Updating tests

Barroclough01 · Barroclough01 · commit 97d5aaa900bb · 2025-10-18T13:52:28.000-04:00
diff --git a/Chapter02/testing/helper.py b/Chapter02/testing/helper.py
@@ -10,6 +10,24 @@
 
 # Define simulate ride data function
 def simulate_ride_data():
+    """
+    Simulates ride data.
+
+    Simulates 370 ride distances with normal distribution around 10,
+    10 ride distances with normal distribution around 30 (long distances),
+    10 ride distances with normal distribution around 10 (same distance),
+    and 10 ride distances with normal distribution around 10 (same distance).
+    
+    Simulates 370 ride speeds with normal distribution around 30,
+    10 ride speeds with normal distribution around 30 (same speed),
+    10 ride speeds with normal distribution around 50 (high speed),
+    and 10 ride speeds with normal distribution around 15 (low speed).
+    
+    Assembles them into a Data Frame with ride_id as the index.
+
+    Returns:
+        df_sim (pandas.DataFrame): A DataFrame containing simulated ride data.
+    """
     # Simulate some ride data ...
     ride_dists = np.concatenate(
         (
@@ -44,9 +62,19 @@ def simulate_ride_data():
 
 
 def get_taxi_data():
+    
+    """
+    Reads in taxi ride data from a csv file or simulates it if not present.
+
+    Args:
+        None
+
+    Returns:
+        df (pandas.DataFrame): A DataFrame containing taxi ride data.
+    """
     # If data present, read it in
-    #file_path = f'''../../chapter1/batch-anomaly/data/taxi-rides.csv''' #relative
-    file_path = f'''chapter1/batch-anomaly/data/taxi-rides.csv''' #from top dir
+    file_path = '../../Chapter01/clustering/taxi-rides.csv' #relative
+    #file_path = f'''chapter1/batch-anomaly/data/taxi-rides.csv''' #from top dir
     if os.path.exists(file_path):
         df = pd.read_csv(file_path)
     else:
diff --git a/Chapter02/testing/model.py b/Chapter02/testing/model.py
@@ -5,6 +5,21 @@
 
 
 def cluster_and_label(X):
+    """
+    Clusters the given data with DBSCAN algorithm and returns the results.
+
+    Parameters
+    ----------
+    X : numpy.ndarray
+        Array of data points to be clustered.
+
+    Returns
+    -------
+    run_metadata : dict
+        A dictionary containing the results of the clustering algorithm.
+        It includes the estimated number of clusters, the estimated number of noise points,
+        the silhouette coefficient, and the labels of the data points.
+    """
     X = StandardScaler().fit_transform(X)
     db = DBSCAN(eps=0.3, min_samples=10).fit(X)
 
diff --git a/Chapter02/testing/test_basic.py b/Chapter02/testing/test_basic.py
@@ -1,2 +1,5 @@
 def test_example():
+    """
+    Test that the example works as expected.
+    """
     pass
diff --git a/Chapter02/testing/test_model_performance.py b/Chapter02/testing/test_model_performance.py
@@ -12,6 +12,11 @@
 
 @pytest.fixture
 def test_dataset() -> Union[np.array, np.array]:
+    """
+    Returns a tuple containing the test dataset and the corresponding labels.
+    The dataset is the wine dataset, with the label being True for class 2 and False otherwise.
+    The dataset is split into a training and test set using `train_test_split` with a random state of 42.
+    """
     # Load the dataset
     X, y = load_wine(return_X_y=True)
     # create an array of True for 2 and False otherwise
@@ -22,18 +27,31 @@ def test_dataset() -> Union[np.array, np.array]:
 
 @pytest.fixture
 def model() -> sklearn.ensemble._forest.RandomForestClassifier:
+    """
+    Returns a trained RandomForestClassifier model downloaded from the Hugging Face Hub.
+    The model was trained on the wine dataset and is used for testing the performance of the model.
+    """
     REPO_ID = "electricweegie/mlewp-sklearn-wine"
     FILENAME = "rfc.joblib"
     model = joblib.load(hf_hub_download(REPO_ID, FILENAME))
     return model
 
 
 def test_model_inference_types(model, test_dataset):
+    """
+    Tests that the model's predict method returns a numpy array and that the test dataset is composed of numpy arrays.
+    """
+    
     assert isinstance(model.predict(test_dataset[0]), np.ndarray)
     assert isinstance(test_dataset[0], np.ndarray)
     assert isinstance(test_dataset[1], np.ndarray)
 
 def test_model_performance(model, test_dataset):
+    """
+    Tests the performance of the model on the test dataset.
+    The performance is measured using the F1-score and precision metrics.
+    The model is expected to achieve an F1-score greater than 0.95 and a precision greater than 0.9 for class False, and an F1-score greater than 0.8 and a precision greater than 0.8 for class True.
+    """
     metrics = classification_report(y_true=test_dataset[1], y_pred=model.predict(test_dataset[0]), output_dict=True)
     assert metrics['False']['f1-score'] > 0.95
     assert metrics['False']['precision'] > 0.9
diff --git a/Chapter02/testing/test_taxi_cluster_basic.py b/Chapter02/testing/test_taxi_cluster_basic.py
@@ -4,6 +4,13 @@
 
 @pytest.mark.skip(reason="From edition 1, does not work due to not uploading taxi data in repo")
 def test_cluster_and_label():
+    """
+    Tests the cluster_and_label function.
+
+    This test should pass if the function returns a dictionary.
+
+    The test is skipped because the taxi data is not uploaded to the repository.
+    """
     df = get_taxi_data()
     results = cluster_and_label(df)
     assert isinstance(results, dict)

-Original file line number
+Diff line change
@@ @@ -1,2 +1,5 @@ @@
 def test_example():
 +    """
 +    Test that the example works as expected.
 +    """
     pass