RAITE_CV_MLOps/evaluate.py at main · abubake/RAITE_CV_MLOps · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
import os
import sys
import json
from typing import Union
from typing import List
from typing import Dict
from typing import Optional
from argparse import ArgumentParser

import torch
import torch.nn as nn
import numpy as np
from torchvision.ops import box_iou
import matplotlib.pyplot as plt

from RAITEDataset import RAITEDataset
from arguments import GroupParams
from arguments import EvalParams


class DetectionEvaluator:
    """ Evaluates object detection models given a dataset using average percision (AP), mAP, anf f1-score.

    DetectionEvaluator provides flexibility to evaluate object detection models
    with either a trained model or precomputed predictions from a JSON file.
    It supports custom IoU and confidence thresholds, and configurable area
    ranges for small, medium, and large object categories.

    Args:
        model: The object detection model, e.g., Faster R-CNN.
        dataset The dataset object (e.g., RAITEDataset).
        predictions_json: Path to or dictionary of precomputed predictions.
        iou_thresholds: IoU thresholds to evaluate AP at.
        confidence_thresholds: List of confidence thresholds.
        area_ranges: Dictionary defining area ranges for small, medium, and large objects.
        display_predictions: Whether to display predictions during evaluation.


    Raises:
    - ValueError: If neither a model nor predictions JSON is provided.
    """

    def __init__(
        self,
        model: Optional[nn.Module] = None,
        dataset: Optional[RAITEDataset] = None,
        predictions_json: Optional[Union[str, dict]] = None,
        iou_thresholds: Optional[List[float]] = None,
        confidence_thresholds: Optional[List[float]] = None,
        area_ranges: Optional[Dict[str, List[float]]] = None,
        display_predictions: bool = False,
    ):
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        self.display_predictions = display_predictions
        self.iou_thresholds = iou_thresholds
        self.confidence_thresholds = confidence_thresholds
        self.area_ranges = area_ranges

        # Validate inputs and set attributes
        if model is not None:
            self._initialize_with_model(model, dataset)
        elif predictions_json is not None:
            self._initialize_with_json(predictions_json)
        else:
            raise ValueError(
                "Either a model or predictions_json must be provided for evaluation."
            )

    def _initialize_with_model(self, model, dataset):
        """Initialize evaluator with a model and dataset."""
        self.model = model.to(self.device)
        self.dataset = dataset
        self.predictions = None

    def _initialize_with_json(self, predictions_json):
        """Initialize evaluator with precomputed predictions from JSON."""
        self.model = None
        self.dataset = None
        self.predictions = self._load_predictions_from_json(predictions_json)

    def _load_predictions_from_json(self, predictions_json):
        """Load predictions from a JSON file or dictionary.

        Args:
        - predictions_json (str or dict): File path or dictionary with predictions.

        Returns:
        - dict: Loaded predictions data.
        """
        if isinstance(predictions_json, str):
            with open(predictions_json, "r") as file:
                return json.load(file)
        elif isinstance(predictions_json, dict):
            return predictions_json
        else:
            raise TypeError(
                "predictions_json must be a file path (str) or a dictionary."
            )

    def evaluate(self):
        """
        Evaluates the model on the dataset.
        Args:
        - confidence_threshold_step: Step size for confidence thresholding.
        """
        if self.predictions is not None:
            # Use precomputed predictions from JSON
            predictions = self.predictions
            res_dict = self.compute_tp_fp_fn_for_all_params()
            res_dict = self._calculate_ap(res_dict)
            area_based_results = None  # TODO: implement
        else:
            self.model.eval()

            res_dict = self.compute_tp_fp_fn_for_all_params()

            # Compute AP at different IoU thresholds
            res_dict = self._calculate_ap(res_dict)

            # Compute area-based AP (AS, AM, AL)
            area_based_results = self._calculate_area_based_ap()

        return res_dict, area_based_results

    def compute_tp_fp_fn_for_all_params(self):
        """
        Takes in the iou thresholds and interval of confidence score to evaluate across, and computes dictionary of
        TP, FP, FN.

        Returns:
            - Dictionary for each IOU threshold and each score threshold.
        """
        res_dict = {
            iou_thresh: {
                confidence_thresh: {"TP": 0, "FP": 0, "FN": 0}
                for confidence_thresh in self.confidence_thresholds
            }
            for iou_thresh in self.iou_thresholds
        }

        if self.predictions is not None:

            for iou_thresh in self.iou_thresholds:
                for confidence_thresh in self.confidence_thresholds:
                    for frame_id, data in self.predictions.items():

                        # Define the device, e.g., 'cuda:0' if available
                        device = "cuda:0" if torch.cuda.is_available() else "cpu"

                        # Convert prediction data to tensors on the correct device
                        boxes_pred = torch.tensor(
                            data["boxes"], device=device, dtype=torch.float32
                        )
                        scores_pred = torch.tensor(
                            data["scores"], device=device, dtype=torch.float32
                        )
                        labels_pred = torch.tensor(
                            data["labels"], device=device, dtype=torch.int64
                        )

                        predictions = [
                            {
                                "boxes": boxes_pred,
                                "scores": scores_pred,
                                "labels": labels_pred,
                            }
                        ]

                        # Convert target data to tensors on the correct device
                        boxes_true = torch.tensor(
                            data["target_boxes"], device=device, dtype=torch.float32
                        )
                        labels_true = torch.tensor(
                            data["target_labels"], device=device, dtype=torch.int64
                        )

                        target = {"boxes": boxes_true, "labels": labels_true}

                        tp, fp, fn = self._compute_tp_fp_fn(
                            predictions, target, iou_thresh, confidence_thresh
                        )  # add predictions in right format

                        res_dict[iou_thresh][confidence_thresh]["TP"] += tp
                        res_dict[iou_thresh][confidence_thresh]["FP"] += fp
                        res_dict[iou_thresh][confidence_thresh]["FN"] += fn

        else:

            for iou_thresh in self.iou_thresholds:
                for confidence_thresh in self.confidence_thresholds:
                    for image, target, _, _ in self.dataset:

                        image_tensor = (
                            torch.from_numpy(image).permute(2, 0, 1).float()
                        )  # Convert from (H, W, C) to (C, H, W)

                        with torch.no_grad():
                            predictions = self.model([image_tensor.to(self.device)])
                            tp, fp, fn = self._compute_tp_fp_fn(
                                predictions, target, iou_thresh, confidence_thresh
                            )

                            res_dict[iou_thresh][confidence_thresh]["TP"] += tp
                            res_dict[iou_thresh][confidence_thresh]["FP"] += fp
                            res_dict[iou_thresh][confidence_thresh]["FN"] += fn

                            if self.display_predictions:
                                self.display_predictions(image, predictions[0], target)
        return res_dict

    def _compute_tp_fp_fn(self, predictions, target, iou_thresh, confidence_thresh):
        """
        Computes true positives, false positives, and false negatives.
        Args:
        - predictions: Model predictions.
        - target: Ground truth annotations.

        Returns:
        - tp, fp, fn: True positives, false positives, and false negatives.
        """
        boxes_pred = predictions[0]["boxes"].to(
            self.device
        )  # Move predictions to the same device
        labels_pred = predictions[0]["labels"].to(
            self.device
        )  # Ensure labels are also on the same device
        scores_pred = predictions[0]["scores"].to(
            self.device
        )  # Ensure scores are on the same device

        boxes_true = target["boxes"].to(
            self.device
        )  # Move ground truth boxes to the same device
        labels_true = target["labels"].to(self.device)

        tp_count = 0
        fp_count = 0
        fn_count = 0

        #  this function operates on each image in the test set.

        if len(boxes_pred) == 0:  # No predictions
            fn_count += len(boxes_true)  # All true boxes are false negatives
            return tp_count, fp_count, fn_count  # Return counts

        if len(boxes_true) == 0:
            fp_count += len(boxes_pred)
            return tp_count, fp_count, fn_count

        ious = box_iou(boxes_pred, boxes_true)  # TODO: verify the logic here
        max_iou, max_idx = ious.max(dim=1)

        # Initialize a boolean array to keep track of matched ground truth boxes
        matched_gt = torch.zeros(
            boxes_true.size(0), dtype=torch.bool, device=self.device
        )

        for i, (score, iou, label_pred) in enumerate(
            zip(scores_pred, max_iou, labels_pred)
        ):
            if score > confidence_thresh:  # Use the provided confidence threshold
                gt_idx = max_idx[i]
                label_true = labels_true[gt_idx]

                if iou >= iou_thresh and label_pred == label_true:
                    tp_count += 1
                    matched_gt[gt_idx] = True  # Mark this ground truth box as matched
                elif (
                    iou < iou_thresh and label_pred == label_true
                ):  # Correct class but IoU less than threshold:
                    fp_count += 1
                else:
                    fp_count += 1

        fn_count = len(boxes_true) - matched_gt.sum().item()

        return tp_count, fp_count, fn_count

    def _calculate_ap(self, res_dict):
        """
        Calculates average precision at different IoU thresholds.
        """
        ap_results = {}

        for i, iou_thresh in enumerate(self.iou_thresholds):
            ap_results[iou_thresh] = (
                {}
            )  # Initialize a sub-dictionary for this IoU threshold

            # Iterate through each confidence threshold
            ap_terms = []
            for i, confidence_thresh in enumerate(self.confidence_thresholds):

                if i == (len(self.confidence_thresholds) - 1):
                    break
                # Retrieve TP, FP, FN counts for the current IoU and confidence threshold
                tp_sum = res_dict[iou_thresh][confidence_thresh]["TP"]
                fp_sum = res_dict[iou_thresh][confidence_thresh]["FP"]
                fn_sum = res_dict[iou_thresh][confidence_thresh]["FN"]

                # Calculate precision and recall
                precision = tp_sum / (tp_sum + fp_sum + 1e-10)
                recall = tp_sum / (tp_sum + fn_sum + 1e-10)

                # Retrieve TP, FP, FN counts for the current IoU and confidence threshold
                tp_sum = res_dict[iou_thresh][self.confidence_thresholds[i + 1]]["TP"]
                fn_sum = res_dict[iou_thresh][self.confidence_thresholds[i + 1]]["FN"]

                next_recall = tp_sum / (tp_sum + fn_sum)

                # Calculate AP using the formula
                ap_terms.append((recall - next_recall) * precision)

                # Store the AP result in the ap_results dictionary
                ap_results[iou_thresh][confidence_thresh] = [
                    precision,
                    recall,
                ]  # previously stored the result at each confidence

            ap_results[iou_thresh]["final_ap"] = np.sum(ap_terms)

        # Add the AP results to the original res_dict
        res_dict["ap_results"] = ap_results
        return res_dict

    def _calculate_area_based_ap(self):
        """
        Calculate area-based AP (small, medium, large objects).
        """
        area_results = {"AS": 0, "AM": 0, "AL": 0}
        # Calculate AP for each area range
        for area, (min_area, max_area) in self.area_ranges.items():
            # Area-specific logic goes here, similar to how AP is computed
            pass

        return area_results

    def find_highest_f1_score(self, ap_results_dict):
        """
        Finds the highest F1 score across all IoU and confidence thresholds.

        Args:
            ap_results_dict (dict): Dictionary containing precision and recall results at different IoU and confidence thresholds.

        Returns:
            dict: A dictionary containing the highest F1 score and its corresponding precision, recall, IoU, and confidence threshold.
        """
        max_f1 = 0
        best_result = {
            "f1_score": 0,
            "precision": 0,
            "recall": 0,
            "iou_threshold": None,
            "confidence_threshold": None,
        }

        for iou_thresh in self.iou_thresholds:
            for conf_thresh in self.confidence_thresholds:
                # Get precision and recall from the results dictionary
                res = ap_results_dict["ap_results"].get(iou_thresh, {}).get(conf_thresh)
                if res is None:
                    continue  # Skip if no result for this threshold

                precision = res[0]
                recall = res[1]

                # Calculate F1 score
                f1_val = self.f1_score(precision=precision, recall=recall)

                # Update if this is the highest F1 score found so far
                if f1_val > max_f1:
                    max_f1 = f1_val
                    best_result.update(
                        {
                            "f1_score": f1_val,
                            "precision": precision,
                            "recall": recall,
                            "iou_threshold": iou_thresh,
                            "confidence_threshold": conf_thresh,
                        }
                    )

        return best_result

    def f1_score(self, precision, recall):
        if precision + recall == 0:
            return 0  # Avoid division by zero
        return 2 * (precision * recall) / (precision + recall)

    def plot_precision_recall_curve(
        self, ap_results_dict, savePath, highlight_point=None
    ) -> None:
        """
        Plots separate Precision-Recall (PR) curves for each IoU threshold and highlights the point
        with the highest F1 score if provided.

        Args:
            ap_results_dict : Dictionary containing precision and recall
            results at different IoU and confidence thresholds.
            savePath: Directory path to save the plot.
            highlight_point: Dictionary with keys 'f1_score', 'precision', 'recall',
                                            'iou_threshold', and 'confidence_threshold' to mark
                                            the best F1 score on the plot.
        """
        plt.style.use("ggplot")
        # Iterate over IoU thresholds
        for iou_thresh in self.iou_thresholds:
            plt.figure(figsize=(12, 8))  # Create a new figure for each IoU threshold

            precisions = []
            recalls = []

            # Iterate over confidence thresholds
            for i, conf_thresh in enumerate(self.confidence_thresholds):
                if i == (len(self.confidence_thresholds) - 1):
                    break  # Stop at the last threshold

                res = ap_results_dict["ap_results"][iou_thresh][conf_thresh]
                precisions.append(res[0])
                recalls.append(res[1])
            plt.plot(
                recalls,
                precisions,
                label=f"Confidence thresholds : Step size 0.001",
                marker="o",
                markersize=6,
                linewidth=2,
                zorder=1,
            )
            if highlight_point and highlight_point["iou_threshold"] == iou_thresh:
                plt.scatter(
                    highlight_point["recall"],
                    highlight_point["precision"],
                    color="orange",
                    s=200,  # Increase size for visibility
                    marker="*",
                    edgecolor="black",  # Add black edge for contrast
                    linewidth=1,
                    zorder=3,
                    label=f'Best F1: {highlight_point["f1_score"]:.2f}',
                )
            plt.xlabel("Recall", fontsize=15)
            plt.ylabel("Precision", fontsize=15)
            plt.xticks(fontsize=12)
            plt.yticks(fontsize=12)
            plt.title(
                f"Precision-Recall Curve for IoU {iou_thresh}",
                fontsize=15,
                weight="bold",
            )
            plt.legend(
                loc="lower left",
                fontsize=15,
                shadow=True,
                fancybox=True,
                framealpha=0.8,
            )
            plt.grid(color="gray", linestyle="--", linewidth=0.5, alpha=0.7)
            plt.xlim(0, 1)
            plt.ylim(0, 1)
            # Save the plot with a unique filename for each IoU threshold
            plt.savefig(f"{savePath}/pr_curve_iou_{iou_thresh}.png")
            plt.close()

def evaluate_all_test_sets(
    model_path : Optional[str] = "models/ugvs/fasterrcnn_resnet50_fpn_ugv_v7.pth",
    eval_set_path : Optional[str] = "data/archive/test_sets/ugv",
    results_path : Optional[str] = "results",
    width : int = 400,
    height : int = 400,
    class_label_mapping : Dict[int, int] = {1: 1}
) -> None:
    """
    Evaluates all test sets specified.
    """
    model_name = os.path.basename(model_path)
    model_name = os.path.splitext(model_name)[0]
    model = torch.load(model_path)

    average_precisions_50 = []
    average_precisions_75 = []
    folder_paths = [
        os.path.join(eval_set_path, name)
        for name in os.listdir(eval_set_path)
        if os.path.isdir(os.path.join(eval_set_path, name))
    ]
    for path in folder_paths:

        dataset = RAITEDataset(

            f"{path}/images", f"{path}/labels", width, height, class_label_mapping
        )
        test_evaluator = DetectionEvaluator(model, dataset)
        ap_results, area_results = test_evaluator.evaluate()

        # for each test set, create a folder f in: test_sets/results/model_type
        # or results and print results to it.
        folder_name = os.path.basename(path)

        dir_name = os.path.dirname(eval_set_path)
        base_name = os.path.basename(eval_set_path)
        # results_dir = f'{dir_name}/results/{base_name}/{model_name}'
        results_dir = f"{results_path}/{base_name}/{model_name}"
        os.makedirs(results_dir, exist_ok=True)

        fp = os.path.join(
            results_dir, folder_name
        )  # makes a folder for each test for the model
        os.makedirs(fp, exist_ok=True)
        result_file = os.path.join(fp, "results.json")  # results file for that test

        # ALL PLOTS HERE:
        # Add all plots to the results folder for each test
        test_evaluator.plot_precision_recall_curve(ap_results, savePath=fp)

        # DICTIONARY DATA HERE:
        # Dump dictionary into the json file.
        with open(result_file, "w") as file:
            json.dump(ap_results, file, indent=4)

        # SAVE AVERAGE PRECISION FOR TEST TO BE USED FOR mAP
        average_precisions_50.append(ap_results["ap_results"][0.5]["final_ap"])
        average_precisions_75.append(ap_results["ap_results"][0.75]["final_ap"])

    # Once complete, find mAP across all.
    file_path = os.path.join(results_dir, "overall_results.txt")  # creates results.txt
    mAP_50 = sum(average_precisions_50) / len(average_precisions_50)
    mAP_75 = sum(average_precisions_75) / len(average_precisions_75)

    with open(file_path, "a") as f:  # Use 'a' to append to the file
        f.write(f"Mean Average Precision at 50% IoU : {mAP_50:.4f}\n")
        f.write(f"Mean Average Precision at 75% IoU : {mAP_75:.4f}\n")


if __name__ == "__main__":
    parser = ArgumentParser(description="evaluation script parameters")
    eval_params = EvalParams(parser)
    args = parser.parse_args(sys.argv[1:])
    eval_args: GroupParams = eval_params.extract(args)

    if eval_args.json:  # If only JSON is provided:
        evaluator = DetectionEvaluator(
            model=None, dataset=None, predictions_json=eval_args.json
        )
        ap_results, area_results = evaluator.evaluate()
        f1 = evaluator.find_highest_f1_score(ap_results)
        print(f1)
        evaluator.plot_precision_recall_curve(
            ap_results, savePath=eval_args.results_path, highlight_point=f1
        )
    elif (
        eval_args.dataset_path
    ):  # Requires model path, dataset, and class label mappings
        detection_dataset = RAITEDataset(
            eval_args.dataset_path + "/images",
            eval_args.dataset_path + "/labels",
            eval_args.Width,
            eval_args.Height,
            eval_args.label_mappings,
        )  # {0:0, 4:4, 6:6}
        detection_model = torch.load(eval_args.model_path)
        evaluator = DetectionEvaluator(
            model=detection_model, dataset=detection_dataset, predictions_json=None
        )
        ap_results, area_results = evaluator.evaluate()
        print(ap_results)
        f1 = evaluator.find_highest_f1_score(ap_results)
        print(f1)
        evaluator.plot_precision_recall_curve(
            ap_results, savePath=eval_args.results_path, highlight_point=f1
        )
    else:  # If evaluation set (multiple datasets all on one model)
        evaluate_all_test_sets(
            model_path=eval_args.model_path,
            eval_set_path=eval_args.evaluation_set_path,
            results_path=eval_args.results_path,
            width=eval_args.Width,
            height=eval_args.Height,
            class_label_mapping=eval_args.label_mappings,
        )