diff --git a/971-Robot-Code b/971-Robot-Code
index 117a420eb..4bb8d3d20 160000
--- a/971-Robot-Code
+++ b/971-Robot-Code
@@ -1 +1 @@
-Subproject commit 117a420ebbb98937e13ab05519d06c401b19140e
+Subproject commit 4bb8d3d202e6415582f20a93290df787888e98fc
diff --git a/zebROS_ws/src/deeptag_ros/CMakeLists.txt b/zebROS_ws/src/deeptag_ros/CMakeLists.txt
index 4e5e389ea..012f6885a 100644
--- a/zebROS_ws/src/deeptag_ros/CMakeLists.txt
+++ b/zebROS_ws/src/deeptag_ros/CMakeLists.txt
@@ -201,6 +201,10 @@ target_link_libraries(run_inference_benchmark PUBLIC deeptag)
 set_target_properties(run_inference_benchmark PROPERTIES CUDA_ARCHITECTURES "86")
 set_property(TARGET run_inference_benchmark PROPERTY CUDA_SEPARABLE_COMPILATION ON)
 
+add_executable(decoder_test src/decoder_test.cpp)
+target_link_libraries(decoder_test PUBLIC deeptag)
+set_target_properties(decoder_test PROPERTIES CUDA_ARCHITECTURES "86")
+set_property(TARGET decoder_test PROPERTY CUDA_SEPARABLE_COMPILATION ON)
 # add_executable(softmax_test src/softmax_test.cpp)
 # target_link_libraries(softmax_test PUBLIC deeptag)
 # set_target_properties(softmax_test PROPERTIES CUDA_ARCHITECTURES "86")
diff --git a/zebROS_ws/src/deeptag_ros/include/deeptag_ros/confidence_filter.h b/zebROS_ws/src/deeptag_ros/include/deeptag_ros/confidence_filter.h
index 84ec7975b..6eaf355f6 100644
--- a/zebROS_ws/src/deeptag_ros/include/deeptag_ros/confidence_filter.h
+++ b/zebROS_ws/src/deeptag_ros/include/deeptag_ros/confidence_filter.h
@@ -1,7 +1,7 @@
 #ifndef CONFIDENCE_FILTER_INC__
 #define CONFIDENCE_FILTER_INC__
 
-#include "span.hpp"
+#include "deeptag_ros/span.hpp"
 
 #ifndef __host__
 #define __host__
diff --git a/zebROS_ws/src/deeptag_ros/include/deeptag_ros/cuda_image_tile.h b/zebROS_ws/src/deeptag_ros/include/deeptag_ros/cuda_image_tile.h
index 1005c8f2f..f425a961f 100644
--- a/zebROS_ws/src/deeptag_ros/include/deeptag_ros/cuda_image_tile.h
+++ b/zebROS_ws/src/deeptag_ros/include/deeptag_ros/cuda_image_tile.h
@@ -3,7 +3,7 @@
 #include <cstdint>
 
 #include "device_types.h"
-#include "image_format.h"
+#include "deeptag_ros/image_format.h"
 
 cudaError_t cudaImageTileRGB(const void *input,
 							 const imageFormat format,
diff --git a/zebROS_ws/src/deeptag_ros/include/deeptag_ros/cuda_ssd_preprocess.h b/zebROS_ws/src/deeptag_ros/include/deeptag_ros/cuda_ssd_preprocess.h
index 0e0e27f56..d69354eac 100644
--- a/zebROS_ws/src/deeptag_ros/include/deeptag_ros/cuda_ssd_preprocess.h
+++ b/zebROS_ws/src/deeptag_ros/include/deeptag_ros/cuda_ssd_preprocess.h
@@ -1,7 +1,7 @@
 #ifndef CUDA_SSD_PREPROCESSING_H__
 #define CUDA_SSD_PREPROCESSING_H__
 
-#include "image_format.h"
+#include "deeptag_ros/image_format.h"
 
 /*
  * Downsample and apply pixel normalization, NCHW format
diff --git a/zebROS_ws/src/deeptag_ros/include/deeptag_ros/decoded_tag.h b/zebROS_ws/src/deeptag_ros/include/deeptag_ros/decoded_tag.h
index 748b10114..f7e85ed1d 100644
--- a/zebROS_ws/src/deeptag_ros/include/deeptag_ros/decoded_tag.h
+++ b/zebROS_ws/src/deeptag_ros/include/deeptag_ros/decoded_tag.h
@@ -3,7 +3,7 @@
 #include <array>
 #include <cstdint>
 #include "opencv2/core.hpp"
-#include "points_and_ids.h"
+#include "deeptag_ros/points_and_ids.h"
 
 template <size_t GRID_SIZE>
 class DecodedTag
@@ -13,10 +13,7 @@ class DecodedTag
     cv::Mat m_HCrop;
     int m_tagId;
     uint64_t m_binaryId;
-    std::array<PointsAndIDs, (GRID_SIZE + 2) * (GRID_SIZE + 2)> m_keypointsWithIds;
-    // TODO : this is just the points from the previous field copied into a different var
-    // see about combining them
-    std::array<cv::Point2d, (GRID_SIZE + 2) * (GRID_SIZE + 2)> m_keypointsInImage;
+    PointsAndIDs<GRID_SIZE + 2> m_keypointsWithIds;
     std::array<cv::Point2d, 4> m_roi;
 
     int m_mainIdx{0};
diff --git a/zebROS_ws/src/deeptag_ros/include/deeptag_ros/decoder_engine.h b/zebROS_ws/src/deeptag_ros/include/deeptag_ros/decoder_engine.h
index 019dc8962..867ca939b 100644
--- a/zebROS_ws/src/deeptag_ros/include/deeptag_ros/decoder_engine.h
+++ b/zebROS_ws/src/deeptag_ros/include/deeptag_ros/decoder_engine.h
@@ -1,10 +1,10 @@
 #ifndef DECODER_ENGINE_INC__
 #define DECODER_ENGINE_INC__
 
-#include "span.hpp"
+#include "deeptag_ros/span.hpp"
 
-#include "decoder_preprocess.h"
-#include "engine.h"
+#include "deeptag_ros/decoder_preprocess.h"
+#include "deeptag_ros/engine.h"
 
 class DecoderEngineCalibrator : public Int8EntropyCalibrator2
 {
diff --git a/zebROS_ws/src/deeptag_ros/include/deeptag_ros/decoder_preprocess.h b/zebROS_ws/src/deeptag_ros/include/deeptag_ros/decoder_preprocess.h
index 758d9f94a..38a32b640 100644
--- a/zebROS_ws/src/deeptag_ros/include/deeptag_ros/decoder_preprocess.h
+++ b/zebROS_ws/src/deeptag_ros/include/deeptag_ros/decoder_preprocess.h
@@ -3,7 +3,7 @@
 #include <array>
 #include "cuda_runtime.h"  // for cudaError
 
-#include "image_format.h"
+#include "deeptag_ros/image_format.h"
 
 class DecoderPreprocess
 {
diff --git a/zebROS_ws/src/deeptag_ros/include/deeptag_ros/decoder_softmax.h b/zebROS_ws/src/deeptag_ros/include/deeptag_ros/decoder_softmax.h
index 19dacc637..2ae4969dc 100644
--- a/zebROS_ws/src/deeptag_ros/include/deeptag_ros/decoder_softmax.h
+++ b/zebROS_ws/src/deeptag_ros/include/deeptag_ros/decoder_softmax.h
@@ -2,8 +2,8 @@
 #define DECODER_SOFTMAX_
 
 #include <cuda_runtime.h>
-#include "decoder_softmax_results.h"
-#include "span.hpp"
+#include "deeptag_ros/decoder_softmax_results.h"
+#include "deeptag_ros/span.hpp"
 
 
 // Class to handle DecoderSoftmax ops.
diff --git a/zebROS_ws/src/deeptag_ros/include/deeptag_ros/deeptag_impls.h b/zebROS_ws/src/deeptag_ros/include/deeptag_ros/deeptag_impls.h
index 47f4004a2..dbb5474da 100644
--- a/zebROS_ws/src/deeptag_ros/include/deeptag_ros/deeptag_impls.h
+++ b/zebROS_ws/src/deeptag_ros/include/deeptag_ros/deeptag_impls.h
@@ -1,13 +1,13 @@
 #ifndef DEEPTAG_IMPLS_INC__
 #define DEEPTAG_IMPLS_INC__
 
-#include "cuda_event_timing.h"
-#include "deeptag.h"
-#include "detection_engine.h"
-#include "marker_dict.h"
-#include "pose_estimator.h"
-#include "stag_decoder.h"
-#include "stag_detector.h"
+#include "deeptag_ros/cuda_event_timing.h"
+#include "deeptag_ros/deeptag.h"
+#include "deeptag_ros/detection_engine.h"
+#include "deeptag_ros/marker_dict.h"
+#include "deeptag_ros/pose_estimator.h"
+#include "deeptag_ros/stag_decoder.h"
+#include "deeptag_ros/stag_detector.h"
 
 class DeepTagImplBase
 {
diff --git a/zebROS_ws/src/deeptag_ros/include/deeptag_ros/detection_engine.h b/zebROS_ws/src/deeptag_ros/include/deeptag_ros/detection_engine.h
index 3c47e9608..9f3f2a352 100644
--- a/zebROS_ws/src/deeptag_ros/include/deeptag_ros/detection_engine.h
+++ b/zebROS_ws/src/deeptag_ros/include/deeptag_ros/detection_engine.h
@@ -1,7 +1,7 @@
 #ifndef DETECTION_ENGINE_INC__
 #define DETECTION_ENGINE_INC__
 
-#include "engine.h"
+#include "deeptag_ros/engine.h"
 class DetectionEngineCalibrator : public Int8EntropyCalibrator2
 {
 public:
diff --git a/zebROS_ws/src/deeptag_ros/include/deeptag_ros/distorted_h_transform.h b/zebROS_ws/src/deeptag_ros/include/deeptag_ros/distorted_h_transform.h
index a3e6140d1..bdfeb5d71 100644
--- a/zebROS_ws/src/deeptag_ros/include/deeptag_ros/distorted_h_transform.h
+++ b/zebROS_ws/src/deeptag_ros/include/deeptag_ros/distorted_h_transform.h
@@ -4,9 +4,9 @@
 #include <array>
 #include "opencv2/core.hpp"
 #include "opencv2/calib3d.hpp"
-#include "warp_perspective_points.h"
+#include "deeptag_ros/warp_perspective_points.h"
 #undef  DEBUG
-#include "debug.h"
+#include "deeptag_ros/debug.h"
 
 template <size_t N>
 std::array<cv::Point2d, N> fromCropToUndistorted(const std::array<cv::Point2d, N> &pointsInCrop,
diff --git a/zebROS_ws/src/deeptag_ros/include/deeptag_ros/engine.h b/zebROS_ws/src/deeptag_ros/include/deeptag_ros/engine.h
index a1e9950d1..e59bb0477 100644
--- a/zebROS_ws/src/deeptag_ros/include/deeptag_ros/engine.h
+++ b/zebROS_ws/src/deeptag_ros/include/deeptag_ros/engine.h
@@ -16,7 +16,7 @@
 #include "NvInferRuntimeCommon.h"  // for Dims, ILogger
 #include "driver_types.h"          // for cudaStream_t, CUstream_st, CUgraph...
 
-#include "cuda_event_timing.h"     // for Timings
+#include "deeptag_ros/cuda_event_timing.h" // for Timings
 class GpuImageWrapper;
 
 // Utility methods
diff --git a/zebROS_ws/src/deeptag_ros/include/deeptag_ros/gpu_image_wrapper.h b/zebROS_ws/src/deeptag_ros/include/deeptag_ros/gpu_image_wrapper.h
index e55351770..565373f6d 100644
--- a/zebROS_ws/src/deeptag_ros/include/deeptag_ros/gpu_image_wrapper.h
+++ b/zebROS_ws/src/deeptag_ros/include/deeptag_ros/gpu_image_wrapper.h
@@ -3,7 +3,7 @@
 #include <cstddef>
 #include "driver_types.h"  // for cudaStream_t
 
-#include "image_format.h"
+#include "deeptag_ros/image_format.h"
 namespace cv
 {
     class Mat;
diff --git a/zebROS_ws/src/deeptag_ros/include/deeptag_ros/grid_prior.h b/zebROS_ws/src/deeptag_ros/include/deeptag_ros/grid_prior.h
index ee7eb6dd1..6de7431c4 100644
--- a/zebROS_ws/src/deeptag_ros/include/deeptag_ros/grid_prior.h
+++ b/zebROS_ws/src/deeptag_ros/include/deeptag_ros/grid_prior.h
@@ -1,8 +1,8 @@
 #ifndef GRID_PRIOR_INC__
 #define GRID_PRIOR_INC__
 #include <cuda_runtime.h>
-#include "grid_prior_value.h"
-#include "span.hpp"
+#include "deeptag_ros/grid_prior_value.h"
+#include "deeptag_ros/span.hpp"
 
 // Class to handle all of the GridPrior ops.
 // Stores the grid priors (a map from model output indexes to image coords) along
diff --git a/zebROS_ws/src/deeptag_ros/include/deeptag_ros/image_format.h b/zebROS_ws/src/deeptag_ros/include/deeptag_ros/image_format.h
index 985ae2338..4ba4aafbe 100644
--- a/zebROS_ws/src/deeptag_ros/include/deeptag_ros/image_format.h
+++ b/zebROS_ws/src/deeptag_ros/include/deeptag_ros/image_format.h
@@ -4,6 +4,7 @@
 enum class imageFormat
 {
     IMAGE_MONO8,
+    IMAGE_MONO32F,
     IMAGE_RGB8,
     IMAGE_BGR8,
     IMAGE_RGBA8,
diff --git a/zebROS_ws/src/deeptag_ros/include/deeptag_ros/points_and_ids.h b/zebROS_ws/src/deeptag_ros/include/deeptag_ros/points_and_ids.h
index abcc4242d..a2df2153d 100644
--- a/zebROS_ws/src/deeptag_ros/include/deeptag_ros/points_and_ids.h
+++ b/zebROS_ws/src/deeptag_ros/include/deeptag_ros/points_and_ids.h
@@ -2,25 +2,27 @@
 #define POINTS_AND_IDS_INC__
 #include "opencv2/core.hpp"
 
+template <size_t GRID_SIZE>
 struct PointsAndIDs
 {
     PointsAndIDs()
-        : m_point{0., 0.}
-        , m_id{-1}
-        , m_score{0.}
     {
+        std::ranges::fill(m_id, -1);
+        std::ranges::fill(m_score, -1);
     }
-    PointsAndIDs(const double x, const double y, const int id, const double score = 0)
-        : m_point{cv::Point2d{x, y}}, m_id{id}, m_score{score}
-    {
-    }
-    cv::Point2d m_point;
-    int m_id; 
-    double m_score;
+
+    std::array<cv::Point2d, GRID_SIZE * GRID_SIZE> m_point{};
+    std::array<int, GRID_SIZE * GRID_SIZE> m_id{};
+    std::array<double, GRID_SIZE * GRID_SIZE> m_score{};
+
+    size_t size(void) const { return m_point.size(); } // all arrays are the same size
 
     friend std::ostream& operator<<(std::ostream &os, const PointsAndIDs &pid)
     {
-        os << pid.m_point.x << ", " << pid.m_point.y << " id = " << pid.m_id << " score = " << pid.m_score;
+        for (size_t ii = 0; ii < pid.m_point.size(); ii++)
+        {
+            os << pid.m_point[ii].x << ", " << pid.m_point[ii].y << " id = " << pid.m_id[ii] << " score = " << pid.m_score[ii] << std::endl;
+        }
         return os;
     }
 };
diff --git a/zebROS_ws/src/deeptag_ros/include/deeptag_ros/pose_estimator.h b/zebROS_ws/src/deeptag_ros/include/deeptag_ros/pose_estimator.h
index 4a87858ef..0f3689647 100644
--- a/zebROS_ws/src/deeptag_ros/include/deeptag_ros/pose_estimator.h
+++ b/zebROS_ws/src/deeptag_ros/include/deeptag_ros/pose_estimator.h
@@ -8,7 +8,7 @@
 template <bool BOOL> struct BoolTag;
 
 #undef  DEBUG
-#include "debug.h"
+#include "deeptag_ros/debug.h"
 
 template <size_t GRID_SIZE, class UNIT_TAG_TYPE, size_t STEP_ELEM_NUM, bool IS_RANSAC_SOLVEPNP, bool IS_INVERSE_X = true>
 class PoseEstimator
diff --git a/zebROS_ws/src/deeptag_ros/include/deeptag_ros/roi_generator.h b/zebROS_ws/src/deeptag_ros/include/deeptag_ros/roi_generator.h
index 583312233..6f7efb979 100644
--- a/zebROS_ws/src/deeptag_ros/include/deeptag_ros/roi_generator.h
+++ b/zebROS_ws/src/deeptag_ros/include/deeptag_ros/roi_generator.h
@@ -3,7 +3,7 @@
 
 #include <vector>
 #include <opencv2/core/types.hpp>  // for Point2d
-#include "span.hpp"
+#include "deeptag_ros/span.hpp"
 
 template <uint32_t BATCH_SIZE> class Stage1GridGroup;
 class Stage1SSDGroup;
diff --git a/zebROS_ws/src/deeptag_ros/include/deeptag_ros/softmax.h b/zebROS_ws/src/deeptag_ros/include/deeptag_ros/softmax.h
index 24ec19e55..bef98eceb 100644
--- a/zebROS_ws/src/deeptag_ros/include/deeptag_ros/softmax.h
+++ b/zebROS_ws/src/deeptag_ros/include/deeptag_ros/softmax.h
@@ -3,7 +3,7 @@
 
 #include <stdint.h>        // for int32_t
 #include "driver_types.h"  // for cudaStream_t
-#include "span.hpp"        // for span
+#include "deeptag_ros/span.hpp"        // for span
 
 // Class to handle Softmax ops.
 // Input is result from model, all bg scores (h x w) followed by all fg scores for grid
diff --git a/zebROS_ws/src/deeptag_ros/include/deeptag_ros/ssd_grid_prior.h b/zebROS_ws/src/deeptag_ros/include/deeptag_ros/ssd_grid_prior.h
index e37dda26e..7e7f604bc 100644
--- a/zebROS_ws/src/deeptag_ros/include/deeptag_ros/ssd_grid_prior.h
+++ b/zebROS_ws/src/deeptag_ros/include/deeptag_ros/ssd_grid_prior.h
@@ -6,8 +6,8 @@
 #include <vector>
 #include <cuda_runtime.h>
 
-#include "span.hpp"
-#include "ssd_box.h"
+#include "deeptag_ros/span.hpp"
+#include "deeptag_ros/ssd_box.h"
 
 // Class to handle all of the SSDGridPrior ops.
 // Stores the grid priors (a map from model output indexes to image coords) along
diff --git a/zebROS_ws/src/deeptag_ros/include/deeptag_ros/ssd_tag_keypoint.h b/zebROS_ws/src/deeptag_ros/include/deeptag_ros/ssd_tag_keypoint.h
index 445e5ed9b..2d0643fb7 100644
--- a/zebROS_ws/src/deeptag_ros/include/deeptag_ros/ssd_tag_keypoint.h
+++ b/zebROS_ws/src/deeptag_ros/include/deeptag_ros/ssd_tag_keypoint.h
@@ -9,7 +9,7 @@
 #endif
 
 #include <cstdint>
-#include "ssd_box.h"
+#include "deeptag_ros/ssd_box.h"
 class SSDTagKeypoint
 {
 public:
diff --git a/zebROS_ws/src/deeptag_ros/include/deeptag_ros/stag_decoder.h b/zebROS_ws/src/deeptag_ros/include/deeptag_ros/stag_decoder.h
index de9efb010..dde2f7551 100644
--- a/zebROS_ws/src/deeptag_ros/include/deeptag_ros/stag_decoder.h
+++ b/zebROS_ws/src/deeptag_ros/include/deeptag_ros/stag_decoder.h
@@ -8,22 +8,22 @@
 #include <opencv2/core/types.hpp>            // for Point2d
 #include <string>                            // for string
 #include <vector>                            // for vector
-#include "decoded_tag.h"                     // for DecodedTag
-#include "decoder_engine.h"                  // for DecoderEngine
-#include "decoder_softmax.h"                 // for DecoderSoftmax
-#include "grid_prior.h"                      // for GridPrior
-#include "confidence_filter.h"               // for ConfidenceFilter
-#include "span.hpp"                          // for span
-#include "stage2_corners.h"                  // for Stage2Corners
-#include "stage2_keypoint_trust.h"           // for Stage2KeypointTrust
-#include "suppress_and_average_keypoints.h"  // for SuppressAndAverageKeypoints
+#include "deeptag_ros/decoded_tag.h"                     // for DecodedTag
+#include "deeptag_ros/decoder_engine.h"                  // for DecoderEngine
+#include "deeptag_ros/decoder_softmax.h"                 // for DecoderSoftmax
+#include "deeptag_ros/grid_prior.h"                      // for GridPrior
+#include "deeptag_ros/confidence_filter.h"               // for ConfidenceFilter
+#include "deeptag_ros/span.hpp"                          // for span
+#include "deeptag_ros/stage2_corners.h"                  // for Stage2Corners
+#include "deeptag_ros/stage2_keypoint_trust.h"           // for Stage2KeypointTrust
+#include "deeptag_ros/suppress_and_average_keypoints.h"  // for SuppressAndAverageKeypoints
 
 class GpuImageWrapper;
 struct GridPriorValue;
 class Stage2Keypoint;
 class Stage2KeypointGroup;
 class Timings;
-struct PointsAndIDs;
+template <size_t GRID_SIZE> struct PointsAndIDs;
 
 template <class MARKER_DICT, size_t GRID_SIZE>
 class STagDecoder
@@ -46,13 +46,14 @@ class STagDecoder
     void   setMinGridMatchRatio(const double minGridMatchRatio);
     double getMinGridMatchRatio(void) const;
     ushort2 getModelSize(void) const;
+    cudaStream_t getCudaStream(void);
 
 private:
     void runInference(std::vector<std::vector<Stage2KeypointGroup>> &stage2KeypointGroupss,
-                      std::vector<std::vector<float2>> &stage2Corners,
+                      std::vector<std::array<float2, 4>> &stage2Corners,
                       const std::vector<std::vector<GpuImageWrapper>> &detectInputs,
                       const tcb::span<const std::array<cv::Point2d, 4>> &rois);
-    void fillEmptyIds(std::array<PointsAndIDs, (GRID_SIZE + 2) * (GRID_SIZE + 2)> &orderedFineGridPointsIds,
+    void fillEmptyIds(PointsAndIDs<GRID_SIZE + 2> &orderedFineGridPointsIds,
                       const tcb::span<const Stage2KeypointGroup> &fineGridPointsWithIdsCandidates) const;
 
     const MARKER_DICT &m_markerDict;
diff --git a/zebROS_ws/src/deeptag_ros/include/deeptag_ros/stag_detector.h b/zebROS_ws/src/deeptag_ros/include/deeptag_ros/stag_detector.h
index 08f3d94b6..aa3bed456 100644
--- a/zebROS_ws/src/deeptag_ros/include/deeptag_ros/stag_detector.h
+++ b/zebROS_ws/src/deeptag_ros/include/deeptag_ros/stag_detector.h
@@ -7,14 +7,14 @@
 #include <memory>                            // for unique_ptr:w
 #include <string>                            // for string
 #include <vector>                            // for vector
-#include "confidence_filter.h"               // for Stage1Predicate, Confide...
+#include "deeptag_ros/confidence_filter.h"   // for Stage1Predicate, Confide...
 #include "driver_types.h"                    // for CUevent_st, cudaEvent_t
-#include "grid_prior.h"                      // for GridPrior
-#include "softmax.h"                         // for Softmax
-#include "span.hpp"                          // for span
-#include "ssd_grid_prior.h"                  // for SSDGridPrior
-#include "suppress_and_average_keypoints.h"  // for SuppressAndAverageKeypoints
-#include "tag_detect_info.h"                 // for TagDetectInfo
+#include "deeptag_ros/grid_prior.h"          // for GridPrior
+#include "deeptag_ros/softmax.h"             // for Softmax
+#include "deeptag_ros/span.hpp"              // for span
+#include "deeptag_ros/ssd_grid_prior.h"      // for SSDGridPrior
+#include "deeptag_ros/suppress_and_average_keypoints.h"  // for SuppressAndAverageKeypoints
+#include "deeptag_ros/tag_detect_info.h"     // for TagDetectInfo
 #include "vector_types.h"                    // for ushort2
 template <size_t NUM_TILES, bool USE_SCALED_IMAGE> class DetectionEngine;
 class GpuImageWrapper;
diff --git a/zebROS_ws/src/deeptag_ros/include/deeptag_ros/stage1_grid.h b/zebROS_ws/src/deeptag_ros/include/deeptag_ros/stage1_grid.h
index c38640bab..ffe7349b4 100644
--- a/zebROS_ws/src/deeptag_ros/include/deeptag_ros/stage1_grid.h
+++ b/zebROS_ws/src/deeptag_ros/include/deeptag_ros/stage1_grid.h
@@ -3,7 +3,7 @@
 #include <cstdint>
 
 #include "vector_types.h"
-#include "grid_prior_value.h"
+#include "deeptag_ros/grid_prior_value.h"
 
 #ifndef __host__
 #define __host__
diff --git a/zebROS_ws/src/deeptag_ros/include/deeptag_ros/stage1_ssd_group.h b/zebROS_ws/src/deeptag_ros/include/deeptag_ros/stage1_ssd_group.h
index 15912d9ed..8480275c0 100644
--- a/zebROS_ws/src/deeptag_ros/include/deeptag_ros/stage1_ssd_group.h
+++ b/zebROS_ws/src/deeptag_ros/include/deeptag_ros/stage1_ssd_group.h
@@ -6,7 +6,7 @@
 #ifndef __device__
 #define __device__
 #endif
-#include "ssd_box.h"
+#include "deeptag_ros/ssd_box.h"
 
 // Used to store merged Stage1Grid similar data as a single
 // group rather than deal with duplicates of individual results
diff --git a/zebROS_ws/src/deeptag_ros/include/deeptag_ros/stage2_corners.h b/zebROS_ws/src/deeptag_ros/include/deeptag_ros/stage2_corners.h
index 438c7750d..89b167675 100644
--- a/zebROS_ws/src/deeptag_ros/include/deeptag_ros/stage2_corners.h
+++ b/zebROS_ws/src/deeptag_ros/include/deeptag_ros/stage2_corners.h
@@ -2,8 +2,8 @@
 #define STAGE2_CORNERS_INC__
 
 #include "cuda_runtime.h"
-#include "grid_prior_value.h"
-#include "span.hpp"
+#include "deeptag_ros/grid_prior_value.h"
+#include "deeptag_ros/span.hpp"
 
 class Stage2Corners
 {
@@ -18,7 +18,7 @@ class Stage2Corners
 
     virtual ~Stage2Corners();
     void compute(const float *input, const tcb::span<const GridPriorValue> &priors, const float centerVariance, cudaStream_t cudaStream);
-    const tcb::span<const float2> getDeviceOutput(void);
+    // const tcb::span<const float2> getDeviceOutput(void);
     const tcb::span<const float2> getHostOutput(void);
 
 private:
diff --git a/zebROS_ws/src/deeptag_ros/include/deeptag_ros/stage2_keypoint.h b/zebROS_ws/src/deeptag_ros/include/deeptag_ros/stage2_keypoint.h
index 22553ed57..e719341a7 100644
--- a/zebROS_ws/src/deeptag_ros/include/deeptag_ros/stage2_keypoint.h
+++ b/zebROS_ws/src/deeptag_ros/include/deeptag_ros/stage2_keypoint.h
@@ -1,7 +1,7 @@
 #ifndef STAGE2_KEYPOINT_INC__
 #define STAGE2_KEYPOINT_INC__
 #include <cstdint>
-#include "grid_prior_value.h"
+#include "deeptag_ros/grid_prior_value.h"
 
 #ifndef __host__
 #define __host__
diff --git a/zebROS_ws/src/deeptag_ros/include/deeptag_ros/stage2_keypoint_trust.h b/zebROS_ws/src/deeptag_ros/include/deeptag_ros/stage2_keypoint_trust.h
index 00289e3f8..b110da2f4 100644
--- a/zebROS_ws/src/deeptag_ros/include/deeptag_ros/stage2_keypoint_trust.h
+++ b/zebROS_ws/src/deeptag_ros/include/deeptag_ros/stage2_keypoint_trust.h
@@ -1,8 +1,8 @@
 #ifndef STAGE2_KEYPOINT_TRUST_INC__
 #define STAGE2_KEYPOINT_TRUST_INC__
 #include <cuda_runtime.h>
-#include "stage2_keypoint.h"
-#include "span.hpp"
+#include "deeptag_ros/stage2_keypoint.h"
+#include "deeptag_ros/span.hpp"
 
 // Given a list of keypoints, check that enough of them
 // have a high enough confidence that we trust that the group
diff --git a/zebROS_ws/src/deeptag_ros/include/deeptag_ros/suppress_and_average_keypoints.h b/zebROS_ws/src/deeptag_ros/include/deeptag_ros/suppress_and_average_keypoints.h
index a699881c8..23c7dd8fb 100644
--- a/zebROS_ws/src/deeptag_ros/include/deeptag_ros/suppress_and_average_keypoints.h
+++ b/zebROS_ws/src/deeptag_ros/include/deeptag_ros/suppress_and_average_keypoints.h
@@ -1,7 +1,7 @@
 #ifndef SUPPRESS_AND_AVERAGE_KEYPOINTS_H__
 #define SUPPRESS_AND_AVERAGE_KEYPOINTS_H__
 
-#include "span.hpp"
+#include "deeptag_ros/span.hpp"
 
 template <class INPUT, class OUTPUT>
 class SuppressAndAverageKeypoints
diff --git a/zebROS_ws/src/deeptag_ros/include/deeptag_ros/unit_arucotag.h b/zebROS_ws/src/deeptag_ros/include/deeptag_ros/unit_arucotag.h
index a44591add..ef79a5014 100644
--- a/zebROS_ws/src/deeptag_ros/include/deeptag_ros/unit_arucotag.h
+++ b/zebROS_ws/src/deeptag_ros/include/deeptag_ros/unit_arucotag.h
@@ -4,7 +4,7 @@
 #include <array>
 #include <cstddef>
 
-#include "unit_chessboard_tag.h"
+#include "deeptag_ros/unit_chessboard_tag.h"
 
 template <std::size_t GRID_SIZE>
 std::array<uint8_t, GRID_SIZE * GRID_SIZE> constexpr makeBinaryIDArray()
diff --git a/zebROS_ws/src/deeptag_ros/include/deeptag_ros/unit_chessboard_tag.h b/zebROS_ws/src/deeptag_ros/include/deeptag_ros/unit_chessboard_tag.h
index c0e288cb9..9ae5dad2a 100644
--- a/zebROS_ws/src/deeptag_ros/include/deeptag_ros/unit_chessboard_tag.h
+++ b/zebROS_ws/src/deeptag_ros/include/deeptag_ros/unit_chessboard_tag.h
@@ -9,7 +9,7 @@
 #include "opencv2/core/mat.hpp"
 #include "opencv2/core/types.hpp"
 
-#include "bool_tag.h"
+#include "deeptag_ros/bool_tag.h"
 
 class UnitChessboardTagBase
 {
diff --git a/zebROS_ws/src/deeptag_ros/include/deeptag_ros/unit_tag_template.h b/zebROS_ws/src/deeptag_ros/include/deeptag_ros/unit_tag_template.h
index 00e315a39..2019ac781 100644
--- a/zebROS_ws/src/deeptag_ros/include/deeptag_ros/unit_tag_template.h
+++ b/zebROS_ws/src/deeptag_ros/include/deeptag_ros/unit_tag_template.h
@@ -8,7 +8,7 @@
 
 #include "opencv2/core/mat.hpp"
 #include "opencv2/core/types.hpp"
-#include "span.hpp"
+#include "deeptag_ros/span.hpp"
 #include "vector_types.h"
 
 //#define DEBUG
@@ -17,8 +17,7 @@
 #endif
 
 class Stage2KeypointGroup;
-struct PointsAndIDs;
-
+template <size_t GRID_SIZE> struct PointsAndIDs;
 
 template <size_t GRID_SIZE, class UNIT_TAG_CLASS, bool IS_NEED_MAIN_IDX, size_t STEP_ELEM_NUM, size_t KPT_START_IDX>
 class UnitTagTemplate
@@ -37,11 +36,11 @@ class UnitTagTemplate
     // mainIdx is the orientation of the tag, in 90* increments (0-3)
     // the two outputs are the rotated fine grid points (points including tag borders),
     // and rotated keypoints (just the "data" bits of the tag)
-    template <typename T>
-    void reorderPointsWithMainIdx(std::array<T, (GRID_SIZE + 2) * (GRID_SIZE + 2)> &fineGridPointsRotated,
-                                  std::array<T, GRID_SIZE * GRID_SIZE> &keypointsRotated,
+    template <typename T, typename T1>
+    void reorderPointsWithMainIdx(T &fineGridPointsRotated,
+                                  T1 &keypointsRotated,
                                   const size_t mainIdx,
-                                  const std::array<T, (GRID_SIZE + 2) * (GRID_SIZE + 2)> &fineGridPoints) const
+                                  const T &fineGridPoints) const
     {
         constexpr auto N = GRID_SIZE + 2;
         auto reorderedXYs = m_unitTags.getFineGridPoints(mainIdx, false, STEP_ELEM_NUM);
@@ -67,6 +66,26 @@ class UnitTagTemplate
 #endif
         }
     }
+
+    void reorderPointsWithMainIdx(PointsAndIDs<GRID_SIZE + 2> &fineGridPointsRotated,
+                                  const size_t mainIdx,
+                                  const PointsAndIDs<GRID_SIZE + 2> &fineGridPoints) const
+    {
+        constexpr auto N = GRID_SIZE + 2;
+        const auto reorderedXYs = m_unitTags.getFineGridPoints(mainIdx, false, STEP_ELEM_NUM);
+        for (size_t i = 0; i < fineGridPointsRotated.size(); i++)
+        {
+            const auto &p = reorderedXYs[i];
+            const size_t idx = p.y / STEP_ELEM_NUM * N + p.x / STEP_ELEM_NUM;
+            fineGridPointsRotated.m_point[i] = fineGridPoints.m_point[idx];
+            fineGridPointsRotated.m_id[i] = fineGridPoints.m_id[idx];
+            fineGridPointsRotated.m_score[i] = fineGridPoints.m_score[idx];
+#ifdef DEBUG
+            std::cout << "fineGridPointsRotated[" << i << "] = " << fineGridPointsRotated[i] << std::endl;
+#endif
+        }
+    }
+    #if 0
     template <typename T>
     void reorderPointsWithMainIdx(std::vector<T> &fineGridPointsRotated,
                                   std::vector<T> &keypointsRotated,
@@ -93,16 +112,17 @@ class UnitTagTemplate
             keypointsRotated.push_back(fineGridPoints[idx]);
         }
     }
+    #endif
 
     void matchFineGrid(double &matchRatio,
-                       std::array<PointsAndIDs, (GRID_SIZE + 2) * (GRID_SIZE + 2)> &fineGridPointsAndIDs,
+                       PointsAndIDs<GRID_SIZE + 2> &bestOrderedPoints,
                        const tcb::span<const Stage2KeypointGroup> &stage2KeypointGroups,
                        const cv::Mat &H,
                        const tcb::span<const float2> &stage2PredCorners,
                        const cv::Mat &cameraMatrix,
                        const cv::Mat &distCoeffs) const;
 
-    std::array<cv::Point2d, 4> updateCornersInImage(const std::array<PointsAndIDs, (GRID_SIZE + 2) * (GRID_SIZE + 2)> &orderedPoints,
+    std::array<cv::Point2d, 4> updateCornersInImage(const PointsAndIDs<GRID_SIZE + 2> &orderedPoints,
                                                     const cv::Mat &HCrop,
                                                     const cv::Mat &cameraMatrix,
                                                     const cv::Mat &distCoeffs) const;
@@ -115,7 +135,7 @@ class UnitTagTemplate
     const UNIT_TAG_CLASS m_unitTags;
 };
 
-#include "unit_arucotag.h"
+#include "deeptag_ros/unit_arucotag.h"
 template <size_t GRID_SIZE>
 class UnitTagTemplateArucotag : public UnitTagTemplate<GRID_SIZE, UnitArucoTag<GRID_SIZE>, false, 1, 0>
 {
diff --git a/zebROS_ws/src/deeptag_ros/include/deeptag_ros/warp_perspective_points.h b/zebROS_ws/src/deeptag_ros/include/deeptag_ros/warp_perspective_points.h
index dc64f2da5..7d48f065c 100644
--- a/zebROS_ws/src/deeptag_ros/include/deeptag_ros/warp_perspective_points.h
+++ b/zebROS_ws/src/deeptag_ros/include/deeptag_ros/warp_perspective_points.h
@@ -11,7 +11,7 @@
 #include <vector>                        // for vector
 
 #undef  DEBUG
-#include "debug.h"
+#include "deeptag_ros/debug.h"
 
 std::vector<cv::Point2d> warpPerspectivePts(const cv::Mat &H, const std::vector<cv::Point2d> &points, const double image_scale = 1.);
 
diff --git a/zebROS_ws/src/deeptag_ros/models/aprilltag_decoder_mono.onnx b/zebROS_ws/src/deeptag_ros/models/aprilltag_decoder_mono.onnx
new file mode 100644
index 000000000..b1e6dbb32
--- /dev/null
+++ b/zebROS_ws/src/deeptag_ros/models/aprilltag_decoder_mono.onnx
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:29678b7526e597a35b28b35bdd845c2977e1a099f8a7c36a93b4eb68551a29cf
+size 901355
diff --git a/zebROS_ws/src/deeptag_ros/models/apriltag_decoder_mono.onnx b/zebROS_ws/src/deeptag_ros/models/apriltag_decoder_mono.onnx
new file mode 100644
index 000000000..409d32873
--- /dev/null
+++ b/zebROS_ws/src/deeptag_ros/models/apriltag_decoder_mono.onnx
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:79292d8d2b66cfa1fc3582bb6df07dce0296d49f0a5032676f2da53c349ea0e1
+size 779902
diff --git a/zebROS_ws/src/deeptag_ros/models/create_engine.sh b/zebROS_ws/src/deeptag_ros/models/create_engine.sh
index 430411d1d..5ad51bcb1 100755
--- a/zebROS_ws/src/deeptag_ros/models/create_engine.sh
+++ b/zebROS_ws/src/deeptag_ros/models/create_engine.sh
@@ -2,6 +2,12 @@
 # Then run deeptag_ros and see what specific int8 file name it wants (e.g. Searching for engine file with name: /home/ubuntu/.900RobotCode.readonly/zebROS_ws/src/deeptag_ros/models/arucotag_roi_detector.engine.NVIDIAGeForceRTX4060LaptopGPU.int8.4.4)
 # ln -s arucotag_roi_detector_int8_batch4.engine aruco_roi_detector.engine.NVIDIAGeForceRTX4060LaptopGPU.int8.4.4
 python3 ~/900RobotCode/zebROS_ws/src/tf_object_detection/src/onnx_to_tensorrt.py arucotag_decoder.onnx --output arucotag_decoder.engine --fp16 --batch-size-min=1 --batch-size-max=4 --batch-size=4 --max-workspace-size=1073741824
+python3 ~/900RobotCode/zebROS_ws/src/tf_object_detection/src/onnx_to_tensorrt.py apriltag_decoder_mono.onnx --output apriltag_decoder_mono_fp16_batch1.engine --fp16 --batch-size-min=1 --batch-size-max=1 --batch-size=1 --max-workspace-size=1073741824
+python3 ~/900RobotCode/zebROS_ws/src/tf_object_detection/src/onnx_to_tensorrt.py apriltag_decoder_mono.onnx --output apriltag_decoder_mono_fp16_batch2.engine --fp16 --batch-size-min=2 --batch-size-max=2 --batch-size=3 --max-workspace-size=1073741824
+python3 ~/900RobotCode/zebROS_ws/src/tf_object_detection/src/onnx_to_tensorrt.py apriltag_decoder_mono.onnx --output apriltag_decoder_mono_fp16_batch3.engine --fp16 --batch-size-min=3 --batch-size-max=3 --batch-size=3 --max-workspace-size=1073741824
+python3 ~/900RobotCode/zebROS_ws/src/tf_object_detection/src/onnx_to_tensorrt.py apriltag_decoder_mono.onnx --output apriltag_decoder_mono_int8_batch1.engine --int8 --fp16 --batch-size-min=1 --batch-size-max=1 --batch-size=1 --max-workspace-size=1073741824
+python3 ~/900RobotCode/zebROS_ws/src/tf_object_detection/src/onnx_to_tensorrt.py apriltag_decoder_mono.onnx --output apriltag_decoder_mono_int8_batch2.engine --int8 --fp16 --batch-size-min=2 --batch-size-max=2 --batch-size=3 --max-workspace-size=1073741824
+python3 ~/900RobotCode/zebROS_ws/src/tf_object_detection/src/onnx_to_tensorrt.py apriltag_decoder_mono.onnx --output apriltag_decoder_mono_int8_batch3.engine --int8 --fp16 --batch-size-min=3 --batch-size-max=3 --batch-size=2 --max-workspace-size=1073741824
 #python3 ~/900RobotCode/zebROS_ws/src/tf_object_detection/src/onnx_to_tensorrt.py arucotag_decoder.onnx --output arucotag_decoder_int8.engine --fp16 --int8 --batch-size-min=1 --batch-size-max=4 --batch-size=4 --max-workspace-size=1073741824 --dataset-path /home/ubuntu/tensorflow_workspace/2023Game/data/combined_88_test --calibration-file arucotag_decoder.calib
 
 #python3 ~/900RobotCode/zebROS_ws/src/tf_object_detection/src/onnx_to_tensorrt.py arucotag_roi_detector.onnx --output arucotag_roi_detector.engine --fp16 --input-width=1088 --input-width-min=512 --input-width-max=2048 --max-workspace-size=1073741824
@@ -10,3 +16,4 @@ python3 ~/900RobotCode/zebROS_ws/src/tf_object_detection/src/onnx_to_tensorrt.py
 python3 ~/900RobotCode/zebROS_ws/src/tf_object_detection/src/onnx_to_tensorrt.py arucotag_roi_detector.onnx --output arucotag_roi_detector_int8_batch5.engine --fp16 --input-width=1088 --input-width-min=960 --input-width-max=1280 --batch-size=5 --int8 --max-workspace-size=1073741824 --dataset-path /home/ubuntu --calibration-file arucotag_roi_detector.calib
 python3 ~/900RobotCode/zebROS_ws/src/tf_object_detection/src/onnx_to_tensorrt.py arucotag_roi_detector.onnx --output arucotag_roi_detector_int8_batch9.engine --fp16 --input-width=1088 --input-width-min=960 --input-width-max=1280 --batch-size=9 --int8 --max-workspace-size=1073741824 --dataset-path /home/ubuntu --calibration-file arucotag_roi_detector.calib
 python3 ~/900RobotCode/zebROS_ws/src/tf_object_detection/src/onnx_to_tensorrt.py arucotag_roi_detector.onnx --output arucotag_roi_detector_int8_batch10.engine --fp16 --input-width=1088 --input-width-min=960 --input-width-max=1280 --batch-size=10 --int8 --max-workspace-size=1073741824 --dataset-path /home/ubuntu --calibration-file arucotag_roi_detector.calib
+python3 ~/900RobotCode/zebROS_ws/src/tf_object_detection/src/onnx_to_tensorrt.py arucotag_roi_detector.onnx --output arucotag_roi_detector_int8_batch10.engine --fp16 --input-width=1088 --input-width-min=960 --input-width-max=1280 --batch-size=10 --int8 --max-workspace-size=1073741824 --dataset-path /home/ubuntu --calibration-file arucotag_roi_detector.calib
diff --git a/zebROS_ws/src/deeptag_ros/src/decoder_engine.cpp b/zebROS_ws/src/deeptag_ros/src/decoder_engine.cpp
index 6717ca859..64c4f9c62 100644
--- a/zebROS_ws/src/deeptag_ros/src/decoder_engine.cpp
+++ b/zebROS_ws/src/deeptag_ros/src/decoder_engine.cpp
@@ -151,8 +151,8 @@ void DecoderEngine::blobFromGpuImageWrappers(const std::vector<GpuImageWrapper>
 #ifdef DEBUG
     static int callNum = 0;
 #endif
-    constexpr size_t outputHW = 256;
-    constexpr size_t imgSize = outputHW * outputHW * 3;
+    const size_t outputHW = 256; // This assumes a square image
+    const size_t imgSize = outputHW * outputHW * batchInput[0].channels();
     const size_t thisBatchSize = std::min(m_rois.size(), static_cast<size_t>(m_options.maxBatchSize));
     //std::cout << "thisBatchSize = " << thisBatchSize << std::endl;
     // Get crop images ordered corners
@@ -173,7 +173,11 @@ void DecoderEngine::blobFromGpuImageWrappers(const std::vector<GpuImageWrapper>
     cv::Mat inputRoi(4, 2, CV_64FC1);
     for (size_t batchIdx = 0; batchIdx < thisBatchSize; batchIdx++)
     {
-        // std::cout << "batchIdx = " << batchIdx << std::endl;
+        // Create a mapping from the input roi (tag corners) in
+        // the input image to a fixed position in the output image
+        // Assign to a location in the output image with a border
+        // to catch the tag corners even if the initial detection
+        // is off by a bit.
         for (int i = 0; i < 4; i++)
         {
             inputRoi.at<double>(i, 0) = m_rois[batchIdx][i].x;
@@ -193,39 +197,46 @@ void DecoderEngine::blobFromGpuImageWrappers(const std::vector<GpuImageWrapper>
         std::cout << "H = " << std::endl << "\t" << H << std::endl;
         std::cout << "H.inv() = " << std::endl << "\t" << H.inv() << std::endl;
 #endif
-        m_hH[batchIdx][0] = static_cast<float>(H.at<double>(0, 0));
-        m_hH[batchIdx][1] = static_cast<float>(H.at<double>(0, 1));
-        m_hH[batchIdx][2] = static_cast<float>(H.at<double>(0, 2));
-        m_hH[batchIdx][3] = static_cast<float>(H.at<double>(1, 0));
-        m_hH[batchIdx][4] = static_cast<float>(H.at<double>(1, 1));
-        m_hH[batchIdx][5] = static_cast<float>(H.at<double>(1, 2));
-        m_hH[batchIdx][6] = static_cast<float>(H.at<double>(2, 0));
-        m_hH[batchIdx][7] = static_cast<float>(H.at<double>(2, 1));
-        m_hH[batchIdx][8] = static_cast<float>(H.at<double>(2, 2));
-        // Calculate H mat from roi
-        // Get inv transform matrix (from dest to src)
-        // pass this into a kernel which creates output image at requested
-        // buffer output
-        // Needs to pick appropriate input pixel for each output pixel, possible with bilinear filtering
-        // then split channels, convert to float.
-        cudaSafeCall(m_decoderPreprocess[batchIdx].decoderPreprocessRGB(m_hH[batchIdx],
-                                                                        batchInput[0].getDataPtr(),
-                                                                        imageFormat::IMAGE_RGB8,
-                                                                        batchInput[0].cols(),
-                                                                        batchInput[0].rows(),
-                                                                        static_cast<float *>(m_buffers[inputIdx]) + batchIdx * imgSize,
-                                                                        outputHW,
-                                                                        outputHW,
-                                                                        float2{0., 1.},
-                                                                        m_preprocCudaStreams[batchIdx]));
-
+        // Use a separate H matrix for each batch entry since they
+        // might not be copied to the device before the next
+        // iteration writes over the same memory
+        for (size_t i = 0; i < 9; i++)
+        {
+            m_hH[batchIdx][i] = static_cast<float>(H.at<double>(i / 3, i % 3));
+        }
+        if (batchInput[0].channels() == 1)
+        {
+            cudaSafeCall(m_decoderPreprocess[batchIdx].decoderPreprocessGray(m_hH[batchIdx],
+                                                                             batchInput[0].getDataPtr(),
+                                                                             imageFormat::IMAGE_MONO8,
+                                                                             batchInput[0].cols(),
+                                                                             batchInput[0].rows(),
+                                                                             static_cast<float *>(m_buffers[inputIdx]) + batchIdx * imgSize,
+                                                                             outputHW,
+                                                                             outputHW,
+                                                                             float2{0., 1.},
+                                                                             m_preprocCudaStreams[batchIdx]));
+        }
+        else
+        {
+            cudaSafeCall(m_decoderPreprocess[batchIdx].decoderPreprocessRGB(m_hH[batchIdx],
+                                                                            batchInput[0].getDataPtr(),
+                                                                            imageFormat::IMAGE_RGB8,
+                                                                            batchInput[0].cols(),
+                                                                            batchInput[0].rows(),
+                                                                            static_cast<float *>(m_buffers[inputIdx]) + batchIdx * imgSize,
+                                                                            outputHW,
+                                                                            outputHW,
+                                                                            float2{0., 1.},
+                                                                            m_preprocCudaStreams[batchIdx]));
+        }
         cudaSafeCall(cudaEventRecord(m_preprocCudaEvents[batchIdx], m_preprocCudaStreams[batchIdx]));
 #ifdef DEBUG
         cv::Mat m = getDebugImage(batchIdx);
         std::stringstream s;
         s << "C" << callNum << "B" << batchIdx;
         cv::imshow(s.str().c_str(), m);
-        cv::imwrite(s.str() + ".png", m);
+        // cv::imwrite(s.str() + ".png", m);
 #endif
     }
     for (size_t batchIdx = 0; batchIdx < thisBatchSize; batchIdx++)
@@ -235,7 +246,7 @@ void DecoderEngine::blobFromGpuImageWrappers(const std::vector<GpuImageWrapper>
 
 #ifdef DEBUG
     callNum += 1;
-    constexpr size_t channelStride = outputHW * outputHW;
+    const size_t channelStride = outputHW * outputHW;
 #if 0
     std::cout << " imgSize = " << imgSize << std::endl;
     cv::Mat hR(outputHW, outputHW, CV_32FC1);
@@ -282,7 +293,7 @@ nvinfer1::Dims DecoderEngine::inputDimsFromInputImage(const GpuImageWrapper &gpu
     // Decoder is fixed at 3, 256, 256
     return nvinfer1::Dims{4,
                           {modelInputDims.d[0],
-                          3,
+                          static_cast<int32_t>(gpuImg.channels()),
                           256,
                           256}};
 }
diff --git a/zebROS_ws/src/deeptag_ros/src/decoder_preprocess.cu b/zebROS_ws/src/deeptag_ros/src/decoder_preprocess.cu
index fa189dfa1..73a6742bb 100644
--- a/zebROS_ws/src/deeptag_ros/src/decoder_preprocess.cu
+++ b/zebROS_ws/src/deeptag_ros/src/decoder_preprocess.cu
@@ -64,7 +64,7 @@ __global__ void gpuDecoderPreprocess(const float *H,
 		};
 
 		const float3 rgb = isBGR ? make_float3(px.z, px.y, px.x)
-								: make_float3(px.x, px.y, px.z);
+								 : make_float3(px.x, px.y, px.z);
 
 		output[n * 0 + m] = rgb.x * multiplier + min_value;
 		output[n * 1 + m] = rgb.y * multiplier + min_value;
@@ -127,8 +127,10 @@ cudaError_t DecoderPreprocess::launchDecoderPreprocess(const float *hH, void *in
 	} 
 	else
 	{
-		// Mono8 option - 1 channel grayscale image
+		// Mono8 option - 1 channel grayscale image, in either uint8_t or float format
 		if (format == imageFormat::IMAGE_MONO8)
+			gpuDecoderPreprocess<uint8_t, isBGR, isGray, threeOutputChannels><<<gridDim, blockDim, 0, stream>>>(m_dH, (uint8_t *)input, inputWidth, inputHeight, output, outputWidth, outputHeight, multiplier, range.x);
+		else if (format == imageFormat::IMAGE_MONO32F)
 			gpuDecoderPreprocess<float, isBGR, isGray, threeOutputChannels><<<gridDim, blockDim, 0, stream>>>(m_dH, (float *)input, inputWidth, inputHeight, output, outputWidth, outputHeight, multiplier, range.x);
 		else
 			return cudaErrorInvalidValue;
diff --git a/zebROS_ws/src/deeptag_ros/src/decoder_test.cpp b/zebROS_ws/src/deeptag_ros/src/decoder_test.cpp
new file mode 100644
index 000000000..7ca656d70
--- /dev/null
+++ b/zebROS_ws/src/deeptag_ros/src/decoder_test.cpp
@@ -0,0 +1,142 @@
+#include <opencv2/opencv.hpp>
+
+#include "deeptag_ros/cuda_event_timing.h"
+#include "deeptag_ros/cuda_utils.h"
+#include "deeptag_ros/gpu_image_wrapper.h"
+#include "deeptag_ros/marker_dict.h"
+#include "deeptag_ros/stag_decoder.h"
+
+constexpr size_t MARKER_GRID_SIZE = 6;
+
+static cv::Mat getTag(const cv::Mat &image, const size_t outputHW, const cv::Mat H)
+{
+    cv::Mat tag;
+    warpPerspective(image, tag, H, cv::Size(outputHW, outputHW));
+    return tag;
+}
+
+template <size_t GRID_SIZE>
+static void writeStage2Debug(cv::Mat &image,
+                             const PointsAndIDs<GRID_SIZE + 2> &keypointsAndIds,
+                             const uint16_t tagId)
+{
+    for (size_t kp = 0; kp < keypointsAndIds.m_point.size(); kp++)
+    {
+        const auto id = keypointsAndIds.m_id[kp];
+        cv::circle(image,
+                   cv::Point2d(keypointsAndIds.m_point[kp].x, keypointsAndIds.m_point[kp].y),
+                   3,
+                   (id < 0) ?
+                        cv::Scalar(255, 0, 0) : (id == 0) ? cv::Scalar(0, 0, 255)
+                                                            : cv::Scalar(0, 255, 0));
+    }
+    std::stringstream s;
+    s << tagId;
+    cv::putText(image, s.str(), cv::Point(5, 35), 0, 1.5, cv::Scalar(0, 255, 255), 2);
+}
+
+template <size_t GRID_SIZE>
+void visualizeStage2(cv::Mat &image,
+                     const size_t outputHW,
+                     const std::vector<std::array<DecodedTag<GRID_SIZE>, 2>> &result)
+{
+    cv::Mat output(outputHW * 2,     // rows
+                   outputHW * std::max(result.size(), static_cast<size_t>(1)), // cols
+                   CV_8UC3,
+                   cv::Scalar(255, 255, 255));
+    if (result.empty())
+    {
+        image = output;
+        return;
+    }
+    cv::Mat tag;
+    // Arrange tags horizontally, with the first pass of the tag decode on top and second on the bottom
+    for (size_t i = 0; i < result.size(); i++)
+    {
+        tag = getTag(image, outputHW, result[i][0].m_HCrop);
+        writeStage2Debug<GRID_SIZE>(tag, result[i][0].m_keypointsWithIds, result[i][0].m_tagId);
+        tag.copyTo(output(cv::Rect(i * outputHW, 0, outputHW, outputHW)));
+
+        tag = getTag(image, outputHW, result[i][1].m_HCrop);
+        writeStage2Debug<GRID_SIZE>(tag, result[i][1].m_keypointsWithIds, result[i][1].m_tagId);
+        tag.copyTo(output(cv::Rect(i * outputHW, outputHW, outputHW, outputHW)));
+    }
+    image = output;
+}
+int main(int argc, char **argv)
+{
+    const auto cameraMatrix = (cv::Mat_<double>(3, 3) << 128., 0.0, 128., 0.0, 128., 128., 0.0, 0.0, 1.0);
+    const auto distCoeffs = (cv::Mat_<double>(1, 8) << 0, 0, 0, 0, 0, 0, 0, 0);
+    Timings timings{};
+    ArucoMarkerDict<MARKER_GRID_SIZE> arucoMarkerDict{cv::aruco::PREDEFINED_DICTIONARY_NAME::DICT_APRILTAG_36h11};
+    ArucoSTagDecoder<MARKER_GRID_SIZE> sTagDecoder{arucoMarkerDict, cameraMatrix, distCoeffs, timings};
+
+    sTagDecoder.initEngine("/home/ubuntu/900RobotCode/zebROS_ws/src/deeptag_ros/models", "apriltag_decoder_mono.onnx");
+
+    const std::string inputImage = argv[1];
+    auto cpuImg = cv::imread(inputImage, cv::IMREAD_GRAYSCALE);
+    if (cpuImg.empty())
+    {
+        throw std::runtime_error("Unable to read image at path: " + inputImage);
+    }
+    std::vector<std::vector<GpuImageWrapper>> detectInputs;
+    detectInputs.emplace_back();
+    detectInputs[0].emplace_back();
+    detectInputs[0][0].upload(cpuImg, sTagDecoder.getCudaStream());
+    std::vector<std::array<cv::Point2d, 4>> rois;
+    rois.emplace_back();
+    rois[0][0] = cv::Point2d{.15 * 256, .15 * 256};
+    rois[0][1] = cv::Point2d{.85 * 256, .15 * 256};
+    rois[0][2] = cv::Point2d{.85 * 256, .85 * 256};
+    rois[0][3] = cv::Point2d{.15 * 256, .85 * 256};
+    rois.emplace_back();
+    rois[0][0] = cv::Point2d{.15 * 256, .15 * 256};
+    rois[0][1] = cv::Point2d{.85 * 256, .15 * 256};
+    rois[0][2] = cv::Point2d{.85 * 256, .85 * 256};
+    rois[0][3] = cv::Point2d{.15 * 256, .85 * 256};
+    rois.emplace_back();
+    rois[0][0] = cv::Point2d{.15 * 256, .15 * 256};
+    rois[0][1] = cv::Point2d{.85 * 256, .15 * 256};
+    rois[0][2] = cv::Point2d{.85 * 256, .85 * 256};
+    rois[0][3] = cv::Point2d{.15 * 256, .85 * 256};
+    rois.emplace_back();
+    rois[0][0] = cv::Point2d{.15 * 256, .15 * 256};
+    rois[0][1] = cv::Point2d{.85 * 256, .15 * 256};
+    rois[0][2] = cv::Point2d{.85 * 256, .85 * 256};
+    rois[0][3] = cv::Point2d{.15 * 256, .85 * 256};
+    rois.emplace_back();
+    rois[0][0] = cv::Point2d{.15 * 256, .15 * 256};
+    rois[0][1] = cv::Point2d{.85 * 256, .15 * 256};
+    rois[0][2] = cv::Point2d{.85 * 256, .85 * 256};
+    rois[0][3] = cv::Point2d{.15 * 256, .85 * 256};
+    rois.emplace_back();
+    rois[0][0] = cv::Point2d{.15 * 256, .15 * 256};
+    rois[0][1] = cv::Point2d{.85 * 256, .15 * 256};
+    rois[0][2] = cv::Point2d{.85 * 256, .85 * 256};
+    rois[0][3] = cv::Point2d{.15 * 256, .85 * 256};
+    rois.emplace_back();
+    rois[0][0] = cv::Point2d{.15 * 256, .15 * 256};
+    rois[0][1] = cv::Point2d{.85 * 256, .15 * 256};
+    rois[0][2] = cv::Point2d{.85 * 256, .85 * 256};
+    rois[0][3] = cv::Point2d{.15 * 256, .85 * 256};
+    rois.emplace_back();
+    rois[0][0] = cv::Point2d{.15 * 256, .15 * 256};
+    rois[0][1] = cv::Point2d{.85 * 256, .15 * 256};
+    rois[0][2] = cv::Point2d{.85 * 256, .85 * 256};
+    rois[0][3] = cv::Point2d{.15 * 256, .85 * 256};
+    for (size_t i = 0; i < 1000; i++)
+    {
+        const auto decodedTags = sTagDecoder.detectTags(detectInputs, rois);
+        for (const auto &decodedTag : decodedTags)
+        {
+            std::cout << "Tag " << decodedTag[1].m_tagId << std::endl;
+        }
+    }
+    cv::Mat stage2DebugImg = cpuImg.clone();
+    cv::cvtColor(stage2DebugImg, stage2DebugImg, cv::COLOR_GRAY2BGR);
+    visualizeStage2<MARKER_GRID_SIZE>(stage2DebugImg, sTagDecoder.getModelSize().x, decodedTags);
+    cv::imshow((inputImage + "_stage_2").c_str(), stage2DebugImg);
+
+    cv::waitKey(0);
+    return 0;
+}
\ No newline at end of file
diff --git a/zebROS_ws/src/deeptag_ros/src/deeptag_impls.cpp b/zebROS_ws/src/deeptag_ros/src/deeptag_impls.cpp
index 0cc9aaf77..c720af94f 100644
--- a/zebROS_ws/src/deeptag_ros/src/deeptag_impls.cpp
+++ b/zebROS_ws/src/deeptag_ros/src/deeptag_impls.cpp
@@ -83,14 +83,12 @@ std::vector<DeepTagResult> DeepTagImpl<NUM_TILES, USE_SCALED_IMAGE, MARKER_GRID_
             }
             for (size_t j = 0; j < decodedTags[i].size(); j++)
             {
-                for (const auto &kp : decodedTags[i][j].m_keypointsWithIds)
-                {
-                    result.m_keypoints[j].push_back(kp.m_point);
-                    result.m_keypointIds[j].push_back(kp.m_id);
-                    result.m_keypointScores[j].push_back(kp.m_score);
-                }
+                const auto &kp = decodedTags[i][j].m_keypointsWithIds;
+                std::copy(kp.m_point.begin(), kp.m_point.end(), std::back_inserter(result.m_keypoints[j]));
+                std::copy(kp.m_id.begin(), kp.m_id.end(), std::back_inserter(result.m_keypointIds[j]));
+                std::copy(kp.m_score.begin(), kp.m_score.end(), std::back_inserter(result.m_keypointScores[j]));
             }
-            if (m_poseEstimator.fineGridKeypointsToPose(result.m_rVec, result.m_tVec, decodedTags[i][1].m_keypointsInImage))
+            if (m_poseEstimator.fineGridKeypointsToPose(result.m_rVec, result.m_tVec, decodedTags[i][1].m_keypointsWithIds.m_point))
             {
 #ifdef DEBUG
                 // std::cout << "result.m_rVec = " << result.m_rVec.t() << " m_tVec = " << result.m_tVec.t() << std::endl;
diff --git a/zebROS_ws/src/deeptag_ros/src/engine.cpp b/zebROS_ws/src/deeptag_ros/src/engine.cpp
index eb92d504e..ceb2ed6c6 100644
--- a/zebROS_ws/src/deeptag_ros/src/engine.cpp
+++ b/zebROS_ws/src/deeptag_ros/src/engine.cpp
@@ -702,23 +702,32 @@ cv::Mat Engine<CALIBRATOR>::getDebugImage(const size_t imageIdx)
     std::cout << "Input dims = " << inputDims.d[0] << " " << inputDims.d[1] << " " << inputDims.d[2] << " " << inputDims.d[3] << std::endl;
     const int outputHeight = inputDims.d[2];
     const int outputWidth = inputDims.d[3];
-    cv::Mat hR(outputHeight, outputWidth, CV_32FC1);
-    cv::Mat hG(outputHeight, outputWidth, CV_32FC1);
-    cv::Mat hB(outputHeight, outputWidth, CV_32FC1);
     const float *destBuffer = getBufferByName("input");
     const size_t channelStride = outputWidth * outputHeight;
     cudaSafeCall(cudaStreamSynchronize(getCudaStream()));
-    cudaSafeCall(cudaMemcpyAsync(hR.data, destBuffer + (imageIdx * 3 + 0) * channelStride, channelStride * sizeof(float), cudaMemcpyDeviceToHost, getCudaStream()));
-    cudaSafeCall(cudaMemcpyAsync(hG.data, destBuffer + (imageIdx * 3 + 1) * channelStride, channelStride * sizeof(float), cudaMemcpyDeviceToHost, getCudaStream()));
-    cudaSafeCall(cudaMemcpyAsync(hB.data, destBuffer + (imageIdx * 3 + 2) * channelStride, channelStride * sizeof(float), cudaMemcpyDeviceToHost, getCudaStream()));
-    cudaSafeCall(cudaStreamSynchronize(getCudaStream()));
-    std::vector<cv::Mat> channels;
-    channels.push_back(hR);
-    channels.push_back(hG);
-    channels.push_back(hB);
-    cv::Mat fin_img;
-    cv::merge(channels, fin_img);
-    return fin_img;
+    if (inputDims.d[1] == 1)
+    {
+        cv::Mat fin_img(outputHeight, outputWidth, CV_32FC1);
+        cudaSafeCall(cudaMemcpyAsync(fin_img.data, destBuffer + imageIdx * channelStride, channelStride * sizeof(float), cudaMemcpyDeviceToHost, getCudaStream()));
+        return fin_img;
+    }
+    else
+    {
+        cv::Mat hR(outputHeight, outputWidth, CV_32FC1);
+        cv::Mat hG(outputHeight, outputWidth, CV_32FC1);
+        cv::Mat hB(outputHeight, outputWidth, CV_32FC1);
+        cudaSafeCall(cudaMemcpyAsync(hR.data, destBuffer + (imageIdx * 3 + 0) * channelStride, channelStride * sizeof(float), cudaMemcpyDeviceToHost, getCudaStream()));
+        cudaSafeCall(cudaMemcpyAsync(hG.data, destBuffer + (imageIdx * 3 + 1) * channelStride, channelStride * sizeof(float), cudaMemcpyDeviceToHost, getCudaStream()));
+        cudaSafeCall(cudaMemcpyAsync(hB.data, destBuffer + (imageIdx * 3 + 2) * channelStride, channelStride * sizeof(float), cudaMemcpyDeviceToHost, getCudaStream()));
+        cudaSafeCall(cudaStreamSynchronize(getCudaStream()));
+        std::vector<cv::Mat> channels;
+        channels.push_back(hR);
+        channels.push_back(hG);
+        channels.push_back(hB);
+        cv::Mat fin_img;
+        cv::merge(channels, fin_img);
+        return fin_img;
+    }
 }
 
 Int8EntropyCalibrator2::Int8EntropyCalibrator2(int32_t batchSize, int32_t inputW, int32_t inputH,
diff --git a/zebROS_ws/src/deeptag_ros/src/stag_decoder.cpp b/zebROS_ws/src/deeptag_ros/src/stag_decoder.cpp
index d89641c6d..2e1ef0fd6 100644
--- a/zebROS_ws/src/deeptag_ros/src/stag_decoder.cpp
+++ b/zebROS_ws/src/deeptag_ros/src/stag_decoder.cpp
@@ -67,7 +67,7 @@ void STagDecoder<MARKER_DICT, GRID_SIZE>::initEngine(const std::string &modelPat
 // (although TODO : those might be duplicates of data in the KeyPoints)
 template <class MARKER_DICT, size_t GRID_SIZE>
 void STagDecoder<MARKER_DICT, GRID_SIZE>::runInference(std::vector<std::vector<Stage2KeypointGroup>> &stage2KeypointGroups,
-                                                       std::vector<std::vector<float2>> &stage2Corners,
+                                                       std::vector<std::array<float2, 4>> &stage2Corners,
                                                        const std::vector<std::vector<GpuImageWrapper>> &detectInputs,
                                                        const tcb::span<const std::array<cv::Point2d, 4>> &rois)
 {
@@ -93,28 +93,38 @@ void STagDecoder<MARKER_DICT, GRID_SIZE>::runInference(std::vector<std::vector<S
     // them just once here
     m_timing.start("stage2_corner_priors", m_decodeEngine->getCudaStream());
     // model input size and image size are the same
+    // Divide x&y by 128 to get a 2x2 grid giving four corner points
+    // of the outer black border of the tag
     m_stage2CornerPrior.generate(getModelSize(), 128, getModelSize(), {}, m_decodeEngine->getCudaStream());
     m_timing.end("stage2_corner_priors");
 
+    // Grid priors create a 32x32 grid of anchor points for keypoint detection
+    // Each has an associated offset from the anchor point along with a class confidence
+    // (background or foreground black or white)
     m_timing.start("stage2_grid_priors", m_decodeEngine->getCudaStream());
     m_stage2GridPrior.generate(getModelSize(), 8, getModelSize(), {}, m_decodeEngine->getCudaStream());
     m_timing.end("stage2_grid_priors");
 
     for (size_t roiNum = 0; roiNum < rois.size(); roiNum++)
     {
+        // Run softmax on the keypoint grid output, giving a confidence for
+        // each keypoint being a black or white corner. We drop any keypoints
+        // which are part of the background class.
         m_timing.start("stage2_softmax", m_decodeEngine->getCudaStream());
         m_stage2DecoderSoftmax.compute(m_decodeEngine->getBufferByName("confidences_pred", roiNum),
                                        32 * 32,
                                        m_decodeEngine->getCudaStream());
         m_timing.end("stage2_softmax");
         
+        // Grab keypoint coordinates by applying offsets to the grid anchor points
+        // Filter out keypoints with low confidence
         m_timing.start("stage2_keypoint_detect", m_decodeEngine->getCudaStream());
         m_confidenceFilter.detect({m_stage2DecoderSoftmax.getOutput().data(),
                                    m_decodeEngine->getBufferByName("locations_pred", roiNum),
                                    nullptr /* not used */},
                                   m_stage2GridPrior.getOutput(),
                                   0.05f,             // centerVariance
-                                  0.0f,              // sizeVariance - not used for corners
+                                  0.0f,              // sizeVariance - not used for keypoints
                                   0.6f,              // min confidence // TODO : configurable
                                   m_decodeEngine->getCudaStream(),
                                   buffersResized);
@@ -132,10 +142,14 @@ void STagDecoder<MARKER_DICT, GRID_SIZE>::runInference(std::vector<std::vector<S
         stage2Corners.emplace_back();
         if (trustFlag)
         {
+            // Group nearby keypoints by taking the average of their locations weighted by confidence
             m_timing.start("stage2_keypoint_group", m_decodeEngine->getCudaStream());
             m_keypointGrouper.compute(m_confidenceFilter.getOutput(), 12, 0.0, m_decodeEngine->getCudaStream());
             m_timing.end("stage2_keypoint_group");
 
+            // Compute corner locations as offsets from the corner prior anchor points
+            // Do this here so the memcpy from the keypoint grouper above has time
+            // to possibly finish
             m_timing.start("stage2_corner_locations", m_decodeEngine->getCudaStream());
             m_corners.compute(m_decodeEngine->getBufferByName("corner_locations_pred", roiNum),
                               m_stage2CornerPrior.getOutput(),
@@ -143,6 +157,7 @@ void STagDecoder<MARKER_DICT, GRID_SIZE>::runInference(std::vector<std::vector<S
                               m_decodeEngine->getCudaStream());
             m_timing.end("stage2_corner_locations");
 
+            // Grab the host outputs of each of the above operations
             m_timing.start("stage2_keypoint_group_out", m_decodeEngine->getCudaStream());
             const tcb::span<const Stage2KeypointGroup> hStage2KeypointGroup = m_keypointGrouper.getOutput();
             for (const auto &k : hStage2KeypointGroup)
@@ -153,10 +168,7 @@ void STagDecoder<MARKER_DICT, GRID_SIZE>::runInference(std::vector<std::vector<S
 
             m_timing.start("stage2_corners_out", m_decodeEngine->getCudaStream());
             const tcb::span<const float2> hStage2Corners = m_corners.getHostOutput();
-            for (const auto &c : hStage2Corners)
-            {
-                stage2Corners.back().push_back(c);
-            }
+            std::copy(hStage2Corners.begin(), hStage2Corners.end(), stage2Corners.back().begin());
             m_timing.end("stage2_corners_out");
         }
     }
@@ -164,12 +176,17 @@ void STagDecoder<MARKER_DICT, GRID_SIZE>::runInference(std::vector<std::vector<S
 
 template <class MARKER_DICT, size_t GRID_SIZE>
 std::vector<std::array<DecodedTag<GRID_SIZE>, 2>> STagDecoder<MARKER_DICT, GRID_SIZE>::detectTags(const std::vector<std::vector<GpuImageWrapper>> &detectInputs,
-                                                                                   const std::vector<std::array<cv::Point2d, 4>> &rois)
+                                                                                                  const std::vector<std::array<cv::Point2d, 4>> &rois)
 {
+    // Array of tag corners detected in the input image
     std::vector<std::array<cv::Point2d, 4>> thisRois{rois};
+    // Output of model inference on the extracted rois
     std::vector<std::vector<Stage2KeypointGroup>> stage2KeypointGroups;
-    std::vector<std::vector<float2>> stage2Corners;
+    std::vector<std::array<float2, 4>> stage2Corners;
+
+    // Decoded tag info. 2 iterations per tag to refine corners
     std::vector<std::array<DecodedTag<GRID_SIZE>, 2>> ret;
+
     for (size_t iter = 0; iter < 2; iter++)
     {
 #ifdef DEBUG
@@ -190,8 +207,8 @@ std::vector<std::array<DecodedTag<GRID_SIZE>, 2>> STagDecoder<MARKER_DICT, GRID_
                 if (iter == 0)
                 {
                     ret.push_back(std::array<DecodedTag<GRID_SIZE>, 2>{});
-                    ret[retIdx][0].m_HCrop = m_decodeEngine->getH(ii);
                 }
+                ret[retIdx][iter].m_HCrop = m_decodeEngine->getH(ii);
                 ret[retIdx][iter].m_isValid = stage2KeypointGroups[retIdx].size() > 0;
 #ifdef DEBUG
                 std::cout << "iter = " << iter << " ret[" << retIdx << "].m_isValid = " << ret[retIdx].m_isValid << std::endl;
@@ -204,7 +221,10 @@ std::vector<std::array<DecodedTag<GRID_SIZE>, 2>> STagDecoder<MARKER_DICT, GRID_
                     m_timing.start("stage2_matchfinegrid", m_decodeEngine->getCudaStream());
                     double matchRatio;
                     constexpr auto FINE_GRID_SIZE = MARKER_DICT::getGridSize() + 2;
-                    std::array<PointsAndIDs, FINE_GRID_SIZE * FINE_GRID_SIZE> orderedFineGridPointsIds;
+                    PointsAndIDs <FINE_GRID_SIZE> orderedFineGridPointsIds;
+                    // Assign the points detected in the crop to actual grid
+                    // points in the proposed tag. This is done by matching the
+                    // detected keypoints to the nearest grid points in the tag
                     m_markerDict.getUnitTagTemplate().matchFineGrid(matchRatio,
                                                                     orderedFineGridPointsIds,
                                                                     stage2KeypointGroups[retIdx],
@@ -219,9 +239,9 @@ std::vector<std::array<DecodedTag<GRID_SIZE>, 2>> STagDecoder<MARKER_DICT, GRID_
 #endif
                     if (matchRatio > m_minGridMatchRatio)
                     {
-                        m_timing.start("stage2_fillemptyids", m_decodeEngine->getCudaStream());
+                        // m_timing.start("stage2_fillemptyids", m_decodeEngine->getCudaStream());
                         //fillEmptyIds(orderedFineGridPointsIds, stage2KeypointGroups[retIdx]);
-                        m_timing.end("stage2_fillemptyids");
+                        // m_timing.end("stage2_fillemptyids");
 
                         m_timing.start("stage2_updatecornersinimage", m_decodeEngine->getCudaStream());
                         const auto roiUpdated = m_markerDict.getUnitTagTemplate().updateCornersInImage(orderedFineGridPointsIds,
@@ -233,44 +253,32 @@ std::vector<std::array<DecodedTag<GRID_SIZE>, 2>> STagDecoder<MARKER_DICT, GRID_
                         m_timing.start("stage2_getmainindex", m_decodeEngine->getCudaStream());
                         thisRois[retIdx] = roiUpdated;
                         ret[retIdx][iter].m_roi = roiUpdated;
-                        std::array<int, FINE_GRID_SIZE * FINE_GRID_SIZE> tagBits;
-                        for (size_t i = 0; i < orderedFineGridPointsIds.size(); i++)
-                        {
-                            tagBits[i] = orderedFineGridPointsIds[i].m_id;
-                        }
+
                         int hammingDist = 2; // TODO - configurable, dynamic reconfig potential
 
-                        m_markerDict.getMainIdx(ret[retIdx][iter].m_mainIdx, ret[retIdx][iter].m_tagId, ret[retIdx][iter].m_binaryId, tagBits, hammingDist);
+                        // Decode tag bits into a tagID and binaryID
+                        // Main index is the rotation of the tag (in 90* increments)
+                        m_markerDict.getMainIdx(ret[retIdx][iter].m_mainIdx,
+                                                ret[retIdx][iter].m_tagId,
+                                                ret[retIdx][iter].m_binaryId,
+                                                orderedFineGridPointsIds.m_id,
+                                                hammingDist);
 #ifdef DEBUG
                         std::cout << "mainIdx = " << ret[retIdx].m_mainIdx << " tagId = " << ret[retIdx].m_tagId << std::endl;
 #endif
                         m_timing.end("stage2_getmainindex");
 
                         m_timing.start("stage2_reorderpointswithmainidx", m_decodeEngine->getCudaStream());
-                        std::array<PointsAndIDs, GRID_SIZE * GRID_SIZE> orderedKptsWithIds;
                         m_markerDict.getUnitTagTemplate().reorderPointsWithMainIdx(ret[retIdx][iter].m_keypointsWithIds, // [re] orderedFineGridPointsIds
-                                                                                   orderedKptsWithIds,
                                                                                    ret[retIdx][iter].m_mainIdx,
                                                                                    orderedFineGridPointsIds);
-                        for (size_t i = 0; i < ret[retIdx][iter].m_keypointsWithIds.size(); i++)
-                        {
-                            ret[retIdx][iter].m_keypointsInImage[i] = ret[retIdx][iter].m_keypointsWithIds[i].m_point;
-                        }
-                        warpPerspectivePts(ret[retIdx][0].m_HCrop.inv(), ret[retIdx][iter].m_keypointsInImage);
+                        warpPerspectivePts(ret[retIdx][iter].m_HCrop.inv(), ret[retIdx][iter].m_keypointsWithIds.m_point);
                         m_timing.end("stage2_reorderpointswithmainidx");
 #ifdef DEBUG
-                        for (const auto &r : orderedFineGridPointsIds)
-                        {
-                            std::cout << " O : " << r.m_point.x << " " << r.m_point.y << " " << r.m_id << std::endl;
-                        }
-                        for (const auto &r : ret[retIdx].m_keypointsWithIds)
-                        {
-                            std::cout << " R : " << r.m_point.x << " " << r.m_point.y << " " << r.m_id << std::endl;
-                        }
-                        for (const auto &r : orderedKptsWithIds)
-                        {
-                            std::cout << " K : " << r.m_point.x << " " << r.m_point.y << " " << r.m_id << std::endl;
-                        }
+                        std::cout << "orderedFineGripPointsIds" << std::endl
+                                  << orderedFineGridPointsIds << std::endl;
+                        std::cout << "ret[retIdx][iter].m_keypointsWithIds.m_point" << std::endl
+                                  << ret[retIdx][iter].m_keypointsWithIds << std::endl;
                         for (const auto &kg : stage2KeypointGroups[retIdx])
                         {
                             kg.print();
@@ -279,66 +287,38 @@ std::vector<std::array<DecodedTag<GRID_SIZE>, 2>> STagDecoder<MARKER_DICT, GRID_
                         {
                             std::cout << c.x << " " << c.y << std::endl;
                         }
-                        
-                        cv::Mat debugImg = m_decodeEngine->getDebugImage(ii);
-                        for (const auto &kg : stage2KeypointGroups[retIdx])
-                        {
-                            cv::circle(debugImg,
-                                       cv::Point2d(kg.m_keypoint.x, kg.m_keypoint.y),
-                                       3,
-                                       (kg.m_label < 0) ? cv::Scalar(128, 0, 0) : (kg.m_label == 0) ? cv::Scalar(0, 0, 128)
-                                                                                                    : cv::Scalar(0, 128, 0));
-                        }
-                        if (stage2Corners[retIdx].size() >= 4)
-                        {
-                            const std::array<float2, 4> orderedCorners = {stage2Corners[retIdx][0],
-                                                                          stage2Corners[retIdx][1],
-                                                                          stage2Corners[retIdx][3],
-                                                                          stage2Corners[retIdx][2]};
-                            cv::line(debugImg, cv::Point2f(orderedCorners[0].x, orderedCorners[0].y), cv::Point2f(orderedCorners[1].x, orderedCorners[1].y), cv::Scalar(0, 128, 0), 2);
-                            cv::line(debugImg, cv::Point2f(orderedCorners[1].x, orderedCorners[1].y), cv::Point2f(orderedCorners[2].x, orderedCorners[2].y), cv::Scalar(0, 128, 0), 2);
-                            cv::line(debugImg, cv::Point2f(orderedCorners[2].x, orderedCorners[2].y), cv::Point2f(orderedCorners[3].x, orderedCorners[3].y), cv::Scalar(0, 128, 0), 2);
-                            cv::line(debugImg, cv::Point2f(orderedCorners[3].x, orderedCorners[3].y), cv::Point2f(orderedCorners[0].x, orderedCorners[0].y), cv::Scalar(0, 128, 0), 2);
-                        }
-                        std::stringstream s;
-                        s << ret[retIdx].m_tagId;
-                        cv::putText(debugImg, s.str(), cv::Point(5, 35), 0, 1.5, cv::Scalar(0, 255, 255), 2);
-                        s.str("");
-                        s << "Debug_Iter" << iter << "Idx" << retIdx;
-                        cv::imshow(s.str().c_str(), debugImg);
 #endif
                     }
                     else
                     {
                         ret[retIdx][iter].m_isValid = false;
                     }
-                }
+                } // if tag is valid
                 retIdx += 1;
-            }
-        }
+            } // loop over tags in batch
+        } // loop over batch in batches
     }
-#ifdef DEBUG
-    cv::waitKey(20);
-#endif
     return ret;
 }
 
 template <class MARKER_DICT, size_t GRID_SIZE>
-void STagDecoder<MARKER_DICT, GRID_SIZE>::fillEmptyIds(std::array<PointsAndIDs, (GRID_SIZE + 2) * (GRID_SIZE + 2)> &orderedFineGridPointsIds,
+void STagDecoder<MARKER_DICT, GRID_SIZE>::fillEmptyIds(PointsAndIDs<GRID_SIZE + 2> &orderedFineGridPointsIds,
                                                        const tcb::span<const Stage2KeypointGroup> &fineGridPointsWithIdsCandidates) const
 {
-    for (auto &kpt1 : orderedFineGridPointsIds)
+    for (size_t i = 0; i < orderedFineGridPointsIds.m_point.size(); i++)
     {
-        if (kpt1.m_id == -1)
+        const auto &kpt1 = orderedFineGridPointsIds.m_point[i];
+        auto kid1 = orderedFineGridPointsIds.m_id[i];
+        if (kid1 == -1)
         {
-            double minDist = 1000.;
+            auto minDist = std::numeric_limits<double>::max();
             for (const auto &kpt2 : fineGridPointsWithIdsCandidates)
             {
-                const auto dist = hypot(kpt1.m_point.x - kpt2.m_keypoint.x, kpt1.m_point.y - kpt2.m_keypoint.y);
+                const auto dist = hypot(kpt1.x - kpt2.m_keypoint.x, kpt1.y - kpt2.m_keypoint.y);
                 if (dist < minDist)
                 {
                     minDist = dist;
-                    kpt1.m_id = kpt2.m_label;
+                    kid1 = kpt2.m_label;
                 }
             }
         }
@@ -363,6 +343,12 @@ ushort2 STagDecoder<MARKER_DICT, GRID_SIZE>::getModelSize(void) const
     return ushort2{inputDim.d[2], inputDim.d[3]};
 }
 
+template <class MARKER_DICT, size_t GRID_SIZE>
+cudaStream_t STagDecoder<MARKER_DICT, GRID_SIZE>::getCudaStream(void)
+{
+    return m_decodeEngine->getCudaStream();
+}
+
 #include "deeptag_ros/marker_dict.h"
 template class STagDecoder<ArucoMarkerDict<4>, 4>;
 template class STagDecoder<ArucoMarkerDict<5>, 5>;
diff --git a/zebROS_ws/src/deeptag_ros/src/stage2_corners.cu b/zebROS_ws/src/deeptag_ros/src/stage2_corners.cu
index 0a89d2648..d387f9943 100644
--- a/zebROS_ws/src/deeptag_ros/src/stage2_corners.cu
+++ b/zebROS_ws/src/deeptag_ros/src/stage2_corners.cu
@@ -17,7 +17,7 @@ __global__ static void calculateStage2CornerKernel(float2 *output,
     }
     //printf("idx = %d, input[idx] = %f %f, centerVariance = %f, priors[idx] = %f %f\n", idx, input[idx], input[idx + length], centerVariance, priors[idx].x, priors[idx].y);
     output[idx].x = (input[idx] * centerVariance + priors[idx].m_scale.x) * priors[idx].m_imageSize.x + priors[idx].m_offset.x;
-    output[idx].y = (input[idx + length] * centerVariance + priors[idx].m_scale.y) * priors[idx].m_imageSize.x + priors[idx].m_offset.y;
+    output[idx].y = (input[idx + length] * centerVariance + priors[idx].m_scale.y) * priors[idx].m_imageSize.y + priors[idx].m_offset.y;
 }
 
 Stage2Corners::Stage2Corners(void)
@@ -45,11 +45,11 @@ void Stage2Corners::compute(const float *input,
     cudaSafeCall(cudaEventRecord(m_dataReadyEvent, cudaStream));
 }
 
-const tcb::span<const float2> Stage2Corners::getDeviceOutput(void)
-{
-    cudaEventSynchronize(m_dataReadyEvent);
-    return tcb::span<const float2>(m_dResults, 4);
-}
+// const tcb::span<const float2> Stage2Corners::getDeviceOutput(void)
+// {
+//     cudaEventSynchronize(m_dataReadyEvent);
+//     return tcb::span<const float2>(m_dResults, 4);
+// }
 
 const tcb::span<const float2> Stage2Corners::getHostOutput(void)
 {
diff --git a/zebROS_ws/src/deeptag_ros/src/unit_tag_template.cpp b/zebROS_ws/src/deeptag_ros/src/unit_tag_template.cpp
index 695015fb9..5e6ab6d3f 100644
--- a/zebROS_ws/src/deeptag_ros/src/unit_tag_template.cpp
+++ b/zebROS_ws/src/deeptag_ros/src/unit_tag_template.cpp
@@ -1,4 +1,4 @@
-// #include <iostream>
+#include <iostream>
 #include <numeric>
 #include "opencv2/calib3d.hpp"
 #include "deeptag_ros/distorted_h_transform.h"
@@ -11,21 +11,21 @@
 
 static constexpr size_t maxWarpTry = 3;
 template <size_t FINE_GRID_SIZE>
-std::array<PointsAndIDs, FINE_GRID_SIZE * FINE_GRID_SIZE> iterativeMatchAndWarp(const tcb::span<const Stage2KeypointGroup> &unorderedPoints,
-                                                                                const std::array<cv::Point2d, FINE_GRID_SIZE * FINE_GRID_SIZE> &unitPoints,
-                                                                                const std::array<cv::Point2d, 4> &unitCorners,
-                                                                                const std::array<cv::Point2d, 4> &cptsInCrop, // ordered corners in crop
-                                                                                const std::array<double, 9> &cameraMatrix,
-                                                                                const std::array<double, 6> &distCoeffs,
-                                                                                const cv::Mat &H,
-                                                                                const std::vector<cv::Mat> &HListForCtpsInCrop,
-                                                                                const size_t maxWarpTry);
+PointsAndIDs<FINE_GRID_SIZE> iterativeMatchAndWarp(const tcb::span<const Stage2KeypointGroup> &unorderedPoints,
+                                                   const std::array<cv::Point2d, FINE_GRID_SIZE * FINE_GRID_SIZE> &unitPoints,
+                                                   const std::array<cv::Point2d, 4> &unitCorners,
+                                                   const std::array<cv::Point2d, 4> &cptsInCrop, // ordered corners in crop
+                                                   const std::array<double, 9> &cameraMatrix,
+                                                   const std::array<double, 6> &distCoeffs,
+                                                   const cv::Mat &H,
+                                                   const std::vector<cv::Mat> &HListForCtpsInCrop,
+                                                   const size_t maxWarpTry);
 
 template <size_t FINE_GRID_SIZE>
 static void checkMatchRatio(double &matchRatio,
                             int &count,
                             int &totalCount,
-                            const std::array<PointsAndIDs, FINE_GRID_SIZE * FINE_GRID_SIZE> &orderedPoints,
+                            const PointsAndIDs<FINE_GRID_SIZE> &orderedPoints,
                             const int unorderedPointsNum);
 
 template <size_t GRID_SIZE, class UNIT_TAG_CLASS, bool IS_NEED_MAIN_IDX, size_t STEP_ELEM_NUM, size_t KPT_START_IDX>
@@ -36,7 +36,7 @@ UnitTagTemplate<GRID_SIZE, UNIT_TAG_CLASS, IS_NEED_MAIN_IDX, STEP_ELEM_NUM, KPT_
 
 template <size_t GRID_SIZE, class UNIT_TAG_CLASS, bool IS_NEED_MAIN_IDX, size_t STEP_ELEM_NUM, size_t KPT_START_IDX>
 void UnitTagTemplate<GRID_SIZE, UNIT_TAG_CLASS, IS_NEED_MAIN_IDX, STEP_ELEM_NUM, KPT_START_IDX>::matchFineGrid(double &maxMatchRatio,
-                                                                                                               std::array<PointsAndIDs, (GRID_SIZE + 2) * (GRID_SIZE + 2)> &bestOrderedPoints,
+                                                                                                               PointsAndIDs<GRID_SIZE + 2> &bestOrderedPoints,
                                                                                                                const tcb::span<const Stage2KeypointGroup> &unorderedPoints,
                                                                                                                const cv::Mat &H,
                                                                                                                const tcb::span<const float2> &stage2PredCorners,
@@ -44,10 +44,6 @@ void UnitTagTemplate<GRID_SIZE, UNIT_TAG_CLASS, IS_NEED_MAIN_IDX, STEP_ELEM_NUM,
                                                                                                                const cv::Mat &distCoeffs) const
 {
     // Make sure there are actually predictions to work with.
-    if (stage2PredCorners.size() < 4)
-    {
-        return;
-    }
     if (unorderedPoints.size() == 0)
     {
         return;
@@ -64,9 +60,10 @@ void UnitTagTemplate<GRID_SIZE, UNIT_TAG_CLASS, IS_NEED_MAIN_IDX, STEP_ELEM_NUM,
     printPoints("unitCorners", unitCorners);
     printPoints("orderedCorners", orderedCorners);
     maxMatchRatio = 0;
-    for (size_t i = 0; i < bestOrderedPoints.size(); i++)
+    for (size_t i = 0; i < bestOrderedPoints.m_point.size(); i++)
     {
-        bestOrderedPoints[i] = PointsAndIDs{unitPoints[i].x, unitPoints[i].y, -1};
+        bestOrderedPoints.m_point[i] = unitPoints[i];
+        bestOrderedPoints.m_id[i] = -1;
     }
 
     constexpr auto unitPointsSize = (GRID_SIZE + 2) * (GRID_SIZE + 2);
@@ -101,19 +98,19 @@ void UnitTagTemplate<GRID_SIZE, UNIT_TAG_CLASS, IS_NEED_MAIN_IDX, STEP_ELEM_NUM,
 }
 
 template <size_t GRID_SIZE, class UNIT_TAG_CLASS, bool IS_NEED_MAIN_IDX, size_t STEP_ELEM_NUM, size_t KPT_START_IDX>
-std::array<cv::Point2d, 4> UnitTagTemplate<GRID_SIZE, UNIT_TAG_CLASS, IS_NEED_MAIN_IDX, STEP_ELEM_NUM, KPT_START_IDX>::updateCornersInImage(const std::array<PointsAndIDs, (GRID_SIZE + 2) * (GRID_SIZE + 2)> &orderedPointsAndIds,
+std::array<cv::Point2d, 4> UnitTagTemplate<GRID_SIZE, UNIT_TAG_CLASS, IS_NEED_MAIN_IDX, STEP_ELEM_NUM, KPT_START_IDX>::updateCornersInImage(const PointsAndIDs<GRID_SIZE + 2> &orderedPointsAndIds,
                                                                                                                                             const cv::Mat &HCrop,
                                                                                                                                             const cv::Mat &cameraMatrix,
                                                                                                                                             const cv::Mat &distCoeffs) const
 {
     const auto unitPoints = m_unitTags.getFineGridPoints(0, true, STEP_ELEM_NUM);
-    std::array<cv::Point2d, (GRID_SIZE + 2) * (GRID_SIZE + 2)> orderedPoints;
-    for (size_t i = 0; i < orderedPointsAndIds.size(); i++)
-    {
-        orderedPoints[i] = orderedPointsAndIds[i].m_point;
-    }
     const auto unitCorners = m_unitTags.getOrderedCorners();
-    const auto cornersInCropUpdated = controlpointsToKeypointsInCropWithH(unitPoints, orderedPoints, unitCorners, cameraMatrix, distCoeffs, HCrop);
+    const auto cornersInCropUpdated = controlpointsToKeypointsInCropWithH(unitPoints,
+                                                                          orderedPointsAndIds.m_point,
+                                                                          unitCorners,
+                                                                          cameraMatrix,
+                                                                          distCoeffs,
+                                                                          HCrop);
 
     return warpPerspectivePts(HCrop.inv(), cornersInCropUpdated);
 }
@@ -240,15 +237,15 @@ static void matchAndWarp(cv::Mat &HNew,
 }
 
 template <size_t FINE_GRID_SIZE>
-std::array<PointsAndIDs, FINE_GRID_SIZE * FINE_GRID_SIZE> iterativeMatchAndWarp(const tcb::span<const Stage2KeypointGroup> &stage2KeypointGroups,              // keypoints in crop
-                                                                                const std::array<cv::Point2d, FINE_GRID_SIZE * FINE_GRID_SIZE> &orderedKptsGt, // unit points
-                                                                                const std::array<cv::Point2d, 4> &cptsGt,                                      // unit corners
-                                                                                const std::array<cv::Point2d, 4> &cptsInCrop,                                  // ordered corners in crop
-                                                                                const cv::Mat &cameraMatrix,
-                                                                                const cv::Mat &distCoeffs,
-                                                                                const cv::Mat &H,
-                                                                                const std::vector<cv::Mat> &HListForCtpsInCrop,
-                                                                                const size_t maxWarpTry)
+PointsAndIDs<FINE_GRID_SIZE> iterativeMatchAndWarp(const tcb::span<const Stage2KeypointGroup> &stage2KeypointGroups,              // keypoints in crop
+                                                   const std::array<cv::Point2d, FINE_GRID_SIZE * FINE_GRID_SIZE> &orderedKptsGt, // unit points
+                                                   const std::array<cv::Point2d, 4> &cptsGt,                                      // unit corners
+                                                   const std::array<cv::Point2d, 4> &cptsInCrop,                                  // ordered corners in crop
+                                                   const cv::Mat &cameraMatrix,
+                                                   const cv::Mat &distCoeffs,
+                                                   const cv::Mat &H,
+                                                   const std::vector<cv::Mat> &HListForCtpsInCrop,
+                                                   const size_t maxWarpTry)
 {
 #ifdef DEBUG
     std::cout << "iterativeMatchAndWarp" << std::endl;
@@ -416,14 +413,11 @@ std::array<PointsAndIDs, FINE_GRID_SIZE * FINE_GRID_SIZE> iterativeMatchAndWarp(
 #endif
         }
     }
-    std::array<PointsAndIDs, FINE_GRID_SIZE * FINE_GRID_SIZE> orderedKptsWithIds;
+    PointsAndIDs<FINE_GRID_SIZE> orderedKptsWithIds;
     //std::cout << "HCurr = " << HCurr << std::endl;
     if (HCurr.empty())
     {
-        for (size_t ii = 0; ii < orderedKptsWithIds.size(); ii++)
-        {
-            orderedKptsWithIds[ii] = PointsAndIDs{orderedKptsGt[ii].x, orderedKptsGt[ii].y, -1};
-        }
+        orderedKptsWithIds.m_point = orderedKptsGt;
     }
     else
     {
@@ -440,28 +434,26 @@ std::array<PointsAndIDs, FINE_GRID_SIZE * FINE_GRID_SIZE> iterativeMatchAndWarp(
             if (matchFlagsCandBest[ii])
             {
                 const auto &kp = stage2KeypointGroups[matchIdsBest[ii]];
-                orderedKptsWithIds[ii] = PointsAndIDs{kp.m_keypoint.x,
-                                                      kp.m_keypoint.y,
-                                                      kp.m_label,
-                                                      kp.m_score};
+                orderedKptsWithIds.m_point[ii].x = kp.m_keypoint.x;
+                orderedKptsWithIds.m_point[ii].y = kp.m_keypoint.y;
+                orderedKptsWithIds.m_id[ii] = kp.m_label;
+                orderedKptsWithIds.m_score[ii] = kp.m_score;
             }
             else
             {
-                orderedKptsWithIds[ii] = PointsAndIDs{orderedKptCandidatesWarp[ii].x,
-                                                      orderedKptCandidatesWarp[ii].y,
-                                                      -1};
+                orderedKptsWithIds.m_point[ii] = orderedKptCandidatesWarp[ii];
+                orderedKptsWithIds.m_id[ii] = -1;
             }
         }
     }
 #ifdef DEBUG
     std::cout << "orderedKptsWithIds" << std::endl;
-    for (const auto &o : orderedKptsWithIds)
+    for (size_t i = 0; i < orderedKptsWithIds.m_point.size(); i++)
     {
-        std::cout << o.m_point.x << " " << o.m_point.y << " " << o.m_id << std::endl;
+        std::cout << orderedKptsWithIds.m_point[i].x << " " << orderedKptsWithIds.m_point[i].y << " " << orderedKptsWithIds.m_id[i] << std::endl;
     }
 #endif
 
-
     return orderedKptsWithIds;
 }
 
@@ -469,18 +461,18 @@ template <size_t FINE_GRID_SIZE>
 static void checkMatchRatio(double &matchRatio,
                             int &count,
                             int &totalCount,
-                            const std::array<PointsAndIDs, FINE_GRID_SIZE * FINE_GRID_SIZE> &orderedPoints,
+                            PointsAndIDs<FINE_GRID_SIZE> &orderedPoints,
                             const int unorderedPointsNum)
 {
     count = 0;
-    for (const auto &op: orderedPoints)
+    for (const auto &id: orderedPoints.m_id)
     {
-        if (op.m_id >= 0)
+        if (id >= 0)
         {
             count += 1;
         }
     }
-    totalCount = std::max(static_cast<int>(orderedPoints.size()), count);
+    totalCount = std::max(static_cast<int>(orderedPoints.m_id.size()), count);
     matchRatio = static_cast<double>(count) / totalCount;
 #ifdef DEBUG
     std::cout << "checkMatchRatio : count = " << count << " unorderedPointNum = " << unorderedPointsNum << " totalCount = " << totalCount << " matchRatio = " << matchRatio << std::endl;
diff --git a/zebROS_ws/src/gpu_apriltag/CMakeLists.txt b/zebROS_ws/src/gpu_apriltag/CMakeLists.txt
index d0b6bdcd2..cd9751741 100644
--- a/zebROS_ws/src/gpu_apriltag/CMakeLists.txt
+++ b/zebROS_ws/src/gpu_apriltag/CMakeLists.txt
@@ -5,6 +5,8 @@ enable_language(CUDA)
 include("../cmake_modules/CMakeOpt.cmake")
 
 set(CMAKE_CXX_STANDARD 20)
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Ofast -DNDEBUG -Wno-deprecated-declarations")
+add_compile_options(-Wno-deprecated-declarations)
 ## Find catkin macros and libraries
 ## if COMPONENTS list like find_package(catkin REQUIRED COMPONENTS xyz)
 ## is used, also find other catkin packages
@@ -25,14 +27,16 @@ find_package(catkin REQUIRED COMPONENTS
 ## System dependencies are found with CMake's conventions
 # find_package(Boost REQUIRED COMPONENTS system)
 
-
 set(CMAKE_CUDA_STANDARD 17)
 set(CMAKE_CUDA_STANDARD_REQUIRED TRUE)
 set(CMAKE_CXX_STANDARD_REQUIRED TRUE)
 set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr")
 
+set(CMAKE_MODULE_PATH "${CMAKE_SOURCE_DIR}/cmake" ${CMAKE_MODULE_PATH})
+
 # Use the correct version of CUDA
 set(CUDA_TOOLKIT_ROOT_DIR /usr/local/cuda)
+find_package(TensorRT REQUIRED)
 find_package(CUDA REQUIRED)
 find_package(OpenCV REQUIRED)
 find_library(LIBNVTOOLSEXT nvToolsExt PATHS ${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES})
@@ -147,6 +151,7 @@ include_directories(
   ../../../cccl/thrust
   ../../../cccl/libcudacxx/include
   ${CUDA_INCLUDE_DIRS}
+  ${TensorRT_INCLUDE_DIRS}
   ${catkin_INCLUDE_DIRS}
   ${FRC971}
   ${FRC971}/third_party/apriltag
@@ -168,12 +173,31 @@ add_library(${PROJECT_NAME}
   src/stubs.cpp
   ${FRC971}/frc971/orin/apriltag.cc
   ${FRC971}/frc971/orin/apriltag_detect.cc
+  ${FRC971}/frc971/orin/aruco_dict.cpp
+  ${FRC971}/frc971/orin/confidence_filter.cu
   ${FRC971}/frc971/orin/cuda.cc
   ${FRC971}/frc971/orin/cuda_event_timing.cc
   ${FRC971}/frc971/orin/cuda_utils.cc
   ${FRC971}/frc971/orin/labeling_allegretti_2019_BKE.cc
   ${FRC971}/frc971/orin/line_fit_filter.cc
   ${FRC971}/frc971/orin/threshold.cc
+  ${FRC971}/frc971/orin/engine.cpp
+  ${FRC971}/frc971/orin/decoder_engine.cpp
+  ${FRC971}/frc971/orin/decoder_preprocess.cu
+  ${FRC971}/frc971/orin/decoder_softmax.cu
+  ${FRC971}/frc971/orin/grid_prior.cu
+  ${FRC971}/frc971/orin/marker_dict.cpp
+  ${FRC971}/frc971/orin/stag_decoder.cpp
+  ${FRC971}/frc971/orin/stage2_corners.cu
+  ${FRC971}/frc971/orin/stage2_keypoint_group.cu
+  ${FRC971}/frc971/orin/stage2_keypoint_group_trust.cpp
+  ${FRC971}/frc971/orin/stage2_keypoint_trust.cu
+  ${FRC971}/frc971/orin/stage2_keypoint.cu
+  ${FRC971}/frc971/orin/suppress_and_average_keypoints.cu
+  ${FRC971}/frc971/orin/unit_arucotag.cpp
+  ${FRC971}/frc971/orin/unit_chessboard_tag.cpp
+  ${FRC971}/frc971/orin/unit_tag_template.cpp
+  ${FRC971}/frc971/orin/warp_perspective_points.cpp
   ${FRC971}/third_party/apriltag/apriltag.c
   ${FRC971}/third_party/apriltag/apriltag_pose.c
   ${FRC971}/third_party/apriltag/apriltag_quad_thresh.c
@@ -200,6 +224,7 @@ target_link_libraries(${PROJECT_NAME}
   ${catkin_LIBRARIES}
   ${OpenCV_LIBS}
   ${CUDA_LIBRARIES} 
+  ${TensorRT_LIBRARIES}
   ${LIBNVTOOLSEXT}
   ${glog_LIBRARIES}
   # absl::flags_internal
diff --git a/zebROS_ws/src/gpu_apriltag/cmake/FindTensorRT.cmake b/zebROS_ws/src/gpu_apriltag/cmake/FindTensorRT.cmake
new file mode 100644
index 000000000..087cffc17
--- /dev/null
+++ b/zebROS_ws/src/gpu_apriltag/cmake/FindTensorRT.cmake
@@ -0,0 +1,87 @@
+# source:
+# https://github.com/NVIDIA/tensorrt-laboratory/blob/master/cmake/FindTensorRT.cmake
+
+# This module defines the following variables:
+#
+# ::
+#
+#   TensorRT_INCLUDE_DIRS
+#   TensorRT_LIBRARIES
+#   TensorRT_FOUND
+#
+# ::
+#
+#   TensorRT_VERSION_STRING - version (x.y.z)
+#   TensorRT_VERSION_MAJOR  - major version (x)
+#   TensorRT_VERSION_MINOR  - minor version (y)
+#   TensorRT_VERSION_PATCH  - patch version (z)
+#
+# Hints
+# ^^^^^
+# A user may set ``TensorRT_DIR`` to an installation root to tell this module where to look.
+#
+set(_TensorRT_SEARCHES)
+
+if(TensorRT_DIR)
+    set(_TensorRT_SEARCH_ROOT PATHS ${TensorRT_DIR} NO_DEFAULT_PATH)
+    list(APPEND _TensorRT_SEARCHES _TensorRT_SEARCH_ROOT)
+endif()
+
+# appends some common paths
+set(_TensorRT_SEARCH_NORMAL
+        PATHS "/usr"
+        )
+list(APPEND _TensorRT_SEARCHES _TensorRT_SEARCH_NORMAL)
+
+# Include dir
+foreach(search ${_TensorRT_SEARCHES})
+    find_path(TensorRT_INCLUDE_DIR NAMES NvInfer.h ${${search}} PATH_SUFFIXES include)
+endforeach()
+
+if(NOT TensorRT_LIBRARY)
+    foreach(search ${_TensorRT_SEARCHES})
+        find_library(TensorRT_LIBRARY NAMES nvinfer ${${search}} PATH_SUFFIXES lib)
+    endforeach()
+endif()
+
+if(NOT TensorRT_PARSERS_LIBRARY)
+    foreach(search ${_TensorRT_SEARCHES})
+        find_library(TensorRT_NVPARSERS_LIBRARY NAMES nvparsers ${${search}} PATH_SUFFIXES lib)
+    endforeach()
+endif()
+
+if(NOT TensorRT_NVONNXPARSER_LIBRARY)
+    foreach(search ${_TensorRT_SEARCHES})
+        find_library(TensorRT_NVONNXPARSER_LIBRARY NAMES nvonnxparser ${${search}} PATH_SUFFIXES lib)
+    endforeach()
+endif()
+
+mark_as_advanced(TensorRT_INCLUDE_DIR)
+
+if(TensorRT_INCLUDE_DIR AND EXISTS "${TensorRT_INCLUDE_DIR}/NvInfer.h")
+    file(STRINGS "${TensorRT_INCLUDE_DIR}/NvInfer.h" TensorRT_MAJOR REGEX "^#define NV_TENSORRT_MAJOR [0-9]+.*$")
+    file(STRINGS "${TensorRT_INCLUDE_DIR}/NvInfer.h" TensorRT_MINOR REGEX "^#define NV_TENSORRT_MINOR [0-9]+.*$")
+    file(STRINGS "${TensorRT_INCLUDE_DIR}/NvInfer.h" TensorRT_PATCH REGEX "^#define NV_TENSORRT_PATCH [0-9]+.*$")
+
+    string(REGEX REPLACE "^#define NV_TENSORRT_MAJOR ([0-9]+).*$" "\\1" TensorRT_VERSION_MAJOR "${TensorRT_MAJOR}")
+    string(REGEX REPLACE "^#define NV_TENSORRT_MINOR ([0-9]+).*$" "\\1" TensorRT_VERSION_MINOR "${TensorRT_MINOR}")
+    string(REGEX REPLACE "^#define NV_TENSORRT_PATCH ([0-9]+).*$" "\\1" TensorRT_VERSION_PATCH "${TensorRT_PATCH}")
+    set(TensorRT_VERSION_STRING "${TensorRT_VERSION_MAJOR}.${TensorRT_VERSION_MINOR}.${TensorRT_VERSION_PATCH}")
+endif()
+
+include(FindPackageHandleStandardArgs)
+FIND_PACKAGE_HANDLE_STANDARD_ARGS(TensorRT REQUIRED_VARS TensorRT_LIBRARY TensorRT_INCLUDE_DIR VERSION_VAR TensorRT_VERSION_STRING)
+
+if(TensorRT_FOUND)
+    set(TensorRT_INCLUDE_DIRS ${TensorRT_INCLUDE_DIR})
+
+    if(NOT TensorRT_LIBRARIES)
+        set(TensorRT_LIBRARIES ${TensorRT_LIBRARY} ${TensorRT_NVONNXPARSER_LIBRARY} ${TensorRT_NVPARSERS_LIBRARY})
+    endif()
+
+    if(NOT TARGET TensorRT::TensorRT)
+        add_library(TensorRT::TensorRT UNKNOWN IMPORTED)
+        set_target_properties(TensorRT::TensorRT PROPERTIES INTERFACE_INCLUDE_DIRECTORIES "${TensorRT_INCLUDE_DIRS}")
+        set_property(TARGET TensorRT::TensorRT APPEND PROPERTY IMPORTED_LOCATION "${TensorRT_LIBRARY}")
+    endif()
+endif()
\ No newline at end of file
diff --git a/zebROS_ws/src/gpu_apriltag/models/.gitignore b/zebROS_ws/src/gpu_apriltag/models/.gitignore
new file mode 100644
index 000000000..4c785b991
--- /dev/null
+++ b/zebROS_ws/src/gpu_apriltag/models/.gitignore
@@ -0,0 +1 @@
+*engine*
diff --git a/zebROS_ws/src/gpu_apriltag/models/arucotag_decoder.calib b/zebROS_ws/src/gpu_apriltag/models/arucotag_decoder.calib
new file mode 100644
index 000000000..a1e8d1384
--- /dev/null
+++ b/zebROS_ws/src/gpu_apriltag/models/arucotag_decoder.calib
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:20d4a006023f0ec0afd03640606d8893a8f9899008183396328364cf6c74f65e
+size 5054
diff --git a/zebROS_ws/src/gpu_apriltag/models/arucotag_decoder.onnx b/zebROS_ws/src/gpu_apriltag/models/arucotag_decoder.onnx
new file mode 100644
index 000000000..b1e6dbb32
--- /dev/null
+++ b/zebROS_ws/src/gpu_apriltag/models/arucotag_decoder.onnx
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:29678b7526e597a35b28b35bdd845c2977e1a099f8a7c36a93b4eb68551a29cf
+size 901355
diff --git a/zebROS_ws/src/tf_object_detection/src/apriltag_decoder_calibrator.py b/zebROS_ws/src/tf_object_detection/src/apriltag_decoder_calibrator.py
new file mode 100644
index 000000000..6a25d9b45
--- /dev/null
+++ b/zebROS_ws/src/tf_object_detection/src/apriltag_decoder_calibrator.py
@@ -0,0 +1,156 @@
+"""calibrator.py
+From https://raw.githubusercontent.com/jkjung-avt/tensorrt_demos/master/yolo/calibrator.py
+
+The original code could be found in TensorRT-7.x sample code:
+"samples/python/int8_caffe_mnist/calibrator.py".  I made the
+modification so that the Calibrator could handle MS-COCO dataset
+images instead of MNIST.
+"""
+
+#
+# Copyright 1993-2019 NVIDIA Corporation.  All rights reserved.
+#
+# NOTICE TO LICENSEE:
+#
+# This source code and/or documentation ("Licensed Deliverables") are
+# subject to NVIDIA intellectual property rights under U.S. and
+# international Copyright laws.
+#
+# These Licensed Deliverables contained herein is PROPRIETARY and
+# CONFIDENTIAL to NVIDIA and is being provided under the terms and
+# conditions of a form of NVIDIA software license agreement by and
+# between NVIDIA and Licensee ("License Agreement") or electronically
+# accepted by Licensee.  Notwithstanding any terms or conditions to
+# the contrary in the License Agreement, reproduction or disclosure
+# of the Licensed Deliverables to any third party without the express
+# written consent of NVIDIA is prohibited.
+#
+# NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+# LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+# SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+# PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+# NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+# DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+# NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+# NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+# LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+# SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+# DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+# WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+# ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+# OF THESE LICENSED DELIVERABLES.
+#
+# U.S. Government End Users.  These Licensed Deliverables are a
+# "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+# 1995), consisting of "commercial computer software" and "commercial
+# computer software documentation" as such terms are used in 48
+# C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+# only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+# 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+# U.S. Government End Users acquire the Licensed Deliverables with
+# only those rights set forth herein.
+#
+# Any use of the Licensed Deliverables in individual and commercial
+# software must include, in the user documentation and internal
+# comments to the code, the above Disclaimer and U.S. Government End
+# Users Notice.
+
+
+import os
+import numpy as np
+import cv2
+import pycuda.driver as cuda
+import tensorrt as trt 
+from random import shuffle
+
+
+def _preprocess_yolo(img, input_shape):
+    """Preprocess an image before TRT YOLO inferencing.
+
+    # Args
+        img: uint8 numpy array of shape either (img_h, img_w, 3)
+             or (img_h, img_w)
+        input_shape: a tuple of (H, W)
+
+    # Returns
+        preprocessed img: float32 numpy array of shape (H, W)
+    """
+    if img.ndim != 2:
+        img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
+    if img.shape != input_shape:
+        img = cv2.resize(img, (input_shape[1], input_shape[0]))
+    img = img.astype(np.float32) / 255.0
+    return img
+
+
+class ApriltagDecoderEntropyCalibrator(trt.IInt8EntropyCalibrator2):
+    """ApriltagDecoderEntropyCalibrator
+
+    This class implements TensorRT's IInt8EntropyCalibtrator2 interface.
+    It reads all images from the specified directory and generates INT8
+    calibration data for YOLO models accordingly.
+    """
+
+    def __init__(self, img_dir, net_hw, cache_file, batch_size=1):
+        if not os.path.isdir(img_dir) and not cache_file: # is there is a calibration file, we don't care about the images dir
+            raise FileNotFoundError('%s does not exist' % img_dir)
+        if len(net_hw) != 2 or net_hw[0] % 32 or net_hw[1] % 32:
+            raise ValueError('bad net shape: %s' % str(net_hw))
+
+        super().__init__()  # trt.IInt8EntropyCalibrator2.__init__(self)
+
+        self.img_dir = img_dir
+        self.net_hw = net_hw
+        self.cache_file = cache_file
+        self.batch_size = batch_size
+        self.blob_size = net_hw[0] * net_hw[1] * np.dtype('float32').itemsize * batch_size
+
+        self.pngs = [f for f in os.listdir(img_dir) if f.endswith('.png')]
+        shuffle(self.pngs)
+        # The number "500" is NVIDIA's suggestion.  See here:
+        # https://docs.nvidia.com/deeplearning/tensorrt/developer-guide/index.html#optimizing_int8_c
+        if len(self.pngs) < 500:
+            print('WARNING: found less than 500 images in %s!' % img_dir)
+        else:
+            self.pngs = self.pngs[:9500]
+        self.current_index = 0
+
+        # Allocate enough memory for a whole batch.
+        self.device_input = cuda.mem_alloc(self.blob_size)
+
+    def __del__(self):
+        del self.device_input  # free CUDA memory
+
+    def get_batch_size(self):
+        return self.batch_size
+
+    def get_batch(self, names):
+        if self.current_index + self.batch_size > len(self.pngs):
+            return None
+        # current_batch = int(self.current_index / self.batch_size)
+
+        batch = []
+        for i in range(self.batch_size):
+            img_path = os.path.join(
+                self.img_dir, self.pngs[self.current_index + i])
+            img = cv2.imread(img_path, cv2.IMREAD_GRAYSCALE)
+            assert img is not None, 'failed to read %s' % img_path
+            batch.append(_preprocess_yolo(img, self.net_hw))
+        batch = np.stack(batch)
+
+        assert batch.nbytes == self.blob_size, "Calibrator: batch.nbytes / blob size mismatch"
+
+        cuda.memcpy_htod(self.device_input, np.ascontiguousarray(batch))
+        self.current_index += self.batch_size
+        return [self.device_input]
+
+    def read_calibration_cache(self):
+        # If there is a cache, use it instead of calibrating again.
+        # Otherwise, implicitly return None.
+        if os.path.exists(self.cache_file):
+            with open(self.cache_file, 'rb') as f:
+                return f.read()
+
+    def write_calibration_cache(self, cache):
+        with open(self.cache_file, 'wb') as f:
+            f.write(cache)
\ No newline at end of file