rxv801 · lukitasxue · Jun 14, 2026 · Jun 12, 2026 · Jun 12, 2026
diff --git a/python/README.md b/python/README.md
@@ -41,13 +41,15 @@ on exit.
 python/
 ├── main.py                 # FastAPI + WebSocket server (not implemented yet)
 ├── models/                 # detection model files (gitignored; fetched by setup.sh)
-│   └── yolox_s.onnx        # YOLOX-S phone detector (Apache-2.0)
+│   ├── yolox_s.onnx        # YOLOX-S phone detector (Apache-2.0)
+│   └── face_landmarker.task# MediaPipe FaceLandmarker for gaze (Apache-2.0)
 └── cv/
     ├── camera.py           # owns the webcam handle: start / read / stop
     ├── detection_loop.py   # the loop: grab frame -> run detectors -> emit result
     ├── phone_detector.py   # detect_phone(frame) -> event dict (YOLOX via onnxruntime)
     ├── phone_detect_test.py# manual visual test: draws boxes on the webcam feed
-    └── gaze_detector.py    # gaze/face detection (planned)
+    ├── gaze_detector.py    # detect_gaze(frame) -> event dict (MediaPipe head pose)
+    └── gaze_detect_test.py # manual visual test: FOCUSED/DISTRACTED + head angles
 ```
 
 ### Design: why `camera.py` and `detection_loop.py` are separate
@@ -91,3 +93,28 @@ is policy that belongs in the loop/state layer, not here.
 
 The model file (`models/yolox_s.onnx`, ~34 MB) is gitignored and downloaded
 by `setup.sh`.
+
+### Gaze detection
+
+`gaze_detector.detect_gaze()` decides whether the user is looking at the
+screen, using **head pose** (which way the face points) from **MediaPipe
+FaceLandmarker** (Apache-2.0, local). Head pose is far more robust than
+eye/iris gaze to lighting, glasses, and distance.
+
+- `analyze_gaze(frame)` → detailed facts (angles, offsets, face count) for the test UI.
+- `detect_gaze(frame)` → the protocol event (`focused` / `distracted`).
+- `calibrate(frame)` / `reset_reference()` → manage the reference pose.
+
+Key behaviours:
+- **Auto-calibration** — the first frame with a face becomes the "looking at
+  screen" reference (0/0); later frames are judged as +/- deviation from it.
+  This makes it work with any camera angle, including off to the side.
+- **Multi-person tracking** — detects up to `NUM_FACES`, locks onto the
+  intended user (biggest/closest face), and follows them by position so other
+  people entering the frame don't steal the signal.
+- **Perception only** — answers "looking at screen *right now*?". The
+  "distracted after N seconds of looking away" timer is policy for the
+  loop/state layer.
+
+The model file (`models/face_landmarker.task`, ~3.6 MB) is gitignored and
+downloaded by `setup.sh`.
diff --git a/python/cv/gaze_detect_test.py b/python/cv/gaze_detect_test.py
@@ -0,0 +1,117 @@
+"""Manual visual test for the gaze detector.
+
+Opens the webcam, runs gaze_detector.analyze_gaze() on each frame, and draws
+the head-pose angles plus a big FOCUSED / DISTRACTED label. Use it to calibrate
+YAW_LIMIT_DEG / PITCH_LIMIT_DEG in gaze_detector.py: turn your head until the
+label flips, and read the angle where that happens.
+Run it:
+    cd python
+    source .venv/bin/activate
+    python cv/gaze_detect_test.py
+
+Press 'q' (video window focused) or Ctrl+C to quit.
+"""
+
+import cv2
+
+import gaze_detector
+
+
+def main() -> None:
+    capture = cv2.VideoCapture(0)
+    if not capture.isOpened():
+        raise RuntimeError("Could not open webcam (index 0).")
+
+    print("Running. Look at the screen — the FIRST detected pose auto-sets as 0.")
+    print("Turn your head away to see it flip. 'c' re-baselines, 'q'/Ctrl+C quits.\n")
+
+    try:
+        while True:
+            ok, frame = capture.read()
+            if not ok or frame is None:
+                continue
+
+            gaze = gaze_detector.analyze_gaze(frame)
+
+            # Decide label + colour. Green when focused, red otherwise.
+            if gaze["has_face"] and gaze["looking_at_screen"]:
+                label, colour = "FOCUSED", (0, 200, 0)
+            elif gaze["has_face"]:
+                label, colour = "DISTRACTED (turned away)", (0, 0, 255)
+            else:
+                label, colour = "NO FACE", (0, 0, 255)
+
+            cv2.putText(
+                frame,
+                label,
+                (20, 50),
+                cv2.FONT_HERSHEY_SIMPLEX,
+                1.0,
+                colour,
+                2,
+            )
+
+            # Show the offset-from-reference angles (what the decision uses) and
+            # whether we've calibrated yet.
+            if gaze["has_face"]:
+                offsets = (
+                    f"yaw {gaze['yaw_offset']:+.0f}  pitch {gaze['pitch_offset']:+.0f}"
+                )
+                cv2.putText(
+                    frame,
+                    offsets,
+                    (20, 90),
+                    cv2.FONT_HERSHEY_SIMPLEX,
+                    0.7,
+                    (255, 255, 255),
+                    2,
+                )
+
+            cal_text = (
+                "calibrated (c = re-baseline)"
+                if gaze["is_calibrated"]
+                else "waiting for face..."
+            )
+            cv2.putText(
+                frame,
+                cal_text,
+                (20, 125),
+                cv2.FONT_HERSHEY_SIMPLEX,
+                0.6,
+                (180, 180, 180),
+                2,
+            )
+
+            # How many people are in frame — tracking only the locked user.
+            faces_text = f"faces in frame: {gaze['face_count']}  (tracking user)"
+            cv2.putText(
+                frame,
+                faces_text,
+                (20, 155),
+                cv2.FONT_HERSHEY_SIMPLEX,
+                0.6,
+                (180, 180, 180),
+                2,
+            )
+
+            cv2.imshow("gaze detector test  (c=calibrate, q=quit)", frame)
+
+            key = cv2.waitKey(1) & 0xFF
+            if key == ord("q"):
+                break
+            if key == ord("c"):
+                if gaze_detector.calibrate(frame):
+                    print("\ncalibrated to current pose as 'looking at screen'")
+                else:
+                    print("\ncalibration failed — no face visible")
+
+    except KeyboardInterrupt:
+        pass
+    finally:
+        capture.release()
+        cv2.destroyAllWindows()
+        print("\nstopped.")
+
+
+if __name__ == "__main__":
+    main()