VisionControl/main_controller.py at main · Visshu78/VisionControl · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
import time
import math
import collections
import numpy as np
import cv2
import mediapipe as mp
import traceback
import pyautogui
# import your modular controllers (these files should be present)
from NewHandEye import MouseHandEyeController
from NewVol import VolumeControl
from NewBright import BrightnessControl

# -------------------------
# Settings / Hyperparams (tweak these)
# -------------------------

# Performance mode: reduce camera resolution for better FPS
PERFORMANCE_MODE = True

CAM_ID = 0

# preferred request size (camera may ignore)
# Lower resolution for better performance
REQUEST_W, REQUEST_H = (320, 240) if PERFORMANCE_MODE else (520, 360)

# -------------------------
# Length / smoothing defaults
# -------------------------
# Adjusted for more comfortable finger distance range
# MIN: fingers very close (almost touching)
# MAX: fingers comfortably apart (natural spread)
LENGTH_MIN = 20.0
LENGTH_MAX = 120.0

VOLUME_SMOOTH = 0.15
BRIGHTNESS_SMOOTH = 0.15

MIRROR_FRAME = True
SWAP_HANDS = False
DISABLE_VISUALS = False
INVERT_LENGTH_MAPPING = False

TIP_IDS = {"index": 8, "middle": 12, "ring": 16, "thumb": 4}
PIP_IDS = {"index": 6, "middle": 10, "ring": 14, "thumb_ip": 3}

LEFT_EYE_UP = 159
LEFT_EYE_DOWN = 145
RIGHT_EYE_UP = 386
RIGHT_EYE_DOWN = 374

CALIBRATE_FRAMES = 25
CALIBRATE_WAIT_SECS = 1.0

# Click timing - optimized for quick response
# This is the minimum time eyes must be detected as closed
CLICK_HOLD = 0.01  # Very quick response once eyes detected as closed
DOUBLE_CLICK_WINDOW = 0.5  # Window for double click detection

# -------------------------
# Utilities
# -------------------------
def landmark_to_pixel(lm, frame_w, frame_h):
    return int(lm.x * frame_w), int(lm.y * frame_h)

def compute_eye_metric(face_landmarks, up_idx, down_idx):
    try:
        up = face_landmarks.landmark[up_idx]
        down = face_landmarks.landmark[down_idx]
        return abs(up.y - down.y)
    except Exception:
        return 0.0

def fingers_up_count(hand_landmarks):
    def is_up(tip_id, pip_id):
        try:
            return hand_landmarks.landmark[tip_id].y < hand_landmarks.landmark[pip_id].y
        except Exception:
            return False
    fingers = {
        'index': is_up(TIP_IDS['index'], PIP_IDS['index']),
        'middle': is_up(TIP_IDS['middle'], PIP_IDS['middle']),
        'ring': is_up(TIP_IDS['ring'], PIP_IDS['ring'])
    }
    count = sum(1 for k in fingers if fingers[k])
    return count, fingers

def thumb_index_distance_px(hand_landmarks, frame_w, frame_h):
    try:
        tx, ty = landmark_to_pixel(hand_landmarks.landmark[TIP_IDS['thumb']], frame_w, frame_h)
        ix, iy = landmark_to_pixel(hand_landmarks.landmark[TIP_IDS['index']], frame_w, frame_h)
        return math.hypot(tx - ix, ty - iy)
    except Exception:
        return 0.0

# -------------------------
# Main
# -------------------------
def main():
    # Create single capture and query actual size
    cap = cv2.VideoCapture(CAM_ID)
    cap.set(cv2.CAP_PROP_FRAME_WIDTH, REQUEST_W)
    cap.set(cv2.CAP_PROP_FRAME_HEIGHT, REQUEST_H)
    time.sleep(0.15)  # let camera settle

    FRAME_W = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)) or REQUEST_W
    FRAME_H = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)) or REQUEST_H

    screen_w, screen_h = pyautogui.size()
    frame_w_ratio = float(screen_w) / float(FRAME_W)
    frame_h_ratio = float(screen_h) / float(FRAME_H)
    print(f"Camera actual: {FRAME_W}x{FRAME_H}, Screen: {screen_w}x{screen_h}")
    print(f"Mapping ratios -> w:{frame_w_ratio:.3f}, h:{frame_h_ratio:.3f}")

    # Controllers
    mouse = MouseHandEyeController(smooth=0.20, click_threshold=CLICK_HOLD, double_click_threshold=DOUBLE_CLICK_WINDOW)
    # Disable callbacks to reduce console spam and improve performance
    volume = VolumeControl(length_range=(LENGTH_MIN, LENGTH_MAX), volume_range=(0.0,1.0),
                           smooth_factor=VOLUME_SMOOTH,
                           notify_callback=None)
    brightness = BrightnessControl(length_range=(LENGTH_MIN, LENGTH_MAX), brightness_range=(0,100),
                                   smooth_factor=BRIGHTNESS_SMOOTH,
                                   notify_callback=None)

    state = "MOUSE"
    last_state_change = time.time()

    mp_hands = mp.solutions.hands
    mp_face = mp.solutions.face_mesh
    # Lower detection confidence for better performance
    hands = mp_hands.Hands(static_image_mode=False, max_num_hands=2,
                           min_detection_confidence=0.5, min_tracking_confidence=0.5)
    # Disable refine_landmarks for better performance
    face_mesh = mp_face.FaceMesh(static_image_mode=False, max_num_faces=1,
                                 refine_landmarks=False,
                                 min_detection_confidence=0.5, min_tracking_confidence=0.5)

    # Calibration for open-eye baseline
    print("Calibration: look at camera (open eyes). Starting in %.1f s..." % CALIBRATE_WAIT_SECS)
    time.sleep(CALIBRATE_WAIT_SECS)
    left_metrics, right_metrics = [], []
    captured = 0
    while captured < CALIBRATE_FRAMES:
        ret, frame = cap.read()
        if not ret:
            continue
        if MIRROR_FRAME:
            frame = cv2.flip(frame, 1)
        small = cv2.resize(frame, (FRAME_W, FRAME_H))
        rgb = cv2.cvtColor(small, cv2.COLOR_BGR2RGB)
        fres = face_mesh.process(rgb)
        if fres.multi_face_landmarks:
            fl = fres.multi_face_landmarks[0]
            lm_l = compute_eye_metric(fl, LEFT_EYE_UP, LEFT_EYE_DOWN)
            lm_r = compute_eye_metric(fl, RIGHT_EYE_UP, RIGHT_EYE_DOWN)
            if lm_l > 0 and lm_r > 0:
                left_metrics.append(lm_l); right_metrics.append(lm_r); captured += 1
        time.sleep(0.015)
    left_open = float(np.mean(left_metrics)) if left_metrics else 0.03
    right_open = float(np.mean(right_metrics)) if right_metrics else 0.03
    # LOWER ratio = easier detection (registers as closed sooner)
    # Increased to 0.85 for better single-eye blink detection
    # 0.85 means eye is "closed" when opening is 85% of normal (15% closed)
    EYE_CLOSED_RATIO = 0.85
    left_thresh = left_open * EYE_CLOSED_RATIO
    right_thresh = right_open * EYE_CLOSED_RATIO
    print(f"[CALIB] left_open={left_open:.4f}, right_open={right_open:.4f}")
    print(f"[CALIB] left_thresh={left_thresh:.4f}, right_thresh={right_thresh:.4f}")

    left_q = collections.deque(maxlen=5)
    right_q = collections.deque(maxlen=5)

    try:
        while True:
            ret, frame = cap.read()
            if not ret:
                time.sleep(0.02)
                continue

            if MIRROR_FRAME:
                frame = cv2.flip(frame, 1)  # flip before processing so processing matches display

            frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)

            hand_res = hands.process(frame_rgb)
            face_res = face_mesh.process(frame_rgb)

            left_hand = None
            right_hand = None

            # Use handedness if available; optionally swap
            if getattr(hand_res, "multi_hand_landmarks", None) and getattr(hand_res, "multi_handedness", None):
                for idx, hland in enumerate(hand_res.multi_hand_landmarks):
                    try:
                        label = hand_res.multi_handedness[idx].classification[0].label  # "Left"/"Right"
                    except Exception:
                        label = "Unknown"
                    if SWAP_HANDS:
                        label = "Left" if label == "Right" else ("Right" if label == "Left" else label)
                    if label == "Left":
                        left_hand = hland
                    elif label == "Right":
                        right_hand = hland

            # Fallback: if handedness not present, infer by x of landmark
            if (left_hand is None and right_hand is None) and getattr(hand_res, "multi_hand_landmarks", None):
                for hland in hand_res.multi_hand_landmarks:
                    try:
                        idxx = hland.landmark[TIP_IDS['index']].x
                        if idxx < 0.5:
                            left_hand = hland
                        else:
                            right_hand = hland
                    except Exception:
                        pass

            # Mode switching via left hand (2->volume, 3->brightness)
            left_count = 0
            if left_hand is not None:
                left_count, left_fingers = fingers_up_count(left_hand)

            prev_state = state
            if left_count == 2:
                state = "VOLUME"
            elif left_count == 3:
                state = "BRIGHTNESS"
            else:
                state = "MOUSE"

            if state != prev_state:
                print(f"[STATE] {prev_state} -> {state}")
                last_state_change = time.time()
                if state == "MOUSE":
                    mouse.enable()
                else:
                    mouse.disable()

            # Volume / Brightness control using thumb-index distance from LEFT hand
            if left_hand is not None and state in ("VOLUME","BRIGHTNESS"):
                # note: use FRAME_W/FRAME_H (actual camera size) here
                dist = thumb_index_distance_px(left_hand, FRAME_W, FRAME_H)
                if INVERT_LENGTH_MAPPING:
                    dist = LENGTH_MIN + (LENGTH_MAX - (dist - LENGTH_MIN))
                if state == "VOLUME":
                    volume.update_from_length(dist)
                else:
                    brightness.update_from_length(dist)

            # Cursor control from RIGHT hand index when in MOUSE state
            if right_hand is not None and state == "MOUSE":
                try:
                    idx_lm = right_hand.landmark[TIP_IDS['index']]
                    x_norm = idx_lm.x
                    y_norm = idx_lm.y
                    mouse.move_cursor(x_norm, y_norm, normalized=True)
                except Exception as e:
                    print("Mouse move failed:", e)

            # Eye blink detection for clicks (only in MOUSE mode)
            left_closed = False
            right_closed = False
            if state == "MOUSE" and getattr(face_res, "multi_face_landmarks", None):
                fl = face_res.multi_face_landmarks[0]
                lm_l = compute_eye_metric(fl, LEFT_EYE_UP, LEFT_EYE_DOWN)
                lm_r = compute_eye_metric(fl, RIGHT_EYE_UP, RIGHT_EYE_DOWN)
                left_q.append(lm_l); right_q.append(lm_r)
                avg_l = float(np.mean(left_q))
                avg_r = float(np.mean(right_q))
                left_closed = avg_l < left_thresh
                right_closed = avg_r < right_thresh
                mouse.update_eyes(left_closed, right_closed)

            if not DISABLE_VISUALS:
                # Use frame directly instead of copy to save memory/CPU
                vis = frame
                cv2.putText(vis, f"State:{state}", (12,28), cv2.FONT_HERSHEY_SIMPLEX, 0.8, (0,255,0), 2)
                if left_hand:
                    cv2.putText(vis, f"LCount:{left_count}", (12,56), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0,200,255), 2)
                    # Show finger distance for volume/brightness control
                    if state in ("VOLUME", "BRIGHTNESS"):
                        dist = thumb_index_distance_px(left_hand, FRAME_W, FRAME_H)
                        cv2.putText(vis, f"Dist:{dist:.1f}px", (12,84), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (255,255,0), 2)
                        # Draw line between thumb and index
                        try:
                            tx, ty = landmark_to_pixel(left_hand.landmark[TIP_IDS['thumb']], FRAME_W, FRAME_H)
                            ix, iy = landmark_to_pixel(left_hand.landmark[TIP_IDS['index']], FRAME_W, FRAME_H)
                            cv2.line(vis, (tx, ty), (ix, iy), (0, 255, 255), 2)
                            cv2.circle(vis, (tx, ty), 8, (255, 0, 255), -1)
                            cv2.circle(vis, (ix, iy), 8, (255, 0, 255), -1)
                        except Exception:
                            pass
                try:
                    volp = volume.get_current_volume_percent()
                    brtp = brightness.get_current_brightness()
                except Exception:
                    volp = None; brtp = None
                if volp is not None:
                    cv2.putText(vis, f"Vol:{volp}%", (12,110), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (255,180,180), 2)
                if brtp is not None:
                    cv2.putText(vis, f"Brt:{brtp}%", (12,140), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (180,255,180), 2)
                # Show eye status for click debugging
                if state == "MOUSE":
                    eye_status = "Eyes: "
                    if left_closed:
                        eye_status += "L-CLOSED "
                    if right_closed:
                        eye_status += "R-CLOSED"
                    if left_closed or right_closed:
                        cv2.putText(vis, eye_status, (12,170), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0,255,255), 2)
                cv2.imshow("Controller (q to quit)", vis)
                if cv2.waitKey(1) & 0xFF == ord('q'):
                    break

    except KeyboardInterrupt:
        print("Interrupted")
    except Exception as e:
        print("Unhandled error:", e)
        traceback.print_exc()
    finally:
        cap.release()
        cv2.destroyAllWindows()
        hands.close()
        face_mesh.close()
        print("Exited")

if __name__ == "__main__":
    main()