diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..70f4f7b --- /dev/null +++ b/.gitignore @@ -0,0 +1,70 @@ + +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Virtual environment +.venv/ +venv/ +env/ +ENV/ +visionmate/ + +# PyInstaller +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ + +# Model downloads & cache +app/models/ +*.onnx +*.pth +*.pb +*.tflite + +# Temporary files +*.log +*.tmp +*.bak +*.swp +*.DS_Store +Thumbs.db + +# Jupyter/IPython +.ipynb_checkpoints + +# Environment variables +.env +.env.* + +# VSCode/IDE +.vscode/ +.idea/ + +# macOS +.DS_Store + +# Windows +*.lnk +desktop.ini diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md new file mode 100644 index 0000000..b2c9e15 --- /dev/null +++ b/CODE_OF_CONDUCT.md @@ -0,0 +1,88 @@ +# πŸ“œ Code of Conduct + +## πŸ‘‹ Welcome to VisionMate + +**VisionMate** is an open-source initiative committed to empowering visually impaired individuals through inclusive, innovative assistive technology. We believe in creating a **respectful, collaborative, and safe environment** for everyone involved in this missionβ€”regardless of background, identity, or skill level. + +This Code of Conduct outlines the behavior expected of all contributors and participants in VisionMate spaces. + +--- + +## πŸ’‘ Our Values + +We expect all members of the VisionMate community to: + +- 🀝 Treat others with **respect, kindness, and empathy** +- 🌍 Embrace **diversity and inclusion** +- πŸ“£ Communicate **clearly and constructively** +- 🧠 Encourage **learning, sharing, and collaboration** +- 🎯 Focus on **problem-solving** and positive contributions + +--- + +## πŸ—£οΈ Feedback Process + +We welcome input from all contributors to help improve our community, processes, and codebase. + +- πŸ›  Share suggestions through GitHub discussions, issues, or pull requests +- 🧩 Be open to differing viewpoints and respectful debate +- βœ… Encourage reviews that are kind, specific, and constructive +- πŸ“ Feedback will be considered carefully by maintainers and incorporated when appropriate + +--- + +## 🚫 Unacceptable Behavior + +To ensure a supportive space, the following will **not** be tolerated: + +- ❌ Harassment, discrimination, or hate speech +- ❌ Personal attacks, threats, or derogatory comments +- ❌ Sexualized or inappropriate content or language +- ❌ Spamming, trolling, or sustained disruption +- ❌ Sharing private information without explicit consent + +--- + +## πŸ™‹ Reporting Issues + +If you witness or experience any behavior that violates this Code of Conduct: +**Report it immediately.** + +All reports will be handled **discreetly and respectfully** by the project maintainers. + +--- + +## βš–οΈ Enforcement + +Violations of this Code of Conduct may result in: + +| Consequence | Description | +|-------------------|---------------------------------------------------------------------------| +| 🟒 Warning | A private warning and clarification of the issue | +| 🟑 Temporary Ban | Temporary removal from participation in discussions or contributions | +| πŸ”΄ Permanent Ban | Full removal from the project and blocking of further contributions | + +### 🧾 Accountability + +- Repeated or severe violations may lead to stricter consequences, including permanent bans +- Maintainers reserve the right to evaluate each case on a situational basis +- Appeals may be discussed with the core maintainer team if needed + +--- + +## πŸ‘₯ Scope + +This Code of Conduct applies to: + +- All VisionMate GitHub repositories (issues, pull requests, discussions) +- Community communication platforms (e.g., chats, forums) +- Public or private conversations related to the project +- Any events, meetings, or collaborative spaces + +--- + +## πŸ“ Attribution + +This Code of Conduct is adapted from the [Contributor Covenant](https://www.contributor-covenant.org/version/2/1/code_of_conduct.html), version 2.1. +Thank you for helping make **VisionMate** a safe, accessible, and inclusive space for everyone. πŸ’™ +Let’s build a world where technology supports **everyone’s independence.** diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 0000000..0afc095 --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,53 @@ +# πŸ‘₯ Contribution Guide + +Welcome to the Hall of Fame! πŸ† + +Every line of code, every bug fix, every pixel of design β€” it all comes from people like **you**. + +------------------------------------------------------------------------------------------ +**VisionMate** is more than just an AI project β€” it’s a growing community of learners, builders, and innovators πŸ’‘. + +πŸ§ πŸ’»πŸŽ¨πŸš€πŸ’¬ +From developers to designers to curious first-timers β€” we see you, we appreciate you, and we welcome you. + +------------------------------------------------------------------------------------------- + + +## πŸ› οΈ Contribution Areas + +You can contribute to: + +🎨Python – Core backend and computer vision +🧠OpenCV – Image processing and recognition +πŸ—ƒοΈFlask / Django – Backend framework (to be finalized) +πŸ“ŠReact.js / Flutter – Frontend or app interface +πŸ“ŠMySQL – Data storage +πŸ“‹Google Cloud Vision API – (future integration) +πŸ“‹Text-to-Speech / Speech-to-Text APIs – Accessibility tools + + +------------------------------------------------------------------------------------------- + +## πŸš€ Getting Started + + +Follow these steps to contribute to the VisionMate project on your local machine: + +# 1. Fork the Repository +By clicking on the Fork button of the repository, you get access to commit changes and push them in github. + + +# 2. Clone the repository +git clone https://github.com/kaushav07/VisionMate.git + +# 3. Navigate into the project directory +cd VisionMate + +# 4. Install all the required dependencies +pip install -r requirements.txt + + + + +------------------------------------------------------------------------------------------- + diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..56af90a --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2025 kaushav07 + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/README.md b/README.md index 6af5802..7b80b6c 100644 --- a/README.md +++ b/README.md @@ -4,30 +4,58 @@ ## Project Overview -VisionMate aims to integrate advanced computer vision, text recognition, and speech technologies to help users: -- Identify objects and surroundings -- Read printed or handwritten text aloud -- Provide real-time feedback through a simple, user-friendly interface +VisionMate uses computer vision, text recognition, and speech tech to help users: + +- Recognize objects and surroundings +- Read printed or handwritten text out loud +- Control the system easily with voice commands +- Get real-time alerts about obstacles +- Add new features easily in the future This is the early development and ideation phase. The repository will include prototypes, research notes, and starter code as the project progresses. ## Features (Planned) -βœ… Object detection using computer vision -βœ… Text-to-speech functionality -βœ… Speech-based user controls -βœ… Environment awareness for obstacle detection -βœ… Modular architecture for future feature integration +- βœ… Real-time object detection +- βœ… Text-to-speech to read out text +- βœ… Speech-based user controls +- βœ… Environment awareness for obstacle detection +- βœ… Modular architecture for future feature integration + +## Technology Stack + +| Part | Technology / Tools | +|---------------------|-------------------------------------------| +| Programming Language| Python | +| Computer Vision | OpenCV, Google Cloud Vision API (planned) | +| Backend Framework | Flask / Django (to be decided) | +| Frontend / App | React.js / Flutter (planned) | +| Database | MySQL | +| Accessibility APIs | Text-to-Speech / Speech-to-Text APIs | + +## How It Works + +Here’s how VisionMate works step-by-step: + +1. **Captures Input:** + Uses a camera to take live pictures or videos of the surroundings. + +2. **Detects Objects:** + Uses computer vision to find and identify things like doors, obstacles, signs, etc. -## Tech Stack +3. **Reads Text:** + Uses OCR (Optical Character Recognition) to detect printed or handwritten text. + +4. **Speech Processing:** + - Converts detected text to speech so the user can hear it. + - Listens to user’s voice commands to control the system. + +5. **Gives Feedback:** + Provides real-time audio alerts about obstacles and text info to help the user move safely. + +6. **Modular Design:** + Built so new features and better AI can be added later easily. -- **Python** – Core backend and computer vision -- **OpenCV** – Image processing and recognition -- **Flask / Django** – Backend framework (to be finalized) -- **React.js / Flutter** – Frontend or app interface -- **MySQL** – Data storage -- **Google Cloud Vision API** – (future integration) -- **Text-to-Speech / Speech-to-Text APIs** – Accessibility tools ## Getting Started @@ -39,3 +67,11 @@ Clone the repository and install dependencies: git clone https://github.com/kaushav07/VisionMate.git cd VisionMate pip install -r requirements.txt +``` +## Contributing + +We’d love your help! Please see [CONTRIBUTING.md](CONTRIBUTING.md) to learn how you can contribute. + +## πŸ“„ License + +This project is licensed under the [MIT License](LICENSE). \ No newline at end of file diff --git a/__pycache__/config.cpython-313.pyc b/__pycache__/config.cpython-313.pyc new file mode 100644 index 0000000..5a603bd Binary files /dev/null and b/__pycache__/config.cpython-313.pyc differ diff --git a/__pycache__/tts_utils.cpython-313.pyc b/__pycache__/tts_utils.cpython-313.pyc new file mode 100644 index 0000000..6ea8951 Binary files /dev/null and b/__pycache__/tts_utils.cpython-313.pyc differ diff --git a/app/api/__init__.py b/app/api/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/app/api/perception/vision/__init__.py b/app/api/perception/vision/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/app/data/__init__.py b/app/data/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/app/data/users/dummy_user/pictures/Paul.jpeg b/app/data/users/dummy_user/pictures/Paul.jpeg new file mode 100644 index 0000000..fb881c9 Binary files /dev/null and b/app/data/users/dummy_user/pictures/Paul.jpeg differ diff --git a/app/data/users/dummy_user/pictures/Peter.jpeg b/app/data/users/dummy_user/pictures/Peter.jpeg new file mode 100644 index 0000000..8b645b6 Binary files /dev/null and b/app/data/users/dummy_user/pictures/Peter.jpeg differ diff --git a/app/services/__init__.py b/app/services/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/app/services/authentication/auth_utils.py b/app/services/authentication/auth_utils.py new file mode 100644 index 0000000..e9878f6 --- /dev/null +++ b/app/services/authentication/auth_utils.py @@ -0,0 +1,4 @@ +# app/services/authentication/auth_utils.py +""" +This file contains utility functions for authentication-related tasks. +""" \ No newline at end of file diff --git a/app/services/authentication/password_utils.py b/app/services/authentication/password_utils.py new file mode 100644 index 0000000..066e788 --- /dev/null +++ b/app/services/authentication/password_utils.py @@ -0,0 +1,4 @@ +# app/services/authentication/password_utils.py +""" +This file contains utility functions for password-related tasks. +""" \ No newline at end of file diff --git a/app/services/perception/audio/__init__.py b/app/services/perception/audio/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/app/services/perception/audio/stt_utils.py b/app/services/perception/audio/stt_utils.py new file mode 100644 index 0000000..dbaf069 --- /dev/null +++ b/app/services/perception/audio/stt_utils.py @@ -0,0 +1,4 @@ +# app/services/perception/audio/transcription.py +""" +This file is for Speech-to-text (STT) logic utilities. +""" diff --git a/app/services/perception/audio/tts_utils.py b/app/services/perception/audio/tts_utils.py new file mode 100644 index 0000000..8a8ecf3 --- /dev/null +++ b/app/services/perception/audio/tts_utils.py @@ -0,0 +1,4 @@ +# app/services/perception/audio/transcription.py +""" +This file is for Text-to-speech (TTS) logic utilities. +""" diff --git a/app/services/perception/audio/voice_utils.py b/app/services/perception/audio/voice_utils.py new file mode 100644 index 0000000..94135e0 --- /dev/null +++ b/app/services/perception/audio/voice_utils.py @@ -0,0 +1,4 @@ +# app/services/perception/audio/voice_utils.py +""" +This file is for voice related utilities. +""" \ No newline at end of file diff --git a/app/services/perception/vision/__init__.py b/app/services/perception/vision/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/app/services/perception/vision/face_utils.py b/app/services/perception/vision/face_utils.py new file mode 100644 index 0000000..49db945 --- /dev/null +++ b/app/services/perception/vision/face_utils.py @@ -0,0 +1,4 @@ +# app/services/perception/vision/face_utils.py +""" +This file contains all the utility services related to face. +""" \ No newline at end of file diff --git a/app/services/perception/vision/gesture_utils.py b/app/services/perception/vision/gesture_utils.py new file mode 100644 index 0000000..5d50d0a --- /dev/null +++ b/app/services/perception/vision/gesture_utils.py @@ -0,0 +1,4 @@ +# app/services/perception/vision/gesture_utils.py +""" +This file is for hand/pose getures like utilities. +""" \ No newline at end of file diff --git a/app/services/perception/vision/object_utils.py b/app/services/perception/vision/object_utils.py new file mode 100644 index 0000000..8ce1a18 --- /dev/null +++ b/app/services/perception/vision/object_utils.py @@ -0,0 +1,4 @@ +# app/services/perception/vision/object_utils.py +""" +This file contains all the utility services related to objects. +""" \ No newline at end of file diff --git a/app/services/perception/vision/scene_analysis.py b/app/services/perception/vision/scene_analysis.py new file mode 100644 index 0000000..d6c675d --- /dev/null +++ b/app/services/perception/vision/scene_analysis.py @@ -0,0 +1,4 @@ +# app/services/perception/vision/scene_analysis.py +""" +This file is for scene analysis and description utilities. +""" \ No newline at end of file diff --git a/app/services/storage/database_storage.py b/app/services/storage/database_storage.py new file mode 100644 index 0000000..ca3773b --- /dev/null +++ b/app/services/storage/database_storage.py @@ -0,0 +1,4 @@ +# app/services/storage/database_storage.py +""" +This file is for utilities to Save & retrieve from database. +""" \ No newline at end of file diff --git a/app/services/storage/file_storage.py b/app/services/storage/file_storage.py new file mode 100644 index 0000000..31eb877 --- /dev/null +++ b/app/services/storage/file_storage.py @@ -0,0 +1,4 @@ +# app/services/storage/file_storage.py +""" +This file is for utilities to Save & retrieve files locally. +""" \ No newline at end of file diff --git a/app/shared/__init__.py b/app/shared/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/app/shared/config.py b/app/shared/config.py new file mode 100644 index 0000000..f3d3c1c --- /dev/null +++ b/app/shared/config.py @@ -0,0 +1,4 @@ +# app/shared/config.py +""" +This file contains configuration variables and methods for the VisionMate application. +""" \ No newline at end of file diff --git a/app/shared/logger.py b/app/shared/logger.py new file mode 100644 index 0000000..7660851 --- /dev/null +++ b/app/shared/logger.py @@ -0,0 +1,4 @@ +# app/shared/logger.py +""" +This file contains Centralized logging setup. +""" \ No newline at end of file diff --git a/app/shared/paths.py b/app/shared/paths.py new file mode 100644 index 0000000..daffb53 --- /dev/null +++ b/app/shared/paths.py @@ -0,0 +1,4 @@ +# app/shared/paths.py +""" +This file defines paths and utility functions for managing user directories in the VisionMate application. +""" diff --git a/app/shared/utils.py b/app/shared/utils.py new file mode 100644 index 0000000..7361ccc --- /dev/null +++ b/app/shared/utils.py @@ -0,0 +1,4 @@ +# app/shared/utils.py +""" +This file contains utility functions for the VisionMate application. Common utility methods are defined here. +""" diff --git a/config.py b/config.py new file mode 100644 index 0000000..64a2631 --- /dev/null +++ b/config.py @@ -0,0 +1,11 @@ +# config.py +# Select the text-to-speech engine. +# Options: +# "gTTS" β†’ uses Google Text-to-Speech (online, clearer) +# "pyttsx3" β†’ uses system speech engine (offline, basic) + +TTS_ENGINE = "gTTS" # or "pyttsx3" + +# IP Webcam URL ((replace with your phone's IP)) +# Make sure your phone and laptop are on the same Wi-Fi +IP_WEBCAM_URL = "http://192.168.29.169:8080/video" \ No newline at end of file diff --git a/dummy_user_data/pictures/Paul.jpeg b/dummy_user_data/pictures/Paul.jpeg new file mode 100644 index 0000000..fb881c9 Binary files /dev/null and b/dummy_user_data/pictures/Paul.jpeg differ diff --git a/dummy_user_data/pictures/Peter.jpeg b/dummy_user_data/pictures/Peter.jpeg new file mode 100644 index 0000000..8b645b6 Binary files /dev/null and b/dummy_user_data/pictures/Peter.jpeg differ diff --git a/face_utils.py b/face_utils.py new file mode 100644 index 0000000..0ad584b --- /dev/null +++ b/face_utils.py @@ -0,0 +1,77 @@ +import insightface +import numpy as np, math +import cv2 +from load_model import load_face_model + +# Global model instance +model = load_face_model() +FACE_MATCH_THRESHOLD = 0.6 + +def cosine_similarity(vec_a: list[float], vec_b: list[float]) -> float: + """ + Calculate cosine similarity between two vectors. + + Args: + vec_a: First vector as a list of numbers. + vec_b: Second vector as a list of numbers. + + Returns: + Cosine similarity value between -1 and 1. + """ + # Check length + if len(vec_a) != len(vec_b): + raise ValueError("Vectors must be of the same length.") + + # Dot product + dot_product = sum(a * b for a, b in zip(vec_a, vec_b)) + + # Norms (magnitudes) + norm_a = math.sqrt(sum(a * a for a in vec_a)) + norm_b = math.sqrt(sum(b * b for b in vec_b)) + + if norm_a == 0 or norm_b == 0: + raise ValueError("Vectors must not be zero-length.") + + return dot_product / (norm_a * norm_b) + +def detect_faces(image: np.ndarray) -> list[dict]: + """Returns list of (embedding, bbox, landmarks) for all detected faces.""" + faces = model.get(image) + results = [] + for face in faces: + results.append({ + "embedding": face.embedding, + "bbox": face.bbox.tolist(), + "landmarks": face.landmark_2d_106.tolist(), + }) + return results + +def face_recog( + embedding: np.ndarray, + known_encodings: dict[str, np.ndarray], + threshold: float = FACE_MATCH_THRESHOLD, + return_score: bool = False +) -> tuple[bool, str, float | None]: + """Matches the given embedding against known encodings.""" + + best_score = -1 + best_match = None + + for name, known_embedding in known_encodings.items(): + score = cosine_similarity(embedding, known_embedding) + if score > best_score: + best_score = score + best_match = name + + if best_score > (1 - threshold): + return True, best_match, best_score if return_score else None + return False, None, best_score if return_score else None + +def crop_face(image: np.ndarray, bbox: list[int], margin: int = 10) -> np.ndarray: + x1, y1, x2, y2 = map(int, bbox) + h, w = image.shape[:2] + x1 = max(x1 - margin, 0) + y1 = max(y1 - margin, 0) + x2 = min(x2 + margin, w) + y2 = min(y2 + margin, h) + return image[y1:y2, x1:x2] diff --git a/load_model.py b/load_model.py new file mode 100644 index 0000000..61bb9ae --- /dev/null +++ b/load_model.py @@ -0,0 +1,54 @@ +import os +import zipfile +import requests +from pathlib import Path +from insightface.app import FaceAnalysis + +# Model config +FACE_MODEL_NAME = "buffalo_l" +FACE_MODEL_PROVIDERS = ["CPUExecutionProvider"] +FACE_MODEL_URL = f"https://github.com/deepinsight/insightface/releases/download/v0.7/{FACE_MODEL_NAME}.zip" +FACE_MODEL_DIR = Path(os.getcwd()) / "models" / FACE_MODEL_NAME +FACE_ZIP_PATH = FACE_MODEL_DIR.parent / f"{FACE_MODEL_NAME}.zip" +FACE_REQUIRED_FILES = [ + "1k3d68.onnx", + "2d106det.onnx", + "det_10g.onnx", + "genderage.onnx", + "w600k_r50.onnx" +] + + +def _ensure_model_exists(model_path, required_files, model_url, zip_path) -> Path: + """Ensure the buffalo_l model is downloaded and extracted.""" + if model_path.exists() and all((model_path / f).exists() for f in required_files): + return model_path + + model_path.mkdir(parents=True, exist_ok=True) + + print("Downloading buffalo_l model...") + response = requests.get(model_url, stream=True) + with open(zip_path, "wb") as f: + for chunk in response.iter_content(chunk_size=8192): + f.write(chunk) + + print("Extracting model...") + with zipfile.ZipFile(zip_path, "r") as zip_ref: + zip_ref.extractall(model_path) + + zip_path.unlink() + print("Model is ready.") + return model_path + + +def load_face_model() -> FaceAnalysis: + """Load the buffalo_l model using InsightFace.""" + model_dir = _ensure_model_exists( + model_path=FACE_MODEL_DIR, + required_files=FACE_REQUIRED_FILES, + model_url=FACE_MODEL_URL, + zip_path=FACE_ZIP_PATH + ) + model = FaceAnalysis(name=str(model_dir), providers=FACE_MODEL_PROVIDERS) + model.prepare(ctx_id=0) + return model diff --git a/main.py b/main.py index 26160be..38a6ea3 100644 --- a/main.py +++ b/main.py @@ -1,43 +1,110 @@ import cv2 -import pyttsx3 -import google.generativeai as genai import numpy as np -import time import threading +import os +import base64 + import speech_recognition as sr from PIL import Image -import io +from dotenv import load_dotenv +import socket +import google.generativeai as genai +from langchain_google_genai import ChatGoogleGenerativeAI +from langchain_core.prompts import PromptTemplate +from langchain_core.messages import HumanMessage -# Initialize TTS engine -engine = pyttsx3.init() -def speak(text): - print("Speaking:", text) - engine.say(text) - engine.runAndWait() +from tts_utils import speak +from config import TTS_ENGINE, IP_WEBCAM_URL -# Initialize Gemini -genai.configure(api_key="API KEY") -model = genai.GenerativeModel(model_name="gemini-1.5-flash") +# Load Gemini API key from .env +load_dotenv() +api_key = os.getenv("API_KEY") -# IP Webcam URL (replace with your phone's IP) -url = 'http://10.134.93.78:8080/video' # Update IP -cap = cv2.VideoCapture(url) + +genai.configure(api_key=api_key) +model = genai.GenerativeModel(model_name="gemini-1.5-flash") +llm = ChatGoogleGenerativeAI( + model="gemini-2.0-flash", + temperature=0, + timeout=None, + max_retries=2, +) +os.environ["GOOGLE_API_KEY"] = "your api key" + + +genai.configure(api_key=api_key) +llm = ChatGoogleGenerativeAI( + model="gemini-2.0-flash", + temperature=0, + timeout=None, + max_retries=2, +) +def is_connected(): + try: + socket.create_connection(("www.google.com", 80), timeout=2) + return True + except OSError: + return False + +# Camera: IP Webcam (phone camera) +cap = cv2.VideoCapture(IP_WEBCAM_URL) status = "Press 's' or say 'scan' to scan surroundings..." -scan_triggered = False # Flag for voice activation +scan_triggered = False def process_frame(frame): + + # Convert the OpenCV BGR image to RGB format (PIL expects RGB) + rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) + + # Create a PIL image (not directly used but can be useful for debugging or saving) rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) + # Encode the frame as PNG image (in memory) + _, buffer = cv2.imencode('.png', frame) + + # Convert the PNG image bytes to a base64-encoded string (required by Gemini) + encoded_image = base64.b64encode(buffer).decode('utf-8') + pil_image = Image.fromarray(rgb_frame) - response = model.generate_content([ - "Describe this scene briefly for a blind user. If there are any signs like STOP or traffic lights, mention them clearly.", - pil_image - ]) - description = response.text.strip() - return description - -# Voice recognition function + + # Encode the frame as PNG image (in memory) + _, buffer = cv2.imencode('.png', frame) + + # Convert the PNG image bytes to a base64-encoded string (required by Gemini) + encoded_image = base64.b64encode(buffer).decode('utf-8') + + try: + # Create a HumanMessage for the Gemini model, combining text and image + message = HumanMessage( + content=[ + # Text prompt asking for a brief visual description for the blind + { + "type": "text", + "text": ( + "Provide a short, clear, and concise description of this scene " + "(1–2 sentences) for a blind person. Focus only on key visual elements " + "or signs like STOP signs, vehicles, people, or traffic lights." + ) + }, + # Embed the base64 image data as an image input + { + "type": "image_url", + "image_url": f"data:image/png;base64,{encoded_image}" + }, + ] + ) + + # Send the prompt to Gemini model using LangChain wrapper and return the description + response = llm.invoke([message]) + return response.content + + except Exception as e: + # Print error for debugging and return a fallback message + print(f"[Gemini Error] {e}") + return "Unable to analyze surroundings due to internet issue." + + def listen_for_scan(): global scan_triggered recognizer = sr.Recognizer() @@ -50,44 +117,38 @@ def listen_for_scan(): while True: with mic as source: try: - print("Listening...") audio = recognizer.listen(source, timeout=3, phrase_time_limit=5) command = recognizer.recognize_google(audio).lower() - print(f"Recognized: {command}") if "scan" in command: - print("Voice command 'scan' detected.") scan_triggered = True - except sr.WaitTimeoutError: - print("Listening timed out, retrying...") - except sr.UnknownValueError: - print("Could not understand audio.") - except sr.RequestError as e: - print(f"Speech recognition service error: {e}") - -# Start voice recognition in a separate thread + except (sr.WaitTimeoutError, sr.UnknownValueError, sr.RequestError): + pass + +# Start voice recognition in background voice_thread = threading.Thread(target=listen_for_scan, daemon=True) voice_thread.start() +if not is_connected(): + speak("Warning. You are offline. Scene analysis will not work.") + while True: ret, frame = cap.read() if not ret: - print("Camera error.") + print("Camera error. Check your IP Webcam app and Wi-Fi.") continue - cv2.putText(frame, status, (20, 50), cv2.FONT_HERSHEY_SIMPLEX, - 0.8, (0, 255, 0), 2) - + cv2.putText(frame, status, (20, 50), cv2.FONT_HERSHEY_SIMPLEX, 0.8, (0, 255, 0), 2) cv2.imshow("Blind Assist Tool", frame) key = cv2.waitKey(1) & 0xFF if key == ord('s') or scan_triggered: - scan_triggered = False # reset flag + scan_triggered = False status = "Analyzing surroundings..." speak("Analyzing surroundings") desc = process_frame(frame) speak(desc) - # Detect important keywords + # Alert on specific signs if "stop sign" in desc.lower() or "stop" in desc.lower(): speak("Stop! There's a stop sign.") elif "red light" in desc.lower(): diff --git a/requirements.txt b/requirements.txt index 804c823..9f37db8 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,13 @@ opencv-python pyttsx3 +gTTS google-generativeai numpy Pillow SpeechRecognition pyaudio +playsound +python-dotenv +langchain-google-genai +langchain-core +insightface \ No newline at end of file diff --git a/scan_logger.py b/scan_logger.py new file mode 100644 index 0000000..926fad6 --- /dev/null +++ b/scan_logger.py @@ -0,0 +1,36 @@ +import datetime + + +scan_history = [] + + # FUNCTION TO LOG SCAN ENTRY +def log_scan(caption, user_command): + timestamp = datetime.datetime.now().strftime('%d-%m-%Y, %H:%M:%S') + entry = { + "timestamp": timestamp, # Key = "timestamp", value = current time + "caption": caption, # Key = "caption", value = passed caption + "user_command": user_command # Key = "user_command", value = passed command + } + scan_history.append(entry) + print("βœ… Logged entry successfully!") + +# FUNCTION TO SHOW ALL HISTORY (DEFINED OUTSIDE log_scan) +def show_history(): + if not scan_history: + print("\nπŸ“­ No scan history available.") + return + + print("\nπŸ“œ Scan History:") + for i, entry in enumerate(scan_history, start=1): + print(f"\nπŸ”Ή Entry {i}") + print(f" πŸ•’ Time: {entry['timestamp']}") + print(f" πŸ–ΌοΈ Caption: {entry['caption']}") + print(f" πŸ’¬ User Command: {entry['user_command']}") + +# DEMO β€” WRAPPED IN if __name__ == "__main__" +if __name__ == "__main__": + log_scan("A man passing the main road", "Alert User") + log_scan("A family roaming in a busy market", "Describe the surroundings") + log_scan("Animal moving freely in the zoo", "Describe the surroundings") + + show_history() diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/test_face_utils.py b/tests/test_face_utils.py new file mode 100644 index 0000000..4f939ac --- /dev/null +++ b/tests/test_face_utils.py @@ -0,0 +1,43 @@ +# tests/services/test_face_utils.py +import pytest +import numpy as np +import pathlib +import cv2 +import sys +import os +sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../'))) +from face_utils import ( + detect_faces, + face_recog, +) + +APP_PATH=pathlib.Path(__file__).parent +print(APP_PATH) +def test_detect_faces(): + # Test the detect_faces function + name="Paul" + image_path = APP_PATH / f"dummy_user_data/pictures/{name}.jpeg" + image = cv2.imread(str(image_path)) + faces = detect_faces(image) + assert len(faces) > 0 + assert 'embedding' in faces[0] + assert 'bbox' in faces[0] + assert 'landmarks' in faces[0] + +def test_face_recog(): + # Test the face_recog function + name = "Paul" + image_path = APP_PATH / f"dummy_user_data/pictures/{name}.jpeg" + image = cv2.imread(str(image_path)) + faces = detect_faces(image) + assert len(faces) > 0 + face_embedding = faces[0]['embedding'] + + # Assuming we have a known face embedding for comparison + known_encodings = { + "Paul": face_embedding # Using the same face to guarantee a match + } + match_result = face_recog(face_embedding, known_encodings) + assert match_result[0] is True + assert match_result[1] == "Paul" + assert match_result[2] is None diff --git a/tts_utils.py b/tts_utils.py new file mode 100644 index 0000000..6664625 --- /dev/null +++ b/tts_utils.py @@ -0,0 +1,51 @@ +""" +tts_utils.py + +Handles Text-to-Speech functionality for the Blind Assist Tool. +Supports both gTTS (online) and pyttsx3 (offline) engines with fallback. + +Usage: +- Import the speak() function and call speak("your message") +- Engine can be set in config.py using TTS_ENGINE = "gTTS" or "pyttsx3" + +""" + +from config import TTS_ENGINE +import pyttsx3 +from gtts import gTTS +from playsound import playsound +import tempfile +import os + +# Initialize pyttsx3 if selected +if TTS_ENGINE == "pyttsx3": + engine = pyttsx3.init() + +def speak(text): + print(f"Speaking ({TTS_ENGINE}): {text}") + + if TTS_ENGINE == "pyttsx3": + try: + engine.say(text) + engine.runAndWait() + except Exception as e: + print(f"[pyttsx3 Error] {e}") + + elif TTS_ENGINE == "gTTS": + try: + tts = gTTS(text=text, lang='en') + with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp: + tts.save(tmp.name) + playsound(tmp.name) + os.remove(tmp.name) + except Exception as e: + print(f"[gTTS Error] {e}. Falling back to pyttsx3.") + fallback_speak(text) + +def fallback_speak(text): + try: + engine = pyttsx3.init() + engine.say(text) + engine.runAndWait() + except Exception as e: + print(f"[Fallback pyttsx3 Error] {e}")