diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..70f4f7b
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,70 @@
+
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Virtual environment
+.venv/
+venv/
+env/
+ENV/
+visionmate/
+
+# PyInstaller
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+
+# Model downloads & cache
+app/models/
+*.onnx
+*.pth
+*.pb
+*.tflite
+
+# Temporary files
+*.log
+*.tmp
+*.bak
+*.swp
+*.DS_Store
+Thumbs.db
+
+# Jupyter/IPython
+.ipynb_checkpoints
+
+# Environment variables
+.env
+.env.*
+
+# VSCode/IDE
+.vscode/
+.idea/
+
+# macOS
+.DS_Store
+
+# Windows
+*.lnk
+desktop.ini
diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md
new file mode 100644
index 0000000..b2c9e15
--- /dev/null
+++ b/CODE_OF_CONDUCT.md
@@ -0,0 +1,88 @@
+# 📜 Code of Conduct
+
+## 👋 Welcome to VisionMate
+
+**VisionMate** is an open-source initiative committed to empowering visually impaired individuals through inclusive, innovative assistive technology. We believe in creating a **respectful, collaborative, and safe environment** for everyone involved in this mission—regardless of background, identity, or skill level.
+
+This Code of Conduct outlines the behavior expected of all contributors and participants in VisionMate spaces.
+
+---
+
+## 💡 Our Values
+
+We expect all members of the VisionMate community to:
+
+- 🤝 Treat others with **respect, kindness, and empathy**
+- 🌍 Embrace **diversity and inclusion**
+- 📣 Communicate **clearly and constructively**
+- 🧠 Encourage **learning, sharing, and collaboration**
+- 🎯 Focus on **problem-solving** and positive contributions
+
+---
+
+## 🗣️ Feedback Process
+
+We welcome input from all contributors to help improve our community, processes, and codebase.
+
+- 🛠 Share suggestions through GitHub discussions, issues, or pull requests  
+- 🧩 Be open to differing viewpoints and respectful debate  
+- ✅ Encourage reviews that are kind, specific, and constructive  
+- 📝 Feedback will be considered carefully by maintainers and incorporated when appropriate
+
+---
+
+## 🚫 Unacceptable Behavior
+
+To ensure a supportive space, the following will **not** be tolerated:
+
+- ❌ Harassment, discrimination, or hate speech  
+- ❌ Personal attacks, threats, or derogatory comments  
+- ❌ Sexualized or inappropriate content or language  
+- ❌ Spamming, trolling, or sustained disruption  
+- ❌ Sharing private information without explicit consent  
+
+---
+
+## 🙋 Reporting Issues
+
+If you witness or experience any behavior that violates this Code of Conduct:  
+**Report it immediately.**  
+
+All reports will be handled **discreetly and respectfully** by the project maintainers.
+
+---
+
+## ⚖️ Enforcement
+
+Violations of this Code of Conduct may result in:
+
+| Consequence       | Description                                                               |
+|-------------------|---------------------------------------------------------------------------|
+| 🟢 Warning         | A private warning and clarification of the issue                          |
+| 🟡 Temporary Ban   | Temporary removal from participation in discussions or contributions      |
+| 🔴 Permanent Ban   | Full removal from the project and blocking of further contributions       |
+
+### 🧾 Accountability
+
+- Repeated or severe violations may lead to stricter consequences, including permanent bans  
+- Maintainers reserve the right to evaluate each case on a situational basis  
+- Appeals may be discussed with the core maintainer team if needed  
+
+---
+
+## 👥 Scope
+
+This Code of Conduct applies to:
+
+- All VisionMate GitHub repositories (issues, pull requests, discussions)  
+- Community communication platforms (e.g., chats, forums)  
+- Public or private conversations related to the project  
+- Any events, meetings, or collaborative spaces  
+
+---
+
+## 📝 Attribution
+
+This Code of Conduct is adapted from the [Contributor Covenant](https://www.contributor-covenant.org/version/2/1/code_of_conduct.html), version 2.1.
+Thank you for helping make **VisionMate** a safe, accessible, and inclusive space for everyone. 💙  
+Let’s build a world where technology supports **everyone’s independence.**
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
new file mode 100644
index 0000000..0afc095
--- /dev/null
+++ b/CONTRIBUTING.md
@@ -0,0 +1,53 @@
+# 👥 Contribution Guide
+
+Welcome to the Hall of Fame! 🏆 
+
+Every line of code, every bug fix, every pixel of design — it all comes from people like **you**.  
+
+------------------------------------------------------------------------------------------
+**VisionMate** is more than just an AI project — it’s a growing community of learners, builders, and innovators 💡.
+
+🧠💻🎨🚀💬  
+From developers to designers to curious first-timers — we see you, we appreciate you, and we welcome you.
+
+-------------------------------------------------------------------------------------------
+
+
+## 🛠️ Contribution Areas
+
+You can contribute to:
+
+🎨Python – Core backend and computer vision
+🧠OpenCV – Image processing and recognition
+🗃️Flask / Django – Backend framework (to be finalized)
+📊React.js / Flutter – Frontend or app interface
+📊MySQL – Data storage
+📋Google Cloud Vision API – (future integration)
+📋Text-to-Speech / Speech-to-Text APIs – Accessibility tools
+
+
+-------------------------------------------------------------------------------------------
+
+## 🚀 Getting Started
+
+
+Follow these steps to contribute to the VisionMate project on your local machine:
+
+# 1. Fork the Repository
+By clicking on the Fork button of the repository, you get access to commit changes and push them in github.
+
+
+# 2. Clone the repository
+git clone https://github.com/kaushav07/VisionMate.git
+
+# 3. Navigate into the project directory
+cd VisionMate
+
+# 4. Install all the required dependencies
+pip install -r requirements.txt
+
+
+
+
+-------------------------------------------------------------------------------------------
+
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..56af90a
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2025 kaushav07
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/README.md b/README.md
index 6af5802..7b80b6c 100644
--- a/README.md
+++ b/README.md
@@ -4,30 +4,58 @@
 
 ## Project Overview
 
-VisionMate aims to integrate advanced computer vision, text recognition, and speech technologies to help users:
-- Identify objects and surroundings
-- Read printed or handwritten text aloud
-- Provide real-time feedback through a simple, user-friendly interface
+VisionMate uses computer vision, text recognition, and speech tech to help users:  
+
+- Recognize objects and surroundings  
+- Read printed or handwritten text out loud  
+- Control the system easily with voice commands  
+- Get real-time alerts about obstacles  
+- Add new features easily in the future  
 
 This is the early development and ideation phase. The repository will include prototypes, research notes, and starter code as the project progresses.
 
 ## Features (Planned)
 
-✅ Object detection using computer vision  
-✅ Text-to-speech functionality  
-✅ Speech-based user controls  
-✅ Environment awareness for obstacle detection  
-✅ Modular architecture for future feature integration  
+- ✅ Real-time object detection  
+- ✅ Text-to-speech to read out text
+- ✅ Speech-based user controls  
+- ✅ Environment awareness for obstacle detection  
+- ✅ Modular architecture for future feature integration
+
+## Technology Stack
+
+| Part                | Technology / Tools                        |
+|---------------------|-------------------------------------------|
+| Programming Language| Python                                    |
+| Computer Vision     | OpenCV, Google Cloud Vision API (planned) |
+| Backend Framework   | Flask / Django (to be decided)            |
+| Frontend / App      | React.js / Flutter (planned)              |
+| Database            | MySQL                                     |
+| Accessibility APIs  | Text-to-Speech / Speech-to-Text APIs      |
+
+## How It Works 
+
+Here’s how VisionMate works step-by-step:
+
+1. **Captures Input:**  
+   Uses a camera to take live pictures or videos of the surroundings.
+
+2. **Detects Objects:**  
+   Uses computer vision to find and identify things like doors, obstacles, signs, etc.
 
-## Tech Stack
+3. **Reads Text:**  
+   Uses OCR (Optical Character Recognition) to detect printed or handwritten text.
+
+4. **Speech Processing:**  
+   - Converts detected text to speech so the user can hear it.  
+   - Listens to user’s voice commands to control the system.
+
+5. **Gives Feedback:**  
+   Provides real-time audio alerts about obstacles and text info to help the user move safely.
+
+6. **Modular Design:**  
+   Built so new features and better AI can be added later easily.
 
-- **Python** – Core backend and computer vision  
-- **OpenCV** – Image processing and recognition  
-- **Flask / Django** – Backend framework (to be finalized)  
-- **React.js / Flutter** – Frontend or app interface  
-- **MySQL** – Data storage  
-- **Google Cloud Vision API** – (future integration)  
-- **Text-to-Speech / Speech-to-Text APIs** – Accessibility tools
 
 ## Getting Started
 
@@ -39,3 +67,11 @@ Clone the repository and install dependencies:
 git clone https://github.com/kaushav07/VisionMate.git
 cd VisionMate
 pip install -r requirements.txt
+```
+## Contributing 
+
+We’d love your help! Please see [CONTRIBUTING.md](CONTRIBUTING.md) to learn how you can contribute.
+
+## 📄 License
+
+This project is licensed under the [MIT License](LICENSE).
\ No newline at end of file
diff --git a/__pycache__/config.cpython-313.pyc b/__pycache__/config.cpython-313.pyc
new file mode 100644
index 0000000..5a603bd
Binary files /dev/null and b/__pycache__/config.cpython-313.pyc differ
diff --git a/__pycache__/tts_utils.cpython-313.pyc b/__pycache__/tts_utils.cpython-313.pyc
new file mode 100644
index 0000000..6ea8951
Binary files /dev/null and b/__pycache__/tts_utils.cpython-313.pyc differ
diff --git a/app/api/__init__.py b/app/api/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/app/api/perception/vision/__init__.py b/app/api/perception/vision/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/app/data/__init__.py b/app/data/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/app/data/users/dummy_user/pictures/Paul.jpeg b/app/data/users/dummy_user/pictures/Paul.jpeg
new file mode 100644
index 0000000..fb881c9
Binary files /dev/null and b/app/data/users/dummy_user/pictures/Paul.jpeg differ
diff --git a/app/data/users/dummy_user/pictures/Peter.jpeg b/app/data/users/dummy_user/pictures/Peter.jpeg
new file mode 100644
index 0000000..8b645b6
Binary files /dev/null and b/app/data/users/dummy_user/pictures/Peter.jpeg differ
diff --git a/app/services/__init__.py b/app/services/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/app/services/authentication/auth_utils.py b/app/services/authentication/auth_utils.py
new file mode 100644
index 0000000..e9878f6
--- /dev/null
+++ b/app/services/authentication/auth_utils.py
@@ -0,0 +1,4 @@
+# app/services/authentication/auth_utils.py
+"""
+This file contains utility functions for authentication-related tasks.
+"""
\ No newline at end of file
diff --git a/app/services/authentication/password_utils.py b/app/services/authentication/password_utils.py
new file mode 100644
index 0000000..066e788
--- /dev/null
+++ b/app/services/authentication/password_utils.py
@@ -0,0 +1,4 @@
+# app/services/authentication/password_utils.py
+"""
+This file contains utility functions for password-related tasks.
+"""
\ No newline at end of file
diff --git a/app/services/perception/audio/__init__.py b/app/services/perception/audio/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/app/services/perception/audio/stt_utils.py b/app/services/perception/audio/stt_utils.py
new file mode 100644
index 0000000..dbaf069
--- /dev/null
+++ b/app/services/perception/audio/stt_utils.py
@@ -0,0 +1,4 @@
+# app/services/perception/audio/transcription.py
+"""
+This file is for Speech-to-text (STT) logic utilities.
+"""
diff --git a/app/services/perception/audio/tts_utils.py b/app/services/perception/audio/tts_utils.py
new file mode 100644
index 0000000..8a8ecf3
--- /dev/null
+++ b/app/services/perception/audio/tts_utils.py
@@ -0,0 +1,4 @@
+# app/services/perception/audio/transcription.py
+"""
+This file is for Text-to-speech (TTS) logic utilities.
+"""
diff --git a/app/services/perception/audio/voice_utils.py b/app/services/perception/audio/voice_utils.py
new file mode 100644
index 0000000..94135e0
--- /dev/null
+++ b/app/services/perception/audio/voice_utils.py
@@ -0,0 +1,4 @@
+# app/services/perception/audio/voice_utils.py
+"""
+This file is for voice related utilities.
+"""
\ No newline at end of file
diff --git a/app/services/perception/vision/__init__.py b/app/services/perception/vision/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/app/services/perception/vision/face_utils.py b/app/services/perception/vision/face_utils.py
new file mode 100644
index 0000000..49db945
--- /dev/null
+++ b/app/services/perception/vision/face_utils.py
@@ -0,0 +1,4 @@
+# app/services/perception/vision/face_utils.py
+"""
+This file contains all the utility services related to face.
+"""
\ No newline at end of file
diff --git a/app/services/perception/vision/gesture_utils.py b/app/services/perception/vision/gesture_utils.py
new file mode 100644
index 0000000..5d50d0a
--- /dev/null
+++ b/app/services/perception/vision/gesture_utils.py
@@ -0,0 +1,4 @@
+# app/services/perception/vision/gesture_utils.py
+"""
+This file is for hand/pose getures like utilities.
+"""
\ No newline at end of file
diff --git a/app/services/perception/vision/object_utils.py b/app/services/perception/vision/object_utils.py
new file mode 100644
index 0000000..8ce1a18
--- /dev/null
+++ b/app/services/perception/vision/object_utils.py
@@ -0,0 +1,4 @@
+# app/services/perception/vision/object_utils.py
+"""
+This file contains all the utility services related to objects.
+"""
\ No newline at end of file
diff --git a/app/services/perception/vision/scene_analysis.py b/app/services/perception/vision/scene_analysis.py
new file mode 100644
index 0000000..d6c675d
--- /dev/null
+++ b/app/services/perception/vision/scene_analysis.py
@@ -0,0 +1,4 @@
+# app/services/perception/vision/scene_analysis.py
+"""
+This file is for scene analysis and description utilities.
+"""
\ No newline at end of file
diff --git a/app/services/storage/database_storage.py b/app/services/storage/database_storage.py
new file mode 100644
index 0000000..ca3773b
--- /dev/null
+++ b/app/services/storage/database_storage.py
@@ -0,0 +1,4 @@
+# app/services/storage/database_storage.py
+"""
+This file is for utilities to Save & retrieve from database.
+"""
\ No newline at end of file
diff --git a/app/services/storage/file_storage.py b/app/services/storage/file_storage.py
new file mode 100644
index 0000000..31eb877
--- /dev/null
+++ b/app/services/storage/file_storage.py
@@ -0,0 +1,4 @@
+# app/services/storage/file_storage.py
+"""
+This file is for utilities to Save & retrieve files locally.
+"""
\ No newline at end of file
diff --git a/app/shared/__init__.py b/app/shared/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/app/shared/config.py b/app/shared/config.py
new file mode 100644
index 0000000..f3d3c1c
--- /dev/null
+++ b/app/shared/config.py
@@ -0,0 +1,4 @@
+# app/shared/config.py
+"""
+This file contains configuration variables and methods for the VisionMate application.
+"""
\ No newline at end of file
diff --git a/app/shared/logger.py b/app/shared/logger.py
new file mode 100644
index 0000000..7660851
--- /dev/null
+++ b/app/shared/logger.py
@@ -0,0 +1,4 @@
+# app/shared/logger.py
+"""
+This file contains Centralized logging setup.
+"""
\ No newline at end of file
diff --git a/app/shared/paths.py b/app/shared/paths.py
new file mode 100644
index 0000000..daffb53
--- /dev/null
+++ b/app/shared/paths.py
@@ -0,0 +1,4 @@
+# app/shared/paths.py
+"""
+This file defines paths and utility functions for managing user directories in the VisionMate application.
+"""
diff --git a/app/shared/utils.py b/app/shared/utils.py
new file mode 100644
index 0000000..7361ccc
--- /dev/null
+++ b/app/shared/utils.py
@@ -0,0 +1,4 @@
+# app/shared/utils.py
+"""
+This file contains utility functions for the VisionMate application. Common utility methods are defined here.
+"""
diff --git a/config.py b/config.py
new file mode 100644
index 0000000..64a2631
--- /dev/null
+++ b/config.py
@@ -0,0 +1,11 @@
+# config.py
+# Select the text-to-speech engine.
+# Options:
+#   "gTTS"     → uses Google Text-to-Speech (online, clearer)
+#   "pyttsx3"  → uses system speech engine (offline, basic)
+
+TTS_ENGINE = "gTTS"  # or "pyttsx3"
+
+# IP Webcam URL ((replace with your phone's IP))
+# Make sure your phone and laptop are on the same Wi-Fi
+IP_WEBCAM_URL = "http://192.168.29.169:8080/video"
\ No newline at end of file
diff --git a/dummy_user_data/pictures/Paul.jpeg b/dummy_user_data/pictures/Paul.jpeg
new file mode 100644
index 0000000..fb881c9
Binary files /dev/null and b/dummy_user_data/pictures/Paul.jpeg differ
diff --git a/dummy_user_data/pictures/Peter.jpeg b/dummy_user_data/pictures/Peter.jpeg
new file mode 100644
index 0000000..8b645b6
Binary files /dev/null and b/dummy_user_data/pictures/Peter.jpeg differ
diff --git a/face_utils.py b/face_utils.py
new file mode 100644
index 0000000..0ad584b
--- /dev/null
+++ b/face_utils.py
@@ -0,0 +1,77 @@
+import insightface
+import numpy as np, math
+import cv2
+from load_model import load_face_model
+
+# Global model instance
+model = load_face_model()
+FACE_MATCH_THRESHOLD = 0.6
+
+def cosine_similarity(vec_a: list[float], vec_b: list[float]) -> float:
+    """
+    Calculate cosine similarity between two vectors.
+
+    Args:
+        vec_a: First vector as a list of numbers.
+        vec_b: Second vector as a list of numbers.
+
+    Returns:
+        Cosine similarity value between -1 and 1.
+    """
+    # Check length
+    if len(vec_a) != len(vec_b):
+        raise ValueError("Vectors must be of the same length.")
+
+    # Dot product
+    dot_product = sum(a * b for a, b in zip(vec_a, vec_b))
+    
+    # Norms (magnitudes)
+    norm_a = math.sqrt(sum(a * a for a in vec_a))
+    norm_b = math.sqrt(sum(b * b for b in vec_b))
+
+    if norm_a == 0 or norm_b == 0:
+        raise ValueError("Vectors must not be zero-length.")
+
+    return dot_product / (norm_a * norm_b)
+
+def detect_faces(image: np.ndarray) -> list[dict]:
+    """Returns list of (embedding, bbox, landmarks) for all detected faces."""
+    faces = model.get(image)
+    results = []
+    for face in faces:
+        results.append({
+            "embedding": face.embedding,                    
+            "bbox": face.bbox.tolist(),                     
+            "landmarks": face.landmark_2d_106.tolist(),     
+        })
+    return results
+
+def face_recog(
+    embedding: np.ndarray,
+    known_encodings: dict[str, np.ndarray],
+    threshold: float = FACE_MATCH_THRESHOLD,
+    return_score: bool = False
+) -> tuple[bool, str, float | None]:
+    """Matches the given embedding against known encodings."""
+
+    best_score = -1
+    best_match = None
+
+    for name, known_embedding in known_encodings.items():
+        score = cosine_similarity(embedding, known_embedding)
+        if score > best_score:
+            best_score = score
+            best_match = name
+
+    if best_score > (1 - threshold):
+        return True, best_match, best_score if return_score else None
+    return False, None, best_score if return_score else None
+
+def crop_face(image: np.ndarray, bbox: list[int], margin: int = 10) -> np.ndarray:
+    x1, y1, x2, y2 = map(int, bbox)
+    h, w = image.shape[:2]
+    x1 = max(x1 - margin, 0)
+    y1 = max(y1 - margin, 0)
+    x2 = min(x2 + margin, w)
+    y2 = min(y2 + margin, h)
+    return image[y1:y2, x1:x2]
diff --git a/load_model.py b/load_model.py
new file mode 100644
index 0000000..61bb9ae
--- /dev/null
+++ b/load_model.py
@@ -0,0 +1,54 @@
+import os
+import zipfile
+import requests
+from pathlib import Path
+from insightface.app import FaceAnalysis
+
+# Model config
+FACE_MODEL_NAME = "buffalo_l"
+FACE_MODEL_PROVIDERS = ["CPUExecutionProvider"]
+FACE_MODEL_URL = f"https://github.com/deepinsight/insightface/releases/download/v0.7/{FACE_MODEL_NAME}.zip"
+FACE_MODEL_DIR = Path(os.getcwd()) / "models" / FACE_MODEL_NAME
+FACE_ZIP_PATH = FACE_MODEL_DIR.parent / f"{FACE_MODEL_NAME}.zip"
+FACE_REQUIRED_FILES = [
+    "1k3d68.onnx",
+    "2d106det.onnx",
+    "det_10g.onnx",
+    "genderage.onnx",
+    "w600k_r50.onnx"
+]
+
+
+def _ensure_model_exists(model_path, required_files, model_url, zip_path) -> Path:
+    """Ensure the buffalo_l model is downloaded and extracted."""
+    if model_path.exists() and all((model_path / f).exists() for f in required_files):
+        return model_path
+
+    model_path.mkdir(parents=True, exist_ok=True)
+
+    print("Downloading buffalo_l model...")
+    response = requests.get(model_url, stream=True)
+    with open(zip_path, "wb") as f:
+        for chunk in response.iter_content(chunk_size=8192):
+            f.write(chunk)
+
+    print("Extracting model...")
+    with zipfile.ZipFile(zip_path, "r") as zip_ref:
+        zip_ref.extractall(model_path)
+
+    zip_path.unlink()
+    print("Model is ready.")
+    return model_path
+
+
+def load_face_model() -> FaceAnalysis:
+    """Load the buffalo_l model using InsightFace."""
+    model_dir = _ensure_model_exists(
+        model_path=FACE_MODEL_DIR,
+        required_files=FACE_REQUIRED_FILES,
+        model_url=FACE_MODEL_URL,
+        zip_path=FACE_ZIP_PATH
+    )
+    model = FaceAnalysis(name=str(model_dir), providers=FACE_MODEL_PROVIDERS)
+    model.prepare(ctx_id=0)
+    return model
diff --git a/main.py b/main.py
index 26160be..38a6ea3 100644
--- a/main.py
+++ b/main.py
@@ -1,43 +1,110 @@
 import cv2
-import pyttsx3
-import google.generativeai as genai
 import numpy as np
-import time
 import threading
+import os
+import base64
+
 import speech_recognition as sr
 from PIL import Image
-import io
+from dotenv import load_dotenv
+import socket
+import google.generativeai as genai
+from langchain_google_genai import ChatGoogleGenerativeAI
+from langchain_core.prompts import PromptTemplate
+from langchain_core.messages import HumanMessage
 
-# Initialize TTS engine
-engine = pyttsx3.init()
 
-def speak(text):
-    print("Speaking:", text)
-    engine.say(text)
-    engine.runAndWait()
+from tts_utils import speak
+from config import TTS_ENGINE, IP_WEBCAM_URL
 
-# Initialize Gemini
-genai.configure(api_key="API KEY")
-model = genai.GenerativeModel(model_name="gemini-1.5-flash")
+# Load Gemini API key from .env
+load_dotenv()
+api_key = os.getenv("API_KEY")
 
-# IP Webcam URL (replace with your phone's IP)
-url = 'http://10.134.93.78:8080/video'  # Update IP
-cap = cv2.VideoCapture(url)
+
+genai.configure(api_key=api_key)
+model = genai.GenerativeModel(model_name="gemini-1.5-flash")
+llm = ChatGoogleGenerativeAI(
+    model="gemini-2.0-flash",
+    temperature=0,
+    timeout=None,
+    max_retries=2,
+)
+os.environ["GOOGLE_API_KEY"] = "your api key"
+
+
+genai.configure(api_key=api_key)
+llm = ChatGoogleGenerativeAI(
+    model="gemini-2.0-flash",
+    temperature=0,
+    timeout=None,
+    max_retries=2,
+)
+def is_connected():
+    try:
+        socket.create_connection(("www.google.com", 80), timeout=2)
+        return True
+    except OSError:
+        return False
+
+# Camera: IP Webcam (phone camera)
+cap = cv2.VideoCapture(IP_WEBCAM_URL)
 
 status = "Press 's' or say 'scan' to scan surroundings..."
-scan_triggered = False  # Flag for voice activation
+scan_triggered = False
 
 def process_frame(frame):
+
+    # Convert the OpenCV BGR image to RGB format (PIL expects RGB)
+    rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+    
+    # Create a PIL image (not directly used but can be useful for debugging or saving)
     rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+    # Encode the frame as PNG image (in memory)
+    _, buffer = cv2.imencode('.png', frame)
+
+    # Convert the PNG image bytes to a base64-encoded string (required by Gemini)
+    encoded_image = base64.b64encode(buffer).decode('utf-8')
+
     pil_image = Image.fromarray(rgb_frame)
-    response = model.generate_content([
-        "Describe this scene briefly for a blind user. If there are any signs like STOP or traffic lights, mention them clearly.",
-        pil_image
-    ])
-    description = response.text.strip()
-    return description
-
-# Voice recognition function
+
+    # Encode the frame as PNG image (in memory)
+    _, buffer = cv2.imencode('.png', frame)
+
+    # Convert the PNG image bytes to a base64-encoded string (required by Gemini)
+    encoded_image = base64.b64encode(buffer).decode('utf-8')
+    
+    try:
+        # Create a HumanMessage for the Gemini model, combining text and image
+        message = HumanMessage(
+            content=[
+                # Text prompt asking for a brief visual description for the blind
+                {
+                    "type": "text",
+                    "text": (
+                        "Provide a short, clear, and concise description of this scene "
+                        "(1–2 sentences) for a blind person. Focus only on key visual elements "
+                        "or signs like STOP signs, vehicles, people, or traffic lights."
+                    )
+                },
+                # Embed the base64 image data as an image input
+                {
+                    "type": "image_url",
+                    "image_url": f"data:image/png;base64,{encoded_image}"
+                },
+            ]
+        )
+
+        # Send the prompt to Gemini model using LangChain wrapper and return the description
+        response = llm.invoke([message])
+        return response.content
+
+    except Exception as e:
+        # Print error for debugging and return a fallback message
+        print(f"[Gemini Error] {e}")
+        return "Unable to analyze surroundings due to internet issue."
+
+
 def listen_for_scan():
     global scan_triggered
     recognizer = sr.Recognizer()
@@ -50,44 +117,38 @@ def listen_for_scan():
     while True:
         with mic as source:
             try:
-                print("Listening...")
                 audio = recognizer.listen(source, timeout=3, phrase_time_limit=5)
                 command = recognizer.recognize_google(audio).lower()
-                print(f"Recognized: {command}")
                 if "scan" in command:
-                    print("Voice command 'scan' detected.")
                     scan_triggered = True
-            except sr.WaitTimeoutError:
-                print("Listening timed out, retrying...")
-            except sr.UnknownValueError:
-                print("Could not understand audio.")
-            except sr.RequestError as e:
-                print(f"Speech recognition service error: {e}")
-
-# Start voice recognition in a separate thread
+            except (sr.WaitTimeoutError, sr.UnknownValueError, sr.RequestError):
+                pass
+
+# Start voice recognition in background
 voice_thread = threading.Thread(target=listen_for_scan, daemon=True)
 voice_thread.start()
 
+if not is_connected():
+    speak("Warning. You are offline. Scene analysis will not work.")
+
 while True:
     ret, frame = cap.read()
     if not ret:
-        print("Camera error.")
+        print("Camera error. Check your IP Webcam app and Wi-Fi.")
         continue
 
-    cv2.putText(frame, status, (20, 50), cv2.FONT_HERSHEY_SIMPLEX,
-                0.8, (0, 255, 0), 2)
-
+    cv2.putText(frame, status, (20, 50), cv2.FONT_HERSHEY_SIMPLEX, 0.8, (0, 255, 0), 2)
     cv2.imshow("Blind Assist Tool", frame)
     key = cv2.waitKey(1) & 0xFF
 
     if key == ord('s') or scan_triggered:
-        scan_triggered = False  # reset flag
+        scan_triggered = False
         status = "Analyzing surroundings..."
         speak("Analyzing surroundings")
         desc = process_frame(frame)
         speak(desc)
 
-        # Detect important keywords
+        # Alert on specific signs
         if "stop sign" in desc.lower() or "stop" in desc.lower():
             speak("Stop! There's a stop sign.")
         elif "red light" in desc.lower():
diff --git a/requirements.txt b/requirements.txt
index 804c823..9f37db8 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,7 +1,13 @@
 opencv-python
 pyttsx3
+gTTS
 google-generativeai
 numpy
 Pillow
 SpeechRecognition
 pyaudio
+playsound
+python-dotenv
+langchain-google-genai
+langchain-core
+insightface
\ No newline at end of file
diff --git a/scan_logger.py b/scan_logger.py
new file mode 100644
index 0000000..926fad6
--- /dev/null
+++ b/scan_logger.py
@@ -0,0 +1,36 @@
+import datetime
+
+
+scan_history = []
+
+ # FUNCTION TO LOG SCAN ENTRY
+def log_scan(caption, user_command):
+    timestamp = datetime.datetime.now().strftime('%d-%m-%Y, %H:%M:%S')
+    entry = {
+        "timestamp": timestamp,       # Key = "timestamp", value = current time
+        "caption": caption,           # Key = "caption", value = passed caption
+        "user_command": user_command  # Key = "user_command", value = passed command
+    }
+    scan_history.append(entry)
+    print("✅ Logged entry successfully!")
+
+#  FUNCTION TO SHOW ALL HISTORY (DEFINED OUTSIDE log_scan)
+def show_history():
+    if not scan_history:
+        print("\n📭 No scan history available.")
+        return
+
+    print("\n📜 Scan History:")
+    for i, entry in enumerate(scan_history, start=1):
+        print(f"\n🔹 Entry {i}")
+        print(f"   🕒 Time: {entry['timestamp']}")
+        print(f"   🖼️ Caption: {entry['caption']}")
+        print(f"   💬 User Command: {entry['user_command']}")
+
+# DEMO — WRAPPED IN if __name__ == "__main__"
+if __name__ == "__main__":
+    log_scan("A man passing the main road", "Alert User")
+    log_scan("A family roaming in a busy market", "Describe the surroundings")
+    log_scan("Animal moving freely in the zoo", "Describe the surroundings")
+
+    show_history()
diff --git a/tests/__init__.py b/tests/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tests/test_face_utils.py b/tests/test_face_utils.py
new file mode 100644
index 0000000..4f939ac
--- /dev/null
+++ b/tests/test_face_utils.py
@@ -0,0 +1,43 @@
+# tests/services/test_face_utils.py
+import pytest
+import numpy as np
+import pathlib
+import cv2
+import sys
+import os
+sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../')))
+from face_utils import (
+    detect_faces,
+    face_recog,
+)
+
+APP_PATH=pathlib.Path(__file__).parent
+print(APP_PATH)
+def test_detect_faces():
+    # Test the detect_faces function
+    name="Paul"
+    image_path = APP_PATH / f"dummy_user_data/pictures/{name}.jpeg"
+    image = cv2.imread(str(image_path))
+    faces = detect_faces(image)
+    assert len(faces) > 0 
+    assert 'embedding' in faces[0]
+    assert 'bbox' in faces[0]
+    assert 'landmarks' in faces[0]
+
+def test_face_recog():
+    # Test the face_recog function
+    name = "Paul"
+    image_path = APP_PATH / f"dummy_user_data/pictures/{name}.jpeg"
+    image = cv2.imread(str(image_path))
+    faces = detect_faces(image)
+    assert len(faces) > 0
+    face_embedding = faces[0]['embedding']
+    
+    # Assuming we have a known face embedding for comparison
+    known_encodings = {
+        "Paul": face_embedding  # Using the same face to guarantee a match
+    }
+    match_result = face_recog(face_embedding, known_encodings)
+    assert match_result[0] is True
+    assert match_result[1] == "Paul"
+    assert match_result[2] is None
diff --git a/tts_utils.py b/tts_utils.py
new file mode 100644
index 0000000..6664625
--- /dev/null
+++ b/tts_utils.py
@@ -0,0 +1,51 @@
+"""
+tts_utils.py
+
+Handles Text-to-Speech functionality for the Blind Assist Tool.
+Supports both gTTS (online) and pyttsx3 (offline) engines with fallback.
+
+Usage:
+- Import the speak() function and call speak("your message")
+- Engine can be set in config.py using TTS_ENGINE = "gTTS" or "pyttsx3"
+
+"""
+
+from config import TTS_ENGINE
+import pyttsx3
+from gtts import gTTS
+from playsound import playsound
+import tempfile
+import os
+
+# Initialize pyttsx3 if selected
+if TTS_ENGINE == "pyttsx3":
+    engine = pyttsx3.init()
+
+def speak(text):
+    print(f"Speaking ({TTS_ENGINE}): {text}")
+
+    if TTS_ENGINE == "pyttsx3":
+        try:
+            engine.say(text)
+            engine.runAndWait()
+        except Exception as e:
+            print(f"[pyttsx3 Error] {e}")
+
+    elif TTS_ENGINE == "gTTS":
+        try:
+            tts = gTTS(text=text, lang='en')
+            with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp:
+                tts.save(tmp.name)
+                playsound(tmp.name)
+            os.remove(tmp.name)
+        except Exception as e:
+            print(f"[gTTS Error] {e}. Falling back to pyttsx3.")
+            fallback_speak(text)
+
+def fallback_speak(text):
+    try:
+        engine = pyttsx3.init()
+        engine.say(text)
+        engine.runAndWait()
+    except Exception as e:
+        print(f"[Fallback pyttsx3 Error] {e}")