theapprenticeproject · ravencore06 · May 3, 2026 · May 9, 2026 · May 14, 2026 · May 21, 2026
diff --git a/README.md b/README.md
@@ -1 +1,13 @@
-# C4GT_2026
+# theApprenticeProject (C4GT 2026)
+
+This repository contains two distinct AI initiatives developed for The Apprentice Project:
+
+## 1. Voice-Based Conversational AI System
+A voice-based AI system that captures audio, generates conversational responses using an LLM, and converts the responses back to speech.
+- **Key Files**: `asr.py`, `llm.py`, `tts.py`, `main.py`
+- **Dependencies**: See `./requirements.txt`
+
+## 2. VLM Evaluation Pipeline
+A cost-efficient Vision Language Model (VLM) pipeline designed to evaluate student artifacts (images/videos) against 21st-century skills rubrics.
+- **Key Directory**: `vlm_evaluation/`
+- **Dependencies**: See `vlm_evaluation/requirements.txt`
diff --git a/__pycache__/asr.cpython-313.pyc b/__pycache__/asr.cpython-313.pyc
diff --git a/__pycache__/llm.cpython-313.pyc b/__pycache__/llm.cpython-313.pyc
diff --git a/__pycache__/tts.cpython-313.pyc b/__pycache__/tts.cpython-313.pyc
diff --git a/asr.py b/asr.py
@@ -0,0 +1,33 @@
+import speech_recognition as sr
+
+
+def capture_audio():
+    recognizer = sr.Recognizer()
+    with sr.Microphone() as source:
+        print("\nListening...")
+        recognizer.adjust_for_ambient_noise(source, duration=0.5)
+        audio = recognizer.listen(source)
+
+    try:
+        text = recognizer.recognize_google(audio)
+        print(f"You said: {text}")
+        return text
+    except sr.UnknownValueError:
+        print("Sorry, I could not understand the audio.")
+        return None
+    except sr.RequestError as e:
+        print(f"Could not request results from Google Speech Recognition service; {e}")
+        return None
+
+
+def validate_transcription(text):
+    if text is None:
+        return False, "No speech detected."
+    stripped = text.strip()
+    if not stripped:
+        return False, "Empty transcription."
+    if len(stripped) > 500:
+        return False, "Input too long."
+    if len(stripped) < 2:
+        return False, "Input too short."
+    return True, None
diff --git a/llm.py b/llm.py
@@ -0,0 +1,54 @@
+from transformers import AutoModelForCausalLM, AutoTokenizer
+import torch
+
+
+class ConversationalAgent:
+    def __init__(self):
+        print("Loading local conversational model (DialoGPT-small)...")
+        self.tokenizer = AutoTokenizer.from_pretrained("microsoft/DialoGPT-small")
+        self.model = AutoModelForCausalLM.from_pretrained("microsoft/DialoGPT-small")
+        self.chat_history_ids = None
+
+    MAX_INPUT_TOKENS = 200
+
+    def generate_response(self, user_input):
+        if not user_input or not user_input.strip():
+            return "I didn't catch that. Could you please repeat?"
+
+        if len(user_input) > 1000:
+            return "That's quite long! Could you keep it shorter?"
+
+        input_ids = self.tokenizer.encode(user_input, return_tensors="pt")
+        if input_ids.shape[1] > self.MAX_INPUT_TOKENS:
+            return "I can only process about 200 words at a time. Please say that in fewer words."
+
+        new_user_input_ids = self.tokenizer.encode(
+            user_input + self.tokenizer.eos_token, return_tensors="pt"
+        )
+
+        if self.chat_history_ids is not None:
+            bot_input_ids = torch.cat(
+                [self.chat_history_ids[:, -100:], new_user_input_ids], dim=-1
+            )
+        else:
+            bot_input_ids = new_user_input_ids
+
+        attention_mask = torch.ones(bot_input_ids.shape, dtype=torch.long)
+
+        self.chat_history_ids = self.model.generate(
+            bot_input_ids,
+            attention_mask=attention_mask,
+            max_length=1000,
+            pad_token_id=self.tokenizer.eos_token_id,
+            no_repeat_ngram_size=3,
+            do_sample=True,
+            top_k=50,
+            top_p=0.95,
+            temperature=0.7,
+        )
+
+        response = self.tokenizer.decode(
+            self.chat_history_ids[:, bot_input_ids.shape[-1] :][0],
+            skip_special_tokens=True,
+        )
+        return response
diff --git a/main.py b/main.py
@@ -0,0 +1,32 @@
+from asr import capture_audio, validate_transcription
+from llm import ConversationalAgent
+from tts import text_to_speech
+
+
+def main():
+    print("=====================================================")
+    print("Initializing Voice-Based Conversational AI System...")
+    print("=====================================================")
+    agent = ConversationalAgent()
+    print("\nSystem ready! Speak into your microphone.")
+    print("Say 'exit', 'quit', or 'stop' to end the conversation.")
+
+    while True:
+        user_input = capture_audio()
+
+        valid, error_msg = validate_transcription(user_input)
+        if not valid:
+            print(f"Validation: {error_msg}")
+            continue
+
+        user_input = user_input.strip()
+        if user_input.lower() in ["exit", "quit", "stop"]:
+            text_to_speech("Goodbye!")
+            break
+
+        response = agent.generate_response(user_input)
+        text_to_speech(response)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/requirements.txt b/requirements.txt
@@ -0,0 +1,5 @@
+SpeechRecognition
+PyAudio==0.2.14
+transformers==4.38.2
+torch
+pyttsx3==2.90
diff --git a/tts.py b/tts.py
@@ -0,0 +1,13 @@
+import pyttsx3
+
+def text_to_speech(text):
+    # Initialize pyttsx3 engine for offline TTS
+    engine = pyttsx3.init()
+
+    # Adjust properties
+    rate = engine.getProperty('rate')
+    engine.setProperty('rate', rate - 20) # Slightly slower for clarity
+
+    print(f"AI: {text}")
+    engine.say(text)
+    engine.runAndWait()
diff --git a/vlm_evaluation/__pycache__/dataset.cpython-313.pyc b/vlm_evaluation/__pycache__/dataset.cpython-313.pyc
diff --git a/vlm_evaluation/__pycache__/evaluate.cpython-313.pyc b/vlm_evaluation/__pycache__/evaluate.cpython-313.pyc
diff --git a/vlm_evaluation/__pycache__/prompts.cpython-313.pyc b/vlm_evaluation/__pycache__/prompts.cpython-313.pyc
diff --git a/vlm_evaluation/dataset.py b/vlm_evaluation/dataset.py
@@ -0,0 +1,49 @@
+import json
+import os
+from PIL import Image
+
+class ArtifactDataset:
+    def __init__(self, data_path: str):
+        """
+        Initializes the dataset loader.
+        Assumes data_path points to a JSON file containing evaluation metadata:
+        [
+            {
+                "image_path": "data/images/student1.jpg",
+                "student_id": "123",
+                "artifact_type": "Origami",
+                "rubric": "1: No effort, 5: Perfect folds and presentation",
+                "ground_truth_score": 4
+            }, ...
+        ]
+        """
+        self.data_path = data_path
+        self.data = []
+
+        if os.path.exists(data_path):
+            with open(data_path, 'r') as f:
+                self.data = json.load(f)
+        else:
+            print(f"Warning: Dataset file {data_path} not found. Returning empty dataset.")
+            print("Please create this file or generate a sample dataset.")
+
+    def __len__(self):
+        return len(self.data)
+
+    def __getitem__(self, idx):
+        item = self.data[idx]
+        image_path = item.get("image_path")
+
+        try:
+            # Handle absolute or relative paths gracefully based on the json directory
+            base_dir = os.path.dirname(self.data_path)
+            full_image_path = os.path.join(base_dir, image_path) if not os.path.isabs(image_path) else image_path
+            image = Image.open(full_image_path).convert("RGB")
+        except Exception as e:
+            print(f"Error loading image {image_path}: {e}")
+            image = None
+
+        return {
+            "image": image,
+            "metadata": item
+        }
diff --git a/vlm_evaluation/evaluate.py b/vlm_evaluation/evaluate.py
@@ -0,0 +1,189 @@
+import argparse
+import json
+import os
+import re
+import torch
+from tqdm import tqdm
+from pydantic import BaseModel
+from lmformatenforcer import JsonSchemaParser
+from lmformatenforcer.integrations.transformers import build_transformers_prefix_allowed_tokens_fn
+from transformers import (
+    LlavaForConditionalGeneration,
+    AutoProcessor,
+    BitsAndBytesConfig,
+)
+from dataset import ArtifactDataset
+from prompts import SYSTEM_PROMPT, generate_evaluation_prompt
+
+
+class EvaluationOutput(BaseModel):
+    skill: str
+    dimension: str
+    score: int
+    max: int
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="VLM Evaluation Pipeline")
+    parser.add_argument(
+        "--data_path", type=str, required=True, help="Path to dataset JSON"
+    )
+    parser.add_argument("--model_name", type=str, default="llava-hf/llava-1.5-7b-hf")
+    parser.add_argument("--quantize", action="store_true", default=True)
+    parser.add_argument("--no_quantize", action="store_false", dest="quantize")
+    parser.add_argument("--output_path", type=str, default="results.json")
+    parser.add_argument("--max_new_tokens", type=int, default=256)
+    return parser.parse_args()
+
+
+def load_model(model_name, quantize=True):
+    if quantize:
+        quantization_config = BitsAndBytesConfig(
+            load_in_4bit=True,
+            bnb_4bit_compute_dtype=torch.float16,
+            bnb_4bit_use_double_quant=True,
+            bnb_4bit_quant_type="nf4",
+        )
+    else:
+        quantization_config = None
+
+    model = LlavaForConditionalGeneration.from_pretrained(
+        model_name,
+        quantization_config=quantization_config,
+        device_map="auto",
+        torch_dtype=torch.float16,
+    )
+    processor = AutoProcessor.from_pretrained(model_name)
+    return model, processor
+
+
+def extract_score(text):
+    try:
+        # Parse the JSON directly instead of using Regex
+        parsed = json.loads(text)
+        score = parsed.get("score")
+        if isinstance(score, int) and 1 <= score <= 5:
+            return score
+    except json.JSONDecodeError:
+        pass
+    return None
+
+
+def compute_metrics(predictions, ground_truths):
+    total = len(ground_truths)
+    if total == 0:
+        return {}
+
+    exact = sum(1 for p, g in zip(predictions, ground_truths) if p == g)
+    within_1 = sum(1 for p, g in zip(predictions, ground_truths) if abs(p - g) <= 1)
+    mae = sum(abs(p - g) for p, g in zip(predictions, ground_truths)) / total
+    parsed = sum(1 for p in predictions if p is not None)
+
+    return {
+        "total_samples": total,
+        "exact_accuracy": round(exact / total * 100, 2),
+        "within_1_accuracy": round(within_1 / total * 100, 2),
+        "mean_absolute_error": round(mae, 4),
+        "parse_rate": round(parsed / total * 100, 2),
+    }
+
+
+def main():
+    args = parse_args()
+
+    if not torch.cuda.is_available():
+        print("Warning: CUDA not available. Inference will be slow on CPU.")
+
+    print(f"Loading dataset from {args.data_path}...")
+    dataset = ArtifactDataset(args.data_path)
+    if len(dataset) == 0:
+        print("Dataset is empty. Exiting.")
+        return
+
+    print(f"Loading model {args.model_name} (quantize={args.quantize})...")
+    model, processor = load_model(args.model_name, quantize=args.quantize)
+
+    results = []
+    preds = []
+    truths = []
+
+    for i in tqdm(range(len(dataset)), desc="Evaluating"):
+        sample = dataset[i]
+        meta = sample["metadata"]
+        image = sample["image"]
+
+        if image is None:
+            continue
+
+        prompt_text = generate_evaluation_prompt(
+            student_id=meta.get("student_id", "unknown"),
+            artifact_type=meta.get("artifact_type", "unknown"),
+            rubric=meta.get("rubric", {}),
+        )
+
+        inputs = processor(text=prompt_text, images=image, return_tensors="pt").to(
+            "cuda" if torch.cuda.is_available() else "cpu"
+        )
+
+        try:
+            schema = EvaluationOutput.model_json_schema()
+        except AttributeError:
+            schema = EvaluationOutput.schema()
+
+        parser = JsonSchemaParser(schema)
+        prefix_function = build_transformers_prefix_allowed_tokens_fn(processor.tokenizer, parser)
+
+        with torch.no_grad():
+            output_ids = model.generate(
+                **inputs,
+                max_new_tokens=args.max_new_tokens,
+                do_sample=False,
+                prefix_allowed_tokens_fn=prefix_function,
+            )
+
+        decoded = processor.decode(output_ids[0][inputs['input_ids'].shape[1]:], skip_special_tokens=True)
+        response = decoded.strip()
+
+        predicted_score = extract_score(response)
+        ground_truth = meta.get("ground_truth_score")
+
+        results.append(
+            {
+                "student_id": meta.get("student_id", "unknown"),
+                "predicted_score": predicted_score,
+                "ground_truth_score": ground_truth,
+                "raw_response": response,
+                "artifact_type": meta.get("artifact_type", "unknown"),
+            }
+        )
+
+        if predicted_score is not None and ground_truth is not None:
+            preds.append(predicted_score)
+            truths.append(ground_truth)
+
+    metrics = compute_metrics(preds, truths)
+
+    output = {
+        "config": {
+            "model_name": args.model_name,
+            "quantize": args.quantize,
+            "dataset": args.data_path,
+        },
+        "metrics": metrics,
+        "results": results,
+    }
+
+    with open(args.output_path, "w") as f:
+        json.dump(output, f, indent=2)
+
+    print("\n" + "=" * 50)
+    print("EVALUATION METRICS")
+    print("=" * 50)
+    for k, v in metrics.items():
+        print(f"  {k}: {v}")
+    print("=" * 50)
+    print(f"Results saved to {args.output_path}")
+
+
+if __name__ == "__main__":
+    main()