Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 13 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1 +1,13 @@
# C4GT_2026
# theApprenticeProject (C4GT 2026)

This repository contains two distinct AI initiatives developed for The Apprentice Project:

## 1. Voice-Based Conversational AI System
A voice-based AI system that captures audio, generates conversational responses using an LLM, and converts the responses back to speech.
- **Key Files**: `asr.py`, `llm.py`, `tts.py`, `main.py`
- **Dependencies**: See `./requirements.txt`

## 2. VLM Evaluation Pipeline
A cost-efficient Vision Language Model (VLM) pipeline designed to evaluate student artifacts (images/videos) against 21st-century skills rubrics.
- **Key Directory**: `vlm_evaluation/`
- **Dependencies**: See `vlm_evaluation/requirements.txt`
Binary file added __pycache__/asr.cpython-313.pyc
Binary file not shown.
Binary file added __pycache__/llm.cpython-313.pyc
Binary file not shown.
Binary file added __pycache__/tts.cpython-313.pyc
Binary file not shown.
33 changes: 33 additions & 0 deletions asr.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
import speech_recognition as sr


def capture_audio():
recognizer = sr.Recognizer()
with sr.Microphone() as source:
print("\nListening...")
recognizer.adjust_for_ambient_noise(source, duration=0.5)
audio = recognizer.listen(source)

try:
text = recognizer.recognize_google(audio)
print(f"You said: {text}")
return text
except sr.UnknownValueError:
print("Sorry, I could not understand the audio.")
return None
except sr.RequestError as e:
print(f"Could not request results from Google Speech Recognition service; {e}")
return None


def validate_transcription(text):
if text is None:
return False, "No speech detected."
stripped = text.strip()
if not stripped:
return False, "Empty transcription."
if len(stripped) > 500:
return False, "Input too long."
if len(stripped) < 2:
return False, "Input too short."
return True, None
54 changes: 54 additions & 0 deletions llm.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch


class ConversationalAgent:
def __init__(self):
print("Loading local conversational model (DialoGPT-small)...")
self.tokenizer = AutoTokenizer.from_pretrained("microsoft/DialoGPT-small")
self.model = AutoModelForCausalLM.from_pretrained("microsoft/DialoGPT-small")
self.chat_history_ids = None

MAX_INPUT_TOKENS = 200

def generate_response(self, user_input):
if not user_input or not user_input.strip():
return "I didn't catch that. Could you please repeat?"

if len(user_input) > 1000:
return "That's quite long! Could you keep it shorter?"

input_ids = self.tokenizer.encode(user_input, return_tensors="pt")
if input_ids.shape[1] > self.MAX_INPUT_TOKENS:
return "I can only process about 200 words at a time. Please say that in fewer words."

new_user_input_ids = self.tokenizer.encode(
user_input + self.tokenizer.eos_token, return_tensors="pt"
)

if self.chat_history_ids is not None:
bot_input_ids = torch.cat(
[self.chat_history_ids[:, -100:], new_user_input_ids], dim=-1
)
else:
bot_input_ids = new_user_input_ids

attention_mask = torch.ones(bot_input_ids.shape, dtype=torch.long)

self.chat_history_ids = self.model.generate(
bot_input_ids,
attention_mask=attention_mask,
max_length=1000,
pad_token_id=self.tokenizer.eos_token_id,
no_repeat_ngram_size=3,
do_sample=True,
top_k=50,
top_p=0.95,
temperature=0.7,
)

response = self.tokenizer.decode(
self.chat_history_ids[:, bot_input_ids.shape[-1] :][0],
skip_special_tokens=True,
)
return response
32 changes: 32 additions & 0 deletions main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
from asr import capture_audio, validate_transcription
from llm import ConversationalAgent
from tts import text_to_speech


def main():
print("=====================================================")
print("Initializing Voice-Based Conversational AI System...")
print("=====================================================")
agent = ConversationalAgent()
print("\nSystem ready! Speak into your microphone.")
print("Say 'exit', 'quit', or 'stop' to end the conversation.")

while True:
user_input = capture_audio()

valid, error_msg = validate_transcription(user_input)
if not valid:
print(f"Validation: {error_msg}")
continue

user_input = user_input.strip()
if user_input.lower() in ["exit", "quit", "stop"]:
text_to_speech("Goodbye!")
break

response = agent.generate_response(user_input)
text_to_speech(response)


if __name__ == "__main__":
main()
5 changes: 5 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
SpeechRecognition
PyAudio==0.2.14
transformers==4.38.2
torch
pyttsx3==2.90
13 changes: 13 additions & 0 deletions tts.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
import pyttsx3

def text_to_speech(text):
# Initialize pyttsx3 engine for offline TTS
engine = pyttsx3.init()

# Adjust properties
rate = engine.getProperty('rate')
engine.setProperty('rate', rate - 20) # Slightly slower for clarity

print(f"AI: {text}")
engine.say(text)
engine.runAndWait()
Binary file not shown.
Binary file not shown.
Binary file not shown.
49 changes: 49 additions & 0 deletions vlm_evaluation/dataset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
import json
import os
from PIL import Image

class ArtifactDataset:
def __init__(self, data_path: str):
"""
Initializes the dataset loader.
Assumes data_path points to a JSON file containing evaluation metadata:
[
{
"image_path": "data/images/student1.jpg",
"student_id": "123",
"artifact_type": "Origami",
"rubric": "1: No effort, 5: Perfect folds and presentation",
"ground_truth_score": 4
}, ...
]
"""
self.data_path = data_path
self.data = []

if os.path.exists(data_path):
with open(data_path, 'r') as f:
self.data = json.load(f)
else:
print(f"Warning: Dataset file {data_path} not found. Returning empty dataset.")
print("Please create this file or generate a sample dataset.")

def __len__(self):
return len(self.data)

def __getitem__(self, idx):
item = self.data[idx]
image_path = item.get("image_path")

try:
# Handle absolute or relative paths gracefully based on the json directory
base_dir = os.path.dirname(self.data_path)
full_image_path = os.path.join(base_dir, image_path) if not os.path.isabs(image_path) else image_path
image = Image.open(full_image_path).convert("RGB")
except Exception as e:
print(f"Error loading image {image_path}: {e}")
image = None

return {
"image": image,
"metadata": item
}
189 changes: 189 additions & 0 deletions vlm_evaluation/evaluate.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,189 @@
import argparse
import json
import os
import re
import torch
from tqdm import tqdm
from pydantic import BaseModel
from lmformatenforcer import JsonSchemaParser
from lmformatenforcer.integrations.transformers import build_transformers_prefix_allowed_tokens_fn
from transformers import (
LlavaForConditionalGeneration,
AutoProcessor,
BitsAndBytesConfig,
)
from dataset import ArtifactDataset
from prompts import SYSTEM_PROMPT, generate_evaluation_prompt


class EvaluationOutput(BaseModel):
skill: str
dimension: str
score: int
max: int


def parse_args():
parser = argparse.ArgumentParser(description="VLM Evaluation Pipeline")
parser.add_argument(
"--data_path", type=str, required=True, help="Path to dataset JSON"
)
parser.add_argument("--model_name", type=str, default="llava-hf/llava-1.5-7b-hf")
parser.add_argument("--quantize", action="store_true", default=True)
parser.add_argument("--no_quantize", action="store_false", dest="quantize")
parser.add_argument("--output_path", type=str, default="results.json")
parser.add_argument("--max_new_tokens", type=int, default=256)
return parser.parse_args()


def load_model(model_name, quantize=True):
if quantize:
quantization_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_compute_dtype=torch.float16,
bnb_4bit_use_double_quant=True,
bnb_4bit_quant_type="nf4",
)
else:
quantization_config = None

model = LlavaForConditionalGeneration.from_pretrained(
model_name,
quantization_config=quantization_config,
device_map="auto",
torch_dtype=torch.float16,
)
processor = AutoProcessor.from_pretrained(model_name)
return model, processor


def extract_score(text):
try:
# Parse the JSON directly instead of using Regex
parsed = json.loads(text)
score = parsed.get("score")
if isinstance(score, int) and 1 <= score <= 5:
return score
except json.JSONDecodeError:
pass
return None


def compute_metrics(predictions, ground_truths):
total = len(ground_truths)
if total == 0:
return {}

exact = sum(1 for p, g in zip(predictions, ground_truths) if p == g)
within_1 = sum(1 for p, g in zip(predictions, ground_truths) if abs(p - g) <= 1)
mae = sum(abs(p - g) for p, g in zip(predictions, ground_truths)) / total
parsed = sum(1 for p in predictions if p is not None)

return {
"total_samples": total,
"exact_accuracy": round(exact / total * 100, 2),
"within_1_accuracy": round(within_1 / total * 100, 2),
"mean_absolute_error": round(mae, 4),
"parse_rate": round(parsed / total * 100, 2),
}


def main():
args = parse_args()

if not torch.cuda.is_available():
print("Warning: CUDA not available. Inference will be slow on CPU.")

print(f"Loading dataset from {args.data_path}...")
dataset = ArtifactDataset(args.data_path)
if len(dataset) == 0:
print("Dataset is empty. Exiting.")
return

print(f"Loading model {args.model_name} (quantize={args.quantize})...")
model, processor = load_model(args.model_name, quantize=args.quantize)

results = []
preds = []
truths = []

for i in tqdm(range(len(dataset)), desc="Evaluating"):
sample = dataset[i]
meta = sample["metadata"]
image = sample["image"]

if image is None:
continue

prompt_text = generate_evaluation_prompt(
student_id=meta.get("student_id", "unknown"),
artifact_type=meta.get("artifact_type", "unknown"),
rubric=meta.get("rubric", {}),
)

inputs = processor(text=prompt_text, images=image, return_tensors="pt").to(
"cuda" if torch.cuda.is_available() else "cpu"
)

try:
schema = EvaluationOutput.model_json_schema()
except AttributeError:
schema = EvaluationOutput.schema()

parser = JsonSchemaParser(schema)
prefix_function = build_transformers_prefix_allowed_tokens_fn(processor.tokenizer, parser)

with torch.no_grad():
output_ids = model.generate(
**inputs,
max_new_tokens=args.max_new_tokens,
do_sample=False,
prefix_allowed_tokens_fn=prefix_function,
)

decoded = processor.decode(output_ids[0][inputs['input_ids'].shape[1]:], skip_special_tokens=True)
response = decoded.strip()

predicted_score = extract_score(response)
ground_truth = meta.get("ground_truth_score")

results.append(
{
"student_id": meta.get("student_id", "unknown"),
"predicted_score": predicted_score,
"ground_truth_score": ground_truth,
"raw_response": response,
"artifact_type": meta.get("artifact_type", "unknown"),
}
)

if predicted_score is not None and ground_truth is not None:
preds.append(predicted_score)
truths.append(ground_truth)

metrics = compute_metrics(preds, truths)

output = {
"config": {
"model_name": args.model_name,
"quantize": args.quantize,
"dataset": args.data_path,
},
"metrics": metrics,
"results": results,
}

with open(args.output_path, "w") as f:
json.dump(output, f, indent=2)

print("\n" + "=" * 50)
print("EVALUATION METRICS")
print("=" * 50)
for k, v in metrics.items():
print(f" {k}: {v}")
print("=" * 50)
print(f"Results saved to {args.output_path}")


if __name__ == "__main__":
main()
Loading