From 09155bea09bab19b8624f3dfda2dd909fdc39636 Mon Sep 17 00:00:00 2001 From: VedaSiddhartha Date: Sat, 28 Mar 2026 22:29:19 +0530 Subject: [PATCH 1/3] refactor(models): standardize outputs to structured dictionary format and add config validation --- .gitignore | 3 ++- app/models/bertweet_model.py | 32 ++++++++++++++++++++++++++++---- app/models/whisper_model.py | 35 ++++++++++++++++++++++++++++++----- 3 files changed, 60 insertions(+), 10 deletions(-) diff --git a/.gitignore b/.gitignore index 564b8d3..4717a04 100644 --- a/.gitignore +++ b/.gitignore @@ -10,4 +10,5 @@ logs/ # Ignore Testing Coverage Results tests/coverage/.coverage -env/ \ No newline at end of file +env/venv/ +venv/ diff --git a/app/models/bertweet_model.py b/app/models/bertweet_model.py index 2342c7c..6058c4e 100644 --- a/app/models/bertweet_model.py +++ b/app/models/bertweet_model.py @@ -5,6 +5,7 @@ import torch.nn as nn from transformers import AutoTokenizer, AutoModelForSequenceClassification +from typing import Dict, Any class BertweetSentiment(nn.Module): def __init__(self,config: dict)->None: @@ -14,7 +15,16 @@ def __init__(self,config: dict)->None: """ self.debug = config.get('debug') - self.config = config.get('sentiment_analysis').get('bertweet') + # ✅ Add null check + sentiment_config = config.get('sentiment_analysis') + if not sentiment_config: + raise ValueError("'sentiment_analysis' not found in config") + + self.config = sentiment_config.get('bertweet') + if not self.config: + raise ValueError("'bertweet' not found in sentiment_analysis config") + + self.model_name = self.config.get('model_name') self.device = self.config.get('device') @@ -35,7 +45,7 @@ def __init__(self,config: dict)->None: else: self.class_labels = None - def forward(self,text)->tuple: + def forward(self,text)-> Dict[str, Any]: """ Perform sentiment analysis on the given text. @@ -43,7 +53,7 @@ def forward(self,text)->tuple: text (str): Input text for sentiment analysis. Returns: - tuple: Model outputs, probabilities, predicted label, and confidence score. + Dict: Model outputs, probabilities, predicted label, and confidence score. """ # Tokenize the input text inputs = self.tokenizer(text, return_tensors="pt", truncation=True, padding=True).to(self.device) @@ -57,10 +67,24 @@ def forward(self,text)->tuple: # Get the predicted sentiment predicted_class = torch.argmax(probabilities, dim=1).item() + # Get the predicted sentiment + # Convert to integer explicitly + predicted_class = int(torch.argmax(probabilities, dim=1).item()) + + # Add null check + if self.class_labels is None: + raise ValueError("Class labels not available") + + # Get the corresponding class label predicted_label = self.class_labels[predicted_class] - return outputs, probabilities, predicted_label, probabilities[0][predicted_class].item() + return { + "logits": outputs.logits.tolist(), + "probabilities": probabilities.tolist(), + "label": predicted_label, + "score": probabilities[0][predicted_class].item() +} # if __name__ == "__main__": diff --git a/app/models/whisper_model.py b/app/models/whisper_model.py index 9217bf2..ca10dd3 100644 --- a/app/models/whisper_model.py +++ b/app/models/whisper_model.py @@ -5,6 +5,7 @@ import torch.nn as nn from transformers import pipeline +from typing import Dict, Any class WhisperTranscript(nn.Module): @@ -14,8 +15,14 @@ def __init__(self, config: dict) -> None: :param config: The configuration object containing model and device info. """ self.debug = config.get('debug') - - self.config = config.get('transcription').get('whisper') + transcription_config = config.get('transcription') + if not transcription_config: + raise ValueError("'transcription' not found in config") + + self.config = transcription_config.get('whisper') + if not self.config: + raise ValueError("'whisper' not found in transcription config") + self.model_size = self.config.get('model_size') self.device = self.config.get('device') self.chunk_length_s = self.config.get('chunk_length_s') @@ -32,7 +39,7 @@ def __init__(self, config: dict) -> None: ) - def forward(self, audio_file: str) -> tuple: + def forward(self, audio_file: str) -> Dict[str, Any]: """ Perform transcription on the given audio file. @@ -40,12 +47,30 @@ def forward(self, audio_file: str) -> tuple: audio_file (str): Path to the audio file. Returns: - tuple: Transcribed text and timestamped chunks. + Dict: Transcribed text and timestamped chunks. """ # Forward pass out = self.pipeline(audio_file, return_timestamps=True) + + # Initialize to avoid "possibly unbound" error + text = "" + chunks = [] + - return out["text"], out["chunks"] + # Extract text and chunks safely + if isinstance(out, dict): + text = out.get("text", "") + chunks = out.get("chunks", []) + else: + # For dict-like objects (not necessarily dict type) + text = getattr(out, "text", "") + chunks = getattr(out, "chunks", []) + + return { + "text": text, + "chunks": chunks +} + # if __name__ == "__main__": # config = { From 7124ad1feb23ab4816651b6d44223d5b87d4c567 Mon Sep 17 00:00:00 2001 From: VedaSiddhartha Date: Sat, 28 Mar 2026 22:42:21 +0530 Subject: [PATCH 2/3] refactor(models): standardize outputs to structured dictionary format and add config validation --- app/models/bertweet_model.py | 20 ++++++++++---------- app/models/whisper_model.py | 3 ++- 2 files changed, 12 insertions(+), 11 deletions(-) diff --git a/app/models/bertweet_model.py b/app/models/bertweet_model.py index 6058c4e..5d57d73 100644 --- a/app/models/bertweet_model.py +++ b/app/models/bertweet_model.py @@ -32,14 +32,14 @@ def __init__(self,config: dict)->None: # Initialize the Tokenizer self.tokenizer = AutoTokenizer.from_pretrained(self.model_name) - # Initialize the Model + # Initializing the Model self.model= AutoModelForSequenceClassification.from_pretrained(self.model_name) self.model.to(self.device) - # Load the model configuration to get class labels + # Loading the model configuration to get class labels self.model_config = self.model.config - # Get Labels + # Geting the Labels if hasattr(self.model_config, 'id2label'): self.class_labels = [self.model_config.id2label[i] for i in range(len(self.model_config.id2label))] else: @@ -55,28 +55,28 @@ def forward(self,text)-> Dict[str, Any]: Returns: Dict: Model outputs, probabilities, predicted label, and confidence score. """ - # Tokenize the input text + # Tokenizing the input text inputs = self.tokenizer(text, return_tensors="pt", truncation=True, padding=True).to(self.device) # Forward pass outputs = self.model(**inputs) - # Convert logits to probabilities + # Converting logits to probabilities probabilities = torch.nn.functional.softmax(outputs.logits, dim=-1) - # Get the predicted sentiment + # to get the predicted sentiment predicted_class = torch.argmax(probabilities, dim=1).item() - # Get the predicted sentiment - # Convert to integer explicitly + + # Converting it to the integer explicitly predicted_class = int(torch.argmax(probabilities, dim=1).item()) - # Add null check + # Adding a null check if self.class_labels is None: raise ValueError("Class labels not available") - # Get the corresponding class label + # Geting the corresponding class label predicted_label = self.class_labels[predicted_class] return { diff --git a/app/models/whisper_model.py b/app/models/whisper_model.py index ca10dd3..c2ea654 100644 --- a/app/models/whisper_model.py +++ b/app/models/whisper_model.py @@ -15,6 +15,7 @@ def __init__(self, config: dict) -> None: :param config: The configuration object containing model and device info. """ self.debug = config.get('debug') + transcription_config = config.get('transcription') if not transcription_config: raise ValueError("'transcription' not found in config") @@ -57,7 +58,7 @@ def forward(self, audio_file: str) -> Dict[str, Any]: chunks = [] - # Extract text and chunks safely + # Extracting the text and chunks safely if isinstance(out, dict): text = out.get("text", "") chunks = out.get("chunks", []) From 9d68b9cff80e68cb25e64318e1c8e9e7e412f7f9 Mon Sep 17 00:00:00 2001 From: VedaSiddhartha Date: Sun, 29 Mar 2026 00:43:04 +0530 Subject: [PATCH 3/3] feat(speech): integrated a SpeechEmotionModel using HuggingFace {superb/wav2vec2-base-superb-er} with config-driven setup and standardized output schema --- app/models/speech_emotion_model.py | 75 ++++++++++++++++++++++++++++++ config.yaml | 7 +++ 2 files changed, 82 insertions(+) create mode 100644 app/models/speech_emotion_model.py diff --git a/app/models/speech_emotion_model.py b/app/models/speech_emotion_model.py new file mode 100644 index 0000000..54eaaa7 --- /dev/null +++ b/app/models/speech_emotion_model.py @@ -0,0 +1,75 @@ +""" +Speech Emotion Model +Uses HuggingFace audio classification pipeline +""" + +import torch +import torch.nn as nn +import logging +from transformers import pipeline +from typing import Dict , Any , List + + +class SpeechEmotionModel(nn.Module): + + def __init__(self, config: dict) -> None: + super(SpeechEmotionModel, self).__init__() + + self.debug = config.get('debug') + + # added a null check + emotion_config = config.get('speech_emotion') + if not emotion_config: + raise ValueError("'speech_emotion' not found in config") + + self.config = emotion_config.get('default') + if not self.config: + raise ValueError("'default' speech_emotion config missing") + + self.model_name = self.config.get('model_name') + self.device = self.config.get('device') + + # Use logging for structured and production-ready output instead of print + logger = logging.getLogger(__name__) + logger.info(f"Loading SpeechEmotionModel: {self.model_name}") + + print(f"Loading Speech Emotion Model: {self.model_name}") + + # Initializing HuggingFace pipeline for audio classification. + # This abstracts preprocessing, model inference, and postprocessing + self.pipeline = pipeline( + task="audio-classification", + model=self.model_name, + device=self.device + ) + + def forward(self, audio_path: str) -> Dict[str, Any]: + """ + Perform emotion classification + """ + + # Running inference using Huggingface pipelines. + # The pipeline internally handles feature extraction + model prediction. + outputs = self.pipeline(audio_path) + + # Ensure output is valid and non-empty. + # This prevents runtime errors in case of unexpected model behavior/ safe handling. + if not isinstance(outputs, list) or len(outputs) == 0: + return { + "emotion": { + "label": "unknown", + "score": 0.0 + } + } + +# Most pipelines return sorted results, so index 0 is the best prediction + top = outputs[0] + +# Return structured output consistent with other models in the system. +# it ensures that the easy integration with downstream pipelines and API's. + return { + "emotion": { + "label": top.get("label", "unknown"), + "score": float(top.get("score", 0.0)) + } + } \ No newline at end of file diff --git a/config.yaml b/config.yaml index 930c68b..f1f133b 100644 --- a/config.yaml +++ b/config.yaml @@ -35,6 +35,13 @@ sentiment_analysis: # api_key: "your_api_key" # endpoint: "https://api.example.com/sentiment" +speech_emotion: + default: + model_name: "superb/wav2vec2-base-superb-er" + device: -1 # -1 = CPU, 0 = GPU + + + # AudioTranscriptionSentimentPipeline Configuration audio_transcription_sentiment_pipeline: remove_audio: false # Specify whether to remove audio files after processing \ No newline at end of file