-
Notifications
You must be signed in to change notification settings - Fork 0
Backend Inference Refactor #35
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Draft
ShafathZ
wants to merge
5
commits into
main
Choose a base branch
from
backend-inference-refactor
base: main
Could not load branches
Branch not found: {{ refName }}
Loading
Could not load tags
Nothing to show
Loading
Are you sure you want to change the base?
Some commits from the old base branch may be removed from the timeline,
and old review comments may become outdated.
Draft
Changes from all commits
Commits
Show all changes
5 commits
Select commit
Hold shift + click to select a range
e6ab551
Constructed new easily scalable and modifiable backend structure
ShafathZ 0610e9e
Fixed backend local model code issues, working but slow on CPU
ShafathZ 5e9217e
Improved scalability defining usage class, improved comments
ShafathZ ba723d2
Addressed some PR comments
ShafathZ 015f071
Addressed more PR comments
ShafathZ File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file was deleted.
Oops, something went wrong.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,124 @@ | ||
| import json | ||
| import os | ||
| import time | ||
| from typing import List, Dict | ||
|
|
||
| from dotenv import load_dotenv | ||
|
|
||
| from backend.mongo.AniZenithMongoClient import AniZenithMongoClient | ||
| from backend.mongo.AniZenithVectorSearchResult import AniZenithVectorSearchResult | ||
| from backend.prometheus_utils import * | ||
| from backend.models import HFInferenceClientModel, HFLocalModel, Model | ||
| from backend.reranker import AniZenithReranker | ||
| from constants import * | ||
|
|
||
| load_dotenv(".env") | ||
| # TODO: Move these to config management system | ||
| MONGO_CONN_STRING = os.getenv("ATLAS_URI") | ||
| VECTOR_SEARCH_LIMIT = 20 | ||
|
ShafathZ marked this conversation as resolved.
ShafathZ marked this conversation as resolved.
|
||
| RERANK_LIMIT = 5 | ||
| MODEL_DOWNTIME_SECONDS = 120 | ||
|
|
||
| local_model_id = "Qwen/Qwen3-0.6B" | ||
| external_model_id = "openai/gpt-oss-20b" | ||
|
|
||
|
|
||
| class InferenceManager: | ||
|
|
||
| def __init__(self): | ||
| # Initialize Models for use in descending order of importance | ||
| self.models: List[Model] = [HFInferenceClientModel(external_model_id), HFLocalModel(local_model_id)] | ||
| self.current_model_idx = 0 # Current model idx being used | ||
| self.model_available_at = [0.0 for _ in self.models] # Controls fallback timer in case error occurs | ||
|
|
||
| # Load a DB Client Instance | ||
| self.db_client = AniZenithMongoClient(MONGO_CONN_STRING) | ||
|
|
||
| # Load a Reranker model instance | ||
| self.reranker = AniZenithReranker() | ||
|
|
||
| # Gets the current most prioritized model for use | ||
| def get_best_model(self) -> Model: | ||
| now = time.time() | ||
|
|
||
| for i, model in enumerate(self.models): | ||
| if now >= self.model_available_at[i]: | ||
| self.current_model_idx = i | ||
| return model | ||
|
|
||
| # If all models are cooling down, throw error models not available | ||
| # TODO: Make custom exception | ||
| raise Exception("No model available") | ||
|
|
||
|
|
||
| def chat(self, messages: List[Dict[str, str]], user_id: str = None): | ||
| """ | ||
| Enhanced inference chat function with retrieval and reranking. | ||
| Steps: | ||
| 1. Retrieve relevant documents from MongoDB | ||
| 2. Rerank them with the reranker | ||
| 3. Build LLM messages prompt | ||
| 4. Stream the model output | ||
| """ | ||
| # TODO: Make this an agentic framework using LangChain | ||
| # TODO: Use user_id in logging | ||
| # TODO: Add queue system to make blocking better | ||
| # TODO: Replace all print statements with logging | ||
| current_model = self.get_best_model() | ||
| with CHATBOT_PIPELINE_LATENCY_SUMMARY.labels(model=current_model.get_name(), stage="full_pipeline").time(): | ||
| # 1) Retrieve results from DB Client | ||
| with CHATBOT_PIPELINE_LATENCY_SUMMARY.labels(model=current_model.get_name(), stage="db_retrieval").time(): | ||
| # Use the last message as the user message (it should always be a user message) | ||
| user_query = messages[-1]['content'] | ||
| retrieved_docs: List[AniZenithVectorSearchResult] = self.db_client.perform_vector_search(user_query, limit=VECTOR_SEARCH_LIMIT) | ||
| print(f"Retrieved Docs: ({len(retrieved_docs)}) relevant docs") | ||
|
|
||
| # 2) Rerank results using the reranker based on document info and user query | ||
| with CHATBOT_PIPELINE_LATENCY_SUMMARY.labels(model=current_model.get_name(), stage="reranking").time(): | ||
| reranked_docs: List[AniZenithVectorSearchResult] = self.reranker.rerank(user_query, retrieved_docs, limit=RERANK_LIMIT) | ||
| print(f"Reranked Docs: ({len(reranked_docs)})") | ||
|
|
||
| # 3) Construct system prompt with recommended docs | ||
| system_prompt = self._build_system_prompt(reranked_docs) | ||
|
|
||
| # 4) Insert system prompt into messages | ||
| messages.insert(0, {"role": "system", "content": system_prompt}) | ||
| print("Completed System Prompt Building") | ||
|
|
||
| # 5) Stream output of the model using the stream method | ||
| output = "" | ||
| with CHATBOT_PIPELINE_LATENCY_SUMMARY.labels(model=current_model.get_name(), stage="model_generation").time(): | ||
| try: | ||
| for token in current_model.stream(messages): | ||
| output += token | ||
| yield token | ||
|
|
||
| except Exception as e: | ||
| # TODO: Log error | ||
| print(f"Model Error: {e}") | ||
| # Yield model terminated to user | ||
| yield "<OUTPUT_TERMINATED>" | ||
| # Sets the model's next available time to current time in seconds + downtime | ||
| self.model_available_at[self.current_model_idx] = time.time() + MODEL_DOWNTIME_SECONDS | ||
|
|
||
| # Record Usage Metrics | ||
| print(f"Streamed output: {output}") | ||
| usage = current_model.get_usage() | ||
| observe_user_message(user_id, user_query, usage.input_token_count, current_model.get_name()) | ||
| observe_bot_message(user_id, output, usage.output_token_count, current_model.get_name()) | ||
|
|
||
| def _build_system_prompt(self, recommendations: List[AniZenithVectorSearchResult]) -> str: | ||
|
ShafathZ marked this conversation as resolved.
|
||
| lines = [] | ||
|
|
||
| # Add base system prompt | ||
| lines.append(SYSTEM_PROMPT) | ||
| lines.append(RECOMMENDED_DOCS_PREAMBLE) | ||
|
|
||
| # Add recommendation docs | ||
| # model_dump() is a special Pydantic method to generate a dict representation of any Pydantic object | ||
| # Dumps JSON as string with indent | ||
| recommendations = [json.dumps(recommendation.model_dump(), indent=4) for recommendation in recommendations] | ||
| recommendation_string = "\n\n".join(recommendations) if recommendations else "No good recommendations found." | ||
| lines.append(recommendation_string) | ||
|
|
||
| return "\n".join(lines) | ||
Oops, something went wrong.
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.