ShafathZ · ShafathZ · Apr 28, 2026 · Apr 25, 2026 · Apr 25, 2026 · Apr 25, 2026
diff --git a/.gitignore b/.gitignore
@@ -1,6 +1,11 @@
 # Ignore CSV files
 *.csv
 
+# Ignore data folder
+data/*.bak
+data/*.json
+data/*.jsonl
+
 # Ignore SSH Keys
 *.pub
 group_key

diff --git a/data/create_db.py → archived/create_db.py b/data/create_db.py → archived/create_db.py
diff --git a/data/populate_mongo_db.py → archived/populate_mongo_db.py b/data/populate_mongo_db.py → archived/populate_mongo_db.py
diff --git a/backend/configs/app_config.yml b/backend/configs/app_config.yml
@@ -4,4 +4,7 @@ default:
   max_session_cookie_age: 3600
   same_site_protection: "lax"
 
+  mongo_anime_collection_name: "anime_enriched"
+  mongo_anime_db_name: "anizenith"
+
   log_level: "info"
diff --git a/backend/configs/backend_config.py b/backend/configs/backend_config.py
@@ -27,7 +27,11 @@ class BackendAppConfig(Config):
     MAL_CLIENT_ID: str = os.getenv("MAL_CLIENT_ID", "")
     MAL_CLIENT_SECRET: str = os.getenv("MAL_CLIENT_SECRET", "")
     BACKEND_SECRET: str = os.getenv("BACKEND_SECRET", "")
+
+    # MongoDB params
     ATLAS_URI: str = os.getenv("ATLAS_URI", "")
+    mongo_anime_collection_name: Optional[str] = None
+    mongo_anime_db_name: Optional[str] = None
 
 class ModelConfig(Config):
     """
@@ -38,6 +42,7 @@ class ModelConfig(Config):
     # Chatbot parameters
     local_model_id: Optional[str] = None
     external_model_id: Optional[str] = None
+    embedding_model_id: Optional[str] = None
     max_new_tokens: Optional[int] = None
     temperature: Optional[float] = None
     top_p: Optional[float] = None

diff --git a/backend/configs/model_config.yml b/backend/configs/model_config.yml
@@ -2,6 +2,7 @@ default:
   # Chatbot parameters
   local_model_id: "Qwen/Qwen3-0.6B"
   external_model_id: "openai/gpt-oss-20b"
+  embedding_model_id: "sentence-transformers/all-MiniLM-L6-v2"
   max_new_tokens: 2048
   temperature: 0.7
   top_p: 0.7

diff --git a/backend/mongo/AniZenithMongoClient.py b/backend/mongo/AniZenithMongoClient.py
@@ -5,6 +5,7 @@
 from backend.mongo.utils import create_text_metadata_and_embedding
 from backend.mongo.AniZenithVectorSearchResult import AniZenithVectorSearchResult
 
+from backend.configs import backend_app_config, model_config
 
 # Class to model Anizenith MongoDB Client related utilities
 class AniZenithMongoClient:
@@ -14,7 +15,7 @@ def __init__(self, conn_string):
             raise ValueError("ATLAS_URI must be set to a non-empty MongoDB connection string")
 
         self.conn_string = conn_string        
-        self.embedding_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
+        self.embedding_model = SentenceTransformer(model_config.embedding_model_id)
 
         # Set internals to None for lazy init
         self._db_client = None
@@ -38,8 +39,7 @@ def anime_collection(self):
         @property decorator defines this as a property of a class, rather than a class method
         """
         if self._anime_collection is None:
-            # TODO: Move the hardcoded DB name and collection name into a central Config object
-            self._anime_collection = self.db_client["anizenith"]["anime"]
+            self._anime_collection = self.db_client[backend_app_config.mongo_anime_db_name][backend_app_config.mongo_anime_collection_name]
         return self._anime_collection
 
 
@@ -55,10 +55,7 @@ def add_anime(self, anime_document: AnimeDocument) -> None:
 
         # Create a new document to be inserted into MongoDB
         anime_document_dict = {
-            "name": anime_document.name,
-            "score": anime_document.score,
-            "synopsis": anime_document.synopsis,
-            "genres": anime_document.genres,
+            **anime_document.model_dump(),
             "text_metadata": text_metadata,
             "text_metadata_embedding": text_metadata_embedding
         }

diff --git a/backend/mongo/AnimeDocument.py b/backend/mongo/AnimeDocument.py
@@ -1,9 +1,22 @@
 from pydantic import BaseModel
-from typing import List
+from datetime import datetime
+from typing import List, Dict
 
 # Class to Model a typical Anime Document
 class AnimeDocument(BaseModel):
-    name: str
-    score: float
-    synopsis: str
-    genres: List[str]
+    mal_id: int                     # ID of anime on MyAnimeList
+    name: str                      # English title of anime
+    alt_titles: Dict[str, str]      # { "en": ENGLISH_TITLE, "jp": JAPANESE_TITLE, ...}
+    score: float                    # MAL User Mean Score -- Typically 6-10
+    synopsis: str                   # Short synopsis of shows, does not contain spoilers beyond first episode
+    genres: List[str]               # Genres list not including demographic genre
+    demographic: str                # Primary demographic
+    age_rating: str                 # g | pg | pg-13 | r | r+ | rx
+    cover_image_url: str            # link to MAL image (not CDN)
+    date_aired: datetime            # date aired as datetime (YYYY-MM-DD stored, not hr/mins/secs/ms)
+    status: str                     # finished_airing | currently_airing | not_aired
+    episode_count: int              # number of episodes in the anime
+    avg_episode_len_mins: int       # average duration per episode in mins
+    publishing_company: str         # publishing company name | Unknown
+    recommendations: Dict[str, int] # {"TITLE1":#_OF_RECOMMENDATIONS, "TITLE2":#_OF_RECOMMENDATIONS, ...}
+    node_name: str                  # Title of anime (node) -- May not be English, but always matches recommendations title
diff --git a/backend/mongo/utils.py b/backend/mongo/utils.py
@@ -6,7 +6,8 @@ def create_text_metadata_and_embedding(
         anime_name: str,
         anime_genres: List[str],
         anime_synopsis: str
-    ) -> Tuple[str, str]:
+    ) -> Tuple[str, List]:
+    # TODO: Make this metadata better for retrieval
     # Create text_metadata field using synopsis, genres and name
     text_metadata = f"Synopsis: {anime_synopsis}\n\nGenres: {', '.join(anime_genres)}\n\nName: {anime_name}"
 

diff --git a/data/anime.db b/data/anime.db
diff --git a/data/anime_scrape.py b/data/anime_scrape.py
@@ -0,0 +1,174 @@
+import json
+import time
+from datetime import datetime
+from typing import List, Dict
+
+import pandas as pd
+import requests
+
+from backend.configs import backend_app_config
+from backend.mongo.AnimeDocument import AnimeDocument
+
+# Jikan is a web scraping REST API for anime data, but not all data can be available (specific anime search 100% uptime)
+JIKAN_ENDPOINT = "https://api.jikan.moe/v4/anime"
+
+# MyAnimeList (MAL) is an anime database access REST API service with limited features
+MAL_ENDPOINT = "https://api.myanimelist.net/v2/anime/ranking"
+MAL_CLIENT_HEADER = {"X-MAL-CLIENT-ID": backend_app_config.MAL_CLIENT_ID}
+MAL_FIELDS = "id,title,alternative_titles,mean,synopsis,genres,main_picture,start_date,status,studios,num_episodes,average_episode_duration,rating"
+
+# API limits
+ANIME_PER_PAGE = 25
+JIKAN_RATE_LIMIT = 1.0 # Jikan: (~3 req/sec, 60 req/min)
+MAL_RATE_LIMIT = 0.5
+MAL_MAX_PER_PAGE = 100
+
+# Used for data cleaning
+DEMOGRAPHIC_GENRES = {"Shounen", "Shoujo", "Seinen", "Josei", "Kids", "Demographic", "Shonen", "Shojo"}
+
+def _get_jikan_recommendations(mal_id: int, limit: int = 10) -> Dict[str, int]:
+    """Fetch anime user recommendations from Jikan."""
+    url = f"{JIKAN_ENDPOINT}/{mal_id}/recommendations"
+    try:
+        resp = requests.get(url, timeout=10)
+        resp.raise_for_status()
+        data = resp.json().get("data", [])
+        return {
+            entry.get("entry", {}).get("title"): entry.get("votes", 0)
+            for entry in data[:limit]
+        }
+    except Exception:
+        return {}
+
+def _normalize_from_mal(item: Dict) -> Dict:
+    """Convert MAL API response (node) into a flat dict with required fields for an AnimeDocument."""
+    node = item.get("node", item)
+
+    # ----- Data Cleaning -----
+    duration = node.get("average_episode_duration")
+    age_rating = node.get("rating", "")
+    studios = node.get("studios", [])
+    publishing_company = studios[0].get("name", "Unknown") if studios else "Unknown"
+    synopsis = node.get("synopsis").replace("[Written by MAL Rewrite]", "").strip()
+    date_aired = (datetime.strptime(node.get("start_date"), "%Y-%m-%d"))
+
+    # Alt titles (in case main is not en)
+    alt_titles = node.get("alternative_titles")
+    alt_titles.pop("synonyms")
+
+    # Extract genre
+    genre_list = [g["name"] for g in node.get("genres", [])]
+    genres = [g for g in genre_list if g not in DEMOGRAPHIC_GENRES]
+    demographic = next((g for g in genre_list if g in DEMOGRAPHIC_GENRES), "All")
+
+    return {
+        "mal_id": node["id"],
+        "title": node.get("title"),
+        "alt_titles": alt_titles,
+        "score": node.get("mean"),
+        "synopsis": synopsis,
+        "genres": genres,
+        "demographic": demographic,
+        "cover_image_url": node.get("main_picture").get("medium", ""),
+        "date_aired": date_aired,
+        "status": node.get("status", "not_aired"),
+        "episode_count": node.get("num_episodes", 0),
+        "publishing_company": publishing_company,
+        "avg_episode_len_mins": int(duration // 60),
+        "age_rating": age_rating.split(" - ")[0] if age_rating else "Unknown",
+    }
+
+def _build_documents(mal_items: List[Dict], search_recommended: bool = False) -> List[AnimeDocument]:
+    """Convert MAL items into AnimeDocument, enriching with Jikan."""
+    results = []
+
+    for raw in mal_items:
+        entry = _normalize_from_mal(raw)
+
+        # Fetch recommendations if requested
+        recs = []
+        if search_recommended:
+            time.sleep(JIKAN_RATE_LIMIT)
+            recs = _get_jikan_recommendations(entry["mal_id"])
+
+        try:
+            results.append(
+                AnimeDocument(
+                    mal_id=entry["mal_id"],
+                    node_name=entry["title"],
+                    name=entry["title"],
+                    alt_titles=entry["alt_titles"],
+                    score=entry["score"],
+                    synopsis=entry["synopsis"],
+                    genres=entry["genres"],
+                    demographic=entry["demographic"],
+                    age_rating=entry["age_rating"],
+                    cover_image_url=entry["cover_image_url"],
+                    date_aired=entry["date_aired"],
+                    status=entry["status"],
+                    episode_count=entry["episode_count"],
+                    avg_episode_len_mins=entry["avg_episode_len_mins"],
+                    publishing_company=entry["publishing_company"],
+                    recommendations=recs,
+                )
+            )
+        except Exception:
+            print(f"[SCRAPE ERR] Failed to append anime: {entry["title"]} ({entry['mal_id']})")
+
+    return results
+
+# Fetch data from MAL
+def _fetch_mal_page(ranking_type: str, page: int, limit: int) -> List[Dict]:
+    params = {
+        "ranking_type": ranking_type,
+        "limit": limit,
+        "offset": (page - 1) * limit,
+        "fields": MAL_FIELDS,
+    }
+    resp = requests.get(MAL_ENDPOINT, headers=MAL_CLIENT_HEADER, params=params, timeout=15)
+    resp.raise_for_status()
+    return resp.json().get("data", [])
+
+# Search API
+def search_anime(sort_by: str = "score", n: int = 10, search_recommended: bool = False) -> List[AnimeDocument]:
+    """
+    Search anime using MAL ranking, then enrich with Jikan.
+
+    Args:
+        sort_by: 'score' (top rated) or 'popularity' (most members)
+        n: number of results
+        search_recommended: if True, fetch recommendations from Jikan per anime
+    Returns:
+        List[AnimeDocument]
+    """
+    results = []
+    page = 1
+
+    ranking_type = {"score": "all", "popularity": "bypopularity"}.get(sort_by, "all")
+
+    with open("./data/anime_scrape.jsonl", "w", encoding="utf-8") as f:
+        while len(results) < n:
+            remaining = n - len(results)
+            limit = min(MAL_MAX_PER_PAGE, remaining) # Limit in case requesting more data than needed
+            print(f"[MAL INFO] Fetching page {page} sorted by {sort_by} (need {remaining} more items, requesting {limit} items)")
+            try:
+                data = _fetch_mal_page(ranking_type, page, limit)
+                documents = _build_documents(data, search_recommended)
+                [f.write(json.dumps(doc.model_dump(), ensure_ascii=False) + "\n") for doc in documents]
+                results.extend(documents)
+                page += 1
+                time.sleep(MAL_RATE_LIMIT)
+            except Exception as e:
+                print(f"[MAL WARN] Page {page} failed: {e}")
+                page += 1
+                continue
+
+    return results[:n]
+
+if __name__ == "__main__":
+    results = search_anime(sort_by="popularity", n=5000, search_recommended=True)
+
+    data = [r.model_dump() for r in results]
+    df = pd.DataFrame(data)
+    df.to_json("./data/anime.json", orient="records", indent=2, force_ascii=False)
+    print(f"Saved {len(df)} records to ./data/anime.json")