Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,6 +1,11 @@
# Ignore CSV files
*.csv

# Ignore data folder
data/*.bak
data/*.json
data/*.jsonl

# Ignore SSH Keys
*.pub
group_key
Expand Down
File renamed without changes.
File renamed without changes.
3 changes: 3 additions & 0 deletions backend/configs/app_config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,7 @@ default:
max_session_cookie_age: 3600
same_site_protection: "lax"

mongo_anime_collection_name: "anime_enriched"
mongo_anime_db_name: "anizenith"

log_level: "info"
5 changes: 5 additions & 0 deletions backend/configs/backend_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,11 @@ class BackendAppConfig(Config):
MAL_CLIENT_ID: str = os.getenv("MAL_CLIENT_ID", "")
MAL_CLIENT_SECRET: str = os.getenv("MAL_CLIENT_SECRET", "")
BACKEND_SECRET: str = os.getenv("BACKEND_SECRET", "")

# MongoDB params
ATLAS_URI: str = os.getenv("ATLAS_URI", "")
mongo_anime_collection_name: Optional[str] = None
mongo_anime_db_name: Optional[str] = None

class ModelConfig(Config):
"""
Expand All @@ -38,6 +42,7 @@ class ModelConfig(Config):
# Chatbot parameters
local_model_id: Optional[str] = None
external_model_id: Optional[str] = None
embedding_model_id: Optional[str] = None
max_new_tokens: Optional[int] = None
temperature: Optional[float] = None
top_p: Optional[float] = None
Expand Down
1 change: 1 addition & 0 deletions backend/configs/model_config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ default:
# Chatbot parameters
local_model_id: "Qwen/Qwen3-0.6B"
external_model_id: "openai/gpt-oss-20b"
embedding_model_id: "sentence-transformers/all-MiniLM-L6-v2"
max_new_tokens: 2048
temperature: 0.7
top_p: 0.7
Expand Down
11 changes: 4 additions & 7 deletions backend/mongo/AniZenithMongoClient.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from backend.mongo.utils import create_text_metadata_and_embedding
from backend.mongo.AniZenithVectorSearchResult import AniZenithVectorSearchResult

from backend.configs import backend_app_config, model_config

# Class to model Anizenith MongoDB Client related utilities
class AniZenithMongoClient:
Expand All @@ -14,7 +15,7 @@ def __init__(self, conn_string):
raise ValueError("ATLAS_URI must be set to a non-empty MongoDB connection string")

self.conn_string = conn_string
self.embedding_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
self.embedding_model = SentenceTransformer(model_config.embedding_model_id)

# Set internals to None for lazy init
self._db_client = None
Expand All @@ -38,8 +39,7 @@ def anime_collection(self):
@property decorator defines this as a property of a class, rather than a class method
"""
if self._anime_collection is None:
# TODO: Move the hardcoded DB name and collection name into a central Config object
self._anime_collection = self.db_client["anizenith"]["anime"]
self._anime_collection = self.db_client[backend_app_config.mongo_anime_db_name][backend_app_config.mongo_anime_collection_name]
return self._anime_collection


Expand All @@ -55,10 +55,7 @@ def add_anime(self, anime_document: AnimeDocument) -> None:

# Create a new document to be inserted into MongoDB
anime_document_dict = {
"name": anime_document.name,
"score": anime_document.score,
"synopsis": anime_document.synopsis,
"genres": anime_document.genres,
**anime_document.model_dump(),
"text_metadata": text_metadata,
"text_metadata_embedding": text_metadata_embedding
}
Expand Down
23 changes: 18 additions & 5 deletions backend/mongo/AnimeDocument.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,22 @@
from pydantic import BaseModel
from typing import List
from datetime import datetime
from typing import List, Dict

# Class to Model a typical Anime Document
class AnimeDocument(BaseModel):
name: str
score: float
synopsis: str
genres: List[str]
mal_id: int # ID of anime on MyAnimeList
name: str # English title of anime
alt_titles: Dict[str, str] # { "en": ENGLISH_TITLE, "jp": JAPANESE_TITLE, ...}
score: float # MAL User Mean Score -- Typically 6-10
synopsis: str # Short synopsis of shows, does not contain spoilers beyond first episode
genres: List[str] # Genres list not including demographic genre
demographic: str # Primary demographic
age_rating: str # g | pg | pg-13 | r | r+ | rx
cover_image_url: str # link to MAL image (not CDN)
date_aired: datetime # date aired as datetime (YYYY-MM-DD stored, not hr/mins/secs/ms)
status: str # finished_airing | currently_airing | not_aired
episode_count: int # number of episodes in the anime
avg_episode_len_mins: int # average duration per episode in mins
publishing_company: str # publishing company name | Unknown
recommendations: Dict[str, int] # {"TITLE1":#_OF_RECOMMENDATIONS, "TITLE2":#_OF_RECOMMENDATIONS, ...}
node_name: str # Title of anime (node) -- May not be English, but always matches recommendations title
3 changes: 2 additions & 1 deletion backend/mongo/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,8 @@ def create_text_metadata_and_embedding(
anime_name: str,
anime_genres: List[str],
anime_synopsis: str
) -> Tuple[str, str]:
) -> Tuple[str, List]:
# TODO: Make this metadata better for retrieval
# Create text_metadata field using synopsis, genres and name
text_metadata = f"Synopsis: {anime_synopsis}\n\nGenres: {', '.join(anime_genres)}\n\nName: {anime_name}"

Expand Down
Binary file removed data/anime.db
Binary file not shown.
174 changes: 174 additions & 0 deletions data/anime_scrape.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,174 @@
import json
import time
from datetime import datetime
from typing import List, Dict

import pandas as pd
import requests

from backend.configs import backend_app_config
from backend.mongo.AnimeDocument import AnimeDocument

# Jikan is a web scraping REST API for anime data, but not all data can be available (specific anime search 100% uptime)
JIKAN_ENDPOINT = "https://api.jikan.moe/v4/anime"

# MyAnimeList (MAL) is an anime database access REST API service with limited features
MAL_ENDPOINT = "https://api.myanimelist.net/v2/anime/ranking"
MAL_CLIENT_HEADER = {"X-MAL-CLIENT-ID": backend_app_config.MAL_CLIENT_ID}
MAL_FIELDS = "id,title,alternative_titles,mean,synopsis,genres,main_picture,start_date,status,studios,num_episodes,average_episode_duration,rating"

# API limits
ANIME_PER_PAGE = 25
JIKAN_RATE_LIMIT = 1.0 # Jikan: (~3 req/sec, 60 req/min)
MAL_RATE_LIMIT = 0.5
MAL_MAX_PER_PAGE = 100

# Used for data cleaning
DEMOGRAPHIC_GENRES = {"Shounen", "Shoujo", "Seinen", "Josei", "Kids", "Demographic", "Shonen", "Shojo"}

def _get_jikan_recommendations(mal_id: int, limit: int = 10) -> Dict[str, int]:
"""Fetch anime user recommendations from Jikan."""
url = f"{JIKAN_ENDPOINT}/{mal_id}/recommendations"
try:
resp = requests.get(url, timeout=10)
resp.raise_for_status()
data = resp.json().get("data", [])
return {
entry.get("entry", {}).get("title"): entry.get("votes", 0)
for entry in data[:limit]
}
except Exception:
return {}

def _normalize_from_mal(item: Dict) -> Dict:
"""Convert MAL API response (node) into a flat dict with required fields for an AnimeDocument."""
node = item.get("node", item)

# ----- Data Cleaning -----
duration = node.get("average_episode_duration")
age_rating = node.get("rating", "")
studios = node.get("studios", [])
publishing_company = studios[0].get("name", "Unknown") if studios else "Unknown"
synopsis = node.get("synopsis").replace("[Written by MAL Rewrite]", "").strip()
date_aired = (datetime.strptime(node.get("start_date"), "%Y-%m-%d"))

# Alt titles (in case main is not en)
alt_titles = node.get("alternative_titles")
alt_titles.pop("synonyms")

# Extract genre
genre_list = [g["name"] for g in node.get("genres", [])]
genres = [g for g in genre_list if g not in DEMOGRAPHIC_GENRES]
demographic = next((g for g in genre_list if g in DEMOGRAPHIC_GENRES), "All")

return {
"mal_id": node["id"],
"title": node.get("title"),
"alt_titles": alt_titles,
"score": node.get("mean"),
"synopsis": synopsis,
"genres": genres,
"demographic": demographic,
"cover_image_url": node.get("main_picture").get("medium", ""),
"date_aired": date_aired,
"status": node.get("status", "not_aired"),
"episode_count": node.get("num_episodes", 0),
"publishing_company": publishing_company,
"avg_episode_len_mins": int(duration // 60),
"age_rating": age_rating.split(" - ")[0] if age_rating else "Unknown",
}

def _build_documents(mal_items: List[Dict], search_recommended: bool = False) -> List[AnimeDocument]:
"""Convert MAL items into AnimeDocument, enriching with Jikan."""
results = []

for raw in mal_items:
entry = _normalize_from_mal(raw)

# Fetch recommendations if requested
recs = []
if search_recommended:
time.sleep(JIKAN_RATE_LIMIT)
recs = _get_jikan_recommendations(entry["mal_id"])

try:
results.append(
AnimeDocument(
mal_id=entry["mal_id"],
node_name=entry["title"],
name=entry["title"],
alt_titles=entry["alt_titles"],
score=entry["score"],
synopsis=entry["synopsis"],
genres=entry["genres"],
demographic=entry["demographic"],
age_rating=entry["age_rating"],
cover_image_url=entry["cover_image_url"],
date_aired=entry["date_aired"],
status=entry["status"],
episode_count=entry["episode_count"],
avg_episode_len_mins=entry["avg_episode_len_mins"],
publishing_company=entry["publishing_company"],
recommendations=recs,
)
)
except Exception:
print(f"[SCRAPE ERR] Failed to append anime: {entry["title"]} ({entry['mal_id']})")

return results

# Fetch data from MAL
def _fetch_mal_page(ranking_type: str, page: int, limit: int) -> List[Dict]:
params = {
"ranking_type": ranking_type,
"limit": limit,
"offset": (page - 1) * limit,
"fields": MAL_FIELDS,
}
resp = requests.get(MAL_ENDPOINT, headers=MAL_CLIENT_HEADER, params=params, timeout=15)
resp.raise_for_status()
return resp.json().get("data", [])

# Search API
def search_anime(sort_by: str = "score", n: int = 10, search_recommended: bool = False) -> List[AnimeDocument]:
"""
Search anime using MAL ranking, then enrich with Jikan.

Args:
sort_by: 'score' (top rated) or 'popularity' (most members)
n: number of results
search_recommended: if True, fetch recommendations from Jikan per anime
Returns:
List[AnimeDocument]
"""
results = []
page = 1

ranking_type = {"score": "all", "popularity": "bypopularity"}.get(sort_by, "all")

with open("./data/anime_scrape.jsonl", "w", encoding="utf-8") as f:
while len(results) < n:
remaining = n - len(results)
limit = min(MAL_MAX_PER_PAGE, remaining) # Limit in case requesting more data than needed
print(f"[MAL INFO] Fetching page {page} sorted by {sort_by} (need {remaining} more items, requesting {limit} items)")
try:
data = _fetch_mal_page(ranking_type, page, limit)
documents = _build_documents(data, search_recommended)
[f.write(json.dumps(doc.model_dump(), ensure_ascii=False) + "\n") for doc in documents]
results.extend(documents)
page += 1
time.sleep(MAL_RATE_LIMIT)
except Exception as e:
print(f"[MAL WARN] Page {page} failed: {e}")
page += 1
continue

return results[:n]

if __name__ == "__main__":
results = search_anime(sort_by="popularity", n=5000, search_recommended=True)

data = [r.model_dump() for r in results]
df = pd.DataFrame(data)
df.to_json("./data/anime.json", orient="records", indent=2, force_ascii=False)
print(f"Saved {len(df)} records to ./data/anime.json")
Loading
Loading