Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
125 changes: 112 additions & 13 deletions octogen/ai/engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import json
from json_repair import repair_json
import logging
import math
import os
import random
import re
Expand Down Expand Up @@ -146,10 +147,10 @@ def analyze_listening_profile(self, favorited_songs: List[Dict], top_artists: Li

# Diversity score: higher when more evenly distributed
if total > 0:
# Calculate normalized entropy
entropy = sum(-(count/total) * (count/total).bit_length() for count in artist_counts.values() if count > 0)
max_entropy = total.bit_length() if total > 1 else 1
profile["diversity_score"] = entropy / max_entropy if max_entropy > 0 else 0
# Calculate normalized Shannon entropy
entropy = -sum((count/total) * math.log2(count/total) for count in artist_counts.values() if count > 0)
max_entropy = math.log2(len(artist_counts)) if len(artist_counts) > 1 else 1
profile["diversity_score"] = min(entropy / max_entropy, 1.0) if max_entropy > 0 else 0

profile["artist_distribution"] = dict(artist_counts.most_common(10))

Expand Down Expand Up @@ -267,6 +268,59 @@ def _invalidate_cache(self) -> None:
# Note: We don't delete call tracker to preserve daily limit
logger.info("Cache invalidation complete")

def _load_recent_songs(self) -> set:
"""Load recently recommended songs from disk.

Returns:
Set of "artist - title" strings from recent runs, or empty set on failure.
"""
recent_file = self.data_dir / "recent_playlist_songs.json"
try:
if recent_file.exists():
with open(recent_file, 'r', encoding='utf-8') as f:
data = json.load(f)
if isinstance(data, list):
return set(data)
Comment on lines +271 to +283
Copy link

Copilot AI Mar 6, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

_load_recent_songs() returns a set, which has no guaranteed iteration order. When this is converted back to a list via list(existing) on line 309, the "existing first (oldest)" ordering described in the comment is not preserved — the existing entries will be in arbitrary order. This means ordered[-200:] may drop recently recommended songs rather than the oldest ones.

To fix this, _load_recent_songs should return a list (preserving the on-disk order) instead of converting to a set. The deduplication in _save_recent_songs via the seen set already handles duplicates, so the return type can safely be list.

Copilot uses AI. Check for mistakes.
except Exception as e:
logger.warning("Could not load recent songs: %s", str(e)[:100])
return set()

def _save_recent_songs(self, songs: list) -> None:
"""Save recently recommended songs to disk (capped at 200 entries across last 2 runs).

The file is written atomically (temp file + os.replace) so an interrupted
write never leaves a corrupt or empty file on disk.

Args:
songs: List of song dicts with "artist" and "title" keys from the new playlists.
"""
recent_file = self.data_dir / "recent_playlist_songs.json"
try:
existing = self._load_recent_songs()
new_entries = [
f"{s.get('artist', '')} - {s.get('title', '')}"
for s in songs
if s.get('artist') and s.get('title')
]
# Build ordered list: existing first (oldest), new entries appended last
# so that truncation with [-200:] always keeps the most recent songs.
seen: set = set()
ordered: list = []
for entry in list(existing) + new_entries:
if entry not in seen:
seen.add(entry)
ordered.append(entry)
# Cap at 200 entries (approximately 2 runs worth); drop oldest first
combined = ordered[-200:]
# Atomic write: write to a sibling temp file then replace
tmp_file = recent_file.with_suffix(".json.tmp")
with open(tmp_file, 'w', encoding='utf-8') as f:
json.dump(combined, f, ensure_ascii=False)
os.replace(tmp_file, recent_file)
logger.info("Saved %d recent songs to disk (%d total)", len(new_entries), len(combined))
except Exception as e:
logger.warning("Could not save recent songs: %s", str(e)[:100])

def _build_cached_context(
self,
top_artists: List[str],
Expand All @@ -285,13 +339,14 @@ def _build_cached_context(
Returns:
Context string for AI
"""
artist_list = ", ".join(top_artists[:10])
genre_list = ", ".join(top_genres[:6])
artist_list = ", ".join(random.sample(top_artists[:20], min(10, len(top_artists[:20]))))
genre_list = ", ".join(random.sample(top_genres[:12], min(6, len(top_genres[:12]))))

# Limit context for memory efficiency
# Randomly sample a subset for variety — avoids O(n) shuffle of the full library
k = min(self.max_context_songs, len(favorited_songs))
favorited_sample = [
f"{s.get('artist','')} - {s.get('title','')}"
for s in favorited_songs[: self.max_context_songs]
for s in random.sample(favorited_songs, k)
]
favorited_context = "\n".join(favorited_sample)

Expand Down Expand Up @@ -392,12 +447,18 @@ def _get_or_create_gemini_cache(
logger.info("Cache created: %s (expires in 24 hours)", cached_content.name)
return cached_content

def _build_task_prompt(self, top_genres: List[str], time_context: Optional[Dict[str, str]] = None) -> str:
def _build_task_prompt(
self,
top_genres: List[str],
time_context: Optional[Dict[str, str]] = None,
recent_songs: Optional[set] = None,
) -> str:
"""Build the task-specific prompt with optional time-of-day awareness.

Args:
top_genres: List of top genres
time_context: Optional time-of-day context from get_time_context()
recent_songs: Optional set of recently recommended "artist - title" strings to avoid

Returns:
Task prompt string
Expand All @@ -411,7 +472,11 @@ def _build_task_prompt(self, top_genres: List[str], time_context: Optional[Dict[
f'{i+2}. "Daily Mix {i+1}" (30 songs, genre: {genre_name}): 25 library + 5 new'
)

variety_seed = random.randint(1000, 9999)
variety_seed = random.randint(100000, 999999) # 6-digit range reduces collision probability across runs

# Pick a random decade bias hint for added variety
decade_hints = ["1970s", "1980s", "1990s", "2000s", "2010s", "2020s", "Mix of all eras"]
decade_hint = random.choice(decade_hints)

# Add time-of-day context if provided
time_guidance = ""
Expand All @@ -424,6 +489,16 @@ def _build_task_prompt(self, top_genres: List[str], time_context: Optional[Dict[
Guidance: {time_context.get('guidance', '')}

Apply this context when selecting NEW songs to match the current time of day.
"""

# Add recently recommended songs section if provided
recent_songs_section = ""
if recent_songs:
sample_size = min(40, len(recent_songs))
recent_sample = random.sample(sorted(recent_songs), sample_size)
recent_songs_section = f"""
RECENTLY RECOMMENDED (avoid repeating these):
{chr(10).join(recent_sample)}
"""

return f"""Generate exactly 11 playlists (Variety Seed: {variety_seed}):
Expand All @@ -434,7 +509,8 @@ def _build_task_prompt(self, top_genres: List[str], time_context: Optional[Dict[
9. "Workout Energy" (30 songs): 25 library + 5 new high-energy
10. "Focus Flow" (30 songs): 25 library + 5 new ambient/instrumental
11. "Drive Time" (30 songs): 25 library + 5 new upbeat
{time_guidance}
Decade focus: {decade_hint} — lean toward this era for new discoveries
{time_guidance}{recent_songs_section}
Respond ONLY with a valid JSON array of objects, each with "artist" and "title" fields, using double quotes.

{{
Expand All @@ -454,6 +530,9 @@ def _build_task_prompt(self, top_genres: List[str], time_context: Optional[Dict[
- ESCAPE ALL BACKSLASHES: Use \\\\ not \\
- If song title has backslash, use double backslash
- Example: "AC\\\\DC" not "AC\\DC"
- Maximize variety: no artist should appear more than 2 times per playlist
- Each playlist MUST have a different set of songs - NO song should appear in more than one playlist
- Prioritize LESS POPULAR and DEEPER CUTS over well-known hits
"""

def _generate_with_gemini(
Expand Down Expand Up @@ -483,7 +562,8 @@ def _generate_with_gemini(
if time_context:
logger.info(f"🕐 Time context: {time_context.get('description')} - {time_context.get('mood')}")

prompt = self._build_task_prompt(top_genres, time_context)
recent_songs = self._load_recent_songs()
prompt = self._build_task_prompt(top_genres, time_context, recent_songs)

# Set thinking budget
thinking_budget = 5000
Expand Down Expand Up @@ -515,6 +595,19 @@ def _generate_with_gemini(
logger.warning("Thinking budget nearly exhausted (%d/%d tokens)",
thoughts, thinking_budget)

# Check for empty response
if not response.text or response.text.strip() == "":
logger.error("Gemini returned empty response")
raise ValueError("Empty response from Gemini")

# Validate JSON structure
try:
json.loads(response.text)
except json.JSONDecodeError as e:
logger.error(f"Gemini returned invalid JSON: {e}")
logger.debug(f"Problematic response start: {response.text[:500]}")
raise ValueError("Invalid JSON response from Gemini") from e

Comment on lines +603 to +610
Copy link

Copilot AI Mar 6, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The strict JSON validation here will reject responses with minor JSON issues (e.g., markdown fencing like ```json ... ```, trailing commas, unescaped characters) and raise a ValueError. This ValueError propagates through _generate_with_retry (which only retries rate-limit errors) and is caught in generate_all_playlists as a generic "api_error", completely bypassing the extensive JSON repair logic downstream (lines 710-754: fence stripping, repair_json, regex cleanup, truncation fallbacks).

Since response_mime_type="application/json" is already set in the Gemini config (line 578), the API should usually return valid JSON. But the existing repair chain was clearly added because malformed responses do occur in practice. This strict pre-validation converts previously recoverable situations into hard failures that consume an API call with no result.

Consider removing this strict JSON validation, or moving it to a warning-only check (log but don't raise), and let the existing downstream repair pipeline handle malformed responses.

Suggested change
# Validate JSON structure
try:
json.loads(response.text)
except json.JSONDecodeError as e:
logger.error(f"Gemini returned invalid JSON: {e}")
logger.debug(f"Problematic response start: {response.text[:500]}")
raise ValueError("Invalid JSON response from Gemini") from e
# Validate JSON structure (warning-only; allow downstream repair logic to handle issues)
try:
json.loads(response.text)
except json.JSONDecodeError as e:
logger.warning("Gemini returned non-strict JSON; deferring to downstream repair logic: %s", e)
logger.debug("Problematic response start: %s", response.text[:500])

Copilot uses AI. Check for mistakes.
return response.text

def _generate_with_openai(
Expand Down Expand Up @@ -544,7 +637,8 @@ def _generate_with_openai(
if time_context:
logger.info(f"🕐 Time context: {time_context.get('description')} - {time_context.get('mood')}")

task_prompt = self._build_task_prompt(top_genres, time_context)
recent_songs = self._load_recent_songs()
task_prompt = self._build_task_prompt(top_genres, time_context, recent_songs)
full_prompt = f"{cached_context}\n\n{task_prompt}"

response = self.client.chat.completions.create(
Expand Down Expand Up @@ -714,6 +808,11 @@ def generate_all_playlists(
self._record_ai_call()
total = sum(len(songs) for songs in all_playlists.values())
logger.info("Generated %d playlists (%d songs)", len(all_playlists), total)

# Persist all new songs for cross-run deduplication
all_new_songs = [song for songs in all_playlists.values() for song in songs]
self._save_recent_songs(all_new_songs)

return all_playlists, None

def _generate_with_retry(self, generate_func, *args, **kwargs) -> str:
Expand Down
Loading
Loading