diff --git a/.github/workflows/pylint.yml b/.github/workflows/pylint.yml index 633ac00..5228720 100644 --- a/.github/workflows/pylint.yml +++ b/.github/workflows/pylint.yml @@ -7,7 +7,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python-version: ["3.9", "3.10", "3.11", "3.12", "3.13.0-rc.2"] + python-version: ["3.9", "3.10", "3.11", "3.12", "3.13"] steps: - uses: actions/checkout@v3 - name: Set up Python ${{ matrix.python-version }} diff --git a/cleanup_stale_translations.py b/cleanup_stale_translations.py new file mode 100755 index 0000000..b94afc8 --- /dev/null +++ b/cleanup_stale_translations.py @@ -0,0 +1,202 @@ +#!/usr/bin/env python3 +""" +Script to clean up stale translations whose source content has changed. +Removes cached translations where the SHA256 hash no longer matches any current content. +""" + +import hashlib +import os +import pickle +import sys +import yaml +import locale +from pathlib import Path + + +def load_config(): + """Load configuration from config.yaml.""" + try: + with open("config.yaml", encoding=locale.getpreferredencoding()) as file: + return yaml.load(file, Loader=yaml.SafeLoader) + except FileNotFoundError: + print("Error: config.yaml not found. Please copy config.yaml.example to config.yaml") + return None + except yaml.YAMLError as e: + print(f"Error parsing config.yaml: {e}") + return None + + +def get_cache_dir(config): + """Get the cache directory from config or use default.""" + if config and 'claude' in config and 'cache_dir' in config['claude']: + return config['claude']['cache_dir'] + return os.path.join('.', 'cache', 'translations') + + +def get_content_hash(content): + """Generate SHA256 hash of content.""" + return hashlib.sha256(content.encode('utf-8')).hexdigest() + + +def scan_content_files(public_dir="public"): + """ + Scan all content files and generate their current hashes. + Returns a set of current content hashes. + """ + current_hashes = set() + + if not os.path.exists(public_dir): + print(f"Warning: Public directory '{public_dir}' not found.") + return current_hashes + + # File extensions to scan + extensions = ['.org', '.md', '.html', '.txt'] + + print(f"Scanning content files in {public_dir}...") + + for root, dirs, files in os.walk(public_dir): + for file in files: + # Check for index files + if file in ['index.org', 'index.md', 'index.html', 'index']: + file_path = os.path.join(root, file) + try: + with open(file_path, 'r', encoding=locale.getpreferredencoding()) as f: + content = f.read() + + # For .org and .md files, we need to simulate the rendering process + if file.endswith('.org'): + # For org files, we'd need orgpython to render, but let's use raw content hash + content_hash = get_content_hash(content) + elif file.endswith('.md'): + # For markdown files, we'd need markdown lib, but let's use raw content hash + content_hash = get_content_hash(content) + else: + # HTML and plain text files + content_hash = get_content_hash(content) + + current_hashes.add(content_hash) + + except (IOError, UnicodeDecodeError) as e: + print(f"Warning: Could not read {file_path}: {e}") + continue + + # Also check other content files with relevant extensions + elif any(file.endswith(ext) for ext in extensions): + file_path = os.path.join(root, file) + try: + with open(file_path, 'r', encoding=locale.getpreferredencoding()) as f: + content = f.read() + content_hash = get_content_hash(content) + current_hashes.add(content_hash) + except (IOError, UnicodeDecodeError): + continue + + print(f"Found {len(current_hashes)} unique content hashes.") + return current_hashes + + +def parse_cache_filename(filename): + """ + Parse cache filename to extract content hash and language. + Expected format: {hash}_{lang}.pkl + """ + if not filename.endswith('.pkl'): + return None, None + + base_name = filename[:-4] # Remove .pkl extension + parts = base_name.rsplit('_', 1) # Split from the right, only once + + if len(parts) != 2: + return None, None + + content_hash, lang = parts + return content_hash, lang + + +def cleanup_stale_cache(cache_dir, current_hashes): + """ + Remove cached translations for content that no longer exists or has changed. + """ + if not os.path.exists(cache_dir): + print(f"Cache directory {cache_dir} does not exist.") + return 0 + + removed_count = 0 + kept_count = 0 + + try: + for filename in os.listdir(cache_dir): + if not filename.endswith('.pkl'): + continue + + file_path = os.path.join(cache_dir, filename) + content_hash, lang = parse_cache_filename(filename) + + if content_hash is None: + print(f"Warning: Could not parse cache filename: {filename}") + continue + + # Check if this content hash still exists in current content + if content_hash not in current_hashes: + try: + os.remove(file_path) + print(f"Removed stale cache: {filename} (hash: {content_hash[:12]}..., lang: {lang})") + removed_count += 1 + except OSError as e: + print(f"Error removing {filename}: {e}") + else: + kept_count += 1 + + print(f"\nCleanup completed:") + print(f" - Removed: {removed_count} stale cache files") + print(f" - Kept: {kept_count} current cache files") + + return removed_count + + except OSError as e: + print(f"Error accessing cache directory: {e}") + return -1 + + +def main(): + """Main function.""" + print("Stale Translation Cache Cleanup") + print("=" * 31) + + # Load configuration + config = load_config() + if config is None: + sys.exit(1) + + cache_dir = get_cache_dir(config) + print(f"Cache directory: {cache_dir}") + + # Scan current content to get active hashes + current_hashes = scan_content_files() + + if not current_hashes: + print("No content files found. Nothing to validate against.") + sys.exit(1) + + # Ask for confirmation + try: + print(f"\nThis will remove cached translations that don't match any current content.") + response = input("Continue? (y/N): ") + if response.lower() not in ['y', 'yes']: + print("Operation cancelled.") + sys.exit(0) + except KeyboardInterrupt: + print("\nOperation cancelled.") + sys.exit(0) + + # Clean up stale cache entries + result = cleanup_stale_cache(cache_dir, current_hashes) + if result >= 0: + print("Stale cache cleanup completed successfully.") + else: + print("Stale cache cleanup failed.") + sys.exit(1) + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/clear_translation_cache.py b/clear_translation_cache.py new file mode 100755 index 0000000..1885fe5 --- /dev/null +++ b/clear_translation_cache.py @@ -0,0 +1,92 @@ +#!/usr/bin/env python3 +""" +Script to clear the entire translation cache. +""" + +import os +import shutil +import sys +import yaml +import locale + + +def load_config(): + """Load configuration from config.yaml.""" + try: + with open("config.yaml", encoding=locale.getpreferredencoding()) as file: + return yaml.load(file, Loader=yaml.SafeLoader) + except FileNotFoundError: + print("Error: config.yaml not found. Please copy config.yaml.example to config.yaml") + return None + except yaml.YAMLError as e: + print(f"Error parsing config.yaml: {e}") + return None + + +def get_cache_dir(config): + """Get the cache directory from config or use default.""" + if config and 'claude' in config and 'cache_dir' in config['claude']: + return config['claude']['cache_dir'] + return os.path.join('.', 'cache', 'translations') + + +def clear_cache(cache_dir): + """Clear the entire translation cache directory.""" + if not os.path.exists(cache_dir): + print(f"Cache directory {cache_dir} does not exist.") + return 0 + + try: + # Count files before deletion + file_count = 0 + for root, dirs, files in os.walk(cache_dir): + file_count += len([f for f in files if f.endswith('.pkl')]) + + if file_count == 0: + print("No cache files found.") + return 0 + + # Remove the entire cache directory + shutil.rmtree(cache_dir) + print(f"Successfully cleared {file_count} cached translations from {cache_dir}") + return file_count + + except OSError as e: + print(f"Error clearing cache: {e}") + return -1 + + +def main(): + """Main function.""" + print("Translation Cache Cleaner") + print("=" * 25) + + # Load configuration + config = load_config() + if config is None: + sys.exit(1) + + cache_dir = get_cache_dir(config) + print(f"Cache directory: {cache_dir}") + + # Ask for confirmation + try: + response = input("Are you sure you want to clear the entire cache? (y/N): ") + if response.lower() not in ['y', 'yes']: + print("Operation cancelled.") + sys.exit(0) + except KeyboardInterrupt: + print("\nOperation cancelled.") + sys.exit(0) + + # Clear the cache + result = clear_cache(cache_dir) + if result >= 0: + print("Cache clearing completed successfully.") + else: + print("Cache clearing failed.") + sys.exit(1) + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/config.yaml.example b/config.yaml.example index a81db79..aa5e713 100644 --- a/config.yaml.example +++ b/config.yaml.example @@ -1,2 +1,28 @@ template: "nomike.com" #template: "custom/example_template" + +# Claude AI Configuration +claude: + # Your Anthropic API key + api_key: "your_anthropic_api_key_here" + + # Claude model to use (e.g., claude-3-5-sonnet-20241022, claude-3-haiku-20240307) + model: "claude-3-5-sonnet-20241022" + + # Maximum tokens for Claude responses + max_tokens: 4096 + + # Temperature for response randomness (0.0 to 1.0) + temperature: 0.7 + + # System prompt for Claude (optional) + system_prompt: "You are a helpful assistant for a content management system." + + # API base URL (leave default unless using a proxy) + base_url: "https://api.anthropic.com" + + # Request timeout in seconds + timeout: 30 + + # Cache directory for translations (relative to project root) + cache_dir: "./cache/translations" diff --git a/manage_cache.sh b/manage_cache.sh new file mode 100755 index 0000000..ac9c1a5 --- /dev/null +++ b/manage_cache.sh @@ -0,0 +1,42 @@ +#!/bin/bash +# Translation Cache Management Script + +show_help() { + echo "Translation Cache Management" + echo "Usage: $0 [clear|cleanup|help]" + echo "" + echo "Commands:" + echo " clear - Clear the entire translation cache" + echo " cleanup - Remove stale translations (source content changed)" + echo " help - Show this help message" + echo "" + echo "Examples:" + echo " $0 clear # Clear all cached translations" + echo " $0 cleanup # Remove outdated cache entries" +} + +case "$1" in + "clear") + echo "Clearing entire translation cache..." + python3 clear_translation_cache.py + ;; + "cleanup") + echo "Cleaning up stale translations..." + python3 cleanup_stale_translations.py + ;; + "help"|"-h"|"--help") + show_help + ;; + "") + echo "Error: No command specified." + echo "" + show_help + exit 1 + ;; + *) + echo "Error: Unknown command '$1'" + echo "" + show_help + exit 1 + ;; +esac \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index a6212a1..ff3ef56 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,3 +3,4 @@ markdown org-python pyyaml regex +anthropic diff --git a/templatehelper.py b/templatehelper.py index 5a94e51..52c706c 100644 --- a/templatehelper.py +++ b/templatehelper.py @@ -12,15 +12,17 @@ """ import fnmatch +import hashlib import json +import locale import mimetypes import os +import pickle # pylint: disable=unused-import import re # pylint: disable=unused-import import urllib from datetime import datetime, timezone, tzinfo -import locale # pylint: disable=unused-import import markdown @@ -29,6 +31,11 @@ import regex import yaml +try: + import anthropic +except ImportError: + anthropic = None + # pylint: disable=invalid-name config = None @@ -41,6 +48,123 @@ # pylint: disable=invalid-name pathprefix = '' +def get_cache_dir(): + """Get the cache directory from config or use default.""" + if config and 'claude' in config and 'cache_dir' in config['claude']: + return config['claude']['cache_dir'] + return os.path.join('.', 'cache', 'translations') + +def ensure_cache_dir(): + """Ensure the cache directory exists.""" + cache_dir = get_cache_dir() + if not os.path.exists(cache_dir): + os.makedirs(cache_dir, exist_ok=True) + +def get_cache_key(content_hash, target_lang): + """Generate a cache key from content hash and target language.""" + return f"{content_hash}_{target_lang}.pkl" + +def get_cached_translation(content_hash, target_lang): + """Retrieve a translation from cache if it exists.""" + ensure_cache_dir() + cache_dir = get_cache_dir() + cache_file = os.path.join(cache_dir, get_cache_key(content_hash, target_lang)) + if os.path.exists(cache_file): + try: + with open(cache_file, 'rb') as f: + return pickle.load(f) + except (IOError, pickle.PickleError): + pass + return None + +def cache_translation(content_hash, target_lang, translation): + """Store a translation in cache.""" + ensure_cache_dir() + cache_dir = get_cache_dir() + cache_file = os.path.join(cache_dir, get_cache_key(content_hash, target_lang)) + try: + with open(cache_file, 'wb') as f: + pickle.dump(translation, f) + except (IOError, pickle.PickleError): + pass + +def translate_claude(content, target_lang): + """ + Translate content using Claude API. + """ + if not anthropic: + raise ImportError( + "anthropic package not installed. Please install with: pip install anthropic" + ) + + if not config or 'claude' not in config: + raise ValueError("Claude configuration not found in config.yaml") + + claude_config = config['claude'] + api_key = claude_config.get('api_key') + if not api_key or api_key == "your_anthropic_api_key_here": + raise ValueError("Valid Claude API key not configured") + + client = anthropic.Anthropic( + api_key=api_key, + base_url=claude_config.get('base_url', 'https://api.anthropic.com') + ) + + # Language code to language name mapping (ISO 639-3) + lang_names = { + 'afr': 'Afrikaans', 'ara': 'Arabic', 'deu': 'German', 'fra': 'French', 'ita': 'Italian', 'ell': 'Greek', + 'hun': 'Hungarian', 'por': 'Portuguese', 'ces': 'Czech', 'slk': 'Slovak', + 'slv': 'Slovenian', 'hrv': 'Croatian', 'gsw': 'Swiss German', + 'nld': 'Dutch', 'bul': 'Bulgarian', 'mnk': 'Mandinka', + 'jpn': 'Japanese', 'rus': 'Russian', 'gla': 'Scottish Gaelic' + } + + target_language = lang_names.get(target_lang, target_lang) + + try: + message = client.messages.create( + model=claude_config.get('model', 'claude-3-5-sonnet-20241022'), + max_tokens=claude_config.get('max_tokens', 4096), + temperature=claude_config.get('temperature', 0.7), + system=claude_config.get( + 'system_prompt', 'You translate org-mode, markdown and HTML ' + 'documents from english to other languages. You maintain the ' + 'original formatting and tone. You only translate the text, not ' + 'the code blocks or HTML tags. You do not add any additional ' + 'text, except for a note at the top that this text has been ' + 'translated by an AI. If you do not know the target language, ' + 'you simply return the original text. You output only the ' + 'resulting text, nothing else.' + ), + messages=[{ + "role": "user", + "content": ( + f"Translate this webpage content from English to {target_language}:\n\n" + + content + ) + }] + ) + return message.content[0].text + except Exception as e: + raise RuntimeError(f"Translation failed: {str(e)}") from e + +def get_content_hash(content): + """Generate SHA256 hash of content.""" + return hashlib.sha256(content.encode('utf-8')).hexdigest() + +def is_claude_available(): + """Check if Claude translation is available and properly configured.""" + if not anthropic: + return False + + if not config or 'claude' not in config: + return False + + claude_config = config['claude'] + api_key = claude_config.get('api_key') + + return api_key and api_key != "your_anthropic_api_key_here" + # List of official MIME Types: http://www.iana.org/assignments/media-types/media-types.xhtml # If you want additional mimetypes to be covered, add them to this list. # The types map to FontAwesome identifiers. Check out https://fontawesome.com/icons?d=gallery @@ -216,3 +340,82 @@ def getlastmodifiedfile(path): newest['file'] = os.path.join(root, directory) newest['timestamp'] = timestamp return newest + +def renderIndexFile(path, lang='en'): + """ + Search for index files in order of priority (index.org, index.md, index.html, index) + and render the appropriate content. Returns rendered HTML content or default header. + If lang is not 'en', attempts to translate source content using Claude API with caching, + then renders the translated source. + """ + full_path = os.path.join(pathprefix, path) + source_content = None + file_type = None + + # Check for index.org file + org_path = os.path.join(full_path, 'index.org') + if os.path.isfile(org_path): + source_content = readfile(org_path) + file_type = 'org' + else: + # Check for index.md file + md_path = os.path.join(full_path, 'index.md') + if os.path.isfile(md_path): + source_content = readfile(md_path) + file_type = 'md' + else: + # Check for index.html file + html_path = os.path.join(full_path, 'index.html') + if os.path.isfile(html_path): + source_content = readfile(html_path) + file_type = 'html' + else: + # Check for plain index file + index_path = os.path.join(full_path, 'index') + if os.path.isfile(index_path): + source_content = readfile(index_path) + file_type = 'plain' + else: + # Default fallback - return directory header + return f'
diff --git a/templates/nomike.com/directory.html b/templates/nomike.com/directory.html index c9c26a4..790c5d5 100644 --- a/templates/nomike.com/directory.html +++ b/templates/nomike.com/directory.html @@ -12,10 +12,4 @@ {{ name }} {% else %} {{ name }} {% endif %} {% endfor %} - {% endif -%} {% endblock -%} {% block content -%} {% if templatehelper.os.path.isfile(templatehelper.os.path.join(pathprefix, path,'index.org')) -%} {{ templatehelper.orgpython.to_html(templatehelper.readfile(templatehelper.os.path.join(pathprefix,path,'index.org'))) - | safe }} {% elif templatehelper.os.path.isfile(templatehelper.os.path.join(pathprefix, path,'index.md')) -%} {{ templatehelper.markdown.markdown(templatehelper.readfile(templatehelper.os.path.join(pathprefix,path,'index.md')),extensions=['fenced_code','toc','tables']) - | safe }} {% elif templatehelper.os.path.isfile(templatehelper.os.path.join(pathprefix, path, 'index.html')) -%} {{ templatehelper.readfile(templatehelper.os.path.join(pathprefix, path, 'index.html')) - | safe -}} {% elif templatehelper.os.path.isfile(templatehelper.os.path.join(pathprefix, path, 'index')) -%} {{ templatehelper.readfile(templatehelper.os.path.join(pathprefix, path, 'index')) - | safe -}} {% else -%} -