From e0905ef97909cd81c9a15b32df1478b252444e71 Mon Sep 17 00:00:00 2001 From: arahangua Date: Tue, 12 Aug 2025 17:10:52 +0900 Subject: [PATCH 1/7] minor hotfix: wrong statement on wan2.2 --- README.md | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/README.md b/README.md index 5b1d92f..d25967f 100644 --- a/README.md +++ b/README.md @@ -23,11 +23,7 @@ Ever burned through credits in minutes? Searching Reddit for that one optimizati ![Classic AI Frustration](assets/guy_freaking_out2.png) -**SCAPO** extracts **specific, actionable optimization techniques** from Reddit about AI services - not generic "write better prompts" advice, but real techniques like: -- "Use HeyGen API v1 for unlimited ElevenLabs access at $10/month" -- "GitHub Copilot has a 300 request/day limit" -- "Avoid slowing ElevenLabs speech >5% to prevent stutters" -- "Set temperature=0.7 for wan2.2 video generation" +**SCAPO** extracts **specific, actionable optimization techniques** from Reddit about AI services - not generic "write better prompts" advice, but real discussions. ## ✨ Two Approaches From 0c0facf2ee9b62d049071a36e382f6a3accde81c Mon Sep 17 00:00:00 2001 From: arahangua Date: Tue, 12 Aug 2025 17:21:11 +0900 Subject: [PATCH 2/7] fixed: left-out acknowledgements --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index d25967f..9ad7341 100644 --- a/README.md +++ b/README.md @@ -296,6 +296,8 @@ Built as part of the CZero Engine project to improve AI application development. - Reddit communities for sharing real experiences - OpenRouter for accessible AI APIs - Coffee ☕ for making this possible +- Ollama and LMstudio for awesome local LLM experience +- All opensource contributors in this AI space --- From 2616490b1423ccfde0f04119bb3c19b72d72f851 Mon Sep 17 00:00:00 2001 From: arahangua Date: Tue, 12 Aug 2025 20:23:33 +0900 Subject: [PATCH 3/7] checkpoint:one-pass run of 22 high-priority services --- QUICKSTART.md | 10 +- README.md | 3 + models/audio/eleven-labs/cost_optimization.md | 16 ++- models/audio/eleven-labs/metadata.json | 6 +- models/audio/eleven-labs/parameters.json | 14 ++- models/audio/eleven-labs/pitfalls.md | 16 ++- models/audio/eleven-labs/prompting.md | 10 +- models/code/cursor/cost_optimization.md | 20 ++++ .../characterai => code/cursor}/metadata.json | 8 +- models/code/cursor/parameters.json | 15 +++ models/code/cursor/pitfalls.md | 21 ++++ models/code/cursor/prompting.md | 17 +++ .../code/github-copilot/cost_optimization.md | 15 +-- models/code/github-copilot/metadata.json | 8 +- models/code/github-copilot/parameters.json | 18 +-- models/code/github-copilot/pitfalls.md | 18 ++- models/code/github-copilot/prompting.md | 8 +- .../general/leonardo-ai/cost_optimization.md | 12 ++ models/general/leonardo-ai/metadata.json | 13 +++ models/general/leonardo-ai/parameters.json | 13 +++ models/general/leonardo-ai/pitfalls.md | 8 ++ models/multimodal/characterai/parameters.json | 17 --- models/multimodal/characterai/pitfalls.md | 16 --- models/multimodal/characterai/prompting.md | 18 --- models/text/characterai/metadata.json | 13 +++ models/text/characterai/parameters.json | 14 +++ models/text/characterai/pitfalls.md | 18 +++ models/text/characterai/prompting.md | 16 +++ models/text/fliki/cost_optimization.md | 8 ++ models/text/fliki/metadata.json | 13 +++ models/text/fliki/parameters.json | 12 ++ models/text/fliki/prompting.md | 12 ++ models/text/ideogram/cost_optimization.md | 8 ++ models/text/ideogram/metadata.json | 13 +++ models/text/ideogram/parameters.json | 12 ++ models/text/ideogram/pitfalls.md | 8 ++ models/text/ideogram/prompting.md | 13 +++ models/text/playht/cost_optimization.md | 8 ++ models/text/playht/metadata.json | 13 +++ models/text/playht/parameters.json | 12 ++ models/text/resemble-ai/cost_optimization.md | 8 ++ models/text/resemble-ai/metadata.json | 13 +++ models/text/resemble-ai/parameters.json | 12 ++ models/video/heygen/metadata.json | 13 +++ models/video/heygen/prompting.md | 18 +++ models/video/pika/metadata.json | 13 +++ models/video/pika/pitfalls.md | 8 ++ models/video/pika/prompting.md | 10 ++ models/video/runway/cost_optimization.md | 9 +- models/video/runway/metadata.json | 6 +- models/video/runway/parameters.json | 7 +- models/video/runway/pitfalls.md | 9 +- models/video/runway/prompting.md | 10 +- src/cli.py | 107 ++++++++++++++++++ src/services/batch_llm_processor.py | 12 +- 55 files changed, 627 insertions(+), 141 deletions(-) create mode 100644 models/code/cursor/cost_optimization.md rename models/{multimodal/characterai => code/cursor}/metadata.json (52%) create mode 100644 models/code/cursor/parameters.json create mode 100644 models/code/cursor/pitfalls.md create mode 100644 models/code/cursor/prompting.md create mode 100644 models/general/leonardo-ai/cost_optimization.md create mode 100644 models/general/leonardo-ai/metadata.json create mode 100644 models/general/leonardo-ai/parameters.json create mode 100644 models/general/leonardo-ai/pitfalls.md delete mode 100644 models/multimodal/characterai/parameters.json delete mode 100644 models/multimodal/characterai/pitfalls.md delete mode 100644 models/multimodal/characterai/prompting.md create mode 100644 models/text/characterai/metadata.json create mode 100644 models/text/characterai/parameters.json create mode 100644 models/text/characterai/pitfalls.md create mode 100644 models/text/characterai/prompting.md create mode 100644 models/text/fliki/cost_optimization.md create mode 100644 models/text/fliki/metadata.json create mode 100644 models/text/fliki/parameters.json create mode 100644 models/text/fliki/prompting.md create mode 100644 models/text/ideogram/cost_optimization.md create mode 100644 models/text/ideogram/metadata.json create mode 100644 models/text/ideogram/parameters.json create mode 100644 models/text/ideogram/pitfalls.md create mode 100644 models/text/ideogram/prompting.md create mode 100644 models/text/playht/cost_optimization.md create mode 100644 models/text/playht/metadata.json create mode 100644 models/text/playht/parameters.json create mode 100644 models/text/resemble-ai/cost_optimization.md create mode 100644 models/text/resemble-ai/metadata.json create mode 100644 models/text/resemble-ai/parameters.json create mode 100644 models/video/heygen/metadata.json create mode 100644 models/video/heygen/prompting.md create mode 100644 models/video/pika/metadata.json create mode 100644 models/video/pika/pitfalls.md create mode 100644 models/video/pika/prompting.md diff --git a/QUICKSTART.md b/QUICKSTART.md index c8eadf9..4b4ae2b 100644 --- a/QUICKSTART.md +++ b/QUICKSTART.md @@ -60,13 +60,17 @@ scapo scrape targeted --service "GitHub Copilot" --limit 20 # Or batch process by category scapo scrape batch --category video --limit 15 -scapo scrape batch --max-services 3 --priority ultra + +# Process ALL priority services one by one +scapo scrape all --priority ultra --limit 20 # Process all ultra priority services +scapo scrape all --dry-run # Preview what will be processed ``` ### Key Commands: -- `discover --update` - Find services from GitHub Awesome lists +- `discover --update` - Find services from GitHub Awesome lists - `targeted --service NAME` - Extract tips for one service -- `batch --category TYPE` - Process multiple services +- `batch --category TYPE` - Process multiple services (limited) +- `all --priority LEVEL` - Process ALL services one by one - `update-status` - See what needs updating ## 📚 Approach 2: Legacy Sources diff --git a/README.md b/README.md index 9ad7341..469fa71 100644 --- a/README.md +++ b/README.md @@ -87,6 +87,9 @@ scapo scrape targeted --service "Midjourney" --limit 20 # Or batch process multiple services scapo scrape batch --category video --limit 15 + +# Process ALL priority services one by one (no limits!) +scapo scrape all --priority ultra --limit 20 ``` #### Option B: Legacy Sources diff --git a/models/audio/eleven-labs/cost_optimization.md b/models/audio/eleven-labs/cost_optimization.md index a84fb7c..9dd260b 100644 --- a/models/audio/eleven-labs/cost_optimization.md +++ b/models/audio/eleven-labs/cost_optimization.md @@ -1,11 +1,17 @@ # Eleven Labs - Cost Optimization Guide -*Last updated: 2025-08-11* +*Last updated: 2025-08-12* ## Pricing Information -- $100-$200/month for multiple hours of audio per week -- Creator plan: $22/month for 100k characters -- $22/month provides approximately 2 hours of content -- Business subscription: over $1,300/month +- $10/month or $96/year for Reader trial +- $99/month plan +- $29/month for Reader unlimited plan +- $22/month for Creator plan (100k characters) +- $1,300+/month for Business subscription + +## Money-Saving Tips + +- Free plan: 12 minutes/month via web UI, 2h30 via API +- Approximately 400,000 credits were removed; 60% of credits left should last until June 5th. diff --git a/models/audio/eleven-labs/metadata.json b/models/audio/eleven-labs/metadata.json index 6334f55..ffa53d6 100644 --- a/models/audio/eleven-labs/metadata.json +++ b/models/audio/eleven-labs/metadata.json @@ -1,13 +1,13 @@ { "service": "Eleven Labs", "category": "audio", - "last_updated": "2025-08-11T23:07:25.519029", - "extraction_timestamp": "2025-08-11T23:06:24.446102", + "last_updated": "2025-08-12T20:08:46.442319", + "extraction_timestamp": "2025-08-12T20:08:37.495998", "data_sources": [ "Reddit API", "Community discussions" ], - "posts_analyzed": 49, + "posts_analyzed": 79, "confidence": "medium", "version": "1.0.0" } \ No newline at end of file diff --git a/models/audio/eleven-labs/parameters.json b/models/audio/eleven-labs/parameters.json index c33ac5c..4564290 100644 --- a/models/audio/eleven-labs/parameters.json +++ b/models/audio/eleven-labs/parameters.json @@ -1,13 +1,21 @@ { "service": "Eleven Labs", - "last_updated": "2025-08-11T23:07:25.519029", + "last_updated": "2025-08-12T20:08:46.346345", "recommended_settings": { "setting_0": { - "description": "In Manim implementation: set_speech_service(ElevenLabsService(voice_name=\"Mun W\"))" + "description": "Model ID: gemini-2.0-flash" + }, + "setting_1": { + "description": "API Key: (user-provided)" + }, + "setting_2": { + "description": "Server URL: https://9df9e70d40a2.ngrok-free.app/v1/big-chief" } }, "cost_optimization": { - "pricing": "Business subscription: over $1,300/month" + "pricing": "$1,300+/month for Business subscription", + "tip_1": "Approximately 400,000 credits were removed; 60% of credits left should last until June 5th.", + "tip_2": "Free plan: 12 minutes/month via web UI, 2h30 via API" }, "sources": [ "Reddit community", diff --git a/models/audio/eleven-labs/pitfalls.md b/models/audio/eleven-labs/pitfalls.md index 8cdd8b1..ecf442a 100644 --- a/models/audio/eleven-labs/pitfalls.md +++ b/models/audio/eleven-labs/pitfalls.md @@ -1,14 +1,22 @@ # Eleven Labs - Common Pitfalls & Issues -*Last updated: 2025-08-11* +*Last updated: 2025-08-12* ## Technical Issues -### ⚠️ Error when accessing Eleven Labs API from self-hosted n8n (specific error message not provided) +### ⚠️ Unable to switch back to a Custom LLM after testing with built-in model gemini-2.0-flash; interface shows 'Fix the errors to proceed' even though Server URL, Model ID, and API Key are correctly filled. +**Fix**: Store API keys in environment variables or use a secrets manager. + +## Policy & Account Issues + +### ⚠️ 400,000 credits were wiped from account on the $99/month plan; subscription renewal failed due to paywall issues, leaving 60% of credits unused. +**Note**: Be aware of terms of service regarding account creation. ## Cost & Limits -### 💰 Credits don't transfer to the next month +### 💰 Credits from ElevenLabs free plan do not roll over to the next month. + +### 💰 $29/month for Reader unlimited plan -### 💰 Creator plan: $22/month for 100k characters +### 💰 $22/month for Creator plan (100k characters) diff --git a/models/audio/eleven-labs/prompting.md b/models/audio/eleven-labs/prompting.md index b84d395..76b915b 100644 --- a/models/audio/eleven-labs/prompting.md +++ b/models/audio/eleven-labs/prompting.md @@ -1,14 +1,12 @@ # Eleven Labs Prompting Guide -*Last updated: 2025-08-11* - -## Technical Tips - -- For Manim voiceovers, use ElevenLabsService with voice_name='Mun W' parameter +*Last updated: 2025-08-12* ## Recommended Settings -- In Manim implementation: set_speech_service(ElevenLabsService(voice_name="Mun W")) +- Model ID: gemini-2.0-flash +- API Key: (user-provided) +- Server URL: https://9df9e70d40a2.ngrok-free.app/v1/big-chief ## Sources diff --git a/models/code/cursor/cost_optimization.md b/models/code/cursor/cost_optimization.md new file mode 100644 index 0000000..4cb6880 --- /dev/null +++ b/models/code/cursor/cost_optimization.md @@ -0,0 +1,20 @@ +# Cursor - Cost Optimization Guide + +*Last updated: 2025-08-12* + +## Pricing Information + +- API costs have decreased in recent months; prompt caching further lowers costs. +- Using an API key for Anthropic's Sonnet 4 can reduce cost to about $0.73 per 1,000 tokens compared to $10–15+ per 1,000 tokens when using Cursor’s built‑in billing. +- Monthly subscription remains $20. + +## Money-Saving Tips + +- Use the free Meta Llama models available in Cursor: meta-llama/llama-3.1-405b-instruct, meta-llama/llama-3.2-90b-vision-instruct, meta-llama/llama-3.1-70b-instruct. +- Pro plan indexing limit: 100,000 files +- deepseekcoder‑v2 is cheaper than other models. + +## Alternative Access Methods + +- Enable prompt caching in Cursor to reduce API costs; the deepseekcoder‑v2 model is noted to be cheaper than other models. + diff --git a/models/multimodal/characterai/metadata.json b/models/code/cursor/metadata.json similarity index 52% rename from models/multimodal/characterai/metadata.json rename to models/code/cursor/metadata.json index 0f27d4f..f0787d7 100644 --- a/models/multimodal/characterai/metadata.json +++ b/models/code/cursor/metadata.json @@ -1,13 +1,13 @@ { - "service": "Character.AI", - "category": "multimodal", - "last_updated": "2025-08-11T22:13:29.427007", + "service": "Cursor", + "category": "code", + "last_updated": "2025-08-12T20:05:11.499882", "extraction_timestamp": null, "data_sources": [ "Reddit API", "Community discussions" ], - "posts_analyzed": 75, + "posts_analyzed": 100, "confidence": "medium", "version": "1.0.0" } \ No newline at end of file diff --git a/models/code/cursor/parameters.json b/models/code/cursor/parameters.json new file mode 100644 index 0000000..8c88a18 --- /dev/null +++ b/models/code/cursor/parameters.json @@ -0,0 +1,15 @@ +{ + "service": "Cursor", + "last_updated": "2025-08-12T20:05:11.435746", + "recommended_settings": {}, + "cost_optimization": { + "tip_0": "API costs have decreased in recent months; prompt caching further lowers costs.", + "tip_1": "Pro plan indexing limit: 100,000 files", + "pricing": "Monthly subscription remains $20.", + "tip_3": "deepseekcoder\u2011v2 is cheaper than other models." + }, + "sources": [ + "Reddit community", + "User reports" + ] +} \ No newline at end of file diff --git a/models/code/cursor/pitfalls.md b/models/code/cursor/pitfalls.md new file mode 100644 index 0000000..369aa8f --- /dev/null +++ b/models/code/cursor/pitfalls.md @@ -0,0 +1,21 @@ +# Cursor - Common Pitfalls & Issues + +*Last updated: 2025-08-12* + +## Technical Issues + +### ⚠️ Cursor charges usage‑based pricing even when using an API key without a subscription. +**Fix**: Store API keys in environment variables or use a secrets manager. + +### ⚠️ BYOK (Bring Your Own API Key) no longer works: error message "Agent and Edit rely on custom models that cannot be billed to". +**Fix**: Store API keys in environment variables or use a secrets manager. + +### ⚠️ Using own OpenAI API key still subject to Free Tier limitations +**Fix**: Store API keys in environment variables or use a secrets manager. + +## Cost & Limits + +### 💰 Pro plan indexing limit of 100,000 files + +### 💰 Pro plan indexing limit: 100,000 files + diff --git a/models/code/cursor/prompting.md b/models/code/cursor/prompting.md new file mode 100644 index 0000000..a42f563 --- /dev/null +++ b/models/code/cursor/prompting.md @@ -0,0 +1,17 @@ +# Cursor Prompting Guide + +*Last updated: 2025-08-12* + +## Technical Tips + +- Enable prompt caching in Cursor to reduce API costs; the deepseekcoder‑v2 model is noted to be cheaper than other models. + +## Usage Tips + +- Use the free Meta Llama models available in Cursor: meta-llama/llama-3.1-405b-instruct, meta-llama/llama-3.2-90b-vision-instruct, meta-llama/llama-3.1-70b-instruct. +- Run any AI model (e.g., GROQ or local models) in Cursor by setting up a proxy server; use the R1SONQWEN implementation for a working example. + +## Sources + +- Reddit community discussions +- User-reported experiences diff --git a/models/code/github-copilot/cost_optimization.md b/models/code/github-copilot/cost_optimization.md index 8de50cd..6c882d4 100644 --- a/models/code/github-copilot/cost_optimization.md +++ b/models/code/github-copilot/cost_optimization.md @@ -1,19 +1,14 @@ # GitHub Copilot - Cost Optimization Guide -*Last updated: 2025-08-11* +*Last updated: 2025-08-12* ## Pricing Information -- GitHub Copilot Pro: $10/month with unlimited standard usage (fair use policies apply) -- More cost-effective than Cursor Pro ($20/month with only 500 premium requests) +- $10/month for the standard Copilot plan ## Money-Saving Tips -- Unclear if Azure Credits can be used to pay for GitHub Copilot -- GitHub Copilot now has a free tier -- GitHub Copilot Pro has a daily request limit of 300 requests for all models including GPT-4.1 -- GitHub Copilot with Sonnet 4 is a cheaper option but slower -- Copilot Pro is mentioned as a higher tier than the free tier -- GitHub Copilot with Sonnet 4 is described as 'cheap' compared to alternatives -- This limit equals about 4 hours of usage per day +- Copilot Pro: 300 premium requests/month (before Pro+ upgrade) +- Free tier: 2,000 completions/month, 50 chat prompts/month +- Paid tier: unlimited completions and chat prompts diff --git a/models/code/github-copilot/metadata.json b/models/code/github-copilot/metadata.json index e1e17a5..35e3889 100644 --- a/models/code/github-copilot/metadata.json +++ b/models/code/github-copilot/metadata.json @@ -1,13 +1,13 @@ { "service": "GitHub Copilot", - "category": "coding", - "last_updated": "2025-08-11T22:04:52.548712", - "extraction_timestamp": "2025-08-11T22:03:30.694771", + "category": "code", + "last_updated": "2025-08-12T20:04:22.865982", + "extraction_timestamp": "2025-08-12T20:04:20.280914", "data_sources": [ "Reddit API", "Community discussions" ], - "posts_analyzed": 15, + "posts_analyzed": 82, "confidence": "medium", "version": "1.0.0" } \ No newline at end of file diff --git a/models/code/github-copilot/parameters.json b/models/code/github-copilot/parameters.json index 1abf98d..313405f 100644 --- a/models/code/github-copilot/parameters.json +++ b/models/code/github-copilot/parameters.json @@ -1,19 +1,19 @@ { "service": "GitHub Copilot", - "last_updated": "2025-08-11T22:04:52.544713", + "last_updated": "2025-08-12T20:04:22.768265", "recommended_settings": { "setting_0": { - "description": "GitHub Copilot can be configured to use Sonnet 4" + "description": "github.copilot.chat.agent.autoFix" + }, + "setting_1": { + "description": "chat.tools.autoApprove" } }, "cost_optimization": { - "tip_0": "Unclear if Azure Credits can be used to pay for GitHub Copilot", - "pricing": "More cost-effective than Cursor Pro ($20/month with only 500 premium requests)", - "tip_2": "GitHub Copilot now has a free tier", - "tip_3": "GitHub Copilot Pro has a daily request limit of 300 requests for all models including GPT-4.1", - "tip_4": "Copilot Pro is mentioned as a higher tier than the free tier", - "tip_5": "GitHub Copilot with Sonnet 4 is described as 'cheap' compared to alternatives", - "tip_6": "This limit equals about 4 hours of usage per day" + "pricing": "$10/month for the standard Copilot plan", + "tip_1": "Copilot Pro: 300 premium requests/month (before Pro+ upgrade)", + "tip_2": "Free tier: 2,000 completions/month, 50 chat prompts/month", + "unlimited_option": "Paid tier: unlimited completions and chat prompts" }, "sources": [ "Reddit community", diff --git a/models/code/github-copilot/pitfalls.md b/models/code/github-copilot/pitfalls.md index d2b709f..33c7f00 100644 --- a/models/code/github-copilot/pitfalls.md +++ b/models/code/github-copilot/pitfalls.md @@ -1,23 +1,21 @@ # GitHub Copilot - Common Pitfalls & Issues -*Last updated: 2025-08-11* +*Last updated: 2025-08-12* ## Technical Issues -### ⚠️ Slower performance compared to alternatives like Claude Code +### ⚠️ Extension 'GitHub.copilot-chat' cannot use API proposal: chatParticipantPrivate. Its package.json#enabledApiProposals-property declares: but NOT chatParticipantPrivate. -## Policy & Account Issues +### ⚠️ 300 requests per day limit on VS Code LM API for Copilot Pro -### ⚠️ Technical issue with signing up for the free tier after trial expiration (continuous redirect between pages) -**Note**: Be aware of terms of service regarding account creation. +### ⚠️ GitHub Copilot extension lost Gemini API access; only Groq and OpenRouter API keys can be added. +**Fix**: Store API keys in environment variables or use a secrets manager. ## Cost & Limits -### 💰 Daily request limit of 300 requests for all models including GPT-4.1 (equals about 4 hours of usage per day) +### 💰 300 premium requests per month limit on Copilot Pro -### 💰 GitHub Copilot Pro: $10/month with unlimited standard usage (fair use policies apply) +### 💰 Free tier limited to 2,000 completions per month and 50 chat prompts -### 💰 GitHub Copilot Pro has a daily request limit of 300 requests for all models including GPT-4.1 - -### 💰 This limit equals about 4 hours of usage per day +### 💰 Paid tier: unlimited completions and chat prompts diff --git a/models/code/github-copilot/prompting.md b/models/code/github-copilot/prompting.md index 458c909..bd10987 100644 --- a/models/code/github-copilot/prompting.md +++ b/models/code/github-copilot/prompting.md @@ -1,14 +1,16 @@ # GitHub Copilot Prompting Guide -*Last updated: 2025-08-11* +*Last updated: 2025-08-12* ## Usage Tips -- Create a copilot-instructions.md file with specific sections including: Terminology (domain-specific terms), Architecture (key files and design decisions), and Task Plan +- When Gemini API access is unavailable in the GitHub Copilot extension, add a Groq or OpenRouter API key via the extension settings to regain model selection. +- Create a custom VS Code extension that uses the GitHub Copilot API as an LLM provider to bypass the standard completion limits ## Recommended Settings -- GitHub Copilot can be configured to use Sonnet 4 +- github.copilot.chat.agent.autoFix +- chat.tools.autoApprove ## Sources diff --git a/models/general/leonardo-ai/cost_optimization.md b/models/general/leonardo-ai/cost_optimization.md new file mode 100644 index 0000000..84c2c8a --- /dev/null +++ b/models/general/leonardo-ai/cost_optimization.md @@ -0,0 +1,12 @@ +# Leonardo AI - Cost Optimization Guide + +*Last updated: 2025-08-12* + +## Pricing Information + +- Maestro Unlimited Plan: $60 per month for 60,000 fast tokens (2,500 tokens per 8‑second clip, costing $2.5 per clip). + +## Money-Saving Tips + +- Free plan available, free trial. + diff --git a/models/general/leonardo-ai/metadata.json b/models/general/leonardo-ai/metadata.json new file mode 100644 index 0000000..641a893 --- /dev/null +++ b/models/general/leonardo-ai/metadata.json @@ -0,0 +1,13 @@ +{ + "service": "Leonardo AI", + "category": "general", + "last_updated": "2025-08-12T20:05:49.276178", + "extraction_timestamp": "2025-08-12T20:05:47.750787", + "data_sources": [ + "Reddit API", + "Community discussions" + ], + "posts_analyzed": 45, + "confidence": "medium", + "version": "1.0.0" +} \ No newline at end of file diff --git a/models/general/leonardo-ai/parameters.json b/models/general/leonardo-ai/parameters.json new file mode 100644 index 0000000..b37aa61 --- /dev/null +++ b/models/general/leonardo-ai/parameters.json @@ -0,0 +1,13 @@ +{ + "service": "Leonardo AI", + "last_updated": "2025-08-12T20:05:49.209009", + "recommended_settings": {}, + "cost_optimization": { + "tip_0": "Free plan available, free trial.", + "pricing": "Maestro Unlimited Plan: $60 per month for 60,000 fast tokens (2,500 tokens per 8\u2011second clip, costing $2.5 per clip)." + }, + "sources": [ + "Reddit community", + "User reports" + ] +} \ No newline at end of file diff --git a/models/general/leonardo-ai/pitfalls.md b/models/general/leonardo-ai/pitfalls.md new file mode 100644 index 0000000..d611e0a --- /dev/null +++ b/models/general/leonardo-ai/pitfalls.md @@ -0,0 +1,8 @@ +# Leonardo AI - Common Pitfalls & Issues + +*Last updated: 2025-08-12* + +## Cost & Limits + +### 💰 Maestro Unlimited Plan: $60 per month for 60,000 fast tokens (2,500 tokens per 8‑second clip, costing $2.5 per clip). + diff --git a/models/multimodal/characterai/parameters.json b/models/multimodal/characterai/parameters.json deleted file mode 100644 index 65fc345..0000000 --- a/models/multimodal/characterai/parameters.json +++ /dev/null @@ -1,17 +0,0 @@ -{ - "service": "Character.AI", - "last_updated": "2025-08-11T22:13:29.425767", - "recommended_settings": { - "setting_0": { - "description": "Goro setting in experimental section of Style tab" - }, - "setting_1": { - "description": "Limited chat style for users over 18" - } - }, - "cost_optimization": {}, - "sources": [ - "Reddit community", - "User reports" - ] -} \ No newline at end of file diff --git a/models/multimodal/characterai/pitfalls.md b/models/multimodal/characterai/pitfalls.md deleted file mode 100644 index 2b641e1..0000000 --- a/models/multimodal/characterai/pitfalls.md +++ /dev/null @@ -1,16 +0,0 @@ -# Character.AI - Common Pitfalls & Issues - -*Last updated: 2025-08-11* - -## Technical Issues - -### ⚠️ Image not displaying bug - -### ⚠️ Bug that stops Character.AI from replying to messages - -### ⚠️ Keyboard closing bug on mobile devices - -## Cost & Limits - -### 💰 Character.AI has a 32,000 character limit when creating AI characters - diff --git a/models/multimodal/characterai/prompting.md b/models/multimodal/characterai/prompting.md deleted file mode 100644 index f9907bd..0000000 --- a/models/multimodal/characterai/prompting.md +++ /dev/null @@ -1,18 +0,0 @@ -# Character.AI Prompting Guide - -*Last updated: 2025-08-11* - -## Usage Tips - -- To avoid keyboard closing bug on mobile: Use a Bluetooth keyboard -- The filtering system includes multiple layers: pre-processing filter, context-aware content classification (BERT-based), response generation filter, and user monitoring - -## Recommended Settings - -- Goro setting in experimental section of Style tab -- Limited chat style for users over 18 - -## Sources - -- Reddit community discussions -- User-reported experiences diff --git a/models/text/characterai/metadata.json b/models/text/characterai/metadata.json new file mode 100644 index 0000000..810d0fe --- /dev/null +++ b/models/text/characterai/metadata.json @@ -0,0 +1,13 @@ +{ + "service": "Character.AI", + "category": "text", + "last_updated": "2025-08-12T20:04:44.876236", + "extraction_timestamp": "2025-08-12T20:04:40.376447", + "data_sources": [ + "Reddit API", + "Community discussions" + ], + "posts_analyzed": 100, + "confidence": "medium", + "version": "1.0.0" +} \ No newline at end of file diff --git a/models/text/characterai/parameters.json b/models/text/characterai/parameters.json new file mode 100644 index 0000000..53924aa --- /dev/null +++ b/models/text/characterai/parameters.json @@ -0,0 +1,14 @@ +{ + "service": "Character.AI", + "last_updated": "2025-08-12T20:04:44.782084", + "recommended_settings": { + "setting_0": { + "description": "Enable 'Goro' in the experimental settings under the Style tab" + } + }, + "cost_optimization": {}, + "sources": [ + "Reddit community", + "User reports" + ] +} \ No newline at end of file diff --git a/models/text/characterai/pitfalls.md b/models/text/characterai/pitfalls.md new file mode 100644 index 0000000..e0b44c1 --- /dev/null +++ b/models/text/characterai/pitfalls.md @@ -0,0 +1,18 @@ +# Character.AI - Common Pitfalls & Issues + +*Last updated: 2025-08-12* + +## Technical Issues + +### ⚠️ image bug where character images do not load + +### ⚠️ keyboard bug where the on-screen keyboard closes unexpectedly + +### ⚠️ bug that stops replying to your messages + +### ⚠️ disappearing characters bug where characters are unavailable for some users but available for others + +## Cost & Limits + +### 💰 Character creation character limit of 32,000 characters per AI + diff --git a/models/text/characterai/prompting.md b/models/text/characterai/prompting.md new file mode 100644 index 0000000..59e4012 --- /dev/null +++ b/models/text/characterai/prompting.md @@ -0,0 +1,16 @@ +# Character.AI Prompting Guide + +*Last updated: 2025-08-12* + +## Technical Tips + +- Use Janitor AI to jailbreak Character AI and access the NSFW version via prompts + +## Recommended Settings + +- Enable 'Goro' in the experimental settings under the Style tab + +## Sources + +- Reddit community discussions +- User-reported experiences diff --git a/models/text/fliki/cost_optimization.md b/models/text/fliki/cost_optimization.md new file mode 100644 index 0000000..0594357 --- /dev/null +++ b/models/text/fliki/cost_optimization.md @@ -0,0 +1,8 @@ +# Fliki - Cost Optimization Guide + +*Last updated: 2025-08-12* + +## Money-Saving Tips + +- 25% off premium plans for a whole year + diff --git a/models/text/fliki/metadata.json b/models/text/fliki/metadata.json new file mode 100644 index 0000000..3313b01 --- /dev/null +++ b/models/text/fliki/metadata.json @@ -0,0 +1,13 @@ +{ + "service": "Fliki", + "category": "text", + "last_updated": "2025-08-12T20:07:02.281408", + "extraction_timestamp": null, + "data_sources": [ + "Reddit API", + "Community discussions" + ], + "posts_analyzed": 23, + "confidence": "medium", + "version": "1.0.0" +} \ No newline at end of file diff --git a/models/text/fliki/parameters.json b/models/text/fliki/parameters.json new file mode 100644 index 0000000..55d84e4 --- /dev/null +++ b/models/text/fliki/parameters.json @@ -0,0 +1,12 @@ +{ + "service": "Fliki", + "last_updated": "2025-08-12T20:07:02.186904", + "recommended_settings": {}, + "cost_optimization": { + "tip_0": "25% off premium plans for a whole year" + }, + "sources": [ + "Reddit community", + "User reports" + ] +} \ No newline at end of file diff --git a/models/text/fliki/prompting.md b/models/text/fliki/prompting.md new file mode 100644 index 0000000..36aab56 --- /dev/null +++ b/models/text/fliki/prompting.md @@ -0,0 +1,12 @@ +# Fliki Prompting Guide + +*Last updated: 2025-08-12* + +## Usage Tips + +- Use the Fliki discount page (https://fliki.ai/pricing?via=subreddits) which includes the coupon, sign up with a new email, choose a plan and period, then proceed with subscription to activate a whole year at 25% discount. + +## Sources + +- Reddit community discussions +- User-reported experiences diff --git a/models/text/ideogram/cost_optimization.md b/models/text/ideogram/cost_optimization.md new file mode 100644 index 0000000..a4d986f --- /dev/null +++ b/models/text/ideogram/cost_optimization.md @@ -0,0 +1,8 @@ +# Ideogram - Cost Optimization Guide + +*Last updated: 2025-08-12* + +## Money-Saving Tips + +- 216 credits per day available for paid users using 3‑credit generations. + diff --git a/models/text/ideogram/metadata.json b/models/text/ideogram/metadata.json new file mode 100644 index 0000000..ae3a3fa --- /dev/null +++ b/models/text/ideogram/metadata.json @@ -0,0 +1,13 @@ +{ + "service": "Ideogram", + "category": "text", + "last_updated": "2025-08-12T20:05:32.879741", + "extraction_timestamp": "2025-08-12T20:05:25.727949", + "data_sources": [ + "Reddit API", + "Community discussions" + ], + "posts_analyzed": 49, + "confidence": "medium", + "version": "1.0.0" +} \ No newline at end of file diff --git a/models/text/ideogram/parameters.json b/models/text/ideogram/parameters.json new file mode 100644 index 0000000..4def834 --- /dev/null +++ b/models/text/ideogram/parameters.json @@ -0,0 +1,12 @@ +{ + "service": "Ideogram", + "last_updated": "2025-08-12T20:05:32.795932", + "recommended_settings": {}, + "cost_optimization": { + "tip_0": "216 credits per day available for paid users using 3\u2011credit generations." + }, + "sources": [ + "Reddit community", + "User reports" + ] +} \ No newline at end of file diff --git a/models/text/ideogram/pitfalls.md b/models/text/ideogram/pitfalls.md new file mode 100644 index 0000000..873e7f5 --- /dev/null +++ b/models/text/ideogram/pitfalls.md @@ -0,0 +1,8 @@ +# Ideogram - Common Pitfalls & Issues + +*Last updated: 2025-08-12* + +## Technical Issues + +### ⚠️ Leader/another role cannot be drafted due to a bug involving ideoligion objects such as shrines and ideograms. + diff --git a/models/text/ideogram/prompting.md b/models/text/ideogram/prompting.md new file mode 100644 index 0000000..95bac25 --- /dev/null +++ b/models/text/ideogram/prompting.md @@ -0,0 +1,13 @@ +# Ideogram Prompting Guide + +*Last updated: 2025-08-12* + +## Usage Tips + +- In dev mods, use the destroy tool to delete any object directly related to your ideoligion (shrines, ideograms, etc.), then randomize symbols and regenerate all buildings to resolve the drafting issue. +- Use 3‑credit generations to maximize throughput: 72 batches per day, totaling 216 credits per day. + +## Sources + +- Reddit community discussions +- User-reported experiences diff --git a/models/text/playht/cost_optimization.md b/models/text/playht/cost_optimization.md new file mode 100644 index 0000000..3c4303c --- /dev/null +++ b/models/text/playht/cost_optimization.md @@ -0,0 +1,8 @@ +# Play.ht - Cost Optimization Guide + +*Last updated: 2025-08-12* + +## Pricing Information + +- $100 per month + diff --git a/models/text/playht/metadata.json b/models/text/playht/metadata.json new file mode 100644 index 0000000..9b35b00 --- /dev/null +++ b/models/text/playht/metadata.json @@ -0,0 +1,13 @@ +{ + "service": "Play.ht", + "category": "text", + "last_updated": "2025-08-12T20:09:36.214880", + "extraction_timestamp": "2025-08-12T20:09:36.064867", + "data_sources": [ + "Reddit API", + "Community discussions" + ], + "posts_analyzed": 28, + "confidence": "medium", + "version": "1.0.0" +} \ No newline at end of file diff --git a/models/text/playht/parameters.json b/models/text/playht/parameters.json new file mode 100644 index 0000000..ba866c0 --- /dev/null +++ b/models/text/playht/parameters.json @@ -0,0 +1,12 @@ +{ + "service": "Play.ht", + "last_updated": "2025-08-12T20:09:36.149442", + "recommended_settings": {}, + "cost_optimization": { + "pricing": "$100 per month" + }, + "sources": [ + "Reddit community", + "User reports" + ] +} \ No newline at end of file diff --git a/models/text/resemble-ai/cost_optimization.md b/models/text/resemble-ai/cost_optimization.md new file mode 100644 index 0000000..40bf4a8 --- /dev/null +++ b/models/text/resemble-ai/cost_optimization.md @@ -0,0 +1,8 @@ +# Resemble AI - Cost Optimization Guide + +*Last updated: 2025-08-12* + +## Pricing Information + +- basic package starts at $0.006 per second + diff --git a/models/text/resemble-ai/metadata.json b/models/text/resemble-ai/metadata.json new file mode 100644 index 0000000..96c6987 --- /dev/null +++ b/models/text/resemble-ai/metadata.json @@ -0,0 +1,13 @@ +{ + "service": "Resemble AI", + "category": "text", + "last_updated": "2025-08-12T20:09:01.048329", + "extraction_timestamp": "2025-08-12T20:09:00.838795", + "data_sources": [ + "Reddit API", + "Community discussions" + ], + "posts_analyzed": 15, + "confidence": "medium", + "version": "1.0.0" +} \ No newline at end of file diff --git a/models/text/resemble-ai/parameters.json b/models/text/resemble-ai/parameters.json new file mode 100644 index 0000000..92c7bbc --- /dev/null +++ b/models/text/resemble-ai/parameters.json @@ -0,0 +1,12 @@ +{ + "service": "Resemble AI", + "last_updated": "2025-08-12T20:09:00.954576", + "recommended_settings": {}, + "cost_optimization": { + "pricing": "basic package starts at $0.006 per second" + }, + "sources": [ + "Reddit community", + "User reports" + ] +} \ No newline at end of file diff --git a/models/video/heygen/metadata.json b/models/video/heygen/metadata.json new file mode 100644 index 0000000..38c02e5 --- /dev/null +++ b/models/video/heygen/metadata.json @@ -0,0 +1,13 @@ +{ + "service": "HeyGen", + "category": "video", + "last_updated": "2025-08-12T20:08:18.502912", + "extraction_timestamp": null, + "data_sources": [ + "Reddit API", + "Community discussions" + ], + "posts_analyzed": 48, + "confidence": "medium", + "version": "1.0.0" +} \ No newline at end of file diff --git a/models/video/heygen/prompting.md b/models/video/heygen/prompting.md new file mode 100644 index 0000000..3f8fe9d --- /dev/null +++ b/models/video/heygen/prompting.md @@ -0,0 +1,18 @@ +# HeyGen Prompting Guide + +*Last updated: 2025-08-12* + +## Technical Tips + +- To create a custom avatar with HeyGen, first record about 30 minutes of clean, high‑quality audio and use ElevenLabs’ Professional Voice Clone to train a custom voice model. +- After the avatar is generated, start a new video project in HeyGen and connect the custom voice model from ElevenLabs to the avatar. + +## Usage Tips + +- Upload a clear, 2‑minute video of yourself to HeyGen’s avatar creation page. +- On HeyGen, click the "Create New Avatar" button, select the "Hyper‑Realistic" option, and upload the 2‑minute video to generate the avatar. + +## Sources + +- Reddit community discussions +- User-reported experiences diff --git a/models/video/pika/metadata.json b/models/video/pika/metadata.json new file mode 100644 index 0000000..d4f4cf2 --- /dev/null +++ b/models/video/pika/metadata.json @@ -0,0 +1,13 @@ +{ + "service": "Pika", + "category": "video", + "last_updated": "2025-08-12T20:07:40.706852", + "extraction_timestamp": "2025-08-12T20:07:30.860248", + "data_sources": [ + "Reddit API", + "Community discussions" + ], + "posts_analyzed": 86, + "confidence": "medium", + "version": "1.0.0" +} \ No newline at end of file diff --git a/models/video/pika/pitfalls.md b/models/video/pika/pitfalls.md new file mode 100644 index 0000000..e824d4e --- /dev/null +++ b/models/video/pika/pitfalls.md @@ -0,0 +1,8 @@ +# Pika - Common Pitfalls & Issues + +*Last updated: 2025-08-12* + +## Technical Issues + +### ⚠️ Blank window bug when using Pika backup on Intel HD 4000 GPU + diff --git a/models/video/pika/prompting.md b/models/video/pika/prompting.md new file mode 100644 index 0000000..1f51089 --- /dev/null +++ b/models/video/pika/prompting.md @@ -0,0 +1,10 @@ +# Pika Prompting Guide + +*Last updated: 2025-08-12* + +*No specific prompting tips available yet. Check back for updates.* + +## Sources + +- Reddit community discussions +- User-reported experiences diff --git a/models/video/runway/cost_optimization.md b/models/video/runway/cost_optimization.md index 5d9720d..147fe00 100644 --- a/models/video/runway/cost_optimization.md +++ b/models/video/runway/cost_optimization.md @@ -1,9 +1,12 @@ # Runway - Cost Optimization Guide -*Last updated: 2025-08-11* +*Last updated: 2025-08-12* ## Pricing Information -- Gen-4 References costs $0.08 (8 credits) per image generation -- Unlimited plan costs $95/month +- $0.08 (8 credits) per image generation + +## Money-Saving Tips + +- Daily image generation limit is 3 per day. diff --git a/models/video/runway/metadata.json b/models/video/runway/metadata.json index cde6d0f..2bc79dc 100644 --- a/models/video/runway/metadata.json +++ b/models/video/runway/metadata.json @@ -1,13 +1,13 @@ { "service": "Runway", "category": "video", - "last_updated": "2025-08-11T23:04:57.379218", - "extraction_timestamp": "2025-08-11T23:04:54.909165", + "last_updated": "2025-08-12T20:10:15.923183", + "extraction_timestamp": null, "data_sources": [ "Reddit API", "Community discussions" ], - "posts_analyzed": 30, + "posts_analyzed": 100, "confidence": "medium", "version": "1.0.0" } \ No newline at end of file diff --git a/models/video/runway/parameters.json b/models/video/runway/parameters.json index 696ae6e..9d887c9 100644 --- a/models/video/runway/parameters.json +++ b/models/video/runway/parameters.json @@ -1,13 +1,14 @@ { "service": "Runway", - "last_updated": "2025-08-11T23:04:57.371483", + "last_updated": "2025-08-12T20:10:15.827498", "recommended_settings": { "setting_0": { - "description": "For Gen-4 References, use up to 3 reference images per request" + "description": "Maximum of 3 reference images per request" } }, "cost_optimization": { - "pricing": "Unlimited plan costs $95/month" + "tip_0": "Daily image generation limit is 3 per day.", + "pricing": "$0.08 (8 credits) per image generation" }, "sources": [ "Reddit community", diff --git a/models/video/runway/pitfalls.md b/models/video/runway/pitfalls.md index 90a2902..c29c048 100644 --- a/models/video/runway/pitfalls.md +++ b/models/video/runway/pitfalls.md @@ -1,13 +1,8 @@ # Runway - Common Pitfalls & Issues -*Last updated: 2025-08-11* - -## Policy & Account Issues - -### ⚠️ Runway throttles Unlimited [$95/month] accounts -**Note**: Be aware of terms of service regarding account creation. +*Last updated: 2025-08-12* ## Cost & Limits -### 💰 Unlimited plan costs $95/month +### 💰 Daily image generation limit is 3 per day. diff --git a/models/video/runway/prompting.md b/models/video/runway/prompting.md index 3acca8f..4f3c323 100644 --- a/models/video/runway/prompting.md +++ b/models/video/runway/prompting.md @@ -1,18 +1,14 @@ # Runway Prompting Guide -*Last updated: 2025-08-11* +*Last updated: 2025-08-12* ## Usage Tips -- Use Gen-4 References in the API for image generation -- Use the API at https://useapi.net/docs/api-runwayml-v1 -- Use Python SDK v3.1 for Runway API integration -- Use up to 3 reference images per request for better results -- Use automation workaround available at https://useapi.net/docs/articles/runway-bash +- Use Python SDK v3.1 from https://github.com/runwayml/sdk-python ## Recommended Settings -- For Gen-4 References, use up to 3 reference images per request +- Maximum of 3 reference images per request ## Sources diff --git a/src/cli.py b/src/cli.py index 01edbc2..1969ea4 100644 --- a/src/cli.py +++ b/src/cli.py @@ -748,6 +748,113 @@ def update_status(): console.print(f"\n[dim]Tip: Run 'scapo scrape batch --max-services {min(3, len(status['stale_services']))}' to update stale services[/dim]") +@scrape.command(name="all") +@click.option('-l', '--limit', default=20, help='Max posts per search (default: 20)') +@click.option('-c', '--category', help='Filter by category (video, audio, code, etc)') +@click.option('-p', '--priority', + type=click.Choice(['ultra', 'critical', 'high', 'all']), + default='ultra', + help='Service priority level') +@click.option('--dry-run', is_flag=True, help='Show what would be processed without running') +@click.option('--delay', default=5, help='Delay in seconds between services (default: 5)') +def scrape_all(limit: int, category: str, priority: str, dry_run: bool, delay: int): + """Process all priority services one by one.""" + show_banner() + + from src.scrapers.targeted_search_generator import TargetedSearchGenerator + from src.scrapers.intelligent_browser_scraper import IntelligentBrowserScraper + from src.services.batch_llm_processor import BatchLLMProcessor + from src.services.llm_processor import LLMProcessorFactory + from src.services.model_entry_generator import ModelEntryGenerator + from src.services.service_alias_manager import ServiceAliasManager + import asyncio + import time + from pathlib import Path + import json + + # Initialize components + generator = TargetedSearchGenerator() + alias_manager = ServiceAliasManager() + + # Get all priority services + priority_services = list(generator.priority_services) + + # Filter discovered services that match priority services + filtered_services = {} + for service_key, service_data in generator.services.items(): + display_name = service_data['display_name'].lower() + if any(priority in display_name for priority in priority_services): + # Apply category filter if specified + if category and service_data['category'] != category: + continue + # Mark with ultra priority for our priority services + service_data['priority'] = 'ultra' + filtered_services[service_key] = service_data + + # Apply priority filter + if priority != 'all': + filtered_services = {k: v for k, v in filtered_services.items() + if v.get('priority') == priority} + + services_to_process = list(filtered_services.values()) + + console.print(f"[cyan]Found {len(services_to_process)} services to process[/cyan]") + + if dry_run: + console.print("\n[yellow]DRY RUN - Services that would be processed:[/yellow]") + for i, service in enumerate(services_to_process, 1): + console.print(f" {i}. {service['display_name']} ({service['category']})") + console.print(f"\n[dim]Total: {len(services_to_process)} services × 5 queries × {limit} posts = {len(services_to_process) * 5 * limit} posts[/dim]") + return + + if not Confirm.ask(f"\n[yellow]Process {len(services_to_process)} services individually?[/yellow]", default=True): + console.print("[red]Cancelled[/red]") + return + + # Process each service one by one + successful = 0 + failed = 0 + + for i, service_data in enumerate(services_to_process, 1): + service_name = service_data['display_name'] + + console.print(f"\n[cyan][{i}/{len(services_to_process)}] Processing {service_name}...[/cyan]") + + try: + # Run targeted scraper for this service + from subprocess import run, PIPE + result = run( + ['uv', 'run', 'scapo', 'scrape', 'targeted', + '--service', service_name, + '--limit', str(limit), + '--max-queries', '5'], + capture_output=True, + text=True + ) + + if result.returncode == 0: + successful += 1 + console.print(f" ✅ {service_name} completed") + else: + failed += 1 + console.print(f" ❌ {service_name} failed: {result.stderr[:100]}") + + except Exception as e: + failed += 1 + console.print(f" ❌ {service_name} error: {str(e)[:100]}") + + # Delay between services (except for the last one) + if i < len(services_to_process): + console.print(f" [dim]Waiting {delay} seconds before next service...[/dim]") + time.sleep(delay) + + # Summary + console.print(f"\n[green]✨ Processing complete![/green]") + console.print(f" Successful: {successful}") + console.print(f" Failed: {failed}") + console.print(f" Total: {len(services_to_process)}") + + @scrape.command(name="status") def scrape_status(): """Show detailed scraper status with visual elements.""" diff --git a/src/services/batch_llm_processor.py b/src/services/batch_llm_processor.py index 4904698..faa602d 100644 --- a/src/services/batch_llm_processor.py +++ b/src/services/batch_llm_processor.py @@ -56,10 +56,16 @@ def __init__(self, model_name: str = "gpt-3.5-turbo"): def _get_dynamic_context_limit(self, model_name: str) -> Optional[int]: """Try to get context limit from OpenRouter API""" try: - # Only try if we have an API key - if os.getenv("OPENROUTER_API_KEY"): + # Try to get API key from settings or environment + from src.core.config import Settings + settings = Settings() + api_key = settings.openrouter_api_key or os.getenv("OPENROUTER_API_KEY") + + if api_key: from src.services.openrouter_context import OpenRouterContextManager - manager = OpenRouterContextManager() + manager = OpenRouterContextManager(api_key=api_key) + # Load from cache first + manager.load_cache() context = manager.get_context_length(model_name) if context: logger.info(f"Got context limit from OpenRouter: {context}") From 64ce26614f2d5f0717dd6f03aa2003eafa0387a9 Mon Sep 17 00:00:00 2001 From: arahangua Date: Tue, 12 Aug 2025 21:28:44 +0900 Subject: [PATCH 4/7] fixed: token context handling, updated readme, quickstart --- .env.example | 6 +- QUICKSTART.md | 22 ++-- README.md | 15 ++- .../code/github-copilot/cost_optimization.md | 8 +- models/code/github-copilot/metadata.json | 6 +- models/code/github-copilot/parameters.json | 16 +-- src/cli.py | 7 +- src/core/config.py | 6 +- src/scrapers/base.py | 1 - src/scrapers/intelligent_browser_scraper.py | 4 +- src/services/adaptive_processor.py | 34 ++--- src/services/batch_llm_processor.py | 53 ++------ src/services/llm_processor.py | 122 ++++++++++++++---- src/services/scraper_service.py | 5 - 14 files changed, 168 insertions(+), 137 deletions(-) diff --git a/.env.example b/.env.example index b89fc3e..e340946 100644 --- a/.env.example +++ b/.env.example @@ -11,9 +11,9 @@ LOCAL_LLM_URL=http://localhost:11434 # Ollama: http://localhost:11434, LM Studi LOCAL_LLM_MODEL=llama3 # Model name for Ollama (ignored by LM Studio) LOCAL_LLM_TYPE=ollama # Options: ollama, lmstudio -# LLM Character Limits -LLM_MAX_CHARS=4000 # Maximum characters to send to LLM (user-friendly limit) -LLM_CHAR_HARD_LIMIT=50000 # Absolute safety limit to prevent excessive API costs +# Local LLM Context Configuration (Important for performance!) +LOCAL_LLM_MAX_CONTEXT=4096 # Maximum context tokens for your local model (e.g., 4096, 8192, 32768) +LOCAL_LLM_OPTIMAL_CHUNK=1024 # Optimal chunk size for batching (typically 1/4 of max context) # Quality Filtering LLM_QUALITY_THRESHOLD=0.6 # Minimum quality score for practices (0.0-1.0, higher = stricter) diff --git a/QUICKSTART.md b/QUICKSTART.md index 4b4ae2b..57f1970 100644 --- a/QUICKSTART.md +++ b/QUICKSTART.md @@ -32,6 +32,9 @@ LLM_PROVIDER=local LOCAL_LLM_TYPE=ollama LOCAL_LLM_URL=http://localhost:11434 LOCAL_LLM_MODEL=model_alias +# Important: Set your model's context size! +LOCAL_LLM_MAX_CONTEXT=8192 # e.g., 4096, 8192, 32768 +LOCAL_LLM_OPTIMAL_CHUNK=2048 # Typically 1/4 of max ``` #### Option C: LM Studio (Local) 1. Install [LM Studio](https://lmstudio.ai/) @@ -42,6 +45,9 @@ LOCAL_LLM_MODEL=model_alias LLM_PROVIDER=local LOCAL_LLM_TYPE=lmstudio LOCAL_LLM_URL=http://localhost:1234 +# Important: Set your model's context size! +LOCAL_LLM_MAX_CONTEXT=8192 # Check your model's specs +LOCAL_LLM_OPTIMAL_CHUNK=2048 # Typically 1/4 of max ``` ### 3. Choose Your Approach @@ -115,19 +121,19 @@ models/ │ └── parameters.json # Recommended settings ``` -## ⚙️ Optimization Tips +## ⚙️ The --limit flag -### For Better Extraction: ```bash -# More posts = better tips (15-20 minimum) -scapo scrape targeted --service "HeyGen" --limit 20 +# ❌ Too few posts = no useful tips found +scapo scrape targeted --service "HeyGen" --limit 5 # ~20% success rate -# Multiple search types -scapo scrape targeted --service "Midjourney" --max-queries 10 +# ✅ Sweet spot = reliable extraction +scapo scrape targeted --service "HeyGen" --limit 20 # ~80% success rate -# Process similar services together -scapo scrape batch --category audio --limit 15 +# 🎯 Maximum insights = comprehensive coverage +scapo scrape targeted --service "HeyGen" --limit 30 # Finds rare edge cases ``` +**Why it matters:** LLMs need multiple examples to identify patterns. More posts = higher chance of finding specific pricing, bugs, and workarounds. ### Adjust Quality Threshold: ```bash diff --git a/README.md b/README.md index 469fa71..499df72 100644 --- a/README.md +++ b/README.md @@ -210,6 +210,10 @@ LLM_PROVIDER=openrouter OPENROUTER_API_KEY=your_key OPENROUTER_MODEL=your_favorite_model +# Local LLM Context (Important for Ollama/LM Studio!) +LOCAL_LLM_MAX_CONTEXT=8192 # Your model's context size in tokens +LOCAL_LLM_OPTIMAL_CHUNK=2048 # Optimal batch size (typically 1/4 of max) + # Extraction Quality LLM_QUALITY_THRESHOLD=0.6 # Min quality (0.0-1.0) @@ -218,10 +222,13 @@ SCRAPING_DELAY_SECONDS=2 # Be respectful MAX_POSTS_PER_SCRAPE=100 # Limit per source ``` -### Recommended Settings for Quality Extraction -- **Posts per query**: 15-20 minimum (more posts = better tips) -- **Queries per service**: 5-10 different search types -- **Batch size**: 3 services at a time for focused extraction +### Why --limit Matters (More Posts = Better Tips) +```bash +--limit 5 # ❌ Often finds nothing (too few samples) +--limit 15 # ✅ Good baseline (finds common issues) +--limit 25 # 🎯 Optimal (uncovers hidden gems & edge cases) +``` +so, hand-wavy breakdown: With 5 posts, extraction success ~20%. With 20+ posts, success jumps to ~80%. ## 🎨 Interactive TUI diff --git a/models/code/github-copilot/cost_optimization.md b/models/code/github-copilot/cost_optimization.md index 6c882d4..8301f6c 100644 --- a/models/code/github-copilot/cost_optimization.md +++ b/models/code/github-copilot/cost_optimization.md @@ -2,13 +2,7 @@ *Last updated: 2025-08-12* -## Pricing Information - -- $10/month for the standard Copilot plan - ## Money-Saving Tips -- Copilot Pro: 300 premium requests/month (before Pro+ upgrade) -- Free tier: 2,000 completions/month, 50 chat prompts/month -- Paid tier: unlimited completions and chat prompts +- €8.4 per month for GPT-4 to Copilot and text chatbox diff --git a/models/code/github-copilot/metadata.json b/models/code/github-copilot/metadata.json index 35e3889..9d2d659 100644 --- a/models/code/github-copilot/metadata.json +++ b/models/code/github-copilot/metadata.json @@ -1,13 +1,13 @@ { "service": "GitHub Copilot", "category": "code", - "last_updated": "2025-08-12T20:04:22.865982", - "extraction_timestamp": "2025-08-12T20:04:20.280914", + "last_updated": "2025-08-12T21:24:27.281886", + "extraction_timestamp": "2025-08-12T21:24:27.096753", "data_sources": [ "Reddit API", "Community discussions" ], - "posts_analyzed": 82, + "posts_analyzed": 2, "confidence": "medium", "version": "1.0.0" } \ No newline at end of file diff --git a/models/code/github-copilot/parameters.json b/models/code/github-copilot/parameters.json index 313405f..5766d0d 100644 --- a/models/code/github-copilot/parameters.json +++ b/models/code/github-copilot/parameters.json @@ -1,19 +1,9 @@ { "service": "GitHub Copilot", - "last_updated": "2025-08-12T20:04:22.768265", - "recommended_settings": { - "setting_0": { - "description": "github.copilot.chat.agent.autoFix" - }, - "setting_1": { - "description": "chat.tools.autoApprove" - } - }, + "last_updated": "2025-08-12T21:24:27.207908", + "recommended_settings": {}, "cost_optimization": { - "pricing": "$10/month for the standard Copilot plan", - "tip_1": "Copilot Pro: 300 premium requests/month (before Pro+ upgrade)", - "tip_2": "Free tier: 2,000 completions/month, 50 chat prompts/month", - "unlimited_option": "Paid tier: unlimited completions and chat prompts" + "tip_0": "\u20ac8.4 per month for GPT-4 to Copilot and text chatbox" }, "sources": [ "Reddit community", diff --git a/src/cli.py b/src/cli.py index 1969ea4..e9ef696 100644 --- a/src/cli.py +++ b/src/cli.py @@ -170,9 +170,8 @@ def scrape(): @click.option("--sources", "-s", multiple=True, help="Sources to scrape (e.g., reddit:LocalLLaMA)") @click.option("--limit", "-l", default=10, help="Maximum posts per source") -@click.option("--llm-max-chars", "-c", type=int, help="Max characters for LLM processing") @click.option("--interactive", "-i", is_flag=True, help="Interactive source selection") -def run_scraper(sources, limit, llm_max_chars, interactive): +def run_scraper(sources, limit, interactive): """Run intelligent scraper with enhanced UI.""" show_banner() @@ -223,8 +222,7 @@ async def _run(): console.print(Panel( f"[bold]Scraping Plan[/bold]\n\n" f"Sources:\n{source_text}\n\n" - f"Post limit: [cyan]{limit}[/cyan] per source\n" - f"LLM processing: [cyan]{'Limited' if llm_max_chars else 'Full'}[/cyan]", + f"Post limit: [cyan]{limit}[/cyan] per source", border_style="blue" )) @@ -251,7 +249,6 @@ async def _run(): result = await service.run_scrapers( sources=sources_list, max_posts_per_source=limit, - llm_max_chars=llm_max_chars, ) progress.update(task, completed=100) diff --git a/src/core/config.py b/src/core/config.py index e644e75..dbfb70c 100644 --- a/src/core/config.py +++ b/src/core/config.py @@ -49,9 +49,11 @@ class Settings(BaseSettings): local_llm_model: str = Field(default="llama3", description="Local LLM model") local_llm_type: str = Field(default="ollama", description="Local LLM type: ollama, lmstudio") llm_processing_enabled: bool = Field(default=True, description="Enable LLM processing of content") - llm_max_chars: int = Field(default=4000, description="Maximum characters to send to LLM (user-friendly)") - llm_char_hard_limit: int = Field(default=50000, description="Absolute maximum characters (safety limit)") llm_quality_threshold: float = Field(default=0.6, description="Minimum quality score for practices (0.0-1.0)") + + # Local LLM context configuration + local_llm_max_context: Optional[int] = Field(None, description="Maximum context tokens for local LLM (e.g., 4096, 8192, 32768)") + local_llm_optimal_chunk: Optional[int] = Field(None, description="Optimal chunk size for local LLM processing") @field_validator("models_dir", "scrapers_dir") @classmethod diff --git a/src/scrapers/base.py b/src/scrapers/base.py index 7bcb19c..270db4a 100644 --- a/src/scrapers/base.py +++ b/src/scrapers/base.py @@ -204,7 +204,6 @@ async def _enhance_with_llm( model=settings.local_llm_model, api_type=settings.local_llm_type, api_key=settings.openrouter_api_key, - max_chars=settings.llm_max_chars, ) enhanced_practices = initial_practices.copy() diff --git a/src/scrapers/intelligent_browser_scraper.py b/src/scrapers/intelligent_browser_scraper.py index 133be85..1a9b6f7 100644 --- a/src/scrapers/intelligent_browser_scraper.py +++ b/src/scrapers/intelligent_browser_scraper.py @@ -83,7 +83,7 @@ def extract_best_practices(self, posts: List[ScrapedPost]) -> Dict[str, Any]: """Required abstract method - handled by LLM processing instead.""" return {} - def _get_llm_processor(self, max_chars: Optional[int] = None): + def _get_llm_processor(self): """Get or create a cached LLM processor instance.""" if self._llm_processor is None: if settings.llm_provider == "openrouter": @@ -91,14 +91,12 @@ def _get_llm_processor(self, max_chars: Optional[int] = None): provider="openrouter", api_key=settings.openrouter_api_key, model=settings.openrouter_model, - max_chars=max_chars or settings.llm_max_chars ) else: self._llm_processor = LLMProcessorFactory.create_processor( provider="local", base_url=settings.local_llm_url, model=settings.local_llm_model, - max_chars=max_chars or settings.llm_max_chars ) return self._llm_processor diff --git a/src/services/adaptive_processor.py b/src/services/adaptive_processor.py index 5e4f694..da9ba5d 100644 --- a/src/services/adaptive_processor.py +++ b/src/services/adaptive_processor.py @@ -25,27 +25,21 @@ class LLMCapabilities: def detect_capabilities(cls, provider: str, model: str) -> 'LLMCapabilities': """Detect LLM capabilities based on provider and model.""" - # Local LLMs (limited context) + # Local LLMs - use environment variables if set if provider == "local": - if "llama" in model.lower(): - if "3" in model: - return cls(provider, model, 8192, False, 2000) - elif "2" in model: - return cls(provider, model, 4096, False, 1500) - else: - return cls(provider, model, 2048, False, 1000) - elif "mistral" in model.lower(): - return cls(provider, model, 8192, False, 2000) - elif "phi" in model.lower(): - return cls(provider, model, 4096, False, 1500) - elif "qwen" in model.lower(): - if "32k" in model.lower(): - return cls(provider, model, 32768, True, 8000) - else: - return cls(provider, model, 8192, True, 2000) - else: - # Conservative defaults for unknown local models - return cls(provider, model, 2048, False, 1000) + # Check for user-configured values + if settings.local_llm_max_context: + max_context = settings.local_llm_max_context + optimal_chunk = settings.local_llm_optimal_chunk or max_context // 4 + logger.info(f"Using user-configured local LLM context: {max_context} tokens, {optimal_chunk} chunk size") + return cls(provider, model, max_context, False, optimal_chunk) + + # Fallback to conservative defaults if not configured + logger.warning(f"No LOCAL_LLM_MAX_CONTEXT set for {model}. Using conservative defaults.") + logger.info("Set LOCAL_LLM_MAX_CONTEXT and LOCAL_LLM_OPTIMAL_CHUNK in .env for better performance") + + # Very conservative defaults for safety + return cls(provider, model, 2048, False, 500) # Cloud providers (larger context) elif provider == "openrouter": diff --git a/src/services/batch_llm_processor.py b/src/services/batch_llm_processor.py index faa602d..6a184df 100644 --- a/src/services/batch_llm_processor.py +++ b/src/services/batch_llm_processor.py @@ -16,21 +16,8 @@ class BatchLLMProcessor: """Processes multiple posts in a single LLM call with context window awareness""" - # Conservative token limits for different model families - CONTEXT_LIMITS = { - 'gpt-4': 8000, # 8k context (conservative for gpt-4) - 'gpt-4-32k': 30000, # 32k context - 'gpt-4-turbo': 120000, # 128k context - 'gpt-3.5-turbo': 15000, # 16k context - 'claude-3': 180000, # 200k context - 'claude-2': 90000, # 100k context - 'llama': 3500, # 4k context (conservative) - 'mistral': 30000, # 32k context - 'deepseek': 30000, # 32k context - 'glm': 120000, # GLM-4 models have 128k context - 'z-ai': 120000, # z-ai models typically have large context - 'default': 8000 # Safe default for cloud models - } + # Default context limit if we can't determine from API or env + DEFAULT_CONTEXT_LIMIT = 4096 # Conservative default # Reserved tokens for system prompt and response RESERVED_TOKENS = { @@ -43,10 +30,18 @@ def __init__(self, model_name: str = "gpt-3.5-turbo"): self.model_name = model_name self.encoder = self._get_encoder(model_name) - # Try to get context from OpenRouter first + # Try to get context from OpenRouter API or environment variables self.context_limit = self._get_dynamic_context_limit(model_name) if not self.context_limit: - self.context_limit = self._get_context_limit(model_name) + # For local models, check environment variable + from src.core.config import settings + if settings.llm_provider == "local" and settings.local_llm_max_context: + self.context_limit = settings.local_llm_max_context + logger.info(f"Using LOCAL_LLM_MAX_CONTEXT: {self.context_limit}") + else: + # Fall back to conservative default + self.context_limit = self.DEFAULT_CONTEXT_LIMIT + logger.warning(f"Using default context limit: {self.context_limit}. Set LOCAL_LLM_MAX_CONTEXT for better performance.") self.usable_tokens = self._calculate_usable_tokens() @@ -89,30 +84,6 @@ def _get_encoder(self, model_name: str): logger.warning(f"Could not get specific encoder for {model_name}: {e}") return tiktoken.get_encoding('cl100k_base') - def _get_context_limit(self, model_name: str) -> int: - """Get conservative context limit for the model""" - model_lower = model_name.lower() - - # Check for specific model patterns - for key in self.CONTEXT_LIMITS: - if key in model_lower: - return self.CONTEXT_LIMITS[key] - - # Check for context size indicators in model name - if '32k' in model_lower: - return 30000 - elif '16k' in model_lower: - return 15000 - elif '8k' in model_lower: - return 7500 - elif '4k' in model_lower: - return 3500 - elif '100k' in model_lower or '128k' in model_lower: - return 90000 - elif '200k' in model_lower: - return 180000 - - return self.CONTEXT_LIMITS['default'] def _calculate_usable_tokens(self) -> int: """Calculate tokens available for actual content""" diff --git a/src/services/llm_processor.py b/src/services/llm_processor.py index a3164d8..3ec75e7 100644 --- a/src/services/llm_processor.py +++ b/src/services/llm_processor.py @@ -8,6 +8,7 @@ from pydantic import BaseModel, Field import litellm from litellm import acompletion, RateLimitError, AuthenticationError +import tiktoken from src.core.logging import get_logger from src.core.config import settings @@ -36,9 +37,15 @@ class ProcessedPractice(BaseModel): class BaseLLMProcessor(ABC): """Base class for LLM processors using LiteLLM.""" - def __init__(self, max_chars: Optional[int] = None): - self.max_chars = max_chars or settings.llm_max_chars + # Default context limit if we can't determine from API or env + DEFAULT_CONTEXT_LIMIT = 4096 # Conservative default + + def __init__(self, model_name: str = None): + self.model_name = model_name or "gpt-3.5-turbo" self.logger = logger + self.encoder = self._get_encoder(self.model_name) + self.context_limit = self._get_context_limit() + self.usable_tokens = self._calculate_usable_tokens() @abstractmethod async def process_content(self, content: str, content_type: str) -> List[ProcessedPractice]: @@ -50,23 +57,93 @@ async def process_raw_prompt(self, prompt: str) -> str: """Process a raw prompt and return the response as a string.""" pass - def truncate_to_limit(self, text: str) -> Tuple[str, bool]: - """Truncate text to character limit. + def _get_encoder(self, model_name: str): + """Get appropriate tokenizer for the model""" + try: + # Try to get model-specific encoder + if 'gpt-4' in model_name.lower(): + return tiktoken.encoding_for_model('gpt-4') + elif 'gpt-3.5' in model_name.lower(): + return tiktoken.encoding_for_model('gpt-3.5-turbo') + else: + # Default to cl100k_base for most modern models + return tiktoken.get_encoding('cl100k_base') + except Exception as e: + self.logger.warning(f"Could not get specific encoder for {model_name}: {e}") + return tiktoken.get_encoding('cl100k_base') + + def _get_context_limit(self) -> int: + """Get context limit for the model""" + # Try to get from OpenRouter API first (if available) + try: + from src.services.openrouter_context import OpenRouterContextManager + api_key = settings.openrouter_api_key + if api_key and settings.llm_provider == "openrouter": + manager = OpenRouterContextManager(api_key=api_key) + manager.load_cache() + context = manager.get_context_length(self.model_name) + if context: + self.logger.info(f"Got context limit from OpenRouter: {context}") + return context + except Exception as e: + self.logger.debug(f"Could not get context from OpenRouter: {e}") + + # For local models, check environment variable + if settings.llm_provider == "local" and settings.local_llm_max_context: + self.logger.info(f"Using LOCAL_LLM_MAX_CONTEXT: {settings.local_llm_max_context}") + return settings.local_llm_max_context + + # Fall back to conservative default + self.logger.warning(f"Using default context limit: {self.DEFAULT_CONTEXT_LIMIT}") + return self.DEFAULT_CONTEXT_LIMIT + + def _calculate_usable_tokens(self) -> int: + """Calculate tokens available for actual content""" + # Reserve tokens for system prompt and response + reserved_tokens = 1500 # Conservative reservation + usable = self.context_limit - reserved_tokens + + if usable < 1000: + self.logger.warning(f"Very limited context space: {usable} tokens") + return max(500, usable) # Minimum 500 tokens for content + + return usable + + def count_tokens(self, text: str) -> int: + """Count tokens in text""" + try: + return len(self.encoder.encode(text)) + except Exception as e: + self.logger.warning(f"Token counting failed, using approximation: {e}") + # Fallback: approximate 1 token per 4 characters + return len(text) // 4 + + def truncate_to_token_limit(self, text: str) -> Tuple[str, bool]: + """Truncate text to fit within token limit. Returns: Tuple of (truncated_text, was_truncated) """ - # Apply hard limit first - if len(text) > settings.llm_char_hard_limit: - text = text[:settings.llm_char_hard_limit] - self.logger.warning(f"Applied hard limit of {settings.llm_char_hard_limit} chars") + token_count = self.count_tokens(text) - # Apply user-specified limit - if len(text) > self.max_chars: - self.logger.info(f"Truncating content from {len(text)} to {self.max_chars} chars") - return text[:self.max_chars] + "\n\n[Content truncated...]", True + if token_count <= self.usable_tokens: + return text, False - return text, False + # Binary search to find the right truncation point + left, right = 0, len(text) + best_fit = "" + + while left < right: + mid = (left + right) // 2 + truncated = text[:mid] + if self.count_tokens(truncated) <= self.usable_tokens: + best_fit = truncated + left = mid + 1 + else: + right = mid + + self.logger.info(f"Truncated content from {token_count} to {self.count_tokens(best_fit)} tokens") + return best_fit + "\n\n[Content truncated...]", True def create_extraction_prompt(self, content: str, content_type: str) -> str: """Create a prompt for extracting practices from content.""" @@ -107,13 +184,14 @@ def __init__(self, provider: str = None, model: str = None, api_key: str = None, - base_url: str = None, - max_chars: Optional[int] = None): - super().__init__(max_chars) - - # Set up provider-specific configuration + base_url: str = None): + # Set up provider-specific configuration first self.provider = provider or settings.llm_provider - self.model_name = model or (settings.openrouter_model if self.provider == "openrouter" else settings.local_llm_model) + model_name = model or (settings.openrouter_model if self.provider == "openrouter" else settings.local_llm_model) + super().__init__(model_name) + + # Store the full model name for litellm + self.model_name = model_name # Initialize smart processors self.capabilities = LLMCapabilities.detect_capabilities(self.provider, self.model_name) @@ -222,10 +300,10 @@ async def process_content(self, content: str, content_type: str) -> List[Process except Exception as e: self.logger.error(f"Smart processing failed, falling back to basic truncation: {e}") - # Fallback to basic truncation - content, was_truncated = self.truncate_to_limit(content) + # Fallback to token-based truncation + content, was_truncated = self.truncate_to_token_limit(content) if was_truncated: - self.logger.info(f"Content truncated to {self.max_chars} characters for {content_type}") + self.logger.info(f"Content truncated to fit token limit for {content_type}") prompt = self.create_extraction_prompt(content, content_type) practices = await self._extract_practices_from_chunk(prompt, content_type) diff --git a/src/services/scraper_service.py b/src/services/scraper_service.py index 30bafb9..087e38e 100644 --- a/src/services/scraper_service.py +++ b/src/services/scraper_service.py @@ -62,7 +62,6 @@ async def run_scrapers( self, sources: List[str] = None, max_posts_per_source: int = 10, - llm_max_chars: Optional[int] = None, ) -> Dict[str, Any]: """Run intelligent scraper on specified sources.""" @@ -70,10 +69,6 @@ async def run_scrapers( if sources is None: sources = self._get_default_sources() - # Override LLM max chars if provided - if llm_max_chars is not None: - settings.llm_max_chars = llm_max_chars - logger.info(f"Using custom LLM character limit: {llm_max_chars}") status = self.scraper_status["intelligent"] From 4d4233f60d000002b664f0dbf738ddd23fa365fd Mon Sep 17 00:00:00 2001 From: arahangua Date: Wed, 13 Aug 2025 16:55:38 +0900 Subject: [PATCH 5/7] updated: readme, .github contents for repo management --- .github/CODEOWNERS | 24 ++++++++ .github/ISSUE_TEMPLATE/feature_request.yaml | 61 +++++++++++++++++++++ .github/dependabot.yml | 45 +++++++++++++++ .github/release.yml | 34 ++++++++++++ .github/topics.txt | 39 +++++++++++++ CITATION.cff | 28 ++++++++++ CODE_OF_CONDUCT.md | 58 ++++++++++++++++++++ README.md | 14 +++-- 8 files changed, 299 insertions(+), 4 deletions(-) create mode 100644 .github/CODEOWNERS create mode 100644 .github/ISSUE_TEMPLATE/feature_request.yaml create mode 100644 .github/dependabot.yml create mode 100644 .github/release.yml create mode 100644 .github/topics.txt create mode 100644 CITATION.cff create mode 100644 CODE_OF_CONDUCT.md diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS new file mode 100644 index 0000000..8cadd0b --- /dev/null +++ b/.github/CODEOWNERS @@ -0,0 +1,24 @@ +# CODEOWNERS file helps GitHub route pull requests to the right reviewers +# This also shows up in GitHub's interface showing who maintains what + +# Global owners +* @czero-cc + +# Documentation +*.md @czero-cc +/docs/ @czero-cc + +# Core functionality +/src/core/ @czero-cc +/src/scrapers/ @czero-cc +/src/services/ @czero-cc + +# Models and data +/models/ @czero-cc +/data/ @czero-cc + +# MCP Server +/mcp/ @czero-cc + +# CI/CD +/.github/ @czero-cc \ No newline at end of file diff --git a/.github/ISSUE_TEMPLATE/feature_request.yaml b/.github/ISSUE_TEMPLATE/feature_request.yaml new file mode 100644 index 0000000..993ed37 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/feature_request.yaml @@ -0,0 +1,61 @@ +name: "✨ Feature Request" +description: Suggest an idea or improvement for SCAPO +title: "[Feature]: " +labels: ["enhancement", "needs-triage"] +body: + - type: markdown + attributes: + value: | + Thanks for helping make SCAPO better! 🚀 + + - type: dropdown + id: feature-type + attributes: + label: Feature Type + description: What kind of feature are you requesting? + options: + - New AI service support + - Scraping improvement + - Data extraction enhancement + - UI/UX improvement + - API feature + - Documentation + - Other + validations: + required: true + + - type: textarea + id: problem + attributes: + label: Problem Statement + description: What problem does this feature solve? + placeholder: | + I'm always frustrated when... + It would be helpful if... + validations: + required: true + + - type: textarea + id: solution + attributes: + label: Proposed Solution + description: How would you like to see this implemented? + placeholder: Describe your ideal solution + validations: + required: true + + - type: textarea + id: alternatives + attributes: + label: Alternatives Considered + description: Have you considered any alternative solutions? + placeholder: Optional - any other approaches you've thought about + + - type: checkboxes + id: contribution + attributes: + label: Contribution + options: + - label: I'm willing to help implement this feature + - label: I can provide test cases or examples + - label: I can help with documentation \ No newline at end of file diff --git a/.github/dependabot.yml b/.github/dependabot.yml new file mode 100644 index 0000000..965bd2b --- /dev/null +++ b/.github/dependabot.yml @@ -0,0 +1,45 @@ +# Dependabot configuration for automatic dependency updates +# This keeps the project secure and up-to-date + +version: 2 +updates: + # Python dependencies + - package-ecosystem: "pip" + directory: "/" + schedule: + interval: "weekly" + day: "monday" + open-pull-requests-limit: 5 + labels: + - "dependencies" + - "python" + commit-message: + prefix: "chore" + include: "scope" + + # npm dependencies for MCP server + - package-ecosystem: "npm" + directory: "/mcp" + schedule: + interval: "weekly" + day: "monday" + open-pull-requests-limit: 3 + labels: + - "dependencies" + - "javascript" + - "mcp" + commit-message: + prefix: "chore" + include: "scope" + + # GitHub Actions + - package-ecosystem: "github-actions" + directory: "/" + schedule: + interval: "monthly" + labels: + - "dependencies" + - "ci" + commit-message: + prefix: "ci" + include: "scope" \ No newline at end of file diff --git a/.github/release.yml b/.github/release.yml new file mode 100644 index 0000000..bb76c0f --- /dev/null +++ b/.github/release.yml @@ -0,0 +1,34 @@ +# GitHub Release configuration +# This helps auto-generate better release notes + +changelog: + exclude: + labels: + - ignore-for-release + authors: + - dependabot + categories: + - title: 🚀 New Features + labels: + - enhancement + - feature + - title: 🐛 Bug Fixes + labels: + - bug + - fix + - title: 📚 Documentation + labels: + - documentation + - docs + - title: 🔧 Maintenance + labels: + - chore + - maintenance + - dependencies + - title: 🎉 New AI Services + labels: + - new-service + - model + - title: Other Changes + labels: + - "*" \ No newline at end of file diff --git a/.github/topics.txt b/.github/topics.txt new file mode 100644 index 0000000..5584bcc --- /dev/null +++ b/.github/topics.txt @@ -0,0 +1,39 @@ +# GitHub Topics to add (via repo settings): +# These help with GitHub search and discovery + +prompt-engineering +ai-optimization +llm-tools +cost-optimization +reddit-scraper +ai-tips +generative-ai +mcp-server +openai +anthropic +claude +gpt-4 +midjourney +stable-diffusion +eleven-labs +github-copilot +ai-tools +machine-learning +nlp +text-generation +image-generation +video-generation +audio-generation +community-driven +knowledge-base +best-practices +optimization-tips +cost-reduction +performance-optimization +web-scraping +python +automation +developer-tools +ai-services +prompt-templates +parameter-tuning \ No newline at end of file diff --git a/CITATION.cff b/CITATION.cff new file mode 100644 index 0000000..d02549a --- /dev/null +++ b/CITATION.cff @@ -0,0 +1,28 @@ +# This CITATION file helps researchers cite your project properly +cff-version: 1.2.0 +title: 'SCAPO: Stay Calm and Prompt On' +message: 'If you use this software, please cite it as below.' +type: software +authors: + - name: "CZero Engine Team" + email: "info@czero.cc" + affiliation: "Fiefworks, Inc." +repository-code: 'https://github.com/czero-cc/scapo' +url: 'https://czero.cc' +abstract: 'A community-driven knowledge base that automatically extracts specific AI service optimization techniques from Reddit discussions, providing actionable tips for cost reduction and performance improvement.' +keywords: + - prompt-engineering + - ai-optimization + - llm-tools + - cost-optimization + - reddit-scraper + - ai-tips + - generative-ai + - mcp-server + - openai + - anthropic + - midjourney + - stable-diffusion +license: MIT +version: 0.1.0 +date-released: '2024-08-01' \ No newline at end of file diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md new file mode 100644 index 0000000..05985d4 --- /dev/null +++ b/CODE_OF_CONDUCT.md @@ -0,0 +1,58 @@ +# Contributor Covenant Code of Conduct + +## Our Pledge + +We as members, contributors, and leaders pledge to make participation in our +community a harassment-free experience for everyone, regardless of age, body +size, visible or invisible disability, ethnicity, sex characteristics, gender +identity and expression, level of experience, education, socio-economic status, +nationality, personal appearance, race, caste, color, religion, or sexual +identity and orientation. + +## Our Standards + +Examples of behavior that contributes to a positive environment: + +* Using welcoming and inclusive language +* Being respectful of differing viewpoints and experiences +* Gracefully accepting constructive criticism +* Focusing on what is best for the community +* Showing empathy towards other community members +* Sharing optimization tips and discoveries openly + +Examples of unacceptable behavior: + +* The use of sexualized language or imagery +* Trolling, insulting or derogatory comments, and personal attacks +* Public or private harassment +* Publishing others' private information without permission +* Other conduct which could reasonably be considered inappropriate + +## Enforcement Responsibilities + +Community leaders are responsible for clarifying and enforcing our standards of +acceptable behavior and will take appropriate and fair corrective action in +response to any behavior that they deem inappropriate, threatening, offensive, +or harmful. + +## Scope + +This Code of Conduct applies within all community spaces, and also applies when +an individual is officially representing the community in public spaces. + +## Enforcement + +Instances of abusive, harassing, or otherwise unacceptable behavior may be +reported to the community leaders responsible for enforcement at +info@czero.cc. + +All complaints will be reviewed and investigated promptly and fairly. + +## Attribution + +This Code of Conduct is adapted from the [Contributor Covenant][homepage], +version 2.1, available at +[https://www.contributor-covenant.org/version/2/1/code_of_conduct.html][v2.1]. + +[homepage]: https://www.contributor-covenant.org +[v2.1]: https://www.contributor-covenant.org/version/2/1/code_of_conduct.html \ No newline at end of file diff --git a/README.md b/README.md index 499df72..2296e1d 100644 --- a/README.md +++ b/README.md @@ -6,12 +6,15 @@ **The Community-Driven Knowledge Base for AI Service Optimization** +[![Python 3.12+](https://img.shields.io/badge/python-3.12+-blue.svg)](https://www.python.org/downloads/) +[![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT) [![Made with Love](https://img.shields.io/badge/Made%20with-❤️-red.svg)](https://github.com/czero-cc/scapo) -[![No API Keys](https://img.shields.io/badge/API%20Keys-Not%20Required-brightgreen.svg)]() +[![No API Keys](https://img.shields.io/badge/API%20Keys-Not%20Required-brightgreen.svg)]()
[![LLM Powered](https://img.shields.io/badge/LLM-Powered-blue.svg)]() [![Browser Magic](https://img.shields.io/badge/Scraping-Browser%20Based-orange.svg)]() [![MCP Ready](https://img.shields.io/badge/Claude-MCP%20Ready-purple.svg)]() [![PRs Welcome](https://img.shields.io/badge/PRs-Welcome-brightgreen.svg)](CONTRIBUTING.md) +[![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black) ### 🎯 Real optimization tips from real users for AI services @@ -19,6 +22,8 @@ ## 🤔 What is SCAPO? +**Keywords**: AI cost optimization, prompt engineering, LLM tips, OpenAI, Claude, Anthropic, Midjourney, Stable Diffusion, ElevenLabs, GitHub Copilot, reduce AI costs, AI service best practices, Reddit scraper, community knowledge base + Ever burned through credits in minutes? Searching Reddit for that one optimization tip? Getting generic advice when you need specific settings? ![Classic AI Frustration](assets/guy_freaking_out2.png) @@ -293,7 +298,7 @@ Help us build the community knowledge base for AI service optimization! - [Configuration Guide](docs/CONFIGURATION.md) - [Quick Start Guide](QUICKSTART.md) - [Contributing Guide](CONTRIBUTING.md) -- [Add New Source Tutorial](docs/ADD_NEW_SOURCE.md) +- [Add New Source Tutorial (legacy method)](docs/ADD_NEW_SOURCE.md) ## 📜 License @@ -304,9 +309,10 @@ Built as part of the CZero Engine project to improve AI application development. ## 🙏 Acknowledgments - Reddit communities for sharing real experiences -- OpenRouter for accessible AI APIs +- [OpenRouter](https://openrouter.ai/) for accessible AI APIs - Coffee ☕ for making this possible -- Ollama and LMstudio for awesome local LLM experience +- [Ollama](https://ollama.com/) and [LMstudio](https://lmstudio.ai/) for awesome local LLM experience +- [Awesome Generative AI](https://github.com/steven2358/awesome-generative-ai) & [Awesome AI Tools](https://github.com/mahseema/awesome-ai-tools) for service discovery - All opensource contributors in this AI space --- From a8a6cb410b7cccbf903ba696e40424a5f25d28d3 Mon Sep 17 00:00:00 2001 From: arahangua Date: Wed, 13 Aug 2025 16:59:55 +0900 Subject: [PATCH 6/7] fixed:citation date alias --- CITATION.cff | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CITATION.cff b/CITATION.cff index d02549a..51383c4 100644 --- a/CITATION.cff +++ b/CITATION.cff @@ -25,4 +25,4 @@ keywords: - stable-diffusion license: MIT version: 0.1.0 -date-released: '2024-08-01' \ No newline at end of file +date-released: '2025-08-13' \ No newline at end of file From d8d0e6d3c36c4851c1b4e4862933743c3aead380 Mon Sep 17 00:00:00 2001 From: arahangua Date: Wed, 13 Aug 2025 17:15:15 +0900 Subject: [PATCH 7/7] fixed:dependabot to target dev branch --- .github/dependabot.yml | 3 +++ .github/scripts/create_labels.sh | 18 ++++++++++++++++++ 2 files changed, 21 insertions(+) create mode 100644 .github/scripts/create_labels.sh diff --git a/.github/dependabot.yml b/.github/dependabot.yml index 965bd2b..cea6e6c 100644 --- a/.github/dependabot.yml +++ b/.github/dependabot.yml @@ -6,6 +6,7 @@ updates: # Python dependencies - package-ecosystem: "pip" directory: "/" + target-branch: "dev" schedule: interval: "weekly" day: "monday" @@ -20,6 +21,7 @@ updates: # npm dependencies for MCP server - package-ecosystem: "npm" directory: "/mcp" + target-branch: "dev" schedule: interval: "weekly" day: "monday" @@ -35,6 +37,7 @@ updates: # GitHub Actions - package-ecosystem: "github-actions" directory: "/" + target-branch: "dev" schedule: interval: "monthly" labels: diff --git a/.github/scripts/create_labels.sh b/.github/scripts/create_labels.sh new file mode 100644 index 0000000..0daa287 --- /dev/null +++ b/.github/scripts/create_labels.sh @@ -0,0 +1,18 @@ +#!/bin/bash + +# Script to create GitHub labels for Dependabot +# Run this script with GitHub CLI (gh) installed and authenticated + +echo "Creating GitHub labels for Dependabot..." + +# Create labels with appropriate colors +gh label create dependencies --description "Dependency updates" --color "0366d6" 2>/dev/null || echo "Label 'dependencies' already exists" +gh label create javascript --description "JavaScript related changes" --color "f7df1e" 2>/dev/null || echo "Label 'javascript' already exists" +gh label create mcp --description "MCP server related" --color "008672" 2>/dev/null || echo "Label 'mcp' already exists" +gh label create python --description "Python related changes" --color "3776ab" 2>/dev/null || echo "Label 'python' already exists" +gh label create ci --description "Continuous Integration" --color "e11d21" 2>/dev/null || echo "Label 'ci' already exists" + +echo "Label creation complete!" +echo "" +echo "Current labels:" +gh label list | grep -E "dependencies|javascript|mcp|python|ci" \ No newline at end of file