diff --git a/docs/skills/README.md b/docs/skills/README.md index a83d484..9c1f962 100644 --- a/docs/skills/README.md +++ b/docs/skills/README.md @@ -24,6 +24,13 @@ Middleware skills that operate on text or state to increase performance, securit | :--- | :--- | :--- | | **[Prompt Token Rewriter](prompt_rewriter.md)** | `optimization/prompt_rewriter` | Aggressively compresses massive prompts or context histories while retaining semantic meaning to save tokens. | +## Data Engineering +Skills tailored for generating, parsing, and orchestrating large datasets for machine learning or analytics workflows. + +| Skill | ID | Description | +| :--- | :--- | :--- | +| **[Synthetic Data Generator](synthetic_generator.md)** | `data_engineering/synthetic_generator` | Generates high-entropy structured synthetic data for model fine-tuning to avoid mode collapse. | + --- ## 📥 Installing Skills diff --git a/docs/skills/synthetic_generator.md b/docs/skills/synthetic_generator.md new file mode 100644 index 0000000..892eb15 --- /dev/null +++ b/docs/skills/synthetic_generator.md @@ -0,0 +1,83 @@ +# Synthetic Data Generator Skill + +**Domain:** `data_engineering` +**Skill ID:** `data_engineering/synthetic_generator` + +A specialized data engineering capability that combats "model collapse" by generating high-entropy, highly structured synthetic data intentionally designed to fine-tune other models. + +## Capabilities + +* **Model Agnosticism**: Supports dynamic internal LLM configuration, letting the user trigger generation via Ollama (local), Google Gemini, or Anthropic Claude. +* **Combinatorial Entropy Injection**: Designed to explicitly seek out edge-case personas via the `diversity_prompt`, significantly raising the variance of training data. +* **Zero-Dependency Evaluation Heuristic**: Employs built-in `zlib` string compression ratios to calculate a dynamic entropy score, allowing the coordinating agent to reject low-entropy boilerplate data instantly. + +## Internal Architecture + +The skill is located in `skillware/skills/data_engineering/synthetic_generator/`. + +### 1. The Mind (`instructions.md`) +The system instructions emphasize boundary-pushing data generation. It prohibits standard AI tropes and enforces schema obedience. + +### 2. The Body (`skill.py`) +* **Data Generation**: The skill handles invoking the LLM behind the scenes, using the configured provider and isolating the `temperature` specifically for the data generation task so the primary coordinating agent doesn't need to run at high temperature. +* **Validation**: Attempts to automatically parse out code blocks to extract standard JSON object arrays. +* **Entropy Scoring**: Converts text sequences into `zlib` compressed bytes. A poor compression ratio implies high lexical variance (less repetitive syntax). + +## Integration Guide + +### Environment Variables +Depending on the requested `model_provider`, ensure you have the necessary API key exported: + +```bash +ANTHROPIC_API_KEY="sk-ant-..." +GOOGLE_API_KEY="AIzaSy..." +# Or run an Ollama server locally on default port 11434 +``` + +### Usage (Skillware Loader) + +```python +from skillware.core.loader import SkillLoader +import json + +# 1. Load the Skill +skill_bundle = SkillLoader.load_skill("data_engineering/synthetic_generator") +SyntheticGeneratorSkill = skill_bundle['module'].SyntheticGeneratorSkill() + +# 2. Execute +result = SyntheticGeneratorSkill.execute({ + "domain": "medical_coding_disputes", + "num_samples": 5, + "entropy_temperature": 0.9, + "diversity_prompt": "Ensure edge-case scenarios involving dual-insurance coverage overlaps.", + "model_provider": "gemini" +}) + +print(f"Generated {result['samples_generated']} samples with Entropy Score: {result['entropy_score']}") +print(json.dumps(result['samples'], indent=2)) +``` + +## Data Schema + +The skill constructs a response validating the pipeline and containing the raw samples. + +```json +{ + "samples": [ + { + "instruction": "Resolve the coding dispute for CPT 99291...", + "input": "Patient A admitted with BlueCross and Medicare...", + "output": "Since primary is exhausted..." + } + ], + "entropy_score": 0.88, + "status": "success", + "provider_used": "gemini", + "samples_generated": 1 +} +``` + +## Limitations + +* **Structure Consistency**: If the LLM generates improperly formatted JSON (despite the strict prompt), the parsing step may fail, requiring the agent to retry the skill execution. +* **Heuristic Entropy**: The `zlib` entropy score evaluates lexical byte-variance, not semantic variance. It serves as a guardrail against robotic boilerplate repetition but is not mathematically bulletproof. diff --git a/examples/build_dataset_demo.py b/examples/build_dataset_demo.py new file mode 100644 index 0000000..d02d6ce --- /dev/null +++ b/examples/build_dataset_demo.py @@ -0,0 +1,51 @@ +import json +import time +from skillware.core.loader import SkillLoader +from skillware.core.env import load_env_file + +def main(): + load_env_file() + + print("Loading Synthetic Data Generator Skill...") + skill_bundle = SkillLoader.load_skill("data_engineering/synthetic_generator") + SyntheticGeneratorSkill = skill_bundle['module'].SyntheticGeneratorSkill + + generator = SyntheticGeneratorSkill() + + dataset = [] + + # We will generate 1 batch of 10 samples + print(f"\nGenerating 10 samples using Gemini...") + start_time = time.time() + + result = generator.execute({ + "domain": "medical_coding_disputes", + "num_samples": 10, + "entropy_temperature": 0.9, + "diversity_prompt": "Ensure personas are extremely erratic. Use rare edge-case medical scenarios like obscure comorbidities fighting with dual-insurance.", + "model_provider": "gemini", + "model_name": "gemini-2.5-flash-lite" + }) + + elapsed = time.time() - start_time + print(f"Time Taken: {elapsed:.2f} seconds") + + if result.get("status") == "success": + score = result.get('entropy_score') + samples = result.get('samples', []) + print(f"✅ Success! Entropy Score: {score}") + print(f"Extracted {len(samples)} samples out of requested 10.") + dataset.extend(samples) + else: + print(f"❌ Failed: {result.get('message')}") + + # Save the dataset + out_file = "synthetic_dataset.jsonl" + with open(out_file, "w", encoding="utf-8") as f: + for d in dataset: + f.write(json.dumps(d) + "\n") + + print(f"\nSaved {len(dataset)} high-entropy samples to {out_file}") + +if __name__ == "__main__": + main() diff --git a/flake8_report.txt b/flake8_report.txt deleted file mode 100644 index a0b0355..0000000 Binary files a/flake8_report.txt and /dev/null differ diff --git a/skills/data_engineering/synthetic_generator/__init__.py b/skills/data_engineering/synthetic_generator/__init__.py new file mode 100644 index 0000000..49ae1d1 --- /dev/null +++ b/skills/data_engineering/synthetic_generator/__init__.py @@ -0,0 +1,3 @@ +from .skill import SyntheticGeneratorSkill + +__all__ = ["SyntheticGeneratorSkill"] diff --git a/skills/data_engineering/synthetic_generator/card.json b/skills/data_engineering/synthetic_generator/card.json new file mode 100644 index 0000000..3e67c0d --- /dev/null +++ b/skills/data_engineering/synthetic_generator/card.json @@ -0,0 +1,27 @@ +{ + "name": "Synthetic Data Generator", + "description": "Generates high-entropy structured synthetic data.", + "icon": "database", + "color": "blue", + "ui_schema": { + "type": "card", + "fields": [ + { + "key": "status", + "label": "Status" + }, + { + "key": "entropy_score", + "label": "Entropy Score" + }, + { + "key": "samples_generated", + "label": "Samples Generated" + }, + { + "key": "provider_used", + "label": "LLM Provider" + } + ] + } +} diff --git a/skills/data_engineering/synthetic_generator/instructions.md b/skills/data_engineering/synthetic_generator/instructions.md new file mode 100644 index 0000000..c841fa4 --- /dev/null +++ b/skills/data_engineering/synthetic_generator/instructions.md @@ -0,0 +1,6 @@ +# Synthesize High-Entropy Data + +You are using the `data_engineering/synthetic_generator` skill. +Use this skill when the user asks you to generate robust, varied, and edge-case synthetic data (such as JSON fine-tuning data) for machine learning training. + +Ensure that your `diversity_prompt` is highly descriptive and enforces non-standard formulations, preventing "model collapse" by pushing boundaries. diff --git a/skills/data_engineering/synthetic_generator/manifest.yaml b/skills/data_engineering/synthetic_generator/manifest.yaml new file mode 100644 index 0000000..4f5c3e8 --- /dev/null +++ b/skills/data_engineering/synthetic_generator/manifest.yaml @@ -0,0 +1,30 @@ +name: "data_engineering/synthetic_generator" +version: "0.1.0" +description: "Generates high-entropy structured synthetic data for model fine-tuning to avoid mode collapse." +requirements: [] +parameters: + type: "object" + properties: + domain: + type: "string" + description: "The core domain or topic (e.g. 'medical_coding_disputes')." + num_samples: + type: "integer" + description: "Number of JSONL samples to generate." + entropy_temperature: + type: "number" + description: "Temperature setting for the generation model (higher = more unique/random)." + diversity_prompt: + type: "string" + description: "Instruction for edge-cases or combinatorial personas to boost entropy." + model_provider: + type: "string" + description: "Which LLM provider to use internally ('ollama', 'gemini', 'anthropic'). Default is 'ollama'." + model_name: + type: "string" + description: "Specific model name (e.g., 'llama3', 'gemini-1.5-pro', 'claude-3-haiku-20240307')." + required: + - domain + - num_samples + - entropy_temperature + - diversity_prompt diff --git a/skills/data_engineering/synthetic_generator/skill.py b/skills/data_engineering/synthetic_generator/skill.py new file mode 100644 index 0000000..7a9ad89 --- /dev/null +++ b/skills/data_engineering/synthetic_generator/skill.py @@ -0,0 +1,128 @@ +import os +import zlib +import json +from typing import Dict, Any +from skillware.core.base_skill import BaseSkill + +class SyntheticGeneratorSkill(BaseSkill): + """ + A skill that generates high-entropy synthetic data using supported internal LLMs, + and validates the generated text with zlib-based entropy scoring. + """ + + @property + def manifest(self) -> Dict[str, Any]: + manifest_path = os.path.join(os.path.dirname(__file__), "manifest.yaml") + if os.path.exists(manifest_path): + import yaml + with open(manifest_path, "r", encoding="utf-8") as f: + return yaml.safe_load(f) + return {} + + def _calculate_entropy_score(self, text: str) -> float: + """ + Calculates a heuristic entropy score using zlib compression ratio. + Higher score = less compressible = higher entropy (more random/diverse). + """ + if not text: + return 0.0 + encoded = text.encode("utf-8") + compressed = zlib.compress(encoded) + ratio = len(compressed) / len(encoded) + # Ratio often ranges from 0.2 (low entropy) to 0.9 (high entropy, random) + return round(min(ratio * 1.5, 1.0), 3) # Scaled for readability + + def _call_gemini(self, prompt: str, temperature: float, model_name: str) -> str: + import google.generativeai as genai + # Initialize with config or env + api_key = self.config.get("GOOGLE_API_KEY") or os.environ.get("GOOGLE_API_KEY") + if api_key: + genai.configure(api_key=api_key) + model = genai.GenerativeModel(model_name) + response = model.generate_content( + prompt, + generation_config=genai.types.GenerationConfig(temperature=temperature) + ) + return response.text + + def _call_anthropic(self, prompt: str, temperature: float, model_name: str) -> str: + import anthropic + api_key = self.config.get("ANTHROPIC_API_KEY") or os.environ.get("ANTHROPIC_API_KEY") + client = anthropic.Anthropic(api_key=api_key) + message = client.messages.create( + model=model_name, + max_tokens=4096, + temperature=temperature, + messages=[{"role": "user", "content": prompt}] + ) + return message.content[0].text + + def _call_ollama(self, prompt: str, temperature: float, model_name: str) -> str: + import ollama + response = ollama.chat( + model=model_name, + messages=[{"role": "user", "content": prompt}], + options={"temperature": temperature} + ) + return response.get("message", {}).get("content", "") + + def execute(self, params: Dict[str, Any]) -> Any: + domain = params.get("domain") + num_samples = params.get("num_samples") + temperature = float(params.get("entropy_temperature", 0.8)) + diversity_prompt = params.get("diversity_prompt") + + provider = params.get("model_provider", "ollama").lower() + model_name = params.get("model_name") + + if not model_name: + if provider == "ollama": model_name = "llama3" + elif provider == "gemini": model_name = "gemini-1.5-flash" + elif provider == "anthropic": model_name = "claude-3-haiku-20240307" + + system_prompt = ( + f"You are a synthetic data generator mimicking extreme diversity for the domain: '{domain}'.\\n" + f"Your output MUST be exactly {num_samples} distinct samples combined into a strict JSON array.\\n" + f"Constraint: {diversity_prompt}\\n" + "Format: Return ONLY a valid JSON array of objects. Do not add any conversational text. Use keys 'instruction', 'input', and 'output'." + ) + + try: + if provider == "gemini": + raw_text = self._call_gemini(system_prompt, temperature, model_name) + elif provider == "anthropic": + raw_text = self._call_anthropic(system_prompt, temperature, model_name) + else: + raw_text = self._call_ollama(system_prompt, temperature, model_name) + except Exception as e: + return {"status": "error", "message": f"LLM Call Failed via {provider}: {str(e)}"} + + # Attempt to parse json + samples = [] + try: + # Basic cleanup to extract array + cleaned = raw_text.strip() + if "```json" in cleaned: + cleaned = cleaned.split("```json")[-1].split("```")[0].strip() + elif "```" in cleaned: + cleaned = cleaned.split("```")[-1].split("```")[0].strip() + + parsed = json.loads(cleaned) + if isinstance(parsed, list): + samples = parsed + else: + samples = [parsed] + except Exception as e: + return {"status": "error", "message": f"Failed to parse LLM output into JSON array: {e}", "raw_output": raw_text} + + # Calculate Entropy + all_text = " ".join([str(s) for s in samples]) + score = self._calculate_entropy_score(all_text) + + return { + "samples": samples, + "entropy_score": score, + "status": "success", + "provider_used": provider, + "samples_generated": len(samples) + } diff --git a/templates/python_skill/card.json b/templates/python_skill/card.json new file mode 100644 index 0000000..e3cc63d --- /dev/null +++ b/templates/python_skill/card.json @@ -0,0 +1,15 @@ +{ + "name": "My Skill", + "description": "A template description.", + "icon": "zap", + "color": "gray", + "ui_schema": { + "type": "card", + "fields": [ + { + "key": "status", + "label": "Status" + } + ] + } +} diff --git a/tests/skills/data_engineering/test_synthetic_generator.py b/tests/skills/data_engineering/test_synthetic_generator.py new file mode 100644 index 0000000..d2e63d7 --- /dev/null +++ b/tests/skills/data_engineering/test_synthetic_generator.py @@ -0,0 +1,48 @@ +import pytest +from skillware.core.loader import SkillLoader + +def test_synthetic_generator_manifest(): + bundle = SkillLoader.load_skill("data_engineering/synthetic_generator") + assert bundle["manifest"]["name"] == "data_engineering/synthetic_generator" + assert "entropy_temperature" in bundle["manifest"]["parameters"]['properties'] + +def test_entropy_score(): + bundle = SkillLoader.load_skill("data_engineering/synthetic_generator") + skill_class = bundle["module"].SyntheticGeneratorSkill + skill = skill_class() + + # Highly repetitive, low entropy + low_entropy_text = "test " * 100 + score_low = skill._calculate_entropy_score(low_entropy_text) + + # More diverse, higher entropy + high_entropy_text = "The quick brown fox jumps over the lazy dog. Programming is fun! Diverse sentences." + score_high = skill._calculate_entropy_score(high_entropy_text) + + assert score_high > score_low + +def test_execute_success(mocker): + bundle = SkillLoader.load_skill("data_engineering/synthetic_generator") + skill = bundle["module"].SyntheticGeneratorSkill() + + mock_json_response = '''```json +[ + {"instruction": "do this", "input": "input data", "output": "output data"} +] +```''' + + # Mock the gemini call to avoid hitting realistic endpoints + mocker.patch.object(skill, '_call_gemini', return_value=mock_json_response) + + result = skill.execute({ + "domain": "test domain", + "num_samples": 1, + "diversity_prompt": "be diverse", + "model_provider": "gemini" + }) + + assert result["status"] == "success" + assert result["provider_used"] == "gemini" + assert result["samples_generated"] == 1 + assert "samples" in result + assert result["samples"][0]["instruction"] == "do this"