From 23224bf5e0abad557ceeb605986a10b16e6f6ca9 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Mon, 13 Oct 2025 10:00:19 +0000
Subject: [PATCH 1/3] Initial plan
From a21475723e2a2729279dc94c196c3b81b2989440 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Mon, 13 Oct 2025 10:08:03 +0000
Subject: [PATCH 2/3] Add purpose-map.md and feature-surface.json reports
Co-authored-by: DeepExtrema <175066046+DeepExtrema@users.noreply.github.com>
---
reports/feature-surface.json | 621 +++++++++++++++++++++++++++++++++++
reports/purpose-map.md | 156 +++++++++
2 files changed, 777 insertions(+)
create mode 100644 reports/feature-surface.json
create mode 100644 reports/purpose-map.md
diff --git a/reports/feature-surface.json b/reports/feature-surface.json
new file mode 100644
index 0000000..c635a8b
--- /dev/null
+++ b/reports/feature-surface.json
@@ -0,0 +1,621 @@
+{
+ "repository": {
+ "name": "Sherlock-Multiagent-Data-Scientist",
+ "owner": "DeepExtrema",
+ "version": "2.1.0",
+ "purpose": "End-to-end, orchestrator-driven data science platform enabling EDA, data quality validation, feature engineering, and model training through microservices agents with real-time observability",
+ "architecture": "microservices",
+ "license": {
+ "type": "hybrid",
+ "server": "BUSL-1.1",
+ "clients": "Apache-2.0"
+ }
+ },
+ "features": [
+ {
+ "id": "dataset-upload-load",
+ "name": "Upload & Load Dataset",
+ "description": "Upload CSV/Parquet datasets and load into memory for analysis",
+ "primary_files": [
+ "mcp-server/master_orchestrator_api.py",
+ "mcp-server/eda_agent.py",
+ "mcp-server/data_sources/"
+ ],
+ "endpoints": [
+ {
+ "method": "POST",
+ "path": "/datasets/upload",
+ "service": "master_orchestrator",
+ "port": 8000
+ },
+ {
+ "method": "POST",
+ "path": "/load_data",
+ "service": "eda_agent",
+ "port": 8001
+ }
+ ],
+ "entry_commands": [
+ "curl -X POST http://localhost:8000/datasets/upload -F \"file=@data.csv\" -F \"name=my_dataset\"",
+ "python start_master_orchestrator.py"
+ ],
+ "data_dependencies": [
+ "local_filesystem",
+ "mongodb_optional"
+ ],
+ "risk_level": "medium",
+ "risk_notes": "File upload validation needed; supports CSV, Parquet formats"
+ },
+ {
+ "id": "exploratory-data-analysis",
+ "name": "Exploratory Data Analysis (EDA)",
+ "description": "Statistical summaries, missing data analysis, correlation matrices, outlier detection, and publication-ready visualizations",
+ "primary_files": [
+ "mcp-server/eda_agent.py",
+ "mcp-server/server.py",
+ "docs/USER_GUIDE.md"
+ ],
+ "endpoints": [
+ {
+ "method": "GET",
+ "path": "/health",
+ "service": "eda_agent",
+ "port": 8001
+ },
+ {
+ "method": "POST",
+ "path": "/basic_info",
+ "service": "eda_agent",
+ "port": 8001
+ },
+ {
+ "method": "POST",
+ "path": "/statistical_summary",
+ "service": "eda_agent",
+ "port": 8001
+ },
+ {
+ "method": "POST",
+ "path": "/missing_data_analysis",
+ "service": "eda_agent",
+ "port": 8001
+ },
+ {
+ "method": "POST",
+ "path": "/detect_outliers",
+ "service": "eda_agent",
+ "port": 8001
+ },
+ {
+ "method": "POST",
+ "path": "/create_visualization",
+ "service": "eda_agent",
+ "port": 8001
+ }
+ ],
+ "entry_commands": [
+ "python start_eda_service.py",
+ "curl http://localhost:8001/health"
+ ],
+ "data_dependencies": [
+ "redis_caching",
+ "dataset_in_memory"
+ ],
+ "risk_level": "medium",
+ "risk_notes": "Large datasets may cause memory issues; 10k row sampling used for correlations"
+ },
+ {
+ "id": "data-quality-validation",
+ "name": "Data Quality Validation",
+ "description": "Schema consistency checks, missing value analysis, distribution validation, drift detection, and comprehensive quality reports",
+ "primary_files": [
+ "mcp-server/refinery_agent.py",
+ "mcp-server/REFINERY_AGENT_GUIDE.md",
+ "mcp-server/dq/"
+ ],
+ "endpoints": [
+ {
+ "method": "POST",
+ "path": "/execute",
+ "service": "refinery_agent",
+ "port": 8005
+ },
+ {
+ "method": "GET",
+ "path": "/health",
+ "service": "refinery_agent",
+ "port": 8005
+ },
+ {
+ "method": "GET",
+ "path": "/metrics",
+ "service": "refinery_agent",
+ "port": 8005
+ }
+ ],
+ "actions": [
+ "check_schema_consistency",
+ "check_missing_values",
+ "check_distributions",
+ "check_duplicates",
+ "check_leakage",
+ "check_drift",
+ "comprehensive_quality_report"
+ ],
+ "entry_commands": [
+ "python -m uvicorn refinery_agent:app --port 8005"
+ ],
+ "data_dependencies": [
+ "evidently_library",
+ "dataset_files"
+ ],
+ "risk_level": "low",
+ "risk_notes": "Read-only operations; comprehensive validation framework"
+ },
+ {
+ "id": "feature-engineering",
+ "name": "Feature Engineering Pipeline",
+ "description": "Imputation, scaling, encoding, datetime features, text vectorization, feature interactions, and selection",
+ "primary_files": [
+ "mcp-server/refinery_agent.py",
+ "mcp-server/fe/"
+ ],
+ "endpoints": [
+ {
+ "method": "POST",
+ "path": "/execute",
+ "service": "refinery_agent",
+ "port": 8005
+ }
+ ],
+ "actions": [
+ "assign_feature_roles",
+ "basic_impute_missing_values",
+ "basic_scale_numeric_features",
+ "basic_encode_categorical_features",
+ "basic_generate_datetime_features",
+ "basic_vectorise_text_features",
+ "basic_generate_interactions",
+ "basic_select_features",
+ "save_fe_pipeline",
+ "execute_feature_pipeline",
+ "advanced_impute_missing_values",
+ "advanced_encode_categorical_features",
+ "advanced_feature_selection"
+ ],
+ "entry_commands": [
+ "curl -X POST http://localhost:8005/execute -H \"Content-Type: application/json\" -d '{\"action\": \"basic_impute_missing_values\", \"params\": {\"data_path\": \"data.csv\"}}'"
+ ],
+ "data_dependencies": [
+ "scikit_learn_pipelines",
+ "feature_metadata"
+ ],
+ "risk_level": "medium",
+ "risk_notes": "Transformation logic; mode validation prevents accidental transforms"
+ },
+ {
+ "id": "model-training-evaluation",
+ "name": "Model Training & Evaluation",
+ "description": "Class imbalance analysis, train/validation/test splits, cross-validation, baseline models, experiment tracking with MLflow",
+ "primary_files": [
+ "mcp-server/ml_agent.py",
+ "mcp-server/ML_WORKFLOW_GUIDE.md",
+ "mcp-server/mlruns/"
+ ],
+ "endpoints": [
+ {
+ "method": "GET",
+ "path": "/health",
+ "service": "ml_agent",
+ "port": 8002
+ },
+ {
+ "method": "POST",
+ "path": "/class_imbalance",
+ "service": "ml_agent",
+ "port": 8002
+ },
+ {
+ "method": "POST",
+ "path": "/train_validation_test",
+ "service": "ml_agent",
+ "port": 8002
+ },
+ {
+ "method": "POST",
+ "path": "/baseline_sanity",
+ "service": "ml_agent",
+ "port": 8002
+ },
+ {
+ "method": "POST",
+ "path": "/experiment_tracking",
+ "service": "ml_agent",
+ "port": 8002
+ }
+ ],
+ "entry_commands": [
+ "python -m uvicorn ml_agent:app --port 8002"
+ ],
+ "data_dependencies": [
+ "mlflow_backend",
+ "training_datasets",
+ "scikit_learn"
+ ],
+ "risk_level": "high",
+ "risk_notes": "Model persistence, hyperparameter tuning; GPU/CPU resource management required"
+ },
+ {
+ "id": "workflow-orchestration",
+ "name": "Workflow Orchestration",
+ "description": "Task coordination, dependency management, retry logic, deadlock monitoring, graceful cancellation, artifact management",
+ "primary_files": [
+ "mcp-server/master_orchestrator_api.py",
+ "mcp-server/orchestrator/workflow_manager.py",
+ "mcp-server/orchestrator/agent_registry.py",
+ "mcp-server/orchestrator/sla_monitor.py"
+ ],
+ "endpoints": [
+ {
+ "method": "GET",
+ "path": "/health",
+ "service": "master_orchestrator",
+ "port": 8000
+ },
+ {
+ "method": "POST",
+ "path": "/workflows/start",
+ "service": "master_orchestrator",
+ "port": 8000
+ },
+ {
+ "method": "GET",
+ "path": "/runs/{run_id}/status",
+ "service": "master_orchestrator",
+ "port": 8000
+ },
+ {
+ "method": "GET",
+ "path": "/runs/{run_id}/artifacts",
+ "service": "master_orchestrator",
+ "port": 8000
+ },
+ {
+ "method": "DELETE",
+ "path": "/runs/{run_id}",
+ "service": "master_orchestrator",
+ "port": 8000
+ }
+ ],
+ "entry_commands": [
+ "python start_master_orchestrator.py",
+ "curl http://localhost:8000/docs"
+ ],
+ "data_dependencies": [
+ "mongodb_runs",
+ "redis_locks",
+ "kafka_events"
+ ],
+ "risk_level": "medium",
+ "risk_notes": "Deadlock monitoring, graceful cancellation; task dependencies managed"
+ },
+ {
+ "id": "realtime-observability",
+ "name": "Real-time Observability Dashboard",
+ "description": "React-based dashboard with live event streaming, workflow status, agent health monitoring, and performance charts",
+ "primary_files": [
+ "dashboard-ui/src/",
+ "dashboard-ui/package.json"
+ ],
+ "endpoints": [
+ {
+ "method": "WebSocket",
+ "path": "/ws/events",
+ "service": "master_orchestrator",
+ "port": 8000
+ }
+ ],
+ "entry_commands": [
+ "cd dashboard-ui && npm run dev",
+ "open http://localhost:3000"
+ ],
+ "data_dependencies": [
+ "fastapi_websocket",
+ "kafka_event_stream",
+ "recent_runs_api"
+ ],
+ "risk_level": "medium",
+ "risk_notes": "WebSocket connection stability; event streaming from Kafka"
+ }
+ ],
+ "runnable_surfaces": {
+ "services": [
+ {
+ "name": "master_orchestrator",
+ "type": "fastapi_service",
+ "entrypoint": "mcp-server/start_master_orchestrator.py",
+ "alternative_entrypoint": "python master_orchestrator_api.py",
+ "port": 8000,
+ "health_endpoint": "/health",
+ "api_docs": "/docs",
+ "description": "Workflow coordination, task dispatch, artifact management",
+ "startup_command": "python start_master_orchestrator.py"
+ },
+ {
+ "name": "eda_agent",
+ "type": "fastapi_service",
+ "entrypoint": "mcp-server/start_eda_service.py",
+ "alternative_entrypoint": "python eda_agent.py",
+ "port": 8001,
+ "health_endpoint": "/health",
+ "api_docs": "/docs",
+ "description": "Data loading, statistical analysis, visualization, outlier detection",
+ "startup_command": "python start_eda_service.py"
+ },
+ {
+ "name": "refinery_agent",
+ "type": "fastapi_service",
+ "entrypoint": "python -m uvicorn refinery_agent:app --port 8005",
+ "port": 8005,
+ "health_endpoint": "/health",
+ "api_docs": "/docs",
+ "description": "Data quality validation, feature engineering, drift detection",
+ "startup_command": "python -m uvicorn refinery_agent:app --port 8005"
+ },
+ {
+ "name": "ml_agent",
+ "type": "fastapi_service",
+ "entrypoint": "python -m uvicorn ml_agent:app --port 8002",
+ "port": 8002,
+ "health_endpoint": "/health",
+ "api_docs": "/docs",
+ "description": "Model training, cross-validation, baseline checks, experiment tracking",
+ "startup_command": "python -m uvicorn ml_agent:app --port 8002"
+ },
+ {
+ "name": "mcp_server",
+ "type": "mcp_protocol",
+ "entrypoint": "mcp-server/server.py",
+ "alternative_entrypoint": "python launch_server.py",
+ "port": null,
+ "health_endpoint": null,
+ "description": "Claude Desktop integration for conversational data analysis",
+ "startup_command": "python launch_server.py"
+ },
+ {
+ "name": "dashboard_ui",
+ "type": "react_spa",
+ "entrypoint": "dashboard-ui/",
+ "port": 3000,
+ "health_endpoint": null,
+ "description": "React-based real-time monitoring interface",
+ "startup_command": "cd dashboard-ui && npm run dev"
+ }
+ ],
+ "infrastructure": [
+ {
+ "name": "mongodb",
+ "type": "database",
+ "startup_command": "docker-compose up -d mongodb",
+ "required_by": [
+ "master_orchestrator"
+ ],
+ "purpose": "Run persistence, task history"
+ },
+ {
+ "name": "redis",
+ "type": "cache",
+ "startup_command": "docker-compose up -d redis",
+ "required_by": [
+ "all_agents"
+ ],
+ "purpose": "Caching, distributed locks, translation queue"
+ },
+ {
+ "name": "kafka",
+ "type": "message_queue",
+ "startup_command": "docker-compose up -d kafka",
+ "required_by": [
+ "master_orchestrator",
+ "dashboard"
+ ],
+ "purpose": "Event streaming, task routing"
+ },
+ {
+ "name": "nginx",
+ "type": "reverse_proxy",
+ "startup_command": "docker-compose up -d nginx",
+ "required_by": [
+ "production_deployment"
+ ],
+ "purpose": "Load balancing, reverse proxy",
+ "optional": true
+ }
+ ],
+ "docker_deployment": [
+ {
+ "name": "full_stack",
+ "compose_file": "mcp-server/docker-compose.yml",
+ "startup_command": "cd mcp-server && docker-compose up -d",
+ "description": "Full multi-service deployment with infrastructure"
+ },
+ {
+ "name": "local_development",
+ "compose_file": "mcp-server/docker-compose.local.yml",
+ "startup_command": "cd mcp-server && docker-compose -f docker-compose.local.yml up -d",
+ "description": "Local development variant"
+ },
+ {
+ "name": "production",
+ "script": "mcp-server/production_deployment.py",
+ "startup_command": "python production_deployment.py",
+ "description": "Production-ready deployment with monitoring"
+ }
+ ],
+ "scripts": [
+ {
+ "name": "install_orchestrator",
+ "path": "mcp-server/install_orchestrator.py",
+ "purpose": "Setup dependencies, create directories, verify infrastructure",
+ "command": "python install_orchestrator.py"
+ },
+ {
+ "name": "production_deployment",
+ "path": "mcp-server/production_deployment.py",
+ "purpose": "Production-ready deployment with monitoring",
+ "command": "python production_deployment.py"
+ },
+ {
+ "name": "connectivity_tester",
+ "path": "mcp-server/connectivity_tester.py",
+ "purpose": "Test inter-service connectivity",
+ "command": "python connectivity_tester.py"
+ },
+ {
+ "name": "verify_setup",
+ "path": "mcp-server/verify_setup.py",
+ "purpose": "Verify Python dependencies and configuration",
+ "command": "python verify_setup.py"
+ },
+ {
+ "name": "bug_hunter",
+ "path": "mcp-server/bug_hunter.py",
+ "purpose": "Diagnostic tool for troubleshooting",
+ "command": "python bug_hunter.py"
+ }
+ ]
+ },
+ "data_flow": {
+ "entry_point": "master_orchestrator:8000",
+ "event_bus": "kafka",
+ "storage": [
+ {
+ "type": "mongodb",
+ "database": "deepline",
+ "collections": [
+ "workflow_runs",
+ "task_status",
+ "artifacts_metadata"
+ ]
+ },
+ {
+ "type": "redis",
+ "purpose": [
+ "caching",
+ "distributed_locks",
+ "translation_queue"
+ ],
+ "queue_name": "translation:q"
+ },
+ {
+ "type": "mlflow",
+ "path": "mcp-server/mlruns/",
+ "purpose": [
+ "experiment_tracking",
+ "model_registry"
+ ]
+ },
+ {
+ "type": "filesystem",
+ "paths": [
+ {
+ "path": "artifacts/",
+ "purpose": "workflow_artifacts_reports_visualizations"
+ },
+ {
+ "path": "snapshots/",
+ "purpose": "data_versioning"
+ },
+ {
+ "path": "mcp-server/data_sources/",
+ "purpose": "uploaded_datasets"
+ }
+ ]
+ }
+ ]
+ },
+ "configuration": {
+ "primary": "mcp-server/config.yaml",
+ "environment": ".env",
+ "claude_integration": "claude_desktop_config.json",
+ "key_settings": {
+ "max_concurrent_workflows": 1,
+ "max_rows_processed": 100000,
+ "correlation_sample_size": 10000,
+ "deadlock_check_interval_s": 60,
+ "task_timeout_s": 600,
+ "workflow_timeout_s": 3600
+ }
+ },
+ "risk_summary": {
+ "high": {
+ "count": 1,
+ "features": [
+ "model-training-evaluation"
+ ],
+ "mitigation": "Resource quotas, GPU/CPU agent routing"
+ },
+ "medium": {
+ "count": 5,
+ "features": [
+ "dataset-upload-load",
+ "exploratory-data-analysis",
+ "feature-engineering",
+ "workflow-orchestration",
+ "realtime-observability"
+ ],
+ "mitigation": "Health checks, deadlock monitoring, graceful cancellation, SLA monitoring"
+ },
+ "low": {
+ "count": 1,
+ "features": [
+ "data-quality-validation"
+ ],
+ "mitigation": "Read-only operations, comprehensive validation"
+ }
+ },
+ "assumptions": {
+ "confidence_level": "high",
+ "uncertainty_percentage": 15,
+ "list": [
+ "Primary use case: Data scientists performing end-to-end analysis workflows via API/UI",
+ "Deployment target: Single-node or small-cluster deployment with Docker Compose",
+ "Data scale: Datasets up to 100k rows (configurable)",
+ "Concurrency: 1 workflow at a time (configurable)",
+ "Authentication: Development mode - production deployment includes API key support"
+ ]
+ },
+ "uncertainties": [
+ {
+ "item": "Missing Mission Definition Module",
+ "details": "Mentioned in COMPREHENSIVE_SYSTEM_AUDIT_REPORT.md but no implementation found",
+ "impact": "low"
+ },
+ {
+ "item": "Data Governance Implementation",
+ "details": "Configuration exists but enforcement unclear",
+ "impact": "medium"
+ },
+ {
+ "item": "Kafka/MongoDB fallback behavior",
+ "details": "System continues with reduced functionality per CONNECTIVITY_TEST_REPORT.md",
+ "impact": "low"
+ },
+ {
+ "item": "Multi-tenant support",
+ "details": "Not implemented; roadmap item per README.md",
+ "impact": "medium"
+ },
+ {
+ "item": "Hybrid API translation",
+ "details": "LLM integration references Claude/Llama2 but implementation incomplete",
+ "impact": "medium"
+ }
+ ],
+ "metadata": {
+ "generated_date": "2025-10-13",
+ "generator": "A0 Purpose & Surface Mapper",
+ "repository_version": "2.1.0",
+ "last_updated": "January 2024"
+ }
+}
diff --git a/reports/purpose-map.md b/reports/purpose-map.md
new file mode 100644
index 0000000..a67e811
--- /dev/null
+++ b/reports/purpose-map.md
@@ -0,0 +1,156 @@
+# Purpose & Surface Map: Sherlock Multiagent Data Scientist
+
+## MVP Purpose
+
+**Sherlock is an end-to-end, orchestrator-driven data science platform that enables users to perform exploratory data analysis, data quality validation, feature engineering, and model training through microservices agents coordinated by a master orchestrator with real-time observability.**
+
+---
+
+## Feature & User Journey Mapping
+
+| Journey | Primary Files | Entry Command | Data Dependencies | Risk |
+|---------|--------------|---------------|-------------------|------|
+| **1. Upload & Load Dataset** | `mcp-server/master_orchestrator_api.py` (POST /datasets/upload)
`mcp-server/eda_agent.py` (POST /load_data)
`mcp-server/data_sources/` | `curl -X POST http://localhost:8000/datasets/upload -F "file=@data.csv" -F "name=my_dataset"`
OR
`python start_master_orchestrator.py` (then API call) | MongoDB (optional), Local filesystem | **M** - File upload validation needed; supports CSV, Parquet |
+| **2. Exploratory Data Analysis (EDA)** | `mcp-server/eda_agent.py` (/basic_info, /statistical_summary, /missing_data_analysis, /detect_outliers)
`mcp-server/server.py` (MCP tools)
`docs/USER_GUIDE.md` | `python start_eda_service.py`
Service: http://localhost:8001
Health: http://localhost:8001/health | Redis (caching), Dataset loaded in memory | **M** - Large datasets may cause memory issues; 10k row sampling used for correlations |
+| **3. Data Quality Validation** | `mcp-server/refinery_agent.py` (POST /execute)
Actions: check_schema_consistency, check_missing_values, check_distributions, check_drift
`mcp-server/REFINERY_AGENT_GUIDE.md`
`mcp-server/dq/` modules | `python -m uvicorn refinery_agent:app --port 8005`
OR via orchestrator workflow | Evidently library, Dataset files | **L** - Read-only operations; comprehensive validation framework |
+| **4. Feature Engineering Pipeline** | `mcp-server/refinery_agent.py` (feature engineering actions)
`mcp-server/fe/` modules
Actions: basic_impute_missing_values, basic_scale_numeric_features, basic_encode_categorical_features | Via POST /execute to http://localhost:8005
Dual-mode operation (data_quality vs feature_engineering) | Scikit-learn pipelines, Feature metadata | **M** - Transformation logic; mode validation prevents accidental transforms |
+| **5. Model Training & Evaluation** | `mcp-server/ml_agent.py` (POST /train_validation_test, /class_imbalance, /baseline_sanity)
`mcp-server/ML_WORKFLOW_GUIDE.md`
`mcp-server/mlruns/` (MLflow) | `python -m uvicorn ml_agent:app --port 8002`
Service: http://localhost:8002 | MLflow backend, Training datasets, Scikit-learn | **H** - Model persistence, hyperparameter tuning; GPU/CPU resource management |
+| **6. Workflow Orchestration** | `mcp-server/master_orchestrator_api.py` (POST /workflows/start, GET /runs/{run_id}/status)
`mcp-server/orchestrator/workflow_manager.py`
`mcp-server/orchestrator/` (agent_registry, sla_monitor) | `python start_master_orchestrator.py`
API: http://localhost:8000
Docs: http://localhost:8000/docs | MongoDB (run persistence), Redis (locks), Kafka (events) | **M** - Deadlock monitoring, graceful cancellation; task dependencies managed |
+| **7. Real-time Observability Dashboard** | `dashboard-ui/src/` (React SPA)
`dashboard-ui/package.json`
Backend: WebSocket `/ws/events` | `cd dashboard-ui && npm run dev`
UI: http://localhost:3000 | FastAPI WebSocket, Kafka events stream, Recent runs API | **M** - WebSocket connection stability; event streaming from Kafka |
+
+---
+
+## Runnable Surfaces & Entrypoints
+
+### Core Services (Microservices Architecture)
+
+| Service | Entrypoint Script | Port | Health Check | Description |
+|---------|------------------|------|-------------|-------------|
+| **Master Orchestrator** | `mcp-server/start_master_orchestrator.py`
OR `python master_orchestrator_api.py` | 8000 | `/health` | Workflow coordination, task dispatch, artifact management |
+| **EDA Agent** | `mcp-server/start_eda_service.py`
OR `python eda_agent.py` | 8001 | `/health` | Data loading, statistical analysis, visualization, outlier detection |
+| **Refinery Agent** | `python -m uvicorn refinery_agent:app --port 8005`
OR via Docker | 8005 | `/health` | Data quality validation, feature engineering, drift detection |
+| **ML Agent** | `python -m uvicorn ml_agent:app --port 8002` | 8002 | `/health` | Model training, cross-validation, baseline checks, experiment tracking |
+| **MCP Server** | `mcp-server/server.py` (MCP protocol)
OR `python launch_server.py` | N/A | N/A | Claude Desktop integration for conversational data analysis |
+| **Dashboard UI** | `cd dashboard-ui && npm run dev` | 3000 | N/A | React-based real-time monitoring interface |
+
+### Infrastructure Dependencies
+
+| Component | Start Command | Required By | Notes |
+|-----------|--------------|-------------|-------|
+| **MongoDB** | `docker-compose up -d mongodb` | Master Orchestrator | Run persistence, task history |
+| **Redis** | `docker-compose up -d redis` | All agents | Caching, distributed locks, translation queue |
+| **Kafka** | `docker-compose up -d kafka` | Master Orchestrator, Dashboard | Event streaming, task routing |
+| **Nginx** (Optional) | `docker-compose up -d nginx` | Production deployment | Load balancing, reverse proxy |
+
+### Docker Deployment
+
+```bash
+# Start all services with infrastructure
+cd mcp-server
+docker-compose up -d
+
+# Production deployment
+python production_deployment.py
+```
+
+**Docker Compose Files:**
+- `mcp-server/docker-compose.yml` - Full multi-service deployment
+- `mcp-server/docker-compose.local.yml` - Local development variant
+- `docker-compose.yml` (root) - Simplified deployment
+
+### Scripts & Utilities
+
+| Script | Purpose | Usage |
+|--------|---------|-------|
+| `mcp-server/install_orchestrator.py` | Setup dependencies, create directories, verify infrastructure | `python install_orchestrator.py` |
+| `mcp-server/production_deployment.py` | Production-ready deployment with monitoring | `python production_deployment.py` |
+| `mcp-server/connectivity_tester.py` | Test inter-service connectivity | `python connectivity_tester.py` |
+| `mcp-server/verify_setup.py` | Verify Python dependencies and configuration | `python verify_setup.py` |
+| `mcp-server/bug_hunter.py` | Diagnostic tool for troubleshooting | `python bug_hunter.py` |
+| `mcp-server/test_*.py` | Test suites (pytest) | `pytest test_refinery_e2e.py` |
+
+---
+
+## Data Dependencies & Flow
+
+```
+User → Master Orchestrator (8000)
+ ↓
+ Kafka Events
+ ↓
+ ┌────┴────┬────────┬─────────┐
+ ↓ ↓ ↓ ↓
+EDA Agent Refinery ML Agent Dashboard
+ (8001) (8005) (8002) (3000)
+ ↓ ↓ ↓ ↓
+ Redis MongoDB MLflow WebSocket
+```
+
+**Key Data Stores:**
+- **MongoDB**: `deepline` database - workflow runs, task status, artifacts metadata
+- **Redis**: Caching layer, distributed locks, translation queue (`translation:q`)
+- **MLflow**: `mcp-server/mlruns/` - experiment tracking, model registry
+- **Local Filesystem**:
+ - `artifacts/` - workflow artifacts (reports, visualizations)
+ - `snapshots/` - data versioning
+ - `mcp-server/data_sources/` - uploaded datasets
+
+**Configuration:**
+- Primary: `mcp-server/config.yaml` (comprehensive settings)
+- Environment: `.env` files (production deployment)
+- Claude Desktop: `claude_desktop_config.json` (MCP integration)
+
+---
+
+## Risk Assessment Summary
+
+| Risk Level | Count | Mitigation Strategy |
+|------------|-------|-------------------|
+| **High (H)** | 1 | Model training - Resource quotas in `resource-quota.yaml`, GPU/CPU agent routing |
+| **Medium (M)** | 5 | Multiple agents - Health checks, deadlock monitoring, graceful cancellation, SLA monitoring |
+| **Low (L)** | 1 | Data quality - Read-only operations, comprehensive validation |
+
+---
+
+## Assumptions & Uncertainties
+
+### Assumptions (<20% uncertainty):
+1. **Primary use case**: Data scientists performing end-to-end analysis workflows via API/UI
+2. **Deployment target**: Single-node or small-cluster deployment with Docker Compose
+3. **Data scale**: Datasets up to 100k rows (configurable: `max_rows_processed: 100000`)
+4. **Concurrency**: 1 workflow at a time (configurable: `max_concurrent_workflows: 1`)
+5. **Authentication**: Development mode - production deployment includes API key support
+
+### Uncertainties Identified:
+1. **Missing Mission Definition Module** (mentioned in COMPREHENSIVE_SYSTEM_AUDIT_REPORT.md) - No implementation found in codebase
+2. **Data Governance Implementation** - Configuration exists but enforcement unclear
+3. **Kafka/MongoDB fallback behavior** - System continues with reduced functionality (per CONNECTIVITY_TEST_REPORT.md)
+4. **Multi-tenant support** - Not implemented; roadmap item per README.md
+5. **Hybrid API translation** - LLM integration unclear; references Claude/Llama2 but implementation incomplete
+
+---
+
+## Architecture Principles
+
+1. **Microservices**: Each agent is independently deployable FastAPI service
+2. **Async-first**: Non-blocking translation, background task processing
+3. **Resilience**: Deadlock detection, automatic retry with exponential backoff, graceful cancellation
+4. **Observability**: OpenTelemetry integration, Kafka event streams, health endpoints
+5. **Configuration-driven**: YAML-based tuning, environment variable overrides
+6. **License Model**: Hybrid Apache 2.0 (clients/SDKs) + BUSL 1.1 (server core)
+
+---
+
+## Next Steps for Users
+
+1. **Quick Start**: Follow `docs/INSTALLATION.md` for Windows/Linux setup
+2. **First Workflow**: Upload CSV → Run EDA → Generate quality report (see `docs/EXAMPLES.md`)
+3. **Configuration**: Tune thresholds in `config.yaml` per `docs/CONFIGURATION.md`
+4. **Advanced**: Build custom workflows with DSL or natural language API
+5. **Production**: Use `production_deployment.py` for containerized deployment
+
+---
+
+*Report generated: 2025-10-13*
+*Repository: DeepExtrema/Sherlock-Multiagent-Data-Scientist*
+*Version: 2.1.0*
From 700de5c72b1d122762b473a9b68ddad670da6716 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Mon, 13 Oct 2025 10:09:20 +0000
Subject: [PATCH 3/3] Add README.md for reports directory with usage guide
Co-authored-by: DeepExtrema <175066046+DeepExtrema@users.noreply.github.com>
---
reports/README.md | 130 ++++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 130 insertions(+)
create mode 100644 reports/README.md
diff --git a/reports/README.md b/reports/README.md
new file mode 100644
index 0000000..2e7b5d5
--- /dev/null
+++ b/reports/README.md
@@ -0,0 +1,130 @@
+# Reports Directory
+
+This directory contains automated analysis and mapping reports for the Sherlock Multiagent Data Scientist platform.
+
+## Files
+
+### `purpose-map.md`
+**Human-readable purpose and surface mapping document**
+
+Contains:
+- Single-sentence MVP purpose statement
+- Feature & User Journey Mapping table with 7 journeys
+ - Journey description
+ - Primary files involved
+ - Entry commands
+ - Data dependencies
+ - Risk assessment (H/M/L)
+- Runnable surfaces documentation
+ - Core services with entrypoints
+ - Infrastructure dependencies
+ - Docker deployment options
+ - Scripts and utilities
+- Data dependencies & flow diagram
+- Risk assessment summary
+- Assumptions & uncertainties
+
+### `feature-surface.json`
+**Machine-readable feature surface mapping**
+
+Structured JSON containing:
+- Repository metadata
+- 7 features with detailed specifications:
+ - Endpoints (methods, paths, services, ports)
+ - Entry commands
+ - Data dependencies
+ - Risk levels and notes
+- Runnable surfaces:
+ - 6 services (FastAPI, React SPA, MCP protocol)
+ - 4 infrastructure components (MongoDB, Redis, Kafka, Nginx)
+ - Docker deployment configurations
+ - Utility scripts
+- Data flow and storage architecture
+- Configuration details
+- Risk summary
+- Assumptions (15% uncertainty)
+- Identified uncertainties
+
+## Generation Details
+
+- **Generated**: 2025-10-13
+- **Generator**: A0 Purpose & Surface Mapper
+- **Repository Version**: 2.1.0
+- **Uncertainty Level**: 15%
+
+## Usage
+
+### View Human-Readable Report
+```bash
+cat reports/purpose-map.md
+```
+
+### Parse Machine-Readable JSON
+```python
+import json
+
+with open('reports/feature-surface.json') as f:
+ data = json.load(f)
+
+# Access features
+for feature in data['features']:
+ print(f"{feature['name']}: {feature['risk_level']} risk")
+
+# Access services
+for service in data['runnable_surfaces']['services']:
+ print(f"{service['name']} on port {service['port']}")
+```
+
+### Quick Stats
+```bash
+# Validate JSON
+python -m json.tool reports/feature-surface.json > /dev/null && echo "✓ Valid JSON"
+
+# Count features
+python -c "import json; print(f\"{len(json.load(open('reports/feature-surface.json'))['features'])} features mapped\")"
+
+# Line counts
+wc -l reports/*
+```
+
+## Key Findings
+
+### MVP Purpose
+Sherlock is an **end-to-end, orchestrator-driven data science platform** that enables users to perform exploratory data analysis, data quality validation, feature engineering, and model training through microservices agents coordinated by a master orchestrator with real-time observability.
+
+### Architecture
+- **Type**: Microservices
+- **Services**: 6 (Master Orchestrator, EDA, Refinery, ML, MCP, Dashboard)
+- **Infrastructure**: MongoDB, Redis, Kafka, Nginx
+- **License**: Hybrid (Apache 2.0 for clients, BUSL 1.1 for server)
+
+### Risk Distribution
+- **High**: 1 feature (Model Training)
+- **Medium**: 5 features (Upload, EDA, Feature Eng, Orchestration, Dashboard)
+- **Low**: 1 feature (Data Quality)
+
+### Deployment
+Multiple options:
+1. Individual services: `python start_.py`
+2. Docker Compose: `docker-compose up -d`
+3. Production: `python production_deployment.py`
+
+## Assumptions
+
+1. Primary use case: Data scientists performing end-to-end analysis workflows
+2. Deployment: Single-node or small-cluster with Docker Compose
+3. Data scale: Up to 100k rows (configurable)
+4. Concurrency: 1 workflow at a time (configurable)
+5. Mode: Development (production has API key support)
+
+## Identified Uncertainties
+
+1. Mission Definition Module (mentioned but not implemented)
+2. Data Governance enforcement details
+3. Kafka/MongoDB fallback behavior
+4. Multi-tenant support (roadmap item)
+5. Hybrid API/LLM integration completeness
+
+---
+
+*For questions about these reports, refer to the problem statement that generated them or examine the source documentation in `/docs`, `README.md`, and agent files in `/mcp-server`.*