From 23224bf5e0abad557ceeb605986a10b16e6f6ca9 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Mon, 13 Oct 2025 10:00:19 +0000
Subject: [PATCH 1/3] Initial plan


From a21475723e2a2729279dc94c196c3b81b2989440 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Mon, 13 Oct 2025 10:08:03 +0000
Subject: [PATCH 2/3] Add purpose-map.md and feature-surface.json reports

Co-authored-by: DeepExtrema <175066046+DeepExtrema@users.noreply.github.com>
---
 reports/feature-surface.json | 621 +++++++++++++++++++++++++++++++++++
 reports/purpose-map.md       | 156 +++++++++
 2 files changed, 777 insertions(+)
 create mode 100644 reports/feature-surface.json
 create mode 100644 reports/purpose-map.md

diff --git a/reports/feature-surface.json b/reports/feature-surface.json
new file mode 100644
index 0000000..c635a8b
--- /dev/null
+++ b/reports/feature-surface.json
@@ -0,0 +1,621 @@
+{
+  "repository": {
+    "name": "Sherlock-Multiagent-Data-Scientist",
+    "owner": "DeepExtrema",
+    "version": "2.1.0",
+    "purpose": "End-to-end, orchestrator-driven data science platform enabling EDA, data quality validation, feature engineering, and model training through microservices agents with real-time observability",
+    "architecture": "microservices",
+    "license": {
+      "type": "hybrid",
+      "server": "BUSL-1.1",
+      "clients": "Apache-2.0"
+    }
+  },
+  "features": [
+    {
+      "id": "dataset-upload-load",
+      "name": "Upload & Load Dataset",
+      "description": "Upload CSV/Parquet datasets and load into memory for analysis",
+      "primary_files": [
+        "mcp-server/master_orchestrator_api.py",
+        "mcp-server/eda_agent.py",
+        "mcp-server/data_sources/"
+      ],
+      "endpoints": [
+        {
+          "method": "POST",
+          "path": "/datasets/upload",
+          "service": "master_orchestrator",
+          "port": 8000
+        },
+        {
+          "method": "POST",
+          "path": "/load_data",
+          "service": "eda_agent",
+          "port": 8001
+        }
+      ],
+      "entry_commands": [
+        "curl -X POST http://localhost:8000/datasets/upload -F \"file=@data.csv\" -F \"name=my_dataset\"",
+        "python start_master_orchestrator.py"
+      ],
+      "data_dependencies": [
+        "local_filesystem",
+        "mongodb_optional"
+      ],
+      "risk_level": "medium",
+      "risk_notes": "File upload validation needed; supports CSV, Parquet formats"
+    },
+    {
+      "id": "exploratory-data-analysis",
+      "name": "Exploratory Data Analysis (EDA)",
+      "description": "Statistical summaries, missing data analysis, correlation matrices, outlier detection, and publication-ready visualizations",
+      "primary_files": [
+        "mcp-server/eda_agent.py",
+        "mcp-server/server.py",
+        "docs/USER_GUIDE.md"
+      ],
+      "endpoints": [
+        {
+          "method": "GET",
+          "path": "/health",
+          "service": "eda_agent",
+          "port": 8001
+        },
+        {
+          "method": "POST",
+          "path": "/basic_info",
+          "service": "eda_agent",
+          "port": 8001
+        },
+        {
+          "method": "POST",
+          "path": "/statistical_summary",
+          "service": "eda_agent",
+          "port": 8001
+        },
+        {
+          "method": "POST",
+          "path": "/missing_data_analysis",
+          "service": "eda_agent",
+          "port": 8001
+        },
+        {
+          "method": "POST",
+          "path": "/detect_outliers",
+          "service": "eda_agent",
+          "port": 8001
+        },
+        {
+          "method": "POST",
+          "path": "/create_visualization",
+          "service": "eda_agent",
+          "port": 8001
+        }
+      ],
+      "entry_commands": [
+        "python start_eda_service.py",
+        "curl http://localhost:8001/health"
+      ],
+      "data_dependencies": [
+        "redis_caching",
+        "dataset_in_memory"
+      ],
+      "risk_level": "medium",
+      "risk_notes": "Large datasets may cause memory issues; 10k row sampling used for correlations"
+    },
+    {
+      "id": "data-quality-validation",
+      "name": "Data Quality Validation",
+      "description": "Schema consistency checks, missing value analysis, distribution validation, drift detection, and comprehensive quality reports",
+      "primary_files": [
+        "mcp-server/refinery_agent.py",
+        "mcp-server/REFINERY_AGENT_GUIDE.md",
+        "mcp-server/dq/"
+      ],
+      "endpoints": [
+        {
+          "method": "POST",
+          "path": "/execute",
+          "service": "refinery_agent",
+          "port": 8005
+        },
+        {
+          "method": "GET",
+          "path": "/health",
+          "service": "refinery_agent",
+          "port": 8005
+        },
+        {
+          "method": "GET",
+          "path": "/metrics",
+          "service": "refinery_agent",
+          "port": 8005
+        }
+      ],
+      "actions": [
+        "check_schema_consistency",
+        "check_missing_values",
+        "check_distributions",
+        "check_duplicates",
+        "check_leakage",
+        "check_drift",
+        "comprehensive_quality_report"
+      ],
+      "entry_commands": [
+        "python -m uvicorn refinery_agent:app --port 8005"
+      ],
+      "data_dependencies": [
+        "evidently_library",
+        "dataset_files"
+      ],
+      "risk_level": "low",
+      "risk_notes": "Read-only operations; comprehensive validation framework"
+    },
+    {
+      "id": "feature-engineering",
+      "name": "Feature Engineering Pipeline",
+      "description": "Imputation, scaling, encoding, datetime features, text vectorization, feature interactions, and selection",
+      "primary_files": [
+        "mcp-server/refinery_agent.py",
+        "mcp-server/fe/"
+      ],
+      "endpoints": [
+        {
+          "method": "POST",
+          "path": "/execute",
+          "service": "refinery_agent",
+          "port": 8005
+        }
+      ],
+      "actions": [
+        "assign_feature_roles",
+        "basic_impute_missing_values",
+        "basic_scale_numeric_features",
+        "basic_encode_categorical_features",
+        "basic_generate_datetime_features",
+        "basic_vectorise_text_features",
+        "basic_generate_interactions",
+        "basic_select_features",
+        "save_fe_pipeline",
+        "execute_feature_pipeline",
+        "advanced_impute_missing_values",
+        "advanced_encode_categorical_features",
+        "advanced_feature_selection"
+      ],
+      "entry_commands": [
+        "curl -X POST http://localhost:8005/execute -H \"Content-Type: application/json\" -d '{\"action\": \"basic_impute_missing_values\", \"params\": {\"data_path\": \"data.csv\"}}'"
+      ],
+      "data_dependencies": [
+        "scikit_learn_pipelines",
+        "feature_metadata"
+      ],
+      "risk_level": "medium",
+      "risk_notes": "Transformation logic; mode validation prevents accidental transforms"
+    },
+    {
+      "id": "model-training-evaluation",
+      "name": "Model Training & Evaluation",
+      "description": "Class imbalance analysis, train/validation/test splits, cross-validation, baseline models, experiment tracking with MLflow",
+      "primary_files": [
+        "mcp-server/ml_agent.py",
+        "mcp-server/ML_WORKFLOW_GUIDE.md",
+        "mcp-server/mlruns/"
+      ],
+      "endpoints": [
+        {
+          "method": "GET",
+          "path": "/health",
+          "service": "ml_agent",
+          "port": 8002
+        },
+        {
+          "method": "POST",
+          "path": "/class_imbalance",
+          "service": "ml_agent",
+          "port": 8002
+        },
+        {
+          "method": "POST",
+          "path": "/train_validation_test",
+          "service": "ml_agent",
+          "port": 8002
+        },
+        {
+          "method": "POST",
+          "path": "/baseline_sanity",
+          "service": "ml_agent",
+          "port": 8002
+        },
+        {
+          "method": "POST",
+          "path": "/experiment_tracking",
+          "service": "ml_agent",
+          "port": 8002
+        }
+      ],
+      "entry_commands": [
+        "python -m uvicorn ml_agent:app --port 8002"
+      ],
+      "data_dependencies": [
+        "mlflow_backend",
+        "training_datasets",
+        "scikit_learn"
+      ],
+      "risk_level": "high",
+      "risk_notes": "Model persistence, hyperparameter tuning; GPU/CPU resource management required"
+    },
+    {
+      "id": "workflow-orchestration",
+      "name": "Workflow Orchestration",
+      "description": "Task coordination, dependency management, retry logic, deadlock monitoring, graceful cancellation, artifact management",
+      "primary_files": [
+        "mcp-server/master_orchestrator_api.py",
+        "mcp-server/orchestrator/workflow_manager.py",
+        "mcp-server/orchestrator/agent_registry.py",
+        "mcp-server/orchestrator/sla_monitor.py"
+      ],
+      "endpoints": [
+        {
+          "method": "GET",
+          "path": "/health",
+          "service": "master_orchestrator",
+          "port": 8000
+        },
+        {
+          "method": "POST",
+          "path": "/workflows/start",
+          "service": "master_orchestrator",
+          "port": 8000
+        },
+        {
+          "method": "GET",
+          "path": "/runs/{run_id}/status",
+          "service": "master_orchestrator",
+          "port": 8000
+        },
+        {
+          "method": "GET",
+          "path": "/runs/{run_id}/artifacts",
+          "service": "master_orchestrator",
+          "port": 8000
+        },
+        {
+          "method": "DELETE",
+          "path": "/runs/{run_id}",
+          "service": "master_orchestrator",
+          "port": 8000
+        }
+      ],
+      "entry_commands": [
+        "python start_master_orchestrator.py",
+        "curl http://localhost:8000/docs"
+      ],
+      "data_dependencies": [
+        "mongodb_runs",
+        "redis_locks",
+        "kafka_events"
+      ],
+      "risk_level": "medium",
+      "risk_notes": "Deadlock monitoring, graceful cancellation; task dependencies managed"
+    },
+    {
+      "id": "realtime-observability",
+      "name": "Real-time Observability Dashboard",
+      "description": "React-based dashboard with live event streaming, workflow status, agent health monitoring, and performance charts",
+      "primary_files": [
+        "dashboard-ui/src/",
+        "dashboard-ui/package.json"
+      ],
+      "endpoints": [
+        {
+          "method": "WebSocket",
+          "path": "/ws/events",
+          "service": "master_orchestrator",
+          "port": 8000
+        }
+      ],
+      "entry_commands": [
+        "cd dashboard-ui && npm run dev",
+        "open http://localhost:3000"
+      ],
+      "data_dependencies": [
+        "fastapi_websocket",
+        "kafka_event_stream",
+        "recent_runs_api"
+      ],
+      "risk_level": "medium",
+      "risk_notes": "WebSocket connection stability; event streaming from Kafka"
+    }
+  ],
+  "runnable_surfaces": {
+    "services": [
+      {
+        "name": "master_orchestrator",
+        "type": "fastapi_service",
+        "entrypoint": "mcp-server/start_master_orchestrator.py",
+        "alternative_entrypoint": "python master_orchestrator_api.py",
+        "port": 8000,
+        "health_endpoint": "/health",
+        "api_docs": "/docs",
+        "description": "Workflow coordination, task dispatch, artifact management",
+        "startup_command": "python start_master_orchestrator.py"
+      },
+      {
+        "name": "eda_agent",
+        "type": "fastapi_service",
+        "entrypoint": "mcp-server/start_eda_service.py",
+        "alternative_entrypoint": "python eda_agent.py",
+        "port": 8001,
+        "health_endpoint": "/health",
+        "api_docs": "/docs",
+        "description": "Data loading, statistical analysis, visualization, outlier detection",
+        "startup_command": "python start_eda_service.py"
+      },
+      {
+        "name": "refinery_agent",
+        "type": "fastapi_service",
+        "entrypoint": "python -m uvicorn refinery_agent:app --port 8005",
+        "port": 8005,
+        "health_endpoint": "/health",
+        "api_docs": "/docs",
+        "description": "Data quality validation, feature engineering, drift detection",
+        "startup_command": "python -m uvicorn refinery_agent:app --port 8005"
+      },
+      {
+        "name": "ml_agent",
+        "type": "fastapi_service",
+        "entrypoint": "python -m uvicorn ml_agent:app --port 8002",
+        "port": 8002,
+        "health_endpoint": "/health",
+        "api_docs": "/docs",
+        "description": "Model training, cross-validation, baseline checks, experiment tracking",
+        "startup_command": "python -m uvicorn ml_agent:app --port 8002"
+      },
+      {
+        "name": "mcp_server",
+        "type": "mcp_protocol",
+        "entrypoint": "mcp-server/server.py",
+        "alternative_entrypoint": "python launch_server.py",
+        "port": null,
+        "health_endpoint": null,
+        "description": "Claude Desktop integration for conversational data analysis",
+        "startup_command": "python launch_server.py"
+      },
+      {
+        "name": "dashboard_ui",
+        "type": "react_spa",
+        "entrypoint": "dashboard-ui/",
+        "port": 3000,
+        "health_endpoint": null,
+        "description": "React-based real-time monitoring interface",
+        "startup_command": "cd dashboard-ui && npm run dev"
+      }
+    ],
+    "infrastructure": [
+      {
+        "name": "mongodb",
+        "type": "database",
+        "startup_command": "docker-compose up -d mongodb",
+        "required_by": [
+          "master_orchestrator"
+        ],
+        "purpose": "Run persistence, task history"
+      },
+      {
+        "name": "redis",
+        "type": "cache",
+        "startup_command": "docker-compose up -d redis",
+        "required_by": [
+          "all_agents"
+        ],
+        "purpose": "Caching, distributed locks, translation queue"
+      },
+      {
+        "name": "kafka",
+        "type": "message_queue",
+        "startup_command": "docker-compose up -d kafka",
+        "required_by": [
+          "master_orchestrator",
+          "dashboard"
+        ],
+        "purpose": "Event streaming, task routing"
+      },
+      {
+        "name": "nginx",
+        "type": "reverse_proxy",
+        "startup_command": "docker-compose up -d nginx",
+        "required_by": [
+          "production_deployment"
+        ],
+        "purpose": "Load balancing, reverse proxy",
+        "optional": true
+      }
+    ],
+    "docker_deployment": [
+      {
+        "name": "full_stack",
+        "compose_file": "mcp-server/docker-compose.yml",
+        "startup_command": "cd mcp-server && docker-compose up -d",
+        "description": "Full multi-service deployment with infrastructure"
+      },
+      {
+        "name": "local_development",
+        "compose_file": "mcp-server/docker-compose.local.yml",
+        "startup_command": "cd mcp-server && docker-compose -f docker-compose.local.yml up -d",
+        "description": "Local development variant"
+      },
+      {
+        "name": "production",
+        "script": "mcp-server/production_deployment.py",
+        "startup_command": "python production_deployment.py",
+        "description": "Production-ready deployment with monitoring"
+      }
+    ],
+    "scripts": [
+      {
+        "name": "install_orchestrator",
+        "path": "mcp-server/install_orchestrator.py",
+        "purpose": "Setup dependencies, create directories, verify infrastructure",
+        "command": "python install_orchestrator.py"
+      },
+      {
+        "name": "production_deployment",
+        "path": "mcp-server/production_deployment.py",
+        "purpose": "Production-ready deployment with monitoring",
+        "command": "python production_deployment.py"
+      },
+      {
+        "name": "connectivity_tester",
+        "path": "mcp-server/connectivity_tester.py",
+        "purpose": "Test inter-service connectivity",
+        "command": "python connectivity_tester.py"
+      },
+      {
+        "name": "verify_setup",
+        "path": "mcp-server/verify_setup.py",
+        "purpose": "Verify Python dependencies and configuration",
+        "command": "python verify_setup.py"
+      },
+      {
+        "name": "bug_hunter",
+        "path": "mcp-server/bug_hunter.py",
+        "purpose": "Diagnostic tool for troubleshooting",
+        "command": "python bug_hunter.py"
+      }
+    ]
+  },
+  "data_flow": {
+    "entry_point": "master_orchestrator:8000",
+    "event_bus": "kafka",
+    "storage": [
+      {
+        "type": "mongodb",
+        "database": "deepline",
+        "collections": [
+          "workflow_runs",
+          "task_status",
+          "artifacts_metadata"
+        ]
+      },
+      {
+        "type": "redis",
+        "purpose": [
+          "caching",
+          "distributed_locks",
+          "translation_queue"
+        ],
+        "queue_name": "translation:q"
+      },
+      {
+        "type": "mlflow",
+        "path": "mcp-server/mlruns/",
+        "purpose": [
+          "experiment_tracking",
+          "model_registry"
+        ]
+      },
+      {
+        "type": "filesystem",
+        "paths": [
+          {
+            "path": "artifacts/",
+            "purpose": "workflow_artifacts_reports_visualizations"
+          },
+          {
+            "path": "snapshots/",
+            "purpose": "data_versioning"
+          },
+          {
+            "path": "mcp-server/data_sources/",
+            "purpose": "uploaded_datasets"
+          }
+        ]
+      }
+    ]
+  },
+  "configuration": {
+    "primary": "mcp-server/config.yaml",
+    "environment": ".env",
+    "claude_integration": "claude_desktop_config.json",
+    "key_settings": {
+      "max_concurrent_workflows": 1,
+      "max_rows_processed": 100000,
+      "correlation_sample_size": 10000,
+      "deadlock_check_interval_s": 60,
+      "task_timeout_s": 600,
+      "workflow_timeout_s": 3600
+    }
+  },
+  "risk_summary": {
+    "high": {
+      "count": 1,
+      "features": [
+        "model-training-evaluation"
+      ],
+      "mitigation": "Resource quotas, GPU/CPU agent routing"
+    },
+    "medium": {
+      "count": 5,
+      "features": [
+        "dataset-upload-load",
+        "exploratory-data-analysis",
+        "feature-engineering",
+        "workflow-orchestration",
+        "realtime-observability"
+      ],
+      "mitigation": "Health checks, deadlock monitoring, graceful cancellation, SLA monitoring"
+    },
+    "low": {
+      "count": 1,
+      "features": [
+        "data-quality-validation"
+      ],
+      "mitigation": "Read-only operations, comprehensive validation"
+    }
+  },
+  "assumptions": {
+    "confidence_level": "high",
+    "uncertainty_percentage": 15,
+    "list": [
+      "Primary use case: Data scientists performing end-to-end analysis workflows via API/UI",
+      "Deployment target: Single-node or small-cluster deployment with Docker Compose",
+      "Data scale: Datasets up to 100k rows (configurable)",
+      "Concurrency: 1 workflow at a time (configurable)",
+      "Authentication: Development mode - production deployment includes API key support"
+    ]
+  },
+  "uncertainties": [
+    {
+      "item": "Missing Mission Definition Module",
+      "details": "Mentioned in COMPREHENSIVE_SYSTEM_AUDIT_REPORT.md but no implementation found",
+      "impact": "low"
+    },
+    {
+      "item": "Data Governance Implementation",
+      "details": "Configuration exists but enforcement unclear",
+      "impact": "medium"
+    },
+    {
+      "item": "Kafka/MongoDB fallback behavior",
+      "details": "System continues with reduced functionality per CONNECTIVITY_TEST_REPORT.md",
+      "impact": "low"
+    },
+    {
+      "item": "Multi-tenant support",
+      "details": "Not implemented; roadmap item per README.md",
+      "impact": "medium"
+    },
+    {
+      "item": "Hybrid API translation",
+      "details": "LLM integration references Claude/Llama2 but implementation incomplete",
+      "impact": "medium"
+    }
+  ],
+  "metadata": {
+    "generated_date": "2025-10-13",
+    "generator": "A0 Purpose & Surface Mapper",
+    "repository_version": "2.1.0",
+    "last_updated": "January 2024"
+  }
+}
diff --git a/reports/purpose-map.md b/reports/purpose-map.md
new file mode 100644
index 0000000..a67e811
--- /dev/null
+++ b/reports/purpose-map.md
@@ -0,0 +1,156 @@
+# Purpose & Surface Map: Sherlock Multiagent Data Scientist
+
+## MVP Purpose
+
+**Sherlock is an end-to-end, orchestrator-driven data science platform that enables users to perform exploratory data analysis, data quality validation, feature engineering, and model training through microservices agents coordinated by a master orchestrator with real-time observability.**
+
+---
+
+## Feature & User Journey Mapping
+
+| Journey | Primary Files | Entry Command | Data Dependencies | Risk |
+|---------|--------------|---------------|-------------------|------|
+| **1. Upload & Load Dataset** | `mcp-server/master_orchestrator_api.py` (POST /datasets/upload)<br>`mcp-server/eda_agent.py` (POST /load_data)<br>`mcp-server/data_sources/` | `curl -X POST http://localhost:8000/datasets/upload -F "file=@data.csv" -F "name=my_dataset"`<br>OR<br>`python start_master_orchestrator.py` (then API call) | MongoDB (optional), Local filesystem | **M** - File upload validation needed; supports CSV, Parquet |
+| **2. Exploratory Data Analysis (EDA)** | `mcp-server/eda_agent.py` (/basic_info, /statistical_summary, /missing_data_analysis, /detect_outliers)<br>`mcp-server/server.py` (MCP tools)<br>`docs/USER_GUIDE.md` | `python start_eda_service.py`<br>Service: http://localhost:8001<br>Health: http://localhost:8001/health | Redis (caching), Dataset loaded in memory | **M** - Large datasets may cause memory issues; 10k row sampling used for correlations |
+| **3. Data Quality Validation** | `mcp-server/refinery_agent.py` (POST /execute)<br>Actions: check_schema_consistency, check_missing_values, check_distributions, check_drift<br>`mcp-server/REFINERY_AGENT_GUIDE.md`<br>`mcp-server/dq/` modules | `python -m uvicorn refinery_agent:app --port 8005`<br>OR via orchestrator workflow | Evidently library, Dataset files | **L** - Read-only operations; comprehensive validation framework |
+| **4. Feature Engineering Pipeline** | `mcp-server/refinery_agent.py` (feature engineering actions)<br>`mcp-server/fe/` modules<br>Actions: basic_impute_missing_values, basic_scale_numeric_features, basic_encode_categorical_features | Via POST /execute to http://localhost:8005<br>Dual-mode operation (data_quality vs feature_engineering) | Scikit-learn pipelines, Feature metadata | **M** - Transformation logic; mode validation prevents accidental transforms |
+| **5. Model Training & Evaluation** | `mcp-server/ml_agent.py` (POST /train_validation_test, /class_imbalance, /baseline_sanity)<br>`mcp-server/ML_WORKFLOW_GUIDE.md`<br>`mcp-server/mlruns/` (MLflow) | `python -m uvicorn ml_agent:app --port 8002`<br>Service: http://localhost:8002 | MLflow backend, Training datasets, Scikit-learn | **H** - Model persistence, hyperparameter tuning; GPU/CPU resource management |
+| **6. Workflow Orchestration** | `mcp-server/master_orchestrator_api.py` (POST /workflows/start, GET /runs/{run_id}/status)<br>`mcp-server/orchestrator/workflow_manager.py`<br>`mcp-server/orchestrator/` (agent_registry, sla_monitor) | `python start_master_orchestrator.py`<br>API: http://localhost:8000<br>Docs: http://localhost:8000/docs | MongoDB (run persistence), Redis (locks), Kafka (events) | **M** - Deadlock monitoring, graceful cancellation; task dependencies managed |
+| **7. Real-time Observability Dashboard** | `dashboard-ui/src/` (React SPA)<br>`dashboard-ui/package.json`<br>Backend: WebSocket `/ws/events` | `cd dashboard-ui && npm run dev`<br>UI: http://localhost:3000 | FastAPI WebSocket, Kafka events stream, Recent runs API | **M** - WebSocket connection stability; event streaming from Kafka |
+
+---
+
+## Runnable Surfaces & Entrypoints
+
+### Core Services (Microservices Architecture)
+
+| Service | Entrypoint Script | Port | Health Check | Description |
+|---------|------------------|------|-------------|-------------|
+| **Master Orchestrator** | `mcp-server/start_master_orchestrator.py`<br>OR `python master_orchestrator_api.py` | 8000 | `/health` | Workflow coordination, task dispatch, artifact management |
+| **EDA Agent** | `mcp-server/start_eda_service.py`<br>OR `python eda_agent.py` | 8001 | `/health` | Data loading, statistical analysis, visualization, outlier detection |
+| **Refinery Agent** | `python -m uvicorn refinery_agent:app --port 8005`<br>OR via Docker | 8005 | `/health` | Data quality validation, feature engineering, drift detection |
+| **ML Agent** | `python -m uvicorn ml_agent:app --port 8002` | 8002 | `/health` | Model training, cross-validation, baseline checks, experiment tracking |
+| **MCP Server** | `mcp-server/server.py` (MCP protocol)<br>OR `python launch_server.py` | N/A | N/A | Claude Desktop integration for conversational data analysis |
+| **Dashboard UI** | `cd dashboard-ui && npm run dev` | 3000 | N/A | React-based real-time monitoring interface |
+
+### Infrastructure Dependencies
+
+| Component | Start Command | Required By | Notes |
+|-----------|--------------|-------------|-------|
+| **MongoDB** | `docker-compose up -d mongodb` | Master Orchestrator | Run persistence, task history |
+| **Redis** | `docker-compose up -d redis` | All agents | Caching, distributed locks, translation queue |
+| **Kafka** | `docker-compose up -d kafka` | Master Orchestrator, Dashboard | Event streaming, task routing |
+| **Nginx** (Optional) | `docker-compose up -d nginx` | Production deployment | Load balancing, reverse proxy |
+
+### Docker Deployment
+
+```bash
+# Start all services with infrastructure
+cd mcp-server
+docker-compose up -d
+
+# Production deployment
+python production_deployment.py
+```
+
+**Docker Compose Files:**
+- `mcp-server/docker-compose.yml` - Full multi-service deployment
+- `mcp-server/docker-compose.local.yml` - Local development variant
+- `docker-compose.yml` (root) - Simplified deployment
+
+### Scripts & Utilities
+
+| Script | Purpose | Usage |
+|--------|---------|-------|
+| `mcp-server/install_orchestrator.py` | Setup dependencies, create directories, verify infrastructure | `python install_orchestrator.py` |
+| `mcp-server/production_deployment.py` | Production-ready deployment with monitoring | `python production_deployment.py` |
+| `mcp-server/connectivity_tester.py` | Test inter-service connectivity | `python connectivity_tester.py` |
+| `mcp-server/verify_setup.py` | Verify Python dependencies and configuration | `python verify_setup.py` |
+| `mcp-server/bug_hunter.py` | Diagnostic tool for troubleshooting | `python bug_hunter.py` |
+| `mcp-server/test_*.py` | Test suites (pytest) | `pytest test_refinery_e2e.py` |
+
+---
+
+## Data Dependencies & Flow
+
+```
+User → Master Orchestrator (8000)
+         ↓
+      Kafka Events
+         ↓
+    ┌────┴────┬────────┬─────────┐
+    ↓         ↓        ↓         ↓
+EDA Agent  Refinery  ML Agent  Dashboard
+  (8001)    (8005)   (8002)    (3000)
+    ↓         ↓        ↓         ↓
+  Redis    MongoDB   MLflow   WebSocket
+```
+
+**Key Data Stores:**
+- **MongoDB**: `deepline` database - workflow runs, task status, artifacts metadata
+- **Redis**: Caching layer, distributed locks, translation queue (`translation:q`)
+- **MLflow**: `mcp-server/mlruns/` - experiment tracking, model registry
+- **Local Filesystem**: 
+  - `artifacts/` - workflow artifacts (reports, visualizations)
+  - `snapshots/` - data versioning
+  - `mcp-server/data_sources/` - uploaded datasets
+
+**Configuration:**
+- Primary: `mcp-server/config.yaml` (comprehensive settings)
+- Environment: `.env` files (production deployment)
+- Claude Desktop: `claude_desktop_config.json` (MCP integration)
+
+---
+
+## Risk Assessment Summary
+
+| Risk Level | Count | Mitigation Strategy |
+|------------|-------|-------------------|
+| **High (H)** | 1 | Model training - Resource quotas in `resource-quota.yaml`, GPU/CPU agent routing |
+| **Medium (M)** | 5 | Multiple agents - Health checks, deadlock monitoring, graceful cancellation, SLA monitoring |
+| **Low (L)** | 1 | Data quality - Read-only operations, comprehensive validation |
+
+---
+
+## Assumptions & Uncertainties
+
+### Assumptions (<20% uncertainty):
+1. **Primary use case**: Data scientists performing end-to-end analysis workflows via API/UI
+2. **Deployment target**: Single-node or small-cluster deployment with Docker Compose
+3. **Data scale**: Datasets up to 100k rows (configurable: `max_rows_processed: 100000`)
+4. **Concurrency**: 1 workflow at a time (configurable: `max_concurrent_workflows: 1`)
+5. **Authentication**: Development mode - production deployment includes API key support
+
+### Uncertainties Identified:
+1. **Missing Mission Definition Module** (mentioned in COMPREHENSIVE_SYSTEM_AUDIT_REPORT.md) - No implementation found in codebase
+2. **Data Governance Implementation** - Configuration exists but enforcement unclear
+3. **Kafka/MongoDB fallback behavior** - System continues with reduced functionality (per CONNECTIVITY_TEST_REPORT.md)
+4. **Multi-tenant support** - Not implemented; roadmap item per README.md
+5. **Hybrid API translation** - LLM integration unclear; references Claude/Llama2 but implementation incomplete
+
+---
+
+## Architecture Principles
+
+1. **Microservices**: Each agent is independently deployable FastAPI service
+2. **Async-first**: Non-blocking translation, background task processing
+3. **Resilience**: Deadlock detection, automatic retry with exponential backoff, graceful cancellation
+4. **Observability**: OpenTelemetry integration, Kafka event streams, health endpoints
+5. **Configuration-driven**: YAML-based tuning, environment variable overrides
+6. **License Model**: Hybrid Apache 2.0 (clients/SDKs) + BUSL 1.1 (server core)
+
+---
+
+## Next Steps for Users
+
+1. **Quick Start**: Follow `docs/INSTALLATION.md` for Windows/Linux setup
+2. **First Workflow**: Upload CSV → Run EDA → Generate quality report (see `docs/EXAMPLES.md`)
+3. **Configuration**: Tune thresholds in `config.yaml` per `docs/CONFIGURATION.md`
+4. **Advanced**: Build custom workflows with DSL or natural language API
+5. **Production**: Use `production_deployment.py` for containerized deployment
+
+---
+
+*Report generated: 2025-10-13*  
+*Repository: DeepExtrema/Sherlock-Multiagent-Data-Scientist*  
+*Version: 2.1.0*

From 700de5c72b1d122762b473a9b68ddad670da6716 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Mon, 13 Oct 2025 10:09:20 +0000
Subject: [PATCH 3/3] Add README.md for reports directory with usage guide

Co-authored-by: DeepExtrema <175066046+DeepExtrema@users.noreply.github.com>
---
 reports/README.md | 130 ++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 130 insertions(+)
 create mode 100644 reports/README.md

diff --git a/reports/README.md b/reports/README.md
new file mode 100644
index 0000000..2e7b5d5
--- /dev/null
+++ b/reports/README.md
@@ -0,0 +1,130 @@
+# Reports Directory
+
+This directory contains automated analysis and mapping reports for the Sherlock Multiagent Data Scientist platform.
+
+## Files
+
+### `purpose-map.md`
+**Human-readable purpose and surface mapping document**
+
+Contains:
+- Single-sentence MVP purpose statement
+- Feature & User Journey Mapping table with 7 journeys
+  - Journey description
+  - Primary files involved
+  - Entry commands
+  - Data dependencies
+  - Risk assessment (H/M/L)
+- Runnable surfaces documentation
+  - Core services with entrypoints
+  - Infrastructure dependencies
+  - Docker deployment options
+  - Scripts and utilities
+- Data dependencies & flow diagram
+- Risk assessment summary
+- Assumptions & uncertainties
+
+### `feature-surface.json`
+**Machine-readable feature surface mapping**
+
+Structured JSON containing:
+- Repository metadata
+- 7 features with detailed specifications:
+  - Endpoints (methods, paths, services, ports)
+  - Entry commands
+  - Data dependencies
+  - Risk levels and notes
+- Runnable surfaces:
+  - 6 services (FastAPI, React SPA, MCP protocol)
+  - 4 infrastructure components (MongoDB, Redis, Kafka, Nginx)
+  - Docker deployment configurations
+  - Utility scripts
+- Data flow and storage architecture
+- Configuration details
+- Risk summary
+- Assumptions (15% uncertainty)
+- Identified uncertainties
+
+## Generation Details
+
+- **Generated**: 2025-10-13
+- **Generator**: A0 Purpose & Surface Mapper
+- **Repository Version**: 2.1.0
+- **Uncertainty Level**: 15%
+
+## Usage
+
+### View Human-Readable Report
+```bash
+cat reports/purpose-map.md
+```
+
+### Parse Machine-Readable JSON
+```python
+import json
+
+with open('reports/feature-surface.json') as f:
+    data = json.load(f)
+
+# Access features
+for feature in data['features']:
+    print(f"{feature['name']}: {feature['risk_level']} risk")
+
+# Access services
+for service in data['runnable_surfaces']['services']:
+    print(f"{service['name']} on port {service['port']}")
+```
+
+### Quick Stats
+```bash
+# Validate JSON
+python -m json.tool reports/feature-surface.json > /dev/null && echo "✓ Valid JSON"
+
+# Count features
+python -c "import json; print(f\"{len(json.load(open('reports/feature-surface.json'))['features'])} features mapped\")"
+
+# Line counts
+wc -l reports/*
+```
+
+## Key Findings
+
+### MVP Purpose
+Sherlock is an **end-to-end, orchestrator-driven data science platform** that enables users to perform exploratory data analysis, data quality validation, feature engineering, and model training through microservices agents coordinated by a master orchestrator with real-time observability.
+
+### Architecture
+- **Type**: Microservices
+- **Services**: 6 (Master Orchestrator, EDA, Refinery, ML, MCP, Dashboard)
+- **Infrastructure**: MongoDB, Redis, Kafka, Nginx
+- **License**: Hybrid (Apache 2.0 for clients, BUSL 1.1 for server)
+
+### Risk Distribution
+- **High**: 1 feature (Model Training)
+- **Medium**: 5 features (Upload, EDA, Feature Eng, Orchestration, Dashboard)
+- **Low**: 1 feature (Data Quality)
+
+### Deployment
+Multiple options:
+1. Individual services: `python start_<service>.py`
+2. Docker Compose: `docker-compose up -d`
+3. Production: `python production_deployment.py`
+
+## Assumptions
+
+1. Primary use case: Data scientists performing end-to-end analysis workflows
+2. Deployment: Single-node or small-cluster with Docker Compose
+3. Data scale: Up to 100k rows (configurable)
+4. Concurrency: 1 workflow at a time (configurable)
+5. Mode: Development (production has API key support)
+
+## Identified Uncertainties
+
+1. Mission Definition Module (mentioned but not implemented)
+2. Data Governance enforcement details
+3. Kafka/MongoDB fallback behavior
+4. Multi-tenant support (roadmap item)
+5. Hybrid API/LLM integration completeness
+
+---
+
+*For questions about these reports, refer to the problem statement that generated them or examine the source documentation in `/docs`, `README.md`, and agent files in `/mcp-server`.*