From 35c4998852221830d7bdf431dbb53c8d174fa995 Mon Sep 17 00:00:00 2001 From: "Michiel De Smet (aider)" Date: Tue, 10 Jun 2025 13:33:13 +0800 Subject: [PATCH 01/26] refactor: move token and instance-name to top-level command --- src/datapilot/cli/main.py | 11 +++++- src/datapilot/core/platforms/dbt/cli/cli.py | 37 +++++++++++++-------- 2 files changed, 34 insertions(+), 14 deletions(-) diff --git a/src/datapilot/cli/main.py b/src/datapilot/cli/main.py index f0f1796..cd8261a 100644 --- a/src/datapilot/cli/main.py +++ b/src/datapilot/cli/main.py @@ -5,8 +5,17 @@ @click.group() -def datapilot(): +@click.option("--token", required=False, help="Your API token for authentication.") +@click.option("--instance-name", required=False, help="Your tenant ID.") +@click.option("--backend-url", required=False, help="Altimate's Backend URL", default="https://api.myaltimate.com") +@click.pass_context +def datapilot(ctx, token, instance_name, backend_url): """Altimate CLI for DBT project management.""" + # Store common options in context + ctx.ensure_object(dict) + ctx.obj['token'] = token + ctx.obj['instance_name'] = instance_name + ctx.obj['backend_url'] = backend_url datapilot.add_command(dbt) diff --git a/src/datapilot/core/platforms/dbt/cli/cli.py b/src/datapilot/core/platforms/dbt/cli/cli.py index 8472ebf..233f6b1 100644 --- a/src/datapilot/core/platforms/dbt/cli/cli.py +++ b/src/datapilot/core/platforms/dbt/cli/cli.py @@ -24,13 +24,14 @@ # New dbt group @click.group() -def dbt(): +@click.pass_context +def dbt(ctx): """DBT specific commands.""" + # Ensure context object exists + ctx.ensure_object(dict) @dbt.command("project-health") -@click.option("--token", required=False, help="Your API token for authentication.") -@click.option("--instance-name", required=False, help="Your tenant ID.") @click.option( "--manifest-path", required=True, @@ -57,21 +58,24 @@ def dbt(): default=None, help="Selective model testing. Specify one or more models to run tests on.", ) -@click.option("--backend-url", required=False, help="Altimate's Backend URL", default="https://api.myaltimate.com") +@click.pass_context def project_health( - token, - instance_name, + ctx, manifest_path, catalog_path, config_path=None, config_name=None, select=None, - backend_url="https://api.myaltimate.com", ): """ Validate the DBT project's configuration and structure. :param manifest_path: Path to the DBT manifest file. """ + # Get common options from parent context + token = ctx.parent.obj.get('token') + instance_name = ctx.parent.obj.get('instance_name') + backend_url = ctx.parent.obj.get('backend_url') + config = None if config_path: config = load_config(config_path) @@ -131,25 +135,32 @@ def project_health( @dbt.command("onboard") -@click.option("--token", prompt="API Token", help="Your API token for authentication.") -@click.option("--instance-name", prompt="Instance Name", help="Your tenant ID.") @click.option("--dbt_core_integration_id", prompt="DBT Core Integration ID", help="DBT Core Integration ID") @click.option( "--dbt_core_integration_environment", default="PROD", prompt="DBT Core Integration Environment", help="DBT Core Integration Environment" ) @click.option("--manifest-path", required=True, prompt="Manifest Path", help="Path to the manifest file.") @click.option("--catalog-path", required=False, prompt=False, help="Path to the catalog file.") -@click.option("--backend-url", required=False, help="Altimate's Backend URL", default="https://api.myaltimate.com") +@click.pass_context def onboard( - token, - instance_name, + ctx, dbt_core_integration_id, dbt_core_integration_environment, manifest_path, catalog_path, - backend_url="https://api.myaltimate.com", ): """Onboard a manifest file to DBT.""" + # Get common options from parent context + token = ctx.parent.obj.get('token') + instance_name = ctx.parent.obj.get('instance_name') + backend_url = ctx.parent.obj.get('backend_url') + + # For onboard command, token and instance_name are required + if not token: + token = click.prompt("API Token") + if not instance_name: + instance_name = click.prompt("Instance Name") + check_token_and_instance(token, instance_name) if not validate_credentials(token, backend_url, instance_name): From 98e123217359a0ddc857772f181e18108bab110c Mon Sep 17 00:00:00 2001 From: "Michiel De Smet (aider)" Date: Tue, 10 Jun 2025 13:37:41 +0800 Subject: [PATCH 02/26] feat: add config file support with env var substitution --- src/datapilot/cli/main.py | 82 +++++++++++++++++++++++++++++++++++++-- 1 file changed, 78 insertions(+), 4 deletions(-) diff --git a/src/datapilot/cli/main.py b/src/datapilot/cli/main.py index cd8261a..21bc2f1 100644 --- a/src/datapilot/cli/main.py +++ b/src/datapilot/cli/main.py @@ -1,9 +1,54 @@ +import json +import os +import re +from pathlib import Path + import click +from dotenv import load_dotenv from datapilot.core.mcp_utils.mcp import mcp from datapilot.core.platforms.dbt.cli.cli import dbt +def load_config_from_file(): + """Load configuration from ~/.altimate/altimate.json if it exists.""" + config_path = Path.home() / ".altimate" / "altimate.json" + + if not config_path.exists(): + return {} + + try: + with open(config_path, 'r') as f: + config = json.load(f) + return config + except (json.JSONDecodeError, IOError) as e: + click.echo(f"Warning: Failed to load config from {config_path}: {e}", err=True) + return {} + + +def substitute_env_vars(value): + """Replace ${env:ENV_VARIABLE} patterns with actual environment variable values.""" + if not isinstance(value, str): + return value + + # Pattern to match ${env:VARIABLE_NAME} + pattern = r'\$\{env:([^}]+)\}' + + def replacer(match): + env_var = match.group(1) + return os.environ.get(env_var, match.group(0)) + + return re.sub(pattern, replacer, value) + + +def process_config(config): + """Process configuration dictionary to substitute environment variables.""" + processed = {} + for key, value in config.items(): + processed[key] = substitute_env_vars(value) + return processed + + @click.group() @click.option("--token", required=False, help="Your API token for authentication.") @click.option("--instance-name", required=False, help="Your tenant ID.") @@ -11,11 +56,40 @@ @click.pass_context def datapilot(ctx, token, instance_name, backend_url): """Altimate CLI for DBT project management.""" - # Store common options in context + # Load .env file from current directory if it exists + load_dotenv() + + # Load configuration from file + file_config = load_config_from_file() + file_config = process_config(file_config) + + # Map config file keys to CLI option names + config_mapping = { + 'altimateApiKey': 'token', + 'altimateInstanceName': 'instance_name', + 'altimateUrl': 'backend_url' + } + + # Store common options in context, with CLI args taking precedence ctx.ensure_object(dict) - ctx.obj['token'] = token - ctx.obj['instance_name'] = instance_name - ctx.obj['backend_url'] = backend_url + + # Apply file config first + for file_key, cli_key in config_mapping.items(): + if file_key in file_config: + ctx.obj[cli_key] = file_config[file_key] + + # Override with CLI arguments if provided + if token is not None: + ctx.obj['token'] = token + if instance_name is not None: + ctx.obj['instance_name'] = instance_name + if backend_url != "https://api.myaltimate.com": # Only override if not default + ctx.obj['backend_url'] = backend_url + + # Set defaults if nothing was provided + ctx.obj.setdefault('token', None) + ctx.obj.setdefault('instance_name', None) + ctx.obj.setdefault('backend_url', 'https://api.myaltimate.com') datapilot.add_command(dbt) From e9d3f71500124c0d8e6be0574839c9d59e4cd593 Mon Sep 17 00:00:00 2001 From: "Michiel De Smet (aider)" Date: Tue, 10 Jun 2025 13:38:51 +0800 Subject: [PATCH 03/26] build: add python-dotenv dependency --- setup.py | 1 + 1 file changed, 1 insertion(+) diff --git a/setup.py b/setup.py index 2da8546..d24279d 100644 --- a/setup.py +++ b/setup.py @@ -70,6 +70,7 @@ def read(*names, **kwargs): "sqlglot~=25.30.0", "mcp~=1.9.0", "pyperclip~=1.8.2", + "python-dotenv~=1.0.0", ], extras_require={ # eg: From ff3dca603a9788ecec3e0089623f49a22d6ee643 Mon Sep 17 00:00:00 2001 From: Michiel De Smet Date: Tue, 10 Jun 2025 13:53:32 +0800 Subject: [PATCH 04/26] refactor: standardize string quotes and improve code consistency --- src/datapilot/cli/main.py | 46 ++++++++++----------- src/datapilot/core/platforms/dbt/cli/cli.py | 18 ++++---- tests/core/platform/dbt/test_cli.py | 22 ++++++---- 3 files changed, 45 insertions(+), 41 deletions(-) diff --git a/src/datapilot/cli/main.py b/src/datapilot/cli/main.py index 21bc2f1..570050a 100644 --- a/src/datapilot/cli/main.py +++ b/src/datapilot/cli/main.py @@ -13,15 +13,15 @@ def load_config_from_file(): """Load configuration from ~/.altimate/altimate.json if it exists.""" config_path = Path.home() / ".altimate" / "altimate.json" - + if not config_path.exists(): return {} - + try: - with open(config_path, 'r') as f: + with open(config_path) as f: config = json.load(f) return config - except (json.JSONDecodeError, IOError) as e: + except (OSError, json.JSONDecodeError) as e: click.echo(f"Warning: Failed to load config from {config_path}: {e}", err=True) return {} @@ -30,14 +30,14 @@ def substitute_env_vars(value): """Replace ${env:ENV_VARIABLE} patterns with actual environment variable values.""" if not isinstance(value, str): return value - + # Pattern to match ${env:VARIABLE_NAME} - pattern = r'\$\{env:([^}]+)\}' - + pattern = r"\$\{env:([^}]+)\}" + def replacer(match): env_var = match.group(1) return os.environ.get(env_var, match.group(0)) - + return re.sub(pattern, replacer, value) @@ -58,38 +58,34 @@ def datapilot(ctx, token, instance_name, backend_url): """Altimate CLI for DBT project management.""" # Load .env file from current directory if it exists load_dotenv() - + # Load configuration from file file_config = load_config_from_file() file_config = process_config(file_config) - + # Map config file keys to CLI option names - config_mapping = { - 'altimateApiKey': 'token', - 'altimateInstanceName': 'instance_name', - 'altimateUrl': 'backend_url' - } - + config_mapping = {"altimateApiKey": "token", "altimateInstanceName": "instance_name", "altimateUrl": "backend_url"} + # Store common options in context, with CLI args taking precedence ctx.ensure_object(dict) - + # Apply file config first for file_key, cli_key in config_mapping.items(): if file_key in file_config: ctx.obj[cli_key] = file_config[file_key] - + # Override with CLI arguments if provided if token is not None: - ctx.obj['token'] = token + ctx.obj["token"] = token if instance_name is not None: - ctx.obj['instance_name'] = instance_name + ctx.obj["instance_name"] = instance_name if backend_url != "https://api.myaltimate.com": # Only override if not default - ctx.obj['backend_url'] = backend_url - + ctx.obj["backend_url"] = backend_url + # Set defaults if nothing was provided - ctx.obj.setdefault('token', None) - ctx.obj.setdefault('instance_name', None) - ctx.obj.setdefault('backend_url', 'https://api.myaltimate.com') + ctx.obj.setdefault("token", None) + ctx.obj.setdefault("instance_name", None) + ctx.obj.setdefault("backend_url", "https://api.myaltimate.com") datapilot.add_command(dbt) diff --git a/src/datapilot/core/platforms/dbt/cli/cli.py b/src/datapilot/core/platforms/dbt/cli/cli.py index 233f6b1..0f63167 100644 --- a/src/datapilot/core/platforms/dbt/cli/cli.py +++ b/src/datapilot/core/platforms/dbt/cli/cli.py @@ -72,10 +72,10 @@ def project_health( :param manifest_path: Path to the DBT manifest file. """ # Get common options from parent context - token = ctx.parent.obj.get('token') - instance_name = ctx.parent.obj.get('instance_name') - backend_url = ctx.parent.obj.get('backend_url') - + token = ctx.parent.obj.get("token") + instance_name = ctx.parent.obj.get("instance_name") + backend_url = ctx.parent.obj.get("backend_url") + config = None if config_path: config = load_config(config_path) @@ -151,16 +151,16 @@ def onboard( ): """Onboard a manifest file to DBT.""" # Get common options from parent context - token = ctx.parent.obj.get('token') - instance_name = ctx.parent.obj.get('instance_name') - backend_url = ctx.parent.obj.get('backend_url') - + token = ctx.parent.obj.get("token") + instance_name = ctx.parent.obj.get("instance_name") + backend_url = ctx.parent.obj.get("backend_url") + # For onboard command, token and instance_name are required if not token: token = click.prompt("API Token") if not instance_name: instance_name = click.prompt("Instance Name") - + check_token_and_instance(token, instance_name) if not validate_credentials(token, backend_url, instance_name): diff --git a/tests/core/platform/dbt/test_cli.py b/tests/core/platform/dbt/test_cli.py index 04242b7..c79fc72 100644 --- a/tests/core/platform/dbt/test_cli.py +++ b/tests/core/platform/dbt/test_cli.py @@ -1,7 +1,7 @@ # test_app.py from click.testing import CliRunner -from datapilot.core.platforms.dbt.cli.cli import project_health +from datapilot.cli.main import datapilot def test_project_health_with_required_and_optional_args(): @@ -11,7 +11,9 @@ def test_project_health_with_required_and_optional_args(): config_path = "tests/data/config.yml" # Simulate command invocation - result = runner.invoke(project_health, ["--manifest-path", manifest_path, "--catalog-path", catalog_path, "--config-path", config_path]) + result = runner.invoke( + datapilot, ["project-health", "--manifest-path", manifest_path, "--catalog-path", catalog_path, "--config-path", config_path] + ) assert result.exit_code == 0 # Ensure the command executed successfully # Add more assertions here to validate the behavior of your command, @@ -25,8 +27,9 @@ def test_project_health_with_only_required_arg(): # Simulate command invocation without optional arguments result = runner.invoke( - project_health, + datapilot, [ + "project-health", "--manifest-path", manifest_path, ], @@ -43,8 +46,9 @@ def test_project_health_with_only_required_arg_version1_6(): # Simulate command invocation without optional arguments result = runner.invoke( - project_health, + datapilot, [ + "project-health", "--manifest-path", manifest_path, ], @@ -61,8 +65,9 @@ def test_project_health_with_macro_args(): # Simulate command invocation without optional arguments result = runner.invoke( - project_health, + datapilot, [ + "project-health", "--manifest-path", manifest_path, ], @@ -76,8 +81,9 @@ def test_project_health_with_macro_args(): # Simulate command invocation without optional arguments result = runner.invoke( - project_health, + datapilot, [ + "project-health", "--manifest-path", manifest_path, ], @@ -95,7 +101,9 @@ def test_project_health_with_required_and_optional_args_v12(): config_path = "tests/data/config.yml" # Simulate command invocation - result = runner.invoke(project_health, ["--manifest-path", manifest_path, "--catalog-path", catalog_path, "--config-path", config_path]) + result = runner.invoke( + datapilot, ["project-health", "--manifest-path", manifest_path, "--catalog-path", catalog_path, "--config-path", config_path] + ) assert result.exit_code == 0 # Ensure the command executed successfully # Add more assertions here to validate the behavior of your command, From 34288ef9e85e84d1813caecfc9741b760a808b18 Mon Sep 17 00:00:00 2001 From: Michiel De Smet Date: Tue, 10 Jun 2025 13:54:40 +0800 Subject: [PATCH 05/26] feat: hide token input in CLI for security --- src/datapilot/cli/main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/datapilot/cli/main.py b/src/datapilot/cli/main.py index 570050a..0e2a9be 100644 --- a/src/datapilot/cli/main.py +++ b/src/datapilot/cli/main.py @@ -50,7 +50,7 @@ def process_config(config): @click.group() -@click.option("--token", required=False, help="Your API token for authentication.") +@click.option("--token", required=False, help="Your API token for authentication.", hide_input=True) @click.option("--instance-name", required=False, help="Your tenant ID.") @click.option("--backend-url", required=False, help="Altimate's Backend URL", default="https://api.myaltimate.com") @click.pass_context From c4059c73d95406da23b5e021584f26b30d871532 Mon Sep 17 00:00:00 2001 From: Michiel De Smet Date: Tue, 10 Jun 2025 13:56:19 +0800 Subject: [PATCH 06/26] chore: update config file loading to use Path.open --- setup.py | 0 src/datapilot/cli/main.py | 2 +- 2 files changed, 1 insertion(+), 1 deletion(-) mode change 100644 => 100755 setup.py diff --git a/setup.py b/setup.py old mode 100644 new mode 100755 diff --git a/src/datapilot/cli/main.py b/src/datapilot/cli/main.py index 0e2a9be..4839fc7 100644 --- a/src/datapilot/cli/main.py +++ b/src/datapilot/cli/main.py @@ -18,7 +18,7 @@ def load_config_from_file(): return {} try: - with open(config_path) as f: + with Path.open(config_path) as f: config = json.load(f) return config except (OSError, json.JSONDecodeError) as e: From e5eaab4ea650ac46410eaae2327dc413e9390024 Mon Sep 17 00:00:00 2001 From: Michiel De Smet Date: Tue, 10 Jun 2025 14:00:34 +0800 Subject: [PATCH 07/26] fix: add 'dbt' prefix to project-health command in tests --- tests/core/platform/dbt/test_cli.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/tests/core/platform/dbt/test_cli.py b/tests/core/platform/dbt/test_cli.py index c79fc72..b7f6f45 100644 --- a/tests/core/platform/dbt/test_cli.py +++ b/tests/core/platform/dbt/test_cli.py @@ -12,7 +12,7 @@ def test_project_health_with_required_and_optional_args(): # Simulate command invocation result = runner.invoke( - datapilot, ["project-health", "--manifest-path", manifest_path, "--catalog-path", catalog_path, "--config-path", config_path] + datapilot, ["dbt", "project-health", "--manifest-path", manifest_path, "--catalog-path", catalog_path, "--config-path", config_path] ) assert result.exit_code == 0 # Ensure the command executed successfully @@ -29,6 +29,7 @@ def test_project_health_with_only_required_arg(): result = runner.invoke( datapilot, [ + "dbt", "project-health", "--manifest-path", manifest_path, @@ -48,6 +49,7 @@ def test_project_health_with_only_required_arg_version1_6(): result = runner.invoke( datapilot, [ + "dbt", "project-health", "--manifest-path", manifest_path, @@ -67,6 +69,7 @@ def test_project_health_with_macro_args(): result = runner.invoke( datapilot, [ + "dbt", "project-health", "--manifest-path", manifest_path, @@ -83,6 +86,7 @@ def test_project_health_with_macro_args(): result = runner.invoke( datapilot, [ + "dbt", "project-health", "--manifest-path", manifest_path, @@ -102,7 +106,7 @@ def test_project_health_with_required_and_optional_args_v12(): # Simulate command invocation result = runner.invoke( - datapilot, ["project-health", "--manifest-path", manifest_path, "--catalog-path", catalog_path, "--config-path", config_path] + datapilot, ["dbt", "project-health", "--manifest-path", manifest_path, "--catalog-path", catalog_path, "--config-path", config_path] ) assert result.exit_code == 0 # Ensure the command executed successfully From a23748faeb8bea6b6d8393b151a563fb8f85928f Mon Sep 17 00:00:00 2001 From: Michiel De Smet Date: Tue, 10 Jun 2025 14:01:51 +0800 Subject: [PATCH 08/26] refactor: simplify config file opening syntax --- src/datapilot/cli/main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/datapilot/cli/main.py b/src/datapilot/cli/main.py index 4839fc7..a58c2cb 100644 --- a/src/datapilot/cli/main.py +++ b/src/datapilot/cli/main.py @@ -18,7 +18,7 @@ def load_config_from_file(): return {} try: - with Path.open(config_path) as f: + with config_path.open() as f: config = json.load(f) return config except (OSError, json.JSONDecodeError) as e: From 2aeda41992de5cf3da6204aa6acacd088fa64d9f Mon Sep 17 00:00:00 2001 From: Michiel De Smet Date: Tue, 10 Jun 2025 14:12:04 +0800 Subject: [PATCH 09/26] feat: add knowledge CLI command group --- src/datapilot/cli/main.py | 2 ++ src/datapilot/core/knowledge/__init__.py | 0 src/datapilot/core/knowledge/cli.py | 7 +++++++ 3 files changed, 9 insertions(+) create mode 100644 src/datapilot/core/knowledge/__init__.py create mode 100644 src/datapilot/core/knowledge/cli.py diff --git a/src/datapilot/cli/main.py b/src/datapilot/cli/main.py index a58c2cb..875c7b6 100644 --- a/src/datapilot/cli/main.py +++ b/src/datapilot/cli/main.py @@ -8,6 +8,7 @@ from datapilot.core.mcp_utils.mcp import mcp from datapilot.core.platforms.dbt.cli.cli import dbt +from datapilot.core.knowledge import cli as knowledge def load_config_from_file(): @@ -90,3 +91,4 @@ def datapilot(ctx, token, instance_name, backend_url): datapilot.add_command(dbt) datapilot.add_command(mcp) +datapilot.add_command(knowledge) diff --git a/src/datapilot/core/knowledge/__init__.py b/src/datapilot/core/knowledge/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/datapilot/core/knowledge/cli.py b/src/datapilot/core/knowledge/cli.py new file mode 100644 index 0000000..b7707a4 --- /dev/null +++ b/src/datapilot/core/knowledge/cli.py @@ -0,0 +1,7 @@ +import click + + +@click.group() +def cli(): + """knowledge specific commands.""" + From 9de835b2af42aa70ef006b708c289fc02c267ad6 Mon Sep 17 00:00:00 2001 From: "Michiel De Smet (aider)" Date: Tue, 10 Jun 2025 14:20:31 +0800 Subject: [PATCH 10/26] feat: add knowledge serve command with HTTP server --- src/datapilot/core/knowledge/cli.py | 94 +++++++++++++++++++++++++++++ 1 file changed, 94 insertions(+) diff --git a/src/datapilot/core/knowledge/cli.py b/src/datapilot/core/knowledge/cli.py index b7707a4..455e232 100644 --- a/src/datapilot/core/knowledge/cli.py +++ b/src/datapilot/core/knowledge/cli.py @@ -1,7 +1,101 @@ import click +import json +from http.server import HTTPServer, BaseHTTPRequestHandler +from urllib.parse import urlparse +from urllib.request import Request, urlopen +from urllib.error import URLError, HTTPError +import re @click.group() def cli(): """knowledge specific commands.""" + +@cli.command() +@click.option('--port', default=3000, help='Port to run the server on') +@click.pass_context +def serve(ctx, port): + """Serve knowledge bases via HTTP server.""" + # Get configuration from parent context + token = ctx.parent.obj.get('token') + instance_name = ctx.parent.obj.get('instance_name') + backend_url = ctx.parent.obj.get('backend_url') + + if not token or not instance_name: + click.echo("Error: API token and instance name are required. Use --token and --instance-name options or set them in config.", err=True) + ctx.exit(1) + + class KnowledgeBaseHandler(BaseHTTPRequestHandler): + def do_GET(self): + """Handle GET requests.""" + path = urlparse(self.path).path + + # Match /knowledge_bases/{uuid} pattern + match = re.match(r'^/knowledge_bases/([a-fA-F0-9]{8}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{12})$', path) + + if match: + public_id = match.group(1) + self.handle_knowledge_base(public_id) + elif path == '/health': + self.handle_health() + else: + self.send_error(404, "Not Found") + + def handle_knowledge_base(self, public_id): + """Fetch and return knowledge base data.""" + url = f"{backend_url}/knowledge_bases/public/{public_id}" + + headers = { + 'Authorization': f'Bearer {token}', + 'X-Tenant': instance_name, + 'Content-Type': 'application/json' + } + + req = Request(url, headers=headers) + + try: + with urlopen(req) as response: + data = response.read() + self.send_response(200) + self.send_header('Content-Type', 'application/json') + self.end_headers() + self.wfile.write(data) + except HTTPError as e: + error_data = e.read().decode('utf-8') if e.read() else '{"error": "HTTP Error"}' + self.send_response(e.code) + self.send_header('Content-Type', 'application/json') + self.end_headers() + self.wfile.write(error_data.encode('utf-8')) + except URLError as e: + self.send_response(500) + self.send_header('Content-Type', 'application/json') + self.end_headers() + error_msg = json.dumps({'error': str(e)}) + self.wfile.write(error_msg.encode('utf-8')) + + def handle_health(self): + """Handle health check endpoint.""" + self.send_response(200) + self.send_header('Content-Type', 'application/json') + self.end_headers() + self.wfile.write(json.dumps({'status': 'ok'}).encode('utf-8')) + + def log_message(self, format, *args): + """Override to use click.echo for logging.""" + click.echo(f"{self.address_string()} - {format % args}") + + server_address = ('', port) + httpd = HTTPServer(server_address, KnowledgeBaseHandler) + + click.echo(f"Starting knowledge base server on port {port}...") + click.echo(f"Backend URL: {backend_url}") + click.echo(f"Instance: {instance_name}") + click.echo(f"Server running at http://localhost:{port}") + + try: + httpd.serve_forever() + except KeyboardInterrupt: + click.echo("\nShutting down server...") + httpd.shutdown() + From 6236ac1dc5ee08915cf645923125da82bfee7d78 Mon Sep 17 00:00:00 2001 From: "Michiel De Smet (aider)" Date: Tue, 10 Jun 2025 14:26:10 +0800 Subject: [PATCH 11/26] fix: correct knowledge cli import path in main.py --- src/datapilot/cli/main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/datapilot/cli/main.py b/src/datapilot/cli/main.py index 875c7b6..643f86c 100644 --- a/src/datapilot/cli/main.py +++ b/src/datapilot/cli/main.py @@ -8,7 +8,7 @@ from datapilot.core.mcp_utils.mcp import mcp from datapilot.core.platforms.dbt.cli.cli import dbt -from datapilot.core.knowledge import cli as knowledge +from datapilot.core.knowledge.cli import cli as knowledge def load_config_from_file(): From 521b381879ccd632a26daf8b51cebea401ab2389 Mon Sep 17 00:00:00 2001 From: Michiel De Smet Date: Tue, 10 Jun 2025 14:27:09 +0800 Subject: [PATCH 12/26] feat: add knowledge CLI group and serve command --- src/datapilot/core/knowledge/cli.py | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/src/datapilot/core/knowledge/cli.py b/src/datapilot/core/knowledge/cli.py index 455e232..3061e13 100644 --- a/src/datapilot/core/knowledge/cli.py +++ b/src/datapilot/core/knowledge/cli.py @@ -7,7 +7,7 @@ import re -@click.group() +@click.group(name="knowledge") def cli(): """knowledge specific commands.""" @@ -21,19 +21,19 @@ def serve(ctx, port): token = ctx.parent.obj.get('token') instance_name = ctx.parent.obj.get('instance_name') backend_url = ctx.parent.obj.get('backend_url') - + if not token or not instance_name: click.echo("Error: API token and instance name are required. Use --token and --instance-name options or set them in config.", err=True) ctx.exit(1) - + class KnowledgeBaseHandler(BaseHTTPRequestHandler): def do_GET(self): """Handle GET requests.""" path = urlparse(self.path).path - + # Match /knowledge_bases/{uuid} pattern match = re.match(r'^/knowledge_bases/([a-fA-F0-9]{8}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{12})$', path) - + if match: public_id = match.group(1) self.handle_knowledge_base(public_id) @@ -41,19 +41,19 @@ def do_GET(self): self.handle_health() else: self.send_error(404, "Not Found") - + def handle_knowledge_base(self, public_id): """Fetch and return knowledge base data.""" url = f"{backend_url}/knowledge_bases/public/{public_id}" - + headers = { 'Authorization': f'Bearer {token}', 'X-Tenant': instance_name, 'Content-Type': 'application/json' } - + req = Request(url, headers=headers) - + try: with urlopen(req) as response: data = response.read() @@ -73,26 +73,26 @@ def handle_knowledge_base(self, public_id): self.end_headers() error_msg = json.dumps({'error': str(e)}) self.wfile.write(error_msg.encode('utf-8')) - + def handle_health(self): """Handle health check endpoint.""" self.send_response(200) self.send_header('Content-Type', 'application/json') self.end_headers() self.wfile.write(json.dumps({'status': 'ok'}).encode('utf-8')) - + def log_message(self, format, *args): """Override to use click.echo for logging.""" click.echo(f"{self.address_string()} - {format % args}") - + server_address = ('', port) httpd = HTTPServer(server_address, KnowledgeBaseHandler) - + click.echo(f"Starting knowledge base server on port {port}...") click.echo(f"Backend URL: {backend_url}") click.echo(f"Instance: {instance_name}") click.echo(f"Server running at http://localhost:{port}") - + try: httpd.serve_forever() except KeyboardInterrupt: From 2f3858c59806d70ff16b8d7bda26a3efa3e29f91 Mon Sep 17 00:00:00 2001 From: Michiel De Smet Date: Tue, 10 Jun 2025 14:29:09 +0800 Subject: [PATCH 13/26] feat: change default server port from 3000 to 4000 --- src/datapilot/core/knowledge/cli.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/datapilot/core/knowledge/cli.py b/src/datapilot/core/knowledge/cli.py index 3061e13..21c345b 100644 --- a/src/datapilot/core/knowledge/cli.py +++ b/src/datapilot/core/knowledge/cli.py @@ -13,7 +13,7 @@ def cli(): @cli.command() -@click.option('--port', default=3000, help='Port to run the server on') +@click.option('--port', default=4000, help='Port to run the server on') @click.pass_context def serve(ctx, port): """Serve knowledge bases via HTTP server.""" From f9637219d1e01accfdf692a1c823f8d0e17e25b6 Mon Sep 17 00:00:00 2001 From: Michiel De Smet Date: Tue, 10 Jun 2025 14:30:09 +0800 Subject: [PATCH 14/26] style: reformat imports and strings in knowledge CLI --- src/datapilot/cli/main.py | 2 +- src/datapilot/core/knowledge/cli.py | 57 +++++++++++++++-------------- 2 files changed, 30 insertions(+), 29 deletions(-) diff --git a/src/datapilot/cli/main.py b/src/datapilot/cli/main.py index 643f86c..ecb5cfa 100644 --- a/src/datapilot/cli/main.py +++ b/src/datapilot/cli/main.py @@ -6,9 +6,9 @@ import click from dotenv import load_dotenv +from datapilot.core.knowledge.cli import cli as knowledge from datapilot.core.mcp_utils.mcp import mcp from datapilot.core.platforms.dbt.cli.cli import dbt -from datapilot.core.knowledge.cli import cli as knowledge def load_config_from_file(): diff --git a/src/datapilot/core/knowledge/cli.py b/src/datapilot/core/knowledge/cli.py index 21c345b..147ecef 100644 --- a/src/datapilot/core/knowledge/cli.py +++ b/src/datapilot/core/knowledge/cli.py @@ -1,10 +1,14 @@ -import click import json -from http.server import HTTPServer, BaseHTTPRequestHandler -from urllib.parse import urlparse -from urllib.request import Request, urlopen -from urllib.error import URLError, HTTPError import re +from http.server import BaseHTTPRequestHandler +from http.server import HTTPServer +from urllib.error import HTTPError +from urllib.error import URLError +from urllib.parse import urlparse +from urllib.request import Request +from urllib.request import urlopen + +import click @click.group(name="knowledge") @@ -13,17 +17,19 @@ def cli(): @cli.command() -@click.option('--port', default=4000, help='Port to run the server on') +@click.option("--port", default=4000, help="Port to run the server on") @click.pass_context def serve(ctx, port): """Serve knowledge bases via HTTP server.""" # Get configuration from parent context - token = ctx.parent.obj.get('token') - instance_name = ctx.parent.obj.get('instance_name') - backend_url = ctx.parent.obj.get('backend_url') + token = ctx.parent.obj.get("token") + instance_name = ctx.parent.obj.get("instance_name") + backend_url = ctx.parent.obj.get("backend_url") if not token or not instance_name: - click.echo("Error: API token and instance name are required. Use --token and --instance-name options or set them in config.", err=True) + click.echo( + "Error: API token and instance name are required. Use --token and --instance-name options or set them in config.", err=True + ) ctx.exit(1) class KnowledgeBaseHandler(BaseHTTPRequestHandler): @@ -32,12 +38,12 @@ def do_GET(self): path = urlparse(self.path).path # Match /knowledge_bases/{uuid} pattern - match = re.match(r'^/knowledge_bases/([a-fA-F0-9]{8}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{12})$', path) + match = re.match(r"^/knowledge_bases/([a-fA-F0-9]{8}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{12})$", path) if match: public_id = match.group(1) self.handle_knowledge_base(public_id) - elif path == '/health': + elif path == "/health": self.handle_health() else: self.send_error(404, "Not Found") @@ -46,11 +52,7 @@ def handle_knowledge_base(self, public_id): """Fetch and return knowledge base data.""" url = f"{backend_url}/knowledge_bases/public/{public_id}" - headers = { - 'Authorization': f'Bearer {token}', - 'X-Tenant': instance_name, - 'Content-Type': 'application/json' - } + headers = {"Authorization": f"Bearer {token}", "X-Tenant": instance_name, "Content-Type": "application/json"} req = Request(url, headers=headers) @@ -58,34 +60,34 @@ def handle_knowledge_base(self, public_id): with urlopen(req) as response: data = response.read() self.send_response(200) - self.send_header('Content-Type', 'application/json') + self.send_header("Content-Type", "application/json") self.end_headers() self.wfile.write(data) except HTTPError as e: - error_data = e.read().decode('utf-8') if e.read() else '{"error": "HTTP Error"}' + error_data = e.read().decode("utf-8") if e.read() else '{"error": "HTTP Error"}' self.send_response(e.code) - self.send_header('Content-Type', 'application/json') + self.send_header("Content-Type", "application/json") self.end_headers() - self.wfile.write(error_data.encode('utf-8')) + self.wfile.write(error_data.encode("utf-8")) except URLError as e: self.send_response(500) - self.send_header('Content-Type', 'application/json') + self.send_header("Content-Type", "application/json") self.end_headers() - error_msg = json.dumps({'error': str(e)}) - self.wfile.write(error_msg.encode('utf-8')) + error_msg = json.dumps({"error": str(e)}) + self.wfile.write(error_msg.encode("utf-8")) def handle_health(self): """Handle health check endpoint.""" self.send_response(200) - self.send_header('Content-Type', 'application/json') + self.send_header("Content-Type", "application/json") self.end_headers() - self.wfile.write(json.dumps({'status': 'ok'}).encode('utf-8')) + self.wfile.write(json.dumps({"status": "ok"}).encode("utf-8")) def log_message(self, format, *args): """Override to use click.echo for logging.""" click.echo(f"{self.address_string()} - {format % args}") - server_address = ('', port) + server_address = ("", port) httpd = HTTPServer(server_address, KnowledgeBaseHandler) click.echo(f"Starting knowledge base server on port {port}...") @@ -98,4 +100,3 @@ def log_message(self, format, *args): except KeyboardInterrupt: click.echo("\nShutting down server...") httpd.shutdown() - From c9f42296cf4b29425ab6f86b949287028633f3ea Mon Sep 17 00:00:00 2001 From: "Michiel De Smet (aider)" Date: Tue, 10 Jun 2025 14:38:25 +0800 Subject: [PATCH 15/26] fix: validate URL scheme to allow only HTTP/HTTPS in knowledge base handler --- src/datapilot/core/knowledge/cli.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/src/datapilot/core/knowledge/cli.py b/src/datapilot/core/knowledge/cli.py index 147ecef..b6e5894 100644 --- a/src/datapilot/core/knowledge/cli.py +++ b/src/datapilot/core/knowledge/cli.py @@ -51,6 +51,16 @@ def do_GET(self): def handle_knowledge_base(self, public_id): """Fetch and return knowledge base data.""" url = f"{backend_url}/knowledge_bases/public/{public_id}" + + # Validate URL scheme for security + parsed_url = urlparse(url) + if parsed_url.scheme not in ('http', 'https'): + self.send_response(400) + self.send_header("Content-Type", "application/json") + self.end_headers() + error_msg = json.dumps({"error": "Invalid URL scheme. Only HTTP and HTTPS are allowed."}) + self.wfile.write(error_msg.encode("utf-8")) + return headers = {"Authorization": f"Bearer {token}", "X-Tenant": instance_name, "Content-Type": "application/json"} From 54e349529cb2337b9020847bafabd0eaac708477 Mon Sep 17 00:00:00 2001 From: "Michiel De Smet (aider)" Date: Tue, 10 Jun 2025 14:39:12 +0800 Subject: [PATCH 16/26] feat: Add timeout to urlopen() calls --- src/datapilot/core/knowledge/cli.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/datapilot/core/knowledge/cli.py b/src/datapilot/core/knowledge/cli.py index b6e5894..bcd8f88 100644 --- a/src/datapilot/core/knowledge/cli.py +++ b/src/datapilot/core/knowledge/cli.py @@ -67,7 +67,7 @@ def handle_knowledge_base(self, public_id): req = Request(url, headers=headers) try: - with urlopen(req) as response: + with urlopen(req, timeout=30) as response: data = response.read() self.send_response(200) self.send_header("Content-Type", "application/json") From 13b3d1bbd846c33ddbbac723d9944ff724aaf382 Mon Sep 17 00:00:00 2001 From: "Michiel De Smet (aider)" Date: Tue, 10 Jun 2025 14:39:30 +0800 Subject: [PATCH 17/26] fix: store HTTPError.read() result to avoid double read --- src/datapilot/core/knowledge/cli.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/datapilot/core/knowledge/cli.py b/src/datapilot/core/knowledge/cli.py index bcd8f88..238fb0f 100644 --- a/src/datapilot/core/knowledge/cli.py +++ b/src/datapilot/core/knowledge/cli.py @@ -74,7 +74,8 @@ def handle_knowledge_base(self, public_id): self.end_headers() self.wfile.write(data) except HTTPError as e: - error_data = e.read().decode("utf-8") if e.read() else '{"error": "HTTP Error"}' + error_body = e.read() + error_data = error_body.decode("utf-8") if error_body else '{"error": "HTTP Error"}' self.send_response(e.code) self.send_header("Content-Type", "application/json") self.end_headers() From 1ceafceb39563bc9058b994a54b3266324f3ad05 Mon Sep 17 00:00:00 2001 From: Michiel De Smet Date: Tue, 10 Jun 2025 14:41:18 +0800 Subject: [PATCH 18/26] style: standardize string quotes in URL scheme check --- src/datapilot/core/knowledge/cli.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/datapilot/core/knowledge/cli.py b/src/datapilot/core/knowledge/cli.py index 238fb0f..d1bc87d 100644 --- a/src/datapilot/core/knowledge/cli.py +++ b/src/datapilot/core/knowledge/cli.py @@ -51,10 +51,10 @@ def do_GET(self): def handle_knowledge_base(self, public_id): """Fetch and return knowledge base data.""" url = f"{backend_url}/knowledge_bases/public/{public_id}" - + # Validate URL scheme for security parsed_url = urlparse(url) - if parsed_url.scheme not in ('http', 'https'): + if parsed_url.scheme not in ("http", "https"): self.send_response(400) self.send_header("Content-Type", "application/json") self.end_headers() From 973bb0f8a4427ef3fd1f61a2bb813f7ad0ad4f10 Mon Sep 17 00:00:00 2001 From: "Michiel De Smet (aider)" Date: Tue, 10 Jun 2025 14:41:21 +0800 Subject: [PATCH 19/26] fix: suppress S310 warning for validated URL scheme --- src/datapilot/core/knowledge/cli.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/datapilot/core/knowledge/cli.py b/src/datapilot/core/knowledge/cli.py index d1bc87d..43ecb5a 100644 --- a/src/datapilot/core/knowledge/cli.py +++ b/src/datapilot/core/knowledge/cli.py @@ -67,7 +67,7 @@ def handle_knowledge_base(self, public_id): req = Request(url, headers=headers) try: - with urlopen(req, timeout=30) as response: + with urlopen(req, timeout=30) as response: # noqa: S310 - URL scheme already validated above data = response.read() self.send_response(200) self.send_header("Content-Type", "application/json") From 9f439e90738a417bff9473ed4032b5dea2c0153d Mon Sep 17 00:00:00 2001 From: "Michiel De Smet (aider)" Date: Tue, 10 Jun 2025 14:42:05 +0800 Subject: [PATCH 20/26] fix: Shorten noqa comment for urlopen security warning --- src/datapilot/core/knowledge/cli.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/datapilot/core/knowledge/cli.py b/src/datapilot/core/knowledge/cli.py index 43ecb5a..be0ef33 100644 --- a/src/datapilot/core/knowledge/cli.py +++ b/src/datapilot/core/knowledge/cli.py @@ -67,7 +67,7 @@ def handle_knowledge_base(self, public_id): req = Request(url, headers=headers) try: - with urlopen(req, timeout=30) as response: # noqa: S310 - URL scheme already validated above + with urlopen(req, timeout=30) as response: # noqa: S310 data = response.read() self.send_response(200) self.send_header("Content-Type", "application/json") From 849436652cbb492ff8c6fae2ee5992f3fdb6d777 Mon Sep 17 00:00:00 2001 From: "Michiel De Smet (aider)" Date: Tue, 10 Jun 2025 14:43:00 +0800 Subject: [PATCH 21/26] fix: add security comment for urlopen usage --- src/datapilot/core/knowledge/cli.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/datapilot/core/knowledge/cli.py b/src/datapilot/core/knowledge/cli.py index be0ef33..6f8e957 100644 --- a/src/datapilot/core/knowledge/cli.py +++ b/src/datapilot/core/knowledge/cli.py @@ -67,6 +67,7 @@ def handle_knowledge_base(self, public_id): req = Request(url, headers=headers) try: + # URL scheme validated above - only HTTP/HTTPS allowed with urlopen(req, timeout=30) as response: # noqa: S310 data = response.read() self.send_response(200) From 5222d4a2a60b11787987d44060335c804f6befe1 Mon Sep 17 00:00:00 2001 From: Michiel De Smet Date: Tue, 10 Jun 2025 14:44:50 +0800 Subject: [PATCH 22/26] fix: add noqa comment for bandit security check --- src/datapilot/core/knowledge/cli.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/datapilot/core/knowledge/cli.py b/src/datapilot/core/knowledge/cli.py index 6f8e957..f590f68 100644 --- a/src/datapilot/core/knowledge/cli.py +++ b/src/datapilot/core/knowledge/cli.py @@ -64,7 +64,7 @@ def handle_knowledge_base(self, public_id): headers = {"Authorization": f"Bearer {token}", "X-Tenant": instance_name, "Content-Type": "application/json"} - req = Request(url, headers=headers) + req = Request(url, headers=headers) # noqa: S310 try: # URL scheme validated above - only HTTP/HTTPS allowed From a02e6c239e351901d0063e20e7a8f24fc60a2a92 Mon Sep 17 00:00:00 2001 From: "Michiel De Smet (aider)" Date: Wed, 11 Jun 2025 13:43:03 +0800 Subject: [PATCH 23/26] feat: add --version option to CLI --- src/datapilot/cli/main.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/datapilot/cli/main.py b/src/datapilot/cli/main.py index ecb5cfa..87a6135 100644 --- a/src/datapilot/cli/main.py +++ b/src/datapilot/cli/main.py @@ -6,6 +6,7 @@ import click from dotenv import load_dotenv +from datapilot import __version__ from datapilot.core.knowledge.cli import cli as knowledge from datapilot.core.mcp_utils.mcp import mcp from datapilot.core.platforms.dbt.cli.cli import dbt @@ -51,6 +52,7 @@ def process_config(config): @click.group() +@click.version_option(version=__version__, prog_name="datapilot") @click.option("--token", required=False, help="Your API token for authentication.", hide_input=True) @click.option("--instance-name", required=False, help="Your tenant ID.") @click.option("--backend-url", required=False, help="Altimate's Backend URL", default="https://api.myaltimate.com") From 7954b55d3754fcd73fd7d25ad6ab959c2a67b230 Mon Sep 17 00:00:00 2001 From: "Michiel De Smet (aider)" Date: Wed, 11 Jun 2025 18:00:45 +0800 Subject: [PATCH 24/26] refactor: extract KnowledgeBaseHandler to separate file --- src/datapilot/core/knowledge/cli.py | 80 ++----------------------- src/datapilot/core/knowledge/server.py | 83 ++++++++++++++++++++++++++ 2 files changed, 88 insertions(+), 75 deletions(-) create mode 100644 src/datapilot/core/knowledge/server.py diff --git a/src/datapilot/core/knowledge/cli.py b/src/datapilot/core/knowledge/cli.py index f590f68..a65335b 100644 --- a/src/datapilot/core/knowledge/cli.py +++ b/src/datapilot/core/knowledge/cli.py @@ -1,13 +1,5 @@ -import json -import re -from http.server import BaseHTTPRequestHandler from http.server import HTTPServer -from urllib.error import HTTPError -from urllib.error import URLError -from urllib.parse import urlparse -from urllib.request import Request -from urllib.request import urlopen - +from .server import KnowledgeBaseHandler import click @@ -32,72 +24,10 @@ def serve(ctx, port): ) ctx.exit(1) - class KnowledgeBaseHandler(BaseHTTPRequestHandler): - def do_GET(self): - """Handle GET requests.""" - path = urlparse(self.path).path - - # Match /knowledge_bases/{uuid} pattern - match = re.match(r"^/knowledge_bases/([a-fA-F0-9]{8}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{12})$", path) - - if match: - public_id = match.group(1) - self.handle_knowledge_base(public_id) - elif path == "/health": - self.handle_health() - else: - self.send_error(404, "Not Found") - - def handle_knowledge_base(self, public_id): - """Fetch and return knowledge base data.""" - url = f"{backend_url}/knowledge_bases/public/{public_id}" - - # Validate URL scheme for security - parsed_url = urlparse(url) - if parsed_url.scheme not in ("http", "https"): - self.send_response(400) - self.send_header("Content-Type", "application/json") - self.end_headers() - error_msg = json.dumps({"error": "Invalid URL scheme. Only HTTP and HTTPS are allowed."}) - self.wfile.write(error_msg.encode("utf-8")) - return - - headers = {"Authorization": f"Bearer {token}", "X-Tenant": instance_name, "Content-Type": "application/json"} - - req = Request(url, headers=headers) # noqa: S310 - - try: - # URL scheme validated above - only HTTP/HTTPS allowed - with urlopen(req, timeout=30) as response: # noqa: S310 - data = response.read() - self.send_response(200) - self.send_header("Content-Type", "application/json") - self.end_headers() - self.wfile.write(data) - except HTTPError as e: - error_body = e.read() - error_data = error_body.decode("utf-8") if error_body else '{"error": "HTTP Error"}' - self.send_response(e.code) - self.send_header("Content-Type", "application/json") - self.end_headers() - self.wfile.write(error_data.encode("utf-8")) - except URLError as e: - self.send_response(500) - self.send_header("Content-Type", "application/json") - self.end_headers() - error_msg = json.dumps({"error": str(e)}) - self.wfile.write(error_msg.encode("utf-8")) - - def handle_health(self): - """Handle health check endpoint.""" - self.send_response(200) - self.send_header("Content-Type", "application/json") - self.end_headers() - self.wfile.write(json.dumps({"status": "ok"}).encode("utf-8")) - - def log_message(self, format, *args): - """Override to use click.echo for logging.""" - click.echo(f"{self.address_string()} - {format % args}") + # Set context data for the handler + KnowledgeBaseHandler.token = token + KnowledgeBaseHandler.instance_name = instance_name + KnowledgeBaseHandler.backend_url = backend_url server_address = ("", port) httpd = HTTPServer(server_address, KnowledgeBaseHandler) diff --git a/src/datapilot/core/knowledge/server.py b/src/datapilot/core/knowledge/server.py new file mode 100644 index 0000000..21fecf8 --- /dev/null +++ b/src/datapilot/core/knowledge/server.py @@ -0,0 +1,83 @@ +import json +import re +from http.server import BaseHTTPRequestHandler +from urllib.error import HTTPError +from urllib.error import URLError +from urllib.parse import urlparse +from urllib.request import Request +from urllib.request import urlopen + +import click + +class KnowledgeBaseHandler(BaseHTTPRequestHandler): + """HTTP request handler for serving knowledge bases and health checks.""" + + token: str = "" + instance_name: str = "" + backend_url: str = "" + + def do_GET(self): + """Handle GET requests.""" + path = urlparse(self.path).path + + # Match /knowledge_bases/{uuid} pattern + match = re.match(r"^/knowledge_bases/([a-fA-F0-9]{8}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{12})$", path) + + if match: + public_id = match.group(1) + self.handle_knowledge_base(public_id) + elif path == "/health": + self.handle_health() + else: + self.send_error(404, "Not Found") + + def handle_knowledge_base(self, public_id): + """Fetch and return knowledge base data.""" + url = f"{self.backend_url}/knowledge_bases/public/{public_id}" + + # Validate URL scheme for security + parsed_url = urlparse(url) + if parsed_url.scheme not in ("http", "https"): + self.send_response(400) + self.send_header("Content-Type", "application/json") + self.end_headers() + error_msg = json.dumps({"error": "Invalid URL scheme. Only HTTP and HTTPS are allowed."}) + self.wfile.write(error_msg.encode("utf-8")) + return + + headers = {"Authorization": f"Bearer {self.token}", "X-Tenant": self.instance_name, "Content-Type": "application/json"} + + req = Request(url, headers=headers) # noqa: S310 + + try: + # URL scheme validated above - only HTTP/HTTPS allowed + with urlopen(req, timeout=30) as response: # noqa: S310 + data = response.read() + self.send_response(200) + self.send_header("Content-Type", "application/json") + self.end_headers() + self.wfile.write(data) + except HTTPError as e: + error_body = e.read() + error_data = error_body.decode("utf-8") if error_body else '{"error": "HTTP Error"}' + self.send_response(e.code) + self.send_header("Content-Type", "application/json") + self.end_headers() + self.wfile.write(error_data.encode("utf-8")) + except URLError as e: + self.send_response(500) + self.send_header("Content-Type", "application/json") + self.end_headers() + error_msg = json.dumps({"error": str(e)}) + self.wfile.write(error_msg.encode("utf-8")) + + def handle_health(self): + """Handle health check endpoint.""" + self.send_response(200) + self.send_header("Content-Type", "application/json") + self.end_headers() + self.wfile.write(json.dumps({"status": "ok"}).encode("utf-8")) + + def log_message(self, format, *args): + """Override to use click.echo for logging.""" + click.echo(f"{self.address_string()} - {format % args}") From 79efc660a6e99b57173a6f8dcc3c8dd3fd732463 Mon Sep 17 00:00:00 2001 From: Michiel De Smet Date: Wed, 11 Jun 2025 18:01:19 +0800 Subject: [PATCH 25/26] style: reorder imports and clean up whitespace --- src/datapilot/core/knowledge/cli.py | 4 +++- src/datapilot/core/knowledge/server.py | 3 ++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/src/datapilot/core/knowledge/cli.py b/src/datapilot/core/knowledge/cli.py index a65335b..2fe30f8 100644 --- a/src/datapilot/core/knowledge/cli.py +++ b/src/datapilot/core/knowledge/cli.py @@ -1,7 +1,9 @@ from http.server import HTTPServer -from .server import KnowledgeBaseHandler + import click +from .server import KnowledgeBaseHandler + @click.group(name="knowledge") def cli(): diff --git a/src/datapilot/core/knowledge/server.py b/src/datapilot/core/knowledge/server.py index 21fecf8..bd15323 100644 --- a/src/datapilot/core/knowledge/server.py +++ b/src/datapilot/core/knowledge/server.py @@ -9,9 +9,10 @@ import click + class KnowledgeBaseHandler(BaseHTTPRequestHandler): """HTTP request handler for serving knowledge bases and health checks.""" - + token: str = "" instance_name: str = "" backend_url: str = "" From 5be69f773ad0c06f69fe9e67c8d4a9b810bb7e6b Mon Sep 17 00:00:00 2001 From: Michiel De Smet Date: Thu, 12 Jun 2025 10:02:17 +0800 Subject: [PATCH 26/26] refactor: update knowledge base endpoint paths --- src/datapilot/core/knowledge/server.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/datapilot/core/knowledge/server.py b/src/datapilot/core/knowledge/server.py index bd15323..7830152 100644 --- a/src/datapilot/core/knowledge/server.py +++ b/src/datapilot/core/knowledge/server.py @@ -22,7 +22,7 @@ def do_GET(self): path = urlparse(self.path).path # Match /knowledge_bases/{uuid} pattern - match = re.match(r"^/knowledge_bases/([a-fA-F0-9]{8}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{12})$", path) + match = re.match(r"^/kb/([a-fA-F0-9]{8}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{12})$", path) if match: public_id = match.group(1) @@ -34,7 +34,7 @@ def do_GET(self): def handle_knowledge_base(self, public_id): """Fetch and return knowledge base data.""" - url = f"{self.backend_url}/knowledge_bases/public/{public_id}" + url = f"{self.backend_url}/knowledge_bases/private/{public_id}" # Validate URL scheme for security parsed_url = urlparse(url)