From ca083dc60f6da8885ebb541f6fc2b59a600c6f49 Mon Sep 17 00:00:00 2001 From: Jaideep Date: Sat, 14 Mar 2026 14:35:51 +0530 Subject: [PATCH 1/5] feat: add CLI config lint command --- scripts/lint_configs.py | 56 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 56 insertions(+) create mode 100644 scripts/lint_configs.py diff --git a/scripts/lint_configs.py b/scripts/lint_configs.py new file mode 100644 index 0000000..b64111e --- /dev/null +++ b/scripts/lint_configs.py @@ -0,0 +1,56 @@ +import os +import argparse +import yaml + +def lint_directory(config_dir): + print(f"🔍 Linting YAML files in '{config_dir}/'...\n") + + error_count = 0 + file_count = 0 + + # 1. Crawl: Walk through the given directory finding all files + for root, _, files in os.walk(config_dir): + for file in files: + # Only check YAML files + if file.endswith((".yaml", ".yml")): + file_count += 1 + filepath = os.path.join(root, file) + + # 2. Read: Open the file + with open(filepath, 'r', encoding='utf-8') as f: + try: + # 3. Parse: Try to load the YAML + yaml.safe_load(f) + except yaml.YAMLError as exc: + # 4. Report: Catch the error and print an actionable hint + error_count += 1 + print(f"❌ Error found in: {filepath}") + + # PyYAML errors usually tell us the exact line number! + if hasattr(exc, 'problem_mark'): + mark = exc.problem_mark + print(f" Hint: Check line {mark.line + 1}, column {mark.column + 1}.") + print(f" Details: {exc.problem}\n") + else: + print(f" Hint: {exc}\n") + + # Final summary + if error_count == 0: + print(f"✅ Success! Checked {file_count} files and found no errors.") + else: + print(f"🚨 Failed: Found {error_count} broken config file(s).") + # Exit with code 1 so automated systems know the command failed + exit(1) + +if __name__ == "__main__": + # Set up the CLI command using standard Python + parser = argparse.ArgumentParser(description="Lint YAML configuration files for syntax errors.") + parser.add_argument( + "--path", + type=str, + default="config", + help="Path to the config directory (default is 'config')" + ) + + args = parser.parse_args() + lint_directory(args.path) \ No newline at end of file From 0ec3e083ca23046337ff3f01141a1e87014a3a0f Mon Sep 17 00:00:00 2001 From: Jaideep Date: Sun, 15 Mar 2026 08:31:35 +0530 Subject: [PATCH 2/5] fix: add path validation, IO handling, and tests --- scripts/lint_configs.py | 52 ++++++++++++++++---------------------- tests/test_lint_configs.py | 17 +++++++++++++ 2 files changed, 39 insertions(+), 30 deletions(-) create mode 100644 tests/test_lint_configs.py diff --git a/scripts/lint_configs.py b/scripts/lint_configs.py index b64111e..991e200 100644 --- a/scripts/lint_configs.py +++ b/scripts/lint_configs.py @@ -3,54 +3,46 @@ import yaml def lint_directory(config_dir): + # --- FIX 1: Path Validation --- + if not os.path.isdir(config_dir): + print(f"🚨 Error: The path '{config_dir}' does not exist or is not a directory.") + exit(1) + print(f"🔍 Linting YAML files in '{config_dir}/'...\n") error_count = 0 file_count = 0 - # 1. Crawl: Walk through the given directory finding all files for root, _, files in os.walk(config_dir): for file in files: - # Only check YAML files if file.endswith((".yaml", ".yml")): file_count += 1 filepath = os.path.join(root, file) - # 2. Read: Open the file - with open(filepath, 'r', encoding='utf-8') as f: - try: - # 3. Parse: Try to load the YAML + # --- FIX 2: File Read Robustness --- + try: + with open(filepath, 'r', encoding='utf-8') as f: yaml.safe_load(f) - except yaml.YAMLError as exc: - # 4. Report: Catch the error and print an actionable hint - error_count += 1 - print(f"❌ Error found in: {filepath}") - - # PyYAML errors usually tell us the exact line number! - if hasattr(exc, 'problem_mark'): - mark = exc.problem_mark - print(f" Hint: Check line {mark.line + 1}, column {mark.column + 1}.") - print(f" Details: {exc.problem}\n") - else: - print(f" Hint: {exc}\n") + except OSError as e: + error_count += 1 + print(f"❌ IO Error in: {filepath}\n Details: {e}\n") + except yaml.YAMLError as exc: + error_count += 1 + print(f"❌ Syntax Error in: {filepath}") + if hasattr(exc, 'problem_mark'): + mark = exc.problem_mark + print(f" Hint: Check line {mark.line + 1}, column {mark.column + 1}.\n") + else: + print(f" Details: {exc}\n") - # Final summary if error_count == 0: print(f"✅ Success! Checked {file_count} files and found no errors.") else: - print(f"🚨 Failed: Found {error_count} broken config file(s).") - # Exit with code 1 so automated systems know the command failed + print(f"🚨 Failed: Found {error_count} error(s).") exit(1) if __name__ == "__main__": - # Set up the CLI command using standard Python - parser = argparse.ArgumentParser(description="Lint YAML configuration files for syntax errors.") - parser.add_argument( - "--path", - type=str, - default="config", - help="Path to the config directory (default is 'config')" - ) - + parser = argparse.ArgumentParser(description="Lint YAML configuration files.") + parser.add_argument("--path", type=str, default="config", help="Path to config directory") args = parser.parse_args() lint_directory(args.path) \ No newline at end of file diff --git a/tests/test_lint_configs.py b/tests/test_lint_configs.py new file mode 100644 index 0000000..a2f2ab8 --- /dev/null +++ b/tests/test_lint_configs.py @@ -0,0 +1,17 @@ +import pytest +import subprocess +import os + +def test_lint_success(): + # Tests a valid directory (the default 'config' folder) + result = subprocess.run(["python", "scripts/lint_configs.py", "--path", "config"], capture_output=True, text=True) + assert result.returncode == 0 + assert "Success" in result.stdout + +def test_invalid_path(): + # Tests a non-existent directory + result = subprocess.run(["python", "scripts/lint_configs.py", "--path", "does-not-exist"], capture_output=True, text=True) + assert result.returncode == 1 + assert "Error: The path" in result.stdout + +# You can add more complex tests here later, but this covers the 'fail fast' requirement! \ No newline at end of file From db7b207a2a428f364cd4d1b5d306dbc84feda797 Mon Sep 17 00:00:00 2001 From: Mohamed Jadla Date: Sun, 15 Mar 2026 04:26:38 +0100 Subject: [PATCH 3/5] docs: small README improvements --- README.md | 141 +++++++++++++++++++++++++----------------------------- 1 file changed, 64 insertions(+), 77 deletions(-) diff --git a/README.md b/README.md index 820cb00..6dfe109 100644 --- a/README.md +++ b/README.md @@ -1,12 +1,13 @@ + # DataHelm DataHelm is a data engineering framework focused on the following: -- source ingestion and orchestration +- Source ingestion and orchestration - dbt transformation workflows -- notebook-based dashboard execution -- reusable provider connectors (SharePoint, GCS, S3, and BigQuery) -- optional local LLM analytics query scaffolding +- Notebook-based dashboard execution +- Reusable provider connectors (SharePoint, GCS, S3, and BigQuery) +- Optional local LLM analytics query scaffolding ![DataHelm Architecture](https://github.com/DevStrikerTech/datahelm/blob/master/docs/architecture.png?raw=true) @@ -53,146 +54,132 @@ ingestion/ tests/ scripts/ docs/ -``` - ## Local Setup ### Prerequisites -- Python 3.12+ -- PostgreSQL (accessible from the local environment) -- Optional: Docker, local Ollama, dbt CLI +Python 3.12+ + +PostgreSQL (accessible from the local environment) +Optional: Docker, local Ollama, dbt CLI ### Installation -```bash +Run the following commands to set up the local environment: + python3 -m venv .venv source .venv/bin/activate pip install --upgrade pip pip install -e . -``` - ### Environment Variables -Create a `.env` file in the repository root with the required values, for example: +Create a file named `.env` in the root of the repository with the required values, for example: -```env -DB_HOST=${DB_HOST} -DB_PORT=${DB_PORT} -DB_USER=${DB_USER} -DB_PASSWORD=${DB_PASSWORD} -DB_NAME=${DB_NAME} +DB_HOST=${DB_HOST} +DB_PORT=${DB_PORT} +DB_USER=${DB_USER} +DB_PASSWORD=${DB_PASSWORD} +DB_NAME=${DB_NAME} CLASHOFCLANS_API_TOKEN=${CLASHOFCLANS_API_TOKEN} -``` - ### Run Dagster Locally -```bash +To start Dagster locally, run: + python scripts/run_dagster_dev.py -``` -Useful option for quick verification: +For a quick verification without executing jobs, run: -```bash python scripts/run_dagster_dev.py --print-only -``` - ## Configuration Model -### Ingestion Config (`config/api/*.yaml`) +### Ingestion Config (config/api/*.yaml) Defines source-level extraction, publish targets, schedules, and column mapping. -Example currently included: +Example included: -- `CLASHOFCLANS_PLAYER_STATS` +CLASHOFCLANS_PLAYER_STATS -### dbt Config (`config/dbt/projects.yaml`) +### dbt Config (config/dbt/projects.yaml) Defines dbt units, selection/exclusion rules, vars, and schedules. -### Dashboard Config (`config/dashboard/projects.yaml`) +### Dashboard Config (config/dashboard/projects.yaml) Defines notebook path, source table mapping, chart columns, and cadence. -### Analytics Semantic Config (`config/analytics/semantic_catalog.yaml`) +### Analytics Semantic Config (config/analytics/semantic_catalog.yaml) Defines dataset metadata for the isolated NL-to-SQL module. - ## Reusable Connectors -The repository includes reusable connector classes under `handlers/`: +The repository includes reusable connector classes under handlers/: -- `handlers/sharepoint/sharepoint.py` - - Microsoft Graph auth + site/file access helpers -- `handlers/gcs/gcs.py` - - upload/download/list/delete/signed URL helpers -- `handlers/s3/s3.py` - - upload/download/list/delete/presigned URL helpers -- `handlers/bigquery/bigquery.py` - - query, row fetch, dataframe load, schema helpers +handlers/sharepoint/sharepoint.py + Microsoft Graph auth + site/file access helpers -## Local LLM Analytics Module +handlers/gcs/gcs.py + Upload/download/list/delete/signed URL helpers + +handlers/s3/s3.py + Upload/download/list/delete/presigned URL helpers -`analytics/nl_query/` is an isolated module for natural-language-to-SQL generation using local Ollama: +handlers/bigquery/bigquery.py + Query, row fetch, dataframe load, schema helpers +## Local LLM Analytics Module -- semantic catalog loader -- SQL read-only safety guard -- Ollama client wrapper -- orchestration service +analytics/nl_query/ is an isolated module for natural-language-to-SQL generation using local Ollama: +Semantic catalog loader +SQL read-only safety guard +Ollama client wrapper +Orchestration service ## Testing -Run all tests: +Run all tests with the following command: -```bash .venv/bin/python -m pytest -q -``` The current test suite includes coverage for: -- ingestion and handler behavior -- analytics factory and runner logic -- connector modules (SharePoint, GCS, S3, BigQuery) -- script behavior -- NL-query safety and service paths - +Ingestion and handler behavior +Analytics factory and runner logic +Connector modules (SharePoint, GCS, S3, BigQuery) +Script behavior +NL-query safety and service paths ## CI/CD and Branching -- `dev`: integration branch -- `master`: release/production branch +dev: integration branch +master: release/production branch Workflows: -- **CI**: tests on development and PR flows -- **Docker Release**: image build/publish on `master` -- **Deploy Release**: workflow_run/manual deployment orchestration - +CI: tests on development and PR flows +Docker Release: image build/publish on master +Deploy Release: workflow_run/manual deployment orchestration ## Containerization -Container image is defined via `Dockerfile`. +Container image is defined via Dockerfile. Default runtime command starts the Dagster gRPC server: -```bash python -m dagster api grpc -m dagster_op.repository -``` - -## Deployment +## Containerization -Deployment flow is workflow-based: +Container image is defined via Dockerfile. -- production auto-path after successful Docker release -- manual staging/production dispatch path +Default runtime command starts the Dagster gRPC server: -## Contributing and Governance +python -m dagster api grpc -m dagster_op.repository +## Deployment -- Contribution guide: `CONTRIBUTING.md` -- Code of conduct: `CODE_OF_CONDUCT.md` -- Security reporting: `SECURITY.md` +Deployment flow is workflow-based: +Production auto-path after successful Docker release +Manual staging/production dispatch path ## Detailed Technical Documentation For complete, long-form project documentation (operations, architecture, and runbook-style details), see: -- `docs/document.md` +docs/document.md + From 3425f156f8c905eaebaf3f0622358e56955c1c78 Mon Sep 17 00:00:00 2001 From: Mohamed Jadla Date: Sun, 15 Mar 2026 04:30:19 +0100 Subject: [PATCH 4/5] docs: small README improvements From 1516e1e088352866e10ec39114dd39e685525b42 Mon Sep 17 00:00:00 2001 From: Mohamed Jadla Date: Sun, 15 Mar 2026 04:50:06 +0100 Subject: [PATCH 5/5] docs: small README improvements --- README.md | 94 +++++++++++++++++++++++++++++-------------------------- 1 file changed, 49 insertions(+), 45 deletions(-) diff --git a/README.md b/README.md index 6dfe109..cc2f0cf 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,3 @@ - # DataHelm DataHelm is a data engineering framework focused on the following: @@ -54,51 +53,60 @@ ingestion/ tests/ scripts/ docs/ +```` + ## Local Setup ### Prerequisites Python 3.12+ - PostgreSQL (accessible from the local environment) - Optional: Docker, local Ollama, dbt CLI + ### Installation Run the following commands to set up the local environment: +```bash python3 -m venv .venv source .venv/bin/activate pip install --upgrade pip pip install -e . +``` + ### Environment Variables Create a file named `.env` in the root of the repository with the required values, for example: -DB_HOST=${DB_HOST} -DB_PORT=${DB_PORT} -DB_USER=${DB_USER} -DB_PASSWORD=${DB_PASSWORD} -DB_NAME=${DB_NAME} +```text +DB_HOST=${DB_HOST} +DB_PORT=${DB_PORT} +DB_USER=${DB_USER} +DB_PASSWORD=${DB_PASSWORD} +DB_NAME=${DB_NAME} CLASHOFCLANS_API_TOKEN=${CLASHOFCLANS_API_TOKEN} +``` + ### Run Dagster Locally To start Dagster locally, run: +```bash python scripts/run_dagster_dev.py +``` For a quick verification without executing jobs, run: +```bash python scripts/run_dagster_dev.py --print-only +``` + ## Configuration Model ### Ingestion Config (config/api/*.yaml) Defines source-level extraction, publish targets, schedules, and column mapping. - -Example included: - -CLASHOFCLANS_PLAYER_STATS +Example included: CLASHOFCLANS_PLAYER_STATS ### dbt Config (config/dbt/projects.yaml) @@ -111,75 +119,71 @@ Defines notebook path, source table mapping, chart columns, and cadence. ### Analytics Semantic Config (config/analytics/semantic_catalog.yaml) Defines dataset metadata for the isolated NL-to-SQL module. + ## Reusable Connectors The repository includes reusable connector classes under handlers/: -handlers/sharepoint/sharepoint.py - Microsoft Graph auth + site/file access helpers - -handlers/gcs/gcs.py - Upload/download/list/delete/signed URL helpers +handlers/sharepoint/sharepoint.py – Microsoft Graph auth + site/file access helpers +handlers/gcs/gcs.py – Upload/download/list/delete/signed URL helpers +handlers/s3/s3.py – Upload/download/list/delete/presigned URL helpers +handlers/bigquery/bigquery.py – Query, row fetch, dataframe load, schema helpers -handlers/s3/s3.py - Upload/download/list/delete/presigned URL helpers - -handlers/bigquery/bigquery.py - Query, row fetch, dataframe load, schema helpers ## Local LLM Analytics Module analytics/nl_query/ is an isolated module for natural-language-to-SQL generation using local Ollama: -Semantic catalog loader -SQL read-only safety guard -Ollama client wrapper -Orchestration service +* Semantic catalog loader +* SQL read-only safety guard +* Ollama client wrapper +* Orchestration service + ## Testing Run all tests with the following command: +```bash .venv/bin/python -m pytest -q +``` The current test suite includes coverage for: -Ingestion and handler behavior -Analytics factory and runner logic -Connector modules (SharePoint, GCS, S3, BigQuery) -Script behavior -NL-query safety and service paths +* Ingestion and handler behavior +* Analytics factory and runner logic +* Connector modules (SharePoint, GCS, S3, BigQuery) +* Script behavior +* NL-query safety and service paths + ## CI/CD and Branching -dev: integration branch -master: release/production branch +* dev: integration branch +* master: release/production branch Workflows: -CI: tests on development and PR flows -Docker Release: image build/publish on master -Deploy Release: workflow_run/manual deployment orchestration -## Containerization - -Container image is defined via Dockerfile. - -Default runtime command starts the Dagster gRPC server: +* CI: tests on development and PR flows +* Docker Release: image build/publish on master +* Deploy Release: workflow_run/manual deployment orchestration -python -m dagster api grpc -m dagster_op.repository ## Containerization Container image is defined via Dockerfile. Default runtime command starts the Dagster gRPC server: +```bash python -m dagster api grpc -m dagster_op.repository +``` + ## Deployment Deployment flow is workflow-based: -Production auto-path after successful Docker release -Manual staging/production dispatch path +* Production auto-path after successful Docker release +* Manual staging/production dispatch path + ## Detailed Technical Documentation For complete, long-form project documentation (operations, architecture, and runbook-style details), see: docs/document.md -