From ca083dc60f6da8885ebb541f6fc2b59a600c6f49 Mon Sep 17 00:00:00 2001
From: Jaideep <jaideeprk07@gmail.com>
Date: Sat, 14 Mar 2026 14:35:51 +0530
Subject: [PATCH 1/5] feat: add CLI config lint command

---
 scripts/lint_configs.py | 56 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 56 insertions(+)
 create mode 100644 scripts/lint_configs.py

diff --git a/scripts/lint_configs.py b/scripts/lint_configs.py
new file mode 100644
index 0000000..b64111e
--- /dev/null
+++ b/scripts/lint_configs.py
@@ -0,0 +1,56 @@
+import os
+import argparse
+import yaml
+
+def lint_directory(config_dir):
+    print(f"🔍 Linting YAML files in '{config_dir}/'...\n")
+    
+    error_count = 0
+    file_count = 0
+
+    # 1. Crawl: Walk through the given directory finding all files
+    for root, _, files in os.walk(config_dir):
+        for file in files:
+            # Only check YAML files
+            if file.endswith((".yaml", ".yml")):
+                file_count += 1
+                filepath = os.path.join(root, file)
+                
+                # 2. Read: Open the file
+                with open(filepath, 'r', encoding='utf-8') as f:
+                    try:
+                        # 3. Parse: Try to load the YAML
+                        yaml.safe_load(f)
+                    except yaml.YAMLError as exc:
+                        # 4. Report: Catch the error and print an actionable hint
+                        error_count += 1
+                        print(f"❌ Error found in: {filepath}")
+                        
+                        # PyYAML errors usually tell us the exact line number!
+                        if hasattr(exc, 'problem_mark'):
+                            mark = exc.problem_mark
+                            print(f"   Hint: Check line {mark.line + 1}, column {mark.column + 1}.")
+                            print(f"   Details: {exc.problem}\n")
+                        else:
+                            print(f"   Hint: {exc}\n")
+
+    # Final summary
+    if error_count == 0:
+        print(f"✅ Success! Checked {file_count} files and found no errors.")
+    else:
+        print(f"🚨 Failed: Found {error_count} broken config file(s).")
+        # Exit with code 1 so automated systems know the command failed
+        exit(1)
+
+if __name__ == "__main__":
+    # Set up the CLI command using standard Python
+    parser = argparse.ArgumentParser(description="Lint YAML configuration files for syntax errors.")
+    parser.add_argument(
+        "--path", 
+        type=str, 
+        default="config", 
+        help="Path to the config directory (default is 'config')"
+    )
+    
+    args = parser.parse_args()
+    lint_directory(args.path)
\ No newline at end of file

From 0ec3e083ca23046337ff3f01141a1e87014a3a0f Mon Sep 17 00:00:00 2001
From: Jaideep <jaideeprk07@gmail.com>
Date: Sun, 15 Mar 2026 08:31:35 +0530
Subject: [PATCH 2/5] fix: add path validation, IO handling, and tests

---
 scripts/lint_configs.py    | 52 ++++++++++++++++----------------------
 tests/test_lint_configs.py | 17 +++++++++++++
 2 files changed, 39 insertions(+), 30 deletions(-)
 create mode 100644 tests/test_lint_configs.py

diff --git a/scripts/lint_configs.py b/scripts/lint_configs.py
index b64111e..991e200 100644
--- a/scripts/lint_configs.py
+++ b/scripts/lint_configs.py
@@ -3,54 +3,46 @@
 import yaml
 
 def lint_directory(config_dir):
+    # --- FIX 1: Path Validation ---
+    if not os.path.isdir(config_dir):
+        print(f"🚨 Error: The path '{config_dir}' does not exist or is not a directory.")
+        exit(1)
+
     print(f"🔍 Linting YAML files in '{config_dir}/'...\n")
     
     error_count = 0
     file_count = 0
 
-    # 1. Crawl: Walk through the given directory finding all files
     for root, _, files in os.walk(config_dir):
         for file in files:
-            # Only check YAML files
             if file.endswith((".yaml", ".yml")):
                 file_count += 1
                 filepath = os.path.join(root, file)
                 
-                # 2. Read: Open the file
-                with open(filepath, 'r', encoding='utf-8') as f:
-                    try:
-                        # 3. Parse: Try to load the YAML
+                # --- FIX 2: File Read Robustness ---
+                try:
+                    with open(filepath, 'r', encoding='utf-8') as f:
                         yaml.safe_load(f)
-                    except yaml.YAMLError as exc:
-                        # 4. Report: Catch the error and print an actionable hint
-                        error_count += 1
-                        print(f"❌ Error found in: {filepath}")
-                        
-                        # PyYAML errors usually tell us the exact line number!
-                        if hasattr(exc, 'problem_mark'):
-                            mark = exc.problem_mark
-                            print(f"   Hint: Check line {mark.line + 1}, column {mark.column + 1}.")
-                            print(f"   Details: {exc.problem}\n")
-                        else:
-                            print(f"   Hint: {exc}\n")
+                except OSError as e:
+                    error_count += 1
+                    print(f"❌ IO Error in: {filepath}\n   Details: {e}\n")
+                except yaml.YAMLError as exc:
+                    error_count += 1
+                    print(f"❌ Syntax Error in: {filepath}")
+                    if hasattr(exc, 'problem_mark'):
+                        mark = exc.problem_mark
+                        print(f"   Hint: Check line {mark.line + 1}, column {mark.column + 1}.\n")
+                    else:
+                        print(f"   Details: {exc}\n")
 
-    # Final summary
     if error_count == 0:
         print(f"✅ Success! Checked {file_count} files and found no errors.")
     else:
-        print(f"🚨 Failed: Found {error_count} broken config file(s).")
-        # Exit with code 1 so automated systems know the command failed
+        print(f"🚨 Failed: Found {error_count} error(s).")
         exit(1)
 
 if __name__ == "__main__":
-    # Set up the CLI command using standard Python
-    parser = argparse.ArgumentParser(description="Lint YAML configuration files for syntax errors.")
-    parser.add_argument(
-        "--path", 
-        type=str, 
-        default="config", 
-        help="Path to the config directory (default is 'config')"
-    )
-    
+    parser = argparse.ArgumentParser(description="Lint YAML configuration files.")
+    parser.add_argument("--path", type=str, default="config", help="Path to config directory")
     args = parser.parse_args()
     lint_directory(args.path)
\ No newline at end of file
diff --git a/tests/test_lint_configs.py b/tests/test_lint_configs.py
new file mode 100644
index 0000000..a2f2ab8
--- /dev/null
+++ b/tests/test_lint_configs.py
@@ -0,0 +1,17 @@
+import pytest
+import subprocess
+import os
+
+def test_lint_success():
+    # Tests a valid directory (the default 'config' folder)
+    result = subprocess.run(["python", "scripts/lint_configs.py", "--path", "config"], capture_output=True, text=True)
+    assert result.returncode == 0
+    assert "Success" in result.stdout
+
+def test_invalid_path():
+    # Tests a non-existent directory
+    result = subprocess.run(["python", "scripts/lint_configs.py", "--path", "does-not-exist"], capture_output=True, text=True)
+    assert result.returncode == 1
+    assert "Error: The path" in result.stdout
+
+# You can add more complex tests here later, but this covers the 'fail fast' requirement!
\ No newline at end of file

From db7b207a2a428f364cd4d1b5d306dbc84feda797 Mon Sep 17 00:00:00 2001
From: Mohamed Jadla <jadlamed444@gmail.com>
Date: Sun, 15 Mar 2026 04:26:38 +0100
Subject: [PATCH 3/5] docs: small README improvements

---
 README.md | 141 +++++++++++++++++++++++++-----------------------------
 1 file changed, 64 insertions(+), 77 deletions(-)

diff --git a/README.md b/README.md
index 820cb00..6dfe109 100644
--- a/README.md
+++ b/README.md
@@ -1,12 +1,13 @@
+
 # DataHelm
 
 DataHelm is a data engineering framework focused on the following:
 
-- source ingestion and orchestration
+- Source ingestion and orchestration
 - dbt transformation workflows
-- notebook-based dashboard execution
-- reusable provider connectors (SharePoint, GCS, S3, and BigQuery)
-- optional local LLM analytics query scaffolding
+- Notebook-based dashboard execution
+- Reusable provider connectors (SharePoint, GCS, S3, and BigQuery)
+- Optional local LLM analytics query scaffolding
 
 ![DataHelm Architecture](https://github.com/DevStrikerTech/datahelm/blob/master/docs/architecture.png?raw=true)
 
@@ -53,146 +54,132 @@ ingestion/
 tests/
 scripts/
 docs/
-```
-
 ## Local Setup
 
 ### Prerequisites
 
-- Python 3.12+
-- PostgreSQL (accessible from the local environment)
-- Optional: Docker, local Ollama, dbt CLI
+Python 3.12+
+
+PostgreSQL (accessible from the local environment)
 
+Optional: Docker, local Ollama, dbt CLI
 ### Installation
 
-```bash
+Run the following commands to set up the local environment:
+
 python3 -m venv .venv
 source .venv/bin/activate
 pip install --upgrade pip
 pip install -e .
-```
-
 ### Environment Variables
 
-Create a `.env` file in the repository root with the required values, for example:
+Create a file named `.env` in the root of the repository with the required values, for example:
 
-```env
-DB_HOST=${DB_HOST}
-DB_PORT=${DB_PORT}
-DB_USER=${DB_USER}
-DB_PASSWORD=${DB_PASSWORD}
-DB_NAME=${DB_NAME}
+DB_HOST=${DB_HOST}  
+DB_PORT=${DB_PORT}  
+DB_USER=${DB_USER}  
+DB_PASSWORD=${DB_PASSWORD}  
+DB_NAME=${DB_NAME}  
 CLASHOFCLANS_API_TOKEN=${CLASHOFCLANS_API_TOKEN}
-```
-
 ### Run Dagster Locally
 
-```bash
+To start Dagster locally, run:
+
 python scripts/run_dagster_dev.py
-```
 
-Useful option for quick verification:
+For a quick verification without executing jobs, run:
 
-```bash
 python scripts/run_dagster_dev.py --print-only
-```
-
 ## Configuration Model
 
-### Ingestion Config (`config/api/*.yaml`)
+### Ingestion Config (config/api/*.yaml)
 
 Defines source-level extraction, publish targets, schedules, and column mapping.
 
-Example currently included:
+Example included:
 
-- `CLASHOFCLANS_PLAYER_STATS`
+CLASHOFCLANS_PLAYER_STATS
 
-### dbt Config (`config/dbt/projects.yaml`)
+### dbt Config (config/dbt/projects.yaml)
 
 Defines dbt units, selection/exclusion rules, vars, and schedules.
 
-### Dashboard Config (`config/dashboard/projects.yaml`)
+### Dashboard Config (config/dashboard/projects.yaml)
 
 Defines notebook path, source table mapping, chart columns, and cadence.
 
-### Analytics Semantic Config (`config/analytics/semantic_catalog.yaml`)
+### Analytics Semantic Config (config/analytics/semantic_catalog.yaml)
 
 Defines dataset metadata for the isolated NL-to-SQL module.
-
 ## Reusable Connectors
 
-The repository includes reusable connector classes under `handlers/`:
+The repository includes reusable connector classes under handlers/:
 
-- `handlers/sharepoint/sharepoint.py`
-  - Microsoft Graph auth + site/file access helpers
-- `handlers/gcs/gcs.py`
-  - upload/download/list/delete/signed URL helpers
-- `handlers/s3/s3.py`
-  - upload/download/list/delete/presigned URL helpers
-- `handlers/bigquery/bigquery.py`
-  - query, row fetch, dataframe load, schema helpers
+handlers/sharepoint/sharepoint.py  
+  Microsoft Graph auth + site/file access helpers
 
-## Local LLM Analytics Module
+handlers/gcs/gcs.py  
+  Upload/download/list/delete/signed URL helpers
+
+handlers/s3/s3.py  
+  Upload/download/list/delete/presigned URL helpers
 
-`analytics/nl_query/` is an isolated module for natural-language-to-SQL generation using local Ollama:
+handlers/bigquery/bigquery.py  
+  Query, row fetch, dataframe load, schema helpers
+## Local LLM Analytics Module
 
-- semantic catalog loader
-- SQL read-only safety guard
-- Ollama client wrapper
-- orchestration service
+analytics/nl_query/ is an isolated module for natural-language-to-SQL generation using local Ollama:
 
+Semantic catalog loader  
+SQL read-only safety guard  
+Ollama client wrapper  
+Orchestration service
 ## Testing
 
-Run all tests:
+Run all tests with the following command:
 
-```bash
 .venv/bin/python -m pytest -q
-```
 
 The current test suite includes coverage for:
 
-- ingestion and handler behavior
-- analytics factory and runner logic
-- connector modules (SharePoint, GCS, S3, BigQuery)
-- script behavior
-- NL-query safety and service paths
-
+Ingestion and handler behavior  
+Analytics factory and runner logic  
+Connector modules (SharePoint, GCS, S3, BigQuery)  
+Script behavior  
+NL-query safety and service paths
 ## CI/CD and Branching
 
-- `dev`: integration branch
-- `master`: release/production branch
+dev: integration branch  
+master: release/production branch
 
 Workflows:
 
-- **CI**: tests on development and PR flows
-- **Docker Release**: image build/publish on `master`
-- **Deploy Release**: workflow_run/manual deployment orchestration
-
+CI: tests on development and PR flows  
+Docker Release: image build/publish on master  
+Deploy Release: workflow_run/manual deployment orchestration
 ## Containerization
 
-Container image is defined via `Dockerfile`.
+Container image is defined via Dockerfile.
 
 Default runtime command starts the Dagster gRPC server:
 
-```bash
 python -m dagster api grpc -m dagster_op.repository
-```
-
-## Deployment
+## Containerization
 
-Deployment flow is workflow-based:
+Container image is defined via Dockerfile.
 
-- production auto-path after successful Docker release
-- manual staging/production dispatch path
+Default runtime command starts the Dagster gRPC server:
 
-## Contributing and Governance
+python -m dagster api grpc -m dagster_op.repository
+## Deployment
 
-- Contribution guide: `CONTRIBUTING.md`
-- Code of conduct: `CODE_OF_CONDUCT.md`
-- Security reporting: `SECURITY.md`
+Deployment flow is workflow-based:
 
+Production auto-path after successful Docker release  
+Manual staging/production dispatch path
 ## Detailed Technical Documentation
 
 For complete, long-form project documentation (operations, architecture, and runbook-style details), see:
 
-- `docs/document.md`
+docs/document.md
+

From 3425f156f8c905eaebaf3f0622358e56955c1c78 Mon Sep 17 00:00:00 2001
From: Mohamed Jadla <jadlamed444@gmail.com>
Date: Sun, 15 Mar 2026 04:30:19 +0100
Subject: [PATCH 4/5] docs: small README improvements


From 1516e1e088352866e10ec39114dd39e685525b42 Mon Sep 17 00:00:00 2001
From: Mohamed Jadla <jadlamed444@gmail.com>
Date: Sun, 15 Mar 2026 04:50:06 +0100
Subject: [PATCH 5/5] docs: small README improvements

---
 README.md | 94 +++++++++++++++++++++++++++++--------------------------
 1 file changed, 49 insertions(+), 45 deletions(-)

diff --git a/README.md b/README.md
index 6dfe109..cc2f0cf 100644
--- a/README.md
+++ b/README.md
@@ -1,4 +1,3 @@
-
 # DataHelm
 
 DataHelm is a data engineering framework focused on the following:
@@ -54,51 +53,60 @@ ingestion/
 tests/
 scripts/
 docs/
+````
+
 ## Local Setup
 
 ### Prerequisites
 
 Python 3.12+
-
 PostgreSQL (accessible from the local environment)
-
 Optional: Docker, local Ollama, dbt CLI
+
 ### Installation
 
 Run the following commands to set up the local environment:
 
+```bash
 python3 -m venv .venv
 source .venv/bin/activate
 pip install --upgrade pip
 pip install -e .
+```
+
 ### Environment Variables
 
 Create a file named `.env` in the root of the repository with the required values, for example:
 
-DB_HOST=${DB_HOST}  
-DB_PORT=${DB_PORT}  
-DB_USER=${DB_USER}  
-DB_PASSWORD=${DB_PASSWORD}  
-DB_NAME=${DB_NAME}  
+```text
+DB_HOST=${DB_HOST}
+DB_PORT=${DB_PORT}
+DB_USER=${DB_USER}
+DB_PASSWORD=${DB_PASSWORD}
+DB_NAME=${DB_NAME}
 CLASHOFCLANS_API_TOKEN=${CLASHOFCLANS_API_TOKEN}
+```
+
 ### Run Dagster Locally
 
 To start Dagster locally, run:
 
+```bash
 python scripts/run_dagster_dev.py
+```
 
 For a quick verification without executing jobs, run:
 
+```bash
 python scripts/run_dagster_dev.py --print-only
+```
+
 ## Configuration Model
 
 ### Ingestion Config (config/api/*.yaml)
 
 Defines source-level extraction, publish targets, schedules, and column mapping.
-
-Example included:
-
-CLASHOFCLANS_PLAYER_STATS
+Example included: CLASHOFCLANS_PLAYER_STATS
 
 ### dbt Config (config/dbt/projects.yaml)
 
@@ -111,75 +119,71 @@ Defines notebook path, source table mapping, chart columns, and cadence.
 ### Analytics Semantic Config (config/analytics/semantic_catalog.yaml)
 
 Defines dataset metadata for the isolated NL-to-SQL module.
+
 ## Reusable Connectors
 
 The repository includes reusable connector classes under handlers/:
 
-handlers/sharepoint/sharepoint.py  
-  Microsoft Graph auth + site/file access helpers
-
-handlers/gcs/gcs.py  
-  Upload/download/list/delete/signed URL helpers
+handlers/sharepoint/sharepoint.py – Microsoft Graph auth + site/file access helpers
+handlers/gcs/gcs.py – Upload/download/list/delete/signed URL helpers
+handlers/s3/s3.py – Upload/download/list/delete/presigned URL helpers
+handlers/bigquery/bigquery.py – Query, row fetch, dataframe load, schema helpers
 
-handlers/s3/s3.py  
-  Upload/download/list/delete/presigned URL helpers
-
-handlers/bigquery/bigquery.py  
-  Query, row fetch, dataframe load, schema helpers
 ## Local LLM Analytics Module
 
 analytics/nl_query/ is an isolated module for natural-language-to-SQL generation using local Ollama:
 
-Semantic catalog loader  
-SQL read-only safety guard  
-Ollama client wrapper  
-Orchestration service
+* Semantic catalog loader
+* SQL read-only safety guard
+* Ollama client wrapper
+* Orchestration service
+
 ## Testing
 
 Run all tests with the following command:
 
+```bash
 .venv/bin/python -m pytest -q
+```
 
 The current test suite includes coverage for:
 
-Ingestion and handler behavior  
-Analytics factory and runner logic  
-Connector modules (SharePoint, GCS, S3, BigQuery)  
-Script behavior  
-NL-query safety and service paths
+* Ingestion and handler behavior
+* Analytics factory and runner logic
+* Connector modules (SharePoint, GCS, S3, BigQuery)
+* Script behavior
+* NL-query safety and service paths
+
 ## CI/CD and Branching
 
-dev: integration branch  
-master: release/production branch
+* dev: integration branch
+* master: release/production branch
 
 Workflows:
 
-CI: tests on development and PR flows  
-Docker Release: image build/publish on master  
-Deploy Release: workflow_run/manual deployment orchestration
-## Containerization
-
-Container image is defined via Dockerfile.
-
-Default runtime command starts the Dagster gRPC server:
+* CI: tests on development and PR flows
+* Docker Release: image build/publish on master
+* Deploy Release: workflow_run/manual deployment orchestration
 
-python -m dagster api grpc -m dagster_op.repository
 ## Containerization
 
 Container image is defined via Dockerfile.
 
 Default runtime command starts the Dagster gRPC server:
 
+```bash
 python -m dagster api grpc -m dagster_op.repository
+```
+
 ## Deployment
 
 Deployment flow is workflow-based:
 
-Production auto-path after successful Docker release  
-Manual staging/production dispatch path
+* Production auto-path after successful Docker release
+* Manual staging/production dispatch path
+
 ## Detailed Technical Documentation
 
 For complete, long-form project documentation (operations, architecture, and runbook-style details), see:
 
 docs/document.md
-