From 7081923e4f0c59b340849f85e9e1759bc352a655 Mon Sep 17 00:00:00 2001
From: Lasse Benninga <devops.pipeline@example.com>
Date: Wed, 24 Jun 2026 23:42:40 +0200
Subject: [PATCH 1/2] feat: fill Week 9 SQL-for-Analytics assignment scaffold

Replace the boilerplate template with the real Week 9 assignment:
- README with the scenario, the five tasks, the deliverables table, and
  the grading model (completeness auto-grade + teacher rubric review).
- Starter deliverables matching the curriculum chapter: validation_queries.sql,
  schema_setup.sql (CREATE OR REPLACE VIEW skeletons), verification_results.sql,
  data_dictionary.md, AI_ASSIST.md, and assets/ for the borough screenshot.
- .hyf/test.sh as a completeness smoke check (no live SQL): it gates each task
  on the starter TODOs being filled in, so the untouched scaffold scores 0/fail
  and a complete submission scores 100/pass. Final grade is teacher review.
- Remove the placeholder task-1/ and task-2/ directories.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .hyf/test.sh             | 69 ++++++++++++++++++++++++++++++++++++----
 AI_ASSIST.md             | 21 ++++++++++++
 README.md                | 43 +++++++++++++++++++------
 assets/.gitkeep          |  1 +
 data_dictionary.md       | 17 ++++++++++
 schema_setup.sql         | 26 +++++++++++++++
 task-1/task 1 files      |  0
 task-2/task 2 files      |  0
 validation_queries.sql   | 20 ++++++++++++
 verification_results.sql | 22 +++++++++++++
 10 files changed, 202 insertions(+), 17 deletions(-)
 mode change 100644 => 100755 .hyf/test.sh
 create mode 100644 AI_ASSIST.md
 create mode 100644 assets/.gitkeep
 create mode 100644 data_dictionary.md
 create mode 100644 schema_setup.sql
 delete mode 100644 task-1/task 1 files
 delete mode 100644 task-2/task 2 files
 create mode 100644 validation_queries.sql
 create mode 100644 verification_results.sql

diff --git a/.hyf/test.sh b/.hyf/test.sh
old mode 100644
new mode 100755
index ee037fc..5aa9fd7
--- a/.hyf/test.sh
+++ b/.hyf/test.sh
@@ -1,13 +1,68 @@
 #!/usr/bin/env bash
 set -euo pipefail
 
-# Run your test scripts here.
-# Auto grade tool will execute this file within the .hyf working directory.
-# The result should be stored in score.json file with the format shown below.
-cat << EOF > score.json
+# Week 9 is a SQL assignment, graded by teacher review against the rubric.
+# This auto-grade is a COMPLETENESS smoke check only: it confirms every required
+# deliverable exists, is non-empty, and has had its TODO placeholders filled in.
+# It does NOT run SQL against a database, and it is NOT the final grade.
+#
+# The tool runs this script from the .hyf working directory and reads .hyf/score.json,
+# so we resolve the repo root explicitly and write score.json next to this script.
+
+HERE="$(cd "$(dirname "$0")" && pwd)"
+ROOT="$(cd "$HERE/.." && pwd)"
+score=0
+
+# A deliverable counts as "done" only when it exists, is non-empty, and has no TODO left.
+# This is what makes the untouched scaffold score 0: every starter file is full of TODOs.
+done_file() {
+  local f="$ROOT/$1"
+  [ -s "$f" ] && ! grep -qiE "todo" "$f"
+}
+
+# Task 1 (20): validation_queries.sql filled, with the expected check patterns.
+if done_file validation_queries.sql; then
+  score=$((score + 8))
+  grep -qiE "having[[:space:]]+count" "$ROOT/validation_queries.sql" && score=$((score + 4))
+  grep -qiE "is[[:space:]]+null"      "$ROOT/validation_queries.sql" && score=$((score + 4))
+  grep -qiE "min\(|max\("             "$ROOT/validation_queries.sql" && score=$((score + 4))
+fi
+
+# Task 2 (30): schema_setup.sql creates both views and references fares.
+if done_file schema_setup.sql; then
+  score=$((score + 6))
+  grep -qiE "view[[:space:]]+vw_dim_zones"  "$ROOT/schema_setup.sql" && score=$((score + 8))
+  grep -qiE "view[[:space:]]+vw_fact_trips" "$ROOT/schema_setup.sql" && score=$((score + 8))
+  grep -qiE "fare_amount" "$ROOT/schema_setup.sql" && score=$((score + 8))
+fi
+
+# Task 3 (20): data_dictionary.md filled and states a grain.
+if done_file data_dictionary.md; then
+  score=$((score + 14))
+  grep -qiE "grain" "$ROOT/data_dictionary.md" && score=$((score + 6))
+fi
+
+# Task 4 (20): verification_results.sql filled + borough screenshot present.
+if done_file verification_results.sql; then
+  score=$((score + 10))
+  grep -qiE "borough" "$ROOT/verification_results.sql" && score=$((score + 5))
+fi
+[ -f "$ROOT/assets/borough_count.png" ] && score=$((score + 5))
+
+# Task 5 (10): AI_ASSIST.md filled.
+if done_file AI_ASSIST.md; then
+  score=$((score + 10))
+fi
+
+[ "$score" -gt 100 ] && score=100
+if [ "$score" -ge 60 ]; then pass=true; else pass=false; fi
+
+cat > "$HERE/score.json" <<EOF
 {
-  "score": 0,
-  "pass": true,
-  "passingScore": 0
+  "score": ${score},
+  "pass": ${pass},
+  "passingScore": 60
 }
 EOF
+
+echo "Completeness score: ${score}/100 (pass=${pass}). Final grade is teacher review against the rubric."
diff --git a/AI_ASSIST.md b/AI_ASSIST.md
new file mode 100644
index 0000000..07efd00
--- /dev/null
+++ b/AI_ASSIST.md
@@ -0,0 +1,21 @@
+# AI Assistance Log
+
+Document one session where you used an LLM to help with a query or a design decision while completing Tasks 1-4. Replace every TODO.
+
+> ⚠️ Never paste real customer data or PII into an LLM. The NYC taxi dataset used here is public, so sample rows are safe to share.
+
+## The problem
+
+TODO: What were you trying to solve? Paste the relevant SQL or schema fragment.
+
+## The prompt
+
+TODO: What did you ask the AI? Include the context you provided.
+
+## The response
+
+TODO: What did it suggest? Did it work first try?
+
+## Reflection
+
+TODO: Did you understand *why* the suggestion worked, or did you accept it blindly?
diff --git a/README.md b/README.md
index 96ce7bc..65e861c 100644
--- a/README.md
+++ b/README.md
@@ -1,17 +1,40 @@
-# [Track] week X assignment
-HackYourFuture <Track> week X assignment
-The Week X assignment for the HackYourFuture <TRACK> can be found at the following link: [TODO: Assignment url in the learning platform]
+# Data Track Week 9 Assignment: SQL for Analytics
 
+HackYourFuture Data Track, Week 9. The full brief (scenario, tasks, and grading) lives in the curriculum: **Week 9 → Assignment** in the HackYourFuture learning platform. This repo holds the starter files you fill in.
 
-## Implementation Instructions
+You audit the raw NYC taxi data, model it as a star schema of SQL **views**, and document it. Run every query against **your own assigned schema** on the shared Azure PostgreSQL instance, not the shared `public` schema. The data is two tables: `raw_trips` (~57K green-taxi trips, January 2024) and `raw_zones` (265 location lookups).
 
-Provide clear instructions on how trainees should implement the tasks.
+## What you submit
 
-### Task 1
-Instructions for Task 1
+Fill in these files (starters are provided). Keep them at the repo root and do not rename them.
 
-### Task 2
-Instructions for Task 2
+| File | Task | What it holds |
+|---|---|---|
+| `validation_queries.sql` | Task 1 | Data-quality audit: duplicates, nulls, range, orphaned keys |
+| `schema_setup.sql` | Task 2 | `CREATE OR REPLACE VIEW vw_dim_zones` and `vw_fact_trips` |
+| `data_dictionary.md` | Task 3 | Grain, keys, and measures for both views |
+| `verification_results.sql` | Task 4 | Verification queries (volume, revenue, geospatial, time patterns) |
+| `assets/borough_count.png` | Task 4 | Screenshot of the per-borough row-count result |
+| `AI_ASSIST.md` | Task 5 | One documented LLM session |
 
-...
+## Tasks (summary)
 
+1. **Data Quality Audit** (`validation_queries.sql`): find duplicate trips, count NULL pickup/dropoff location IDs, check the `fare_amount` range for negatives, and find `pickup_location_id` values not present in `raw_zones`.
+2. **Star Schema Views** (`schema_setup.sql`): `vw_dim_zones` (one row per `location_id`, the primary key) and `vw_fact_trips` (one row per trip; exclude `fare_amount < 0`; cast `pickup_datetime` to `TIMESTAMP`; keep the location IDs so it joins to `vw_dim_zones`).
+3. **Data Dictionary** (`data_dictionary.md`): state each view's grain in one sentence, identify keys, list measures.
+4. **Verification Queries** (`verification_results.sql`): query the views for volume, revenue, geospatial, and time-pattern questions, joining through `vw_dim_zones` for any borough/zone name. Save a screenshot of the per-borough counts to `assets/borough_count.png`.
+5. **AI Assistance Log** (`AI_ASSIST.md`): document one LLM session honestly.
+
+## How you are graded
+
+- **Auto-grade (on PR creation):** a **completeness** smoke check confirms every required deliverable exists, is non-empty, and contains the expected views and checks. It does **not** run SQL against a database and is **not** your final grade.
+- **Teacher review:** your teacher grades correctness against the rubric: do the queries run, do findings match the real data, does `vw_fact_trips` filter negatives and join cleanly, is the grain stated precisely.
+
+## Submit
+
+1. Work on a branch in your copy of this repo.
+2. Fill in each deliverable file.
+3. Commit, push, and open a Pull Request against `main`. The auto-grade runs on PR creation and posts a completeness score.
+4. Share the PR URL with your teacher.
+
+> ⚠️ Never paste real customer data or PII into an LLM. The NYC taxi dataset used here is public and safe to share.
diff --git a/assets/.gitkeep b/assets/.gitkeep
new file mode 100644
index 0000000..3e4d424
--- /dev/null
+++ b/assets/.gitkeep
@@ -0,0 +1 @@
+# Save your Task 4 screenshot here as borough_count.png
diff --git a/data_dictionary.md b/data_dictionary.md
new file mode 100644
index 0000000..5e44612
--- /dev/null
+++ b/data_dictionary.md
@@ -0,0 +1,17 @@
+# Data Dictionary
+
+Document both views. State the grain in one sentence, identify the keys, and list the measures (the columns you can aggregate). Replace every TODO.
+
+## vw_fact_trips
+
+- **Grain:** TODO (one sentence, e.g. "One row per ...")
+- **Primary key:** TODO
+- **Foreign keys:** TODO
+- **Measures:** TODO (columns you would SUM or AVG)
+
+## vw_dim_zones
+
+- **Grain:** TODO
+- **Primary key:** TODO
+- **Foreign keys:** TODO (or "none")
+- **Measures:** TODO (or "none, descriptive attributes only")
diff --git a/schema_setup.sql b/schema_setup.sql
new file mode 100644
index 0000000..6818dcb
--- /dev/null
+++ b/schema_setup.sql
@@ -0,0 +1,26 @@
+-- Task 2: Star Schema Views (create these in YOUR OWN schema, not public).
+-- CREATE OR REPLACE VIEW lets you re-run this script while you iterate.
+
+-- Dimension: one row per location_id. Treat location_id as the primary key.
+-- TODO: complete the SELECT (location_id, zone, borough).
+CREATE OR REPLACE VIEW vw_dim_zones AS
+SELECT
+    -- TODO
+FROM raw_zones;
+
+-- Fact: one row per taxi trip.
+--   - Exclude rows where fare_amount is less than 0.
+--   - Cast pickup_datetime to TIMESTAMP.
+--   - Keep the location IDs so the view can join to vw_dim_zones.
+-- TODO: complete the SELECT and the WHERE.
+CREATE OR REPLACE VIEW vw_fact_trips AS
+SELECT
+    -- TODO
+FROM raw_trips
+-- TODO: WHERE fare_amount >= 0
+;
+
+-- Join-readiness test (run after creating the views; it must run without error
+-- and return a count close to the vw_fact_trips row count):
+-- SELECT COUNT(*) FROM vw_fact_trips f
+-- JOIN vw_dim_zones d ON f.pickup_location_id = d.location_id;
diff --git a/task-1/task 1 files b/task-1/task 1 files
deleted file mode 100644
index e69de29..0000000
diff --git a/task-2/task 2 files b/task-2/task 2 files
deleted file mode 100644
index e69de29..0000000
diff --git a/validation_queries.sql b/validation_queries.sql
new file mode 100644
index 0000000..fc470cd
--- /dev/null
+++ b/validation_queries.sql
@@ -0,0 +1,20 @@
+-- Task 1: Data Quality Audit
+-- Run every query against raw_trips / raw_zones in YOUR OWN schema (not public).
+-- The shared pattern is a query that returns the bad rows (or a count).
+-- Zero rows back means the check passed.
+
+-- 1. Duplicate check: are there rows with the same vendor_id, pickup_datetime, dropoff_datetime?
+-- TODO: GROUP BY the three columns and keep only groups with HAVING COUNT(*) > 1.
+
+
+-- 2. Null integrity: how many rows have a NULL pickup_location_id or dropoff_location_id?
+-- TODO: count the NULLs (COUNT(*) FILTER (WHERE ... IS NULL) is handy for several columns at once).
+
+
+-- 3. Range validation: what are the min and max fare_amount? Are there negative values?
+-- TODO: SELECT MIN(fare_amount), MAX(fare_amount), and a count of rows where fare_amount < 0.
+
+
+-- 4. Relationship check: which pickup_location_id values in raw_trips do NOT exist in raw_zones?
+-- TODO: LEFT JOIN raw_zones ... WHERE z.location_id IS NULL  (or NOT EXISTS).
+-- Do NOT use NOT IN: a single NULL in the subquery hides every orphan.
diff --git a/verification_results.sql b/verification_results.sql
new file mode 100644
index 0000000..68a8d26
--- /dev/null
+++ b/verification_results.sql
@@ -0,0 +1,22 @@
+-- Task 4: Verification Queries.
+-- Query your views and label each query with the question it answers.
+-- Borough and zone names live in vw_dim_zones, so join on pickup_location_id = location_id.
+
+-- 1. Volume: how many total rows in vw_fact_trips? How many rows per borough?
+--    What is the most common pickup/dropoff location combination?
+-- TODO
+-- (Take a screenshot of the per-borough counts and save it as assets/borough_count.png.)
+
+
+-- 2. Revenue: which pickup zone (name, not ID) generated the highest total fare_amount?
+--    Which pickup zone collected the highest total fare_amount on any single day?
+-- TODO
+
+
+-- 3. Geospatial: total number of trips and average trip_distance for each borough.
+-- TODO
+
+
+-- 4. Time patterns: which day of the week had the highest total tip_amount?
+--    What hour of the day has the highest average tip?
+-- TODO

From f62a12083345b747e9a97ef2aeecefa5aaae1854 Mon Sep 17 00:00:00 2001
From: Lasse Benninga <devops.pipeline@example.com>
Date: Thu, 25 Jun 2026 09:27:21 +0200
Subject: [PATCH 2/2] fix: qualify raw tables as nyc_taxi.raw_trips/raw_zones

The shared class DB moved raw_trips/raw_zones from public to the nyc_taxi
schema. Update the scaffold SQL, README, and data dictionary so students
read from nyc_taxi.*, matching the Week 9 chapters. Autograder patterns
are unaffected (they key on HAVING/IS NULL/vw_ names/fare_amount, not the
source schema); scaffold still scores 0/100 until TODOs are completed.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 README.md              | 4 ++--
 schema_setup.sql       | 4 ++--
 validation_queries.sql | 6 +++---
 3 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/README.md b/README.md
index 65e861c..ae7ba18 100644
--- a/README.md
+++ b/README.md
@@ -2,7 +2,7 @@
 
 HackYourFuture Data Track, Week 9. The full brief (scenario, tasks, and grading) lives in the curriculum: **Week 9 → Assignment** in the HackYourFuture learning platform. This repo holds the starter files you fill in.
 
-You audit the raw NYC taxi data, model it as a star schema of SQL **views**, and document it. Run every query against **your own assigned schema** on the shared Azure PostgreSQL instance, not the shared `public` schema. The data is two tables: `raw_trips` (~57K green-taxi trips, January 2024) and `raw_zones` (265 location lookups).
+You audit the raw NYC taxi data, model it as a star schema of SQL **views**, and document it. Run every query against **your own assigned schema** on the shared Azure PostgreSQL instance, not the shared `public` schema. The data is two tables: `nyc_taxi.raw_trips` (~57K green-taxi trips, January 2024) and `nyc_taxi.raw_zones` (265 location lookups).
 
 ## What you submit
 
@@ -19,7 +19,7 @@ Fill in these files (starters are provided). Keep them at the repo root and do n
 
 ## Tasks (summary)
 
-1. **Data Quality Audit** (`validation_queries.sql`): find duplicate trips, count NULL pickup/dropoff location IDs, check the `fare_amount` range for negatives, and find `pickup_location_id` values not present in `raw_zones`.
+1. **Data Quality Audit** (`validation_queries.sql`): find duplicate trips, count NULL pickup/dropoff location IDs, check the `fare_amount` range for negatives, and find `pickup_location_id` values not present in `nyc_taxi.raw_zones`.
 2. **Star Schema Views** (`schema_setup.sql`): `vw_dim_zones` (one row per `location_id`, the primary key) and `vw_fact_trips` (one row per trip; exclude `fare_amount < 0`; cast `pickup_datetime` to `TIMESTAMP`; keep the location IDs so it joins to `vw_dim_zones`).
 3. **Data Dictionary** (`data_dictionary.md`): state each view's grain in one sentence, identify keys, list measures.
 4. **Verification Queries** (`verification_results.sql`): query the views for volume, revenue, geospatial, and time-pattern questions, joining through `vw_dim_zones` for any borough/zone name. Save a screenshot of the per-borough counts to `assets/borough_count.png`.
diff --git a/schema_setup.sql b/schema_setup.sql
index 6818dcb..a7ae1ad 100644
--- a/schema_setup.sql
+++ b/schema_setup.sql
@@ -6,7 +6,7 @@
 CREATE OR REPLACE VIEW vw_dim_zones AS
 SELECT
     -- TODO
-FROM raw_zones;
+FROM nyc_taxi.raw_zones;
 
 -- Fact: one row per taxi trip.
 --   - Exclude rows where fare_amount is less than 0.
@@ -16,7 +16,7 @@ FROM raw_zones;
 CREATE OR REPLACE VIEW vw_fact_trips AS
 SELECT
     -- TODO
-FROM raw_trips
+FROM nyc_taxi.raw_trips
 -- TODO: WHERE fare_amount >= 0
 ;
 
diff --git a/validation_queries.sql b/validation_queries.sql
index fc470cd..301b194 100644
--- a/validation_queries.sql
+++ b/validation_queries.sql
@@ -1,5 +1,5 @@
 -- Task 1: Data Quality Audit
--- Run every query against raw_trips / raw_zones in YOUR OWN schema (not public).
+-- Run every query against nyc_taxi.raw_trips / nyc_taxi.raw_zones in YOUR OWN schema (not public).
 -- The shared pattern is a query that returns the bad rows (or a count).
 -- Zero rows back means the check passed.
 
@@ -15,6 +15,6 @@
 -- TODO: SELECT MIN(fare_amount), MAX(fare_amount), and a count of rows where fare_amount < 0.
 
 
--- 4. Relationship check: which pickup_location_id values in raw_trips do NOT exist in raw_zones?
--- TODO: LEFT JOIN raw_zones ... WHERE z.location_id IS NULL  (or NOT EXISTS).
+-- 4. Relationship check: which pickup_location_id values in nyc_taxi.raw_trips do NOT exist in nyc_taxi.raw_zones?
+-- TODO: LEFT JOIN nyc_taxi.raw_zones ... WHERE z.location_id IS NULL  (or NOT EXISTS).
 -- Do NOT use NOT IN: a single NULL in the subquery hides every orphan.