From 7081923e4f0c59b340849f85e9e1759bc352a655 Mon Sep 17 00:00:00 2001 From: Lasse Benninga Date: Wed, 24 Jun 2026 23:42:40 +0200 Subject: [PATCH 1/2] feat: fill Week 9 SQL-for-Analytics assignment scaffold Replace the boilerplate template with the real Week 9 assignment: - README with the scenario, the five tasks, the deliverables table, and the grading model (completeness auto-grade + teacher rubric review). - Starter deliverables matching the curriculum chapter: validation_queries.sql, schema_setup.sql (CREATE OR REPLACE VIEW skeletons), verification_results.sql, data_dictionary.md, AI_ASSIST.md, and assets/ for the borough screenshot. - .hyf/test.sh as a completeness smoke check (no live SQL): it gates each task on the starter TODOs being filled in, so the untouched scaffold scores 0/fail and a complete submission scores 100/pass. Final grade is teacher review. - Remove the placeholder task-1/ and task-2/ directories. Co-Authored-By: Claude Opus 4.8 (1M context) --- .hyf/test.sh | 69 ++++++++++++++++++++++++++++++++++++---- AI_ASSIST.md | 21 ++++++++++++ README.md | 43 +++++++++++++++++++------ assets/.gitkeep | 1 + data_dictionary.md | 17 ++++++++++ schema_setup.sql | 26 +++++++++++++++ task-1/task 1 files | 0 task-2/task 2 files | 0 validation_queries.sql | 20 ++++++++++++ verification_results.sql | 22 +++++++++++++ 10 files changed, 202 insertions(+), 17 deletions(-) mode change 100644 => 100755 .hyf/test.sh create mode 100644 AI_ASSIST.md create mode 100644 assets/.gitkeep create mode 100644 data_dictionary.md create mode 100644 schema_setup.sql delete mode 100644 task-1/task 1 files delete mode 100644 task-2/task 2 files create mode 100644 validation_queries.sql create mode 100644 verification_results.sql diff --git a/.hyf/test.sh b/.hyf/test.sh old mode 100644 new mode 100755 index ee037fc..5aa9fd7 --- a/.hyf/test.sh +++ b/.hyf/test.sh @@ -1,13 +1,68 @@ #!/usr/bin/env bash set -euo pipefail -# Run your test scripts here. -# Auto grade tool will execute this file within the .hyf working directory. -# The result should be stored in score.json file with the format shown below. -cat << EOF > score.json +# Week 9 is a SQL assignment, graded by teacher review against the rubric. +# This auto-grade is a COMPLETENESS smoke check only: it confirms every required +# deliverable exists, is non-empty, and has had its TODO placeholders filled in. +# It does NOT run SQL against a database, and it is NOT the final grade. +# +# The tool runs this script from the .hyf working directory and reads .hyf/score.json, +# so we resolve the repo root explicitly and write score.json next to this script. + +HERE="$(cd "$(dirname "$0")" && pwd)" +ROOT="$(cd "$HERE/.." && pwd)" +score=0 + +# A deliverable counts as "done" only when it exists, is non-empty, and has no TODO left. +# This is what makes the untouched scaffold score 0: every starter file is full of TODOs. +done_file() { + local f="$ROOT/$1" + [ -s "$f" ] && ! grep -qiE "todo" "$f" +} + +# Task 1 (20): validation_queries.sql filled, with the expected check patterns. +if done_file validation_queries.sql; then + score=$((score + 8)) + grep -qiE "having[[:space:]]+count" "$ROOT/validation_queries.sql" && score=$((score + 4)) + grep -qiE "is[[:space:]]+null" "$ROOT/validation_queries.sql" && score=$((score + 4)) + grep -qiE "min\(|max\(" "$ROOT/validation_queries.sql" && score=$((score + 4)) +fi + +# Task 2 (30): schema_setup.sql creates both views and references fares. +if done_file schema_setup.sql; then + score=$((score + 6)) + grep -qiE "view[[:space:]]+vw_dim_zones" "$ROOT/schema_setup.sql" && score=$((score + 8)) + grep -qiE "view[[:space:]]+vw_fact_trips" "$ROOT/schema_setup.sql" && score=$((score + 8)) + grep -qiE "fare_amount" "$ROOT/schema_setup.sql" && score=$((score + 8)) +fi + +# Task 3 (20): data_dictionary.md filled and states a grain. +if done_file data_dictionary.md; then + score=$((score + 14)) + grep -qiE "grain" "$ROOT/data_dictionary.md" && score=$((score + 6)) +fi + +# Task 4 (20): verification_results.sql filled + borough screenshot present. +if done_file verification_results.sql; then + score=$((score + 10)) + grep -qiE "borough" "$ROOT/verification_results.sql" && score=$((score + 5)) +fi +[ -f "$ROOT/assets/borough_count.png" ] && score=$((score + 5)) + +# Task 5 (10): AI_ASSIST.md filled. +if done_file AI_ASSIST.md; then + score=$((score + 10)) +fi + +[ "$score" -gt 100 ] && score=100 +if [ "$score" -ge 60 ]; then pass=true; else pass=false; fi + +cat > "$HERE/score.json" < ⚠️ Never paste real customer data or PII into an LLM. The NYC taxi dataset used here is public, so sample rows are safe to share. + +## The problem + +TODO: What were you trying to solve? Paste the relevant SQL or schema fragment. + +## The prompt + +TODO: What did you ask the AI? Include the context you provided. + +## The response + +TODO: What did it suggest? Did it work first try? + +## Reflection + +TODO: Did you understand *why* the suggestion worked, or did you accept it blindly? diff --git a/README.md b/README.md index 96ce7bc..65e861c 100644 --- a/README.md +++ b/README.md @@ -1,17 +1,40 @@ -# [Track] week X assignment -HackYourFuture week X assignment -The Week X assignment for the HackYourFuture can be found at the following link: [TODO: Assignment url in the learning platform] +# Data Track Week 9 Assignment: SQL for Analytics +HackYourFuture Data Track, Week 9. The full brief (scenario, tasks, and grading) lives in the curriculum: **Week 9 → Assignment** in the HackYourFuture learning platform. This repo holds the starter files you fill in. -## Implementation Instructions +You audit the raw NYC taxi data, model it as a star schema of SQL **views**, and document it. Run every query against **your own assigned schema** on the shared Azure PostgreSQL instance, not the shared `public` schema. The data is two tables: `raw_trips` (~57K green-taxi trips, January 2024) and `raw_zones` (265 location lookups). -Provide clear instructions on how trainees should implement the tasks. +## What you submit -### Task 1 -Instructions for Task 1 +Fill in these files (starters are provided). Keep them at the repo root and do not rename them. -### Task 2 -Instructions for Task 2 +| File | Task | What it holds | +|---|---|---| +| `validation_queries.sql` | Task 1 | Data-quality audit: duplicates, nulls, range, orphaned keys | +| `schema_setup.sql` | Task 2 | `CREATE OR REPLACE VIEW vw_dim_zones` and `vw_fact_trips` | +| `data_dictionary.md` | Task 3 | Grain, keys, and measures for both views | +| `verification_results.sql` | Task 4 | Verification queries (volume, revenue, geospatial, time patterns) | +| `assets/borough_count.png` | Task 4 | Screenshot of the per-borough row-count result | +| `AI_ASSIST.md` | Task 5 | One documented LLM session | -... +## Tasks (summary) +1. **Data Quality Audit** (`validation_queries.sql`): find duplicate trips, count NULL pickup/dropoff location IDs, check the `fare_amount` range for negatives, and find `pickup_location_id` values not present in `raw_zones`. +2. **Star Schema Views** (`schema_setup.sql`): `vw_dim_zones` (one row per `location_id`, the primary key) and `vw_fact_trips` (one row per trip; exclude `fare_amount < 0`; cast `pickup_datetime` to `TIMESTAMP`; keep the location IDs so it joins to `vw_dim_zones`). +3. **Data Dictionary** (`data_dictionary.md`): state each view's grain in one sentence, identify keys, list measures. +4. **Verification Queries** (`verification_results.sql`): query the views for volume, revenue, geospatial, and time-pattern questions, joining through `vw_dim_zones` for any borough/zone name. Save a screenshot of the per-borough counts to `assets/borough_count.png`. +5. **AI Assistance Log** (`AI_ASSIST.md`): document one LLM session honestly. + +## How you are graded + +- **Auto-grade (on PR creation):** a **completeness** smoke check confirms every required deliverable exists, is non-empty, and contains the expected views and checks. It does **not** run SQL against a database and is **not** your final grade. +- **Teacher review:** your teacher grades correctness against the rubric: do the queries run, do findings match the real data, does `vw_fact_trips` filter negatives and join cleanly, is the grain stated precisely. + +## Submit + +1. Work on a branch in your copy of this repo. +2. Fill in each deliverable file. +3. Commit, push, and open a Pull Request against `main`. The auto-grade runs on PR creation and posts a completeness score. +4. Share the PR URL with your teacher. + +> ⚠️ Never paste real customer data or PII into an LLM. The NYC taxi dataset used here is public and safe to share. diff --git a/assets/.gitkeep b/assets/.gitkeep new file mode 100644 index 0000000..3e4d424 --- /dev/null +++ b/assets/.gitkeep @@ -0,0 +1 @@ +# Save your Task 4 screenshot here as borough_count.png diff --git a/data_dictionary.md b/data_dictionary.md new file mode 100644 index 0000000..5e44612 --- /dev/null +++ b/data_dictionary.md @@ -0,0 +1,17 @@ +# Data Dictionary + +Document both views. State the grain in one sentence, identify the keys, and list the measures (the columns you can aggregate). Replace every TODO. + +## vw_fact_trips + +- **Grain:** TODO (one sentence, e.g. "One row per ...") +- **Primary key:** TODO +- **Foreign keys:** TODO +- **Measures:** TODO (columns you would SUM or AVG) + +## vw_dim_zones + +- **Grain:** TODO +- **Primary key:** TODO +- **Foreign keys:** TODO (or "none") +- **Measures:** TODO (or "none, descriptive attributes only") diff --git a/schema_setup.sql b/schema_setup.sql new file mode 100644 index 0000000..6818dcb --- /dev/null +++ b/schema_setup.sql @@ -0,0 +1,26 @@ +-- Task 2: Star Schema Views (create these in YOUR OWN schema, not public). +-- CREATE OR REPLACE VIEW lets you re-run this script while you iterate. + +-- Dimension: one row per location_id. Treat location_id as the primary key. +-- TODO: complete the SELECT (location_id, zone, borough). +CREATE OR REPLACE VIEW vw_dim_zones AS +SELECT + -- TODO +FROM raw_zones; + +-- Fact: one row per taxi trip. +-- - Exclude rows where fare_amount is less than 0. +-- - Cast pickup_datetime to TIMESTAMP. +-- - Keep the location IDs so the view can join to vw_dim_zones. +-- TODO: complete the SELECT and the WHERE. +CREATE OR REPLACE VIEW vw_fact_trips AS +SELECT + -- TODO +FROM raw_trips +-- TODO: WHERE fare_amount >= 0 +; + +-- Join-readiness test (run after creating the views; it must run without error +-- and return a count close to the vw_fact_trips row count): +-- SELECT COUNT(*) FROM vw_fact_trips f +-- JOIN vw_dim_zones d ON f.pickup_location_id = d.location_id; diff --git a/task-1/task 1 files b/task-1/task 1 files deleted file mode 100644 index e69de29..0000000 diff --git a/task-2/task 2 files b/task-2/task 2 files deleted file mode 100644 index e69de29..0000000 diff --git a/validation_queries.sql b/validation_queries.sql new file mode 100644 index 0000000..fc470cd --- /dev/null +++ b/validation_queries.sql @@ -0,0 +1,20 @@ +-- Task 1: Data Quality Audit +-- Run every query against raw_trips / raw_zones in YOUR OWN schema (not public). +-- The shared pattern is a query that returns the bad rows (or a count). +-- Zero rows back means the check passed. + +-- 1. Duplicate check: are there rows with the same vendor_id, pickup_datetime, dropoff_datetime? +-- TODO: GROUP BY the three columns and keep only groups with HAVING COUNT(*) > 1. + + +-- 2. Null integrity: how many rows have a NULL pickup_location_id or dropoff_location_id? +-- TODO: count the NULLs (COUNT(*) FILTER (WHERE ... IS NULL) is handy for several columns at once). + + +-- 3. Range validation: what are the min and max fare_amount? Are there negative values? +-- TODO: SELECT MIN(fare_amount), MAX(fare_amount), and a count of rows where fare_amount < 0. + + +-- 4. Relationship check: which pickup_location_id values in raw_trips do NOT exist in raw_zones? +-- TODO: LEFT JOIN raw_zones ... WHERE z.location_id IS NULL (or NOT EXISTS). +-- Do NOT use NOT IN: a single NULL in the subquery hides every orphan. diff --git a/verification_results.sql b/verification_results.sql new file mode 100644 index 0000000..68a8d26 --- /dev/null +++ b/verification_results.sql @@ -0,0 +1,22 @@ +-- Task 4: Verification Queries. +-- Query your views and label each query with the question it answers. +-- Borough and zone names live in vw_dim_zones, so join on pickup_location_id = location_id. + +-- 1. Volume: how many total rows in vw_fact_trips? How many rows per borough? +-- What is the most common pickup/dropoff location combination? +-- TODO +-- (Take a screenshot of the per-borough counts and save it as assets/borough_count.png.) + + +-- 2. Revenue: which pickup zone (name, not ID) generated the highest total fare_amount? +-- Which pickup zone collected the highest total fare_amount on any single day? +-- TODO + + +-- 3. Geospatial: total number of trips and average trip_distance for each borough. +-- TODO + + +-- 4. Time patterns: which day of the week had the highest total tip_amount? +-- What hour of the day has the highest average tip? +-- TODO From f62a12083345b747e9a97ef2aeecefa5aaae1854 Mon Sep 17 00:00:00 2001 From: Lasse Benninga Date: Thu, 25 Jun 2026 09:27:21 +0200 Subject: [PATCH 2/2] fix: qualify raw tables as nyc_taxi.raw_trips/raw_zones The shared class DB moved raw_trips/raw_zones from public to the nyc_taxi schema. Update the scaffold SQL, README, and data dictionary so students read from nyc_taxi.*, matching the Week 9 chapters. Autograder patterns are unaffected (they key on HAVING/IS NULL/vw_ names/fare_amount, not the source schema); scaffold still scores 0/100 until TODOs are completed. Co-Authored-By: Claude Opus 4.8 (1M context) --- README.md | 4 ++-- schema_setup.sql | 4 ++-- validation_queries.sql | 6 +++--- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index 65e861c..ae7ba18 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,7 @@ HackYourFuture Data Track, Week 9. The full brief (scenario, tasks, and grading) lives in the curriculum: **Week 9 → Assignment** in the HackYourFuture learning platform. This repo holds the starter files you fill in. -You audit the raw NYC taxi data, model it as a star schema of SQL **views**, and document it. Run every query against **your own assigned schema** on the shared Azure PostgreSQL instance, not the shared `public` schema. The data is two tables: `raw_trips` (~57K green-taxi trips, January 2024) and `raw_zones` (265 location lookups). +You audit the raw NYC taxi data, model it as a star schema of SQL **views**, and document it. Run every query against **your own assigned schema** on the shared Azure PostgreSQL instance, not the shared `public` schema. The data is two tables: `nyc_taxi.raw_trips` (~57K green-taxi trips, January 2024) and `nyc_taxi.raw_zones` (265 location lookups). ## What you submit @@ -19,7 +19,7 @@ Fill in these files (starters are provided). Keep them at the repo root and do n ## Tasks (summary) -1. **Data Quality Audit** (`validation_queries.sql`): find duplicate trips, count NULL pickup/dropoff location IDs, check the `fare_amount` range for negatives, and find `pickup_location_id` values not present in `raw_zones`. +1. **Data Quality Audit** (`validation_queries.sql`): find duplicate trips, count NULL pickup/dropoff location IDs, check the `fare_amount` range for negatives, and find `pickup_location_id` values not present in `nyc_taxi.raw_zones`. 2. **Star Schema Views** (`schema_setup.sql`): `vw_dim_zones` (one row per `location_id`, the primary key) and `vw_fact_trips` (one row per trip; exclude `fare_amount < 0`; cast `pickup_datetime` to `TIMESTAMP`; keep the location IDs so it joins to `vw_dim_zones`). 3. **Data Dictionary** (`data_dictionary.md`): state each view's grain in one sentence, identify keys, list measures. 4. **Verification Queries** (`verification_results.sql`): query the views for volume, revenue, geospatial, and time-pattern questions, joining through `vw_dim_zones` for any borough/zone name. Save a screenshot of the per-borough counts to `assets/borough_count.png`. diff --git a/schema_setup.sql b/schema_setup.sql index 6818dcb..a7ae1ad 100644 --- a/schema_setup.sql +++ b/schema_setup.sql @@ -6,7 +6,7 @@ CREATE OR REPLACE VIEW vw_dim_zones AS SELECT -- TODO -FROM raw_zones; +FROM nyc_taxi.raw_zones; -- Fact: one row per taxi trip. -- - Exclude rows where fare_amount is less than 0. @@ -16,7 +16,7 @@ FROM raw_zones; CREATE OR REPLACE VIEW vw_fact_trips AS SELECT -- TODO -FROM raw_trips +FROM nyc_taxi.raw_trips -- TODO: WHERE fare_amount >= 0 ; diff --git a/validation_queries.sql b/validation_queries.sql index fc470cd..301b194 100644 --- a/validation_queries.sql +++ b/validation_queries.sql @@ -1,5 +1,5 @@ -- Task 1: Data Quality Audit --- Run every query against raw_trips / raw_zones in YOUR OWN schema (not public). +-- Run every query against nyc_taxi.raw_trips / nyc_taxi.raw_zones in YOUR OWN schema (not public). -- The shared pattern is a query that returns the bad rows (or a count). -- Zero rows back means the check passed. @@ -15,6 +15,6 @@ -- TODO: SELECT MIN(fare_amount), MAX(fare_amount), and a count of rows where fare_amount < 0. --- 4. Relationship check: which pickup_location_id values in raw_trips do NOT exist in raw_zones? --- TODO: LEFT JOIN raw_zones ... WHERE z.location_id IS NULL (or NOT EXISTS). +-- 4. Relationship check: which pickup_location_id values in nyc_taxi.raw_trips do NOT exist in nyc_taxi.raw_zones? +-- TODO: LEFT JOIN nyc_taxi.raw_zones ... WHERE z.location_id IS NULL (or NOT EXISTS). -- Do NOT use NOT IN: a single NULL in the subquery hides every orphan.