From 68e5e6b1a20821a65bb64baf1be99b06b471cb8a Mon Sep 17 00:00:00 2001 From: Lasse Benninga Date: Wed, 3 Jun 2026 14:01:58 +0200 Subject: [PATCH 1/2] feat: scaffold Week 6 assignment (Deploy to Azure) Replaces the empty template stubs with a real scaffold derived from `Data Track/Week 6/week_6__8_assignment.md`. Students implement a Container App Job pipeline that uploads raw JSON to Azure Blob Storage and upserts rows into Azure Database for PostgreSQL. Layout - `src/pipeline.py`: `raise NotImplementedError` stubs for the three pipeline functions (config, blob upload, Postgres upsert) plus a `run()` orchestrator. - `Dockerfile`: cache-friendly starter with TODO comments for the layer order students must complete. - `requirements.txt`: TODO entries for `azure-storage-blob` and `psycopg2-binary` pins. - `AI_ASSIST.md`: section headers + TODO placeholders for Task 7. - `.env.example`: connection-string template with placeholders only. - `docs/`: target folder for the Task 5 Execution-history screenshot. Autograder (`.hyf/test.sh`) Static-analysis only. Azure deployment is unverifiable in CI (no credentials), so the grader checks code shape, not live deployment: required files, pinned deps, Dockerfile layer order, env-var reads, `contextlib.closing()`, azure SDK logger silencing, `ON CONFLICT ... DO UPDATE` upsert with `%s` placeholders, `sslmode=require`, `BlobServiceClient` import, AI report fill-in, README `## Verification` heading + embedded `docs/` image, screenshot presence. Verified locally - Bare scaffold: 13/100, pass=false. - Working solution (clone at `~/Documents/github/hyf/data-assignment-week-6-test`): 100/100, pass=true. Co-Authored-By: Claude Opus 4.7 (1M context) --- .devcontainer/devcontainer.json | 15 ++ .env.example | 11 + .gitignore | 161 ++------------ .hyf/grader_lib.sh | 250 ++++++++++++++++++++++ .hyf/test.sh | 285 ++++++++++++++++++++++++- AI_ASSIST.md | 25 +++ Dockerfile | 21 ++ README.md | 113 +++++++++- docs/.gitkeep | 2 + requirements.txt | 13 ++ task-1/task 1 files => src/__init__.py | 0 src/pipeline.py | 103 +++++++++ task-2/task 2 files | 0 13 files changed, 836 insertions(+), 163 deletions(-) create mode 100644 .devcontainer/devcontainer.json create mode 100644 .env.example create mode 100644 .hyf/grader_lib.sh mode change 100644 => 100755 .hyf/test.sh create mode 100644 AI_ASSIST.md create mode 100644 Dockerfile create mode 100644 docs/.gitkeep create mode 100644 requirements.txt rename task-1/task 1 files => src/__init__.py (100%) create mode 100644 src/pipeline.py delete mode 100644 task-2/task 2 files diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json new file mode 100644 index 0000000..3184532 --- /dev/null +++ b/.devcontainer/devcontainer.json @@ -0,0 +1,15 @@ +{ + "name": "Week 6: Deploy to Azure", + "image": "mcr.microsoft.com/devcontainers/python:3.11", + "features": { + "ghcr.io/devcontainers/features/docker-in-docker:2": {}, + "ghcr.io/devcontainers/features/azure-cli:1": {} + }, + "postCreateCommand": "pip install -r requirements.txt", + "remoteUser": "vscode", + "customizations": { + "vscode": { + "extensions": ["ms-python.python", "ms-python.vscode-pylance"] + } + } +} diff --git a/.env.example b/.env.example new file mode 100644 index 0000000..755ba01 --- /dev/null +++ b/.env.example @@ -0,0 +1,11 @@ +# Copy to .env (which is gitignored) and fill in. Never commit real values. +# +# Your teacher will provide these connection strings, or you can pull them +# from Azure Key Vault as described in Chapter 5 (Getting your connection +# strings). Check Chapter 4 for the full Postgres connection-string format +# Azure requires (host suffix and ssl flag). + +POSTGRES_URL=postgresql://:@.postgres.database.azure.com:5432/? +AZURE_STORAGE_CONNECTION_STRING=DefaultEndpointsProtocol=https;AccountName=...;AccountKey=...;EndpointSuffix=core.windows.net +SOURCE_NAME=weather +LOG_LEVEL=INFO diff --git a/.gitignore b/.gitignore index 2b76d7c..f6a80c0 100644 --- a/.gitignore +++ b/.gitignore @@ -3,156 +3,31 @@ Thumbs.db [Dd]esktop.ini -# hyf +# HYF auto-grader output .hyf/score.json -# Editor and IDE settings +# Editor / IDE .vscode/ .idea/ -*.iml *.code-workspace -*.sublime-project -*.sublime-workspace -.history/ -.ionide/ -# Logs -logs -*.log -npm-debug.log* -yarn-debug.log* -yarn-error.log* -lerna-debug.log* - -# Diagnostic reports (https://nodejs.org/api/report.html) -report.[0-9]*.[0-9]*.[0-9]*.[0-9]*.json - -# Runtime data -pids -*.pid -*.seed -*.pid.lock - -# Directory for instrumented libs generated by jscoverage/JSCover -lib-cov - -# Coverage directory used by tools like istanbul -coverage -*.lcov - -# nyc test coverage -.nyc_output - -# Grunt intermediate storage (https://gruntjs.com/creating-plugins#storing-task-files) -.grunt - -# Bower dependency directory (https://bower.io/) -bower_components - -# node-waf configuration -.lock-wscript - -# Compiled binary addons (https://nodejs.org/api/addons.html) -build/Release - -# Dependency directories -node_modules/ -jspm_packages/ - -# Snowpack dependency directory (https://snowpack.dev/) -web_modules/ - -# TypeScript cache -*.tsbuildinfo - -# Optional npm cache directory -.npm - -# Optional eslint cache -.eslintcache - -# Optional stylelint cache -.stylelintcache - -# Optional REPL history -.node_repl_history - -# Output of 'npm pack' -*.tgz - -# Yarn Integrity file -.yarn-integrity - -# dotenv environment variable files +# Python +__pycache__/ +*.pyc +*.py[cod] +*.pyo +.Python +.venv/ +venv/ +*.egg-info/ +dist/ +build/ + +# Environments and secrets .env .env.* !.env.example -# parcel-bundler cache (https://parceljs.org/) -.cache -.parcel-cache - -# Next.js build output -.next -out - -# Nuxt.js build / generate output -.nuxt -dist - -# Gatsby files -.cache/ -# Comment in the public line in if your project uses Gatsby and not Next.js -# https://nextjs.org/blog/next-9-1#public-directory-support -# public - -# vuepress build output -.vuepress/dist - -# vuepress v2.x temp and cache directory -.temp -.cache - -# Sveltekit cache directory -.svelte-kit/ - -# vitepress build output -**/.vitepress/dist - -# vitepress cache directory -**/.vitepress/cache - -# Docusaurus cache and generated files -.docusaurus - -# Serverless directories -.serverless/ - -# FuseBox cache -.fusebox/ - -# DynamoDB Local files -.dynamodb/ - -# Firebase cache directory -.firebase/ - -# TernJS port file -.tern-port - -# Stores VSCode versions used for testing VSCode extensions -.vscode-test - -# yarn v3 -.pnp.* -.yarn/* -!.yarn/patches -!.yarn/plugins -!.yarn/releases -!.yarn/sdks -!.yarn/versions - -# Vite logs files -vite.config.js.timestamp-* -vite.config.ts.timestamp-* - +# Local runtime output (the cloud copies are the source of truth) +output/ +*.log diff --git a/.hyf/grader_lib.sh b/.hyf/grader_lib.sh new file mode 100644 index 0000000..d383a4a --- /dev/null +++ b/.hyf/grader_lib.sh @@ -0,0 +1,250 @@ +#!/usr/bin/env bash +# grader_lib.sh — shared helpers for HYF Data Track autograders. +# Source this at the top of test.sh: +# source "$(dirname "$0")/grader_lib.sh" +# +# Provides: pass(), fail(), warn(), print_results(), write_score(), +# and a set of common static-analysis checks derived from recurring +# PR review patterns across cohort c55. + +_grader_details=() + +pass() { _grader_details+=("✓ PASS $1"); } +fail() { _grader_details+=("✗ FAIL $1"); } +warn() { _grader_details+=("⚠ WARN $1"); } + +print_results() { + local header="${1:-Autograder Results}" + echo "" + echo "=== $header ===" + for line in "${_grader_details[@]}"; do echo " $line"; done + echo "" +} + +write_score() { + # write_score [] + local score="$1" + local passing="$2" + local outfile="${3:-$(dirname "${BASH_SOURCE[0]}")/score.json}" + local pass_flag="false" + [[ "$score" -ge "$passing" ]] && pass_flag="true" + cat > "$outfile" << JSON +{ + "score": $score, + "pass": $pass_flag, + "passingScore": $passing +} +JSON + echo "Score: $score / 100 (passing: $passing) pass=$pass_flag" +} + +# ── Common static-analysis checks ──────────────────────────────────────────── +# Each function: returns 0 on pass, 1 on fail/warn (for caller logic). +# All feedback goes through pass()/fail()/warn() so it appears in print_results. + +check_no_print_statements() { + # Usage: check_no_print_statements [label] + # Flags bare print() calls that should be logging calls. + local dir="${1:-.}" + local label="${2:-$dir}" + local found + found=$(grep -rn "^[[:space:]]*print(" "$dir" --include="*.py" 2>/dev/null | grep -v "# noqa" || true) + if [[ -n "$found" ]]; then + local count + count=$(echo "$found" | wc -l | tr -d ' ') + warn "$label: $count print() call(s) found — use logging.info/warning/error instead (see Week 1 Ch1)" + return 1 + fi + return 0 +} + +check_no_notimplemented() { + # Usage: check_no_notimplemented [label] + # Flags NotImplementedError stubs left in after implementation. + local dir="${1:-.}" + local label="${2:-$dir}" + local found + found=$(grep -rn "raise NotImplementedError" "$dir" --include="*.py" 2>/dev/null || true) + if [[ -n "$found" ]]; then + fail "$label: raise NotImplementedError still present — remove stubs before submitting" + return 1 + fi + return 0 +} + +check_no_relative_imports() { + # Usage: check_no_relative_imports [label] + # Flags `from .module import x` in scripts not inside a proper package. + # Relative imports break the grader: python3 src/cleaner.py fails with + # "attempted relative import with no known parent package". + local dir="${1:-.}" + local label="${2:-$dir}" + local found + found=$(grep -rn "^from \." "$dir" --include="*.py" 2>/dev/null || true) + if [[ -n "$found" ]]; then + fail "$label: relative import found (from .module) — use absolute: 'from src.module import x'" + return 1 + fi + return 0 +} + +check_no_logging_in_utils() { + # Usage: check_no_logging_in_utils + # utils.py should be pure helpers; logging config belongs in the entry point. + local file="${1:-task-1/src/utils.py}" + if [[ ! -f "$file" ]]; then return 0; fi + if grep -qE "logging\.basicConfig|logging\.getLogger" "$file"; then + warn "$file: logging.basicConfig/getLogger found — logging setup belongs in cleaner.py or the entry-point, not in utils" + return 1 + fi + return 0 +} + +check_gitignore_python() { + # Usage: check_gitignore_python [] + # Warns when Python cache patterns are absent from .gitignore. + local gi="${1:-.gitignore}" + if [[ ! -f "$gi" ]]; then + warn ".gitignore is missing — add one so __pycache__/ and *.pyc are not committed" + return 1 + fi + local ok=true + if ! grep -q "__pycache__" "$gi"; then + warn ".gitignore missing __pycache__/ — Python bytecode cache dirs should not be committed" + ok=false + fi + if ! grep -qE "^\*\.pyc$|^.*\*\.pyc" "$gi"; then + warn ".gitignore missing *.pyc — compiled Python files should not be committed" + ok=false + fi + if ! grep -qE "^\.env$|^\.env\b" "$gi"; then + warn ".gitignore missing .env — secret files should not be committed" + ok=false + fi + if [[ "$ok" = true ]]; then pass ".gitignore correctly excludes __pycache__/, *.pyc, and .env"; fi +} + +check_screenshot_is_png() { + # Usage: check_screenshot_is_png [] + # Awards full credit for .png, warns (and still credits) for .jpg/.jpeg, + # zero for missing. Matches the pattern flagged in c55 PR reviews. + local expected_png="$1" + local dir + dir="$(dirname "$expected_png")" + local base + base="$(basename "$expected_png" .png)" + + if [[ -s "$expected_png" ]]; then + pass "screenshot is $expected_png (.png format ✓)" + return 0 + fi + for ext in jpg jpeg; do + if [[ -s "$dir/$base.$ext" ]]; then + warn "screenshot is .$ext but should be .png — rename to $base.png (partial credit still given)" + return 1 + fi + done + fail "screenshot missing: $expected_png not found" + return 2 +} + +check_silent_zero_in_except() { + # Usage: check_silent_zero_in_except + # Detects the pattern: try: x = compute() / except: x = 0 + # which silently corrupts data instead of skipping or raising. + local file="$1" + if [[ ! -f "$file" ]]; then return 0; fi + local found + found=$(python3 - "$file" 2>/dev/null << 'PY' +import ast, sys +try: + tree = ast.parse(open(sys.argv[1]).read()) +except SyntaxError: + sys.exit(0) +for node in ast.walk(tree): + if isinstance(node, ast.ExceptHandler): + for stmt in node.body: + if isinstance(stmt, ast.Assign): + if isinstance(stmt.value, ast.Constant) and stmt.value.value == 0: + print(f"line {stmt.lineno}: '{ast.unparse(stmt)}' — sets field to 0 in except block (silent data corruption)") +PY +) + if [[ -n "$found" ]]; then + warn "$file: silent 0-assignment in except block — skip the row or raise instead of setting to 0:\n $found" + return 1 + fi + return 0 +} + +check_exception_logged() { + # Usage: check_exception_logged + # Warns when except blocks log/print a message but don't include the + # exception variable (e, err, exc), meaning the error type is lost. + local dir="${1:-.}" + local found + found=$(python3 - "$dir" 2>/dev/null << 'PY' +import ast, os, sys +issues = [] +for root, _, files in os.walk(sys.argv[1]): + for fname in files: + if not fname.endswith(".py"): + continue + path = os.path.join(root, fname) + try: + tree = ast.parse(open(path).read()) + except SyntaxError: + continue + for node in ast.walk(tree): + if not isinstance(node, ast.ExceptHandler): + continue + exc_var = node.name # e.g. "e" in `except ValueError as e` + if not exc_var: + continue + for stmt in node.body: + for call in ast.walk(stmt): + if not isinstance(call, ast.Call): + continue + # Is it a logging.* or print call? + func = call.func + is_log = (isinstance(func, ast.Attribute) and + isinstance(func.value, ast.Name) and + func.value.id == "logging") + is_print = isinstance(func, ast.Name) and func.id == "print" + if not (is_log or is_print): + continue + # Does the call reference the exception variable? + src = ast.unparse(call) + if exc_var not in src: + issues.append(f"{path}:{call.lineno}: log message doesn't include exception variable '{exc_var}' — add it for easier debugging") +if issues: + for i in issues[:3]: # cap at 3 to keep output readable + print(i) +PY +) + if [[ -n "$found" ]]; then + warn "exception variable not included in log message (harder to debug):\n $found" + return 1 + fi + return 0 +} + +check_ruff() { + # Usage: check_ruff [