From 9d9874e7d0ab261b8386e9d1c41e3adec2401862 Mon Sep 17 00:00:00 2001 From: Damon McCullough Date: Mon, 29 Dec 2025 12:05:37 -0500 Subject: [PATCH 1/3] update docs and source data for CEQR schools --- dcpy/library/templates/doe_lcgms.yml | 32 ++++++++++++++++ dcpy/library/templates/sca_bluebook.yml | 5 ++- .../sca_capacity_projects_current.yml | 3 +- dcpy/library/templates/sca_e_pct.yml | 3 +- dcpy/library/templates/sca_e_projections.yml | 3 +- ingest_templates/doe_lcgms.yml | 5 ++- products/ceqr/ceqr_app/README.md | 37 +++++++++++++++++++ 7 files changed, 82 insertions(+), 6 deletions(-) create mode 100644 dcpy/library/templates/doe_lcgms.yml diff --git a/dcpy/library/templates/doe_lcgms.yml b/dcpy/library/templates/doe_lcgms.yml new file mode 100644 index 0000000000..1cc0866f18 --- /dev/null +++ b/dcpy/library/templates/doe_lcgms.yml @@ -0,0 +1,32 @@ +dataset: + name: doe_lcgms + acl: public-read + source: + url: + path: s3://edm-recipes/inbox/sca/{{ version }}/doe_lcgms.csv + options: + - AUTODETECT_TYPE=NO + - EMPTY_STRING_AS_NULL=YES + geometry: + SRS: null + type: NONE + + destination: + geometry: + SRS: null + type: NONE + options: + - OVERWRITE=YES + - PRECISION=NO + fields: [] + sql: null + + info: + description: | + Provided by DCP Capital Planning team as an excel file + with a name like "LCGMS_SchoolData". + + This is only needed for the legacy CEQR schools dataset ceqr_school_buildings + and is different from the doe_lcgms ingest source data used in FacDB. + url: "" + dependents: [] diff --git a/dcpy/library/templates/sca_bluebook.yml b/dcpy/library/templates/sca_bluebook.yml index 168977cf77..fc51e652e1 100644 --- a/dcpy/library/templates/sca_bluebook.yml +++ b/dcpy/library/templates/sca_bluebook.yml @@ -24,6 +24,9 @@ dataset: info: description: | ### NYC School Construction Authority - Capacity Projects in Progress - Provided by DCP Capital Planning team as an excel file. This is the SCA's “Enrollment, Capacity, Utilization Report,” known as the “Blue Book”. + Provided by DCP Capital Planning team as an excel file + with a name like "20XX - 20XX Blue Book" and a sheet name like "XX-XX by Org". + + This is the SCA's “Enrollment, Capacity, Utilization Report,” known as the “Blue Book”. url: "" dependents: [] diff --git a/dcpy/library/templates/sca_capacity_projects_current.yml b/dcpy/library/templates/sca_capacity_projects_current.yml index 5c523d42f5..4faf62270b 100644 --- a/dcpy/library/templates/sca_capacity_projects_current.yml +++ b/dcpy/library/templates/sca_capacity_projects_current.yml @@ -24,6 +24,7 @@ dataset: info: description: | ### NYC School Construction Authority - Capacity Projects in Progress - Provided by DCP Capital Planning team as an excel file. + Provided by DCP Capital Planning team as an excel file + with a name like "Section 6 Capacity Projects in Process". url: "" dependents: [] diff --git a/dcpy/library/templates/sca_e_pct.yml b/dcpy/library/templates/sca_e_pct.yml index f91d531c60..2c913736bf 100644 --- a/dcpy/library/templates/sca_e_pct.yml +++ b/dcpy/library/templates/sca_e_pct.yml @@ -24,6 +24,7 @@ dataset: info: description: | ### NYC School Construction Authority - Enrollment Percentages by Zone - Provided by DCP Capital Planning team as an excel file. + Provided by DCP Capital Planning team as an excel file + with a name like "20XX ENROLLMENT _ by Zone". url: "" dependents: [] diff --git a/dcpy/library/templates/sca_e_projections.yml b/dcpy/library/templates/sca_e_projections.yml index a002c9d87e..e2f81accb1 100644 --- a/dcpy/library/templates/sca_e_projections.yml +++ b/dcpy/library/templates/sca_e_projections.yml @@ -24,6 +24,7 @@ dataset: info: description: | ### NYC School Construction Authority - Enrollment Projections by Grade - Provided by DCP Capital Planning team as an excel file. + Provided by DCP Capital Planning team as an excel file + with a name like "20XX-20XX Enrollment Projection By Grade". url: "" dependents: [] diff --git a/ingest_templates/doe_lcgms.yml b/ingest_templates/doe_lcgms.yml index f921e08355..ee9f140de0 100644 --- a/ingest_templates/doe_lcgms.yml +++ b/ingest_templates/doe_lcgms.yml @@ -18,8 +18,9 @@ attributes: ingestion: source: - type: local_file - path: ./LCGMS_SchoolData.xls + type: s3 + bucket: edm-recipes + key: inbox/doe/{{ version }}/LCGMS_SchoolData.xls file_format: type: html kwargs: diff --git a/products/ceqr/ceqr_app/README.md b/products/ceqr/ceqr_app/README.md index d38013e5b8..d91e1d1002 100644 --- a/products/ceqr/ceqr_app/README.md +++ b/products/ceqr/ceqr_app/README.md @@ -28,6 +28,43 @@ This then gets passed to the EDM production database using `create.sql`, where f ## Build instructions +> [!IMPORTANT] +> This codebase is currently only used to build CEQR Schools datasets which are distributed to the Capital Planning and Support (CAPS) team and used in DCP's Schools Model excel workbook. This section is focused on buildings those datasets. + +All source data comes from the CAPS team and must be archived using `library` with the output as postgres. For example: + +```bash +library archive --name sca_capacity_projects_current --version 20251120 --latest --output-format postgres --postgres-url $RECIPE_ENGINE +``` + +These are the four CEQR school datasets and their source data. See each source dataset's `library` template for details. + +`sca_capacity_projects` + +- `sca_capacity_projects_current` + +`sca_e_projections_by_boro` + +- `sca_e_projections` + +`sca_e_projections_by_sd` + +- `sca_e_pct` +- `sca_e_projections` + +`ceqr_school_buildings` + +- `doe_lcgms` +- `sca_bluebook` + +Outputs must be distributed to S3 file storage at `edm-publishing/ceqr-app-data-staging/`. Each dataset has it's own folder and all versions in them. Versions are based on the day the build was run and the `latest` folder has the latest version. + +### Diagram of legacy CEQR app data flow + +![Diagram of legacy CEQR app data flow](/docs/diagrams/dataflow_ceqr.drawio.png) + +## DEPRECATED BUILD NOTES + ### To build using github (NYCPlanning Members Only) Running a recipe using github actions is easy! Simply open an From 235cc02ddc3187f135c48409a79433ab18e3c4da Mon Sep 17 00:00:00 2001 From: Damon McCullough Date: Mon, 29 Dec 2025 12:06:52 -0500 Subject: [PATCH 2/3] improve library docs and logging --- dcpy/library/archive.py | 2 +- dcpy/library/cli.py | 2 +- dcpy/library/ingest.py | 3 ++- 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/dcpy/library/archive.py b/dcpy/library/archive.py index 14f29b97fc..63749d24ca 100644 --- a/dcpy/library/archive.py +++ b/dcpy/library/archive.py @@ -34,7 +34,7 @@ def __call__( Parameters ---------- path: path to the configutation file - output_format: currently supported formats: `'csv'`, `'geojson'`, `'shapefile'`, `'postgres'` + output_format: see ingest.Ingestor translator methods for currently supported formats` push: if `True` then push to s3 clean: if `True`, the temporary files created under `.library` will be removed latest: if `True` then tag this current version we are processing to be the `latest` diff --git a/dcpy/library/cli.py b/dcpy/library/cli.py index c967f5c39a..1c357d8ecd 100644 --- a/dcpy/library/cli.py +++ b/dcpy/library/cli.py @@ -19,7 +19,7 @@ @app.command() def archive( path: str = typer.Option(None, "--path", "-f", help="Path to config yml"), - output_formats: list[str] = typer.Option(["pgdump", "parquet", "csv"], "--output-format", "-o", help="csv, geojson, shapefile, pgdump and parquet"), + output_formats: list[str] = typer.Option(["pgdump", "parquet", "csv", "shapefile", "postgres"], "--output-format", "-o", help="csv, geojson, shapefile, pgdump and parquet"), push: bool = typer.Option(False, "--s3", "-s", help="Push to s3"), clean: bool = typer.Option(False, "--clean", "-c", help="Remove temporary files"), latest: bool = typer.Option(False, "--latest", "-l", help="Tag with latest"), diff --git a/dcpy/library/ingest.py b/dcpy/library/ingest.py index 201b5c1ed2..a337fc4922 100644 --- a/dcpy/library/ingest.py +++ b/dcpy/library/ingest.py @@ -65,7 +65,7 @@ def format_field_names( else: geom_clause = "" query = f"""SELECT\n\t{select}{geom_clause}\nFROM {layer_name}""" - print(query) + print(f"Formatting field names in layer '{layer_name}' using SQL query:\n{query}") if not sql: return query else: @@ -191,6 +191,7 @@ def wrapper(self: Ingestor, *args, **kwargs) -> tuple[list[str], library.Config] layerName = dataset.name # Initiate vector translate + print("Initiating vector translate ...") with Progress( SpinnerColumn(spinner_name="earth"), TextColumn("[progress.description]{task.description}"), From 5b050588ceec1d942b1ff4ac5d93c214b7977973 Mon Sep 17 00:00:00 2001 From: Damon McCullough Date: Sun, 28 Dec 2025 23:45:22 -0500 Subject: [PATCH 3/3] add action to build CEQR schools datasets --- .github/workflows/ceqr_schools.yml | 46 ++++++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) create mode 100644 .github/workflows/ceqr_schools.yml diff --git a/.github/workflows/ceqr_schools.yml b/.github/workflows/ceqr_schools.yml new file mode 100644 index 0000000000..6fd23a50b4 --- /dev/null +++ b/.github/workflows/ceqr_schools.yml @@ -0,0 +1,46 @@ +name: CEQR - Schools +on: + workflow_dispatch: + inputs: + dataset: + description: "Dataset to build" + type: choice + required: true + options: + - sca_capacity_projects + - sca_e_projections_by_boro + - sca_e_projections_by_sd + - ceqr_school_buildings + +jobs: + build: + runs-on: ubuntu-22.04 + defaults: + run: + shell: bash + working-directory: products/ceqr/ceqr_app + container: + image: nycplanning/build-geosupport:${{ inputs.image_tag || 'latest' }} + steps: + - uses: actions/checkout@v4 + + - name: Load Secrets + uses: 1password/load-secrets-action@v1 + with: + export-env: true + env: + OP_SERVICE_ACCOUNT_TOKEN: ${{ secrets.OP_SERVICE_ACCOUNT_TOKEN }} + BUILD_ENGINE_SERVER: "op://Data Engineering/EDM_DATA/server_url" + AWS_S3_ENDPOINT: "op://Data Engineering/DO_keys/AWS_S3_ENDPOINT" + AWS_SECRET_ACCESS_KEY: "op://Data Engineering/DO_keys/AWS_SECRET_ACCESS_KEY" + AWS_ACCESS_KEY_ID: "op://Data Engineering/DO_keys/AWS_ACCESS_KEY_ID" + + - name: Setup build environment + working-directory: ./ + run: ./bash/docker_container_setup.sh + + - name: Run recipe + run: | + export RECIPE_EGNINE=$BUILD_ENGINE_SERVER/recipe + export EDM_DATA=$BUILD_ENGINE_SERVER/defaultdb + ./ceqr run recipe ${{ inputs.dataset }}