From f99304e39af7cf8e3e11ceda645f519f68437711 Mon Sep 17 00:00:00 2001 From: winter-again <63322884+winter-again@users.noreply.github.com> Date: Thu, 12 Mar 2026 14:55:10 -0400 Subject: [PATCH 1/5] zip-to-county and county-to-zip data --- src/kintsugi/_data.py | 20 ++ src/kintsugi/crosswalk.py | 430 ++++++++++++++++++++++++++++++++++++++ tests/crosswalk_test.py | 60 ++++++ 3 files changed, 510 insertions(+) create mode 100644 src/kintsugi/crosswalk.py create mode 100644 tests/crosswalk_test.py diff --git a/src/kintsugi/_data.py b/src/kintsugi/_data.py index 69d334e..21ccc09 100644 --- a/src/kintsugi/_data.py +++ b/src/kintsugi/_data.py @@ -18,6 +18,26 @@ "county_neighbors/county_adjacency2023.txt": "2dbf9a8bae1b7c50db3a9db4864b073d2423c3e6e63518c97d649453e2809843", "county_neighbors/county_adjacency2024.txt": "20cffeb48ba46972fb949c453d3fbf62620115039c45c88bc80c094885650816", "county_neighbors/county_adjacency2025.txt": "27046a5f09f66205fd9869afbfb6dcae744e1c9b85cb19dd767c580211fb4575", + "crosswalk/county_to_zip/county_to_zip_2016.parquet": "f13d79b059b272c9ac4ff02dbe4e0e26d56acffad837adeb0f2d86d3101fd9ad", + "crosswalk/county_to_zip/county_to_zip_2017.parquet": "009141988a3b902179d6beed80260faf2474a4cda8d6af0667d88ab4f0140b0b", + "crosswalk/county_to_zip/county_to_zip_2018.parquet": "697425d6ffba4c8f95c80a961814d2291f61507336010349a4d78c10ce5cfdba", + "crosswalk/county_to_zip/county_to_zip_2019.parquet": "3a06ca1a4d21638a4d014cbb4bc29b7d6b254a937dab1536058152d226073d62", + "crosswalk/county_to_zip/county_to_zip_2020.parquet": "06cb2bfa55085d9c5f0f652628a61d1bf64fe14ea190df5bdc361844f5204250", + "crosswalk/county_to_zip/county_to_zip_2021.parquet": "fdd59a6b8ab06a534d1c9c4765101105ce8bc08d9685eb9441c58701bffd96ae", + "crosswalk/county_to_zip/county_to_zip_2022.parquet": "c3875e66bff03ff71ad73b89277fc892a294a5d71be08b4eb4ac2c431d715b83", + "crosswalk/county_to_zip/county_to_zip_2023.parquet": "2f7c7564092c43d3a964ecf76bc61083916186d140c3cabed63aba5f91b56f1a", + "crosswalk/county_to_zip/county_to_zip_2024.parquet": "907321abfa45437a13d4cfa9047b687456e883a096b863c8874d168fb8cc57c0", + "crosswalk/county_to_zip/county_to_zip_2025.parquet": "e5008518558a4aca7ae2d40226f56bdcdcdf01bdb3879241a0b8d69a14097239", + "crosswalk/zip_to_county/zip_to_county_2016.parquet": "982913988b4915e9c4787d4f639985635cae938b447944e0ef04868bb5c784d1", + "crosswalk/zip_to_county/zip_to_county_2017.parquet": "e8dd16dcee07b3a6b52a30c49157d707a1f1e8f1cabf7da126cb094e88cfd0eb", + "crosswalk/zip_to_county/zip_to_county_2018.parquet": "8ab399f97bb8d81eaabca7adf512e801c8958745fe6f0e82992313fbcfaaa942", + "crosswalk/zip_to_county/zip_to_county_2019.parquet": "78ac9cdda7c5e00b82315ceaece4f3722da905e2ec4f68774a135d327c257435", + "crosswalk/zip_to_county/zip_to_county_2020.parquet": "17f518b2a6de5ee58b134c0dc5fbf10f6d48c45b14fdbfcfe58a4269e08eaee2", + "crosswalk/zip_to_county/zip_to_county_2021.parquet": "655c6ce8b257e942ac1c7dfa068d93ad2a1f139c9cd0c110efbce752ab27cea4", + "crosswalk/zip_to_county/zip_to_county_2022.parquet": "9f125642aeb5aba3c8f2bdf616c186f975fc6afc1b5543d55204ad9b2a10b252", + "crosswalk/zip_to_county/zip_to_county_2023.parquet": "89f50d28acb226a5777ededc1f8301dc1f028c65dd04b860355d23efe24b89c5", + "crosswalk/zip_to_county/zip_to_county_2024.parquet": "8d76e09fa52923c0901c0afad26ba2d32075210bb0ab5b82caa14ce5465b3d80", + "crosswalk/zip_to_county/zip_to_county_2025.parquet": "2683272f9759646d0a6234de06076f274852b85fcd2fd9e3d2dfcb3aa10def80", "geo/cb_2020_us_county_5m.zip": "187e7118304428e5450083beb375e67c2c516c58a01ce52db95aaf24f18df3ba", "geo/cb_2020_us_state_5m.zip": "aedc60e0d1924a9030ee6d39ff0ed27ad7d1b0bc86807ea809391a6b9008ffb3", "geo/cb_2024_us_county_5m.zip": "a867f8734059b45d1d54a0ba56189dd7e73c42eb451418fa56de44c35232614b", diff --git a/src/kintsugi/crosswalk.py b/src/kintsugi/crosswalk.py new file mode 100644 index 0000000..8586209 --- /dev/null +++ b/src/kintsugi/crosswalk.py @@ -0,0 +1,430 @@ +from typing import Literal, overload + +import pandas as pd +import polars as pl + +from ._data import get_dataset + +num_county_subs = 169 + + +# @overload +# def crosswalk_CT_counties(as_pandas: Literal[False] = ...) -> pl.LazyFrame: ... +# +# +# @overload +# def crosswalk_CT_counties(as_pandas: Literal[True]) -> pd.DataFrame: ... +# +# +# def crosswalk_CT_counties(as_pandas: bool = False) -> pl.LazyFrame | pd.DataFrame: +# """ +# Crosswalk CT counties between pre and post-2022 changes. Weights calculated based on county subdivision populations. +# +# See: +# - FIPS code changes: https://www.census.gov/programs-surveys/geography/technical-documentation/county-changes.html +# - CT specific: https://www2.census.gov/geo/pdfs/reference/ct_county_equiv_change.pdf +# - CT specific: https://www.federalregister.gov/documents/2022/06/06/2022-12063/change-to-county-equivalents-in-the-state-of-connecticut +# +# Crosswalk: https://www.census.gov/programs-surveys/geography/technical-documentation/county-changes.January_2020.html +# Source: https://www2.census.gov/geo/docs/reference/ct_change/ct_cou_to_cousub_crosswalk.txt +# +# CT county subdivision populations source: https://www.census.gov/data/tables/time-series/demo/popest/2020s-total-cities-and-towns.html +# FTP: https://www2.census.gov/programs-surveys/popest/datasets/2020-2024/cities/totals/sub-est2024_9.csv +# +# """ +# crosswalk = ( +# pl.scan_csv( +# CROSSWALK_DATA / "county/ct_cou_to_cousub_crosswalk.txt", +# separator="|", +# n_rows=174, +# infer_schema=False, +# ) +# .select( +# "STATEFP\n(INCITS38)", +# "OLD_COUNTYFP\n(INCITS31)", +# "OLD_COUNTY_NAMELSAD", +# "NEW_COUNTYFP\n(INCITS31)", +# "NEW_COUNTY_NAMELSAD", +# "COUSUBFP", +# "OLD_COUSUB_GEOID", +# "NEW_COUSUB_GEOID", +# "COUSUB_NAMELSAD", +# ) +# .rename( +# { +# "STATEFP\n(INCITS38)": "state_fips", +# "OLD_COUNTYFP\n(INCITS31)": "county_fips_old", +# "OLD_COUNTY_NAMELSAD": "county_name_old", +# "NEW_COUNTYFP\n(INCITS31)": "county_fips_new", +# "NEW_COUNTY_NAMELSAD": "county_name_new", +# "COUSUBFP": "county_sub_fips", +# "OLD_COUSUB_GEOID": "county_sub_geoid_old", +# "NEW_COUSUB_GEOID": "county_sub_geoid_new", +# "COUSUB_NAMELSAD": "county_sub_name", +# } +# ) +# .filter( +# # NOTE: 5 rows labeled with "County subdivisions not defined" but their GEOID/FIPS +# # doesn't make sense anyway +# pl.col("county_sub_fips") != "00000" +# ) +# .with_columns( +# (pl.col("state_fips") + pl.col(col)).alias(col) +# for col in ["county_fips_old", "county_fips_new"] +# ) +# .drop("state_fips") +# ) +# assert crosswalk.select(pl.len()).collect().item() == num_county_subs +# +# subcounty = ( +# pl.scan_csv( +# CROSSWALK_DATA / "county/sub-est2024_9.csv", +# schema_overrides={ +# "SUMLEV": pl.String, +# "STATE": pl.String, +# "COUNTY": pl.String, +# "COUSUB": pl.String, +# "NAME": pl.String, +# "POPESTIMATE2024": pl.Int64, +# }, +# ) +# .select( +# "SUMLEV", +# "STATE", +# "COUNTY", +# "COUSUB", +# "NAME", +# "POPESTIMATE2024", +# ) +# .rename( +# { +# "SUMLEV": "sumlev", +# "STATE": "state_fips", +# "COUNTY": "county_fips", +# "COUSUB": "county_sub_fips", +# "NAME": "county_sub_name", +# "POPESTIMATE2024": "pop_2024", +# } +# ) +# .filter( +# # only minor civil divisions +# pl.col("sumlev") == "061" +# ) +# .with_columns(county_fips=pl.col("state_fips") + pl.col("county_fips")) +# .drop("state_fips", "county_fips", "sumlev") +# .sort("county_sub_fips") +# ) +# assert subcounty.select(pl.len()).collect().item() == num_county_subs +# +# lf = ( +# crosswalk.join( +# subcounty, +# on="county_sub_fips", +# how="inner", +# validate="1:1", +# ) +# .select( +# "county_sub_fips", +# "county_sub_name", +# "county_fips_new", +# "county_name_new", +# "county_fips_old", +# "county_name_old", +# "pop_2024", +# ) +# .with_columns( +# pop_old=pl.col("pop_2024").sum().over("county_fips_old"), +# pop_new=pl.col("pop_2024").sum().over("county_fips_new"), +# ) +# .group_by( +# [ +# "county_fips_new", +# "county_name_new", +# "county_fips_old", +# "county_name_old", +# "pop_old", +# "pop_new", +# ] +# ) +# .agg(pop_agg=pl.col("pop_2024").sum()) # agg county sub to county pairs +# # NOTE: want weights to be expected prop. of origin FIPS that are located in dest. FIPS +# # Aka prop. of origin FIPS that is located in dest. FIPS +# # Ex: wt_new_to_old should give expected prop. of new FIPS that is located in old FIPS. +# .with_columns( +# wt_new_to_old=pl.col("pop_agg") / pl.col("pop_new"), +# wt_old_to_new=pl.col("pop_agg") / pl.col("pop_old"), +# ) +# .select( +# "county_fips_old", +# "county_name_old", +# "county_fips_new", +# "county_name_new", +# "wt_new_to_old", +# "wt_old_to_new", +# ) +# .sort("county_fips_old") +# ) +# assert ( +# lf.select(pl.len()).collect().item() +# == lf.unique(["county_fips_old", "county_fips_new"]) +# .select(pl.len()) +# .collect() +# .item() +# ) +# +# if as_pandas: +# return lf.collect().to_pandas() +# +# return lf + + +# @overload +# def crosswalk_puma_versions() -> pl.LazyFrame: ... +# +# +# @overload +# def crosswalk_puma_versions(as_pandas: Literal[False]) -> pl.LazyFrame: ... +# +# +# @overload +# def crosswalk_puma_versions(as_pandas: Literal[True]) -> pd.DataFrame: ... +# +# +# def crosswalk_puma_versions(as_pandas: bool = False) -> pl.LazyFrame | pd.DataFrame: +# """ +# Crosswalk data between 2010 (effective 2012) and 2020 (effective 2022) PUMAs. +# +# Sourced from Missouri Census Data Center (MCDC). Alternative source exists from +# Integrated Public Use Microdata Series (IPUMS). +# +# - Source (MCDC): https://mcdc.missouri.edu/geography/PUMAs.html +# - See: https://mcdc.missouri.edu/data/corrlst/puma2010-to-puma2020.csv +# - Source (IPUMS): https://usa.ipums.org/usa/volii/pumas20.shtml +# - See: https://usa.ipums.org/usa/resources/volii/PUMA2010_PUMA2020_crosswalk.xls +# """ +# lf = ( +# pl.scan_csv( +# CROSSWALK_DATA / "PUMA/puma2010-to-puma2020.csv", +# skip_rows_after_header=1, +# schema_overrides={ +# "state": pl.String, +# "puma12": pl.String, +# "puma22": pl.String, +# "afact": pl.Float64, +# "AFACT2": pl.Float64, +# }, +# ) +# .select( +# "state", +# "puma22", +# "puma12", +# "afact", # portion of earlier PUMA's population living in later PUMA +# "AFACT2", # portion of later PUMA's population living in the earlier PUMA +# ) +# .rename( +# { +# "afact": "wt_PUMA_2010_to_2020_MCDC", +# "AFACT2": "wt_PUMA_2020_to_2010_MCDC", +# } +# ) +# .with_columns( +# puma_geoid_2020=pl.col("state").str.zfill(2) +# + pl.col("puma22").str.zfill(5), +# puma_geoid_2010=pl.col("state").str.zfill(2) +# + pl.col("puma12").str.zfill(5), +# ) +# .select( +# "puma_geoid_2010", +# "puma_geoid_2020", +# "wt_PUMA_2010_to_2020_MCDC", +# "wt_PUMA_2020_to_2010_MCDC", +# ) +# .filter( +# pl.col("puma_geoid_2010") +# .str.slice(0, 2) +# .is_between(pl.lit("01"), pl.lit("56")), +# pl.col("puma_geoid_2020") +# .str.slice(0, 2) +# .is_between(pl.lit("01"), pl.lit("56")), +# # pl.col("wt_PUMA_2020_to_2010") != 0.0, +# ) +# .sort("puma_geoid_2010", "puma_geoid_2020") +# ) +# +# # lf_IPUMS = ( +# # pl.read_excel( +# # PUMS_DATA / "PUMA2010_PUMA2020_crosswalk.xls", +# # # NOTE: pPUMA20_Pop20 = "Estimated percent of the 2020 PUMA's 2020 population that lies in the area of intersection" +# # columns=[ +# # "GEOID10", +# # "GEOID20", +# # "pPUMA20_Pop20", +# # "pPUMA10_Pop20", +# # ], +# # ) +# # .lazy() +# # .rename( +# # { +# # "GEOID20": "puma_geoid_2020", +# # "GEOID10": "puma_geoid_2010", +# # "pPUMA20_Pop20": "wt_PUMA_2020_to_2010", +# # "pPUMA10_Pop20": "wt_PUMA_2010_to_2020", +# # } +# # ) +# # .with_columns( +# # (pl.col(col) / 100.0).alias(col) +# # for col in ["wt_PUMA_2020_to_2010", "wt_PUMA_2010_to_2020"] +# # ) +# # .select( +# # "puma_geoid_2020", +# # "puma_geoid_2010", +# # "wt_PUMA_2020_to_2010", +# # "wt_PUMA_2010_to_2020", +# # ) +# # .sort("puma_geoid_2020", "puma_geoid_2010") +# # ) +# +# if as_pandas: +# return lf.collect().to_pandas() +# +# return lf + + +# @overload +# def crosswalk_puma_2010_county_2020() -> pl.LazyFrame: ... +# +# +# @overload +# def crosswalk_puma_2010_county_2020(as_pandas: Literal[False]) -> pl.LazyFrame: ... +# +# +# @overload +# def crosswalk_puma_2010_county_2020(as_pandas: Literal[True]) -> pd.DataFrame: ... +# +# +# def crosswalk_puma_2010_county_2020( +# as_pandas: bool = False, +# ) -> pl.LazyFrame | pd.DataFrame: +# """ +# Crosswalk data between 2010 PUMAs and 2020 counties. +# +# Note: uses new CT counties +# +# Source: MCDC 2022 Geocorr +# - Form query: https://mcdc.missouri.edu/cgi-bin/broker?_PROGRAM=apps.geocorr2022.sas&_SERVICE=MCDC_long&_debug=0&state=Mo29&state=Al01&state=Ak02&state=Az04&state=Ar05&state=Ca06&state=Co08&state=Ct09&state=De10&state=Dc11&state=Fl12&state=Ga13&state=Hi15&state=Id16&state=Il17&state=In18&state=Ia19&state=Ks20&state=Ky21&state=La22&state=Me23&state=Md24&state=Ma25&state=Mi26&state=Mn27&state=Ms28&state=Mt30&state=Ne31&state=Nv32&state=Nh33&state=Nj34&state=Nm35&state=Ny36&state=Nc37&state=Nd38&state=Oh39&state=Ok40&state=Or41&state=Pa42&state=Pr72&state=Ri44&state=Sc45&state=Sd46&state=Tn47&state=Tx48&state=Ut49&state=Vt50&state=Va51&state=Wa53&state=Wv54&state=Wi55&state=Wy56&g1_=puma12&g2_=county&wtvar=pop20&nozerob=1&fileout=1&filefmt=csv&lstfmt=html&title=&afacts2=on&counties=&metros=&places=&oropt=&latitude=&longitude=&distance=&kiloms=0&locname= +# """ +# lf = ( +# pl.read_csv( +# CROSSWALK_DATA / "PUMA/geocorr_puma_2010_to_county_2020_with_afact2.csv", +# encoding="iso-8859-1", +# skip_rows_after_header=1, +# columns=[ +# "state", +# "puma12", +# "county", +# "afact", +# "afact2", +# ], +# schema_overrides={ +# "state": pl.String, +# "puma12": pl.String, +# "county": pl.String, +# "afact": pl.String, +# "afact2": pl.String, +# }, +# ) +# .lazy() +# .rename({"county": "county_fips"}) +# .with_columns( +# puma_geoid=pl.col("state") + pl.col("puma12"), +# wt_PUMA_2010_to_county=pl.col("afact").str.strip_chars().cast(pl.Float64), +# wt_county_to_PUMA_2010=pl.col("afact2").str.strip_chars().cast(pl.Float64), +# ) +# .filter( +# pl.col("state").is_between(pl.lit("01"), pl.lit("56")), +# # pl.col("wt_PUMA_2010_to_county") != 0 +# ) +# .select( +# "puma_geoid", +# "county_fips", +# "wt_PUMA_2010_to_county", +# "wt_county_to_PUMA_2010", +# ) +# .sort("puma_geoid", "county_fips") +# ) +# +# if as_pandas: +# return lf.collect().to_pandas() +# +# return lf + + +# TODO: need to settle on some convention for the files: +# e.g., always use Q4 for each year? Q3 used because that was latest at the time when I pulled data +# Available on a quarterly release cycle from HUD: +# - 2010-Q1 - 2011-Q4 data use 2000 Census geographies +# - 2012-Q1 - 2022-Q4 data use 2010 Census geographies +# - 2023-Q1 - present data use 2020 Census geographies +# TODO: note that I can only find one instance of using old version of this function +# in other proj. Although it used zip-to-county file instead of county-to-zip file, +# the weights weren't even used (only used to map zip to county) + + +@overload +def county_to_zip(year: int, as_pandas: Literal[False] = ...) -> pl.LazyFrame: ... + + +@overload +def county_to_zip(year: int, as_pandas: Literal[True]) -> pd.DataFrame: ... + + +def county_to_zip(year: int, as_pandas: bool = False) -> pl.LazyFrame | pd.DataFrame: + """ + County-to-zip residential ratio weights. + + Use these weights to crosswalk zip-to-county via a weighted mean. + 2012-2022 data use 2010 Census geographies. 2023-present data use + 2020 Census geographies. All years use quarter 4 data. + + Source: https://www.huduser.gov/portal/datasets/usps_crosswalk.html + """ + if not (2016 <= year <= 2025): + raise ValueError("Must choose a year between 2016 and 2025") + + data = get_dataset(f"crosswalk/county_to_zip/county_to_zip_{year}.parquet") + lf = pl.scan_parquet(data) + + if as_pandas: + return lf.collect().to_pandas() + + return lf + + +@overload +def zip_to_county(year: int, as_pandas: Literal[False] = ...) -> pl.LazyFrame: ... + + +@overload +def zip_to_county(year: int, as_pandas: Literal[True]) -> pd.DataFrame: ... + + +def zip_to_county(year: int, as_pandas: bool = False) -> pl.LazyFrame | pd.DataFrame: + """ + Zip-to-county residential ratio weights. + + Use these weights to crosswalk counts data from zip-to-county + 2012-2022 data use 2010 Census geographies. 2023-present data use + 2020 Census geographies. All years use quarter 4 data. + + Source: https://www.huduser.gov/portal/datasets/usps_crosswalk.html + """ + if not (2016 <= year <= 2025): + raise ValueError("Must choose a year between 2016 and 2025") + + data = get_dataset(f"crosswalk/zip_to_county/zip_to_county_{year}.parquet") + lf = pl.scan_parquet(data) + + if as_pandas: + return lf.collect().to_pandas() + + return lf diff --git a/tests/crosswalk_test.py b/tests/crosswalk_test.py new file mode 100644 index 0000000..4602438 --- /dev/null +++ b/tests/crosswalk_test.py @@ -0,0 +1,60 @@ +import pandera.polars as pa +import polars as pl +import pytest +from pandas import DataFrame + +from kintsugi.crosswalk import county_to_zip, zip_to_county + +from .models import BasePolarsModel + + +class ZipCountyCrosswalk(BasePolarsModel): + zip_code: pl.String # pyright: ignore [reportUninitializedInstanceVariable] + county_fips: pl.String # pyright: ignore [reportUninitializedInstanceVariable] + res_ratio: pl.Float64 = pa.Field(ge=0) # pyright: ignore [reportAny] + + +@pytest.mark.parametrize( + ("year"), + range(2016, 2026), +) +def test_county_to_zip(year: int) -> None: + county_to_zip(year).collect().pipe(ZipCountyCrosswalk.validate, lazy=True) + + +@pytest.mark.parametrize( + ("year"), + range(2016, 2026), +) +def test_county_to_zip_as_pandas(year: int) -> None: + df = county_to_zip(year, as_pandas=True) + + assert isinstance(df, DataFrame) + + +def test_county_to_zip_year_exception() -> None: + with pytest.raises(ValueError, match="Must choose a year between 2016 and 2025"): + county_to_zip(2010) + + +@pytest.mark.parametrize( + ("year"), + range(2016, 2026), +) +def test_zip_to_county(year: int) -> None: + zip_to_county(year).collect().pipe(ZipCountyCrosswalk.validate, lazy=True) + + +@pytest.mark.parametrize( + ("year"), + range(2016, 2026), +) +def test_zip_to_county_as_pandas(year: int) -> None: + df = zip_to_county(year, as_pandas=True) + + assert isinstance(df, DataFrame) + + +def test_zip_to_county_year_exception() -> None: + with pytest.raises(ValueError, match="Must choose a year between 2016 and 2025"): + zip_to_county(2010) From f797f21d98afba8b97191198c445b286460eb1a9 Mon Sep 17 00:00:00 2001 From: winter-again <63322884+winter-again@users.noreply.github.com> Date: Thu, 12 Mar 2026 17:40:05 -0400 Subject: [PATCH 2/5] PUMA version crosswalk --- pyproject.toml | 2 +- src/kintsugi/_data.py | 3 +- src/kintsugi/crosswalk.py | 489 ++++++++++++++++++-------------------- tests/crosswalk_test.py | 25 +- uv.lock | 2 +- 5 files changed, 264 insertions(+), 257 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index bd65ad4..b2c378d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "kintsugi" -version = "0.8.0" +version = "0.9.0" description = "Commonly used datasets and functions" readme = "README.md" authors = [ diff --git a/src/kintsugi/_data.py b/src/kintsugi/_data.py index 21ccc09..7868053 100644 --- a/src/kintsugi/_data.py +++ b/src/kintsugi/_data.py @@ -12,7 +12,7 @@ logger = logging.getLogger("kintsugi") logger.addHandler(logging.NullHandler()) -BASE_URL = "https://raw.githubusercontent.com/winter-again/kintsugi-data/main/data" +BASE_URL = "https://raw.githubusercontent.com/bansallab/kintsugi-data/main/data" DATASETS = { "county_neighbors/county_adjacency2010.txt": "7edda309ad38a4dfc6a6c6c30e1753e5490b9c8a3aa4563188841989b4fe9a96", "county_neighbors/county_adjacency2023.txt": "2dbf9a8bae1b7c50db3a9db4864b073d2423c3e6e63518c97d649453e2809843", @@ -28,6 +28,7 @@ "crosswalk/county_to_zip/county_to_zip_2023.parquet": "2f7c7564092c43d3a964ecf76bc61083916186d140c3cabed63aba5f91b56f1a", "crosswalk/county_to_zip/county_to_zip_2024.parquet": "907321abfa45437a13d4cfa9047b687456e883a096b863c8874d168fb8cc57c0", "crosswalk/county_to_zip/county_to_zip_2025.parquet": "e5008518558a4aca7ae2d40226f56bdcdcdf01bdb3879241a0b8d69a14097239", + "crosswalk/PUMA/puma2010-to-puma2020.csv": "bc6d0116aa39ea3b2af85095b49bac00f4612f591d2f4684cc2cc872c210a666", "crosswalk/zip_to_county/zip_to_county_2016.parquet": "982913988b4915e9c4787d4f639985635cae938b447944e0ef04868bb5c784d1", "crosswalk/zip_to_county/zip_to_county_2017.parquet": "e8dd16dcee07b3a6b52a30c49157d707a1f1e8f1cabf7da126cb094e88cfd0eb", "crosswalk/zip_to_county/zip_to_county_2018.parquet": "8ab399f97bb8d81eaabca7adf512e801c8958745fe6f0e82992313fbcfaaa942", diff --git a/src/kintsugi/crosswalk.py b/src/kintsugi/crosswalk.py index 8586209..88106c7 100644 --- a/src/kintsugi/crosswalk.py +++ b/src/kintsugi/crosswalk.py @@ -5,7 +5,242 @@ from ._data import get_dataset -num_county_subs = 169 +# num_county_subs = 169 + + +@overload +def puma_2010_2020(as_pandas: Literal[False] = ...) -> pl.LazyFrame: ... + + +@overload +def puma_2010_2020(as_pandas: Literal[True]) -> pd.DataFrame: ... + + +def puma_2010_2020(as_pandas: bool = False) -> pl.LazyFrame | pd.DataFrame: + """ + Crosswalk data between 2010 (effective 2012) and 2020 (effective 2022) PUMAs. + + `wt_PUMA_2010_to_2020_MCDC` describes the proportion of the 2010 PUMA's population + that lives in the 2020 PUMA. Similarly, `wt_PUMA_2020_to_2010_MCDC` describes the + proportion of the 2020 PUMA's population that lives in the 2010 PUMA. + + Sourced from Missouri Census Data Center (MCDC). Alternative source exists from + Integrated Public Use Microdata Series (IPUMS). + + - Source (MCDC): https://mcdc.missouri.edu/geography/PUMAs.html + - See: https://mcdc.missouri.edu/data/corrlst/puma2010-to-puma2020.csv + - Source (IPUMS): https://usa.ipums.org/usa/volii/pumas20.shtml + - See: https://usa.ipums.org/usa/resources/volii/PUMA2010_PUMA2020_crosswalk.xls + """ + data = get_dataset("crosswalk/PUMA/puma2010-to-puma2020.csv") + lf = ( + pl.scan_csv( + data, + skip_rows_after_header=1, + schema_overrides={ + "state": pl.String, + "puma12": pl.String, + "puma22": pl.String, + "afact": pl.Float64, + "AFACT2": pl.Float64, + }, + ) + .select( + "state", + "puma22", + "puma12", + "afact", # portion of earlier PUMA's population living in later PUMA + "AFACT2", # portion of later PUMA's population living in the earlier PUMA + ) + .rename( + { + "afact": "wt_PUMA_2010_to_2020_MCDC", + "AFACT2": "wt_PUMA_2020_to_2010_MCDC", + } + ) + .with_columns( + (pl.col(col).str.zfill(5).alias(col) for col in ["puma12", "puma22"]), + state=pl.col("state").str.zfill(2), + ) + .filter(pl.col("state").is_between(pl.lit("01"), pl.lit("56"))) + .with_columns( + puma_geoid_2020=pl.col("state") + pl.col("puma22"), + puma_geoid_2010=pl.col("state") + pl.col("puma12"), + ) + .select( + "puma_geoid_2010", + "puma_geoid_2020", + "wt_PUMA_2010_to_2020_MCDC", + "wt_PUMA_2020_to_2010_MCDC", + ) + .sort("puma_geoid_2010", "puma_geoid_2020") + ) + + # NOTE: implementation using alternate data source + # lf_IPUMS = ( + # pl.read_excel( + # PUMS_DATA / "PUMA2010_PUMA2020_crosswalk.xls", + # # NOTE: pPUMA20_Pop20 = "Estimated percent of the 2020 PUMA's 2020 population that lies in the area of intersection" + # columns=[ + # "GEOID10", + # "GEOID20", + # "pPUMA20_Pop20", + # "pPUMA10_Pop20", + # ], + # ) + # .lazy() + # .rename( + # { + # "GEOID20": "puma_geoid_2020", + # "GEOID10": "puma_geoid_2010", + # "pPUMA20_Pop20": "wt_PUMA_2020_to_2010", + # "pPUMA10_Pop20": "wt_PUMA_2010_to_2020", + # } + # ) + # .with_columns( + # (pl.col(col) / 100.0).alias(col) + # for col in ["wt_PUMA_2020_to_2010", "wt_PUMA_2010_to_2020"] + # ) + # .select( + # "puma_geoid_2020", + # "puma_geoid_2010", + # "wt_PUMA_2020_to_2010", + # "wt_PUMA_2010_to_2020", + # ) + # .sort("puma_geoid_2020", "puma_geoid_2010") + # ) + + if as_pandas: + return lf.collect().to_pandas() + + return lf + + +# @overload +# def crosswalk_puma_2010_county_2020( +# as_pandas: Literal[False] = ..., +# ) -> pl.LazyFrame: ... +# +# +# @overload +# def crosswalk_puma_2010_county_2020(as_pandas: Literal[True]) -> pd.DataFrame: ... +# +# +# def crosswalk_puma_2010_county_2020( +# as_pandas: bool = False, +# ) -> pl.LazyFrame | pd.DataFrame: +# """ +# Crosswalk data between 2010 PUMAs and 2020 counties. +# +# Note: uses new CT counties +# +# Source: MCDC 2022 Geocorr +# - Form query: https://mcdc.missouri.edu/cgi-bin/broker?_PROGRAM=apps.geocorr2022.sas&_SERVICE=MCDC_long&_debug=0&state=Mo29&state=Al01&state=Ak02&state=Az04&state=Ar05&state=Ca06&state=Co08&state=Ct09&state=De10&state=Dc11&state=Fl12&state=Ga13&state=Hi15&state=Id16&state=Il17&state=In18&state=Ia19&state=Ks20&state=Ky21&state=La22&state=Me23&state=Md24&state=Ma25&state=Mi26&state=Mn27&state=Ms28&state=Mt30&state=Ne31&state=Nv32&state=Nh33&state=Nj34&state=Nm35&state=Ny36&state=Nc37&state=Nd38&state=Oh39&state=Ok40&state=Or41&state=Pa42&state=Pr72&state=Ri44&state=Sc45&state=Sd46&state=Tn47&state=Tx48&state=Ut49&state=Vt50&state=Va51&state=Wa53&state=Wv54&state=Wi55&state=Wy56&g1_=puma12&g2_=county&wtvar=pop20&nozerob=1&fileout=1&filefmt=csv&lstfmt=html&title=&afacts2=on&counties=&metros=&places=&oropt=&latitude=&longitude=&distance=&kiloms=0&locname= +# """ +# lf = ( +# pl.read_csv( +# CROSSWALK_DATA / "PUMA/geocorr_puma_2010_to_county_2020_with_afact2.csv", +# encoding="iso-8859-1", +# skip_rows_after_header=1, +# columns=[ +# "state", +# "puma12", +# "county", +# "afact", +# "afact2", +# ], +# schema_overrides={ +# "state": pl.String, +# "puma12": pl.String, +# "county": pl.String, +# "afact": pl.String, +# "afact2": pl.String, +# }, +# ) +# .lazy() +# .rename({"county": "county_fips"}) +# .with_columns( +# puma_geoid=pl.col("state") + pl.col("puma12"), +# wt_PUMA_2010_to_county=pl.col("afact").str.strip_chars().cast(pl.Float64), +# wt_county_to_PUMA_2010=pl.col("afact2").str.strip_chars().cast(pl.Float64), +# ) +# .filter( +# pl.col("state").is_between(pl.lit("01"), pl.lit("56")), +# # pl.col("wt_PUMA_2010_to_county") != 0 +# ) +# .select( +# "puma_geoid", +# "county_fips", +# "wt_PUMA_2010_to_county", +# "wt_county_to_PUMA_2010", +# ) +# .sort("puma_geoid", "county_fips") +# ) +# +# if as_pandas: +# return lf.collect().to_pandas() +# +# return lf + + +@overload +def county_to_zip(year: int, as_pandas: Literal[False] = ...) -> pl.LazyFrame: ... + + +@overload +def county_to_zip(year: int, as_pandas: Literal[True]) -> pd.DataFrame: ... + + +def county_to_zip(year: int, as_pandas: bool = False) -> pl.LazyFrame | pd.DataFrame: + """ + County-to-zip residential ratio weights. + + Use these weights to crosswalk zip-to-county via a weighted mean. + 2012-2022 data use 2010 Census geographies. 2023-present data use + 2020 Census geographies. All years use quarter 4 data. + + Source: https://www.huduser.gov/portal/datasets/usps_crosswalk.html + """ + if not (2016 <= year <= 2025): + raise ValueError("Must choose a year between 2016 and 2025") + + data = get_dataset(f"crosswalk/county_to_zip/county_to_zip_{year}.parquet") + lf = pl.scan_parquet(data) + + if as_pandas: + return lf.collect().to_pandas() + + return lf + + +@overload +def zip_to_county(year: int, as_pandas: Literal[False] = ...) -> pl.LazyFrame: ... + + +@overload +def zip_to_county(year: int, as_pandas: Literal[True]) -> pd.DataFrame: ... + + +def zip_to_county(year: int, as_pandas: bool = False) -> pl.LazyFrame | pd.DataFrame: + """ + Zip-to-county residential ratio weights. + + Use these weights to crosswalk counts data from zip-to-county + 2012-2022 data use 2010 Census geographies. 2023-present data use + 2020 Census geographies. All years use quarter 4 data. + + Source: https://www.huduser.gov/portal/datasets/usps_crosswalk.html + """ + if not (2016 <= year <= 2025): + raise ValueError("Must choose a year between 2016 and 2025") + + data = get_dataset(f"crosswalk/zip_to_county/zip_to_county_{year}.parquet") + lf = pl.scan_parquet(data) + + if as_pandas: + return lf.collect().to_pandas() + + return lf # @overload @@ -176,255 +411,3 @@ # return lf.collect().to_pandas() # # return lf - - -# @overload -# def crosswalk_puma_versions() -> pl.LazyFrame: ... -# -# -# @overload -# def crosswalk_puma_versions(as_pandas: Literal[False]) -> pl.LazyFrame: ... -# -# -# @overload -# def crosswalk_puma_versions(as_pandas: Literal[True]) -> pd.DataFrame: ... -# -# -# def crosswalk_puma_versions(as_pandas: bool = False) -> pl.LazyFrame | pd.DataFrame: -# """ -# Crosswalk data between 2010 (effective 2012) and 2020 (effective 2022) PUMAs. -# -# Sourced from Missouri Census Data Center (MCDC). Alternative source exists from -# Integrated Public Use Microdata Series (IPUMS). -# -# - Source (MCDC): https://mcdc.missouri.edu/geography/PUMAs.html -# - See: https://mcdc.missouri.edu/data/corrlst/puma2010-to-puma2020.csv -# - Source (IPUMS): https://usa.ipums.org/usa/volii/pumas20.shtml -# - See: https://usa.ipums.org/usa/resources/volii/PUMA2010_PUMA2020_crosswalk.xls -# """ -# lf = ( -# pl.scan_csv( -# CROSSWALK_DATA / "PUMA/puma2010-to-puma2020.csv", -# skip_rows_after_header=1, -# schema_overrides={ -# "state": pl.String, -# "puma12": pl.String, -# "puma22": pl.String, -# "afact": pl.Float64, -# "AFACT2": pl.Float64, -# }, -# ) -# .select( -# "state", -# "puma22", -# "puma12", -# "afact", # portion of earlier PUMA's population living in later PUMA -# "AFACT2", # portion of later PUMA's population living in the earlier PUMA -# ) -# .rename( -# { -# "afact": "wt_PUMA_2010_to_2020_MCDC", -# "AFACT2": "wt_PUMA_2020_to_2010_MCDC", -# } -# ) -# .with_columns( -# puma_geoid_2020=pl.col("state").str.zfill(2) -# + pl.col("puma22").str.zfill(5), -# puma_geoid_2010=pl.col("state").str.zfill(2) -# + pl.col("puma12").str.zfill(5), -# ) -# .select( -# "puma_geoid_2010", -# "puma_geoid_2020", -# "wt_PUMA_2010_to_2020_MCDC", -# "wt_PUMA_2020_to_2010_MCDC", -# ) -# .filter( -# pl.col("puma_geoid_2010") -# .str.slice(0, 2) -# .is_between(pl.lit("01"), pl.lit("56")), -# pl.col("puma_geoid_2020") -# .str.slice(0, 2) -# .is_between(pl.lit("01"), pl.lit("56")), -# # pl.col("wt_PUMA_2020_to_2010") != 0.0, -# ) -# .sort("puma_geoid_2010", "puma_geoid_2020") -# ) -# -# # lf_IPUMS = ( -# # pl.read_excel( -# # PUMS_DATA / "PUMA2010_PUMA2020_crosswalk.xls", -# # # NOTE: pPUMA20_Pop20 = "Estimated percent of the 2020 PUMA's 2020 population that lies in the area of intersection" -# # columns=[ -# # "GEOID10", -# # "GEOID20", -# # "pPUMA20_Pop20", -# # "pPUMA10_Pop20", -# # ], -# # ) -# # .lazy() -# # .rename( -# # { -# # "GEOID20": "puma_geoid_2020", -# # "GEOID10": "puma_geoid_2010", -# # "pPUMA20_Pop20": "wt_PUMA_2020_to_2010", -# # "pPUMA10_Pop20": "wt_PUMA_2010_to_2020", -# # } -# # ) -# # .with_columns( -# # (pl.col(col) / 100.0).alias(col) -# # for col in ["wt_PUMA_2020_to_2010", "wt_PUMA_2010_to_2020"] -# # ) -# # .select( -# # "puma_geoid_2020", -# # "puma_geoid_2010", -# # "wt_PUMA_2020_to_2010", -# # "wt_PUMA_2010_to_2020", -# # ) -# # .sort("puma_geoid_2020", "puma_geoid_2010") -# # ) -# -# if as_pandas: -# return lf.collect().to_pandas() -# -# return lf - - -# @overload -# def crosswalk_puma_2010_county_2020() -> pl.LazyFrame: ... -# -# -# @overload -# def crosswalk_puma_2010_county_2020(as_pandas: Literal[False]) -> pl.LazyFrame: ... -# -# -# @overload -# def crosswalk_puma_2010_county_2020(as_pandas: Literal[True]) -> pd.DataFrame: ... -# -# -# def crosswalk_puma_2010_county_2020( -# as_pandas: bool = False, -# ) -> pl.LazyFrame | pd.DataFrame: -# """ -# Crosswalk data between 2010 PUMAs and 2020 counties. -# -# Note: uses new CT counties -# -# Source: MCDC 2022 Geocorr -# - Form query: https://mcdc.missouri.edu/cgi-bin/broker?_PROGRAM=apps.geocorr2022.sas&_SERVICE=MCDC_long&_debug=0&state=Mo29&state=Al01&state=Ak02&state=Az04&state=Ar05&state=Ca06&state=Co08&state=Ct09&state=De10&state=Dc11&state=Fl12&state=Ga13&state=Hi15&state=Id16&state=Il17&state=In18&state=Ia19&state=Ks20&state=Ky21&state=La22&state=Me23&state=Md24&state=Ma25&state=Mi26&state=Mn27&state=Ms28&state=Mt30&state=Ne31&state=Nv32&state=Nh33&state=Nj34&state=Nm35&state=Ny36&state=Nc37&state=Nd38&state=Oh39&state=Ok40&state=Or41&state=Pa42&state=Pr72&state=Ri44&state=Sc45&state=Sd46&state=Tn47&state=Tx48&state=Ut49&state=Vt50&state=Va51&state=Wa53&state=Wv54&state=Wi55&state=Wy56&g1_=puma12&g2_=county&wtvar=pop20&nozerob=1&fileout=1&filefmt=csv&lstfmt=html&title=&afacts2=on&counties=&metros=&places=&oropt=&latitude=&longitude=&distance=&kiloms=0&locname= -# """ -# lf = ( -# pl.read_csv( -# CROSSWALK_DATA / "PUMA/geocorr_puma_2010_to_county_2020_with_afact2.csv", -# encoding="iso-8859-1", -# skip_rows_after_header=1, -# columns=[ -# "state", -# "puma12", -# "county", -# "afact", -# "afact2", -# ], -# schema_overrides={ -# "state": pl.String, -# "puma12": pl.String, -# "county": pl.String, -# "afact": pl.String, -# "afact2": pl.String, -# }, -# ) -# .lazy() -# .rename({"county": "county_fips"}) -# .with_columns( -# puma_geoid=pl.col("state") + pl.col("puma12"), -# wt_PUMA_2010_to_county=pl.col("afact").str.strip_chars().cast(pl.Float64), -# wt_county_to_PUMA_2010=pl.col("afact2").str.strip_chars().cast(pl.Float64), -# ) -# .filter( -# pl.col("state").is_between(pl.lit("01"), pl.lit("56")), -# # pl.col("wt_PUMA_2010_to_county") != 0 -# ) -# .select( -# "puma_geoid", -# "county_fips", -# "wt_PUMA_2010_to_county", -# "wt_county_to_PUMA_2010", -# ) -# .sort("puma_geoid", "county_fips") -# ) -# -# if as_pandas: -# return lf.collect().to_pandas() -# -# return lf - - -# TODO: need to settle on some convention for the files: -# e.g., always use Q4 for each year? Q3 used because that was latest at the time when I pulled data -# Available on a quarterly release cycle from HUD: -# - 2010-Q1 - 2011-Q4 data use 2000 Census geographies -# - 2012-Q1 - 2022-Q4 data use 2010 Census geographies -# - 2023-Q1 - present data use 2020 Census geographies -# TODO: note that I can only find one instance of using old version of this function -# in other proj. Although it used zip-to-county file instead of county-to-zip file, -# the weights weren't even used (only used to map zip to county) - - -@overload -def county_to_zip(year: int, as_pandas: Literal[False] = ...) -> pl.LazyFrame: ... - - -@overload -def county_to_zip(year: int, as_pandas: Literal[True]) -> pd.DataFrame: ... - - -def county_to_zip(year: int, as_pandas: bool = False) -> pl.LazyFrame | pd.DataFrame: - """ - County-to-zip residential ratio weights. - - Use these weights to crosswalk zip-to-county via a weighted mean. - 2012-2022 data use 2010 Census geographies. 2023-present data use - 2020 Census geographies. All years use quarter 4 data. - - Source: https://www.huduser.gov/portal/datasets/usps_crosswalk.html - """ - if not (2016 <= year <= 2025): - raise ValueError("Must choose a year between 2016 and 2025") - - data = get_dataset(f"crosswalk/county_to_zip/county_to_zip_{year}.parquet") - lf = pl.scan_parquet(data) - - if as_pandas: - return lf.collect().to_pandas() - - return lf - - -@overload -def zip_to_county(year: int, as_pandas: Literal[False] = ...) -> pl.LazyFrame: ... - - -@overload -def zip_to_county(year: int, as_pandas: Literal[True]) -> pd.DataFrame: ... - - -def zip_to_county(year: int, as_pandas: bool = False) -> pl.LazyFrame | pd.DataFrame: - """ - Zip-to-county residential ratio weights. - - Use these weights to crosswalk counts data from zip-to-county - 2012-2022 data use 2010 Census geographies. 2023-present data use - 2020 Census geographies. All years use quarter 4 data. - - Source: https://www.huduser.gov/portal/datasets/usps_crosswalk.html - """ - if not (2016 <= year <= 2025): - raise ValueError("Must choose a year between 2016 and 2025") - - data = get_dataset(f"crosswalk/zip_to_county/zip_to_county_{year}.parquet") - lf = pl.scan_parquet(data) - - if as_pandas: - return lf.collect().to_pandas() - - return lf diff --git a/tests/crosswalk_test.py b/tests/crosswalk_test.py index 4602438..3705208 100644 --- a/tests/crosswalk_test.py +++ b/tests/crosswalk_test.py @@ -3,16 +3,39 @@ import pytest from pandas import DataFrame -from kintsugi.crosswalk import county_to_zip, zip_to_county +from kintsugi.crosswalk import county_to_zip, puma_2010_2020, zip_to_county from .models import BasePolarsModel +class PUMAVersionCrosswalk(BasePolarsModel): + puma_geoid_2010: pl.String # pyright: ignore [reportUninitializedInstanceVariable] + puma_geoid_2020: pl.String # pyright: ignore [reportUninitializedInstanceVariable] + wt_PUMA_2010_to_2020_MCDC: pl.Float64 # pyright: ignore [reportUninitializedInstanceVariable] + wt_PUMA_2020_to_2010_MCDC: pl.Float64 # pyright: ignore [reportUninitializedInstanceVariable] + + class Config: # pyright: ignore [reportIncompatibleVariableOverride] + unique: list[str] = ["puma_geoid_2010", "puma_geoid_2020"] + + +def test_puma_2010_2020() -> None: + puma_2010_2020().collect().pipe(PUMAVersionCrosswalk.validate, lazy=True) + + +def test_puma_2010_2020_as_pandas() -> None: + df = puma_2010_2020(as_pandas=True) + + assert isinstance(df, DataFrame) + + class ZipCountyCrosswalk(BasePolarsModel): zip_code: pl.String # pyright: ignore [reportUninitializedInstanceVariable] county_fips: pl.String # pyright: ignore [reportUninitializedInstanceVariable] res_ratio: pl.Float64 = pa.Field(ge=0) # pyright: ignore [reportAny] + class Config: # pyright: ignore [reportIncompatibleVariableOverride] + unique: list[str] = ["zip_code", "county_fips"] + @pytest.mark.parametrize( ("year"), diff --git a/uv.lock b/uv.lock index df92410..9ec42ad 100644 --- a/uv.lock +++ b/uv.lock @@ -200,7 +200,7 @@ wheels = [ [[package]] name = "kintsugi" -version = "0.8.0" +version = "0.9.0" source = { editable = "." } dependencies = [ { name = "pandas" }, From b5125b80ed10b83fb4ad6483af18f3369f1aefc9 Mon Sep 17 00:00:00 2001 From: winter-again <63322884+winter-again@users.noreply.github.com> Date: Fri, 13 Mar 2026 09:17:53 -0400 Subject: [PATCH 3/5] PUMA-county crosswalk --- src/kintsugi/_data.py | 1 + src/kintsugi/crosswalk.py | 128 ++++++++++++++++++-------------------- tests/crosswalk_test.py | 74 +++++++++++++++++++++- 3 files changed, 134 insertions(+), 69 deletions(-) diff --git a/src/kintsugi/_data.py b/src/kintsugi/_data.py index 7868053..c0ff179 100644 --- a/src/kintsugi/_data.py +++ b/src/kintsugi/_data.py @@ -28,6 +28,7 @@ "crosswalk/county_to_zip/county_to_zip_2023.parquet": "2f7c7564092c43d3a964ecf76bc61083916186d140c3cabed63aba5f91b56f1a", "crosswalk/county_to_zip/county_to_zip_2024.parquet": "907321abfa45437a13d4cfa9047b687456e883a096b863c8874d168fb8cc57c0", "crosswalk/county_to_zip/county_to_zip_2025.parquet": "e5008518558a4aca7ae2d40226f56bdcdcdf01bdb3879241a0b8d69a14097239", + "crosswalk/PUMA/geocorr_puma_2010_to_county_2020_with_afact2.csv": "b66f23aa27be3daf1f6456607478848df3383d4c7111d6f935f792fc9542b807", "crosswalk/PUMA/puma2010-to-puma2020.csv": "bc6d0116aa39ea3b2af85095b49bac00f4612f591d2f4684cc2cc872c210a666", "crosswalk/zip_to_county/zip_to_county_2016.parquet": "982913988b4915e9c4787d4f639985635cae938b447944e0ef04868bb5c784d1", "crosswalk/zip_to_county/zip_to_county_2017.parquet": "e8dd16dcee07b3a6b52a30c49157d707a1f1e8f1cabf7da126cb094e88cfd0eb", diff --git a/src/kintsugi/crosswalk.py b/src/kintsugi/crosswalk.py index 88106c7..b5593bd 100644 --- a/src/kintsugi/crosswalk.py +++ b/src/kintsugi/crosswalk.py @@ -5,8 +5,6 @@ from ._data import get_dataset -# num_county_subs = 169 - @overload def puma_2010_2020(as_pandas: Literal[False] = ...) -> pl.LazyFrame: ... @@ -116,71 +114,64 @@ def puma_2010_2020(as_pandas: bool = False) -> pl.LazyFrame | pd.DataFrame: return lf -# @overload -# def crosswalk_puma_2010_county_2020( -# as_pandas: Literal[False] = ..., -# ) -> pl.LazyFrame: ... -# -# -# @overload -# def crosswalk_puma_2010_county_2020(as_pandas: Literal[True]) -> pd.DataFrame: ... -# -# -# def crosswalk_puma_2010_county_2020( -# as_pandas: bool = False, -# ) -> pl.LazyFrame | pd.DataFrame: -# """ -# Crosswalk data between 2010 PUMAs and 2020 counties. -# -# Note: uses new CT counties -# -# Source: MCDC 2022 Geocorr -# - Form query: https://mcdc.missouri.edu/cgi-bin/broker?_PROGRAM=apps.geocorr2022.sas&_SERVICE=MCDC_long&_debug=0&state=Mo29&state=Al01&state=Ak02&state=Az04&state=Ar05&state=Ca06&state=Co08&state=Ct09&state=De10&state=Dc11&state=Fl12&state=Ga13&state=Hi15&state=Id16&state=Il17&state=In18&state=Ia19&state=Ks20&state=Ky21&state=La22&state=Me23&state=Md24&state=Ma25&state=Mi26&state=Mn27&state=Ms28&state=Mt30&state=Ne31&state=Nv32&state=Nh33&state=Nj34&state=Nm35&state=Ny36&state=Nc37&state=Nd38&state=Oh39&state=Ok40&state=Or41&state=Pa42&state=Pr72&state=Ri44&state=Sc45&state=Sd46&state=Tn47&state=Tx48&state=Ut49&state=Vt50&state=Va51&state=Wa53&state=Wv54&state=Wi55&state=Wy56&g1_=puma12&g2_=county&wtvar=pop20&nozerob=1&fileout=1&filefmt=csv&lstfmt=html&title=&afacts2=on&counties=&metros=&places=&oropt=&latitude=&longitude=&distance=&kiloms=0&locname= -# """ -# lf = ( -# pl.read_csv( -# CROSSWALK_DATA / "PUMA/geocorr_puma_2010_to_county_2020_with_afact2.csv", -# encoding="iso-8859-1", -# skip_rows_after_header=1, -# columns=[ -# "state", -# "puma12", -# "county", -# "afact", -# "afact2", -# ], -# schema_overrides={ -# "state": pl.String, -# "puma12": pl.String, -# "county": pl.String, -# "afact": pl.String, -# "afact2": pl.String, -# }, -# ) -# .lazy() -# .rename({"county": "county_fips"}) -# .with_columns( -# puma_geoid=pl.col("state") + pl.col("puma12"), -# wt_PUMA_2010_to_county=pl.col("afact").str.strip_chars().cast(pl.Float64), -# wt_county_to_PUMA_2010=pl.col("afact2").str.strip_chars().cast(pl.Float64), -# ) -# .filter( -# pl.col("state").is_between(pl.lit("01"), pl.lit("56")), -# # pl.col("wt_PUMA_2010_to_county") != 0 -# ) -# .select( -# "puma_geoid", -# "county_fips", -# "wt_PUMA_2010_to_county", -# "wt_county_to_PUMA_2010", -# ) -# .sort("puma_geoid", "county_fips") -# ) -# -# if as_pandas: -# return lf.collect().to_pandas() -# -# return lf +@overload +def puma_2010_county_2020( + as_pandas: Literal[False] = ..., +) -> pl.LazyFrame: ... + + +@overload +def puma_2010_county_2020(as_pandas: Literal[True]) -> pd.DataFrame: ... + + +def puma_2010_county_2020( + as_pandas: bool = False, +) -> pl.LazyFrame | pd.DataFrame: + """ + Crosswalk data between 2010 PUMAs (effective 2012) and 2020 (effective 2022) counties. + + Note: uses new CT counties despite counties being labeled as 2020 + + Source: MCDC 2022 Geocorr + PUMA page: https://mcdc.missouri.edu/geography/PUMAs.html + Form query: https://mcdc.missouri.edu/cgi-bin/broker?_PROGRAM=apps.geocorr2022.sas&_SERVICE=MCDC_long&_debug=0&state=Mo29&state=Al01&state=Ak02&state=Az04&state=Ar05&state=Ca06&state=Co08&state=Ct09&state=De10&state=Dc11&state=Fl12&state=Ga13&state=Hi15&state=Id16&state=Il17&state=In18&state=Ia19&state=Ks20&state=Ky21&state=La22&state=Me23&state=Md24&state=Ma25&state=Mi26&state=Mn27&state=Ms28&state=Mt30&state=Ne31&state=Nv32&state=Nh33&state=Nj34&state=Nm35&state=Ny36&state=Nc37&state=Nd38&state=Oh39&state=Ok40&state=Or41&state=Pa42&state=Pr72&state=Ri44&state=Sc45&state=Sd46&state=Tn47&state=Tx48&state=Ut49&state=Vt50&state=Va51&state=Wa53&state=Wv54&state=Wi55&state=Wy56&g1_=puma12&g2_=county&wtvar=pop20&nozerob=1&fileout=1&filefmt=csv&lstfmt=html&title=&afacts2=on&counties=&metros=&places=&oropt=&latitude=&longitude=&distance=&kiloms=0&locname= + """ + data = get_dataset( + "crosswalk/PUMA/geocorr_puma_2010_to_county_2020_with_afact2.csv" + ) + lf = ( + pl.scan_csv( + data, + encoding="utf8-lossy", + skip_rows_after_header=1, + schema_overrides={ + "state": pl.String, + "puma12": pl.String, + "county": pl.String, + "afact": pl.String, + "afact2": pl.String, + }, + ) + .filter(pl.col("state").is_between(pl.lit("01"), pl.lit("56"))) + .rename({"county": "county_fips"}) + .with_columns( + puma_geoid_2010=pl.col("state") + pl.col("puma12"), + wt_PUMA_2010_to_county=pl.col("afact").str.strip_chars().cast(pl.Float64), + wt_county_to_PUMA_2010=pl.col("afact2").str.strip_chars().cast(pl.Float64), + ) + .select( + "puma_geoid_2010", + "county_fips", + "wt_PUMA_2010_to_county", + "wt_county_to_PUMA_2010", + ) + .sort("puma_geoid_2010", "county_fips") + ) + + if as_pandas: + return lf.collect().to_pandas() + + return lf @overload @@ -311,6 +302,7 @@ def zip_to_county(year: int, as_pandas: bool = False) -> pl.LazyFrame | pd.DataF # ) # assert crosswalk.select(pl.len()).collect().item() == num_county_subs # +# num_county_subs = 169 # subcounty = ( # pl.scan_csv( # CROSSWALK_DATA / "county/sub-est2024_9.csv", @@ -349,7 +341,7 @@ def zip_to_county(year: int, as_pandas: bool = False) -> pl.LazyFrame | pd.DataF # .drop("state_fips", "county_fips", "sumlev") # .sort("county_sub_fips") # ) -# assert subcounty.select(pl.len()).collect().item() == num_county_subs +# assert subcounty.select(pl.len()).collect().item() == 169 # # lf = ( # crosswalk.join( diff --git a/tests/crosswalk_test.py b/tests/crosswalk_test.py index 3705208..a1efa29 100644 --- a/tests/crosswalk_test.py +++ b/tests/crosswalk_test.py @@ -2,8 +2,14 @@ import polars as pl import pytest from pandas import DataFrame +from pandera.polars import PolarsData -from kintsugi.crosswalk import county_to_zip, puma_2010_2020, zip_to_county +from kintsugi.crosswalk import ( + county_to_zip, + puma_2010_2020, + puma_2010_county_2020, + zip_to_county, +) from .models import BasePolarsModel @@ -17,6 +23,24 @@ class PUMAVersionCrosswalk(BasePolarsModel): class Config: # pyright: ignore [reportIncompatibleVariableOverride] unique: list[str] = ["puma_geoid_2010", "puma_geoid_2020"] + @pa.dataframe_check + def has_correct_states(cls, data: PolarsData) -> bool: + return ( + data.lazyframe.select( + pl.all_horizontal( + pl.col("puma_geoid_2010") + .str.slice(0, 2) + .is_between(pl.lit("01"), pl.lit("56")), + pl.col("puma_geoid_2020") + .str.slice(0, 2) + .is_between(pl.lit("01"), pl.lit("56")), + ).all() + ) + .collect() + .item() + is True + ) + def test_puma_2010_2020() -> None: puma_2010_2020().collect().pipe(PUMAVersionCrosswalk.validate, lazy=True) @@ -28,6 +52,40 @@ def test_puma_2010_2020_as_pandas() -> None: assert isinstance(df, DataFrame) +class PUMACountyCrosswalk(BasePolarsModel): + puma_geoid_2010: pl.String # pyright: ignore [reportUninitializedInstanceVariable] + county_fips: pl.String # pyright: ignore [reportUninitializedInstanceVariable] + wt_PUMA_2010_to_county: pl.Float64 # pyright: ignore [reportUninitializedInstanceVariable] + wt_county_to_PUMA_2010: pl.Float64 # pyright: ignore [reportUninitializedInstanceVariable] + + class Config: # pyright: ignore [reportIncompatibleVariableOverride] + unique: list[str] = ["puma_geoid_2010", "county_fips"] + + @pa.dataframe_check + def has_correct_states(cls, data: PolarsData) -> bool: + return ( + data.lazyframe.select( + pl.col("county_fips") + .str.slice(0, 2) + .is_between(pl.lit("01"), pl.lit("56")) + .all() + ) + .collect() + .item() + is True + ) + + +def test_puma_2010_county_2020() -> None: + puma_2010_county_2020().collect().pipe(PUMACountyCrosswalk.validate, lazy=True) + + +def test_puma_2010_county_2020_as_pandas() -> None: + df = puma_2010_county_2020(as_pandas=True) + + assert isinstance(df, DataFrame) + + class ZipCountyCrosswalk(BasePolarsModel): zip_code: pl.String # pyright: ignore [reportUninitializedInstanceVariable] county_fips: pl.String # pyright: ignore [reportUninitializedInstanceVariable] @@ -36,6 +94,20 @@ class ZipCountyCrosswalk(BasePolarsModel): class Config: # pyright: ignore [reportIncompatibleVariableOverride] unique: list[str] = ["zip_code", "county_fips"] + @pa.dataframe_check + def has_correct_states(cls, data: PolarsData) -> bool: + return ( + data.lazyframe.select( + pl.col("county_fips") + .str.slice(0, 2) + .is_between(pl.lit("01"), pl.lit("56")) + .all() + ) + .collect() + .item() + is True + ) + @pytest.mark.parametrize( ("year"), From 71c88ac9ee4e95fd4e25102d958e73f9c993a6d9 Mon Sep 17 00:00:00 2001 From: winter-again <63322884+winter-again@users.noreply.github.com> Date: Fri, 13 Mar 2026 11:16:09 -0400 Subject: [PATCH 4/5] Crosswalk CT counties --- src/kintsugi/_data.py | 2 + src/kintsugi/crosswalk.py | 336 +++++++++++++++++++------------------- tests/crosswalk_test.py | 51 +++++- 3 files changed, 215 insertions(+), 174 deletions(-) diff --git a/src/kintsugi/_data.py b/src/kintsugi/_data.py index c0ff179..210c3be 100644 --- a/src/kintsugi/_data.py +++ b/src/kintsugi/_data.py @@ -18,6 +18,8 @@ "county_neighbors/county_adjacency2023.txt": "2dbf9a8bae1b7c50db3a9db4864b073d2423c3e6e63518c97d649453e2809843", "county_neighbors/county_adjacency2024.txt": "20cffeb48ba46972fb949c453d3fbf62620115039c45c88bc80c094885650816", "county_neighbors/county_adjacency2025.txt": "27046a5f09f66205fd9869afbfb6dcae744e1c9b85cb19dd767c580211fb4575", + "crosswalk/county/ct_cou_to_cousub_crosswalk.txt": "13ad002564e30e6dcd1df112fdaa985f2cd0cdb908f7f3017cd2723db294cff8", + "crosswalk/county/sub-est2024_9.csv": "e5bd2cb1b10cf12d741572eef9e8eff19a1f6c5e08eb910d6f44f299c7ca83df", "crosswalk/county_to_zip/county_to_zip_2016.parquet": "f13d79b059b272c9ac4ff02dbe4e0e26d56acffad837adeb0f2d86d3101fd9ad", "crosswalk/county_to_zip/county_to_zip_2017.parquet": "009141988a3b902179d6beed80260faf2474a4cda8d6af0667d88ab4f0140b0b", "crosswalk/county_to_zip/county_to_zip_2018.parquet": "697425d6ffba4c8f95c80a961814d2291f61507336010349a4d78c10ce5cfdba", diff --git a/src/kintsugi/crosswalk.py b/src/kintsugi/crosswalk.py index b5593bd..9d9954b 100644 --- a/src/kintsugi/crosswalk.py +++ b/src/kintsugi/crosswalk.py @@ -234,172 +234,170 @@ def zip_to_county(year: int, as_pandas: bool = False) -> pl.LazyFrame | pd.DataF return lf -# @overload -# def crosswalk_CT_counties(as_pandas: Literal[False] = ...) -> pl.LazyFrame: ... -# -# -# @overload -# def crosswalk_CT_counties(as_pandas: Literal[True]) -> pd.DataFrame: ... -# -# -# def crosswalk_CT_counties(as_pandas: bool = False) -> pl.LazyFrame | pd.DataFrame: -# """ -# Crosswalk CT counties between pre and post-2022 changes. Weights calculated based on county subdivision populations. -# -# See: -# - FIPS code changes: https://www.census.gov/programs-surveys/geography/technical-documentation/county-changes.html -# - CT specific: https://www2.census.gov/geo/pdfs/reference/ct_county_equiv_change.pdf -# - CT specific: https://www.federalregister.gov/documents/2022/06/06/2022-12063/change-to-county-equivalents-in-the-state-of-connecticut -# -# Crosswalk: https://www.census.gov/programs-surveys/geography/technical-documentation/county-changes.January_2020.html -# Source: https://www2.census.gov/geo/docs/reference/ct_change/ct_cou_to_cousub_crosswalk.txt -# -# CT county subdivision populations source: https://www.census.gov/data/tables/time-series/demo/popest/2020s-total-cities-and-towns.html -# FTP: https://www2.census.gov/programs-surveys/popest/datasets/2020-2024/cities/totals/sub-est2024_9.csv -# -# """ -# crosswalk = ( -# pl.scan_csv( -# CROSSWALK_DATA / "county/ct_cou_to_cousub_crosswalk.txt", -# separator="|", -# n_rows=174, -# infer_schema=False, -# ) -# .select( -# "STATEFP\n(INCITS38)", -# "OLD_COUNTYFP\n(INCITS31)", -# "OLD_COUNTY_NAMELSAD", -# "NEW_COUNTYFP\n(INCITS31)", -# "NEW_COUNTY_NAMELSAD", -# "COUSUBFP", -# "OLD_COUSUB_GEOID", -# "NEW_COUSUB_GEOID", -# "COUSUB_NAMELSAD", -# ) -# .rename( -# { -# "STATEFP\n(INCITS38)": "state_fips", -# "OLD_COUNTYFP\n(INCITS31)": "county_fips_old", -# "OLD_COUNTY_NAMELSAD": "county_name_old", -# "NEW_COUNTYFP\n(INCITS31)": "county_fips_new", -# "NEW_COUNTY_NAMELSAD": "county_name_new", -# "COUSUBFP": "county_sub_fips", -# "OLD_COUSUB_GEOID": "county_sub_geoid_old", -# "NEW_COUSUB_GEOID": "county_sub_geoid_new", -# "COUSUB_NAMELSAD": "county_sub_name", -# } -# ) -# .filter( -# # NOTE: 5 rows labeled with "County subdivisions not defined" but their GEOID/FIPS -# # doesn't make sense anyway -# pl.col("county_sub_fips") != "00000" -# ) -# .with_columns( -# (pl.col("state_fips") + pl.col(col)).alias(col) -# for col in ["county_fips_old", "county_fips_new"] -# ) -# .drop("state_fips") -# ) -# assert crosswalk.select(pl.len()).collect().item() == num_county_subs -# -# num_county_subs = 169 -# subcounty = ( -# pl.scan_csv( -# CROSSWALK_DATA / "county/sub-est2024_9.csv", -# schema_overrides={ -# "SUMLEV": pl.String, -# "STATE": pl.String, -# "COUNTY": pl.String, -# "COUSUB": pl.String, -# "NAME": pl.String, -# "POPESTIMATE2024": pl.Int64, -# }, -# ) -# .select( -# "SUMLEV", -# "STATE", -# "COUNTY", -# "COUSUB", -# "NAME", -# "POPESTIMATE2024", -# ) -# .rename( -# { -# "SUMLEV": "sumlev", -# "STATE": "state_fips", -# "COUNTY": "county_fips", -# "COUSUB": "county_sub_fips", -# "NAME": "county_sub_name", -# "POPESTIMATE2024": "pop_2024", -# } -# ) -# .filter( -# # only minor civil divisions -# pl.col("sumlev") == "061" -# ) -# .with_columns(county_fips=pl.col("state_fips") + pl.col("county_fips")) -# .drop("state_fips", "county_fips", "sumlev") -# .sort("county_sub_fips") -# ) -# assert subcounty.select(pl.len()).collect().item() == 169 -# -# lf = ( -# crosswalk.join( -# subcounty, -# on="county_sub_fips", -# how="inner", -# validate="1:1", -# ) -# .select( -# "county_sub_fips", -# "county_sub_name", -# "county_fips_new", -# "county_name_new", -# "county_fips_old", -# "county_name_old", -# "pop_2024", -# ) -# .with_columns( -# pop_old=pl.col("pop_2024").sum().over("county_fips_old"), -# pop_new=pl.col("pop_2024").sum().over("county_fips_new"), -# ) -# .group_by( -# [ -# "county_fips_new", -# "county_name_new", -# "county_fips_old", -# "county_name_old", -# "pop_old", -# "pop_new", -# ] -# ) -# .agg(pop_agg=pl.col("pop_2024").sum()) # agg county sub to county pairs -# # NOTE: want weights to be expected prop. of origin FIPS that are located in dest. FIPS -# # Aka prop. of origin FIPS that is located in dest. FIPS -# # Ex: wt_new_to_old should give expected prop. of new FIPS that is located in old FIPS. -# .with_columns( -# wt_new_to_old=pl.col("pop_agg") / pl.col("pop_new"), -# wt_old_to_new=pl.col("pop_agg") / pl.col("pop_old"), -# ) -# .select( -# "county_fips_old", -# "county_name_old", -# "county_fips_new", -# "county_name_new", -# "wt_new_to_old", -# "wt_old_to_new", -# ) -# .sort("county_fips_old") -# ) -# assert ( -# lf.select(pl.len()).collect().item() -# == lf.unique(["county_fips_old", "county_fips_new"]) -# .select(pl.len()) -# .collect() -# .item() -# ) -# -# if as_pandas: -# return lf.collect().to_pandas() -# -# return lf +@overload +def counties_CT(as_pandas: Literal[False] = ...) -> pl.LazyFrame: ... + + +@overload +def counties_CT(as_pandas: Literal[True]) -> pd.DataFrame: ... + + +def counties_CT(as_pandas: bool = False) -> pl.LazyFrame | pd.DataFrame: + """ + Crosswalk CT counties between pre and post-2022 changes. + + Uses county subdivisions as a more accurate intermediary. + Weights calculated based on county subdivision 2024 populations. + + See: + - https://www2.census.gov/geo/pdfs/reference/ct_county_equiv_change.pdf + - https://www.federalregister.gov/documents/2022/06/06/2022-12063/change-to-county-equivalents-in-the-state-of-connecticut + + Crosswalk data: https://www.census.gov/programs-surveys/geography/technical-documentation/county-changes.January_2020.html + Source: https://www2.census.gov/geo/docs/reference/ct_change/ct_cou_to_cousub_crosswalk.txt + + CT county subdivision populations source: https://www.census.gov/data/tables/time-series/demo/popest/2020s-total-cities-and-towns.html + FTP: https://www2.census.gov/programs-surveys/popest/datasets/2020-2024/cities/totals/sub-est2024_9.csv + + """ + num_county_subs = 169 + data_county_to_sub = get_dataset("crosswalk/county/ct_cou_to_cousub_crosswalk.txt") + # NOTE: provides mapping of county subdivisions between old and new CT counties + crosswalk = ( + pl.scan_csv( + data_county_to_sub, + separator="|", + n_rows=174, + infer_schema=False, + ) + .rename(lambda col: col.strip().lower().split("(")[0].strip()) + .rename( + { + "statefp": "state_fips", + "old_countyfp": "county_fips_old", + "old_county_namelsad": "county_name_old", + "new_countyfp": "county_fips_new", + "new_county_namelsad": "county_name_new", + "cousubfp": "county_sub_fips", + "old_cousub_geoid": "county_sub_geoid_old", + "new_cousub_geoid": "county_sub_geoid_new", + "cousub_namelsad": "county_sub_name", + } + ) + .filter( + # NOTE: 5 rows labeled with "County subdivisions not defined" but their GEOID/FIPS + # doesn't make sense anyway + pl.col("county_sub_fips") != "00000" + ) + .with_columns( + (pl.col("state_fips") + pl.col(col)).alias(col) + for col in ["county_fips_old", "county_fips_new"] + ) + .select( + "county_sub_fips", + "county_fips_new", + "county_name_new", + "county_fips_old", + "county_name_old", + ) + ) + assert crosswalk.select(pl.len()).collect().item() == num_county_subs + + data_county_sub = get_dataset("crosswalk/county/sub-est2024_9.csv") + # NOTE: provides county subdivision population counts + county_sub = ( + pl.scan_csv( + data_county_sub, + schema_overrides={ + "SUMLEV": pl.String, + "STATE": pl.String, + "COUNTY": pl.String, + "COUSUB": pl.String, + "NAME": pl.String, + "POPESTIMATE2024": pl.Int64, + }, + ) + .rename( + { + "SUMLEV": "sumlev", + "COUSUB": "county_sub_fips", + "NAME": "county_sub_name", + "POPESTIMATE2024": "pop_2024", + } + ) + .select( + "sumlev", + "county_sub_fips", + "county_sub_name", + "pop_2024", + ) + .filter( + # 061 apparently considered part of 060 (county subdivisions/minor civil divisions) + # Use because it's one level below county (050) + pl.col("sumlev") == "061" + ) + .drop("sumlev") + .sort("county_sub_fips") + ) + assert county_sub.select(pl.len()).collect().item() == 169 + + print(crosswalk.collect()) + print(county_sub.collect()) + + lf = ( + crosswalk.join( + county_sub, + on="county_sub_fips", + how="inner", + validate="1:1", + ) + .select( + "county_sub_fips", + "county_sub_name", + "county_fips_new", + "county_name_new", + "county_fips_old", + "county_name_old", + "pop_2024", + ) + .with_columns( + pop_old=pl.col("pop_2024").sum().over("county_fips_old"), + pop_new=pl.col("pop_2024").sum().over("county_fips_new"), + ) + .group_by( + [ + "county_fips_new", + "county_name_new", + "county_fips_old", + "county_name_old", + "pop_old", + "pop_new", + ] + ) + .agg( + pop_intersect=pl.col( + "pop_2024" + ).sum() # sum county subdivision population by new-county-old-county pairs + ) + # NOTE: want weights to be expected prop. of intersection that is located in dest. FIPS + # Aka prop. of origin FIPS that is located in dest. FIPS + # Ex: wt_new_to_old should give expected prop. of new FIPS that is located in old FIPS. + .with_columns( + wt_new_to_old=pl.col("pop_intersect") / pl.col("pop_new"), + wt_old_to_new=pl.col("pop_intersect") / pl.col("pop_old"), + ) + .select( + "county_fips_old", + "county_name_old", + "county_fips_new", + "county_name_new", + "wt_new_to_old", + "wt_old_to_new", + ) + .sort("county_fips_old") + ) + + if as_pandas: + return lf.collect().to_pandas() + + return lf diff --git a/tests/crosswalk_test.py b/tests/crosswalk_test.py index a1efa29..2276817 100644 --- a/tests/crosswalk_test.py +++ b/tests/crosswalk_test.py @@ -5,6 +5,7 @@ from pandera.polars import PolarsData from kintsugi.crosswalk import ( + counties_CT, county_to_zip, puma_2010_2020, puma_2010_county_2020, @@ -17,8 +18,8 @@ class PUMAVersionCrosswalk(BasePolarsModel): puma_geoid_2010: pl.String # pyright: ignore [reportUninitializedInstanceVariable] puma_geoid_2020: pl.String # pyright: ignore [reportUninitializedInstanceVariable] - wt_PUMA_2010_to_2020_MCDC: pl.Float64 # pyright: ignore [reportUninitializedInstanceVariable] - wt_PUMA_2020_to_2010_MCDC: pl.Float64 # pyright: ignore [reportUninitializedInstanceVariable] + wt_PUMA_2010_to_2020_MCDC: pl.Float64 = pa.Field(ge=0, le=1.0) # pyright: ignore [reportAny] + wt_PUMA_2020_to_2010_MCDC: pl.Float64 = pa.Field(ge=0, le=1.0) # pyright: ignore [reportAny] class Config: # pyright: ignore [reportIncompatibleVariableOverride] unique: list[str] = ["puma_geoid_2010", "puma_geoid_2020"] @@ -55,8 +56,8 @@ def test_puma_2010_2020_as_pandas() -> None: class PUMACountyCrosswalk(BasePolarsModel): puma_geoid_2010: pl.String # pyright: ignore [reportUninitializedInstanceVariable] county_fips: pl.String # pyright: ignore [reportUninitializedInstanceVariable] - wt_PUMA_2010_to_county: pl.Float64 # pyright: ignore [reportUninitializedInstanceVariable] - wt_county_to_PUMA_2010: pl.Float64 # pyright: ignore [reportUninitializedInstanceVariable] + wt_PUMA_2010_to_county: pl.Float64 = pa.Field(ge=0, le=1.0) # pyright: ignore [reportAny] + wt_county_to_PUMA_2010: pl.Float64 = pa.Field(ge=0, le=1.0) # pyright: ignore [reportAny] class Config: # pyright: ignore [reportIncompatibleVariableOverride] unique: list[str] = ["puma_geoid_2010", "county_fips"] @@ -89,7 +90,7 @@ def test_puma_2010_county_2020_as_pandas() -> None: class ZipCountyCrosswalk(BasePolarsModel): zip_code: pl.String # pyright: ignore [reportUninitializedInstanceVariable] county_fips: pl.String # pyright: ignore [reportUninitializedInstanceVariable] - res_ratio: pl.Float64 = pa.Field(ge=0) # pyright: ignore [reportAny] + res_ratio: pl.Float64 = pa.Field(ge=0, le=1.0) # pyright: ignore [reportAny] class Config: # pyright: ignore [reportIncompatibleVariableOverride] unique: list[str] = ["zip_code", "county_fips"] @@ -153,3 +154,43 @@ def test_zip_to_county_as_pandas(year: int) -> None: def test_zip_to_county_year_exception() -> None: with pytest.raises(ValueError, match="Must choose a year between 2016 and 2025"): zip_to_county(2010) + + +class CountiesCT(BasePolarsModel): + county_fips_old: pl.String # pyright: ignore [reportUninitializedInstanceVariable] + county_name_old: pl.String # pyright: ignore [reportUninitializedInstanceVariable] + county_fips_new: pl.String # pyright: ignore [reportUninitializedInstanceVariable] + county_name_new: pl.String # pyright: ignore [reportUninitializedInstanceVariable] + wt_new_to_old: pl.Float64 = pa.Field(ge=0, le=1.0) # pyright: ignore [reportAny] + wt_old_to_new: pl.Float64 = pa.Field(ge=0, le=1.0) # pyright: ignore [reportAny] + + class Config: # pyright: ignore [reportIncompatibleVariableOverride] + unique: list[str] = ["county_fips_old", "county_fips_new"] + + @pa.dataframe_check + def has_correct_states(cls, data: PolarsData) -> bool: + return ( + data.lazyframe.select( + pl.all_horizontal( + pl.col("county_fips_old") + .str.slice(0, 2) + .is_between(pl.lit("01"), pl.lit("56")), + pl.col("county_fips_new") + .str.slice(0, 2) + .is_between(pl.lit("01"), pl.lit("56")), + ).all() + ) + .collect() + .item() + is True + ) + + +def test_counties_CT() -> None: + counties_CT().collect().pipe(CountiesCT.validate, lazy=True) + + +def test_counties_CT_as_pandas() -> None: + df = counties_CT(as_pandas=True) + + assert isinstance(df, DataFrame) From d4613b0cf44eae3a5e05082aec5d091e90b9f015 Mon Sep 17 00:00:00 2001 From: winter-again <63322884+winter-again@users.noreply.github.com> Date: Fri, 13 Mar 2026 11:41:15 -0400 Subject: [PATCH 5/5] Bump version and add crosswalk example to README --- README.md | 10 +++++++++- pyproject.toml | 2 +- uv.lock | 2 +- 3 files changed, 11 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 32d0384..7c8a0e8 100644 --- a/README.md +++ b/README.md @@ -20,7 +20,7 @@ pip install git+ssh://git@github.com/winter-again/kintsugi ## Datasets -Currently supported datasets: +Currently supported datasets. Where appropriate, you can pass `as_pandas=True` to get a pandas dataframe back: County neighbors @@ -71,3 +71,11 @@ from kintsugi.metadata import counties lf_counties = counties(2020) ``` + +Crosswalk 2010 PUMAs to 2020 counties + +```python +from kintsugi.crosswalk import puma_2010_county_2020 + +crosswalk = puma_2010_county_2020() +``` diff --git a/pyproject.toml b/pyproject.toml index b2c378d..3382a06 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "kintsugi" -version = "0.9.0" +version = "0.10.0" description = "Commonly used datasets and functions" readme = "README.md" authors = [ diff --git a/uv.lock b/uv.lock index 9ec42ad..b3323e7 100644 --- a/uv.lock +++ b/uv.lock @@ -200,7 +200,7 @@ wheels = [ [[package]] name = "kintsugi" -version = "0.9.0" +version = "0.10.0" source = { editable = "." } dependencies = [ { name = "pandas" },