diff --git a/README.md b/README.md index 32d0384..7c8a0e8 100644 --- a/README.md +++ b/README.md @@ -20,7 +20,7 @@ pip install git+ssh://git@github.com/winter-again/kintsugi ## Datasets -Currently supported datasets: +Currently supported datasets. Where appropriate, you can pass `as_pandas=True` to get a pandas dataframe back: County neighbors @@ -71,3 +71,11 @@ from kintsugi.metadata import counties lf_counties = counties(2020) ``` + +Crosswalk 2010 PUMAs to 2020 counties + +```python +from kintsugi.crosswalk import puma_2010_county_2020 + +crosswalk = puma_2010_county_2020() +``` diff --git a/pyproject.toml b/pyproject.toml index bd65ad4..3382a06 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "kintsugi" -version = "0.8.0" +version = "0.10.0" description = "Commonly used datasets and functions" readme = "README.md" authors = [ diff --git a/src/kintsugi/_data.py b/src/kintsugi/_data.py index 69d334e..210c3be 100644 --- a/src/kintsugi/_data.py +++ b/src/kintsugi/_data.py @@ -12,12 +12,36 @@ logger = logging.getLogger("kintsugi") logger.addHandler(logging.NullHandler()) -BASE_URL = "https://raw.githubusercontent.com/winter-again/kintsugi-data/main/data" +BASE_URL = "https://raw.githubusercontent.com/bansallab/kintsugi-data/main/data" DATASETS = { "county_neighbors/county_adjacency2010.txt": "7edda309ad38a4dfc6a6c6c30e1753e5490b9c8a3aa4563188841989b4fe9a96", "county_neighbors/county_adjacency2023.txt": "2dbf9a8bae1b7c50db3a9db4864b073d2423c3e6e63518c97d649453e2809843", "county_neighbors/county_adjacency2024.txt": "20cffeb48ba46972fb949c453d3fbf62620115039c45c88bc80c094885650816", "county_neighbors/county_adjacency2025.txt": "27046a5f09f66205fd9869afbfb6dcae744e1c9b85cb19dd767c580211fb4575", + "crosswalk/county/ct_cou_to_cousub_crosswalk.txt": "13ad002564e30e6dcd1df112fdaa985f2cd0cdb908f7f3017cd2723db294cff8", + "crosswalk/county/sub-est2024_9.csv": "e5bd2cb1b10cf12d741572eef9e8eff19a1f6c5e08eb910d6f44f299c7ca83df", + "crosswalk/county_to_zip/county_to_zip_2016.parquet": "f13d79b059b272c9ac4ff02dbe4e0e26d56acffad837adeb0f2d86d3101fd9ad", + "crosswalk/county_to_zip/county_to_zip_2017.parquet": "009141988a3b902179d6beed80260faf2474a4cda8d6af0667d88ab4f0140b0b", + "crosswalk/county_to_zip/county_to_zip_2018.parquet": "697425d6ffba4c8f95c80a961814d2291f61507336010349a4d78c10ce5cfdba", + "crosswalk/county_to_zip/county_to_zip_2019.parquet": "3a06ca1a4d21638a4d014cbb4bc29b7d6b254a937dab1536058152d226073d62", + "crosswalk/county_to_zip/county_to_zip_2020.parquet": "06cb2bfa55085d9c5f0f652628a61d1bf64fe14ea190df5bdc361844f5204250", + "crosswalk/county_to_zip/county_to_zip_2021.parquet": "fdd59a6b8ab06a534d1c9c4765101105ce8bc08d9685eb9441c58701bffd96ae", + "crosswalk/county_to_zip/county_to_zip_2022.parquet": "c3875e66bff03ff71ad73b89277fc892a294a5d71be08b4eb4ac2c431d715b83", + "crosswalk/county_to_zip/county_to_zip_2023.parquet": "2f7c7564092c43d3a964ecf76bc61083916186d140c3cabed63aba5f91b56f1a", + "crosswalk/county_to_zip/county_to_zip_2024.parquet": "907321abfa45437a13d4cfa9047b687456e883a096b863c8874d168fb8cc57c0", + "crosswalk/county_to_zip/county_to_zip_2025.parquet": "e5008518558a4aca7ae2d40226f56bdcdcdf01bdb3879241a0b8d69a14097239", + "crosswalk/PUMA/geocorr_puma_2010_to_county_2020_with_afact2.csv": "b66f23aa27be3daf1f6456607478848df3383d4c7111d6f935f792fc9542b807", + "crosswalk/PUMA/puma2010-to-puma2020.csv": "bc6d0116aa39ea3b2af85095b49bac00f4612f591d2f4684cc2cc872c210a666", + "crosswalk/zip_to_county/zip_to_county_2016.parquet": "982913988b4915e9c4787d4f639985635cae938b447944e0ef04868bb5c784d1", + "crosswalk/zip_to_county/zip_to_county_2017.parquet": "e8dd16dcee07b3a6b52a30c49157d707a1f1e8f1cabf7da126cb094e88cfd0eb", + "crosswalk/zip_to_county/zip_to_county_2018.parquet": "8ab399f97bb8d81eaabca7adf512e801c8958745fe6f0e82992313fbcfaaa942", + "crosswalk/zip_to_county/zip_to_county_2019.parquet": "78ac9cdda7c5e00b82315ceaece4f3722da905e2ec4f68774a135d327c257435", + "crosswalk/zip_to_county/zip_to_county_2020.parquet": "17f518b2a6de5ee58b134c0dc5fbf10f6d48c45b14fdbfcfe58a4269e08eaee2", + "crosswalk/zip_to_county/zip_to_county_2021.parquet": "655c6ce8b257e942ac1c7dfa068d93ad2a1f139c9cd0c110efbce752ab27cea4", + "crosswalk/zip_to_county/zip_to_county_2022.parquet": "9f125642aeb5aba3c8f2bdf616c186f975fc6afc1b5543d55204ad9b2a10b252", + "crosswalk/zip_to_county/zip_to_county_2023.parquet": "89f50d28acb226a5777ededc1f8301dc1f028c65dd04b860355d23efe24b89c5", + "crosswalk/zip_to_county/zip_to_county_2024.parquet": "8d76e09fa52923c0901c0afad26ba2d32075210bb0ab5b82caa14ce5465b3d80", + "crosswalk/zip_to_county/zip_to_county_2025.parquet": "2683272f9759646d0a6234de06076f274852b85fcd2fd9e3d2dfcb3aa10def80", "geo/cb_2020_us_county_5m.zip": "187e7118304428e5450083beb375e67c2c516c58a01ce52db95aaf24f18df3ba", "geo/cb_2020_us_state_5m.zip": "aedc60e0d1924a9030ee6d39ff0ed27ad7d1b0bc86807ea809391a6b9008ffb3", "geo/cb_2024_us_county_5m.zip": "a867f8734059b45d1d54a0ba56189dd7e73c42eb451418fa56de44c35232614b", diff --git a/src/kintsugi/crosswalk.py b/src/kintsugi/crosswalk.py new file mode 100644 index 0000000..9d9954b --- /dev/null +++ b/src/kintsugi/crosswalk.py @@ -0,0 +1,403 @@ +from typing import Literal, overload + +import pandas as pd +import polars as pl + +from ._data import get_dataset + + +@overload +def puma_2010_2020(as_pandas: Literal[False] = ...) -> pl.LazyFrame: ... + + +@overload +def puma_2010_2020(as_pandas: Literal[True]) -> pd.DataFrame: ... + + +def puma_2010_2020(as_pandas: bool = False) -> pl.LazyFrame | pd.DataFrame: + """ + Crosswalk data between 2010 (effective 2012) and 2020 (effective 2022) PUMAs. + + `wt_PUMA_2010_to_2020_MCDC` describes the proportion of the 2010 PUMA's population + that lives in the 2020 PUMA. Similarly, `wt_PUMA_2020_to_2010_MCDC` describes the + proportion of the 2020 PUMA's population that lives in the 2010 PUMA. + + Sourced from Missouri Census Data Center (MCDC). Alternative source exists from + Integrated Public Use Microdata Series (IPUMS). + + - Source (MCDC): https://mcdc.missouri.edu/geography/PUMAs.html + - See: https://mcdc.missouri.edu/data/corrlst/puma2010-to-puma2020.csv + - Source (IPUMS): https://usa.ipums.org/usa/volii/pumas20.shtml + - See: https://usa.ipums.org/usa/resources/volii/PUMA2010_PUMA2020_crosswalk.xls + """ + data = get_dataset("crosswalk/PUMA/puma2010-to-puma2020.csv") + lf = ( + pl.scan_csv( + data, + skip_rows_after_header=1, + schema_overrides={ + "state": pl.String, + "puma12": pl.String, + "puma22": pl.String, + "afact": pl.Float64, + "AFACT2": pl.Float64, + }, + ) + .select( + "state", + "puma22", + "puma12", + "afact", # portion of earlier PUMA's population living in later PUMA + "AFACT2", # portion of later PUMA's population living in the earlier PUMA + ) + .rename( + { + "afact": "wt_PUMA_2010_to_2020_MCDC", + "AFACT2": "wt_PUMA_2020_to_2010_MCDC", + } + ) + .with_columns( + (pl.col(col).str.zfill(5).alias(col) for col in ["puma12", "puma22"]), + state=pl.col("state").str.zfill(2), + ) + .filter(pl.col("state").is_between(pl.lit("01"), pl.lit("56"))) + .with_columns( + puma_geoid_2020=pl.col("state") + pl.col("puma22"), + puma_geoid_2010=pl.col("state") + pl.col("puma12"), + ) + .select( + "puma_geoid_2010", + "puma_geoid_2020", + "wt_PUMA_2010_to_2020_MCDC", + "wt_PUMA_2020_to_2010_MCDC", + ) + .sort("puma_geoid_2010", "puma_geoid_2020") + ) + + # NOTE: implementation using alternate data source + # lf_IPUMS = ( + # pl.read_excel( + # PUMS_DATA / "PUMA2010_PUMA2020_crosswalk.xls", + # # NOTE: pPUMA20_Pop20 = "Estimated percent of the 2020 PUMA's 2020 population that lies in the area of intersection" + # columns=[ + # "GEOID10", + # "GEOID20", + # "pPUMA20_Pop20", + # "pPUMA10_Pop20", + # ], + # ) + # .lazy() + # .rename( + # { + # "GEOID20": "puma_geoid_2020", + # "GEOID10": "puma_geoid_2010", + # "pPUMA20_Pop20": "wt_PUMA_2020_to_2010", + # "pPUMA10_Pop20": "wt_PUMA_2010_to_2020", + # } + # ) + # .with_columns( + # (pl.col(col) / 100.0).alias(col) + # for col in ["wt_PUMA_2020_to_2010", "wt_PUMA_2010_to_2020"] + # ) + # .select( + # "puma_geoid_2020", + # "puma_geoid_2010", + # "wt_PUMA_2020_to_2010", + # "wt_PUMA_2010_to_2020", + # ) + # .sort("puma_geoid_2020", "puma_geoid_2010") + # ) + + if as_pandas: + return lf.collect().to_pandas() + + return lf + + +@overload +def puma_2010_county_2020( + as_pandas: Literal[False] = ..., +) -> pl.LazyFrame: ... + + +@overload +def puma_2010_county_2020(as_pandas: Literal[True]) -> pd.DataFrame: ... + + +def puma_2010_county_2020( + as_pandas: bool = False, +) -> pl.LazyFrame | pd.DataFrame: + """ + Crosswalk data between 2010 PUMAs (effective 2012) and 2020 (effective 2022) counties. + + Note: uses new CT counties despite counties being labeled as 2020 + + Source: MCDC 2022 Geocorr + PUMA page: https://mcdc.missouri.edu/geography/PUMAs.html + Form query: https://mcdc.missouri.edu/cgi-bin/broker?_PROGRAM=apps.geocorr2022.sas&_SERVICE=MCDC_long&_debug=0&state=Mo29&state=Al01&state=Ak02&state=Az04&state=Ar05&state=Ca06&state=Co08&state=Ct09&state=De10&state=Dc11&state=Fl12&state=Ga13&state=Hi15&state=Id16&state=Il17&state=In18&state=Ia19&state=Ks20&state=Ky21&state=La22&state=Me23&state=Md24&state=Ma25&state=Mi26&state=Mn27&state=Ms28&state=Mt30&state=Ne31&state=Nv32&state=Nh33&state=Nj34&state=Nm35&state=Ny36&state=Nc37&state=Nd38&state=Oh39&state=Ok40&state=Or41&state=Pa42&state=Pr72&state=Ri44&state=Sc45&state=Sd46&state=Tn47&state=Tx48&state=Ut49&state=Vt50&state=Va51&state=Wa53&state=Wv54&state=Wi55&state=Wy56&g1_=puma12&g2_=county&wtvar=pop20&nozerob=1&fileout=1&filefmt=csv&lstfmt=html&title=&afacts2=on&counties=&metros=&places=&oropt=&latitude=&longitude=&distance=&kiloms=0&locname= + """ + data = get_dataset( + "crosswalk/PUMA/geocorr_puma_2010_to_county_2020_with_afact2.csv" + ) + lf = ( + pl.scan_csv( + data, + encoding="utf8-lossy", + skip_rows_after_header=1, + schema_overrides={ + "state": pl.String, + "puma12": pl.String, + "county": pl.String, + "afact": pl.String, + "afact2": pl.String, + }, + ) + .filter(pl.col("state").is_between(pl.lit("01"), pl.lit("56"))) + .rename({"county": "county_fips"}) + .with_columns( + puma_geoid_2010=pl.col("state") + pl.col("puma12"), + wt_PUMA_2010_to_county=pl.col("afact").str.strip_chars().cast(pl.Float64), + wt_county_to_PUMA_2010=pl.col("afact2").str.strip_chars().cast(pl.Float64), + ) + .select( + "puma_geoid_2010", + "county_fips", + "wt_PUMA_2010_to_county", + "wt_county_to_PUMA_2010", + ) + .sort("puma_geoid_2010", "county_fips") + ) + + if as_pandas: + return lf.collect().to_pandas() + + return lf + + +@overload +def county_to_zip(year: int, as_pandas: Literal[False] = ...) -> pl.LazyFrame: ... + + +@overload +def county_to_zip(year: int, as_pandas: Literal[True]) -> pd.DataFrame: ... + + +def county_to_zip(year: int, as_pandas: bool = False) -> pl.LazyFrame | pd.DataFrame: + """ + County-to-zip residential ratio weights. + + Use these weights to crosswalk zip-to-county via a weighted mean. + 2012-2022 data use 2010 Census geographies. 2023-present data use + 2020 Census geographies. All years use quarter 4 data. + + Source: https://www.huduser.gov/portal/datasets/usps_crosswalk.html + """ + if not (2016 <= year <= 2025): + raise ValueError("Must choose a year between 2016 and 2025") + + data = get_dataset(f"crosswalk/county_to_zip/county_to_zip_{year}.parquet") + lf = pl.scan_parquet(data) + + if as_pandas: + return lf.collect().to_pandas() + + return lf + + +@overload +def zip_to_county(year: int, as_pandas: Literal[False] = ...) -> pl.LazyFrame: ... + + +@overload +def zip_to_county(year: int, as_pandas: Literal[True]) -> pd.DataFrame: ... + + +def zip_to_county(year: int, as_pandas: bool = False) -> pl.LazyFrame | pd.DataFrame: + """ + Zip-to-county residential ratio weights. + + Use these weights to crosswalk counts data from zip-to-county + 2012-2022 data use 2010 Census geographies. 2023-present data use + 2020 Census geographies. All years use quarter 4 data. + + Source: https://www.huduser.gov/portal/datasets/usps_crosswalk.html + """ + if not (2016 <= year <= 2025): + raise ValueError("Must choose a year between 2016 and 2025") + + data = get_dataset(f"crosswalk/zip_to_county/zip_to_county_{year}.parquet") + lf = pl.scan_parquet(data) + + if as_pandas: + return lf.collect().to_pandas() + + return lf + + +@overload +def counties_CT(as_pandas: Literal[False] = ...) -> pl.LazyFrame: ... + + +@overload +def counties_CT(as_pandas: Literal[True]) -> pd.DataFrame: ... + + +def counties_CT(as_pandas: bool = False) -> pl.LazyFrame | pd.DataFrame: + """ + Crosswalk CT counties between pre and post-2022 changes. + + Uses county subdivisions as a more accurate intermediary. + Weights calculated based on county subdivision 2024 populations. + + See: + - https://www2.census.gov/geo/pdfs/reference/ct_county_equiv_change.pdf + - https://www.federalregister.gov/documents/2022/06/06/2022-12063/change-to-county-equivalents-in-the-state-of-connecticut + + Crosswalk data: https://www.census.gov/programs-surveys/geography/technical-documentation/county-changes.January_2020.html + Source: https://www2.census.gov/geo/docs/reference/ct_change/ct_cou_to_cousub_crosswalk.txt + + CT county subdivision populations source: https://www.census.gov/data/tables/time-series/demo/popest/2020s-total-cities-and-towns.html + FTP: https://www2.census.gov/programs-surveys/popest/datasets/2020-2024/cities/totals/sub-est2024_9.csv + + """ + num_county_subs = 169 + data_county_to_sub = get_dataset("crosswalk/county/ct_cou_to_cousub_crosswalk.txt") + # NOTE: provides mapping of county subdivisions between old and new CT counties + crosswalk = ( + pl.scan_csv( + data_county_to_sub, + separator="|", + n_rows=174, + infer_schema=False, + ) + .rename(lambda col: col.strip().lower().split("(")[0].strip()) + .rename( + { + "statefp": "state_fips", + "old_countyfp": "county_fips_old", + "old_county_namelsad": "county_name_old", + "new_countyfp": "county_fips_new", + "new_county_namelsad": "county_name_new", + "cousubfp": "county_sub_fips", + "old_cousub_geoid": "county_sub_geoid_old", + "new_cousub_geoid": "county_sub_geoid_new", + "cousub_namelsad": "county_sub_name", + } + ) + .filter( + # NOTE: 5 rows labeled with "County subdivisions not defined" but their GEOID/FIPS + # doesn't make sense anyway + pl.col("county_sub_fips") != "00000" + ) + .with_columns( + (pl.col("state_fips") + pl.col(col)).alias(col) + for col in ["county_fips_old", "county_fips_new"] + ) + .select( + "county_sub_fips", + "county_fips_new", + "county_name_new", + "county_fips_old", + "county_name_old", + ) + ) + assert crosswalk.select(pl.len()).collect().item() == num_county_subs + + data_county_sub = get_dataset("crosswalk/county/sub-est2024_9.csv") + # NOTE: provides county subdivision population counts + county_sub = ( + pl.scan_csv( + data_county_sub, + schema_overrides={ + "SUMLEV": pl.String, + "STATE": pl.String, + "COUNTY": pl.String, + "COUSUB": pl.String, + "NAME": pl.String, + "POPESTIMATE2024": pl.Int64, + }, + ) + .rename( + { + "SUMLEV": "sumlev", + "COUSUB": "county_sub_fips", + "NAME": "county_sub_name", + "POPESTIMATE2024": "pop_2024", + } + ) + .select( + "sumlev", + "county_sub_fips", + "county_sub_name", + "pop_2024", + ) + .filter( + # 061 apparently considered part of 060 (county subdivisions/minor civil divisions) + # Use because it's one level below county (050) + pl.col("sumlev") == "061" + ) + .drop("sumlev") + .sort("county_sub_fips") + ) + assert county_sub.select(pl.len()).collect().item() == 169 + + print(crosswalk.collect()) + print(county_sub.collect()) + + lf = ( + crosswalk.join( + county_sub, + on="county_sub_fips", + how="inner", + validate="1:1", + ) + .select( + "county_sub_fips", + "county_sub_name", + "county_fips_new", + "county_name_new", + "county_fips_old", + "county_name_old", + "pop_2024", + ) + .with_columns( + pop_old=pl.col("pop_2024").sum().over("county_fips_old"), + pop_new=pl.col("pop_2024").sum().over("county_fips_new"), + ) + .group_by( + [ + "county_fips_new", + "county_name_new", + "county_fips_old", + "county_name_old", + "pop_old", + "pop_new", + ] + ) + .agg( + pop_intersect=pl.col( + "pop_2024" + ).sum() # sum county subdivision population by new-county-old-county pairs + ) + # NOTE: want weights to be expected prop. of intersection that is located in dest. FIPS + # Aka prop. of origin FIPS that is located in dest. FIPS + # Ex: wt_new_to_old should give expected prop. of new FIPS that is located in old FIPS. + .with_columns( + wt_new_to_old=pl.col("pop_intersect") / pl.col("pop_new"), + wt_old_to_new=pl.col("pop_intersect") / pl.col("pop_old"), + ) + .select( + "county_fips_old", + "county_name_old", + "county_fips_new", + "county_name_new", + "wt_new_to_old", + "wt_old_to_new", + ) + .sort("county_fips_old") + ) + + if as_pandas: + return lf.collect().to_pandas() + + return lf diff --git a/tests/crosswalk_test.py b/tests/crosswalk_test.py new file mode 100644 index 0000000..2276817 --- /dev/null +++ b/tests/crosswalk_test.py @@ -0,0 +1,196 @@ +import pandera.polars as pa +import polars as pl +import pytest +from pandas import DataFrame +from pandera.polars import PolarsData + +from kintsugi.crosswalk import ( + counties_CT, + county_to_zip, + puma_2010_2020, + puma_2010_county_2020, + zip_to_county, +) + +from .models import BasePolarsModel + + +class PUMAVersionCrosswalk(BasePolarsModel): + puma_geoid_2010: pl.String # pyright: ignore [reportUninitializedInstanceVariable] + puma_geoid_2020: pl.String # pyright: ignore [reportUninitializedInstanceVariable] + wt_PUMA_2010_to_2020_MCDC: pl.Float64 = pa.Field(ge=0, le=1.0) # pyright: ignore [reportAny] + wt_PUMA_2020_to_2010_MCDC: pl.Float64 = pa.Field(ge=0, le=1.0) # pyright: ignore [reportAny] + + class Config: # pyright: ignore [reportIncompatibleVariableOverride] + unique: list[str] = ["puma_geoid_2010", "puma_geoid_2020"] + + @pa.dataframe_check + def has_correct_states(cls, data: PolarsData) -> bool: + return ( + data.lazyframe.select( + pl.all_horizontal( + pl.col("puma_geoid_2010") + .str.slice(0, 2) + .is_between(pl.lit("01"), pl.lit("56")), + pl.col("puma_geoid_2020") + .str.slice(0, 2) + .is_between(pl.lit("01"), pl.lit("56")), + ).all() + ) + .collect() + .item() + is True + ) + + +def test_puma_2010_2020() -> None: + puma_2010_2020().collect().pipe(PUMAVersionCrosswalk.validate, lazy=True) + + +def test_puma_2010_2020_as_pandas() -> None: + df = puma_2010_2020(as_pandas=True) + + assert isinstance(df, DataFrame) + + +class PUMACountyCrosswalk(BasePolarsModel): + puma_geoid_2010: pl.String # pyright: ignore [reportUninitializedInstanceVariable] + county_fips: pl.String # pyright: ignore [reportUninitializedInstanceVariable] + wt_PUMA_2010_to_county: pl.Float64 = pa.Field(ge=0, le=1.0) # pyright: ignore [reportAny] + wt_county_to_PUMA_2010: pl.Float64 = pa.Field(ge=0, le=1.0) # pyright: ignore [reportAny] + + class Config: # pyright: ignore [reportIncompatibleVariableOverride] + unique: list[str] = ["puma_geoid_2010", "county_fips"] + + @pa.dataframe_check + def has_correct_states(cls, data: PolarsData) -> bool: + return ( + data.lazyframe.select( + pl.col("county_fips") + .str.slice(0, 2) + .is_between(pl.lit("01"), pl.lit("56")) + .all() + ) + .collect() + .item() + is True + ) + + +def test_puma_2010_county_2020() -> None: + puma_2010_county_2020().collect().pipe(PUMACountyCrosswalk.validate, lazy=True) + + +def test_puma_2010_county_2020_as_pandas() -> None: + df = puma_2010_county_2020(as_pandas=True) + + assert isinstance(df, DataFrame) + + +class ZipCountyCrosswalk(BasePolarsModel): + zip_code: pl.String # pyright: ignore [reportUninitializedInstanceVariable] + county_fips: pl.String # pyright: ignore [reportUninitializedInstanceVariable] + res_ratio: pl.Float64 = pa.Field(ge=0, le=1.0) # pyright: ignore [reportAny] + + class Config: # pyright: ignore [reportIncompatibleVariableOverride] + unique: list[str] = ["zip_code", "county_fips"] + + @pa.dataframe_check + def has_correct_states(cls, data: PolarsData) -> bool: + return ( + data.lazyframe.select( + pl.col("county_fips") + .str.slice(0, 2) + .is_between(pl.lit("01"), pl.lit("56")) + .all() + ) + .collect() + .item() + is True + ) + + +@pytest.mark.parametrize( + ("year"), + range(2016, 2026), +) +def test_county_to_zip(year: int) -> None: + county_to_zip(year).collect().pipe(ZipCountyCrosswalk.validate, lazy=True) + + +@pytest.mark.parametrize( + ("year"), + range(2016, 2026), +) +def test_county_to_zip_as_pandas(year: int) -> None: + df = county_to_zip(year, as_pandas=True) + + assert isinstance(df, DataFrame) + + +def test_county_to_zip_year_exception() -> None: + with pytest.raises(ValueError, match="Must choose a year between 2016 and 2025"): + county_to_zip(2010) + + +@pytest.mark.parametrize( + ("year"), + range(2016, 2026), +) +def test_zip_to_county(year: int) -> None: + zip_to_county(year).collect().pipe(ZipCountyCrosswalk.validate, lazy=True) + + +@pytest.mark.parametrize( + ("year"), + range(2016, 2026), +) +def test_zip_to_county_as_pandas(year: int) -> None: + df = zip_to_county(year, as_pandas=True) + + assert isinstance(df, DataFrame) + + +def test_zip_to_county_year_exception() -> None: + with pytest.raises(ValueError, match="Must choose a year between 2016 and 2025"): + zip_to_county(2010) + + +class CountiesCT(BasePolarsModel): + county_fips_old: pl.String # pyright: ignore [reportUninitializedInstanceVariable] + county_name_old: pl.String # pyright: ignore [reportUninitializedInstanceVariable] + county_fips_new: pl.String # pyright: ignore [reportUninitializedInstanceVariable] + county_name_new: pl.String # pyright: ignore [reportUninitializedInstanceVariable] + wt_new_to_old: pl.Float64 = pa.Field(ge=0, le=1.0) # pyright: ignore [reportAny] + wt_old_to_new: pl.Float64 = pa.Field(ge=0, le=1.0) # pyright: ignore [reportAny] + + class Config: # pyright: ignore [reportIncompatibleVariableOverride] + unique: list[str] = ["county_fips_old", "county_fips_new"] + + @pa.dataframe_check + def has_correct_states(cls, data: PolarsData) -> bool: + return ( + data.lazyframe.select( + pl.all_horizontal( + pl.col("county_fips_old") + .str.slice(0, 2) + .is_between(pl.lit("01"), pl.lit("56")), + pl.col("county_fips_new") + .str.slice(0, 2) + .is_between(pl.lit("01"), pl.lit("56")), + ).all() + ) + .collect() + .item() + is True + ) + + +def test_counties_CT() -> None: + counties_CT().collect().pipe(CountiesCT.validate, lazy=True) + + +def test_counties_CT_as_pandas() -> None: + df = counties_CT(as_pandas=True) + + assert isinstance(df, DataFrame) diff --git a/uv.lock b/uv.lock index df92410..b3323e7 100644 --- a/uv.lock +++ b/uv.lock @@ -200,7 +200,7 @@ wheels = [ [[package]] name = "kintsugi" -version = "0.8.0" +version = "0.10.0" source = { editable = "." } dependencies = [ { name = "pandas" },