From f99304e39af7cf8e3e11ceda645f519f68437711 Mon Sep 17 00:00:00 2001
From: winter-again <63322884+winter-again@users.noreply.github.com>
Date: Thu, 12 Mar 2026 14:55:10 -0400
Subject: [PATCH 1/5] zip-to-county and county-to-zip data

---
 src/kintsugi/_data.py     |  20 ++
 src/kintsugi/crosswalk.py | 430 ++++++++++++++++++++++++++++++++++++++
 tests/crosswalk_test.py   |  60 ++++++
 3 files changed, 510 insertions(+)
 create mode 100644 src/kintsugi/crosswalk.py
 create mode 100644 tests/crosswalk_test.py

diff --git a/src/kintsugi/_data.py b/src/kintsugi/_data.py
index 69d334e..21ccc09 100644
--- a/src/kintsugi/_data.py
+++ b/src/kintsugi/_data.py
@@ -18,6 +18,26 @@
     "county_neighbors/county_adjacency2023.txt": "2dbf9a8bae1b7c50db3a9db4864b073d2423c3e6e63518c97d649453e2809843",
     "county_neighbors/county_adjacency2024.txt": "20cffeb48ba46972fb949c453d3fbf62620115039c45c88bc80c094885650816",
     "county_neighbors/county_adjacency2025.txt": "27046a5f09f66205fd9869afbfb6dcae744e1c9b85cb19dd767c580211fb4575",
+    "crosswalk/county_to_zip/county_to_zip_2016.parquet": "f13d79b059b272c9ac4ff02dbe4e0e26d56acffad837adeb0f2d86d3101fd9ad",
+    "crosswalk/county_to_zip/county_to_zip_2017.parquet": "009141988a3b902179d6beed80260faf2474a4cda8d6af0667d88ab4f0140b0b",
+    "crosswalk/county_to_zip/county_to_zip_2018.parquet": "697425d6ffba4c8f95c80a961814d2291f61507336010349a4d78c10ce5cfdba",
+    "crosswalk/county_to_zip/county_to_zip_2019.parquet": "3a06ca1a4d21638a4d014cbb4bc29b7d6b254a937dab1536058152d226073d62",
+    "crosswalk/county_to_zip/county_to_zip_2020.parquet": "06cb2bfa55085d9c5f0f652628a61d1bf64fe14ea190df5bdc361844f5204250",
+    "crosswalk/county_to_zip/county_to_zip_2021.parquet": "fdd59a6b8ab06a534d1c9c4765101105ce8bc08d9685eb9441c58701bffd96ae",
+    "crosswalk/county_to_zip/county_to_zip_2022.parquet": "c3875e66bff03ff71ad73b89277fc892a294a5d71be08b4eb4ac2c431d715b83",
+    "crosswalk/county_to_zip/county_to_zip_2023.parquet": "2f7c7564092c43d3a964ecf76bc61083916186d140c3cabed63aba5f91b56f1a",
+    "crosswalk/county_to_zip/county_to_zip_2024.parquet": "907321abfa45437a13d4cfa9047b687456e883a096b863c8874d168fb8cc57c0",
+    "crosswalk/county_to_zip/county_to_zip_2025.parquet": "e5008518558a4aca7ae2d40226f56bdcdcdf01bdb3879241a0b8d69a14097239",
+    "crosswalk/zip_to_county/zip_to_county_2016.parquet": "982913988b4915e9c4787d4f639985635cae938b447944e0ef04868bb5c784d1",
+    "crosswalk/zip_to_county/zip_to_county_2017.parquet": "e8dd16dcee07b3a6b52a30c49157d707a1f1e8f1cabf7da126cb094e88cfd0eb",
+    "crosswalk/zip_to_county/zip_to_county_2018.parquet": "8ab399f97bb8d81eaabca7adf512e801c8958745fe6f0e82992313fbcfaaa942",
+    "crosswalk/zip_to_county/zip_to_county_2019.parquet": "78ac9cdda7c5e00b82315ceaece4f3722da905e2ec4f68774a135d327c257435",
+    "crosswalk/zip_to_county/zip_to_county_2020.parquet": "17f518b2a6de5ee58b134c0dc5fbf10f6d48c45b14fdbfcfe58a4269e08eaee2",
+    "crosswalk/zip_to_county/zip_to_county_2021.parquet": "655c6ce8b257e942ac1c7dfa068d93ad2a1f139c9cd0c110efbce752ab27cea4",
+    "crosswalk/zip_to_county/zip_to_county_2022.parquet": "9f125642aeb5aba3c8f2bdf616c186f975fc6afc1b5543d55204ad9b2a10b252",
+    "crosswalk/zip_to_county/zip_to_county_2023.parquet": "89f50d28acb226a5777ededc1f8301dc1f028c65dd04b860355d23efe24b89c5",
+    "crosswalk/zip_to_county/zip_to_county_2024.parquet": "8d76e09fa52923c0901c0afad26ba2d32075210bb0ab5b82caa14ce5465b3d80",
+    "crosswalk/zip_to_county/zip_to_county_2025.parquet": "2683272f9759646d0a6234de06076f274852b85fcd2fd9e3d2dfcb3aa10def80",
     "geo/cb_2020_us_county_5m.zip": "187e7118304428e5450083beb375e67c2c516c58a01ce52db95aaf24f18df3ba",
     "geo/cb_2020_us_state_5m.zip": "aedc60e0d1924a9030ee6d39ff0ed27ad7d1b0bc86807ea809391a6b9008ffb3",
     "geo/cb_2024_us_county_5m.zip": "a867f8734059b45d1d54a0ba56189dd7e73c42eb451418fa56de44c35232614b",
diff --git a/src/kintsugi/crosswalk.py b/src/kintsugi/crosswalk.py
new file mode 100644
index 0000000..8586209
--- /dev/null
+++ b/src/kintsugi/crosswalk.py
@@ -0,0 +1,430 @@
+from typing import Literal, overload
+
+import pandas as pd
+import polars as pl
+
+from ._data import get_dataset
+
+num_county_subs = 169
+
+
+# @overload
+# def crosswalk_CT_counties(as_pandas: Literal[False] = ...) -> pl.LazyFrame: ...
+#
+#
+# @overload
+# def crosswalk_CT_counties(as_pandas: Literal[True]) -> pd.DataFrame: ...
+#
+#
+# def crosswalk_CT_counties(as_pandas: bool = False) -> pl.LazyFrame | pd.DataFrame:
+#     """
+#     Crosswalk CT counties between pre and post-2022 changes. Weights calculated based on county subdivision populations.
+#
+#     See:
+#         - FIPS code changes: https://www.census.gov/programs-surveys/geography/technical-documentation/county-changes.html
+#         - CT specific: https://www2.census.gov/geo/pdfs/reference/ct_county_equiv_change.pdf
+#         - CT specific: https://www.federalregister.gov/documents/2022/06/06/2022-12063/change-to-county-equivalents-in-the-state-of-connecticut
+#
+#     Crosswalk: https://www.census.gov/programs-surveys/geography/technical-documentation/county-changes.January_2020.html
+#     Source: https://www2.census.gov/geo/docs/reference/ct_change/ct_cou_to_cousub_crosswalk.txt
+#
+#     CT county subdivision populations source: https://www.census.gov/data/tables/time-series/demo/popest/2020s-total-cities-and-towns.html
+#     FTP: https://www2.census.gov/programs-surveys/popest/datasets/2020-2024/cities/totals/sub-est2024_9.csv
+#
+#     """
+#     crosswalk = (
+#         pl.scan_csv(
+#             CROSSWALK_DATA / "county/ct_cou_to_cousub_crosswalk.txt",
+#             separator="|",
+#             n_rows=174,
+#             infer_schema=False,
+#         )
+#         .select(
+#             "STATEFP\n(INCITS38)",
+#             "OLD_COUNTYFP\n(INCITS31)",
+#             "OLD_COUNTY_NAMELSAD",
+#             "NEW_COUNTYFP\n(INCITS31)",
+#             "NEW_COUNTY_NAMELSAD",
+#             "COUSUBFP",
+#             "OLD_COUSUB_GEOID",
+#             "NEW_COUSUB_GEOID",
+#             "COUSUB_NAMELSAD",
+#         )
+#         .rename(
+#             {
+#                 "STATEFP\n(INCITS38)": "state_fips",
+#                 "OLD_COUNTYFP\n(INCITS31)": "county_fips_old",
+#                 "OLD_COUNTY_NAMELSAD": "county_name_old",
+#                 "NEW_COUNTYFP\n(INCITS31)": "county_fips_new",
+#                 "NEW_COUNTY_NAMELSAD": "county_name_new",
+#                 "COUSUBFP": "county_sub_fips",
+#                 "OLD_COUSUB_GEOID": "county_sub_geoid_old",
+#                 "NEW_COUSUB_GEOID": "county_sub_geoid_new",
+#                 "COUSUB_NAMELSAD": "county_sub_name",
+#             }
+#         )
+#         .filter(
+#             # NOTE: 5 rows labeled with "County subdivisions not defined" but their GEOID/FIPS
+#             # doesn't make sense anyway
+#             pl.col("county_sub_fips") != "00000"
+#         )
+#         .with_columns(
+#             (pl.col("state_fips") + pl.col(col)).alias(col)
+#             for col in ["county_fips_old", "county_fips_new"]
+#         )
+#         .drop("state_fips")
+#     )
+#     assert crosswalk.select(pl.len()).collect().item() == num_county_subs
+#
+#     subcounty = (
+#         pl.scan_csv(
+#             CROSSWALK_DATA / "county/sub-est2024_9.csv",
+#             schema_overrides={
+#                 "SUMLEV": pl.String,
+#                 "STATE": pl.String,
+#                 "COUNTY": pl.String,
+#                 "COUSUB": pl.String,
+#                 "NAME": pl.String,
+#                 "POPESTIMATE2024": pl.Int64,
+#             },
+#         )
+#         .select(
+#             "SUMLEV",
+#             "STATE",
+#             "COUNTY",
+#             "COUSUB",
+#             "NAME",
+#             "POPESTIMATE2024",
+#         )
+#         .rename(
+#             {
+#                 "SUMLEV": "sumlev",
+#                 "STATE": "state_fips",
+#                 "COUNTY": "county_fips",
+#                 "COUSUB": "county_sub_fips",
+#                 "NAME": "county_sub_name",
+#                 "POPESTIMATE2024": "pop_2024",
+#             }
+#         )
+#         .filter(
+#             # only minor civil divisions
+#             pl.col("sumlev") == "061"
+#         )
+#         .with_columns(county_fips=pl.col("state_fips") + pl.col("county_fips"))
+#         .drop("state_fips", "county_fips", "sumlev")
+#         .sort("county_sub_fips")
+#     )
+#     assert subcounty.select(pl.len()).collect().item() == num_county_subs
+#
+#     lf = (
+#         crosswalk.join(
+#             subcounty,
+#             on="county_sub_fips",
+#             how="inner",
+#             validate="1:1",
+#         )
+#         .select(
+#             "county_sub_fips",
+#             "county_sub_name",
+#             "county_fips_new",
+#             "county_name_new",
+#             "county_fips_old",
+#             "county_name_old",
+#             "pop_2024",
+#         )
+#         .with_columns(
+#             pop_old=pl.col("pop_2024").sum().over("county_fips_old"),
+#             pop_new=pl.col("pop_2024").sum().over("county_fips_new"),
+#         )
+#         .group_by(
+#             [
+#                 "county_fips_new",
+#                 "county_name_new",
+#                 "county_fips_old",
+#                 "county_name_old",
+#                 "pop_old",
+#                 "pop_new",
+#             ]
+#         )
+#         .agg(pop_agg=pl.col("pop_2024").sum())  # agg county sub to county pairs
+#         # NOTE: want weights to be expected prop. of origin FIPS that are located in dest. FIPS
+#         # Aka prop. of origin FIPS that is located in dest. FIPS
+#         # Ex: wt_new_to_old should give expected prop. of new FIPS that is located in old FIPS.
+#         .with_columns(
+#             wt_new_to_old=pl.col("pop_agg") / pl.col("pop_new"),
+#             wt_old_to_new=pl.col("pop_agg") / pl.col("pop_old"),
+#         )
+#         .select(
+#             "county_fips_old",
+#             "county_name_old",
+#             "county_fips_new",
+#             "county_name_new",
+#             "wt_new_to_old",
+#             "wt_old_to_new",
+#         )
+#         .sort("county_fips_old")
+#     )
+#     assert (
+#         lf.select(pl.len()).collect().item()
+#         == lf.unique(["county_fips_old", "county_fips_new"])
+#         .select(pl.len())
+#         .collect()
+#         .item()
+#     )
+#
+#     if as_pandas:
+#         return lf.collect().to_pandas()
+#
+#     return lf
+
+
+# @overload
+# def crosswalk_puma_versions() -> pl.LazyFrame: ...
+#
+#
+# @overload
+# def crosswalk_puma_versions(as_pandas: Literal[False]) -> pl.LazyFrame: ...
+#
+#
+# @overload
+# def crosswalk_puma_versions(as_pandas: Literal[True]) -> pd.DataFrame: ...
+#
+#
+# def crosswalk_puma_versions(as_pandas: bool = False) -> pl.LazyFrame | pd.DataFrame:
+#     """
+#     Crosswalk data between 2010 (effective 2012) and 2020 (effective 2022) PUMAs.
+#
+#     Sourced from Missouri Census Data Center (MCDC). Alternative source exists from
+#     Integrated Public Use Microdata Series (IPUMS).
+#
+#     - Source (MCDC): https://mcdc.missouri.edu/geography/PUMAs.html
+#         - See: https://mcdc.missouri.edu/data/corrlst/puma2010-to-puma2020.csv
+#     - Source (IPUMS): https://usa.ipums.org/usa/volii/pumas20.shtml
+#         - See: https://usa.ipums.org/usa/resources/volii/PUMA2010_PUMA2020_crosswalk.xls
+#     """
+#     lf = (
+#         pl.scan_csv(
+#             CROSSWALK_DATA / "PUMA/puma2010-to-puma2020.csv",
+#             skip_rows_after_header=1,
+#             schema_overrides={
+#                 "state": pl.String,
+#                 "puma12": pl.String,
+#                 "puma22": pl.String,
+#                 "afact": pl.Float64,
+#                 "AFACT2": pl.Float64,
+#             },
+#         )
+#         .select(
+#             "state",
+#             "puma22",
+#             "puma12",
+#             "afact",  # portion of earlier PUMA's population living in later PUMA
+#             "AFACT2",  # portion of later PUMA's population living in the earlier PUMA
+#         )
+#         .rename(
+#             {
+#                 "afact": "wt_PUMA_2010_to_2020_MCDC",
+#                 "AFACT2": "wt_PUMA_2020_to_2010_MCDC",
+#             }
+#         )
+#         .with_columns(
+#             puma_geoid_2020=pl.col("state").str.zfill(2)
+#             + pl.col("puma22").str.zfill(5),
+#             puma_geoid_2010=pl.col("state").str.zfill(2)
+#             + pl.col("puma12").str.zfill(5),
+#         )
+#         .select(
+#             "puma_geoid_2010",
+#             "puma_geoid_2020",
+#             "wt_PUMA_2010_to_2020_MCDC",
+#             "wt_PUMA_2020_to_2010_MCDC",
+#         )
+#         .filter(
+#             pl.col("puma_geoid_2010")
+#             .str.slice(0, 2)
+#             .is_between(pl.lit("01"), pl.lit("56")),
+#             pl.col("puma_geoid_2020")
+#             .str.slice(0, 2)
+#             .is_between(pl.lit("01"), pl.lit("56")),
+#             # pl.col("wt_PUMA_2020_to_2010") != 0.0,
+#         )
+#         .sort("puma_geoid_2010", "puma_geoid_2020")
+#     )
+#
+#     # lf_IPUMS = (
+#     #     pl.read_excel(
+#     #         PUMS_DATA / "PUMA2010_PUMA2020_crosswalk.xls",
+#     #         # NOTE: pPUMA20_Pop20 = "Estimated percent of the 2020 PUMA's 2020 population that lies in the area of intersection"
+#     #         columns=[
+#     #             "GEOID10",
+#     #             "GEOID20",
+#     #             "pPUMA20_Pop20",
+#     #             "pPUMA10_Pop20",
+#     #         ],
+#     #     )
+#     #     .lazy()
+#     #     .rename(
+#     #         {
+#     #             "GEOID20": "puma_geoid_2020",
+#     #             "GEOID10": "puma_geoid_2010",
+#     #             "pPUMA20_Pop20": "wt_PUMA_2020_to_2010",
+#     #             "pPUMA10_Pop20": "wt_PUMA_2010_to_2020",
+#     #         }
+#     #     )
+#     #     .with_columns(
+#     #         (pl.col(col) / 100.0).alias(col)
+#     #         for col in ["wt_PUMA_2020_to_2010", "wt_PUMA_2010_to_2020"]
+#     #     )
+#     #     .select(
+#     #         "puma_geoid_2020",
+#     #         "puma_geoid_2010",
+#     #         "wt_PUMA_2020_to_2010",
+#     #         "wt_PUMA_2010_to_2020",
+#     #     )
+#     #     .sort("puma_geoid_2020", "puma_geoid_2010")
+#     # )
+#
+#     if as_pandas:
+#         return lf.collect().to_pandas()
+#
+#     return lf
+
+
+# @overload
+# def crosswalk_puma_2010_county_2020() -> pl.LazyFrame: ...
+#
+#
+# @overload
+# def crosswalk_puma_2010_county_2020(as_pandas: Literal[False]) -> pl.LazyFrame: ...
+#
+#
+# @overload
+# def crosswalk_puma_2010_county_2020(as_pandas: Literal[True]) -> pd.DataFrame: ...
+#
+#
+# def crosswalk_puma_2010_county_2020(
+#     as_pandas: bool = False,
+# ) -> pl.LazyFrame | pd.DataFrame:
+#     """
+#     Crosswalk data between 2010 PUMAs and 2020 counties.
+#
+#     Note: uses new CT counties
+#
+#     Source: MCDC 2022 Geocorr
+#         - Form query: https://mcdc.missouri.edu/cgi-bin/broker?_PROGRAM=apps.geocorr2022.sas&_SERVICE=MCDC_long&_debug=0&state=Mo29&state=Al01&state=Ak02&state=Az04&state=Ar05&state=Ca06&state=Co08&state=Ct09&state=De10&state=Dc11&state=Fl12&state=Ga13&state=Hi15&state=Id16&state=Il17&state=In18&state=Ia19&state=Ks20&state=Ky21&state=La22&state=Me23&state=Md24&state=Ma25&state=Mi26&state=Mn27&state=Ms28&state=Mt30&state=Ne31&state=Nv32&state=Nh33&state=Nj34&state=Nm35&state=Ny36&state=Nc37&state=Nd38&state=Oh39&state=Ok40&state=Or41&state=Pa42&state=Pr72&state=Ri44&state=Sc45&state=Sd46&state=Tn47&state=Tx48&state=Ut49&state=Vt50&state=Va51&state=Wa53&state=Wv54&state=Wi55&state=Wy56&g1_=puma12&g2_=county&wtvar=pop20&nozerob=1&fileout=1&filefmt=csv&lstfmt=html&title=&afacts2=on&counties=&metros=&places=&oropt=&latitude=&longitude=&distance=&kiloms=0&locname=
+#     """
+#     lf = (
+#         pl.read_csv(
+#             CROSSWALK_DATA / "PUMA/geocorr_puma_2010_to_county_2020_with_afact2.csv",
+#             encoding="iso-8859-1",
+#             skip_rows_after_header=1,
+#             columns=[
+#                 "state",
+#                 "puma12",
+#                 "county",
+#                 "afact",
+#                 "afact2",
+#             ],
+#             schema_overrides={
+#                 "state": pl.String,
+#                 "puma12": pl.String,
+#                 "county": pl.String,
+#                 "afact": pl.String,
+#                 "afact2": pl.String,
+#             },
+#         )
+#         .lazy()
+#         .rename({"county": "county_fips"})
+#         .with_columns(
+#             puma_geoid=pl.col("state") + pl.col("puma12"),
+#             wt_PUMA_2010_to_county=pl.col("afact").str.strip_chars().cast(pl.Float64),
+#             wt_county_to_PUMA_2010=pl.col("afact2").str.strip_chars().cast(pl.Float64),
+#         )
+#         .filter(
+#             pl.col("state").is_between(pl.lit("01"), pl.lit("56")),
+#             # pl.col("wt_PUMA_2010_to_county") != 0
+#         )
+#         .select(
+#             "puma_geoid",
+#             "county_fips",
+#             "wt_PUMA_2010_to_county",
+#             "wt_county_to_PUMA_2010",
+#         )
+#         .sort("puma_geoid", "county_fips")
+#     )
+#
+#     if as_pandas:
+#         return lf.collect().to_pandas()
+#
+#     return lf
+
+
+# TODO: need to settle on some convention for the files:
+# e.g., always use Q4 for each year? Q3 used because that was latest at the time when I pulled data
+# Available on a quarterly release cycle from HUD:
+# - 2010-Q1 - 2011-Q4 data use 2000 Census geographies
+# - 2012-Q1 - 2022-Q4 data use 2010 Census geographies
+# - 2023-Q1 - present data use 2020 Census geographies
+# TODO: note that I can only find one instance of using old version of this function
+# in other proj. Although it used zip-to-county file instead of county-to-zip file,
+# the weights weren't even used (only used to map zip to county)
+
+
+@overload
+def county_to_zip(year: int, as_pandas: Literal[False] = ...) -> pl.LazyFrame: ...
+
+
+@overload
+def county_to_zip(year: int, as_pandas: Literal[True]) -> pd.DataFrame: ...
+
+
+def county_to_zip(year: int, as_pandas: bool = False) -> pl.LazyFrame | pd.DataFrame:
+    """
+    County-to-zip residential ratio weights.
+
+    Use these weights to crosswalk zip-to-county via a weighted mean.
+    2012-2022 data use 2010 Census geographies. 2023-present data use
+    2020 Census geographies. All years use quarter 4 data.
+
+    Source: https://www.huduser.gov/portal/datasets/usps_crosswalk.html
+    """
+    if not (2016 <= year <= 2025):
+        raise ValueError("Must choose a year between 2016 and 2025")
+
+    data = get_dataset(f"crosswalk/county_to_zip/county_to_zip_{year}.parquet")
+    lf = pl.scan_parquet(data)
+
+    if as_pandas:
+        return lf.collect().to_pandas()
+
+    return lf
+
+
+@overload
+def zip_to_county(year: int, as_pandas: Literal[False] = ...) -> pl.LazyFrame: ...
+
+
+@overload
+def zip_to_county(year: int, as_pandas: Literal[True]) -> pd.DataFrame: ...
+
+
+def zip_to_county(year: int, as_pandas: bool = False) -> pl.LazyFrame | pd.DataFrame:
+    """
+    Zip-to-county residential ratio weights.
+
+    Use these weights to crosswalk counts data from zip-to-county
+    2012-2022 data use 2010 Census geographies. 2023-present data use
+    2020 Census geographies. All years use quarter 4 data.
+
+    Source: https://www.huduser.gov/portal/datasets/usps_crosswalk.html
+    """
+    if not (2016 <= year <= 2025):
+        raise ValueError("Must choose a year between 2016 and 2025")
+
+    data = get_dataset(f"crosswalk/zip_to_county/zip_to_county_{year}.parquet")
+    lf = pl.scan_parquet(data)
+
+    if as_pandas:
+        return lf.collect().to_pandas()
+
+    return lf
diff --git a/tests/crosswalk_test.py b/tests/crosswalk_test.py
new file mode 100644
index 0000000..4602438
--- /dev/null
+++ b/tests/crosswalk_test.py
@@ -0,0 +1,60 @@
+import pandera.polars as pa
+import polars as pl
+import pytest
+from pandas import DataFrame
+
+from kintsugi.crosswalk import county_to_zip, zip_to_county
+
+from .models import BasePolarsModel
+
+
+class ZipCountyCrosswalk(BasePolarsModel):
+    zip_code: pl.String  # pyright: ignore [reportUninitializedInstanceVariable]
+    county_fips: pl.String  # pyright: ignore [reportUninitializedInstanceVariable]
+    res_ratio: pl.Float64 = pa.Field(ge=0)  # pyright: ignore [reportAny]
+
+
+@pytest.mark.parametrize(
+    ("year"),
+    range(2016, 2026),
+)
+def test_county_to_zip(year: int) -> None:
+    county_to_zip(year).collect().pipe(ZipCountyCrosswalk.validate, lazy=True)
+
+
+@pytest.mark.parametrize(
+    ("year"),
+    range(2016, 2026),
+)
+def test_county_to_zip_as_pandas(year: int) -> None:
+    df = county_to_zip(year, as_pandas=True)
+
+    assert isinstance(df, DataFrame)
+
+
+def test_county_to_zip_year_exception() -> None:
+    with pytest.raises(ValueError, match="Must choose a year between 2016 and 2025"):
+        county_to_zip(2010)
+
+
+@pytest.mark.parametrize(
+    ("year"),
+    range(2016, 2026),
+)
+def test_zip_to_county(year: int) -> None:
+    zip_to_county(year).collect().pipe(ZipCountyCrosswalk.validate, lazy=True)
+
+
+@pytest.mark.parametrize(
+    ("year"),
+    range(2016, 2026),
+)
+def test_zip_to_county_as_pandas(year: int) -> None:
+    df = zip_to_county(year, as_pandas=True)
+
+    assert isinstance(df, DataFrame)
+
+
+def test_zip_to_county_year_exception() -> None:
+    with pytest.raises(ValueError, match="Must choose a year between 2016 and 2025"):
+        zip_to_county(2010)

From f797f21d98afba8b97191198c445b286460eb1a9 Mon Sep 17 00:00:00 2001
From: winter-again <63322884+winter-again@users.noreply.github.com>
Date: Thu, 12 Mar 2026 17:40:05 -0400
Subject: [PATCH 2/5] PUMA version crosswalk

---
 pyproject.toml            |   2 +-
 src/kintsugi/_data.py     |   3 +-
 src/kintsugi/crosswalk.py | 489 ++++++++++++++++++--------------------
 tests/crosswalk_test.py   |  25 +-
 uv.lock                   |   2 +-
 5 files changed, 264 insertions(+), 257 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index bd65ad4..b2c378d 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "kintsugi"
-version = "0.8.0"
+version = "0.9.0"
 description = "Commonly used datasets and functions"
 readme = "README.md"
 authors = [
diff --git a/src/kintsugi/_data.py b/src/kintsugi/_data.py
index 21ccc09..7868053 100644
--- a/src/kintsugi/_data.py
+++ b/src/kintsugi/_data.py
@@ -12,7 +12,7 @@
 logger = logging.getLogger("kintsugi")
 logger.addHandler(logging.NullHandler())
 
-BASE_URL = "https://raw.githubusercontent.com/winter-again/kintsugi-data/main/data"
+BASE_URL = "https://raw.githubusercontent.com/bansallab/kintsugi-data/main/data"
 DATASETS = {
     "county_neighbors/county_adjacency2010.txt": "7edda309ad38a4dfc6a6c6c30e1753e5490b9c8a3aa4563188841989b4fe9a96",
     "county_neighbors/county_adjacency2023.txt": "2dbf9a8bae1b7c50db3a9db4864b073d2423c3e6e63518c97d649453e2809843",
@@ -28,6 +28,7 @@
     "crosswalk/county_to_zip/county_to_zip_2023.parquet": "2f7c7564092c43d3a964ecf76bc61083916186d140c3cabed63aba5f91b56f1a",
     "crosswalk/county_to_zip/county_to_zip_2024.parquet": "907321abfa45437a13d4cfa9047b687456e883a096b863c8874d168fb8cc57c0",
     "crosswalk/county_to_zip/county_to_zip_2025.parquet": "e5008518558a4aca7ae2d40226f56bdcdcdf01bdb3879241a0b8d69a14097239",
+    "crosswalk/PUMA/puma2010-to-puma2020.csv": "bc6d0116aa39ea3b2af85095b49bac00f4612f591d2f4684cc2cc872c210a666",
     "crosswalk/zip_to_county/zip_to_county_2016.parquet": "982913988b4915e9c4787d4f639985635cae938b447944e0ef04868bb5c784d1",
     "crosswalk/zip_to_county/zip_to_county_2017.parquet": "e8dd16dcee07b3a6b52a30c49157d707a1f1e8f1cabf7da126cb094e88cfd0eb",
     "crosswalk/zip_to_county/zip_to_county_2018.parquet": "8ab399f97bb8d81eaabca7adf512e801c8958745fe6f0e82992313fbcfaaa942",
diff --git a/src/kintsugi/crosswalk.py b/src/kintsugi/crosswalk.py
index 8586209..88106c7 100644
--- a/src/kintsugi/crosswalk.py
+++ b/src/kintsugi/crosswalk.py
@@ -5,7 +5,242 @@
 
 from ._data import get_dataset
 
-num_county_subs = 169
+# num_county_subs = 169
+
+
+@overload
+def puma_2010_2020(as_pandas: Literal[False] = ...) -> pl.LazyFrame: ...
+
+
+@overload
+def puma_2010_2020(as_pandas: Literal[True]) -> pd.DataFrame: ...
+
+
+def puma_2010_2020(as_pandas: bool = False) -> pl.LazyFrame | pd.DataFrame:
+    """
+    Crosswalk data between 2010 (effective 2012) and 2020 (effective 2022) PUMAs.
+
+    `wt_PUMA_2010_to_2020_MCDC` describes the proportion of the 2010 PUMA's population
+    that lives in the 2020 PUMA. Similarly, `wt_PUMA_2020_to_2010_MCDC` describes the
+    proportion of the 2020 PUMA's population that lives in the 2010 PUMA.
+
+    Sourced from Missouri Census Data Center (MCDC). Alternative source exists from
+    Integrated Public Use Microdata Series (IPUMS).
+
+    - Source (MCDC): https://mcdc.missouri.edu/geography/PUMAs.html
+        - See: https://mcdc.missouri.edu/data/corrlst/puma2010-to-puma2020.csv
+    - Source (IPUMS): https://usa.ipums.org/usa/volii/pumas20.shtml
+        - See: https://usa.ipums.org/usa/resources/volii/PUMA2010_PUMA2020_crosswalk.xls
+    """
+    data = get_dataset("crosswalk/PUMA/puma2010-to-puma2020.csv")
+    lf = (
+        pl.scan_csv(
+            data,
+            skip_rows_after_header=1,
+            schema_overrides={
+                "state": pl.String,
+                "puma12": pl.String,
+                "puma22": pl.String,
+                "afact": pl.Float64,
+                "AFACT2": pl.Float64,
+            },
+        )
+        .select(
+            "state",
+            "puma22",
+            "puma12",
+            "afact",  # portion of earlier PUMA's population living in later PUMA
+            "AFACT2",  # portion of later PUMA's population living in the earlier PUMA
+        )
+        .rename(
+            {
+                "afact": "wt_PUMA_2010_to_2020_MCDC",
+                "AFACT2": "wt_PUMA_2020_to_2010_MCDC",
+            }
+        )
+        .with_columns(
+            (pl.col(col).str.zfill(5).alias(col) for col in ["puma12", "puma22"]),
+            state=pl.col("state").str.zfill(2),
+        )
+        .filter(pl.col("state").is_between(pl.lit("01"), pl.lit("56")))
+        .with_columns(
+            puma_geoid_2020=pl.col("state") + pl.col("puma22"),
+            puma_geoid_2010=pl.col("state") + pl.col("puma12"),
+        )
+        .select(
+            "puma_geoid_2010",
+            "puma_geoid_2020",
+            "wt_PUMA_2010_to_2020_MCDC",
+            "wt_PUMA_2020_to_2010_MCDC",
+        )
+        .sort("puma_geoid_2010", "puma_geoid_2020")
+    )
+
+    # NOTE: implementation using alternate data source
+    # lf_IPUMS = (
+    #     pl.read_excel(
+    #         PUMS_DATA / "PUMA2010_PUMA2020_crosswalk.xls",
+    #         # NOTE: pPUMA20_Pop20 = "Estimated percent of the 2020 PUMA's 2020 population that lies in the area of intersection"
+    #         columns=[
+    #             "GEOID10",
+    #             "GEOID20",
+    #             "pPUMA20_Pop20",
+    #             "pPUMA10_Pop20",
+    #         ],
+    #     )
+    #     .lazy()
+    #     .rename(
+    #         {
+    #             "GEOID20": "puma_geoid_2020",
+    #             "GEOID10": "puma_geoid_2010",
+    #             "pPUMA20_Pop20": "wt_PUMA_2020_to_2010",
+    #             "pPUMA10_Pop20": "wt_PUMA_2010_to_2020",
+    #         }
+    #     )
+    #     .with_columns(
+    #         (pl.col(col) / 100.0).alias(col)
+    #         for col in ["wt_PUMA_2020_to_2010", "wt_PUMA_2010_to_2020"]
+    #     )
+    #     .select(
+    #         "puma_geoid_2020",
+    #         "puma_geoid_2010",
+    #         "wt_PUMA_2020_to_2010",
+    #         "wt_PUMA_2010_to_2020",
+    #     )
+    #     .sort("puma_geoid_2020", "puma_geoid_2010")
+    # )
+
+    if as_pandas:
+        return lf.collect().to_pandas()
+
+    return lf
+
+
+# @overload
+# def crosswalk_puma_2010_county_2020(
+#     as_pandas: Literal[False] = ...,
+# ) -> pl.LazyFrame: ...
+#
+#
+# @overload
+# def crosswalk_puma_2010_county_2020(as_pandas: Literal[True]) -> pd.DataFrame: ...
+#
+#
+# def crosswalk_puma_2010_county_2020(
+#     as_pandas: bool = False,
+# ) -> pl.LazyFrame | pd.DataFrame:
+#     """
+#     Crosswalk data between 2010 PUMAs and 2020 counties.
+#
+#     Note: uses new CT counties
+#
+#     Source: MCDC 2022 Geocorr
+#         - Form query: https://mcdc.missouri.edu/cgi-bin/broker?_PROGRAM=apps.geocorr2022.sas&_SERVICE=MCDC_long&_debug=0&state=Mo29&state=Al01&state=Ak02&state=Az04&state=Ar05&state=Ca06&state=Co08&state=Ct09&state=De10&state=Dc11&state=Fl12&state=Ga13&state=Hi15&state=Id16&state=Il17&state=In18&state=Ia19&state=Ks20&state=Ky21&state=La22&state=Me23&state=Md24&state=Ma25&state=Mi26&state=Mn27&state=Ms28&state=Mt30&state=Ne31&state=Nv32&state=Nh33&state=Nj34&state=Nm35&state=Ny36&state=Nc37&state=Nd38&state=Oh39&state=Ok40&state=Or41&state=Pa42&state=Pr72&state=Ri44&state=Sc45&state=Sd46&state=Tn47&state=Tx48&state=Ut49&state=Vt50&state=Va51&state=Wa53&state=Wv54&state=Wi55&state=Wy56&g1_=puma12&g2_=county&wtvar=pop20&nozerob=1&fileout=1&filefmt=csv&lstfmt=html&title=&afacts2=on&counties=&metros=&places=&oropt=&latitude=&longitude=&distance=&kiloms=0&locname=
+#     """
+#     lf = (
+#         pl.read_csv(
+#             CROSSWALK_DATA / "PUMA/geocorr_puma_2010_to_county_2020_with_afact2.csv",
+#             encoding="iso-8859-1",
+#             skip_rows_after_header=1,
+#             columns=[
+#                 "state",
+#                 "puma12",
+#                 "county",
+#                 "afact",
+#                 "afact2",
+#             ],
+#             schema_overrides={
+#                 "state": pl.String,
+#                 "puma12": pl.String,
+#                 "county": pl.String,
+#                 "afact": pl.String,
+#                 "afact2": pl.String,
+#             },
+#         )
+#         .lazy()
+#         .rename({"county": "county_fips"})
+#         .with_columns(
+#             puma_geoid=pl.col("state") + pl.col("puma12"),
+#             wt_PUMA_2010_to_county=pl.col("afact").str.strip_chars().cast(pl.Float64),
+#             wt_county_to_PUMA_2010=pl.col("afact2").str.strip_chars().cast(pl.Float64),
+#         )
+#         .filter(
+#             pl.col("state").is_between(pl.lit("01"), pl.lit("56")),
+#             # pl.col("wt_PUMA_2010_to_county") != 0
+#         )
+#         .select(
+#             "puma_geoid",
+#             "county_fips",
+#             "wt_PUMA_2010_to_county",
+#             "wt_county_to_PUMA_2010",
+#         )
+#         .sort("puma_geoid", "county_fips")
+#     )
+#
+#     if as_pandas:
+#         return lf.collect().to_pandas()
+#
+#     return lf
+
+
+@overload
+def county_to_zip(year: int, as_pandas: Literal[False] = ...) -> pl.LazyFrame: ...
+
+
+@overload
+def county_to_zip(year: int, as_pandas: Literal[True]) -> pd.DataFrame: ...
+
+
+def county_to_zip(year: int, as_pandas: bool = False) -> pl.LazyFrame | pd.DataFrame:
+    """
+    County-to-zip residential ratio weights.
+
+    Use these weights to crosswalk zip-to-county via a weighted mean.
+    2012-2022 data use 2010 Census geographies. 2023-present data use
+    2020 Census geographies. All years use quarter 4 data.
+
+    Source: https://www.huduser.gov/portal/datasets/usps_crosswalk.html
+    """
+    if not (2016 <= year <= 2025):
+        raise ValueError("Must choose a year between 2016 and 2025")
+
+    data = get_dataset(f"crosswalk/county_to_zip/county_to_zip_{year}.parquet")
+    lf = pl.scan_parquet(data)
+
+    if as_pandas:
+        return lf.collect().to_pandas()
+
+    return lf
+
+
+@overload
+def zip_to_county(year: int, as_pandas: Literal[False] = ...) -> pl.LazyFrame: ...
+
+
+@overload
+def zip_to_county(year: int, as_pandas: Literal[True]) -> pd.DataFrame: ...
+
+
+def zip_to_county(year: int, as_pandas: bool = False) -> pl.LazyFrame | pd.DataFrame:
+    """
+    Zip-to-county residential ratio weights.
+
+    Use these weights to crosswalk counts data from zip-to-county
+    2012-2022 data use 2010 Census geographies. 2023-present data use
+    2020 Census geographies. All years use quarter 4 data.
+
+    Source: https://www.huduser.gov/portal/datasets/usps_crosswalk.html
+    """
+    if not (2016 <= year <= 2025):
+        raise ValueError("Must choose a year between 2016 and 2025")
+
+    data = get_dataset(f"crosswalk/zip_to_county/zip_to_county_{year}.parquet")
+    lf = pl.scan_parquet(data)
+
+    if as_pandas:
+        return lf.collect().to_pandas()
+
+    return lf
 
 
 # @overload
@@ -176,255 +411,3 @@
 #         return lf.collect().to_pandas()
 #
 #     return lf
-
-
-# @overload
-# def crosswalk_puma_versions() -> pl.LazyFrame: ...
-#
-#
-# @overload
-# def crosswalk_puma_versions(as_pandas: Literal[False]) -> pl.LazyFrame: ...
-#
-#
-# @overload
-# def crosswalk_puma_versions(as_pandas: Literal[True]) -> pd.DataFrame: ...
-#
-#
-# def crosswalk_puma_versions(as_pandas: bool = False) -> pl.LazyFrame | pd.DataFrame:
-#     """
-#     Crosswalk data between 2010 (effective 2012) and 2020 (effective 2022) PUMAs.
-#
-#     Sourced from Missouri Census Data Center (MCDC). Alternative source exists from
-#     Integrated Public Use Microdata Series (IPUMS).
-#
-#     - Source (MCDC): https://mcdc.missouri.edu/geography/PUMAs.html
-#         - See: https://mcdc.missouri.edu/data/corrlst/puma2010-to-puma2020.csv
-#     - Source (IPUMS): https://usa.ipums.org/usa/volii/pumas20.shtml
-#         - See: https://usa.ipums.org/usa/resources/volii/PUMA2010_PUMA2020_crosswalk.xls
-#     """
-#     lf = (
-#         pl.scan_csv(
-#             CROSSWALK_DATA / "PUMA/puma2010-to-puma2020.csv",
-#             skip_rows_after_header=1,
-#             schema_overrides={
-#                 "state": pl.String,
-#                 "puma12": pl.String,
-#                 "puma22": pl.String,
-#                 "afact": pl.Float64,
-#                 "AFACT2": pl.Float64,
-#             },
-#         )
-#         .select(
-#             "state",
-#             "puma22",
-#             "puma12",
-#             "afact",  # portion of earlier PUMA's population living in later PUMA
-#             "AFACT2",  # portion of later PUMA's population living in the earlier PUMA
-#         )
-#         .rename(
-#             {
-#                 "afact": "wt_PUMA_2010_to_2020_MCDC",
-#                 "AFACT2": "wt_PUMA_2020_to_2010_MCDC",
-#             }
-#         )
-#         .with_columns(
-#             puma_geoid_2020=pl.col("state").str.zfill(2)
-#             + pl.col("puma22").str.zfill(5),
-#             puma_geoid_2010=pl.col("state").str.zfill(2)
-#             + pl.col("puma12").str.zfill(5),
-#         )
-#         .select(
-#             "puma_geoid_2010",
-#             "puma_geoid_2020",
-#             "wt_PUMA_2010_to_2020_MCDC",
-#             "wt_PUMA_2020_to_2010_MCDC",
-#         )
-#         .filter(
-#             pl.col("puma_geoid_2010")
-#             .str.slice(0, 2)
-#             .is_between(pl.lit("01"), pl.lit("56")),
-#             pl.col("puma_geoid_2020")
-#             .str.slice(0, 2)
-#             .is_between(pl.lit("01"), pl.lit("56")),
-#             # pl.col("wt_PUMA_2020_to_2010") != 0.0,
-#         )
-#         .sort("puma_geoid_2010", "puma_geoid_2020")
-#     )
-#
-#     # lf_IPUMS = (
-#     #     pl.read_excel(
-#     #         PUMS_DATA / "PUMA2010_PUMA2020_crosswalk.xls",
-#     #         # NOTE: pPUMA20_Pop20 = "Estimated percent of the 2020 PUMA's 2020 population that lies in the area of intersection"
-#     #         columns=[
-#     #             "GEOID10",
-#     #             "GEOID20",
-#     #             "pPUMA20_Pop20",
-#     #             "pPUMA10_Pop20",
-#     #         ],
-#     #     )
-#     #     .lazy()
-#     #     .rename(
-#     #         {
-#     #             "GEOID20": "puma_geoid_2020",
-#     #             "GEOID10": "puma_geoid_2010",
-#     #             "pPUMA20_Pop20": "wt_PUMA_2020_to_2010",
-#     #             "pPUMA10_Pop20": "wt_PUMA_2010_to_2020",
-#     #         }
-#     #     )
-#     #     .with_columns(
-#     #         (pl.col(col) / 100.0).alias(col)
-#     #         for col in ["wt_PUMA_2020_to_2010", "wt_PUMA_2010_to_2020"]
-#     #     )
-#     #     .select(
-#     #         "puma_geoid_2020",
-#     #         "puma_geoid_2010",
-#     #         "wt_PUMA_2020_to_2010",
-#     #         "wt_PUMA_2010_to_2020",
-#     #     )
-#     #     .sort("puma_geoid_2020", "puma_geoid_2010")
-#     # )
-#
-#     if as_pandas:
-#         return lf.collect().to_pandas()
-#
-#     return lf
-
-
-# @overload
-# def crosswalk_puma_2010_county_2020() -> pl.LazyFrame: ...
-#
-#
-# @overload
-# def crosswalk_puma_2010_county_2020(as_pandas: Literal[False]) -> pl.LazyFrame: ...
-#
-#
-# @overload
-# def crosswalk_puma_2010_county_2020(as_pandas: Literal[True]) -> pd.DataFrame: ...
-#
-#
-# def crosswalk_puma_2010_county_2020(
-#     as_pandas: bool = False,
-# ) -> pl.LazyFrame | pd.DataFrame:
-#     """
-#     Crosswalk data between 2010 PUMAs and 2020 counties.
-#
-#     Note: uses new CT counties
-#
-#     Source: MCDC 2022 Geocorr
-#         - Form query: https://mcdc.missouri.edu/cgi-bin/broker?_PROGRAM=apps.geocorr2022.sas&_SERVICE=MCDC_long&_debug=0&state=Mo29&state=Al01&state=Ak02&state=Az04&state=Ar05&state=Ca06&state=Co08&state=Ct09&state=De10&state=Dc11&state=Fl12&state=Ga13&state=Hi15&state=Id16&state=Il17&state=In18&state=Ia19&state=Ks20&state=Ky21&state=La22&state=Me23&state=Md24&state=Ma25&state=Mi26&state=Mn27&state=Ms28&state=Mt30&state=Ne31&state=Nv32&state=Nh33&state=Nj34&state=Nm35&state=Ny36&state=Nc37&state=Nd38&state=Oh39&state=Ok40&state=Or41&state=Pa42&state=Pr72&state=Ri44&state=Sc45&state=Sd46&state=Tn47&state=Tx48&state=Ut49&state=Vt50&state=Va51&state=Wa53&state=Wv54&state=Wi55&state=Wy56&g1_=puma12&g2_=county&wtvar=pop20&nozerob=1&fileout=1&filefmt=csv&lstfmt=html&title=&afacts2=on&counties=&metros=&places=&oropt=&latitude=&longitude=&distance=&kiloms=0&locname=
-#     """
-#     lf = (
-#         pl.read_csv(
-#             CROSSWALK_DATA / "PUMA/geocorr_puma_2010_to_county_2020_with_afact2.csv",
-#             encoding="iso-8859-1",
-#             skip_rows_after_header=1,
-#             columns=[
-#                 "state",
-#                 "puma12",
-#                 "county",
-#                 "afact",
-#                 "afact2",
-#             ],
-#             schema_overrides={
-#                 "state": pl.String,
-#                 "puma12": pl.String,
-#                 "county": pl.String,
-#                 "afact": pl.String,
-#                 "afact2": pl.String,
-#             },
-#         )
-#         .lazy()
-#         .rename({"county": "county_fips"})
-#         .with_columns(
-#             puma_geoid=pl.col("state") + pl.col("puma12"),
-#             wt_PUMA_2010_to_county=pl.col("afact").str.strip_chars().cast(pl.Float64),
-#             wt_county_to_PUMA_2010=pl.col("afact2").str.strip_chars().cast(pl.Float64),
-#         )
-#         .filter(
-#             pl.col("state").is_between(pl.lit("01"), pl.lit("56")),
-#             # pl.col("wt_PUMA_2010_to_county") != 0
-#         )
-#         .select(
-#             "puma_geoid",
-#             "county_fips",
-#             "wt_PUMA_2010_to_county",
-#             "wt_county_to_PUMA_2010",
-#         )
-#         .sort("puma_geoid", "county_fips")
-#     )
-#
-#     if as_pandas:
-#         return lf.collect().to_pandas()
-#
-#     return lf
-
-
-# TODO: need to settle on some convention for the files:
-# e.g., always use Q4 for each year? Q3 used because that was latest at the time when I pulled data
-# Available on a quarterly release cycle from HUD:
-# - 2010-Q1 - 2011-Q4 data use 2000 Census geographies
-# - 2012-Q1 - 2022-Q4 data use 2010 Census geographies
-# - 2023-Q1 - present data use 2020 Census geographies
-# TODO: note that I can only find one instance of using old version of this function
-# in other proj. Although it used zip-to-county file instead of county-to-zip file,
-# the weights weren't even used (only used to map zip to county)
-
-
-@overload
-def county_to_zip(year: int, as_pandas: Literal[False] = ...) -> pl.LazyFrame: ...
-
-
-@overload
-def county_to_zip(year: int, as_pandas: Literal[True]) -> pd.DataFrame: ...
-
-
-def county_to_zip(year: int, as_pandas: bool = False) -> pl.LazyFrame | pd.DataFrame:
-    """
-    County-to-zip residential ratio weights.
-
-    Use these weights to crosswalk zip-to-county via a weighted mean.
-    2012-2022 data use 2010 Census geographies. 2023-present data use
-    2020 Census geographies. All years use quarter 4 data.
-
-    Source: https://www.huduser.gov/portal/datasets/usps_crosswalk.html
-    """
-    if not (2016 <= year <= 2025):
-        raise ValueError("Must choose a year between 2016 and 2025")
-
-    data = get_dataset(f"crosswalk/county_to_zip/county_to_zip_{year}.parquet")
-    lf = pl.scan_parquet(data)
-
-    if as_pandas:
-        return lf.collect().to_pandas()
-
-    return lf
-
-
-@overload
-def zip_to_county(year: int, as_pandas: Literal[False] = ...) -> pl.LazyFrame: ...
-
-
-@overload
-def zip_to_county(year: int, as_pandas: Literal[True]) -> pd.DataFrame: ...
-
-
-def zip_to_county(year: int, as_pandas: bool = False) -> pl.LazyFrame | pd.DataFrame:
-    """
-    Zip-to-county residential ratio weights.
-
-    Use these weights to crosswalk counts data from zip-to-county
-    2012-2022 data use 2010 Census geographies. 2023-present data use
-    2020 Census geographies. All years use quarter 4 data.
-
-    Source: https://www.huduser.gov/portal/datasets/usps_crosswalk.html
-    """
-    if not (2016 <= year <= 2025):
-        raise ValueError("Must choose a year between 2016 and 2025")
-
-    data = get_dataset(f"crosswalk/zip_to_county/zip_to_county_{year}.parquet")
-    lf = pl.scan_parquet(data)
-
-    if as_pandas:
-        return lf.collect().to_pandas()
-
-    return lf
diff --git a/tests/crosswalk_test.py b/tests/crosswalk_test.py
index 4602438..3705208 100644
--- a/tests/crosswalk_test.py
+++ b/tests/crosswalk_test.py
@@ -3,16 +3,39 @@
 import pytest
 from pandas import DataFrame
 
-from kintsugi.crosswalk import county_to_zip, zip_to_county
+from kintsugi.crosswalk import county_to_zip, puma_2010_2020, zip_to_county
 
 from .models import BasePolarsModel
 
 
+class PUMAVersionCrosswalk(BasePolarsModel):
+    puma_geoid_2010: pl.String  # pyright: ignore [reportUninitializedInstanceVariable]
+    puma_geoid_2020: pl.String  # pyright: ignore [reportUninitializedInstanceVariable]
+    wt_PUMA_2010_to_2020_MCDC: pl.Float64  # pyright: ignore [reportUninitializedInstanceVariable]
+    wt_PUMA_2020_to_2010_MCDC: pl.Float64  # pyright: ignore [reportUninitializedInstanceVariable]
+
+    class Config:  # pyright: ignore [reportIncompatibleVariableOverride]
+        unique: list[str] = ["puma_geoid_2010", "puma_geoid_2020"]
+
+
+def test_puma_2010_2020() -> None:
+    puma_2010_2020().collect().pipe(PUMAVersionCrosswalk.validate, lazy=True)
+
+
+def test_puma_2010_2020_as_pandas() -> None:
+    df = puma_2010_2020(as_pandas=True)
+
+    assert isinstance(df, DataFrame)
+
+
 class ZipCountyCrosswalk(BasePolarsModel):
     zip_code: pl.String  # pyright: ignore [reportUninitializedInstanceVariable]
     county_fips: pl.String  # pyright: ignore [reportUninitializedInstanceVariable]
     res_ratio: pl.Float64 = pa.Field(ge=0)  # pyright: ignore [reportAny]
 
+    class Config:  # pyright: ignore [reportIncompatibleVariableOverride]
+        unique: list[str] = ["zip_code", "county_fips"]
+
 
 @pytest.mark.parametrize(
     ("year"),
diff --git a/uv.lock b/uv.lock
index df92410..9ec42ad 100644
--- a/uv.lock
+++ b/uv.lock
@@ -200,7 +200,7 @@ wheels = [
 
 [[package]]
 name = "kintsugi"
-version = "0.8.0"
+version = "0.9.0"
 source = { editable = "." }
 dependencies = [
     { name = "pandas" },

From b5125b80ed10b83fb4ad6483af18f3369f1aefc9 Mon Sep 17 00:00:00 2001
From: winter-again <63322884+winter-again@users.noreply.github.com>
Date: Fri, 13 Mar 2026 09:17:53 -0400
Subject: [PATCH 3/5] PUMA-county crosswalk

---
 src/kintsugi/_data.py     |   1 +
 src/kintsugi/crosswalk.py | 128 ++++++++++++++++++--------------------
 tests/crosswalk_test.py   |  74 +++++++++++++++++++++-
 3 files changed, 134 insertions(+), 69 deletions(-)

diff --git a/src/kintsugi/_data.py b/src/kintsugi/_data.py
index 7868053..c0ff179 100644
--- a/src/kintsugi/_data.py
+++ b/src/kintsugi/_data.py
@@ -28,6 +28,7 @@
     "crosswalk/county_to_zip/county_to_zip_2023.parquet": "2f7c7564092c43d3a964ecf76bc61083916186d140c3cabed63aba5f91b56f1a",
     "crosswalk/county_to_zip/county_to_zip_2024.parquet": "907321abfa45437a13d4cfa9047b687456e883a096b863c8874d168fb8cc57c0",
     "crosswalk/county_to_zip/county_to_zip_2025.parquet": "e5008518558a4aca7ae2d40226f56bdcdcdf01bdb3879241a0b8d69a14097239",
+    "crosswalk/PUMA/geocorr_puma_2010_to_county_2020_with_afact2.csv": "b66f23aa27be3daf1f6456607478848df3383d4c7111d6f935f792fc9542b807",
     "crosswalk/PUMA/puma2010-to-puma2020.csv": "bc6d0116aa39ea3b2af85095b49bac00f4612f591d2f4684cc2cc872c210a666",
     "crosswalk/zip_to_county/zip_to_county_2016.parquet": "982913988b4915e9c4787d4f639985635cae938b447944e0ef04868bb5c784d1",
     "crosswalk/zip_to_county/zip_to_county_2017.parquet": "e8dd16dcee07b3a6b52a30c49157d707a1f1e8f1cabf7da126cb094e88cfd0eb",
diff --git a/src/kintsugi/crosswalk.py b/src/kintsugi/crosswalk.py
index 88106c7..b5593bd 100644
--- a/src/kintsugi/crosswalk.py
+++ b/src/kintsugi/crosswalk.py
@@ -5,8 +5,6 @@
 
 from ._data import get_dataset
 
-# num_county_subs = 169
-
 
 @overload
 def puma_2010_2020(as_pandas: Literal[False] = ...) -> pl.LazyFrame: ...
@@ -116,71 +114,64 @@ def puma_2010_2020(as_pandas: bool = False) -> pl.LazyFrame | pd.DataFrame:
     return lf
 
 
-# @overload
-# def crosswalk_puma_2010_county_2020(
-#     as_pandas: Literal[False] = ...,
-# ) -> pl.LazyFrame: ...
-#
-#
-# @overload
-# def crosswalk_puma_2010_county_2020(as_pandas: Literal[True]) -> pd.DataFrame: ...
-#
-#
-# def crosswalk_puma_2010_county_2020(
-#     as_pandas: bool = False,
-# ) -> pl.LazyFrame | pd.DataFrame:
-#     """
-#     Crosswalk data between 2010 PUMAs and 2020 counties.
-#
-#     Note: uses new CT counties
-#
-#     Source: MCDC 2022 Geocorr
-#         - Form query: https://mcdc.missouri.edu/cgi-bin/broker?_PROGRAM=apps.geocorr2022.sas&_SERVICE=MCDC_long&_debug=0&state=Mo29&state=Al01&state=Ak02&state=Az04&state=Ar05&state=Ca06&state=Co08&state=Ct09&state=De10&state=Dc11&state=Fl12&state=Ga13&state=Hi15&state=Id16&state=Il17&state=In18&state=Ia19&state=Ks20&state=Ky21&state=La22&state=Me23&state=Md24&state=Ma25&state=Mi26&state=Mn27&state=Ms28&state=Mt30&state=Ne31&state=Nv32&state=Nh33&state=Nj34&state=Nm35&state=Ny36&state=Nc37&state=Nd38&state=Oh39&state=Ok40&state=Or41&state=Pa42&state=Pr72&state=Ri44&state=Sc45&state=Sd46&state=Tn47&state=Tx48&state=Ut49&state=Vt50&state=Va51&state=Wa53&state=Wv54&state=Wi55&state=Wy56&g1_=puma12&g2_=county&wtvar=pop20&nozerob=1&fileout=1&filefmt=csv&lstfmt=html&title=&afacts2=on&counties=&metros=&places=&oropt=&latitude=&longitude=&distance=&kiloms=0&locname=
-#     """
-#     lf = (
-#         pl.read_csv(
-#             CROSSWALK_DATA / "PUMA/geocorr_puma_2010_to_county_2020_with_afact2.csv",
-#             encoding="iso-8859-1",
-#             skip_rows_after_header=1,
-#             columns=[
-#                 "state",
-#                 "puma12",
-#                 "county",
-#                 "afact",
-#                 "afact2",
-#             ],
-#             schema_overrides={
-#                 "state": pl.String,
-#                 "puma12": pl.String,
-#                 "county": pl.String,
-#                 "afact": pl.String,
-#                 "afact2": pl.String,
-#             },
-#         )
-#         .lazy()
-#         .rename({"county": "county_fips"})
-#         .with_columns(
-#             puma_geoid=pl.col("state") + pl.col("puma12"),
-#             wt_PUMA_2010_to_county=pl.col("afact").str.strip_chars().cast(pl.Float64),
-#             wt_county_to_PUMA_2010=pl.col("afact2").str.strip_chars().cast(pl.Float64),
-#         )
-#         .filter(
-#             pl.col("state").is_between(pl.lit("01"), pl.lit("56")),
-#             # pl.col("wt_PUMA_2010_to_county") != 0
-#         )
-#         .select(
-#             "puma_geoid",
-#             "county_fips",
-#             "wt_PUMA_2010_to_county",
-#             "wt_county_to_PUMA_2010",
-#         )
-#         .sort("puma_geoid", "county_fips")
-#     )
-#
-#     if as_pandas:
-#         return lf.collect().to_pandas()
-#
-#     return lf
+@overload
+def puma_2010_county_2020(
+    as_pandas: Literal[False] = ...,
+) -> pl.LazyFrame: ...
+
+
+@overload
+def puma_2010_county_2020(as_pandas: Literal[True]) -> pd.DataFrame: ...
+
+
+def puma_2010_county_2020(
+    as_pandas: bool = False,
+) -> pl.LazyFrame | pd.DataFrame:
+    """
+    Crosswalk data between 2010 PUMAs (effective 2012) and 2020 (effective 2022) counties.
+
+    Note: uses new CT counties despite counties being labeled as 2020
+
+    Source: MCDC 2022 Geocorr
+    PUMA page: https://mcdc.missouri.edu/geography/PUMAs.html
+    Form query: https://mcdc.missouri.edu/cgi-bin/broker?_PROGRAM=apps.geocorr2022.sas&_SERVICE=MCDC_long&_debug=0&state=Mo29&state=Al01&state=Ak02&state=Az04&state=Ar05&state=Ca06&state=Co08&state=Ct09&state=De10&state=Dc11&state=Fl12&state=Ga13&state=Hi15&state=Id16&state=Il17&state=In18&state=Ia19&state=Ks20&state=Ky21&state=La22&state=Me23&state=Md24&state=Ma25&state=Mi26&state=Mn27&state=Ms28&state=Mt30&state=Ne31&state=Nv32&state=Nh33&state=Nj34&state=Nm35&state=Ny36&state=Nc37&state=Nd38&state=Oh39&state=Ok40&state=Or41&state=Pa42&state=Pr72&state=Ri44&state=Sc45&state=Sd46&state=Tn47&state=Tx48&state=Ut49&state=Vt50&state=Va51&state=Wa53&state=Wv54&state=Wi55&state=Wy56&g1_=puma12&g2_=county&wtvar=pop20&nozerob=1&fileout=1&filefmt=csv&lstfmt=html&title=&afacts2=on&counties=&metros=&places=&oropt=&latitude=&longitude=&distance=&kiloms=0&locname=
+    """
+    data = get_dataset(
+        "crosswalk/PUMA/geocorr_puma_2010_to_county_2020_with_afact2.csv"
+    )
+    lf = (
+        pl.scan_csv(
+            data,
+            encoding="utf8-lossy",
+            skip_rows_after_header=1,
+            schema_overrides={
+                "state": pl.String,
+                "puma12": pl.String,
+                "county": pl.String,
+                "afact": pl.String,
+                "afact2": pl.String,
+            },
+        )
+        .filter(pl.col("state").is_between(pl.lit("01"), pl.lit("56")))
+        .rename({"county": "county_fips"})
+        .with_columns(
+            puma_geoid_2010=pl.col("state") + pl.col("puma12"),
+            wt_PUMA_2010_to_county=pl.col("afact").str.strip_chars().cast(pl.Float64),
+            wt_county_to_PUMA_2010=pl.col("afact2").str.strip_chars().cast(pl.Float64),
+        )
+        .select(
+            "puma_geoid_2010",
+            "county_fips",
+            "wt_PUMA_2010_to_county",
+            "wt_county_to_PUMA_2010",
+        )
+        .sort("puma_geoid_2010", "county_fips")
+    )
+
+    if as_pandas:
+        return lf.collect().to_pandas()
+
+    return lf
 
 
 @overload
@@ -311,6 +302,7 @@ def zip_to_county(year: int, as_pandas: bool = False) -> pl.LazyFrame | pd.DataF
 #     )
 #     assert crosswalk.select(pl.len()).collect().item() == num_county_subs
 #
+#     num_county_subs = 169
 #     subcounty = (
 #         pl.scan_csv(
 #             CROSSWALK_DATA / "county/sub-est2024_9.csv",
@@ -349,7 +341,7 @@ def zip_to_county(year: int, as_pandas: bool = False) -> pl.LazyFrame | pd.DataF
 #         .drop("state_fips", "county_fips", "sumlev")
 #         .sort("county_sub_fips")
 #     )
-#     assert subcounty.select(pl.len()).collect().item() == num_county_subs
+#     assert subcounty.select(pl.len()).collect().item() == 169
 #
 #     lf = (
 #         crosswalk.join(
diff --git a/tests/crosswalk_test.py b/tests/crosswalk_test.py
index 3705208..a1efa29 100644
--- a/tests/crosswalk_test.py
+++ b/tests/crosswalk_test.py
@@ -2,8 +2,14 @@
 import polars as pl
 import pytest
 from pandas import DataFrame
+from pandera.polars import PolarsData
 
-from kintsugi.crosswalk import county_to_zip, puma_2010_2020, zip_to_county
+from kintsugi.crosswalk import (
+    county_to_zip,
+    puma_2010_2020,
+    puma_2010_county_2020,
+    zip_to_county,
+)
 
 from .models import BasePolarsModel
 
@@ -17,6 +23,24 @@ class PUMAVersionCrosswalk(BasePolarsModel):
     class Config:  # pyright: ignore [reportIncompatibleVariableOverride]
         unique: list[str] = ["puma_geoid_2010", "puma_geoid_2020"]
 
+    @pa.dataframe_check
+    def has_correct_states(cls, data: PolarsData) -> bool:
+        return (
+            data.lazyframe.select(
+                pl.all_horizontal(
+                    pl.col("puma_geoid_2010")
+                    .str.slice(0, 2)
+                    .is_between(pl.lit("01"), pl.lit("56")),
+                    pl.col("puma_geoid_2020")
+                    .str.slice(0, 2)
+                    .is_between(pl.lit("01"), pl.lit("56")),
+                ).all()
+            )
+            .collect()
+            .item()
+            is True
+        )
+
 
 def test_puma_2010_2020() -> None:
     puma_2010_2020().collect().pipe(PUMAVersionCrosswalk.validate, lazy=True)
@@ -28,6 +52,40 @@ def test_puma_2010_2020_as_pandas() -> None:
     assert isinstance(df, DataFrame)
 
 
+class PUMACountyCrosswalk(BasePolarsModel):
+    puma_geoid_2010: pl.String  # pyright: ignore [reportUninitializedInstanceVariable]
+    county_fips: pl.String  # pyright: ignore [reportUninitializedInstanceVariable]
+    wt_PUMA_2010_to_county: pl.Float64  # pyright: ignore [reportUninitializedInstanceVariable]
+    wt_county_to_PUMA_2010: pl.Float64  # pyright: ignore [reportUninitializedInstanceVariable]
+
+    class Config:  # pyright: ignore [reportIncompatibleVariableOverride]
+        unique: list[str] = ["puma_geoid_2010", "county_fips"]
+
+    @pa.dataframe_check
+    def has_correct_states(cls, data: PolarsData) -> bool:
+        return (
+            data.lazyframe.select(
+                pl.col("county_fips")
+                .str.slice(0, 2)
+                .is_between(pl.lit("01"), pl.lit("56"))
+                .all()
+            )
+            .collect()
+            .item()
+            is True
+        )
+
+
+def test_puma_2010_county_2020() -> None:
+    puma_2010_county_2020().collect().pipe(PUMACountyCrosswalk.validate, lazy=True)
+
+
+def test_puma_2010_county_2020_as_pandas() -> None:
+    df = puma_2010_county_2020(as_pandas=True)
+
+    assert isinstance(df, DataFrame)
+
+
 class ZipCountyCrosswalk(BasePolarsModel):
     zip_code: pl.String  # pyright: ignore [reportUninitializedInstanceVariable]
     county_fips: pl.String  # pyright: ignore [reportUninitializedInstanceVariable]
@@ -36,6 +94,20 @@ class ZipCountyCrosswalk(BasePolarsModel):
     class Config:  # pyright: ignore [reportIncompatibleVariableOverride]
         unique: list[str] = ["zip_code", "county_fips"]
 
+    @pa.dataframe_check
+    def has_correct_states(cls, data: PolarsData) -> bool:
+        return (
+            data.lazyframe.select(
+                pl.col("county_fips")
+                .str.slice(0, 2)
+                .is_between(pl.lit("01"), pl.lit("56"))
+                .all()
+            )
+            .collect()
+            .item()
+            is True
+        )
+
 
 @pytest.mark.parametrize(
     ("year"),

From 71c88ac9ee4e95fd4e25102d958e73f9c993a6d9 Mon Sep 17 00:00:00 2001
From: winter-again <63322884+winter-again@users.noreply.github.com>
Date: Fri, 13 Mar 2026 11:16:09 -0400
Subject: [PATCH 4/5] Crosswalk CT counties

---
 src/kintsugi/_data.py     |   2 +
 src/kintsugi/crosswalk.py | 336 +++++++++++++++++++-------------------
 tests/crosswalk_test.py   |  51 +++++-
 3 files changed, 215 insertions(+), 174 deletions(-)

diff --git a/src/kintsugi/_data.py b/src/kintsugi/_data.py
index c0ff179..210c3be 100644
--- a/src/kintsugi/_data.py
+++ b/src/kintsugi/_data.py
@@ -18,6 +18,8 @@
     "county_neighbors/county_adjacency2023.txt": "2dbf9a8bae1b7c50db3a9db4864b073d2423c3e6e63518c97d649453e2809843",
     "county_neighbors/county_adjacency2024.txt": "20cffeb48ba46972fb949c453d3fbf62620115039c45c88bc80c094885650816",
     "county_neighbors/county_adjacency2025.txt": "27046a5f09f66205fd9869afbfb6dcae744e1c9b85cb19dd767c580211fb4575",
+    "crosswalk/county/ct_cou_to_cousub_crosswalk.txt": "13ad002564e30e6dcd1df112fdaa985f2cd0cdb908f7f3017cd2723db294cff8",
+    "crosswalk/county/sub-est2024_9.csv": "e5bd2cb1b10cf12d741572eef9e8eff19a1f6c5e08eb910d6f44f299c7ca83df",
     "crosswalk/county_to_zip/county_to_zip_2016.parquet": "f13d79b059b272c9ac4ff02dbe4e0e26d56acffad837adeb0f2d86d3101fd9ad",
     "crosswalk/county_to_zip/county_to_zip_2017.parquet": "009141988a3b902179d6beed80260faf2474a4cda8d6af0667d88ab4f0140b0b",
     "crosswalk/county_to_zip/county_to_zip_2018.parquet": "697425d6ffba4c8f95c80a961814d2291f61507336010349a4d78c10ce5cfdba",
diff --git a/src/kintsugi/crosswalk.py b/src/kintsugi/crosswalk.py
index b5593bd..9d9954b 100644
--- a/src/kintsugi/crosswalk.py
+++ b/src/kintsugi/crosswalk.py
@@ -234,172 +234,170 @@ def zip_to_county(year: int, as_pandas: bool = False) -> pl.LazyFrame | pd.DataF
     return lf
 
 
-# @overload
-# def crosswalk_CT_counties(as_pandas: Literal[False] = ...) -> pl.LazyFrame: ...
-#
-#
-# @overload
-# def crosswalk_CT_counties(as_pandas: Literal[True]) -> pd.DataFrame: ...
-#
-#
-# def crosswalk_CT_counties(as_pandas: bool = False) -> pl.LazyFrame | pd.DataFrame:
-#     """
-#     Crosswalk CT counties between pre and post-2022 changes. Weights calculated based on county subdivision populations.
-#
-#     See:
-#         - FIPS code changes: https://www.census.gov/programs-surveys/geography/technical-documentation/county-changes.html
-#         - CT specific: https://www2.census.gov/geo/pdfs/reference/ct_county_equiv_change.pdf
-#         - CT specific: https://www.federalregister.gov/documents/2022/06/06/2022-12063/change-to-county-equivalents-in-the-state-of-connecticut
-#
-#     Crosswalk: https://www.census.gov/programs-surveys/geography/technical-documentation/county-changes.January_2020.html
-#     Source: https://www2.census.gov/geo/docs/reference/ct_change/ct_cou_to_cousub_crosswalk.txt
-#
-#     CT county subdivision populations source: https://www.census.gov/data/tables/time-series/demo/popest/2020s-total-cities-and-towns.html
-#     FTP: https://www2.census.gov/programs-surveys/popest/datasets/2020-2024/cities/totals/sub-est2024_9.csv
-#
-#     """
-#     crosswalk = (
-#         pl.scan_csv(
-#             CROSSWALK_DATA / "county/ct_cou_to_cousub_crosswalk.txt",
-#             separator="|",
-#             n_rows=174,
-#             infer_schema=False,
-#         )
-#         .select(
-#             "STATEFP\n(INCITS38)",
-#             "OLD_COUNTYFP\n(INCITS31)",
-#             "OLD_COUNTY_NAMELSAD",
-#             "NEW_COUNTYFP\n(INCITS31)",
-#             "NEW_COUNTY_NAMELSAD",
-#             "COUSUBFP",
-#             "OLD_COUSUB_GEOID",
-#             "NEW_COUSUB_GEOID",
-#             "COUSUB_NAMELSAD",
-#         )
-#         .rename(
-#             {
-#                 "STATEFP\n(INCITS38)": "state_fips",
-#                 "OLD_COUNTYFP\n(INCITS31)": "county_fips_old",
-#                 "OLD_COUNTY_NAMELSAD": "county_name_old",
-#                 "NEW_COUNTYFP\n(INCITS31)": "county_fips_new",
-#                 "NEW_COUNTY_NAMELSAD": "county_name_new",
-#                 "COUSUBFP": "county_sub_fips",
-#                 "OLD_COUSUB_GEOID": "county_sub_geoid_old",
-#                 "NEW_COUSUB_GEOID": "county_sub_geoid_new",
-#                 "COUSUB_NAMELSAD": "county_sub_name",
-#             }
-#         )
-#         .filter(
-#             # NOTE: 5 rows labeled with "County subdivisions not defined" but their GEOID/FIPS
-#             # doesn't make sense anyway
-#             pl.col("county_sub_fips") != "00000"
-#         )
-#         .with_columns(
-#             (pl.col("state_fips") + pl.col(col)).alias(col)
-#             for col in ["county_fips_old", "county_fips_new"]
-#         )
-#         .drop("state_fips")
-#     )
-#     assert crosswalk.select(pl.len()).collect().item() == num_county_subs
-#
-#     num_county_subs = 169
-#     subcounty = (
-#         pl.scan_csv(
-#             CROSSWALK_DATA / "county/sub-est2024_9.csv",
-#             schema_overrides={
-#                 "SUMLEV": pl.String,
-#                 "STATE": pl.String,
-#                 "COUNTY": pl.String,
-#                 "COUSUB": pl.String,
-#                 "NAME": pl.String,
-#                 "POPESTIMATE2024": pl.Int64,
-#             },
-#         )
-#         .select(
-#             "SUMLEV",
-#             "STATE",
-#             "COUNTY",
-#             "COUSUB",
-#             "NAME",
-#             "POPESTIMATE2024",
-#         )
-#         .rename(
-#             {
-#                 "SUMLEV": "sumlev",
-#                 "STATE": "state_fips",
-#                 "COUNTY": "county_fips",
-#                 "COUSUB": "county_sub_fips",
-#                 "NAME": "county_sub_name",
-#                 "POPESTIMATE2024": "pop_2024",
-#             }
-#         )
-#         .filter(
-#             # only minor civil divisions
-#             pl.col("sumlev") == "061"
-#         )
-#         .with_columns(county_fips=pl.col("state_fips") + pl.col("county_fips"))
-#         .drop("state_fips", "county_fips", "sumlev")
-#         .sort("county_sub_fips")
-#     )
-#     assert subcounty.select(pl.len()).collect().item() == 169
-#
-#     lf = (
-#         crosswalk.join(
-#             subcounty,
-#             on="county_sub_fips",
-#             how="inner",
-#             validate="1:1",
-#         )
-#         .select(
-#             "county_sub_fips",
-#             "county_sub_name",
-#             "county_fips_new",
-#             "county_name_new",
-#             "county_fips_old",
-#             "county_name_old",
-#             "pop_2024",
-#         )
-#         .with_columns(
-#             pop_old=pl.col("pop_2024").sum().over("county_fips_old"),
-#             pop_new=pl.col("pop_2024").sum().over("county_fips_new"),
-#         )
-#         .group_by(
-#             [
-#                 "county_fips_new",
-#                 "county_name_new",
-#                 "county_fips_old",
-#                 "county_name_old",
-#                 "pop_old",
-#                 "pop_new",
-#             ]
-#         )
-#         .agg(pop_agg=pl.col("pop_2024").sum())  # agg county sub to county pairs
-#         # NOTE: want weights to be expected prop. of origin FIPS that are located in dest. FIPS
-#         # Aka prop. of origin FIPS that is located in dest. FIPS
-#         # Ex: wt_new_to_old should give expected prop. of new FIPS that is located in old FIPS.
-#         .with_columns(
-#             wt_new_to_old=pl.col("pop_agg") / pl.col("pop_new"),
-#             wt_old_to_new=pl.col("pop_agg") / pl.col("pop_old"),
-#         )
-#         .select(
-#             "county_fips_old",
-#             "county_name_old",
-#             "county_fips_new",
-#             "county_name_new",
-#             "wt_new_to_old",
-#             "wt_old_to_new",
-#         )
-#         .sort("county_fips_old")
-#     )
-#     assert (
-#         lf.select(pl.len()).collect().item()
-#         == lf.unique(["county_fips_old", "county_fips_new"])
-#         .select(pl.len())
-#         .collect()
-#         .item()
-#     )
-#
-#     if as_pandas:
-#         return lf.collect().to_pandas()
-#
-#     return lf
+@overload
+def counties_CT(as_pandas: Literal[False] = ...) -> pl.LazyFrame: ...
+
+
+@overload
+def counties_CT(as_pandas: Literal[True]) -> pd.DataFrame: ...
+
+
+def counties_CT(as_pandas: bool = False) -> pl.LazyFrame | pd.DataFrame:
+    """
+    Crosswalk CT counties between pre and post-2022 changes.
+
+    Uses county subdivisions as a more accurate intermediary.
+    Weights calculated based on county subdivision 2024 populations.
+
+    See:
+        - https://www2.census.gov/geo/pdfs/reference/ct_county_equiv_change.pdf
+        - https://www.federalregister.gov/documents/2022/06/06/2022-12063/change-to-county-equivalents-in-the-state-of-connecticut
+
+    Crosswalk data: https://www.census.gov/programs-surveys/geography/technical-documentation/county-changes.January_2020.html
+    Source: https://www2.census.gov/geo/docs/reference/ct_change/ct_cou_to_cousub_crosswalk.txt
+
+    CT county subdivision populations source: https://www.census.gov/data/tables/time-series/demo/popest/2020s-total-cities-and-towns.html
+    FTP: https://www2.census.gov/programs-surveys/popest/datasets/2020-2024/cities/totals/sub-est2024_9.csv
+
+    """
+    num_county_subs = 169
+    data_county_to_sub = get_dataset("crosswalk/county/ct_cou_to_cousub_crosswalk.txt")
+    # NOTE: provides mapping of county subdivisions between old and new CT counties
+    crosswalk = (
+        pl.scan_csv(
+            data_county_to_sub,
+            separator="|",
+            n_rows=174,
+            infer_schema=False,
+        )
+        .rename(lambda col: col.strip().lower().split("(")[0].strip())
+        .rename(
+            {
+                "statefp": "state_fips",
+                "old_countyfp": "county_fips_old",
+                "old_county_namelsad": "county_name_old",
+                "new_countyfp": "county_fips_new",
+                "new_county_namelsad": "county_name_new",
+                "cousubfp": "county_sub_fips",
+                "old_cousub_geoid": "county_sub_geoid_old",
+                "new_cousub_geoid": "county_sub_geoid_new",
+                "cousub_namelsad": "county_sub_name",
+            }
+        )
+        .filter(
+            # NOTE: 5 rows labeled with "County subdivisions not defined" but their GEOID/FIPS
+            # doesn't make sense anyway
+            pl.col("county_sub_fips") != "00000"
+        )
+        .with_columns(
+            (pl.col("state_fips") + pl.col(col)).alias(col)
+            for col in ["county_fips_old", "county_fips_new"]
+        )
+        .select(
+            "county_sub_fips",
+            "county_fips_new",
+            "county_name_new",
+            "county_fips_old",
+            "county_name_old",
+        )
+    )
+    assert crosswalk.select(pl.len()).collect().item() == num_county_subs
+
+    data_county_sub = get_dataset("crosswalk/county/sub-est2024_9.csv")
+    # NOTE: provides county subdivision population counts
+    county_sub = (
+        pl.scan_csv(
+            data_county_sub,
+            schema_overrides={
+                "SUMLEV": pl.String,
+                "STATE": pl.String,
+                "COUNTY": pl.String,
+                "COUSUB": pl.String,
+                "NAME": pl.String,
+                "POPESTIMATE2024": pl.Int64,
+            },
+        )
+        .rename(
+            {
+                "SUMLEV": "sumlev",
+                "COUSUB": "county_sub_fips",
+                "NAME": "county_sub_name",
+                "POPESTIMATE2024": "pop_2024",
+            }
+        )
+        .select(
+            "sumlev",
+            "county_sub_fips",
+            "county_sub_name",
+            "pop_2024",
+        )
+        .filter(
+            # 061 apparently considered part of 060 (county subdivisions/minor civil divisions)
+            # Use because it's one level below county (050)
+            pl.col("sumlev") == "061"
+        )
+        .drop("sumlev")
+        .sort("county_sub_fips")
+    )
+    assert county_sub.select(pl.len()).collect().item() == 169
+
+    print(crosswalk.collect())
+    print(county_sub.collect())
+
+    lf = (
+        crosswalk.join(
+            county_sub,
+            on="county_sub_fips",
+            how="inner",
+            validate="1:1",
+        )
+        .select(
+            "county_sub_fips",
+            "county_sub_name",
+            "county_fips_new",
+            "county_name_new",
+            "county_fips_old",
+            "county_name_old",
+            "pop_2024",
+        )
+        .with_columns(
+            pop_old=pl.col("pop_2024").sum().over("county_fips_old"),
+            pop_new=pl.col("pop_2024").sum().over("county_fips_new"),
+        )
+        .group_by(
+            [
+                "county_fips_new",
+                "county_name_new",
+                "county_fips_old",
+                "county_name_old",
+                "pop_old",
+                "pop_new",
+            ]
+        )
+        .agg(
+            pop_intersect=pl.col(
+                "pop_2024"
+            ).sum()  # sum county subdivision population by new-county-old-county pairs
+        )
+        # NOTE: want weights to be expected prop. of intersection that is located in dest. FIPS
+        # Aka prop. of origin FIPS that is located in dest. FIPS
+        # Ex: wt_new_to_old should give expected prop. of new FIPS that is located in old FIPS.
+        .with_columns(
+            wt_new_to_old=pl.col("pop_intersect") / pl.col("pop_new"),
+            wt_old_to_new=pl.col("pop_intersect") / pl.col("pop_old"),
+        )
+        .select(
+            "county_fips_old",
+            "county_name_old",
+            "county_fips_new",
+            "county_name_new",
+            "wt_new_to_old",
+            "wt_old_to_new",
+        )
+        .sort("county_fips_old")
+    )
+
+    if as_pandas:
+        return lf.collect().to_pandas()
+
+    return lf
diff --git a/tests/crosswalk_test.py b/tests/crosswalk_test.py
index a1efa29..2276817 100644
--- a/tests/crosswalk_test.py
+++ b/tests/crosswalk_test.py
@@ -5,6 +5,7 @@
 from pandera.polars import PolarsData
 
 from kintsugi.crosswalk import (
+    counties_CT,
     county_to_zip,
     puma_2010_2020,
     puma_2010_county_2020,
@@ -17,8 +18,8 @@
 class PUMAVersionCrosswalk(BasePolarsModel):
     puma_geoid_2010: pl.String  # pyright: ignore [reportUninitializedInstanceVariable]
     puma_geoid_2020: pl.String  # pyright: ignore [reportUninitializedInstanceVariable]
-    wt_PUMA_2010_to_2020_MCDC: pl.Float64  # pyright: ignore [reportUninitializedInstanceVariable]
-    wt_PUMA_2020_to_2010_MCDC: pl.Float64  # pyright: ignore [reportUninitializedInstanceVariable]
+    wt_PUMA_2010_to_2020_MCDC: pl.Float64 = pa.Field(ge=0, le=1.0)  # pyright: ignore [reportAny]
+    wt_PUMA_2020_to_2010_MCDC: pl.Float64 = pa.Field(ge=0, le=1.0)  # pyright: ignore [reportAny]
 
     class Config:  # pyright: ignore [reportIncompatibleVariableOverride]
         unique: list[str] = ["puma_geoid_2010", "puma_geoid_2020"]
@@ -55,8 +56,8 @@ def test_puma_2010_2020_as_pandas() -> None:
 class PUMACountyCrosswalk(BasePolarsModel):
     puma_geoid_2010: pl.String  # pyright: ignore [reportUninitializedInstanceVariable]
     county_fips: pl.String  # pyright: ignore [reportUninitializedInstanceVariable]
-    wt_PUMA_2010_to_county: pl.Float64  # pyright: ignore [reportUninitializedInstanceVariable]
-    wt_county_to_PUMA_2010: pl.Float64  # pyright: ignore [reportUninitializedInstanceVariable]
+    wt_PUMA_2010_to_county: pl.Float64 = pa.Field(ge=0, le=1.0)  # pyright: ignore [reportAny]
+    wt_county_to_PUMA_2010: pl.Float64 = pa.Field(ge=0, le=1.0)  # pyright: ignore [reportAny]
 
     class Config:  # pyright: ignore [reportIncompatibleVariableOverride]
         unique: list[str] = ["puma_geoid_2010", "county_fips"]
@@ -89,7 +90,7 @@ def test_puma_2010_county_2020_as_pandas() -> None:
 class ZipCountyCrosswalk(BasePolarsModel):
     zip_code: pl.String  # pyright: ignore [reportUninitializedInstanceVariable]
     county_fips: pl.String  # pyright: ignore [reportUninitializedInstanceVariable]
-    res_ratio: pl.Float64 = pa.Field(ge=0)  # pyright: ignore [reportAny]
+    res_ratio: pl.Float64 = pa.Field(ge=0, le=1.0)  # pyright: ignore [reportAny]
 
     class Config:  # pyright: ignore [reportIncompatibleVariableOverride]
         unique: list[str] = ["zip_code", "county_fips"]
@@ -153,3 +154,43 @@ def test_zip_to_county_as_pandas(year: int) -> None:
 def test_zip_to_county_year_exception() -> None:
     with pytest.raises(ValueError, match="Must choose a year between 2016 and 2025"):
         zip_to_county(2010)
+
+
+class CountiesCT(BasePolarsModel):
+    county_fips_old: pl.String  # pyright: ignore [reportUninitializedInstanceVariable]
+    county_name_old: pl.String  # pyright: ignore [reportUninitializedInstanceVariable]
+    county_fips_new: pl.String  # pyright: ignore [reportUninitializedInstanceVariable]
+    county_name_new: pl.String  # pyright: ignore [reportUninitializedInstanceVariable]
+    wt_new_to_old: pl.Float64 = pa.Field(ge=0, le=1.0)  # pyright: ignore [reportAny]
+    wt_old_to_new: pl.Float64 = pa.Field(ge=0, le=1.0)  # pyright: ignore [reportAny]
+
+    class Config:  # pyright: ignore [reportIncompatibleVariableOverride]
+        unique: list[str] = ["county_fips_old", "county_fips_new"]
+
+    @pa.dataframe_check
+    def has_correct_states(cls, data: PolarsData) -> bool:
+        return (
+            data.lazyframe.select(
+                pl.all_horizontal(
+                    pl.col("county_fips_old")
+                    .str.slice(0, 2)
+                    .is_between(pl.lit("01"), pl.lit("56")),
+                    pl.col("county_fips_new")
+                    .str.slice(0, 2)
+                    .is_between(pl.lit("01"), pl.lit("56")),
+                ).all()
+            )
+            .collect()
+            .item()
+            is True
+        )
+
+
+def test_counties_CT() -> None:
+    counties_CT().collect().pipe(CountiesCT.validate, lazy=True)
+
+
+def test_counties_CT_as_pandas() -> None:
+    df = counties_CT(as_pandas=True)
+
+    assert isinstance(df, DataFrame)

From d4613b0cf44eae3a5e05082aec5d091e90b9f015 Mon Sep 17 00:00:00 2001
From: winter-again <63322884+winter-again@users.noreply.github.com>
Date: Fri, 13 Mar 2026 11:41:15 -0400
Subject: [PATCH 5/5] Bump version and add crosswalk example to README

---
 README.md      | 10 +++++++++-
 pyproject.toml |  2 +-
 uv.lock        |  2 +-
 3 files changed, 11 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index 32d0384..7c8a0e8 100644
--- a/README.md
+++ b/README.md
@@ -20,7 +20,7 @@ pip install git+ssh://git@github.com/winter-again/kintsugi
 
 ## Datasets
 
-Currently supported datasets:
+Currently supported datasets. Where appropriate, you can pass `as_pandas=True` to get a pandas dataframe back:
 
 County neighbors
 
@@ -71,3 +71,11 @@ from kintsugi.metadata import counties
 
 lf_counties = counties(2020)
 ```
+
+Crosswalk 2010 PUMAs to 2020 counties
+
+```python
+from kintsugi.crosswalk import puma_2010_county_2020
+
+crosswalk = puma_2010_county_2020()
+```
diff --git a/pyproject.toml b/pyproject.toml
index b2c378d..3382a06 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "kintsugi"
-version = "0.9.0"
+version = "0.10.0"
 description = "Commonly used datasets and functions"
 readme = "README.md"
 authors = [
diff --git a/uv.lock b/uv.lock
index 9ec42ad..b3323e7 100644
--- a/uv.lock
+++ b/uv.lock
@@ -200,7 +200,7 @@ wheels = [
 
 [[package]]
 name = "kintsugi"
-version = "0.9.0"
+version = "0.10.0"
 source = { editable = "." }
 dependencies = [
     { name = "pandas" },