diff --git a/pyproject.toml b/pyproject.toml index b2c378d..3382a06 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "kintsugi" -version = "0.9.0" +version = "0.10.0" description = "Commonly used datasets and functions" readme = "README.md" authors = [ diff --git a/src/kintsugi/_data.py b/src/kintsugi/_data.py deleted file mode 100644 index 210c3be..0000000 --- a/src/kintsugi/_data.py +++ /dev/null @@ -1,164 +0,0 @@ -import hashlib -import logging -import os -import shutil -import tempfile -from pathlib import Path - -import requests -from platformdirs import user_cache_path -from requests.adapters import HTTPAdapter, Retry - -logger = logging.getLogger("kintsugi") -logger.addHandler(logging.NullHandler()) - -BASE_URL = "https://raw.githubusercontent.com/bansallab/kintsugi-data/main/data" -DATASETS = { - "county_neighbors/county_adjacency2010.txt": "7edda309ad38a4dfc6a6c6c30e1753e5490b9c8a3aa4563188841989b4fe9a96", - "county_neighbors/county_adjacency2023.txt": "2dbf9a8bae1b7c50db3a9db4864b073d2423c3e6e63518c97d649453e2809843", - "county_neighbors/county_adjacency2024.txt": "20cffeb48ba46972fb949c453d3fbf62620115039c45c88bc80c094885650816", - "county_neighbors/county_adjacency2025.txt": "27046a5f09f66205fd9869afbfb6dcae744e1c9b85cb19dd767c580211fb4575", - "crosswalk/county/ct_cou_to_cousub_crosswalk.txt": "13ad002564e30e6dcd1df112fdaa985f2cd0cdb908f7f3017cd2723db294cff8", - "crosswalk/county/sub-est2024_9.csv": "e5bd2cb1b10cf12d741572eef9e8eff19a1f6c5e08eb910d6f44f299c7ca83df", - "crosswalk/county_to_zip/county_to_zip_2016.parquet": "f13d79b059b272c9ac4ff02dbe4e0e26d56acffad837adeb0f2d86d3101fd9ad", - "crosswalk/county_to_zip/county_to_zip_2017.parquet": "009141988a3b902179d6beed80260faf2474a4cda8d6af0667d88ab4f0140b0b", - "crosswalk/county_to_zip/county_to_zip_2018.parquet": "697425d6ffba4c8f95c80a961814d2291f61507336010349a4d78c10ce5cfdba", - "crosswalk/county_to_zip/county_to_zip_2019.parquet": "3a06ca1a4d21638a4d014cbb4bc29b7d6b254a937dab1536058152d226073d62", - "crosswalk/county_to_zip/county_to_zip_2020.parquet": "06cb2bfa55085d9c5f0f652628a61d1bf64fe14ea190df5bdc361844f5204250", - "crosswalk/county_to_zip/county_to_zip_2021.parquet": "fdd59a6b8ab06a534d1c9c4765101105ce8bc08d9685eb9441c58701bffd96ae", - "crosswalk/county_to_zip/county_to_zip_2022.parquet": "c3875e66bff03ff71ad73b89277fc892a294a5d71be08b4eb4ac2c431d715b83", - "crosswalk/county_to_zip/county_to_zip_2023.parquet": "2f7c7564092c43d3a964ecf76bc61083916186d140c3cabed63aba5f91b56f1a", - "crosswalk/county_to_zip/county_to_zip_2024.parquet": "907321abfa45437a13d4cfa9047b687456e883a096b863c8874d168fb8cc57c0", - "crosswalk/county_to_zip/county_to_zip_2025.parquet": "e5008518558a4aca7ae2d40226f56bdcdcdf01bdb3879241a0b8d69a14097239", - "crosswalk/PUMA/geocorr_puma_2010_to_county_2020_with_afact2.csv": "b66f23aa27be3daf1f6456607478848df3383d4c7111d6f935f792fc9542b807", - "crosswalk/PUMA/puma2010-to-puma2020.csv": "bc6d0116aa39ea3b2af85095b49bac00f4612f591d2f4684cc2cc872c210a666", - "crosswalk/zip_to_county/zip_to_county_2016.parquet": "982913988b4915e9c4787d4f639985635cae938b447944e0ef04868bb5c784d1", - "crosswalk/zip_to_county/zip_to_county_2017.parquet": "e8dd16dcee07b3a6b52a30c49157d707a1f1e8f1cabf7da126cb094e88cfd0eb", - "crosswalk/zip_to_county/zip_to_county_2018.parquet": "8ab399f97bb8d81eaabca7adf512e801c8958745fe6f0e82992313fbcfaaa942", - "crosswalk/zip_to_county/zip_to_county_2019.parquet": "78ac9cdda7c5e00b82315ceaece4f3722da905e2ec4f68774a135d327c257435", - "crosswalk/zip_to_county/zip_to_county_2020.parquet": "17f518b2a6de5ee58b134c0dc5fbf10f6d48c45b14fdbfcfe58a4269e08eaee2", - "crosswalk/zip_to_county/zip_to_county_2021.parquet": "655c6ce8b257e942ac1c7dfa068d93ad2a1f139c9cd0c110efbce752ab27cea4", - "crosswalk/zip_to_county/zip_to_county_2022.parquet": "9f125642aeb5aba3c8f2bdf616c186f975fc6afc1b5543d55204ad9b2a10b252", - "crosswalk/zip_to_county/zip_to_county_2023.parquet": "89f50d28acb226a5777ededc1f8301dc1f028c65dd04b860355d23efe24b89c5", - "crosswalk/zip_to_county/zip_to_county_2024.parquet": "8d76e09fa52923c0901c0afad26ba2d32075210bb0ab5b82caa14ce5465b3d80", - "crosswalk/zip_to_county/zip_to_county_2025.parquet": "2683272f9759646d0a6234de06076f274852b85fcd2fd9e3d2dfcb3aa10def80", - "geo/cb_2020_us_county_5m.zip": "187e7118304428e5450083beb375e67c2c516c58a01ce52db95aaf24f18df3ba", - "geo/cb_2020_us_state_5m.zip": "aedc60e0d1924a9030ee6d39ff0ed27ad7d1b0bc86807ea809391a6b9008ffb3", - "geo/cb_2024_us_county_5m.zip": "a867f8734059b45d1d54a0ba56189dd7e73c42eb451418fa56de44c35232614b", - "geo/cb_2024_us_state_5m.zip": "c9db0e395c11a1f94a8017fde4f4c7cbee1dca6eb37ba8f1ccaab927df70885f", - "pop/county_cc/county_pop_2016.parquet": "74caad19bf5eed856ad9b6f63c65f7fceca612dec680d0768890de2265116607", - "pop/county_cc/county_pop_2017.parquet": "d93d027929861e115cf34b15f1ff7c697c8eaa327b73cd8132710a11860a63d5", - "pop/county_cc/county_pop_2018.parquet": "be3d3bab642a9f6f111c792a431f940b1753373194993885e4d47c136feed91a", - "pop/county_cc/county_pop_2019.parquet": "98801f118cd795c026a8269d5ac6674f98b9d47e0207c6a2721a5b7f4b6e5c08", - "pop/county_cc/county_pop_2020.parquet": "f1e4f282d297dc5498b6f839412c0815ca6f9e0a15d83d5d3867f2d70aa8413d", - "pop/county_cc/county_pop_2021.parquet": "3af369564ebb0e1fda25b440e5bf133ecb2d2eab60ab40f5db1f0a0955db713b", - "pop/county_cc/county_pop_2022.parquet": "977856eb5fffd508442ccedaa54c92e338b037135e5a9be55a03c7132863d9ca", - "pop/county_cc/county_pop_2023.parquet": "a4d66c302a557c1565ec9f43bad5ea9d4267576d1fbd17d8939e5a858a3d73e7", - "pop/county_cc/county_pop_2024.parquet": "12b16c7c20329a3df2f4120f6ec9a9a7313147fad0fd03bc360b1de5769c8abd", - "pop/state/state_pop_2016.parquet": "bac51c5ba4a9ff7305e92b3b2804c854fc20b9cbcf01156e5439d92668c0c81e", - "pop/state/state_pop_2017.parquet": "6fb950b1b78409af8130317b08b437b742c0906ff9d5c38655c1189103b8dddc", - "pop/state/state_pop_2018.parquet": "913fca35299028a842325000e58e33cd3912c1e900d480f00b468095398e57f8", - "pop/state/state_pop_2019.parquet": "7ca2c87065f24857178bb33a7512cb799a92890596bac6fff1cbeb3c69f6fc36", - "pop/state/state_pop_2020.parquet": "275b861e07f1c2327fb5382a28e84a5fb7ac4f896ae9f91b06612f6197af9611", - "pop/state/state_pop_2021.parquet": "8b47a5c9fdca838954c8ddac8265ad00d590281c7b444019070c81b9942a727e", - "pop/state/state_pop_2022.parquet": "ea113b3766c44bbf250e01b0b9509e810590119b3b9470b13dc347d43aed042b", - "pop/state/state_pop_2023.parquet": "e96a982342510fe6a1ba90fc85a9bd6fbdd8687bceaf76e6e117606429d2d160", - "pop/state/state_pop_2024.parquet": "b79bca471a68b8c3742ec30d41a2b65ab1227152e81239faf00763188752c6ff", - "county_groups.parquet": "7d7c150b5efd5596e0eaaed27abd6dc86137f08ff677c2606d402b9d165b87fa", - "state.txt": "bea4e03f71a1fa0045ae732aabad11fa541e5932b071c2369bb0d325e8cba5a0", -} - - -class GetDatasetError(Exception): - pass - - -def get_dataset(file_name: str) -> Path: - """ - Ensure valid dataset file is present in cache and return its path - """ - if file_name not in DATASETS: - raise ValueError(f"{file_name} not in dataset") - - file_cached = get_cache_dir() / file_name - if not file_cached.is_file() or not file_valid(file_name, file_cached): - logger.debug( - f"{file_name} not in cache or cached file is invalid. Getting file from kintsugi-data repository." - ) - try: - download_dataset(file_name, file_cached) - except GetDatasetError as err: - logger.exception(f"Unable to get dataset: {file_name}", exc_info=err) - raise - else: - logger.debug(f"{file_name} already exists in cache") - - return file_cached - - -def get_cache_dir() -> Path: - """ - Ensure cache directory exists and return its absolute path - """ - cache_dir = ( - Path(os.getenv("KINTSUGI_CACHE", user_cache_path("kintsugi-data"))) - .expanduser() - .resolve() - ) - try: - cache_dir.mkdir(parents=True, exist_ok=True) - except (OSError, PermissionError) as err: - logger.exception( - f"Error while setting up cache directory at {cache_dir}", exc_info=err - ) - raise - - return cache_dir - - -def download_dataset(file_name: str, file_cached: Path) -> None: - """ - Download dataset file, verify integrity via checksum, and save to cache - """ - url = f"{BASE_URL}/{file_name}" - # 5 total retries, sleep for [0.0, 0.2, 0.4, 0.8, ...] seconds between retries after second try - retries = Retry( - total=5, - backoff_factor=0.1, - status_forcelist=[500, 502, 503, 504], - raise_on_status=True, - ) - adapter = HTTPAdapter(max_retries=retries) - - with requests.Session() as s: - s.mount("https://raw.githubusercontent.com/", adapter) - try: - res = s.get(url) - except requests.exceptions.RetryError as err: - raise GetDatasetError( - f"Error getting data file {file_name} despite retry strategy" - ) from err - - with tempfile.NamedTemporaryFile("wb", delete_on_close=False) as fp: - fp.write(res.content) - fp.close() - - tmp_file = Path(fp.name) - if not file_valid(file_name, tmp_file): - raise GetDatasetError( - f"Checksum for {file_name} is invalid. File not copied to cache" - ) - - logger.info(f"File {file_name} valid. Copying to cache") - file_cached.parent.mkdir(parents=True, exist_ok=True) - shutil.copyfile(tmp_file, file_cached) - - -def file_valid(file_name: str, file: Path) -> bool: - """ - Validate dataset file via sha256 checksum - """ - with open(file, "rb") as f: - digest = hashlib.file_digest(f, "sha256") - - return digest.hexdigest() == DATASETS[file_name] diff --git a/src/kintsugi/_data/__init__.py b/src/kintsugi/_data/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/kintsugi/_data/checksums.json b/src/kintsugi/_data/checksums.json new file mode 100644 index 0000000..235adcb --- /dev/null +++ b/src/kintsugi/_data/checksums.json @@ -0,0 +1,74 @@ +{ + "county_neighbors": { + "county_adjacency2010.txt": "7edda309ad38a4dfc6a6c6c30e1753e5490b9c8a3aa4563188841989b4fe9a96", + "county_adjacency2023.txt": "2dbf9a8bae1b7c50db3a9db4864b073d2423c3e6e63518c97d649453e2809843", + "county_adjacency2024.txt": "20cffeb48ba46972fb949c453d3fbf62620115039c45c88bc80c094885650816", + "county_adjacency2025.txt": "27046a5f09f66205fd9869afbfb6dcae744e1c9b85cb19dd767c580211fb4575" + }, + "crosswalk": { + "county": { + "ct_cou_to_cousub_crosswalk.txt": "13ad002564e30e6dcd1df112fdaa985f2cd0cdb908f7f3017cd2723db294cff8", + "sub-est2024_9.csv": "e5bd2cb1b10cf12d741572eef9e8eff19a1f6c5e08eb910d6f44f299c7ca83df" + }, + "county_to_zip": { + "county_to_zip_2016.parquet": "f13d79b059b272c9ac4ff02dbe4e0e26d56acffad837adeb0f2d86d3101fd9ad", + "county_to_zip_2017.parquet": "009141988a3b902179d6beed80260faf2474a4cda8d6af0667d88ab4f0140b0b", + "county_to_zip_2018.parquet": "697425d6ffba4c8f95c80a961814d2291f61507336010349a4d78c10ce5cfdba", + "county_to_zip_2019.parquet": "3a06ca1a4d21638a4d014cbb4bc29b7d6b254a937dab1536058152d226073d62", + "county_to_zip_2020.parquet": "06cb2bfa55085d9c5f0f652628a61d1bf64fe14ea190df5bdc361844f5204250", + "county_to_zip_2021.parquet": "fdd59a6b8ab06a534d1c9c4765101105ce8bc08d9685eb9441c58701bffd96ae", + "county_to_zip_2022.parquet": "c3875e66bff03ff71ad73b89277fc892a294a5d71be08b4eb4ac2c431d715b83", + "county_to_zip_2023.parquet": "2f7c7564092c43d3a964ecf76bc61083916186d140c3cabed63aba5f91b56f1a", + "county_to_zip_2024.parquet": "907321abfa45437a13d4cfa9047b687456e883a096b863c8874d168fb8cc57c0", + "county_to_zip_2025.parquet": "e5008518558a4aca7ae2d40226f56bdcdcdf01bdb3879241a0b8d69a14097239" + }, + "PUMA": { + "geocorr_puma_2010_to_county_2020_with_afact2.csv": "b66f23aa27be3daf1f6456607478848df3383d4c7111d6f935f792fc9542b807", + "puma2010-to-puma2020.csv": "bc6d0116aa39ea3b2af85095b49bac00f4612f591d2f4684cc2cc872c210a666" + }, + "zip_to_county": { + "zip_to_county_2016.parquet": "982913988b4915e9c4787d4f639985635cae938b447944e0ef04868bb5c784d1", + "zip_to_county_2017.parquet": "e8dd16dcee07b3a6b52a30c49157d707a1f1e8f1cabf7da126cb094e88cfd0eb", + "zip_to_county_2018.parquet": "8ab399f97bb8d81eaabca7adf512e801c8958745fe6f0e82992313fbcfaaa942", + "zip_to_county_2019.parquet": "78ac9cdda7c5e00b82315ceaece4f3722da905e2ec4f68774a135d327c257435", + "zip_to_county_2020.parquet": "17f518b2a6de5ee58b134c0dc5fbf10f6d48c45b14fdbfcfe58a4269e08eaee2", + "zip_to_county_2021.parquet": "655c6ce8b257e942ac1c7dfa068d93ad2a1f139c9cd0c110efbce752ab27cea4", + "zip_to_county_2022.parquet": "9f125642aeb5aba3c8f2bdf616c186f975fc6afc1b5543d55204ad9b2a10b252", + "zip_to_county_2023.parquet": "89f50d28acb226a5777ededc1f8301dc1f028c65dd04b860355d23efe24b89c5", + "zip_to_county_2024.parquet": "8d76e09fa52923c0901c0afad26ba2d32075210bb0ab5b82caa14ce5465b3d80", + "zip_to_county_2025.parquet": "2683272f9759646d0a6234de06076f274852b85fcd2fd9e3d2dfcb3aa10def80" + } + }, + "geo": { + "cb_2020_us_county_5m.zip": "187e7118304428e5450083beb375e67c2c516c58a01ce52db95aaf24f18df3ba", + "cb_2020_us_state_5m.zip": "aedc60e0d1924a9030ee6d39ff0ed27ad7d1b0bc86807ea809391a6b9008ffb3", + "cb_2024_us_county_5m.zip": "a867f8734059b45d1d54a0ba56189dd7e73c42eb451418fa56de44c35232614b", + "cb_2024_us_state_5m.zip": "c9db0e395c11a1f94a8017fde4f4c7cbee1dca6eb37ba8f1ccaab927df70885f" + }, + "pop": { + "county_cc": { + "county_pop_2016.parquet": "74caad19bf5eed856ad9b6f63c65f7fceca612dec680d0768890de2265116607", + "county_pop_2017.parquet": "d93d027929861e115cf34b15f1ff7c697c8eaa327b73cd8132710a11860a63d5", + "county_pop_2018.parquet": "be3d3bab642a9f6f111c792a431f940b1753373194993885e4d47c136feed91a", + "county_pop_2019.parquet": "98801f118cd795c026a8269d5ac6674f98b9d47e0207c6a2721a5b7f4b6e5c08", + "county_pop_2020.parquet": "f1e4f282d297dc5498b6f839412c0815ca6f9e0a15d83d5d3867f2d70aa8413d", + "county_pop_2021.parquet": "3af369564ebb0e1fda25b440e5bf133ecb2d2eab60ab40f5db1f0a0955db713b", + "county_pop_2022.parquet": "977856eb5fffd508442ccedaa54c92e338b037135e5a9be55a03c7132863d9ca", + "county_pop_2023.parquet": "a4d66c302a557c1565ec9f43bad5ea9d4267576d1fbd17d8939e5a858a3d73e7", + "county_pop_2024.parquet": "12b16c7c20329a3df2f4120f6ec9a9a7313147fad0fd03bc360b1de5769c8abd" + }, + "state": { + "state_pop_2016.parquet": "bac51c5ba4a9ff7305e92b3b2804c854fc20b9cbcf01156e5439d92668c0c81e", + "state_pop_2017.parquet": "6fb950b1b78409af8130317b08b437b742c0906ff9d5c38655c1189103b8dddc", + "state_pop_2018.parquet": "913fca35299028a842325000e58e33cd3912c1e900d480f00b468095398e57f8", + "state_pop_2019.parquet": "7ca2c87065f24857178bb33a7512cb799a92890596bac6fff1cbeb3c69f6fc36", + "state_pop_2020.parquet": "275b861e07f1c2327fb5382a28e84a5fb7ac4f896ae9f91b06612f6197af9611", + "state_pop_2021.parquet": "8b47a5c9fdca838954c8ddac8265ad00d590281c7b444019070c81b9942a727e", + "state_pop_2022.parquet": "ea113b3766c44bbf250e01b0b9509e810590119b3b9470b13dc347d43aed042b", + "state_pop_2023.parquet": "e96a982342510fe6a1ba90fc85a9bd6fbdd8687bceaf76e6e117606429d2d160", + "state_pop_2024.parquet": "b79bca471a68b8c3742ec30d41a2b65ab1227152e81239faf00763188752c6ff" + } + }, + "county_groups.parquet": "7d7c150b5efd5596e0eaaed27abd6dc86137f08ff677c2606d402b9d165b87fa", + "state.txt": "bea4e03f71a1fa0045ae732aabad11fa541e5932b071c2369bb0d325e8cba5a0" +} diff --git a/src/kintsugi/_data/data.py b/src/kintsugi/_data/data.py new file mode 100644 index 0000000..8dfca39 --- /dev/null +++ b/src/kintsugi/_data/data.py @@ -0,0 +1,144 @@ +import hashlib +import importlib.resources +import json +import logging +import os +import shutil +import tempfile +from functools import lru_cache +from pathlib import Path +from typing import cast + +import requests +from platformdirs import user_cache_path +from requests.adapters import HTTPAdapter, Retry + +logger = logging.getLogger("kintsugi") +logger.addHandler(logging.NullHandler()) + +BASE_URL = "https://raw.githubusercontent.com/winter-again/kintsugi-data/main/data" + + +class GetDatasetError(Exception): + pass + + +type ChecksumTable = dict[str, ChecksumTable] + + +@lru_cache(maxsize=1) +def load_checksums() -> ChecksumTable: + with (importlib.resources.files("kintsugi") / "_data/checksums.json").open() as f: + checksums = cast(ChecksumTable, json.load(f)) + + return checksums + + +def get_dataset(file_name: str) -> Path: + """ + Ensure valid dataset file is present in cache and return its path + """ + file_path_keys = file_name.split("/") + checksums = load_checksums() + for key in file_path_keys: + try: + checksums = checksums[key] + except KeyError: + raise ValueError(f"{file_name} not in dataset") + + file_cached = get_cache_dir() / file_name + if not file_cached.is_file() or not file_valid(file_name, file_cached): + logger.debug( + f"{file_name} not in cache or cached file is invalid. Getting file from kintsugi-data repository." + ) + try: + download_dataset(file_name, file_cached) + except GetDatasetError as err: + logger.exception(f"Unable to get dataset: {file_name}", exc_info=err) + raise + else: + logger.debug(f"{file_name} already exists in cache") + + return file_cached + + +def get_cache_dir() -> Path: + """ + Ensure cache directory exists and return its absolute path + """ + cache_dir = ( + Path(os.getenv("KINTSUGI_CACHE", user_cache_path("kintsugi-data"))) + .expanduser() + .resolve() + ) + try: + cache_dir.mkdir(parents=True, exist_ok=True) + except (OSError, PermissionError) as err: + logger.exception( + f"Error while setting up cache directory at {cache_dir}", exc_info=err + ) + raise + + return cache_dir + + +def download_dataset(file_name: str, file_cached: Path) -> None: + """ + Download dataset file, verify integrity via checksum, and save to cache + """ + url = f"{BASE_URL}/{file_name}" + # 5 total retries, sleep for [0.0, 0.2, 0.4, 0.8, ...] seconds between retries after second try + retries = Retry( + total=5, + backoff_factor=0.1, + status_forcelist=[500, 502, 503, 504], + raise_on_status=True, + ) + adapter = HTTPAdapter(max_retries=retries) + + with requests.Session() as s: + s.mount("https://raw.githubusercontent.com/", adapter) + try: + res = s.get(url) + except requests.exceptions.RetryError as err: + raise GetDatasetError( + f"Error getting data file {file_name} despite retry strategy" + ) from err + + with tempfile.NamedTemporaryFile("wb", delete_on_close=False) as fp: + fp.write(res.content) + fp.close() + + tmp_file = Path(fp.name) + if not file_valid(file_name, tmp_file): + raise GetDatasetError( + f"Checksum for {file_name} is invalid. File not copied to cache" + ) + + logger.info(f"File {file_name} valid. Copying to cache") + file_cached.parent.mkdir(parents=True, exist_ok=True) + shutil.copyfile(tmp_file, file_cached) + + +def file_valid(file_name: str, file: Path) -> bool: + """ + Validate dataset file via sha256 checksum + """ + with open(file, "rb") as f: + digest = hashlib.file_digest(f, "sha256") + + checksum = get_checksum(file_name) + return digest.hexdigest() == checksum + + +def get_checksum(file_name: str) -> str: + file_path_keys = file_name.split("/") + checksums = load_checksums() + for key in file_path_keys: + try: + checksums = checksums[key] + except KeyError: + raise ValueError(f"{file_name} not in dataset") + + assert isinstance(checksums, str) + return checksums diff --git a/src/kintsugi/county_groups.py b/src/kintsugi/county_groups.py index f8a9a42..bc52dd8 100644 --- a/src/kintsugi/county_groups.py +++ b/src/kintsugi/county_groups.py @@ -3,7 +3,7 @@ import pandas as pd import polars as pl -from ._data import get_dataset +from ._data.data import get_dataset from .population import county_pop diff --git a/src/kintsugi/crosswalk.py b/src/kintsugi/crosswalk.py index 6c92501..b6e4774 100644 --- a/src/kintsugi/crosswalk.py +++ b/src/kintsugi/crosswalk.py @@ -3,7 +3,7 @@ import pandas as pd import polars as pl -from ._data import get_dataset +from ._data.data import get_dataset @overload diff --git a/src/kintsugi/geo.py b/src/kintsugi/geo.py index 3075786..bd1fc53 100644 --- a/src/kintsugi/geo.py +++ b/src/kintsugi/geo.py @@ -2,7 +2,7 @@ import geopandas as gpd -from ._data import get_dataset +from ._data.data import get_dataset type ShapefileYear = Literal[2020, 2024] diff --git a/src/kintsugi/metadata.py b/src/kintsugi/metadata.py index b15962a..828bbcc 100644 --- a/src/kintsugi/metadata.py +++ b/src/kintsugi/metadata.py @@ -3,7 +3,7 @@ import pandas as pd import polars as pl -from ._data import get_dataset +from ._data.data import get_dataset @overload diff --git a/src/kintsugi/neighbors.py b/src/kintsugi/neighbors.py index 8bc9842..13fde41 100644 --- a/src/kintsugi/neighbors.py +++ b/src/kintsugi/neighbors.py @@ -5,7 +5,7 @@ import pandas as pd import polars as pl -from ._data import get_dataset +from ._data.data import get_dataset from .geo import ShapefileYear, county_geo type NeighborsYear = Literal[2010, 2023, 2024, 2025] diff --git a/src/kintsugi/population.py b/src/kintsugi/population.py index 558201c..5d5b57b 100644 --- a/src/kintsugi/population.py +++ b/src/kintsugi/population.py @@ -3,7 +3,7 @@ import pandas as pd import polars as pl -from ._data import get_dataset +from ._data.data import get_dataset type VintageYear = Literal[ 2016, diff --git a/tests/_data_test.py b/tests/_data/data_test.py similarity index 56% rename from tests/_data_test.py rename to tests/_data/data_test.py index 4578c50..563809b 100644 --- a/tests/_data_test.py +++ b/tests/_data/data_test.py @@ -7,13 +7,15 @@ import requests from platformdirs import user_cache_path -from kintsugi._data import ( - DATASETS, +from kintsugi._data.data import ( + ChecksumTable, GetDatasetError, download_dataset, file_valid, get_cache_dir, + get_checksum, get_dataset, + load_checksums, ) @@ -25,6 +27,25 @@ def _get_cache_dir() -> Path: yield _get_cache_dir +def test_get_checksum() -> None: + with patch("kintsugi._data.data.load_checksums") as mock_load_checksums: + mock_load_checksums.return_value = { + "test": { + "diamonds.csv": "9574730b03aba241d899c4a97511c5061b19358fab89510774fb6c24168345c4" + } + } + + assert ( + get_checksum("test/diamonds.csv") + == "9574730b03aba241d899c4a97511c5061b19358fab89510774fb6c24168345c4" + ) + + +def test_get_checksum_not_exists() -> None: + with pytest.raises(ValueError, match="not in dataset$"): + get_checksum("foo/bar.csv") + + def test_get_dataset(mock_get_cache_dir: Callable[[], Path]) -> None: """ Test basic dataset fetch. Cached file doesn't exist. @@ -34,14 +55,15 @@ def test_get_dataset(mock_get_cache_dir: Callable[[], Path]) -> None: assert not file_cached.is_file() with ( - patch.dict( - "kintsugi._data.DATASETS", - { - file_name: "9574730b03aba241d899c4a97511c5061b19358fab89510774fb6c24168345c4" - }, - ), - patch("kintsugi._data.get_cache_dir", mock_get_cache_dir), + patch("kintsugi._data.data.load_checksums") as mock_load_checksums, + patch("kintsugi._data.data.get_cache_dir", mock_get_cache_dir), ): + mock_load_checksums.return_value = { + "test": { + "diamonds.csv": "9574730b03aba241d899c4a97511c5061b19358fab89510774fb6c24168345c4" + } + } + assert get_dataset(file_name) == file_cached assert file_cached.is_file() @@ -50,8 +72,24 @@ def test_get_all_datasets(mock_get_cache_dir: Callable[[], Path]) -> None: """ Test fetching all real datasets. Cached files don't exist. """ - with patch("kintsugi._data.get_cache_dir", mock_get_cache_dir): - for file_name in DATASETS.keys(): + + def flatten_keys( + d: ChecksumTable, k_parent: str = "", sep: str = "/" + ) -> dict[str, str]: + items: list[tuple[str, str]] = [] + for k, v in d.items(): + k_next = k_parent + sep + k if k_parent else k + if isinstance(v, str): + items.append((k_next, v)) + else: + items.extend(flatten_keys(v, k_next, sep).items()) + + return dict(items) + + datasets = flatten_keys(load_checksums()) + + with patch("kintsugi._data.data.get_cache_dir", mock_get_cache_dir): + for file_name in datasets.keys(): file_cached = mock_get_cache_dir() / file_name assert not file_cached.is_file() assert get_dataset(file_name) == file_cached @@ -66,15 +104,16 @@ def test_get_dataset_no_cached(mock_get_cache_dir: Callable[[], Path]) -> None: file_cached = mock_get_cache_dir() / file_name with ( - patch.dict( - "kintsugi._data.DATASETS", - { - file_name: "9574730b03aba241d899c4a97511c5061b19358fab89510774fb6c24168345c4" - }, - ), - patch("kintsugi._data.get_cache_dir", mock_get_cache_dir), - patch("kintsugi._data.download_dataset") as mock_download_dataset, + patch("kintsugi._data.data.load_checksums") as mock_load_checksums, + patch("kintsugi._data.data.get_cache_dir", mock_get_cache_dir), + patch("kintsugi._data.data.download_dataset") as mock_download_dataset, ): + mock_load_checksums.return_value = { + "test": { + "diamonds.csv": "9574730b03aba241d899c4a97511c5061b19358fab89510774fb6c24168345c4" + } + } + get_dataset(file_name) mock_download_dataset.assert_called_once_with(file_name, file_cached) @@ -92,15 +131,16 @@ def test_get_dataset_cache_invalid(mock_get_cache_dir: Callable[[], Path]) -> No assert file_cached.is_file() with ( - patch.dict( - "kintsugi._data.DATASETS", - { - file_name: "9574730b03aba241d899c4a97511c5061b19358fab89510774fb6c24168345c4" - }, - ), - patch("kintsugi._data.get_cache_dir", mock_get_cache_dir), - patch("kintsugi._data.download_dataset") as mock_download_dataset, + patch("kintsugi._data.data.load_checksums") as mock_load_checksums, + patch("kintsugi._data.data.get_cache_dir", mock_get_cache_dir), + patch("kintsugi._data.data.download_dataset") as mock_download_dataset, ): + mock_load_checksums.return_value = { + "test": { + "diamonds.csv": "9574730b03aba241d899c4a97511c5061b19358fab89510774fb6c24168345c4" + } + } + get_dataset(file_name) mock_download_dataset.assert_called_once_with(file_name, file_cached) @@ -134,14 +174,15 @@ def test_download_dataset(mock_get_cache_dir: Callable[[], Path]) -> None: file_cached = mock_get_cache_dir() / file_name with ( - patch.dict( - "kintsugi._data.DATASETS", - { - file_name: "9574730b03aba241d899c4a97511c5061b19358fab89510774fb6c24168345c4" - }, - ), - patch("kintsugi._data.get_cache_dir", mock_get_cache_dir), + patch("kintsugi._data.data.load_checksums") as mock_load_checksums, + patch("kintsugi._data.data.get_cache_dir", mock_get_cache_dir), ): + mock_load_checksums.return_value = { + "test": { + "diamonds.csv": "9574730b03aba241d899c4a97511c5061b19358fab89510774fb6c24168345c4" + } + } + download_dataset(file_name, file_cached) assert file_cached.is_file() @@ -151,9 +192,9 @@ def test_download_dataset_retry_error(mock_get_cache_dir: Callable[[], Path]) -> Test that RetryError during download_dataset() leads to GetDatasetError """ with ( - patch("kintsugi._data.get_cache_dir", mock_get_cache_dir), + patch("kintsugi._data.data.get_cache_dir", mock_get_cache_dir), patch( - "kintsugi._data.requests.Session.get", + "kintsugi._data.data.requests.Session.get", side_effect=requests.exceptions.RetryError, ), pytest.raises(GetDatasetError, match="^Error getting data file"), @@ -171,10 +212,11 @@ def test_file_valid_content_correct(mock_get_cache_dir: Callable[[], Path]) -> N with open(file_cached, "w") as f: f.write("Valid content for test.txt") - with patch.dict( - "kintsugi._data.DATASETS", - {file_name: "9227f3934df8fd3b9cde4a201195921c90b3d15d174af1a6831b11cdc78ee5b8"}, - ): + with patch("kintsugi._data.data.load_checksums") as mock_load_checksum: + mock_load_checksum.return_value = { + file_name: "9227f3934df8fd3b9cde4a201195921c90b3d15d174af1a6831b11cdc78ee5b8" + } + assert file_valid(file_name, file_cached) is True @@ -188,8 +230,11 @@ def test_file_valid_content_incorrect(mock_get_cache_dir: Callable[[], Path]) -> with open(file_cached, "w") as f: f.write("Invalid content for diamonds.csv") - with patch.dict( - "kintsugi._data.DATASETS", - {file_name: "9574730b03aba241d899c4a97511c5061b19358fab89510774fb6c24168345c4"}, - ): + with patch("kintsugi._data.data.load_checksums") as mock_load_checksums: + mock_load_checksums.return_value = { + "test": { + "diamonds.csv": "9574730b03aba241d899c4a97511c5061b19358fab89510774fb6c24168345c4" + } + } + assert file_valid(file_name, file_cached) is False diff --git a/uv.lock b/uv.lock index 9ec42ad..b3323e7 100644 --- a/uv.lock +++ b/uv.lock @@ -200,7 +200,7 @@ wheels = [ [[package]] name = "kintsugi" -version = "0.9.0" +version = "0.10.0" source = { editable = "." } dependencies = [ { name = "pandas" },