Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[project]
name = "kintsugi"
version = "0.9.0"
version = "0.10.0"
description = "Commonly used datasets and functions"
readme = "README.md"
authors = [
Expand Down
164 changes: 0 additions & 164 deletions src/kintsugi/_data.py

This file was deleted.

Empty file added src/kintsugi/_data/__init__.py
Empty file.
74 changes: 74 additions & 0 deletions src/kintsugi/_data/checksums.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
{
"county_neighbors": {
"county_adjacency2010.txt": "7edda309ad38a4dfc6a6c6c30e1753e5490b9c8a3aa4563188841989b4fe9a96",
"county_adjacency2023.txt": "2dbf9a8bae1b7c50db3a9db4864b073d2423c3e6e63518c97d649453e2809843",
"county_adjacency2024.txt": "20cffeb48ba46972fb949c453d3fbf62620115039c45c88bc80c094885650816",
"county_adjacency2025.txt": "27046a5f09f66205fd9869afbfb6dcae744e1c9b85cb19dd767c580211fb4575"
},
"crosswalk": {
"county": {
"ct_cou_to_cousub_crosswalk.txt": "13ad002564e30e6dcd1df112fdaa985f2cd0cdb908f7f3017cd2723db294cff8",
"sub-est2024_9.csv": "e5bd2cb1b10cf12d741572eef9e8eff19a1f6c5e08eb910d6f44f299c7ca83df"
},
"county_to_zip": {
"county_to_zip_2016.parquet": "f13d79b059b272c9ac4ff02dbe4e0e26d56acffad837adeb0f2d86d3101fd9ad",
"county_to_zip_2017.parquet": "009141988a3b902179d6beed80260faf2474a4cda8d6af0667d88ab4f0140b0b",
"county_to_zip_2018.parquet": "697425d6ffba4c8f95c80a961814d2291f61507336010349a4d78c10ce5cfdba",
"county_to_zip_2019.parquet": "3a06ca1a4d21638a4d014cbb4bc29b7d6b254a937dab1536058152d226073d62",
"county_to_zip_2020.parquet": "06cb2bfa55085d9c5f0f652628a61d1bf64fe14ea190df5bdc361844f5204250",
"county_to_zip_2021.parquet": "fdd59a6b8ab06a534d1c9c4765101105ce8bc08d9685eb9441c58701bffd96ae",
"county_to_zip_2022.parquet": "c3875e66bff03ff71ad73b89277fc892a294a5d71be08b4eb4ac2c431d715b83",
"county_to_zip_2023.parquet": "2f7c7564092c43d3a964ecf76bc61083916186d140c3cabed63aba5f91b56f1a",
"county_to_zip_2024.parquet": "907321abfa45437a13d4cfa9047b687456e883a096b863c8874d168fb8cc57c0",
"county_to_zip_2025.parquet": "e5008518558a4aca7ae2d40226f56bdcdcdf01bdb3879241a0b8d69a14097239"
},
"PUMA": {
"geocorr_puma_2010_to_county_2020_with_afact2.csv": "b66f23aa27be3daf1f6456607478848df3383d4c7111d6f935f792fc9542b807",
"puma2010-to-puma2020.csv": "bc6d0116aa39ea3b2af85095b49bac00f4612f591d2f4684cc2cc872c210a666"
},
"zip_to_county": {
"zip_to_county_2016.parquet": "982913988b4915e9c4787d4f639985635cae938b447944e0ef04868bb5c784d1",
"zip_to_county_2017.parquet": "e8dd16dcee07b3a6b52a30c49157d707a1f1e8f1cabf7da126cb094e88cfd0eb",
"zip_to_county_2018.parquet": "8ab399f97bb8d81eaabca7adf512e801c8958745fe6f0e82992313fbcfaaa942",
"zip_to_county_2019.parquet": "78ac9cdda7c5e00b82315ceaece4f3722da905e2ec4f68774a135d327c257435",
"zip_to_county_2020.parquet": "17f518b2a6de5ee58b134c0dc5fbf10f6d48c45b14fdbfcfe58a4269e08eaee2",
"zip_to_county_2021.parquet": "655c6ce8b257e942ac1c7dfa068d93ad2a1f139c9cd0c110efbce752ab27cea4",
"zip_to_county_2022.parquet": "9f125642aeb5aba3c8f2bdf616c186f975fc6afc1b5543d55204ad9b2a10b252",
"zip_to_county_2023.parquet": "89f50d28acb226a5777ededc1f8301dc1f028c65dd04b860355d23efe24b89c5",
"zip_to_county_2024.parquet": "8d76e09fa52923c0901c0afad26ba2d32075210bb0ab5b82caa14ce5465b3d80",
"zip_to_county_2025.parquet": "2683272f9759646d0a6234de06076f274852b85fcd2fd9e3d2dfcb3aa10def80"
}
},
"geo": {
"cb_2020_us_county_5m.zip": "187e7118304428e5450083beb375e67c2c516c58a01ce52db95aaf24f18df3ba",
"cb_2020_us_state_5m.zip": "aedc60e0d1924a9030ee6d39ff0ed27ad7d1b0bc86807ea809391a6b9008ffb3",
"cb_2024_us_county_5m.zip": "a867f8734059b45d1d54a0ba56189dd7e73c42eb451418fa56de44c35232614b",
"cb_2024_us_state_5m.zip": "c9db0e395c11a1f94a8017fde4f4c7cbee1dca6eb37ba8f1ccaab927df70885f"
},
"pop": {
"county_cc": {
"county_pop_2016.parquet": "74caad19bf5eed856ad9b6f63c65f7fceca612dec680d0768890de2265116607",
"county_pop_2017.parquet": "d93d027929861e115cf34b15f1ff7c697c8eaa327b73cd8132710a11860a63d5",
"county_pop_2018.parquet": "be3d3bab642a9f6f111c792a431f940b1753373194993885e4d47c136feed91a",
"county_pop_2019.parquet": "98801f118cd795c026a8269d5ac6674f98b9d47e0207c6a2721a5b7f4b6e5c08",
"county_pop_2020.parquet": "f1e4f282d297dc5498b6f839412c0815ca6f9e0a15d83d5d3867f2d70aa8413d",
"county_pop_2021.parquet": "3af369564ebb0e1fda25b440e5bf133ecb2d2eab60ab40f5db1f0a0955db713b",
"county_pop_2022.parquet": "977856eb5fffd508442ccedaa54c92e338b037135e5a9be55a03c7132863d9ca",
"county_pop_2023.parquet": "a4d66c302a557c1565ec9f43bad5ea9d4267576d1fbd17d8939e5a858a3d73e7",
"county_pop_2024.parquet": "12b16c7c20329a3df2f4120f6ec9a9a7313147fad0fd03bc360b1de5769c8abd"
},
"state": {
"state_pop_2016.parquet": "bac51c5ba4a9ff7305e92b3b2804c854fc20b9cbcf01156e5439d92668c0c81e",
"state_pop_2017.parquet": "6fb950b1b78409af8130317b08b437b742c0906ff9d5c38655c1189103b8dddc",
"state_pop_2018.parquet": "913fca35299028a842325000e58e33cd3912c1e900d480f00b468095398e57f8",
"state_pop_2019.parquet": "7ca2c87065f24857178bb33a7512cb799a92890596bac6fff1cbeb3c69f6fc36",
"state_pop_2020.parquet": "275b861e07f1c2327fb5382a28e84a5fb7ac4f896ae9f91b06612f6197af9611",
"state_pop_2021.parquet": "8b47a5c9fdca838954c8ddac8265ad00d590281c7b444019070c81b9942a727e",
"state_pop_2022.parquet": "ea113b3766c44bbf250e01b0b9509e810590119b3b9470b13dc347d43aed042b",
"state_pop_2023.parquet": "e96a982342510fe6a1ba90fc85a9bd6fbdd8687bceaf76e6e117606429d2d160",
"state_pop_2024.parquet": "b79bca471a68b8c3742ec30d41a2b65ab1227152e81239faf00763188752c6ff"
}
},
"county_groups.parquet": "7d7c150b5efd5596e0eaaed27abd6dc86137f08ff677c2606d402b9d165b87fa",
"state.txt": "bea4e03f71a1fa0045ae732aabad11fa541e5932b071c2369bb0d325e8cba5a0"
}
144 changes: 144 additions & 0 deletions src/kintsugi/_data/data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,144 @@
import hashlib
import importlib.resources
import json
import logging
import os
import shutil
import tempfile
from functools import lru_cache
from pathlib import Path
from typing import cast

import requests
from platformdirs import user_cache_path
from requests.adapters import HTTPAdapter, Retry

logger = logging.getLogger("kintsugi")
logger.addHandler(logging.NullHandler())

BASE_URL = "https://raw.githubusercontent.com/winter-again/kintsugi-data/main/data"


class GetDatasetError(Exception):
pass


type ChecksumTable = dict[str, ChecksumTable]


@lru_cache(maxsize=1)
def load_checksums() -> ChecksumTable:
with (importlib.resources.files("kintsugi") / "_data/checksums.json").open() as f:
checksums = cast(ChecksumTable, json.load(f))

return checksums


def get_dataset(file_name: str) -> Path:
"""
Ensure valid dataset file is present in cache and return its path
"""
file_path_keys = file_name.split("/")
checksums = load_checksums()
for key in file_path_keys:
try:
checksums = checksums[key]
except KeyError:
raise ValueError(f"{file_name} not in dataset")

file_cached = get_cache_dir() / file_name
if not file_cached.is_file() or not file_valid(file_name, file_cached):
logger.debug(
f"{file_name} not in cache or cached file is invalid. Getting file from kintsugi-data repository."
)
try:
download_dataset(file_name, file_cached)
except GetDatasetError as err:
logger.exception(f"Unable to get dataset: {file_name}", exc_info=err)
raise
else:
logger.debug(f"{file_name} already exists in cache")

return file_cached


def get_cache_dir() -> Path:
"""
Ensure cache directory exists and return its absolute path
"""
cache_dir = (
Path(os.getenv("KINTSUGI_CACHE", user_cache_path("kintsugi-data")))
.expanduser()
.resolve()
)
try:
cache_dir.mkdir(parents=True, exist_ok=True)
except (OSError, PermissionError) as err:
logger.exception(
f"Error while setting up cache directory at {cache_dir}", exc_info=err
)
raise

return cache_dir


def download_dataset(file_name: str, file_cached: Path) -> None:
"""
Download dataset file, verify integrity via checksum, and save to cache
"""
url = f"{BASE_URL}/{file_name}"
# 5 total retries, sleep for [0.0, 0.2, 0.4, 0.8, ...] seconds between retries after second try
retries = Retry(
total=5,
backoff_factor=0.1,
status_forcelist=[500, 502, 503, 504],
raise_on_status=True,
)
adapter = HTTPAdapter(max_retries=retries)

with requests.Session() as s:
s.mount("https://raw.githubusercontent.com/", adapter)
try:
res = s.get(url)
except requests.exceptions.RetryError as err:
raise GetDatasetError(
f"Error getting data file {file_name} despite retry strategy"
) from err

with tempfile.NamedTemporaryFile("wb", delete_on_close=False) as fp:
fp.write(res.content)
fp.close()

tmp_file = Path(fp.name)
if not file_valid(file_name, tmp_file):
raise GetDatasetError(
f"Checksum for {file_name} is invalid. File not copied to cache"
)

logger.info(f"File {file_name} valid. Copying to cache")
file_cached.parent.mkdir(parents=True, exist_ok=True)
shutil.copyfile(tmp_file, file_cached)


def file_valid(file_name: str, file: Path) -> bool:
"""
Validate dataset file via sha256 checksum
"""
with open(file, "rb") as f:
digest = hashlib.file_digest(f, "sha256")

checksum = get_checksum(file_name)
return digest.hexdigest() == checksum


def get_checksum(file_name: str) -> str:
file_path_keys = file_name.split("/")
checksums = load_checksums()
for key in file_path_keys:
try:
checksums = checksums[key]
except KeyError:
raise ValueError(f"{file_name} not in dataset")

assert isinstance(checksums, str)
return checksums
Loading