-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathrun_preprocessing_llm.py
More file actions
54 lines (46 loc) · 1.76 KB
/
run_preprocessing_llm.py
File metadata and controls
54 lines (46 loc) · 1.76 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
"""Preprocessing driver for LLM-assisted geocoding batches.
This script creates batches of LLM-Geocoded data from initial CSV batch files
to new GPKG batch files. Each GPKG batch file is specific to one type of
LLM-Geocoded geometries, either GADM, OSM, or wikidata.
"""
import tomllib
import logging
from pathlib import Path
from validation import preprocessing as pp
from validation.io import load_emdat_archive
with open("config.toml", "rb") as f:
config = tomllib.load(f)
logging.basicConfig(
level=config["logging"]["level"],
filename=config["logging"]["filename"],
filemode=config["logging"]["filemode"],
style=config["logging"]["style"],
format=config["logging"]["format"],
datefmt=config["logging"]["datefmt"]
)
def list_disno_with_gaul(emdat_archive_path: Path) -> list[str]:
"""List disno with GAUL admin units in the EM-DAT archive."""
disno_with_gaul = load_emdat_archive(
emdat_archive_path,
use_columns=["DisNo."],
geocoded_only=True
)['DisNo.'].to_list()
return disno_with_gaul
def main():
disno_with_gaul = list_disno_with_gaul(config["path"]["emdat_archive_path"])
output_dir = Path(config["path"]["batch_dir"])
output_dir.mkdir(parents=True, exist_ok=True)
pp.make_llm_geocoded_batches(
csv_file_dir='Q:/Data/emdat_geocoding/GEOEMDAT',
columns_to_keep=config["index"]["llm_columns_to_keep"],
batch_numbers=config["index"]["batch_numbers"],
keep_disno=disno_with_gaul,
output_dir=output_dir,
geometry_columns=config["index"]["llm_geom_columns"]
)
if __name__ == '__main__':
logging.info(f"Running preprocessing script...".upper())
try:
main()
except Exception as e:
logging.exception(f"Exception occurred: {e}")