Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,7 @@ dmypy.json
*.zip
# Test data
test_wildfire_vector
test_wildfire_classes_vector

# Ruff
.ruff_cache
Expand Down
74 changes: 51 additions & 23 deletions .secrets.baseline
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@
{
"hashed_secret": "5204df45fc8c724684bbc61cd4107a726a6b9204",
"is_verified": false,
"line_number": 96,
"line_number": 95,
"type": "Secret Keyword",
"verified_result": null
},
Expand Down Expand Up @@ -153,72 +153,100 @@
],
"docs/examples/labels_to_data.ipynb": [
{
"hashed_secret": "ea7bf9657ae460e16f1f5a09be13331ef369e647",
"hashed_secret": "b4c98c8116583474b0ce624687462f16212d3650",
"is_verified": false,
"line_number": 155,
"line_number": 159,
"type": "Base64 High Entropy String",
"verified_result": null
},
{
"hashed_secret": "ccfcfcee2a2f7039e65ad59276d2df5daff216f3",
"hashed_secret": "2d55e648f84ed1a7ebb0049f13aaaa8feac4c656",
"is_verified": false,
"line_number": 654,
"line_number": 699,
"type": "Base64 High Entropy String",
"verified_result": null
},
{
"hashed_secret": "040bd74da546d56544a69f11bb291e0911b32415",
"hashed_secret": "f01d3fe6a4fac4d8acdd27cf69de63dfb3cb90ae",
"is_verified": false,
"line_number": 673,
"line_number": 1075,
"type": "Base64 High Entropy String",
"verified_result": null
},
{
"hashed_secret": "0af104c47bac851d6aea383735fb00c038075527",
"hashed_secret": "5c3f245e2a875b184c38437fc79312a460e46f64",
"is_verified": false,
"line_number": 759,
"line_number": 1094,
"type": "Base64 High Entropy String",
"verified_result": null
},
{
"hashed_secret": "28e34a8419ecb7d29f43f0bbc270b16d52e24318",
"hashed_secret": "6eb312da2c4cd2a6af47cfbfced9f7537d29e3ca",
"is_verified": false,
"line_number": 769,
"line_number": 1188,
"type": "Base64 High Entropy String",
"verified_result": null
},
{
"hashed_secret": "78084293061a8f6970060ebca17f3c888aa6145f",
"hashed_secret": "7a115f1ced67eb4bdb8c61bfa843ff6805e83483",
"is_verified": false,
"line_number": 968,
"line_number": 1276,
"type": "Base64 High Entropy String",
"verified_result": null
},
{
"hashed_secret": "21b0b4bef5092046ae7550022244e107908c622f",
"hashed_secret": "8eacdf4b2e1225fde9c0e61e379b6e64dc64f910",
"is_verified": false,
"line_number": 1033,
"line_number": 1286,
"type": "Base64 High Entropy String",
"verified_result": null
},
{
"hashed_secret": "9822c5a483cf4b897a24d2002c20eaf371177428",
"hashed_secret": "86cee4c3f71528e41a7ecdf652934f07d50b64a3",
"is_verified": false,
"line_number": 1052,
"line_number": 1388,
"type": "Base64 High Entropy String",
"verified_result": null
},
{
"hashed_secret": "c1dea1d5fe85fbf50d35ca3274348a534d0075e3",
"hashed_secret": "22c4272dc742fbf283fe572cd686439c29a6af06",
"is_verified": false,
"line_number": 1112,
"line_number": 1398,
"type": "Base64 High Entropy String",
"verified_result": null
},
{
"hashed_secret": "793b0144acd585c78b9f9ccd53855b47e199e1c0",
"hashed_secret": "2ef95f955e08b6e0cd9384f27bedac942b9bc98e",
"is_verified": false,
"line_number": 1122,
"line_number": 1644,
"type": "Base64 High Entropy String",
"verified_result": null
},
{
"hashed_secret": "dcad3d7e0c43d75e00654218eeddf560c94e0d05",
"is_verified": false,
"line_number": 1709,
"type": "Base64 High Entropy String",
"verified_result": null
},
{
"hashed_secret": "9030aa541bbe3cd57e8121bc4df512b24e28addf",
"is_verified": false,
"line_number": 1728,
"type": "Base64 High Entropy String",
"verified_result": null
},
{
"hashed_secret": "719cffec4ca2382c1ec69049456d9114dd50ac84",
"is_verified": false,
"line_number": 1788,
"type": "Base64 High Entropy String",
"verified_result": null
},
{
"hashed_secret": "cff83a50da163ee1acd619111e345870745574d9",
"is_verified": false,
"line_number": 1798,
"type": "Base64 High Entropy String",
"verified_result": null
}
Expand Down Expand Up @@ -280,7 +308,7 @@
{
"hashed_secret": "34fd2a7f5faa004cd1b9e4a22aa09b16d521661b",
"is_verified": false,
"line_number": 227,
"line_number": 288,
"type": "Hex High Entropy String",
"verified_result": null
}
Expand Down Expand Up @@ -314,7 +342,7 @@
{
"hashed_secret": "34fd2a7f5faa004cd1b9e4a22aa09b16d521661b",
"is_verified": false,
"line_number": 62,
"line_number": 68,
"type": "Hex High Entropy String",
"verified_result": null
}
Expand Down
17 changes: 17 additions & 0 deletions docs/download_data.md
Original file line number Diff line number Diff line change
Expand Up @@ -148,6 +148,23 @@ The shapefile `{dataset_name}_labels.shp` must contain a `datetime` field and `g
### Keep files: `keep_files`
Flag to preserve shapefiles in the working directory once they have been used by the download data step. Downloaded files will not be removed. Set to `True` to ensure shapefiles remain in place.

### Set No Data: `set_no_data`
Controls how label rasterization handles the background (no-data) pixels. When set to `True`, background pixels are assigned a no-data value (-1), allowing label class 0 to be used for actual labels. When set to `False` (default), background pixels are assigned value 0, which means label classes must start from 1 to avoid conflicts.

**Important:** If your labels use class 0 and `set_no_data=False`, TerraKit will raise a `TerrakitValueError` because class 0 would conflict with the background class. In this case, you must either:
- Set `set_no_data=True` to use -1 for background pixels, or
- Ensure your label classes start from 1 instead of 0

Example with multi-class labels using class 0:
```python
queried_data = download_data(
data_sources=config["download"]["data_sources"],
date_allowance=config["download"]["date_allowance"],
set_no_data=True, # Required when using class 0
transform=config["download"]["transform"],
)
```

## Data Connectors
Data connectors are classes which enable a user to search for data and query data from a particular data source using a common set of functions. Check out the [TerraKit Data Connectors](#data-connectors) section for more information.

Expand Down
974 changes: 825 additions & 149 deletions docs/examples/labels_to_data.ipynb

Large diffs are not rendered by default.

12 changes: 11 additions & 1 deletion docs/process_labels.md
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,17 @@ EMSR801_AOI01_DEL_MONIT02_observedEventA_v1.json,2025-04-23
TerraKit will look a file called `metadata.csv` in the `labels_folder`.

### label_type
`label_type`: Set to either `raster` or `vector`. TerraKit expects label data in either vector or raster format.
`label_type`: Set to either `raster` or `vector`. TerraKit expects label data in either vector or raster format.

### Multi-class Labels
For multi-class label datasets, TerraKit supports automatic class detection through filename patterns. Include `_CLASS_<number>_` in your label filenames to specify the class:

```
EMSR801_AOI01_DEL_MONIT02_CLASS_0_observedEventA_v1_2025-04-23.json
EMSR801_AOI01_DEL_MONIT02_CLASS_1_observedEventA_v1_2025-04-23.json
```

The class number will be extracted from the filename and used during rasterization. If no `_CLASS_` pattern is found, the label defaults to class 1. This enables visualization with distinct colors for each class and proper handling of multi-class segmentation tasks.

## Download example labels
To download a set of example labels, use the `rapid_mapping_geojson_downloader` function to get started:
Expand Down
2 changes: 1 addition & 1 deletion terrakit/chip/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,4 @@
# SPDX-License-Identifier: Apache-2.0


from terrakit.chip import tiling
from terrakit.chip import tiling as tiling
69 changes: 60 additions & 9 deletions terrakit/download/download_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ class DownloadCls:
active (bool): Flag to activate/deactivate data download.
max_cloud_cover (int): Maximum cloud cover percentage for data selection.
keep_files (bool): Flag to keep shapefiles once they have been used. Downloaded files will not be removed.
set_no_data (bool): Flag to set non-labeled data as no-data. Default False
datetime_bbox_shp_file (str): Path to shapefile containing datetime and bounding boxes to be downloaded.
labels_shp_file (str): Path to shapefile containing labels.

Expand Down Expand Up @@ -107,6 +108,7 @@ def __init__(
active: bool = True,
max_cloud_cover: int = 80,
keep_files: bool = False,
set_no_data: bool = False,
datetime_bbox_shp_file: str = "./tmp/terrakit_curated_dataset_all_bboxes.shp",
labels_shp_file: str = "./tmp/terrakit_curated_dataset_labels.shp",
):
Expand All @@ -122,6 +124,7 @@ def __init__(
active (bool): Flag to activate/deactivate data download.
max_cloud_cover (int): Maximum cloud cover percentage for data selection.
keep_files (bool): Flag to keep shapefiles once they have been used. Downloaded files will not be removed.
set_no_data (bool): Flag to set non-labeled data as no-data. Default Falise
datetime_bbox_shp_file (str): Path to shapefile containing datetime bounding boxes.
labels_shp_file (str): Path to shapefile containing labels.
"""
Expand All @@ -132,6 +135,7 @@ def __init__(
self.active = active
self.max_cloud_cover = max_cloud_cover
self.keep_files = keep_files
self.set_no_data = set_no_data
self.datetime_bbox_shp_file = datetime_bbox_shp_file
self.labels_shp_file = labels_shp_file
self.data_sources = data_sources
Expand Down Expand Up @@ -226,9 +230,15 @@ def find_and_query_data_for_matching_dates(
)
grouped_bbox_gdf = self._read_shp_file(bbox_shp_file)

# Deduplicate by datetime and geometry to avoid downloading same tile multiple times
# This happens when multiple label classes exist for the same date/location
grouped_bbox_gdf_unique = grouped_bbox_gdf.drop_duplicates(
subset=["datetime", "geometry"], keep="first"
).reset_index(drop=True)

queried_data = []
for li in range(0, len(grouped_bbox_gdf)):
l = grouped_bbox_gdf.loc[li] # noqa
for li in range(0, len(grouped_bbox_gdf_unique)):
l = grouped_bbox_gdf_unique.loc[li] # noqa

from_date = (
datetime.strptime(l.datetime, "%Y-%m-%d")
Expand Down Expand Up @@ -301,9 +311,6 @@ def find_and_query_data_for_matching_dates(
f"Error while transforming data... {e}"
) from e

for t in da.time.values: # type: ignore[union-attr]
date = t.astype(str)[:10]

for i, t in enumerate(da.time.values): # type: ignore[union-attr]
date = t.astype(str)[:10]
queried_data.append(
Expand All @@ -316,7 +323,9 @@ def find_and_query_data_for_matching_dates(
logging.info(f"Queried data: {queried_data}")
return queried_data

def rasterize_vectors_to_the_queried_data(self, queried_data: list) -> int:
def rasterize_vectors_to_the_queried_data(
self, queried_data: list, set_no_data: bool
) -> int:
"""
Rasterize vector data to the queried raster data.

Expand All @@ -332,23 +341,58 @@ def rasterize_vectors_to_the_queried_data(self, queried_data: list) -> int:
label_gdf = self._read_shp_file(labels_shp_file)

logging.info("Rasterizing vectors to the queried data")

# Verify label classes
if "labelclass" in label_gdf.columns:
label_classes = np.sort(label_gdf["labelclass"].unique())
logger.info(f"Label classes being used: {label_classes}")
if not set_no_data and 0 in label_classes:
raise TerrakitValueError(
"Labels are using class 0 which conflicts with the background class. "
"Either use set_no_data=True or ensure label classes start from 1.",
details={
"label_classes": label_classes.tolist(),
"set_no_data": set_no_data,
},
)

start_index = 0 if set_no_data else 1
# Check if continuous and otherwise provide a warning
if not (
start_index in label_classes
and label_classes[-1] == start_index + len(label_classes) - 1
):
logger.warning(
"Label classes are not a continuous list of indicies, is this correct?"
)

background_value = -1 if set_no_data else 0 # 0 is rasterize default
file_save_count = 0
for q in queried_data:
with rasterio.open(q, "r") as src:
out_meta = src.meta
out_meta.update({"count": 1})
label_column = label_gdf.get(
"labelclass", [1] * len(label_gdf)
) # Default 1 if not set
image = rasterio.features.rasterize(
((g, 1) for g in label_gdf.geometry),
(
(g, class_id)
for g, class_id in zip(label_gdf.geometry, label_column)
),
out_shape=src.shape,
transform=src.transform,
fill=background_value,
)
if set_no_data:
out_meta.update({"nodata": -1})
# Write the burned image to geotiff
logging.info(f"Writing to {q.replace('.tif', '')}_labels.tif")
with rasterio.open(
f"{q.replace('.tif', '')}_labels.tif", "w", **out_meta
) as dst:
dst.write(image, indexes=1)
file_save_count = +1
file_save_count += 1
return file_save_count


Expand All @@ -362,6 +406,7 @@ def download_validation(
datetime_bbox_shp_file: str = "./tmp/terrakit_curated_dataset_all_bboxes.shp",
labels_shp_file: str = "./tmp/terrakit_curated_dataset_labels.shp",
keep_files: bool = False,
set_no_data: bool = False,
) -> tuple[DownloadCls, DownloadModel]:
"""
Validate and initialize the download process.
Expand All @@ -376,6 +421,7 @@ def download_validation(
datetime_bbox_shp_file (str): Path to shapefile containing datetime bounding boxes.
labels_shp_file (str): Path to shapefile containing labels.
keep_files (bool): Flag to keep shapefiles once they have been used. Downloaded files will not be removed.
set_no_data (bool): Flag to set non-labeled data as no-data. Default False.

Returns:
DownloadCls: Initialized DownloadCls object.
Expand Down Expand Up @@ -445,6 +491,7 @@ def download_validation(
max_cloud_cover=max_cloud_cover,
datetime_bbox_shp_file=datetime_bbox_shp_file,
keep_files=keep_files,
set_no_data=set_no_data,
data_sources=data_source_list,
date_allowance=date_allowance,
labels_shp_file=labels_shp_file,
Expand Down Expand Up @@ -473,6 +520,7 @@ def download_data(
datetime_bbox_shp_file: str = "./tmp/terrakit_curated_dataset_all_bboxes.shp",
labels_shp_file: str = "./tmp/terrakit_curated_dataset_labels.shp",
keep_files: bool = False,
set_no_data: bool = False,
) -> list:
"""
Download and preprocess geospatial data.
Expand All @@ -488,6 +536,7 @@ def download_data(
datetime_bbox_shp_file (str): Path to shapefile containing datetime bounding boxes.
labels_shp_file (str): Path to shapefile containing labels.
keep_files (bool): Flag to keep shapefiles once they have been used. Downloaded files will not be removed.
set_no_data (bool): Flag to set non-labeled data as no-data. Default False

Returns:
list: List of queried data file paths.
Expand Down Expand Up @@ -551,6 +600,7 @@ def download_data(
datetime_bbox_shp_file=datetime_bbox_shp_file,
labels_shp_file=labels_shp_file,
keep_files=keep_files,
set_no_data=set_no_data,
)

logging.info("Listing collections..")
Expand All @@ -573,7 +623,8 @@ def download_data(

# Rasterize
file_save_count = download.rasterize_vectors_to_the_queried_data(
queried_data=queried_data
queried_data=queried_data,
set_no_data=set_no_data,
)

if file_save_count > 0:
Expand Down
Loading