WUR-AI · vdplasthijs · Mar 23, 2026 · Mar 4, 2026 · Mar 4, 2026 · Mar 4, 2026
diff --git a/.env.example b/.env.example
@@ -10,6 +10,12 @@ TRAINER_PROFILE="gpu" # cpu/gpu/mps/ddp
 HF_HOME="${PROJECT_ROOT}/.cache/huggingface/" # set or will default to './.cache/huggingface/'
 DATA_DIR="${PROJECT_ROOT}/data/"  # set to your local data folder (for aether), or will default to '${PROJECT_ROOT}/data/'
 
+# Base cache directory for TESSERA.
+# GeoTessera registry/metadata is stored here; large raw source tiles go in the
+# raw/ subfolder. This folder can get very large — point it at an external drive
+# if needed.
+TESSERA_EMBEDDINGS_DIR="${PROJECT_ROOT}/data/cache/tessera/"
+
 # Working directories
 # STORAGE_MODE=# or "shared"
 # SHARED_CACHE=# or "/path/to/shared/.cache"

diff --git a/.gitignore b/.gitignore
@@ -228,3 +228,4 @@ notebooks/01-TvdP-tmp.ipynb
 */source/*
 *.tif # for now
 ..env.swp
+/data/yield_africa/
diff --git a/configs/callbacks/default.yaml b/configs/callbacks/default.yaml
@@ -19,4 +19,4 @@ early_stopping:
   mode: "min"
 
 model_summary:
-  max_depth: 2
+  max_depth: 1
diff --git a/configs/data/butterfly_coords_text.yaml b/configs/data/butterfly_coords_text.yaml
@@ -14,6 +14,7 @@ dataset:
 caption_builder:
   _target_: src.data.butterfly_caption_builder.ButterflyCaptionBuilder
   templates_fname: v3.json
+  concepts_fname: v1.json
   data_dir: ${paths.data_dir}/s2bms
   seed: ${seed}
 

diff --git a/configs/data/butterfly_full_param_example.yaml b/configs/data/butterfly_full_param_example.yaml
@@ -22,7 +22,8 @@ dataset:
 
 caption_builder:
   _target_: src.data.butterfly_caption_builder.ButterflyCaptionBuilder
-  templates_fname: caption_templates.json
+  templates_fname: v3.json
+  concepts_fname: v1.json
   data_dir: ${paths.data_dir}/s2bms
   seed: ${seed}
 

diff --git a/configs/data/yield_africa_all.yaml b/configs/data/yield_africa_all.yaml
@@ -0,0 +1,33 @@
+_target_: src.data.base_datamodule.BaseDataModule
+
+dataset:
+  _target_: src.data.yield_africa_dataset.YieldAfricaDataset
+  data_dir: ${paths.data_dir}
+  modalities:
+    coords: {}
+  use_target_data: true
+  use_features: true
+  use_aux_data: none
+  seed: ${seed}
+  cache_dir: ${paths.cache_dir}
+  # Country/year filters — set to a list to restrict, null to include all.
+  # countries and years select only the listed values;
+  # exclude_countries and exclude_years drop the listed values.
+  countries: ["BF", "BUR", "ETH", "KEN", "MAL", "RWA", "TAN", "ZAM"]
+  years: [2014, 2016, 2017, 2018, 2019, 2020, 2021, 2023, 2024]
+  exclude_countries: null
+  exclude_years: null
+
+batch_size: 64
+num_workers: 0
+pin_memory: false
+
+# todo - use spatial split (pre-calculate and then load from file)
+#      - hold out country/year block for validation
+#      - or leave one country out for validation
+#      - normalize data by country (after filtering)
+
+split_mode: "random"
+train_val_test_split: [0.7, 0.15, 0.15]
+save_split: false
+seed: ${seed}
diff --git a/configs/data/yield_africa_loco.yaml b/configs/data/yield_africa_loco.yaml
@@ -0,0 +1,33 @@
+_target_: src.data.base_datamodule.BaseDataModule
+
+dataset:
+  _target_: src.data.yield_africa_dataset.YieldAfricaDataset
+  data_dir: ${paths.data_dir}
+  modalities:
+    coords: {}
+  use_target_data: true
+  use_features: true
+  use_aux_data: none
+  seed: ${seed}
+  cache_dir: ${paths.cache_dir}
+  # Include all countries and years so the split file determines the partition.
+  countries: ["BF", "BUR", "ETH", "KEN", "MAL", "RWA", "TAN", "ZAM"]
+  years: [2014, 2016, 2017, 2018, 2019, 2020, 2021, 2023, 2024]
+  exclude_countries: null
+  exclude_years: null
+
+batch_size: 64
+num_workers: 0
+pin_memory: false
+
+# Leave-one-country-out split loaded from a pre-generated file.
+# Generate split files first:
+#   python src/data_preprocessing/yield_africa_loco_splits.py --data_dir <data_dir>
+#
+# Override saved_split_file_name at the command line to change the held-out country:
+#   python src/train.py experiment=yield_africa_tabular_loco \
+#     data.saved_split_file_name=split_loco_RWA.pth
+split_mode: "from_file"
+saved_split_file_name: "split_loco_KEN.pth"
+save_split: false
+seed: ${seed}
diff --git a/configs/data/yield_africa_spatial.yaml b/configs/data/yield_africa_spatial.yaml
@@ -0,0 +1,33 @@
+_target_: src.data.base_datamodule.BaseDataModule
+
+dataset:
+  _target_: src.data.yield_africa_dataset.YieldAfricaDataset
+  data_dir: ${paths.data_dir}
+  modalities:
+    coords: {}
+  use_target_data: true
+  use_features: true
+  use_aux_data: none
+  seed: ${seed}
+  cache_dir: ${paths.cache_dir}
+  # Include all countries and years so the split file determines the partition.
+  countries: ["BF", "BUR", "ETH", "KEN", "MAL", "RWA", "TAN", "ZAM"]
+  years: [2014, 2016, 2017, 2018, 2019, 2020, 2021, 2023, 2024]
+  exclude_countries: null
+  exclude_years: null
+
+batch_size: 64
+num_workers: 0
+pin_memory: false
+
+# Spatial-cluster split loaded from a pre-generated file.
+# Generate split files first (produces 10 km, 25 km, and 50 km variants):
+#   python src/data_preprocessing/yield_africa_spatial_splits.py --data_dir <data_dir>
+#
+# Override saved_split_file_name at the command line to change the cluster distance:
+#   python src/train.py experiment=yield_africa_tabular_spatial \
+#     data.saved_split_file_name=split_spatial_10km.pth
+split_mode: "from_file"
+saved_split_file_name: "split_spatial_25km.pth"
+save_split: false
+seed: ${seed}
diff --git a/configs/data/yield_africa_tessera.yaml b/configs/data/yield_africa_tessera.yaml
@@ -0,0 +1,31 @@
+_target_: src.data.base_datamodule.BaseDataModule
+
+dataset:
+  _target_: src.data.yield_africa_dataset.YieldAfricaDataset
+  data_dir: ${paths.data_dir}
+  modalities:
+    tessera:
+      # size must match the tile_size used when running the preprocessing script.
+      # Default: 9 pixels (set by yield_africa_tessera_preprocess.py --tile_size).
+      size: 9
+      format: npy
+      # year is intentionally omitted: yield_africa fetches per-record year tiles
+      # via the preprocessing script rather than a single bulk-year download.
+  use_target_data: true
+  use_features: true
+  use_aux_data: none
+  seed: ${seed}
+  cache_dir: ${paths.cache_dir}
+  countries: ["BF", "BUR", "ETH", "KEN", "MAL", "RWA", "TAN", "ZAM"]
+  years: [2014, 2016, 2017, 2018, 2019, 2020, 2021, 2023, 2024]
+  exclude_countries: null
+  exclude_years: null
+
+batch_size: 64
+num_workers: 0
+pin_memory: false
+
+split_mode: "random"
+train_val_test_split: [0.7, 0.15, 0.15]
+save_split: false
+seed: ${seed}
diff --git a/configs/data/yield_africa_tessera_loco.yaml b/configs/data/yield_africa_tessera_loco.yaml
@@ -0,0 +1,39 @@
+_target_: src.data.base_datamodule.BaseDataModule
+
+dataset:
+  _target_: src.data.yield_africa_dataset.YieldAfricaDataset
+  data_dir: ${paths.data_dir}
+  modalities:
+    tessera:
+      # size must match the tile_size used when running the preprocessing script.
+      # Default: 9 pixels (set by yield_africa_tessera_preprocess.py --tile_size).
+      size: 9
+      format: npy
+      # year is intentionally omitted: yield_africa fetches per-record year tiles
+      # via the preprocessing script rather than a single bulk-year download.
+  use_target_data: true
+  use_features: true
+  use_aux_data: none
+  seed: ${seed}
+  cache_dir: ${paths.cache_dir}
+  # Include all countries and years so the split file determines the partition.
+  countries: ["BF", "BUR", "ETH", "KEN", "MAL", "RWA", "TAN", "ZAM"]
+  years: [2014, 2016, 2017, 2018, 2019, 2020, 2021, 2023, 2024]
+  exclude_countries: null
+  exclude_years: null
+
+batch_size: 64
+num_workers: 0
+pin_memory: false
+
+# Leave-one-country-out split loaded from a pre-generated file.
+# Generate split files first:
+#   python src/data_preprocessing/yield_africa_loco_splits.py --data_dir <data_dir>
+#
+# Override saved_split_file_name at the command line to change the held-out country:
+#   python src/train.py experiment=yield_africa_tessera_fusion_loco \
+#     data.saved_split_file_name=split_loco_RWA.pth
+split_mode: "from_file"
+saved_split_file_name: "split_loco_KEN.pth"
+save_split: false
+seed: ${seed}
diff --git a/configs/data/yield_africa_tessera_spatial.yaml b/configs/data/yield_africa_tessera_spatial.yaml
@@ -0,0 +1,39 @@
+_target_: src.data.base_datamodule.BaseDataModule
+
+dataset:
+  _target_: src.data.yield_africa_dataset.YieldAfricaDataset
+  data_dir: ${paths.data_dir}
+  modalities:
+    tessera:
+      # size must match the tile_size used when running the preprocessing script.
+      # Default: 9 pixels (set by yield_africa_tessera_preprocess.py --tile_size).
+      size: 9
+      format: npy
+      # year is intentionally omitted: yield_africa fetches per-record year tiles
+      # via the preprocessing script rather than a single bulk-year download.
+  use_target_data: true
+  use_features: true
+  use_aux_data: none
+  seed: ${seed}
+  cache_dir: ${paths.cache_dir}
+  # Include all countries and years so the split file determines the partition.
+  countries: ["BF", "BUR", "ETH", "KEN", "MAL", "RWA", "TAN", "ZAM"]
+  years: [2014, 2016, 2017, 2018, 2019, 2020, 2021, 2023, 2024]
+  exclude_countries: null
+  exclude_years: null
+
+batch_size: 64
+num_workers: 0
+pin_memory: false
+
+# Spatial-cluster split loaded from a pre-generated file.
+# Generate split files first (produces 10 km, 25 km, and 50 km variants):
+#   python src/data_preprocessing/yield_africa_spatial_splits.py --data_dir <data_dir>
+#
+# Override saved_split_file_name at the command line to change the cluster distance:
+#   python src/train.py experiment=yield_africa_tessera_fusion_spatial \
+#     data.saved_split_file_name=split_spatial_10km.pth
+split_mode: "from_file"
+saved_split_file_name: "split_spatial_25km.pth"
+save_split: false
+seed: ${seed}
diff --git a/configs/experiment/yield_africa_coords_reg.yaml b/configs/experiment/yield_africa_coords_reg.yaml
@@ -0,0 +1,25 @@
+# @package _global_
+# configs/experiment/yield_africa_tabular_reg.yaml
+# Variant: Tabular features only, full dataset
+
+defaults:
+  - override /model: yield_geoclip_reg
+  - override /data: yield_africa_all
+  - override /metrics: yield_africa_regression
+
+tags: ["yield_africa", "coords_only", "regression"]
+seed: 12345
+
+trainer:
+  min_epochs: 1
+  max_epochs: 150
+
+data:
+  batch_size: 64
+
+logger:
+  wandb:
+    tags: ${tags}
+    group: "yield_africa"
+  aim:
+    experiment: "yield_africa"
diff --git a/configs/experiment/yield_africa_fusion_loco.yaml b/configs/experiment/yield_africa_fusion_loco.yaml
@@ -0,0 +1,33 @@
+# @package _global_
+# configs/experiment/yield_africa_fusion_loco.yaml
+# GeoClip + tabular fusion model evaluated with leave-one-country-out split.
+# Default held-out country: KEN (largest, most representative test set).
+#
+# Generate split files first:
+#   python src/data_preprocessing/yield_africa_loco_splits.py --data_dir <data_dir>
+#
+# To evaluate on a different held-out country:
+#   python src/train.py experiment=yield_africa_fusion_loco \
+#     data.saved_split_file_name=split_loco_RWA.pth
+
+defaults:
+  - override /model: yield_fusion_reg
+  - override /data: yield_africa_loco
+  - override /metrics: yield_africa_regression
+
+tags: ["yield_africa", "fusion", "regression", "loco"]
+seed: 12345
+
+trainer:
+  min_epochs: 1
+  max_epochs: 150
+
+data:
+  batch_size: 64
+
+logger:
+  wandb:
+    tags: ${tags}
+    group: "yield_africa"
+  aim:
+    experiment: "yield_africa"
diff --git a/configs/experiment/yield_africa_fusion_reg.yaml b/configs/experiment/yield_africa_fusion_reg.yaml
@@ -0,0 +1,25 @@
+# @package _global_
+# configs/experiment/heat_guatemala_fusion_reg.yaml
+# Variant C: GeoClip + tabular fusion
+
+defaults:
+  - override /model: yield_fusion_reg
+  - override /data: yield_africa_all
+  - override /metrics: yield_africa_regression
+
+tags: ["yield_africa", "fusion", "regression"]
+seed: 12345
+
+trainer:
+  min_epochs: 1
+  max_epochs: 150
+
+data:
+  batch_size: 64
+
+logger:
+  wandb:
+    tags: ${tags}
+    group: "yield_africa"
+  aim:
+    experiment: "yield_africa"
diff --git a/configs/experiment/yield_africa_fusion_spatial.yaml b/configs/experiment/yield_africa_fusion_spatial.yaml
@@ -0,0 +1,33 @@
+# @package _global_
+# configs/experiment/yield_africa_fusion_spatial.yaml
+# GeoClip + tabular fusion model evaluated with a spatial-cluster split.
+# Default cluster distance: 25 km (split_spatial_25km.pth).
+#
+# Generate split files first:
+#   python src/data_preprocessing/yield_africa_spatial_splits.py --data_dir <data_dir>
+#
+# To evaluate at a different cluster distance:
+#   python src/train.py experiment=yield_africa_fusion_spatial \
+#     data.saved_split_file_name=split_spatial_10km.pth
+
+defaults:
+  - override /model: yield_fusion_reg
+  - override /data: yield_africa_spatial
+  - override /metrics: yield_africa_regression
+
+tags: ["yield_africa", "fusion", "regression", "spatial"]
+seed: 12345
+
+trainer:
+  min_epochs: 1
+  max_epochs: 150
+
+data:
+  batch_size: 64
+
+logger:
+  wandb:
+    tags: ${tags}
+    group: "yield_africa"
+  aim:
+    experiment: "yield_africa"