Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,10 @@ requires-python = ">=3.10"
dependencies = [
"cffi",
"metkitlib",
"findlibs"
"findlibs",
"pyyaml",
"requests",
"platformdirs",
]

[tool.setuptools.dynamic]
Expand Down
2 changes: 2 additions & 0 deletions python/pymetkit/src/pymetkit/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1,3 @@
from .pymetkit import *
from .pymetkit import ParamDB
from .models import ParameterEntry
246 changes: 246 additions & 0 deletions python/pymetkit/src/pymetkit/generate_parameter_metadata.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,246 @@
"""
Standalone script to generate:
- parameter_metadata.yaml — one entry per ECMWF parameter
- unit_metadata.yaml — one entry per ECMWF unit

Usage
-----
python -m pymetkit.generate_parameter_metadata
# or directly:
python generate_parameter_metadata.py
"""

import json
import requests
import yaml
from pathlib import Path

PARAM_URL = "https://codes.ecmwf.int/parameter-database/api/v1/param/"
UNIT_URL = "https://codes.ecmwf.int/parameter-database/api/v1/unit/"
ORIGIN_URL = "https://codes.ecmwf.int/parameter-database/api/v1/origin/"

# Output paths: canonical location is share/metkit/ at the repo root, which is
# four parent directories above this module file:
# python/pymetkit/src/pymetkit/ -> python/pymetkit/src/ -> python/pymetkit/
# -> python/ -> <repo_root>
_REPO_ROOT = Path(__file__).parents[4]
PARAM_OUTPUT = _REPO_ROOT / "share" / "metkit" / "parameter_metadata.yaml"
UNIT_OUTPUT = _REPO_ROOT / "share" / "metkit" / "unit_metadata.yaml"
SCHEMA_OUTPUT = _REPO_ROOT / "share" / "metkit" / "parameter_entry_schema.json"

#: Timeout in seconds for HTTP requests to the ECMWF parameter database API.
REQUEST_TIMEOUT = 30

Comment on lines +18 to +33
Copy link

Copilot AI Apr 10, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The generator writes parameter_metadata.yaml and unit_metadata.yaml next to the Python module (Path(__file__).parent), but the PR adds the YAML under share/metkit/ and ParamDB’s fallback search also expects share/metkit/parameter_metadata.yaml. Regenerating with this script will therefore write to a different location than the committed data. Align the output paths with the repository’s canonical YAML location (or update the rest of the codebase to consume the module-adjacent files).

Copilot uses AI. Check for mistakes.

# ---------------------------------------------------------------------------
# Units
# ---------------------------------------------------------------------------


def fetch_units(url: str = UNIT_URL) -> tuple[list[dict], dict[int, str]]:
"""
Fetch all units from the ECMWF parameter database API.

Returns
-------
units : list[dict]
Normalised unit records ready to be written to unit_metadata.yaml.
unit_map : dict[int, str]
Mapping of unit id -> unit name string for use in parameter enrichment.
"""
print(f"Fetching units from {url} ...")
response = requests.get(url, timeout=REQUEST_TIMEOUT)
response.raise_for_status()
raw_units = response.json()
Comment on lines +51 to +54
Copy link

Copilot AI Apr 10, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Both API calls use requests.get(...) without a timeout. If the endpoint stalls, this script can block indefinitely. Consider providing a default timeout (and possibly a retry strategy) to make regeneration more robust.

Copilot uses AI. Check for mistakes.
print(f" Received {len(raw_units)} units.")

units = []
unit_map: dict[int, str] = {}

for raw in raw_units:
uid = int(raw["id"])
# The API may use 'name', 'symbol', or 'label' for the unit string
name = raw.get("name") or raw.get("symbol") or raw.get("label") or ""

entry = {"id": uid}
# Preserve all fields the API returns, but ensure id comes first
for key, value in raw.items():
if key == "id":
continue
entry[key] = value
# Always emit a canonical 'name' field so unit_metadata.yaml has a
# stable schema regardless of which key the API uses (name/symbol/label)
entry["name"] = name

Copy link

Copilot AI Apr 10, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

fetch_units() computes a normalised unit string (name = raw.get("name") or raw.get("symbol") ...) for unit_map, but the YAML output preserves the raw keys and does not ensure there is a canonical name field. If the API returns symbol/label instead of name, unit_metadata.yaml will lack the expected key. Consider explicitly setting entry["name"] = name (and/or dropping the alternate keys) to keep the output schema stable.

Suggested change
# Always emit a canonical name field so unit_metadata.yaml has a stable schema
entry["name"] = name

Copilot uses AI. Check for mistakes.
units.append(entry)
unit_map[uid] = name

units.sort(key=lambda e: e["id"])
return units, unit_map


def write_unit_yaml(units: list[dict], output_path: Path = UNIT_OUTPUT) -> None:
"""Write the unit list to a YAML file."""
with output_path.open("w") as fh:
yaml.dump(
units,
fh,
default_flow_style=False,
allow_unicode=True,
sort_keys=False,
)
print(f"Written {len(units)} units to {output_path}")


# ---------------------------------------------------------------------------
# Origins
# ---------------------------------------------------------------------------


def fetch_origin_map(
origin_url: str = ORIGIN_URL,
param_url: str = PARAM_URL,
) -> tuple[dict[int, dict], dict[int, list[int]]]:
"""Fetch all origins and build a reverse map of param_id -> [origin_ids].

The ``/param/`` endpoint does not include an ``origin`` field in its
response, so we derive the mapping by querying each origin's filtered
parameter list via ``/param/?origin=<id>``.

Returns
-------
origins : dict[int, dict]
Mapping of origin_id -> origin metadata (id, abbreviation, name).
param_origin_map : dict[int, list[int]]
Mapping of param_id -> sorted list of origin_ids that include it.
"""
print(f"Fetching origins from {origin_url} ...")
response = requests.get(origin_url, timeout=REQUEST_TIMEOUT)
response.raise_for_status()
raw_origins = response.json()
print(f" Received {len(raw_origins)} origins.")

origins: dict[int, dict] = {o["id"]: o for o in raw_origins}
param_origin_map: dict[int, list[int]] = {}

for origin in raw_origins:
oid = origin["id"]
abbr = origin.get("abbreviation", str(oid))
print(f" Fetching params for origin={oid} ({abbr}) ...")
r = requests.get(
param_url, params={"origin": oid}, timeout=REQUEST_TIMEOUT
)
r.raise_for_status()
origin_params = r.json()
print(f" {len(origin_params)} params.")
for p in origin_params:
pid = int(p["id"])
param_origin_map.setdefault(pid, []).append(oid)

# Sort each origin list for deterministic output
for pid in param_origin_map:
param_origin_map[pid].sort()

return origins, param_origin_map


# ---------------------------------------------------------------------------
# Parameters
# ---------------------------------------------------------------------------


def fetch_parameters(
url: str = PARAM_URL,
unit_map: "dict[int, str] | None" = None,
param_origin_map: "dict[int, list[int]] | None" = None,
) -> list[dict]:
"""Fetch all parameters from the ECMWF parameter database API.

Parameters
----------
url:
The parameter API endpoint.
unit_map:
Mapping of unit_id -> unit name string, used to resolve the
``units`` field. When ``None`` the units field is left empty.
param_origin_map:
Mapping of param_id -> list of origin_ids, built by
:func:`fetch_origin_map`. When provided, each entry gains an
``origin_ids`` field containing the sorted list of WMO originating
centre IDs that include this parameter. When ``None`` the field
is omitted.
"""
print(f"Fetching parameters from {url} ...")
response = requests.get(url, timeout=REQUEST_TIMEOUT)
response.raise_for_status()
params = response.json()
print(f" Received {len(params)} parameters.")

result = []
for raw in params:
# Resolve short name (API may return 'shortName', 'short_name', or 'shortname')
shortname = (
raw.get("shortname") or raw.get("shortName") or raw.get("short_name") or ""
)

# Resolve units via unit_map if available
unit_id = raw.get("unit_id")
if unit_map and unit_id is not None:
units = unit_map.get(int(unit_id), "")
else:
units = ""

pid = int(raw["id"])

entry = {
"id": pid,
"shortname": shortname,
"longname": raw.get("name", ""),
"units": units,
"description": raw.get("description", ""),
# access_ids indicates dissemination availability; preserve as-is.
"access_ids": raw.get("access_ids", []),
}

# Attach origin_ids derived from the per-origin filtered queries.
if param_origin_map is not None:
entry["origin_ids"] = param_origin_map.get(pid, [])

result.append(entry)

result.sort(key=lambda e: e["id"])
return result


def write_param_yaml(params: list[dict], output_path: Path = PARAM_OUTPUT) -> None:
"""Write the parameter list to a YAML file."""
with output_path.open("w") as fh:
yaml.dump(
params,
fh,
default_flow_style=False,
allow_unicode=True,
sort_keys=False,
)
print(f"Written {len(params)} parameters to {output_path}")


# ---------------------------------------------------------------------------
# Entry point
# ---------------------------------------------------------------------------

if __name__ == "__main__":
units, unit_map = fetch_units()
write_unit_yaml(units)

_, param_origin_map = fetch_origin_map()

parameters = fetch_parameters(unit_map=unit_map, param_origin_map=param_origin_map)
write_param_yaml(parameters)

# Write the JSON schema for ParameterEntry so downstream tools can validate YAML.
from .models import ParameterEntry # noqa: E402 (local import to avoid circular at module level)

schema = ParameterEntry.model_json_schema()
SCHEMA_OUTPUT.write_text(json.dumps(schema, indent=2), encoding="utf-8")
print(f"Written JSON schema to {SCHEMA_OUTPUT}")
117 changes: 117 additions & 0 deletions python/pymetkit/src/pymetkit/models.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
"""
Pydantic models for pymetkit parameter metadata.
"""

from __future__ import annotations

from typing import Annotated, Any

from pydantic import BaseModel, Field, field_validator, model_validator


class ParameterEntry(BaseModel):
"""
A single entry from the ECMWF parameter database.

Accepts both the canonical field names produced by ``_normalise`` and the
raw aliases that may appear in YAML or API responses.
"""

model_config = {"populate_by_name": True, "extra": "allow"}

id: int = Field(..., description="Numeric ECMWF/GRIB parameter ID")

shortname: str = Field(
...,
alias="shortname",
description="Short name (e.g. 't', 'tp')",
)

longname: str = Field(
...,
alias="longname",
description="Human-readable long name (e.g. 'Temperature')",
)

units: str = Field(
default="unknown",
description="Physical units string (e.g. 'K', 'm s**-1')",
)

origin_ids: list[int] = Field(
default_factory=list,
description="WMO originating centre IDs associated with this parameter",
)

access_ids: list[str] = Field(
default_factory=list,
description="Access category tags (e.g. 'dissemination', 'research')",
)

# ------------------------------------------------------------------
# Validators
# ------------------------------------------------------------------

@field_validator("id", mode="before")
@classmethod
def coerce_id_to_int(cls, v: Any) -> int:
try:
return int(v)
except (TypeError, ValueError) as exc:
raise ValueError(f"'id' must be convertible to int, got {v!r}") from exc

@field_validator("shortname", mode="before")
@classmethod
def normalise_shortname_key(cls, v: Any) -> str:
if v is None or str(v).strip() == "":
raise ValueError("'shortname' must be a non-empty string")
return str(v)

@field_validator("longname", mode="before")
@classmethod
def normalise_longname_key(cls, v: Any) -> str:
if v is None or str(v).strip() == "":
raise ValueError("'longname' must be a non-empty string")
return str(v)

@field_validator("units", mode="before")
@classmethod
def default_empty_units(cls, v: Any) -> str:
if v is None or str(v).strip() == "":
return "unknown"
return str(v)

@field_validator("origin_ids", mode="before")
@classmethod
def coerce_origin_ids(cls, v: Any) -> list[int]:
if v is None:
return []
return [int(x) for x in v]

@field_validator("access_ids", mode="before")
@classmethod
def coerce_access_ids(cls, v: Any) -> list[str]:
if v is None:
return []
return [str(x) for x in v]

@model_validator(mode="before")
@classmethod
def _normalise_aliases(cls, data: Any) -> Any:
"""Accept legacy key spellings from raw YAML / API payloads."""
if not isinstance(data, dict):
return data
d = dict(data)
# shortname aliases
if "shortname" not in d:
for alias in ("shortName", "short_name"):
if alias in d:
d["shortname"] = d.pop(alias)
break
# longname aliases
if "longname" not in d:
for alias in ("longName", "long_name", "name"):
if alias in d:
d["longname"] = d.pop(alias)
break
return d
Loading
Loading