diff --git a/backend/app/features/ingestion/ingest.py b/backend/app/features/ingestion/ingest.py index d69cc72a..d9a8a15b 100644 --- a/backend/app/features/ingestion/ingest.py +++ b/backend/app/features/ingestion/ingest.py @@ -5,6 +5,7 @@ - A run is "successful" only if all required metadata files are present. - case_name (from timing files) is the identity for Case grouping. - The first successful run per case is the reference simulation. + - CASE_HASH is stored as execution metadata and used for drift warnings. - Each run creates a Simulation linked to a Case via case_id. - Reference simulations have run_config_deltas = None. - Non-reference runs store config differences vs the reference. @@ -99,6 +100,7 @@ class SimulationCreateDraft: last_updated_by: UUID | None hpc_username: str | None run_config_deltas: dict[str, dict[str, str | None]] | None = None + case_hash: str | None = None def ingest_archive( @@ -172,6 +174,8 @@ def ingest_archive( errors: list[dict[str, str]] = [] reference_cache: dict[str, SimulationConfigSnapshot] = {} persisted_reference_cache: dict[UUID, SimulationConfigSnapshot | None] = {} + case_hash_cache: dict[str, str] = {} + persisted_case_hash_cache: dict[UUID, str | None] = {} for parsed_simulation in parsed_simulations: try: @@ -180,6 +184,8 @@ def ingest_archive( db=db, reference_cache=reference_cache, persisted_reference_cache=persisted_reference_cache, + case_hash_cache=case_hash_cache, + persisted_case_hash_cache=persisted_case_hash_cache, ) if is_duplicate: @@ -221,6 +227,8 @@ def _process_simulation_for_ingest( db: Session, reference_cache: dict[str, SimulationConfigSnapshot], persisted_reference_cache: dict[UUID, SimulationConfigSnapshot | None], + case_hash_cache: dict[str, str], + persisted_case_hash_cache: dict[UUID, str | None], ) -> tuple[SimulationCreate | None, bool]: """Process one parsed simulation entry. @@ -254,6 +262,13 @@ def _process_simulation_for_ingest( prevalidated_draft = _prevalidate_simulation_create(parsed_simulation, machine_id) case = _resolve_case(parsed_simulation, case_name, db) + _track_case_hash_observation( + parsed_simulation=parsed_simulation, + case=case, + case_hash_cache=case_hash_cache, + persisted_case_hash_cache=persisted_case_hash_cache, + db=db, + ) simulation = _build_simulation_create( parsed_simulation=parsed_simulation, @@ -267,6 +282,70 @@ def _process_simulation_for_ingest( return simulation, False +def _track_case_hash_observation( + parsed_simulation: ParsedSimulation, + case: Case, + case_hash_cache: dict[str, str], + persisted_case_hash_cache: dict[UUID, str | None], + db: Session, +) -> None: + """Track CASE_HASH drift without changing case-name grouping.""" + current_hash = parsed_simulation.case_hash + if not current_hash: + return + + baseline_hash = _get_case_hash_baseline( + case=case, + case_hash_cache=case_hash_cache, + persisted_case_hash_cache=persisted_case_hash_cache, + db=db, + ) + if baseline_hash is None: + case_hash_cache.setdefault(case.name, current_hash) + return + + case_hash_cache.setdefault(case.name, baseline_hash) + if baseline_hash == current_hash: + return + + logger.warning( + "Observed CASE_HASH drift for case '%s': baseline='%s', current='%s' " + "from %s. Retaining case-name grouping.", + case.name, + baseline_hash, + current_hash, + parsed_simulation.execution_dir, + ) + + +def _get_case_hash_baseline( + case: Case, + case_hash_cache: dict[str, str], + persisted_case_hash_cache: dict[UUID, str | None], + db: Session, +) -> str | None: + """Return the CASE_HASH baseline for drift detection.""" + if case.reference_simulation_id is not None: + if case.id not in persisted_case_hash_cache: + reference_simulation = ( + db.query(Simulation) + .filter(Simulation.id == case.reference_simulation_id) + .first() + ) + baseline_hash = ( + reference_simulation.case_hash if reference_simulation else None + ) + persisted_case_hash_cache[case.id] = baseline_hash + if baseline_hash is not None: + case_hash_cache.setdefault(case.name, baseline_hash) + + baseline_hash = persisted_case_hash_cache[case.id] + if baseline_hash is not None: + return baseline_hash + + return case_hash_cache.get(case.name) + + def _require_case_name(parsed_simulation: ParsedSimulation) -> str: """Return case_name from metadata or raise a descriptive error.""" case_name = parsed_simulation.case_name @@ -719,6 +798,7 @@ def _build_simulation_create_draft( last_updated_by=None, hpc_username=parsed_simulation.hpc_username, run_config_deltas=run_config_deltas, + case_hash=parsed_simulation.case_hash, ) return simulation_draft diff --git a/backend/app/features/ingestion/parsers/case_docs.py b/backend/app/features/ingestion/parsers/case_docs.py index 9e5ec964..28cb4894 100644 --- a/backend/app/features/ingestion/parsers/case_docs.py +++ b/backend/app/features/ingestion/parsers/case_docs.py @@ -23,6 +23,7 @@ def parse_env_case(env_case_path: str | Path) -> dict[str, str | None]: Dictionary with case metadata (values are str or None), including: - ``case_name``: Case name (``CASE``) + - ``case_hash``: Case hash (``CASE_HASH``) - ``case_group``: Case group (``CASE_GROUP``) - ``machine``: Machine name (``MACH``) - ``user``: Real user (``REALUSER``) @@ -34,6 +35,7 @@ def parse_env_case(env_case_path: str | Path) -> dict[str, str | None]: env_case_path = Path(env_case_path) case_name = _extract_value_from_file(env_case_path, "CASE") + case_hash = _extract_value_from_file(env_case_path, "CASE_HASH") case_group = _extract_value_from_file(env_case_path, "CASE_GROUP") machine = _extract_value_from_file(env_case_path, "MACH") user = _extract_value_from_file(env_case_path, "REALUSER") @@ -44,6 +46,7 @@ def parse_env_case(env_case_path: str | Path) -> dict[str, str | None]: return { "case_name": case_name, + "case_hash": case_hash, "case_group": case_group, "machine": machine, "user": user, diff --git a/backend/app/features/ingestion/parsers/parser.py b/backend/app/features/ingestion/parsers/parser.py index 718b1bae..665d2045 100644 --- a/backend/app/features/ingestion/parsers/parser.py +++ b/backend/app/features/ingestion/parsers/parser.py @@ -571,6 +571,7 @@ def _parse_all_files(exec_dir: str, files: dict[str, str | None]) -> ParsedSimul git_tag=metadata.get("git_tag"), git_commit_hash=metadata.get("git_commit_hash"), status=metadata.get("status") or SimulationStatus.UNKNOWN.value, + case_hash=metadata.get("case_hash"), ) diff --git a/backend/app/features/ingestion/parsers/types.py b/backend/app/features/ingestion/parsers/types.py index cc7d43b9..ea3bd05b 100644 --- a/backend/app/features/ingestion/parsers/types.py +++ b/backend/app/features/ingestion/parsers/types.py @@ -28,3 +28,4 @@ class ParsedSimulation: git_tag: str | None git_commit_hash: str | None status: str | None + case_hash: str | None = None diff --git a/backend/app/features/simulation/models.py b/backend/app/features/simulation/models.py index 453450d2..9c32a9d5 100644 --- a/backend/app/features/simulation/models.py +++ b/backend/app/features/simulation/models.py @@ -71,6 +71,7 @@ class Simulation(Base, IDMixin, TimestampMixin): execution_id: Mapped[str] = mapped_column( Text, unique=True, index=True, nullable=False ) + case_hash: Mapped[str | None] = mapped_column(String(64), nullable=True) description: Mapped[str | None] = mapped_column(Text, nullable=True) compset: Mapped[str] = mapped_column(String(120)) compset_alias: Mapped[str] = mapped_column(Text) diff --git a/backend/app/features/simulation/schemas.py b/backend/app/features/simulation/schemas.py index 8861ab1a..de2e0329 100644 --- a/backend/app/features/simulation/schemas.py +++ b/backend/app/features/simulation/schemas.py @@ -114,6 +114,16 @@ class SimulationCreate(CamelInBaseModel): ), ), ] + case_hash: Annotated[ + str | None, + Field( + None, + description=( + "Optional CASE_HASH parsed from env_case.xml. Informational for " + "ingestion consistency checks; not currently the grouping key." + ), + ), + ] description: Annotated[ str | None, Field(None, description="Optional description of the simulation") ] @@ -404,6 +414,16 @@ class SimulationOut(CamelOutBaseModel): ), ), ] + case_hash: Annotated[ + str | None, + Field( + None, + description=( + "Optional CASE_HASH parsed from env_case.xml. Informational for " + "ingestion consistency checks; not currently the grouping key." + ), + ), + ] is_reference: Annotated[ bool, Field( diff --git a/backend/migrations/versions/20260414_000000_add_case_hash_to_simulations.py b/backend/migrations/versions/20260414_000000_add_case_hash_to_simulations.py new file mode 100644 index 00000000..6dd35cfc --- /dev/null +++ b/backend/migrations/versions/20260414_000000_add_case_hash_to_simulations.py @@ -0,0 +1,30 @@ +"""Add case_hash column to simulations. + +Revision ID: 20260414_000000 +Revises: 20260331_000000 +Create Date: 2026-03-30 00:00:00.000000 +""" + +from typing import Sequence, Union + +import sqlalchemy as sa +from alembic import op + +# revision identifiers, used by Alembic. +revision: str = "20260414_000000" +down_revision: Union[str, Sequence[str], None] = "20260331_000000" +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + """Add nullable CASE_HASH storage to simulations.""" + op.add_column( + "simulations", + sa.Column("case_hash", sa.String(length=64), nullable=True), + ) + + +def downgrade() -> None: + """Remove CASE_HASH storage from simulations.""" + op.drop_column("simulations", "case_hash") diff --git a/backend/tests/features/ingestion/parsers/test_case_docs.py b/backend/tests/features/ingestion/parsers/test_case_docs.py index e804b5ff..42b138f8 100644 --- a/backend/tests/features/ingestion/parsers/test_case_docs.py +++ b/backend/tests/features/ingestion/parsers/test_case_docs.py @@ -59,6 +59,7 @@ def test_campaign_and_experiment_type_are_derived(self, tmp_path): xml_case = """ + """ tmp_case = tmp_path / "env_case_campaign.xml" @@ -68,6 +69,7 @@ def test_campaign_and_experiment_type_are_derived(self, tmp_path): assert result["campaign"] == "v3.LR.historical" assert result["experiment_type"] == "historical" + assert result["case_hash"] == "abc123def456" def test_non_dot_case_name_does_not_set_campaign(self, tmp_path): xml_case = """ @@ -91,6 +93,7 @@ def test_read_error_returns_none(self): result = parse_env_case(Path("/tmp/missing.xml")) assert result["case_name"] is None + assert result["case_hash"] is None assert result["case_group"] is None diff --git a/backend/tests/features/ingestion/test_ingest.py b/backend/tests/features/ingestion/test_ingest.py index e0fecb3e..63bbd73c 100644 --- a/backend/tests/features/ingestion/test_ingest.py +++ b/backend/tests/features/ingestion/test_ingest.py @@ -11,12 +11,14 @@ SimulationCreateDraft, _build_config_snapshot, _build_simulation_create_draft, + _get_case_hash_baseline, _get_or_create_case, _get_reference_metadata_for_case, _normalize_git_url, _normalize_simulation_status, _normalize_simulation_type, _stringify_config_value, + _track_case_hash_observation, _validate_simulation_create, ingest_archive, ) @@ -65,6 +67,7 @@ def _parsed_simulations_from_mapping( git_tag=metadata.get("git_tag"), git_commit_hash=metadata.get("git_commit_hash"), status=metadata.get("status"), + case_hash=metadata.get("case_hash"), ) ) @@ -1861,7 +1864,7 @@ def test_incremental_ingestion_persisted_simulation_type_adds_delta( } def test_same_case_name_groups_to_same_case(self, db: Session) -> None: - """Runs with the same case_name belong to the same Case.""" + """Runs with the same case_name belong to the same Case without CASE_HASH.""" self._create_machine(db, "test-machine") mock_simulations = { @@ -1890,6 +1893,107 @@ def test_same_case_name_groups_to_same_case(self, db: Session) -> None: case = db.query(Case).filter(Case.name == "case1").first() assert case is not None + def test_case_hash_is_persisted_on_created_simulation(self, db: Session) -> None: + """Parsed CASE_HASH should be preserved on created simulations.""" + self._create_machine(db, "test-machine") + + mock_simulations = { + "/path/to/1081196.251218-200956": self._make_metadata( + execution_id="1081196.251218-200956", + case_name="case1", + case_hash="ea56b83457fa9e775be77c500bef13533bf675cee8f662f6ce218c2e53b7c357", + ), + } + + with patch( + "app.features.ingestion.ingest.main_parser", + return_value=(_parsed_simulations_from_mapping(mock_simulations), 0), + ): + result = ingest_archive(Path("/tmp/a.zip"), Path("/tmp/o"), db) + + assert result.created_count == 1 + assert result.simulations[0].case_hash == ( + "ea56b83457fa9e775be77c500bef13533bf675cee8f662f6ce218c2e53b7c357" + ) + + def test_case_hash_drift_logs_warning_and_retains_case_name_grouping( + self, db: Session + ) -> None: + """Conflicting CASE_HASH values warn but do not split the Case.""" + machine = self._create_machine(db, "test-machine") + + user = User( + email="test@example.com", + is_active=True, + is_verified=True, + ) + db.add(user) + db.commit() + + ingestion = Ingestion( + source_type=IngestionSourceType.HPC_PATH, + source_reference="/archive", + status=IngestionStatus.SUCCESS, + machine_id=machine.id, + triggered_by=user.id, + ) + db.add(ingestion) + db.commit() + + case = Case(name="case1") + db.add(case) + db.flush() + + sim = Simulation( + case_id=case.id, + execution_id="1081192.251218-200952", + case_hash="ea56b83457fa9e775be77c500bef13533bf675cee8f662f6ce218c2e53b7c357", + compset="FHIST", + compset_alias="test_alias", + grid_name="grid1", + grid_resolution="0.9x1.25", + machine_id=machine.id, + simulation_start_date=datetime(2020, 1, 1), + initialization_type="test", + status=SimulationStatus.CREATED, + simulation_type=SimulationType.UNKNOWN, + created_by=user.id, + last_updated_by=user.id, + ingestion_id=ingestion.id, + ) + db.add(sim) + db.flush() + + assert sim.id is not None + case.reference_simulation_id = sim.id + db.commit() + + mock_simulations = { + "/path/to/1081193.251218-200953": self._make_metadata( + execution_id="1081193.251218-200953", + case_name="case1", + simulation_start_date="2020-06-01", + case_hash="162f93c8f9ac9296efe7160d1807e41d8c2a6da1cbc77c54dd976665e10818fb", + ), + } + + with ( + patch( + "app.features.ingestion.ingest.main_parser", + return_value=(_parsed_simulations_from_mapping(mock_simulations), 0), + ), + patch("app.features.ingestion.ingest.logger.warning") as mock_warning, + ): + result = ingest_archive(Path("/tmp/a.zip"), Path("/tmp/o"), db) + + assert result.created_count == 1 + assert len({s.case_id for s in result.simulations}) == 1 + assert result.simulations[0].case_id == case.id + assert result.simulations[0].case_hash == ( + "162f93c8f9ac9296efe7160d1807e41d8c2a6da1cbc77c54dd976665e10818fb" + ) + mock_warning.assert_called_once() + def test_different_case_name_creates_separate_cases(self, db: Session) -> None: """Runs with different case_name values create separate Cases.""" self._create_machine(db, "test-machine") @@ -2047,6 +2151,171 @@ def test_get_reference_metadata_uses_persisted_cache_on_second_lookup( ) assert second == first + def test_get_case_hash_baseline_uses_persisted_cache_on_second_lookup( + self, + ) -> None: + case = MagicMock(spec=Case) + case.id = uuid4() + case.name = "case_hash_cache_case" + case.reference_simulation_id = uuid4() + + db = MagicMock(spec=Session) + case_hash_cache: dict[str, str] = {} + persisted_case_hash_cache: dict[UUID, str | None] = {case.id: "baseline-hash"} + + with patch.object(db, "query", side_effect=AssertionError): + result = _get_case_hash_baseline( + case=case, + case_hash_cache=case_hash_cache, + persisted_case_hash_cache=persisted_case_hash_cache, + db=db, + ) + + assert result == "baseline-hash" + + def test_track_case_hash_observation_skips_warning_for_matching_baseline( + self, + ) -> None: + parsed = ParsedSimulation( + execution_dir="/path/to/1082002.260305-120002", + execution_id="1082002.260305-120002", + case_name="case_hash_case", + case_group=None, + machine="machine", + hpc_username=None, + compset="FHIST", + compset_alias="test_alias", + grid_name="grid1", + grid_resolution="0.9x1.25", + campaign=None, + experiment_type=None, + initialization_type="test", + simulation_start_date="2020-01-01", + simulation_end_date=None, + run_start_date=None, + run_end_date=None, + compiler=None, + git_repository_url=None, + git_branch=None, + git_tag=None, + git_commit_hash=None, + status=None, + case_hash="matching-hash", + ) + case = MagicMock(spec=Case) + case.name = "case_hash_case" + + case_hash_cache: dict[str, str] = {} + + with ( + patch( + "app.features.ingestion.ingest._get_case_hash_baseline", + return_value="matching-hash", + ), + patch("app.features.ingestion.ingest.logger.warning") as mock_warning, + ): + _track_case_hash_observation( + parsed_simulation=parsed, + case=case, + case_hash_cache=case_hash_cache, + persisted_case_hash_cache={}, + db=MagicMock(spec=Session), + ) + + assert case_hash_cache == {"case_hash_case": "matching-hash"} + mock_warning.assert_not_called() + + def test_track_case_hash_observation_uses_in_batch_baseline_when_reference_hash_missing( + self, + ) -> None: + first_parsed = ParsedSimulation( + execution_dir="/path/to/1082003.260305-120003", + execution_id="1082003.260305-120003", + case_name="case_hash_case", + case_group=None, + machine="machine", + hpc_username=None, + compset="FHIST", + compset_alias="test_alias", + grid_name="grid1", + grid_resolution="0.9x1.25", + campaign=None, + experiment_type=None, + initialization_type="test", + simulation_start_date="2020-01-01", + simulation_end_date=None, + run_start_date=None, + run_end_date=None, + compiler=None, + git_repository_url=None, + git_branch=None, + git_tag=None, + git_commit_hash=None, + status=None, + case_hash="first-hash", + ) + second_parsed = ParsedSimulation( + execution_dir="/path/to/1082004.260305-120004", + execution_id="1082004.260305-120004", + case_name="case_hash_case", + case_group=None, + machine="machine", + hpc_username=None, + compset="FHIST", + compset_alias="test_alias", + grid_name="grid1", + grid_resolution="0.9x1.25", + campaign=None, + experiment_type=None, + initialization_type="test", + simulation_start_date="2020-06-01", + simulation_end_date=None, + run_start_date=None, + run_end_date=None, + compiler=None, + git_repository_url=None, + git_branch=None, + git_tag=None, + git_commit_hash=None, + status=None, + case_hash="second-hash", + ) + case = MagicMock(spec=Case) + case.id = uuid4() + case.name = "case_hash_case" + case.reference_simulation_id = uuid4() + + reference_simulation = MagicMock(spec=Simulation) + reference_simulation.case_hash = None + + db = MagicMock(spec=Session) + db.query.return_value.filter.return_value.first.return_value = ( + reference_simulation + ) + + case_hash_cache: dict[str, str] = {} + persisted_case_hash_cache: dict[UUID, str | None] = {} + + with patch("app.features.ingestion.ingest.logger.warning") as mock_warning: + _track_case_hash_observation( + parsed_simulation=first_parsed, + case=case, + case_hash_cache=case_hash_cache, + persisted_case_hash_cache=persisted_case_hash_cache, + db=db, + ) + _track_case_hash_observation( + parsed_simulation=second_parsed, + case=case, + case_hash_cache=case_hash_cache, + persisted_case_hash_cache=persisted_case_hash_cache, + db=db, + ) + + assert case_hash_cache == {"case_hash_case": "first-hash"} + assert persisted_case_hash_cache == {case.id: None} + mock_warning.assert_called_once() + def test_parsed_snapshot_defaults_simulation_type_to_unknown(self) -> None: parsed = ParsedSimulation( execution_dir="/path/to/1082001.260305-120001", @@ -2296,6 +2565,7 @@ def test_simulation_create_draft_validates_by_field_name(self) -> None: last_updated_by=None, hpc_username="test-user", run_config_deltas=None, + case_hash="abc123", ) schema = _validate_simulation_create(draft) @@ -2305,6 +2575,7 @@ def test_simulation_create_draft_validates_by_field_name(self) -> None: assert schema.extra == {} assert schema.artifacts == [] assert schema.links == [] + assert schema.case_hash == "abc123" assert schema.run_config_deltas is None def test_build_simulation_create_draft_normalizes_values(self) -> None: @@ -2332,6 +2603,7 @@ def test_build_simulation_create_draft_normalizes_values(self) -> None: git_tag="v1.0.0", git_commit_hash="abc123", status="completed", + case_hash="hash123", ) draft = _build_simulation_create_draft( @@ -2343,5 +2615,6 @@ def test_build_simulation_create_draft_normalizes_values(self) -> None: assert draft.simulation_type == SimulationType.UNKNOWN assert draft.status == SimulationStatus.COMPLETED assert draft.git_repository_url == "https://github.com/E3SM-Project/E3SM.git" + assert draft.case_hash == "hash123" assert draft.simulation_start_date is not None assert draft.run_start_date is not None diff --git a/backend/tests/features/simulation/test_api.py b/backend/tests/features/simulation/test_api.py index 4c3ce917..9ced9a42 100644 --- a/backend/tests/features/simulation/test_api.py +++ b/backend/tests/features/simulation/test_api.py @@ -645,6 +645,7 @@ def test_endpoint_succeeds_with_valid_id( sim = Simulation( case_id=case.id, execution_id="get-test-exec-1", + case_hash="abc123casehash", compset="AQUAPLANET", compset_alias="QPC4", grid_name="f19_f19", @@ -669,6 +670,7 @@ def test_endpoint_succeeds_with_valid_id( data = res.json() assert data["caseName"] == "test_case_get" assert data["executionId"] == "get-test-exec-1" + assert data["caseHash"] == "abc123casehash" def test_endpoint_raises_404_if_simulation_not_found(self, client): res = client.get(f"{API_BASE}/simulations/{uuid4()}") diff --git a/backend/tests/features/simulation/test_schemas.py b/backend/tests/features/simulation/test_schemas.py index ebb0358c..73baaeef 100644 --- a/backend/tests/features/simulation/test_schemas.py +++ b/backend/tests/features/simulation/test_schemas.py @@ -23,6 +23,7 @@ def test_valid_simulation_create_required_fields(self): payload = { "caseId": uuid4(), "executionId": "1081156.251218-200923", + "caseHash": "abc123", "compset": "AQUAPLANET", "compsetAlias": "QPC4", "gridName": "f19_f19", @@ -113,6 +114,7 @@ def test_valid_simulation_out_required_fields(self): "case_id": case_id, "case_name": "test_case", "execution_id": "1081156.251218-200923", + "case_hash": "abc123", "is_reference": True, "change_count": 0, "compset": "AQUAPLANET", @@ -192,6 +194,7 @@ def test_valid_simulation_out_optional_fields(self): "case_id": case_id, "case_name": "test_case", "execution_id": "1081156.251218-200923", + "case_hash": "abc123", "is_reference": False, "change_count": 2, "compset": "AQUAPLANET",