diff --git a/geonode/harvesting/tasks.py b/geonode/harvesting/tasks.py index 05d7c2081ce..6b55f3e47ff 100644 --- a/geonode/harvesting/tasks.py +++ b/geonode/harvesting/tasks.py @@ -31,7 +31,7 @@ from django.db.models.functions import Concat from django.utils import timezone from django.conf import settings -from django.db import transaction +from django.db import transaction, IntegrityError from geonode.resource.models import ExecutionRequest from geonode.resource.enumerator import ExecutionRequestAction @@ -741,16 +741,30 @@ def _update_harvestable_resources_batch(self, refresh_session_id: int, page: int else: processed = 0 for remote_resource in found_resources: - resource, created = models.HarvestableResource.objects.get_or_create( - harvester=harvester, - unique_identifier=remote_resource.unique_identifier, - title=remote_resource.title, - defaults={ - "should_be_harvested": harvester.harvest_new_resources_by_default, - "remote_resource_type": remote_resource.resource_type, - "last_refreshed": timezone.now(), - }, - ) + try: + resource, created = models.HarvestableResource.objects.get_or_create( + harvester=harvester, + unique_identifier=remote_resource.unique_identifier, + defaults={ + "title": remote_resource.title, + "should_be_harvested": harvester.harvest_new_resources_by_default, + "remote_resource_type": remote_resource.resource_type, + "last_refreshed": timezone.now(), + }, + ) + except IntegrityError: + # RACE CONDITION: Another worker created this between our SELECT and INSERT. + # We catch the error and simply fetch the one they created. + resource = models.HarvestableResource.objects.get( + harvester=harvester, unique_identifier=remote_resource.unique_identifier + ) + created = False + # If the resource wasn't just created, check if the title changed + # (e.g. from 'copy title' to '28409') and update it. + if not created: + resource.title = remote_resource.title + resource.remote_resource_type = remote_resource.resource_type + processed += 1 # NOTE: make sure to save the resource because we need to have its # `last_updated` property be refreshed - this is done in order to be able diff --git a/geonode/harvesting/tests/test_tasks.py b/geonode/harvesting/tests/test_tasks.py index ffc6f8b13a5..926f161ba26 100644 --- a/geonode/harvesting/tests/test_tasks.py +++ b/geonode/harvesting/tests/test_tasks.py @@ -251,6 +251,45 @@ def test_update_harvestable_resources_sends_batched_requests( # Assert chord was called with expiration mock_chord.return_value.apply_async.assert_called_once_with(args=(), expires=123) + @mock.patch("geonode.harvesting.models.Harvester.get_harvester_worker") + def test_update_batch_corrects_title_mismatch(self, mock_get_worker): + """ + Verify that if a remote resource title changes, the existing + HarvestableResource is updated instead of causing an IntegrityError. + """ + # Setup the session to be "ON_GOING" so the task doesn't skip it + self.harvesting_session.status = models.AsynchronousHarvestingSession.STATUS_ON_GOING + self.harvesting_session.save() + + # Pick one of the resources created in setUpTestData + # Let's say we target 'fake-identifier-0' which currently has 'fake-title-0' + target_uid = "fake-identifier-0" + new_remote_title = "COMPLETELY_NEW_TITLE_FROM_WMS" + + # Mock the worker to return the resource with the new title + mock_remote_resource = mock.MagicMock() + mock_remote_resource.unique_identifier = target_uid + mock_remote_resource.title = new_remote_title + mock_remote_resource.resource_type = "fake-remote-resource-type" + + mock_worker = mock.MagicMock() + mock_worker.list_resources.return_value = [mock_remote_resource] + mock_get_worker.return_value = mock_worker + + # Run the batch task using the session ID from setUpTestData + # We process page 0 with page_size 1 to just handle our mocked resource + tasks._update_harvestable_resources_batch(self.harvesting_session.pk, page=0, page_size=1) + + # ASSERTIONS + # Fetch the resource from the DB to see if it updated + resource = models.HarvestableResource.objects.get(unique_identifier=target_uid) + + # Verify the title was updated correctly + self.assertEqual(resource.title, new_remote_title) + + # Verify no new records were created (count should still be 3 from setUpTestData) + self.assertEqual(models.HarvestableResource.objects.count(), 3) + def test_harvesting_scheduler(self): mock_harvester = mock.MagicMock(spec=models.Harvester).return_value mock_harvester.scheduling_enabled = True