From 64719a640fb066c3a896a6a12e43227d2cd22aed Mon Sep 17 00:00:00 2001 From: Colin Haywood Date: Thu, 14 May 2026 16:48:19 +1200 Subject: [PATCH 01/26] feat: Add DiscoveryConfig APIs --- datamasque/client/__init__.py | 3 + datamasque/client/discovery_configs.py | 137 ++++++ datamasque/client/dmclient.py | 2 + datamasque/client/models/discovery_config.py | 18 + tests/test_discovery_configs.py | 430 +++++++++++++++++++ 5 files changed, 590 insertions(+) create mode 100644 datamasque/client/discovery_configs.py create mode 100644 datamasque/client/models/discovery_config.py create mode 100644 tests/test_discovery_configs.py diff --git a/datamasque/client/__init__.py b/datamasque/client/__init__.py index c4f007d..814ea56 100644 --- a/datamasque/client/__init__.py +++ b/datamasque/client/__init__.py @@ -70,6 +70,7 @@ SchemaDiscoveryResult, TableConstraints, ) +from datamasque.client.models.discovery_config import DiscoveryConfig, DiscoveryConfigId from datamasque.client.models.dm_instance import DataMasqueInstanceConfig from datamasque.client.models.files import ( DataMasqueFile, @@ -130,6 +131,8 @@ "DatabaseConnectionConfig", "DatabaseType", "DatabricksConnectionConfig", + "DiscoveryConfig", + "DiscoveryConfigId", "DiscoveryMatch", "DynamoConnectionConfig", "FailedToStartError", diff --git a/datamasque/client/discovery_configs.py b/datamasque/client/discovery_configs.py new file mode 100644 index 0000000..617f367 --- /dev/null +++ b/datamasque/client/discovery_configs.py @@ -0,0 +1,137 @@ +import logging +from typing import Iterator, Optional + +from datamasque.client.base import BaseClient +from datamasque.client.exceptions import DataMasqueApiError, DataMasqueException +from datamasque.client.models.discovery_config import DiscoveryConfig, DiscoveryConfigId +from datamasque.client.models.pagination import Page + +logger = logging.getLogger(__name__) + + +class DiscoveryConfigClient(BaseClient): + """Discovery config CRUD API methods. Mixed into `DataMasqueClient`.""" + + def iter_discovery_configs(self) -> Iterator[DiscoveryConfig]: + """Lazily iterate all discovery configs via the paginated endpoint.""" + + return self._iter_paginated("/api/discovery/configs/", model=DiscoveryConfig) + + def list_discovery_configs(self) -> list[DiscoveryConfig]: + """ + Lists all (non-archived) discovery configs. + + Note: the YAML content is not included in the list response for performance. + Use `get_discovery_config` to retrieve the full config with its YAML body. + """ + + return list(self.iter_discovery_configs()) + + def get_discovery_config(self, config_id: DiscoveryConfigId) -> DiscoveryConfig: + """Retrieves a single discovery config by ID.""" + + response = self.make_request("GET", f"/api/discovery/configs/{config_id}/") + return DiscoveryConfig.model_validate(response.json()) + + def get_discovery_config_by_name(self, name: str) -> Optional[DiscoveryConfig]: + """ + Looks for a discovery config matching the given name (case-sensitive, exact match). + + Returns it if found, otherwise `None`. + """ + + response = self.make_request( + "GET", + "/api/discovery/configs/", + params={"name_exact": name, "limit": 1}, + ) + page = Page[DiscoveryConfig].model_validate(response.json()) + if not page.results: + return None + + config_id = page.results[0].id + if config_id is None: + raise DataMasqueApiError( + "Server returned a discovery config list entry without an `id`.", + response=response, + ) + + return self.get_discovery_config(config_id) + + def create_discovery_config(self, config: DiscoveryConfig) -> DiscoveryConfig: + """ + Creates a new discovery config on the server. + + Sets the config's server-assigned fields (`id`, `created`, `modified`) and returns the config. + """ + + data = config.model_dump(exclude_none=True, by_alias=True, mode="json") + response = self.make_request("POST", "/api/discovery/configs/", data=data) + created = DiscoveryConfig.model_validate(response.json()) + config.id = created.id + config.created = created.created + config.modified = created.modified + logger.info('Creation of discovery config "%s" successful', config.name) + return config + + def update_discovery_config(self, config: DiscoveryConfig) -> DiscoveryConfig: + """ + Performs a full update of the discovery config. + + The config must have its `id` set + (i.e., it must have been previously created or retrieved from the server). + """ + + if config.id is None: + raise ValueError("Cannot update a discovery config that has not been created yet (id is None)") + + data = config.model_dump(exclude_none=True, by_alias=True, mode="json") + response = self.make_request("PUT", f"/api/discovery/configs/{config.id}/", data=data) + updated = DiscoveryConfig.model_validate(response.json()) + config.modified = updated.modified + logger.debug('Update of discovery config "%s" successful', config.name) + return config + + def create_or_update_discovery_config(self, config: DiscoveryConfig) -> DiscoveryConfig: + """ + Creates the config if it doesn't exist, or updates it if one with the same name already exists. + + Sets the config's `id` property. + """ + + existing = self.get_discovery_config_by_name(config.name) + if existing is not None: + config.id = existing.id + return self.update_discovery_config(config) + + return self.create_discovery_config(config) + + def delete_discovery_config_by_id_if_exists(self, config_id: DiscoveryConfigId) -> None: + """ + Archives the discovery config with the given ID. + + No-op if the config does not exist. + """ + + self._delete_if_exists(f"/api/discovery/configs/{config_id}/") + + def delete_discovery_config_by_name_if_exists(self, name: str) -> None: + """ + Archives the discovery config with the given name. + + No-op if the config does not exist. + """ + + all_configs = self.list_discovery_configs() + matching = [config for config in all_configs if config.name == name] + for config in matching: + if config.id is None: + raise DataMasqueException(f'Server returned a discovery config named "{config.name}" without an `id`.') + + self.delete_discovery_config_by_id_if_exists(config.id) + + def get_default_discovery_config_yaml(self) -> str: + """Returns the server's built-in default discovery configuration as a YAML string.""" + + response = self.make_request("GET", "/api/discovery/configs/defaults/") + return response.content.decode("utf-8") diff --git a/datamasque/client/dmclient.py b/datamasque/client/dmclient.py index 55cc6f3..25efcc6 100644 --- a/datamasque/client/dmclient.py +++ b/datamasque/client/dmclient.py @@ -1,6 +1,7 @@ from datamasque.client.base import FileOrContent, UploadFile from datamasque.client.connections import ConnectionClient from datamasque.client.discovery import DiscoveryClient +from datamasque.client.discovery_configs import DiscoveryConfigClient from datamasque.client.files import FileClient from datamasque.client.license import LicenseClient from datamasque.client.ruleset_libraries import RulesetLibraryClient @@ -20,6 +21,7 @@ class DataMasqueClient( FileClient, RunClient, DiscoveryClient, + DiscoveryConfigClient, UserClient, SettingsClient, ): diff --git a/datamasque/client/models/discovery_config.py b/datamasque/client/models/discovery_config.py new file mode 100644 index 0000000..8c580b3 --- /dev/null +++ b/datamasque/client/models/discovery_config.py @@ -0,0 +1,18 @@ +from datetime import datetime +from typing import NewType, Optional + +from pydantic import BaseModel, ConfigDict, Field + +DiscoveryConfigId = NewType("DiscoveryConfigId", str) + + +class DiscoveryConfig(BaseModel): + """Represents a named, persisted YAML discovery configuration.""" + + model_config = ConfigDict(extra="allow", populate_by_name=True) + + name: str + yaml: Optional[str] = Field(default=None, alias="config_yaml") + id: Optional[DiscoveryConfigId] = None + created: Optional[datetime] = None + modified: Optional[datetime] = None diff --git a/tests/test_discovery_configs.py b/tests/test_discovery_configs.py new file mode 100644 index 0000000..2c9e71c --- /dev/null +++ b/tests/test_discovery_configs.py @@ -0,0 +1,430 @@ +"""Tests for discovery-config support in the DataMasque client.""" + +from datetime import datetime +from typing import Any + +import pytest +import requests_mock + +from datamasque.client import DataMasqueClient +from datamasque.client.exceptions import DataMasqueApiError, DataMasqueException +from datamasque.client.models.discovery_config import DiscoveryConfig, DiscoveryConfigId + +CONFIG_ID_1 = "aaaaaaaa-1111-2222-3333-444444444444" +CONFIG_ID_2 = "bbbbbbbb-1111-2222-3333-444444444444" + + +@pytest.fixture +def sample_config_list_response() -> dict[str, Any]: + return { + "count": 2, + "next": None, + "previous": None, + "results": [ + { + "id": CONFIG_ID_1, + "name": "my_config", + "archived": False, + "created": "2025-01-01T12:00:00Z", + "modified": "2025-01-02T12:00:00Z", + }, + { + "id": CONFIG_ID_2, + "name": "another_config", + "archived": False, + "created": "2025-02-01T12:00:00Z", + "modified": "2025-02-02T12:00:00Z", + }, + ], + } + + +@pytest.fixture +def sample_config_detail_response() -> dict[str, Any]: + return { + "id": CONFIG_ID_1, + "name": "my_config", + "config_yaml": "labels: []\nmetadata_rules: []\nidd_rules: []\n", + "archived": False, + "created": "2025-01-01T12:00:00Z", + "modified": "2025-01-02T12:00:00Z", + } + + +@pytest.fixture +def discovery_config() -> DiscoveryConfig: + return DiscoveryConfig( + name="test_config", + yaml="labels: []\nmetadata_rules: []\nidd_rules: []\n", + ) + + +def test_list_discovery_configs(client: DataMasqueClient, sample_config_list_response: dict[str, Any]) -> None: + with requests_mock.Mocker() as m: + m.get( + "http://test-server/api/discovery/configs/", + json=sample_config_list_response, + status_code=200, + ) + configs = client.list_discovery_configs() + + assert len(configs) == 2 + assert configs[0].id == DiscoveryConfigId(CONFIG_ID_1) + assert configs[0].name == "my_config" + assert configs[0].yaml is None + assert configs[1].name == "another_config" + + +def test_list_discovery_configs_pagination(client: DataMasqueClient) -> None: + page1 = { + "count": 3, + "next": "http://test-server/api/discovery/configs/?limit=2&offset=2", + "previous": None, + "results": [ + { + "id": CONFIG_ID_1, + "name": "c1", + "archived": False, + "created": "2025-01-01T12:00:00Z", + "modified": "2025-01-01T12:00:00Z", + }, + { + "id": CONFIG_ID_2, + "name": "c2", + "archived": False, + "created": "2025-01-01T12:00:00Z", + "modified": "2025-01-01T12:00:00Z", + }, + ], + } + page2 = { + "count": 3, + "next": None, + "previous": "http://test-server/api/discovery/configs/?limit=2", + "results": [ + { + "id": "cccccccc-1111-2222-3333-444444444444", + "name": "c3", + "archived": False, + "created": "2025-01-01T12:00:00Z", + "modified": "2025-01-01T12:00:00Z", + }, + ], + } + + with requests_mock.Mocker() as m: + m.get( + "http://test-server/api/discovery/configs/", + [{"json": page1, "status_code": 200}, {"json": page2, "status_code": 200}], + ) + configs = client.list_discovery_configs() + + assert [c.name for c in configs] == ["c1", "c2", "c3"] + + +def test_list_discovery_configs_empty(client: DataMasqueClient) -> None: + empty_response = {"count": 0, "next": None, "previous": None, "results": []} + + with requests_mock.Mocker() as m: + m.get("http://test-server/api/discovery/configs/", json=empty_response, status_code=200) + configs = client.list_discovery_configs() + + assert configs == [] + + +def test_get_discovery_config(client: DataMasqueClient, sample_config_detail_response: dict[str, Any]) -> None: + with requests_mock.Mocker() as m: + m.get( + f"http://test-server/api/discovery/configs/{CONFIG_ID_1}/", + json=sample_config_detail_response, + status_code=200, + ) + config = client.get_discovery_config(DiscoveryConfigId(CONFIG_ID_1)) + + assert config.id == DiscoveryConfigId(CONFIG_ID_1) + assert config.name == "my_config" + assert config.yaml == "labels: []\nmetadata_rules: []\nidd_rules: []\n" + + +def test_get_discovery_config_by_name_found( + client: DataMasqueClient, sample_config_detail_response: dict[str, Any] +) -> None: + list_response = { + "count": 1, + "next": None, + "previous": None, + "results": [ + { + "id": CONFIG_ID_1, + "name": "my_config", + "archived": False, + "created": "2025-01-01T12:00:00Z", + "modified": "2025-01-02T12:00:00Z", + }, + ], + } + + with requests_mock.Mocker() as m: + m.get( + "http://test-server/api/discovery/configs/", + json=list_response, + status_code=200, + ) + m.get( + f"http://test-server/api/discovery/configs/{CONFIG_ID_1}/", + json=sample_config_detail_response, + status_code=200, + ) + config = client.get_discovery_config_by_name("my_config") + + assert config is not None + assert config.name == "my_config" + assert "name_exact=my_config" in m.request_history[0].url + + +def test_get_discovery_config_by_name_not_found(client: DataMasqueClient) -> None: + empty_response = {"count": 0, "next": None, "previous": None, "results": []} + + with requests_mock.Mocker() as m: + m.get( + "http://test-server/api/discovery/configs/", + json=empty_response, + status_code=200, + ) + config = client.get_discovery_config_by_name("nonexistent") + + assert config is None + + +def test_get_discovery_config_by_name_raises_when_server_omits_id(client: DataMasqueClient) -> None: + list_response_without_id = { + "count": 1, + "next": None, + "previous": None, + "results": [ + { + "name": "my_config", + "archived": False, + "created": "2025-01-01T12:00:00Z", + "modified": "2025-01-02T12:00:00Z", + }, + ], + } + + with requests_mock.Mocker() as m: + m.get( + "http://test-server/api/discovery/configs/", + json=list_response_without_id, + status_code=200, + ) + with pytest.raises(DataMasqueApiError, match="without an `id`"): + client.get_discovery_config_by_name("my_config") + + +def test_create_discovery_config(client: DataMasqueClient, discovery_config: DiscoveryConfig) -> None: + create_response = { + "id": CONFIG_ID_1, + "name": "test_config", + "config_yaml": "labels: []\nmetadata_rules: []\nidd_rules: []\n", + "archived": False, + "created": "2025-06-01T10:00:00Z", + "modified": "2025-06-01T10:00:00Z", + } + + with requests_mock.Mocker() as m: + m.post( + "http://test-server/api/discovery/configs/", + json=create_response, + status_code=201, + ) + result = client.create_discovery_config(discovery_config) + + assert result is discovery_config + assert result.id == DiscoveryConfigId(CONFIG_ID_1) + assert result.created == datetime.fromisoformat("2025-06-01T10:00:00+00:00") + assert result.modified == datetime.fromisoformat("2025-06-01T10:00:00+00:00") + + request_body = m.last_request.json() + assert request_body["name"] == "test_config" + assert request_body["config_yaml"] == "labels: []\nmetadata_rules: []\nidd_rules: []\n" + + +def test_update_discovery_config(client: DataMasqueClient, discovery_config: DiscoveryConfig) -> None: + discovery_config.id = DiscoveryConfigId(CONFIG_ID_1) + + update_response = { + "id": CONFIG_ID_1, + "name": "test_config", + "config_yaml": "labels: []\nmetadata_rules: []\nidd_rules: []\n", + "archived": False, + "created": "2025-06-01T10:00:00Z", + "modified": "2025-06-02T10:00:00Z", + } + + with requests_mock.Mocker() as m: + m.put( + f"http://test-server/api/discovery/configs/{CONFIG_ID_1}/", + json=update_response, + status_code=200, + ) + result = client.update_discovery_config(discovery_config) + + assert result is discovery_config + assert result.modified == datetime.fromisoformat("2025-06-02T10:00:00+00:00") + + request_body = m.last_request.json() + assert request_body["name"] == "test_config" + assert request_body["config_yaml"] == "labels: []\nmetadata_rules: []\nidd_rules: []\n" + + +def test_update_discovery_config_no_id_raises(client: DataMasqueClient, discovery_config: DiscoveryConfig) -> None: + with pytest.raises(ValueError, match="id is None"): + client.update_discovery_config(discovery_config) + + +def test_create_or_update_discovery_config_create(client: DataMasqueClient, discovery_config: DiscoveryConfig) -> None: + empty_list = {"count": 0, "next": None, "previous": None, "results": []} + create_response = { + "id": CONFIG_ID_1, + "name": "test_config", + "config_yaml": "labels: []\nmetadata_rules: []\nidd_rules: []\n", + "archived": False, + "created": "2025-06-01T10:00:00Z", + "modified": "2025-06-01T10:00:00Z", + } + + with requests_mock.Mocker() as m: + m.get("http://test-server/api/discovery/configs/", json=empty_list, status_code=200) + m.post("http://test-server/api/discovery/configs/", json=create_response, status_code=201) + result = client.create_or_update_discovery_config(discovery_config) + + assert result.id == DiscoveryConfigId(CONFIG_ID_1) + assert m.request_history[0].method == "GET" + assert m.request_history[1].method == "POST" + + +def test_create_or_update_discovery_config_update(client: DataMasqueClient, discovery_config: DiscoveryConfig) -> None: + list_response = { + "count": 1, + "next": None, + "previous": None, + "results": [ + { + "id": CONFIG_ID_1, + "name": "test_config", + "archived": False, + "created": "2025-06-01T10:00:00Z", + "modified": "2025-06-01T10:00:00Z", + }, + ], + } + detail_response = { + "id": CONFIG_ID_1, + "name": "test_config", + "config_yaml": "labels: []", + "archived": False, + "created": "2025-06-01T10:00:00Z", + "modified": "2025-06-01T10:00:00Z", + } + update_response = { + "id": CONFIG_ID_1, + "name": "test_config", + "config_yaml": "labels: []\nmetadata_rules: []\nidd_rules: []\n", + "archived": False, + "created": "2025-06-01T10:00:00Z", + "modified": "2025-06-02T10:00:00Z", + } + + with requests_mock.Mocker() as m: + m.get("http://test-server/api/discovery/configs/", json=list_response, status_code=200) + m.get( + f"http://test-server/api/discovery/configs/{CONFIG_ID_1}/", + json=detail_response, + status_code=200, + ) + m.put( + f"http://test-server/api/discovery/configs/{CONFIG_ID_1}/", + json=update_response, + status_code=200, + ) + result = client.create_or_update_discovery_config(discovery_config) + + assert result.id == DiscoveryConfigId(CONFIG_ID_1) + assert m.request_history[0].method == "GET" + assert m.request_history[1].method == "GET" + assert m.request_history[2].method == "PUT" + + +def test_delete_discovery_config_by_id(client: DataMasqueClient) -> None: + with requests_mock.Mocker() as m: + m.delete(f"http://test-server/api/discovery/configs/{CONFIG_ID_1}/", status_code=204) + client.delete_discovery_config_by_id_if_exists(DiscoveryConfigId(CONFIG_ID_1)) + + assert m.call_count == 1 + + +def test_delete_discovery_config_by_id_not_found(client: DataMasqueClient) -> None: + with requests_mock.Mocker() as m: + m.delete(f"http://test-server/api/discovery/configs/{CONFIG_ID_1}/", status_code=404) + client.delete_discovery_config_by_id_if_exists(DiscoveryConfigId(CONFIG_ID_1)) + + +def test_delete_discovery_config_by_name(client: DataMasqueClient, sample_config_list_response: dict[str, Any]) -> None: + with requests_mock.Mocker() as m: + m.get("http://test-server/api/discovery/configs/", json=sample_config_list_response, status_code=200) + m.delete(f"http://test-server/api/discovery/configs/{CONFIG_ID_1}/", status_code=204) + client.delete_discovery_config_by_name_if_exists("my_config") + + assert m.request_history[0].method == "GET" + assert m.request_history[1].method == "DELETE" + + +def test_delete_discovery_config_by_name_not_found( + client: DataMasqueClient, sample_config_list_response: dict[str, Any] +) -> None: + with requests_mock.Mocker() as m: + m.get("http://test-server/api/discovery/configs/", json=sample_config_list_response, status_code=200) + client.delete_discovery_config_by_name_if_exists("nonexistent") + + assert m.call_count == 1 + assert m.request_history[0].method == "GET" + + +def test_delete_discovery_config_by_name_raises_when_server_omits_id(client: DataMasqueClient) -> None: + list_response_without_id = { + "count": 1, + "next": None, + "previous": None, + "results": [ + { + "name": "my_config", + "archived": False, + "created": "2025-01-01T12:00:00Z", + "modified": "2025-01-02T12:00:00Z", + }, + ], + } + + with requests_mock.Mocker() as m: + m.get( + "http://test-server/api/discovery/configs/", + json=list_response_without_id, + status_code=200, + ) + with pytest.raises(DataMasqueException, match="without an `id`"): + client.delete_discovery_config_by_name_if_exists("my_config") + + +def test_get_default_discovery_config_yaml(client: DataMasqueClient) -> None: + yaml_body = "labels: []\nmetadata_rules: []\nidd_rules: []\n" + + with requests_mock.Mocker() as m: + m.get( + "http://test-server/api/discovery/configs/defaults/", + text=yaml_body, + status_code=200, + headers={"Content-Type": "application/x-yaml"}, + ) + result = client.get_default_discovery_config_yaml() + + assert result == yaml_body From b129ec721320c63da82cc9c5a96ee32297f53a9c Mon Sep 17 00:00:00 2001 From: Colin Haywood Date: Thu, 21 May 2026 10:23:50 +1200 Subject: [PATCH 02/26] test: Remove trivial tests that only exercise Pydantic --- tests/test_ruleset_library.py | 55 ----------------------------------- 1 file changed, 55 deletions(-) diff --git a/tests/test_ruleset_library.py b/tests/test_ruleset_library.py index 3e2e476..1ff939b 100644 --- a/tests/test_ruleset_library.py +++ b/tests/test_ruleset_library.py @@ -70,61 +70,6 @@ def ruleset_library(): ) -def test_ruleset_library_model_dump() -> None: - lib = RulesetLibrary(name="lib1", namespace="ns", yaml="content: true") - assert lib.model_dump(exclude_none=True, by_alias=True, mode="json") == { - "name": "lib1", - "namespace": "ns", - "config_yaml": "content: true", - } - - -def test_ruleset_library_model_dump_no_yaml() -> None: - lib = RulesetLibrary(name="lib1", namespace="ns") - api_dict = lib.model_dump(exclude_none=True, by_alias=True, mode="json") - assert "config_yaml" not in api_dict - assert api_dict == {"name": "lib1", "namespace": "ns"} - - -def test_ruleset_library_model_validate() -> None: - response = { - "id": LIBRARY_ID_1, - "name": "my_library", - "namespace": "org", - "config_yaml": "version: '1.0'", - "is_valid": "valid", - "created": "2025-01-01T12:00:00Z", - "modified": "2025-01-02T12:00:00Z", - } - lib = RulesetLibrary.model_validate(response) - assert lib.id == RulesetLibraryId(LIBRARY_ID_1) - assert lib.name == "my_library" - assert lib.namespace == "org" - assert lib.yaml == "version: '1.0'" - assert lib.is_valid is ValidationStatus.valid - assert lib.created == datetime.fromisoformat("2025-01-01T12:00:00+00:00") - assert lib.modified == datetime.fromisoformat("2025-01-02T12:00:00+00:00") - - -def test_ruleset_library_model_validate_list( - sample_library_list_response: dict[str, Any], -) -> None: - """List responses omit config_yaml, so yaml should be None.""" - result = sample_library_list_response["results"][0] - lib = RulesetLibrary.model_validate(result) - assert lib.yaml is None - assert lib.is_valid is ValidationStatus.valid - - -def test_ruleset_library_equality() -> None: - """Pydantic structural equality compares all fields.""" - lib1 = RulesetLibrary(name="lib", namespace="ns", yaml="content") - lib2 = RulesetLibrary(name="lib", namespace="ns", yaml="content") - lib3 = RulesetLibrary(name="lib", namespace="other", yaml="content") - assert lib1 == lib2 - assert lib1 != lib3 - - def test_list_ruleset_libraries(client: DataMasqueClient, sample_library_list_response: dict[str, Any]) -> None: with requests_mock.Mocker() as m: m.get( From c67c530efba6ae1c0fbb8c3a93188cf96975ff51 Mon Sep 17 00:00:00 2001 From: Colin Haywood Date: Thu, 28 May 2026 10:26:09 +1200 Subject: [PATCH 03/26] feat: Extend discovery-config support across SD/FDD APIs - Add `discovery_config` field to `SchemaDiscoveryRequest` - Add `FileDataDiscoveryRequest` + `FileDataDiscoveryOptions` models - Add `start_file_data_discovery_run()` for the file-data-discovery trigger --- datamasque/client/__init__.py | 4 + datamasque/client/discovery.py | 35 ++++++ datamasque/client/models/discovery.py | 56 +++++++++ datamasque/client/models/discovery_config.py | 27 ++++- tests/test_discovery.py | 121 +++++++++++++++++++ tests/test_discovery_configs.py | 56 ++++++++- 6 files changed, 297 insertions(+), 2 deletions(-) diff --git a/datamasque/client/__init__.py b/datamasque/client/__init__.py index 814ea56..bb6875a 100644 --- a/datamasque/client/__init__.py +++ b/datamasque/client/__init__.py @@ -54,6 +54,8 @@ from datamasque.client.models.discovery import ( ConstraintColumns, DiscoveryMatch, + FileDataDiscoveryOptions, + FileDataDiscoveryRequest, FileDiscoveryFile, FileDiscoveryLocatorResult, FileDiscoveryMatch, @@ -137,6 +139,8 @@ "DynamoConnectionConfig", "FailedToStartError", "FileConnectionConfig", + "FileDataDiscoveryOptions", + "FileDataDiscoveryRequest", "FileDiscoveryFile", "FileDiscoveryLocatorResult", "FileDiscoveryMatch", diff --git a/datamasque/client/discovery.py b/datamasque/client/discovery.py index 19b89f0..e6ba406 100644 --- a/datamasque/client/discovery.py +++ b/datamasque/client/discovery.py @@ -17,6 +17,7 @@ SelectedFileData, ) from datamasque.client.models.discovery import ( + FileDataDiscoveryRequest, FileDiscoveryResult, FileRulesetGenerationRequest, RulesetGenerationRequest, @@ -230,6 +231,40 @@ def start_schema_discovery_run(self, discovery_config: SchemaDiscoveryRequest) - response=response, ) + def start_file_data_discovery_run(self, request: FileDataDiscoveryRequest) -> RunId: + """ + Starts a file data discovery run with the given configuration. + + Args: + request: A `FileDataDiscoveryRequest` with connection and optional settings. + + Returns: + RunId: The ID of the started discovery run + + Raises: + FailedToStartError: If run fails to start + """ + + data = request.model_dump(exclude_none=True, mode="json") + response = self.make_request( + "POST", + "/api/run-file-data-discovery/", + data=data, + require_status_check=False, + ) + run_data = response.json() + + if response.status_code == 201: + logger.info("File data discovery run %s started successfully", run_data["id"]) + return RunId(run_data["id"]) + + logger.error("File data discovery run failed to start: %s", run_data) + raise FailedToStartError( + f"File data discovery run failed to start " + f"(server responded with status {response.status_code}: {response.text}).", + response=response, + ) + def iter_schema_discovery_results(self, run_id: RunId) -> Iterator[SchemaDiscoveryResult]: """Lazily iterate all schema discovery results for a run via the paginated v2 endpoint.""" diff --git a/datamasque/client/models/discovery.py b/datamasque/client/models/discovery.py index 7532526..d87a87f 100644 --- a/datamasque/client/models/discovery.py +++ b/datamasque/client/models/discovery.py @@ -6,6 +6,7 @@ from datamasque.client.models.connection import ConnectionConfig, ConnectionId, unwrap_connection_id from datamasque.client.models.data_selection import HashColumnsTableConfig, Locator, UserSelection +from datamasque.client.models.discovery_config import DiscoveryConfig, DiscoveryConfigId, unwrap_discovery_config_id from datamasque.client.models.pagination import Page @@ -35,12 +36,15 @@ class SchemaDiscoveryRequest(BaseModel): Request body for `POST /api/schema-discovery/`. `connection` accepts either a `ConnectionId` or a full `ConnectionConfig` returned by an earlier client call. + `discovery_config` (optional) accepts either a `DiscoveryConfigId` or a full `DiscoveryConfig`; + when omitted the server's built-in default discovery config is used. Every other field uses the server's default value when omitted. """ model_config = ConfigDict(extra="forbid") connection: Union[ConnectionId, ConnectionConfig] + discovery_config: Optional[Union[DiscoveryConfigId, DiscoveryConfig]] = None custom_keywords: list[str] = Field(default_factory=list) ignored_keywords: list[str] = Field(default_factory=list) schemas: list[str] = Field(default_factory=list) @@ -54,6 +58,11 @@ class SchemaDiscoveryRequest(BaseModel): def _unwrap_connection(cls, value: Any) -> Any: return unwrap_connection_id(value) + @field_validator("discovery_config", mode="before") + @classmethod + def _unwrap_discovery_config(cls, value: Any) -> Any: + return unwrap_discovery_config_id(value) + class RulesetGenerationRequest(BaseModel): """ @@ -77,6 +86,53 @@ def _unwrap_connection(cls, value: Any) -> Any: return unwrap_connection_id(value) +class FileDataDiscoveryOptions(BaseModel): + """Run options nested under `FileDataDiscoveryRequest.options`.""" + + model_config = ConfigDict(extra="forbid") + + dry_run: Optional[bool] = None + diagnostic_logging: Optional[bool] = None + + +class FileDataDiscoveryRequest(BaseModel): + """ + Request body for `POST /api/run-file-data-discovery/`. + + `connection` accepts either a `ConnectionId` or a full `ConnectionConfig` returned by an earlier client call. + `discovery_config` (optional) accepts either a `DiscoveryConfigId` or a full `DiscoveryConfig`; + when omitted the server's built-in default discovery config is used. + All other fields use the server's default value when omitted. + """ + + model_config = ConfigDict(extra="forbid") + + connection: Union[ConnectionId, ConnectionConfig] + discovery_config: Optional[Union[DiscoveryConfigId, DiscoveryConfig]] = None + options: Optional[FileDataDiscoveryOptions] = None + custom_keywords: list[str] = Field(default_factory=list) + ignored_keywords: list[str] = Field(default_factory=list) + disable_built_in_keywords: bool = False + disable_global_custom_keywords: Optional[bool] = None + disable_global_ignored_keywords: Optional[bool] = None + in_data_discovery: Optional[InDataDiscoveryConfig] = None + recurse: Optional[bool] = None + include: Optional[list[str]] = None + skip: Optional[list[str]] = None + encoding: Optional[str] = None + workers: Optional[int] = None + + @field_validator("connection", mode="before") + @classmethod + def _unwrap_connection(cls, value: Any) -> Any: + return unwrap_connection_id(value) + + @field_validator("discovery_config", mode="before") + @classmethod + def _unwrap_discovery_config(cls, value: Any) -> Any: + return unwrap_discovery_config_id(value) + + class FileRulesetGenerationRequest(BaseModel): """ Request body for `POST /api/generate-file-ruleset/`. diff --git a/datamasque/client/models/discovery_config.py b/datamasque/client/models/discovery_config.py index 8c580b3..f0a6b3c 100644 --- a/datamasque/client/models/discovery_config.py +++ b/datamasque/client/models/discovery_config.py @@ -1,11 +1,31 @@ from datetime import datetime -from typing import NewType, Optional +from typing import Any, NewType, Optional from pydantic import BaseModel, ConfigDict, Field +from datamasque.client.models.status import ValidationStatus + DiscoveryConfigId = NewType("DiscoveryConfigId", str) +def unwrap_discovery_config_id(value: Any) -> Any: + """ + Coerce a `DiscoveryConfig` to its `id`; pass other values through unchanged. + + Used by request-model validators that accept either a `DiscoveryConfigId` + or a full `DiscoveryConfig` for user convenience. + Raises `ValueError` if the config has no `id` + (i.e. the caller hasn't yet created it on the server). + """ + + if isinstance(value, DiscoveryConfig): + if value.id is None: + raise ValueError("Discovery config has not been created yet (id is None)") + return value.id + + return value + + class DiscoveryConfig(BaseModel): """Represents a named, persisted YAML discovery configuration.""" @@ -14,5 +34,10 @@ class DiscoveryConfig(BaseModel): name: str yaml: Optional[str] = Field(default=None, alias="config_yaml") id: Optional[DiscoveryConfigId] = None + # Server-managed validation surface, populated by the DataMasque server. + # `is_valid` may be `in_progress` immediately after creating a large config, + # transitioning to `valid` or `invalid` once the server finishes validating. + is_valid: Optional[ValidationStatus] = None + validation_error: Optional[str] = None created: Optional[datetime] = None modified: Optional[datetime] = None diff --git a/tests/test_discovery.py b/tests/test_discovery.py index 2830287..14a2a0b 100644 --- a/tests/test_discovery.py +++ b/tests/test_discovery.py @@ -9,6 +9,10 @@ from datamasque.client import ( DataMasqueClient, + DiscoveryConfig, + DiscoveryConfigId, + FileDataDiscoveryOptions, + FileDataDiscoveryRequest, FileRulesetGenerationRequest, RulesetGenerationRequest, RunId, @@ -27,6 +31,8 @@ from datamasque.client.models.status import AsyncRulesetGenerationTaskStatus from tests.helpers import parse_multipart_form +DISCOVERY_CONFIG_ID = "aaaaaaaa-1111-2222-3333-444444444444" + def test_generate_ruleset(client): req = RulesetGenerationRequest(connection="conn-1", selected_columns={"public": {"users": ["email"]}}) @@ -727,3 +733,118 @@ def test_start_schema_discovery_run_raises_on_non_201(client): ) with pytest.raises(FailedToStartError, match="Schema discovery run failed to start"): client.start_schema_discovery_run(SchemaDiscoveryRequest(connection="nope")) + + +def test_schema_discovery_request_accepts_discovery_config_id(): + """A `DiscoveryConfigId` string passes through unchanged in the request body.""" + req = SchemaDiscoveryRequest( + connection="conn-1", + discovery_config=DiscoveryConfigId(DISCOVERY_CONFIG_ID), + ) + dumped = req.model_dump(exclude_none=True, mode="json") + assert dumped["discovery_config"] == DISCOVERY_CONFIG_ID + + +def test_schema_discovery_request_unwraps_discovery_config_model(): + """Passing a full `DiscoveryConfig` object substitutes its `id` for the wire payload.""" + config = DiscoveryConfig(name="my_cfg", id=DiscoveryConfigId(DISCOVERY_CONFIG_ID)) + req = SchemaDiscoveryRequest(connection="conn-1", discovery_config=config) + assert req.model_dump(exclude_none=True, mode="json")["discovery_config"] == DISCOVERY_CONFIG_ID + + +def test_schema_discovery_request_rejects_unsaved_discovery_config(): + """A `DiscoveryConfig` without an `id` cannot be used yet — raises immediately.""" + config = DiscoveryConfig(name="my_cfg") + with pytest.raises(ValueError, match="id is None"): + SchemaDiscoveryRequest(connection="conn-1", discovery_config=config) + + +def test_start_schema_discovery_run_sends_discovery_config(client): + """`start_schema_discovery_run` posts the discovery_config id to the server.""" + req = SchemaDiscoveryRequest( + connection="conn-1", + discovery_config=DiscoveryConfigId(DISCOVERY_CONFIG_ID), + ) + with requests_mock.Mocker() as m: + m.post( + "http://test-server/api/schema-discovery/", + json={"id": 11}, + status_code=201, + ) + assert client.start_schema_discovery_run(req) == 11 + + assert m.last_request.json()["discovery_config"] == DISCOVERY_CONFIG_ID + + +def test_start_file_data_discovery_run_minimal(client): + """A minimal FDD request — only `connection` set — round-trips through the server.""" + req = FileDataDiscoveryRequest(connection="conn-1") + with requests_mock.Mocker() as m: + m.post( + "http://test-server/api/run-file-data-discovery/", + json={"id": 42}, + status_code=201, + ) + run_id = client.start_file_data_discovery_run(req) + + assert run_id == 42 + body = m.last_request.json() + assert body["connection"] == "conn-1" + assert "discovery_config" not in body + + +def test_start_file_data_discovery_run_full(client): + """All FDD request fields populate the wire payload and pass through unwrap helpers.""" + config = DiscoveryConfig(name="my_cfg", id=DiscoveryConfigId(DISCOVERY_CONFIG_ID)) + req = FileDataDiscoveryRequest( + connection="conn-1", + discovery_config=config, + options=FileDataDiscoveryOptions(dry_run=True, diagnostic_logging=True), + custom_keywords=["foo"], + ignored_keywords=["bar"], + disable_built_in_keywords=True, + disable_global_custom_keywords=True, + disable_global_ignored_keywords=False, + in_data_discovery={"enabled": True, "row_sample_size": 50}, + recurse=True, + include=["*.csv"], + skip=["**/tmp/**"], + encoding="utf-8", + workers=4, + ) + with requests_mock.Mocker() as m: + m.post( + "http://test-server/api/run-file-data-discovery/", + json={"id": 99}, + status_code=201, + ) + assert client.start_file_data_discovery_run(req) == 99 + + body = m.last_request.json() + assert body == { + "connection": "conn-1", + "discovery_config": DISCOVERY_CONFIG_ID, + "options": {"dry_run": True, "diagnostic_logging": True}, + "custom_keywords": ["foo"], + "ignored_keywords": ["bar"], + "disable_built_in_keywords": True, + "disable_global_custom_keywords": True, + "disable_global_ignored_keywords": False, + "in_data_discovery": {"enabled": True, "row_sample_size": 50}, + "recurse": True, + "include": ["*.csv"], + "skip": ["**/tmp/**"], + "encoding": "utf-8", + "workers": 4, + } + + +def test_start_file_data_discovery_run_raises_on_non_201(client): + with requests_mock.Mocker() as m: + m.post( + "http://test-server/api/run-file-data-discovery/", + json={"detail": "connection not found"}, + status_code=400, + ) + with pytest.raises(FailedToStartError, match="File data discovery run failed to start"): + client.start_file_data_discovery_run(FileDataDiscoveryRequest(connection="nope")) diff --git a/tests/test_discovery_configs.py b/tests/test_discovery_configs.py index 2c9e71c..f98893e 100644 --- a/tests/test_discovery_configs.py +++ b/tests/test_discovery_configs.py @@ -8,7 +8,12 @@ from datamasque.client import DataMasqueClient from datamasque.client.exceptions import DataMasqueApiError, DataMasqueException -from datamasque.client.models.discovery_config import DiscoveryConfig, DiscoveryConfigId +from datamasque.client.models.discovery_config import ( + DiscoveryConfig, + DiscoveryConfigId, + unwrap_discovery_config_id, +) +from datamasque.client.models.status import ValidationStatus CONFIG_ID_1 = "aaaaaaaa-1111-2222-3333-444444444444" CONFIG_ID_2 = "bbbbbbbb-1111-2222-3333-444444444444" @@ -428,3 +433,52 @@ def test_get_default_discovery_config_yaml(client: DataMasqueClient) -> None: result = client.get_default_discovery_config_yaml() assert result == yaml_body + + +def test_discovery_config_parses_validation_fields() -> None: + """`is_valid` and `validation_error` round-trip from API responses.""" + config = DiscoveryConfig.model_validate( + { + "id": CONFIG_ID_1, + "name": "my_config", + "config_yaml": "labels: []", + "is_valid": "invalid", + "validation_error": "bad shape on line 3", + "created": "2025-01-01T12:00:00Z", + "modified": "2025-01-02T12:00:00Z", + } + ) + + assert config.is_valid is ValidationStatus.invalid + assert config.validation_error == "bad shape on line 3" + + +def test_discovery_config_validation_fields_optional() -> None: + """Older / lighter API responses without validation fields still parse.""" + config = DiscoveryConfig.model_validate( + { + "id": CONFIG_ID_1, + "name": "my_config", + "created": "2025-01-01T12:00:00Z", + "modified": "2025-01-02T12:00:00Z", + } + ) + + assert config.is_valid is None + assert config.validation_error is None + + +def test_unwrap_discovery_config_id_passes_through_strings() -> None: + assert unwrap_discovery_config_id(CONFIG_ID_1) == CONFIG_ID_1 + assert unwrap_discovery_config_id(None) is None + + +def test_unwrap_discovery_config_id_extracts_id_from_model() -> None: + config = DiscoveryConfig(name="x", id=DiscoveryConfigId(CONFIG_ID_1)) + assert unwrap_discovery_config_id(config) == CONFIG_ID_1 + + +def test_unwrap_discovery_config_id_raises_without_id() -> None: + config = DiscoveryConfig(name="x") + with pytest.raises(ValueError, match="id is None"): + unwrap_discovery_config_id(config) From 0f386c2e3be7fdb91609bba42bc6a91848a0a61f Mon Sep 17 00:00:00 2001 From: Colin Haywood Date: Thu, 28 May 2026 11:07:55 +1200 Subject: [PATCH 04/26] ci: Add ref and dev_version inputs to TestPyPI release workflow --- .github/workflows/release-testpypi.yml | 43 ++++++++++++++++++++++++-- 1 file changed, 41 insertions(+), 2 deletions(-) diff --git a/.github/workflows/release-testpypi.yml b/.github/workflows/release-testpypi.yml index 4935a55..9bff756 100644 --- a/.github/workflows/release-testpypi.yml +++ b/.github/workflows/release-testpypi.yml @@ -2,6 +2,17 @@ name: Release (TestPyPI) on: workflow_dispatch: + inputs: + ref: + description: 'Branch, tag, or commit SHA to build from' + required: true + default: 'main' + dev_version: + description: >- + Optional PEP 440 pre/dev version override (e.g. 1.1.0.dev1, 1.1.0a2, 1.1.0rc1). + Leave blank to use the version in pyproject.toml as-is. + required: false + default: '' jobs: build: @@ -9,6 +20,8 @@ jobs: runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 + with: + ref: ${{ inputs.ref }} - name: Set up uv uses: astral-sh/setup-uv@v5 @@ -18,11 +31,37 @@ jobs: - name: Set up Python run: uv python install 3.12 + - name: Apply dev_version override + if: inputs.dev_version != '' + # Pass the input through the environment, never interpolated into the script body: + # `${{ }}` is expanded into the script text before the shell runs, so splicing it inline + # would let a crafted dev_version inject shell commands before the validation below could reject it. + env: + DEV_VERSION: ${{ inputs.dev_version }} + run: | + # Reject anything that isn't a pre-release / dev version — final releases must go through release.yml. + if ! printf '%s' "${DEV_VERSION}" | grep -Eq '^[0-9]+\.[0-9]+\.[0-9]+(a|b|rc|\.dev)[0-9]+$'; then + echo "::error::dev_version '${DEV_VERSION}' is not a PEP 440 pre/dev version (must end in aN, bN, rcN, or .devN)" + exit 1 + fi + uv run python -c " + import os, re, pathlib + version = os.environ['DEV_VERSION'] + path = pathlib.Path('pyproject.toml') + text = path.read_text() + new = re.sub(r'^version\s*=\s*\".*\"', f'version = \"{version}\"', text, count=1, flags=re.M) + if new == text: + raise SystemExit('Failed to locate version line in pyproject.toml') + path.write_text(new) + " + - name: Show package version + env: + REF: ${{ inputs.ref }} run: | VERSION="$(uv run python -c 'import tomllib; print(tomllib.loads(open("pyproject.toml","rb").read().decode())["project"]["version"])')" - echo "Publishing version: ${VERSION}" - echo "::notice title=TestPyPI version::${VERSION}" + echo "Publishing version: ${VERSION} (from ref ${REF})" + echo "::notice title=TestPyPI version::${VERSION} from ${REF}" - name: Build run: uv build From 4edf1fcb0c7005ce63cf5faaf7bed0e6a3e4e2cf Mon Sep 17 00:00:00 2001 From: Colin Haywood Date: Thu, 28 May 2026 11:08:05 +1200 Subject: [PATCH 05/26] chore: Bump version to 1.1.0.dev0 for epic dev publishes --- pyproject.toml | 2 +- setup.cfg | 2 +- uv.lock | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 5fdc7d0..7ba57f5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "datamasque-python" -version = "1.0.5" +version = "1.1.0.dev0" description = "Official Python client for the DataMasque data-masking API." authors = [ { name = "DataMasque Ltd" }, diff --git a/setup.cfg b/setup.cfg index f04494b..24d750a 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 1.0.4 +current_version = 1.1.0.dev0 commit = True tag = True diff --git a/uv.lock b/uv.lock index 6fe9d57..31e21ef 100644 --- a/uv.lock +++ b/uv.lock @@ -428,7 +428,7 @@ toml = [ [[package]] name = "datamasque-python" -version = "1.0.3" +version = "1.1.0.dev0" source = { editable = "." } dependencies = [ { name = "pydantic" }, From 6f77b0ce3a88094a8c9593d520aa59721410abdf Mon Sep 17 00:00:00 2001 From: Colin Haywood Date: Thu, 28 May 2026 11:47:57 +1200 Subject: [PATCH 06/26] feat: Raise InvalidDiscoveryConfigError on SD/FDD start failures The server returns a 400 keyed under `discovery_config` when the referenced config is missing, archived, or not in `valid` validation state. Surface that as a specific exception subclass. --- datamasque/client/__init__.py | 2 + datamasque/client/discovery.py | 30 +++++++++++++-- datamasque/client/exceptions.py | 8 ++++ tests/test_discovery.py | 68 +++++++++++++++++++++++++++++++++ 4 files changed, 104 insertions(+), 4 deletions(-) diff --git a/datamasque/client/__init__.py b/datamasque/client/__init__.py index bb6875a..2678e54 100644 --- a/datamasque/client/__init__.py +++ b/datamasque/client/__init__.py @@ -19,6 +19,7 @@ DataMasqueUserError, FailedToStartError, IfmAuthError, + InvalidDiscoveryConfigError, InvalidLibraryError, InvalidRulesetError, RunNotCancellableError, @@ -158,6 +159,7 @@ "IfmTokenInfo", "InDataDiscoveryConfig", "InDataDiscoveryRule", + "InvalidDiscoveryConfigError", "InvalidLibraryError", "InvalidRulesetError", "JsonPath", diff --git a/datamasque/client/discovery.py b/datamasque/client/discovery.py index e6ba406..8e30d7c 100644 --- a/datamasque/client/discovery.py +++ b/datamasque/client/discovery.py @@ -4,11 +4,14 @@ from pathlib import Path from typing import Iterator, Optional, Union +from requests import Response + from datamasque.client.base import BaseClient, UploadFile from datamasque.client.exceptions import ( AsyncRulesetGenerationInProgressError, DataMasqueException, FailedToStartError, + InvalidDiscoveryConfigError, ) from datamasque.client.models.connection import ConnectionId from datamasque.client.models.data_selection import ( @@ -208,7 +211,9 @@ def start_schema_discovery_run(self, discovery_config: SchemaDiscoveryRequest) - RunId: The ID of the started discovery run Raises: - FailedToStartError: If run fails to start + InvalidDiscoveryConfigError: the run failed to start because the referenced + discovery config is missing, archived, or not in a `valid` validation state. + FailedToStartError: the run failed to start for any other reason. """ data = discovery_config.model_dump(exclude_none=True, mode="json") @@ -218,13 +223,14 @@ def start_schema_discovery_run(self, discovery_config: SchemaDiscoveryRequest) - data=data, require_status_check=False, ) - run_data = response.json() + run_data = response.json() if response.content else {} if response.status_code == 201: logger.info("Schema discovery run %s started successfully", run_data["id"]) return RunId(run_data["id"]) logger.error("Schema discovery run failed to start: %s", run_data) + self._maybe_raise_discovery_config_error(run_data, response, "Schema discovery") raise FailedToStartError( f"Schema discovery run failed to start " f"(server responded with status {response.status_code}: {response.text}).", @@ -242,7 +248,9 @@ def start_file_data_discovery_run(self, request: FileDataDiscoveryRequest) -> Ru RunId: The ID of the started discovery run Raises: - FailedToStartError: If run fails to start + InvalidDiscoveryConfigError: the run failed to start because the referenced + discovery config is missing, archived, or not in a `valid` validation state. + FailedToStartError: the run failed to start for any other reason. """ data = request.model_dump(exclude_none=True, mode="json") @@ -252,19 +260,33 @@ def start_file_data_discovery_run(self, request: FileDataDiscoveryRequest) -> Ru data=data, require_status_check=False, ) - run_data = response.json() + run_data = response.json() if response.content else {} if response.status_code == 201: logger.info("File data discovery run %s started successfully", run_data["id"]) return RunId(run_data["id"]) logger.error("File data discovery run failed to start: %s", run_data) + self._maybe_raise_discovery_config_error(run_data, response, "File data discovery") raise FailedToStartError( f"File data discovery run failed to start " f"(server responded with status {response.status_code}: {response.text}).", response=response, ) + @staticmethod + def _maybe_raise_discovery_config_error(run_data: object, response: Response, run_kind: str) -> None: + """Raise `InvalidDiscoveryConfigError` if the server's 400 body cites `discovery_config`.""" + if not isinstance(run_data, dict) or "discovery_config" not in run_data: + return + + errors = run_data["discovery_config"] + detail = errors[0] if isinstance(errors, list) and errors else str(errors) + raise InvalidDiscoveryConfigError( + f"{run_kind} run failed to start due to discovery config error: {detail}", + response=response, + ) + def iter_schema_discovery_results(self, run_id: RunId) -> Iterator[SchemaDiscoveryResult]: """Lazily iterate all schema discovery results for a run via the paginated v2 endpoint.""" diff --git a/datamasque/client/exceptions.py b/datamasque/client/exceptions.py index 05fe339..f034c4d 100644 --- a/datamasque/client/exceptions.py +++ b/datamasque/client/exceptions.py @@ -41,6 +41,14 @@ class InvalidLibraryError(FailedToStartError): """Specific error for when runs fail to start due to having an invalid ruleset library.""" +class InvalidDiscoveryConfigError(FailedToStartError): + """ + Raised when a discovery run fails to start due to an unusable discovery config. + + The referenced config is missing, archived, or not in a `valid` validation state. + """ + + class DataMasqueTransportError(DataMasqueException): """ Raised when a request to the DataMasque server fails before any response is received. diff --git a/tests/test_discovery.py b/tests/test_discovery.py index 14a2a0b..0cab7f9 100644 --- a/tests/test_discovery.py +++ b/tests/test_discovery.py @@ -25,6 +25,7 @@ DataMasqueApiError, DataMasqueException, FailedToStartError, + InvalidDiscoveryConfigError, ) from datamasque.client.models.connection import ConnectionId, DatabaseConnectionConfig, DatabaseType from datamasque.client.models.data_selection import SelectedColumns, SelectedFileData, UserSelection @@ -848,3 +849,70 @@ def test_start_file_data_discovery_run_raises_on_non_201(client): ) with pytest.raises(FailedToStartError, match="File data discovery run failed to start"): client.start_file_data_discovery_run(FileDataDiscoveryRequest(connection="nope")) + + +def test_start_schema_discovery_raises_invalid_discovery_config_when_not_valid(client): + """A 400 with a `discovery_config` validation message raises the specific subclass.""" + with requests_mock.Mocker() as m: + m.post( + "http://test-server/api/schema-discovery/", + json={ + "discovery_config": ['Discovery config "my_cfg" cannot be used: validation status is `invalid`.'], + }, + status_code=400, + ) + with pytest.raises( + InvalidDiscoveryConfigError, + match=r'Schema discovery run failed to start due to discovery config error: .*"my_cfg".*invalid', + ): + client.start_schema_discovery_run( + SchemaDiscoveryRequest(connection="conn-1", discovery_config=DiscoveryConfigId(DISCOVERY_CONFIG_ID)), + ) + + +def test_start_schema_discovery_raises_invalid_discovery_config_when_missing(client): + """A 400 from DRF's PrimaryKeyRelatedField (config not found / archived) is also classified.""" + with requests_mock.Mocker() as m: + m.post( + "http://test-server/api/schema-discovery/", + json={"discovery_config": [f'Invalid pk "{DISCOVERY_CONFIG_ID}" - object does not exist.']}, + status_code=400, + ) + with pytest.raises(InvalidDiscoveryConfigError, match="object does not exist"): + client.start_schema_discovery_run( + SchemaDiscoveryRequest(connection="conn-1", discovery_config=DiscoveryConfigId(DISCOVERY_CONFIG_ID)), + ) + + +def test_start_file_data_discovery_raises_invalid_discovery_config(client): + with requests_mock.Mocker() as m: + m.post( + "http://test-server/api/run-file-data-discovery/", + json={ + "discovery_config": ['Discovery config "my_cfg" cannot be used: validation status is `in_progress`.'], + }, + status_code=400, + ) + with pytest.raises( + InvalidDiscoveryConfigError, + match=r"File data discovery run failed to start due to discovery config error: .*in_progress", + ): + client.start_file_data_discovery_run( + FileDataDiscoveryRequest( + connection="conn-1", + discovery_config=DiscoveryConfigId(DISCOVERY_CONFIG_ID), + ), + ) + + +def test_start_schema_discovery_non_discovery_config_400_still_raises_generic_error(client): + """Other 400s (e.g. unknown connection) keep raising the base FailedToStartError.""" + with requests_mock.Mocker() as m: + m.post( + "http://test-server/api/schema-discovery/", + json={"connection": ['Invalid pk "nope" - object does not exist.']}, + status_code=400, + ) + with pytest.raises(FailedToStartError) as exc_info: + client.start_schema_discovery_run(SchemaDiscoveryRequest(connection="nope")) + assert not isinstance(exc_info.value, InvalidDiscoveryConfigError) From 8646c123742753a88721d6b962f5612159872534 Mon Sep 17 00:00:00 2001 From: Colin Haywood Date: Thu, 28 May 2026 16:39:29 +1200 Subject: [PATCH 07/26] fix: Implement correct file filter syntax and ignore_rules for FDD --- datamasque/client/__init__.py | 4 +++ datamasque/client/models/discovery.py | 37 ++++++++++++++++++-- tests/test_discovery.py | 49 ++++++++++++++++++++++++--- 3 files changed, 83 insertions(+), 7 deletions(-) diff --git a/datamasque/client/__init__.py b/datamasque/client/__init__.py index 2678e54..2f261f4 100644 --- a/datamasque/client/__init__.py +++ b/datamasque/client/__init__.py @@ -61,6 +61,8 @@ FileDiscoveryLocatorResult, FileDiscoveryMatch, FileDiscoveryResult, + FileFilter, + FileFilterMatchAgainst, FileRulesetGenerationRequest, ForeignKeyRef, InDataDiscoveryConfig, @@ -146,6 +148,8 @@ "FileDiscoveryLocatorResult", "FileDiscoveryMatch", "FileDiscoveryResult", + "FileFilter", + "FileFilterMatchAgainst", "FileId", "FileOrContent", "FileRulesetGenerationRequest", diff --git a/datamasque/client/models/discovery.py b/datamasque/client/models/discovery.py index d87a87f..d249cf2 100644 --- a/datamasque/client/models/discovery.py +++ b/datamasque/client/models/discovery.py @@ -1,8 +1,9 @@ """Typed request and response shapes for schema-discovery and ruleset-generation endpoints.""" +from enum import Enum from typing import Any, Optional, Union -from pydantic import BaseModel, ConfigDict, Field, field_validator +from pydantic import BaseModel, ConfigDict, Field, field_validator, model_validator from datamasque.client.models.connection import ConnectionConfig, ConnectionId, unwrap_connection_id from datamasque.client.models.data_selection import HashColumnsTableConfig, Locator, UserSelection @@ -28,6 +29,7 @@ class InDataDiscoveryConfig(BaseModel): row_sample_size: Optional[int] = None custom_rules: Optional[list[InDataDiscoveryRule]] = None non_sensitive_rules: Optional[list[InDataDiscoveryRule]] = None + ignore_rules: Optional[list[InDataDiscoveryRule]] = None force: Optional[bool] = None @@ -86,6 +88,35 @@ def _unwrap_connection(cls, value: Any) -> Any: return unwrap_connection_id(value) +class FileFilterMatchAgainst(Enum): + """Which part of a file's path an `include`/`skip` filter is matched against.""" + + path = "path" + filename = "filename" + + +class FileFilter(BaseModel): + """ + A single `include` or `skip` filter for file data discovery. + + Exactly one of `glob` or `regex` must be set. + `match_against` selects whether the pattern is applied to the full path or just the filename + (defaults to the full path when omitted). + """ + + model_config = ConfigDict(extra="forbid") + + glob: Optional[str] = Field(default=None, min_length=1) + regex: Optional[str] = Field(default=None, min_length=1) + match_against: Optional[FileFilterMatchAgainst] = None + + @model_validator(mode="after") + def _check_glob_xor_regex(self) -> "FileFilter": + if (self.glob is None) == (self.regex is None): + raise ValueError("A `FileFilter` must set exactly one of `glob` or `regex`.") + return self + + class FileDataDiscoveryOptions(BaseModel): """Run options nested under `FileDataDiscoveryRequest.options`.""" @@ -117,8 +148,8 @@ class FileDataDiscoveryRequest(BaseModel): disable_global_ignored_keywords: Optional[bool] = None in_data_discovery: Optional[InDataDiscoveryConfig] = None recurse: Optional[bool] = None - include: Optional[list[str]] = None - skip: Optional[list[str]] = None + include: Optional[list[FileFilter]] = None + skip: Optional[list[FileFilter]] = None encoding: Optional[str] = None workers: Optional[int] = None diff --git a/tests/test_discovery.py b/tests/test_discovery.py index 0cab7f9..7c36ce8 100644 --- a/tests/test_discovery.py +++ b/tests/test_discovery.py @@ -6,6 +6,7 @@ import pytest import requests_mock +from pydantic import ValidationError from datamasque.client import ( DataMasqueClient, @@ -13,7 +14,10 @@ DiscoveryConfigId, FileDataDiscoveryOptions, FileDataDiscoveryRequest, + FileFilter, + FileFilterMatchAgainst, FileRulesetGenerationRequest, + InDataDiscoveryConfig, RulesetGenerationRequest, RunId, SchemaDiscoveryPage, @@ -808,8 +812,8 @@ def test_start_file_data_discovery_run_full(client): disable_global_ignored_keywords=False, in_data_discovery={"enabled": True, "row_sample_size": 50}, recurse=True, - include=["*.csv"], - skip=["**/tmp/**"], + include=[{"glob": "*.csv"}], + skip=[{"regex": r".*/tmp/.*", "match_against": "path"}], encoding="utf-8", workers=4, ) @@ -833,13 +837,50 @@ def test_start_file_data_discovery_run_full(client): "disable_global_ignored_keywords": False, "in_data_discovery": {"enabled": True, "row_sample_size": 50}, "recurse": True, - "include": ["*.csv"], - "skip": ["**/tmp/**"], + "include": [{"glob": "*.csv"}], + "skip": [{"regex": r".*/tmp/.*", "match_against": "path"}], "encoding": "utf-8", "workers": 4, } +def test_file_filter_requires_exactly_one_of_glob_or_regex(): + """A `FileFilter` with neither, or both, of `glob`/`regex` is rejected.""" + FileFilter(glob="*.csv") + FileFilter(regex=r".*\.csv") + FileFilter(glob="*.csv", match_against=FileFilterMatchAgainst.filename) + + with pytest.raises(ValidationError, match="exactly one of `glob` or `regex`"): + FileFilter() + + with pytest.raises(ValidationError, match="exactly one of `glob` or `regex`"): + FileFilter(glob="*.csv", regex=r".*\.csv") + + +def test_file_filter_rejects_empty_pattern(): + """An empty `glob`/`regex` is rejected rather than silently treated as unset.""" + with pytest.raises(ValidationError): + FileFilter(glob="") + + with pytest.raises(ValidationError): + FileFilter(regex="", glob="*.csv") + + +def test_file_data_discovery_ignore_rules_serialize(): + """`in_data_discovery.ignore_rules` round-trips into the wire payload.""" + req = FileDataDiscoveryRequest( + connection="conn-1", + in_data_discovery=InDataDiscoveryConfig( + enabled=True, + custom_rules=[{"name": "cc", "pattern": r"^1234"}], + non_sensitive_rules=[{"pattern": r"^5555"}], + ignore_rules=[{"pattern": r"^4321"}], + ), + ) + dumped = req.model_dump(exclude_none=True, mode="json") + assert dumped["in_data_discovery"]["ignore_rules"] == [{"pattern": r"^4321"}] + + def test_start_file_data_discovery_run_raises_on_non_201(client): with requests_mock.Mocker() as m: m.post( From 38827c4e8ae266c1f156445e52c2cd60f8d0280f Mon Sep 17 00:00:00 2001 From: Andrew Chester Date: Tue, 9 Jun 2026 17:27:22 +1200 Subject: [PATCH 08/26] feat: Add v2 saved-config discovery endpoints, keep v1 keyword-only --- HISTORY.rst | 8 ++ datamasque/client/__init__.py | 4 + datamasque/client/discovery.py | 91 +++++++++++++- datamasque/client/models/discovery.py | 56 ++++++++- tests/test_discovery.py | 170 +++++++++++++++++--------- 5 files changed, 262 insertions(+), 67 deletions(-) diff --git a/HISTORY.rst b/HISTORY.rst index 73bb66e..a0655fc 100644 --- a/HISTORY.rst +++ b/HISTORY.rst @@ -2,6 +2,14 @@ History ======= +1.1.0 (unreleased) +------------------ + +* Added discovery-config management APIs on ``DataMasqueClient`` (``list_discovery_configs``, ``get_discovery_config``, ``create_discovery_config``, ``update_discovery_config``, and friends), backed by the ``DiscoveryConfig`` / ``DiscoveryConfigId`` models. +* Added ``start_schema_discovery_run_v2`` and ``start_file_data_discovery_run_v2`` for starting discovery runs from a saved discovery config, via the ``/api/schema-discovery/v2/`` and ``/api/run-file-data-discovery/v2/`` endpoints. The discovery config is a full override of all detection options, so the v2 request models (``SchemaDiscoveryV2Request``, ``FileDataDiscoveryV2Request``) reject the legacy keyword/schema/in-data-discovery fields. +* The v1 ``start_schema_discovery_run`` and ``start_file_data_discovery_run`` endpoints are unchanged and do not accept a discovery config; ``start_file_data_discovery_run`` (with ``FileDataDiscoveryRequest``, the ``FileFilter`` include/skip model, and ``FileDataDiscoveryOptions``) is newly exposed for ad-hoc file data discovery. +* Added ``InvalidDiscoveryConfigError``, raised by the v2 run-start methods when the referenced config is missing, archived, or not in a ``valid`` validation state. + 1.0.5 (2026-06-18) ------------------ diff --git a/datamasque/client/__init__.py b/datamasque/client/__init__.py index 2f261f4..b9a4b85 100644 --- a/datamasque/client/__init__.py +++ b/datamasque/client/__init__.py @@ -57,6 +57,7 @@ DiscoveryMatch, FileDataDiscoveryOptions, FileDataDiscoveryRequest, + FileDataDiscoveryV2Request, FileDiscoveryFile, FileDiscoveryLocatorResult, FileDiscoveryMatch, @@ -73,6 +74,7 @@ SchemaDiscoveryPage, SchemaDiscoveryRequest, SchemaDiscoveryResult, + SchemaDiscoveryV2Request, TableConstraints, ) from datamasque.client.models.discovery_config import DiscoveryConfig, DiscoveryConfigId @@ -144,6 +146,7 @@ "FileConnectionConfig", "FileDataDiscoveryOptions", "FileDataDiscoveryRequest", + "FileDataDiscoveryV2Request", "FileDiscoveryFile", "FileDiscoveryLocatorResult", "FileDiscoveryMatch", @@ -198,6 +201,7 @@ "SchemaDiscoveryPage", "SchemaDiscoveryRequest", "SchemaDiscoveryResult", + "SchemaDiscoveryV2Request", "SeedFile", "SelectedColumns", "SelectedData", diff --git a/datamasque/client/discovery.py b/datamasque/client/discovery.py index 8e30d7c..ca2a8ff 100644 --- a/datamasque/client/discovery.py +++ b/datamasque/client/discovery.py @@ -21,12 +21,14 @@ ) from datamasque.client.models.discovery import ( FileDataDiscoveryRequest, + FileDataDiscoveryV2Request, FileDiscoveryResult, FileRulesetGenerationRequest, RulesetGenerationRequest, SchemaDiscoveryPage, SchemaDiscoveryRequest, SchemaDiscoveryResult, + SchemaDiscoveryV2Request, ) from datamasque.client.models.ruleset import Ruleset from datamasque.client.models.runs import RunId @@ -211,9 +213,7 @@ def start_schema_discovery_run(self, discovery_config: SchemaDiscoveryRequest) - RunId: The ID of the started discovery run Raises: - InvalidDiscoveryConfigError: the run failed to start because the referenced - discovery config is missing, archived, or not in a `valid` validation state. - FailedToStartError: the run failed to start for any other reason. + FailedToStartError: If run fails to start """ data = discovery_config.model_dump(exclude_none=True, mode="json") @@ -223,14 +223,13 @@ def start_schema_discovery_run(self, discovery_config: SchemaDiscoveryRequest) - data=data, require_status_check=False, ) - run_data = response.json() if response.content else {} + run_data = response.json() if response.status_code == 201: logger.info("Schema discovery run %s started successfully", run_data["id"]) return RunId(run_data["id"]) logger.error("Schema discovery run failed to start: %s", run_data) - self._maybe_raise_discovery_config_error(run_data, response, "Schema discovery") raise FailedToStartError( f"Schema discovery run failed to start " f"(server responded with status {response.status_code}: {response.text}).", @@ -247,6 +246,44 @@ def start_file_data_discovery_run(self, request: FileDataDiscoveryRequest) -> Ru Returns: RunId: The ID of the started discovery run + Raises: + FailedToStartError: If run fails to start + """ + + data = request.model_dump(exclude_none=True, mode="json") + response = self.make_request( + "POST", + "/api/run-file-data-discovery/", + data=data, + require_status_check=False, + ) + run_data = response.json() + + if response.status_code == 201: + logger.info("File data discovery run %s started successfully", run_data["id"]) + return RunId(run_data["id"]) + + logger.error("File data discovery run failed to start: %s", run_data) + raise FailedToStartError( + f"File data discovery run failed to start " + f"(server responded with status {response.status_code}: {response.text}).", + response=response, + ) + + def start_schema_discovery_run_v2(self, request: SchemaDiscoveryV2Request) -> RunId: + """ + Starts a schema discovery run from a saved discovery config (v2 API). + + All detection options come from the saved discovery config referenced by `request.discovery_config`; + the legacy keyword/schema/in-data-discovery options accepted by `start_schema_discovery_run` + are not accepted here. + + Args: + request: A `SchemaDiscoveryV2Request` with the connection and saved discovery-config. + + Returns: + RunId: The ID of the started discovery run + Raises: InvalidDiscoveryConfigError: the run failed to start because the referenced discovery config is missing, archived, or not in a `valid` validation state. @@ -256,7 +293,49 @@ def start_file_data_discovery_run(self, request: FileDataDiscoveryRequest) -> Ru data = request.model_dump(exclude_none=True, mode="json") response = self.make_request( "POST", - "/api/run-file-data-discovery/", + "/api/schema-discovery/v2/", + data=data, + require_status_check=False, + ) + run_data = response.json() if response.content else {} + + if response.status_code == 201: + logger.info("Schema discovery run %s started successfully", run_data["id"]) + return RunId(run_data["id"]) + + logger.error("Schema discovery run failed to start: %s", run_data) + self._maybe_raise_discovery_config_error(run_data, response, "Schema discovery") + raise FailedToStartError( + f"Schema discovery run failed to start " + f"(server responded with status {response.status_code}: {response.text}).", + response=response, + ) + + def start_file_data_discovery_run_v2(self, request: FileDataDiscoveryV2Request) -> RunId: + """ + Starts a file data discovery run from a saved discovery config (v2 API). + + Detection options come from the saved discovery config referenced by `request.discovery_config`; + the file-handling parameters (`recurse`, `include`, `skip`, `encoding`, `workers`) and run + `options` are still accepted, since the discovery config does not own them. + + Args: + request: A `FileDataDiscoveryV2Request` with the connection, saved discovery-config, and + optional file-handling parameters. + + Returns: + RunId: The ID of the started discovery run + + Raises: + InvalidDiscoveryConfigError: the run failed to start because the referenced + discovery config is missing, archived, or not in a `valid` validation state. + FailedToStartError: the run failed to start for any other reason. + """ + + data = request.model_dump(exclude_none=True, mode="json") + response = self.make_request( + "POST", + "/api/run-file-data-discovery/v2/", data=data, require_status_check=False, ) diff --git a/datamasque/client/models/discovery.py b/datamasque/client/models/discovery.py index d249cf2..267f145 100644 --- a/datamasque/client/models/discovery.py +++ b/datamasque/client/models/discovery.py @@ -38,15 +38,12 @@ class SchemaDiscoveryRequest(BaseModel): Request body for `POST /api/schema-discovery/`. `connection` accepts either a `ConnectionId` or a full `ConnectionConfig` returned by an earlier client call. - `discovery_config` (optional) accepts either a `DiscoveryConfigId` or a full `DiscoveryConfig`; - when omitted the server's built-in default discovery config is used. Every other field uses the server's default value when omitted. """ model_config = ConfigDict(extra="forbid") connection: Union[ConnectionId, ConnectionConfig] - discovery_config: Optional[Union[DiscoveryConfigId, DiscoveryConfig]] = None custom_keywords: list[str] = Field(default_factory=list) ignored_keywords: list[str] = Field(default_factory=list) schemas: list[str] = Field(default_factory=list) @@ -60,6 +57,27 @@ class SchemaDiscoveryRequest(BaseModel): def _unwrap_connection(cls, value: Any) -> Any: return unwrap_connection_id(value) + +class SchemaDiscoveryV2Request(BaseModel): + """ + Request body for `POST /api/schema-discovery/v2/` (start a run from a saved discovery config). + + `connection` accepts either a `ConnectionId` or a full `ConnectionConfig` returned by an earlier client call. + `discovery_config` accepts either a `DiscoveryConfigId` or a full `DiscoveryConfig`, and is a full + override: it supplies every detection option, so the legacy keyword/schema/in-data-discovery fields + accepted by `SchemaDiscoveryRequest` are rejected here. + """ + + model_config = ConfigDict(extra="forbid") + + connection: Union[ConnectionId, ConnectionConfig] + discovery_config: Union[DiscoveryConfigId, DiscoveryConfig] + + @field_validator("connection", mode="before") + @classmethod + def _unwrap_connection(cls, value: Any) -> Any: + return unwrap_connection_id(value) + @field_validator("discovery_config", mode="before") @classmethod def _unwrap_discovery_config(cls, value: Any) -> Any: @@ -131,15 +149,12 @@ class FileDataDiscoveryRequest(BaseModel): Request body for `POST /api/run-file-data-discovery/`. `connection` accepts either a `ConnectionId` or a full `ConnectionConfig` returned by an earlier client call. - `discovery_config` (optional) accepts either a `DiscoveryConfigId` or a full `DiscoveryConfig`; - when omitted the server's built-in default discovery config is used. All other fields use the server's default value when omitted. """ model_config = ConfigDict(extra="forbid") connection: Union[ConnectionId, ConnectionConfig] - discovery_config: Optional[Union[DiscoveryConfigId, DiscoveryConfig]] = None options: Optional[FileDataDiscoveryOptions] = None custom_keywords: list[str] = Field(default_factory=list) ignored_keywords: list[str] = Field(default_factory=list) @@ -158,6 +173,35 @@ class FileDataDiscoveryRequest(BaseModel): def _unwrap_connection(cls, value: Any) -> Any: return unwrap_connection_id(value) + +class FileDataDiscoveryV2Request(BaseModel): + """ + Request body for `POST /api/run-file-data-discovery/v2/` (start a run from a saved discovery config). + + `discovery_config` is a full override of the detection options, so the legacy keyword and + in-data-discovery fields accepted by `FileDataDiscoveryRequest` are rejected here. + The file-handling parameters (`recurse`, `include`, `skip`, `encoding`, `workers`) and run + `options` remain accepted, since the discovery config does not own them. + `connection` accepts either a `ConnectionId` or a full `ConnectionConfig`, and `discovery_config` + accepts either a `DiscoveryConfigId` or a full `DiscoveryConfig`. + """ + + model_config = ConfigDict(extra="forbid") + + connection: Union[ConnectionId, ConnectionConfig] + discovery_config: Union[DiscoveryConfigId, DiscoveryConfig] + options: Optional[FileDataDiscoveryOptions] = None + recurse: Optional[bool] = None + include: Optional[list[FileFilter]] = None + skip: Optional[list[FileFilter]] = None + encoding: Optional[str] = None + workers: Optional[int] = None + + @field_validator("connection", mode="before") + @classmethod + def _unwrap_connection(cls, value: Any) -> Any: + return unwrap_connection_id(value) + @field_validator("discovery_config", mode="before") @classmethod def _unwrap_discovery_config(cls, value: Any) -> Any: diff --git a/tests/test_discovery.py b/tests/test_discovery.py index 7c36ce8..6eb2643 100644 --- a/tests/test_discovery.py +++ b/tests/test_discovery.py @@ -14,6 +14,7 @@ DiscoveryConfigId, FileDataDiscoveryOptions, FileDataDiscoveryRequest, + FileDataDiscoveryV2Request, FileFilter, FileFilterMatchAgainst, FileRulesetGenerationRequest, @@ -23,6 +24,7 @@ SchemaDiscoveryPage, SchemaDiscoveryRequest, SchemaDiscoveryResult, + SchemaDiscoveryV2Request, ) from datamasque.client.exceptions import ( AsyncRulesetGenerationInProgressError, @@ -740,45 +742,19 @@ def test_start_schema_discovery_run_raises_on_non_201(client): client.start_schema_discovery_run(SchemaDiscoveryRequest(connection="nope")) -def test_schema_discovery_request_accepts_discovery_config_id(): - """A `DiscoveryConfigId` string passes through unchanged in the request body.""" - req = SchemaDiscoveryRequest( - connection="conn-1", - discovery_config=DiscoveryConfigId(DISCOVERY_CONFIG_ID), - ) - dumped = req.model_dump(exclude_none=True, mode="json") - assert dumped["discovery_config"] == DISCOVERY_CONFIG_ID - +# --- v1 (legacy) discovery: no discovery_config accepted --- -def test_schema_discovery_request_unwraps_discovery_config_model(): - """Passing a full `DiscoveryConfig` object substitutes its `id` for the wire payload.""" - config = DiscoveryConfig(name="my_cfg", id=DiscoveryConfigId(DISCOVERY_CONFIG_ID)) - req = SchemaDiscoveryRequest(connection="conn-1", discovery_config=config) - assert req.model_dump(exclude_none=True, mode="json")["discovery_config"] == DISCOVERY_CONFIG_ID - - -def test_schema_discovery_request_rejects_unsaved_discovery_config(): - """A `DiscoveryConfig` without an `id` cannot be used yet — raises immediately.""" - config = DiscoveryConfig(name="my_cfg") - with pytest.raises(ValueError, match="id is None"): - SchemaDiscoveryRequest(connection="conn-1", discovery_config=config) +def test_schema_discovery_request_rejects_discovery_config(): + """The v1 schema-discovery request no longer accepts `discovery_config` (it is v2-only).""" + with pytest.raises(ValidationError): + SchemaDiscoveryRequest(connection="conn-1", discovery_config=DiscoveryConfigId(DISCOVERY_CONFIG_ID)) -def test_start_schema_discovery_run_sends_discovery_config(client): - """`start_schema_discovery_run` posts the discovery_config id to the server.""" - req = SchemaDiscoveryRequest( - connection="conn-1", - discovery_config=DiscoveryConfigId(DISCOVERY_CONFIG_ID), - ) - with requests_mock.Mocker() as m: - m.post( - "http://test-server/api/schema-discovery/", - json={"id": 11}, - status_code=201, - ) - assert client.start_schema_discovery_run(req) == 11 - assert m.last_request.json()["discovery_config"] == DISCOVERY_CONFIG_ID +def test_file_data_discovery_request_rejects_discovery_config(): + """The v1 file-data-discovery request no longer accepts `discovery_config` (it is v2-only).""" + with pytest.raises(ValidationError): + FileDataDiscoveryRequest(connection="conn-1", discovery_config=DiscoveryConfigId(DISCOVERY_CONFIG_ID)) def test_start_file_data_discovery_run_minimal(client): @@ -799,11 +775,9 @@ def test_start_file_data_discovery_run_minimal(client): def test_start_file_data_discovery_run_full(client): - """All FDD request fields populate the wire payload and pass through unwrap helpers.""" - config = DiscoveryConfig(name="my_cfg", id=DiscoveryConfigId(DISCOVERY_CONFIG_ID)) + """All legacy FDD request fields populate the wire payload and pass through unwrap helpers.""" req = FileDataDiscoveryRequest( connection="conn-1", - discovery_config=config, options=FileDataDiscoveryOptions(dry_run=True, diagnostic_logging=True), custom_keywords=["foo"], ignored_keywords=["bar"], @@ -828,7 +802,6 @@ def test_start_file_data_discovery_run_full(client): body = m.last_request.json() assert body == { "connection": "conn-1", - "discovery_config": DISCOVERY_CONFIG_ID, "options": {"dry_run": True, "diagnostic_logging": True}, "custom_keywords": ["foo"], "ignored_keywords": ["bar"], @@ -892,11 +865,96 @@ def test_start_file_data_discovery_run_raises_on_non_201(client): client.start_file_data_discovery_run(FileDataDiscoveryRequest(connection="nope")) -def test_start_schema_discovery_raises_invalid_discovery_config_when_not_valid(client): +# --- v2 discovery: discovery_config only, posted to the /v2/ endpoints --- + + +def test_schema_discovery_v2_request_accepts_discovery_config_id(): + """A `DiscoveryConfigId` string passes through unchanged in the v2 request body.""" + req = SchemaDiscoveryV2Request(connection="conn-1", discovery_config=DiscoveryConfigId(DISCOVERY_CONFIG_ID)) + assert req.model_dump(exclude_none=True, mode="json") == { + "connection": "conn-1", + "discovery_config": DISCOVERY_CONFIG_ID, + } + + +def test_schema_discovery_v2_request_unwraps_discovery_config_model(): + """Passing a full `DiscoveryConfig` object substitutes its `id` for the wire payload.""" + config = DiscoveryConfig(name="my_cfg", id=DiscoveryConfigId(DISCOVERY_CONFIG_ID)) + req = SchemaDiscoveryV2Request(connection="conn-1", discovery_config=config) + assert req.model_dump(exclude_none=True, mode="json")["discovery_config"] == DISCOVERY_CONFIG_ID + + +def test_schema_discovery_v2_request_rejects_unsaved_discovery_config(): + """A `DiscoveryConfig` without an `id` cannot be used yet — raises immediately.""" + config = DiscoveryConfig(name="my_cfg") + with pytest.raises(ValueError, match="id is None"): + SchemaDiscoveryV2Request(connection="conn-1", discovery_config=config) + + +def test_schema_discovery_v2_request_rejects_legacy_fields(): + """`config_id` is a full override — the v2 request rejects any legacy detection options.""" + with pytest.raises(ValidationError): + SchemaDiscoveryV2Request( + connection="conn-1", + discovery_config=DiscoveryConfigId(DISCOVERY_CONFIG_ID), + schemas=["public"], + ) + + +def test_file_data_discovery_v2_request_rejects_legacy_detection_fields(): + """The v2 FDD request rejects detection options owned by the config, while keeping file-handling params.""" + with pytest.raises(ValidationError): + FileDataDiscoveryV2Request( + connection="conn-1", + discovery_config=DiscoveryConfigId(DISCOVERY_CONFIG_ID), + custom_keywords=["foo"], + ) + + +def test_start_schema_discovery_run_v2_sends_discovery_config(client): + """`start_schema_discovery_run_v2` posts the discovery_config id to the v2 endpoint.""" + req = SchemaDiscoveryV2Request(connection="conn-1", discovery_config=DiscoveryConfigId(DISCOVERY_CONFIG_ID)) + with requests_mock.Mocker() as m: + m.post("http://test-server/api/schema-discovery/v2/", json={"id": 11}, status_code=201) + assert client.start_schema_discovery_run_v2(req) == 11 + + assert m.last_request.json() == {"connection": "conn-1", "discovery_config": DISCOVERY_CONFIG_ID} + + +def test_start_file_data_discovery_run_v2_full(client): + """The v2 FDD request carries the config plus the file-handling params the config does not own.""" + config = DiscoveryConfig(name="my_cfg", id=DiscoveryConfigId(DISCOVERY_CONFIG_ID)) + req = FileDataDiscoveryV2Request( + connection="conn-1", + discovery_config=config, + options=FileDataDiscoveryOptions(dry_run=True, diagnostic_logging=True), + recurse=True, + include=[{"glob": "*.csv"}], + skip=[{"regex": r".*/tmp/.*", "match_against": "path"}], + encoding="utf-8", + workers=4, + ) + with requests_mock.Mocker() as m: + m.post("http://test-server/api/run-file-data-discovery/v2/", json={"id": 99}, status_code=201) + assert client.start_file_data_discovery_run_v2(req) == 99 + + assert m.last_request.json() == { + "connection": "conn-1", + "discovery_config": DISCOVERY_CONFIG_ID, + "options": {"dry_run": True, "diagnostic_logging": True}, + "recurse": True, + "include": [{"glob": "*.csv"}], + "skip": [{"regex": r".*/tmp/.*", "match_against": "path"}], + "encoding": "utf-8", + "workers": 4, + } + + +def test_start_schema_discovery_run_v2_raises_invalid_discovery_config_when_not_valid(client): """A 400 with a `discovery_config` validation message raises the specific subclass.""" with requests_mock.Mocker() as m: m.post( - "http://test-server/api/schema-discovery/", + "http://test-server/api/schema-discovery/v2/", json={ "discovery_config": ['Discovery config "my_cfg" cannot be used: validation status is `invalid`.'], }, @@ -906,29 +964,29 @@ def test_start_schema_discovery_raises_invalid_discovery_config_when_not_valid(c InvalidDiscoveryConfigError, match=r'Schema discovery run failed to start due to discovery config error: .*"my_cfg".*invalid', ): - client.start_schema_discovery_run( - SchemaDiscoveryRequest(connection="conn-1", discovery_config=DiscoveryConfigId(DISCOVERY_CONFIG_ID)), + client.start_schema_discovery_run_v2( + SchemaDiscoveryV2Request(connection="conn-1", discovery_config=DiscoveryConfigId(DISCOVERY_CONFIG_ID)), ) -def test_start_schema_discovery_raises_invalid_discovery_config_when_missing(client): +def test_start_schema_discovery_run_v2_raises_invalid_discovery_config_when_missing(client): """A 400 from DRF's PrimaryKeyRelatedField (config not found / archived) is also classified.""" with requests_mock.Mocker() as m: m.post( - "http://test-server/api/schema-discovery/", + "http://test-server/api/schema-discovery/v2/", json={"discovery_config": [f'Invalid pk "{DISCOVERY_CONFIG_ID}" - object does not exist.']}, status_code=400, ) with pytest.raises(InvalidDiscoveryConfigError, match="object does not exist"): - client.start_schema_discovery_run( - SchemaDiscoveryRequest(connection="conn-1", discovery_config=DiscoveryConfigId(DISCOVERY_CONFIG_ID)), + client.start_schema_discovery_run_v2( + SchemaDiscoveryV2Request(connection="conn-1", discovery_config=DiscoveryConfigId(DISCOVERY_CONFIG_ID)), ) -def test_start_file_data_discovery_raises_invalid_discovery_config(client): +def test_start_file_data_discovery_run_v2_raises_invalid_discovery_config(client): with requests_mock.Mocker() as m: m.post( - "http://test-server/api/run-file-data-discovery/", + "http://test-server/api/run-file-data-discovery/v2/", json={ "discovery_config": ['Discovery config "my_cfg" cannot be used: validation status is `in_progress`.'], }, @@ -938,22 +996,24 @@ def test_start_file_data_discovery_raises_invalid_discovery_config(client): InvalidDiscoveryConfigError, match=r"File data discovery run failed to start due to discovery config error: .*in_progress", ): - client.start_file_data_discovery_run( - FileDataDiscoveryRequest( + client.start_file_data_discovery_run_v2( + FileDataDiscoveryV2Request( connection="conn-1", discovery_config=DiscoveryConfigId(DISCOVERY_CONFIG_ID), ), ) -def test_start_schema_discovery_non_discovery_config_400_still_raises_generic_error(client): - """Other 400s (e.g. unknown connection) keep raising the base FailedToStartError.""" +def test_start_schema_discovery_run_v2_non_discovery_config_400_still_raises_generic_error(client): + """Other 400s (e.g. unknown connection) keep raising the base FailedToStartError, not the config subclass.""" with requests_mock.Mocker() as m: m.post( - "http://test-server/api/schema-discovery/", + "http://test-server/api/schema-discovery/v2/", json={"connection": ['Invalid pk "nope" - object does not exist.']}, status_code=400, ) with pytest.raises(FailedToStartError) as exc_info: - client.start_schema_discovery_run(SchemaDiscoveryRequest(connection="nope")) + client.start_schema_discovery_run_v2( + SchemaDiscoveryV2Request(connection="nope", discovery_config=DiscoveryConfigId(DISCOVERY_CONFIG_ID)), + ) assert not isinstance(exc_info.value, InvalidDiscoveryConfigError) From 567e6721011856a84659b7589ec0c3dd8303eb5d Mon Sep 17 00:00:00 2001 From: Andrew Chester Date: Wed, 10 Jun 2026 13:49:20 +1200 Subject: [PATCH 09/26] feat: Reject discovery_config on v1 discovery requests with a v2 redirect --- datamasque/client/models/discovery.py | 34 +++++++++++++++++++++++---- tests/test_discovery.py | 8 +++---- 2 files changed, 34 insertions(+), 8 deletions(-) diff --git a/datamasque/client/models/discovery.py b/datamasque/client/models/discovery.py index 267f145..37393de 100644 --- a/datamasque/client/models/discovery.py +++ b/datamasque/client/models/discovery.py @@ -35,10 +35,12 @@ class InDataDiscoveryConfig(BaseModel): class SchemaDiscoveryRequest(BaseModel): """ - Request body for `POST /api/schema-discovery/`. + Request body for `POST /api/schema-discovery/` (the keyword-driven schema-discovery trigger). `connection` accepts either a `ConnectionId` or a full `ConnectionConfig` returned by an earlier client call. - Every other field uses the server's default value when omitted. + This request does not accept a `discovery_config`; to run from a saved discovery config use + `start_schema_discovery_run_v2` with a `SchemaDiscoveryV2Request`. Every other field uses the + server's default value when omitted. """ model_config = ConfigDict(extra="forbid") @@ -57,6 +59,17 @@ class SchemaDiscoveryRequest(BaseModel): def _unwrap_connection(cls, value: Any) -> Any: return unwrap_connection_id(value) + @model_validator(mode="before") + @classmethod + def _reject_discovery_config(cls, data: Any) -> Any: + if isinstance(data, dict) and "discovery_config" in data: + raise ValueError( + "`discovery_config` is not accepted by the keyword-driven schema-discovery request; " + "use `start_schema_discovery_run_v2` with a `SchemaDiscoveryV2Request` to run from a saved " + "discovery config." + ) + return data + class SchemaDiscoveryV2Request(BaseModel): """ @@ -146,10 +159,12 @@ class FileDataDiscoveryOptions(BaseModel): class FileDataDiscoveryRequest(BaseModel): """ - Request body for `POST /api/run-file-data-discovery/`. + Request body for `POST /api/run-file-data-discovery/` (the keyword-driven file-data-discovery trigger). `connection` accepts either a `ConnectionId` or a full `ConnectionConfig` returned by an earlier client call. - All other fields use the server's default value when omitted. + This request does not accept a `discovery_config`; to run from a saved discovery config use + `start_file_data_discovery_run_v2` with a `FileDataDiscoveryV2Request`. All other fields use the + server's default value when omitted. """ model_config = ConfigDict(extra="forbid") @@ -173,6 +188,17 @@ class FileDataDiscoveryRequest(BaseModel): def _unwrap_connection(cls, value: Any) -> Any: return unwrap_connection_id(value) + @model_validator(mode="before") + @classmethod + def _reject_discovery_config(cls, data: Any) -> Any: + if isinstance(data, dict) and "discovery_config" in data: + raise ValueError( + "`discovery_config` is not accepted by the keyword-driven file-data-discovery request; " + "use `start_file_data_discovery_run_v2` with a `FileDataDiscoveryV2Request` to run from a saved " + "discovery config." + ) + return data + class FileDataDiscoveryV2Request(BaseModel): """ diff --git a/tests/test_discovery.py b/tests/test_discovery.py index 6eb2643..08b911b 100644 --- a/tests/test_discovery.py +++ b/tests/test_discovery.py @@ -746,14 +746,14 @@ def test_start_schema_discovery_run_raises_on_non_201(client): def test_schema_discovery_request_rejects_discovery_config(): - """The v1 schema-discovery request no longer accepts `discovery_config` (it is v2-only).""" - with pytest.raises(ValidationError): + """The v1 schema-discovery request rejects `discovery_config` and points the user at the v2 method.""" + with pytest.raises(ValidationError, match="start_schema_discovery_run_v2"): SchemaDiscoveryRequest(connection="conn-1", discovery_config=DiscoveryConfigId(DISCOVERY_CONFIG_ID)) def test_file_data_discovery_request_rejects_discovery_config(): - """The v1 file-data-discovery request no longer accepts `discovery_config` (it is v2-only).""" - with pytest.raises(ValidationError): + """The v1 file-data-discovery request rejects `discovery_config` and points the user at the v2 method.""" + with pytest.raises(ValidationError, match="start_file_data_discovery_run_v2"): FileDataDiscoveryRequest(connection="conn-1", discovery_config=DiscoveryConfigId(DISCOVERY_CONFIG_ID)) From a6e1736735e664323374838ceba9eea4efde4b1e Mon Sep 17 00:00:00 2001 From: Andrew Chester Date: Wed, 10 Jun 2026 15:55:33 +1200 Subject: [PATCH 10/26] feat: Restrict v2 file discovery request to connection and discovery_config --- HISTORY.rst | 2 +- datamasque/client/discovery.py | 8 ++--- datamasque/client/models/discovery.py | 17 +++------- tests/test_discovery.py | 45 ++++++++++++--------------- 4 files changed, 29 insertions(+), 43 deletions(-) diff --git a/HISTORY.rst b/HISTORY.rst index a0655fc..1a345ed 100644 --- a/HISTORY.rst +++ b/HISTORY.rst @@ -6,7 +6,7 @@ History ------------------ * Added discovery-config management APIs on ``DataMasqueClient`` (``list_discovery_configs``, ``get_discovery_config``, ``create_discovery_config``, ``update_discovery_config``, and friends), backed by the ``DiscoveryConfig`` / ``DiscoveryConfigId`` models. -* Added ``start_schema_discovery_run_v2`` and ``start_file_data_discovery_run_v2`` for starting discovery runs from a saved discovery config, via the ``/api/schema-discovery/v2/`` and ``/api/run-file-data-discovery/v2/`` endpoints. The discovery config is a full override of all detection options, so the v2 request models (``SchemaDiscoveryV2Request``, ``FileDataDiscoveryV2Request``) reject the legacy keyword/schema/in-data-discovery fields. +* Added ``start_schema_discovery_run_v2`` and ``start_file_data_discovery_run_v2`` for starting discovery runs from a saved discovery config, via the ``/api/schema-discovery/v2/`` and ``/api/run-file-data-discovery/v2/`` endpoints. The discovery config is a full override, so the v2 request models (``SchemaDiscoveryV2Request``, ``FileDataDiscoveryV2Request``) accept only the connection and discovery config: they reject the legacy keyword/schema/in-data-discovery fields, and ``FileDataDiscoveryV2Request`` additionally rejects the file-handling parameters (``recurse``, ``include``, ``skip``, ``encoding``, ``workers``) and run ``options``, which now live in the config. * The v1 ``start_schema_discovery_run`` and ``start_file_data_discovery_run`` endpoints are unchanged and do not accept a discovery config; ``start_file_data_discovery_run`` (with ``FileDataDiscoveryRequest``, the ``FileFilter`` include/skip model, and ``FileDataDiscoveryOptions``) is newly exposed for ad-hoc file data discovery. * Added ``InvalidDiscoveryConfigError``, raised by the v2 run-start methods when the referenced config is missing, archived, or not in a ``valid`` validation state. diff --git a/datamasque/client/discovery.py b/datamasque/client/discovery.py index ca2a8ff..183ab43 100644 --- a/datamasque/client/discovery.py +++ b/datamasque/client/discovery.py @@ -315,13 +315,11 @@ def start_file_data_discovery_run_v2(self, request: FileDataDiscoveryV2Request) """ Starts a file data discovery run from a saved discovery config (v2 API). - Detection options come from the saved discovery config referenced by `request.discovery_config`; - the file-handling parameters (`recurse`, `include`, `skip`, `encoding`, `workers`) and run - `options` are still accepted, since the discovery config does not own them. + All detection options, file-handling parameters, and run options come from the saved discovery + config referenced by `request.discovery_config`; no per-run overrides are accepted. Args: - request: A `FileDataDiscoveryV2Request` with the connection, saved discovery-config, and - optional file-handling parameters. + request: A `FileDataDiscoveryV2Request` with the connection and saved discovery-config. Returns: RunId: The ID of the started discovery run diff --git a/datamasque/client/models/discovery.py b/datamasque/client/models/discovery.py index 37393de..5619258 100644 --- a/datamasque/client/models/discovery.py +++ b/datamasque/client/models/discovery.py @@ -204,24 +204,17 @@ class FileDataDiscoveryV2Request(BaseModel): """ Request body for `POST /api/run-file-data-discovery/v2/` (start a run from a saved discovery config). - `discovery_config` is a full override of the detection options, so the legacy keyword and - in-data-discovery fields accepted by `FileDataDiscoveryRequest` are rejected here. - The file-handling parameters (`recurse`, `include`, `skip`, `encoding`, `workers`) and run - `options` remain accepted, since the discovery config does not own them. - `connection` accepts either a `ConnectionId` or a full `ConnectionConfig`, and `discovery_config` - accepts either a `DiscoveryConfigId` or a full `DiscoveryConfig`. + `connection` accepts either a `ConnectionId` or a full `ConnectionConfig` returned by an earlier client call. + `discovery_config` accepts either a `DiscoveryConfigId` or a full `DiscoveryConfig`, and is a full + override: it supplies every detection option as well as the file-handling and run options, so the + legacy keyword/in-data-discovery fields and the file-handling parameters (`recurse`, `include`, `skip`, + `encoding`, `workers`) accepted by `FileDataDiscoveryRequest` are rejected here. """ model_config = ConfigDict(extra="forbid") connection: Union[ConnectionId, ConnectionConfig] discovery_config: Union[DiscoveryConfigId, DiscoveryConfig] - options: Optional[FileDataDiscoveryOptions] = None - recurse: Optional[bool] = None - include: Optional[list[FileFilter]] = None - skip: Optional[list[FileFilter]] = None - encoding: Optional[str] = None - workers: Optional[int] = None @field_validator("connection", mode="before") @classmethod diff --git a/tests/test_discovery.py b/tests/test_discovery.py index 08b911b..32de2f7 100644 --- a/tests/test_discovery.py +++ b/tests/test_discovery.py @@ -901,13 +901,26 @@ def test_schema_discovery_v2_request_rejects_legacy_fields(): ) -def test_file_data_discovery_v2_request_rejects_legacy_detection_fields(): - """The v2 FDD request rejects detection options owned by the config, while keeping file-handling params.""" +@pytest.mark.parametrize( + "extra_field", + [ + {"custom_keywords": ["foo"]}, + {"in_data_discovery": {"enabled": True}}, + {"options": {"dry_run": True}}, + {"recurse": True}, + {"include": [{"glob": "*.csv"}]}, + {"skip": [{"glob": "*.tmp"}]}, + {"encoding": "utf-8"}, + {"workers": 4}, + ], +) +def test_file_data_discovery_v2_request_rejects_non_config_fields(extra_field): + """The config is a full override: detection options, file-handling params, and run options are all rejected.""" with pytest.raises(ValidationError): FileDataDiscoveryV2Request( connection="conn-1", discovery_config=DiscoveryConfigId(DISCOVERY_CONFIG_ID), - custom_keywords=["foo"], + **extra_field, ) @@ -921,33 +934,15 @@ def test_start_schema_discovery_run_v2_sends_discovery_config(client): assert m.last_request.json() == {"connection": "conn-1", "discovery_config": DISCOVERY_CONFIG_ID} -def test_start_file_data_discovery_run_v2_full(client): - """The v2 FDD request carries the config plus the file-handling params the config does not own.""" +def test_start_file_data_discovery_run_v2_sends_discovery_config(client): + """`start_file_data_discovery_run_v2` posts only the connection and discovery_config id to the v2 endpoint.""" config = DiscoveryConfig(name="my_cfg", id=DiscoveryConfigId(DISCOVERY_CONFIG_ID)) - req = FileDataDiscoveryV2Request( - connection="conn-1", - discovery_config=config, - options=FileDataDiscoveryOptions(dry_run=True, diagnostic_logging=True), - recurse=True, - include=[{"glob": "*.csv"}], - skip=[{"regex": r".*/tmp/.*", "match_against": "path"}], - encoding="utf-8", - workers=4, - ) + req = FileDataDiscoveryV2Request(connection="conn-1", discovery_config=config) with requests_mock.Mocker() as m: m.post("http://test-server/api/run-file-data-discovery/v2/", json={"id": 99}, status_code=201) assert client.start_file_data_discovery_run_v2(req) == 99 - assert m.last_request.json() == { - "connection": "conn-1", - "discovery_config": DISCOVERY_CONFIG_ID, - "options": {"dry_run": True, "diagnostic_logging": True}, - "recurse": True, - "include": [{"glob": "*.csv"}], - "skip": [{"regex": r".*/tmp/.*", "match_against": "path"}], - "encoding": "utf-8", - "workers": 4, - } + assert m.last_request.json() == {"connection": "conn-1", "discovery_config": DISCOVERY_CONFIG_ID} def test_start_schema_discovery_run_v2_raises_invalid_discovery_config_when_not_valid(client): From f18a2e4a15b595da0fb1b8fcbcdee9da2cde81ac Mon Sep 17 00:00:00 2001 From: Andrew Chester Date: Thu, 11 Jun 2026 11:54:08 +1200 Subject: [PATCH 11/26] feat: Rename discovery v2 endpoints to from_config; allow optional config and run options --- HISTORY.rst | 4 +- datamasque/client/__init__.py | 8 +- datamasque/client/discovery.py | 24 ++--- datamasque/client/models/discovery.py | 50 +++++----- tests/test_discovery.py | 133 +++++++++++++++++--------- 5 files changed, 128 insertions(+), 91 deletions(-) diff --git a/HISTORY.rst b/HISTORY.rst index 1a345ed..2cbee8b 100644 --- a/HISTORY.rst +++ b/HISTORY.rst @@ -6,9 +6,9 @@ History ------------------ * Added discovery-config management APIs on ``DataMasqueClient`` (``list_discovery_configs``, ``get_discovery_config``, ``create_discovery_config``, ``update_discovery_config``, and friends), backed by the ``DiscoveryConfig`` / ``DiscoveryConfigId`` models. -* Added ``start_schema_discovery_run_v2`` and ``start_file_data_discovery_run_v2`` for starting discovery runs from a saved discovery config, via the ``/api/schema-discovery/v2/`` and ``/api/run-file-data-discovery/v2/`` endpoints. The discovery config is a full override, so the v2 request models (``SchemaDiscoveryV2Request``, ``FileDataDiscoveryV2Request``) accept only the connection and discovery config: they reject the legacy keyword/schema/in-data-discovery fields, and ``FileDataDiscoveryV2Request`` additionally rejects the file-handling parameters (``recurse``, ``include``, ``skip``, ``encoding``, ``workers``) and run ``options``, which now live in the config. +* Added ``start_schema_discovery_run_from_config`` and ``start_file_data_discovery_run_from_config`` for starting discovery runs from a saved discovery config, via the ``/api/schema-discovery/v2/`` and ``/api/run-file-data-discovery/v2/`` endpoints. Their request models (``SchemaDiscoveryFromConfigRequest``, ``FileDataDiscoveryFromConfigRequest``) take a ``connection`` and an optional ``discovery_config`` (``None`` runs with the server's default discovery options); the legacy keyword/schema/in-data-discovery fields and the file-handling parameters (``recurse``, ``include``, ``skip``, ``encoding``, ``workers``) are rejected because the config owns them. ``FileDataDiscoveryFromConfigRequest`` also accepts run ``options`` (``dry_run``, ``diagnostic_logging``). * The v1 ``start_schema_discovery_run`` and ``start_file_data_discovery_run`` endpoints are unchanged and do not accept a discovery config; ``start_file_data_discovery_run`` (with ``FileDataDiscoveryRequest``, the ``FileFilter`` include/skip model, and ``FileDataDiscoveryOptions``) is newly exposed for ad-hoc file data discovery. -* Added ``InvalidDiscoveryConfigError``, raised by the v2 run-start methods when the referenced config is missing, archived, or not in a ``valid`` validation state. +* Added ``InvalidDiscoveryConfigError``, raised by the from-config run-start methods when the referenced config is missing, archived, or not in a ``valid`` validation state. 1.0.5 (2026-06-18) ------------------ diff --git a/datamasque/client/__init__.py b/datamasque/client/__init__.py index b9a4b85..304864f 100644 --- a/datamasque/client/__init__.py +++ b/datamasque/client/__init__.py @@ -55,9 +55,9 @@ from datamasque.client.models.discovery import ( ConstraintColumns, DiscoveryMatch, + FileDataDiscoveryFromConfigRequest, FileDataDiscoveryOptions, FileDataDiscoveryRequest, - FileDataDiscoveryV2Request, FileDiscoveryFile, FileDiscoveryLocatorResult, FileDiscoveryMatch, @@ -71,10 +71,10 @@ ReferencingForeignKey, RulesetGenerationRequest, SchemaDiscoveryColumn, + SchemaDiscoveryFromConfigRequest, SchemaDiscoveryPage, SchemaDiscoveryRequest, SchemaDiscoveryResult, - SchemaDiscoveryV2Request, TableConstraints, ) from datamasque.client.models.discovery_config import DiscoveryConfig, DiscoveryConfigId @@ -144,9 +144,9 @@ "DynamoConnectionConfig", "FailedToStartError", "FileConnectionConfig", + "FileDataDiscoveryFromConfigRequest", "FileDataDiscoveryOptions", "FileDataDiscoveryRequest", - "FileDataDiscoveryV2Request", "FileDiscoveryFile", "FileDiscoveryLocatorResult", "FileDiscoveryMatch", @@ -198,10 +198,10 @@ "RunNotCancellableError", "S3ConnectionConfig", "SchemaDiscoveryColumn", + "SchemaDiscoveryFromConfigRequest", "SchemaDiscoveryPage", "SchemaDiscoveryRequest", "SchemaDiscoveryResult", - "SchemaDiscoveryV2Request", "SeedFile", "SelectedColumns", "SelectedData", diff --git a/datamasque/client/discovery.py b/datamasque/client/discovery.py index 183ab43..a1db3f5 100644 --- a/datamasque/client/discovery.py +++ b/datamasque/client/discovery.py @@ -20,15 +20,15 @@ SelectedFileData, ) from datamasque.client.models.discovery import ( + FileDataDiscoveryFromConfigRequest, FileDataDiscoveryRequest, - FileDataDiscoveryV2Request, FileDiscoveryResult, FileRulesetGenerationRequest, RulesetGenerationRequest, + SchemaDiscoveryFromConfigRequest, SchemaDiscoveryPage, SchemaDiscoveryRequest, SchemaDiscoveryResult, - SchemaDiscoveryV2Request, ) from datamasque.client.models.ruleset import Ruleset from datamasque.client.models.runs import RunId @@ -270,16 +270,12 @@ def start_file_data_discovery_run(self, request: FileDataDiscoveryRequest) -> Ru response=response, ) - def start_schema_discovery_run_v2(self, request: SchemaDiscoveryV2Request) -> RunId: + def start_schema_discovery_run_from_config(self, request: SchemaDiscoveryFromConfigRequest) -> RunId: """ - Starts a schema discovery run from a saved discovery config (v2 API). - - All detection options come from the saved discovery config referenced by `request.discovery_config`; - the legacy keyword/schema/in-data-discovery options accepted by `start_schema_discovery_run` - are not accepted here. + Starts a schema discovery run from a saved discovery config. Args: - request: A `SchemaDiscoveryV2Request` with the connection and saved discovery-config. + request: A `SchemaDiscoveryFromConfigRequest` with the connection and an optional saved discovery-config. Returns: RunId: The ID of the started discovery run @@ -311,15 +307,13 @@ def start_schema_discovery_run_v2(self, request: SchemaDiscoveryV2Request) -> Ru response=response, ) - def start_file_data_discovery_run_v2(self, request: FileDataDiscoveryV2Request) -> RunId: + def start_file_data_discovery_run_from_config(self, request: FileDataDiscoveryFromConfigRequest) -> RunId: """ - Starts a file data discovery run from a saved discovery config (v2 API). - - All detection options, file-handling parameters, and run options come from the saved discovery - config referenced by `request.discovery_config`; no per-run overrides are accepted. + Starts a file data discovery run from a saved discovery config. Args: - request: A `FileDataDiscoveryV2Request` with the connection and saved discovery-config. + request: A `FileDataDiscoveryFromConfigRequest` with the connection, + an optional saved discovery-config, and optional run `options`. Returns: RunId: The ID of the started discovery run diff --git a/datamasque/client/models/discovery.py b/datamasque/client/models/discovery.py index 5619258..eb75282 100644 --- a/datamasque/client/models/discovery.py +++ b/datamasque/client/models/discovery.py @@ -37,10 +37,9 @@ class SchemaDiscoveryRequest(BaseModel): """ Request body for `POST /api/schema-discovery/` (the keyword-driven schema-discovery trigger). - `connection` accepts either a `ConnectionId` or a full `ConnectionConfig` returned by an earlier client call. - This request does not accept a `discovery_config`; to run from a saved discovery config use - `start_schema_discovery_run_v2` with a `SchemaDiscoveryV2Request`. Every other field uses the - server's default value when omitted. + `connection` accepts either a `ConnectionId` or a full `ConnectionConfig` + returned by an earlier client call. + This request does not accept a `discovery_config` """ model_config = ConfigDict(extra="forbid") @@ -65,26 +64,26 @@ def _reject_discovery_config(cls, data: Any) -> Any: if isinstance(data, dict) and "discovery_config" in data: raise ValueError( "`discovery_config` is not accepted by the keyword-driven schema-discovery request; " - "use `start_schema_discovery_run_v2` with a `SchemaDiscoveryV2Request` to run from a saved " - "discovery config." + "use `start_schema_discovery_run_from_config` with a `SchemaDiscoveryFromConfigRequest` " + "to run from a saved discovery config." ) return data -class SchemaDiscoveryV2Request(BaseModel): +class SchemaDiscoveryFromConfigRequest(BaseModel): """ Request body for `POST /api/schema-discovery/v2/` (start a run from a saved discovery config). - `connection` accepts either a `ConnectionId` or a full `ConnectionConfig` returned by an earlier client call. - `discovery_config` accepts either a `DiscoveryConfigId` or a full `DiscoveryConfig`, and is a full - override: it supplies every detection option, so the legacy keyword/schema/in-data-discovery fields - accepted by `SchemaDiscoveryRequest` are rejected here. + `connection` accepts either a `ConnectionId` or a full `ConnectionConfig` + returned by an earlier client call. + `discovery_config` accepts either a `DiscoveryConfigId` or a full `DiscoveryConfig`, + or `None` to run with the server's default discovery options. """ model_config = ConfigDict(extra="forbid") connection: Union[ConnectionId, ConnectionConfig] - discovery_config: Union[DiscoveryConfigId, DiscoveryConfig] + discovery_config: Optional[Union[DiscoveryConfigId, DiscoveryConfig]] = None @field_validator("connection", mode="before") @classmethod @@ -161,10 +160,9 @@ class FileDataDiscoveryRequest(BaseModel): """ Request body for `POST /api/run-file-data-discovery/` (the keyword-driven file-data-discovery trigger). - `connection` accepts either a `ConnectionId` or a full `ConnectionConfig` returned by an earlier client call. - This request does not accept a `discovery_config`; to run from a saved discovery config use - `start_file_data_discovery_run_v2` with a `FileDataDiscoveryV2Request`. All other fields use the - server's default value when omitted. + `connection` accepts either a `ConnectionId` or a full `ConnectionConfig` + returned by an earlier client call. + This request does not accept a `discovery_config`. """ model_config = ConfigDict(extra="forbid") @@ -194,27 +192,29 @@ def _reject_discovery_config(cls, data: Any) -> Any: if isinstance(data, dict) and "discovery_config" in data: raise ValueError( "`discovery_config` is not accepted by the keyword-driven file-data-discovery request; " - "use `start_file_data_discovery_run_v2` with a `FileDataDiscoveryV2Request` to run from a saved " - "discovery config." + "use `start_file_data_discovery_run_from_config` with a `FileDataDiscoveryFromConfigRequest` " + "to run from a saved discovery config." ) return data -class FileDataDiscoveryV2Request(BaseModel): +class FileDataDiscoveryFromConfigRequest(BaseModel): """ Request body for `POST /api/run-file-data-discovery/v2/` (start a run from a saved discovery config). - `connection` accepts either a `ConnectionId` or a full `ConnectionConfig` returned by an earlier client call. - `discovery_config` accepts either a `DiscoveryConfigId` or a full `DiscoveryConfig`, and is a full - override: it supplies every detection option as well as the file-handling and run options, so the - legacy keyword/in-data-discovery fields and the file-handling parameters (`recurse`, `include`, `skip`, - `encoding`, `workers`) accepted by `FileDataDiscoveryRequest` are rejected here. + `connection` accepts either a `ConnectionId` or a full `ConnectionConfig` + returned by an earlier client call. + `discovery_config` accepts either a `DiscoveryConfigId` or a full `DiscoveryConfig`, + or `None` to run with the server's default discovery options. + `options` carries run-time toggles (`dry_run`, `diagnostic_logging`); + detection and file-handling settings come from the discovery config, not the request. """ model_config = ConfigDict(extra="forbid") connection: Union[ConnectionId, ConnectionConfig] - discovery_config: Union[DiscoveryConfigId, DiscoveryConfig] + discovery_config: Optional[Union[DiscoveryConfigId, DiscoveryConfig]] = None + options: Optional[FileDataDiscoveryOptions] = None @field_validator("connection", mode="before") @classmethod diff --git a/tests/test_discovery.py b/tests/test_discovery.py index 32de2f7..c5f1fd8 100644 --- a/tests/test_discovery.py +++ b/tests/test_discovery.py @@ -12,19 +12,19 @@ DataMasqueClient, DiscoveryConfig, DiscoveryConfigId, + FileDataDiscoveryFromConfigRequest, FileDataDiscoveryOptions, FileDataDiscoveryRequest, - FileDataDiscoveryV2Request, FileFilter, FileFilterMatchAgainst, FileRulesetGenerationRequest, InDataDiscoveryConfig, RulesetGenerationRequest, RunId, + SchemaDiscoveryFromConfigRequest, SchemaDiscoveryPage, SchemaDiscoveryRequest, SchemaDiscoveryResult, - SchemaDiscoveryV2Request, ) from datamasque.client.exceptions import ( AsyncRulesetGenerationInProgressError, @@ -742,18 +742,15 @@ def test_start_schema_discovery_run_raises_on_non_201(client): client.start_schema_discovery_run(SchemaDiscoveryRequest(connection="nope")) -# --- v1 (legacy) discovery: no discovery_config accepted --- - - def test_schema_discovery_request_rejects_discovery_config(): - """The v1 schema-discovery request rejects `discovery_config` and points the user at the v2 method.""" - with pytest.raises(ValidationError, match="start_schema_discovery_run_v2"): + """The v1 schema-discovery request rejects `discovery_config` and points the user at the from-config method.""" + with pytest.raises(ValidationError, match="start_schema_discovery_run_from_config"): SchemaDiscoveryRequest(connection="conn-1", discovery_config=DiscoveryConfigId(DISCOVERY_CONFIG_ID)) def test_file_data_discovery_request_rejects_discovery_config(): - """The v1 file-data-discovery request rejects `discovery_config` and points the user at the v2 method.""" - with pytest.raises(ValidationError, match="start_file_data_discovery_run_v2"): + """The v1 file-data-discovery request rejects `discovery_config` and points the user at the from-config method.""" + with pytest.raises(ValidationError, match="start_file_data_discovery_run_from_config"): FileDataDiscoveryRequest(connection="conn-1", discovery_config=DiscoveryConfigId(DISCOVERY_CONFIG_ID)) @@ -865,36 +862,33 @@ def test_start_file_data_discovery_run_raises_on_non_201(client): client.start_file_data_discovery_run(FileDataDiscoveryRequest(connection="nope")) -# --- v2 discovery: discovery_config only, posted to the /v2/ endpoints --- - - -def test_schema_discovery_v2_request_accepts_discovery_config_id(): +def test_schema_discovery_from_config_request_accepts_discovery_config_id(): """A `DiscoveryConfigId` string passes through unchanged in the v2 request body.""" - req = SchemaDiscoveryV2Request(connection="conn-1", discovery_config=DiscoveryConfigId(DISCOVERY_CONFIG_ID)) + req = SchemaDiscoveryFromConfigRequest(connection="conn-1", discovery_config=DiscoveryConfigId(DISCOVERY_CONFIG_ID)) assert req.model_dump(exclude_none=True, mode="json") == { "connection": "conn-1", "discovery_config": DISCOVERY_CONFIG_ID, } -def test_schema_discovery_v2_request_unwraps_discovery_config_model(): +def test_schema_discovery_from_config_request_unwraps_discovery_config_model(): """Passing a full `DiscoveryConfig` object substitutes its `id` for the wire payload.""" config = DiscoveryConfig(name="my_cfg", id=DiscoveryConfigId(DISCOVERY_CONFIG_ID)) - req = SchemaDiscoveryV2Request(connection="conn-1", discovery_config=config) + req = SchemaDiscoveryFromConfigRequest(connection="conn-1", discovery_config=config) assert req.model_dump(exclude_none=True, mode="json")["discovery_config"] == DISCOVERY_CONFIG_ID -def test_schema_discovery_v2_request_rejects_unsaved_discovery_config(): +def test_schema_discovery_from_config_request_rejects_unsaved_discovery_config(): """A `DiscoveryConfig` without an `id` cannot be used yet — raises immediately.""" config = DiscoveryConfig(name="my_cfg") with pytest.raises(ValueError, match="id is None"): - SchemaDiscoveryV2Request(connection="conn-1", discovery_config=config) + SchemaDiscoveryFromConfigRequest(connection="conn-1", discovery_config=config) -def test_schema_discovery_v2_request_rejects_legacy_fields(): - """`config_id` is a full override — the v2 request rejects any legacy detection options.""" +def test_schema_discovery_from_config_request_rejects_legacy_fields(): + """The saved-config request rejects any legacy detection options — they live in the config.""" with pytest.raises(ValidationError): - SchemaDiscoveryV2Request( + SchemaDiscoveryFromConfigRequest( connection="conn-1", discovery_config=DiscoveryConfigId(DISCOVERY_CONFIG_ID), schemas=["public"], @@ -906,7 +900,6 @@ def test_schema_discovery_v2_request_rejects_legacy_fields(): [ {"custom_keywords": ["foo"]}, {"in_data_discovery": {"enabled": True}}, - {"options": {"dry_run": True}}, {"recurse": True}, {"include": [{"glob": "*.csv"}]}, {"skip": [{"glob": "*.tmp"}]}, @@ -914,38 +907,82 @@ def test_schema_discovery_v2_request_rejects_legacy_fields(): {"workers": 4}, ], ) -def test_file_data_discovery_v2_request_rejects_non_config_fields(extra_field): - """The config is a full override: detection options, file-handling params, and run options are all rejected.""" +def test_file_data_discovery_from_config_request_rejects_non_config_fields(extra_field): + """Detection options and file-handling params are rejected — they live in the config, not the request.""" with pytest.raises(ValidationError): - FileDataDiscoveryV2Request( + FileDataDiscoveryFromConfigRequest( connection="conn-1", discovery_config=DiscoveryConfigId(DISCOVERY_CONFIG_ID), **extra_field, ) -def test_start_schema_discovery_run_v2_sends_discovery_config(client): - """`start_schema_discovery_run_v2` posts the discovery_config id to the v2 endpoint.""" - req = SchemaDiscoveryV2Request(connection="conn-1", discovery_config=DiscoveryConfigId(DISCOVERY_CONFIG_ID)) +def test_schema_discovery_from_config_request_allows_none_discovery_config(): + """`discovery_config` may be omitted or None; it is dropped from the payload so the server uses its defaults.""" + omitted = SchemaDiscoveryFromConfigRequest(connection="conn-1") + explicit_none = SchemaDiscoveryFromConfigRequest(connection="conn-1", discovery_config=None) + assert omitted.model_dump(exclude_none=True, mode="json") == {"connection": "conn-1"} + assert explicit_none.model_dump(exclude_none=True, mode="json") == {"connection": "conn-1"} + + +def test_file_data_discovery_from_config_request_allows_none_discovery_config(): + """`discovery_config` may be omitted or None; it is dropped from the payload so the server uses its defaults.""" + omitted = FileDataDiscoveryFromConfigRequest(connection="conn-1") + explicit_none = FileDataDiscoveryFromConfigRequest(connection="conn-1", discovery_config=None) + assert omitted.model_dump(exclude_none=True, mode="json") == {"connection": "conn-1"} + assert explicit_none.model_dump(exclude_none=True, mode="json") == {"connection": "conn-1"} + + +def test_start_schema_discovery_run_from_config_sends_discovery_config(client): + """`start_schema_discovery_run_from_config` posts the discovery_config id to the saved-config endpoint.""" + req = SchemaDiscoveryFromConfigRequest(connection="conn-1", discovery_config=DiscoveryConfigId(DISCOVERY_CONFIG_ID)) with requests_mock.Mocker() as m: m.post("http://test-server/api/schema-discovery/v2/", json={"id": 11}, status_code=201) - assert client.start_schema_discovery_run_v2(req) == 11 + assert client.start_schema_discovery_run_from_config(req) == 11 assert m.last_request.json() == {"connection": "conn-1", "discovery_config": DISCOVERY_CONFIG_ID} -def test_start_file_data_discovery_run_v2_sends_discovery_config(client): - """`start_file_data_discovery_run_v2` posts only the connection and discovery_config id to the v2 endpoint.""" +def test_start_file_data_discovery_run_from_config_sends_discovery_config(client): + """`start_file_data_discovery_run_from_config` posts only the connection and discovery_config id.""" config = DiscoveryConfig(name="my_cfg", id=DiscoveryConfigId(DISCOVERY_CONFIG_ID)) - req = FileDataDiscoveryV2Request(connection="conn-1", discovery_config=config) + req = FileDataDiscoveryFromConfigRequest(connection="conn-1", discovery_config=config) with requests_mock.Mocker() as m: m.post("http://test-server/api/run-file-data-discovery/v2/", json={"id": 99}, status_code=201) - assert client.start_file_data_discovery_run_v2(req) == 99 + assert client.start_file_data_discovery_run_from_config(req) == 99 assert m.last_request.json() == {"connection": "conn-1", "discovery_config": DISCOVERY_CONFIG_ID} -def test_start_schema_discovery_run_v2_raises_invalid_discovery_config_when_not_valid(client): +def test_start_file_data_discovery_run_from_config_sends_options(client): + """`options` (dry_run / diagnostic_logging) is posted alongside the config on the file from-config trigger.""" + req = FileDataDiscoveryFromConfigRequest( + connection="conn-1", + discovery_config=DiscoveryConfigId(DISCOVERY_CONFIG_ID), + options=FileDataDiscoveryOptions(dry_run=True, diagnostic_logging=True), + ) + with requests_mock.Mocker() as m: + m.post("http://test-server/api/run-file-data-discovery/v2/", json={"id": 77}, status_code=201) + assert client.start_file_data_discovery_run_from_config(req) == 77 + + assert m.last_request.json() == { + "connection": "conn-1", + "discovery_config": DISCOVERY_CONFIG_ID, + "options": {"dry_run": True, "diagnostic_logging": True}, + } + + +def test_start_schema_discovery_run_from_config_none_omits_discovery_config(client): + """With `discovery_config=None` the client posts only the connection; the server falls back to its defaults.""" + req = SchemaDiscoveryFromConfigRequest(connection="conn-1", discovery_config=None) + with requests_mock.Mocker() as m: + m.post("http://test-server/api/schema-discovery/v2/", json={"id": 13}, status_code=201) + assert client.start_schema_discovery_run_from_config(req) == 13 + + assert m.last_request.json() == {"connection": "conn-1"} + + +def test_start_schema_discovery_run_from_config_raises_invalid_discovery_config_when_not_valid(client): """A 400 with a `discovery_config` validation message raises the specific subclass.""" with requests_mock.Mocker() as m: m.post( @@ -959,12 +996,14 @@ def test_start_schema_discovery_run_v2_raises_invalid_discovery_config_when_not_ InvalidDiscoveryConfigError, match=r'Schema discovery run failed to start due to discovery config error: .*"my_cfg".*invalid', ): - client.start_schema_discovery_run_v2( - SchemaDiscoveryV2Request(connection="conn-1", discovery_config=DiscoveryConfigId(DISCOVERY_CONFIG_ID)), + client.start_schema_discovery_run_from_config( + SchemaDiscoveryFromConfigRequest( + connection="conn-1", discovery_config=DiscoveryConfigId(DISCOVERY_CONFIG_ID) + ), ) -def test_start_schema_discovery_run_v2_raises_invalid_discovery_config_when_missing(client): +def test_start_schema_discovery_run_from_config_raises_invalid_discovery_config_when_missing(client): """A 400 from DRF's PrimaryKeyRelatedField (config not found / archived) is also classified.""" with requests_mock.Mocker() as m: m.post( @@ -973,12 +1012,14 @@ def test_start_schema_discovery_run_v2_raises_invalid_discovery_config_when_miss status_code=400, ) with pytest.raises(InvalidDiscoveryConfigError, match="object does not exist"): - client.start_schema_discovery_run_v2( - SchemaDiscoveryV2Request(connection="conn-1", discovery_config=DiscoveryConfigId(DISCOVERY_CONFIG_ID)), + client.start_schema_discovery_run_from_config( + SchemaDiscoveryFromConfigRequest( + connection="conn-1", discovery_config=DiscoveryConfigId(DISCOVERY_CONFIG_ID) + ), ) -def test_start_file_data_discovery_run_v2_raises_invalid_discovery_config(client): +def test_start_file_data_discovery_run_from_config_raises_invalid_discovery_config(client): with requests_mock.Mocker() as m: m.post( "http://test-server/api/run-file-data-discovery/v2/", @@ -991,15 +1032,15 @@ def test_start_file_data_discovery_run_v2_raises_invalid_discovery_config(client InvalidDiscoveryConfigError, match=r"File data discovery run failed to start due to discovery config error: .*in_progress", ): - client.start_file_data_discovery_run_v2( - FileDataDiscoveryV2Request( + client.start_file_data_discovery_run_from_config( + FileDataDiscoveryFromConfigRequest( connection="conn-1", discovery_config=DiscoveryConfigId(DISCOVERY_CONFIG_ID), ), ) -def test_start_schema_discovery_run_v2_non_discovery_config_400_still_raises_generic_error(client): +def test_start_schema_discovery_run_from_config_non_discovery_config_400_still_raises_generic_error(client): """Other 400s (e.g. unknown connection) keep raising the base FailedToStartError, not the config subclass.""" with requests_mock.Mocker() as m: m.post( @@ -1008,7 +1049,9 @@ def test_start_schema_discovery_run_v2_non_discovery_config_400_still_raises_gen status_code=400, ) with pytest.raises(FailedToStartError) as exc_info: - client.start_schema_discovery_run_v2( - SchemaDiscoveryV2Request(connection="nope", discovery_config=DiscoveryConfigId(DISCOVERY_CONFIG_ID)), + client.start_schema_discovery_run_from_config( + SchemaDiscoveryFromConfigRequest( + connection="nope", discovery_config=DiscoveryConfigId(DISCOVERY_CONFIG_ID) + ), ) assert not isinstance(exc_info.value, InvalidDiscoveryConfigError) From ee52e054b147bdea9d71f60ee407684978439b12 Mon Sep 17 00:00:00 2001 From: Andrew Chester Date: Thu, 11 Jun 2026 19:02:00 +1200 Subject: [PATCH 12/26] refactor: Drop unused dry_run from file discovery options and condense the changelog --- HISTORY.rst | 10 ++++++---- datamasque/client/models/discovery.py | 5 ++--- tests/test_discovery.py | 10 +++++----- 3 files changed, 13 insertions(+), 12 deletions(-) diff --git a/HISTORY.rst b/HISTORY.rst index 2cbee8b..584d294 100644 --- a/HISTORY.rst +++ b/HISTORY.rst @@ -5,10 +5,12 @@ History 1.1.0 (unreleased) ------------------ -* Added discovery-config management APIs on ``DataMasqueClient`` (``list_discovery_configs``, ``get_discovery_config``, ``create_discovery_config``, ``update_discovery_config``, and friends), backed by the ``DiscoveryConfig`` / ``DiscoveryConfigId`` models. -* Added ``start_schema_discovery_run_from_config`` and ``start_file_data_discovery_run_from_config`` for starting discovery runs from a saved discovery config, via the ``/api/schema-discovery/v2/`` and ``/api/run-file-data-discovery/v2/`` endpoints. Their request models (``SchemaDiscoveryFromConfigRequest``, ``FileDataDiscoveryFromConfigRequest``) take a ``connection`` and an optional ``discovery_config`` (``None`` runs with the server's default discovery options); the legacy keyword/schema/in-data-discovery fields and the file-handling parameters (``recurse``, ``include``, ``skip``, ``encoding``, ``workers``) are rejected because the config owns them. ``FileDataDiscoveryFromConfigRequest`` also accepts run ``options`` (``dry_run``, ``diagnostic_logging``). -* The v1 ``start_schema_discovery_run`` and ``start_file_data_discovery_run`` endpoints are unchanged and do not accept a discovery config; ``start_file_data_discovery_run`` (with ``FileDataDiscoveryRequest``, the ``FileFilter`` include/skip model, and ``FileDataDiscoveryOptions``) is newly exposed for ad-hoc file data discovery. -* Added ``InvalidDiscoveryConfigError``, raised by the from-config run-start methods when the referenced config is missing, archived, or not in a ``valid`` validation state. +* Added discovery-config management APIs (``list_discovery_configs``, ``create_discovery_config``, and friends). +* Added schema-discovery and file-data-discovery APIs that take a saved discovery configuration + (``start_schema_discovery_run_from_config`` / ``start_file_data_discovery_run_from_config``). + Adoption is recommended; the older APIs that take individual options will be deprecated in a future release. +* Corrected the file-data-discovery ``include``/``skip`` filter syntax and added ``ignore_rules`` support. +* Added ``InvalidDiscoveryConfigError``, raised when a discovery run can't start due to an invalid discovery config. 1.0.5 (2026-06-18) ------------------ diff --git a/datamasque/client/models/discovery.py b/datamasque/client/models/discovery.py index eb75282..86215ab 100644 --- a/datamasque/client/models/discovery.py +++ b/datamasque/client/models/discovery.py @@ -39,7 +39,7 @@ class SchemaDiscoveryRequest(BaseModel): `connection` accepts either a `ConnectionId` or a full `ConnectionConfig` returned by an earlier client call. - This request does not accept a `discovery_config` + This request does not accept a `discovery_config`. """ model_config = ConfigDict(extra="forbid") @@ -152,7 +152,6 @@ class FileDataDiscoveryOptions(BaseModel): model_config = ConfigDict(extra="forbid") - dry_run: Optional[bool] = None diagnostic_logging: Optional[bool] = None @@ -206,7 +205,7 @@ class FileDataDiscoveryFromConfigRequest(BaseModel): returned by an earlier client call. `discovery_config` accepts either a `DiscoveryConfigId` or a full `DiscoveryConfig`, or `None` to run with the server's default discovery options. - `options` carries run-time toggles (`dry_run`, `diagnostic_logging`); + `options` carries the `diagnostic_logging` run-time toggle; detection and file-handling settings come from the discovery config, not the request. """ diff --git a/tests/test_discovery.py b/tests/test_discovery.py index c5f1fd8..7305d26 100644 --- a/tests/test_discovery.py +++ b/tests/test_discovery.py @@ -775,7 +775,7 @@ def test_start_file_data_discovery_run_full(client): """All legacy FDD request fields populate the wire payload and pass through unwrap helpers.""" req = FileDataDiscoveryRequest( connection="conn-1", - options=FileDataDiscoveryOptions(dry_run=True, diagnostic_logging=True), + options=FileDataDiscoveryOptions(diagnostic_logging=True), custom_keywords=["foo"], ignored_keywords=["bar"], disable_built_in_keywords=True, @@ -799,7 +799,7 @@ def test_start_file_data_discovery_run_full(client): body = m.last_request.json() assert body == { "connection": "conn-1", - "options": {"dry_run": True, "diagnostic_logging": True}, + "options": {"diagnostic_logging": True}, "custom_keywords": ["foo"], "ignored_keywords": ["bar"], "disable_built_in_keywords": True, @@ -955,11 +955,11 @@ def test_start_file_data_discovery_run_from_config_sends_discovery_config(client def test_start_file_data_discovery_run_from_config_sends_options(client): - """`options` (dry_run / diagnostic_logging) is posted alongside the config on the file from-config trigger.""" + """`options` (diagnostic_logging) is posted alongside the config on the file from-config trigger.""" req = FileDataDiscoveryFromConfigRequest( connection="conn-1", discovery_config=DiscoveryConfigId(DISCOVERY_CONFIG_ID), - options=FileDataDiscoveryOptions(dry_run=True, diagnostic_logging=True), + options=FileDataDiscoveryOptions(diagnostic_logging=True), ) with requests_mock.Mocker() as m: m.post("http://test-server/api/run-file-data-discovery/v2/", json={"id": 77}, status_code=201) @@ -968,7 +968,7 @@ def test_start_file_data_discovery_run_from_config_sends_options(client): assert m.last_request.json() == { "connection": "conn-1", "discovery_config": DISCOVERY_CONFIG_ID, - "options": {"dry_run": True, "diagnostic_logging": True}, + "options": {"diagnostic_logging": True}, } From 1f2745805d5a2cb4c095d8465ff8f744ee06897f Mon Sep 17 00:00:00 2001 From: Andrew Chester Date: Fri, 12 Jun 2026 16:47:30 +1200 Subject: [PATCH 13/26] feat(discovery): DM-3562 send explicit null discovery_config from saved-config triggers --- datamasque/client/discovery.py | 14 ++++++-- datamasque/client/models/discovery.py | 8 ++--- tests/test_discovery.py | 48 ++++++++++++++++++--------- 3 files changed, 48 insertions(+), 22 deletions(-) diff --git a/datamasque/client/discovery.py b/datamasque/client/discovery.py index a1db3f5..ca27772 100644 --- a/datamasque/client/discovery.py +++ b/datamasque/client/discovery.py @@ -275,7 +275,8 @@ def start_schema_discovery_run_from_config(self, request: SchemaDiscoveryFromCon Starts a schema discovery run from a saved discovery config. Args: - request: A `SchemaDiscoveryFromConfigRequest` with the connection and an optional saved discovery-config. + request: A `SchemaDiscoveryFromConfigRequest` with the `connection` and a required `discovery_config` + (a saved config, or `None` for the server's defaults). Returns: RunId: The ID of the started discovery run @@ -287,6 +288,9 @@ def start_schema_discovery_run_from_config(self, request: SchemaDiscoveryFromCon """ data = request.model_dump(exclude_none=True, mode="json") + # The server requires `discovery_config` to be present; a null selects its built-in defaults, + # so send it explicitly rather than letting `exclude_none` drop a None. + data.setdefault("discovery_config", None) response = self.make_request( "POST", "/api/schema-discovery/v2/", @@ -312,8 +316,9 @@ def start_file_data_discovery_run_from_config(self, request: FileDataDiscoveryFr Starts a file data discovery run from a saved discovery config. Args: - request: A `FileDataDiscoveryFromConfigRequest` with the connection, - an optional saved discovery-config, and optional run `options`. + request: A `FileDataDiscoveryFromConfigRequest` with the `connection`, + a required `discovery_config` (a saved config, or `None` for the server's defaults), + and optional run `options`. Returns: RunId: The ID of the started discovery run @@ -325,6 +330,9 @@ def start_file_data_discovery_run_from_config(self, request: FileDataDiscoveryFr """ data = request.model_dump(exclude_none=True, mode="json") + # The server requires `discovery_config` to be present; a null selects its built-in defaults, + # so send it explicitly rather than letting `exclude_none` drop a None. + data.setdefault("discovery_config", None) response = self.make_request( "POST", "/api/run-file-data-discovery/v2/", diff --git a/datamasque/client/models/discovery.py b/datamasque/client/models/discovery.py index 86215ab..0fce513 100644 --- a/datamasque/client/models/discovery.py +++ b/datamasque/client/models/discovery.py @@ -76,14 +76,14 @@ class SchemaDiscoveryFromConfigRequest(BaseModel): `connection` accepts either a `ConnectionId` or a full `ConnectionConfig` returned by an earlier client call. - `discovery_config` accepts either a `DiscoveryConfigId` or a full `DiscoveryConfig`, + `discovery_config` is required: pass a `DiscoveryConfigId`, a full `DiscoveryConfig`, or `None` to run with the server's default discovery options. """ model_config = ConfigDict(extra="forbid") connection: Union[ConnectionId, ConnectionConfig] - discovery_config: Optional[Union[DiscoveryConfigId, DiscoveryConfig]] = None + discovery_config: Optional[Union[DiscoveryConfigId, DiscoveryConfig]] @field_validator("connection", mode="before") @classmethod @@ -203,7 +203,7 @@ class FileDataDiscoveryFromConfigRequest(BaseModel): `connection` accepts either a `ConnectionId` or a full `ConnectionConfig` returned by an earlier client call. - `discovery_config` accepts either a `DiscoveryConfigId` or a full `DiscoveryConfig`, + `discovery_config` is required: pass a `DiscoveryConfigId`, a full `DiscoveryConfig`, or `None` to run with the server's default discovery options. `options` carries the `diagnostic_logging` run-time toggle; detection and file-handling settings come from the discovery config, not the request. @@ -212,7 +212,7 @@ class FileDataDiscoveryFromConfigRequest(BaseModel): model_config = ConfigDict(extra="forbid") connection: Union[ConnectionId, ConnectionConfig] - discovery_config: Optional[Union[DiscoveryConfigId, DiscoveryConfig]] = None + discovery_config: Optional[Union[DiscoveryConfigId, DiscoveryConfig]] options: Optional[FileDataDiscoveryOptions] = None @field_validator("connection", mode="before") diff --git a/tests/test_discovery.py b/tests/test_discovery.py index 7305d26..dcf736a 100644 --- a/tests/test_discovery.py +++ b/tests/test_discovery.py @@ -917,20 +917,28 @@ def test_file_data_discovery_from_config_request_rejects_non_config_fields(extra ) -def test_schema_discovery_from_config_request_allows_none_discovery_config(): - """`discovery_config` may be omitted or None; it is dropped from the payload so the server uses its defaults.""" - omitted = SchemaDiscoveryFromConfigRequest(connection="conn-1") - explicit_none = SchemaDiscoveryFromConfigRequest(connection="conn-1", discovery_config=None) - assert omitted.model_dump(exclude_none=True, mode="json") == {"connection": "conn-1"} - assert explicit_none.model_dump(exclude_none=True, mode="json") == {"connection": "conn-1"} +def test_schema_discovery_from_config_request_requires_discovery_config(): + """`discovery_config` must be set explicitly; omitting it is a validation error (it may be None, but not absent).""" + with pytest.raises(ValidationError, match="discovery_config"): + SchemaDiscoveryFromConfigRequest(connection="conn-1") -def test_file_data_discovery_from_config_request_allows_none_discovery_config(): - """`discovery_config` may be omitted or None; it is dropped from the payload so the server uses its defaults.""" - omitted = FileDataDiscoveryFromConfigRequest(connection="conn-1") - explicit_none = FileDataDiscoveryFromConfigRequest(connection="conn-1", discovery_config=None) - assert omitted.model_dump(exclude_none=True, mode="json") == {"connection": "conn-1"} - assert explicit_none.model_dump(exclude_none=True, mode="json") == {"connection": "conn-1"} +def test_schema_discovery_from_config_request_accepts_none_discovery_config(): + """An explicit None is accepted and means the server uses its built-in defaults.""" + req = SchemaDiscoveryFromConfigRequest(connection="conn-1", discovery_config=None) + assert req.discovery_config is None + + +def test_file_data_discovery_from_config_request_requires_discovery_config(): + """`discovery_config` must be set explicitly; omitting it is a validation error (it may be None, but not absent).""" + with pytest.raises(ValidationError, match="discovery_config"): + FileDataDiscoveryFromConfigRequest(connection="conn-1") + + +def test_file_data_discovery_from_config_request_accepts_none_discovery_config(): + """An explicit None is accepted and means the server uses its built-in defaults.""" + req = FileDataDiscoveryFromConfigRequest(connection="conn-1", discovery_config=None) + assert req.discovery_config is None def test_start_schema_discovery_run_from_config_sends_discovery_config(client): @@ -972,14 +980,24 @@ def test_start_file_data_discovery_run_from_config_sends_options(client): } -def test_start_schema_discovery_run_from_config_none_omits_discovery_config(client): - """With `discovery_config=None` the client posts only the connection; the server falls back to its defaults.""" +def test_start_schema_discovery_run_from_config_none_sends_null_discovery_config(client): + """With `discovery_config=None` the client posts an explicit null so the server applies its defaults.""" req = SchemaDiscoveryFromConfigRequest(connection="conn-1", discovery_config=None) with requests_mock.Mocker() as m: m.post("http://test-server/api/schema-discovery/v2/", json={"id": 13}, status_code=201) assert client.start_schema_discovery_run_from_config(req) == 13 - assert m.last_request.json() == {"connection": "conn-1"} + assert m.last_request.json() == {"connection": "conn-1", "discovery_config": None} + + +def test_start_file_data_discovery_run_from_config_none_sends_null_discovery_config(client): + """With `discovery_config=None` the file-data trigger posts an explicit null so the server applies its defaults.""" + req = FileDataDiscoveryFromConfigRequest(connection="conn-1", discovery_config=None) + with requests_mock.Mocker() as m: + m.post("http://test-server/api/run-file-data-discovery/v2/", json={"id": 21}, status_code=201) + assert client.start_file_data_discovery_run_from_config(req) == 21 + + assert m.last_request.json() == {"connection": "conn-1", "discovery_config": None} def test_start_schema_discovery_run_from_config_raises_invalid_discovery_config_when_not_valid(client): From 76a3921f9b06a68e3b04b1965234df075f3237a1 Mon Sep 17 00:00:00 2001 From: Andrew Chester Date: Mon, 15 Jun 2026 10:19:39 +1200 Subject: [PATCH 14/26] feat(discovery): DM-3562 accept schemas on saved-config schema discovery --- datamasque/client/models/discovery.py | 4 +++- tests/test_discovery.py | 34 +++++++++++++++++++++++++-- 2 files changed, 35 insertions(+), 3 deletions(-) diff --git a/datamasque/client/models/discovery.py b/datamasque/client/models/discovery.py index 0fce513..6a41b1c 100644 --- a/datamasque/client/models/discovery.py +++ b/datamasque/client/models/discovery.py @@ -77,13 +77,15 @@ class SchemaDiscoveryFromConfigRequest(BaseModel): `connection` accepts either a `ConnectionId` or a full `ConnectionConfig` returned by an earlier client call. `discovery_config` is required: pass a `DiscoveryConfigId`, a full `DiscoveryConfig`, - or `None` to run with the server's default discovery options. + or `None` to run with the default discovery options. + `schemas` optionally scopes the run to specific schemas; omit it to scan the connection's default schema. """ model_config = ConfigDict(extra="forbid") connection: Union[ConnectionId, ConnectionConfig] discovery_config: Optional[Union[DiscoveryConfigId, DiscoveryConfig]] + schemas: Optional[list[str]] = None @field_validator("connection", mode="before") @classmethod diff --git a/tests/test_discovery.py b/tests/test_discovery.py index dcf736a..0e14e57 100644 --- a/tests/test_discovery.py +++ b/tests/test_discovery.py @@ -886,15 +886,29 @@ def test_schema_discovery_from_config_request_rejects_unsaved_discovery_config() def test_schema_discovery_from_config_request_rejects_legacy_fields(): - """The saved-config request rejects any legacy detection options — they live in the config.""" + """The saved-config request rejects legacy detection options — they live in the config.""" with pytest.raises(ValidationError): SchemaDiscoveryFromConfigRequest( connection="conn-1", discovery_config=DiscoveryConfigId(DISCOVERY_CONFIG_ID), - schemas=["public"], + custom_keywords=["foo"], ) +def test_schema_discovery_from_config_request_accepts_schemas(): + """`schemas` scopes the saved-config schema run to specific schemas and is forwarded as-is.""" + req = SchemaDiscoveryFromConfigRequest( + connection="conn-1", + discovery_config=DiscoveryConfigId(DISCOVERY_CONFIG_ID), + schemas=["public", "sales"], + ) + assert req.model_dump(exclude_none=True, mode="json") == { + "connection": "conn-1", + "discovery_config": DISCOVERY_CONFIG_ID, + "schemas": ["public", "sales"], + } + + @pytest.mark.parametrize( "extra_field", [ @@ -951,6 +965,22 @@ def test_start_schema_discovery_run_from_config_sends_discovery_config(client): assert m.last_request.json() == {"connection": "conn-1", "discovery_config": DISCOVERY_CONFIG_ID} +def test_start_schema_discovery_run_from_config_sends_schemas(client): + """`start_schema_discovery_run_from_config` forwards a `schemas` scope to the saved-config endpoint.""" + req = SchemaDiscoveryFromConfigRequest( + connection="conn-1", discovery_config=DiscoveryConfigId(DISCOVERY_CONFIG_ID), schemas=["public"] + ) + with requests_mock.Mocker() as m: + m.post("http://test-server/api/schema-discovery/v2/", json={"id": 12}, status_code=201) + assert client.start_schema_discovery_run_from_config(req) == 12 + + assert m.last_request.json() == { + "connection": "conn-1", + "discovery_config": DISCOVERY_CONFIG_ID, + "schemas": ["public"], + } + + def test_start_file_data_discovery_run_from_config_sends_discovery_config(client): """`start_file_data_discovery_run_from_config` posts only the connection and discovery_config id.""" config = DiscoveryConfig(name="my_cfg", id=DiscoveryConfigId(DISCOVERY_CONFIG_ID)) From 11b75e4a734a1eba1d2a4fb7eabb3eacce4886c5 Mon Sep 17 00:00:00 2001 From: Colin Haywood Date: Tue, 16 Jun 2026 10:49:04 +1200 Subject: [PATCH 15/26] feat: DM-3400: add missing client features for the release-testing migration - Add `is_user_subscribed` to `MaskingRunRequest` (subscribe the requesting user to run notifications). - Add `auto_pull` / `auto_pull_branch` to `MaskingRunOptions` (refresh the ruleset from git before a run). - Surface `validation_error` (and `validation_error_type` for rulesets) on `Ruleset` and `RulesetLibrary`. - Expose git provenance as a nested `git` (`GitSnapshot`) field on both, collapsed from the flat `git_*` fields. - Exclude read-only fields (`id`, `is_valid`, `validation_error`, `git`, ...) from create/update request bodies. - Fix `SslZipFile` uploads to send the required `database_type=mysql` form field. --- HISTORY.rst | 10 ++ datamasque/client/__init__.py | 10 +- datamasque/client/files.py | 2 +- datamasque/client/models/files.py | 11 ++ datamasque/client/models/git.py | 60 ++++++++ datamasque/client/models/ruleset.py | 19 ++- datamasque/client/models/ruleset_library.py | 19 +-- datamasque/client/models/runs.py | 9 ++ datamasque/client/models/status.py | 9 ++ datamasque/client/ruleset_libraries.py | 7 +- datamasque/client/rulesets.py | 16 +-- tests/test_files.py | 39 +++++ tests/test_ruleset_library.py | 120 ++++++++++++++++ tests/test_rulesets.py | 151 +++++++++++++++++++- tests/test_runs.py | 47 ++++++ 15 files changed, 500 insertions(+), 29 deletions(-) create mode 100644 datamasque/client/models/git.py diff --git a/HISTORY.rst b/HISTORY.rst index 584d294..67055e4 100644 --- a/HISTORY.rst +++ b/HISTORY.rst @@ -11,6 +11,16 @@ History Adoption is recommended; the older APIs that take individual options will be deprecated in a future release. * Corrected the file-data-discovery ``include``/``skip`` filter syntax and added ``ignore_rules`` support. * Added ``InvalidDiscoveryConfigError``, raised when a discovery run can't start due to an invalid discovery config. +* Added ``is_user_subscribed`` to ``MaskingRunRequest`` + to subscribe the requesting user to a run's email notifications. +* Added ``auto_pull`` / ``auto_pull_branch`` to ``MaskingRunOptions`` + to refresh the run's ruleset from git before starting. +* Added ``validation_error`` (and ``validation_error_type`` for rulesets) + to ``Ruleset`` and ``RulesetLibrary``. +* Exposed git provenance on ``Ruleset`` and ``RulesetLibrary`` as a nested ``git`` field (``GitSnapshot``). +* Read-only fields (``id``, ``is_valid``, ``validation_error``, etc.) + are no longer echoed back in ``Ruleset`` / ``RulesetLibrary`` create/update request bodies. +* Fixed ``SslZipFile`` uploads to send the required ``database_type=mysql`` form field. 1.0.5 (2026-06-18) ------------------ diff --git a/datamasque/client/__init__.py b/datamasque/client/__init__.py index 304864f..7908151 100644 --- a/datamasque/client/__init__.py +++ b/datamasque/client/__init__.py @@ -87,6 +87,7 @@ SnowflakeKeyFile, SslZipFile, ) +from datamasque.client.models.git import GitSnapshot from datamasque.client.models.ifm import ( DataMasqueIfmInstanceConfig, IfmLog, @@ -112,7 +113,12 @@ RunInfo, UnfinishedRun, ) -from datamasque.client.models.status import AsyncRulesetGenerationTaskStatus, MaskingRunStatus, ValidationStatus +from datamasque.client.models.status import ( + AsyncRulesetGenerationTaskStatus, + MaskingRunStatus, + ValidationErrorType, + ValidationStatus, +) from datamasque.client.models.user import User, UserId, UserRole __version__ = version("datamasque-python") @@ -157,6 +163,7 @@ "FileOrContent", "FileRulesetGenerationRequest", "ForeignKeyRef", + "GitSnapshot", "HashColumnsTableConfig", "IfmAuthError", "IfmLog", @@ -219,5 +226,6 @@ "UserId", "UserRole", "UserSelection", + "ValidationErrorType", "ValidationStatus", ] diff --git a/datamasque/client/files.py b/datamasque/client/files.py index 2f476e6..89d1a67 100644 --- a/datamasque/client/files.py +++ b/datamasque/client/files.py @@ -30,7 +30,7 @@ def upload_file( response = self.make_request( "POST", file_type.get_url(), - data={"name": file_name}, + data={"name": file_name, **file_type.get_extra_form_data()}, files=[ UploadFile( field_name=file_type.get_content_param_name(), diff --git a/datamasque/client/models/files.py b/datamasque/client/models/files.py index 59bfb7f..13c9846 100644 --- a/datamasque/client/models/files.py +++ b/datamasque/client/models/files.py @@ -40,6 +40,12 @@ def get_content_param_name(cls) -> str: raise NotImplementedError # pragma: no cover + @classmethod + def get_extra_form_data(cls) -> dict[str, str]: + """Extra multipart form fields to send alongside the file on upload. Empty by default.""" + + return {} + class SeedFile(DataMasqueFile): """Represents a seed file (CSV file).""" @@ -76,6 +82,11 @@ def get_url(cls) -> str: def get_content_param_name(cls) -> str: return "zip_archive" + @classmethod + def get_extra_form_data(cls) -> dict[str, str]: + # The connection-filesets endpoint requires a database_type; SSL filesets are MySQL-only today. + return {"database_type": "mysql"} + class SnowflakeKeyFile(DataMasqueFile): """Represents a private SSH key file for Snowflake connections.""" diff --git a/datamasque/client/models/git.py b/datamasque/client/models/git.py new file mode 100644 index 0000000..df8e71a --- /dev/null +++ b/datamasque/client/models/git.py @@ -0,0 +1,60 @@ +from collections.abc import Mapping +from datetime import datetime +from typing import Any, Optional + +from pydantic import BaseModel, ConfigDict, Field, model_validator + + +class GitSnapshot(BaseModel): + """ + Git provenance for a ruleset or ruleset library. + + Identifies the commit the entity's contents came from — + `commit_sha` on `branch` in `repo_url`, as of `synced_at`. + """ + + branch: str + commit_sha: str + repo_url: str + synced_at: datetime + + +GIT_RESPONSE_FIELDS = ("git_branch", "git_commit_sha", "git_repo_url", "git_synced_at") + + +def git_snapshot_from_response(data: Mapping[str, Any]) -> Optional[GitSnapshot]: + """Build a `GitSnapshot` from the server's flat `git_*` fields, or `None` when not git-synced.""" + if data.get("git_branch") is None: + return None + + return GitSnapshot( + branch=data["git_branch"], + commit_sha=data["git_commit_sha"], + repo_url=data["git_repo_url"], + synced_at=data["git_synced_at"], + ) + + +class GitTrackedEntity(BaseModel): + """Base class for rulesets and ruleset libraries that carry git provenance.""" + + model_config = ConfigDict(extra="allow", populate_by_name=True) + + git: Optional[GitSnapshot] = Field(default=None, exclude=True) + """Git provenance, or `None` when the entity is not currently in sync with a git commit.""" + + @model_validator(mode="before") + @classmethod + def _collapse_git_fields(cls, data: Any) -> Any: + if not isinstance(data, dict) or "git" in data: + return data + + data = dict(data) + snapshot = git_snapshot_from_response(data) + for field in GIT_RESPONSE_FIELDS: + data.pop(field, None) + + if snapshot is not None: + data["git"] = snapshot + + return data diff --git a/datamasque/client/models/ruleset.py b/datamasque/client/models/ruleset.py index 564b7c1..362fc8b 100644 --- a/datamasque/client/models/ruleset.py +++ b/datamasque/client/models/ruleset.py @@ -1,9 +1,10 @@ import enum from typing import Any, NewType, Optional -from pydantic import BaseModel, ConfigDict, Field +from pydantic import Field -from datamasque.client.models.status import ValidationStatus +from datamasque.client.models.git import GitTrackedEntity +from datamasque.client.models.status import ValidationErrorType, ValidationStatus RulesetId = NewType("RulesetId", str) @@ -33,13 +34,17 @@ class RulesetType(enum.Enum): database = "database" -class Ruleset(BaseModel): +class Ruleset(GitTrackedEntity): """Represents a ruleset.""" - model_config = ConfigDict(extra="allow", populate_by_name=True) - name: str yaml: str = Field(default="", alias="config_yaml") ruleset_type: RulesetType = Field(default=RulesetType.database, alias="mask_type") - id: Optional[RulesetId] = None - is_valid: Optional[ValidationStatus] = None + + # Server-populated read-only fields, excluded from request bodies. + id: Optional[RulesetId] = Field(default=None, exclude=True) + is_valid: Optional[ValidationStatus] = Field(default=None, exclude=True) + validation_error: Optional[str] = Field(default=None, exclude=True) + """Human-readable validation error, or `None` when valid.""" + validation_error_type: Optional[ValidationErrorType] = Field(default=None, exclude=True) + """Category of the validation failure, or `None` when valid.""" diff --git a/datamasque/client/models/ruleset_library.py b/datamasque/client/models/ruleset_library.py index 1ebe789..d1e1d3b 100644 --- a/datamasque/client/models/ruleset_library.py +++ b/datamasque/client/models/ruleset_library.py @@ -1,22 +1,25 @@ from datetime import datetime from typing import NewType, Optional -from pydantic import BaseModel, ConfigDict, Field +from pydantic import Field +from datamasque.client.models.git import GitTrackedEntity from datamasque.client.models.status import ValidationStatus RulesetLibraryId = NewType("RulesetLibraryId", str) -class RulesetLibrary(BaseModel): +class RulesetLibrary(GitTrackedEntity): """Represents a ruleset library.""" - model_config = ConfigDict(extra="allow", populate_by_name=True) - name: str namespace: str = "" yaml: Optional[str] = Field(default=None, alias="config_yaml") - id: Optional[RulesetLibraryId] = None - is_valid: Optional[ValidationStatus] = None - created: Optional[datetime] = None - modified: Optional[datetime] = None + + # Server-populated read-only fields, excluded from request bodies. + id: Optional[RulesetLibraryId] = Field(default=None, exclude=True) + is_valid: Optional[ValidationStatus] = Field(default=None, exclude=True) + validation_error: Optional[str] = Field(default=None, exclude=True) + """Human-readable validation error, or `None` when valid.""" + created: Optional[datetime] = Field(default=None, exclude=True) + modified: Optional[datetime] = Field(default=None, exclude=True) diff --git a/datamasque/client/models/runs.py b/datamasque/client/models/runs.py index 379f82a..dc15663 100644 --- a/datamasque/client/models/runs.py +++ b/datamasque/client/models/runs.py @@ -30,6 +30,7 @@ class MaskingRunOptions(BaseModel): if supplied, must be 16–256 characters and is used as the per-run encryption key; the server auto-generates one when omitted. + Set `auto_pull` to refresh the run's ruleset from git before the run starts. """ model_config = ConfigDict(extra="forbid") @@ -41,6 +42,10 @@ class MaskingRunOptions(BaseModel): diagnostic_logging: Optional[bool] = None run_secret: Optional[str] = Field(default=None, min_length=16, max_length=256) disable_instance_secret: Optional[bool] = None + auto_pull: Optional[bool] = None + """When `True`, pull the run's ruleset from git before starting.""" + auto_pull_branch: Optional[str] = None + """Branch to auto-pull from; omitted or empty uses the instance's configured branch.""" class MaskingRunRequest(BaseModel): @@ -50,6 +55,9 @@ class MaskingRunRequest(BaseModel): `connection`, `destination_connection`, and `ruleset` accept either the server-assigned ID or the corresponding object returned by an earlier client call (e.g. a `ConnectionConfig` or `Ruleset`); the object's `id` is extracted at construction time. + + Set `is_user_subscribed=True` to subscribe the requesting user to the run's email notifications; + when omitted the server applies its default (no subscription). """ model_config = ConfigDict(extra="forbid") @@ -60,6 +68,7 @@ class MaskingRunRequest(BaseModel): destination_connection: Optional[Union[ConnectionId, ConnectionConfig]] = None options: MaskingRunOptions = Field(default_factory=MaskingRunOptions) name: Optional[str] = None + is_user_subscribed: Optional[bool] = None @field_validator("connection", "destination_connection", mode="before") @classmethod diff --git a/datamasque/client/models/status.py b/datamasque/client/models/status.py index b09e18f..f470fd0 100644 --- a/datamasque/client/models/status.py +++ b/datamasque/client/models/status.py @@ -10,6 +10,15 @@ class ValidationStatus(enum.Enum): unknown = "unknown" +class ValidationErrorType(enum.Enum): + """Categorises why a ruleset failed validation (see `Ruleset.validation_error_type`).""" + + ruleset = "ruleset" + library_missing = "library_missing" + library_invalid = "library_invalid" + expansion = "expansion" # The ruleset is not valid once its library references are expanded. + + class MaskingRunStatus(enum.Enum): """List of valid masking run statuses.""" diff --git a/datamasque/client/ruleset_libraries.py b/datamasque/client/ruleset_libraries.py index 81acb8f..8943553 100644 --- a/datamasque/client/ruleset_libraries.py +++ b/datamasque/client/ruleset_libraries.py @@ -63,7 +63,8 @@ def create_ruleset_library(self, library: RulesetLibrary) -> RulesetLibrary: """ Creates a new ruleset library on the server. - Sets the library's server-assigned fields (`id`, `is_valid`, `created`, `modified`) and returns the library. + Sets the library's server-assigned fields + (`id`, `is_valid`, `validation_error`, `git`, `created`, `modified`) and returns the library. """ data = library.model_dump(exclude_none=True, by_alias=True, mode="json") @@ -71,6 +72,8 @@ def create_ruleset_library(self, library: RulesetLibrary) -> RulesetLibrary: created_library = RulesetLibrary.model_validate(response.json()) library.id = created_library.id library.is_valid = created_library.is_valid + library.validation_error = created_library.validation_error + library.git = created_library.git library.created = created_library.created library.modified = created_library.modified logger.info('Creation of ruleset library "%s" successful', library.name) @@ -90,6 +93,8 @@ def update_ruleset_library(self, library: RulesetLibrary) -> RulesetLibrary: response = self.make_request("PUT", f"/api/ruleset-libraries/{library.id}/", data=data) updated_library = RulesetLibrary.model_validate(response.json()) library.is_valid = updated_library.is_valid + library.validation_error = updated_library.validation_error + library.git = updated_library.git library.modified = updated_library.modified logger.debug('Update of ruleset library "%s" successful', library.name) return library diff --git a/datamasque/client/rulesets.py b/datamasque/client/rulesets.py index 4b9e038..77ab54d 100644 --- a/datamasque/client/rulesets.py +++ b/datamasque/client/rulesets.py @@ -3,7 +3,6 @@ from datamasque.client.base import BaseClient from datamasque.client.exceptions import DataMasqueException from datamasque.client.models.ruleset import Ruleset, RulesetId -from datamasque.client.models.status import ValidationStatus logger = logging.getLogger(__name__) @@ -21,17 +20,18 @@ def create_or_update_ruleset(self, ruleset: Ruleset) -> Ruleset: """ Creates or updates a ruleset. - Populates the given ruleset's `id` and `is_valid` fields from the server response, - and returns the same ruleset instance for convenience. + Populates the given ruleset's `id`, `is_valid`, `validation_error`, `validation_error_type`, + and `git` fields from the server response, and returns the same ruleset instance for convenience. """ data = ruleset.model_dump(exclude_none=True, by_alias=True, mode="json") response = self.make_request("POST", "/api/rulesets/", data=data, params={"upsert": "true"}) - response_data = response.json() - ruleset.id = RulesetId(response_data["id"]) - is_valid = response_data.get("is_valid") - if is_valid is not None: - ruleset.is_valid = ValidationStatus(is_valid) + created = Ruleset.model_validate(response.json()) + ruleset.id = created.id + ruleset.is_valid = created.is_valid + ruleset.validation_error = created.validation_error + ruleset.validation_error_type = created.validation_error_type + ruleset.git = created.git if response.status_code == 201: logger.info('Creation of ruleset "%s" successful', ruleset.name) diff --git a/tests/test_files.py b/tests/test_files.py index 5f5574c..3b8f99b 100644 --- a/tests/test_files.py +++ b/tests/test_files.py @@ -271,3 +271,42 @@ def test_upload_file_retries_on_401(config): second_form = parse_multipart_form(m_request.request_history[1]) assert first_form["seed_file"]["content"] == b"seed content" assert second_form["seed_file"]["content"] == b"seed content" + + +def test_upload_ssl_zip_file_sends_mysql_database_type(client): + """`SslZipFile` uploads must include `database_type=mysql`; the connection-filesets endpoint requires it.""" + with requests_mock.Mocker() as m_request: + m_request.post( + f"http://test-server/{SslZipFile.get_url()}", + status_code=201, + json={ + "id": str(uuid.uuid4()), + "name": "certs.zip", + "created_date": "2024-01-01T00:00:00.000000Z", + "modified_date": "2024-01-01T00:00:00.000000Z", + }, + ) + client.upload_file(SslZipFile, "certs.zip", b"zip bytes") + + form = parse_multipart_form(m_request.request_history[0]) + assert form["database_type"] == "mysql" + + +@pytest.mark.parametrize("file_type", [SeedFile, OracleWalletFile, SnowflakeKeyFile]) +def test_upload_non_ssl_zip_file_omits_database_type(client, file_type): + """Only `SslZipFile` carries the `database_type` form field; other file types must not send it.""" + with requests_mock.Mocker() as m_request: + m_request.post( + f"http://test-server/{file_type.get_url()}", + status_code=201, + json={ + "id": str(uuid.uuid4()), + "name": "file.bin", + "created_date": "2024-01-01T00:00:00.000000Z", + "modified_date": "2024-01-01T00:00:00.000000Z", + }, + ) + client.upload_file(file_type, "file.bin", b"content") + + form = parse_multipart_form(m_request.request_history[0]) + assert "database_type" not in form diff --git a/tests/test_ruleset_library.py b/tests/test_ruleset_library.py index 1ff939b..6698531 100644 --- a/tests/test_ruleset_library.py +++ b/tests/test_ruleset_library.py @@ -616,3 +616,123 @@ def test_list_rulesets_using_library_pagination(client: DataMasqueClient) -> Non assert rulesets[0].name == "r1" assert rulesets[1].name == "r2" assert rulesets[2].name == "r3" + + +def test_create_ruleset_library_populates_validation_error( + client: DataMasqueClient, ruleset_library: RulesetLibrary +) -> None: + """The server's `validation_error` string is surfaced on the returned library.""" + create_response = { + "id": LIBRARY_ID_1, + "name": "test_library", + "namespace": "test_ns", + "config_yaml": "version: '1.0'\nfunctions: []", + "is_valid": "invalid", + "validation_error": "Invalid function definition", + "created": "2025-06-01T10:00:00Z", + "modified": "2025-06-01T10:00:00Z", + } + + with requests_mock.Mocker() as m: + m.post( + "http://test-server/api/ruleset-libraries/", + json=create_response, + status_code=201, + ) + result = client.create_ruleset_library(ruleset_library) + + assert result.validation_error == "Invalid function definition" + + +def test_update_ruleset_library_populates_validation_error( + client: DataMasqueClient, ruleset_library: RulesetLibrary +) -> None: + """A full update also surfaces the server's `validation_error` string.""" + ruleset_library.id = RulesetLibraryId(LIBRARY_ID_1) + update_response = { + "id": LIBRARY_ID_1, + "name": "test_library", + "namespace": "test_ns", + "config_yaml": "version: '1.0'\nfunctions: []", + "is_valid": "invalid", + "validation_error": "Invalid function definition", + "created": "2025-06-01T10:00:00Z", + "modified": "2025-06-02T10:00:00Z", + } + + with requests_mock.Mocker() as m: + m.put( + f"http://test-server/api/ruleset-libraries/{LIBRARY_ID_1}/", + json=update_response, + status_code=200, + ) + result = client.update_ruleset_library(ruleset_library) + + assert result.validation_error == "Invalid function definition" + + +def test_create_ruleset_library_does_not_send_read_only_fields( + client: DataMasqueClient, ruleset_library: RulesetLibrary +) -> None: + """Read-only server fields must never appear in the create request body.""" + ruleset_library.id = RulesetLibraryId(LIBRARY_ID_1) + ruleset_library.is_valid = ValidationStatus.invalid + ruleset_library.validation_error = "stale error" + create_response = { + "id": LIBRARY_ID_1, + "name": "test_library", + "namespace": "test_ns", + "config_yaml": "version: '1.0'\nfunctions: []", + "is_valid": "invalid", + "validation_error": "Invalid function definition", + "created": "2025-06-01T10:00:00Z", + "modified": "2025-06-01T10:00:00Z", + } + + with requests_mock.Mocker() as m: + m.post( + "http://test-server/api/ruleset-libraries/", + json=create_response, + status_code=201, + ) + client.create_ruleset_library(ruleset_library) + + body = m.last_request.json() + for read_only_field in ("id", "is_valid", "validation_error", "created", "modified"): + assert read_only_field not in body + # Input fields are still present. + assert body["name"] == "test_library" + assert body["config_yaml"] == "version: '1.0'\nfunctions: []" + + +def test_create_ruleset_library_collapses_git_snapshot( + client: DataMasqueClient, ruleset_library: RulesetLibrary +) -> None: + """The server's flat `git_*` fields are collapsed into a nested `git` snapshot on the library.""" + create_response = { + "id": LIBRARY_ID_1, + "name": "test_library", + "namespace": "test_ns", + "config_yaml": "version: '1.0'\nfunctions: []", + "is_valid": "valid", + "git_branch": "main", + "git_commit_sha": "abc123", + "git_repo_url": "https://git.example.com/repo.git", + "git_synced_at": "2025-06-01T10:00:00Z", + "created": "2025-06-01T10:00:00Z", + "modified": "2025-06-01T10:00:00Z", + } + + with requests_mock.Mocker() as m: + m.post( + "http://test-server/api/ruleset-libraries/", + json=create_response, + status_code=201, + ) + result = client.create_ruleset_library(ruleset_library) + + assert result.git is not None + assert result.git.branch == "main" + assert result.git.commit_sha == "abc123" + assert result.git.repo_url == "https://git.example.com/repo.git" + assert result.git.synced_at == datetime.fromisoformat("2025-06-01T10:00:00+00:00") diff --git a/tests/test_rulesets.py b/tests/test_rulesets.py index ae97b3a..aff5f32 100644 --- a/tests/test_rulesets.py +++ b/tests/test_rulesets.py @@ -1,11 +1,13 @@ """Tests for `RulesetClient`.""" +from datetime import datetime + import pytest import requests_mock from datamasque.client.exceptions import DataMasqueApiError from datamasque.client.models.ruleset import RulesetType -from datamasque.client.models.status import ValidationStatus +from datamasque.client.models.status import ValidationErrorType, ValidationStatus def test_list_rulesets(client, existing_rulesets_json): @@ -32,7 +34,7 @@ def test_create_or_update_ruleset_create(client, ruleset): # Test creating a new ruleset with upsert m.post( "http://test-server/api/rulesets/?upsert=true", - json={"id": "2", "is_valid": "in_progress"}, + json={"id": "2", "name": "test_ruleset", "is_valid": "in_progress"}, status_code=201, ) @@ -62,7 +64,7 @@ def test_create_or_update_ruleset_update(client, ruleset): # Test updating an existing ruleset with upsert (returns 200 status for update) m.post( "http://test-server/api/rulesets/?upsert=true", - json={"id": "1", "is_valid": "valid"}, + json={"id": "1", "name": "test_ruleset", "is_valid": "valid"}, status_code=200, ) @@ -117,3 +119,146 @@ def test_delete_ruleset_that_does_not_exist(client, existing_rulesets_json): assert m.call_count == 1 assert m.request_history[0].method == "GET" + + +def test_create_or_update_ruleset_populates_validation_error(client, ruleset): + """The server's `validation_error` string and `validation_error_type` are surfaced on the returned ruleset.""" + with requests_mock.Mocker() as m: + m.post( + "http://test-server/api/rulesets/?upsert=true", + json={ + "id": "2", + "name": "test_ruleset", + "is_valid": "invalid", + "validation_error": "Ruleset error: Missing required field: table", + "validation_error_type": "ruleset", + }, + status_code=201, + ) + result = client.create_or_update_ruleset(ruleset) + + assert result.validation_error == "Ruleset error: Missing required field: table" + assert result.validation_error_type is ValidationErrorType.ruleset + + +def test_create_or_update_ruleset_validation_error_none_when_absent(client, ruleset): + """`validation_error` and `validation_error_type` are `None` when the server omits them (a valid ruleset).""" + with requests_mock.Mocker() as m: + m.post( + "http://test-server/api/rulesets/?upsert=true", + json={"id": "2", "name": "test_ruleset", "is_valid": "valid"}, + status_code=201, + ) + result = client.create_or_update_ruleset(ruleset) + + assert result.validation_error is None + assert result.validation_error_type is None + + +def test_create_or_update_ruleset_does_not_send_read_only_fields(client, ruleset): + """Read-only server fields must never be echoed back into a re-submit's request body.""" + with requests_mock.Mocker() as m: + m.post( + "http://test-server/api/rulesets/?upsert=true", + json={ + "id": "2", + "name": "test_ruleset", + "is_valid": "invalid", + "validation_error": "Ruleset error: bad", + "validation_error_type": "ruleset", + }, + status_code=201, + ) + client.create_or_update_ruleset(ruleset) + + # Re-submit the same object (read-only fields now populated). + client.create_or_update_ruleset(ruleset) + + body = m.last_request.json() + for read_only_field in ("id", "is_valid", "validation_error", "validation_error_type"): + assert read_only_field not in body + # Input fields are still present. + assert body["config_yaml"] == "version: '1.0'\ntasks: []" + + +def test_create_or_update_ruleset_collapses_git_snapshot(client, ruleset): + """The server's flat `git_*` fields are collapsed into a nested `git` snapshot.""" + with requests_mock.Mocker() as m: + m.post( + "http://test-server/api/rulesets/?upsert=true", + json={ + "id": "2", + "name": "test_ruleset", + "is_valid": "valid", + "git_branch": "main", + "git_commit_sha": "abc123", + "git_repo_url": "https://git.example.com/repo.git", + "git_synced_at": "2025-06-01T10:00:00Z", + }, + status_code=201, + ) + result = client.create_or_update_ruleset(ruleset) + + assert result.git is not None + assert result.git.branch == "main" + assert result.git.commit_sha == "abc123" + assert result.git.repo_url == "https://git.example.com/repo.git" + assert result.git.synced_at == datetime.fromisoformat("2025-06-01T10:00:00+00:00") + + +def test_create_or_update_ruleset_git_none_when_not_synced(client, ruleset): + """`git` is `None` when the server reports no git provenance.""" + with requests_mock.Mocker() as m: + m.post( + "http://test-server/api/rulesets/?upsert=true", + json={ + "id": "2", + "name": "test_ruleset", + "is_valid": "valid", + "git_branch": None, + "git_commit_sha": None, + "git_repo_url": None, + "git_synced_at": None, + }, + status_code=201, + ) + result = client.create_or_update_ruleset(ruleset) + + assert result.git is None + + +def test_list_rulesets_collapses_git_snapshot(client): + """`git_*` fields are collapsed (and consumed, not left as extras) when listing rulesets.""" + with requests_mock.Mocker() as m: + m.get( + "http://test-server/api/v2/rulesets/", + json=[ + { + "id": "1", + "name": "synced_ruleset", + "mask_type": "database", + "config_yaml": "version: '1.0'", + "is_valid": "valid", + "git_branch": "main", + "git_commit_sha": "abc123", + "git_repo_url": "https://git.example.com/repo.git", + "git_synced_at": "2025-06-01T10:00:00Z", + }, + { + "id": "2", + "name": "local_ruleset", + "mask_type": "database", + "config_yaml": "version: '1.0'", + "is_valid": "valid", + }, + ], + status_code=200, + ) + rulesets = client.list_rulesets() + + assert rulesets[0].git is not None + assert rulesets[0].git.branch == "main" + assert rulesets[1].git is None + # The flat keys were consumed, not retained as extras that would leak back on re-submit. + assert "git_branch" not in rulesets[0].model_dump(by_alias=True) + assert "git" not in rulesets[0].model_dump(by_alias=True) diff --git a/tests/test_runs.py b/tests/test_runs.py index 3360093..24cbcd6 100644 --- a/tests/test_runs.py +++ b/tests/test_runs.py @@ -161,6 +161,53 @@ def test_start_masking_run(client): assert client.start_masking_run(MaskingRunRequest(connection="1", ruleset="rs-1", name=fake.word())) == "1" +def test_start_masking_run_sends_is_user_subscribed_when_set(client): + """`is_user_subscribed=True` is sent top-level so the server subscribes the requesting user to the run.""" + with requests_mock.Mocker() as m: + m.post("http://test-server/api/runs/", json={"id": "1", "name": "r"}, status_code=201) + client.start_masking_run(MaskingRunRequest(connection="1", ruleset="rs-1", name="r", is_user_subscribed=True)) + + assert m.last_request.json()["is_user_subscribed"] is True + + +def test_start_masking_run_omits_is_user_subscribed_by_default(client): + """An unset `is_user_subscribed` is excluded from the request body rather than sent as null.""" + with requests_mock.Mocker() as m: + m.post("http://test-server/api/runs/", json={"id": "1", "name": "r"}, status_code=201) + client.start_masking_run(MaskingRunRequest(connection="1", ruleset="rs-1", name="r")) + + assert "is_user_subscribed" not in m.last_request.json() + + +def test_start_masking_run_sends_auto_pull_options(client): + """`auto_pull` / `auto_pull_branch` are sent inside `options` so the server refreshes the ruleset from git.""" + with requests_mock.Mocker() as m: + m.post("http://test-server/api/runs/", json={"id": "1", "name": "r"}, status_code=201) + client.start_masking_run( + MaskingRunRequest( + connection="1", + ruleset="rs-1", + name="r", + options=MaskingRunOptions(auto_pull=True, auto_pull_branch="main"), + ) + ) + + options = m.last_request.json()["options"] + assert options["auto_pull"] is True + assert options["auto_pull_branch"] == "main" + + +def test_start_masking_run_omits_auto_pull_options_by_default(client): + """Unset auto-pull options are excluded from the request body rather than sent as null.""" + with requests_mock.Mocker() as m: + m.post("http://test-server/api/runs/", json={"id": "1", "name": "r"}, status_code=201) + client.start_masking_run(MaskingRunRequest(connection="1", ruleset="rs-1", name="r")) + + options = m.last_request.json()["options"] + assert "auto_pull" not in options + assert "auto_pull_branch" not in options + + def test_start_masking_run_invalid_ruleset(client): with requests_mock.Mocker() as m: m.post( From 2e07560a3f2d11aaef4217405c8dada2bc56ec7f Mon Sep 17 00:00:00 2001 From: Colin Haywood Date: Wed, 17 Jun 2026 15:09:28 +1200 Subject: [PATCH 16/26] fix: Handle dict-shaped config validation errors on the discovery run API --- datamasque/client/discovery.py | 43 ++++++++++++++++++++++++---------- tests/test_discovery.py | 28 ++++++++++++++++++++++ 2 files changed, 59 insertions(+), 12 deletions(-) diff --git a/datamasque/client/discovery.py b/datamasque/client/discovery.py index ca27772..0180036 100644 --- a/datamasque/client/discovery.py +++ b/datamasque/client/discovery.py @@ -283,7 +283,8 @@ def start_schema_discovery_run_from_config(self, request: SchemaDiscoveryFromCon Raises: InvalidDiscoveryConfigError: the run failed to start because the referenced - discovery config is missing, archived, or not in a `valid` validation state. + discovery config is missing, archived, not in a `valid` validation state, + or carries YAML the server rejects at trigger time. FailedToStartError: the run failed to start for any other reason. """ @@ -325,7 +326,8 @@ def start_file_data_discovery_run_from_config(self, request: FileDataDiscoveryFr Raises: InvalidDiscoveryConfigError: the run failed to start because the referenced - discovery config is missing, archived, or not in a `valid` validation state. + discovery config is missing, archived, not in a `valid` validation state, + or carries YAML the server rejects at trigger time. FailedToStartError: the run failed to start for any other reason. """ @@ -353,18 +355,35 @@ def start_file_data_discovery_run_from_config(self, request: FileDataDiscoveryFr response=response, ) - @staticmethod - def _maybe_raise_discovery_config_error(run_data: object, response: Response, run_kind: str) -> None: - """Raise `InvalidDiscoveryConfigError` if the server's 400 body cites `discovery_config`.""" - if not isinstance(run_data, dict) or "discovery_config" not in run_data: + # Server key for a 400 that means the discovery config itself is unusable. + # `discovery_config` covers a missing/archived config or a non-`valid` validation state + # (string messages), + # and re-validation of broken saved-config YAML at trigger time + # (a `{"message", "line_number", "column_number"}` dict per error). + DISCOVERY_CONFIG_ERROR_FIELDS = ("discovery_config",) + + @classmethod + def _maybe_raise_discovery_config_error(cls, run_data: object, response: Response, run_kind: str) -> None: + """Raise `InvalidDiscoveryConfigError` if the server's 400 body cites the discovery config.""" + if not isinstance(run_data, dict): return - errors = run_data["discovery_config"] - detail = errors[0] if isinstance(errors, list) and errors else str(errors) - raise InvalidDiscoveryConfigError( - f"{run_kind} run failed to start due to discovery config error: {detail}", - response=response, - ) + for field in cls.DISCOVERY_CONFIG_ERROR_FIELDS: + if field in run_data: + detail = cls._format_discovery_config_error(run_data[field]) + raise InvalidDiscoveryConfigError( + f"{run_kind} run failed to start due to discovery config error: {detail}", + response=response, + ) + + @staticmethod + def _format_discovery_config_error(errors: object) -> str: + """Render the first server error, handling both string and `{message, ...}` dict items.""" + first = errors[0] if isinstance(errors, list) and errors else errors + if isinstance(first, dict) and "message" in first: + return str(first["message"]) + + return str(first) def iter_schema_discovery_results(self, run_id: RunId) -> Iterator[SchemaDiscoveryResult]: """Lazily iterate all schema discovery results for a run via the paginated v2 endpoint.""" diff --git a/tests/test_discovery.py b/tests/test_discovery.py index 0e14e57..7a2fec8 100644 --- a/tests/test_discovery.py +++ b/tests/test_discovery.py @@ -1067,6 +1067,34 @@ def test_start_schema_discovery_run_from_config_raises_invalid_discovery_config_ ) +def test_start_schema_discovery_run_from_config_raises_invalid_discovery_config_on_broken_yaml(client): + """ + A 400 carrying trigger-time re-validation of broken saved YAML is classified. + + The server reports these under `discovery_config` as `{"message", "line_number", "column_number"}` dicts, + not strings. + """ + with requests_mock.Mocker() as m: + m.post( + "http://test-server/api/schema-discovery/v2/", + json={ + "discovery_config": [ + {"message": "Unknown mask 'no_such_mask'.", "line_number": 4, "column_number": 7}, + ], + }, + status_code=400, + ) + with pytest.raises( + InvalidDiscoveryConfigError, + match=r"Schema discovery run failed to start due to discovery config error: Unknown mask 'no_such_mask'\.", + ): + client.start_schema_discovery_run_from_config( + SchemaDiscoveryFromConfigRequest( + connection="conn-1", discovery_config=DiscoveryConfigId(DISCOVERY_CONFIG_ID) + ), + ) + + def test_start_file_data_discovery_run_from_config_raises_invalid_discovery_config(client): with requests_mock.Mocker() as m: m.post( From bf1cb054d13fec85551a77e2a2fa3d94d09a3120 Mon Sep 17 00:00:00 2001 From: Colin Haywood Date: Sun, 21 Jun 2026 11:01:20 +1200 Subject: [PATCH 17/26] feat: Add DiscoveryRunType and DiscoveryConfigNotFoundError --- datamasque/client/__init__.py | 5 +- datamasque/client/discovery.py | 52 ++++++++++++-------- datamasque/client/exceptions.py | 13 ++++- datamasque/client/models/discovery_config.py | 26 +++++++--- tests/test_discovery.py | 37 +++++++++++--- tests/test_discovery_configs.py | 23 ++++++++- 6 files changed, 117 insertions(+), 39 deletions(-) diff --git a/datamasque/client/__init__.py b/datamasque/client/__init__.py index 7908151..0872d95 100644 --- a/datamasque/client/__init__.py +++ b/datamasque/client/__init__.py @@ -17,6 +17,7 @@ DataMasqueNotReadyError, DataMasqueTransportError, DataMasqueUserError, + DiscoveryConfigNotFoundError, FailedToStartError, IfmAuthError, InvalidDiscoveryConfigError, @@ -77,7 +78,7 @@ SchemaDiscoveryResult, TableConstraints, ) -from datamasque.client.models.discovery_config import DiscoveryConfig, DiscoveryConfigId +from datamasque.client.models.discovery_config import DiscoveryConfig, DiscoveryConfigId, DiscoveryConfigType from datamasque.client.models.dm_instance import DataMasqueInstanceConfig from datamasque.client.models.files import ( DataMasqueFile, @@ -146,6 +147,8 @@ "DatabricksConnectionConfig", "DiscoveryConfig", "DiscoveryConfigId", + "DiscoveryConfigNotFoundError", + "DiscoveryConfigType", "DiscoveryMatch", "DynamoConnectionConfig", "FailedToStartError", diff --git a/datamasque/client/discovery.py b/datamasque/client/discovery.py index 0180036..392c32e 100644 --- a/datamasque/client/discovery.py +++ b/datamasque/client/discovery.py @@ -10,6 +10,7 @@ from datamasque.client.exceptions import ( AsyncRulesetGenerationInProgressError, DataMasqueException, + DiscoveryConfigNotFoundError, FailedToStartError, InvalidDiscoveryConfigError, ) @@ -282,9 +283,10 @@ def start_schema_discovery_run_from_config(self, request: SchemaDiscoveryFromCon RunId: The ID of the started discovery run Raises: - InvalidDiscoveryConfigError: the run failed to start because the referenced - discovery config is missing, archived, not in a `valid` validation state, - or carries YAML the server rejects at trigger time. + DiscoveryConfigNotFoundError: the referenced discovery config cannot be found + (it does not exist or is the wrong type for the run). + InvalidDiscoveryConfigError: the config is present but not in a `valid` validation state, + or its YAML is rejected when the run starts. FailedToStartError: the run failed to start for any other reason. """ @@ -325,9 +327,10 @@ def start_file_data_discovery_run_from_config(self, request: FileDataDiscoveryFr RunId: The ID of the started discovery run Raises: - InvalidDiscoveryConfigError: the run failed to start because the referenced - discovery config is missing, archived, not in a `valid` validation state, - or carries YAML the server rejects at trigger time. + DiscoveryConfigNotFoundError: the referenced discovery config cannot be found + (it does not exist or is the wrong type for the run). + InvalidDiscoveryConfigError: the config is present but not in a `valid` validation state, + or its YAML is rejected when the run starts. FailedToStartError: the run failed to start for any other reason. """ @@ -355,26 +358,35 @@ def start_file_data_discovery_run_from_config(self, request: FileDataDiscoveryFr response=response, ) - # Server key for a 400 that means the discovery config itself is unusable. - # `discovery_config` covers a missing/archived config or a non-`valid` validation state - # (string messages), - # and re-validation of broken saved-config YAML at trigger time - # (a `{"message", "line_number", "column_number"}` dict per error). - DISCOVERY_CONFIG_ERROR_FIELDS = ("discovery_config",) + # Server key for a 400 that means the discovery config itself is unusable: + # a missing or wrong-type config, or one not in a `valid` validation state (string messages), + # or re-validation of broken saved-config YAML when the run starts + # (a `{"message", "line_number", "column_number"}` dict). + DISCOVERY_CONFIG_ERROR_FIELD = "discovery_config" + + # The phrase the server uses when the config id cannot be resolved (a missing or wrong-type config). + MISSING_DISCOVERY_CONFIG_SIGNATURE = "object does not exist" @classmethod def _maybe_raise_discovery_config_error(cls, run_data: object, response: Response, run_kind: str) -> None: - """Raise `InvalidDiscoveryConfigError` if the server's 400 body cites the discovery config.""" + """Raise a discovery-config error if the server's 400 body cites the discovery config.""" if not isinstance(run_data, dict): return - for field in cls.DISCOVERY_CONFIG_ERROR_FIELDS: - if field in run_data: - detail = cls._format_discovery_config_error(run_data[field]) - raise InvalidDiscoveryConfigError( - f"{run_kind} run failed to start due to discovery config error: {detail}", - response=response, - ) + if not (errors := run_data.get(cls.DISCOVERY_CONFIG_ERROR_FIELD)): + return + + detail = cls._format_discovery_config_error(errors) + if cls.MISSING_DISCOVERY_CONFIG_SIGNATURE in detail: + raise DiscoveryConfigNotFoundError( + f"{run_kind} run failed to start: the referenced discovery config could not be found: {detail}", + response=response, + ) + + raise InvalidDiscoveryConfigError( + f"{run_kind} run failed to start due to discovery config error: {detail}", + response=response, + ) @staticmethod def _format_discovery_config_error(errors: object) -> str: diff --git a/datamasque/client/exceptions.py b/datamasque/client/exceptions.py index f034c4d..39c90fa 100644 --- a/datamasque/client/exceptions.py +++ b/datamasque/client/exceptions.py @@ -43,9 +43,18 @@ class InvalidLibraryError(FailedToStartError): class InvalidDiscoveryConfigError(FailedToStartError): """ - Raised when a discovery run fails to start due to an unusable discovery config. + Raised when a discovery run fails to start because the referenced config is present but unusable. - The referenced config is missing, archived, or not in a `valid` validation state. + The config exists but is not in a `valid` validation state, or its YAML is rejected when the run starts. + A config that cannot be found raises `DiscoveryConfigNotFoundError` instead. + """ + + +class DiscoveryConfigNotFoundError(FailedToStartError): + """ + Raised when a discovery run references a discovery config that cannot be found. + + The config does not exist or is the wrong type for the run. """ diff --git a/datamasque/client/models/discovery_config.py b/datamasque/client/models/discovery_config.py index f0a6b3c..09b123f 100644 --- a/datamasque/client/models/discovery_config.py +++ b/datamasque/client/models/discovery_config.py @@ -1,3 +1,4 @@ +import enum from datetime import datetime from typing import Any, NewType, Optional @@ -8,6 +9,13 @@ DiscoveryConfigId = NewType("DiscoveryConfigId", str) +class DiscoveryConfigType(enum.Enum): + """Which discovery config variant a config targets: database (qualified columns) or file (locators).""" + + database = "database" + file = "file" + + def unwrap_discovery_config_id(value: Any) -> Any: """ Coerce a `DiscoveryConfig` to its `id`; pass other values through unchanged. @@ -33,11 +41,13 @@ class DiscoveryConfig(BaseModel): name: str yaml: Optional[str] = Field(default=None, alias="config_yaml") - id: Optional[DiscoveryConfigId] = None - # Server-managed validation surface, populated by the DataMasque server. - # `is_valid` may be `in_progress` immediately after creating a large config, - # transitioning to `valid` or `invalid` once the server finishes validating. - is_valid: Optional[ValidationStatus] = None - validation_error: Optional[str] = None - created: Optional[datetime] = None - modified: Optional[datetime] = None + config_type: DiscoveryConfigType + + # Server-populated read-only fields, excluded from request bodies. + id: Optional[DiscoveryConfigId] = Field(default=None, exclude=True) + is_valid: Optional[ValidationStatus] = Field(default=None, exclude=True) + """Validation status; may be `in_progress` briefly after creating a large config.""" + validation_error: Optional[str] = Field(default=None, exclude=True) + """Human-readable validation error, or `None` when valid.""" + created: Optional[datetime] = Field(default=None, exclude=True) + modified: Optional[datetime] = Field(default=None, exclude=True) diff --git a/tests/test_discovery.py b/tests/test_discovery.py index 7a2fec8..8605474 100644 --- a/tests/test_discovery.py +++ b/tests/test_discovery.py @@ -30,6 +30,7 @@ AsyncRulesetGenerationInProgressError, DataMasqueApiError, DataMasqueException, + DiscoveryConfigNotFoundError, FailedToStartError, InvalidDiscoveryConfigError, ) @@ -873,14 +874,14 @@ def test_schema_discovery_from_config_request_accepts_discovery_config_id(): def test_schema_discovery_from_config_request_unwraps_discovery_config_model(): """Passing a full `DiscoveryConfig` object substitutes its `id` for the wire payload.""" - config = DiscoveryConfig(name="my_cfg", id=DiscoveryConfigId(DISCOVERY_CONFIG_ID)) + config = DiscoveryConfig(name="my_cfg", config_type="database", id=DiscoveryConfigId(DISCOVERY_CONFIG_ID)) req = SchemaDiscoveryFromConfigRequest(connection="conn-1", discovery_config=config) assert req.model_dump(exclude_none=True, mode="json")["discovery_config"] == DISCOVERY_CONFIG_ID def test_schema_discovery_from_config_request_rejects_unsaved_discovery_config(): """A `DiscoveryConfig` without an `id` cannot be used yet — raises immediately.""" - config = DiscoveryConfig(name="my_cfg") + config = DiscoveryConfig(name="my_cfg", config_type="database") with pytest.raises(ValueError, match="id is None"): SchemaDiscoveryFromConfigRequest(connection="conn-1", discovery_config=config) @@ -983,7 +984,7 @@ def test_start_schema_discovery_run_from_config_sends_schemas(client): def test_start_file_data_discovery_run_from_config_sends_discovery_config(client): """`start_file_data_discovery_run_from_config` posts only the connection and discovery_config id.""" - config = DiscoveryConfig(name="my_cfg", id=DiscoveryConfigId(DISCOVERY_CONFIG_ID)) + config = DiscoveryConfig(name="my_cfg", config_type="file", id=DiscoveryConfigId(DISCOVERY_CONFIG_ID)) req = FileDataDiscoveryFromConfigRequest(connection="conn-1", discovery_config=config) with requests_mock.Mocker() as m: m.post("http://test-server/api/run-file-data-discovery/v2/", json={"id": 99}, status_code=201) @@ -1051,21 +1052,45 @@ def test_start_schema_discovery_run_from_config_raises_invalid_discovery_config_ ) -def test_start_schema_discovery_run_from_config_raises_invalid_discovery_config_when_missing(client): - """A 400 from DRF's PrimaryKeyRelatedField (config not found / archived) is also classified.""" +def test_start_schema_discovery_run_from_config_raises_not_found_when_missing(client): + """ + A 400 for a config that cannot be found raises `DiscoveryConfigNotFoundError`. + + This is a bad reference rather than an unusable-but-present config, + so it must not be conflated with `InvalidDiscoveryConfigError`. + The not-found subclass still inherits `FailedToStartError` for callers that catch the base. + """ with requests_mock.Mocker() as m: m.post( "http://test-server/api/schema-discovery/v2/", json={"discovery_config": [f'Invalid pk "{DISCOVERY_CONFIG_ID}" - object does not exist.']}, status_code=400, ) - with pytest.raises(InvalidDiscoveryConfigError, match="object does not exist"): + with pytest.raises(DiscoveryConfigNotFoundError, match="object does not exist") as exc_info: client.start_schema_discovery_run_from_config( SchemaDiscoveryFromConfigRequest( connection="conn-1", discovery_config=DiscoveryConfigId(DISCOVERY_CONFIG_ID) ), ) + assert isinstance(exc_info.value, FailedToStartError) + + +def test_start_file_data_discovery_run_from_config_raises_not_found_when_missing(client): + """A not-found saved config on the file-data trigger also raises `DiscoveryConfigNotFoundError`.""" + with requests_mock.Mocker() as m: + m.post( + "http://test-server/api/run-file-data-discovery/v2/", + json={"discovery_config": [f'Invalid pk "{DISCOVERY_CONFIG_ID}" - object does not exist.']}, + status_code=400, + ) + with pytest.raises(DiscoveryConfigNotFoundError, match="object does not exist"): + client.start_file_data_discovery_run_from_config( + FileDataDiscoveryFromConfigRequest( + connection="conn-1", discovery_config=DiscoveryConfigId(DISCOVERY_CONFIG_ID) + ), + ) + def test_start_schema_discovery_run_from_config_raises_invalid_discovery_config_on_broken_yaml(client): """ diff --git a/tests/test_discovery_configs.py b/tests/test_discovery_configs.py index f98893e..7ea2b67 100644 --- a/tests/test_discovery_configs.py +++ b/tests/test_discovery_configs.py @@ -29,6 +29,7 @@ def sample_config_list_response() -> dict[str, Any]: { "id": CONFIG_ID_1, "name": "my_config", + "config_type": "database", "archived": False, "created": "2025-01-01T12:00:00Z", "modified": "2025-01-02T12:00:00Z", @@ -36,6 +37,7 @@ def sample_config_list_response() -> dict[str, Any]: { "id": CONFIG_ID_2, "name": "another_config", + "config_type": "database", "archived": False, "created": "2025-02-01T12:00:00Z", "modified": "2025-02-02T12:00:00Z", @@ -50,6 +52,7 @@ def sample_config_detail_response() -> dict[str, Any]: "id": CONFIG_ID_1, "name": "my_config", "config_yaml": "labels: []\nmetadata_rules: []\nidd_rules: []\n", + "config_type": "database", "archived": False, "created": "2025-01-01T12:00:00Z", "modified": "2025-01-02T12:00:00Z", @@ -61,6 +64,7 @@ def discovery_config() -> DiscoveryConfig: return DiscoveryConfig( name="test_config", yaml="labels: []\nmetadata_rules: []\nidd_rules: []\n", + config_type="database", ) @@ -89,6 +93,7 @@ def test_list_discovery_configs_pagination(client: DataMasqueClient) -> None: { "id": CONFIG_ID_1, "name": "c1", + "config_type": "database", "archived": False, "created": "2025-01-01T12:00:00Z", "modified": "2025-01-01T12:00:00Z", @@ -96,6 +101,7 @@ def test_list_discovery_configs_pagination(client: DataMasqueClient) -> None: { "id": CONFIG_ID_2, "name": "c2", + "config_type": "database", "archived": False, "created": "2025-01-01T12:00:00Z", "modified": "2025-01-01T12:00:00Z", @@ -110,6 +116,7 @@ def test_list_discovery_configs_pagination(client: DataMasqueClient) -> None: { "id": "cccccccc-1111-2222-3333-444444444444", "name": "c3", + "config_type": "database", "archived": False, "created": "2025-01-01T12:00:00Z", "modified": "2025-01-01T12:00:00Z", @@ -162,6 +169,7 @@ def test_get_discovery_config_by_name_found( { "id": CONFIG_ID_1, "name": "my_config", + "config_type": "database", "archived": False, "created": "2025-01-01T12:00:00Z", "modified": "2025-01-02T12:00:00Z", @@ -209,6 +217,7 @@ def test_get_discovery_config_by_name_raises_when_server_omits_id(client: DataMa "results": [ { "name": "my_config", + "config_type": "database", "archived": False, "created": "2025-01-01T12:00:00Z", "modified": "2025-01-02T12:00:00Z", @@ -231,6 +240,7 @@ def test_create_discovery_config(client: DataMasqueClient, discovery_config: Dis "id": CONFIG_ID_1, "name": "test_config", "config_yaml": "labels: []\nmetadata_rules: []\nidd_rules: []\n", + "config_type": "database", "archived": False, "created": "2025-06-01T10:00:00Z", "modified": "2025-06-01T10:00:00Z", @@ -252,6 +262,7 @@ def test_create_discovery_config(client: DataMasqueClient, discovery_config: Dis request_body = m.last_request.json() assert request_body["name"] == "test_config" assert request_body["config_yaml"] == "labels: []\nmetadata_rules: []\nidd_rules: []\n" + assert request_body["config_type"] == "database" def test_update_discovery_config(client: DataMasqueClient, discovery_config: DiscoveryConfig) -> None: @@ -261,6 +272,7 @@ def test_update_discovery_config(client: DataMasqueClient, discovery_config: Dis "id": CONFIG_ID_1, "name": "test_config", "config_yaml": "labels: []\nmetadata_rules: []\nidd_rules: []\n", + "config_type": "database", "archived": False, "created": "2025-06-01T10:00:00Z", "modified": "2025-06-02T10:00:00Z", @@ -293,6 +305,7 @@ def test_create_or_update_discovery_config_create(client: DataMasqueClient, disc "id": CONFIG_ID_1, "name": "test_config", "config_yaml": "labels: []\nmetadata_rules: []\nidd_rules: []\n", + "config_type": "database", "archived": False, "created": "2025-06-01T10:00:00Z", "modified": "2025-06-01T10:00:00Z", @@ -317,6 +330,7 @@ def test_create_or_update_discovery_config_update(client: DataMasqueClient, disc { "id": CONFIG_ID_1, "name": "test_config", + "config_type": "database", "archived": False, "created": "2025-06-01T10:00:00Z", "modified": "2025-06-01T10:00:00Z", @@ -327,6 +341,7 @@ def test_create_or_update_discovery_config_update(client: DataMasqueClient, disc "id": CONFIG_ID_1, "name": "test_config", "config_yaml": "labels: []", + "config_type": "database", "archived": False, "created": "2025-06-01T10:00:00Z", "modified": "2025-06-01T10:00:00Z", @@ -335,6 +350,7 @@ def test_create_or_update_discovery_config_update(client: DataMasqueClient, disc "id": CONFIG_ID_1, "name": "test_config", "config_yaml": "labels: []\nmetadata_rules: []\nidd_rules: []\n", + "config_type": "database", "archived": False, "created": "2025-06-01T10:00:00Z", "modified": "2025-06-02T10:00:00Z", @@ -403,6 +419,7 @@ def test_delete_discovery_config_by_name_raises_when_server_omits_id(client: Dat "results": [ { "name": "my_config", + "config_type": "database", "archived": False, "created": "2025-01-01T12:00:00Z", "modified": "2025-01-02T12:00:00Z", @@ -441,6 +458,7 @@ def test_discovery_config_parses_validation_fields() -> None: { "id": CONFIG_ID_1, "name": "my_config", + "config_type": "database", "config_yaml": "labels: []", "is_valid": "invalid", "validation_error": "bad shape on line 3", @@ -459,6 +477,7 @@ def test_discovery_config_validation_fields_optional() -> None: { "id": CONFIG_ID_1, "name": "my_config", + "config_type": "database", "created": "2025-01-01T12:00:00Z", "modified": "2025-01-02T12:00:00Z", } @@ -474,11 +493,11 @@ def test_unwrap_discovery_config_id_passes_through_strings() -> None: def test_unwrap_discovery_config_id_extracts_id_from_model() -> None: - config = DiscoveryConfig(name="x", id=DiscoveryConfigId(CONFIG_ID_1)) + config = DiscoveryConfig(name="x", config_type="database", id=DiscoveryConfigId(CONFIG_ID_1)) assert unwrap_discovery_config_id(config) == CONFIG_ID_1 def test_unwrap_discovery_config_id_raises_without_id() -> None: - config = DiscoveryConfig(name="x") + config = DiscoveryConfig(name="x", config_type="database") with pytest.raises(ValueError, match="id is None"): unwrap_discovery_config_id(config) From 39fa29f574451caf25a618e93c6fb9397921d6a6 Mon Sep 17 00:00:00 2001 From: Colin Haywood Date: Wed, 24 Jun 2026 10:26:33 +1200 Subject: [PATCH 18/26] feat!: require type for discovery-config and ruleset lookup/delete Names are unique only per type, so the by-name lookups need the type to identify a single config. --- HISTORY.rst | 2 ++ datamasque/client/discovery_configs.py | 25 ++++++++++-------- datamasque/client/rulesets.py | 20 ++++++++++----- tests/test_discovery_configs.py | 35 +++++++++++++++++++++----- tests/test_rulesets.py | 17 +++++++++++-- 5 files changed, 75 insertions(+), 24 deletions(-) diff --git a/HISTORY.rst b/HISTORY.rst index 67055e4..e1803c0 100644 --- a/HISTORY.rst +++ b/HISTORY.rst @@ -21,6 +21,8 @@ History * Read-only fields (``id``, ``is_valid``, ``validation_error``, etc.) are no longer echoed back in ``Ruleset`` / ``RulesetLibrary`` create/update request bodies. * Fixed ``SslZipFile`` uploads to send the required ``database_type=mysql`` form field. +* **Breaking:** ``delete_ruleset_by_name_if_exists`` now requires a ``ruleset_type`` argument, + since ruleset names are unique only per type. 1.0.5 (2026-06-18) ------------------ diff --git a/datamasque/client/discovery_configs.py b/datamasque/client/discovery_configs.py index 617f367..d5e1a32 100644 --- a/datamasque/client/discovery_configs.py +++ b/datamasque/client/discovery_configs.py @@ -3,7 +3,7 @@ from datamasque.client.base import BaseClient from datamasque.client.exceptions import DataMasqueApiError, DataMasqueException -from datamasque.client.models.discovery_config import DiscoveryConfig, DiscoveryConfigId +from datamasque.client.models.discovery_config import DiscoveryConfig, DiscoveryConfigId, DiscoveryConfigType from datamasque.client.models.pagination import Page logger = logging.getLogger(__name__) @@ -33,17 +33,18 @@ def get_discovery_config(self, config_id: DiscoveryConfigId) -> DiscoveryConfig: response = self.make_request("GET", f"/api/discovery/configs/{config_id}/") return DiscoveryConfig.model_validate(response.json()) - def get_discovery_config_by_name(self, name: str) -> Optional[DiscoveryConfig]: + def get_discovery_config_by_name(self, name: str, config_type: DiscoveryConfigType) -> Optional[DiscoveryConfig]: """ - Looks for a discovery config matching the given name (case-sensitive, exact match). + Looks for a discovery config matching the given name and type (case-sensitive, exact match). + Config names are unique per type, so a type is required to identify a single config. Returns it if found, otherwise `None`. """ response = self.make_request( "GET", "/api/discovery/configs/", - params={"name_exact": name, "limit": 1}, + params={"name_exact": name, "config_type": config_type.value, "limit": 1}, ) page = Page[DiscoveryConfig].model_validate(response.json()) if not page.results: @@ -99,7 +100,7 @@ def create_or_update_discovery_config(self, config: DiscoveryConfig) -> Discover Sets the config's `id` property. """ - existing = self.get_discovery_config_by_name(config.name) + existing = self.get_discovery_config_by_name(config.name, config.config_type) if existing is not None: config.id = existing.id return self.update_discovery_config(config) @@ -115,15 +116,19 @@ def delete_discovery_config_by_id_if_exists(self, config_id: DiscoveryConfigId) self._delete_if_exists(f"/api/discovery/configs/{config_id}/") - def delete_discovery_config_by_name_if_exists(self, name: str) -> None: + def delete_discovery_config_by_name_if_exists(self, name: str, config_type: DiscoveryConfigType) -> None: """ - Archives the discovery config with the given name. + Archives the discovery config with the given name and type. - No-op if the config does not exist. + Config names are unique per type, so a type is required to identify a single config. + No-op if no such config exists. """ - all_configs = self.list_discovery_configs() - matching = [config for config in all_configs if config.name == name] + matching = [ + config + for config in self.list_discovery_configs() + if config.name == name and config.config_type is config_type + ] for config in matching: if config.id is None: raise DataMasqueException(f'Server returned a discovery config named "{config.name}" without an `id`.') diff --git a/datamasque/client/rulesets.py b/datamasque/client/rulesets.py index 77ab54d..5ea0ce9 100644 --- a/datamasque/client/rulesets.py +++ b/datamasque/client/rulesets.py @@ -2,7 +2,7 @@ from datamasque.client.base import BaseClient from datamasque.client.exceptions import DataMasqueException -from datamasque.client.models.ruleset import Ruleset, RulesetId +from datamasque.client.models.ruleset import Ruleset, RulesetId, RulesetType logger = logging.getLogger(__name__) @@ -45,12 +45,20 @@ def delete_ruleset_by_id_if_exists(self, ruleset_id: RulesetId) -> None: self._delete_if_exists(f"/api/rulesets/{ruleset_id}/") - def delete_ruleset_by_name_if_exists(self, ruleset_name: str) -> None: - """Deletes the ruleset with the given name. No-op if the ruleset does not exist.""" + def delete_ruleset_by_name_if_exists(self, ruleset_name: str, ruleset_type: RulesetType) -> None: + """ + Deletes the ruleset with the given name and type. + + Ruleset names are unique per type, so a type is required to identify a single ruleset. + No-op if no such ruleset exists. + """ - all_rulesets = self.list_rulesets() - rulesets_matching_name = [ruleset for ruleset in all_rulesets if ruleset.name == ruleset_name] - for ruleset in rulesets_matching_name: + matching = [ + ruleset + for ruleset in self.list_rulesets() + if ruleset.name == ruleset_name and ruleset.ruleset_type is ruleset_type + ] + for ruleset in matching: if ruleset.id is None: raise DataMasqueException(f'Server returned a ruleset named "{ruleset.name}" without an `id`.') diff --git a/tests/test_discovery_configs.py b/tests/test_discovery_configs.py index 7ea2b67..d8bdf68 100644 --- a/tests/test_discovery_configs.py +++ b/tests/test_discovery_configs.py @@ -11,6 +11,7 @@ from datamasque.client.models.discovery_config import ( DiscoveryConfig, DiscoveryConfigId, + DiscoveryConfigType, unwrap_discovery_config_id, ) from datamasque.client.models.status import ValidationStatus @@ -188,11 +189,12 @@ def test_get_discovery_config_by_name_found( json=sample_config_detail_response, status_code=200, ) - config = client.get_discovery_config_by_name("my_config") + config = client.get_discovery_config_by_name("my_config", DiscoveryConfigType.database) assert config is not None assert config.name == "my_config" assert "name_exact=my_config" in m.request_history[0].url + assert "config_type=database" in m.request_history[0].url def test_get_discovery_config_by_name_not_found(client: DataMasqueClient) -> None: @@ -204,7 +206,7 @@ def test_get_discovery_config_by_name_not_found(client: DataMasqueClient) -> Non json=empty_response, status_code=200, ) - config = client.get_discovery_config_by_name("nonexistent") + config = client.get_discovery_config_by_name("nonexistent", DiscoveryConfigType.database) assert config is None @@ -232,7 +234,7 @@ def test_get_discovery_config_by_name_raises_when_server_omits_id(client: DataMa status_code=200, ) with pytest.raises(DataMasqueApiError, match="without an `id`"): - client.get_discovery_config_by_name("my_config") + client.get_discovery_config_by_name("my_config", DiscoveryConfigType.database) def test_create_discovery_config(client: DataMasqueClient, discovery_config: DiscoveryConfig) -> None: @@ -394,7 +396,7 @@ def test_delete_discovery_config_by_name(client: DataMasqueClient, sample_config with requests_mock.Mocker() as m: m.get("http://test-server/api/discovery/configs/", json=sample_config_list_response, status_code=200) m.delete(f"http://test-server/api/discovery/configs/{CONFIG_ID_1}/", status_code=204) - client.delete_discovery_config_by_name_if_exists("my_config") + client.delete_discovery_config_by_name_if_exists("my_config", DiscoveryConfigType.database) assert m.request_history[0].method == "GET" assert m.request_history[1].method == "DELETE" @@ -405,7 +407,7 @@ def test_delete_discovery_config_by_name_not_found( ) -> None: with requests_mock.Mocker() as m: m.get("http://test-server/api/discovery/configs/", json=sample_config_list_response, status_code=200) - client.delete_discovery_config_by_name_if_exists("nonexistent") + client.delete_discovery_config_by_name_if_exists("nonexistent", DiscoveryConfigType.database) assert m.call_count == 1 assert m.request_history[0].method == "GET" @@ -434,7 +436,28 @@ def test_delete_discovery_config_by_name_raises_when_server_omits_id(client: Dat status_code=200, ) with pytest.raises(DataMasqueException, match="without an `id`"): - client.delete_discovery_config_by_name_if_exists("my_config") + client.delete_discovery_config_by_name_if_exists("my_config", DiscoveryConfigType.database) + + +def test_delete_discovery_config_by_name_only_deletes_matching_type(client: DataMasqueClient) -> None: + shared_name_response = { + "count": 2, + "next": None, + "previous": None, + "results": [ + {"id": CONFIG_ID_1, "name": "shared", "config_type": "database", "archived": False}, + {"id": CONFIG_ID_2, "name": "shared", "config_type": "file", "archived": False}, + ], + } + + with requests_mock.Mocker() as m: + m.get("http://test-server/api/discovery/configs/", json=shared_name_response, status_code=200) + m.delete(f"http://test-server/api/discovery/configs/{CONFIG_ID_2}/", status_code=204) + client.delete_discovery_config_by_name_if_exists("shared", DiscoveryConfigType.file) + + assert m.request_history[-1].method == "DELETE" + assert CONFIG_ID_2 in m.request_history[-1].url + assert CONFIG_ID_1 not in m.request_history[-1].url def test_get_default_discovery_config_yaml(client: DataMasqueClient) -> None: diff --git a/tests/test_rulesets.py b/tests/test_rulesets.py index aff5f32..1200a68 100644 --- a/tests/test_rulesets.py +++ b/tests/test_rulesets.py @@ -101,13 +101,26 @@ def test_delete_ruleset_by_name(client, existing_rulesets_json): status_code=200, ) m.delete("http://test-server/api/rulesets/2/", status_code=204) - client.delete_ruleset_by_name_if_exists("file_masking_ruleset") + client.delete_ruleset_by_name_if_exists("file_masking_ruleset", RulesetType.file) assert m.call_count == 2 assert m.request_history[0].method == "GET" assert m.request_history[1].method == "DELETE" +def test_delete_ruleset_by_name_skips_other_type(client, existing_rulesets_json): + with requests_mock.Mocker() as m: + m.get( + "http://test-server/api/v2/rulesets/", + json=existing_rulesets_json, + status_code=200, + ) + client.delete_ruleset_by_name_if_exists("file_masking_ruleset", RulesetType.database) + + assert m.call_count == 1 + assert m.request_history[0].method == "GET" + + def test_delete_ruleset_that_does_not_exist(client, existing_rulesets_json): with requests_mock.Mocker() as m: m.get( @@ -115,7 +128,7 @@ def test_delete_ruleset_that_does_not_exist(client, existing_rulesets_json): json=existing_rulesets_json, status_code=200, ) - client.delete_ruleset_by_name_if_exists("not_a_ruleset") + client.delete_ruleset_by_name_if_exists("not_a_ruleset", RulesetType.database) assert m.call_count == 1 assert m.request_history[0].method == "GET" From db41755a75c18342296ea292b12e578c8641a9e0 Mon Sep 17 00:00:00 2001 From: Colin Haywood Date: Wed, 24 Jun 2026 13:02:19 +1200 Subject: [PATCH 19/26] fix: populate validation status on discovery-config create/update Copy `is_valid`/`validation_error` back from the server response, matching the ruleset-library client. --- datamasque/client/discovery_configs.py | 38 ++++++++++++++++++-------- tests/test_discovery_configs.py | 28 ++++++++----------- 2 files changed, 38 insertions(+), 28 deletions(-) diff --git a/datamasque/client/discovery_configs.py b/datamasque/client/discovery_configs.py index d5e1a32..b8cb5d5 100644 --- a/datamasque/client/discovery_configs.py +++ b/datamasque/client/discovery_configs.py @@ -33,13 +33,10 @@ def get_discovery_config(self, config_id: DiscoveryConfigId) -> DiscoveryConfig: response = self.make_request("GET", f"/api/discovery/configs/{config_id}/") return DiscoveryConfig.model_validate(response.json()) - def get_discovery_config_by_name(self, name: str, config_type: DiscoveryConfigType) -> Optional[DiscoveryConfig]: - """ - Looks for a discovery config matching the given name and type (case-sensitive, exact match). - - Config names are unique per type, so a type is required to identify a single config. - Returns it if found, otherwise `None`. - """ + def _get_discovery_config_id_by_name( + self, name: str, config_type: DiscoveryConfigType + ) -> Optional[DiscoveryConfigId]: + """Return the id of the config matching name and type via a single list request, or `None`.""" response = self.make_request( "GET", @@ -57,19 +54,36 @@ def get_discovery_config_by_name(self, name: str, config_type: DiscoveryConfigTy response=response, ) + return config_id + + def get_discovery_config_by_name(self, name: str, config_type: DiscoveryConfigType) -> Optional[DiscoveryConfig]: + """ + Looks for a discovery config matching the given name and type (case-sensitive, exact match). + + Config names are unique per type, so a type is required to identify a single config. + Returns it if found, otherwise `None`. + """ + + config_id = self._get_discovery_config_id_by_name(name, config_type) + if config_id is None: + return None + return self.get_discovery_config(config_id) def create_discovery_config(self, config: DiscoveryConfig) -> DiscoveryConfig: """ Creates a new discovery config on the server. - Sets the config's server-assigned fields (`id`, `created`, `modified`) and returns the config. + Sets the config's server-assigned fields + (`id`, `is_valid`, `validation_error`, `created`, `modified`) and returns the config. """ data = config.model_dump(exclude_none=True, by_alias=True, mode="json") response = self.make_request("POST", "/api/discovery/configs/", data=data) created = DiscoveryConfig.model_validate(response.json()) config.id = created.id + config.is_valid = created.is_valid + config.validation_error = created.validation_error config.created = created.created config.modified = created.modified logger.info('Creation of discovery config "%s" successful', config.name) @@ -89,6 +103,8 @@ def update_discovery_config(self, config: DiscoveryConfig) -> DiscoveryConfig: data = config.model_dump(exclude_none=True, by_alias=True, mode="json") response = self.make_request("PUT", f"/api/discovery/configs/{config.id}/", data=data) updated = DiscoveryConfig.model_validate(response.json()) + config.is_valid = updated.is_valid + config.validation_error = updated.validation_error config.modified = updated.modified logger.debug('Update of discovery config "%s" successful', config.name) return config @@ -100,9 +116,9 @@ def create_or_update_discovery_config(self, config: DiscoveryConfig) -> Discover Sets the config's `id` property. """ - existing = self.get_discovery_config_by_name(config.name, config.config_type) - if existing is not None: - config.id = existing.id + existing_id = self._get_discovery_config_id_by_name(config.name, config.config_type) + if existing_id is not None: + config.id = existing_id return self.update_discovery_config(config) return self.create_discovery_config(config) diff --git a/tests/test_discovery_configs.py b/tests/test_discovery_configs.py index d8bdf68..caa4b31 100644 --- a/tests/test_discovery_configs.py +++ b/tests/test_discovery_configs.py @@ -243,6 +243,7 @@ def test_create_discovery_config(client: DataMasqueClient, discovery_config: Dis "name": "test_config", "config_yaml": "labels: []\nmetadata_rules: []\nidd_rules: []\n", "config_type": "database", + "is_valid": "in_progress", "archived": False, "created": "2025-06-01T10:00:00Z", "modified": "2025-06-01T10:00:00Z", @@ -258,6 +259,7 @@ def test_create_discovery_config(client: DataMasqueClient, discovery_config: Dis assert result is discovery_config assert result.id == DiscoveryConfigId(CONFIG_ID_1) + assert result.is_valid is ValidationStatus.in_progress assert result.created == datetime.fromisoformat("2025-06-01T10:00:00+00:00") assert result.modified == datetime.fromisoformat("2025-06-01T10:00:00+00:00") @@ -265,6 +267,11 @@ def test_create_discovery_config(client: DataMasqueClient, discovery_config: Dis assert request_body["name"] == "test_config" assert request_body["config_yaml"] == "labels: []\nmetadata_rules: []\nidd_rules: []\n" assert request_body["config_type"] == "database" + # Server-managed read-only fields must not be echoed back in the request body. + assert "id" not in request_body + assert "is_valid" not in request_body + assert "created" not in request_body + assert "modified" not in request_body def test_update_discovery_config(client: DataMasqueClient, discovery_config: DiscoveryConfig) -> None: @@ -294,6 +301,8 @@ def test_update_discovery_config(client: DataMasqueClient, discovery_config: Dis request_body = m.last_request.json() assert request_body["name"] == "test_config" assert request_body["config_yaml"] == "labels: []\nmetadata_rules: []\nidd_rules: []\n" + # The id identifies the config in the URL, not the request body. + assert "id" not in request_body def test_update_discovery_config_no_id_raises(client: DataMasqueClient, discovery_config: DiscoveryConfig) -> None: @@ -339,15 +348,6 @@ def test_create_or_update_discovery_config_update(client: DataMasqueClient, disc }, ], } - detail_response = { - "id": CONFIG_ID_1, - "name": "test_config", - "config_yaml": "labels: []", - "config_type": "database", - "archived": False, - "created": "2025-06-01T10:00:00Z", - "modified": "2025-06-01T10:00:00Z", - } update_response = { "id": CONFIG_ID_1, "name": "test_config", @@ -360,11 +360,6 @@ def test_create_or_update_discovery_config_update(client: DataMasqueClient, disc with requests_mock.Mocker() as m: m.get("http://test-server/api/discovery/configs/", json=list_response, status_code=200) - m.get( - f"http://test-server/api/discovery/configs/{CONFIG_ID_1}/", - json=detail_response, - status_code=200, - ) m.put( f"http://test-server/api/discovery/configs/{CONFIG_ID_1}/", json=update_response, @@ -373,9 +368,8 @@ def test_create_or_update_discovery_config_update(client: DataMasqueClient, disc result = client.create_or_update_discovery_config(discovery_config) assert result.id == DiscoveryConfigId(CONFIG_ID_1) - assert m.request_history[0].method == "GET" - assert m.request_history[1].method == "GET" - assert m.request_history[2].method == "PUT" + # The upsert resolves the id from the list response alone — no detail GET before the PUT. + assert [r.method for r in m.request_history] == ["GET", "PUT"] def test_delete_discovery_config_by_id(client: DataMasqueClient) -> None: From f1464c722f377ea38e25a02459b86a92c6effc90 Mon Sep 17 00:00:00 2001 From: Colin Haywood Date: Wed, 24 Jun 2026 13:02:19 +1200 Subject: [PATCH 20/26] fix: tighten file-discovery result models Require the fields the server always returns; type `connection` as `RunConnectionRef`. --- datamasque/client/models/discovery.py | 31 ++++++++++++++------------- tests/test_discovery.py | 30 ++++++++++++++++++++++++++ tests/test_runs.py | 11 +++++++++- 3 files changed, 56 insertions(+), 16 deletions(-) diff --git a/datamasque/client/models/discovery.py b/datamasque/client/models/discovery.py index 6a41b1c..7bccefa 100644 --- a/datamasque/client/models/discovery.py +++ b/datamasque/client/models/discovery.py @@ -9,6 +9,7 @@ from datamasque.client.models.data_selection import HashColumnsTableConfig, Locator, UserSelection from datamasque.client.models.discovery_config import DiscoveryConfig, DiscoveryConfigId, unwrap_discovery_config_id from datamasque.client.models.pagination import Page +from datamasque.client.models.runs import RunConnectionRef class InDataDiscoveryRule(BaseModel): @@ -340,11 +341,11 @@ class FileDiscoveryMatch(BaseModel): model_config = ConfigDict(extra="allow") - categories: Optional[list[str]] = None - flagged_by: Optional[str] = None - description: Optional[str] = None - label: Optional[str] = None - hit_ratio: Optional[int] = None + flagged_by: str + description: str + label: Optional[str] = None # Omitted for non-sensitive and ignored matches. + categories: Optional[list[str]] = None # Omitted for ignored matches. + hit_ratio: Optional[int] = None # None for metadata matches, percentage 0-100 for IDD matches. class FileDiscoveryLocatorResult(BaseModel): @@ -352,9 +353,9 @@ class FileDiscoveryLocatorResult(BaseModel): model_config = ConfigDict(extra="allow") - locator: Optional[Locator] = None - matches: Optional[list[FileDiscoveryMatch]] = None - data_types: Optional[list[str]] = None + locator: Locator + matches: list[FileDiscoveryMatch] + data_types: list[str] class FileDiscoveryFile(BaseModel): @@ -362,8 +363,8 @@ class FileDiscoveryFile(BaseModel): model_config = ConfigDict(extra="allow") - path: Optional[str] = None - file_type: Optional[str] = None + path: str + file_type: str delimiter: Optional[str] = None encoding: Optional[str] = None @@ -373,8 +374,8 @@ class FileDiscoveryResult(BaseModel): model_config = ConfigDict(extra="allow") - id: Optional[int] = None - connection: Optional[Any] = None - file_type: Optional[str] = None - files: Optional[list[FileDiscoveryFile]] = None - results: Optional[list[FileDiscoveryLocatorResult]] = None + id: int + connection: RunConnectionRef + file_type: str + files: list[FileDiscoveryFile] + results: list[FileDiscoveryLocatorResult] diff --git a/tests/test_discovery.py b/tests/test_discovery.py index 8605474..a5ede9b 100644 --- a/tests/test_discovery.py +++ b/tests/test_discovery.py @@ -15,6 +15,7 @@ FileDataDiscoveryFromConfigRequest, FileDataDiscoveryOptions, FileDataDiscoveryRequest, + FileDiscoveryResult, FileFilter, FileFilterMatchAgainst, FileRulesetGenerationRequest, @@ -837,6 +838,35 @@ def test_file_filter_rejects_empty_pattern(): FileFilter(regex="", glob="*.csv") +def test_file_discovery_result_parses_server_response(): + """A file-discovery-results record parses; match label/categories stay optional for non-sensitive matches.""" + result = FileDiscoveryResult.model_validate( + { + "id": 7, + "connection": {"id": "conn-1", "name": "my files"}, + "file_type": "csv", + "files": [{"path": "data/people.csv", "file_type": "csv", "delimiter": ",", "encoding": "utf-8"}], + "results": [ + { + "locator": "email", + "data_types": ["string"], + "matches": [ + {"flagged_by": "MDD", "description": "Email", "label": "email", "categories": ["PII"]}, + {"flagged_by": "MDD", "description": "Not sensitive"}, + ], + } + ], + } + ) + + assert result.id == 7 + assert result.connection.name == "my files" + assert result.files[0].path == "data/people.csv" + non_sensitive_match = result.results[0].matches[1] + assert non_sensitive_match.label is None + assert non_sensitive_match.categories is None + + def test_file_data_discovery_ignore_rules_serialize(): """`in_data_discovery.ignore_rules` round-trips into the wire payload.""" req = FileDataDiscoveryRequest( diff --git a/tests/test_runs.py b/tests/test_runs.py index 24cbcd6..d4be2e8 100644 --- a/tests/test_runs.py +++ b/tests/test_runs.py @@ -48,12 +48,21 @@ def test_get_file_data_discovery_report(client): with requests_mock.Mocker() as m: m.get( "http://test-server/api/runs/1/file-discovery-results/", - json=[{"id": 1, "file_type": "csv", "files": [], "results": []}], + json=[ + { + "id": 1, + "connection": {"id": "conn-1", "name": "files"}, + "file_type": "csv", + "files": [], + "results": [], + } + ], status_code=200, ) results = client.get_file_data_discovery_report(RunId(1)) assert len(results) == 1 assert results[0].id == 1 + assert results[0].connection.name == "files" def test_unfinished_run_str_with_destination(): From eb5e26ef81cf6015f029624ccdccfd19840df485 Mon Sep 17 00:00:00 2001 From: Colin Haywood Date: Wed, 24 Jun 2026 13:02:19 +1200 Subject: [PATCH 21/26] fix: raise when a generated-ruleset archive contains no rulesets --- datamasque/client/discovery.py | 7 +++++++ tests/test_discovery.py | 27 +++++++++++++++++++++++++++ 2 files changed, 34 insertions(+) diff --git a/datamasque/client/discovery.py b/datamasque/client/discovery.py index 392c32e..94de3cb 100644 --- a/datamasque/client/discovery.py +++ b/datamasque/client/discovery.py @@ -192,6 +192,13 @@ def get_generated_rulesets(self, connection_id: ConnectionId) -> list[Ruleset]: with zip_file.open(file_info) as file: yaml_content = file.read().decode("utf-8") rulesets.append(Ruleset(name=Path(file_info.filename).stem, yaml=yaml_content)) + + if not rulesets: + raise DataMasqueException( + f"Ruleset generation for connection {connection_id} reported `finished` " + f"but the downloaded archive contained no rulesets." + ) + return rulesets generated = response.json().get("generated_ruleset") diff --git a/tests/test_discovery.py b/tests/test_discovery.py index a5ede9b..298f160 100644 --- a/tests/test_discovery.py +++ b/tests/test_discovery.py @@ -193,6 +193,33 @@ def test_get_generated_rulesets_success(client): assert rulesets[1].yaml == yaml_content_2.decode("utf-8") +def test_get_generated_rulesets_empty_archive_raises(client): + """A finished task whose download archive contains no ruleset files raises a clear error.""" + connection_id = ConnectionId("1") + + with requests_mock.Mocker() as m: + m.get( + f"http://test-server/api/async-generate-ruleset/{connection_id}/", + json={"status": "finished"}, + status_code=200, + ) + + zip_buffer = BytesIO() + with zipfile.ZipFile(zip_buffer, "w") as zip_file: + zip_file.writestr("README.txt", "no rulesets here") + zip_buffer.seek(0) + + m.get( + f"http://test-server/api/async-generate-ruleset/{connection_id}/download-rulesets/", + content=zip_buffer.getvalue(), + headers={"Content-Disposition": 'attachment; filename="rulesets.zip"'}, + status_code=200, + ) + + with pytest.raises(DataMasqueException, match="contained no rulesets"): + client.get_generated_rulesets(connection_id) + + def test_get_generated_rulesets_from_selection_success(client): """Non-CSV async RG: server 303s to the task-status endpoint, whose JSON body carries `generated_ruleset`.""" connection_id = ConnectionId("1") From 45877a7c8d4382552f92584a335f088da2606958 Mon Sep 17 00:00:00 2001 From: Colin Haywood Date: Wed, 24 Jun 2026 13:02:19 +1200 Subject: [PATCH 22/26] docs: tidy the unreleased changelog --- HISTORY.rst | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/HISTORY.rst b/HISTORY.rst index e1803c0..7f43206 100644 --- a/HISTORY.rst +++ b/HISTORY.rst @@ -5,18 +5,17 @@ History 1.1.0 (unreleased) ------------------ -* Added discovery-config management APIs (``list_discovery_configs``, ``create_discovery_config``, and friends). +* Added discovery configuration models and management APIs. * Added schema-discovery and file-data-discovery APIs that take a saved discovery configuration (``start_schema_discovery_run_from_config`` / ``start_file_data_discovery_run_from_config``). Adoption is recommended; the older APIs that take individual options will be deprecated in a future release. * Corrected the file-data-discovery ``include``/``skip`` filter syntax and added ``ignore_rules`` support. -* Added ``InvalidDiscoveryConfigError``, raised when a discovery run can't start due to an invalid discovery config. -* Added ``is_user_subscribed`` to ``MaskingRunRequest`` - to subscribe the requesting user to a run's email notifications. +* Added ``InvalidDiscoveryConfigError`` and ``DiscoveryConfigNotFoundError``, + raised when a discovery run can't start due to an unusable or missing discovery config. +* Added ``is_user_subscribed`` to ``MaskingRunRequest`` to subscribe the requesting user to a run's email notifications. * Added ``auto_pull`` / ``auto_pull_branch`` to ``MaskingRunOptions`` to refresh the run's ruleset from git before starting. -* Added ``validation_error`` (and ``validation_error_type`` for rulesets) - to ``Ruleset`` and ``RulesetLibrary``. +* Added ``validation_error`` (and ``validation_error_type`` for rulesets) to ``Ruleset`` and ``RulesetLibrary``. * Exposed git provenance on ``Ruleset`` and ``RulesetLibrary`` as a nested ``git`` field (``GitSnapshot``). * Read-only fields (``id``, ``is_valid``, ``validation_error``, etc.) are no longer echoed back in ``Ruleset`` / ``RulesetLibrary`` create/update request bodies. From cb865d16d4c1f31b92f6c9dfe9663b6968f001ec Mon Sep 17 00:00:00 2001 From: Colin Haywood Date: Wed, 24 Jun 2026 13:49:43 +1200 Subject: [PATCH 23/26] docs: drop references to "archive" from docstrings --- datamasque/client/discovery_configs.py | 6 +++--- datamasque/client/ruleset_libraries.py | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/datamasque/client/discovery_configs.py b/datamasque/client/discovery_configs.py index b8cb5d5..60f448d 100644 --- a/datamasque/client/discovery_configs.py +++ b/datamasque/client/discovery_configs.py @@ -19,7 +19,7 @@ def iter_discovery_configs(self) -> Iterator[DiscoveryConfig]: def list_discovery_configs(self) -> list[DiscoveryConfig]: """ - Lists all (non-archived) discovery configs. + Lists all discovery configs. Note: the YAML content is not included in the list response for performance. Use `get_discovery_config` to retrieve the full config with its YAML body. @@ -125,7 +125,7 @@ def create_or_update_discovery_config(self, config: DiscoveryConfig) -> Discover def delete_discovery_config_by_id_if_exists(self, config_id: DiscoveryConfigId) -> None: """ - Archives the discovery config with the given ID. + Deletes the discovery config with the given ID. No-op if the config does not exist. """ @@ -134,7 +134,7 @@ def delete_discovery_config_by_id_if_exists(self, config_id: DiscoveryConfigId) def delete_discovery_config_by_name_if_exists(self, name: str, config_type: DiscoveryConfigType) -> None: """ - Archives the discovery config with the given name and type. + Deletes the discovery config with the given name and type. Config names are unique per type, so a type is required to identify a single config. No-op if no such config exists. diff --git a/datamasque/client/ruleset_libraries.py b/datamasque/client/ruleset_libraries.py index 8943553..e734bc2 100644 --- a/datamasque/client/ruleset_libraries.py +++ b/datamasque/client/ruleset_libraries.py @@ -115,7 +115,7 @@ def create_or_update_ruleset_library(self, library: RulesetLibrary) -> RulesetLi def delete_ruleset_library_by_id_if_exists(self, library_id: RulesetLibraryId, *, force: bool = False) -> None: """ - Deletes (archives) the ruleset library with the given ID. + Deletes the ruleset library with the given ID. No-op if the library does not exist. @@ -144,13 +144,13 @@ def delete_ruleset_library_by_name_if_exists( self.delete_ruleset_library_by_id_if_exists(lib.id, force=force) def iter_rulesets_using_library(self, library_id: RulesetLibraryId) -> Iterator[Ruleset]: - """Lazily iterate non-archived rulesets that import the given library.""" + """Lazily iterate rulesets that import the given library.""" return self._iter_paginated(f"/api/ruleset-libraries/{library_id}/rulesets/", model=Ruleset) def list_rulesets_using_library(self, library_id: RulesetLibraryId) -> list[Ruleset]: """ - Lists non-archived rulesets that import the given library. + Lists rulesets that import the given library. Note: The YAML content is not included in the response for performance. Each returned Ruleset will have an empty string for `yaml`. From edfa6e049779813346963b05e2bc68300283e5f6 Mon Sep 17 00:00:00 2001 From: Colin Haywood Date: Wed, 24 Jun 2026 14:20:36 +1200 Subject: [PATCH 24/26] refactor: collapse duplicated saved-config discovery start methods --- datamasque/client/discovery.py | 50 +++++++++++----------------------- 1 file changed, 16 insertions(+), 34 deletions(-) diff --git a/datamasque/client/discovery.py b/datamasque/client/discovery.py index 94de3cb..0d2155b 100644 --- a/datamasque/client/discovery.py +++ b/datamasque/client/discovery.py @@ -297,29 +297,7 @@ def start_schema_discovery_run_from_config(self, request: SchemaDiscoveryFromCon FailedToStartError: the run failed to start for any other reason. """ - data = request.model_dump(exclude_none=True, mode="json") - # The server requires `discovery_config` to be present; a null selects its built-in defaults, - # so send it explicitly rather than letting `exclude_none` drop a None. - data.setdefault("discovery_config", None) - response = self.make_request( - "POST", - "/api/schema-discovery/v2/", - data=data, - require_status_check=False, - ) - run_data = response.json() if response.content else {} - - if response.status_code == 201: - logger.info("Schema discovery run %s started successfully", run_data["id"]) - return RunId(run_data["id"]) - - logger.error("Schema discovery run failed to start: %s", run_data) - self._maybe_raise_discovery_config_error(run_data, response, "Schema discovery") - raise FailedToStartError( - f"Schema discovery run failed to start " - f"(server responded with status {response.status_code}: {response.text}).", - response=response, - ) + return self._start_discovery_run_from_config(request, "/api/schema-discovery/v2/", "Schema discovery") def start_file_data_discovery_run_from_config(self, request: FileDataDiscoveryFromConfigRequest) -> RunId: """ @@ -341,27 +319,31 @@ def start_file_data_discovery_run_from_config(self, request: FileDataDiscoveryFr FailedToStartError: the run failed to start for any other reason. """ + return self._start_discovery_run_from_config(request, "/api/run-file-data-discovery/v2/", "File data discovery") + + def _start_discovery_run_from_config( + self, + request: Union[SchemaDiscoveryFromConfigRequest, FileDataDiscoveryFromConfigRequest], + path: str, + run_kind: str, + ) -> RunId: + """Post a saved-config discovery request and return its run id, classifying config errors on failure.""" + data = request.model_dump(exclude_none=True, mode="json") # The server requires `discovery_config` to be present; a null selects its built-in defaults, # so send it explicitly rather than letting `exclude_none` drop a None. data.setdefault("discovery_config", None) - response = self.make_request( - "POST", - "/api/run-file-data-discovery/v2/", - data=data, - require_status_check=False, - ) + response = self.make_request("POST", path, data=data, require_status_check=False) run_data = response.json() if response.content else {} if response.status_code == 201: - logger.info("File data discovery run %s started successfully", run_data["id"]) + logger.info("%s run %s started successfully", run_kind, run_data["id"]) return RunId(run_data["id"]) - logger.error("File data discovery run failed to start: %s", run_data) - self._maybe_raise_discovery_config_error(run_data, response, "File data discovery") + logger.error("%s run failed to start: %s", run_kind, run_data) + self._maybe_raise_discovery_config_error(run_data, response, run_kind) raise FailedToStartError( - f"File data discovery run failed to start " - f"(server responded with status {response.status_code}: {response.text}).", + f"{run_kind} run failed to start (server responded with status {response.status_code}: {response.text}).", response=response, ) From 55c1dc800c5bd02b7ce7755c6c5085689f3f5393 Mon Sep 17 00:00:00 2001 From: Colin Haywood Date: Wed, 24 Jun 2026 14:46:18 +1200 Subject: [PATCH 25/26] feat: add get_discovery_run_config_snapshot_yaml --- HISTORY.rst | 2 ++ datamasque/client/discovery.py | 19 +++++++++++++ tests/test_discovery.py | 50 ++++++++++++++++++++++++++++++++++ 3 files changed, 71 insertions(+) diff --git a/HISTORY.rst b/HISTORY.rst index 7f43206..e239d5e 100644 --- a/HISTORY.rst +++ b/HISTORY.rst @@ -12,6 +12,8 @@ History * Corrected the file-data-discovery ``include``/``skip`` filter syntax and added ``ignore_rules`` support. * Added ``InvalidDiscoveryConfigError`` and ``DiscoveryConfigNotFoundError``, raised when a discovery run can't start due to an unusable or missing discovery config. +* Added ``get_discovery_run_config_snapshot_yaml`` to retrieve the discovery-config YAML + that was effective at the start of a given discovery run. * Added ``is_user_subscribed`` to ``MaskingRunRequest`` to subscribe the requesting user to a run's email notifications. * Added ``auto_pull`` / ``auto_pull_branch`` to ``MaskingRunOptions`` to refresh the run's ruleset from git before starting. diff --git a/datamasque/client/discovery.py b/datamasque/client/discovery.py index 0d2155b..f6f3850 100644 --- a/datamasque/client/discovery.py +++ b/datamasque/client/discovery.py @@ -440,3 +440,22 @@ def get_file_data_discovery_report(self, run_id: RunId) -> list[FileDiscoveryRes response = self.make_request("GET", f"api/runs/{run_id}/file-discovery-results/") return [FileDiscoveryResult.model_validate(d) for d in response.json()] + + def get_discovery_run_config_snapshot_yaml(self, run_id: RunId, *, timezone: Optional[str] = None) -> str: + """ + Returns the discovery-config YAML that was effective at the start of the given discovery run. + + The YAML is prefixed with a commented provenance header naming the saved config + (or the built-in defaults) the run used, and whether it has since been modified or deleted. + `timezone`, a `±HH:MM` UTC offset, sets the timezone of the header timestamp; the server defaults to UTC. + """ + + params = {"timezone": timezone} if timezone is not None else None + response = self.make_request("GET", f"/api/discovery/runs/{run_id}/config-snapshot/", params=params) + with zipfile.ZipFile(BytesIO(response.content)) as zip_file: + names = zip_file.namelist() + if not names: + raise DataMasqueException(f"Discovery run {run_id} config snapshot archive contained no files.") + + with zip_file.open(names[0]) as snapshot_file: + return snapshot_file.read().decode("utf-8") diff --git a/tests/test_discovery.py b/tests/test_discovery.py index 298f160..42debfd 100644 --- a/tests/test_discovery.py +++ b/tests/test_discovery.py @@ -1213,3 +1213,53 @@ def test_start_schema_discovery_run_from_config_non_discovery_config_400_still_r ), ) assert not isinstance(exc_info.value, InvalidDiscoveryConfigError) + + +def _zip_bytes(*members: tuple[str, str]) -> bytes: + buffer = BytesIO() + with zipfile.ZipFile(buffer, "w") as zip_file: + for name, content in members: + zip_file.writestr(name, content) + + return buffer.getvalue() + + +def test_get_discovery_run_config_snapshot_yaml_returns_yaml(client): + """The single zipped snapshot member is unzipped and returned as a YAML string.""" + snapshot = "# Discovery configuration: my_cfg.\nlabels: []\nmetadata_rules: []\n" + with requests_mock.Mocker() as m: + m.get( + "http://test-server/api/discovery/runs/7/config-snapshot/", + content=_zip_bytes(("config-7.yaml", snapshot)), + headers={"Content-Disposition": 'attachment; filename="config-7.yaml.zip"'}, + status_code=200, + ) + result = client.get_discovery_run_config_snapshot_yaml(RunId(7)) + + assert result == snapshot + assert "timezone" not in (m.last_request.qs or {}) + + +def test_get_discovery_run_config_snapshot_yaml_forwards_timezone(client): + """An explicit `timezone` is forwarded as a query param for the provenance-header timestamp.""" + with requests_mock.Mocker() as m: + m.get( + "http://test-server/api/discovery/runs/7/config-snapshot/", + content=_zip_bytes(("config-7.yaml", "labels: []\n")), + status_code=200, + ) + client.get_discovery_run_config_snapshot_yaml(RunId(7), timezone="+12:00") + + assert m.last_request.qs["timezone"] == ["+12:00"] + + +def test_get_discovery_run_config_snapshot_yaml_empty_archive_raises(client): + """An archive with no members raises a clear error rather than an IndexError.""" + with requests_mock.Mocker() as m: + m.get( + "http://test-server/api/discovery/runs/7/config-snapshot/", + content=_zip_bytes(), + status_code=200, + ) + with pytest.raises(DataMasqueException, match="contained no files"): + client.get_discovery_run_config_snapshot_yaml(RunId(7)) From d0c59d5508775b6a206f2f220466bc0c37bb20ea Mon Sep 17 00:00:00 2001 From: Colin Haywood Date: Wed, 24 Jun 2026 15:02:32 +1200 Subject: [PATCH 26/26] chore: release 1.1.0 --- HISTORY.rst | 2 +- pyproject.toml | 2 +- setup.cfg | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/HISTORY.rst b/HISTORY.rst index e239d5e..dc72f9a 100644 --- a/HISTORY.rst +++ b/HISTORY.rst @@ -2,7 +2,7 @@ History ======= -1.1.0 (unreleased) +1.1.0 (2026-06-24) ------------------ * Added discovery configuration models and management APIs. diff --git a/pyproject.toml b/pyproject.toml index 7ba57f5..1d226e4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "datamasque-python" -version = "1.1.0.dev0" +version = "1.1.0" description = "Official Python client for the DataMasque data-masking API." authors = [ { name = "DataMasque Ltd" }, diff --git a/setup.cfg b/setup.cfg index 24d750a..26b71a7 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 1.1.0.dev0 +current_version = 1.1.0 commit = True tag = True