From 6968215ba76ec6afa0a02e232e1f3c246c49917f Mon Sep 17 00:00:00 2001 From: jiatolentino Date: Tue, 16 Jun 2026 12:09:54 +0800 Subject: [PATCH] feat: return split db-discovery reports as zip bytes --- datamasque/client/discovery.py | 10 ++++++++-- datamasque/client/runs.py | 11 +++++++++-- tests/test_discovery.py | 29 +++++++++++++++++++++++++++++ 3 files changed, 46 insertions(+), 4 deletions(-) diff --git a/datamasque/client/discovery.py b/datamasque/client/discovery.py index 19b89f0..e54a042 100644 --- a/datamasque/client/discovery.py +++ b/datamasque/client/discovery.py @@ -98,10 +98,15 @@ def start_async_ruleset_generation_from_csv( """ content: BufferedIOBase + filename = "ruleset.csv" + content_type = "text/csv" if isinstance(csv_content, str): content = BytesIO(csv_content.encode()) elif isinstance(csv_content, bytes): content = BytesIO(csv_content) + if csv_content[:4] == b"PK\x03\x04": + filename = "ruleset.zip" + content_type = "application/zip" elif isinstance(csv_content, TextIOBase): content = BytesIO(csv_content.read().encode()) else: @@ -110,11 +115,12 @@ def start_async_ruleset_generation_from_csv( files = [ UploadFile( field_name="csv_or_zip_file", - filename="ruleset.csv", + filename=filename, content=content, - content_type="text/csv", + content_type=content_type, ), ] + self.make_request( method="POST", path=f"/api/async-generate-ruleset/{connection_id}/from-csv/", diff --git a/datamasque/client/runs.py b/datamasque/client/runs.py index 6b9a827..c25317e 100644 --- a/datamasque/client/runs.py +++ b/datamasque/client/runs.py @@ -1,5 +1,6 @@ import logging import re +from typing import Union from datamasque.client.base import BaseClient from datamasque.client.exceptions import ( @@ -43,9 +44,12 @@ def get_run_report(self, run_id: RunId) -> str: response = self.make_request("GET", f"api/runs/{run_id}/run-report/") return response.text - def get_db_discovery_result_report(self, run_id: RunId, include_selection_column: bool = True) -> str: + def get_db_discovery_result_report(self, run_id: RunId, include_selection_column: bool = True) -> Union[str, bytes]: """ - Returns the database-discovery result report for the specified run as CSV. + Returns the database-discovery result report for the specified run. + + Returns CSV text (`str`), or a zip of numbered CSV parts as `bytes` when the report is + large enough that the server splits it. When `include_selection_column` is true (the default), the CSV includes a `selected` column suitable for feeding back into ruleset generation. @@ -54,6 +58,9 @@ def get_db_discovery_result_report(self, run_id: RunId, include_selection_column url = f"api/runs/{run_id}/db-discovery-results/report/" params = None if include_selection_column else {"include_selection_column": "false"} response = self.make_request("GET", url, params=params) + + if response.headers.get("X-DM-Download-Format") == "zip": + return response.content return response.text def get_unfinished_runs(self) -> dict[str, UnfinishedRun]: diff --git a/tests/test_discovery.py b/tests/test_discovery.py index 2830287..d8a0b3d 100644 --- a/tests/test_discovery.py +++ b/tests/test_discovery.py @@ -93,6 +93,17 @@ def test_get_db_discovery_result_report(client): assert result == "db discovery report without selection column" +def test_get_db_discovery_result_report_returns_zip_bytes_when_split(client): + run_id = RunId(1) + zip_bytes = b"PK\x03\x04 split report zip bytes" + with requests_mock.Mocker() as m: + url = f"http://test-server/api/runs/{run_id}/db-discovery-results/report/" + m.get(url, content=zip_bytes, headers={"X-DM-Download-Format": "zip"}, status_code=200) + result = client.get_db_discovery_result_report(run_id) + assert result == zip_bytes + assert isinstance(result, bytes) + + def test_poll_async_ruleset_generation(client): connection_id = ConnectionId("1") with requests_mock.Mocker() as m: @@ -421,6 +432,24 @@ def test_start_async_ruleset_generation_from_csv_success(client, csv_content): assert form_data["csv_or_zip_file"]["content"] == b"schema,table,column,selected\npublic,users,email,true" +def test_start_async_ruleset_generation_from_csv_uploads_zip_as_zip(client): + """A split report (zip bytes) is uploaded with a .zip filename and zip content-type.""" + connection_id = ConnectionId("1") + zip_content = b"PK\x03\x04 zipped discovery report" + + with requests_mock.Mocker() as m: + m.post( + f"http://test-server/api/async-generate-ruleset/{connection_id}/from-csv/", + status_code=201, + ) + client.start_async_ruleset_generation_from_csv(connection_id, zip_content) + + form_data = parse_multipart_form(m.last_request) + assert form_data["csv_or_zip_file"]["filename"] == "ruleset.zip" + assert form_data["csv_or_zip_file"]["content_type"] == "application/zip" + assert form_data["csv_or_zip_file"]["content"] == zip_content + + def test_start_async_ruleset_generation_from_csv_with_target_size(client): """Test async ruleset generation from CSV with target_size_bytes parameter.""" connection_id = ConnectionId("1")