diff --git a/d3b_api_client_cli/cli/__init__.py b/d3b_api_client_cli/cli/__init__.py index bc2d419..e268080 100644 --- a/d3b_api_client_cli/cli/__init__.py +++ b/d3b_api_client_cli/cli/__init__.py @@ -7,6 +7,14 @@ import click from d3b_api_client_cli.cli.dewrangle import * from d3b_api_client_cli.cli.postgres import * +from d3b_api_client_cli.cli.faker import * + + +@click.group() +def faker(): + """ + Group of lower level CLI commands related to generating fake data + """ @click.group() @@ -35,6 +43,9 @@ def main(): """ +# Fake data commands +faker.add_command(generate_global_id_file) + # Postgres API commands postgres.add_command(save_file_to_db) @@ -57,7 +68,12 @@ def main(): dewrangle.add_command(create_billing_group) dewrangle.add_command(delete_billing_group) dewrangle.add_command(read_billing_groups) +dewrangle.add_command(upsert_global_descriptors) +dewrangle.add_command(download_global_descriptors) +dewrangle.add_command(upsert_and_download_global_descriptors) +dewrangle.add_command(upsert_and_download_global_descriptor) # Add command groups to the root CLI main.add_command(dewrangle) main.add_command(postgres) +main.add_command(faker) diff --git a/d3b_api_client_cli/cli/dewrangle/__init__.py b/d3b_api_client_cli/cli/dewrangle/__init__.py index 4921f23..2c7febd 100644 --- a/d3b_api_client_cli/cli/dewrangle/__init__.py +++ b/d3b_api_client_cli/cli/dewrangle/__init__.py @@ -10,3 +10,4 @@ from d3b_api_client_cli.cli.dewrangle.volume_commands import * from d3b_api_client_cli.cli.dewrangle.job_commands import * from d3b_api_client_cli.cli.dewrangle.billing_group_commands import * +from d3b_api_client_cli.cli.dewrangle.global_id_commands import * diff --git a/d3b_api_client_cli/cli/dewrangle/global_id_commands.py b/d3b_api_client_cli/cli/dewrangle/global_id_commands.py new file mode 100644 index 0000000..654cd90 --- /dev/null +++ b/d3b_api_client_cli/cli/dewrangle/global_id_commands.py @@ -0,0 +1,285 @@ +""" +All CLI commands related to creating, updating, and downloading global IDs +in Dewrangle +""" + +import os +import logging +import click + +from d3b_api_client_cli.config import log, FHIR_RESOURCE_TYPES +from d3b_api_client_cli.dewrangle.global_id import GlobalIdDescriptorOptions +from d3b_api_client_cli.dewrangle.global_id import ( + upsert_global_descriptors as _upsert_global_descriptors, + download_global_descriptors as _download_global_descriptors, + upsert_and_download_global_descriptors as _upsert_and_download_global_descriptors, + upsert_and_download_global_descriptor as _upsert_and_download_global_descriptor, +) + +logger = logging.getLogger(__name__) + + +@click.command() +@click.option( + "--output-filepath", + type=click.Path(exists=False, file_okay=True, dir_okay=False), + help="If provided, download the file to this path. This takes " + "precedence over the --output-dir option", +) +@click.option( + "--output-dir", + default=os.getcwd(), + type=click.Path(exists=True, file_okay=False, dir_okay=True), + help="If provided, download the file with the default file name into " + "this directory", +) +@click.option( + "--download-all", + is_flag=True, + help="What descriptor(s) for each global ID to download. Either download" + " all descriptors for each global ID or just the most recent", +) +@click.option( + "--study-global-id", + help="The global ID of the study in Dewrangle. You must provide either " + "the global ID of the study OR the GraphQL ID of the study but not both", +) +@click.option( + "--study-id", + help="The GraphQL ID of the study in Dewrangle. You must provide either " + "the global ID of the study OR the GraphQL ID of the study but not both", +) +@click.option( + "--global-id", + help="Global ID associated with this descriptor." + " If this is provided, and the descriptor is new, then Dewrangle" + " will append the descriptor to this global ID's descriptor list", +) +@click.option( + "--fhir-resource-type", + type=click.Choice([rt for rt in FHIR_RESOURCE_TYPES.keys()]), + required=True, +) +@click.option( + "--descriptor", + required=True, +) +def upsert_and_download_global_descriptor( + descriptor, + fhir_resource_type, + global_id, + study_id, + study_global_id, + download_all, + output_dir, + output_filepath, +): + """ + Send request to upsert one global ID descriptor in Dewrangle and + download the resulting global ID descriptors. + + In order to create new global IDs provide: + descriptor, fhir-resource-type + + In order to update existing global IDs: + descriptor, fhir-resource-type, global-id + + \b + Arguments: + \b + input_filepath - Path to the file with global IDs and descriptors + """ + + log.init_logger() + + if (not study_id) and (not study_global_id): + raise click.BadParameter( + "❌ You must provide either the study's global ID in Dewrangle OR " + "the study's GraphQL ID in Dewrangle" + ) + return _upsert_and_download_global_descriptor( + descriptor, + fhir_resource_type, + global_id=global_id, + study_global_id=study_global_id, + dewrangle_study_id=study_id, + download_all=download_all, + output_dir=output_dir, + output_filepath=output_filepath, + ) + + +@click.command() +@click.option( + "--output-filepath", + type=click.Path(exists=False, file_okay=True, dir_okay=False), + help="If provided, download the file to this path. This takes " + "precedence over the --output-dir option", +) +@click.option( + "--output-dir", + default=os.getcwd(), + type=click.Path(exists=True, file_okay=False, dir_okay=True), + help="If provided, download the file with the default file name into " + "this directory", +) +@click.option( + "--download-all", + is_flag=True, + help="What descriptor(s) for each global ID to download. Either download" + " all descriptors for each global ID or just the most recent", +) +@click.option( + "--study-global-id", + help="The global ID of the study in Dewrangle. You must provide either " + "the global ID of the study OR the GraphQL ID of the study but not both", +) +@click.option( + "--study-id", + help="The GraphQL ID of the study in Dewrangle. You must provide either " + "the global ID of the study OR the GraphQL ID of the study but not both", +) +@click.argument( + "input_filepath", + type=click.Path(exists=False, file_okay=True, dir_okay=False), +) +def upsert_and_download_global_descriptors( + input_filepath, + study_id, + study_global_id, + download_all, + output_dir, + output_filepath, +): + """ + Send request to upsert global ID descriptors in Dewrangle and + download the resulting global ID descriptors. + + In order to create new global IDs provide a CSV file with the columns: + descriptor, fhirResourceType + + In order to update existing global IDs provide a CSV file with the columns: + descriptor, fhirResourceType, globalId + + \b + Arguments: + \b + input_filepath - Path to the file with global IDs and descriptors + """ + + log.init_logger() + + if (not study_id) and (not study_global_id): + raise click.BadParameter( + "❌ You must provide either the study's global ID in Dewrangle OR " + "the study's GraphQL ID in Dewrangle" + ) + + return _upsert_and_download_global_descriptors( + input_filepath, + study_global_id=study_global_id, + dewrangle_study_id=study_id, + download_all=download_all, + output_dir=output_dir, + output_filepath=output_filepath, + ) + + +@click.command() +@click.option( + "--study-global-id", + help="The global ID of the study in Dewrangle. You must provide either " + "the global ID of the study OR the GraphQL ID of the study but not both", +) +@click.option( + "--study-id", + help="The GraphQL ID of the study in Dewrangle. You must provide either " + "the global ID of the study OR the GraphQL ID of the study but not both", +) +@click.argument( + "filepath", + type=click.Path(exists=False, file_okay=True, dir_okay=False), +) +def upsert_global_descriptors(filepath, study_id, study_global_id): + """ + Upsert global ID descriptors in Dewrangle for a study. + + In order to create new global IDs provide a CSV file with the columns: + descriptor, fhirResourceType + + In order to update existing global IDs provide a CSV file with the columns: + descriptor, fhirResourceType, globalId + + \b + Arguments: + \b + filepath - Path to the file with global IDs and descriptors + """ + + log.init_logger() + + if (not study_id) and (not study_global_id): + raise click.BadParameter( + "❌ You must provide either the study's global ID in Dewrangle OR " + "the study's GraphQL ID in Dewrangle" + ) + + return _upsert_global_descriptors(filepath, study_global_id, study_id) + + +@click.command() +@click.option( + "--output-dir", + default=os.getcwd(), + type=click.Path(exists=True, file_okay=False, dir_okay=True), + help="If provided, download the file with the default file name into " + "this directory", +) +@click.option( + "--download-all", + is_flag=True, + help="What descriptor(s) for each global ID to download. Either download" + " all descriptors for each global ID or just the most recent", +) +@click.option( + "--job-id", help="Dewrangle job id from the upsert_global_descriptors cmd" +) +@click.option( + "--study-global-id", + help="The global ID of the study in Dewrangle. You must provide either " + "the global ID of the study OR the GraphQL ID of the study but not both", +) +@click.option( + "--study-id", + help="The GraphQL ID of the study in Dewrangle. You must provide either " + "the global ID of the study OR the GraphQL ID of the study but not both", +) +@click.option( + "--filepath", + type=click.Path(exists=False, file_okay=True, dir_okay=False), + help="If provided, download the file to this filepath. This takes " + "precedence over --output-dir", +) +def download_global_descriptors( + filepath, study_id, study_global_id, job_id, download_all, output_dir +): + """ + Download global ID descriptors in Dewrangle for a study. + """ + + log.init_logger() + + if (not study_id) and (not study_global_id): + raise click.BadParameter( + "❌ You must provide either the study's global ID in Dewrangle OR " + "the study's GraphQL ID in Dewrangle" + ) + + return _download_global_descriptors( + dewrangle_study_id=study_id, + study_global_id=study_global_id, + filepath=filepath, + job_id=job_id, + download_all=download_all, + output_dir=output_dir, + ) diff --git a/d3b_api_client_cli/cli/faker/__init__.py b/d3b_api_client_cli/cli/faker/__init__.py new file mode 100644 index 0000000..0e35a7a --- /dev/null +++ b/d3b_api_client_cli/cli/faker/__init__.py @@ -0,0 +1,5 @@ +""" +Package containing commands for fake data generation +""" + +from d3b_api_client_cli.cli.faker.global_id_commands import * diff --git a/d3b_api_client_cli/cli/faker/global_id_commands.py b/d3b_api_client_cli/cli/faker/global_id_commands.py new file mode 100644 index 0000000..480651a --- /dev/null +++ b/d3b_api_client_cli/cli/faker/global_id_commands.py @@ -0,0 +1,83 @@ +""" +Commands to generate fake global ID descriptors +""" + +import os +import logging +import click + +from d3b_api_client_cli.config import log, FHIR_RESOURCE_TYPES, FhirResourceType +from d3b_api_client_cli.faker.global_id import ( + generate_global_id_file as _generate_global_id_file, +) + +logger = logging.getLogger(__name__) + +DEFAULT_FHIR_RESOURCE_TYPE: FhirResourceType = FHIR_RESOURCE_TYPES[ + "DocumentReference" +] + + +@click.command() +@click.option( + "--output-dir", + type=click.Path(exists=True, file_okay=False, dir_okay=True), + help="Where the output file will be written", +) +@click.option( + "--fhir-resource-type", + default=DEFAULT_FHIR_RESOURCE_TYPE.resource_type, + type=click.Choice([rt for rt in FHIR_RESOURCE_TYPES.keys()]), + help="What the fhirResourceType column will be populated with", +) +@click.option( + "--with-global-ids", + is_flag=True, + help="Whether or not to generate a globalId column", +) +@click.option( + "--starting-index", + type=int, + default=0, + help="Determines what index the sequential descriptors start at", +) +@click.option( + "--total-rows", + type=int, + default=10, + help="Total number of rows to generate", +) +def generate_global_id_file( + total_rows, starting_index, with_global_ids, fhir_resource_type, output_dir +): + """ + Generate a csv file with global IDs and descriptors. + + \b + Descriptors are formatted like: + \b + - <2 char prefix for resource type>-000 + - Example: For a DocumentReference FHIR resource type the + descriptors would look like `dr-1000` + + \b + When starting_index is supplied it will be added to the row index. + \b + - Example: row 0, starting_index=255, descriptor = dr-25500 + - Example: row 1, starting_index=255, descriptor = dr-25600 + + \b + The starting_index allows a developer to have some control over the + descriptors that get generated so they can test create, replace, and + append functions for global IDs. + """ + + log.init_logger() + + return _generate_global_id_file( + fhir_resource_type, + total_rows=total_rows, + starting_index=starting_index, + with_global_ids=with_global_ids, + output_dir=output_dir, + ) diff --git a/d3b_api_client_cli/config/__init__.py b/d3b_api_client_cli/config/__init__.py index 74e7a76..0c95cbd 100644 --- a/d3b_api_client_cli/config/__init__.py +++ b/d3b_api_client_cli/config/__init__.py @@ -3,12 +3,14 @@ """ import os +from dataclasses import dataclass from dotenv import find_dotenv, load_dotenv # File paths and directories ROOT_DIR = os.path.dirname(os.path.dirname(os.path.dirname((__file__)))) ROOT_DATA_DIR = os.path.join(ROOT_DIR, "data") +ROOT_FAKE_DATA_DIR = os.path.join(ROOT_DATA_DIR, "fake_data") LOG_DIR = os.path.join(ROOT_DATA_DIR, "logs") DOTENV_PATH = find_dotenv() @@ -27,6 +29,23 @@ DB_USER_PW = os.environ.get("DB_USER_PW") +@dataclass +class FhirResourceType: + """ + Wrapper class to define a FHIR resource type along with a global ID + prefix + """ + + resource_type: str + id_prefix: str + + +FHIR_RESOURCE_TYPES: dict = { + resource_type: FhirResourceType(resource_type, prefix) + for resource_type, prefix in [("DocumentReference", "dr")] +} + + class SECRETS: """ Used in logger initialization to obfuscate sensitive env variables @@ -61,7 +80,8 @@ def check_dewrangle_http_config(): "endpoints": { "graphql": "/api/graphql", "rest": { - "hash_report": "/api/rest/jobs/{job_id}/report/volume-hash", + "study_file": "api/rest/studies/{dewrangle_study_id}/files/{filename}", + "global_id": "api/rest/studies/{dewrangle_study_id}/global-descriptors", "job_errors": "/api/rest/jobs/{job_id}/errors", }, }, @@ -69,6 +89,7 @@ def check_dewrangle_http_config(): "credential_type": "AWS", "billing_group_id": os.environ.get("CAVATICA_BILLING_GROUP_ID"), }, + "faker": {"global_id": {"fhir_resource_types": FHIR_RESOURCE_TYPES}}, "aws": { "region": os.environ.get("AWS_DEFAULT_REGION") or "us-east-1", "s3": { diff --git a/d3b_api_client_cli/dewrangle/global_id.py b/d3b_api_client_cli/dewrangle/global_id.py new file mode 100644 index 0000000..feed126 --- /dev/null +++ b/d3b_api_client_cli/dewrangle/global_id.py @@ -0,0 +1,302 @@ +""" +Dewrangle functions to create, update, remove global descriptors in Dewrangle +""" + +from enum import Enum +from typing import Optional +from pprint import pformat +import logging +import os + +import pandas + +from d3b_api_client_cli.dewrangle.graphql import study as study_api +from d3b_api_client_cli.dewrangle.rest.files import download_file + +from d3b_api_client_cli.config import config, ROOT_DATA_DIR, FhirResourceType +from d3b_api_client_cli.dewrangle.rest import ( + upload_study_file, +) +from d3b_api_client_cli.utils import timestamp + +logger = logging.getLogger(__name__) + +CSV_CONTENT_TYPE = "text/csv" +DEWRANGLE_BASE_URL = config["dewrangle"]["base_url"].rstrip("/") +DEFAULT_FILENAME = f"dewrangle-file-{timestamp()}.csv" + + +class GlobalIdDescriptorOptions(Enum): + """ + Used in download_global_descriptors + """ + + DOWNLOAD_ALL_DESC = "all" + DOWNLOAD_MOST_RECENT = "most-recent" + + +def upsert_and_download_global_descriptor( + descriptor: str, + fhir_resource_type: FhirResourceType, + global_id: Optional[str] = None, + study_global_id: Optional[str] = None, + dewrangle_study_id: Optional[str] = None, + skip_unavailable_descriptors: Optional[bool] = True, + download_all: Optional[bool] = True, + output_dir: Optional[str] = None, + output_filepath: Optional[str] = None, +) -> str: + """ + Upsert a single global descriptor and download created/updated + global descriptors and ID from Dewrangle + + Args: + See upsert_global_descriptors and + d3b_api_client_cli.dewrangle.rest.download_global_descriptors + + Options: + See upsert_global_descriptors and + d3b_api_client_cli.dewrangle.rest.download_global_descriptors + + Returns: + filepath: path to downloaded global ID descriptors + """ + if not output_dir: + output_dir = os.path.join(ROOT_DATA_DIR) + os.makedirs(output_dir, exist_ok=True) + + s_id = study_global_id if (study_global_id) else dewrangle_study_id + + filepath = os.path.join(output_dir, f"global-descriptors-{s_id}.csv") + + logger.info("✏️ Preparing to upsert single global descriptor ...") + logger.info("Writing parameters to file %s", filepath) + + row = {"descriptor": descriptor, "fhirResourceType": fhir_resource_type} + if global_id: + row["globalId"] = global_id + + pandas.DataFrame([row]).to_csv(filepath, index=False) + + return upsert_and_download_global_descriptors( + filepath, + study_global_id=study_global_id, + dewrangle_study_id=dewrangle_study_id, + skip_unavailable_descriptors=skip_unavailable_descriptors, + download_all=download_all, + output_dir=output_dir, + output_filepath=output_filepath, + ) + + +def upsert_and_download_global_descriptors( + input_filepath: str, + study_global_id: Optional[str] = None, + dewrangle_study_id: Optional[str] = None, + skip_unavailable_descriptors: Optional[bool] = True, + download_all: Optional[bool] = True, + output_dir: Optional[str] = None, + output_filepath: Optional[str] = None, +) -> str: + """ + Send request to upsert global descriptors and download created/updated + global descriptors and ID from Dewrangle + + Args: + See upsert_global_descriptors and + d3b_api_client_cli.dewrangle.rest.download_global_descriptors + + Options: + See upsert_global_descriptors and + d3b_api_client_cli.dewrangle.rest.download_global_descriptors + + Returns: + filepath: path to downloaded global ID descriptors + """ + if not output_dir: + output_dir = os.path.join(ROOT_DATA_DIR) + os.makedirs(output_dir, exist_ok=True) + + result = upsert_global_descriptors( + input_filepath, + study_global_id=study_global_id, + dewrangle_study_id=dewrangle_study_id, + skip_unavailable_descriptors=skip_unavailable_descriptors, + ) + + job_id = result["job"]["id"] + dewrangle_study_id = result["study_id"] + + filepath = download_global_descriptors( + dewrangle_study_id=dewrangle_study_id, + job_id=job_id, + download_all=download_all, + filepath=output_filepath, + output_dir=output_dir, + ) + + return filepath + + +def upsert_global_descriptors( + filepath: str, + study_global_id: Optional[str], + dewrangle_study_id: Optional[str], + skip_unavailable_descriptors: Optional[bool] = True, +): + """ + Upsert global descriptors to Dewrangle + + This happens in two steps: + 1. Upload the global descriptor csv file to the study file endpoint + 2. Invoke the graphQL mutation to upsert global descriptors + + Args: + - skip_unavailable_descriptors (bool): If true any errors due to a + descriptor already having a global ID assigned will be ignored + + Options: + - study_global_id - Provide this when you don't know the study's + GraphQL ID in Dewrangle. + - study_id - Study GraphQL ID in Dewrangle + + You must provide either the study_global_id OR the study_id but not both + + Raise: + ValueError if the study does not exist in Dewrangle + """ + if dewrangle_study_id: + study = study_api.read_study(dewrangle_study_id) + else: + study = study_api.find_study(study_global_id) + + if not study: + raise ValueError( + f"❌ Study " + f"{study_global_id if study_global_id else dewrangle_study_id}" + " does not exist in Dewrangle. Aborting" + ) + + study_global_id = study["globalId"] + dewrangle_study_id = study["id"] + + logger.info( + "🛸 Upsert global IDs in %s to Dewrangle for study %s", + filepath, + study_global_id, + ) + + filepath = os.path.abspath(filepath) + base_url = config["dewrangle"]["base_url"] + endpoint_template = config["dewrangle"]["endpoints"]["rest"]["study_file"] + endpoint = endpoint_template.format( + dewrangle_study_id=dewrangle_study_id, + filename=os.path.split(filepath)[-1], + ) + + url = f"{base_url}/{endpoint}" + logger.info("🛸 POST global IDs file %s to Dewrangle %s", filepath, url) + + result = upload_study_file(dewrangle_study_id, filepath=filepath) + study_file_id = result["id"] + + # Trigger global descriptor upsert mutation + resp = study_api.upsert_global_descriptors( + study_file_id, skip_unavailable_descriptors=skip_unavailable_descriptors + ) + result = resp["globalDescriptorUpsert"] + job_id = result["job"]["id"] + result["study_global_id"] = study_global_id + result["study_id"] = study["id"] + + logger.info( + "✅ Completed request to upsert global descriptors. Job ID: %s", job_id + ) + + return result + + +def download_global_descriptors( + dewrangle_study_id: Optional[str] = None, + study_global_id: Optional[str] = None, + job_id: Optional[str] = None, + download_all: Optional[bool] = True, + filepath: Optional[str] = None, + output_dir: Optional[str] = None, +) -> str: + """ + Download study's global IDs from Dewrangle + + Args: + - dewrangle_study_id: GraphQL ID of study in Dewrangle + - filepath: GraphQL ID of study in Dewrangle + + Options: + - job_id: The job ID returned from the upsert_global_descriptors + method. If this is provided, only global IDs from that + job will be returned. + + - download_all: Determines how many descriptors + will be returned for the global ID. + + If True, return all descriptors associated + with the global ID + + If False, return the most recent + descriptor associated with the global ID + + - filepath: If filepath is provided, download content to that filepath + + - output_dir: If output_dir is provided, get filename from + Content-Disposition header and download the file to the + output directory with that filename + """ + if dewrangle_study_id: + study = study_api.read_study(dewrangle_study_id) + else: + study = study_api.find_study(study_global_id) + + if not study: + raise ValueError( + f"❌ Study " + f"{study_global_id if study_global_id else dewrangle_study_id}" + " does not exist in Dewrangle. Aborting" + ) + + study_global_id = study["globalId"] + dewrangle_study_id = study["id"] + + if download_all: + descriptors = GlobalIdDescriptorOptions.DOWNLOAD_ALL_DESC.value + else: + descriptors = GlobalIdDescriptorOptions.DOWNLOAD_MOST_RECENT.value + + base_url = config["dewrangle"]["base_url"] + endpoint_template = config["dewrangle"]["endpoints"]["rest"]["global_id"] + endpoint = endpoint_template.format(dewrangle_study_id=dewrangle_study_id) + url = f"{base_url}/{endpoint}" + + # Download global IDs associated with this job only + params = {} + if job_id: + params.update({"job": job_id}) + + # Download all descriptors associated with each affected global id + if descriptors: + params.update({"descriptors": descriptors}) + + logger.info( + "🛸 Start download of global IDs for study %s from Dewrangle: %s" + " Params: %s", + study_global_id, + url, + pformat(params), + ) + + filepath = download_file( + url, output_dir=output_dir, filepath=filepath, params=params + ) + + logger.info("✅ Completed download of global IDs: %s", filepath) + + return filepath diff --git a/d3b_api_client_cli/dewrangle/graphql/study/__init__.py b/d3b_api_client_cli/dewrangle/graphql/study/__init__.py index 2d1a6da..5ed1bec 100644 --- a/d3b_api_client_cli/dewrangle/graphql/study/__init__.py +++ b/d3b_api_client_cli/dewrangle/graphql/study/__init__.py @@ -5,6 +5,7 @@ import os import logging from pprint import pformat +from typing import Optional import gql @@ -31,6 +32,44 @@ DEWRANGLE_MAX_PAGE_SIZE = config["dewrangle"]["pagination"]["max_page_size"] +def upsert_global_descriptors( + study_file_id: str, skip_unavailable_descriptors: Optional[bool] = True +) -> dict: + """ + Trigger the operation to upsert global descriptors in Dewrangle + + Args: + - skip_unavailable_descriptors: If true any errors due to a descriptor + """ + logger.info( + "🛸 Upsert global descriptors for study file: %s", study_file_id + ) + variables = { + "input": { + "studyFileId": study_file_id, + "skipUnavailableDescriptors": skip_unavailable_descriptors, + } + } + resp = exec_query(mutations.upsert_global_descriptors, variables=variables) + + key = "globalDescriptorUpsert" + mutation_errors = resp.get(key, {}).get("errors") + job_errors = ( + resp.get(key, {}).get("job", {}).get("errors", {}).get("edges", []) + ) + + if mutation_errors or job_errors: + logger.error("❌ %s for study failed", key) + if mutation_errors: + logger.error("❌ Mutation Errors:\n%s", pformat(mutation_errors)) + if job_errors: + logger.error("❌ Job Errors:\n%s", pformat(job_errors)) + else: + logger.info("✅ %s for study succeeded:\n%s", key, pformat(resp)) + + return resp + + def upsert_study( variables: dict, organization_id: str, study_id: str = None ) -> dict: diff --git a/d3b_api_client_cli/dewrangle/graphql/study/mutations.py b/d3b_api_client_cli/dewrangle/graphql/study/mutations.py index 51c7cf2..48c96d3 100644 --- a/d3b_api_client_cli/dewrangle/graphql/study/mutations.py +++ b/d3b_api_client_cli/dewrangle/graphql/study/mutations.py @@ -66,3 +66,45 @@ } """ ) + +upsert_global_descriptors = gql( + """ + mutation globalDescriptorUpsertMutation( + $input: GlobalDescriptorUpsertInput! + ) { + globalDescriptorUpsert(input: $input) { + errors { + ... on MutationError { + __typename + message + field + } + } + job { + id + completedAt + globalDescriptors { + totalCount + edges { + node { + descriptor + globalId + fhirResourceType + } + } + } + errors { + totalCount + edges { + node { + name + message + isFatal + } + } + } + } + } + } + """ +) diff --git a/d3b_api_client_cli/dewrangle/rest/files.py b/d3b_api_client_cli/dewrangle/rest/files.py index 977e069..eb082eb 100644 --- a/d3b_api_client_cli/dewrangle/rest/files.py +++ b/d3b_api_client_cli/dewrangle/rest/files.py @@ -3,14 +3,17 @@ """ from typing import Optional +from pprint import pformat, pprint import logging import os import cgi + from d3b_api_client_cli.config import ( DEWRANGLE_DEV_PAT, config, check_dewrangle_http_config, + ROOT_DATA_DIR, ) from d3b_api_client_cli.utils import send_request, timestamp @@ -30,13 +33,37 @@ def _filename_from_headers(headers: dict) -> str: return params.get("filename") +def upload_file(url: str, filepath: str, params: Optional[dict] = None): + """ + Upload a file to Dewrangle + """ + logger.info("🛸 Starting upload of %s to %s", filepath, url) + with open(filepath, "rb") as file_to_upload: + headers = {"x-api-key": DEWRANGLE_DEV_PAT} + resp = send_request( + "post", + url, + headers=headers, + data=file_to_upload, + params=params, + # Set timeout to infinity so that uploads don't timeout + timeout=-1, + ) + + logger.info("✅ Completed upload: %s", os.path.split(filepath)[-1]) + logger.info(pformat(resp.json())) + + return resp.json() + + def download_file( url: str, output_dir: Optional[str] = None, filepath: Optional[str] = None, + params: Optional[dict] = None, ) -> str: """ - Download study's global IDs from Dewrangle + Download a file from Dewrangle If filepath is provided, download content to that filepath @@ -47,12 +74,17 @@ def download_file( filepath - if the downloaded file was not empty None - if the downloaded file was empty """ - logger.info("🛸 Start downloading file from Dewrangle ...") + logger.info("🛸 Start downloading file from Dewrangle %s ...", url) + + if (not filepath) and (not output_dir): + output_dir = os.path.join(ROOT_DATA_DIR) + os.makedirs(output_dir, exist_ok=True) headers = {"x-api-key": DEWRANGLE_DEV_PAT, "content-type": CSV_CONTENT_TYPE} resp = send_request( "get", url, + params=params, headers=headers, ) if not filepath: @@ -70,40 +102,36 @@ def download_file( return filepath -def download_job_errors( - job_id: str, - output_dir: Optional[str] = None, - filepath: Optional[str] = None, -) -> str: +def upload_study_file(dewrangle_study_id: str, filepath: str): """ - Download a job's error report from Dewrangle - - See download_file for details + Upload a CSV file to Dewrangle's study file endpoint """ - # Ensure env vars are set - check_dewrangle_http_config() - - endpoint_template = config["dewrangle"]["endpoints"]["rest"]["job_errors"] - endpoint = endpoint_template.format(job_id=job_id) - url = f"{DEWRANGLE_BASE_URL}{endpoint}" + filepath = os.path.abspath(filepath) + base_url = config["dewrangle"]["base_url"] + endpoint_template = config["dewrangle"]["endpoints"]["rest"]["study_file"] + endpoint = endpoint_template.format( + dewrangle_study_id=dewrangle_study_id, + filename=os.path.split(filepath)[-1], + ) + url = f"{base_url}/{endpoint}" - return download_file(url, filepath=filepath, output_dir=output_dir) + return upload_file(url, filepath) -def download_hash_report( +def download_job_errors( job_id: str, output_dir: Optional[str] = None, filepath: Optional[str] = None, ) -> str: """ - Download a volume hash report from Dewrangle + Download a job's error report from Dewrangle See download_file for details """ # Ensure env vars are set check_dewrangle_http_config() - endpoint_template = config["dewrangle"]["endpoints"]["rest"]["hash_report"] + endpoint_template = config["dewrangle"]["endpoints"]["rest"]["job_errors"] endpoint = endpoint_template.format(job_id=job_id) url = f"{DEWRANGLE_BASE_URL}{endpoint}" diff --git a/d3b_api_client_cli/faker/__init__.py b/d3b_api_client_cli/faker/__init__.py new file mode 100644 index 0000000..6a44789 --- /dev/null +++ b/d3b_api_client_cli/faker/__init__.py @@ -0,0 +1,3 @@ +""" +Package dedicated to generating fake data needed for development and testing +""" diff --git a/d3b_api_client_cli/faker/global_id.py b/d3b_api_client_cli/faker/global_id.py new file mode 100644 index 0000000..b899cfb --- /dev/null +++ b/d3b_api_client_cli/faker/global_id.py @@ -0,0 +1,119 @@ +""" +Generate files of global ID descriptors for testing and development +""" + +import os +from typing import Optional +from pprint import pformat +import logging + +import pandas + +from d3b_api_client_cli.config import ( + config, + FhirResourceType, + ROOT_FAKE_DATA_DIR, +) + +FHIR_RESOURCE_TYPES: dict[str, FhirResourceType] = config["faker"]["global_id"][ + "fhir_resource_types" +] +DEFAULT_FHIR_RESOURCE_TYPE: str = "DocumentReference" + +logger = logging.getLogger(__name__) + + +def _generate_fake_global_id( + prefix: str, starting_index: Optional[int] = 0 +) -> str: + """ + Generate a fake Dewrangle global ID + """ + starting_index = str(starting_index) + + if not starting_index.isdigit(): + raise ValueError("Starting index must contain only digits.") + + if len(starting_index) > 10: + raise ValueError("Starting index cannot be longer than 10 digits.") + + remaining_length = 10 - len(starting_index) + remaining = "0" * remaining_length + + return f"{prefix}-{str(starting_index)}{remaining}" + + +def generate_global_id_file( + fhir_resource_type: Optional[str] = DEFAULT_FHIR_RESOURCE_TYPE, + with_global_ids: Optional[bool] = False, + total_rows: Optional[int] = 10, + starting_index: Optional[int] = 0, + output_dir: Optional[str] = None, +) -> str: + """ + Generate a csv file with global IDs and descriptors. + + Descriptors are formatted like: + + - <2 char prefix for resource type>-000 + - Example: dr-1000 + + When starting_index is supplied it will be added to the row index. + + - Example: row 0, starting_index=255, descriptor = dr-25500 + - Example: row 1, starting_index=255, descriptor = dr-25600 + + The starting_index allows a developer to have some control over the + descriptors that get generated so they can test create, replace, and + append functions for descriptors. + + Options: + - fhir_resource_type: the FHIR resource type and global ID prefix + to populate the file with + + - with_global_ids: Whether or not to include a column for global IDs + if global IDs are not included and this file is used in + upsert_global_descriptors, then new global IDs will be created by + Dewrangle + + - total_rows: Number of rows to generate + + - starting_index: Used in generating sequential descriptors. + + Returns: + Path to file + """ + logger.info( + "🏭 Generating %s rows for fake global ID descriptors file", total_rows + ) + if not output_dir: + output_dir = ROOT_FAKE_DATA_DIR + os.makedirs(output_dir, exist_ok=True) + + fhir_resource_type = FHIR_RESOURCE_TYPES.get(fhir_resource_type) + + data = [] + for i in range(total_rows): + rt = fhir_resource_type.resource_type + global_id = _generate_fake_global_id( + fhir_resource_type.id_prefix, starting_index + i + ) + descriptor_suffix = global_id.split("-")[-1] + row = { + "fhirResourceType": fhir_resource_type.resource_type, + "descriptor": f"{rt}-{descriptor_suffix}", + } + if with_global_ids: + row["globalId"] = global_id + data.append(row) + + logger.info("Wrote %s to file", pformat(row)) + + df = pandas.DataFrame(data) + + filepath = os.path.join(output_dir, "fake_global_descriptors.csv") + df.to_csv(filepath, index=False) + + logger.info("✅ Completed writing global ID descriptors to %s", filepath) + + return filepath diff --git a/tests/conftest.py b/tests/conftest.py index e831178..2f4542e 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -22,6 +22,7 @@ AWS_BUCKET_DATA_TRANSFER_TEST = config["aws"]["s3"]["test_bucket_name"] POSTGRES_DB_IMAGE = "postgres:16-alpine" +ORG_NAME = "Integration Tests d3b-api-client-cli" @pytest.fixture(scope="session") @@ -30,7 +31,7 @@ def organization_file(tmp_path_factory): Write the inputs to create a Dewrangle Organization to file """ - def create_and_write_org(org_name="TestOrg"): + def create_and_write_org(org_name=ORG_NAME): data_dir = tmp_path_factory.mktemp("data") org_filepath = os.path.join(data_dir, "Organization.json") org = { @@ -72,7 +73,7 @@ def dewrangle_org(organization_file): """ Upsert an Organization in Dewrangle for other tests to use """ - fp = organization_file(org_name="Integration Tests") + fp = organization_file() runner = CliRunner() result = runner.invoke(upsert_organization, [fp], standalone_mode=False) assert result.exit_code == 0 diff --git a/tests/integration/dewrangle/test_crud_organization.py b/tests/integration/dewrangle/test_crud_organization.py index 039425d..1a511d0 100644 --- a/tests/integration/dewrangle/test_crud_organization.py +++ b/tests/integration/dewrangle/test_crud_organization.py @@ -10,6 +10,9 @@ from d3b_api_client_cli.utils import read_json, write_json from d3b_api_client_cli.cli import * from d3b_api_client_cli.dewrangle.graphql import organization +from tests.conftest import ORG_NAME + +TEST_ORG_NAME = ORG_NAME + " for orgs" def test_upsert_organization(tmp_path, organization_file): @@ -17,7 +20,7 @@ def test_upsert_organization(tmp_path, organization_file): Test `d3b-clients dewrangle upsert-organization` command """ # Create - fp = organization_file() + fp = organization_file(org_name=TEST_ORG_NAME) organization = read_json(fp) runner = CliRunner() result = runner.invoke(upsert_organization, [fp], standalone_mode=False) @@ -71,7 +74,7 @@ def test_delete_organization_safety_check_on(): runner = CliRunner() result = runner.invoke( delete_organization, - ["--dewrangle-org-name", "TestOrg"], + ["--dewrangle-org-name", TEST_ORG_NAME], standalone_mode=False, ) assert result.exit_code == 1 @@ -81,7 +84,7 @@ def test_delete_organization_safety_check_on(): found_org = None if orgs: for org in orgs: - if org["name"] == "TestOrg": + if org["name"] == TEST_ORG_NAME: found_org = org break assert found_org @@ -98,7 +101,7 @@ def test_delete_organization_safety_check_off(): dwid = None for org in orgs: - if org["name"] == "TestOrg": + if org["name"] == TEST_ORG_NAME: dwid = org["id"] break @@ -117,4 +120,4 @@ def test_delete_organization_safety_check_off(): orgs = organization.read_organizations() if orgs: - assert all([org["name"] != "TestOrg" for org in orgs]) + assert all([org["name"] != TEST_ORG_NAME for org in orgs]) diff --git a/tests/integration/dewrangle/test_global_ids.py b/tests/integration/dewrangle/test_global_ids.py new file mode 100644 index 0000000..22b73ab --- /dev/null +++ b/tests/integration/dewrangle/test_global_ids.py @@ -0,0 +1,185 @@ +""" +Test Dewrangle global ID commands +""" + +import os + +import pytest +from click.testing import CliRunner +import pandas + +from d3b_api_client_cli.cli.dewrangle.global_id_commands import ( + upsert_global_descriptors, + download_global_descriptors, + upsert_and_download_global_descriptors, + upsert_and_download_global_descriptor, +) +from d3b_api_client_cli.dewrangle.global_id import ( + upsert_global_descriptors as _upsert_global_descriptors, +) +from d3b_api_client_cli.faker.global_id import ( + generate_global_id_file, +) + + +@pytest.fixture(scope="session") +def upserted_global_descriptors(dewrangle_study): + """ + Upsert global descriptors + """ + study, fp = dewrangle_study + output_dir = os.path.dirname(fp) + + filepath = generate_global_id_file(output_dir=output_dir) + + runner = CliRunner() + result = runner.invoke( + upsert_global_descriptors, + [filepath, "--study-id", study["id"]], + standalone_mode=False, + ) + assert result.exit_code == 0 + assert result.return_value + + return result.return_value, filepath + + +@pytest.fixture(scope="session") +def downloaded_global_descriptors(upserted_global_descriptors): + """ + Download newly created global descriptors + """ + result, filepath = upserted_global_descriptors + output_dir = os.path.dirname(filepath) + study_id = result["study_id"] + job_id = result["job"]["id"] + + runner = CliRunner() + + result = runner.invoke( + download_global_descriptors, + [ + "--study-id", + study_id, + "--job-id", + job_id, + "--output-dir", + output_dir, + ], + standalone_mode=False, + ) + assert result.exit_code == 0 + filepath = result.return_value + + return study_id, filepath + + +def test_upsert_global_descriptors(upserted_global_descriptors): + """ + Test d3b-clients dewrangle upsert-global-descriptors + """ + upserted_global_descriptors + + +def test_download_global_descriptors(downloaded_global_descriptors): + """ + Test d3b-clients dewrangle download-global-descriptors + """ + _, filepath = downloaded_global_descriptors + df = pandas.read_csv(filepath) + assert df.shape[0] == 10 + + +def test_upsert_and_download_global_descriptors(downloaded_global_descriptors): + """ + Test d3b-clients dewrangle upsert-and-download-global-descriptors + """ + study_id, filepath = downloaded_global_descriptors + output_dir = os.path.dirname(filepath) + + # Update the descriptors + df = pandas.read_csv(filepath) + df = df[[c for c in ("fhirResourceType", "descriptor", "globalId")]] + df["descriptor"] = df["descriptor"].apply(lambda d: d + "1") + df.to_csv(filepath, index=False) + + runner = CliRunner() + + # Upsert and download the descriptors + result = runner.invoke( + upsert_and_download_global_descriptors, + [filepath, "--study-id", study_id, "--output-dir", output_dir], + standalone_mode=False, + ) + assert result.exit_code == 0 + filepath = result.return_value + + df = pandas.read_csv(filepath) + assert df.shape[0] == 10 + + +def test_download_all_descriptors(dewrangle_study): + """ + Test d3b-clients dewrangle download-global-descriptors for all ids + """ + study, filepath = dewrangle_study + output_dir = os.path.dirname(filepath) + + runner = CliRunner() + result = runner.invoke( + download_global_descriptors, + [ + "--study-id", + study["id"], + "--download-all", + "--output-dir", + output_dir, + ], + standalone_mode=False, + ) + assert result.exit_code == 0 + filepath = result.return_value + + df = pandas.read_csv(filepath) + + # Should have double the descriptors plus one for the study + assert df.shape[0] == 21 + + +def test_one_upsert_and_download_global_descriptor( + downloaded_global_descriptors, +): + """ + Test d3b-clients dewrangle upsert-and-download-global-descriptor + """ + study_id, filepath = downloaded_global_descriptors + output_dir = os.path.dirname(filepath) + + # Get an existing global ID + df = pandas.read_csv(filepath) + row = df.to_dict(orient="records")[0] + + runner = CliRunner() + + # Upsert and download the descriptors + result = runner.invoke( + upsert_and_download_global_descriptor, + [ + "--descriptor", + "foo", + "--fhir-resource-type", + row["fhirResourceType"], + "--global-id", + row["globalId"], + "--study-id", + study_id, + "--output-dir", + output_dir, + ], + standalone_mode=False, + ) + assert result.exit_code == 0 + filepath = result.return_value + + df = pandas.read_csv(filepath) + assert df.shape[0] == 1 diff --git a/tests/unit/dewrangle/test_download.py b/tests/unit/dewrangle/test_download.py index d01c07d..a0e26d5 100644 --- a/tests/unit/dewrangle/test_download.py +++ b/tests/unit/dewrangle/test_download.py @@ -1,5 +1,5 @@ """ -Test downloading volume hash files (error, hash report) from Dewrangle +Test downloading volume hash files (job errors) from Dewrangle """ import os @@ -84,28 +84,7 @@ def test_download_job_errors(mocker): ) -def test_download_hash_report(mocker): - """ - Test download Dewrangle volume hash report - """ - mock_download_file = mocker.patch( - "d3b_api_client_cli.dewrangle.rest.files.download_file" - ) - - files.download_hash_report("job-id", output_dir="output") - - endpoint_template = config["dewrangle"]["endpoints"]["rest"]["hash_report"] - endpoint = endpoint_template.format(job_id="job-id") - url = f"{DEWRANGLE_BASE_URL.rstrip('/')}/{endpoint.lstrip('/')}" - - mock_download_file.assert_called_with( - url, output_dir="output", filepath=None - ) - - -@pytest.mark.parametrize( - "download_method", [files.download_job_errors, files.download_hash_report] -) +@pytest.mark.parametrize("download_method", [files.download_job_errors]) @pytest.mark.parametrize( "token,url, expected_msg", [ diff --git a/tests/unit/dewrangle/test_global_ids.py b/tests/unit/dewrangle/test_global_ids.py new file mode 100644 index 0000000..e061f03 --- /dev/null +++ b/tests/unit/dewrangle/test_global_ids.py @@ -0,0 +1,76 @@ +""" +Unit test global ID command +""" + +import pytest +from click.testing import CliRunner + +from d3b_api_client_cli.cli.dewrangle.global_id_commands import ( + upsert_global_descriptors, +) +from d3b_api_client_cli.dewrangle.global_id import ( + upsert_global_descriptors as _upsert_global_descriptors, + download_global_descriptors as _download_global_descriptors, +) + + +def test_upsert_global_descriptors_cli_errors(): + """ + Test d3b-clients dewrangle upser-global-descriptor errors + """ + runner = CliRunner() + + result = runner.invoke( + upsert_global_descriptors, + ["global_ids.csv"], + standalone_mode=False, + ) + assert result.exit_code == 1 + assert "BadParameter" in str(result.exc_info) + assert "global ID" in str(result.exc_info) + + +@pytest.mark.parametrize( + "kwargs", + [ + {"dewrangle_study_id": None, "study_global_id": "foo"}, + {"dewrangle_study_id": "foo", "study_global_id": None}, + ], +) +def test_upsert_global_descriptors_no_study(mocker, kwargs): + """ + Test d3b-clients dewrangle upsert-global-descriptors when study + is not found + """ + mock_study_api = mocker.patch( + "d3b_api_client_cli.dewrangle.global_id.study_api" + ) + mock_study_api.read_study.return_value = {} + mock_study_api.find_study.return_value = {} + + with pytest.raises(ValueError) as e: + _upsert_global_descriptors("global_ids.csv", **kwargs) + assert "does not exist" in str(e) + + +@pytest.mark.parametrize( + "kwargs", + [ + {"dewrangle_study_id": None, "study_global_id": "foo"}, + {"dewrangle_study_id": "foo", "study_global_id": None}, + ], +) +def test_download_global_descriptors_no_study(mocker, kwargs): + """ + Test d3b-clients dewrangle download-global-descriptors when study + is not found + """ + mock_study_api = mocker.patch( + "d3b_api_client_cli.dewrangle.global_id.study_api" + ) + mock_study_api.read_study.return_value = {} + mock_study_api.find_study.return_value = {} + + with pytest.raises(ValueError) as e: + _download_global_descriptors(**kwargs) + assert "does not exist" in str(e) diff --git a/tests/unit/faker/test_fake_global_ids.py b/tests/unit/faker/test_fake_global_ids.py new file mode 100644 index 0000000..e1594f6 --- /dev/null +++ b/tests/unit/faker/test_fake_global_ids.py @@ -0,0 +1,68 @@ +""" +Test generating fake data for global ID commands +""" + +import pytest +from click.testing import CliRunner +import pandas + +from d3b_api_client_cli.cli.faker.global_id_commands import * +from d3b_api_client_cli.faker.global_id import ( + generate_global_id_file as _generate_global_id_file, + DEFAULT_FHIR_RESOURCE_TYPE, +) + + +@pytest.mark.parametrize( + "kwargs,error_msg", [({"fhir_resource_type": "foo"}, "BadParameter")] +) +def test_generate_global_ids_errors(kwargs, error_msg): + """ + Test generate_global_id_file errors + """ + runner = CliRunner() + result = runner.invoke( + generate_global_id_file, + ["--fhir-resource-type", "foo"], + standalone_mode=False, + ) + assert result.exit_code == 1 + assert error_msg in str(result.exc_info) + + +def test_generate_global_ids(tmp_path): + """ + Test generate_global_id_file + """ + temp_dir = tmp_path / "output" + temp_dir.mkdir() + + # With global IDs + filepath = _generate_global_id_file( + starting_index=250, + output_dir=temp_dir, + with_global_ids=True, + ) + df = pandas.read_csv(filepath) + + for c in ["fhirResourceType", "descriptor", "globalId"]: + assert c in df.columns + + assert df["fhirResourceType"].eq(DEFAULT_FHIR_RESOURCE_TYPE).all() + assert ( + df["descriptor"].apply(lambda d: int(d.split("-")[-1])).ge(250000).all() + ) + + # Without global IDs + filepath = _generate_global_id_file( + output_dir=temp_dir, with_global_ids=False + ) + df = pandas.read_csv(filepath) + assert "globalId" not in df.columns + assert df["descriptor"].apply(lambda d: int(d.split("-")[-1])).ge(0).all() + assert ( + df["descriptor"] + .apply(lambda d: int(d.split("-")[-1])) + .le(9000000000) + .all() + )