diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 32e18abe..b715e83e 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -2,13 +2,12 @@ image: python:3.6 stages: - dcp_wide_test - - 1000_cell_test - - 100_cell_test before_script: - apt-get -y update - apt-get -y install jq - pip install -r requirements.txt + - export CI_COMMIT_REF_NAME=integration - export DEPLOYMENT_ENV=$CI_COMMIT_REF_NAME - export SWAGGER_URL="https://dss.$DEPLOYMENT_ENV.data.humancellatlas.org/v1/swagger.json" - mkdir -p ~/.config/hca @@ -42,3 +41,10 @@ dcp_wide_test_optimus: - staging script: - python -m unittest tests.integration.test_end_to_end_dcp.TestOptimusRun.test_optimus_run + +dcp_wide_ss2_scale_test_4000: + stage: dcp_wide_test + only: + - scale-out-spreadsheet-and-files + script: + CI_COMMIT_REF_NAME=integration DEPLOYMENT_STAGE=integration python -m unittest tests.scale.test_big_bundles.TestBigBundles.test_one_submission_with_4000_bundles diff --git a/requirements.txt b/requirements.txt index bcc32575..92d7e704 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,7 +2,7 @@ iso8601 requests urllib3 hca>=4.9.0 -openpyxl +openpyxl==2.3.5 awscli hca-ingest cromwell-tools>=1.1.2 diff --git a/tests/dataset_fixture.py b/tests/dataset_fixture.py index c074ab2f..e7606839 100644 --- a/tests/dataset_fixture.py +++ b/tests/dataset_fixture.py @@ -2,8 +2,10 @@ import os import glob +import boto3 import openpyxl import requests +from hca.util.pool import ThreadPool class DatasetFixture: @@ -35,6 +37,94 @@ def __init__(self, dataset_name, deployment): self.config = json.load(json_data) self.config["spreadsheet_location"] = self.config["spreadsheet_location"].replace("DEPLOYMENT", self.deployment) self._download_spreadsheet() + if self.config["generate_scaled_spreadsheet"] == True: + self._generate_scaled_spreadsheet_and_data_files() + + def _generate_scaled_spreadsheet_and_data_files(self): + self._scale_spreadsheet_cell_suspensions() + self._scale_sequence_files() + + def _scale_spreadsheet_cell_suspensions(self): + cell_suspension_tab = self.spreadsheet['Cell suspension'] + row_to_copy = self._fetch_row_with_headers(cell_suspension_tab, 6) + num_rows_to_copy = self.config["expected_bundle_count"] + for row_idx in range(2, num_rows_to_copy + 1): + new_row = row_to_copy.copy() + new_row["cell_suspension.biomaterial_core.biomaterial_id"] = row_to_copy["cell_suspension.biomaterial_core.biomaterial_id"].replace("1", str(row_idx)) + new_row["cell_suspension.plate_based_sequencing.plate_label"] = row_to_copy["cell_suspension.plate_based_sequencing.plate_label"] + 1 + new_row["cell_suspension.plate_based_sequencing.well_id"] = f"A{row_idx}" + cell_suspension_tab.append(list(new_row.values())) + self.spreadsheet.save(filename=self.metadata_spreadsheet_path) + + def _scale_sequence_files(self): + sequence_file_tab = self.spreadsheet['Sequence file'] + first_file_row_to_copy = self._fetch_row_with_headers(sequence_file_tab, 6) + second_file_row_to_copy = self._fetch_row_with_headers(sequence_file_tab, 7) + num_rows_to_copy = self.config["expected_bundle_count"] + orig_filename_1 = first_file_row_to_copy['sequence_file.file_core.file_name'] + orig_filename_2 = second_file_row_to_copy['sequence_file.file_core.file_name'] + pool = ThreadPool() + pool.add_task(self._copy_sequence_file, orig_filename_1, orig_filename_1) + pool.add_task(self._copy_sequence_file, orig_filename_2, orig_filename_2) + for row_idx in range(2, num_rows_to_copy + 1): + new_first_file_row = first_file_row_to_copy.copy() + new_second_file_row = second_file_row_to_copy.copy() + new_filename_1 = f"{row_idx}_{orig_filename_1}" + new_filename_2 = f"{row_idx}_{orig_filename_2}" + new_first_file_row["sequence_file.file_core.file_name"] = new_filename_1 + new_second_file_row["sequence_file.file_core.file_name"] = new_filename_2 + new_first_file_row["cell_suspension.biomaterial_core.biomaterial_id"] = first_file_row_to_copy['cell_suspension.biomaterial_core.biomaterial_id'].replace("1", str(row_idx)) + new_second_file_row["cell_suspension.biomaterial_core.biomaterial_id"] = second_file_row_to_copy['cell_suspension.biomaterial_core.biomaterial_id'].replace("1", str(row_idx)) + new_first_file_row["process.process_core.process_id"] = row_idx + new_second_file_row["process.process_core.process_id"] = row_idx + sequence_file_tab.append(list(new_first_file_row.values())) + sequence_file_tab.append(list(new_second_file_row.values())) + pool.add_task(self._copy_sequence_file, orig_filename_1, new_filename_1) + pool.add_task(self._copy_sequence_file, orig_filename_2, new_filename_2) + pool.wait_for_completion() + self.spreadsheet.save(filename=self.metadata_spreadsheet_path) + + def _copy_sequence_file(self, source_file_name, target_file_name): + s3_client = boto3.client('s3') + source_s3_prefix = self.config["orig_copy_files_location"] + source_s3_path = f"{source_s3_prefix}{source_file_name}" + s3_path_split = source_s3_path.replace("s3://", "").split("/", 1) + source_bucket = s3_path_split[0] + source_key = s3_path_split[1] + + target_s3_prefix = self.config["data_files_location"] + target_s3_path = f"{target_s3_prefix}{target_file_name}" + s3_path_split = target_s3_path.replace("s3://", "").split("/", 1) + target_bucket = s3_path_split[0] + target_key = s3_path_split[1] + + copy_source = { + 'Bucket': source_bucket, + 'Key': source_key + } + upload_args = { + 'CopySource': copy_source, + 'Bucket': target_bucket, + 'Key': target_key + } + print(f"copying {source_s3_path} to {target_s3_path}") + s3_client.copy(**upload_args) + + def _fetch_row_with_headers(self, worksheet, row_idx): + row = {} + headers = self._fetch_row_values(worksheet, "A4:AG4") + value_idxs = f"A{row_idx}:AG{row_idx}" + values = self._fetch_row_values(worksheet, value_idxs) + for idx, val in enumerate(headers): + row[val] = values[idx] + return row + + def _fetch_row_values(self, ws, n): + values = [] + for row in ws.iter_rows(n): + for cell in row: + values.append(cell.value) + return values def _download_spreadsheet(self): response = requests.get(self.config["spreadsheet_location"]) @@ -63,7 +153,11 @@ def count_of_rows_in_spreadsheet_tab(self, tab_name, header_rows=5): ws = self.spreadsheet[tab_name] rows_with_content = 0 row = header_rows + 1 - while ws.cell(row=row, column=1).value: - rows_with_content += 1 + extra_rows_to_check = 10 + while extra_rows_to_check > 0: + if ws.cell(row=row, column=1).value: + rows_with_content += 1 + else: + extra_rows_to_check -= 1 row += 1 return rows_with_content diff --git a/tests/fixtures/.DS_Store b/tests/fixtures/.DS_Store index bba90c17..5008ddfc 100644 Binary files a/tests/fixtures/.DS_Store and b/tests/fixtures/.DS_Store differ diff --git a/tests/fixtures/datasets/gliob_100/README.json b/tests/fixtures/datasets/gliob_100/README.json deleted file mode 100644 index 584be5a4..00000000 --- a/tests/fixtures/datasets/gliob_100/README.json +++ /dev/null @@ -1,5 +0,0 @@ -{ - "data_files_location": "s3://org-humancellatlas-dcp-test-data/gliob_100/", - "spreadsheet_location": "https://raw.github.com/HumanCellAtlas/metadata-schema/DEPLOYMENT/infrastructure_testing_files/current/dcp_scale_test_metadata_100_glioblastoma_cells.xlsx", - "expected_bundle_count": 100 -} diff --git a/tests/fixtures/datasets/gliob_1000/README.json b/tests/fixtures/datasets/gliob_1000/README.json deleted file mode 100644 index 94b621aa..00000000 --- a/tests/fixtures/datasets/gliob_1000/README.json +++ /dev/null @@ -1,5 +0,0 @@ -{ - "data_files_location": "s3://org-humancellatlas-dcp-test-data/gliob_1000/", - "spreadsheet_location": "https://raw.github.com/HumanCellAtlas/metadata-schema/DEPLOYMENT/infrastructure_testing_files/current/dcp_scale_test_metadata_1000_glioblastoma_cells.xlsx", - "expected_bundle_count": 1000 -} diff --git a/tests/fixtures/datasets/gliob_200/README.json b/tests/fixtures/datasets/gliob_200/README.json deleted file mode 100644 index 46c55b17..00000000 --- a/tests/fixtures/datasets/gliob_200/README.json +++ /dev/null @@ -1,5 +0,0 @@ -{ - "data_files_location": "s3://org-humancellatlas-dcp-test-data/gliob_200/", - "spreadsheet_location": "https://raw.github.com/HumanCellAtlas/metadata-schema/DEPLOYMENT/infrastructure_testing_files/current/dcp_scale_test_metadata_200_glioblastoma_cells.xlsx", - "expected_bundle_count": 200 -} diff --git a/tests/fixtures/datasets/gliob_400/README.json b/tests/fixtures/datasets/gliob_400/README.json deleted file mode 100644 index 7757875f..00000000 --- a/tests/fixtures/datasets/gliob_400/README.json +++ /dev/null @@ -1,5 +0,0 @@ -{ - "data_files_location": "s3://org-humancellatlas-dcp-test-data/gliob_400/", - "spreadsheet_location": "https://raw.github.com/HumanCellAtlas/metadata-schema/DEPLOYMENT/infrastructure_testing_files/current/dcp_scale_test_metadata_400_glioblastoma_cells.xlsx", - "expected_bundle_count": 400 -} diff --git a/tests/fixtures/datasets/ss2_100/README.json b/tests/fixtures/datasets/ss2_100/README.json new file mode 100644 index 00000000..41cb519f --- /dev/null +++ b/tests/fixtures/datasets/ss2_100/README.json @@ -0,0 +1,7 @@ +{ + "data_files_location": "s3://org-humancellatlas-dcp-test-data/ss2-100/", + "spreadsheet_location": "https://raw.github.com/HumanCellAtlas/metadata-schema/DEPLOYMENT/infrastructure_testing_files/current/dcp_integration_test_metadata_1_SS2_bundle.xlsx", + "expected_bundle_count": 100, + "generate_scaled_spreadsheet": true, + "orig_copy_files_location": "s3://org-humancellatlas-dcp-test-data/smart-seq2-one-bundle/" +} diff --git a/tests/fixtures/datasets/ss2_1000/README.json b/tests/fixtures/datasets/ss2_1000/README.json new file mode 100644 index 00000000..126e50ab --- /dev/null +++ b/tests/fixtures/datasets/ss2_1000/README.json @@ -0,0 +1,7 @@ +{ + "data_files_location": "s3://org-humancellatlas-dcp-test-data/ss2-1000/", + "spreadsheet_location": "https://raw.github.com/HumanCellAtlas/metadata-schema/DEPLOYMENT/infrastructure_testing_files/current/dcp_integration_test_metadata_1_SS2_bundle.xlsx", + "expected_bundle_count": 1000, + "generate_scaled_spreadsheet": true, + "orig_copy_files_location": "s3://org-humancellatlas-dcp-test-data/smart-seq2-one-bundle/" +} diff --git a/tests/fixtures/datasets/ss2_2000/README.json b/tests/fixtures/datasets/ss2_2000/README.json new file mode 100644 index 00000000..cb2f8f1c --- /dev/null +++ b/tests/fixtures/datasets/ss2_2000/README.json @@ -0,0 +1,7 @@ +{ + "data_files_location": "s3://org-humancellatlas-dcp-test-data/ss2-2000/", + "spreadsheet_location": "https://raw.github.com/HumanCellAtlas/metadata-schema/DEPLOYMENT/infrastructure_testing_files/current/dcp_integration_test_metadata_1_SS2_bundle.xlsx", + "expected_bundle_count": 2000, + "generate_scaled_spreadsheet": true, + "orig_copy_files_location": "s3://org-humancellatlas-dcp-test-data/smart-seq2-one-bundle/" +} diff --git a/tests/fixtures/datasets/ss2_4000/README.json b/tests/fixtures/datasets/ss2_4000/README.json new file mode 100644 index 00000000..6d7fc755 --- /dev/null +++ b/tests/fixtures/datasets/ss2_4000/README.json @@ -0,0 +1,7 @@ +{ + "data_files_location": "s3://org-humancellatlas-dcp-test-data/ss2-4000/", + "spreadsheet_location": "https://raw.github.com/HumanCellAtlas/metadata-schema/DEPLOYMENT/infrastructure_testing_files/current/dcp_integration_test_metadata_1_SS2_bundle.xlsx", + "expected_bundle_count": 4000, + "generate_scaled_spreadsheet": true, + "orig_copy_files_location": "s3://org-humancellatlas-dcp-test-data/smart-seq2-one-bundle/" +} diff --git a/tests/scale/test_big_bundles.py b/tests/scale/test_big_bundles.py index c18f88ae..4bdef0ae 100644 --- a/tests/scale/test_big_bundles.py +++ b/tests/scale/test_big_bundles.py @@ -8,19 +8,19 @@ class TestBigBundles(unittest.TestCase): def test_one_submission_with_100_bundles(self): - self._run(fixture_name='gliob_100') + self._run(fixture_name='ss2_100') - def test_one_submission_with_200_bundles(self): - self._run(fixture_name='gliob_200') + def test_one_submission_with_1000_bundles(self): + self._run(fixture_name='ss2_1000') - def test_one_submission_with_400_bundles(self): - self._run(fixture_name='gliob_400') + def test_one_submission_with_2000_bundles(self): + self._run(fixture_name='ss2_2000') - def test_one_submission_with_1000_bundles(self): - self._run(fixture_name='gliob_1000') + def test_one_submission_with_4000_bundles(self): + self._run(fixture_name='ss2_4000') def test_ingest_and_upload_only_1000_bundle_submissions(self): - self._run(fixture_name='gliob_1000', export_bundles=False) + self._run(fixture_name='ss2_1000', export_bundles=False) def _run(self, fixture_name, export_bundles=True): print("")