diff --git a/docs/BACKUP.md b/docs/BACKUP.md index a878649..4421394 100644 --- a/docs/BACKUP.md +++ b/docs/BACKUP.md @@ -69,7 +69,9 @@ python scripts/backup.py input.csv conf.yaml -p path/to/profiles.yaml --profile ## Configuration file (conf) The configuration files let you define which type of storage the export tool will save the backups to, and any additional storage-specific information that might be required. Currently AWS S3 and Local storage are supported. -If you run the script with `list-of-parents` or `entire-organization`, the script will fetch the IDs of workspaces to process (either hierarchies under the specified parents or all the workspaces within the organization) in batches. As a default, the batch size is set to `100`, but you can parametrize it by setting the `api_page_size` parametter in your configuration yaml. +If you run the script with `list-of-parents` or `entire-organization`, the script will fetch the IDs of workspaces to process (either hierarchies under the specified parents or all the workspaces within the organization) in batches. As a default, the batch size is set to `100`, but you can parametrize it by setting the `api_page_size` parameter in your configuration yaml. + +The `batch_size` is an optional parameter which accepts integer value and determines how many workspaces will be processed before saving the backups to the selected storage. As a default, the batch size is set to `100`. If you want to set a different batch size, you can specify so in the configuration yaml. The configuration file has the following format: ```yaml @@ -78,8 +80,10 @@ storage: arg1: foo arg2: bar api_page_size: 1000 +batch_size: 20 ``` + ### AWS S3 You can define the configuration file for S3 storage like so: @@ -96,7 +100,7 @@ Here, the meaning of different `storage` fields is as follows: - backup_path - absolute path within the S3 bucket which leads to the root directory where the backups should be saved - profile (optional) - AWS profile to be used -## Local Storage +### Local Storage ```yaml storage_type: local diff --git a/scripts/backup.py b/scripts/backup.py index 06b8ea5..06b8aea 100644 --- a/scripts/backup.py +++ b/scripts/backup.py @@ -1,57 +1,52 @@ # (C) 2025 GoodData Corporation import abc import argparse -import datetime import json import logging import os import shutil import tempfile +import time +from concurrent.futures import ThreadPoolExecutor from pathlib import Path from typing import Any, Type import boto3 # type: ignore[import] import requests import yaml -from gooddata_api_client.exceptions import NotFoundException -from gooddata_sdk import __version__ as sdk_version # type: ignore[import] -from gooddata_sdk.sdk import GoodDataSdk # type: ignore[import] +from gooddata_sdk.sdk import GoodDataSdk from utils.backup_utils.input_loader import InputLoader # type: ignore[import] +from utils.constants import ( # type: ignore[import] + BackupSettings, + DirNames, + GoodDataProfile, +) from utils.gd_api import ( # type: ignore[import] - BEARER_TKN_PREFIX, GDApi, GoodDataRestApiError, ) from utils.logger import setup_logging # type: ignore[import] +from utils.models.batch import BackupBatch, Size # type: ignore[import] setup_logging() -logger = logging.getLogger("backup") - -TIMESTAMP_SDK_FOLDER = ( - str(datetime.datetime.now().strftime("%Y%m%d-%H%M%S")) - + "-" - + sdk_version.replace(".", "_") -) - -PROFILES_FILE = "profiles.yaml" -PROFILES_DIRECTORY = ".gooddata" -PROFILES_FILE_PATH = Path.home() / PROFILES_DIRECTORY / PROFILES_FILE - - -LAYOUTS_DIR = "gooddata_layouts" -LDM_DIR = "ldm" - -API_PAGE_SIZE = 100 +module_name = __file__.split(os.sep)[-1] +logger = logging.getLogger(module_name) # TODO: consider moving storage related logic to a separate module and reuse it in restore class BackupRestoreConfig: def __init__(self, conf_path: str): with open(conf_path, "r") as stream: - conf = yaml.safe_load(stream) - self.storage_type = conf["storage_type"] - self.storage = conf["storage"] - self.api_page_size = conf.get("api_page_size", API_PAGE_SIZE) + conf: dict = yaml.safe_load(stream) + + self.storage_type: str = conf["storage_type"] + self.storage: dict[str, str] = conf["storage"] + + page_size = conf.get("api_page_size", BackupSettings.DEFAULT_PAGE_SIZE) + self.api_page_size: Size = Size(size=page_size) + + batch_size = conf.get("batch_size", BackupSettings.DEFAULT_BATCH_SIZE) + self.batch_size: Size = Size(size=batch_size) class BackupStorage(abc.ABC): @@ -69,11 +64,13 @@ def __init__(self, conf: BackupRestoreConfig): self._config = conf.storage self._profile = self._config.get("profile", "default") self._session = self._create_boto_session(self._profile) - self._api = self._session.resource("s3") - self._bucket = self._api.Bucket(self._config["bucket"]) # type: ignore [missing library stubs] + self._resource = self._session.resource("s3") + self._bucket = self._resource.Bucket(self._config["bucket"]) # type: ignore [missing library stubs] suffix = "/" if not self._config["backup_path"].endswith("/") else "" self._backup_path = self._config["backup_path"] + suffix + self._verify_connection() + @staticmethod def _create_boto_session(profile: str) -> boto3.Session: try: @@ -85,6 +82,17 @@ def _create_boto_session(profile: str) -> boto3.Session: return boto3.Session() + def _verify_connection(self) -> None: + """ + Pings the S3 bucket to verify that the connection is working. + """ + try: + self._resource.meta.client.head_bucket(Bucket=self._config["bucket"]) + except Exception as e: + raise RuntimeError( + f"Failed to connect to S3 bucket {self._config['bucket']}: {e}" + ) + def export(self, folder, org_id) -> None: """Uploads the content of the folder to S3 as backup.""" storage_path = self._config["bucket"] + "/" + self._backup_path @@ -149,9 +157,9 @@ def create_parser() -> argparse.ArgumentParser: "-p", "--profile-config", type=Path, - default=PROFILES_FILE_PATH, + default=GoodDataProfile.PROFILE_PATH, help="Optional path to GoodData profile config. " - f'If no path is provided, "{PROFILES_FILE_PATH}" is used.', + f'If no path is provided, "{GoodDataProfile.PROFILE_PATH}" is used.', ) parser.add_argument( "--profile", @@ -239,7 +247,7 @@ def get_automations_from_api(api: GDApi, ws_id: str) -> Any: response: requests.Response = requests.get( f"{api.endpoint}/entities/workspaces/{ws_id}/automations?include=ALL", headers={ - "Authorization": f"{BEARER_TKN_PREFIX} {api.api_token}", + "Authorization": f"Bearer {api.api_token}", "Content-Type": "application/vnd.gooddata.api+json", }, ) @@ -301,7 +309,9 @@ def get_workspace_export( """ exported = False for ws_id in workspaces_to_export: - export_path = Path(local_target_path, org_id, ws_id, TIMESTAMP_SDK_FOLDER) + export_path = Path( + local_target_path, org_id, ws_id, BackupSettings.TIMESTAMP_SDK_FOLDER + ) user_data_filters = get_user_data_filters(api, ws_id) if not user_data_filters: @@ -316,8 +326,8 @@ def get_workspace_export( store_user_data_filters(user_data_filters, export_path, org_id, ws_id) logger.info(f"Stored export for {ws_id}") exported = True - except NotFoundException: - logger.error(f"Workspace {ws_id} does not exist. Skipping.") + except Exception as e: + logger.error(f"Skipping {ws_id}. Error encountered: {e}") if not exported: raise RuntimeError( @@ -329,9 +339,9 @@ def archive_gooddata_layouts_to_zip(folder: str) -> None: """Archives the gooddata_layouts directory to a zip file.""" target_subdir = "" for subdir, dirs, files in os.walk(folder): - if LAYOUTS_DIR in dirs: + if DirNames.LAYOUTS in dirs: target_subdir = os.path.join(subdir, dirs[0]) - if LDM_DIR in dirs: + if DirNames.LDM in dirs: inner_layouts_dir = subdir + "/gooddata_layouts" os.mkdir(inner_layouts_dir) for dir in dirs: @@ -382,6 +392,75 @@ def validate_args(args: argparse.Namespace) -> None: ) +def split_to_batches( + workspaces_to_export: list[str], batch_size: Size +) -> list[BackupBatch]: + """Splits the list of workspaces to into batches of the specified size. + The batch is respresented as a list of workspace IDs. + Returns a list of batches (i.e. list of lists of IDs) + """ + list_of_batches = [] + while workspaces_to_export: + batch = BackupBatch(workspaces_to_export[: batch_size.size]) + workspaces_to_export = workspaces_to_export[batch_size.size :] + list_of_batches.append(batch) + + return list_of_batches + + +def process_batch( + sdk: GoodDataSdk, + api: GDApi, + org_id: str, + storage: BackupStorage, + batch: BackupBatch, + retry_count: int = 0, +) -> None: + """Processes a single batch of workspaces for backup. + If the batch processing fails, the function will wait + and retry with exponential backoff up to BackupSettings.MAX_RETRIES. + The base wait time is defined by BackupSettings.RETRY_DELAY. + """ + try: + with tempfile.TemporaryDirectory() as tmpdir: + get_workspace_export(sdk, api, tmpdir, org_id, batch.list_of_ids) + + archive_gooddata_layouts_to_zip(str(Path(tmpdir, org_id))) + + storage.export(tmpdir, org_id) + + except Exception as e: + # Retry with exponential backoff until MAX_RETRIES, then raise the error + if retry_count < BackupSettings.MAX_RETRIES: + next_retry = retry_count + 1 + logger.info( + f"Unexpected error while processing a batch. Retrying {next_retry}/{BackupSettings.MAX_RETRIES}..." + ) + time.sleep(BackupSettings.RETRY_DELAY**next_retry) + process_batch(sdk, api, org_id, storage, batch, next_retry) + else: + logger.error(f"Error processing batch: {e}") + raise e + + +def process_batches_in_parallel( + sdk: GoodDataSdk, + api: GDApi, + org_id: str, + storage: BackupStorage, + batches: list[BackupBatch], +) -> None: + with ThreadPoolExecutor(max_workers=BackupSettings.MAX_WORKERS) as executor: + futures = [] + for batch in batches: + futures.append( + executor.submit(process_batch, sdk, api, org_id, storage, batch) + ) + + for future in futures: + future.result() + + def main(args: argparse.Namespace) -> None: """Main function for the backup script.""" sdk, api = create_client(args) @@ -393,24 +472,18 @@ def main(args: argparse.Namespace) -> None: storage_class: Type[BackupStorage] = get_storage(conf.storage_type) storage: BackupStorage = storage_class(conf) - # TODO: if storage set to S3, check that valid connection can be established - # currently the script would gather the exports and only then fail to upload them - loader = InputLoader(api, conf.api_page_size) workspaces_to_export: list[str] = loader.get_ids_to_backup( args.input_type, args.ws_csv ) - if not workspaces_to_export: - logger.error("No workspaces to export. Check the input file or the input type.") - return + batches = split_to_batches(workspaces_to_export, conf.batch_size) - with tempfile.TemporaryDirectory() as tmpdir: - get_workspace_export(sdk, api, tmpdir, org_id, workspaces_to_export) - - archive_gooddata_layouts_to_zip(str(Path(tmpdir, org_id))) + logger.info( + f"Exporting {len(workspaces_to_export)} workspaces in {len(batches)} batches." + ) - storage.export(tmpdir, org_id) + process_batches_in_parallel(sdk, api, org_id, storage, batches) if __name__ == "__main__": @@ -421,6 +494,6 @@ def main(args: argparse.Namespace) -> None: validate_args(args) main(args) - logger.info("Backup completed.") + logger.info("Backup completed!") except Exception as e: logger.error(f"Backup failed: {e}") diff --git a/scripts/restore.py b/scripts/restore.py index 9914f59..f1e774c 100644 --- a/scripts/restore.py +++ b/scripts/restore.py @@ -5,7 +5,6 @@ import json import logging import os -import sys import tempfile import traceback import zipfile @@ -15,30 +14,27 @@ import boto3 import requests import yaml -from gooddata_sdk import ( +from gooddata_sdk.catalog.workspace.declarative_model.workspace.analytics_model.analytics_model import ( CatalogDeclarativeAnalytics, +) +from gooddata_sdk.catalog.workspace.declarative_model.workspace.automation import ( CatalogDeclarativeAutomation, - CatalogDeclarativeFilterView, +) +from gooddata_sdk.catalog.workspace.declarative_model.workspace.logical_model.ldm import ( CatalogDeclarativeModel, - GoodDataSdk, ) +from gooddata_sdk.catalog.workspace.declarative_model.workspace.workspace import ( + CatalogDeclarativeFilterView, +) +from gooddata_sdk.sdk import GoodDataSdk +from utils.constants import DirNames, GoodDataProfile # type: ignore[import] +from utils.logger import setup_logging # type: ignore[import] BEARER_TKN_PREFIX = "Bearer" -LAYOUTS_DIR = "gooddata_layouts" -AM_DIR = "analytics_model" -LDM_DIR = "ldm" -UDF_DIR = "user_data_filters" - -PROFILES_FILE = "profiles.yaml" -PROFILES_DIRECTORY = ".gooddata" -PROFILES_FILE_PATH = Path.home() / PROFILES_DIRECTORY / PROFILES_FILE -LOG_FORMAT = "%(asctime)s [%(levelname)s] %(message)s" - -logger = logging.getLogger(__name__) -handler = logging.StreamHandler(sys.stdout) -handler.setFormatter(logging.Formatter(fmt=LOG_FORMAT)) -logger.addHandler(handler) -logger.setLevel(logging.INFO) + +setup_logging() +module_name = __file__.split(os.sep)[-1] +logger = logging.getLogger(module_name) GDWorkspace: TypeAlias = tuple[CatalogDeclarativeModel, CatalogDeclarativeAnalytics] @@ -72,6 +68,9 @@ class BackupStorage(abc.ABC): Implement this abstract base class for different kinds of storage providers. """ + def __init__(self, conf: BackupRestoreConfig): + return + @abc.abstractmethod def get_ws_declaration(self, target_path: str, local_target_path: Path) -> None: raise NotImplementedError @@ -120,10 +119,10 @@ def _validate_backup_path(self) -> None: if len(objects) == 0: raise RuntimeError("Provided s3 backup_path does not exist. Exiting...") - def get_ws_declaration(self, s3_target_path: str, local_target_path: Path) -> None: + def get_ws_declaration(self, target_path: str, local_target_path: Path) -> None: """Retrieves workspace declaration from S3 bucket.""" s3_backup_path = self._config.backup_path - target_s3_prefix = f"{s3_backup_path}{s3_target_path}" + target_s3_prefix = f"{s3_backup_path}{target_path}" objs_found = list(self._bucket.objects.filter(Prefix=target_s3_prefix)) @@ -141,7 +140,7 @@ def get_ws_declaration(self, s3_target_path: str, local_target_path: Path) -> No ) s3_obj = objs_found[0] - self._bucket.download_file(s3_obj.key, local_target_path) + self._bucket.download_file(s3_obj.key, str(local_target_path)) MaybeResponse: TypeAlias = Optional[requests.Response] @@ -221,9 +220,9 @@ def create_parser() -> argparse.ArgumentParser: "-p", "--profile-config", type=Path, - default=PROFILES_FILE_PATH, + default=GoodDataProfile.PROFILE_PATH, help="Optional path to GoodData profile config. " - f'If no path is provided, "{PROFILES_FILE_PATH}" is used.', + f'If no path is provided, "{GoodDataProfile.PROFILE_PATH}" is used.', ) parser.add_argument( "--profile", @@ -337,7 +336,7 @@ def _load_workspace_layout(self, src_path: Path) -> GDWorkspace: def _convert_udf_files_to_api_body(src_path: Path) -> dict: """Converts UDF files to API body.""" user_data_filters: dict = {"userDataFilters": []} - user_data_filters_folder = os.path.join(src_path, UDF_DIR) + user_data_filters_folder = os.path.join(src_path, DirNames.UDF) for filename in os.listdir(user_data_filters_folder): f = os.path.join(user_data_filters_folder, filename) with open(f, "r") as file: @@ -462,9 +461,9 @@ def _check_workspace_is_valid(src_path: Path) -> None: raise BackupRestoreError("Invalid source path upon load.") children = list(src_path.iterdir()) - am_path = src_path / AM_DIR - ldm_path = src_path / LDM_DIR - udf_path = src_path / UDF_DIR + am_path = src_path / DirNames.AM + ldm_path = src_path / DirNames.LDM + udf_path = src_path / DirNames.UDF if ( am_path not in children @@ -474,7 +473,7 @@ def _check_workspace_is_valid(src_path: Path) -> None: logger.error( "LDM or AM directory missing in the workspace hierarchy. " "Check if gooddata_layouts contains " - f"{AM_DIR}, {LDM_DIR} and {UDF_DIR} directories." + f"{DirNames.AM}, {DirNames.LDM} and {DirNames.UDF} directories." ) raise BackupRestoreError("LDM or AM directory missing.") @@ -505,8 +504,8 @@ def _restore_backup(self, ws_id: str, tempdir: str) -> None: """Restores the backup of a workspace.""" ws_path = self._ws_paths[ws_id] tempdir_path = Path(tempdir) - zip_target = tempdir_path / f"{LAYOUTS_DIR}.zip" - src_path = tempdir_path / LAYOUTS_DIR + zip_target = tempdir_path / f"{DirNames.LAYOUTS}.zip" + src_path = tempdir_path / DirNames.LAYOUTS try: self._get_ws_declaration(ws_path, zip_target) @@ -591,7 +590,8 @@ def main(args): conf = BackupRestoreConfig(args.conf) - storage = get_storage(conf.storage_type)(conf) + cls_storage: type[BackupStorage] = get_storage(conf.storage_type) + storage = cls_storage(conf) ws_paths = read_targets_from_csv(args.ws_csv) validate_targets(sdk, ws_paths) diff --git a/scripts/utils/backup_utils/input_loader.py b/scripts/utils/backup_utils/input_loader.py index 393fc5b..90a12d9 100644 --- a/scripts/utils/backup_utils/input_loader.py +++ b/scripts/utils/backup_utils/input_loader.py @@ -10,6 +10,7 @@ GoodDataRestApiError, MaybeResponse, ) +from utils.models.batch import Size # type: ignore[import] from utils.models.workspace_response import ( # type: ignore[import] Workspace, WorkspaceResponse, @@ -26,9 +27,9 @@ class InputLoader: hierarchy_endpoint: str all_workspaces_endpoint: str - def __init__(self, api_client: GDApi, page_size: int) -> None: + def __init__(self, api_client: GDApi, page_size: Size) -> None: self.api_client = api_client - self.page_size = page_size + self.page_size = page_size.size self.set_endpoints() def set_endpoints(self) -> None: diff --git a/scripts/utils/constants.py b/scripts/utils/constants.py new file mode 100644 index 0000000..0dc385c --- /dev/null +++ b/scripts/utils/constants.py @@ -0,0 +1,54 @@ +import datetime +from dataclasses import dataclass +from pathlib import Path + +from gooddata_sdk._version import __version__ as sdk_version + + +@dataclass(frozen=True) +class GoodDataProfile: + """ + Default path to the GoodData profile file. + """ + + FILE_NAME = "profiles.yaml" + DIRECTORY = ".gooddata" + PROFILE_PATH = Path.home() / DIRECTORY / FILE_NAME + + +@dataclass(frozen=True) +class DirNames: + """ + Folder names used in the SDK backup process: + - LAYOUTS - GoodData Layouts + - LDM - Logical Data Model + - AM - Analytics Model + - UDF - User Data Filters + """ + + LAYOUTS = "gooddata_layouts" + LDM = "ldm" + AM = "analytics_model" + UDF = "user_data_filters" + + +@dataclass(frozen=True) +class ConcurrencyDefaults: + MAX_WORKERS = 2 + DEFAULT_BATCH_SIZE = 100 + + +@dataclass(frozen=True) +class ApiDefaults: + DEFAULT_PAGE_SIZE = 100 + + +@dataclass(frozen=True) +class BackupSettings(ConcurrencyDefaults, ApiDefaults): + MAX_RETRIES = 3 + RETRY_DELAY = 5 # seconds + TIMESTAMP_SDK_FOLDER = ( + str(datetime.datetime.now().strftime("%Y%m%d-%H%M%S")) + + "-" + + sdk_version.replace(".", "_") + ) diff --git a/scripts/utils/gd_api.py b/scripts/utils/gd_api.py index 7be9898..393a1fa 100644 --- a/scripts/utils/gd_api.py +++ b/scripts/utils/gd_api.py @@ -9,7 +9,6 @@ logger = logging.getLogger(__name__) API_VERSION = "v1" -BEARER_TKN_PREFIX = "Bearer" MaybeResponse: TypeAlias = requests.Response | None @@ -61,7 +60,7 @@ def _prepare_request(self, path: str, params=None) -> dict[str, Any]: if params: kwargs["params"] = params if self.api_token: - kwargs["headers"]["Authorization"] = f"{BEARER_TKN_PREFIX} {self.api_token}" + kwargs["headers"]["Authorization"] = f"Bearer {self.api_token}" else: raise RuntimeError( "Token required for authentication against GD API is missing." diff --git a/scripts/utils/logger.py b/scripts/utils/logger.py index fe62e12..0e670fc 100644 --- a/scripts/utils/logger.py +++ b/scripts/utils/logger.py @@ -18,21 +18,38 @@ def format(self, record): class LogHandler(logging.Handler): - def __init__(self, script_name: str) -> None: + def __init__(self) -> None: super().__init__() - self.script_name: str = os.path.splitext(os.path.basename(script_name))[0] - self.file_handler: logging.FileHandler | None = None + self.script_name: str = self.get_top_level_script() + self.modules: list[str] = self.get_module_names() + self.file_handler: logging.FileHandler | None = None self.stream_handler: logging.StreamHandler = logging.StreamHandler() self.stream_handler.setFormatter(LevelFormatter()) + @staticmethod + def get_module_names() -> list[str]: + """Returns a list of module names in the scripts directory.""" + current_dir = os.path.dirname(os.path.abspath(__file__)) + scripts_dir = current_dir.split("scripts")[0] + "scripts" + modules = os.listdir(scripts_dir) + return modules + + @staticmethod + def get_top_level_script() -> str: + """Returns the name of the top-level script - i.e., the script that was executed.""" + if hasattr(sys, "argv") and sys.argv and sys.argv[0]: + return sys.argv[0].split(os.sep)[-1] + return "__main__" + def emit(self, record: logging.LogRecord) -> None: # Top level script name record.script = self.script_name self.stream_handler.emit(record) # Save Warnings and Errors to a file - if record.levelno >= logging.WARNING: + # Only if the script name is in the modules list (we don't need to log pytest errors etc.) + if record.levelno >= logging.WARNING and self.script_name in self.modules: if self.file_handler is None: date_str = datetime.now().strftime("%Y-%m-%d") log_filename = f"{self.script_name}_{date_str}.log" @@ -41,16 +58,14 @@ def emit(self, record: logging.LogRecord) -> None: self.file_handler.emit(record) -def get_top_level_script() -> str: - """Returns the name of the top-level script.""" - if hasattr(sys, "argv") and sys.argv and sys.argv[0]: - return sys.argv[0] - return "__main__" - - def setup_logging() -> None: - """Sets up logging configuration for the root logger.""" + """ + Sets up logging configuration for the root logger. + Terminal logs will be formatted with colors based on the log level. + Warnings and errors will also be saved to a file named + `_.log` in the current working directory. + """ root_logger = logging.getLogger() root_logger.setLevel(logging.INFO) root_logger.handlers.clear() - root_logger.addHandler(LogHandler(get_top_level_script())) + root_logger.addHandler(LogHandler()) diff --git a/scripts/utils/models/batch.py b/scripts/utils/models/batch.py new file mode 100644 index 0000000..9bca78e --- /dev/null +++ b/scripts/utils/models/batch.py @@ -0,0 +1,15 @@ +from dataclasses import dataclass +from typing import Annotated + +from pydantic import BaseModel, Field # type: ignore[import] + + +@dataclass +class BackupBatch: + list_of_ids: list[str] + + +class Size(BaseModel): + """Model to ensure valid batch or page size, i.e., integer greater than 0.""" + + size: Annotated[int, Field(gt=0, description="Batch size must be greater than 0")] diff --git a/tests/test_backup.py b/tests/test_backup.py index 43b7b48..e78d3cb 100644 --- a/tests/test_backup.py +++ b/tests/test_backup.py @@ -20,8 +20,8 @@ from moto import mock_s3 import scripts.backup as backup +from scripts.utils.models.batch import Size -LOGGER_NAME = "scripts.backup" MOCK_DL_TARGET = Path("overlays.zip") TEST_CONF_PATH = "tests/data/backup/test_conf.yaml" TEST_LOCAL_CONF_PATH = "tests/data/backup/test_local_conf.yaml" @@ -347,3 +347,91 @@ def test_file_upload(s3, s3_bucket): S3_BUCKET, "some/s3/backup/path/org_id/services/wsid2/20230713-132759-1_3_1_dev5/gooddata_layouts/services/workspaces/wsid2/analytics_model/filter_contexts/id.yaml", ).load() + + +def test_split_to_batches(): + workspaces = ["ws1", "ws2", "ws3", "ws4", "ws5"] + batch_size = Size(size=2) + expected_batches = [ + backup.BackupBatch(["ws1", "ws2"]), + backup.BackupBatch(["ws3", "ws4"]), + backup.BackupBatch(["ws5"]), + ] + + result = backup.split_to_batches(workspaces, batch_size) + + for i, batch in enumerate(result): + assert isinstance(batch, backup.BackupBatch) + assert batch.list_of_ids == expected_batches[i].list_of_ids + + +@mock.patch("scripts.backup.archive_gooddata_layouts_to_zip") +@mock.patch("scripts.backup.get_workspace_export") +def test_process_batch_success(get_workspace_export_mock, archive_zip_mock): + sdk = mock.Mock() + api = mock.Mock() + org_id = "org" + storage = mock.Mock() + batch = backup.BackupBatch(["ws1", "ws2"]) + + backup.process_batch(sdk, api, org_id, storage, batch) + + get_workspace_export_mock.assert_called_once() + archive_zip_mock.assert_called_once() + storage.export.assert_called_once() + + +@mock.patch("scripts.backup.logger") +@mock.patch("scripts.backup.archive_gooddata_layouts_to_zip") +@mock.patch("scripts.backup.get_workspace_export") +def test_process_batch_retries_on_exception( + get_workspace_export_mock, _archive_zip_mock, logger_mock +): + sdk = mock.Mock() + api = mock.Mock() + org_id = "org" + storage = mock.Mock() + batch = backup.BackupBatch(["ws1"]) + # Raise exception on first call, succeed on second + call_count = {"count": 0} + + def fail_once(*args, **kwargs): + if call_count["count"] == 0: + call_count["count"] += 1 + raise Exception("fail") + return None + + get_workspace_export_mock.side_effect = fail_once + + backup.process_batch(sdk, api, org_id, storage, batch) + + assert get_workspace_export_mock.call_count == 2 + assert logger_mock.info.call_args_list[0][0][0].startswith( + "Unexpected error while processing a batch. Retrying" + ) + storage.export.assert_called_once() + + +@mock.patch("scripts.backup.logger") +@mock.patch("scripts.backup.archive_gooddata_layouts_to_zip") +@mock.patch("scripts.backup.get_workspace_export") +def test_process_batch_raises_after_max_retries( + get_workspace_export_mock, _archive_zip_mock, logger_mock +): + sdk = mock.Mock() + api = mock.Mock() + org_id = "org" + storage = mock.Mock() + batch = backup.BackupBatch(["ws1"]) + get_workspace_export_mock.side_effect = Exception("fail") + + with pytest.raises(Exception, match="fail"): + backup.process_batch( + sdk, + api, + org_id, + storage, + batch, + retry_count=backup.BackupSettings.MAX_RETRIES, + ) + logger_mock.error.assert_called() diff --git a/tests/test_restore.py b/tests/test_restore.py index f014ea3..dcdd6ef 100644 --- a/tests/test_restore.py +++ b/tests/test_restore.py @@ -1,20 +1,26 @@ # (C) 2025 GoodData Corporation +import os +import sys + +sys.path.insert( + 0, os.path.abspath(os.path.join(os.path.dirname(__file__), "../scripts")) +) + import argparse import json import logging -import os import tempfile from pathlib import Path from unittest import mock import boto3 -import gooddata_sdk as gd_sdk import pytest +from gooddata_sdk.sdk import GoodDataSdk from moto import mock_s3 from scripts import restore -LOGGER_NAME = "scripts.restore" +LOGGER_NAME = "restore.py" MOCK_DL_TARGET = Path("overlays.zip") TEST_CONF_PATH = "tests/data/restore/test_conf.yaml" TEST_CSV_PATH = "tests/data/restore/test.csv" @@ -24,6 +30,20 @@ S3_BACKUP_PATH = "some/s3/backup/path/org_id/" S3_BUCKET = "some-s3-bucket" +# TODO: Verify that the tests use proper mocking - some of the tests appear to be sensitive to AWS CLI +# local settings or to be making real calls: +# - tests/test_restore.py::test_s3_storage +# - tests/test_restore.py::test_s3_storage_no_target_only_dir +# - tests/test_restore.py::test_s3_storage_no_target +# - tests/test_restore.py::test_incremental_restore +# - tests/test_restore.py::test_incremental_restore_different_ws_source +# - tests/test_restore.py::test_incremental_restore_one_succeeds_one_fails +# - tests/test_restore.py::test_e2e +# +# likely culprit is restore.S3Storage._validate_backup_path method +# +# As sidefect some other tests that assert RuntimeError being raised become false positives + class MockGdWorkspace: def __init__(self, id: str) -> None: @@ -156,7 +176,6 @@ def test_get_unknown_storage_raises_error(): def test_s3_storage(create_backups_in_bucket): create_backups_in_bucket(["ws_id"]) - conf = restore.BackupRestoreConfig(TEST_CONF_PATH) storage = restore.S3Storage(conf) @@ -183,8 +202,9 @@ def test_s3_storage_no_target(s3_bucket): def test_init_ldm_with_ws_data_filter_cols(): # Regression test - this doesn't work for sdk 1.3 and lesser - sdk = gd_sdk.GoodDataSdk.create("", "") + sdk = GoodDataSdk.create("", "") model = sdk.catalog_workspace_content.load_ldm_from_disk(TEST_LDM_PATH) + assert model.ldm is not None assert len(model.ldm.datasets) == 1 diff --git a/tests/test_utils/test_backup_utils/test_input_loader.py b/tests/test_utils/test_backup_utils/test_input_loader.py index 163f4dd..bc7465f 100644 --- a/tests/test_utils/test_backup_utils/test_input_loader.py +++ b/tests/test_utils/test_backup_utils/test_input_loader.py @@ -12,6 +12,7 @@ from scripts.utils.backup_utils.input_loader import InputLoader from scripts.utils.gd_api import GDApi +from scripts.utils.models.batch import Size from scripts.utils.models.workspace_response import ( Hierarchy, Links, @@ -29,7 +30,7 @@ @pytest.fixture def input_loader(): - loader = InputLoader(MOCK_GDP_API, page_size=2) + loader = InputLoader(MOCK_GDP_API, page_size=Size(size=2)) loader.hierarchy_endpoint = "/fake/hierarchy?filter=parent.id=={parent_id}" loader.all_workspaces_endpoint = "/fake/all" return loader