diff --git a/retropath2_wrapper/Args.py b/retropath2_wrapper/Args.py index 1e68b6a..ee4c25d 100644 --- a/retropath2_wrapper/Args.py +++ b/retropath2_wrapper/Args.py @@ -15,9 +15,10 @@ ) DEFAULTS = { 'MSC_TIMEOUT': 10, # minutes - 'RP2_VERSION': 'r20250728', + 'RP2_VERSION': 'r20260119', 'KNIME_FOLDER': __PACKAGE_FOLDER, "STD_HYDROGEN": "auto", # How hydrogens are represented in chemical rules + "SCORE_MODE": "auto", # How to consider priorize rules according to their score } RETCODES = { 'OK': 0, @@ -95,7 +96,7 @@ def _add_arguments(parser): '--rp2_version', type=str, default=DEFAULTS['RP2_VERSION'], - choices=['v9', 'r20210127', 'r20220104', "r20220224", "r20250728"], + choices=['v9', 'r20210127', 'r20220104', "r20220224", "r20250728", "r20260119"], help=f'version of RetroPath2.0 workflow (default: {DEFAULTS["RP2_VERSION"]}).' ) @@ -116,6 +117,17 @@ def _add_arguments(parser): choices=["auto", "implicit", "explicit"], help="How hydrogens are represented in chemical rules, auto mode will try to guess from the chemical rules", ) + parser_rp.add_argument( + "--score_mode", + default=DEFAULTS["SCORE_MODE"], + choices=["auto", "maximize", "minimize"], + help=( + "How to consider priorize rules according to their score:" + " 'maximise' (higher is better), 'minimize' (lower is better)," + " 'auto' (will try to guess from the chemical rules)." + f" Default: '{DEFAULTS['SCORE_MODE']}'" + ) + ) # Program options parser_sp = parser.add_argument_group("Logging") diff --git a/retropath2_wrapper/RetroPath2.py b/retropath2_wrapper/RetroPath2.py index 4964af6..5046c50 100644 --- a/retropath2_wrapper/RetroPath2.py +++ b/retropath2_wrapper/RetroPath2.py @@ -6,6 +6,7 @@ @description: Python wrapper to run RetroPath2.0 KNIME workflow """ +import csv import gzip import tarfile import zipfile @@ -49,6 +50,7 @@ def retropath2( rules_file: str, outdir: str, std_hydrogen: str, + score_mode: str, knime: Knime | None, rp2_version: str | None = DEFAULTS['RP2_VERSION'], max_steps: int = 3, @@ -65,6 +67,7 @@ def retropath2( logger.debug(f'rules_file: {rules_file}') logger.debug(f'outdir: {outdir}') logger.debug(f'std_hydrogen: {std_hydrogen}') + logger.debug(f'score_mode: {score_mode}') logger.debug(f'rp2_version: {rp2_version}') logger.debug(f'max_steps: {max_steps}') logger.debug(f'topx: {topx}') @@ -102,6 +105,7 @@ def retropath2( 'dmax' : dmax, 'mwmax_source' : mwmax_source, 'std_hydrogen' : std_hydrogen, + 'score_mode' : score_mode, } logger.debug('rp2_params: ' + str(rp2_params)) @@ -372,8 +376,13 @@ def format_files_for_knime( } # Because KNIME accepts only '.csv' file extension, # files have to be renamed + allowed_extensions = { + 'sink' : ['.csv'], + 'source' : ['.csv'], + 'rules' : ['.csv', '.tsv'], + } for key in ['sink', 'source', 'rules']: - if os_path.splitext(files[key])[-1] != '.csv': + if os_path.splitext(files[key])[-1] not in allowed_extensions[key]: new_f = os_path.join( indir, os_path.basename(files[key])+'.csv' @@ -383,6 +392,136 @@ def format_files_for_knime( return files +# Function to return the first lines of a file (as a list) +def get_first_lines(path: str, n: int = 10) -> list[str]: + """ + Return the first n lines of a file. + + Parameters + ---------- + path : str + Path of the file. + n : int + Number of lines to return. If n <= 0, return all lines. + + Returns + ------- + list[str] + The first n lines of the file. + """ + lines = [] + # --- Gzip compressed single file --- + if path.endswith(".gz") and not path.endswith(".tar.gz"): + with gzip.open(path, "rt", encoding="utf-8", errors="ignore") as f: + for i, line in enumerate(f): + if n > 0 and i >= n: + break + lines.append(line.rstrip()) + # --- Tar or Tar.gz archive --- + elif path.endswith(".tar") or path.endswith(".tar.gz"): + mode = "r:gz" if path.endswith(".gz") else "r:" + with tarfile.open(path, mode) as tar: + # Pick the first regular file inside + for member in tar: + if member.isfile(): + f = tar.extractfile(member) + if f is None: + continue + for i, line in enumerate(f): + if n > 0 and i >= n: + break + lines.append(line.decode("utf-8", errors="ignore").rstrip()) + break # only first file + # --- Zip archive --- + elif path.endswith(".zip"): + with zipfile.ZipFile(path, "r") as zf: + for name in zf.namelist(): + if name.startswith("_"): + continue + with zf.open(name) as f: + for i, line in enumerate(f): + if n > 0 and i >= n: + break + lines.append(line.decode("utf-8", errors="ignore").rstrip()) + break + # --- Plain text --- + else: + with open(path, "rt", encoding="utf-8", errors="ignore") as f: + for i, line in enumerate(f): + if n > 0 and i >= n: + break + lines.append(line.rstrip()) + return lines + + +def sniff_score_mode(path: str, default_mode: str = "maximize", logger: Logger = getLogger(__name__)) -> str: # noqa: E501 + """ + Sniff the scoring mode used in reaction rules. + + Parameters + ---------- + path : str + Path of the file. + default_mode : str + Default scoring mode. + logger : Logger + The logger object. + + Returns + ------- + str + The scoring mode: 'minimize' or 'maximize'. + + """ + lines = get_first_lines(path, n=-1) + + dialect = csv.Sniffer().sniff("\n".join(lines[:10])) + + # Check if header is present + has_header = csv.Sniffer().has_header("\n".join(lines[:10])) + if not has_header: + logger.info( + "No header detected in reaction rules." + " Returning scoring mode: %s.", default_mode + ) + return default_mode + + # Look for 'score' column + lines[0] = lines[0].lower() + if "score" not in lines[0]: + logger.info( + "No 'score' column detected in reaction rules." + " Returning scoring mode: %s.", default_mode + ) + return default_mode + + # Inspect all score values + scores = [] + for line in csv.DictReader(lines, dialect=dialect): + try: + scores.append(float(line["score"])) + except ValueError: + pass + if len(scores) == 0: + logger.info( + "No valid score values detected in reaction rules." + " Returning scoring mode: %s.", default_mode + ) + return default_mode + elif any(s > 1 for s in scores): + logger.info( + "Score > 1 detected, which is historically associated" + " with 'minimize' mode. Returning 'minimize'." + ) + return "minimize" + else: + logger.info( + "All scores <= 1 detected, which is historically associated" + " with 'maximize' mode. Returning 'maximize'." + ) + return "maximize" + + def sniff_rules(path: str, logger: Logger = getLogger(__name__)) -> str: hydrogen_explicit_patterns = ["[#1"] n = 10 @@ -435,4 +574,4 @@ def sniff_rules(path: str, logger: Logger = getLogger(__name__)) -> str: logger.info("Detect explicit hydrogens in reaction rules") return "explicit" logger.info("Detect implicit hydrogen in reaction rules") - return "implicit" \ No newline at end of file + return "implicit" diff --git a/retropath2_wrapper/__main__.py b/retropath2_wrapper/__main__.py index b5ade7b..9649b05 100644 --- a/retropath2_wrapper/__main__.py +++ b/retropath2_wrapper/__main__.py @@ -23,6 +23,7 @@ from retropath2_wrapper.RetroPath2 import ( retropath2, sniff_rules, + sniff_score_mode, ) from retropath2_wrapper.Args import ( build_args_parser, @@ -102,12 +103,21 @@ def _cli(): else: std_hydrogen = "H added + Aromatized" + # Sniff scoring mode + if args.score_mode == "auto": + score_mode = sniff_score_mode(path=args.rules_file, logger=logger) + elif args.score_mode in ["maximize", "minimize"]: + score_mode = args.score_mode + else: + parser.error("--score_mode should be one of 'auto', 'maximize' or 'minimize'.") + r_code, result_files = retropath2( sink_file=args.sink_file, source_file=args.source_file, rules_file=args.rules_file, outdir=args.outdir, std_hydrogen=std_hydrogen, + score_mode=score_mode, max_steps=args.max_steps, topx=args.topx, dmin=args.dmin, diff --git a/retropath2_wrapper/knime.py b/retropath2_wrapper/knime.py index fe68a8f..406dade 100644 --- a/retropath2_wrapper/knime.py +++ b/retropath2_wrapper/knime.py @@ -328,6 +328,7 @@ def call( args += ['-workflow.variable=output.solutionfile,"%s",String' % (self.standardize_path(files['results']),)] args += ['-workflow.variable=output.sourceinsinkfile,"%s",String' % (self.standardize_path(files['src-in-sk']),)] args += ['-workflow.variable=input.std_mode,"%s",String' % (params["std_hydrogen"],)] + args += ['-workflow.variable=input.score_mode,"%s",String' % (params["score_mode"],)] if preference and preference.is_init(): preference.to_file() args += ["-preferences=" + self.standardize_path(preference.path)] diff --git a/retropath2_wrapper/workflows/RetroPath2.0_r20260119.knwf b/retropath2_wrapper/workflows/RetroPath2.0_r20260119.knwf new file mode 100644 index 0000000..c98e8ca Binary files /dev/null and b/retropath2_wrapper/workflows/RetroPath2.0_r20260119.knwf differ