diff --git a/transforms/tabular-thresholding-tool/.bumpversion.cfg b/transforms/tabular-thresholding-tool/.bumpversion.cfg index be2d0e1..5dfe954 100644 --- a/transforms/tabular-thresholding-tool/.bumpversion.cfg +++ b/transforms/tabular-thresholding-tool/.bumpversion.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 0.1.7-dev0 +current_version = 0.1.8-dev2 commit = True tag = False parse = (?P\d+)\.(?P\d+)\.(?P\d+)(\-(?P[a-z]+)(?P\d+))? diff --git a/transforms/tabular-thresholding-tool/Dockerfile b/transforms/tabular-thresholding-tool/Dockerfile index bbd6fec..5ccf57a 100644 --- a/transforms/tabular-thresholding-tool/Dockerfile +++ b/transforms/tabular-thresholding-tool/Dockerfile @@ -1,9 +1,9 @@ -FROM polusai/bfio:2.3.6 +FROM polusai/bfio:2.4.5 # environment variables defined in polusai/bfio ENV EXEC_DIR="/opt/executables" ENV POLUS_IMG_EXT=".ome.tif" -ENV POLUS_TAB_EXT=".csv" +ENV POLUS_TAB_EXT=".arrow" ENV POLUS_LOG="INFO" # Work directory defined in the base container @@ -19,7 +19,6 @@ COPY . ${EXEC_DIR}/tabular-tools # Install the tool RUN pip3 install "${EXEC_DIR}/tabular-tools/${TOOL_DIR}" --no-cache-dir -# Set the entrypoint # TODO: Change the entrypoint to the tool entrypoint ENTRYPOINT ["python3", "-m", "polus.tabular.transforms.tabular_thresholding"] CMD ["--help"] diff --git a/transforms/tabular-thresholding-tool/README.md b/transforms/tabular-thresholding-tool/README.md index aaf0455..c32d04e 100644 --- a/transforms/tabular-thresholding-tool/README.md +++ b/transforms/tabular-thresholding-tool/README.md @@ -1,4 +1,4 @@ -# Tabular Thresholding Plugin (v0.1.7-dev0) +# Tabular Thresholding Plugin (v0.1.8-dev2) This plugin uses three [threshold methods](https://github.com/nishaq503/thresholding.git) to compute threshold values on a user-defined variable and then determines if each label (ROI) is above or below the calculated threshold value. A new feature column will be computed for selected threshold method with the values in binary format (0, 1) \ *0* `negative or below threshold`\ *1* `positive or above threshold` @@ -30,7 +30,7 @@ contents of `plugin.json` into the pop-up window and submit. ## Options -This plugin takes 10 input arguments and one output argument: +This plugin takes 9 input arguments and one output argument: | Name | Description | I/O | Type | |-------------------------|-----------------------------------------------------|--------|---------------| @@ -43,5 +43,4 @@ This plugin takes 10 input arguments and one output argument: | `--falsePositiverate` | Area to the right of the threshold | Input | float | | `--numBins` | Number of bins for histogram | Input | number | | `--n` | Number of standard deviation | Input | number | -| `--outFormat` | Output file format | Input | enum | | `--outDir` | Output collection | Output | genericData | diff --git a/transforms/tabular-thresholding-tool/VERSION b/transforms/tabular-thresholding-tool/VERSION index be8ec7f..89894cb 100644 --- a/transforms/tabular-thresholding-tool/VERSION +++ b/transforms/tabular-thresholding-tool/VERSION @@ -1 +1 @@ -0.1.7-dev0 +0.1.8-dev2 diff --git a/transforms/tabular-thresholding-tool/ict.yaml b/transforms/tabular-thresholding-tool/ict.yaml index 4ad366f..7681a87 100644 --- a/transforms/tabular-thresholding-tool/ict.yaml +++ b/transforms/tabular-thresholding-tool/ict.yaml @@ -1,146 +1,135 @@ author: -- Hamdah Shafqat -- Najib Ishaq + - Hamdah Shafqat + - Najib Ishaq contact: hamdahshafqat.abbasi@nih.gov -container: polusai/tabular-thresholding-tool:0.1.7-dev0 -description: This plugin computes thresholds using three methods and apply thresholds +container: polusai/tabular-thresholding-tool:0.1.8-dev2 +description: + This plugin computes thresholds using three methods and apply thresholds on each labelled data to produce binary outputs entrypoint: python3 -m polus.tabular.transforms.tabular_thresholding inputs: -- description: Directory containing tabular data - format: - - inpDir - name: inpDir - required: true - type: path -- description: Pattern to parse input files - format: - - filePattern - name: filePattern - required: false - type: string -- description: FeatureName containing information about the position of non treated - wells - format: - - negControl - name: negControl - required: true - type: string -- description: FeatureName containing information about the position of wells with - known treatment outcome - format: - - posControl - name: posControl - required: false - type: string -- description: Name of the Variable for computing thresholds - format: - - varName - name: varName - required: true - type: string -- description: Name of the threshold method - format: - - thresholdType - name: thresholdType - required: true - type: string -- description: False positive rate threshold value - format: - - falsePositiverate - name: falsePositiverate - required: false - type: number -- description: Number of Bins for otsu threshold - format: - - numBins - name: numBins - required: false - type: number -- description: Number of Standard deviation - format: - - n - name: n - required: false - type: number -- description: Output format - format: - - outFormat - name: outFormat - required: true - type: string -name: polusai/tabular-thresholding-plugin + - description: Directory containing tabular data + format: + - inpDir + name: inpDir + required: true + type: path + - description: Pattern to parse input files + format: + - filePattern + name: filePattern + required: false + type: string + - description: + FeatureName containing information about the position of non treated + wells + format: + - negControl + name: negControl + required: true + type: string + - description: + FeatureName containing information about the position of wells with + known treatment outcome + format: + - posControl + name: posControl + required: false + type: string + - description: Name of the Variable for computing thresholds + format: + - varName + name: varName + required: true + type: string + - description: Name of the threshold method + format: + - thresholdType + name: thresholdType + required: true + type: string + - description: False positive rate threshold value + format: + - falsePositiverate + name: falsePositiverate + required: false + type: number + - description: Number of Bins for otsu threshold + format: + - numBins + name: numBins + required: false + type: number + - description: Number of Standard deviation + format: + - n + name: n + required: false + type: number +name: polusai/tabular-thresholding-tool outputs: -- description: Output collection - format: - - outDir - name: outDir - required: true - type: path + - description: Output collection + format: + - outDir + name: outDir + required: true + type: path repository: https://github.com/PolusAI/tabular-tools specVersion: 1.0.0 -title: tabular-thresholding-plugin +title: tabular-thresholding-tool ui: -- description: Input directory containing tabular data - key: inputs.inpDir - title: inpDir - type: path -- description: Pattern to parse input files - key: inputs.filePattern - title: filePattern - type: text -- description: FeatureName containing information about the position of non treated - wells - key: inputs.negControl - title: negControl - type: text -- description: FeatureName containing information about the position of wells with - known treatment outcome - key: inputs.posControl - title: posControl - type: text -- description: FeatureName containing information about the position of wells with - known treatment outcome - key: inputs.posControl - title: posControl - type: text -- description: Name of the Variable for computing thresholds - key: inputs.varName - title: varName - type: text -- description: Name of the threshold method - fields: - - fpr - - otsu - - nsigma - - all - key: inputs.thresholdType - title: thresholdType - type: select -- default: 1.0 - description: False positive rate threshold value - key: inputs.falsePositiverate - title: falsePositiverate - type: number -- default: 512 - description: Number of Bins for otsu threshold - key: inputs.numBins - title: numBins - type: number -- default: 4 - description: Number of Standard deviation - key: inputs.n - title: n - type: number -- description: Output format - fields: - - .csv - - .feather - - .parquet - - .hdf5 - - .arrow - - default - key: inputs.outFormat - title: outFormat - type: select -version: 0.1.7-dev0 + - description: Input directory containing tabular data + key: inputs.inpDir + title: inpDir + type: path + - description: Pattern to parse input files + key: inputs.filePattern + title: filePattern + type: text + - description: + FeatureName containing information about the position of non treated + wells + key: inputs.negControl + title: negControl + type: text + - description: + FeatureName containing information about the position of wells with + known treatment outcome + key: inputs.posControl + title: posControl + type: text + - description: + FeatureName containing information about the position of wells with + known treatment outcome + key: inputs.posControl + title: posControl + type: text + - description: Name of the Variable for computing thresholds + key: inputs.varName + title: varName + type: text + - description: Name of the threshold method + fields: + - fpr + - otsu + - nsigma + - all + key: inputs.thresholdType + title: thresholdType + type: select + - default: 1.0 + description: False positive rate threshold value + key: inputs.falsePositiverate + title: falsePositiverate + type: number + - default: 512 + description: Number of Bins for otsu threshold + key: inputs.numBins + title: numBins + type: number + - default: 4 + description: Number of Standard deviation + key: inputs.n + title: n + type: number +version: 0.1.8-dev2 diff --git a/transforms/tabular-thresholding-tool/package-release.sh b/transforms/tabular-thresholding-tool/package-release.sh index 82b4870..7e78b43 100644 --- a/transforms/tabular-thresholding-tool/package-release.sh +++ b/transforms/tabular-thresholding-tool/package-release.sh @@ -10,4 +10,4 @@ bump2version --config-file bumpversion.cfg --new-version ${version} --allow-dirt ./build-docker.sh # Push to dockerhub -docker push polusai/tabular-thresholding-plugin:${version} +docker push polusai/tabular-thresholding-tool:${version} diff --git a/transforms/tabular-thresholding-tool/plugin.json b/transforms/tabular-thresholding-tool/plugin.json index 9388c5a..f471e78 100644 --- a/transforms/tabular-thresholding-tool/plugin.json +++ b/transforms/tabular-thresholding-tool/plugin.json @@ -1,6 +1,6 @@ { "name": "tabular-thresholding-plugin", - "version": "0.1.7-dev0", + "version": "0.1.8-dev2", "title": "tabular-thresholding-plugin", "description": "This plugin computes thresholds using three methods and apply thresholds on each labelled data to produce binary outputs", "author": "Hamdah Shafqat Abbasi (hamdahshafqat.abbasi@nih.gov), Najib Ishaq (najib.ishaq@nih.gov)", @@ -8,7 +8,7 @@ "repository": "https://github.com/PolusAI/tabular-tools", "website": "https://ncats.nih.gov/preclinical/core/informatics", "citation": "", - "containerId": "polusai/tabular-thresholding-tool:0.1.7-dev0", + "containerId": "polusai/tabular-thresholding-tool:0.1.8-dev2", "baseCommand": [ "python3", "-m", @@ -84,22 +84,6 @@ "type": "number", "options": null, "required": false - }, - { - "name": "outFormat", - "description": "Output format", - "type": "enum", - "options": { - "values": [ - ".csv", - ".feather", - ".parquet", - ".hdf5", - ".arrow", - "default" - ] - }, - "required": true } ], "outputs": [ @@ -166,12 +150,6 @@ "title": "n", "description": "Number of Standard deviation", "default": 4 - }, - { - "key": "inputs.outFormat", - "title": "outFormat", - "description": "Output format", - "default": ".arrow" } ] } diff --git a/transforms/tabular-thresholding-tool/pyproject.toml b/transforms/tabular-thresholding-tool/pyproject.toml index bf24b80..cce80be 100644 --- a/transforms/tabular-thresholding-tool/pyproject.toml +++ b/transforms/tabular-thresholding-tool/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "polus-tabular-transforms-tabular-thresholding" -version = "0.1.7-dev0" +version = "0.1.8-dev2" description = "This plugin computes thresholds using three methods and apply thresholds on each labelled data to produce binary outputs." authors = [ "Hamdah Shafqat Abbasi ", @@ -14,8 +14,9 @@ python = ">=3.9" filepattern = "^2.0.4" typer = "^0.7.0" tqdm = "^4.64.1" -vaex = "^4.17.0" pyarrow = ">=16.0,<17.0" +pandas = "^2.2.3" +numpy = "1.26.4" [tool.poetry.group.dev.dependencies] diff --git a/transforms/tabular-thresholding-tool/run-plugin.sh b/transforms/tabular-thresholding-tool/run-plugin.sh index 55e02a5..9f84e1a 100755 --- a/transforms/tabular-thresholding-tool/run-plugin.sh +++ b/transforms/tabular-thresholding-tool/run-plugin.sh @@ -19,7 +19,6 @@ thresholdType='all' numBins=512 falsePositiverate=0.1 n=4 -outFormat=".arrow" # Log level, must be one of ERROR, CRITICAL, WARNING, INFO, DEBUG @@ -27,7 +26,7 @@ LOGLEVEL=INFO docker run --mount type=bind,source=${datapath},target=/data/ \ --user $(id -u):$(id -g) \ --env POLUS_LOG=${LOGLEVEL} \ - polusai/tabular-thresholding-plugin:${version} \ + polusai/tabular-thresholding-tool:${version} \ --inpDir ${inpDir} \ --filePattern ${filePattern} \ --negControl ${negControl} \ @@ -37,5 +36,4 @@ docker run --mount type=bind,source=${datapath},target=/data/ \ --falsePositiverate ${falsePositiverate} \ --numBins ${numBins} \ --n ${n} \ - --outFormat ${outFormat} \ --outDir ${outDir} diff --git a/transforms/tabular-thresholding-tool/src/polus/tabular/transforms/tabular_thresholding/__init__.py b/transforms/tabular-thresholding-tool/src/polus/tabular/transforms/tabular_thresholding/__init__.py index 3aae081..1ed48e0 100644 --- a/transforms/tabular-thresholding-tool/src/polus/tabular/transforms/tabular_thresholding/__init__.py +++ b/transforms/tabular-thresholding-tool/src/polus/tabular/transforms/tabular_thresholding/__init__.py @@ -1,4 +1,4 @@ """Tabular Thresholding.""" -__version__ = "0.1.7-dev0" +__version__ = "0.1.8-dev2" from . import tabular_thresholding diff --git a/transforms/tabular-thresholding-tool/src/polus/tabular/transforms/tabular_thresholding/__main__.py b/transforms/tabular-thresholding-tool/src/polus/tabular/transforms/tabular_thresholding/__main__.py index f6a00bb..0d1c843 100644 --- a/transforms/tabular-thresholding-tool/src/polus/tabular/transforms/tabular_thresholding/__main__.py +++ b/transforms/tabular-thresholding-tool/src/polus/tabular/transforms/tabular_thresholding/__main__.py @@ -7,14 +7,13 @@ import time from functools import partial from multiprocessing import cpu_count -from typing import Any, List, Optional, Union +from typing import Any +from typing import Optional +from typing import Union import filepattern as fp import typer - -from polus.tabular.transforms.tabular_thresholding import ( - tabular_thresholding as tt, -) +from polus.tabular.transforms.tabular_thresholding import tabular_thresholding as tt # Initialize the logger logging.basicConfig( @@ -31,14 +30,14 @@ @app.command() -def main( +def main( # noqa:PLR0913 inp_dir: pathlib.Path = typer.Option( ..., "--inpDir", help="Path to the input data", ), file_pattern: str = typer.Option( - ".+", + ".*", "--filePattern", help="Patttern to parse file names", ), @@ -47,27 +46,32 @@ def main( "--negControl", help="Column name containing information of the position of non treated wells", ), - pos_control: str = typer.Option( - ..., + pos_control: Optional[str] = typer.Option( + None, "--posControl", - help="Column name containing information of the position of wells with known treatment outcome", + help="Column indicating the position of wells with known treatment outcomes", ), var_name: str = typer.Option( - tt.Methods.Default, "--varName", help="Column name for computing thresholds" + tt.Methods.Default, + "--varName", + help="Column name for computing thresholds", ), threshold_type: tt.Methods = typer.Option( - ..., "--thresholdType", help="Name of the threshold method" + ..., + "--thresholdType", + help="Name of the threshold method", ), false_positive_rate: float = typer.Option( - 0.1, "--falsePositiverate", help="False positive rate threshold value" + 0.1, + "--falsePositiverate", + help="False positive rate threshold value", ), num_bins: int = typer.Option( - 512, "--numBins", help="Number of Bins for otsu threshold" + 512, + "--numBins", + help="Number of Bins for otsu threshold", ), n: int = typer.Option(4, "--n", help="Number of Standard deviation"), - out_format: tt.Extensions = typer.Option( - tt.Extensions.Default, "--outFormat", help="Output format" - ), out_dir: pathlib.Path = typer.Option(..., "--outDir", help="Output collection"), preview: Optional[bool] = typer.Option( False, @@ -87,7 +91,6 @@ def main( logger.info(f"falsePositiverate = {false_positive_rate}") logger.info(f"numBins = {num_bins}") logger.info(f"n = {n}") - logger.info(f"outFormat = {out_format}") inp_dir = inp_dir.resolve() out_dir = out_dir.resolve() @@ -97,61 +100,64 @@ def main( out_dir.exists() ), f"{out_dir} doesnot exists!! Please check output path again" # By default it ingests all input files if not file_pattern is defined - file_pattern = ".*" + file_pattern fps = fp.FilePattern(inp_dir, file_pattern) if preview: - with open(pathlib.Path(out_dir, "preview.json"), "w") as jfile: - out_json: dict[Union[str, List], Any] = { + with pathlib.Path.open(pathlib.Path(out_dir, "preview.json"), "w") as jfile: + out_json: dict[Union[str, list], Any] = { "filepattern": file_pattern, "outDir": [], } for file in fps: - out_name = str(file[1][0].name.split(".")[0]) + "_binary" + out_format + out_name = str(file[1][0].name.split(".")[0]) + "_binary" + tt.POLUS_TAB_EXT thr_json = str(file[1][0].name.split(".")[0]) + "_thresholds.json" out_json["outDir"].append(out_name) out_json["outDir"].append(thr_json) json.dump(out_json, jfile, indent=2) - num_workers = max(multiprocessing.cpu_count() // 2, 2) + num_workers = max(multiprocessing.cpu_count() // 2, 1) flist = [f[1][0] for f in fps] + logger.info(f"Number of tabular files detected: {len(flist)}, filenames: {flist}") assert len(flist) != 0, f"No tabular file is detected: {flist}" - with multiprocessing.Pool(processes=num_workers) as executor: - executor.map( - partial( - tt.thresholding_func, - neg_control, - pos_control, - var_name, - threshold_type, - false_positive_rate, - num_bins, - n, - out_format, - out_dir, - ), - flist, + if len(flist) == 1: + tt.thresholding_func( + neg_control, + pos_control, + var_name, + threshold_type, + false_positive_rate, + num_bins, + n, + out_dir, + flist[0], ) - executor.close() - executor.join() - - # Deleting intermediate files from input directory - for f in inp_dir.iterdir(): - if f.is_file() and file_pattern != ".*.hdf5": - if f.suffix in [".hdf5", ".yaml"]: - os.remove(f) - else: - if ".hdf5.hdf5" in f.name or f.suffix == ".yaml": - os.remove(f) + else: + # Otherwise, use multiprocessing for parallel processing + with multiprocessing.Pool(processes=num_workers) as executor: + executor.map( + partial( + tt.thresholding_func, + neg_control, + pos_control, + var_name, + threshold_type, + false_positive_rate, + num_bins, + n, + out_dir, + ), + flist, + ) + executor.close() + executor.join() endtime = round((time.time() - starttime) / 60, 3) logger.info(f"Time taken to process binary threhold CSVs: {endtime} minutes!!!") - return if __name__ == "__main__": diff --git a/transforms/tabular-thresholding-tool/src/polus/tabular/transforms/tabular_thresholding/tabular_thresholding.py b/transforms/tabular-thresholding-tool/src/polus/tabular/transforms/tabular_thresholding/tabular_thresholding.py index cf6e50a..74fb38f 100644 --- a/transforms/tabular-thresholding-tool/src/polus/tabular/transforms/tabular_thresholding/tabular_thresholding.py +++ b/transforms/tabular-thresholding-tool/src/polus/tabular/transforms/tabular_thresholding/tabular_thresholding.py @@ -8,7 +8,10 @@ from typing import Union import numpy as np -import vaex +import pyarrow as pa +import pyarrow.csv as pacsv +import pyarrow.parquet as pq +import pyarrow.feather as pf from .thresholding import custom_fpr from .thresholding import n_sigma @@ -19,17 +22,6 @@ POLUS_TAB_EXT = os.environ.get("POLUS_TAB_EXT", ".arrow") -class Extensions(str, enum.Enum): - """File format of an output file.""" - - CSV = ".csv" - ARROW = ".arrow" - PARQUET = ".parquet" - HDF = ".hdf5" - FEATHER = ".feather" - Default = POLUS_TAB_EXT - - class Methods(str, enum.Enum): """Threshold methods.""" @@ -48,7 +40,6 @@ def thresholding_func( # noqa: PLR0915, PLR0912, PLR0913, C901 false_positive_rate: float, num_bins: int, n: int, - out_format: Extensions, out_dir: pathlib.Path, file: pathlib.Path, ) -> None: @@ -60,127 +51,175 @@ def thresholding_func( # noqa: PLR0915, PLR0912, PLR0913, C901 Args: neg_control: Column name containing information of non treated wells. - pos_control: Column name containing information of wells with the known - treatment. + pos_control: Column name for the well with known treatment var_name: Column name for computing thresholds. threshold_type: Name of threshold method. false_positive_rate: Tuning parameter. num_bins: Number of bins. n: Number of standard deviation away from mean value. - out_format: Output file extension. out_dir: Output directory. file: Filename. - """ - chunk_size = 100_000 if file.suffix == ".csv": - df = vaex.from_csv(file, convert=True, chunk_size=chunk_size) + # Read CSV using pyarrow.csv + table = pacsv.read_csv(file) else: - df = vaex.open(file, convert=True, progress=True) + # For Arrow or Parquet files, load directly as PyArrow table + if file.suffix == ".arrow": + table = pa.ipc.open_file(file).read_all() + elif file.suffix == ".parquet": + table = pq.read_table(file) + elif file.suffix == ".feather": + table = pf.read_feather(file) + else: + raise ValueError(f"Unsupported file format: {file.suffix}") + + plate = table["plate"].unique()[0] + + # Check for missing columns based on whether pos_control is provided + missing_columns = ( + not all(item in table.column_names for item in [var_name, neg_control]) + if pos_control is None + else not all( + item in table.column_names for item in [var_name, neg_control, pos_control] + ) + ) - if not any( - item in [var_name, neg_control, pos_control] for item in list(df.columns) - ): - msg = ( - f"{file} table is missing {var_name}, {neg_control}, {pos_control} " - "column names tabular data file. Please check variables again!" + if missing_columns: + missing_msg = ( + f"{file} is missing {var_name} and {neg_control} columns." + if pos_control is None + else f"{file} is missing {var_name}, {neg_control}, {pos_control} column." ) + logger.error(missing_msg) + raise ValueError(missing_msg) + + if table.num_rows == 0: + msg = f"File {file} is not loaded properly! Please check input files again!" logger.error(msg) raise ValueError(msg) - if df.shape == (0, 0): - msg = f"File {file} is not loaded properly! Please check input files again!" + # Convert to Pandas for specific operations if needed + df = table.to_pandas() + + unique_neg = df[neg_control].unique() + + if not np.array_equal(np.sort(unique_neg), [0, 1]): + msg = ( + f"The {neg_control} column has unique values {unique_neg}, " + "which are not exactly [0, 1]. Ensure proper negative controls are set." + ) logger.error(msg) raise ValueError(msg) + if pos_control: + unique_positive = df[pos_control].unique() + if not np.array_equal(np.sort(unique_positive), [0, 1]): + msg = ( + f"The {pos_control} column has unique values {unique_positive}, " + "which are not exactly [0, 1]. Verify positive controls" + ) + logger.error(msg) + raise ValueError(msg) + if pos_control is None: - msg = "`pos_control` is missing. Otsu threshold will not be computed!" + msg = "pos_control is missing. Otsu threshold will not be computed!" logger.info(msg) threshold_dict: dict[str, Union[float, str]] = {} - plate = file.stem - threshold_dict["plate"] = plate - - if df[neg_control].unique() != [0.0, 1.0]: - warnings.warn( - "controls are missing. NaN value are computed for thresholds", - stacklevel=1, - ) - nan_value = np.nan * np.arange(0, len(df[neg_control].values), 1) - threshold_dict["fpr"] = np.nan - threshold_dict["otsu"] = np.nan - threshold_dict["nsigma"] = np.nan - df["fpr"] = nan_value - df["otsu"] = nan_value - df["nsigma"] = nan_value - - else: + nan_value = np.nan * np.arange(0, len(df[neg_control].values), 1) + threshold_dict["FPR"] = np.nan + threshold_dict["OTSU"] = np.nan + threshold_dict["NSIGMA"] = np.nan + df["FPR"] = nan_value + df["OTSU"] = nan_value + df["NSIGMA"] = nan_value + + if pos_control: pos_controls = df[df[pos_control] == 1][var_name].values - neg_controls = df[df[neg_control] == 1][var_name].values - if threshold_type == "fpr": - logger.info(threshold_type) - threshold = custom_fpr.find_threshold( - neg_controls, - false_positive_rate=false_positive_rate, - ) - threshold_dict[threshold_type] = threshold - df[threshold_type] = df.func.where(df[var_name] <= threshold, 0, 1) - elif threshold_type == "otsu": + neg_controls = df[df[neg_control] == 1][var_name].values + + if threshold_type == "fpr": + threshold = custom_fpr.find_threshold( + neg_controls, + false_positive_rate=false_positive_rate, + ) + threshold_dict["FPR"] = threshold + df["FPR"] = np.where(df[var_name] <= threshold, 0, 1) + + elif threshold_type == "otsu": + if len(pos_controls) == 0: + msg = f"{pos_control} controls missing. NaN values for Otsu thresholds" + logger.error(msg) + threshold_dict["OTSU"] = np.nan + df["OTSU"] = np.nan * np.arange(0, len(df[var_name].values), 1) + else: combine_array = np.append(neg_controls, pos_controls, axis=0) threshold = otsu.find_threshold( combine_array, num_bins=num_bins, normalize_histogram=False, ) - threshold_dict[threshold_type] = threshold - df[threshold_type] = df.func.where(df[var_name] <= threshold, 0, 1) - elif threshold_type == "nsigma": - threshold = n_sigma.find_threshold(neg_controls, n=n) - threshold_dict[threshold_type] = threshold - df[threshold_type] = df.func.where(df[var_name] <= threshold, 0, 1) - elif threshold_type == "all": - fpr_thr = custom_fpr.find_threshold( - neg_controls, - false_positive_rate=false_positive_rate, - ) - combine_array = np.append(neg_controls, pos_controls, axis=0) + threshold_dict["OTSU"] = threshold + df["OTSU"] = np.where(df[var_name] <= threshold, 0, 1) + elif threshold_type == "nsigma": + threshold = n_sigma.find_threshold(neg_controls, n=n) + threshold_dict["NSIGMA"] = threshold + df["NSIGMA"] = np.where(df[var_name] <= threshold, 0, 1) + elif threshold_type == "all": + fpr_thr = custom_fpr.find_threshold( + neg_controls, + false_positive_rate=false_positive_rate, + ) + combine_array = np.append(neg_controls, pos_controls, axis=0) - if len(pos_controls) == 0: - warnings.warn( - "controls are missing. NaN value are computed for otsu thresholds", - stacklevel=1, - ) - threshold_dict["otsu"] = np.nan - df["otsu"] = np.nan * np.arange(0, len(df[var_name].values), 1) - else: - otsu_thr = otsu.find_threshold( - combine_array, - num_bins=num_bins, - normalize_histogram=False, - ) - threshold_dict["otsu"] = otsu_thr - df["otsu"] = df.func.where(df[var_name] <= otsu_thr, 0, 1) + if len(pos_controls) == 0: + warnings.warn( # noqa: B028 + f"{pos_control} missing; NaN values computed for Otsu thresholds", + ) + threshold_dict["OTSU"] = np.nan + df["OTSU"] = np.nan * np.arange(0, len(df[var_name].values), 1) + else: + otsu_thr = otsu.find_threshold( + combine_array, + num_bins=num_bins, + normalize_histogram=False, + ) + threshold_dict["OTSU"] = otsu_thr + df["OTSU"] = np.where(df[var_name] <= otsu_thr, 0, 1) nsigma_thr = n_sigma.find_threshold(neg_controls, n=n) - threshold_dict["fpr"] = fpr_thr - threshold_dict["nsigma"] = nsigma_thr - df["fpr"] = df.func.where(df[var_name] <= fpr_thr, 0, 1) - df["nsigma"] = df.func.where(df[var_name] <= nsigma_thr, 0, 1) + threshold_dict["FPR"] = fpr_thr + threshold_dict["NSIGMA"] = nsigma_thr + df["FPR"] = np.where(df[var_name] <= fpr_thr, 0, 1) + df["NSIGMA"] = np.where(df[var_name] <= nsigma_thr, 0, 1) - outjson = pathlib.Path(out_dir).joinpath(f"{plate}_thresholds.json") - with outjson.open("w") as outfile: + outjson = out_dir.joinpath(f"{plate}_thresholds.json") + with pathlib.Path.open(outjson, "w") as outfile: json.dump(threshold_dict, outfile) logger.info(f"Saving Thresholds in JSON fileformat {outjson}") + + out_format = POLUS_TAB_EXT + outname = out_dir.joinpath(f"{plate}_binary{POLUS_TAB_EXT}") + + if out_format in [".feather", ".arrow", ".parquet"]: + # Convert back to PyArrow Table if output is .arrow or .parquet + output_table = pa.Table.from_pandas(df) + + if out_format == ".arrow": + with pa.OSFile(str(outname), "wb") as sink: + writer = pa.ipc.new_file(sink, output_table.schema) + writer.write_table(output_table) + writer.close() + + elif out_format == ".parquet": + pq.write_table(output_table, outname) + + elif out_format == ".feather": + pf.write_feather(output_table, outname) - if f"{out_format}" in [".feather", ".arrow"]: - outname = pathlib.Path(out_dir, f"{plate}_binary{out_format}") - df.export_feather(outname) - logger.info(f"Saving f'{plate}_binary{out_format}") - elif f"{out_format}" == ".csv": - outname = pathlib.Path(out_dir).joinpath(f"{plate}_binary{out_format}") - df.export_csv(path=outname, chunk_size=chunk_size) else: - outname = pathlib.Path(out_dir).joinpath(f"{plate}_binary{out_format}") - df.export(outname, progress=True) - logger.info(f"Saving f'{plate}_binary{out_format}") + pacsv.write_csv(output_table, outname) + + logger.info(f"Saving {plate}_binary{out_format}") \ No newline at end of file diff --git a/transforms/tabular-thresholding-tool/tabular-thresholding-plugin.cwl b/transforms/tabular-thresholding-tool/tabular-thresholding-plugin.cwl index 1267cab..b01bf42 100644 --- a/transforms/tabular-thresholding-tool/tabular-thresholding-plugin.cwl +++ b/transforms/tabular-thresholding-tool/tabular-thresholding-plugin.cwl @@ -29,10 +29,6 @@ inputs: inputBinding: prefix: --outDir type: Directory - outFormat: - inputBinding: - prefix: --outFormat - type: string posControl: inputBinding: prefix: --posControl @@ -52,7 +48,7 @@ outputs: type: Directory requirements: DockerRequirement: - dockerPull: polusai/tabular-thresholding-tool:0.1.7-dev0 + dockerPull: polusai/tabular-thresholding-tool:0.1.8-dev2 InitialWorkDirRequirement: listing: - entry: $(inputs.outDir) diff --git a/transforms/tabular-thresholding-tool/tests/test_main.py b/transforms/tabular-thresholding-tool/tests/test_main.py index b2a543f..a2d8f9e 100644 --- a/transforms/tabular-thresholding-tool/tests/test_main.py +++ b/transforms/tabular-thresholding-tool/tests/test_main.py @@ -1,5 +1,4 @@ -"""Tabular Thresholding.""" - +"""Testing of Tabular Thresholding.""" import pathlib import random import shutil @@ -10,14 +9,15 @@ import numpy as np import pandas as pd import pytest -import vaex +import pyarrow as pa +import pyarrow.csv as pv from polus.tabular.transforms.tabular_thresholding import ( tabular_thresholding as tt, ) class Generatedata: - """Generate tabular data with several different file format.""" + """Generate tabular data with several different file formats.""" def __init__(self, file_pattern: str, size: int, outname: str) -> None: """Define instance attributes.""" @@ -41,12 +41,13 @@ def create_dataframe(self) -> pd.core.frame.DataFrame: """Create Pandas dataframe.""" diction_1 = { "A": list(range(self.size)), - "B": [random.choice(string.ascii_letters) for i in range(self.size)], + "B": [random.choice(string.ascii_letters) for _ in range(self.size)], "C": np.random.randint(low=1, high=100, size=self.size), "D": np.random.normal(0.0, 1.0, size=self.size), "MEAN": np.linspace(1.0, 4000.0, self.size), - "neg_control": [random.choice("01") for i in range(self.size)], - "pos_neutral": [random.choice("01") for i in range(self.size)], + "neg_control": [random.choice("01") for _ in range(self.size)], + "pos_neutral": [random.choice("01") for _ in range(self.size)], + "plate": ["CD_SOD1_2_E1023886__1" for _ in range(self.size)], } df = pd.DataFrame(diction_1) @@ -72,25 +73,30 @@ def feather_func(self) -> None: self.x.to_feather(pathlib.Path(self.inp_dir, self.outname)) def arrow_func(self) -> None: - """Convert pandas dataframe to Arrow file format.""" - self.x.to_feather(pathlib.Path(self.inp_dir, self.outname)) - - def hdf_func(self) -> None: - """Convert pandas dataframe to hdf5 file format.""" - v_df = vaex.from_pandas(self.x, copy_index=False) - v_df.export(pathlib.Path(self.inp_dir, self.outname)) + """Convert pandas dataframe to Arrow IPC file format.""" + table = pa.Table.from_pandas(self.x) + arrow_path = pathlib.Path(self.inp_dir, self.outname) + with pa.OSFile(str(arrow_path), "wb") as sink: + with pa.RecordBatchFileWriter(sink, table.schema) as writer: + writer.write_table(table) + + # Verify that the file is written correctly + with pa.memory_map(arrow_path, "r") as source: + try: + pa.ipc.RecordBatchFileReader(source).read_all() + except pa.ArrowInvalid: + raise ValueError(f"The file {arrow_path} is not a valid Arrow file.") def __call__(self) -> None: """To make a class callable.""" data_ext = { - ".hdf5": self.hdf_func, ".csv": self.csv_func, ".parquet": self.parquet_func, ".feather": self.feather_func, ".arrow": self.arrow_func, } - return data_ext[self.file_pattern]() + return data_ext[self.file_pattern]() # No changes here, this is correct def clean_directories(self): """Remove files.""" @@ -99,46 +105,55 @@ def clean_directories(self): shutil.rmtree(d) -EXT = [[".csv", ".feather", ".arrow", ".parquet", ".hdf5"]] - +# List of extensions to test +EXT = [[".csv", ".feather", ".arrow", ".parquet"]] @pytest.fixture(params=EXT) def poly(request): - """To get the parameter of the fixture.""" - return request.param - + """Fixture to get the file extension parameter for testing.""" + return request.param[0] # Return the extension, e.g., ".csv", not the list def test_tabular_thresholding(poly): - """Testing of merging of tabular data by rows with equal number of rows.""" - for i in poly: - d = Generatedata(i, outname=f"data_1{i}", size=1000000) - d() - pattern = f".*{i}" - fps = fp.FilePattern(d.get_inp_dir(), pattern) - for file in fps(): - tt.thresholding_func( - neg_control="neg_control", - pos_control="pos_neutral", - var_name="MEAN", - threshold_type="all", - false_positive_rate=0.01, - num_bins=512, - n=4, - out_format=i, - out_dir=d.get_out_dir(), - file=file[1][0], - ) - - assert i in [f.suffix for f in d.get_out_dir().iterdir()] - - df = vaex.open( - pathlib.Path(d.get_out_dir(), file[1][0].stem + "_binary" + i), - ) - threshold_methods = ["fpr", "otsu", "nsigma"] - assert (all(item in list(df.columns) for item in threshold_methods)) is True - assert np.allclose(np.unique(df[threshold_methods]), [0, 1]) is True - assert file[1][0].stem + "_thresholds.json" in [ - f.name for f in d.get_out_dir().iterdir() - ] - - d.clean_directories() + """Test the merging of tabular data by rows with equal number of rows.""" + + # Generate data with the specified file extension + d = Generatedata(poly, outname=f"data_1{poly}", size=1000000) + d() + pattern = f".*{poly}" + fps = fp.FilePattern(d.get_inp_dir(), pattern) + for file in fps(): + tt.thresholding_func( + neg_control="neg_control", + pos_control="pos_neutral", + var_name="MEAN", + threshold_type="all", + false_positive_rate=0.01, + num_bins=512, + n=4, + out_dir=d.get_out_dir(), + file=file[1][0], + ) + + # Find the processed file (excluding JSON files) + file = [f for f in d.get_out_dir().iterdir() if ".json" not in f.name][0] + + if file.suffix == ".arrow": + with pa.memory_map(str(file), "r") as source: + table = pa.ipc.RecordBatchFileReader(source).read_all() + else: + table = pv.read_csv(file) + + df = table.to_pandas() + + # List of expected threshold methods + threshold_methods = ["FPR", "OTSU", "NSIGMA"] + + # Check if the expected columns are present in the DataFrame + assert all(item in list(df.columns) for item in threshold_methods) + + # Check if the values in the threshold columns are either 0 or 1 + assert np.allclose(np.unique(df[threshold_methods]), [0, 1]) + + # Clean up directories after the test + d.clean_directories() +