From b71458cbac47d83f18cb5a13308bff609b9c516f Mon Sep 17 00:00:00 2001 From: Hamdah Shafqat Abbasi Date: Wed, 27 Nov 2024 17:17:22 -0500 Subject: [PATCH 01/12] adding new tabular statistic plugin --- .../polus-csv-statistics-plugin/Dockerfile | 17 - features/polus-csv-statistics-plugin/VERSION | 1 - .../build-docker.sh | 4 - features/polus-csv-statistics-plugin/ict.yaml | 60 --- .../polus-csv-statistics-plugin/src/main.py | 309 -------------- .../src/requirements.txt | 1 - .../tabular-statistics-tool/.bumpversion.cfg | 31 ++ features/tabular-statistics-tool/Dockerfile | 24 ++ .../README.md | 17 +- .../TabularStatistics.cwl | 36 ++ features/tabular-statistics-tool/VERSION | 1 + .../tabular-statistics-tool/build-docker.sh | 23 + features/tabular-statistics-tool/gitignore | 23 + features/tabular-statistics-tool/ict.yaml | 62 +++ .../plugin.json | 43 +- .../tabular-statistics-tool/pyproject.toml | 31 ++ .../features/tabular_statistics/__init__.py | 4 + .../features/tabular_statistics/__main__.py | 117 +++++ .../tabular_statistics/tabular_statistics.py | 399 ++++++++++++++++++ 19 files changed, 790 insertions(+), 413 deletions(-) delete mode 100644 features/polus-csv-statistics-plugin/Dockerfile delete mode 100644 features/polus-csv-statistics-plugin/VERSION delete mode 100755 features/polus-csv-statistics-plugin/build-docker.sh delete mode 100644 features/polus-csv-statistics-plugin/ict.yaml delete mode 100644 features/polus-csv-statistics-plugin/src/main.py delete mode 100644 features/polus-csv-statistics-plugin/src/requirements.txt create mode 100644 features/tabular-statistics-tool/.bumpversion.cfg create mode 100644 features/tabular-statistics-tool/Dockerfile rename features/{polus-csv-statistics-plugin => tabular-statistics-tool}/README.md (67%) create mode 100644 features/tabular-statistics-tool/TabularStatistics.cwl create mode 100644 features/tabular-statistics-tool/VERSION create mode 100755 features/tabular-statistics-tool/build-docker.sh create mode 100644 features/tabular-statistics-tool/gitignore create mode 100644 features/tabular-statistics-tool/ict.yaml rename features/{polus-csv-statistics-plugin => tabular-statistics-tool}/plugin.json (71%) create mode 100644 features/tabular-statistics-tool/pyproject.toml create mode 100644 features/tabular-statistics-tool/src/polus/tabular/features/tabular_statistics/__init__.py create mode 100644 features/tabular-statistics-tool/src/polus/tabular/features/tabular_statistics/__main__.py create mode 100644 features/tabular-statistics-tool/src/polus/tabular/features/tabular_statistics/tabular_statistics.py diff --git a/features/polus-csv-statistics-plugin/Dockerfile b/features/polus-csv-statistics-plugin/Dockerfile deleted file mode 100644 index d6b8f9f..0000000 --- a/features/polus-csv-statistics-plugin/Dockerfile +++ /dev/null @@ -1,17 +0,0 @@ -FROM polusai/bfio:2.1.9 - -COPY VERSION / - -ARG EXEC_DIR="/opt/executables" -ARG DATA_DIR="/data" - -RUN mkdir -p ${EXEC_DIR} \ - && mkdir -p ${DATA_DIR}/inputs \ - && mkdir ${DATA_DIR}/outputs - -COPY src ${EXEC_DIR}/ -WORKDIR ${EXEC_DIR} - -RUN pip3 install -r ${EXEC_DIR}/requirements.txt - -ENTRYPOINT ["python3", "/opt/executables/main.py"] \ No newline at end of file diff --git a/features/polus-csv-statistics-plugin/VERSION b/features/polus-csv-statistics-plugin/VERSION deleted file mode 100644 index 7dff5b8..0000000 --- a/features/polus-csv-statistics-plugin/VERSION +++ /dev/null @@ -1 +0,0 @@ -0.2.1 \ No newline at end of file diff --git a/features/polus-csv-statistics-plugin/build-docker.sh b/features/polus-csv-statistics-plugin/build-docker.sh deleted file mode 100755 index ff8f13c..0000000 --- a/features/polus-csv-statistics-plugin/build-docker.sh +++ /dev/null @@ -1,4 +0,0 @@ -#!/bin/bash - -version=$( fcheck: - fcheck += 1 - logger.info('Unique Files parsed: {}'.format(fnum)) \ No newline at end of file diff --git a/features/polus-csv-statistics-plugin/src/requirements.txt b/features/polus-csv-statistics-plugin/src/requirements.txt deleted file mode 100644 index 6dd96c6..0000000 --- a/features/polus-csv-statistics-plugin/src/requirements.txt +++ /dev/null @@ -1 +0,0 @@ -filepattern==1.4.7 \ No newline at end of file diff --git a/features/tabular-statistics-tool/.bumpversion.cfg b/features/tabular-statistics-tool/.bumpversion.cfg new file mode 100644 index 0000000..026c094 --- /dev/null +++ b/features/tabular-statistics-tool/.bumpversion.cfg @@ -0,0 +1,31 @@ +[bumpversion] +current_version = 0.1.0-dev0 +commit = True +tag = False +parse = (?P\d+)\.(?P\d+)\.(?P\d+)(\-(?P[a-z]+)(?P\d+))? +serialize = + {major}.{minor}.{patch}-{release}{dev} + {major}.{minor}.{patch} + +[bumpversion:part:release] +optional_value = _ +first_value = dev +values = + dev + _ + +[bumpversion:part:dev] + +[bumpversion:file:pyproject.toml] +search = version = "{current_version}" +replace = version = "{new_version}" + +[bumpversion:file:plugin.json] + +[bumpversion:file:VERSION] + +[bumpversion:file:README.md] +[bumpversion:file:ict.yaml] +[bumpversion:file:TabularStatistics.cwl] + +[bumpversion:file:src/polus/tabular/features/tabular_statistics/__init__.py] \ No newline at end of file diff --git a/features/tabular-statistics-tool/Dockerfile b/features/tabular-statistics-tool/Dockerfile new file mode 100644 index 0000000..5d23a5b --- /dev/null +++ b/features/tabular-statistics-tool/Dockerfile @@ -0,0 +1,24 @@ +FROM polusai/bfio:2.4.5 + +# environment variables defined in polusai/bfio +ENV EXEC_DIR="/opt/executables" +ENV POLUS_TAB_EXT=".csv" +ENV POLUS_LOG="INFO" + +# Work directory defined in the base container +WORKDIR ${EXEC_DIR} + +# TODO: Change the tool_dir to the tool directory +ENV TOOL_DIR="features/tabular-statistics-tool" + +# Copy the repository into the container +RUN mkdir tabular-tools +COPY . ${EXEC_DIR}/tabular-tools + +# Install the tool +RUN pip3 install "${EXEC_DIR}/tabular-tools/${TOOL_DIR}" --no-cache-dir + +# Set the entrypoint +# TODO: Change the entrypoint to the tool entrypoint +ENTRYPOINT ["python3", "-m", "polus.tabular.features.tabular_statistics"] +CMD ["--help"] \ No newline at end of file diff --git a/features/polus-csv-statistics-plugin/README.md b/features/tabular-statistics-tool/README.md similarity index 67% rename from features/polus-csv-statistics-plugin/README.md rename to features/tabular-statistics-tool/README.md index 51ac1c4..60ac1f7 100644 --- a/features/polus-csv-statistics-plugin/README.md +++ b/features/tabular-statistics-tool/README.md @@ -1,8 +1,8 @@ -# CSV Statistics +# Tabular Statistics -This WIPP plugin performs statistics on values in each column of a csv file if the data is numeric. Rows of data are grouped together by rows that have a matching value in a column with header named `file`. If no columns have the `file` header, then this plugin throws and error. +This plugin computes statistical measures on numeric and floating point data columns in tabular files. The supported input file formats are `CSV`, `Feather`, `Arrow`, and `Parquet`, leveraging the [PyArrow](https://arrow.apache.org/) library for efficient processing. If no columns have the `file` header, then this plugin throws and error. -Available statistics are: +## Available Statistics: 1. [mean (arithmetic mean)](https://en.wikipedia.org/wiki/Mean#Arithmetic_mean_(AM)) 2. [median](https://en.wikipedia.org/wiki/Median#The_sample_median) @@ -12,6 +12,13 @@ Available statistics are: 6. [kurt (excess kurtosis)](https://www.itl.nist.gov/div898/handbook/eda/section3/eda35b.htm) 7. count (number of rows sampled) 8. [iqr (Interquartile_range)](https://en.wikipedia.org/wiki/Interquartile_range) +9. Counts the numbers greater than 0 and divides this count by the total number of elements in the sequence + + +## Usage: +- A directory containing one or more tabular files in the supported formats +- Each file must include numeric and floating point data columns +- If a `groupBy` column is specified in the input arguments, it must be present in the data For more information on WIPP, visit the [official WIPP page](https://isg.nist.gov/deepzoomweb/software/wipp). @@ -31,7 +38,7 @@ This plugin takes two input argument and one output argument: | Name | Description | I/O | Type | | --------------- | --------------------------------------------------- | ------ | ------------- | | `--statistics` | Types of statistics to calculate | Input | array | -| `--inpDir` | Input csv collection to be processed by this plugin | Input | csvCollection | +| `--inpDir` | Input csv collection to be processed by this plugin | Input | genericData | | `--filePattern` | The filePattern of the images in represented in csv | Input | string | | `--groupBy` | The variable(s) of how the images should be grouped | Input | string | -| `--outDir` | Output collection | Output | csvCollection | +| `--outDir` | Output collection | Output | genericData | diff --git a/features/tabular-statistics-tool/TabularStatistics.cwl b/features/tabular-statistics-tool/TabularStatistics.cwl new file mode 100644 index 0000000..aa061c2 --- /dev/null +++ b/features/tabular-statistics-tool/TabularStatistics.cwl @@ -0,0 +1,36 @@ +class: CommandLineTool +cwlVersion: v1.2 +inputs: + filePattern: + inputBinding: + prefix: --filePattern + type: string? + groupBy: + inputBinding: + prefix: --groupBy + type: string? + inpDir: + inputBinding: + prefix: --inpDir + type: Directory + outDir: + inputBinding: + prefix: --outDir + type: Directory + statistics: + inputBinding: + prefix: --statistics + type: string +outputs: + outDir: + outputBinding: + glob: $(inputs.outDir.basename) + type: Directory +requirements: + DockerRequirement: + dockerPull: polusai/tabular-statistics-tool:0.1.0-dev0 + InitialWorkDirRequirement: + listing: + - entry: $(inputs.outDir) + writable: true + InlineJavascriptRequirement: {} diff --git a/features/tabular-statistics-tool/VERSION b/features/tabular-statistics-tool/VERSION new file mode 100644 index 0000000..15a06be --- /dev/null +++ b/features/tabular-statistics-tool/VERSION @@ -0,0 +1 @@ +0.1.0-dev0 \ No newline at end of file diff --git a/features/tabular-statistics-tool/build-docker.sh b/features/tabular-statistics-tool/build-docker.sh new file mode 100755 index 0000000..d734881 --- /dev/null +++ b/features/tabular-statistics-tool/build-docker.sh @@ -0,0 +1,23 @@ +#!/bin/bash + +# TODO: Change the name of the tool here +tool_dir="features" +tool_name="tabular-statistics-tool" + +# The version is read from the VERSION file +version=$(