diff --git a/.github/workflows/quay-on-tag.yml b/.github/workflows/quay-on-tag.yml
new file mode 100644
index 0000000..67603c0
--- /dev/null
+++ b/.github/workflows/quay-on-tag.yml
@@ -0,0 +1,50 @@
+name: Build & Push Docker image on tag creation
+on:
+    push:
+        tags:
+            - '*'
+
+concurrency:
+    group: docker-${{ github.ref }}
+    cancel-in-progress: true
+
+jobs:
+    docker:
+        runs-on: ubuntu-latest
+        permissions:
+            contents: read
+
+        steps:
+            - name: Checkout repository
+              uses: actions/checkout@v4
+            
+            - name: Login to Quay
+              uses: docker/login-action@v3
+              with:
+                registry: quay.io
+                username: ${{ secrets.QUAY_USERNAME }}
+                password: ${{ secrets.QUAY_PASSWORD }}
+
+            - name: Docker metadata
+              id: meta
+              uses: docker/metadata-action@v5
+              with:
+                images: quay.io/cellgeni/starsolo
+                tags: |
+                    type=ref,event=tag
+                    # push "latest" only for tags WITHOUT hyphens (excludes pre-releases like v1.0-beta):
+                    type=raw,value=latest,enable=${{ !contains(github.ref_name, '-') }}
+            
+            - name: Build and push Docker image
+              uses: docker/build-push-action@v6
+              with:
+                context: .
+                file: Dockerfile
+                push: true
+                tags: ${{ steps.meta.outputs.tags }}
+                labels: ${{ steps.meta.outputs.labels }}
+                cache-from: |
+                    type=registry,ref=quay.io/cellgeni/starsolo:cache
+                    type=gha
+                cache-to: type=registry,ref=quay.io/cellgeni/starsolo:cache,mode=max
+            
\ No newline at end of file
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
new file mode 100644
index 0000000..d3795c1
--- /dev/null
+++ b/.github/workflows/test.yml
@@ -0,0 +1,57 @@
+name: "Test Workflow"
+on:
+    push:
+        branches: [main, dev]
+        paths-ignore:
+            - 'data/whitelists.tar.gz'
+            - 'README.md'
+            - 'LICENSE'
+            - 'scripts/**'
+            - 'Dockerfile'
+    pull_request:
+        branches: [main, dev]
+
+jobs:
+    test:
+        runs-on: ubuntu-latest
+        container: quay.io/cellgeni/starsolo:base
+        strategy:
+            fail-fast: false
+            matrix:
+                include:
+                    - technology: 10x
+                    - technology: dropseq
+                    - technology: indrops
+                    - technology: smartseq
+                    - technology: rhapsody
+                    - technology: strt
+        name: "Run tests for ${{ matrix.technology }}"
+
+        steps:
+            - name: Checkout repository
+              uses: actions/checkout@v4
+            
+            - name: Unpack test data
+              run: tar -xzf data/test.tar.gz
+            
+            - name: Run tests
+              run: |
+                set -e
+                technology=${{ matrix.technology }}
+                if [ "$technology" = "smartseq" ]; then
+                    bin/starsolo smartseq \
+                        data/test/fastqs/smartseq/smartseq_example.manifest.tsv \
+                        --ref data/test/reference/index \
+                        --no-bam \
+                        --cpus 1
+                else
+                    bin/starsolo $technology \
+                        data/test/fastqs/${technology} \
+                        "$technology" \
+                        --ref data/test/reference/index \
+                        --whitelist-dir data/test/whitelists \
+                        --no-bam \
+                        --cpus 1
+                fi
+    
+
diff --git a/Dockerfile b/Dockerfile
index 6612e08..2603f4d 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -3,14 +3,14 @@ FROM ubuntu:20.04
 ENV DEBIAN_FRONTEND=noninteractive
 
 ARG star_version=2.7.10a_alpha_220818
-ARG samtools_version=1.15.1
+ARG samtools_version=1.21
 ARG bbmap_version=38.97
-ARG rsem_version=1.3.3
+ARG seqtk_version=1.4
 
 #Install OS packages
 RUN apt-get update && apt-get -y --no-install-recommends -qq install \
     wget gcc build-essential software-properties-common libz-dev \
-    git libncurses5-dev libbz2-dev liblzma-dev default-jre bsdmainutils
+    git libncurses5-dev libbz2-dev liblzma-dev default-jre bsdmainutils pbzip2
 
 #Install STAR
 RUN wget --no-check-certificate https://github.com/alexdobin/STAR/archive/${star_version}.tar.gz && \
@@ -20,10 +20,11 @@ RUN wget --no-check-certificate https://github.com/alexdobin/STAR/archive/${star
     cd / && rm ${star_version}.tar.gz 
 
 #Install seqtk
-RUN git clone https://github.com/lh3/seqtk.git && \
-    mv seqtk /opt && \
-    cd /opt/seqtk && \
-    make
+RUN wget --no-check-certificate https://github.com/lh3/seqtk/archive/refs/tags/v${seqtk_version}.tar.gz && \
+    tar -xzf v${seqtk_version}.tar.gz -C /opt && \
+    cd /opt/seqtk-${seqtk_version} && \
+    make && \
+    cd / && rm v${seqtk_version}.tar.gz
 
 #Install samtools
 RUN wget https://github.com/samtools/samtools/releases/download/${samtools_version}/samtools-${samtools_version}.tar.bz2 && \
@@ -41,19 +42,29 @@ RUN wget https://sourceforge.net/projects/bbmap/files/BBMap_${bbmap_version}.tar
     ./stats.sh in=resources/phix174_ill.ref.fa.gz && \
     cd / && rm BBMap_${bbmap_version}.tar.gz
 
-#Install RSEM
-RUN wget https://github.com/deweylab/RSEM/archive/refs/tags/v${rsem_version}.tar.gz && \
-    tar -xzf v${rsem_version}.tar.gz -C /opt && \
-    cd /opt/RSEM-${rsem_version} && \
-    make && \
-    cd / && rm v${rsem_version}.tar.gz
+# Install this repository
+COPY . /opt/STARsolo
+RUN cd /opt/STARsolo &&\
+    rm data/test.tar.gz &&\
+    tar -xzf data/whitelists.tar.gz &&\
+    ./install.sh
 
-ENV PATH="${PATH}:/opt/STAR-${star_version}/source:/opt/seqtk:/opt/bbmap:/opt/RSEM-${rsem_version}"     
+# Set PATH to include all binaries
+ENV STARSOLO_WL_DIR=/opt/STARsolo/data/whitelists
+ENV PATH="/opt/STARsolo/bin:${PATH}:/opt/STAR-${star_version}/source:/opt/seqtk-${seqtk_version}:/opt/bbmap"     
 
 #Saving Software Versions to a file
 RUN echo "STAR version: ${star_version}" >> versions.txt && \
     echo "samtools version: ${samtools_version}" >> versions.txt && \
     echo "BBMap version: ${bbmap_version}" >> versions.txt && \
-    echo "RSEM version: ${rsem_version}" >> versions.txt && \
-    seqtk_version=`strings $(which seqtk) | grep 'Version:' | cut -f 2 -d " "` && \
     echo "seqtk version: ${seqtk_version}" >> versions.txt 
+
+COPY Dockerfile /docker/
+RUN chmod -R 755 /docker
+
+
+
+# Default entrypoint: run the CLI
+WORKDIR /workdir
+ENTRYPOINT ["starsolo"]
+CMD ["--help"]
\ No newline at end of file
diff --git a/README.md b/README.md
index 4407f52..fceac68 100644
--- a/README.md
+++ b/README.md
@@ -1,148 +1,256 @@
-# Wrapper scripts for `STARsolo` scRNA-seq pipeline
+# `starsolo` — unified CLI for STARsolo scRNA-seq processing
 
-These are the scripts used for CellGenIT for uniform processing of scRNA-seq - both 10X and quite a few other types (see below for supported platforms). All listed methods use [STAR](https://github.com/alexdobin/STAR) aligner to align reads to the reference genome. 
+A single command-line tool that wraps [STAR](https://github.com/alexdobin/STAR) in `STARsolo` mode for uniform processing of scRNA-seq data across multiple platforms.
 
-## Software installation
+## Supported platforms
 
-### STAR version
+| Subcommand | Platform | Barcode type | Chemistry auto-detection |
+|:--|:--|:--|:-:|
+| `10x` | 10x Genomics (v1 – v4, multiome) | CB_UMI_Simple | ✅ |
+| `smartseq` | Smart-seq / Smart-seq2 | SmartSeq (plate-based, no UMIs) | — |
+| `dropseq` | Drop-seq | CB_UMI_Simple (no whitelist) | — |
+| `rhapsody` | BD Rhapsody | CB_UMI_Complex (3 segments) | — |
+| `indrops` | inDrops | CB_UMI_Complex (2 segments + adapter) | — |
+| `strt` | STRT-seq | CB_UMI_Simple (96-barcode list) | — |
+| `qc` | Aggregate QC stats across runs | — | — |
 
-`STAR` of version 2.7.9a or above is recommended (2.7.10a is the latest and greatest, as of August'22). The newest update includes the ability to correctly process multi-mapping reads, and adds many important options and bug fixes. 
+## Installation
 
-In order to use settings that closely mimic those of `Cell Ranger` v4 or above (see explanations below, particularly `--clipAdapterType CellRanger4` option), `STAR` needs to be re-compiled from source with `make STAR CXXFLAGS_SIMD="-msse4.2"` (see [this issue](https://github.com/alexdobin/STAR/issues/1218) for more info). If you get the _Illegal instruction_ error, that's what you need to do. 
+### Prerequisites
 
-### Docker
+The following tools **must be available in `$PATH`** before running `starsolo`:
 
-We also have a Dockerfile which builds with specific versions of all the tools used. Once the container is built you can do `cat /versions.txt` and it will show the versions of the tools in the container. 
+| Tool | Tested version | Purpose |
+|:--|:--|:--|
+| [STAR](https://github.com/alexdobin/STAR) | 2.7.10a | Alignment & quantification |
+| [samtools](https://github.com/samtools/samtools) | 1.15.1 | BAM indexing |
+| [seqtk](https://github.com/lh3/seqtk) | 1.3+ | Read subsampling (10x chemistry detection) |
+| [pbzip2](https://launchpad.net/pbzip2) | 1.1+ | Compressing unmapped reads |
+| [BBMap](https://sourceforge.net/projects/bbmap/) | 38.97 | Adapter trimming (optional, via `bbduk.sh`) |
 
-## Reference genome and annotation
+> **Tip:** The included [Dockerfile](Dockerfile) builds all of the above into a single container image.
 
-The issues of reference genome and annotation used for **mouse** and **human** scRNA-seq experiments are described [here](https://www.singlecellcourse.org/processing-raw-scrna-seq-sequencing-data-from-reads-to-a-count-matrix.html) in some detail. 
+STAR must be compiled with `make STAR CXXFLAGS_SIMD="-msse4.2"` to support `--clipAdapterType CellRanger4` (see [STAR#1218](https://github.com/alexdobin/STAR/issues/1218)).
 
-Briefly,
+### Install `starsolo` CLI
 
-  - there are several standard annotation versions used by `Cell Ranger`; They are referred to as versions `1.2.0`, `2.1.0`, `3.0.0`, and `2020-A`; 
-  - the references are filtered to remove pseudogenes and small RNAs; exact filtering scripts are available [here](https://support.10xgenomics.com/single-cell-gene-expression/software/release-notes/build#header); 
-  - number of genes in `Cell Ranger` filtered human reference is about 35k; in full human annotation there are about 60k.
+```bash
+git clone <this-repo> && cd STARsolo
+
+# Default: symlinks into ~/.local/bin
+./install.sh
+
+# Or specify a directory:
+./install.sh /usr/local/bin
+
+# Verify
+starsolo --version
+```
+
+The installer creates a single `starsolo` symlink. All library code is resolved relative to the repo, so you can `git pull` to update in-place.
+
+## Reference genome
+
+Use a standard STAR genome index. [Cell Ranger-filtered](https://support.10xgenomics.com/single-cell-gene-expression/software/release-notes/build#header) references are recommended for comparability with Cell Ranger output.
+
+```bash
+STAR --runThreadN 16 --runMode genomeGenerate --genomeDir STAR --genomeFastaFiles $FA --sjdbGTFfile $GTF
+```
+
+By default, `starsolo` resolves references via `--species`:
+
+```
+$STARSOLO_REF_BASE/<species>/2020A/index
+```
+
+Override with `--ref /your/path/to/index` on any subcommand, or change `STARSOLO_REF_BASE` in [etc/defaults.conf](etc/defaults.conf).
+
+### Barcode whitelists
+
+Download 10x barcode whitelists [from Cell Ranger](https://github.com/10XGenomics/cellranger/tree/master/lib/python/cellranger/barcodes) and place them in the whitelist directory (default: `/nfs/cellgeni/STAR/whitelists`, configurable via `--whitelist-dir` or `STARSOLO_WL_DIR`).
+
+## Usage
+
+```
+starsolo <platform> <args…> [options]
+```
+
+### Global options
+
+| Flag | Description | Default |
+|:--|:--|:--|
+| `-s, --species <name>` | Species name (`human`, `mouse`, …) — resolves reference automatically | — |
+| `-r, --ref <path>` | Explicit STAR index path (overrides `--species`) | — |
+| `-w, --whitelist-dir <dir>` | Barcode whitelist directory | from config |
+| `-c, --cpus <N>` | Number of threads | `16` |
+| `--bam` | Output coordinate-sorted BAM | off |
+| `--no-bam` | Suppress BAM output (default) | ✅ |
+| `-h, --help` | Show help (global or per-platform) | — |
+| `--version` | Print version | — |
+
+### 10x Genomics
+
+Automatically detects chemistry (v1–v4, multiome), strand specificity, and paired-end mode.
+
+```bash
+starsolo 10x /data/fastqs SAMPLE1 --species human
+starsolo 10x /data/fastqs SAMPLE1 --ref /path/to/index --bam
+```
+
+#### 10x chemistry detection
+
+The script subsamples 200,000 reads and matches barcodes against all known whitelists:
 
-Once you have downloaded the needed fasta and GTF files, and applied the necessary filtering (see the 10x genomics link above), run the following command using 16 CPUs/64Gb of RAM: 
+| 10x version | Whitelist | CB len | UMI len | Strand |
+|:-:|:-:|:-:|:-:|:-:|
+| 3' v1 | 737K-april-2014_rc.txt | 14 | 10 | Forward |
+| 3' v2 | 737K-august-2016.txt | 16 | 10 | Forward |
+| 3' v3/v3.1 | 3M-february-2018.txt | 16 | 12 | Forward |
+| 3' v4 | 3M-3pgex-may-2023.txt | 16 | 12 | Forward |
+| 5' v1.1/v2 | 737K-august-2016.txt | 16 | 10 | Reverse |
+| 5' v3 | 737K-august-2016.txt | 16 | 12 | Reverse |
+| 5' v4 | 3M-5pgex-jan-2023.txt | 16 | 12 | Reverse |
+| multiome | 737K-arc-v1.txt | 16 | 12 | Forward |
 
-`STAR --runThreadN 16 --runMode genomeGenerate --genomeDir STAR --genomeFastaFiles $FA --sjdbGTFfile $GTF`
+#### Paired-end processing
 
-`STARsolo` reference is not different from a normal `STAR` reference - `STARsolo` workflow is invoked using the options listed below. Thus, it can be run on both `Cell Ranger` filtered and un-filtered index. We use the filtered reference to match the `Cell Ranger` output as closely as possible. 
+5' libraries with long R1 reads (>50 bp) are automatically processed in paired-end mode with `--soloBarcodeMate 1 --clip5pNbases 39 0`. 3' libraries are always single-end.
 
-### Pre-made reference location
+### Smart-seq / Smart-seq2
 
-All **CellGenIT** pre-made `STAR` references are located in `/nfs/cellgeni/STAR/`. Barcode whitelist files are located in `/nfs/cellgeni/STAR/whitelists`. 
-## Resource requirements 
+Requires a **manifest file** — a tab-separated file with three columns: `R1_path`, `R2_path`, `cell_name`.
+
+```bash
+starsolo smartseq manifest.tsv --species mouse
+```
 
-By default, all processing is done using 16 CPUs and 128Gb of RAM. Sanger Farm typically has 8 Gb of RAM per core, so 8 CPUs/64Gb RAM, or 16 CPUs/128Gb RAM is probably optimal. Unfortunately, many datasets fail with OOM error when 64Gb is requested, so we reverted our defaults to 128Gb. 
+Aliases: `smart-seq`, `ss2`
 
-## Processing scRNA-seq with STARsolo
+### Drop-seq
 
-### 10X: reproducing `Cell Ranger` v4 and above (but much faster)
+12 bp cell barcode, 8 bp UMI, no whitelist.
 
-The current relevant wrapper script for all 10X scRNA-seq processing is called `starsolo_10x_auto.sh`. The scripts contain *many* options that sometimes change; some of which will be explained below. In general, commands are tuned in such way that the results will be very close to those of `Cell Ranger` v4 and above. 
+```bash
+starsolo dropseq /data/fastqs SAMPLE1 --species human
+```
 
-Before running, barcode whitelists need to be downloaded [from here](https://github.com/10XGenomics/cellranger/tree/master/lib/python/cellranger/barcodes). 
+Alias: `drop-seq`
 
-Below are the explanations for some of the options (note that 5' experiments of versions v1.1/v2/v3 all use `737K-august-2016.txt` barcode file): 
-  - `--runDirPerm All_RWX` allows a directory readable by all users, which becomes an issue when sharing results on Farm; 
-  - `--soloCBwhitelist $BC --soloBarcodeReadLength 0 --soloUMIlen $UMILEN --soloStrand $STR` are settings that change with the used 10x chemistry:
+### BD Rhapsody
 
-<div align="center">
+3 barcode segments + 8 bp UMI. Requires `Rhapsody_bc1/2/3.txt` in the whitelist directory.
 
-| 10X VERSION | BC | UMILEN | STR |
-|:-:|:-:|:-:|:-:|
-| 3' v1 | 737K-april-2014_rc.txt | 10 | Forward |
-| 3' v2 | 737K-august-2016.txt | 10 | Forward |
-| 3' v3, v3.1 | 3M-february-2018.txt | 12 | Forward |
-| 3' v4 | 3M-3pgex-may-2023.txt | 12 | Forward |
-| 5' v1.1, v2 | 737K-august-2016.txt | 10 | Reverse |
-| 5' v3 | 737K-august-2016.txt | 12 | Reverse |
-| 5' v4 | 3M-5pgex-jan-2023.txt | 12 | Reverse |
-| multiome | 737K-arc-v1.txt | 12 | Forward |
+```bash
+starsolo rhapsody /data/fastqs SAMPLE1 --species human
+```
 
-</div>
+Aliases: `bd-rhapsody`, `bd`
 
-  - `--soloUMIdedup 1MM_CR --soloCBmatchWLtype 1MM_multi_Nbase_pseudocounts --soloUMIfiltering MultiGeneUMI_CR --clipAdapterType CellRanger4 --outFilterScoreMin 30` are options that define UMI collapsing, barcode collapsing, and read clipping algorithms that are closest to ones used by `Cell Ranger` v4+; 
-  - `--soloCellFilter EmptyDrops_CR` specifies the cell filtering algorithm used in [EmptyDrops](https://bioconductor.org/packages/release/bioc/html/DropletUtils.html), which is the default algorithm in later versions of `Cell Ranger`; 
-  - `--soloFeatures Gene GeneFull Velocyto` output conventional (exon-only) UMI counts, as well as exon+intron UMI counts (analog of `Cell Ranger` premrna option), as well as matrices preprocessed for `Velocyto`; 
-  - `--soloMultiMappers EM` is to count multimappers (on by default in v3.0+ of our wrapper scripts; does not influence the main output, but creates an additional matrix in `/raw` subdir of `Gene` and `GeneFull`); 
-  - `--readFilesCommand zcat` is used if your input fastq files are gzipped (auto-deteted);
-  - `--outReadsUnmapped Fastx` to output the unmapped reads that can be used for metagenomic analysis; 
-  - options grouped as `$SORTEDBAM` should be used if you need a genomic bam file; otherwise, use `$NOBAM`.
+### inDrops
 
-Actual `STAR` command being run (note the order of read files passed to `--readFilesIn`): 
+2 barcode segments + adapter + 6 bp UMI. Requires `inDrops_Ambrose2_bc1/2.txt` in the whitelist directory.
 
 ```bash
-STAR --runThreadN $CPUS --genomeDir $REF --readFilesIn $R2 $R1 --runDirPerm All_RWX $GZIP $BAM \
-   --soloType CB_UMI_Simple --soloCBwhitelist $BC --soloBarcodeReadLength 0 \
-   --soloCBlen $CBLEN --soloUMIstart $((CBLEN+1)) --soloUMIlen $UMILEN --soloStrand $STRAND \
-   --soloUMIdedup 1MM_CR --soloCBmatchWLtype 1MM_multi_Nbase_pseudocounts --soloUMIfiltering MultiGeneUMI_CR \
-   --soloCellFilter EmptyDrops_CR --clipAdapterType CellRanger4 --outFilterScoreMin 30 \
-   --soloFeatures Gene GeneFull Velocyto --soloOutFileNames output/ features.tsv barcodes.tsv matrix.mtx \
-   --soloMultiMappers EM --outReadsUnmapped Fastx
+starsolo indrops /data/fastqs SAMPLE1 --species human
+starsolo indrops /data/fastqs SAMPLE1 --species human --adapter GAGTGATTGCTTGTGACGCCAA
 ```
-### Processing 10X datasets in paired-end mode
 
-Although some 3' 10X datasets are sequenced with a "long" R1, usually they are aligned as single-end, since it would take a very long R1 to accommodate for barcode, UMI, TSO + adapter, and a poly-A tail. On the other hand, 5' 10X datasets, especially ones sequenced with 2x150 bp paired-end reads, could be trimmed and aligned as paired-end (this is what `Cell Ranger` does, too). Effective `STARsolo` command for paired-end processing is as follows (note the 5' clipping flag, read input order, and flipped `soloStrand` because of the changed read order):
+Default adapter: `GAGTGATTGCTTGTGACGCCTT`
+
+### STRT-seq
+
+96-barcode whitelist, 8 bp cell barcode, 8 bp UMI. Requires `96_barcodes.list` in the whitelist directory.
 
 ```bash
-STAR --runThreadN $CPUS --genomeDir $REF --readFilesIn $R1 $R2 --runDirPerm All_RWX $GZIP $BAM \
-   --soloBarcodeMate 1 --clip5pNbases 39 0 \
-   --soloType CB_UMI_Simple --soloCBwhitelist $BC --soloCBstart 1 --soloCBlen $CBLEN \
-   --soloUMIstart $((CBLEN+1)) --soloUMIlen $UMILEN --soloStrand Forward \
-   --soloUMIdedup 1MM_CR --soloCBmatchWLtype 1MM_multi_Nbase_pseudocounts --soloUMIfiltering MultiGeneUMI_CR \
-   --soloCellFilter EmptyDrops_CR --outFilterScoreMin 30 \
-   --soloFeatures Gene GeneFull Velocyto --soloOutFileNames output/ features.tsv barcodes.tsv matrix.mtx \
-   --soloMultiMappers EM --outReadsUnmapped Fastx
+starsolo strt /data/fastqs SAMPLE1 --species human --strand Forward
 ```
 
-### Using STARsolo for SMART-seq/SMART-seq2
+Extra option: `--strand <Forward|Reverse>` (default: `Forward`)
+
+Alias: `strt-seq`
 
-For plate-based methods that don't use UMIs (such as [SMART-Seq and SMART-Seq2](https://teichlab.github.io/scg_lib_structs/methods_html/SMART-seq_family.html)), `STARsolo` can be used as well. Fastq files for these methods usually come as separate, paired-end files; all of these should be listed in a *manifest* file - plain text, tab-separated file containing three columns per line: 1) full path to R1; 2) full path to R2; 3) cell name or ID. 
+### QC aggregation
 
-Example of a script used to process Smart-seq2 data can be found in `/scripts/starsolo_ss2.sh`. Key parameters that could be adjusted are `--outFilterScoreMinOverLread 0.3 --outFilterMatchNminOverLread 0.3`; the higher they are, the less permissive is the alignment (we use default settings usually). Lower values can help you "rescue" a larger proportion of reads with high adapter content (see below for adapter trimming). Actual `STAR` command being run:
+Run from the parent directory containing per-sample output folders:
 
 ```bash
-STAR --runThreadN $CPUS --genomeDir $REF --runDirPerm All_RWX $GZIP $BAM \
-     --soloType SmartSeq --readFilesManifest ../$TAG.manifest.tsv \
-     --soloUMIdedup Exact --soloStrand Unstranded \
-     --limitOutSJcollapsed 10000000 --soloCellFilter None \
-     --soloFeatures Gene GeneFull --soloOutFileNames output/ features.tsv barcodes.tsv matrix.mtx
+starsolo qc | column -t
 ```
 
-Often, SMART-seq2 reads can benefit from trimming adapters, which can be turned on using `--clip3pAdapterSeq <3' adapter sequence>` option. Alternatively, `/scripts/bbduk.sh` can be used to trim adapters from reads prior to the alignment and quantification (this is the preferred option).  
+Checks for:
+- Incomplete runs (`_STARtmp` still present)
+- Barcode cross-contamination between samples
+- Low mapping percentages
 
-### Counting the multimapping reads
+Outputs a tab-separated table with read counts, mapping rates, cell counts, and configuration for each sample.
 
-Default approach used by `Cell Ranger` (and `STARsolo` scripts above) is to discard all reads that map to multiple genomic locations with equal mapping quality. This approach creates a bias in gene expression estimation. Pseudocount-based methods correctly quantify multimapping reads, but generate false counts due to pseudo-alignment errors. These issues are described in good detail [here](https://www.biorxiv.org/content/10.1101/2021.05.05.442755v1). 
+## Configuration
 
-If you would like to process multimappers, add the following options: `--soloMultiMappers Uniform EM` (or simply `--soloMultiMappers EM`; this is on by default in v3.0+ of these scripts). This will generate an extra matrix in the `/raw` output folders. There will be non-integer numbers in the matrix because of split reads. If the downstream processing requires integers, you can round with a tool of your liking (e.g. `awk`). 
+Edit [etc/defaults.conf](etc/defaults.conf) to change default paths and parameters. All values can also be set as environment variables:
 
-As of `STAR` v2.7.10a, **multimapper counting still does not work for SMART-seq2 or bulk RNA-seq processing**. 
+```bash
+export STARSOLO_REF_BASE=/my/references
+export STARSOLO_WL_DIR=/my/whitelists
+export STARSOLO_CPUS=8
+export STARSOLO_BAM_MODE=bam
+```
+
+CLI flags always take precedence over environment variables, which take precedence over config file defaults.
+
+## Helper scripts
 
-### Running STARsolo on other scRNA-seq platforms 
+| Script | Purpose |
+|:--|:--|
+| [scripts/bbduk_trim.sh](scripts/bbduk_trim.sh) | Adapter/polyA trimming via BBMap |
+| [scripts/bsub_submit.sh](scripts/bsub_submit.sh) | Submit any starsolo command as an LSF job |
 
-STARsolo is very flexible and can be used with almost any scRNA-seq method, provided you know the library structure - i.e. where cell barcodes, UMIs, and biological parts of the read are located in the sequencing fragment or reads. A great source of information about scRNA-seq library structures is [this page](https://teichlab.github.io/scg_lib_structs/).
+### LSF submission
+
+```bash
+./scripts/bsub_submit.sh starsolo 10x /data/fastqs SAMPLE1 --species human
+```
 
-Currently, our scripts directory provides dedicated scripts for:
-  - SMART-seq/SMART-seq2;
-  - Drop-seq;
-  - inDrops;
-  - Microwell-seq;
-  - BD Rhapsody;
-  - CEL-seq;
-  - STRT-seq. 
+This submits with 16 CPUs / 64 GB RAM to the `normal` queue.
 
-Please contact `CellGenIT` if you need to process an unusual dataset. 
+## Resource requirements
 
-## Quick evaluation of multiple STARsolo runs
+Default: **16 CPUs, 64–128 GB RAM**. Some datasets require 128 GB; if jobs fail with OOM errors, increase RAM in your job submission.
 
-If you've used these scripts to process multiple 10X samples, you can get a quick look at the results by copying `solo_QC.sh` script from this repo to the directory with `STARsolo` output folders, and running
+## Docker
+
+The included [Dockerfile](Dockerfile) builds all required tools:
 
 ```bash
-./solo_QC.sh | column -t 
+docker build -t starsolo .
+docker run -v /data:/data starsolo starsolo 10x /data/fastqs SAMPLE1 --species human
+```
+
+## Project structure
+
+```
+STARsolo/
+├── bin/
+│   └── starsolo              # CLI entrypoint (add to PATH)
+├── lib/
+│   ├── common.sh             # Shared functions (FASTQ discovery, compression, post-processing)
+│   ├── platform_10x.sh       # 10x Genomics logic
+│   ├── platform_smartseq.sh  # Smart-seq2 logic
+│   ├── platform_dropseq.sh   # Drop-seq logic
+│   ├── platform_rhapsody.sh  # BD Rhapsody logic
+│   ├── platform_indrops.sh   # inDrops logic
+│   └── platform_strt.sh      # STRT-seq logic
+├── etc/
+│   └── defaults.conf          # Default configuration
+├── scripts/
+│   ├── solo_qc.sh             # QC aggregation
+│   ├── bbduk_trim.sh          # Adapter trimming helper
+│   └── bsub_submit.sh         # LSF job submission helper
+├── Dockerfile
+├── install.sh
+├── LICENSE
+└── README.md
 ```
 
-The script is designed for 10X or other droplet-based methods. For SMART-seq2, if processing was done per plate, it will also produce an informative output. 
+## License
+
+See [LICENSE](LICENSE).
diff --git a/bin/starsolo b/bin/starsolo
new file mode 100755
index 0000000..1dc4aa8
--- /dev/null
+++ b/bin/starsolo
@@ -0,0 +1,279 @@
+#!/bin/bash -e
+# =============================================================================
+# starsolo — unified CLI for STARsolo scRNA-seq processing
+# =============================================================================
+# Version 4.0
+#
+# Usage:  starsolo <platform> [args…]
+#         starsolo --help
+#
+# Platforms: 10x, smartseq, dropseq, rhapsody, indrops, strt, qc
+# =============================================================================
+
+set -euo pipefail
+
+STARSOLO_VERSION="4.0"
+
+# ---------- Resolve install paths -------------------------------------------
+
+# STARSOLO_ROOT is the repo root (parent of bin/).
+# Works whether called via symlink, PATH, or direct path.
+_resolve_root() {
+    local SOURCE="${BASH_SOURCE[0]}"
+    while [[ -L "$SOURCE" ]]; do
+        local DIR
+        DIR=$(cd -P "$(dirname "$SOURCE")" && pwd)
+        SOURCE=$(readlink "$SOURCE")
+        [[ $SOURCE != /* ]] && SOURCE="$DIR/$SOURCE"
+    done
+    cd -P "$(dirname "$SOURCE")/.." && pwd
+}
+
+STARSOLO_ROOT=$(_resolve_root)
+
+# ---------- Load shared code ------------------------------------------------
+
+source "$STARSOLO_ROOT/etc/defaults.conf"
+source "$STARSOLO_ROOT/lib/common.sh"
+source "$STARSOLO_ROOT/lib/platform_10x.sh"
+source "$STARSOLO_ROOT/lib/platform_smartseq.sh"
+source "$STARSOLO_ROOT/lib/platform_dropseq.sh"
+source "$STARSOLO_ROOT/lib/platform_rhapsody.sh"
+source "$STARSOLO_ROOT/lib/platform_indrops.sh"
+source "$STARSOLO_ROOT/lib/platform_strt.sh"
+
+# ---------- Top-level help ---------------------------------------------------
+
+show_main_help() {
+    cat <<EOF
+starsolo v${STARSOLO_VERSION} — unified CLI for STARsolo scRNA-seq processing
+
+Usage:
+  starsolo <platform> <args…> [options]
+  starsolo qc
+  starsolo --help | --version
+
+Platforms:
+  10x         10x Genomics (auto-detects chemistry, strand, PE/SE)
+  smartseq    Smart-seq / Smart-seq2 (plate-based, manifest file)
+  dropseq     Drop-seq (12 bp CB, 8 bp UMI, no whitelist)
+  rhapsody    BD Rhapsody (3 barcode segments, 8 bp UMI)
+  indrops     inDrops (2 barcode segments + adapter, 6 bp UMI)
+  strt        STRT-seq (96-barcode whitelist, 8 bp CB, 8 bp UMI)
+  qc          Aggregate QC stats across completed runs
+
+Global options (available for all platforms):
+  -s, --species <name>      Species (e.g. human, mouse); resolves reference
+  -r, --ref <path>          Explicit STAR genome index (overrides --species)
+  -w, --whitelist-dir <dir> Barcode whitelist directory
+  -c, --cpus <N>            Number of threads  [default: ${STARSOLO_CPUS}]
+  --bam                     Output sorted BAM
+  --no-bam                  No BAM output (default)
+  -h, --help                Show help (per-platform if after platform name)
+  --version                 Print version
+
+Configuration:
+  Defaults are read from $STARSOLO_ROOT/etc/defaults.conf
+  Override with environment variables (STARSOLO_CPUS, STARSOLO_REF_BASE, …)
+  or with CLI flags.
+
+Examples:
+  starsolo 10x /data/fastqs SAMPLE1 --species human
+  starsolo 10x /data/fastqs SAMPLE1 --ref /path/to/index --bam
+  starsolo smartseq manifest.tsv --species mouse
+  starsolo dropseq /data/fastqs SAMPLE1 --species human
+  starsolo rhapsody /data/fastqs SAMPLE1 --species human
+  starsolo indrops /data/fastqs SAMPLE1 --species human --adapter GAGTGATTGCTTGTGACGCCAA
+  starsolo strt /data/fastqs SAMPLE1 --species human --strand Reverse
+  starsolo qc
+EOF
+}
+
+# ---------- Option parsing helpers ------------------------------------------
+
+# Parses the common option set shared by all droplet-based platforms.
+# After parsing, the following variables are set:
+#   FQDIR, TAG, REF, WL, CPUS, BAM, plus platform-specific extras.
+# The function shifts positional args (FQDIR, TAG) off $@ and processes flags.
+
+# Generic parser for platforms that take: <fastq_dir> <sample_id> [options]
+# Sets: FQDIR TAG REF WL CPUS BAM and any extra vars needed.
+parse_droplet_args() {
+    local PLATFORM=$1; shift
+
+    # Defaults from config
+    CPUS="$STARSOLO_CPUS"
+    WL="$STARSOLO_WL_DIR"
+    BAM_MODE="$STARSOLO_BAM_MODE"
+    REF=""
+    SPECIES=""
+    FQDIR=""
+    TAG=""
+
+    # Platform-specific defaults
+    ADAPTER="GAGTGATTGCTTGTGACGCCTT"   # indrops
+    STRAND="Forward"                    # strt
+
+    # Consume positional args first (up to 2)
+    local POSITIONALS=()
+    local ALL_ARGS=("$@")
+    local i=0
+
+    while (( i < ${#ALL_ARGS[@]} )); do
+        case "${ALL_ARGS[$i]}" in
+            -*) break ;;
+            *)  POSITIONALS+=("${ALL_ARGS[$i]}"); i=$((i + 1)) ;;
+        esac
+    done
+
+    # Remaining are flags
+    local FLAGS=()
+    if (( i < ${#ALL_ARGS[@]} )); then
+        FLAGS=("${ALL_ARGS[@]:$i}")
+    fi
+
+    # For smartseq we only need 1 positional; others need 2
+    if [[ $PLATFORM == "smartseq" ]]; then
+        if (( ${#POSITIONALS[@]} < 1 )); then
+            "show_${PLATFORM}_help"
+            exit 1
+        fi
+        FQDIR="${POSITIONALS[0]}"  # actually TSV for smartseq
+    else
+        if (( ${#POSITIONALS[@]} < 2 )); then
+            "show_${PLATFORM}_help"
+            exit 1
+        fi
+        FQDIR="${POSITIONALS[0]}"
+        TAG="${POSITIONALS[1]}"
+    fi
+
+    # Parse flags
+    local j=0
+    while (( j < ${#FLAGS[@]} )); do
+        case "${FLAGS[$j]}" in
+            -s|--species)
+                SPECIES="${FLAGS[$((j+1))]}"
+                j=$((j + 2)) ;;
+            -r|--ref)
+                REF="${FLAGS[$((j+1))]}"
+                j=$((j + 2)) ;;
+            -w|--whitelist-dir)
+                WL="${FLAGS[$((j+1))]}"
+                j=$((j + 2)) ;;
+            -c|--cpus)
+                CPUS="${FLAGS[$((j+1))]}"
+                j=$((j + 2)) ;;
+            --bam)
+                BAM_MODE="bam"
+                j=$((j + 1)) ;;
+            --no-bam)
+                BAM_MODE="nobam"
+                j=$((j + 1)) ;;
+            --adapter)
+                ADAPTER="${FLAGS[$((j+1))]}"
+                j=$((j + 2)) ;;
+            --strand)
+                STRAND="${FLAGS[$((j+1))]}"
+                j=$((j + 2)) ;;
+            -h|--help)
+                "show_${PLATFORM}_help"
+                exit 0 ;;
+            *)
+                die "Unknown option: ${FLAGS[$j]}. Run 'starsolo $PLATFORM --help'." ;;
+        esac
+    done
+
+    # Resolve REF from species if not explicitly given
+    if [[ -z "$REF" ]]; then
+        [[ -n "$SPECIES" ]] || die "Either --species or --ref must be provided."
+        REF=$(resolve_reference "$SPECIES" "$STARSOLO_REF_BASE") || exit 1
+    fi
+
+    BAM=$(resolve_bam_flags "$BAM_MODE")
+}
+
+# ---------- QC subcommand ---------------------------------------------------
+
+run_qc() {
+    local QC_SCRIPT="$STARSOLO_ROOT/scripts/solo_qc.sh"
+    if [[ ! -x "$QC_SCRIPT" ]]; then
+        die "QC script not found: $QC_SCRIPT"
+    fi
+    exec bash "$QC_SCRIPT" "$@"
+}
+
+# ---------- Dispatch --------------------------------------------------------
+
+main() {
+    if (( $# == 0 )); then
+        show_main_help
+        exit 0
+    fi
+
+    case "$1" in
+        --help|-h)
+            show_main_help
+            exit 0 ;;
+        --version)
+            echo "starsolo v${STARSOLO_VERSION}"
+            exit 0 ;;
+    esac
+
+    local PLATFORM="$1"; shift
+
+    case "$PLATFORM" in
+        10x)
+            if [[ "${1:-}" == "-h" || "${1:-}" == "--help" ]]; then
+                show_10x_help; exit 0
+            fi
+            parse_droplet_args "10x" "$@"
+            run_10x "$FQDIR" "$TAG" "$CPUS" "$REF" "$WL" "$BAM"
+            ;;
+        smartseq|smart-seq|ss2)
+            if [[ "${1:-}" == "-h" || "${1:-}" == "--help" ]]; then
+                show_smartseq_help; exit 0
+            fi
+            parse_droplet_args "smartseq" "$@"
+            run_smartseq "$FQDIR" "$CPUS" "$REF" "$BAM"
+            ;;
+        dropseq|drop-seq)
+            if [[ "${1:-}" == "-h" || "${1:-}" == "--help" ]]; then
+                show_dropseq_help; exit 0
+            fi
+            parse_droplet_args "dropseq" "$@"
+            run_dropseq "$FQDIR" "$TAG" "$CPUS" "$REF" "$BAM"
+            ;;
+        rhapsody|bd-rhapsody|bd)
+            if [[ "${1:-}" == "-h" || "${1:-}" == "--help" ]]; then
+                show_rhapsody_help; exit 0
+            fi
+            parse_droplet_args "rhapsody" "$@"
+            run_rhapsody "$FQDIR" "$TAG" "$CPUS" "$REF" "$WL" "$BAM"
+            ;;
+        indrops)
+            if [[ "${1:-}" == "-h" || "${1:-}" == "--help" ]]; then
+                show_indrops_help; exit 0
+            fi
+            parse_droplet_args "indrops" "$@"
+            run_indrops "$FQDIR" "$TAG" "$CPUS" "$REF" "$WL" "$BAM" "$ADAPTER"
+            ;;
+        strt|strt-seq)
+            if [[ "${1:-}" == "-h" || "${1:-}" == "--help" ]]; then
+                show_strt_help; exit 0
+            fi
+            parse_droplet_args "strt" "$@"
+            run_strt "$FQDIR" "$TAG" "$CPUS" "$REF" "$WL" "$BAM" "$STRAND"
+            ;;
+        qc)
+            run_qc "$@"
+            ;;
+        *)
+            die "Unknown platform: '$PLATFORM'. Run 'starsolo --help' for usage."
+            ;;
+    esac
+}
+
+if [[ "${BASH_SOURCE[0]}" == "$0" ]]; then
+    main "$@"
+fi
diff --git a/data/test.tar.gz b/data/test.tar.gz
new file mode 100644
index 0000000..de599e0
Binary files /dev/null and b/data/test.tar.gz differ
diff --git a/data/whitelists.tar.gz b/data/whitelists.tar.gz
new file mode 100644
index 0000000..707d753
Binary files /dev/null and b/data/whitelists.tar.gz differ
diff --git a/etc/defaults.conf b/etc/defaults.conf
new file mode 100644
index 0000000..7f3c633
--- /dev/null
+++ b/etc/defaults.conf
@@ -0,0 +1,26 @@
+# =============================================================================
+# STARsolo CLI - Default Configuration
+# =============================================================================
+# This file is sourced by all starsolo subcommands.
+# Override any value via CLI flags or by editing this file.
+#
+# Environment variables with the STARSOLO_ prefix will override these defaults
+# (e.g. export STARSOLO_CPUS=8).
+# =============================================================================
+
+# --- Compute resources ---
+STARSOLO_CPUS="${STARSOLO_CPUS:-16}"
+
+# --- Reference genome base directory ---
+# Platform scripts resolve the full path as: $STARSOLO_REF_BASE/<species>/2020A/index
+STARSOLO_REF_BASE="${STARSOLO_REF_BASE:-/nfs/cellgeni/STAR}"
+
+# --- Barcode whitelist directory ---
+STARSOLO_WL_DIR="${STARSOLO_WL_DIR:-/nfs/cellgeni/STAR/whitelists}"
+
+# --- BAM output (default: no BAM) ---
+# Set to "bam" to produce a sorted BAM by default.
+STARSOLO_BAM_MODE="${STARSOLO_BAM_MODE:-nobam}"
+
+# --- Adapter reference for bbduk ---
+STARSOLO_BBDUK_ADAPTERS="${STARSOLO_BBDUK_ADAPTERS:-/software/cellgen/cellgeni/bbmap/resources/adapters.fa}"
diff --git a/install.sh b/install.sh
new file mode 100755
index 0000000..6cf473d
--- /dev/null
+++ b/install.sh
@@ -0,0 +1,45 @@
+#!/bin/bash
+# =============================================================================
+# STARsolo CLI — installer
+# =============================================================================
+# Adds the starsolo command to your PATH by symlinking into a bin directory.
+#
+# Usage:
+#   ./install.sh                    # installs to ~/.local/bin  (default)
+#   ./install.sh /usr/local/bin     # installs to /usr/local/bin (needs sudo)
+#   PREFIX=~/tools ./install.sh     # installs to ~/tools/bin
+# =============================================================================
+
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+
+TARGET_DIR="${1:-${PREFIX:+${PREFIX}/bin}}"
+TARGET_DIR="${TARGET_DIR:-$HOME/.local/bin}"
+
+mkdir -p "$TARGET_DIR"
+
+STARSOLO_BIN="$SCRIPT_DIR/bin/starsolo"
+
+# Make everything executable
+chmod +x "$STARSOLO_BIN"
+chmod +x "$SCRIPT_DIR"/lib/*.sh 2>/dev/null || true
+chmod +x "$SCRIPT_DIR"/scripts/*.sh 2>/dev/null || true
+
+# Create symlink
+ln -sf "$STARSOLO_BIN" "$TARGET_DIR/starsolo"
+
+echo "Installed starsolo → $TARGET_DIR/starsolo"
+
+# Check if target is in PATH
+if ! echo "$PATH" | tr ':' '\n' | grep -qx "$TARGET_DIR"; then
+    echo ""
+    echo "NOTE: $TARGET_DIR is not in your PATH."
+    echo "Add it by appending this line to your ~/.bashrc or ~/.bash_profile:"
+    echo ""
+    echo "  export PATH=\"$TARGET_DIR:\$PATH\""
+    echo ""
+fi
+
+echo ""
+echo "Verify with:  starsolo --version"
diff --git a/lib/common.sh b/lib/common.sh
new file mode 100755
index 0000000..660e24f
--- /dev/null
+++ b/lib/common.sh
@@ -0,0 +1,145 @@
+#!/bin/bash
+# =============================================================================
+# STARsolo CLI - Shared library functions
+# =============================================================================
+# Sourced by bin/starsolo and all platform scripts. Never executed directly.
+# =============================================================================
+
+# ---------- Logging --------------------------------------------------------
+
+log_info()  { echo "[INFO]  $*" >&2; }
+log_warn()  { echo "[WARN]  $*" >&2; }
+log_error() { echo "[ERROR] $*" >&2; }
+die()       { log_error "$@"; exit 1; }
+
+# ---------- FASTQ file discovery -------------------------------------------
+
+# Finds paired-end FASTQ files based on common naming conventions.
+#   $1 : FQDIR  – directory containing FASTQ files
+#   $2 : TAG    – sample identifier
+# Prints: "R1_FILES|R2_FILES" (comma-separated within each group)
+find_fastq_files() {
+    local FQDIR=$1 TAG=$2
+    local R1="" R2=""
+
+    if [[ $(find "$FQDIR"/* | grep -P "/$TAG[\\/\\._]" | grep "_1\\.f.*q") != "" ]]; then
+        R1=$(find "$FQDIR"/* | grep -P "/$TAG[\\/\\._]" | grep "_1\\.f.*q" | sort | tr '\n' ',' | sed "s/,$//g")
+        R2=$(find "$FQDIR"/* | grep -P "/$TAG[\\/\\._]" | grep "_2\\.f.*q" | sort | tr '\n' ',' | sed "s/,$//g")
+    elif [[ $(find "$FQDIR"/* | grep -P "/$TAG[\\/\\._]" | grep "R1\\.f.*q") != "" ]]; then
+        R1=$(find "$FQDIR"/* | grep -P "/$TAG[\\/\\._]" | grep "R1\\.f.*q" | sort | tr '\n' ',' | sed "s/,$//g")
+        R2=$(find "$FQDIR"/* | grep -P "/$TAG[\\/\\._]" | grep "R2\\.f.*q" | sort | tr '\n' ',' | sed "s/,$//g")
+    elif [[ $(find "$FQDIR"/* | grep -P "/$TAG[\\/\\._]" | grep "_R1_.*\\.f.*q") != "" ]]; then
+        R1=$(find "$FQDIR"/* | grep -P "/$TAG[\\/\\._]" | grep "_R1_.*\\.f.*q" | sort | tr '\n' ',' | sed "s/,$//g")
+        R2=$(find "$FQDIR"/* | grep -P "/$TAG[\\/\\._]" | grep "_R2_.*\\.f.*q" | sort | tr '\n' ',' | sed "s/,$//g")
+    elif [[ $(find "$FQDIR"/* | grep -P "/$TAG[\\/\\._]" | grep "_f1\\.f.*q") != "" ]]; then
+        R1=$(find "$FQDIR"/* | grep -P "/$TAG[\\/\\._]" | grep "_f1\\.f.*q" | sort | tr '\n' ',' | sed "s/,$//g")
+        R2=$(find "$FQDIR"/* | grep -P "/$TAG[\\/\\._]" | grep "_r2\\.f.*q" | sort | tr '\n' ',' | sed "s/,$//g")
+    else
+        die "No appropriate FASTQ files found for sample '$TAG' in '$FQDIR'. Check file names and directory."
+    fi
+
+    echo "$R1|$R2"
+}
+
+# ---------- Compression detection ------------------------------------------
+
+# Detects GZIP/BZIP2 compression for a sample's FASTQ files.
+#   $1 : FQDIR
+#   $2 : TAG
+# Prints: "STAR_FLAG|DECOMPRESS_CMD"
+check_compression() {
+    local FQDIR=$1 TAG=$2
+    local STAR_FLAG="" ZCMD="cat"
+
+    if [[ $(find "$FQDIR"/* | grep -P "$TAG[\\/\\._]" | grep "\\.gz$") != "" ]]; then
+        STAR_FLAG="--readFilesCommand zcat"
+        ZCMD="zcat"
+    elif [[ $(find "$FQDIR"/* | grep -P "$TAG[\\/\\._]" | grep "\\.bz2$") != "" ]]; then
+        STAR_FLAG="--readFilesCommand bzcat"
+        ZCMD="bzcat"
+    fi
+
+    echo "$STAR_FLAG|$ZCMD"
+}
+
+# ---------- Post-alignment processing --------------------------------------
+
+# Compresses outputs and optionally indexes BAM.
+#   $1 : OUTPUT_DIR   – STARsolo output directory (e.g. "output/")
+#   No singularity; tools expected in PATH.
+process_output_files() {
+    local OUTPUT_DIR=${1:-output}
+
+    # Index BAM if it exists
+    if [[ -s "Aligned.sortedByCoord.out.bam" ]]; then
+        log_info "Indexing BAM file …"
+        samtools index -@16 Aligned.sortedByCoord.out.bam
+    fi
+
+    # Compress unmapped reads
+    if [[ -s "Unmapped.out.mate1" ]]; then
+        pbzip2 -9 Unmapped.out.mate1 &
+    fi
+    if [[ -s "Unmapped.out.mate2" ]]; then
+        pbzip2 -9 Unmapped.out.mate2 &
+    fi
+    wait
+
+    # Gzip all matrix / barcode / feature files
+    find "$OUTPUT_DIR" -name "*.tsv" -exec gzip {} + 2>/dev/null
+    find "$OUTPUT_DIR" -name "*.mtx" -exec gzip {} + 2>/dev/null
+
+    wait
+    log_info "ALL DONE!"
+}
+
+# ---------- BAM flag builder -----------------------------------------------
+
+# Resolves the BAM flags for STAR depending on --bam / --no-bam.
+#   $1 : "bam" or "nobam"
+# Prints the STAR flags string.
+resolve_bam_flags() {
+    local MODE=$1
+    if [[ $MODE == "bam" ]]; then
+        echo "--outSAMtype BAM SortedByCoordinate --outBAMsortingBinsN 500 --limitBAMsortRAM 60000000000 --outMultimapperOrder Random --runRNGseed 1 --outSAMattributes NH HI AS nM CB UB CR CY UR UY GX GN"
+    else
+        echo "--outSAMtype None"
+    fi
+}
+
+# ---------- Reference path resolver -----------------------------------------
+
+# Resolves a STAR reference index path from species name.
+#   $1 : species (e.g. "human", "mouse")
+#   $2 : REF_BASE directory
+# Prints absolute path and validates it exists.
+resolve_reference() {
+    local SPECIES=$1 REF_BASE=$2
+    local REF="${REF_BASE}/${SPECIES}/2020A/index"
+
+    if [[ ! -d "$REF" ]]; then
+        log_error "Reference directory not found: $REF"
+        return 1
+    fi
+    echo "$REF"
+}
+
+# ---------- Input validation ------------------------------------------------
+
+validate_fqdir() {
+    local FQDIR=$1
+    if [[ ! -d "$FQDIR" ]]; then
+        log_error "FASTQ directory does not exist: $FQDIR"
+        return 1
+    fi
+    readlink -f "$FQDIR"
+}
+
+validate_tag() {
+    local TAG=$1
+    if [[ -z "$TAG" ]]; then
+        log_error "Sample TAG must not be empty."
+        return 1
+    fi
+    echo "$TAG"
+}
diff --git a/lib/platform_10x.sh b/lib/platform_10x.sh
new file mode 100755
index 0000000..c9abc2e
--- /dev/null
+++ b/lib/platform_10x.sh
@@ -0,0 +1,290 @@
+#!/bin/bash
+# =============================================================================
+# STARsolo CLI - 10x Genomics platform
+# =============================================================================
+# Automatic chemistry detection (v1 / v2 / v3 / v4 / multiome),
+# strand-specificity detection, and paired-end support.
+# =============================================================================
+
+# ---------- 10x-specific helpers -------------------------------------------
+
+# Extracts a random subsample of reads for chemistry testing.
+#   $1: R1 (comma-sep)   $2: R2 (comma-sep)   $3: ZCMD
+_10x_extract_test_reads() {
+    local R1=$1 R2=$2 ZCMD=$3
+    local COUNT=0
+
+    for i in $(echo "$R1" | tr ',' ' '); do
+        $ZCMD "$i" | head -4000000 > "$COUNT.R1_head" &
+        COUNT=$((COUNT + 1))
+    done
+    wait
+
+    COUNT=0
+    for i in $(echo "$R2" | tr ',' ' '); do
+        $ZCMD "$i" | head -4000000 > "$COUNT.R2_head" &
+        COUNT=$((COUNT + 1))
+    done
+    wait
+
+    cat *.R1_head | seqtk sample -s100 - 200000 > test.R1.fastq &
+    cat *.R2_head | seqtk sample -s100 - 200000 > test.R2.fastq &
+    wait
+    rm -f *.R1_head *.R2_head
+}
+
+# Determines the correct barcode whitelist.
+#   $1: WL directory
+# Prints: "BC|NBC1|NBC2|NBC3|NBC4|NBC5|NBCA|R1LEN|R2LEN|R1DIS"
+_10x_determine_whitelist() {
+    local WL=$1
+
+    local NBC1 NBC2 NBC3 NBC4 NBC5 NBCA
+    NBC1=$(awk 'NR%4==2' test.R1.fastq | cut -c-14 | grep -F -f "$WL/737K-april-2014_rc.txt" | wc -l)
+    NBC2=$(awk 'NR%4==2' test.R1.fastq | cut -c-16 | grep -F -f "$WL/737K-august-2016.txt"   | wc -l)
+    NBC3=$(awk 'NR%4==2' test.R1.fastq | cut -c-16 | grep -F -f "$WL/3M-february-2018.txt"   | wc -l)
+    NBC4=$(awk 'NR%4==2' test.R1.fastq | cut -c-16 | grep -F -f "$WL/3M-3pgex-may-2023.txt"  | wc -l)
+    NBC5=$(awk 'NR%4==2' test.R1.fastq | cut -c-16 | grep -F -f "$WL/3M-5pgex-jan-2023.txt"  | wc -l)
+    NBCA=$(awk 'NR%4==2' test.R1.fastq | cut -c-16 | grep -F -f "$WL/737K-arc-v1.txt"        | wc -l)
+
+    local R1LEN R2LEN R1DIS
+    R1LEN=$(awk 'NR%4==2' test.R1.fastq | awk '{sum+=length($0)} END {printf "%d\n",sum/NR+0.5}')
+    R2LEN=$(awk 'NR%4==2' test.R2.fastq | awk '{sum+=length($0)} END {printf "%d\n",sum/NR+0.5}')
+    R1DIS=$(awk 'NR%4==2' test.R1.fastq | awk '{print length($0)}' | sort | uniq -c | wc -l)
+
+    local BC=""
+    if   (( NBC3 > 50000 )); then BC="$WL/3M-february-2018.txt"
+    elif (( NBC2 > 50000 )); then BC="$WL/737K-august-2016.txt"
+    elif (( NBCA > 50000 )); then BC="$WL/737K-arc-v1.txt"
+    elif (( NBC1 > 50000 )); then BC="$WL/737K-april-2014_rc.txt"
+    elif (( NBC4 > 50000 )); then BC="$WL/3M-3pgex-may-2023.txt"
+    elif (( NBC5 > 50000 )); then BC="$WL/3M-5pgex-jan-2023.txt"
+    else
+        die "No whitelist matched 200,000 random barcodes! Counts: v1=$NBC1 v2=$NBC2 v3=$NBC3 v4-3p=$NBC4 v4-5p=$NBC5 multiome=$NBCA"
+    fi
+
+    echo "$BC|$NBC1|$NBC2|$NBC3|$NBC4|$NBC5|$NBCA|$R1LEN|$R2LEN|$R1DIS"
+}
+
+# Checks read lengths and sets barcode/UMI parameters.
+#   $1: R1DIS  $2: R1LEN  $3: R2LEN  $4: BC  $5: WL
+# Prints: "PAIRED|CBLEN|UMILEN"
+_10x_check_read_lengths() {
+    local R1DIS=$1 R1LEN=$2 R2LEN=$3 BC=$4 WL=$5
+    local PAIRED=False CBLEN UMILEN
+
+    if (( R1DIS > 1 && R1LEN <= 30 )); then
+        die "Read 1 (barcode) has varying length; possibly quality-trimmed."
+    elif (( R1LEN < 24 )); then
+        die "Read 1 (barcode) is less than 24 bp. Check FASTQ files."
+    elif (( R2LEN < 40 )); then
+        die "Read 2 (biological read) is less than 40 bp. Check FASTQ files."
+    fi
+
+    (( R1LEN > 50 )) && PAIRED=True
+
+    case "$BC" in
+        *3M-february-2018.txt|*737K-arc-v1.txt|*3M-3pgex-may-2023.txt|*3M-5pgex-jan-2023.txt)
+            CBLEN=16; UMILEN=12 ;;
+        *737K-august-2016.txt)
+            CBLEN=16; UMILEN=10 ;;
+        *737K-april-2014_rc.txt)
+            CBLEN=14; UMILEN=10 ;;
+    esac
+
+    local BCUMI=$((CBLEN + UMILEN))
+    if (( BCUMI > R1LEN )); then
+        local NEWUMI=$((R1LEN - CBLEN))
+        log_warn "R1 length ($R1LEN) < barcode+UMI ($BCUMI). Setting UMI length to $NEWUMI."
+        UMILEN=$NEWUMI
+    elif (( BCUMI < R1LEN )); then
+        log_warn "R1 length ($R1LEN) > barcode+UMI ($BCUMI)."
+    fi
+
+    echo "$PAIRED|$CBLEN|$UMILEN"
+}
+
+# Determines strand specificity via test alignments.
+#   $1: REF  $2: CBLEN  $3: UMILEN  $4: CPUS  $5: BC  $6: PAIRED
+# Prints: "STRAND|PCTFWD|PCTREV|PAIRED"
+_10x_determine_strand() {
+    local REF=$1 CBLEN=$2 UMILEN=$3 CPUS=$4 BC=$5 PAIRED=$6
+    local STRAND=Forward
+
+    log_info "Running test alignment (Forward) …"
+    STAR --runThreadN "$CPUS" --genomeDir "$REF" --readFilesIn test.R2.fastq test.R1.fastq \
+        --runDirPerm All_RWX --outSAMtype None \
+        --soloType CB_UMI_Simple --soloCBwhitelist "$BC" --soloBarcodeReadLength 0 \
+        --soloCBlen "$CBLEN" --soloUMIstart $((CBLEN + 1)) --soloUMIlen "$UMILEN" \
+        --soloStrand Forward --genomeLoad LoadAndKeep \
+        --soloUMIdedup 1MM_CR --soloCBmatchWLtype 1MM_multi_Nbase_pseudocounts \
+        --soloUMIfiltering MultiGeneUMI_CR --soloCellFilter EmptyDrops_CR \
+        --clipAdapterType CellRanger4 --outFilterScoreMin 30 \
+        --soloFeatures Gene GeneFull \
+        --soloOutFileNames test_forward/ features.tsv barcodes.tsv matrix.mtx &>/dev/null
+
+    log_info "Running test alignment (Reverse) …"
+    STAR --runThreadN "$CPUS" --genomeDir "$REF" --readFilesIn test.R2.fastq test.R1.fastq \
+        --runDirPerm All_RWX --outSAMtype None \
+        --soloType CB_UMI_Simple --soloCBwhitelist "$BC" --soloBarcodeReadLength 0 \
+        --soloCBlen "$CBLEN" --soloUMIstart $((CBLEN + 1)) --soloUMIlen "$UMILEN" \
+        --soloStrand Reverse --genomeLoad LoadAndKeep \
+        --soloUMIdedup 1MM_CR --soloCBmatchWLtype 1MM_multi_Nbase_pseudocounts \
+        --soloUMIfiltering MultiGeneUMI_CR --soloCellFilter EmptyDrops_CR \
+        --clipAdapterType CellRanger4 --outFilterScoreMin 30 \
+        --soloFeatures Gene GeneFull \
+        --soloOutFileNames test_reverse/ features.tsv barcodes.tsv matrix.mtx &>/dev/null
+
+    local PCTFWD PCTREV
+    PCTFWD=$(grep "Reads Mapped to GeneFull: Unique GeneFull" test_forward/GeneFull/Summary.csv | awk -F "," '{printf "%d\n",$2*100+0.5}')
+    PCTREV=$(grep "Reads Mapped to GeneFull: Unique GeneFull" test_reverse/GeneFull/Summary.csv | awk -F "," '{printf "%d\n",$2*100+0.5}')
+
+    (( PCTREV > PCTFWD )) && STRAND=Reverse
+
+    if (( PCTREV < 50 && PCTFWD < 50 )); then
+        log_warn "Low GeneFull mapping: forward=${PCTFWD}%, reverse=${PCTREV}%"
+    fi
+
+    # 3' paired-end experiments should still be processed as single-end
+    [[ $STRAND == "Forward" && $PAIRED == "True" ]] && PAIRED=False
+
+    echo "$STRAND|$PCTFWD|$PCTREV|$PAIRED"
+}
+
+# Writes a human-readable configuration summary.
+_10x_write_config() {
+    local TAG=$1 PAIRED=$2 STRAND=$3 PCTFWD=$4 PCTREV=$5
+    local BC=$6 NBC1=$7 NBC2=$8 NBC3=$9 NBCA=${10} NBC4=${11} NBC5=${12}
+    local CBLEN=${13} UMILEN=${14} GZIP=${15} R1=${16} R2=${17}
+
+    log_info "STARsolo 10x run configuration:"
+    {
+        echo "============================================================================="
+        echo "Sample: $TAG"
+        echo "Paired-end mode: $PAIRED"
+        echo "Strand (Forward=3', Reverse=5'): $STRAND  (%fwd=$PCTFWD, %rev=$PCTREV)"
+        echo "CB whitelist: $BC"
+        echo "  Matches /200k: v3=$NBC3 v2=$NBC2 v1=$NBC1 v4-3p=$NBC4 v4-5p=$NBC5 multiome=$NBCA"
+        echo "CB length: $CBLEN    UMI length: $UMILEN"
+        echo "Compression: ${GZIP:-none}"
+        echo "-----------------------------------------------------------------------------"
+        echo "R1: $R1"
+        echo "R2: $R2"
+        echo "============================================================================="
+    } | tee strand.txt
+}
+
+# ---------- Main 10x workflow -----------------------------------------------
+
+# Entry point called by bin/starsolo.
+#   Arguments come from the CLI parser.
+run_10x() {
+    local FQDIR=$1 TAG=$2 CPUS=$3 REF=$4 WL=$5 BAM=$6
+
+    local FQDIR_ABS
+    FQDIR_ABS=$(validate_fqdir "$FQDIR") || exit 1
+    TAG=$(validate_tag "$TAG") || exit 1
+
+    # Convert paths to absolute before cd
+    REF=$(readlink -f "$REF")
+    WL=$(readlink -f "$WL")
+
+    # Create output directory and enter it
+    mkdir -p "$TAG" && cd "$TAG" || exit
+
+    # 1. Discover FASTQs
+    log_info "Discovering FASTQ files for sample '$TAG' …"
+    local fq_info R1 R2
+    fq_info=$(find_fastq_files "$FQDIR_ABS" "$TAG")
+    IFS='|' read -r R1 R2 <<< "$fq_info"
+
+    # 2. Compression
+    local comp_info GZIP ZCMD
+    comp_info=$(check_compression "$FQDIR_ABS" "$TAG")
+    IFS='|' read -r GZIP ZCMD <<< "$comp_info"
+
+    # 3. Chemistry detection
+    log_info "Subsampling reads for chemistry detection …"
+    _10x_extract_test_reads "$R1" "$R2" "$ZCMD"
+
+    log_info "Determining barcode whitelist …"
+    local wl_info BC NBC1 NBC2 NBC3 NBC4 NBC5 NBCA R1LEN R2LEN R1DIS
+    wl_info=$(_10x_determine_whitelist "$WL")
+    IFS='|' read -r BC NBC1 NBC2 NBC3 NBC4 NBC5 NBCA R1LEN R2LEN R1DIS <<< "$wl_info"
+
+    # 4. Read length checks
+    local rl_info PAIRED CBLEN UMILEN
+    rl_info=$(_10x_check_read_lengths "$R1DIS" "$R1LEN" "$R2LEN" "$BC" "$WL")
+    IFS='|' read -r PAIRED CBLEN UMILEN <<< "$rl_info"
+
+    # 5. Strand specificity
+    log_info "Determining strand specificity …"
+    local strand_info STRAND PCTFWD PCTREV
+    strand_info=$(_10x_determine_strand "$REF" "$CBLEN" "$UMILEN" "$CPUS" "$BC" "$PAIRED")
+    IFS='|' read -r STRAND PCTFWD PCTREV PAIRED <<< "$strand_info"
+
+    # 6. Log configuration
+    _10x_write_config "$TAG" "$PAIRED" "$STRAND" "$PCTFWD" "$PCTREV" \
+        "$BC" "$NBC1" "$NBC2" "$NBC3" "$NBCA" "$NBC4" "$NBC5" \
+        "$CBLEN" "$UMILEN" "$GZIP" "$R1" "$R2"
+
+    # 7. Run STAR
+    local SOLOFILENAMES="output/"
+    log_info "Running STARsolo alignment …"
+
+    if [[ $PAIRED == "True" ]]; then
+        STAR --runThreadN "$CPUS" --genomeDir "$REF" \
+            --readFilesIn "$R1" "$R2" --runDirPerm All_RWX $GZIP $BAM \
+            --soloBarcodeMate 1 --clip5pNbases 39 0 \
+            --soloType CB_UMI_Simple --soloCBwhitelist "$BC" --soloCBstart 1 \
+            --soloCBlen "$CBLEN" --soloUMIstart $((CBLEN + 1)) --soloUMIlen "$UMILEN" \
+            --soloStrand Forward \
+            --soloUMIdedup 1MM_CR --soloCBmatchWLtype 1MM_multi_Nbase_pseudocounts \
+            --soloUMIfiltering MultiGeneUMI_CR --soloCellFilter EmptyDrops_CR \
+            --outFilterScoreMin 30 --genomeLoad LoadAndRemove \
+            --soloFeatures Gene GeneFull Velocyto \
+            --soloOutFileNames "$SOLOFILENAMES" features.tsv barcodes.tsv matrix.mtx \
+            --soloMultiMappers EM --outReadsUnmapped Fastx
+    else
+        STAR --runThreadN "$CPUS" --genomeDir "$REF" \
+            --readFilesIn "$R2" "$R1" --runDirPerm All_RWX $GZIP $BAM \
+            --soloType CB_UMI_Simple --soloCBwhitelist "$BC" --soloBarcodeReadLength 0 \
+            --soloCBlen "$CBLEN" --soloUMIstart $((CBLEN + 1)) --soloUMIlen "$UMILEN" \
+            --soloStrand "$STRAND" \
+            --soloUMIdedup 1MM_CR --soloCBmatchWLtype 1MM_multi_Nbase_pseudocounts \
+            --soloUMIfiltering MultiGeneUMI_CR --soloCellFilter EmptyDrops_CR \
+            --clipAdapterType CellRanger4 --outFilterScoreMin 30 --genomeLoad LoadAndRemove \
+            --soloFeatures Gene GeneFull Velocyto \
+            --soloOutFileNames "$SOLOFILENAMES" features.tsv barcodes.tsv matrix.mtx \
+            --soloMultiMappers EM --outReadsUnmapped Fastx
+    fi
+
+    # 8. Cleanup & compress
+    rm -rf test.R?.fastq test_forward test_reverse
+    process_output_files "$SOLOFILENAMES"
+}
+
+# ---------- Subcommand help -------------------------------------------------
+
+show_10x_help() {
+    cat <<'EOF'
+Usage: starsolo 10x <fastq_dir> <sample_id> [options]
+
+Automatically detects 10x chemistry (v1–v4, multiome), strand specificity,
+and paired-end vs single-end mode.
+
+Required:
+  <fastq_dir>               Directory containing FASTQ files
+  <sample_id>               Sample identifier (used to match FASTQ filenames)
+
+Options:
+  -s, --species <name>      Species name (e.g. human, mouse).
+                             Resolves reference via $REF_BASE/<species>/2020A/index
+  -r, --ref <path>          Explicit STAR reference index (overrides --species)
+  -w, --whitelist-dir <dir> Barcode whitelist directory  [default: from config]
+  -c, --cpus <N>            Number of threads             [default: 16]
+  --bam                     Output sorted BAM file
+  --no-bam                  Do not output BAM (default)
+  -h, --help                Show this help
+EOF
+}
diff --git a/lib/platform_dropseq.sh b/lib/platform_dropseq.sh
new file mode 100755
index 0000000..4aa7e87
--- /dev/null
+++ b/lib/platform_dropseq.sh
@@ -0,0 +1,62 @@
+#!/bin/bash
+# =============================================================================
+# STARsolo CLI - Drop-seq platform
+# =============================================================================
+# CB_UMI_Simple with no whitelist (12 bp CB, 8 bp UMI).
+# =============================================================================
+
+run_dropseq() {
+    local FQDIR=$1 TAG=$2 CPUS=$3 REF=$4 BAM=$5
+
+    local FQDIR_ABS
+    FQDIR_ABS=$(validate_fqdir "$FQDIR") || exit 1
+    TAG=$(validate_tag "$TAG") || exit 1
+
+    # Convert to absolute path before cd
+    REF=$(readlink -f "$REF")
+
+    mkdir -p "$TAG" && cd "$TAG" || exit
+
+    # Discover FASTQs
+    local fq_info R1 R2
+    fq_info=$(find_fastq_files "$FQDIR_ABS" "$TAG")
+    IFS='|' read -r R1 R2 <<< "$fq_info"
+
+    # Compression
+    local comp_info GZIP ZCMD
+    comp_info=$(check_compression "$FQDIR_ABS" "$TAG")
+    IFS='|' read -r GZIP ZCMD <<< "$comp_info"
+
+    log_info "Running STARsolo (Drop-seq) for '$TAG' …"
+
+    STAR --runThreadN "$CPUS" --genomeDir "$REF" \
+        --readFilesIn "$R2" "$R1" --runDirPerm All_RWX $GZIP $BAM \
+        --soloType CB_UMI_Simple --soloCBwhitelist None \
+        --soloCBstart 1 --soloCBlen 12 --soloUMIstart 13 --soloUMIlen 8 \
+        --soloBarcodeReadLength 0 \
+        --soloFeatures Gene GeneFull \
+        --soloOutFileNames output/ features.tsv barcodes.tsv matrix.mtx \
+        --outReadsUnmapped Fastx
+
+    process_output_files "output"
+}
+
+show_dropseq_help() {
+    cat <<'EOF'
+Usage: starsolo dropseq <fastq_dir> <sample_id> [options]
+
+Process Drop-seq data (12 bp cell barcode, 8 bp UMI, no whitelist).
+
+Required:
+  <fastq_dir>               Directory containing FASTQ files
+  <sample_id>               Sample identifier
+
+Options:
+  -s, --species <name>      Species name (resolves reference automatically)
+  -r, --ref <path>          Explicit STAR reference index (overrides --species)
+  -c, --cpus <N>            Number of threads             [default: 16]
+  --bam                     Output sorted BAM file
+  --no-bam                  Do not output BAM (default)
+  -h, --help                Show this help
+EOF
+}
diff --git a/lib/platform_indrops.sh b/lib/platform_indrops.sh
new file mode 100755
index 0000000..c33c07b
--- /dev/null
+++ b/lib/platform_indrops.sh
@@ -0,0 +1,79 @@
+#!/bin/bash
+# =============================================================================
+# STARsolo CLI - inDrops platform
+# =============================================================================
+# CB_UMI_Complex with 2 barcode segments, adapter sequence, and 6 bp UMI.
+# =============================================================================
+
+run_indrops() {
+    local FQDIR=$1 TAG=$2 CPUS=$3 REF=$4 WL=$5 BAM=$6 ADAPTER=$7
+
+    local FQDIR_ABS
+    FQDIR_ABS=$(validate_fqdir "$FQDIR") || exit 1
+    TAG=$(validate_tag "$TAG") || exit 1
+
+    local BC1="$WL/inDrops_Ambrose2_bc1.txt"
+    local BC2="$WL/inDrops_Ambrose2_bc2.txt"
+
+    for f in "$BC1" "$BC2"; do
+        [[ -f "$f" ]] || die "inDrops barcode file not found: $f"
+    done
+
+    # Convert to absolute paths before cd
+    REF=$(readlink -f "$REF")
+    BC1=$(readlink -f "$BC1")
+    BC2=$(readlink -f "$BC2")
+
+    mkdir -p "$TAG" && cd "$TAG" || exit
+
+    # Discover FASTQs
+    local fq_info R1 R2
+    fq_info=$(find_fastq_files "$FQDIR_ABS" "$TAG")
+    IFS='|' read -r R1 R2 <<< "$fq_info"
+
+    # Compression
+    local comp_info GZIP ZCMD
+    comp_info=$(check_compression "$FQDIR_ABS" "$TAG")
+    IFS='|' read -r GZIP ZCMD <<< "$comp_info"
+
+    log_info "Running STARsolo (inDrops) for '$TAG' (adapter=$ADAPTER) …"
+
+    STAR --runThreadN "$CPUS" --genomeDir "$REF" \
+        --readFilesIn "$R2" "$R1" --runDirPerm All_RWX $GZIP $BAM \
+        --soloType CB_UMI_Complex --soloCBwhitelist "$BC1" "$BC2" \
+        --soloAdapterSequence "$ADAPTER" --soloAdapterMismatchesNmax 3 \
+        --soloCBmatchWLtype 1MM \
+        --soloCBposition 0_0_2_-1 3_1_3_8 \
+        --soloUMIposition 3_9_3_14 \
+        --soloFeatures Gene GeneFull \
+        --soloOutFileNames output/ features.tsv barcodes.tsv matrix.mtx \
+        --outReadsUnmapped Fastx
+
+    process_output_files "output"
+}
+
+show_indrops_help() {
+    cat <<'EOF'
+Usage: starsolo indrops <fastq_dir> <sample_id> [options]
+
+Process inDrops data (2 barcode segments + adapter + 6 bp UMI).
+
+Requires inDrops barcode whitelist files (inDrops_Ambrose2_bc1/2.txt)
+in the whitelist directory.
+
+Required:
+  <fastq_dir>               Directory containing FASTQ files
+  <sample_id>               Sample identifier
+
+Options:
+  -s, --species <name>      Species name (resolves reference automatically)
+  -r, --ref <path>          Explicit STAR reference index (overrides --species)
+  -w, --whitelist-dir <dir> Barcode whitelist directory  [default: from config]
+  -c, --cpus <N>            Number of threads             [default: 16]
+  --adapter <seq>           Adapter sequence
+                             [default: GAGTGATTGCTTGTGACGCCTT]
+  --bam                     Output sorted BAM file
+  --no-bam                  Do not output BAM (default)
+  -h, --help                Show this help
+EOF
+}
diff --git a/lib/platform_rhapsody.sh b/lib/platform_rhapsody.sh
new file mode 100755
index 0000000..9357c7e
--- /dev/null
+++ b/lib/platform_rhapsody.sh
@@ -0,0 +1,79 @@
+#!/bin/bash
+# =============================================================================
+# STARsolo CLI - BD Rhapsody platform
+# =============================================================================
+# CB_UMI_Complex with 3 barcode segments and 8 bp UMI.
+# =============================================================================
+
+run_rhapsody() {
+    local FQDIR=$1 TAG=$2 CPUS=$3 REF=$4 WL=$5 BAM=$6
+
+    local FQDIR_ABS
+    FQDIR_ABS=$(validate_fqdir "$FQDIR") || exit 1
+    TAG=$(validate_tag "$TAG") || exit 1
+
+    local BC1="$WL/Rhapsody_bc1.txt"
+    local BC2="$WL/Rhapsody_bc2.txt"
+    local BC3="$WL/Rhapsody_bc3.txt"
+
+    for f in "$BC1" "$BC2" "$BC3"; do
+        [[ -f "$f" ]] || die "Rhapsody barcode file not found: $f"
+    done
+
+    # Convert to absolute paths before cd
+    REF=$(readlink -f "$REF")
+    BC1=$(readlink -f "$BC1")
+    BC2=$(readlink -f "$BC2")
+    BC3=$(readlink -f "$BC3")
+
+    mkdir -p "$TAG" && cd "$TAG" || exit
+
+    # Discover FASTQs
+    local fq_info R1 R2
+    fq_info=$(find_fastq_files "$FQDIR_ABS" "$TAG")
+    IFS='|' read -r R1 R2 <<< "$fq_info"
+
+    # Compression
+    local comp_info GZIP ZCMD
+    comp_info=$(check_compression "$FQDIR_ABS" "$TAG")
+    IFS='|' read -r GZIP ZCMD <<< "$comp_info"
+
+    log_info "Running STARsolo (BD Rhapsody) for '$TAG' …"
+
+    STAR --runThreadN "$CPUS" --genomeDir "$REF" \
+        --readFilesIn "$R2" "$R1" --runDirPerm All_RWX $GZIP $BAM \
+        --soloType CB_UMI_Complex --soloCBwhitelist "$BC1" "$BC2" "$BC3" \
+        --soloUMIlen 8 \
+        --soloCBmatchWLtype 1MM \
+        --soloCBposition 0_0_0_8 0_21_0_29 0_43_0_51 \
+        --soloUMIposition 0_52_0_59 \
+        --soloFeatures Gene GeneFull \
+        --soloOutFileNames output/ features.tsv barcodes.tsv matrix.mtx \
+        --outReadsUnmapped Fastx
+
+    process_output_files "output"
+}
+
+show_rhapsody_help() {
+    cat <<'EOF'
+Usage: starsolo rhapsody <fastq_dir> <sample_id> [options]
+
+Process BD Rhapsody data (3 barcode segments, 8 bp UMI).
+
+Requires Rhapsody barcode whitelist files (Rhapsody_bc1/2/3.txt)
+in the whitelist directory.
+
+Required:
+  <fastq_dir>               Directory containing FASTQ files
+  <sample_id>               Sample identifier
+
+Options:
+  -s, --species <name>      Species name (resolves reference automatically)
+  -r, --ref <path>          Explicit STAR reference index (overrides --species)
+  -w, --whitelist-dir <dir> Barcode whitelist directory  [default: from config]
+  -c, --cpus <N>            Number of threads             [default: 16]
+  --bam                     Output sorted BAM file
+  --no-bam                  Do not output BAM (default)
+  -h, --help                Show this help
+EOF
+}
diff --git a/lib/platform_smartseq.sh b/lib/platform_smartseq.sh
new file mode 100755
index 0000000..595a368
--- /dev/null
+++ b/lib/platform_smartseq.sh
@@ -0,0 +1,77 @@
+#!/bin/bash
+# =============================================================================
+# STARsolo CLI - Smart-seq / Smart-seq2 platform
+# =============================================================================
+# Plate-based, no UMIs. Requires a manifest TSV file.
+# =============================================================================
+
+run_smartseq() {
+    local TSV=$1 CPUS=$2 REF=$3 BAM=$4
+
+    [[ -s "$TSV" ]] || die "Manifest file not found or empty: $TSV"
+
+    local TAG
+    TAG=$(basename "${TSV%%.manifest.tsv}")
+    TAG=$(basename "${TAG%%.tsv}")        # fallback if named differently
+    [[ -n "$TAG" ]] || die "Could not derive sample name from manifest filename."
+
+    # Convert to absolute paths before cd
+    TSV=$(readlink -f "$TSV")
+    REF=$(readlink -f "$REF")
+
+    # Create output directory
+    mkdir -p "$TAG" || exit
+
+    # Convert manifest paths to absolute and write to output directory
+    local NEW_MANIFEST="$TAG/manifest.tsv"
+    while IFS=$'\t' read -r r1 r2 cell; do
+        r1=$(readlink -f "$r1")
+        r2=$(readlink -f "$r2")
+        echo -e "$r1\t$r2\t$cell"
+    done < "$TSV" > "$NEW_MANIFEST"
+
+    cd "$TAG" || exit
+    TSV="manifest.tsv"
+
+    # Compression detection from manifest content
+    local GZIP=""
+    if grep -qP "\.gz\t" "$TSV"; then
+        GZIP="--readFilesCommand zcat"
+    fi
+
+    log_info "Running STARsolo (Smart-seq) for '$TAG' …"
+
+    STAR --runThreadN "$CPUS" --genomeDir "$REF" --runDirPerm All_RWX $GZIP $BAM \
+        --soloType SmartSeq --readFilesManifest "$TSV" \
+        --soloUMIdedup Exact --soloStrand Unstranded \
+        --limitOutSJcollapsed 10000000 --soloCellFilter None \
+        --soloFeatures Gene GeneFull \
+        --soloOutFileNames output/ features.tsv barcodes.tsv matrix.mtx \
+        --outReadsUnmapped Fastx
+
+    rm -f "$NEW_MANIFEST"
+    process_output_files "output"
+}
+
+show_smartseq_help() {
+    cat <<'EOF'
+Usage: starsolo smartseq <manifest.tsv> [options]
+
+Process Smart-seq / Smart-seq2 data using a manifest file.
+The manifest is a tab-separated file with 3 columns:
+  1) Full path to R1   2) Full path to R2   3) Cell name/ID
+
+The sample name is derived from the manifest filename (strips .manifest.tsv).
+
+Required:
+  <manifest.tsv>            Path to the manifest TSV file
+
+Options:
+  -s, --species <name>      Species name (resolves reference automatically)
+  -r, --ref <path>          Explicit STAR reference index (overrides --species)
+  -c, --cpus <N>            Number of threads             [default: 16]
+  --bam                     Output sorted BAM file
+  --no-bam                  Do not output BAM (default)
+  -h, --help                Show this help
+EOF
+}
diff --git a/lib/platform_strt.sh b/lib/platform_strt.sh
new file mode 100755
index 0000000..ea3e8b7
--- /dev/null
+++ b/lib/platform_strt.sh
@@ -0,0 +1,79 @@
+#!/bin/bash
+# =============================================================================
+# STARsolo CLI - STRT-seq platform
+# =============================================================================
+# CB_UMI_Simple with a 96-barcode whitelist (8 bp CB, 8 bp UMI).
+# NOTE: R1 is the biological read; R2 carries the barcode in STRT-seq.
+# =============================================================================
+
+run_strt() {
+    local FQDIR=$1 TAG=$2 CPUS=$3 REF=$4 WL=$5 BAM=$6 STRAND=$7
+
+    local FQDIR_ABS
+    FQDIR_ABS=$(validate_fqdir "$FQDIR") || exit 1
+    TAG=$(validate_tag "$TAG") || exit 1
+
+    local BC="$WL/96_barcodes.list"
+    [[ -f "$BC" ]] || die "STRT-seq barcode file not found: $BC"
+
+    # Convert to absolute paths before cd
+    REF=$(readlink -f "$REF")
+    BC=$(readlink -f "$BC")
+
+    local CBLEN=8
+    local UMILEN=8
+
+    mkdir -p "$TAG" && cd "$TAG" || exit
+
+    # Discover FASTQs
+    local fq_info R1 R2
+    fq_info=$(find_fastq_files "$FQDIR_ABS" "$TAG")
+    IFS='|' read -r R1 R2 <<< "$fq_info"
+
+    # Compression
+    local comp_info GZIP ZCMD
+    comp_info=$(check_compression "$FQDIR_ABS" "$TAG")
+    IFS='|' read -r GZIP ZCMD <<< "$comp_info"
+
+    log_info "Running STARsolo (STRT-seq) for '$TAG' (strand=$STRAND) …"
+
+    # NOTE: R2 R1 order — R1 is biological read, R2 carries barcode
+    STAR --runThreadN "$CPUS" --genomeDir "$REF" \
+        --readFilesIn "$R2" "$R1" --runDirPerm All_RWX $GZIP $BAM \
+        --soloType CB_UMI_Simple --soloCBwhitelist "$BC" \
+        --soloBarcodeReadLength 0 \
+        --soloCBlen "$CBLEN" --soloUMIstart $((CBLEN + 1)) --soloUMIlen "$UMILEN" \
+        --soloStrand "$STRAND" \
+        --soloUMIdedup 1MM_CR --soloCBmatchWLtype 1MM_multi_Nbase_pseudocounts \
+        --soloUMIfiltering MultiGeneUMI_CR \
+        --limitOutSJcollapsed 10000000 --soloCellFilter None \
+        --soloFeatures Gene GeneFull \
+        --soloOutFileNames output/ features.tsv barcodes.tsv matrix.mtx \
+        --outReadsUnmapped Fastx
+
+    process_output_files "output"
+}
+
+show_strt_help() {
+    cat <<'EOF'
+Usage: starsolo strt <fastq_dir> <sample_id> [options]
+
+Process STRT-seq data (96-barcode whitelist, 8 bp CB, 8 bp UMI).
+
+Requires 96_barcodes.list in the whitelist directory.
+
+Required:
+  <fastq_dir>               Directory containing FASTQ files
+  <sample_id>               Sample identifier
+
+Options:
+  -s, --species <name>      Species name (resolves reference automatically)
+  -r, --ref <path>          Explicit STAR reference index (overrides --species)
+  -w, --whitelist-dir <dir> Barcode whitelist directory  [default: from config]
+  -c, --cpus <N>            Number of threads             [default: 16]
+  --strand <Forward|Reverse> Strand specificity           [default: Forward]
+  --bam                     Output sorted BAM file
+  --no-bam                  Do not output BAM (default)
+  -h, --help                Show this help
+EOF
+}
diff --git a/scripts/bbduk_trim.sh b/scripts/bbduk_trim.sh
new file mode 100755
index 0000000..f8cc126
--- /dev/null
+++ b/scripts/bbduk_trim.sh
@@ -0,0 +1,24 @@
+#!/bin/bash
+# =============================================================================
+# bbduk.sh — adapter/polyA trimming helper
+# =============================================================================
+# Usage: bbduk.sh <sample_id> [adapters_fasta]
+#
+# Requires BBMap (bbduk.sh) in PATH.
+# =============================================================================
+
+set -euo pipefail
+
+TAG=${1:?"Usage: bbduk.sh <sample_id> [adapters_fasta]"}
+ADAPTERS=${2:-/software/cellgen/cellgeni/bbmap/resources/adapters.fa}
+
+bbduk.sh \
+    in1="${TAG}_1.fastq.gz" \
+    in2="${TAG}_2.fastq.gz" \
+    out1="${TAG}.bbduk.R1.fastq" \
+    out2="${TAG}.bbduk.R2.fastq" \
+    ref="$ADAPTERS" \
+    trimpolya=10 ktrim=r k=23 mink=11 hdist=1 tpe tbo \
+    &> "${TAG}.bbduk.log"
+
+echo "[INFO] Trimming complete. Output: ${TAG}.bbduk.R1.fastq / ${TAG}.bbduk.R2.fastq"
diff --git a/scripts/bsub_submit.sh b/scripts/bsub_submit.sh
new file mode 100755
index 0000000..e59cf9f
--- /dev/null
+++ b/scripts/bsub_submit.sh
@@ -0,0 +1,39 @@
+#!/bin/bash
+# =============================================================================
+# bsub.sh — LSF job submission helper
+# =============================================================================
+# Wraps any starsolo command in an LSF bsub job.
+#
+# Usage: bsub.sh <starsolo_command_and_args…>
+#
+# Example:
+#   ./scripts/bsub.sh starsolo 10x /data/fastqs SAMPLE1 --species human
+# =============================================================================
+
+set -euo pipefail
+
+if (( $# < 1 )); then
+    echo "Usage: bsub.sh <command> [args…]" >&2
+    echo "Example: bsub.sh starsolo 10x /data/fastqs SAMPLE1 --species human" >&2
+    exit 1
+fi
+
+SCRIPT="$*"
+AA=("$@")
+# Use the last argument as the job tag (basename only)
+TAG=$(basename "${AA[-1]}")
+
+GROUP=$(bugroup -w | grep "\b${USER}\b" | cut -d" " -f1)
+CPUS=16
+RAM=64000
+QUE="normal"
+WDIR=$(pwd)
+
+bsub -G "$GROUP" \
+    -n"$CPUS" \
+    -R"span[hosts=1] select[mem>$RAM] rusage[mem=$RAM]" \
+    -M"$RAM" \
+    -o "$WDIR/$TAG.%J.bsub.log" \
+    -e "$WDIR/$TAG.%J.bsub.err" \
+    -q "$QUE" \
+    $SCRIPT
diff --git a/scripts/generate_test_data.py b/scripts/generate_test_data.py
new file mode 100755
index 0000000..4a37ee2
--- /dev/null
+++ b/scripts/generate_test_data.py
@@ -0,0 +1,313 @@
+#!/usr/bin/env python3
+"""Generate a tiny synthetic dataset for the STARsolo CLI wrapper.
+
+- Creates a minimal synthetic reference (FASTA + GTF).
+- Creates minimal whitelist files with the exact filenames expected by the wrapper.
+- Creates gzipped FASTQs for each supported technology:
+    10x, dropseq, rhapsody, indrops, strt, smartseq.
+
+This script uses ONLY the Python standard library.
+"""
+
+from __future__ import annotations
+
+import argparse
+import gzip
+import tarfile
+from pathlib import Path
+from typing import Iterable, Tuple
+
+DNA_ALPHABET = "ACGT"
+
+
+def base4_dna(n: int, length: int) -> str:
+    """Deterministic pseudo-random-ish DNA string from an integer."""
+    if length <= 0:
+        return ""
+    chars = []
+    x = n
+    for _ in range(length):
+        chars.append(DNA_ALPHABET[x & 3])
+        x >>= 2
+        if x == 0:
+            # mix a bit so we don't get long runs of 'A' for small n
+            x = (n * 1103515245 + 12345) & 0xFFFFFFFF
+    return "".join(chars)
+
+
+def repeat_to_length(motif: str, length: int) -> str:
+    if not motif:
+        raise ValueError("motif must be non-empty")
+    return (motif * ((length // len(motif)) + 1))[:length]
+
+
+def write_text(path: Path, text: str) -> None:
+    path.parent.mkdir(parents=True, exist_ok=True)
+    path.write_text(text, encoding="utf-8")
+
+
+def write_fastq_gz(path: Path, records: Iterable[Tuple[str, str]]) -> None:
+    """Write gzipped FASTQ from (name, sequence) pairs."""
+    path.parent.mkdir(parents=True, exist_ok=True)
+    with gzip.open(path, "wt", encoding="utf-8", compresslevel=6) as fh:
+        for name, seq in records:
+            qual = "I" * len(seq)
+            fh.write(f"@{name}\n{seq}\n+\n{qual}\n")
+
+
+def gen_records(prefix: str, mate: int, n: int, seq_fn) -> Iterable[Tuple[str, str]]:
+    for i in range(n):
+        yield (f"{prefix}_{i}/{mate}", seq_fn(i))
+
+
+def make_reference(outdir: Path) -> None:
+    """Create a synthetic reference with two genes on chrTest."""
+    ref_dir = outdir / "reference"
+    ref_dir.mkdir(parents=True, exist_ok=True)
+
+    # Build a 1500bp contig. Embed two easy-to-hit regions for mapping.
+    # geneA: 101-500 (1-based, inclusive) contains lots of ACGT repeats
+    # geneB: 1001-1400 contains lots of TGCATGCA repeats
+    contig_len = 1500
+    seq = list(
+        repeat_to_length(
+            "CAGATTTTCATATTATGCAGAAAATCTACTTCGCCTGATACGAGTCGGTTATCTTCGGATACTGTATAGTCCCACCTGGTGATCCTATGCTTGTGAGTAC",
+            contig_len,
+        )
+    )
+
+    geneA = repeat_to_length("ACGT", 400)
+    geneB = repeat_to_length("TGCATGCA", 400)
+
+    # Insert (convert to 0-based slicing)
+    seq[100:500] = list(geneA)  # 101..500
+    seq[1000:1400] = list(geneB)  # 1001..1400
+
+    fasta = ">chrTest\n" + "".join(seq) + "\n"
+    write_text(ref_dir / "test.fa", fasta)
+
+    gtf = "\n".join(
+        [
+            'chrTest\tsynth\tgene\t101\t500\t.\t+\t.\tgene_id "geneA"; gene_name "geneA";',
+            'chrTest\tsynth\ttranscript\t101\t500\t.\t+\t.\tgene_id "geneA"; transcript_id "txA"; gene_name "geneA";',
+            'chrTest\tsynth\texon\t101\t500\t.\t+\t.\tgene_id "geneA"; transcript_id "txA"; gene_name "geneA"; exon_number "1";',
+            'chrTest\tsynth\tgene\t1001\t1400\t.\t+\t.\tgene_id "geneB"; gene_name "geneB";',
+            'chrTest\tsynth\ttranscript\t1001\t1400\t.\t+\t.\tgene_id "geneB"; transcript_id "txB"; gene_name "geneB";',
+            'chrTest\tsynth\texon\t1001\t1400\t.\t+\t.\tgene_id "geneB"; transcript_id "txB"; gene_name "geneB"; exon_number "1";',
+            "",
+        ]
+    )
+    write_text(ref_dir / "test.gtf", gtf)
+
+
+def make_whitelists(outdir: Path) -> None:
+    wl = outdir / "whitelists"
+    wl.mkdir(parents=True, exist_ok=True)
+
+    # 10x whitelists: minimal, but filenames MUST match the wrapper.
+    write_text(wl / "3M-february-2018.txt", "ACGTTGCAACGTTGCA\n")
+    write_text(wl / "737K-august-2016.txt", "TTTTTTTTTTTTTTTT\n")
+    write_text(wl / "737K-arc-v1.txt", "TGCATGCATGCATGCA\n")
+    write_text(wl / "737K-april-2014_rc.txt", "CCCCCCCCCCCCCC\n")  # 14bp
+    write_text(wl / "3M-3pgex-may-2023.txt", "GGGGGGGGGGGGGGGG\n")
+    write_text(wl / "3M-5pgex-jan-2023.txt", "AAAAAAAAAAAAAAAA\n")
+
+    # BD Rhapsody
+    write_text(wl / "Rhapsody_bc1.txt", "ACGTACGTA\n")
+    write_text(wl / "Rhapsody_bc2.txt", "TGCATGCAT\n")
+    write_text(wl / "Rhapsody_bc3.txt", "GATCGATCG\n")
+
+    # inDrops
+    write_text(wl / "inDrops_Ambrose2_bc1.txt", "ACGTACGTACG\n")
+    write_text(wl / "inDrops_Ambrose2_bc2.txt", "TGCATGCA\n")
+
+    # STRT-seq
+    write_text(wl / "96_barcodes.list", "AACCGGTT\nTTGGAACC\n")
+
+
+def make_fastqs(outdir: Path, tenx_reads: int) -> None:
+    fq_root = outdir / "fastqs"
+
+    # Shared cDNA read used across droplet tests (maps into geneA region)
+    cdna_50 = repeat_to_length("ACGT", 50)
+    cdna_75 = repeat_to_length("ACGT", 75)
+
+    # 10x
+    bc_10x = "ACGTTGCAACGTTGCA"  # must match 3M-february-2018.txt
+
+    def tenx_r1(i: int) -> str:
+        return bc_10x + base4_dna(i, 12)
+
+    write_fastq_gz(
+        fq_root / "10x" / "TENX1_1.fastq.gz",
+        gen_records("10x", 1, tenx_reads, tenx_r1),
+    )
+    write_fastq_gz(
+        fq_root / "10x" / "TENX1_2.fastq.gz",
+        gen_records("10x", 2, tenx_reads, lambda i: cdna_50),
+    )
+
+    # Drop-seq (12bp CB + 8bp UMI)
+    bc_drop = "AACCGGTTAACC"
+
+    def drop_r1(i: int) -> str:
+        return bc_drop + base4_dna(i, 8)
+
+    write_fastq_gz(
+        fq_root / "dropseq" / "DROP1_1.fastq.gz",
+        gen_records("drop", 1, 2000, drop_r1),
+    )
+    write_fastq_gz(
+        fq_root / "dropseq" / "DROP1_2.fastq.gz",
+        gen_records("drop", 2, 2000, lambda i: cdna_50),
+    )
+
+    # BD Rhapsody (3 barcode segments + 8bp UMI)
+    bc1, bc2, bc3 = "ACGTACGTA", "TGCATGCAT", "GATCGATCG"
+    filler1, filler2 = "A" * 12, "A" * 13
+
+    def rhap_r1(i: int) -> str:
+        return bc1 + filler1 + bc2 + filler2 + bc3 + base4_dna(i, 8)
+
+    write_fastq_gz(
+        fq_root / "rhapsody" / "RHAP1_1.fastq.gz",
+        gen_records("rhap", 1, 1000, rhap_r1),
+    )
+    write_fastq_gz(
+        fq_root / "rhapsody" / "RHAP1_2.fastq.gz",
+        gen_records("rhap", 2, 1000, lambda i: cdna_50),
+    )
+
+    # inDrops (BC1 + adapter + BC2 + 6bp UMI)
+    bc1_i, bc2_i = "ACGTACGTACG", "TGCATGCA"
+    adapter = "GAGTGATTGCTTGTGACGCCTT"  # wrapper default
+
+    def ind_r1(i: int) -> str:
+        return bc1_i + adapter + bc2_i + base4_dna(i, 6)
+
+    write_fastq_gz(
+        fq_root / "indrops" / "IND1_1.fastq.gz",
+        gen_records("ind", 1, 1000, ind_r1),
+    )
+    write_fastq_gz(
+        fq_root / "indrops" / "IND1_2.fastq.gz",
+        gen_records("ind", 2, 1000, lambda i: cdna_50),
+    )
+
+    # STRT-seq (8bp CB + 8bp UMI) — in this wrapper, barcode is in *_1.fastq
+    bc_strt = "AACCGGTT"  # one of 96_barcodes.list
+
+    def strt_r1(i: int) -> str:
+        return bc_strt + base4_dna(i, 8)
+
+    write_fastq_gz(
+        fq_root / "strt" / "STRT1_1.fastq.gz",
+        gen_records("strt", 1, 500, strt_r1),
+    )
+    write_fastq_gz(
+        fq_root / "strt" / "STRT1_2.fastq.gz",
+        gen_records("strt", 2, 500, lambda i: cdna_50),
+    )
+
+    # Smart-seq (manifest with absolute paths)
+    smart_dir = fq_root / "smartseq"
+    smart_dir.mkdir(parents=True, exist_ok=True)
+
+    for cell in ("CELL1", "CELL2"):
+        write_fastq_gz(
+            smart_dir / f"{cell}_R1.fastq.gz",
+            gen_records(cell, 1, 200, lambda i: cdna_75),
+        )
+        write_fastq_gz(
+            smart_dir / f"{cell}_R2.fastq.gz",
+            gen_records(cell, 2, 200, lambda i: cdna_75),
+        )
+
+    manifest_path = smart_dir / "smartseq_example.manifest.tsv"
+    cell1_r1 = (smart_dir / "CELL1_R1.fastq.gz").resolve()
+    cell1_r2 = (smart_dir / "CELL1_R2.fastq.gz").resolve()
+    cell2_r1 = (smart_dir / "CELL2_R1.fastq.gz").resolve()
+    cell2_r2 = (smart_dir / "CELL2_R2.fastq.gz").resolve()
+
+    manifest = "\n".join(
+        [
+            f"{cell1_r1}\t{cell1_r2}\tCELL1",
+            f"{cell2_r1}\t{cell2_r2}\tCELL2",
+            "",
+        ]
+    )
+    write_text(manifest_path, manifest)
+
+
+def make_readme(outdir: Path, tenx_reads: int) -> None:
+    text = f"""# STARsolo synthetic smoke-test dataset
+
+**Note on 10x:** the wrapper’s chemistry auto-detection requires >50,000 barcode matches in the
+subsampled reads. This dataset generates {tenx_reads:,} reads for the 10x sample.
+
+## Build a STAR index
+
+```bash
+mkdir -p reference/index
+STAR --runThreadN 4 --runMode genomeGenerate \\
+  --genomeDir reference/index \\
+  --genomeFastaFiles reference/test.fa \\
+  --sjdbGTFfile reference/test.gtf
+
+  ## Example runs
+  WL=$(pwd)/whitelists
+REF=$(pwd)/reference/index
+```
+## Example runs
+```bash
+starsolo 10x fastqs/10x TENX1 --ref "$REF" --whitelist-dir "$WL"
+starsolo dropseq fastqs/dropseq DROP1 --ref "$REF"
+starsolo rhapsody fastqs/rhapsody RHAP1 --ref "$REF" --whitelist-dir "$WL"
+starsolo indrops fastqs/indrops IND1 --ref "$REF" --whitelist-dir "$WL"
+starsolo strt fastqs/strt STRT1 --ref "$REF" --whitelist-dir "$WL"
+starsolo smartseq fastqs/smartseq/smartseq_example.manifest.tsv --ref "$REF"
+```
+"""
+    write_text(outdir / "README.md", text)
+
+
+def maybe_tar(outdir: Path, tar_path: Path) -> None:
+    tar_path.parent.mkdir(parents=True, exist_ok=True)
+    with tarfile.open(tar_path, "w:gz") as tf:
+        tf.add(outdir, arcname=outdir.name)
+
+
+def main() -> None:
+    ap = argparse.ArgumentParser(
+        description="Generate synthetic STARsolo wrapper test data"
+    )
+    ap.add_argument("--outdir", default="starsolo_synth", help="Output directory")
+    ap.add_argument(
+        "--tenx-reads", type=int, default=60000, help="Number of 10x reads (R1/R2)"
+    )
+    ap.add_argument(
+        "--tar", action="store_true", help="Also create <outdir>.tar.gz next to outdir"
+    )
+    args = ap.parse_args()
+
+    outdir = Path(args.outdir).resolve()
+    if outdir.exists() and any(outdir.iterdir()):
+        raise SystemExit(f"Refusing to write into non-empty directory: {outdir}")
+
+    outdir.mkdir(parents=True, exist_ok=True)
+
+    make_reference(outdir)
+    make_whitelists(outdir)
+    make_fastqs(outdir, tenx_reads=args.tenx_reads)
+    make_readme(outdir, tenx_reads=args.tenx_reads)
+
+    if args.tar:
+        tar_path = outdir.with_suffix(".tar.gz")
+        maybe_tar(outdir, tar_path)
+        print(f"Wrote {outdir} and {tar_path}")
+    else:
+        print(f"Wrote {outdir}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/bbduk.sh b/scripts/legacy/bbduk.sh
similarity index 100%
rename from scripts/bbduk.sh
rename to scripts/legacy/bbduk.sh
diff --git a/scripts/bsub16.sh b/scripts/legacy/bsub16.sh
similarity index 100%
rename from scripts/bsub16.sh
rename to scripts/legacy/bsub16.sh
diff --git a/scripts/solo_QC.sh b/scripts/legacy/solo_QC.sh
similarity index 100%
rename from scripts/solo_QC.sh
rename to scripts/legacy/solo_QC.sh
diff --git a/scripts/legacy/starsolo_10x_auto.sh b/scripts/legacy/starsolo_10x_auto.sh
new file mode 100755
index 0000000..23f69a5
--- /dev/null
+++ b/scripts/legacy/starsolo_10x_auto.sh
@@ -0,0 +1,437 @@
+#!/bin/bash -e
+
+## v3.2 of STARsolo wrappers is set up to guess the chemistry automatically
+## newest version of the script uses STAR v2.7.10a with EM multimapper processing
+## in STARsolo which on by default; the extra matrix can be found in /raw subdir
+
+# --- Function Definitions ---
+
+# Finds paired-end FASTQ files based on common naming conventions.
+# Args:
+#   $1: FQDIR - The directory containing FASTQ files.
+#   $2: TAG - The sample identifier.
+# Returns:
+#   A pipe-separated string "R1_FILES|R2_FILES".
+function find_fastq_files() {
+  local FQDIR=$1
+  local TAG=$2
+  local R1
+  local R2
+
+  ## Four popular cases: ENA, regular, Cell Ranger, and HRA.
+  ## The command below will generate a comma-separated list for each read
+  ## if there are >1 file for each mate.
+  if [[ $(find "$FQDIR"/* | grep -P "/$TAG[\\/\\._]" | grep "_1\\.f.*q") != "" ]]; then
+    R1=$(find "$FQDIR"/* | grep -P "/$TAG[\\/\\._]" | grep "_1\\.f.*q" | sort | tr '\n' ',' | sed "s/,$//g")
+    R2=$(find "$FQDIR"/* | grep -P "/$TAG[\\/\\._]" | grep "_2\\.f.*q" | sort | tr '\n' ',' | sed "s/,$//g")
+  elif [[ $(find "$FQDIR"/* | grep -P "/$TAG[\\/\\._]" | grep "R1\\.f.*q") != "" ]]; then
+    R1=$(find "$FQDIR"/* | grep -P "/$TAG[\\/\\._]" | grep "R1\\.f.*q" | sort | tr '\n' ',' | sed "s/,$//g")
+    R2=$(find "$FQDIR"/* | grep -P "/$TAG[\\/\\._]" | grep "R2\\.f.*q" | sort | tr '\n' ',' | sed "s/,$//g")
+  elif [[ $(find "$FQDIR"/* | grep -P "/$TAG[\\/\\._]" | grep "_R1_.*\\.f.*q") != "" ]]; then
+    R1=$(find "$FQDIR"/* | grep -P "/$TAG[\\/\\._]" | grep "_R1_.*\\.f.*q" | sort | tr '\n' ',' | sed "s/,$//g")
+    R2=$(find "$FQDIR"/* | grep -P "/$TAG[\\/\\._]" | grep "_R2_.*\\.f.*q" | sort | tr '\n' ',' | sed "s/,$//g")
+  elif [[ $(find "$FQDIR"/* | grep -P "/$TAG[\\/\\._]" | grep "_f1\\.f.*q") != "" ]]; then
+    R1=$(find "$FQDIR"/* | grep -P "/$TAG[\\/\\._]" | grep "_f1\\.f.*q" | sort | tr '\n' ',' | sed "s/,$//g")
+    R2=$(find "$FQDIR"/* | grep -P "/$TAG[\\/\\._]" | grep "_r2\\.f.*q" | sort | tr '\n' ',' | sed "s/,$//g")
+  else
+    >&2 echo "ERROR: No appropriate fastq files were found! Please check file formatting, and check if you have set the right FQDIR."
+    exit 1
+  fi
+
+  echo "$R1|$R2"
+}
+
+# Checks for GZIP or BZIP2 compression and returns the appropriate command.
+# Args:
+#   $1: FQDIR - The directory containing FASTQ files.
+#   $2: TAG - The sample identifier.
+# Returns:
+#   A pipe-separated string "GZIP_COMMAND|Z_COMMAND".
+function check_compression() {
+  local FQDIR=$1
+  local TAG=$2
+  local GZIP=""
+  local ZCMD="cat" # Default to no compression
+
+  if [[ $(find "$FQDIR"/* | grep -P "$TAG[\\/\\._]" | grep "\\.gz$") != "" ]]; then
+    GZIP="--readFilesCommand zcat"
+    ZCMD="zcat"
+  elif [[ $(find "$FQDIR"/* | grep -P "$TAG[\\/\\._]" | grep "\\.bz2$") != "" ]]; then
+    GZIP="--readFilesCommand bzcat"
+    ZCMD="bzcat"
+  fi
+
+  echo "$GZIP|$ZCMD"
+}
+
+# Extracts a random subsample of reads for testing chemistry.
+# Args:
+#   $1: R1 - Comma-separated list of Read 1 FASTQ files.
+#   $2: R2 - Comma-separated list of Read 2 FASTQ files.
+#   $3: ZCMD - The command to decompress files (e.g., zcat, bzcat, cat).
+#   $4: CMD - Optional prefix command (e.g., for singularity).
+function extract_test_reads() {
+  local R1=$1
+  local R2=$2
+  local ZCMD=$3
+  local CMD=${4:-""}
+  local COUNT=0
+
+  for i in $(echo "$R1" | tr ',' ' '); do
+    $ZCMD "$i" | head -4000000 > "$COUNT.R1_head" &
+    COUNT=$((COUNT+1))
+  done
+  wait
+
+  COUNT=0
+  for i in $(echo "$R2" | tr ',' ' '); do
+    $ZCMD "$i" | head -4000000 > "$COUNT.R2_head" &
+    COUNT=$((COUNT+1))
+  done
+  wait
+
+  # Use the same random seed to select corresponding reads from R1 and R2.
+  cat *.R1_head | $CMD seqtk sample -s100 - 200000 > test.R1.fastq &
+  cat *.R2_head | $CMD seqtk sample -s100 - 200000 > test.R2.fastq &
+  wait
+  rm *.R1_head *.R2_head
+}
+
+# Determines the correct 10x barcode whitelist and read lengths.
+# Args:
+#   $1: WL - Path to the directory containing whitelist files.
+# Returns:
+#   A pipe-separated string: "BC|NBC1|NBC2|NBC3|NBC4|NBC5|NBCA|R1LEN|R2LEN|R1DIS"
+function determine_barcode_whitelist() {
+    local WL=$1
+    local BC=""
+
+    # Elucidate the right barcode whitelist to use.
+    local NBC1=$(cat test.R1.fastq | awk 'NR%4==2' | cut -c-14 | grep -F -f "$WL/737K-april-2014_rc.txt" | wc -l)
+    local NBC2=$(cat test.R1.fastq | awk 'NR%4==2' | cut -c-16 | grep -F -f "$WL/737K-august-2016.txt" | wc -l)
+    local NBC3=$(cat test.R1.fastq | awk 'NR%4==2' | cut -c-16 | grep -F -f "$WL/3M-february-2018.txt" | wc -l)
+    local NBC4=$(cat test.R1.fastq | awk 'NR%4==2' | cut -c-16 | grep -F -f "$WL/3M-3pgex-may-2023.txt" | wc -l)
+    local NBC5=$(cat test.R1.fastq | awk 'NR%4==2' | cut -c-16 | grep -F -f "$WL/3M-5pgex-jan-2023.txt" | wc -l)
+    local NBCA=$(cat test.R1.fastq | awk 'NR%4==2' | cut -c-16 | grep -F -f "$WL/737K-arc-v1.txt" | wc -l)
+
+    # Get average read lengths and distribution.
+    local R1LEN=$(cat test.R1.fastq | awk 'NR%4==2' | awk '{sum+=length($0)} END {printf "%d\n",sum/NR+0.5}')
+    local R2LEN=$(cat test.R2.fastq | awk 'NR%4==2' | awk '{sum+=length($0)} END {printf "%d\n",sum/NR+0.5}')
+    local R1DIS=$(cat test.R1.fastq | awk 'NR%4==2' | awk '{print length($0)}' | sort | uniq -c | wc -l)
+
+    if (( NBC3 > 50000 )); then
+        BC="$WL/3M-february-2018.txt"
+    elif (( NBC2 > 50000 )); then
+        BC="$WL/737K-august-2016.txt"
+    elif (( NBCA > 50000 )); then
+        BC="$WL/737K-arc-v1.txt"
+    elif (( NBC1 > 50000 )); then
+        BC="$WL/737K-april-2014_rc.txt"
+    elif (( NBC4 > 50000 )); then
+        BC="$WL/3M-3pgex-may-2023.txt"
+    elif (( NBC5 > 50000 )); then
+        BC="$WL/3M-5pgex-jan-2023.txt"
+    else
+        >&2 echo "ERROR: No whitelist has matched a random selection of 200,000 barcodes! Match counts: $NBC1 (v1), $NBC2 (v2), $NBC3 (v3), $NBC4 (v4-3p), $NBC5 (v4-5p), $NBCA (multiome)."
+        exit 1
+    fi
+
+    echo "$BC|$NBC1|$NBC2|$NBC3|$NBC4|$NBC5|$NBCA|$R1LEN|$R2LEN|$R1DIS"
+}
+
+# Checks read lengths and determines barcode/UMI parameters.
+# Args:
+#   $1: R1DIS, $2: R1LEN, $3: R2LEN, $4: BC, $5: WL
+# Returns:
+#   A pipe-separated string: "PAIRED|CBLEN|UMILEN"
+function check_read_lengths() {
+    local R1DIS=$1
+    local R1LEN=$2
+    local R2LEN=$3
+    local BC=$4
+    local WL=$5
+    local PAIRED=False
+    local CBLEN
+    local UMILEN
+
+    # Check for potential issues with read lengths.
+    if (( R1DIS > 1 && R1LEN <= 30 )); then
+        >&2 echo "ERROR: Read 1 (barcode) has varying length; possibly quality-trimmed. Please check the fastq files."
+        exit 1
+    elif (( R1LEN < 24 )); then
+        >&2 echo "ERROR: Read 1 (barcode) is less than 24 bp in length. Please check the fastq files."
+        exit 1
+    elif (( R2LEN < 40 )); then
+        >&2 echo "ERROR: Read 2 (biological read) is less than 40 bp in length. Please check the fastq files."
+        exit 1
+    fi
+
+    if (( R1LEN > 50 )); then
+        PAIRED=True
+    fi
+
+    if [[ $BC == "$WL/3M-february-2018.txt" || $BC == "$WL/737K-arc-v1.txt" || $BC == "$WL/3M-3pgex-may-2023.txt" || $BC == "$WL/3M-5pgex-jan-2023.txt" ]]; then
+        CBLEN=16
+        UMILEN=12
+    elif [[ $BC == "$WL/737K-august-2016.txt" ]]; then
+        CBLEN=16
+        UMILEN=10
+    elif [[ $BC == "$WL/737K-april-2014_rc.txt" ]]; then
+        CBLEN=14
+        UMILEN=10
+    fi
+
+    # Failsafe for short R1 reads with v3 chemistry.
+    if (( CBLEN + UMILEN > R1LEN )); then
+        local NEWUMI=$((R1LEN - CBLEN))
+        local BCUMI=$((UMILEN + CBLEN))
+        >&2 echo "WARNING: Read 1 length ($R1LEN) is less than the sum of barcode and UMI ($BCUMI). Changing UMI from $UMILEN to $NEWUMI!"
+        UMILEN=$NEWUMI
+    elif (( CBLEN + UMILEN < R1LEN )); then
+        local BCUMI=$((UMILEN + CBLEN))
+        >&2 echo "WARNING: Read 1 length ($R1LEN) is more than the sum of barcode and UMI ($BCUMI)."
+    fi
+
+    echo "$PAIRED|$CBLEN|$UMILEN"
+}
+
+
+# Determines strand specificity by running test alignments.
+# Args:
+#   $1: REF, $2: CBLEN, $3: UMILEN, $4: CPUS, $5: BC, $6: PAIRED
+#   $7: CMD - Optional prefix command.
+# Returns:
+#   A pipe-separated string: "STRAND|PCTFWD|PCTREV|PAIRED"
+function determine_strand_specificity() {
+    local REF=$1
+    local CBLEN=$2
+    local UMILEN=$3
+    local CPUS=$4
+    local BC=$5
+    local PAIRED=$6
+    local CMD=${7:-""}
+    local STRAND=Forward
+
+    $CMD STAR --runThreadN "$CPUS" --genomeDir "$REF" --readFilesIn test.R2.fastq test.R1.fastq --runDirPerm All_RWX --outSAMtype None \
+        --soloType CB_UMI_Simple --soloCBwhitelist "$BC" --soloBarcodeReadLength 0 --soloCBlen "$CBLEN" --soloUMIstart $((CBLEN + 1)) \
+        --soloUMIlen "$UMILEN" --soloStrand Forward --genomeLoad LoadAndKeep \
+        --soloUMIdedup 1MM_CR --soloCBmatchWLtype 1MM_multi_Nbase_pseudocounts --soloUMIfiltering MultiGeneUMI_CR \
+        --soloCellFilter EmptyDrops_CR --clipAdapterType CellRanger4 --outFilterScoreMin 30 \
+        --soloFeatures Gene GeneFull --soloOutFileNames test_forward/ features.tsv barcodes.tsv matrix.mtx &> /dev/null
+
+    $CMD STAR --runThreadN "$CPUS" --genomeDir "$REF" --readFilesIn test.R2.fastq test.R1.fastq --runDirPerm All_RWX --outSAMtype None \
+        --soloType CB_UMI_Simple --soloCBwhitelist "$BC" --soloBarcodeReadLength 0 --soloCBlen "$CBLEN" --soloUMIstart $((CBLEN + 1)) \
+        --soloUMIlen "$UMILEN" --soloStrand Reverse --genomeLoad LoadAndKeep \
+        --soloUMIdedup 1MM_CR --soloCBmatchWLtype 1MM_multi_Nbase_pseudocounts --soloUMIfiltering MultiGeneUMI_CR \
+        --soloCellFilter EmptyDrops_CR --clipAdapterType CellRanger4 --outFilterScoreMin 30 \
+        --soloFeatures Gene GeneFull --soloOutFileNames test_reverse/ features.tsv barcodes.tsv matrix.mtx &> /dev/null
+
+    local PCTFWD=$(grep "Reads Mapped to GeneFull: Unique GeneFull" test_forward/GeneFull/Summary.csv | awk -F "," '{printf "%d\n",$2*100+0.5}')
+    local PCTREV=$(grep "Reads Mapped to GeneFull: Unique GeneFull" test_reverse/GeneFull/Summary.csv | awk -F "," '{printf "%d\n",$2*100+0.5}')
+
+    if (( PCTREV > PCTFWD )); then
+        STRAND=Reverse
+    fi
+
+    if (( PCTREV < 50 && PCTFWD < 50 )); then
+        >&2 echo "WARNING: Low percentage of reads mapping to GeneFull: forward = $PCTFWD, reverse = $PCTREV"
+    fi
+
+    # If a paired-end experiment turns out to be 3', process it as single-end.
+    if [[ $STRAND == "Forward" && $PAIRED == "True" ]]; then
+        PAIRED=False
+    fi
+
+    echo "$STRAND|$PCTFWD|$PCTREV|$PAIRED"
+}
+
+# Writes the final run configuration to a log file.
+# Args: Takes 15 arguments representing all configuration parameters.
+function write_config_file() {
+    local TAG=$1
+    local PAIRED=$2
+    local STRAND=$3
+    local PCTFWD=$4
+    local PCTREV=$5
+    local BC=$6
+    local NBC1=$7
+    local NBC2=$8
+    local NBC3=$9
+    local NBCA=${10}
+    local NBC4=${11}
+    local NBC5=${12}
+    local CBLEN=${13}
+    local UMILEN=${14}
+    local GZIP=${15}
+    local R1=${16}
+    local R2=${17}
+
+    echo "Done setting up the STARsolo run; here are final processing options:"
+    echo "============================================================================="
+    (
+        echo "Sample: $TAG"
+        echo "Paired-end mode: $PAIRED"
+        echo "Strand (Forward = 3', Reverse = 5'): $STRAND, %reads mapped to GeneFull: forward = $PCTFWD , reverse = $PCTREV"
+        echo "CB whitelist: $BC, matches out of 200,000: $NBC3 (v3), $NBC2 (v2), $NBC1 (v1), $NBC4 (v4-3p), $NBC5 (v4-5p), $NBCA (multiome)"
+        echo "CB length: $CBLEN"
+        echo "UMI length: $UMILEN"
+        echo "GZIP: $GZIP"
+        echo "-----------------------------------------------------------------------------"
+        echo "Read 1 files: $R1"
+        echo "-----------------------------------------------------------------------------"
+        echo "Read 2 files: $R2"
+        echo "-----------------------------------------------------------------------------"
+    ) | tee strand.txt
+}
+
+# Executes the main STARsolo alignment.
+# Args: Takes 13 arguments for all STARsolo parameters.
+function run_starsolo() {
+    local PAIRED=$1
+    local R1=$2
+    local R2=$3
+    local GZIP=$4
+    local BAM=$5
+    local BC=$6
+    local CBLEN=$7
+    local UMILEN=$8
+    local STRAND=$9
+    local REF=${10}
+    local CPUS=${11}
+    local SOLOFILENAMES=${12}
+    local CMD=${13:-""}
+
+    if [[ $PAIRED == "True" ]]; then
+        # Note the R1/R2 order and --soloStrand Forward for 5' paired-end.
+        $CMD STAR --runThreadN "$CPUS" --genomeDir "$REF" --readFilesIn "$R1" "$R2" --runDirPerm All_RWX $GZIP $BAM --soloBarcodeMate 1 --clip5pNbases 39 0 \
+            --soloType CB_UMI_Simple --soloCBwhitelist "$BC" --soloCBstart 1 --soloCBlen "$CBLEN" --soloUMIstart $((CBLEN + 1)) --soloUMIlen "$UMILEN" --soloStrand Forward \
+            --soloUMIdedup 1MM_CR --soloCBmatchWLtype 1MM_multi_Nbase_pseudocounts --soloUMIfiltering MultiGeneUMI_CR \
+            --soloCellFilter EmptyDrops_CR --outFilterScoreMin 30 --genomeLoad LoadAndRemove \
+            --soloFeatures Gene GeneFull Velocyto --soloOutFileNames "$SOLOFILENAMES" features.tsv barcodes.tsv matrix.mtx --soloMultiMappers EM
+    else
+        $CMD STAR --runThreadN "$CPUS" --genomeDir "$REF" --readFilesIn "$R2" "$R1" --runDirPerm All_RWX $GZIP $BAM \
+            --soloType CB_UMI_Simple --soloCBwhitelist "$BC" --soloBarcodeReadLength 0 --soloCBlen "$CBLEN" --soloUMIstart $((CBLEN + 1)) --soloUMIlen "$UMILEN" --soloStrand "$STRAND" \
+            --soloUMIdedup 1MM_CR --soloCBmatchWLtype 1MM_multi_Nbase_pseudocounts --soloUMIfiltering MultiGeneUMI_CR \
+            --soloCellFilter EmptyDrops_CR --clipAdapterType CellRanger4 --outFilterScoreMin 30 --genomeLoad LoadAndRemove \
+            --soloFeatures Gene GeneFull Velocyto --soloOutFileNames "$SOLOFILENAMES" features.tsv barcodes.tsv matrix.mtx --soloMultiMappers EM
+    fi
+}
+
+# Finalizes the output files.
+# Args:
+#   $1: TAG - The sample identifier.
+#   $2: SOLOFILENAMES - The output directory for solo features.
+#   $3: CMD - Optional prefix command (e.g., for singularity).
+function process_output_files() {
+    local CMD=${1:-""}
+
+    if [[ -s "Aligned.sortedByCoord.out.bam" ]]; then
+        $CMD samtools index -@16 Aligned.sortedByCoord.out.bam
+    fi
+
+    $CMD pbzip2 -9 "Unmapped.out.mate1" &
+    $CMD pbzip2 -9 "Unmapped.out.mate2" &
+    wait
+
+    rm -rf test.R?.fastq test_forward test_reverse
+
+    find . -name "*.tsv" -exec gzip {} +
+    find . -name "*.mtx" -exec gzip {} +
+
+    wait
+    echo "ALL DONE!"
+}
+
+
+# --- Main Workflow Orchestrator ---
+
+function starsolo_10x() {
+    # --- 1. Setup and Argument Parsing ---
+    local FQDIR=$1
+    local TAG=$2
+    local CPUS=$3
+    local REF=$4
+    local WL=$5
+    local BAM=$6
+    local CMD=${7:-""}
+
+    local FQDIR_ABS
+    FQDIR_ABS=$(readlink -f "$FQDIR")
+    mkdir -p "$TAG" && cd "$TAG" || exit
+
+    # --- 2. Find FASTQ Files ---
+    local fastq_files
+    fastq_files=$(find_fastq_files "$FQDIR_ABS" "$TAG")
+    local R1
+    local R2
+    IFS='|' read -r R1 R2 <<< "$fastq_files"
+
+    # --- 3. Determine Compression and Chemistry ---
+    local compression_info
+    compression_info=$(check_compression "$FQDIR_ABS" "$TAG")
+    local GZIP
+    local ZCMD
+    IFS='|' read -r GZIP ZCMD <<< "$compression_info"
+
+    extract_test_reads "$R1" "$R2" "$ZCMD" "$CMD"
+
+    local whitelist_info
+    whitelist_info=$(determine_barcode_whitelist "$WL")
+    local BC NBC1 NBC2 NBC3 NBC4 NBC5 NBCA R1LEN R2LEN R1DIS
+    IFS='|' read -r BC NBC1 NBC2 NBC3 NBC4 NBC5 NBCA R1LEN R2LEN R1DIS <<< "$whitelist_info"
+
+    # --- 4. Check Reads and Determine Parameters ---
+    local read_len_info
+    read_len_info=$(check_read_lengths "$R1DIS" "$R1LEN" "$R2LEN" "$BC" "$WL")
+    local PAIRED CBLEN UMILEN
+    IFS='|' read -r PAIRED CBLEN UMILEN <<< "$read_len_info"
+
+    local strand_info
+    strand_info=$(determine_strand_specificity "$REF" "$CBLEN" "$UMILEN" "$CPUS" "$BC" "$PAIRED" "$CMD")
+    local STRAND PCTFWD PCTREV
+    # PAIRED is returned again as the function can modify it
+    IFS='|' read -r STRAND PCTFWD PCTREV PAIRED <<< "$strand_info"
+
+    # --- 5. Log, Run, and Cleanup ---
+    write_config_file "$TAG" "$PAIRED" "$STRAND" "$PCTFWD" "$PCTREV" "$BC" "$NBC1" "$NBC2" "$NBC3" "$NBCA" "$NBC4" "$NBC5" "$CBLEN" "$UMILEN" "$GZIP" "$R1" "$R2"
+
+    local SOLOFILENAMES="output/"
+    run_starsolo "$PAIRED" "$R1" "$R2" "$GZIP" "$BAM" "$BC" "$CBLEN" "$UMILEN" "$STRAND" "$REF" "$CPUS" "$SOLOFILENAMES" "$CMD"
+
+    process_output_files "$CMD"
+}
+
+# --- Script Entrypoint ---
+
+function main () {
+    if (( $# != 3 )); then
+        >&2 echo "Usage: ./starsolo_10x_auto.sh <fastq_dir> <sample_id> <specie>"
+        >&2 echo "(make sure you set the correct REF, WL, and BAM variables below)"
+        >&2 echo "Specie can be 'human' or 'mouse' (or any other directory in /nfs/cellgeni/STAR/)"
+        >&2 echo -e "You need to have the following software installed to run this script: \n- star_version=2.7.10a_alpha_220818\n- samtools_version=1.15.1\n- bbmap_version=38.97\n- rsem_version=1.3.3"
+        >&2 echo "Use /nfs/cellgeni/singularity/images/reprocess_10x.sif if you work on FARM cluster."
+        exit 1
+    fi
+
+    local FQDIR=$1
+    local TAG=$2
+    local SPECIE=$3
+
+    # --- Configuration ---
+    # Using local variables to keep all state contained.
+    local SIF="/nfs/cellgeni/singularity/images/reprocess_10x.sif"
+    local CMD="singularity run --bind /nfs,/lustre,/software $SIF"
+    local CPUS=16
+    local REF="/nfs/cellgeni/STAR/${SPECIE}/2020A/index"
+    local WL="/nfs/cellgeni/STAR/whitelists"
+    # Choose one of the two options, depending on whether you need a BAM file
+    # local BAM="--outSAMtype BAM SortedByCoordinate --outBAMsortingBinsN 500 --limitBAMsortRAM 60000000000 --outSAMunmapped Within --outMultimapperOrder Random --runRNGseed 1 --outSAMattributes NH HI AS nM CB UB CR CY UR UY GX GN"
+    local BAM="--outSAMtype None --outReadsUnmapped Fastx"
+
+    starsolo_10x "$FQDIR" "$TAG" "$CPUS" "$REF" "$WL" "$BAM" "$CMD"
+}
+
+# This construct ensures that main() is called only when the script is executed directly.
+if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then
+    main "$@"
+fi
\ No newline at end of file
diff --git a/scripts/starsolo_bd_rhapsody.sh b/scripts/legacy/starsolo_bd_rhapsody.sh
similarity index 100%
rename from scripts/starsolo_bd_rhapsody.sh
rename to scripts/legacy/starsolo_bd_rhapsody.sh
diff --git a/scripts/starsolo_dropseq.sh b/scripts/legacy/starsolo_dropseq.sh
similarity index 100%
rename from scripts/starsolo_dropseq.sh
rename to scripts/legacy/starsolo_dropseq.sh
diff --git a/scripts/starsolo_indrops.sh b/scripts/legacy/starsolo_indrops.sh
similarity index 100%
rename from scripts/starsolo_indrops.sh
rename to scripts/legacy/starsolo_indrops.sh
diff --git a/scripts/starsolo_ss2.sh b/scripts/legacy/starsolo_ss2.sh
similarity index 100%
rename from scripts/starsolo_ss2.sh
rename to scripts/legacy/starsolo_ss2.sh
diff --git a/scripts/starsolo_strt.sh b/scripts/legacy/starsolo_strt.sh
similarity index 100%
rename from scripts/starsolo_strt.sh
rename to scripts/legacy/starsolo_strt.sh
diff --git a/scripts/solo_qc.sh b/scripts/solo_qc.sh
new file mode 100755
index 0000000..f055d05
--- /dev/null
+++ b/scripts/solo_qc.sh
@@ -0,0 +1,167 @@
+#!/bin/bash
+# =============================================================================
+# solo_qc.sh — Aggregate QC statistics across STARsolo runs
+# =============================================================================
+# Run from the parent directory that contains one subdirectory per sample
+# (each produced by a starsolo run).
+#
+# Usage:
+#   starsolo qc                   # via the CLI
+#   ./scripts/solo_qc.sh          # standalone
+#
+# Pipe through `column -t` for pretty-printed output:
+#   starsolo qc | column -t
+# =============================================================================
+
+# --- Check completion status ------------------------------------------------
+
+>&2 echo "Checking that all STARsolo jobs went to completion …"
+for i in */; do
+    i="${i%/}"
+    if [[ -d "$i/output" && -s "$i/Log.final.out" ]]; then
+        if [[ -d "$i/_STARtmp" ]]; then
+            >&2 echo "WARNING: Sample $i did not run to completion: _STARtmp is still present!"
+        fi
+    fi
+done
+
+# --- Check for cross-contamination -----------------------------------------
+
+>&2 echo "Checking potential sample cross-contamination …"
+for i in */; do
+    i="${i%/}"
+    if [[ -s "$i/output/Gene/filtered/barcodes.tsv.gz" ]]; then
+        for j in */; do
+            j="${j%/}"
+            if [[ -s "$j/output/Gene/filtered/barcodes.tsv.gz" && "$i" != "$j" ]]; then
+                if [[ ! -s "sorted.$i.barcodes.tsv" ]]; then
+                    zcat "$i/output/Gene/filtered/barcodes.tsv.gz" | sort > "sorted.$i.barcodes.tsv" &
+                fi
+                if [[ ! -s "sorted.$j.barcodes.tsv" ]]; then
+                    zcat "$j/output/Gene/filtered/barcodes.tsv.gz" | sort > "sorted.$j.barcodes.tsv" &
+                fi
+                wait
+                N1=$(wc -l < "sorted.$i.barcodes.tsv")
+                N2=$(wc -l < "sorted.$j.barcodes.tsv")
+                MIN=$(( N1 <= N2 ? N1 : N2 ))
+                COMM=$(comm -12 "sorted.$i.barcodes.tsv" "sorted.$j.barcodes.tsv" | wc -l)
+                PCT=$(echo "$COMM" | awk -v v="$MIN" '{printf "%d\n",100*$1/v+0.5}')
+                if (( PCT >= 20 )); then
+                    >&2 echo "WARNING: Samples $i ($N1 barcodes) and $j ($N2 barcodes) share $COMM ($PCT%) barcodes — higher than expected!"
+                fi
+            fi
+        done
+    elif [[ -d "$i/output" && -s "$i/Log.out" ]]; then
+        >&2 echo "Sample $i: no filtered output detected (probably plate-based)."
+    fi
+done
+
+rm -f sorted.*.barcodes.tsv
+
+# --- Extract and output statistics ------------------------------------------
+
+>&2 echo "Extracting STARsolo stats …"
+>&2 echo
+
+echo -e "Sample\tRd_all\tRd_in_cells\tFrc_in_cells\tUMI_in_cells\tCells\tMed_nFeature\tGood_BC\tWL\tSpecies\tPaired\tStrand\tall_u+m\tall_u\texon_u+m\texon_u\tfull_u+m\tfull_u"
+
+for i in */; do
+    i="${i%/}"
+
+    if [[ -d "$i/output/Gene/filtered/" && -s "$i/Log.out" ]]; then
+        # --- Droplet-based ---
+        PAIRED="Single"
+        if grep -q "clip5pNbases 39 0" "$i/Log.out"; then
+            PAIRED="Paired"
+        fi
+
+        REF="Other"
+        if grep "^genomeDir" "$i/Log.out" | tail -n1 | grep -q "/human/"; then
+            REF="Human"
+        elif grep "^genomeDir" "$i/Log.out" | tail -n1 | grep -q "/mouse/"; then
+            REF="Mouse"
+        fi
+
+        WL="Undef"
+        if grep "^soloCBwhitelist" "$i/Log.out" | tail -n1 | grep -q "3M-february-2018.txt"; then
+            WL="v3"
+        elif grep "^soloCBwhitelist" "$i/Log.out" | tail -n1 | grep -q "737K-august-2016.txt"; then
+            WL="v2"
+        elif grep "^soloCBwhitelist" "$i/Log.out" | tail -n1 | grep -q "737K-april-2014_rc.txt"; then
+            WL="v1"
+        elif grep "^soloCBwhitelist" "$i/Log.out" | tail -n1 | grep -q "737K-arc-v1.txt"; then
+            WL="arc"
+        fi
+
+        R1=$(grep "Number of Reads,"             "$i/output/Gene/Summary.csv"     | awk -F "," '{print $2}')
+        B=$( grep "Reads With Valid Barcodes,"    "$i/output/Gene/Summary.csv"     | awk -F "," '{print $2}')
+        G1=$(grep "Reads Mapped to Genome: Unique+Multiple," "$i/output/Gene/Summary.csv" | awk -F "," '{print $2}')
+        G2=$(grep "Reads Mapped to Genome: Unique,"          "$i/output/Gene/Summary.csv" | awk -F "," '{print $2}')
+        E1=$(grep "Reads Mapped to Gene: Unique+Multip.*e Gene,"     "$i/output/Gene/Summary.csv"     | awk -F "," '{print $2}')
+        E2=$(grep "Reads Mapped to Gene: Unique Gene,"               "$i/output/Gene/Summary.csv"     | awk -F "," '{print $2}')
+        F1=$(grep "Reads Mapped to GeneFull: Unique+Multip.*e GeneFull," "$i/output/GeneFull/Summary.csv" | awk -F "," '{print $2}')
+        F2=$(grep "Reads Mapped to GeneFull: Unique GeneFull,"           "$i/output/GeneFull/Summary.csv" | awk -F "," '{print $2}')
+        C=$( grep "Estimated Number of Cells,"    "$i/output/GeneFull/Summary.csv" | awk -F "," '{print $2}')
+        R2=$(grep "Unique Reads in Cells Mapped to GeneFull," "$i/output/GeneFull/Summary.csv" | awk -F "," '{print $2}')
+        CF=$(echo "$R1" | awk -v v="$R2" '{printf "%.3f\n",v/$1}')
+        R3=$(grep "UMIs in Cells,"                "$i/output/GeneFull/Summary.csv" | awk -F "," '{print $2}')
+        GC=$(grep "Median GeneFull per Cell,"      "$i/output/GeneFull/Summary.csv" | awk -F "," '{print $2}')
+        ST=$(grep "^soloStrand" "$i/Log.out" | grep RE-DEFINED | awk '{print $2}')
+
+        if [[ -z "$ST" ]]; then
+            ST="Undef"
+        elif [[ "$PAIRED" == "Paired" ]]; then
+            ST="Reverse"
+        fi
+
+        F2PCT=$(echo "$F2" | awk '{printf "%d\n",$1*100+0.5}')
+        if (( F2PCT <= 20 )); then
+            >&2 echo "WARNING: Sample $i : GeneFull percentage ($F2PCT) is too low!"
+        fi
+
+        echo -e "$i\t$R1\t$R2\t$CF\t$R3\t$C\t$GC\t$B\t$WL\t$REF\t$PAIRED\t$ST\t$G1\t$G2\t$E1\t$E2\t$F1\t$F2"
+
+    elif [[ -d "$i/output" && -s "$i/Log.out" ]]; then
+        # --- Plate-based (Smart-seq2 etc.) ---
+        PAIRED="Single"
+        if grep -q "mate 2" "$i/Log.out"; then
+            PAIRED="Paired"
+        fi
+
+        REF="Other"
+        if grep "^genomeDir" "$i/Log.out" | tail -n1 | grep -q "/human/"; then
+            REF="Human"
+        elif grep "^genomeDir" "$i/Log.out" | tail -n1 | grep -q "/mouse/"; then
+            REF="Mouse"
+        fi
+
+        WL="Undef"
+        if grep -q "manifest" "$i/Log.out"; then
+            WL="None(Smart-seq)"
+        fi
+
+        R1=$(grep "Number of Reads,"             "$i/output/Gene/Summary.csv"     | awk -F "," '{print $2}')
+        B=$( grep "Reads With Valid Barcodes,"    "$i/output/Gene/Summary.csv"     | awk -F "," '{print $2}')
+        G1=$(grep "Reads Mapped to Genome: Unique+Multiple," "$i/output/Gene/Summary.csv" | awk -F "," '{print $2}')
+        G2=$(grep "Reads Mapped to Genome: Unique,"          "$i/output/Gene/Summary.csv" | awk -F "," '{print $2}')
+        E1=$(grep "Reads Mapped to Gene: Unique+Multip.*e Gene,"     "$i/output/Gene/Summary.csv"     | awk -F "," '{print $2}')
+        E2=$(grep "Reads Mapped to Gene: Unique Gene,"               "$i/output/Gene/Summary.csv"     | awk -F "," '{print $2}')
+        F1=$(grep "Reads Mapped to GeneFull: Unique+Multip.*e GeneFull," "$i/output/GeneFull/Summary.csv" | awk -F "," '{print $2}')
+        F2=$(grep "Reads Mapped to GeneFull: Unique GeneFull,"           "$i/output/GeneFull/Summary.csv" | awk -F "," '{print $2}')
+        C=$( zcat "$i/output/Gene/raw/barcodes.tsv.gz" | wc -l)
+        R2=$(grep "yessubWLmatch_UniqueFeature" "$i/output/GeneFull/Features.stats" | awk '{print $2}')
+        CF=$(echo "$R1" | awk -v v="$R2" '{printf "%.3f\n",v/$1}')
+        R3=$(grep "yesUMIs" "$i/output/GeneFull/Features.stats" | awk '{print $2}')
+        GC=$(zcat "$i/output/GeneFull/raw/matrix.mtx.gz" | awk 'NR>3 {print $2}' | uniq -c | awk '{print $1}' | sort -n | awk '{a[i++]=$1} END {print a[int(i/2)]}')
+        ST=$(grep "^soloStrand" "$i/Log.out" | grep RE-DEFINED | awk '{print $2}')
+
+        [[ -z "$ST" ]] && ST="Undef"
+
+        F2PCT=$(echo "$F2" | awk '{printf "%d\n",$1*100+0.5}')
+        if (( F2PCT <= 20 )); then
+            >&2 echo "WARNING: Sample $i : GeneFull percentage ($F2PCT) is too low!"
+        fi
+
+        echo -e "$i\t$R1\t$R2\t$CF\t$R3\t$C\t$GC\t$B\t$WL\t$REF\t$PAIRED\t$ST\t$G1\t$G2\t$E1\t$E2\t$F1\t$F2"
+    fi
+done
diff --git a/scripts/starsolo_10x_auto.sh b/scripts/starsolo_10x_auto.sh
deleted file mode 100755
index 7523abc..0000000
--- a/scripts/starsolo_10x_auto.sh
+++ /dev/null
@@ -1,291 +0,0 @@
-#!/bin/bash -e 
-
-## v3.2 of STARsolo wrappers is set up to guess the chemistry automatically
-## newest version of the script uses STAR v2.7.10a with EM multimapper processing 
-## in STARsolo which on by default; the extra matrix can be found in /raw subdir 
-
-SIF="/nfs/cellgeni/singularity/images/reprocess_10x.sif"
-CMD="singularity run --bind /nfs,/lustre,/software $SIF"
-
-FQDIR=$1
-TAG=$2
-
-if [[ $FQDIR == "" || $TAG == "" ]]
-then
-  >&2 echo "Usage: ./starsolo_10x_auto.sh <fastq_dir> <sample_id>"
-  >&2 echo "(make sure you set the correct REF, WL, and BAM variables below)"
-  exit 1
-fi
-
-FQDIR=`readlink -f $FQDIR`
-CPUS=16                                                                ## typically bsub this into normal queue with 16 cores and 64 Gb RAM.   
-REF=/nfs/cellgeni/STAR/human/2020A/index                               ## choose the appropriate reference 
-WL=/nfs/cellgeni/STAR/whitelists                                       ## directory with all barcode whitelists
-
-## choose one of the two otions, depending on whether you need a BAM file 
-#BAM="--outSAMtype BAM SortedByCoordinate --outBAMsortingBinsN 500 --limitBAMsortRAM 60000000000 --outSAMunmapped Within --outMultimapperOrder Random --runRNGseed 1 --outSAMattributes NH HI AS nM CB UB CR CY UR UY GX GN"
-BAM="--outSAMtype None --outReadsUnmapped Fastx"
-
-###################################################################### DONT CHANGE OPTIONS BELOW THIS LINE ##############################################################################################
-
-mkdir $TAG && cd $TAG
-
-## four popular cases: ENA - <sample>_1.fastq/<sample>_2.fastq, regular - <sample>.R1.fastq/<sample>.R2.fastq
-## Cell Ranger - <sample>_L001_R1_S001.fastq/<sample>_L001_R2_S001.fastq, and HRA - <sample>_f1.fastq/<sample>_r2.fastq
-## the command below will generate a comma-separated list for each read if there are >1 file for each mate
-## archives (gzip/bzip2) are considered below; both .fastq and .fq should work, too. 
-R1=""
-R2=""
-if [[ `find $FQDIR/* | grep -P "\/$TAG[\/\._]" | grep "_1\.f.*q"` != "" ]]
-then 
-  R1=`find $FQDIR/* | grep -P "\/$TAG[\/\._]" | grep "_1\.f.*q" | sort | tr '\n' ',' | sed "s/,$//g"`
-  R2=`find $FQDIR/* | grep -P "\/$TAG[\/\._]" | grep "_2\.f.*q" | sort | tr '\n' ',' | sed "s/,$//g"`
-elif [[ `find $FQDIR/* | grep -P "\/$TAG[\/\._]" | grep "R1\.f.*q"` != "" ]]
-then
-  R1=`find $FQDIR/* | grep -P "\/$TAG[\/\._]" | grep "R1\.f.*q" | sort | tr '\n' ',' | sed "s/,$//g"`
-  R2=`find $FQDIR/* | grep -P "\/$TAG[\/\._]" | grep "R2\.f.*q" | sort | tr '\n' ',' | sed "s/,$//g"`
-elif [[ `find $FQDIR/* | grep -P "\/$TAG[\/\._]" | grep "_R1_.*\.f.*q"` != "" ]]
-then
-  R1=`find $FQDIR/* | grep -P "\/$TAG[\/\._]" | grep "_R1_.*\.f.*q" | sort | tr '\n' ',' | sed "s/,$//g"`
-  R2=`find $FQDIR/* | grep -P "\/$TAG[\/\._]" | grep "_R2_.*\.f.*q" | sort | tr '\n' ',' | sed "s/,$//g"`
-elif [[ `find $FQDIR/* | grep -P "\/$TAG[\/\._]" | grep "_f1\.f.*q"` != "" ]]
-then
-  R1=`find $FQDIR/* | grep -P "\/$TAG[\/\._]" | grep "_f1\.f.*q" | sort | tr '\n' ',' | sed "s/,$//g"`
-  R2=`find $FQDIR/* | grep -P "\/$TAG[\/\._]" | grep "_r2\.f.*q" | sort | tr '\n' ',' | sed "s/,$//g"`
-else 
-  >&2 echo "ERROR: No appropriate fastq files were found! Please check file formatting, and check if you have set the right FQDIR."
-  exit 1
-fi 
-
-## define some key variables, in order to evaluate reads for being 1) gzipped/bzipped/un-archived; 
-## 2) having barcodes from the whitelist, and which; 3) having consistent length; 4) being single- or paired-end. 
-GZIP=""
-BC=""
-NBC1=""
-NBC2=""
-NBC3=""
-NBCA=""
-R1LEN=""
-R2LEN=""
-R1DIS=""
-ZCMD="cat"
-
-## see if the original fastq files are archived: 
-if [[ `find $FQDIR/* | grep -P "$TAG[\/\._]" | grep "\.gz$"` != "" ]]
-then  
-  GZIP="--readFilesCommand zcat"
-  ZCMD="zcat"
-elif [[ `find $FQDIR/* | grep -P "$TAG[\/\._]" | grep "\.bz2$"` != "" ]]
-then
-  GZIP="--readFilesCommand bzcat"
-  ZCMD="bzcat"
-fi
-
-## we need a small and random selection of reads. the solution below is a result of much trial and error.
-## in the end, we select 200k reads that represent all of the files present in the FASTQ dir for this sample.
-## have to use numbers because bamtofastq likes to make files with identical names in different folders..
-COUNT=0
-for i in `echo $R1 | tr ',' ' '`
-do
-  $ZCMD $i | head -4000000 > $COUNT.R1_head &
-  COUNT=$((COUNT+1))
-done
-wait 
-
-COUNT=0
-for i in `echo $R2 | tr ',' ' ' `                                                
-do 
-  $ZCMD $i | head -4000000 > $COUNT.R2_head &
-  COUNT=$((COUNT+1))
-done
-wait
-
-## same random seed makes sure you select same reads from R1 and R2
-cat *.R1_head | $CMD seqtk sample -s100 - 200000 > test.R1.fastq &
-cat *.R2_head | $CMD seqtk sample -s100 - 200000 > test.R2.fastq &
-wait 
-rm *.R1_head *.R2_head
-
-## elucidate the right barcode whitelist to use. Grepping out N saves us some trouble. Note the special list for multiome experiments (737K-arc-v1.txt):
-## 50k (out of 200,000) is a modified empirical number - matching only first 14-16 nt makes this more specific
-NBC1=`cat test.R1.fastq | awk 'NR%4==2' | cut -c-14 | grep -F -f $WL/737K-april-2014_rc.txt | wc -l`
-NBC2=`cat test.R1.fastq | awk 'NR%4==2' | cut -c-16 | grep -F -f $WL/737K-august-2016.txt | wc -l`
-NBC3=`cat test.R1.fastq | awk 'NR%4==2' | cut -c-16 | grep -F -f $WL/3M-february-2018.txt | wc -l`
-NBC4=`cat test.R1.fastq | awk 'NR%4==2' | cut -c-16 | grep -F -f $WL/3M-3pgex-may-2023.txt | wc -l`
-NBC5=`cat test.R1.fastq | awk 'NR%4==2' | cut -c-16 | grep -F -f $WL/3M-5pgex-jan-2023.txt | wc -l`
-NBCA=`cat test.R1.fastq | awk 'NR%4==2' | cut -c-16 | grep -F -f $WL/737K-arc-v1.txt | wc -l`
-R1LEN=`cat test.R1.fastq | awk 'NR%4==2' | awk '{sum+=length($0)} END {printf "%d\n",sum/NR+0.5}'`
-R2LEN=`cat test.R2.fastq | awk 'NR%4==2' | awk '{sum+=length($0)} END {printf "%d\n",sum/NR+0.5}'`
-R1DIS=`cat test.R1.fastq | awk 'NR%4==2' | awk '{print length($0)}' | sort | uniq -c | wc -l`
-
-if (( $NBC3 > 50000 )) 
-then 
-  BC=$WL/3M-february-2018.txt
-elif (( $NBC2 > 50000 ))
-then
-  BC=$WL/737K-august-2016.txt
-elif (( $NBCA > 50000 ))
-then
-  BC=$WL/737K-arc-v1.txt
-elif (( $NBC1 > 50000 )) 
-then
-  BC=$WL/737K-april-2014_rc.txt
-elif (( $NBC4 > 50000 )) 
-then
-  BC=$WL/3M-3pgex-may-2023.txt
-elif (( $NBC5 > 50000 )) 
-then
-  BC=$WL/3M-5pgex-jan-2023.txt
-else 
-  >&2 echo "ERROR: No whitelist has matched a random selection of 200,000 barcodes! Match counts: $NBC1 (v1), $NBC2 (v2), $NBC3 (v3), $NBC4 (v4-3p), $NBC5 (v4-5p), $NBCA (multiome)."
-  exit 1
-fi 
-
-## check read lengths, fail if something funky is going on: 
-PAIRED=False
-UMILEN=""
-CBLEN=""
-if (( $R1DIS > 1 && $R1LEN <= 30 ))
-then 
-  >&2 echo "ERROR: Read 1 (barcode) has varying length; possibly someone thought it's a good idea to quality-trim it. Please check the fastq files."
-  exit 1
-elif (( $R1LEN < 24 )) 
-then
-  >&2 echo "ERROR: Read 1 (barcode) is less than 24 bp in length. Please check the fastq files."
-  exit 1
-elif (( $R2LEN < 40 )) 
-then
-  >&2 echo "ERROR: Read 2 (biological read) is less than 40 bp in length. Please check the fastq files."
-  exit 1
-fi
-
-## assign the necessary variables for barcode/UMI length/paired-end processing. 
-## script was changed to not rely on read length for the UMIs because of the epic Hassan case
-# (v2 16bp barcodes + 10bp UMIs were sequenced to 28bp, effectively removing the effects of the UMIs)
-if (( $R1LEN > 50 )) 
-then
-  PAIRED=True
-fi
-
-if [[ $BC == "$WL/3M-february-2018.txt" || $BC == "$WL/737K-arc-v1.txt" || $BC == "$WL/3M-3pgex-may-2023.txt" || $BC == "$WL/3M-5pgex-jan-2023.txt" ]] 
-then 
-  CBLEN=16
-  UMILEN=12
-elif [[ $BC == "$WL/737K-august-2016.txt" ]] 
-then
-  CBLEN=16
-  UMILEN=10
-elif [[ $BC == "$WL/737K-april-2014_rc.txt" ]] 
-then
-  CBLEN=14
-  UMILEN=10
-fi
-
-## yet another failsafe! Some geniuses managed to sequence v3 10x with a 26bp R1, which also causes STARsolo grief. This fixes it.
-if (( $CBLEN + $UMILEN > $R1LEN ))
-then
-  NEWUMI=$((R1LEN-CBLEN))
-  BCUMI=$((UMILEN+CBLEN))
-  >&2 echo "WARNING: Read 1 length ($R1LEN) is less than the sum of appropriate barcode and UMI ($BCUMI). Changing UMI setting from $UMILEN to $NEWUMI!"
-  UMILEN=$NEWUMI
-elif (( $CBLEN + $UMILEN < $R1LEN ))
-then
-  BCUMI=$((UMILEN+CBLEN))
-  >&2 echo "WARNING: Read 1 length ($R1LEN) is more than the sum of appropriate barcode and UMI ($BCUMI)."
-fi
-
-## it's hard to come up with a universal rule to correctly infer strand-specificity of the experiment. 
-## this is the best I could come up with: 1) check if fraction of test reads (200k random ones) maps to GeneFull forward strand 
-## with higher than 50% probability; 2) if not, run the same quantification with "--soloStand Reverse" and calculate the same stat; 
-## 3) output a warning, and choose the strand with higher %; 4) if both percentages are below 10, 
-
-STRAND=Forward
-
-$CMD STAR --runThreadN $CPUS --genomeDir $REF --readFilesIn test.R2.fastq test.R1.fastq --runDirPerm All_RWX --outSAMtype None \
-     --soloType CB_UMI_Simple --soloCBwhitelist $BC --soloBarcodeReadLength 0 --soloCBlen $CBLEN --soloUMIstart $((CBLEN+1)) \
-     --soloUMIlen $UMILEN --soloStrand Forward --genomeLoad LoadAndKeep \
-     --soloUMIdedup 1MM_CR --soloCBmatchWLtype 1MM_multi_Nbase_pseudocounts --soloUMIfiltering MultiGeneUMI_CR \
-     --soloCellFilter EmptyDrops_CR --clipAdapterType CellRanger4 --outFilterScoreMin 30 \
-     --soloFeatures Gene GeneFull --soloOutFileNames test_forward/ features.tsv barcodes.tsv matrix.mtx &> /dev/null 
-
-$CMD STAR --runThreadN $CPUS --genomeDir $REF --readFilesIn test.R2.fastq test.R1.fastq --runDirPerm All_RWX --outSAMtype None \
-     --soloType CB_UMI_Simple --soloCBwhitelist $BC --soloBarcodeReadLength 0 --soloCBlen $CBLEN --soloUMIstart $((CBLEN+1)) \
-     --soloUMIlen $UMILEN --soloStrand Reverse --genomeLoad LoadAndKeep \
-     --soloUMIdedup 1MM_CR --soloCBmatchWLtype 1MM_multi_Nbase_pseudocounts --soloUMIfiltering MultiGeneUMI_CR \
-     --soloCellFilter EmptyDrops_CR --clipAdapterType CellRanger4 --outFilterScoreMin 30 \
-     --soloFeatures Gene GeneFull --soloOutFileNames test_reverse/ features.tsv barcodes.tsv matrix.mtx &> /dev/null
-
-PCTFWD=`grep "Reads Mapped to GeneFull: Unique GeneFull" test_forward/GeneFull/Summary.csv | awk -F "," '{printf "%d\n",$2*100+0.5}'`
-PCTREV=`grep "Reads Mapped to GeneFull: Unique GeneFull" test_reverse/GeneFull/Summary.csv | awk -F "," '{printf "%d\n",$2*100+0.5}'`
-
-if (( $PCTREV > $PCTFWD )) 
-then
-  STRAND=Reverse
-fi
-
-if (( $PCTREV < 50 && $PCTFWD < 50)) 
-then
-  >&2 echo "WARNING: Low percentage of reads mapping to GeneFull: forward = $PCTFWD , reverse = $PCTREV"
-fi 
-
-## finally, if paired-end experiment turned out to be 3' (yes, they do exist!), process it as single-end: 
-if [[ $STRAND == "Forward" && $PAIRED == "True" ]]
-then
-  PAIRED=False
-fi
-
-## write a file in the sample dir too, these metrics are not crucial but useful 
-echo "Done setting up the STARsolo run; here are final processing options:"
-echo "============================================================================="
-echo "Sample: $TAG" | tee strand.txt
-echo "Paired-end mode: $PAIRED" | tee -a strand.txt
-echo "Strand (Forward = 3', Reverse = 5'): $STRAND, %reads mapped to GeneFull: forward = $PCTFWD , reverse = $PCTREV" | tee -a strand.txt
-echo "CB whitelist: $BC, matches out of 200,000: $NBC3 (v3), $NBC2 (v2), $NBC1 (v1), $NBCA (multiome) " | tee -a strand.txt
-echo "CB length: $CBLEN" | tee -a strand.txt
-echo "UMI length: $UMILEN" | tee -a strand.txt
-echo "GZIP: $GZIP" | tee -a strand.txt
-echo "-----------------------------------------------------------------------------" | tee -a strand.txt
-echo "Read 1 files: $R1" | tee -a strand.txt
-echo "-----------------------------------------------------------------------------" | tee -a strand.txt
-echo "Read 2 files: $R2" | tee -a strand.txt
-echo "-----------------------------------------------------------------------------" | tee -a strand.txt
-
-if [[ $PAIRED == "True" ]]
-then
-  ## note the R1/R2 order of input fastq reads and --soloStrand Forward for 5' paired-end experiment
-  $CMD STAR --runThreadN $CPUS --genomeDir $REF --readFilesIn $R1 $R2 --runDirPerm All_RWX $GZIP $BAM --soloBarcodeMate 1 --clip5pNbases 39 0 \
-     --soloType CB_UMI_Simple --soloCBwhitelist $BC --soloCBstart 1 --soloCBlen $CBLEN --soloUMIstart $((CBLEN+1)) --soloUMIlen $UMILEN --soloStrand Forward \
-     --soloUMIdedup 1MM_CR --soloCBmatchWLtype 1MM_multi_Nbase_pseudocounts --soloUMIfiltering MultiGeneUMI_CR \
-     --soloCellFilter EmptyDrops_CR --outFilterScoreMin 30 --genomeLoad LoadAndRemove \
-     --soloFeatures Gene GeneFull Velocyto --soloOutFileNames output/ features.tsv barcodes.tsv matrix.mtx --soloMultiMappers EM
-else 
-  $CMD STAR --runThreadN $CPUS --genomeDir $REF --readFilesIn $R2 $R1 --runDirPerm All_RWX $GZIP $BAM \
-     --soloType CB_UMI_Simple --soloCBwhitelist $BC --soloBarcodeReadLength 0 --soloCBlen $CBLEN --soloUMIstart $((CBLEN+1)) --soloUMIlen $UMILEN --soloStrand $STRAND \
-     --soloUMIdedup 1MM_CR --soloCBmatchWLtype 1MM_multi_Nbase_pseudocounts --soloUMIfiltering MultiGeneUMI_CR \
-     --soloCellFilter EmptyDrops_CR --clipAdapterType CellRanger4 --outFilterScoreMin 30 --genomeLoad LoadAndRemove \
-     --soloFeatures Gene GeneFull Velocyto --soloOutFileNames output/ features.tsv barcodes.tsv matrix.mtx --soloMultiMappers EM
-fi
-
-## index the BAM file
-if [[ -s Aligned.sortedByCoord.out.bam ]]
-then
-  $CMD samtools index -@16 Aligned.sortedByCoord.out.bam
-fi
-
-## max-CR bzip all unmapped reads with multicore pbzip2 
-$CMD pbzip2 -9 Unmapped.out.mate1 &
-$CMD pbzip2 -9 Unmapped.out.mate2 &
-wait
-
-## remove test files 
-rm -rf test.R?.fastq test_forward test_reverse
-
-cd output
-for i in Gene/raw Gene/filtered GeneFull/raw GeneFull/filtered Velocyto/raw Velocyto/filtered
-do 
-  cd $i; for j in *; do gzip $j & done
-  cd ../../
-done
-
-wait
-echo "ALL DONE!"