diff --git a/.github/workflows/quay-on-tag.yml b/.github/workflows/quay-on-tag.yml new file mode 100644 index 0000000..67603c0 --- /dev/null +++ b/.github/workflows/quay-on-tag.yml @@ -0,0 +1,50 @@ +name: Build & Push Docker image on tag creation +on: + push: + tags: + - '*' + +concurrency: + group: docker-${{ github.ref }} + cancel-in-progress: true + +jobs: + docker: + runs-on: ubuntu-latest + permissions: + contents: read + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Login to Quay + uses: docker/login-action@v3 + with: + registry: quay.io + username: ${{ secrets.QUAY_USERNAME }} + password: ${{ secrets.QUAY_PASSWORD }} + + - name: Docker metadata + id: meta + uses: docker/metadata-action@v5 + with: + images: quay.io/cellgeni/starsolo + tags: | + type=ref,event=tag + # push "latest" only for tags WITHOUT hyphens (excludes pre-releases like v1.0-beta): + type=raw,value=latest,enable=${{ !contains(github.ref_name, '-') }} + + - name: Build and push Docker image + uses: docker/build-push-action@v6 + with: + context: . + file: Dockerfile + push: true + tags: ${{ steps.meta.outputs.tags }} + labels: ${{ steps.meta.outputs.labels }} + cache-from: | + type=registry,ref=quay.io/cellgeni/starsolo:cache + type=gha + cache-to: type=registry,ref=quay.io/cellgeni/starsolo:cache,mode=max + \ No newline at end of file diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml new file mode 100644 index 0000000..d3795c1 --- /dev/null +++ b/.github/workflows/test.yml @@ -0,0 +1,57 @@ +name: "Test Workflow" +on: + push: + branches: [main, dev] + paths-ignore: + - 'data/whitelists.tar.gz' + - 'README.md' + - 'LICENSE' + - 'scripts/**' + - 'Dockerfile' + pull_request: + branches: [main, dev] + +jobs: + test: + runs-on: ubuntu-latest + container: quay.io/cellgeni/starsolo:base + strategy: + fail-fast: false + matrix: + include: + - technology: 10x + - technology: dropseq + - technology: indrops + - technology: smartseq + - technology: rhapsody + - technology: strt + name: "Run tests for ${{ matrix.technology }}" + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Unpack test data + run: tar -xzf data/test.tar.gz + + - name: Run tests + run: | + set -e + technology=${{ matrix.technology }} + if [ "$technology" = "smartseq" ]; then + bin/starsolo smartseq \ + data/test/fastqs/smartseq/smartseq_example.manifest.tsv \ + --ref data/test/reference/index \ + --no-bam \ + --cpus 1 + else + bin/starsolo $technology \ + data/test/fastqs/${technology} \ + "$technology" \ + --ref data/test/reference/index \ + --whitelist-dir data/test/whitelists \ + --no-bam \ + --cpus 1 + fi + + diff --git a/Dockerfile b/Dockerfile index 6612e08..2603f4d 100644 --- a/Dockerfile +++ b/Dockerfile @@ -3,14 +3,14 @@ FROM ubuntu:20.04 ENV DEBIAN_FRONTEND=noninteractive ARG star_version=2.7.10a_alpha_220818 -ARG samtools_version=1.15.1 +ARG samtools_version=1.21 ARG bbmap_version=38.97 -ARG rsem_version=1.3.3 +ARG seqtk_version=1.4 #Install OS packages RUN apt-get update && apt-get -y --no-install-recommends -qq install \ wget gcc build-essential software-properties-common libz-dev \ - git libncurses5-dev libbz2-dev liblzma-dev default-jre bsdmainutils + git libncurses5-dev libbz2-dev liblzma-dev default-jre bsdmainutils pbzip2 #Install STAR RUN wget --no-check-certificate https://github.com/alexdobin/STAR/archive/${star_version}.tar.gz && \ @@ -20,10 +20,11 @@ RUN wget --no-check-certificate https://github.com/alexdobin/STAR/archive/${star cd / && rm ${star_version}.tar.gz #Install seqtk -RUN git clone https://github.com/lh3/seqtk.git && \ - mv seqtk /opt && \ - cd /opt/seqtk && \ - make +RUN wget --no-check-certificate https://github.com/lh3/seqtk/archive/refs/tags/v${seqtk_version}.tar.gz && \ + tar -xzf v${seqtk_version}.tar.gz -C /opt && \ + cd /opt/seqtk-${seqtk_version} && \ + make && \ + cd / && rm v${seqtk_version}.tar.gz #Install samtools RUN wget https://github.com/samtools/samtools/releases/download/${samtools_version}/samtools-${samtools_version}.tar.bz2 && \ @@ -41,19 +42,29 @@ RUN wget https://sourceforge.net/projects/bbmap/files/BBMap_${bbmap_version}.tar ./stats.sh in=resources/phix174_ill.ref.fa.gz && \ cd / && rm BBMap_${bbmap_version}.tar.gz -#Install RSEM -RUN wget https://github.com/deweylab/RSEM/archive/refs/tags/v${rsem_version}.tar.gz && \ - tar -xzf v${rsem_version}.tar.gz -C /opt && \ - cd /opt/RSEM-${rsem_version} && \ - make && \ - cd / && rm v${rsem_version}.tar.gz +# Install this repository +COPY . /opt/STARsolo +RUN cd /opt/STARsolo &&\ + rm data/test.tar.gz &&\ + tar -xzf data/whitelists.tar.gz &&\ + ./install.sh -ENV PATH="${PATH}:/opt/STAR-${star_version}/source:/opt/seqtk:/opt/bbmap:/opt/RSEM-${rsem_version}" +# Set PATH to include all binaries +ENV STARSOLO_WL_DIR=/opt/STARsolo/data/whitelists +ENV PATH="/opt/STARsolo/bin:${PATH}:/opt/STAR-${star_version}/source:/opt/seqtk-${seqtk_version}:/opt/bbmap" #Saving Software Versions to a file RUN echo "STAR version: ${star_version}" >> versions.txt && \ echo "samtools version: ${samtools_version}" >> versions.txt && \ echo "BBMap version: ${bbmap_version}" >> versions.txt && \ - echo "RSEM version: ${rsem_version}" >> versions.txt && \ - seqtk_version=`strings $(which seqtk) | grep 'Version:' | cut -f 2 -d " "` && \ echo "seqtk version: ${seqtk_version}" >> versions.txt + +COPY Dockerfile /docker/ +RUN chmod -R 755 /docker + + + +# Default entrypoint: run the CLI +WORKDIR /workdir +ENTRYPOINT ["starsolo"] +CMD ["--help"] \ No newline at end of file diff --git a/README.md b/README.md index 4407f52..fceac68 100644 --- a/README.md +++ b/README.md @@ -1,148 +1,256 @@ -# Wrapper scripts for `STARsolo` scRNA-seq pipeline +# `starsolo` — unified CLI for STARsolo scRNA-seq processing -These are the scripts used for CellGenIT for uniform processing of scRNA-seq - both 10X and quite a few other types (see below for supported platforms). All listed methods use [STAR](https://github.com/alexdobin/STAR) aligner to align reads to the reference genome. +A single command-line tool that wraps [STAR](https://github.com/alexdobin/STAR) in `STARsolo` mode for uniform processing of scRNA-seq data across multiple platforms. -## Software installation +## Supported platforms -### STAR version +| Subcommand | Platform | Barcode type | Chemistry auto-detection | +|:--|:--|:--|:-:| +| `10x` | 10x Genomics (v1 – v4, multiome) | CB_UMI_Simple | ✅ | +| `smartseq` | Smart-seq / Smart-seq2 | SmartSeq (plate-based, no UMIs) | — | +| `dropseq` | Drop-seq | CB_UMI_Simple (no whitelist) | — | +| `rhapsody` | BD Rhapsody | CB_UMI_Complex (3 segments) | — | +| `indrops` | inDrops | CB_UMI_Complex (2 segments + adapter) | — | +| `strt` | STRT-seq | CB_UMI_Simple (96-barcode list) | — | +| `qc` | Aggregate QC stats across runs | — | — | -`STAR` of version 2.7.9a or above is recommended (2.7.10a is the latest and greatest, as of August'22). The newest update includes the ability to correctly process multi-mapping reads, and adds many important options and bug fixes. +## Installation -In order to use settings that closely mimic those of `Cell Ranger` v4 or above (see explanations below, particularly `--clipAdapterType CellRanger4` option), `STAR` needs to be re-compiled from source with `make STAR CXXFLAGS_SIMD="-msse4.2"` (see [this issue](https://github.com/alexdobin/STAR/issues/1218) for more info). If you get the _Illegal instruction_ error, that's what you need to do. +### Prerequisites -### Docker +The following tools **must be available in `$PATH`** before running `starsolo`: -We also have a Dockerfile which builds with specific versions of all the tools used. Once the container is built you can do `cat /versions.txt` and it will show the versions of the tools in the container. +| Tool | Tested version | Purpose | +|:--|:--|:--| +| [STAR](https://github.com/alexdobin/STAR) | 2.7.10a | Alignment & quantification | +| [samtools](https://github.com/samtools/samtools) | 1.15.1 | BAM indexing | +| [seqtk](https://github.com/lh3/seqtk) | 1.3+ | Read subsampling (10x chemistry detection) | +| [pbzip2](https://launchpad.net/pbzip2) | 1.1+ | Compressing unmapped reads | +| [BBMap](https://sourceforge.net/projects/bbmap/) | 38.97 | Adapter trimming (optional, via `bbduk.sh`) | -## Reference genome and annotation +> **Tip:** The included [Dockerfile](Dockerfile) builds all of the above into a single container image. -The issues of reference genome and annotation used for **mouse** and **human** scRNA-seq experiments are described [here](https://www.singlecellcourse.org/processing-raw-scrna-seq-sequencing-data-from-reads-to-a-count-matrix.html) in some detail. +STAR must be compiled with `make STAR CXXFLAGS_SIMD="-msse4.2"` to support `--clipAdapterType CellRanger4` (see [STAR#1218](https://github.com/alexdobin/STAR/issues/1218)). -Briefly, +### Install `starsolo` CLI - - there are several standard annotation versions used by `Cell Ranger`; They are referred to as versions `1.2.0`, `2.1.0`, `3.0.0`, and `2020-A`; - - the references are filtered to remove pseudogenes and small RNAs; exact filtering scripts are available [here](https://support.10xgenomics.com/single-cell-gene-expression/software/release-notes/build#header); - - number of genes in `Cell Ranger` filtered human reference is about 35k; in full human annotation there are about 60k. +```bash +git clone && cd STARsolo + +# Default: symlinks into ~/.local/bin +./install.sh + +# Or specify a directory: +./install.sh /usr/local/bin + +# Verify +starsolo --version +``` + +The installer creates a single `starsolo` symlink. All library code is resolved relative to the repo, so you can `git pull` to update in-place. + +## Reference genome + +Use a standard STAR genome index. [Cell Ranger-filtered](https://support.10xgenomics.com/single-cell-gene-expression/software/release-notes/build#header) references are recommended for comparability with Cell Ranger output. + +```bash +STAR --runThreadN 16 --runMode genomeGenerate --genomeDir STAR --genomeFastaFiles $FA --sjdbGTFfile $GTF +``` + +By default, `starsolo` resolves references via `--species`: + +``` +$STARSOLO_REF_BASE//2020A/index +``` + +Override with `--ref /your/path/to/index` on any subcommand, or change `STARSOLO_REF_BASE` in [etc/defaults.conf](etc/defaults.conf). + +### Barcode whitelists + +Download 10x barcode whitelists [from Cell Ranger](https://github.com/10XGenomics/cellranger/tree/master/lib/python/cellranger/barcodes) and place them in the whitelist directory (default: `/nfs/cellgeni/STAR/whitelists`, configurable via `--whitelist-dir` or `STARSOLO_WL_DIR`). + +## Usage + +``` +starsolo [options] +``` + +### Global options + +| Flag | Description | Default | +|:--|:--|:--| +| `-s, --species ` | Species name (`human`, `mouse`, …) — resolves reference automatically | — | +| `-r, --ref ` | Explicit STAR index path (overrides `--species`) | — | +| `-w, --whitelist-dir ` | Barcode whitelist directory | from config | +| `-c, --cpus ` | Number of threads | `16` | +| `--bam` | Output coordinate-sorted BAM | off | +| `--no-bam` | Suppress BAM output (default) | ✅ | +| `-h, --help` | Show help (global or per-platform) | — | +| `--version` | Print version | — | + +### 10x Genomics + +Automatically detects chemistry (v1–v4, multiome), strand specificity, and paired-end mode. + +```bash +starsolo 10x /data/fastqs SAMPLE1 --species human +starsolo 10x /data/fastqs SAMPLE1 --ref /path/to/index --bam +``` + +#### 10x chemistry detection + +The script subsamples 200,000 reads and matches barcodes against all known whitelists: -Once you have downloaded the needed fasta and GTF files, and applied the necessary filtering (see the 10x genomics link above), run the following command using 16 CPUs/64Gb of RAM: +| 10x version | Whitelist | CB len | UMI len | Strand | +|:-:|:-:|:-:|:-:|:-:| +| 3' v1 | 737K-april-2014_rc.txt | 14 | 10 | Forward | +| 3' v2 | 737K-august-2016.txt | 16 | 10 | Forward | +| 3' v3/v3.1 | 3M-february-2018.txt | 16 | 12 | Forward | +| 3' v4 | 3M-3pgex-may-2023.txt | 16 | 12 | Forward | +| 5' v1.1/v2 | 737K-august-2016.txt | 16 | 10 | Reverse | +| 5' v3 | 737K-august-2016.txt | 16 | 12 | Reverse | +| 5' v4 | 3M-5pgex-jan-2023.txt | 16 | 12 | Reverse | +| multiome | 737K-arc-v1.txt | 16 | 12 | Forward | -`STAR --runThreadN 16 --runMode genomeGenerate --genomeDir STAR --genomeFastaFiles $FA --sjdbGTFfile $GTF` +#### Paired-end processing -`STARsolo` reference is not different from a normal `STAR` reference - `STARsolo` workflow is invoked using the options listed below. Thus, it can be run on both `Cell Ranger` filtered and un-filtered index. We use the filtered reference to match the `Cell Ranger` output as closely as possible. +5' libraries with long R1 reads (>50 bp) are automatically processed in paired-end mode with `--soloBarcodeMate 1 --clip5pNbases 39 0`. 3' libraries are always single-end. -### Pre-made reference location +### Smart-seq / Smart-seq2 -All **CellGenIT** pre-made `STAR` references are located in `/nfs/cellgeni/STAR/`. Barcode whitelist files are located in `/nfs/cellgeni/STAR/whitelists`. -## Resource requirements +Requires a **manifest file** — a tab-separated file with three columns: `R1_path`, `R2_path`, `cell_name`. + +```bash +starsolo smartseq manifest.tsv --species mouse +``` -By default, all processing is done using 16 CPUs and 128Gb of RAM. Sanger Farm typically has 8 Gb of RAM per core, so 8 CPUs/64Gb RAM, or 16 CPUs/128Gb RAM is probably optimal. Unfortunately, many datasets fail with OOM error when 64Gb is requested, so we reverted our defaults to 128Gb. +Aliases: `smart-seq`, `ss2` -## Processing scRNA-seq with STARsolo +### Drop-seq -### 10X: reproducing `Cell Ranger` v4 and above (but much faster) +12 bp cell barcode, 8 bp UMI, no whitelist. -The current relevant wrapper script for all 10X scRNA-seq processing is called `starsolo_10x_auto.sh`. The scripts contain *many* options that sometimes change; some of which will be explained below. In general, commands are tuned in such way that the results will be very close to those of `Cell Ranger` v4 and above. +```bash +starsolo dropseq /data/fastqs SAMPLE1 --species human +``` -Before running, barcode whitelists need to be downloaded [from here](https://github.com/10XGenomics/cellranger/tree/master/lib/python/cellranger/barcodes). +Alias: `drop-seq` -Below are the explanations for some of the options (note that 5' experiments of versions v1.1/v2/v3 all use `737K-august-2016.txt` barcode file): - - `--runDirPerm All_RWX` allows a directory readable by all users, which becomes an issue when sharing results on Farm; - - `--soloCBwhitelist $BC --soloBarcodeReadLength 0 --soloUMIlen $UMILEN --soloStrand $STR` are settings that change with the used 10x chemistry: +### BD Rhapsody -
+3 barcode segments + 8 bp UMI. Requires `Rhapsody_bc1/2/3.txt` in the whitelist directory. -| 10X VERSION | BC | UMILEN | STR | -|:-:|:-:|:-:|:-:| -| 3' v1 | 737K-april-2014_rc.txt | 10 | Forward | -| 3' v2 | 737K-august-2016.txt | 10 | Forward | -| 3' v3, v3.1 | 3M-february-2018.txt | 12 | Forward | -| 3' v4 | 3M-3pgex-may-2023.txt | 12 | Forward | -| 5' v1.1, v2 | 737K-august-2016.txt | 10 | Reverse | -| 5' v3 | 737K-august-2016.txt | 12 | Reverse | -| 5' v4 | 3M-5pgex-jan-2023.txt | 12 | Reverse | -| multiome | 737K-arc-v1.txt | 12 | Forward | +```bash +starsolo rhapsody /data/fastqs SAMPLE1 --species human +``` -
+Aliases: `bd-rhapsody`, `bd` - - `--soloUMIdedup 1MM_CR --soloCBmatchWLtype 1MM_multi_Nbase_pseudocounts --soloUMIfiltering MultiGeneUMI_CR --clipAdapterType CellRanger4 --outFilterScoreMin 30` are options that define UMI collapsing, barcode collapsing, and read clipping algorithms that are closest to ones used by `Cell Ranger` v4+; - - `--soloCellFilter EmptyDrops_CR` specifies the cell filtering algorithm used in [EmptyDrops](https://bioconductor.org/packages/release/bioc/html/DropletUtils.html), which is the default algorithm in later versions of `Cell Ranger`; - - `--soloFeatures Gene GeneFull Velocyto` output conventional (exon-only) UMI counts, as well as exon+intron UMI counts (analog of `Cell Ranger` premrna option), as well as matrices preprocessed for `Velocyto`; - - `--soloMultiMappers EM` is to count multimappers (on by default in v3.0+ of our wrapper scripts; does not influence the main output, but creates an additional matrix in `/raw` subdir of `Gene` and `GeneFull`); - - `--readFilesCommand zcat` is used if your input fastq files are gzipped (auto-deteted); - - `--outReadsUnmapped Fastx` to output the unmapped reads that can be used for metagenomic analysis; - - options grouped as `$SORTEDBAM` should be used if you need a genomic bam file; otherwise, use `$NOBAM`. +### inDrops -Actual `STAR` command being run (note the order of read files passed to `--readFilesIn`): +2 barcode segments + adapter + 6 bp UMI. Requires `inDrops_Ambrose2_bc1/2.txt` in the whitelist directory. ```bash -STAR --runThreadN $CPUS --genomeDir $REF --readFilesIn $R2 $R1 --runDirPerm All_RWX $GZIP $BAM \ - --soloType CB_UMI_Simple --soloCBwhitelist $BC --soloBarcodeReadLength 0 \ - --soloCBlen $CBLEN --soloUMIstart $((CBLEN+1)) --soloUMIlen $UMILEN --soloStrand $STRAND \ - --soloUMIdedup 1MM_CR --soloCBmatchWLtype 1MM_multi_Nbase_pseudocounts --soloUMIfiltering MultiGeneUMI_CR \ - --soloCellFilter EmptyDrops_CR --clipAdapterType CellRanger4 --outFilterScoreMin 30 \ - --soloFeatures Gene GeneFull Velocyto --soloOutFileNames output/ features.tsv barcodes.tsv matrix.mtx \ - --soloMultiMappers EM --outReadsUnmapped Fastx +starsolo indrops /data/fastqs SAMPLE1 --species human +starsolo indrops /data/fastqs SAMPLE1 --species human --adapter GAGTGATTGCTTGTGACGCCAA ``` -### Processing 10X datasets in paired-end mode -Although some 3' 10X datasets are sequenced with a "long" R1, usually they are aligned as single-end, since it would take a very long R1 to accommodate for barcode, UMI, TSO + adapter, and a poly-A tail. On the other hand, 5' 10X datasets, especially ones sequenced with 2x150 bp paired-end reads, could be trimmed and aligned as paired-end (this is what `Cell Ranger` does, too). Effective `STARsolo` command for paired-end processing is as follows (note the 5' clipping flag, read input order, and flipped `soloStrand` because of the changed read order): +Default adapter: `GAGTGATTGCTTGTGACGCCTT` + +### STRT-seq + +96-barcode whitelist, 8 bp cell barcode, 8 bp UMI. Requires `96_barcodes.list` in the whitelist directory. ```bash -STAR --runThreadN $CPUS --genomeDir $REF --readFilesIn $R1 $R2 --runDirPerm All_RWX $GZIP $BAM \ - --soloBarcodeMate 1 --clip5pNbases 39 0 \ - --soloType CB_UMI_Simple --soloCBwhitelist $BC --soloCBstart 1 --soloCBlen $CBLEN \ - --soloUMIstart $((CBLEN+1)) --soloUMIlen $UMILEN --soloStrand Forward \ - --soloUMIdedup 1MM_CR --soloCBmatchWLtype 1MM_multi_Nbase_pseudocounts --soloUMIfiltering MultiGeneUMI_CR \ - --soloCellFilter EmptyDrops_CR --outFilterScoreMin 30 \ - --soloFeatures Gene GeneFull Velocyto --soloOutFileNames output/ features.tsv barcodes.tsv matrix.mtx \ - --soloMultiMappers EM --outReadsUnmapped Fastx +starsolo strt /data/fastqs SAMPLE1 --species human --strand Forward ``` -### Using STARsolo for SMART-seq/SMART-seq2 +Extra option: `--strand ` (default: `Forward`) + +Alias: `strt-seq` -For plate-based methods that don't use UMIs (such as [SMART-Seq and SMART-Seq2](https://teichlab.github.io/scg_lib_structs/methods_html/SMART-seq_family.html)), `STARsolo` can be used as well. Fastq files for these methods usually come as separate, paired-end files; all of these should be listed in a *manifest* file - plain text, tab-separated file containing three columns per line: 1) full path to R1; 2) full path to R2; 3) cell name or ID. +### QC aggregation -Example of a script used to process Smart-seq2 data can be found in `/scripts/starsolo_ss2.sh`. Key parameters that could be adjusted are `--outFilterScoreMinOverLread 0.3 --outFilterMatchNminOverLread 0.3`; the higher they are, the less permissive is the alignment (we use default settings usually). Lower values can help you "rescue" a larger proportion of reads with high adapter content (see below for adapter trimming). Actual `STAR` command being run: +Run from the parent directory containing per-sample output folders: ```bash -STAR --runThreadN $CPUS --genomeDir $REF --runDirPerm All_RWX $GZIP $BAM \ - --soloType SmartSeq --readFilesManifest ../$TAG.manifest.tsv \ - --soloUMIdedup Exact --soloStrand Unstranded \ - --limitOutSJcollapsed 10000000 --soloCellFilter None \ - --soloFeatures Gene GeneFull --soloOutFileNames output/ features.tsv barcodes.tsv matrix.mtx +starsolo qc | column -t ``` -Often, SMART-seq2 reads can benefit from trimming adapters, which can be turned on using `--clip3pAdapterSeq <3' adapter sequence>` option. Alternatively, `/scripts/bbduk.sh` can be used to trim adapters from reads prior to the alignment and quantification (this is the preferred option). +Checks for: +- Incomplete runs (`_STARtmp` still present) +- Barcode cross-contamination between samples +- Low mapping percentages -### Counting the multimapping reads +Outputs a tab-separated table with read counts, mapping rates, cell counts, and configuration for each sample. -Default approach used by `Cell Ranger` (and `STARsolo` scripts above) is to discard all reads that map to multiple genomic locations with equal mapping quality. This approach creates a bias in gene expression estimation. Pseudocount-based methods correctly quantify multimapping reads, but generate false counts due to pseudo-alignment errors. These issues are described in good detail [here](https://www.biorxiv.org/content/10.1101/2021.05.05.442755v1). +## Configuration -If you would like to process multimappers, add the following options: `--soloMultiMappers Uniform EM` (or simply `--soloMultiMappers EM`; this is on by default in v3.0+ of these scripts). This will generate an extra matrix in the `/raw` output folders. There will be non-integer numbers in the matrix because of split reads. If the downstream processing requires integers, you can round with a tool of your liking (e.g. `awk`). +Edit [etc/defaults.conf](etc/defaults.conf) to change default paths and parameters. All values can also be set as environment variables: -As of `STAR` v2.7.10a, **multimapper counting still does not work for SMART-seq2 or bulk RNA-seq processing**. +```bash +export STARSOLO_REF_BASE=/my/references +export STARSOLO_WL_DIR=/my/whitelists +export STARSOLO_CPUS=8 +export STARSOLO_BAM_MODE=bam +``` + +CLI flags always take precedence over environment variables, which take precedence over config file defaults. + +## Helper scripts -### Running STARsolo on other scRNA-seq platforms +| Script | Purpose | +|:--|:--| +| [scripts/bbduk_trim.sh](scripts/bbduk_trim.sh) | Adapter/polyA trimming via BBMap | +| [scripts/bsub_submit.sh](scripts/bsub_submit.sh) | Submit any starsolo command as an LSF job | -STARsolo is very flexible and can be used with almost any scRNA-seq method, provided you know the library structure - i.e. where cell barcodes, UMIs, and biological parts of the read are located in the sequencing fragment or reads. A great source of information about scRNA-seq library structures is [this page](https://teichlab.github.io/scg_lib_structs/). +### LSF submission + +```bash +./scripts/bsub_submit.sh starsolo 10x /data/fastqs SAMPLE1 --species human +``` -Currently, our scripts directory provides dedicated scripts for: - - SMART-seq/SMART-seq2; - - Drop-seq; - - inDrops; - - Microwell-seq; - - BD Rhapsody; - - CEL-seq; - - STRT-seq. +This submits with 16 CPUs / 64 GB RAM to the `normal` queue. -Please contact `CellGenIT` if you need to process an unusual dataset. +## Resource requirements -## Quick evaluation of multiple STARsolo runs +Default: **16 CPUs, 64–128 GB RAM**. Some datasets require 128 GB; if jobs fail with OOM errors, increase RAM in your job submission. -If you've used these scripts to process multiple 10X samples, you can get a quick look at the results by copying `solo_QC.sh` script from this repo to the directory with `STARsolo` output folders, and running +## Docker + +The included [Dockerfile](Dockerfile) builds all required tools: ```bash -./solo_QC.sh | column -t +docker build -t starsolo . +docker run -v /data:/data starsolo starsolo 10x /data/fastqs SAMPLE1 --species human +``` + +## Project structure + +``` +STARsolo/ +├── bin/ +│ └── starsolo # CLI entrypoint (add to PATH) +├── lib/ +│ ├── common.sh # Shared functions (FASTQ discovery, compression, post-processing) +│ ├── platform_10x.sh # 10x Genomics logic +│ ├── platform_smartseq.sh # Smart-seq2 logic +│ ├── platform_dropseq.sh # Drop-seq logic +│ ├── platform_rhapsody.sh # BD Rhapsody logic +│ ├── platform_indrops.sh # inDrops logic +│ └── platform_strt.sh # STRT-seq logic +├── etc/ +│ └── defaults.conf # Default configuration +├── scripts/ +│ ├── solo_qc.sh # QC aggregation +│ ├── bbduk_trim.sh # Adapter trimming helper +│ └── bsub_submit.sh # LSF job submission helper +├── Dockerfile +├── install.sh +├── LICENSE +└── README.md ``` -The script is designed for 10X or other droplet-based methods. For SMART-seq2, if processing was done per plate, it will also produce an informative output. +## License + +See [LICENSE](LICENSE). diff --git a/bin/starsolo b/bin/starsolo new file mode 100755 index 0000000..1dc4aa8 --- /dev/null +++ b/bin/starsolo @@ -0,0 +1,279 @@ +#!/bin/bash -e +# ============================================================================= +# starsolo — unified CLI for STARsolo scRNA-seq processing +# ============================================================================= +# Version 4.0 +# +# Usage: starsolo [args…] +# starsolo --help +# +# Platforms: 10x, smartseq, dropseq, rhapsody, indrops, strt, qc +# ============================================================================= + +set -euo pipefail + +STARSOLO_VERSION="4.0" + +# ---------- Resolve install paths ------------------------------------------- + +# STARSOLO_ROOT is the repo root (parent of bin/). +# Works whether called via symlink, PATH, or direct path. +_resolve_root() { + local SOURCE="${BASH_SOURCE[0]}" + while [[ -L "$SOURCE" ]]; do + local DIR + DIR=$(cd -P "$(dirname "$SOURCE")" && pwd) + SOURCE=$(readlink "$SOURCE") + [[ $SOURCE != /* ]] && SOURCE="$DIR/$SOURCE" + done + cd -P "$(dirname "$SOURCE")/.." && pwd +} + +STARSOLO_ROOT=$(_resolve_root) + +# ---------- Load shared code ------------------------------------------------ + +source "$STARSOLO_ROOT/etc/defaults.conf" +source "$STARSOLO_ROOT/lib/common.sh" +source "$STARSOLO_ROOT/lib/platform_10x.sh" +source "$STARSOLO_ROOT/lib/platform_smartseq.sh" +source "$STARSOLO_ROOT/lib/platform_dropseq.sh" +source "$STARSOLO_ROOT/lib/platform_rhapsody.sh" +source "$STARSOLO_ROOT/lib/platform_indrops.sh" +source "$STARSOLO_ROOT/lib/platform_strt.sh" + +# ---------- Top-level help --------------------------------------------------- + +show_main_help() { + cat < [options] + starsolo qc + starsolo --help | --version + +Platforms: + 10x 10x Genomics (auto-detects chemistry, strand, PE/SE) + smartseq Smart-seq / Smart-seq2 (plate-based, manifest file) + dropseq Drop-seq (12 bp CB, 8 bp UMI, no whitelist) + rhapsody BD Rhapsody (3 barcode segments, 8 bp UMI) + indrops inDrops (2 barcode segments + adapter, 6 bp UMI) + strt STRT-seq (96-barcode whitelist, 8 bp CB, 8 bp UMI) + qc Aggregate QC stats across completed runs + +Global options (available for all platforms): + -s, --species Species (e.g. human, mouse); resolves reference + -r, --ref Explicit STAR genome index (overrides --species) + -w, --whitelist-dir Barcode whitelist directory + -c, --cpus Number of threads [default: ${STARSOLO_CPUS}] + --bam Output sorted BAM + --no-bam No BAM output (default) + -h, --help Show help (per-platform if after platform name) + --version Print version + +Configuration: + Defaults are read from $STARSOLO_ROOT/etc/defaults.conf + Override with environment variables (STARSOLO_CPUS, STARSOLO_REF_BASE, …) + or with CLI flags. + +Examples: + starsolo 10x /data/fastqs SAMPLE1 --species human + starsolo 10x /data/fastqs SAMPLE1 --ref /path/to/index --bam + starsolo smartseq manifest.tsv --species mouse + starsolo dropseq /data/fastqs SAMPLE1 --species human + starsolo rhapsody /data/fastqs SAMPLE1 --species human + starsolo indrops /data/fastqs SAMPLE1 --species human --adapter GAGTGATTGCTTGTGACGCCAA + starsolo strt /data/fastqs SAMPLE1 --species human --strand Reverse + starsolo qc +EOF +} + +# ---------- Option parsing helpers ------------------------------------------ + +# Parses the common option set shared by all droplet-based platforms. +# After parsing, the following variables are set: +# FQDIR, TAG, REF, WL, CPUS, BAM, plus platform-specific extras. +# The function shifts positional args (FQDIR, TAG) off $@ and processes flags. + +# Generic parser for platforms that take: [options] +# Sets: FQDIR TAG REF WL CPUS BAM and any extra vars needed. +parse_droplet_args() { + local PLATFORM=$1; shift + + # Defaults from config + CPUS="$STARSOLO_CPUS" + WL="$STARSOLO_WL_DIR" + BAM_MODE="$STARSOLO_BAM_MODE" + REF="" + SPECIES="" + FQDIR="" + TAG="" + + # Platform-specific defaults + ADAPTER="GAGTGATTGCTTGTGACGCCTT" # indrops + STRAND="Forward" # strt + + # Consume positional args first (up to 2) + local POSITIONALS=() + local ALL_ARGS=("$@") + local i=0 + + while (( i < ${#ALL_ARGS[@]} )); do + case "${ALL_ARGS[$i]}" in + -*) break ;; + *) POSITIONALS+=("${ALL_ARGS[$i]}"); i=$((i + 1)) ;; + esac + done + + # Remaining are flags + local FLAGS=() + if (( i < ${#ALL_ARGS[@]} )); then + FLAGS=("${ALL_ARGS[@]:$i}") + fi + + # For smartseq we only need 1 positional; others need 2 + if [[ $PLATFORM == "smartseq" ]]; then + if (( ${#POSITIONALS[@]} < 1 )); then + "show_${PLATFORM}_help" + exit 1 + fi + FQDIR="${POSITIONALS[0]}" # actually TSV for smartseq + else + if (( ${#POSITIONALS[@]} < 2 )); then + "show_${PLATFORM}_help" + exit 1 + fi + FQDIR="${POSITIONALS[0]}" + TAG="${POSITIONALS[1]}" + fi + + # Parse flags + local j=0 + while (( j < ${#FLAGS[@]} )); do + case "${FLAGS[$j]}" in + -s|--species) + SPECIES="${FLAGS[$((j+1))]}" + j=$((j + 2)) ;; + -r|--ref) + REF="${FLAGS[$((j+1))]}" + j=$((j + 2)) ;; + -w|--whitelist-dir) + WL="${FLAGS[$((j+1))]}" + j=$((j + 2)) ;; + -c|--cpus) + CPUS="${FLAGS[$((j+1))]}" + j=$((j + 2)) ;; + --bam) + BAM_MODE="bam" + j=$((j + 1)) ;; + --no-bam) + BAM_MODE="nobam" + j=$((j + 1)) ;; + --adapter) + ADAPTER="${FLAGS[$((j+1))]}" + j=$((j + 2)) ;; + --strand) + STRAND="${FLAGS[$((j+1))]}" + j=$((j + 2)) ;; + -h|--help) + "show_${PLATFORM}_help" + exit 0 ;; + *) + die "Unknown option: ${FLAGS[$j]}. Run 'starsolo $PLATFORM --help'." ;; + esac + done + + # Resolve REF from species if not explicitly given + if [[ -z "$REF" ]]; then + [[ -n "$SPECIES" ]] || die "Either --species or --ref must be provided." + REF=$(resolve_reference "$SPECIES" "$STARSOLO_REF_BASE") || exit 1 + fi + + BAM=$(resolve_bam_flags "$BAM_MODE") +} + +# ---------- QC subcommand --------------------------------------------------- + +run_qc() { + local QC_SCRIPT="$STARSOLO_ROOT/scripts/solo_qc.sh" + if [[ ! -x "$QC_SCRIPT" ]]; then + die "QC script not found: $QC_SCRIPT" + fi + exec bash "$QC_SCRIPT" "$@" +} + +# ---------- Dispatch -------------------------------------------------------- + +main() { + if (( $# == 0 )); then + show_main_help + exit 0 + fi + + case "$1" in + --help|-h) + show_main_help + exit 0 ;; + --version) + echo "starsolo v${STARSOLO_VERSION}" + exit 0 ;; + esac + + local PLATFORM="$1"; shift + + case "$PLATFORM" in + 10x) + if [[ "${1:-}" == "-h" || "${1:-}" == "--help" ]]; then + show_10x_help; exit 0 + fi + parse_droplet_args "10x" "$@" + run_10x "$FQDIR" "$TAG" "$CPUS" "$REF" "$WL" "$BAM" + ;; + smartseq|smart-seq|ss2) + if [[ "${1:-}" == "-h" || "${1:-}" == "--help" ]]; then + show_smartseq_help; exit 0 + fi + parse_droplet_args "smartseq" "$@" + run_smartseq "$FQDIR" "$CPUS" "$REF" "$BAM" + ;; + dropseq|drop-seq) + if [[ "${1:-}" == "-h" || "${1:-}" == "--help" ]]; then + show_dropseq_help; exit 0 + fi + parse_droplet_args "dropseq" "$@" + run_dropseq "$FQDIR" "$TAG" "$CPUS" "$REF" "$BAM" + ;; + rhapsody|bd-rhapsody|bd) + if [[ "${1:-}" == "-h" || "${1:-}" == "--help" ]]; then + show_rhapsody_help; exit 0 + fi + parse_droplet_args "rhapsody" "$@" + run_rhapsody "$FQDIR" "$TAG" "$CPUS" "$REF" "$WL" "$BAM" + ;; + indrops) + if [[ "${1:-}" == "-h" || "${1:-}" == "--help" ]]; then + show_indrops_help; exit 0 + fi + parse_droplet_args "indrops" "$@" + run_indrops "$FQDIR" "$TAG" "$CPUS" "$REF" "$WL" "$BAM" "$ADAPTER" + ;; + strt|strt-seq) + if [[ "${1:-}" == "-h" || "${1:-}" == "--help" ]]; then + show_strt_help; exit 0 + fi + parse_droplet_args "strt" "$@" + run_strt "$FQDIR" "$TAG" "$CPUS" "$REF" "$WL" "$BAM" "$STRAND" + ;; + qc) + run_qc "$@" + ;; + *) + die "Unknown platform: '$PLATFORM'. Run 'starsolo --help' for usage." + ;; + esac +} + +if [[ "${BASH_SOURCE[0]}" == "$0" ]]; then + main "$@" +fi diff --git a/data/test.tar.gz b/data/test.tar.gz new file mode 100644 index 0000000..de599e0 Binary files /dev/null and b/data/test.tar.gz differ diff --git a/data/whitelists.tar.gz b/data/whitelists.tar.gz new file mode 100644 index 0000000..707d753 Binary files /dev/null and b/data/whitelists.tar.gz differ diff --git a/etc/defaults.conf b/etc/defaults.conf new file mode 100644 index 0000000..7f3c633 --- /dev/null +++ b/etc/defaults.conf @@ -0,0 +1,26 @@ +# ============================================================================= +# STARsolo CLI - Default Configuration +# ============================================================================= +# This file is sourced by all starsolo subcommands. +# Override any value via CLI flags or by editing this file. +# +# Environment variables with the STARSOLO_ prefix will override these defaults +# (e.g. export STARSOLO_CPUS=8). +# ============================================================================= + +# --- Compute resources --- +STARSOLO_CPUS="${STARSOLO_CPUS:-16}" + +# --- Reference genome base directory --- +# Platform scripts resolve the full path as: $STARSOLO_REF_BASE//2020A/index +STARSOLO_REF_BASE="${STARSOLO_REF_BASE:-/nfs/cellgeni/STAR}" + +# --- Barcode whitelist directory --- +STARSOLO_WL_DIR="${STARSOLO_WL_DIR:-/nfs/cellgeni/STAR/whitelists}" + +# --- BAM output (default: no BAM) --- +# Set to "bam" to produce a sorted BAM by default. +STARSOLO_BAM_MODE="${STARSOLO_BAM_MODE:-nobam}" + +# --- Adapter reference for bbduk --- +STARSOLO_BBDUK_ADAPTERS="${STARSOLO_BBDUK_ADAPTERS:-/software/cellgen/cellgeni/bbmap/resources/adapters.fa}" diff --git a/install.sh b/install.sh new file mode 100755 index 0000000..6cf473d --- /dev/null +++ b/install.sh @@ -0,0 +1,45 @@ +#!/bin/bash +# ============================================================================= +# STARsolo CLI — installer +# ============================================================================= +# Adds the starsolo command to your PATH by symlinking into a bin directory. +# +# Usage: +# ./install.sh # installs to ~/.local/bin (default) +# ./install.sh /usr/local/bin # installs to /usr/local/bin (needs sudo) +# PREFIX=~/tools ./install.sh # installs to ~/tools/bin +# ============================================================================= + +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + +TARGET_DIR="${1:-${PREFIX:+${PREFIX}/bin}}" +TARGET_DIR="${TARGET_DIR:-$HOME/.local/bin}" + +mkdir -p "$TARGET_DIR" + +STARSOLO_BIN="$SCRIPT_DIR/bin/starsolo" + +# Make everything executable +chmod +x "$STARSOLO_BIN" +chmod +x "$SCRIPT_DIR"/lib/*.sh 2>/dev/null || true +chmod +x "$SCRIPT_DIR"/scripts/*.sh 2>/dev/null || true + +# Create symlink +ln -sf "$STARSOLO_BIN" "$TARGET_DIR/starsolo" + +echo "Installed starsolo → $TARGET_DIR/starsolo" + +# Check if target is in PATH +if ! echo "$PATH" | tr ':' '\n' | grep -qx "$TARGET_DIR"; then + echo "" + echo "NOTE: $TARGET_DIR is not in your PATH." + echo "Add it by appending this line to your ~/.bashrc or ~/.bash_profile:" + echo "" + echo " export PATH=\"$TARGET_DIR:\$PATH\"" + echo "" +fi + +echo "" +echo "Verify with: starsolo --version" diff --git a/lib/common.sh b/lib/common.sh new file mode 100755 index 0000000..660e24f --- /dev/null +++ b/lib/common.sh @@ -0,0 +1,145 @@ +#!/bin/bash +# ============================================================================= +# STARsolo CLI - Shared library functions +# ============================================================================= +# Sourced by bin/starsolo and all platform scripts. Never executed directly. +# ============================================================================= + +# ---------- Logging -------------------------------------------------------- + +log_info() { echo "[INFO] $*" >&2; } +log_warn() { echo "[WARN] $*" >&2; } +log_error() { echo "[ERROR] $*" >&2; } +die() { log_error "$@"; exit 1; } + +# ---------- FASTQ file discovery ------------------------------------------- + +# Finds paired-end FASTQ files based on common naming conventions. +# $1 : FQDIR – directory containing FASTQ files +# $2 : TAG – sample identifier +# Prints: "R1_FILES|R2_FILES" (comma-separated within each group) +find_fastq_files() { + local FQDIR=$1 TAG=$2 + local R1="" R2="" + + if [[ $(find "$FQDIR"/* | grep -P "/$TAG[\\/\\._]" | grep "_1\\.f.*q") != "" ]]; then + R1=$(find "$FQDIR"/* | grep -P "/$TAG[\\/\\._]" | grep "_1\\.f.*q" | sort | tr '\n' ',' | sed "s/,$//g") + R2=$(find "$FQDIR"/* | grep -P "/$TAG[\\/\\._]" | grep "_2\\.f.*q" | sort | tr '\n' ',' | sed "s/,$//g") + elif [[ $(find "$FQDIR"/* | grep -P "/$TAG[\\/\\._]" | grep "R1\\.f.*q") != "" ]]; then + R1=$(find "$FQDIR"/* | grep -P "/$TAG[\\/\\._]" | grep "R1\\.f.*q" | sort | tr '\n' ',' | sed "s/,$//g") + R2=$(find "$FQDIR"/* | grep -P "/$TAG[\\/\\._]" | grep "R2\\.f.*q" | sort | tr '\n' ',' | sed "s/,$//g") + elif [[ $(find "$FQDIR"/* | grep -P "/$TAG[\\/\\._]" | grep "_R1_.*\\.f.*q") != "" ]]; then + R1=$(find "$FQDIR"/* | grep -P "/$TAG[\\/\\._]" | grep "_R1_.*\\.f.*q" | sort | tr '\n' ',' | sed "s/,$//g") + R2=$(find "$FQDIR"/* | grep -P "/$TAG[\\/\\._]" | grep "_R2_.*\\.f.*q" | sort | tr '\n' ',' | sed "s/,$//g") + elif [[ $(find "$FQDIR"/* | grep -P "/$TAG[\\/\\._]" | grep "_f1\\.f.*q") != "" ]]; then + R1=$(find "$FQDIR"/* | grep -P "/$TAG[\\/\\._]" | grep "_f1\\.f.*q" | sort | tr '\n' ',' | sed "s/,$//g") + R2=$(find "$FQDIR"/* | grep -P "/$TAG[\\/\\._]" | grep "_r2\\.f.*q" | sort | tr '\n' ',' | sed "s/,$//g") + else + die "No appropriate FASTQ files found for sample '$TAG' in '$FQDIR'. Check file names and directory." + fi + + echo "$R1|$R2" +} + +# ---------- Compression detection ------------------------------------------ + +# Detects GZIP/BZIP2 compression for a sample's FASTQ files. +# $1 : FQDIR +# $2 : TAG +# Prints: "STAR_FLAG|DECOMPRESS_CMD" +check_compression() { + local FQDIR=$1 TAG=$2 + local STAR_FLAG="" ZCMD="cat" + + if [[ $(find "$FQDIR"/* | grep -P "$TAG[\\/\\._]" | grep "\\.gz$") != "" ]]; then + STAR_FLAG="--readFilesCommand zcat" + ZCMD="zcat" + elif [[ $(find "$FQDIR"/* | grep -P "$TAG[\\/\\._]" | grep "\\.bz2$") != "" ]]; then + STAR_FLAG="--readFilesCommand bzcat" + ZCMD="bzcat" + fi + + echo "$STAR_FLAG|$ZCMD" +} + +# ---------- Post-alignment processing -------------------------------------- + +# Compresses outputs and optionally indexes BAM. +# $1 : OUTPUT_DIR – STARsolo output directory (e.g. "output/") +# No singularity; tools expected in PATH. +process_output_files() { + local OUTPUT_DIR=${1:-output} + + # Index BAM if it exists + if [[ -s "Aligned.sortedByCoord.out.bam" ]]; then + log_info "Indexing BAM file …" + samtools index -@16 Aligned.sortedByCoord.out.bam + fi + + # Compress unmapped reads + if [[ -s "Unmapped.out.mate1" ]]; then + pbzip2 -9 Unmapped.out.mate1 & + fi + if [[ -s "Unmapped.out.mate2" ]]; then + pbzip2 -9 Unmapped.out.mate2 & + fi + wait + + # Gzip all matrix / barcode / feature files + find "$OUTPUT_DIR" -name "*.tsv" -exec gzip {} + 2>/dev/null + find "$OUTPUT_DIR" -name "*.mtx" -exec gzip {} + 2>/dev/null + + wait + log_info "ALL DONE!" +} + +# ---------- BAM flag builder ----------------------------------------------- + +# Resolves the BAM flags for STAR depending on --bam / --no-bam. +# $1 : "bam" or "nobam" +# Prints the STAR flags string. +resolve_bam_flags() { + local MODE=$1 + if [[ $MODE == "bam" ]]; then + echo "--outSAMtype BAM SortedByCoordinate --outBAMsortingBinsN 500 --limitBAMsortRAM 60000000000 --outMultimapperOrder Random --runRNGseed 1 --outSAMattributes NH HI AS nM CB UB CR CY UR UY GX GN" + else + echo "--outSAMtype None" + fi +} + +# ---------- Reference path resolver ----------------------------------------- + +# Resolves a STAR reference index path from species name. +# $1 : species (e.g. "human", "mouse") +# $2 : REF_BASE directory +# Prints absolute path and validates it exists. +resolve_reference() { + local SPECIES=$1 REF_BASE=$2 + local REF="${REF_BASE}/${SPECIES}/2020A/index" + + if [[ ! -d "$REF" ]]; then + log_error "Reference directory not found: $REF" + return 1 + fi + echo "$REF" +} + +# ---------- Input validation ------------------------------------------------ + +validate_fqdir() { + local FQDIR=$1 + if [[ ! -d "$FQDIR" ]]; then + log_error "FASTQ directory does not exist: $FQDIR" + return 1 + fi + readlink -f "$FQDIR" +} + +validate_tag() { + local TAG=$1 + if [[ -z "$TAG" ]]; then + log_error "Sample TAG must not be empty." + return 1 + fi + echo "$TAG" +} diff --git a/lib/platform_10x.sh b/lib/platform_10x.sh new file mode 100755 index 0000000..c9abc2e --- /dev/null +++ b/lib/platform_10x.sh @@ -0,0 +1,290 @@ +#!/bin/bash +# ============================================================================= +# STARsolo CLI - 10x Genomics platform +# ============================================================================= +# Automatic chemistry detection (v1 / v2 / v3 / v4 / multiome), +# strand-specificity detection, and paired-end support. +# ============================================================================= + +# ---------- 10x-specific helpers ------------------------------------------- + +# Extracts a random subsample of reads for chemistry testing. +# $1: R1 (comma-sep) $2: R2 (comma-sep) $3: ZCMD +_10x_extract_test_reads() { + local R1=$1 R2=$2 ZCMD=$3 + local COUNT=0 + + for i in $(echo "$R1" | tr ',' ' '); do + $ZCMD "$i" | head -4000000 > "$COUNT.R1_head" & + COUNT=$((COUNT + 1)) + done + wait + + COUNT=0 + for i in $(echo "$R2" | tr ',' ' '); do + $ZCMD "$i" | head -4000000 > "$COUNT.R2_head" & + COUNT=$((COUNT + 1)) + done + wait + + cat *.R1_head | seqtk sample -s100 - 200000 > test.R1.fastq & + cat *.R2_head | seqtk sample -s100 - 200000 > test.R2.fastq & + wait + rm -f *.R1_head *.R2_head +} + +# Determines the correct barcode whitelist. +# $1: WL directory +# Prints: "BC|NBC1|NBC2|NBC3|NBC4|NBC5|NBCA|R1LEN|R2LEN|R1DIS" +_10x_determine_whitelist() { + local WL=$1 + + local NBC1 NBC2 NBC3 NBC4 NBC5 NBCA + NBC1=$(awk 'NR%4==2' test.R1.fastq | cut -c-14 | grep -F -f "$WL/737K-april-2014_rc.txt" | wc -l) + NBC2=$(awk 'NR%4==2' test.R1.fastq | cut -c-16 | grep -F -f "$WL/737K-august-2016.txt" | wc -l) + NBC3=$(awk 'NR%4==2' test.R1.fastq | cut -c-16 | grep -F -f "$WL/3M-february-2018.txt" | wc -l) + NBC4=$(awk 'NR%4==2' test.R1.fastq | cut -c-16 | grep -F -f "$WL/3M-3pgex-may-2023.txt" | wc -l) + NBC5=$(awk 'NR%4==2' test.R1.fastq | cut -c-16 | grep -F -f "$WL/3M-5pgex-jan-2023.txt" | wc -l) + NBCA=$(awk 'NR%4==2' test.R1.fastq | cut -c-16 | grep -F -f "$WL/737K-arc-v1.txt" | wc -l) + + local R1LEN R2LEN R1DIS + R1LEN=$(awk 'NR%4==2' test.R1.fastq | awk '{sum+=length($0)} END {printf "%d\n",sum/NR+0.5}') + R2LEN=$(awk 'NR%4==2' test.R2.fastq | awk '{sum+=length($0)} END {printf "%d\n",sum/NR+0.5}') + R1DIS=$(awk 'NR%4==2' test.R1.fastq | awk '{print length($0)}' | sort | uniq -c | wc -l) + + local BC="" + if (( NBC3 > 50000 )); then BC="$WL/3M-february-2018.txt" + elif (( NBC2 > 50000 )); then BC="$WL/737K-august-2016.txt" + elif (( NBCA > 50000 )); then BC="$WL/737K-arc-v1.txt" + elif (( NBC1 > 50000 )); then BC="$WL/737K-april-2014_rc.txt" + elif (( NBC4 > 50000 )); then BC="$WL/3M-3pgex-may-2023.txt" + elif (( NBC5 > 50000 )); then BC="$WL/3M-5pgex-jan-2023.txt" + else + die "No whitelist matched 200,000 random barcodes! Counts: v1=$NBC1 v2=$NBC2 v3=$NBC3 v4-3p=$NBC4 v4-5p=$NBC5 multiome=$NBCA" + fi + + echo "$BC|$NBC1|$NBC2|$NBC3|$NBC4|$NBC5|$NBCA|$R1LEN|$R2LEN|$R1DIS" +} + +# Checks read lengths and sets barcode/UMI parameters. +# $1: R1DIS $2: R1LEN $3: R2LEN $4: BC $5: WL +# Prints: "PAIRED|CBLEN|UMILEN" +_10x_check_read_lengths() { + local R1DIS=$1 R1LEN=$2 R2LEN=$3 BC=$4 WL=$5 + local PAIRED=False CBLEN UMILEN + + if (( R1DIS > 1 && R1LEN <= 30 )); then + die "Read 1 (barcode) has varying length; possibly quality-trimmed." + elif (( R1LEN < 24 )); then + die "Read 1 (barcode) is less than 24 bp. Check FASTQ files." + elif (( R2LEN < 40 )); then + die "Read 2 (biological read) is less than 40 bp. Check FASTQ files." + fi + + (( R1LEN > 50 )) && PAIRED=True + + case "$BC" in + *3M-february-2018.txt|*737K-arc-v1.txt|*3M-3pgex-may-2023.txt|*3M-5pgex-jan-2023.txt) + CBLEN=16; UMILEN=12 ;; + *737K-august-2016.txt) + CBLEN=16; UMILEN=10 ;; + *737K-april-2014_rc.txt) + CBLEN=14; UMILEN=10 ;; + esac + + local BCUMI=$((CBLEN + UMILEN)) + if (( BCUMI > R1LEN )); then + local NEWUMI=$((R1LEN - CBLEN)) + log_warn "R1 length ($R1LEN) < barcode+UMI ($BCUMI). Setting UMI length to $NEWUMI." + UMILEN=$NEWUMI + elif (( BCUMI < R1LEN )); then + log_warn "R1 length ($R1LEN) > barcode+UMI ($BCUMI)." + fi + + echo "$PAIRED|$CBLEN|$UMILEN" +} + +# Determines strand specificity via test alignments. +# $1: REF $2: CBLEN $3: UMILEN $4: CPUS $5: BC $6: PAIRED +# Prints: "STRAND|PCTFWD|PCTREV|PAIRED" +_10x_determine_strand() { + local REF=$1 CBLEN=$2 UMILEN=$3 CPUS=$4 BC=$5 PAIRED=$6 + local STRAND=Forward + + log_info "Running test alignment (Forward) …" + STAR --runThreadN "$CPUS" --genomeDir "$REF" --readFilesIn test.R2.fastq test.R1.fastq \ + --runDirPerm All_RWX --outSAMtype None \ + --soloType CB_UMI_Simple --soloCBwhitelist "$BC" --soloBarcodeReadLength 0 \ + --soloCBlen "$CBLEN" --soloUMIstart $((CBLEN + 1)) --soloUMIlen "$UMILEN" \ + --soloStrand Forward --genomeLoad LoadAndKeep \ + --soloUMIdedup 1MM_CR --soloCBmatchWLtype 1MM_multi_Nbase_pseudocounts \ + --soloUMIfiltering MultiGeneUMI_CR --soloCellFilter EmptyDrops_CR \ + --clipAdapterType CellRanger4 --outFilterScoreMin 30 \ + --soloFeatures Gene GeneFull \ + --soloOutFileNames test_forward/ features.tsv barcodes.tsv matrix.mtx &>/dev/null + + log_info "Running test alignment (Reverse) …" + STAR --runThreadN "$CPUS" --genomeDir "$REF" --readFilesIn test.R2.fastq test.R1.fastq \ + --runDirPerm All_RWX --outSAMtype None \ + --soloType CB_UMI_Simple --soloCBwhitelist "$BC" --soloBarcodeReadLength 0 \ + --soloCBlen "$CBLEN" --soloUMIstart $((CBLEN + 1)) --soloUMIlen "$UMILEN" \ + --soloStrand Reverse --genomeLoad LoadAndKeep \ + --soloUMIdedup 1MM_CR --soloCBmatchWLtype 1MM_multi_Nbase_pseudocounts \ + --soloUMIfiltering MultiGeneUMI_CR --soloCellFilter EmptyDrops_CR \ + --clipAdapterType CellRanger4 --outFilterScoreMin 30 \ + --soloFeatures Gene GeneFull \ + --soloOutFileNames test_reverse/ features.tsv barcodes.tsv matrix.mtx &>/dev/null + + local PCTFWD PCTREV + PCTFWD=$(grep "Reads Mapped to GeneFull: Unique GeneFull" test_forward/GeneFull/Summary.csv | awk -F "," '{printf "%d\n",$2*100+0.5}') + PCTREV=$(grep "Reads Mapped to GeneFull: Unique GeneFull" test_reverse/GeneFull/Summary.csv | awk -F "," '{printf "%d\n",$2*100+0.5}') + + (( PCTREV > PCTFWD )) && STRAND=Reverse + + if (( PCTREV < 50 && PCTFWD < 50 )); then + log_warn "Low GeneFull mapping: forward=${PCTFWD}%, reverse=${PCTREV}%" + fi + + # 3' paired-end experiments should still be processed as single-end + [[ $STRAND == "Forward" && $PAIRED == "True" ]] && PAIRED=False + + echo "$STRAND|$PCTFWD|$PCTREV|$PAIRED" +} + +# Writes a human-readable configuration summary. +_10x_write_config() { + local TAG=$1 PAIRED=$2 STRAND=$3 PCTFWD=$4 PCTREV=$5 + local BC=$6 NBC1=$7 NBC2=$8 NBC3=$9 NBCA=${10} NBC4=${11} NBC5=${12} + local CBLEN=${13} UMILEN=${14} GZIP=${15} R1=${16} R2=${17} + + log_info "STARsolo 10x run configuration:" + { + echo "=============================================================================" + echo "Sample: $TAG" + echo "Paired-end mode: $PAIRED" + echo "Strand (Forward=3', Reverse=5'): $STRAND (%fwd=$PCTFWD, %rev=$PCTREV)" + echo "CB whitelist: $BC" + echo " Matches /200k: v3=$NBC3 v2=$NBC2 v1=$NBC1 v4-3p=$NBC4 v4-5p=$NBC5 multiome=$NBCA" + echo "CB length: $CBLEN UMI length: $UMILEN" + echo "Compression: ${GZIP:-none}" + echo "-----------------------------------------------------------------------------" + echo "R1: $R1" + echo "R2: $R2" + echo "=============================================================================" + } | tee strand.txt +} + +# ---------- Main 10x workflow ----------------------------------------------- + +# Entry point called by bin/starsolo. +# Arguments come from the CLI parser. +run_10x() { + local FQDIR=$1 TAG=$2 CPUS=$3 REF=$4 WL=$5 BAM=$6 + + local FQDIR_ABS + FQDIR_ABS=$(validate_fqdir "$FQDIR") || exit 1 + TAG=$(validate_tag "$TAG") || exit 1 + + # Convert paths to absolute before cd + REF=$(readlink -f "$REF") + WL=$(readlink -f "$WL") + + # Create output directory and enter it + mkdir -p "$TAG" && cd "$TAG" || exit + + # 1. Discover FASTQs + log_info "Discovering FASTQ files for sample '$TAG' …" + local fq_info R1 R2 + fq_info=$(find_fastq_files "$FQDIR_ABS" "$TAG") + IFS='|' read -r R1 R2 <<< "$fq_info" + + # 2. Compression + local comp_info GZIP ZCMD + comp_info=$(check_compression "$FQDIR_ABS" "$TAG") + IFS='|' read -r GZIP ZCMD <<< "$comp_info" + + # 3. Chemistry detection + log_info "Subsampling reads for chemistry detection …" + _10x_extract_test_reads "$R1" "$R2" "$ZCMD" + + log_info "Determining barcode whitelist …" + local wl_info BC NBC1 NBC2 NBC3 NBC4 NBC5 NBCA R1LEN R2LEN R1DIS + wl_info=$(_10x_determine_whitelist "$WL") + IFS='|' read -r BC NBC1 NBC2 NBC3 NBC4 NBC5 NBCA R1LEN R2LEN R1DIS <<< "$wl_info" + + # 4. Read length checks + local rl_info PAIRED CBLEN UMILEN + rl_info=$(_10x_check_read_lengths "$R1DIS" "$R1LEN" "$R2LEN" "$BC" "$WL") + IFS='|' read -r PAIRED CBLEN UMILEN <<< "$rl_info" + + # 5. Strand specificity + log_info "Determining strand specificity …" + local strand_info STRAND PCTFWD PCTREV + strand_info=$(_10x_determine_strand "$REF" "$CBLEN" "$UMILEN" "$CPUS" "$BC" "$PAIRED") + IFS='|' read -r STRAND PCTFWD PCTREV PAIRED <<< "$strand_info" + + # 6. Log configuration + _10x_write_config "$TAG" "$PAIRED" "$STRAND" "$PCTFWD" "$PCTREV" \ + "$BC" "$NBC1" "$NBC2" "$NBC3" "$NBCA" "$NBC4" "$NBC5" \ + "$CBLEN" "$UMILEN" "$GZIP" "$R1" "$R2" + + # 7. Run STAR + local SOLOFILENAMES="output/" + log_info "Running STARsolo alignment …" + + if [[ $PAIRED == "True" ]]; then + STAR --runThreadN "$CPUS" --genomeDir "$REF" \ + --readFilesIn "$R1" "$R2" --runDirPerm All_RWX $GZIP $BAM \ + --soloBarcodeMate 1 --clip5pNbases 39 0 \ + --soloType CB_UMI_Simple --soloCBwhitelist "$BC" --soloCBstart 1 \ + --soloCBlen "$CBLEN" --soloUMIstart $((CBLEN + 1)) --soloUMIlen "$UMILEN" \ + --soloStrand Forward \ + --soloUMIdedup 1MM_CR --soloCBmatchWLtype 1MM_multi_Nbase_pseudocounts \ + --soloUMIfiltering MultiGeneUMI_CR --soloCellFilter EmptyDrops_CR \ + --outFilterScoreMin 30 --genomeLoad LoadAndRemove \ + --soloFeatures Gene GeneFull Velocyto \ + --soloOutFileNames "$SOLOFILENAMES" features.tsv barcodes.tsv matrix.mtx \ + --soloMultiMappers EM --outReadsUnmapped Fastx + else + STAR --runThreadN "$CPUS" --genomeDir "$REF" \ + --readFilesIn "$R2" "$R1" --runDirPerm All_RWX $GZIP $BAM \ + --soloType CB_UMI_Simple --soloCBwhitelist "$BC" --soloBarcodeReadLength 0 \ + --soloCBlen "$CBLEN" --soloUMIstart $((CBLEN + 1)) --soloUMIlen "$UMILEN" \ + --soloStrand "$STRAND" \ + --soloUMIdedup 1MM_CR --soloCBmatchWLtype 1MM_multi_Nbase_pseudocounts \ + --soloUMIfiltering MultiGeneUMI_CR --soloCellFilter EmptyDrops_CR \ + --clipAdapterType CellRanger4 --outFilterScoreMin 30 --genomeLoad LoadAndRemove \ + --soloFeatures Gene GeneFull Velocyto \ + --soloOutFileNames "$SOLOFILENAMES" features.tsv barcodes.tsv matrix.mtx \ + --soloMultiMappers EM --outReadsUnmapped Fastx + fi + + # 8. Cleanup & compress + rm -rf test.R?.fastq test_forward test_reverse + process_output_files "$SOLOFILENAMES" +} + +# ---------- Subcommand help ------------------------------------------------- + +show_10x_help() { + cat <<'EOF' +Usage: starsolo 10x [options] + +Automatically detects 10x chemistry (v1–v4, multiome), strand specificity, +and paired-end vs single-end mode. + +Required: + Directory containing FASTQ files + Sample identifier (used to match FASTQ filenames) + +Options: + -s, --species Species name (e.g. human, mouse). + Resolves reference via $REF_BASE//2020A/index + -r, --ref Explicit STAR reference index (overrides --species) + -w, --whitelist-dir Barcode whitelist directory [default: from config] + -c, --cpus Number of threads [default: 16] + --bam Output sorted BAM file + --no-bam Do not output BAM (default) + -h, --help Show this help +EOF +} diff --git a/lib/platform_dropseq.sh b/lib/platform_dropseq.sh new file mode 100755 index 0000000..4aa7e87 --- /dev/null +++ b/lib/platform_dropseq.sh @@ -0,0 +1,62 @@ +#!/bin/bash +# ============================================================================= +# STARsolo CLI - Drop-seq platform +# ============================================================================= +# CB_UMI_Simple with no whitelist (12 bp CB, 8 bp UMI). +# ============================================================================= + +run_dropseq() { + local FQDIR=$1 TAG=$2 CPUS=$3 REF=$4 BAM=$5 + + local FQDIR_ABS + FQDIR_ABS=$(validate_fqdir "$FQDIR") || exit 1 + TAG=$(validate_tag "$TAG") || exit 1 + + # Convert to absolute path before cd + REF=$(readlink -f "$REF") + + mkdir -p "$TAG" && cd "$TAG" || exit + + # Discover FASTQs + local fq_info R1 R2 + fq_info=$(find_fastq_files "$FQDIR_ABS" "$TAG") + IFS='|' read -r R1 R2 <<< "$fq_info" + + # Compression + local comp_info GZIP ZCMD + comp_info=$(check_compression "$FQDIR_ABS" "$TAG") + IFS='|' read -r GZIP ZCMD <<< "$comp_info" + + log_info "Running STARsolo (Drop-seq) for '$TAG' …" + + STAR --runThreadN "$CPUS" --genomeDir "$REF" \ + --readFilesIn "$R2" "$R1" --runDirPerm All_RWX $GZIP $BAM \ + --soloType CB_UMI_Simple --soloCBwhitelist None \ + --soloCBstart 1 --soloCBlen 12 --soloUMIstart 13 --soloUMIlen 8 \ + --soloBarcodeReadLength 0 \ + --soloFeatures Gene GeneFull \ + --soloOutFileNames output/ features.tsv barcodes.tsv matrix.mtx \ + --outReadsUnmapped Fastx + + process_output_files "output" +} + +show_dropseq_help() { + cat <<'EOF' +Usage: starsolo dropseq [options] + +Process Drop-seq data (12 bp cell barcode, 8 bp UMI, no whitelist). + +Required: + Directory containing FASTQ files + Sample identifier + +Options: + -s, --species Species name (resolves reference automatically) + -r, --ref Explicit STAR reference index (overrides --species) + -c, --cpus Number of threads [default: 16] + --bam Output sorted BAM file + --no-bam Do not output BAM (default) + -h, --help Show this help +EOF +} diff --git a/lib/platform_indrops.sh b/lib/platform_indrops.sh new file mode 100755 index 0000000..c33c07b --- /dev/null +++ b/lib/platform_indrops.sh @@ -0,0 +1,79 @@ +#!/bin/bash +# ============================================================================= +# STARsolo CLI - inDrops platform +# ============================================================================= +# CB_UMI_Complex with 2 barcode segments, adapter sequence, and 6 bp UMI. +# ============================================================================= + +run_indrops() { + local FQDIR=$1 TAG=$2 CPUS=$3 REF=$4 WL=$5 BAM=$6 ADAPTER=$7 + + local FQDIR_ABS + FQDIR_ABS=$(validate_fqdir "$FQDIR") || exit 1 + TAG=$(validate_tag "$TAG") || exit 1 + + local BC1="$WL/inDrops_Ambrose2_bc1.txt" + local BC2="$WL/inDrops_Ambrose2_bc2.txt" + + for f in "$BC1" "$BC2"; do + [[ -f "$f" ]] || die "inDrops barcode file not found: $f" + done + + # Convert to absolute paths before cd + REF=$(readlink -f "$REF") + BC1=$(readlink -f "$BC1") + BC2=$(readlink -f "$BC2") + + mkdir -p "$TAG" && cd "$TAG" || exit + + # Discover FASTQs + local fq_info R1 R2 + fq_info=$(find_fastq_files "$FQDIR_ABS" "$TAG") + IFS='|' read -r R1 R2 <<< "$fq_info" + + # Compression + local comp_info GZIP ZCMD + comp_info=$(check_compression "$FQDIR_ABS" "$TAG") + IFS='|' read -r GZIP ZCMD <<< "$comp_info" + + log_info "Running STARsolo (inDrops) for '$TAG' (adapter=$ADAPTER) …" + + STAR --runThreadN "$CPUS" --genomeDir "$REF" \ + --readFilesIn "$R2" "$R1" --runDirPerm All_RWX $GZIP $BAM \ + --soloType CB_UMI_Complex --soloCBwhitelist "$BC1" "$BC2" \ + --soloAdapterSequence "$ADAPTER" --soloAdapterMismatchesNmax 3 \ + --soloCBmatchWLtype 1MM \ + --soloCBposition 0_0_2_-1 3_1_3_8 \ + --soloUMIposition 3_9_3_14 \ + --soloFeatures Gene GeneFull \ + --soloOutFileNames output/ features.tsv barcodes.tsv matrix.mtx \ + --outReadsUnmapped Fastx + + process_output_files "output" +} + +show_indrops_help() { + cat <<'EOF' +Usage: starsolo indrops [options] + +Process inDrops data (2 barcode segments + adapter + 6 bp UMI). + +Requires inDrops barcode whitelist files (inDrops_Ambrose2_bc1/2.txt) +in the whitelist directory. + +Required: + Directory containing FASTQ files + Sample identifier + +Options: + -s, --species Species name (resolves reference automatically) + -r, --ref Explicit STAR reference index (overrides --species) + -w, --whitelist-dir Barcode whitelist directory [default: from config] + -c, --cpus Number of threads [default: 16] + --adapter Adapter sequence + [default: GAGTGATTGCTTGTGACGCCTT] + --bam Output sorted BAM file + --no-bam Do not output BAM (default) + -h, --help Show this help +EOF +} diff --git a/lib/platform_rhapsody.sh b/lib/platform_rhapsody.sh new file mode 100755 index 0000000..9357c7e --- /dev/null +++ b/lib/platform_rhapsody.sh @@ -0,0 +1,79 @@ +#!/bin/bash +# ============================================================================= +# STARsolo CLI - BD Rhapsody platform +# ============================================================================= +# CB_UMI_Complex with 3 barcode segments and 8 bp UMI. +# ============================================================================= + +run_rhapsody() { + local FQDIR=$1 TAG=$2 CPUS=$3 REF=$4 WL=$5 BAM=$6 + + local FQDIR_ABS + FQDIR_ABS=$(validate_fqdir "$FQDIR") || exit 1 + TAG=$(validate_tag "$TAG") || exit 1 + + local BC1="$WL/Rhapsody_bc1.txt" + local BC2="$WL/Rhapsody_bc2.txt" + local BC3="$WL/Rhapsody_bc3.txt" + + for f in "$BC1" "$BC2" "$BC3"; do + [[ -f "$f" ]] || die "Rhapsody barcode file not found: $f" + done + + # Convert to absolute paths before cd + REF=$(readlink -f "$REF") + BC1=$(readlink -f "$BC1") + BC2=$(readlink -f "$BC2") + BC3=$(readlink -f "$BC3") + + mkdir -p "$TAG" && cd "$TAG" || exit + + # Discover FASTQs + local fq_info R1 R2 + fq_info=$(find_fastq_files "$FQDIR_ABS" "$TAG") + IFS='|' read -r R1 R2 <<< "$fq_info" + + # Compression + local comp_info GZIP ZCMD + comp_info=$(check_compression "$FQDIR_ABS" "$TAG") + IFS='|' read -r GZIP ZCMD <<< "$comp_info" + + log_info "Running STARsolo (BD Rhapsody) for '$TAG' …" + + STAR --runThreadN "$CPUS" --genomeDir "$REF" \ + --readFilesIn "$R2" "$R1" --runDirPerm All_RWX $GZIP $BAM \ + --soloType CB_UMI_Complex --soloCBwhitelist "$BC1" "$BC2" "$BC3" \ + --soloUMIlen 8 \ + --soloCBmatchWLtype 1MM \ + --soloCBposition 0_0_0_8 0_21_0_29 0_43_0_51 \ + --soloUMIposition 0_52_0_59 \ + --soloFeatures Gene GeneFull \ + --soloOutFileNames output/ features.tsv barcodes.tsv matrix.mtx \ + --outReadsUnmapped Fastx + + process_output_files "output" +} + +show_rhapsody_help() { + cat <<'EOF' +Usage: starsolo rhapsody [options] + +Process BD Rhapsody data (3 barcode segments, 8 bp UMI). + +Requires Rhapsody barcode whitelist files (Rhapsody_bc1/2/3.txt) +in the whitelist directory. + +Required: + Directory containing FASTQ files + Sample identifier + +Options: + -s, --species Species name (resolves reference automatically) + -r, --ref Explicit STAR reference index (overrides --species) + -w, --whitelist-dir Barcode whitelist directory [default: from config] + -c, --cpus Number of threads [default: 16] + --bam Output sorted BAM file + --no-bam Do not output BAM (default) + -h, --help Show this help +EOF +} diff --git a/lib/platform_smartseq.sh b/lib/platform_smartseq.sh new file mode 100755 index 0000000..595a368 --- /dev/null +++ b/lib/platform_smartseq.sh @@ -0,0 +1,77 @@ +#!/bin/bash +# ============================================================================= +# STARsolo CLI - Smart-seq / Smart-seq2 platform +# ============================================================================= +# Plate-based, no UMIs. Requires a manifest TSV file. +# ============================================================================= + +run_smartseq() { + local TSV=$1 CPUS=$2 REF=$3 BAM=$4 + + [[ -s "$TSV" ]] || die "Manifest file not found or empty: $TSV" + + local TAG + TAG=$(basename "${TSV%%.manifest.tsv}") + TAG=$(basename "${TAG%%.tsv}") # fallback if named differently + [[ -n "$TAG" ]] || die "Could not derive sample name from manifest filename." + + # Convert to absolute paths before cd + TSV=$(readlink -f "$TSV") + REF=$(readlink -f "$REF") + + # Create output directory + mkdir -p "$TAG" || exit + + # Convert manifest paths to absolute and write to output directory + local NEW_MANIFEST="$TAG/manifest.tsv" + while IFS=$'\t' read -r r1 r2 cell; do + r1=$(readlink -f "$r1") + r2=$(readlink -f "$r2") + echo -e "$r1\t$r2\t$cell" + done < "$TSV" > "$NEW_MANIFEST" + + cd "$TAG" || exit + TSV="manifest.tsv" + + # Compression detection from manifest content + local GZIP="" + if grep -qP "\.gz\t" "$TSV"; then + GZIP="--readFilesCommand zcat" + fi + + log_info "Running STARsolo (Smart-seq) for '$TAG' …" + + STAR --runThreadN "$CPUS" --genomeDir "$REF" --runDirPerm All_RWX $GZIP $BAM \ + --soloType SmartSeq --readFilesManifest "$TSV" \ + --soloUMIdedup Exact --soloStrand Unstranded \ + --limitOutSJcollapsed 10000000 --soloCellFilter None \ + --soloFeatures Gene GeneFull \ + --soloOutFileNames output/ features.tsv barcodes.tsv matrix.mtx \ + --outReadsUnmapped Fastx + + rm -f "$NEW_MANIFEST" + process_output_files "output" +} + +show_smartseq_help() { + cat <<'EOF' +Usage: starsolo smartseq [options] + +Process Smart-seq / Smart-seq2 data using a manifest file. +The manifest is a tab-separated file with 3 columns: + 1) Full path to R1 2) Full path to R2 3) Cell name/ID + +The sample name is derived from the manifest filename (strips .manifest.tsv). + +Required: + Path to the manifest TSV file + +Options: + -s, --species Species name (resolves reference automatically) + -r, --ref Explicit STAR reference index (overrides --species) + -c, --cpus Number of threads [default: 16] + --bam Output sorted BAM file + --no-bam Do not output BAM (default) + -h, --help Show this help +EOF +} diff --git a/lib/platform_strt.sh b/lib/platform_strt.sh new file mode 100755 index 0000000..ea3e8b7 --- /dev/null +++ b/lib/platform_strt.sh @@ -0,0 +1,79 @@ +#!/bin/bash +# ============================================================================= +# STARsolo CLI - STRT-seq platform +# ============================================================================= +# CB_UMI_Simple with a 96-barcode whitelist (8 bp CB, 8 bp UMI). +# NOTE: R1 is the biological read; R2 carries the barcode in STRT-seq. +# ============================================================================= + +run_strt() { + local FQDIR=$1 TAG=$2 CPUS=$3 REF=$4 WL=$5 BAM=$6 STRAND=$7 + + local FQDIR_ABS + FQDIR_ABS=$(validate_fqdir "$FQDIR") || exit 1 + TAG=$(validate_tag "$TAG") || exit 1 + + local BC="$WL/96_barcodes.list" + [[ -f "$BC" ]] || die "STRT-seq barcode file not found: $BC" + + # Convert to absolute paths before cd + REF=$(readlink -f "$REF") + BC=$(readlink -f "$BC") + + local CBLEN=8 + local UMILEN=8 + + mkdir -p "$TAG" && cd "$TAG" || exit + + # Discover FASTQs + local fq_info R1 R2 + fq_info=$(find_fastq_files "$FQDIR_ABS" "$TAG") + IFS='|' read -r R1 R2 <<< "$fq_info" + + # Compression + local comp_info GZIP ZCMD + comp_info=$(check_compression "$FQDIR_ABS" "$TAG") + IFS='|' read -r GZIP ZCMD <<< "$comp_info" + + log_info "Running STARsolo (STRT-seq) for '$TAG' (strand=$STRAND) …" + + # NOTE: R2 R1 order — R1 is biological read, R2 carries barcode + STAR --runThreadN "$CPUS" --genomeDir "$REF" \ + --readFilesIn "$R2" "$R1" --runDirPerm All_RWX $GZIP $BAM \ + --soloType CB_UMI_Simple --soloCBwhitelist "$BC" \ + --soloBarcodeReadLength 0 \ + --soloCBlen "$CBLEN" --soloUMIstart $((CBLEN + 1)) --soloUMIlen "$UMILEN" \ + --soloStrand "$STRAND" \ + --soloUMIdedup 1MM_CR --soloCBmatchWLtype 1MM_multi_Nbase_pseudocounts \ + --soloUMIfiltering MultiGeneUMI_CR \ + --limitOutSJcollapsed 10000000 --soloCellFilter None \ + --soloFeatures Gene GeneFull \ + --soloOutFileNames output/ features.tsv barcodes.tsv matrix.mtx \ + --outReadsUnmapped Fastx + + process_output_files "output" +} + +show_strt_help() { + cat <<'EOF' +Usage: starsolo strt [options] + +Process STRT-seq data (96-barcode whitelist, 8 bp CB, 8 bp UMI). + +Requires 96_barcodes.list in the whitelist directory. + +Required: + Directory containing FASTQ files + Sample identifier + +Options: + -s, --species Species name (resolves reference automatically) + -r, --ref Explicit STAR reference index (overrides --species) + -w, --whitelist-dir Barcode whitelist directory [default: from config] + -c, --cpus Number of threads [default: 16] + --strand Strand specificity [default: Forward] + --bam Output sorted BAM file + --no-bam Do not output BAM (default) + -h, --help Show this help +EOF +} diff --git a/scripts/bbduk_trim.sh b/scripts/bbduk_trim.sh new file mode 100755 index 0000000..f8cc126 --- /dev/null +++ b/scripts/bbduk_trim.sh @@ -0,0 +1,24 @@ +#!/bin/bash +# ============================================================================= +# bbduk.sh — adapter/polyA trimming helper +# ============================================================================= +# Usage: bbduk.sh [adapters_fasta] +# +# Requires BBMap (bbduk.sh) in PATH. +# ============================================================================= + +set -euo pipefail + +TAG=${1:?"Usage: bbduk.sh [adapters_fasta]"} +ADAPTERS=${2:-/software/cellgen/cellgeni/bbmap/resources/adapters.fa} + +bbduk.sh \ + in1="${TAG}_1.fastq.gz" \ + in2="${TAG}_2.fastq.gz" \ + out1="${TAG}.bbduk.R1.fastq" \ + out2="${TAG}.bbduk.R2.fastq" \ + ref="$ADAPTERS" \ + trimpolya=10 ktrim=r k=23 mink=11 hdist=1 tpe tbo \ + &> "${TAG}.bbduk.log" + +echo "[INFO] Trimming complete. Output: ${TAG}.bbduk.R1.fastq / ${TAG}.bbduk.R2.fastq" diff --git a/scripts/bsub_submit.sh b/scripts/bsub_submit.sh new file mode 100755 index 0000000..e59cf9f --- /dev/null +++ b/scripts/bsub_submit.sh @@ -0,0 +1,39 @@ +#!/bin/bash +# ============================================================================= +# bsub.sh — LSF job submission helper +# ============================================================================= +# Wraps any starsolo command in an LSF bsub job. +# +# Usage: bsub.sh +# +# Example: +# ./scripts/bsub.sh starsolo 10x /data/fastqs SAMPLE1 --species human +# ============================================================================= + +set -euo pipefail + +if (( $# < 1 )); then + echo "Usage: bsub.sh [args…]" >&2 + echo "Example: bsub.sh starsolo 10x /data/fastqs SAMPLE1 --species human" >&2 + exit 1 +fi + +SCRIPT="$*" +AA=("$@") +# Use the last argument as the job tag (basename only) +TAG=$(basename "${AA[-1]}") + +GROUP=$(bugroup -w | grep "\b${USER}\b" | cut -d" " -f1) +CPUS=16 +RAM=64000 +QUE="normal" +WDIR=$(pwd) + +bsub -G "$GROUP" \ + -n"$CPUS" \ + -R"span[hosts=1] select[mem>$RAM] rusage[mem=$RAM]" \ + -M"$RAM" \ + -o "$WDIR/$TAG.%J.bsub.log" \ + -e "$WDIR/$TAG.%J.bsub.err" \ + -q "$QUE" \ + $SCRIPT diff --git a/scripts/generate_test_data.py b/scripts/generate_test_data.py new file mode 100755 index 0000000..4a37ee2 --- /dev/null +++ b/scripts/generate_test_data.py @@ -0,0 +1,313 @@ +#!/usr/bin/env python3 +"""Generate a tiny synthetic dataset for the STARsolo CLI wrapper. + +- Creates a minimal synthetic reference (FASTA + GTF). +- Creates minimal whitelist files with the exact filenames expected by the wrapper. +- Creates gzipped FASTQs for each supported technology: + 10x, dropseq, rhapsody, indrops, strt, smartseq. + +This script uses ONLY the Python standard library. +""" + +from __future__ import annotations + +import argparse +import gzip +import tarfile +from pathlib import Path +from typing import Iterable, Tuple + +DNA_ALPHABET = "ACGT" + + +def base4_dna(n: int, length: int) -> str: + """Deterministic pseudo-random-ish DNA string from an integer.""" + if length <= 0: + return "" + chars = [] + x = n + for _ in range(length): + chars.append(DNA_ALPHABET[x & 3]) + x >>= 2 + if x == 0: + # mix a bit so we don't get long runs of 'A' for small n + x = (n * 1103515245 + 12345) & 0xFFFFFFFF + return "".join(chars) + + +def repeat_to_length(motif: str, length: int) -> str: + if not motif: + raise ValueError("motif must be non-empty") + return (motif * ((length // len(motif)) + 1))[:length] + + +def write_text(path: Path, text: str) -> None: + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text(text, encoding="utf-8") + + +def write_fastq_gz(path: Path, records: Iterable[Tuple[str, str]]) -> None: + """Write gzipped FASTQ from (name, sequence) pairs.""" + path.parent.mkdir(parents=True, exist_ok=True) + with gzip.open(path, "wt", encoding="utf-8", compresslevel=6) as fh: + for name, seq in records: + qual = "I" * len(seq) + fh.write(f"@{name}\n{seq}\n+\n{qual}\n") + + +def gen_records(prefix: str, mate: int, n: int, seq_fn) -> Iterable[Tuple[str, str]]: + for i in range(n): + yield (f"{prefix}_{i}/{mate}", seq_fn(i)) + + +def make_reference(outdir: Path) -> None: + """Create a synthetic reference with two genes on chrTest.""" + ref_dir = outdir / "reference" + ref_dir.mkdir(parents=True, exist_ok=True) + + # Build a 1500bp contig. Embed two easy-to-hit regions for mapping. + # geneA: 101-500 (1-based, inclusive) contains lots of ACGT repeats + # geneB: 1001-1400 contains lots of TGCATGCA repeats + contig_len = 1500 + seq = list( + repeat_to_length( + "CAGATTTTCATATTATGCAGAAAATCTACTTCGCCTGATACGAGTCGGTTATCTTCGGATACTGTATAGTCCCACCTGGTGATCCTATGCTTGTGAGTAC", + contig_len, + ) + ) + + geneA = repeat_to_length("ACGT", 400) + geneB = repeat_to_length("TGCATGCA", 400) + + # Insert (convert to 0-based slicing) + seq[100:500] = list(geneA) # 101..500 + seq[1000:1400] = list(geneB) # 1001..1400 + + fasta = ">chrTest\n" + "".join(seq) + "\n" + write_text(ref_dir / "test.fa", fasta) + + gtf = "\n".join( + [ + 'chrTest\tsynth\tgene\t101\t500\t.\t+\t.\tgene_id "geneA"; gene_name "geneA";', + 'chrTest\tsynth\ttranscript\t101\t500\t.\t+\t.\tgene_id "geneA"; transcript_id "txA"; gene_name "geneA";', + 'chrTest\tsynth\texon\t101\t500\t.\t+\t.\tgene_id "geneA"; transcript_id "txA"; gene_name "geneA"; exon_number "1";', + 'chrTest\tsynth\tgene\t1001\t1400\t.\t+\t.\tgene_id "geneB"; gene_name "geneB";', + 'chrTest\tsynth\ttranscript\t1001\t1400\t.\t+\t.\tgene_id "geneB"; transcript_id "txB"; gene_name "geneB";', + 'chrTest\tsynth\texon\t1001\t1400\t.\t+\t.\tgene_id "geneB"; transcript_id "txB"; gene_name "geneB"; exon_number "1";', + "", + ] + ) + write_text(ref_dir / "test.gtf", gtf) + + +def make_whitelists(outdir: Path) -> None: + wl = outdir / "whitelists" + wl.mkdir(parents=True, exist_ok=True) + + # 10x whitelists: minimal, but filenames MUST match the wrapper. + write_text(wl / "3M-february-2018.txt", "ACGTTGCAACGTTGCA\n") + write_text(wl / "737K-august-2016.txt", "TTTTTTTTTTTTTTTT\n") + write_text(wl / "737K-arc-v1.txt", "TGCATGCATGCATGCA\n") + write_text(wl / "737K-april-2014_rc.txt", "CCCCCCCCCCCCCC\n") # 14bp + write_text(wl / "3M-3pgex-may-2023.txt", "GGGGGGGGGGGGGGGG\n") + write_text(wl / "3M-5pgex-jan-2023.txt", "AAAAAAAAAAAAAAAA\n") + + # BD Rhapsody + write_text(wl / "Rhapsody_bc1.txt", "ACGTACGTA\n") + write_text(wl / "Rhapsody_bc2.txt", "TGCATGCAT\n") + write_text(wl / "Rhapsody_bc3.txt", "GATCGATCG\n") + + # inDrops + write_text(wl / "inDrops_Ambrose2_bc1.txt", "ACGTACGTACG\n") + write_text(wl / "inDrops_Ambrose2_bc2.txt", "TGCATGCA\n") + + # STRT-seq + write_text(wl / "96_barcodes.list", "AACCGGTT\nTTGGAACC\n") + + +def make_fastqs(outdir: Path, tenx_reads: int) -> None: + fq_root = outdir / "fastqs" + + # Shared cDNA read used across droplet tests (maps into geneA region) + cdna_50 = repeat_to_length("ACGT", 50) + cdna_75 = repeat_to_length("ACGT", 75) + + # 10x + bc_10x = "ACGTTGCAACGTTGCA" # must match 3M-february-2018.txt + + def tenx_r1(i: int) -> str: + return bc_10x + base4_dna(i, 12) + + write_fastq_gz( + fq_root / "10x" / "TENX1_1.fastq.gz", + gen_records("10x", 1, tenx_reads, tenx_r1), + ) + write_fastq_gz( + fq_root / "10x" / "TENX1_2.fastq.gz", + gen_records("10x", 2, tenx_reads, lambda i: cdna_50), + ) + + # Drop-seq (12bp CB + 8bp UMI) + bc_drop = "AACCGGTTAACC" + + def drop_r1(i: int) -> str: + return bc_drop + base4_dna(i, 8) + + write_fastq_gz( + fq_root / "dropseq" / "DROP1_1.fastq.gz", + gen_records("drop", 1, 2000, drop_r1), + ) + write_fastq_gz( + fq_root / "dropseq" / "DROP1_2.fastq.gz", + gen_records("drop", 2, 2000, lambda i: cdna_50), + ) + + # BD Rhapsody (3 barcode segments + 8bp UMI) + bc1, bc2, bc3 = "ACGTACGTA", "TGCATGCAT", "GATCGATCG" + filler1, filler2 = "A" * 12, "A" * 13 + + def rhap_r1(i: int) -> str: + return bc1 + filler1 + bc2 + filler2 + bc3 + base4_dna(i, 8) + + write_fastq_gz( + fq_root / "rhapsody" / "RHAP1_1.fastq.gz", + gen_records("rhap", 1, 1000, rhap_r1), + ) + write_fastq_gz( + fq_root / "rhapsody" / "RHAP1_2.fastq.gz", + gen_records("rhap", 2, 1000, lambda i: cdna_50), + ) + + # inDrops (BC1 + adapter + BC2 + 6bp UMI) + bc1_i, bc2_i = "ACGTACGTACG", "TGCATGCA" + adapter = "GAGTGATTGCTTGTGACGCCTT" # wrapper default + + def ind_r1(i: int) -> str: + return bc1_i + adapter + bc2_i + base4_dna(i, 6) + + write_fastq_gz( + fq_root / "indrops" / "IND1_1.fastq.gz", + gen_records("ind", 1, 1000, ind_r1), + ) + write_fastq_gz( + fq_root / "indrops" / "IND1_2.fastq.gz", + gen_records("ind", 2, 1000, lambda i: cdna_50), + ) + + # STRT-seq (8bp CB + 8bp UMI) — in this wrapper, barcode is in *_1.fastq + bc_strt = "AACCGGTT" # one of 96_barcodes.list + + def strt_r1(i: int) -> str: + return bc_strt + base4_dna(i, 8) + + write_fastq_gz( + fq_root / "strt" / "STRT1_1.fastq.gz", + gen_records("strt", 1, 500, strt_r1), + ) + write_fastq_gz( + fq_root / "strt" / "STRT1_2.fastq.gz", + gen_records("strt", 2, 500, lambda i: cdna_50), + ) + + # Smart-seq (manifest with absolute paths) + smart_dir = fq_root / "smartseq" + smart_dir.mkdir(parents=True, exist_ok=True) + + for cell in ("CELL1", "CELL2"): + write_fastq_gz( + smart_dir / f"{cell}_R1.fastq.gz", + gen_records(cell, 1, 200, lambda i: cdna_75), + ) + write_fastq_gz( + smart_dir / f"{cell}_R2.fastq.gz", + gen_records(cell, 2, 200, lambda i: cdna_75), + ) + + manifest_path = smart_dir / "smartseq_example.manifest.tsv" + cell1_r1 = (smart_dir / "CELL1_R1.fastq.gz").resolve() + cell1_r2 = (smart_dir / "CELL1_R2.fastq.gz").resolve() + cell2_r1 = (smart_dir / "CELL2_R1.fastq.gz").resolve() + cell2_r2 = (smart_dir / "CELL2_R2.fastq.gz").resolve() + + manifest = "\n".join( + [ + f"{cell1_r1}\t{cell1_r2}\tCELL1", + f"{cell2_r1}\t{cell2_r2}\tCELL2", + "", + ] + ) + write_text(manifest_path, manifest) + + +def make_readme(outdir: Path, tenx_reads: int) -> None: + text = f"""# STARsolo synthetic smoke-test dataset + +**Note on 10x:** the wrapper’s chemistry auto-detection requires >50,000 barcode matches in the +subsampled reads. This dataset generates {tenx_reads:,} reads for the 10x sample. + +## Build a STAR index + +```bash +mkdir -p reference/index +STAR --runThreadN 4 --runMode genomeGenerate \\ + --genomeDir reference/index \\ + --genomeFastaFiles reference/test.fa \\ + --sjdbGTFfile reference/test.gtf + + ## Example runs + WL=$(pwd)/whitelists +REF=$(pwd)/reference/index +``` +## Example runs +```bash +starsolo 10x fastqs/10x TENX1 --ref "$REF" --whitelist-dir "$WL" +starsolo dropseq fastqs/dropseq DROP1 --ref "$REF" +starsolo rhapsody fastqs/rhapsody RHAP1 --ref "$REF" --whitelist-dir "$WL" +starsolo indrops fastqs/indrops IND1 --ref "$REF" --whitelist-dir "$WL" +starsolo strt fastqs/strt STRT1 --ref "$REF" --whitelist-dir "$WL" +starsolo smartseq fastqs/smartseq/smartseq_example.manifest.tsv --ref "$REF" +``` +""" + write_text(outdir / "README.md", text) + + +def maybe_tar(outdir: Path, tar_path: Path) -> None: + tar_path.parent.mkdir(parents=True, exist_ok=True) + with tarfile.open(tar_path, "w:gz") as tf: + tf.add(outdir, arcname=outdir.name) + + +def main() -> None: + ap = argparse.ArgumentParser( + description="Generate synthetic STARsolo wrapper test data" + ) + ap.add_argument("--outdir", default="starsolo_synth", help="Output directory") + ap.add_argument( + "--tenx-reads", type=int, default=60000, help="Number of 10x reads (R1/R2)" + ) + ap.add_argument( + "--tar", action="store_true", help="Also create .tar.gz next to outdir" + ) + args = ap.parse_args() + + outdir = Path(args.outdir).resolve() + if outdir.exists() and any(outdir.iterdir()): + raise SystemExit(f"Refusing to write into non-empty directory: {outdir}") + + outdir.mkdir(parents=True, exist_ok=True) + + make_reference(outdir) + make_whitelists(outdir) + make_fastqs(outdir, tenx_reads=args.tenx_reads) + make_readme(outdir, tenx_reads=args.tenx_reads) + + if args.tar: + tar_path = outdir.with_suffix(".tar.gz") + maybe_tar(outdir, tar_path) + print(f"Wrote {outdir} and {tar_path}") + else: + print(f"Wrote {outdir}") + + +if __name__ == "__main__": + main() diff --git a/scripts/bbduk.sh b/scripts/legacy/bbduk.sh similarity index 100% rename from scripts/bbduk.sh rename to scripts/legacy/bbduk.sh diff --git a/scripts/bsub16.sh b/scripts/legacy/bsub16.sh similarity index 100% rename from scripts/bsub16.sh rename to scripts/legacy/bsub16.sh diff --git a/scripts/solo_QC.sh b/scripts/legacy/solo_QC.sh similarity index 100% rename from scripts/solo_QC.sh rename to scripts/legacy/solo_QC.sh diff --git a/scripts/legacy/starsolo_10x_auto.sh b/scripts/legacy/starsolo_10x_auto.sh new file mode 100755 index 0000000..23f69a5 --- /dev/null +++ b/scripts/legacy/starsolo_10x_auto.sh @@ -0,0 +1,437 @@ +#!/bin/bash -e + +## v3.2 of STARsolo wrappers is set up to guess the chemistry automatically +## newest version of the script uses STAR v2.7.10a with EM multimapper processing +## in STARsolo which on by default; the extra matrix can be found in /raw subdir + +# --- Function Definitions --- + +# Finds paired-end FASTQ files based on common naming conventions. +# Args: +# $1: FQDIR - The directory containing FASTQ files. +# $2: TAG - The sample identifier. +# Returns: +# A pipe-separated string "R1_FILES|R2_FILES". +function find_fastq_files() { + local FQDIR=$1 + local TAG=$2 + local R1 + local R2 + + ## Four popular cases: ENA, regular, Cell Ranger, and HRA. + ## The command below will generate a comma-separated list for each read + ## if there are >1 file for each mate. + if [[ $(find "$FQDIR"/* | grep -P "/$TAG[\\/\\._]" | grep "_1\\.f.*q") != "" ]]; then + R1=$(find "$FQDIR"/* | grep -P "/$TAG[\\/\\._]" | grep "_1\\.f.*q" | sort | tr '\n' ',' | sed "s/,$//g") + R2=$(find "$FQDIR"/* | grep -P "/$TAG[\\/\\._]" | grep "_2\\.f.*q" | sort | tr '\n' ',' | sed "s/,$//g") + elif [[ $(find "$FQDIR"/* | grep -P "/$TAG[\\/\\._]" | grep "R1\\.f.*q") != "" ]]; then + R1=$(find "$FQDIR"/* | grep -P "/$TAG[\\/\\._]" | grep "R1\\.f.*q" | sort | tr '\n' ',' | sed "s/,$//g") + R2=$(find "$FQDIR"/* | grep -P "/$TAG[\\/\\._]" | grep "R2\\.f.*q" | sort | tr '\n' ',' | sed "s/,$//g") + elif [[ $(find "$FQDIR"/* | grep -P "/$TAG[\\/\\._]" | grep "_R1_.*\\.f.*q") != "" ]]; then + R1=$(find "$FQDIR"/* | grep -P "/$TAG[\\/\\._]" | grep "_R1_.*\\.f.*q" | sort | tr '\n' ',' | sed "s/,$//g") + R2=$(find "$FQDIR"/* | grep -P "/$TAG[\\/\\._]" | grep "_R2_.*\\.f.*q" | sort | tr '\n' ',' | sed "s/,$//g") + elif [[ $(find "$FQDIR"/* | grep -P "/$TAG[\\/\\._]" | grep "_f1\\.f.*q") != "" ]]; then + R1=$(find "$FQDIR"/* | grep -P "/$TAG[\\/\\._]" | grep "_f1\\.f.*q" | sort | tr '\n' ',' | sed "s/,$//g") + R2=$(find "$FQDIR"/* | grep -P "/$TAG[\\/\\._]" | grep "_r2\\.f.*q" | sort | tr '\n' ',' | sed "s/,$//g") + else + >&2 echo "ERROR: No appropriate fastq files were found! Please check file formatting, and check if you have set the right FQDIR." + exit 1 + fi + + echo "$R1|$R2" +} + +# Checks for GZIP or BZIP2 compression and returns the appropriate command. +# Args: +# $1: FQDIR - The directory containing FASTQ files. +# $2: TAG - The sample identifier. +# Returns: +# A pipe-separated string "GZIP_COMMAND|Z_COMMAND". +function check_compression() { + local FQDIR=$1 + local TAG=$2 + local GZIP="" + local ZCMD="cat" # Default to no compression + + if [[ $(find "$FQDIR"/* | grep -P "$TAG[\\/\\._]" | grep "\\.gz$") != "" ]]; then + GZIP="--readFilesCommand zcat" + ZCMD="zcat" + elif [[ $(find "$FQDIR"/* | grep -P "$TAG[\\/\\._]" | grep "\\.bz2$") != "" ]]; then + GZIP="--readFilesCommand bzcat" + ZCMD="bzcat" + fi + + echo "$GZIP|$ZCMD" +} + +# Extracts a random subsample of reads for testing chemistry. +# Args: +# $1: R1 - Comma-separated list of Read 1 FASTQ files. +# $2: R2 - Comma-separated list of Read 2 FASTQ files. +# $3: ZCMD - The command to decompress files (e.g., zcat, bzcat, cat). +# $4: CMD - Optional prefix command (e.g., for singularity). +function extract_test_reads() { + local R1=$1 + local R2=$2 + local ZCMD=$3 + local CMD=${4:-""} + local COUNT=0 + + for i in $(echo "$R1" | tr ',' ' '); do + $ZCMD "$i" | head -4000000 > "$COUNT.R1_head" & + COUNT=$((COUNT+1)) + done + wait + + COUNT=0 + for i in $(echo "$R2" | tr ',' ' '); do + $ZCMD "$i" | head -4000000 > "$COUNT.R2_head" & + COUNT=$((COUNT+1)) + done + wait + + # Use the same random seed to select corresponding reads from R1 and R2. + cat *.R1_head | $CMD seqtk sample -s100 - 200000 > test.R1.fastq & + cat *.R2_head | $CMD seqtk sample -s100 - 200000 > test.R2.fastq & + wait + rm *.R1_head *.R2_head +} + +# Determines the correct 10x barcode whitelist and read lengths. +# Args: +# $1: WL - Path to the directory containing whitelist files. +# Returns: +# A pipe-separated string: "BC|NBC1|NBC2|NBC3|NBC4|NBC5|NBCA|R1LEN|R2LEN|R1DIS" +function determine_barcode_whitelist() { + local WL=$1 + local BC="" + + # Elucidate the right barcode whitelist to use. + local NBC1=$(cat test.R1.fastq | awk 'NR%4==2' | cut -c-14 | grep -F -f "$WL/737K-april-2014_rc.txt" | wc -l) + local NBC2=$(cat test.R1.fastq | awk 'NR%4==2' | cut -c-16 | grep -F -f "$WL/737K-august-2016.txt" | wc -l) + local NBC3=$(cat test.R1.fastq | awk 'NR%4==2' | cut -c-16 | grep -F -f "$WL/3M-february-2018.txt" | wc -l) + local NBC4=$(cat test.R1.fastq | awk 'NR%4==2' | cut -c-16 | grep -F -f "$WL/3M-3pgex-may-2023.txt" | wc -l) + local NBC5=$(cat test.R1.fastq | awk 'NR%4==2' | cut -c-16 | grep -F -f "$WL/3M-5pgex-jan-2023.txt" | wc -l) + local NBCA=$(cat test.R1.fastq | awk 'NR%4==2' | cut -c-16 | grep -F -f "$WL/737K-arc-v1.txt" | wc -l) + + # Get average read lengths and distribution. + local R1LEN=$(cat test.R1.fastq | awk 'NR%4==2' | awk '{sum+=length($0)} END {printf "%d\n",sum/NR+0.5}') + local R2LEN=$(cat test.R2.fastq | awk 'NR%4==2' | awk '{sum+=length($0)} END {printf "%d\n",sum/NR+0.5}') + local R1DIS=$(cat test.R1.fastq | awk 'NR%4==2' | awk '{print length($0)}' | sort | uniq -c | wc -l) + + if (( NBC3 > 50000 )); then + BC="$WL/3M-february-2018.txt" + elif (( NBC2 > 50000 )); then + BC="$WL/737K-august-2016.txt" + elif (( NBCA > 50000 )); then + BC="$WL/737K-arc-v1.txt" + elif (( NBC1 > 50000 )); then + BC="$WL/737K-april-2014_rc.txt" + elif (( NBC4 > 50000 )); then + BC="$WL/3M-3pgex-may-2023.txt" + elif (( NBC5 > 50000 )); then + BC="$WL/3M-5pgex-jan-2023.txt" + else + >&2 echo "ERROR: No whitelist has matched a random selection of 200,000 barcodes! Match counts: $NBC1 (v1), $NBC2 (v2), $NBC3 (v3), $NBC4 (v4-3p), $NBC5 (v4-5p), $NBCA (multiome)." + exit 1 + fi + + echo "$BC|$NBC1|$NBC2|$NBC3|$NBC4|$NBC5|$NBCA|$R1LEN|$R2LEN|$R1DIS" +} + +# Checks read lengths and determines barcode/UMI parameters. +# Args: +# $1: R1DIS, $2: R1LEN, $3: R2LEN, $4: BC, $5: WL +# Returns: +# A pipe-separated string: "PAIRED|CBLEN|UMILEN" +function check_read_lengths() { + local R1DIS=$1 + local R1LEN=$2 + local R2LEN=$3 + local BC=$4 + local WL=$5 + local PAIRED=False + local CBLEN + local UMILEN + + # Check for potential issues with read lengths. + if (( R1DIS > 1 && R1LEN <= 30 )); then + >&2 echo "ERROR: Read 1 (barcode) has varying length; possibly quality-trimmed. Please check the fastq files." + exit 1 + elif (( R1LEN < 24 )); then + >&2 echo "ERROR: Read 1 (barcode) is less than 24 bp in length. Please check the fastq files." + exit 1 + elif (( R2LEN < 40 )); then + >&2 echo "ERROR: Read 2 (biological read) is less than 40 bp in length. Please check the fastq files." + exit 1 + fi + + if (( R1LEN > 50 )); then + PAIRED=True + fi + + if [[ $BC == "$WL/3M-february-2018.txt" || $BC == "$WL/737K-arc-v1.txt" || $BC == "$WL/3M-3pgex-may-2023.txt" || $BC == "$WL/3M-5pgex-jan-2023.txt" ]]; then + CBLEN=16 + UMILEN=12 + elif [[ $BC == "$WL/737K-august-2016.txt" ]]; then + CBLEN=16 + UMILEN=10 + elif [[ $BC == "$WL/737K-april-2014_rc.txt" ]]; then + CBLEN=14 + UMILEN=10 + fi + + # Failsafe for short R1 reads with v3 chemistry. + if (( CBLEN + UMILEN > R1LEN )); then + local NEWUMI=$((R1LEN - CBLEN)) + local BCUMI=$((UMILEN + CBLEN)) + >&2 echo "WARNING: Read 1 length ($R1LEN) is less than the sum of barcode and UMI ($BCUMI). Changing UMI from $UMILEN to $NEWUMI!" + UMILEN=$NEWUMI + elif (( CBLEN + UMILEN < R1LEN )); then + local BCUMI=$((UMILEN + CBLEN)) + >&2 echo "WARNING: Read 1 length ($R1LEN) is more than the sum of barcode and UMI ($BCUMI)." + fi + + echo "$PAIRED|$CBLEN|$UMILEN" +} + + +# Determines strand specificity by running test alignments. +# Args: +# $1: REF, $2: CBLEN, $3: UMILEN, $4: CPUS, $5: BC, $6: PAIRED +# $7: CMD - Optional prefix command. +# Returns: +# A pipe-separated string: "STRAND|PCTFWD|PCTREV|PAIRED" +function determine_strand_specificity() { + local REF=$1 + local CBLEN=$2 + local UMILEN=$3 + local CPUS=$4 + local BC=$5 + local PAIRED=$6 + local CMD=${7:-""} + local STRAND=Forward + + $CMD STAR --runThreadN "$CPUS" --genomeDir "$REF" --readFilesIn test.R2.fastq test.R1.fastq --runDirPerm All_RWX --outSAMtype None \ + --soloType CB_UMI_Simple --soloCBwhitelist "$BC" --soloBarcodeReadLength 0 --soloCBlen "$CBLEN" --soloUMIstart $((CBLEN + 1)) \ + --soloUMIlen "$UMILEN" --soloStrand Forward --genomeLoad LoadAndKeep \ + --soloUMIdedup 1MM_CR --soloCBmatchWLtype 1MM_multi_Nbase_pseudocounts --soloUMIfiltering MultiGeneUMI_CR \ + --soloCellFilter EmptyDrops_CR --clipAdapterType CellRanger4 --outFilterScoreMin 30 \ + --soloFeatures Gene GeneFull --soloOutFileNames test_forward/ features.tsv barcodes.tsv matrix.mtx &> /dev/null + + $CMD STAR --runThreadN "$CPUS" --genomeDir "$REF" --readFilesIn test.R2.fastq test.R1.fastq --runDirPerm All_RWX --outSAMtype None \ + --soloType CB_UMI_Simple --soloCBwhitelist "$BC" --soloBarcodeReadLength 0 --soloCBlen "$CBLEN" --soloUMIstart $((CBLEN + 1)) \ + --soloUMIlen "$UMILEN" --soloStrand Reverse --genomeLoad LoadAndKeep \ + --soloUMIdedup 1MM_CR --soloCBmatchWLtype 1MM_multi_Nbase_pseudocounts --soloUMIfiltering MultiGeneUMI_CR \ + --soloCellFilter EmptyDrops_CR --clipAdapterType CellRanger4 --outFilterScoreMin 30 \ + --soloFeatures Gene GeneFull --soloOutFileNames test_reverse/ features.tsv barcodes.tsv matrix.mtx &> /dev/null + + local PCTFWD=$(grep "Reads Mapped to GeneFull: Unique GeneFull" test_forward/GeneFull/Summary.csv | awk -F "," '{printf "%d\n",$2*100+0.5}') + local PCTREV=$(grep "Reads Mapped to GeneFull: Unique GeneFull" test_reverse/GeneFull/Summary.csv | awk -F "," '{printf "%d\n",$2*100+0.5}') + + if (( PCTREV > PCTFWD )); then + STRAND=Reverse + fi + + if (( PCTREV < 50 && PCTFWD < 50 )); then + >&2 echo "WARNING: Low percentage of reads mapping to GeneFull: forward = $PCTFWD, reverse = $PCTREV" + fi + + # If a paired-end experiment turns out to be 3', process it as single-end. + if [[ $STRAND == "Forward" && $PAIRED == "True" ]]; then + PAIRED=False + fi + + echo "$STRAND|$PCTFWD|$PCTREV|$PAIRED" +} + +# Writes the final run configuration to a log file. +# Args: Takes 15 arguments representing all configuration parameters. +function write_config_file() { + local TAG=$1 + local PAIRED=$2 + local STRAND=$3 + local PCTFWD=$4 + local PCTREV=$5 + local BC=$6 + local NBC1=$7 + local NBC2=$8 + local NBC3=$9 + local NBCA=${10} + local NBC4=${11} + local NBC5=${12} + local CBLEN=${13} + local UMILEN=${14} + local GZIP=${15} + local R1=${16} + local R2=${17} + + echo "Done setting up the STARsolo run; here are final processing options:" + echo "=============================================================================" + ( + echo "Sample: $TAG" + echo "Paired-end mode: $PAIRED" + echo "Strand (Forward = 3', Reverse = 5'): $STRAND, %reads mapped to GeneFull: forward = $PCTFWD , reverse = $PCTREV" + echo "CB whitelist: $BC, matches out of 200,000: $NBC3 (v3), $NBC2 (v2), $NBC1 (v1), $NBC4 (v4-3p), $NBC5 (v4-5p), $NBCA (multiome)" + echo "CB length: $CBLEN" + echo "UMI length: $UMILEN" + echo "GZIP: $GZIP" + echo "-----------------------------------------------------------------------------" + echo "Read 1 files: $R1" + echo "-----------------------------------------------------------------------------" + echo "Read 2 files: $R2" + echo "-----------------------------------------------------------------------------" + ) | tee strand.txt +} + +# Executes the main STARsolo alignment. +# Args: Takes 13 arguments for all STARsolo parameters. +function run_starsolo() { + local PAIRED=$1 + local R1=$2 + local R2=$3 + local GZIP=$4 + local BAM=$5 + local BC=$6 + local CBLEN=$7 + local UMILEN=$8 + local STRAND=$9 + local REF=${10} + local CPUS=${11} + local SOLOFILENAMES=${12} + local CMD=${13:-""} + + if [[ $PAIRED == "True" ]]; then + # Note the R1/R2 order and --soloStrand Forward for 5' paired-end. + $CMD STAR --runThreadN "$CPUS" --genomeDir "$REF" --readFilesIn "$R1" "$R2" --runDirPerm All_RWX $GZIP $BAM --soloBarcodeMate 1 --clip5pNbases 39 0 \ + --soloType CB_UMI_Simple --soloCBwhitelist "$BC" --soloCBstart 1 --soloCBlen "$CBLEN" --soloUMIstart $((CBLEN + 1)) --soloUMIlen "$UMILEN" --soloStrand Forward \ + --soloUMIdedup 1MM_CR --soloCBmatchWLtype 1MM_multi_Nbase_pseudocounts --soloUMIfiltering MultiGeneUMI_CR \ + --soloCellFilter EmptyDrops_CR --outFilterScoreMin 30 --genomeLoad LoadAndRemove \ + --soloFeatures Gene GeneFull Velocyto --soloOutFileNames "$SOLOFILENAMES" features.tsv barcodes.tsv matrix.mtx --soloMultiMappers EM + else + $CMD STAR --runThreadN "$CPUS" --genomeDir "$REF" --readFilesIn "$R2" "$R1" --runDirPerm All_RWX $GZIP $BAM \ + --soloType CB_UMI_Simple --soloCBwhitelist "$BC" --soloBarcodeReadLength 0 --soloCBlen "$CBLEN" --soloUMIstart $((CBLEN + 1)) --soloUMIlen "$UMILEN" --soloStrand "$STRAND" \ + --soloUMIdedup 1MM_CR --soloCBmatchWLtype 1MM_multi_Nbase_pseudocounts --soloUMIfiltering MultiGeneUMI_CR \ + --soloCellFilter EmptyDrops_CR --clipAdapterType CellRanger4 --outFilterScoreMin 30 --genomeLoad LoadAndRemove \ + --soloFeatures Gene GeneFull Velocyto --soloOutFileNames "$SOLOFILENAMES" features.tsv barcodes.tsv matrix.mtx --soloMultiMappers EM + fi +} + +# Finalizes the output files. +# Args: +# $1: TAG - The sample identifier. +# $2: SOLOFILENAMES - The output directory for solo features. +# $3: CMD - Optional prefix command (e.g., for singularity). +function process_output_files() { + local CMD=${1:-""} + + if [[ -s "Aligned.sortedByCoord.out.bam" ]]; then + $CMD samtools index -@16 Aligned.sortedByCoord.out.bam + fi + + $CMD pbzip2 -9 "Unmapped.out.mate1" & + $CMD pbzip2 -9 "Unmapped.out.mate2" & + wait + + rm -rf test.R?.fastq test_forward test_reverse + + find . -name "*.tsv" -exec gzip {} + + find . -name "*.mtx" -exec gzip {} + + + wait + echo "ALL DONE!" +} + + +# --- Main Workflow Orchestrator --- + +function starsolo_10x() { + # --- 1. Setup and Argument Parsing --- + local FQDIR=$1 + local TAG=$2 + local CPUS=$3 + local REF=$4 + local WL=$5 + local BAM=$6 + local CMD=${7:-""} + + local FQDIR_ABS + FQDIR_ABS=$(readlink -f "$FQDIR") + mkdir -p "$TAG" && cd "$TAG" || exit + + # --- 2. Find FASTQ Files --- + local fastq_files + fastq_files=$(find_fastq_files "$FQDIR_ABS" "$TAG") + local R1 + local R2 + IFS='|' read -r R1 R2 <<< "$fastq_files" + + # --- 3. Determine Compression and Chemistry --- + local compression_info + compression_info=$(check_compression "$FQDIR_ABS" "$TAG") + local GZIP + local ZCMD + IFS='|' read -r GZIP ZCMD <<< "$compression_info" + + extract_test_reads "$R1" "$R2" "$ZCMD" "$CMD" + + local whitelist_info + whitelist_info=$(determine_barcode_whitelist "$WL") + local BC NBC1 NBC2 NBC3 NBC4 NBC5 NBCA R1LEN R2LEN R1DIS + IFS='|' read -r BC NBC1 NBC2 NBC3 NBC4 NBC5 NBCA R1LEN R2LEN R1DIS <<< "$whitelist_info" + + # --- 4. Check Reads and Determine Parameters --- + local read_len_info + read_len_info=$(check_read_lengths "$R1DIS" "$R1LEN" "$R2LEN" "$BC" "$WL") + local PAIRED CBLEN UMILEN + IFS='|' read -r PAIRED CBLEN UMILEN <<< "$read_len_info" + + local strand_info + strand_info=$(determine_strand_specificity "$REF" "$CBLEN" "$UMILEN" "$CPUS" "$BC" "$PAIRED" "$CMD") + local STRAND PCTFWD PCTREV + # PAIRED is returned again as the function can modify it + IFS='|' read -r STRAND PCTFWD PCTREV PAIRED <<< "$strand_info" + + # --- 5. Log, Run, and Cleanup --- + write_config_file "$TAG" "$PAIRED" "$STRAND" "$PCTFWD" "$PCTREV" "$BC" "$NBC1" "$NBC2" "$NBC3" "$NBCA" "$NBC4" "$NBC5" "$CBLEN" "$UMILEN" "$GZIP" "$R1" "$R2" + + local SOLOFILENAMES="output/" + run_starsolo "$PAIRED" "$R1" "$R2" "$GZIP" "$BAM" "$BC" "$CBLEN" "$UMILEN" "$STRAND" "$REF" "$CPUS" "$SOLOFILENAMES" "$CMD" + + process_output_files "$CMD" +} + +# --- Script Entrypoint --- + +function main () { + if (( $# != 3 )); then + >&2 echo "Usage: ./starsolo_10x_auto.sh " + >&2 echo "(make sure you set the correct REF, WL, and BAM variables below)" + >&2 echo "Specie can be 'human' or 'mouse' (or any other directory in /nfs/cellgeni/STAR/)" + >&2 echo -e "You need to have the following software installed to run this script: \n- star_version=2.7.10a_alpha_220818\n- samtools_version=1.15.1\n- bbmap_version=38.97\n- rsem_version=1.3.3" + >&2 echo "Use /nfs/cellgeni/singularity/images/reprocess_10x.sif if you work on FARM cluster." + exit 1 + fi + + local FQDIR=$1 + local TAG=$2 + local SPECIE=$3 + + # --- Configuration --- + # Using local variables to keep all state contained. + local SIF="/nfs/cellgeni/singularity/images/reprocess_10x.sif" + local CMD="singularity run --bind /nfs,/lustre,/software $SIF" + local CPUS=16 + local REF="/nfs/cellgeni/STAR/${SPECIE}/2020A/index" + local WL="/nfs/cellgeni/STAR/whitelists" + # Choose one of the two options, depending on whether you need a BAM file + # local BAM="--outSAMtype BAM SortedByCoordinate --outBAMsortingBinsN 500 --limitBAMsortRAM 60000000000 --outSAMunmapped Within --outMultimapperOrder Random --runRNGseed 1 --outSAMattributes NH HI AS nM CB UB CR CY UR UY GX GN" + local BAM="--outSAMtype None --outReadsUnmapped Fastx" + + starsolo_10x "$FQDIR" "$TAG" "$CPUS" "$REF" "$WL" "$BAM" "$CMD" +} + +# This construct ensures that main() is called only when the script is executed directly. +if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then + main "$@" +fi \ No newline at end of file diff --git a/scripts/starsolo_bd_rhapsody.sh b/scripts/legacy/starsolo_bd_rhapsody.sh similarity index 100% rename from scripts/starsolo_bd_rhapsody.sh rename to scripts/legacy/starsolo_bd_rhapsody.sh diff --git a/scripts/starsolo_dropseq.sh b/scripts/legacy/starsolo_dropseq.sh similarity index 100% rename from scripts/starsolo_dropseq.sh rename to scripts/legacy/starsolo_dropseq.sh diff --git a/scripts/starsolo_indrops.sh b/scripts/legacy/starsolo_indrops.sh similarity index 100% rename from scripts/starsolo_indrops.sh rename to scripts/legacy/starsolo_indrops.sh diff --git a/scripts/starsolo_ss2.sh b/scripts/legacy/starsolo_ss2.sh similarity index 100% rename from scripts/starsolo_ss2.sh rename to scripts/legacy/starsolo_ss2.sh diff --git a/scripts/starsolo_strt.sh b/scripts/legacy/starsolo_strt.sh similarity index 100% rename from scripts/starsolo_strt.sh rename to scripts/legacy/starsolo_strt.sh diff --git a/scripts/solo_qc.sh b/scripts/solo_qc.sh new file mode 100755 index 0000000..f055d05 --- /dev/null +++ b/scripts/solo_qc.sh @@ -0,0 +1,167 @@ +#!/bin/bash +# ============================================================================= +# solo_qc.sh — Aggregate QC statistics across STARsolo runs +# ============================================================================= +# Run from the parent directory that contains one subdirectory per sample +# (each produced by a starsolo run). +# +# Usage: +# starsolo qc # via the CLI +# ./scripts/solo_qc.sh # standalone +# +# Pipe through `column -t` for pretty-printed output: +# starsolo qc | column -t +# ============================================================================= + +# --- Check completion status ------------------------------------------------ + +>&2 echo "Checking that all STARsolo jobs went to completion …" +for i in */; do + i="${i%/}" + if [[ -d "$i/output" && -s "$i/Log.final.out" ]]; then + if [[ -d "$i/_STARtmp" ]]; then + >&2 echo "WARNING: Sample $i did not run to completion: _STARtmp is still present!" + fi + fi +done + +# --- Check for cross-contamination ----------------------------------------- + +>&2 echo "Checking potential sample cross-contamination …" +for i in */; do + i="${i%/}" + if [[ -s "$i/output/Gene/filtered/barcodes.tsv.gz" ]]; then + for j in */; do + j="${j%/}" + if [[ -s "$j/output/Gene/filtered/barcodes.tsv.gz" && "$i" != "$j" ]]; then + if [[ ! -s "sorted.$i.barcodes.tsv" ]]; then + zcat "$i/output/Gene/filtered/barcodes.tsv.gz" | sort > "sorted.$i.barcodes.tsv" & + fi + if [[ ! -s "sorted.$j.barcodes.tsv" ]]; then + zcat "$j/output/Gene/filtered/barcodes.tsv.gz" | sort > "sorted.$j.barcodes.tsv" & + fi + wait + N1=$(wc -l < "sorted.$i.barcodes.tsv") + N2=$(wc -l < "sorted.$j.barcodes.tsv") + MIN=$(( N1 <= N2 ? N1 : N2 )) + COMM=$(comm -12 "sorted.$i.barcodes.tsv" "sorted.$j.barcodes.tsv" | wc -l) + PCT=$(echo "$COMM" | awk -v v="$MIN" '{printf "%d\n",100*$1/v+0.5}') + if (( PCT >= 20 )); then + >&2 echo "WARNING: Samples $i ($N1 barcodes) and $j ($N2 barcodes) share $COMM ($PCT%) barcodes — higher than expected!" + fi + fi + done + elif [[ -d "$i/output" && -s "$i/Log.out" ]]; then + >&2 echo "Sample $i: no filtered output detected (probably plate-based)." + fi +done + +rm -f sorted.*.barcodes.tsv + +# --- Extract and output statistics ------------------------------------------ + +>&2 echo "Extracting STARsolo stats …" +>&2 echo + +echo -e "Sample\tRd_all\tRd_in_cells\tFrc_in_cells\tUMI_in_cells\tCells\tMed_nFeature\tGood_BC\tWL\tSpecies\tPaired\tStrand\tall_u+m\tall_u\texon_u+m\texon_u\tfull_u+m\tfull_u" + +for i in */; do + i="${i%/}" + + if [[ -d "$i/output/Gene/filtered/" && -s "$i/Log.out" ]]; then + # --- Droplet-based --- + PAIRED="Single" + if grep -q "clip5pNbases 39 0" "$i/Log.out"; then + PAIRED="Paired" + fi + + REF="Other" + if grep "^genomeDir" "$i/Log.out" | tail -n1 | grep -q "/human/"; then + REF="Human" + elif grep "^genomeDir" "$i/Log.out" | tail -n1 | grep -q "/mouse/"; then + REF="Mouse" + fi + + WL="Undef" + if grep "^soloCBwhitelist" "$i/Log.out" | tail -n1 | grep -q "3M-february-2018.txt"; then + WL="v3" + elif grep "^soloCBwhitelist" "$i/Log.out" | tail -n1 | grep -q "737K-august-2016.txt"; then + WL="v2" + elif grep "^soloCBwhitelist" "$i/Log.out" | tail -n1 | grep -q "737K-april-2014_rc.txt"; then + WL="v1" + elif grep "^soloCBwhitelist" "$i/Log.out" | tail -n1 | grep -q "737K-arc-v1.txt"; then + WL="arc" + fi + + R1=$(grep "Number of Reads," "$i/output/Gene/Summary.csv" | awk -F "," '{print $2}') + B=$( grep "Reads With Valid Barcodes," "$i/output/Gene/Summary.csv" | awk -F "," '{print $2}') + G1=$(grep "Reads Mapped to Genome: Unique+Multiple," "$i/output/Gene/Summary.csv" | awk -F "," '{print $2}') + G2=$(grep "Reads Mapped to Genome: Unique," "$i/output/Gene/Summary.csv" | awk -F "," '{print $2}') + E1=$(grep "Reads Mapped to Gene: Unique+Multip.*e Gene," "$i/output/Gene/Summary.csv" | awk -F "," '{print $2}') + E2=$(grep "Reads Mapped to Gene: Unique Gene," "$i/output/Gene/Summary.csv" | awk -F "," '{print $2}') + F1=$(grep "Reads Mapped to GeneFull: Unique+Multip.*e GeneFull," "$i/output/GeneFull/Summary.csv" | awk -F "," '{print $2}') + F2=$(grep "Reads Mapped to GeneFull: Unique GeneFull," "$i/output/GeneFull/Summary.csv" | awk -F "," '{print $2}') + C=$( grep "Estimated Number of Cells," "$i/output/GeneFull/Summary.csv" | awk -F "," '{print $2}') + R2=$(grep "Unique Reads in Cells Mapped to GeneFull," "$i/output/GeneFull/Summary.csv" | awk -F "," '{print $2}') + CF=$(echo "$R1" | awk -v v="$R2" '{printf "%.3f\n",v/$1}') + R3=$(grep "UMIs in Cells," "$i/output/GeneFull/Summary.csv" | awk -F "," '{print $2}') + GC=$(grep "Median GeneFull per Cell," "$i/output/GeneFull/Summary.csv" | awk -F "," '{print $2}') + ST=$(grep "^soloStrand" "$i/Log.out" | grep RE-DEFINED | awk '{print $2}') + + if [[ -z "$ST" ]]; then + ST="Undef" + elif [[ "$PAIRED" == "Paired" ]]; then + ST="Reverse" + fi + + F2PCT=$(echo "$F2" | awk '{printf "%d\n",$1*100+0.5}') + if (( F2PCT <= 20 )); then + >&2 echo "WARNING: Sample $i : GeneFull percentage ($F2PCT) is too low!" + fi + + echo -e "$i\t$R1\t$R2\t$CF\t$R3\t$C\t$GC\t$B\t$WL\t$REF\t$PAIRED\t$ST\t$G1\t$G2\t$E1\t$E2\t$F1\t$F2" + + elif [[ -d "$i/output" && -s "$i/Log.out" ]]; then + # --- Plate-based (Smart-seq2 etc.) --- + PAIRED="Single" + if grep -q "mate 2" "$i/Log.out"; then + PAIRED="Paired" + fi + + REF="Other" + if grep "^genomeDir" "$i/Log.out" | tail -n1 | grep -q "/human/"; then + REF="Human" + elif grep "^genomeDir" "$i/Log.out" | tail -n1 | grep -q "/mouse/"; then + REF="Mouse" + fi + + WL="Undef" + if grep -q "manifest" "$i/Log.out"; then + WL="None(Smart-seq)" + fi + + R1=$(grep "Number of Reads," "$i/output/Gene/Summary.csv" | awk -F "," '{print $2}') + B=$( grep "Reads With Valid Barcodes," "$i/output/Gene/Summary.csv" | awk -F "," '{print $2}') + G1=$(grep "Reads Mapped to Genome: Unique+Multiple," "$i/output/Gene/Summary.csv" | awk -F "," '{print $2}') + G2=$(grep "Reads Mapped to Genome: Unique," "$i/output/Gene/Summary.csv" | awk -F "," '{print $2}') + E1=$(grep "Reads Mapped to Gene: Unique+Multip.*e Gene," "$i/output/Gene/Summary.csv" | awk -F "," '{print $2}') + E2=$(grep "Reads Mapped to Gene: Unique Gene," "$i/output/Gene/Summary.csv" | awk -F "," '{print $2}') + F1=$(grep "Reads Mapped to GeneFull: Unique+Multip.*e GeneFull," "$i/output/GeneFull/Summary.csv" | awk -F "," '{print $2}') + F2=$(grep "Reads Mapped to GeneFull: Unique GeneFull," "$i/output/GeneFull/Summary.csv" | awk -F "," '{print $2}') + C=$( zcat "$i/output/Gene/raw/barcodes.tsv.gz" | wc -l) + R2=$(grep "yessubWLmatch_UniqueFeature" "$i/output/GeneFull/Features.stats" | awk '{print $2}') + CF=$(echo "$R1" | awk -v v="$R2" '{printf "%.3f\n",v/$1}') + R3=$(grep "yesUMIs" "$i/output/GeneFull/Features.stats" | awk '{print $2}') + GC=$(zcat "$i/output/GeneFull/raw/matrix.mtx.gz" | awk 'NR>3 {print $2}' | uniq -c | awk '{print $1}' | sort -n | awk '{a[i++]=$1} END {print a[int(i/2)]}') + ST=$(grep "^soloStrand" "$i/Log.out" | grep RE-DEFINED | awk '{print $2}') + + [[ -z "$ST" ]] && ST="Undef" + + F2PCT=$(echo "$F2" | awk '{printf "%d\n",$1*100+0.5}') + if (( F2PCT <= 20 )); then + >&2 echo "WARNING: Sample $i : GeneFull percentage ($F2PCT) is too low!" + fi + + echo -e "$i\t$R1\t$R2\t$CF\t$R3\t$C\t$GC\t$B\t$WL\t$REF\t$PAIRED\t$ST\t$G1\t$G2\t$E1\t$E2\t$F1\t$F2" + fi +done diff --git a/scripts/starsolo_10x_auto.sh b/scripts/starsolo_10x_auto.sh deleted file mode 100755 index 7523abc..0000000 --- a/scripts/starsolo_10x_auto.sh +++ /dev/null @@ -1,291 +0,0 @@ -#!/bin/bash -e - -## v3.2 of STARsolo wrappers is set up to guess the chemistry automatically -## newest version of the script uses STAR v2.7.10a with EM multimapper processing -## in STARsolo which on by default; the extra matrix can be found in /raw subdir - -SIF="/nfs/cellgeni/singularity/images/reprocess_10x.sif" -CMD="singularity run --bind /nfs,/lustre,/software $SIF" - -FQDIR=$1 -TAG=$2 - -if [[ $FQDIR == "" || $TAG == "" ]] -then - >&2 echo "Usage: ./starsolo_10x_auto.sh " - >&2 echo "(make sure you set the correct REF, WL, and BAM variables below)" - exit 1 -fi - -FQDIR=`readlink -f $FQDIR` -CPUS=16 ## typically bsub this into normal queue with 16 cores and 64 Gb RAM. -REF=/nfs/cellgeni/STAR/human/2020A/index ## choose the appropriate reference -WL=/nfs/cellgeni/STAR/whitelists ## directory with all barcode whitelists - -## choose one of the two otions, depending on whether you need a BAM file -#BAM="--outSAMtype BAM SortedByCoordinate --outBAMsortingBinsN 500 --limitBAMsortRAM 60000000000 --outSAMunmapped Within --outMultimapperOrder Random --runRNGseed 1 --outSAMattributes NH HI AS nM CB UB CR CY UR UY GX GN" -BAM="--outSAMtype None --outReadsUnmapped Fastx" - -###################################################################### DONT CHANGE OPTIONS BELOW THIS LINE ############################################################################################## - -mkdir $TAG && cd $TAG - -## four popular cases: ENA - _1.fastq/_2.fastq, regular - .R1.fastq/.R2.fastq -## Cell Ranger - _L001_R1_S001.fastq/_L001_R2_S001.fastq, and HRA - _f1.fastq/_r2.fastq -## the command below will generate a comma-separated list for each read if there are >1 file for each mate -## archives (gzip/bzip2) are considered below; both .fastq and .fq should work, too. -R1="" -R2="" -if [[ `find $FQDIR/* | grep -P "\/$TAG[\/\._]" | grep "_1\.f.*q"` != "" ]] -then - R1=`find $FQDIR/* | grep -P "\/$TAG[\/\._]" | grep "_1\.f.*q" | sort | tr '\n' ',' | sed "s/,$//g"` - R2=`find $FQDIR/* | grep -P "\/$TAG[\/\._]" | grep "_2\.f.*q" | sort | tr '\n' ',' | sed "s/,$//g"` -elif [[ `find $FQDIR/* | grep -P "\/$TAG[\/\._]" | grep "R1\.f.*q"` != "" ]] -then - R1=`find $FQDIR/* | grep -P "\/$TAG[\/\._]" | grep "R1\.f.*q" | sort | tr '\n' ',' | sed "s/,$//g"` - R2=`find $FQDIR/* | grep -P "\/$TAG[\/\._]" | grep "R2\.f.*q" | sort | tr '\n' ',' | sed "s/,$//g"` -elif [[ `find $FQDIR/* | grep -P "\/$TAG[\/\._]" | grep "_R1_.*\.f.*q"` != "" ]] -then - R1=`find $FQDIR/* | grep -P "\/$TAG[\/\._]" | grep "_R1_.*\.f.*q" | sort | tr '\n' ',' | sed "s/,$//g"` - R2=`find $FQDIR/* | grep -P "\/$TAG[\/\._]" | grep "_R2_.*\.f.*q" | sort | tr '\n' ',' | sed "s/,$//g"` -elif [[ `find $FQDIR/* | grep -P "\/$TAG[\/\._]" | grep "_f1\.f.*q"` != "" ]] -then - R1=`find $FQDIR/* | grep -P "\/$TAG[\/\._]" | grep "_f1\.f.*q" | sort | tr '\n' ',' | sed "s/,$//g"` - R2=`find $FQDIR/* | grep -P "\/$TAG[\/\._]" | grep "_r2\.f.*q" | sort | tr '\n' ',' | sed "s/,$//g"` -else - >&2 echo "ERROR: No appropriate fastq files were found! Please check file formatting, and check if you have set the right FQDIR." - exit 1 -fi - -## define some key variables, in order to evaluate reads for being 1) gzipped/bzipped/un-archived; -## 2) having barcodes from the whitelist, and which; 3) having consistent length; 4) being single- or paired-end. -GZIP="" -BC="" -NBC1="" -NBC2="" -NBC3="" -NBCA="" -R1LEN="" -R2LEN="" -R1DIS="" -ZCMD="cat" - -## see if the original fastq files are archived: -if [[ `find $FQDIR/* | grep -P "$TAG[\/\._]" | grep "\.gz$"` != "" ]] -then - GZIP="--readFilesCommand zcat" - ZCMD="zcat" -elif [[ `find $FQDIR/* | grep -P "$TAG[\/\._]" | grep "\.bz2$"` != "" ]] -then - GZIP="--readFilesCommand bzcat" - ZCMD="bzcat" -fi - -## we need a small and random selection of reads. the solution below is a result of much trial and error. -## in the end, we select 200k reads that represent all of the files present in the FASTQ dir for this sample. -## have to use numbers because bamtofastq likes to make files with identical names in different folders.. -COUNT=0 -for i in `echo $R1 | tr ',' ' '` -do - $ZCMD $i | head -4000000 > $COUNT.R1_head & - COUNT=$((COUNT+1)) -done -wait - -COUNT=0 -for i in `echo $R2 | tr ',' ' ' ` -do - $ZCMD $i | head -4000000 > $COUNT.R2_head & - COUNT=$((COUNT+1)) -done -wait - -## same random seed makes sure you select same reads from R1 and R2 -cat *.R1_head | $CMD seqtk sample -s100 - 200000 > test.R1.fastq & -cat *.R2_head | $CMD seqtk sample -s100 - 200000 > test.R2.fastq & -wait -rm *.R1_head *.R2_head - -## elucidate the right barcode whitelist to use. Grepping out N saves us some trouble. Note the special list for multiome experiments (737K-arc-v1.txt): -## 50k (out of 200,000) is a modified empirical number - matching only first 14-16 nt makes this more specific -NBC1=`cat test.R1.fastq | awk 'NR%4==2' | cut -c-14 | grep -F -f $WL/737K-april-2014_rc.txt | wc -l` -NBC2=`cat test.R1.fastq | awk 'NR%4==2' | cut -c-16 | grep -F -f $WL/737K-august-2016.txt | wc -l` -NBC3=`cat test.R1.fastq | awk 'NR%4==2' | cut -c-16 | grep -F -f $WL/3M-february-2018.txt | wc -l` -NBC4=`cat test.R1.fastq | awk 'NR%4==2' | cut -c-16 | grep -F -f $WL/3M-3pgex-may-2023.txt | wc -l` -NBC5=`cat test.R1.fastq | awk 'NR%4==2' | cut -c-16 | grep -F -f $WL/3M-5pgex-jan-2023.txt | wc -l` -NBCA=`cat test.R1.fastq | awk 'NR%4==2' | cut -c-16 | grep -F -f $WL/737K-arc-v1.txt | wc -l` -R1LEN=`cat test.R1.fastq | awk 'NR%4==2' | awk '{sum+=length($0)} END {printf "%d\n",sum/NR+0.5}'` -R2LEN=`cat test.R2.fastq | awk 'NR%4==2' | awk '{sum+=length($0)} END {printf "%d\n",sum/NR+0.5}'` -R1DIS=`cat test.R1.fastq | awk 'NR%4==2' | awk '{print length($0)}' | sort | uniq -c | wc -l` - -if (( $NBC3 > 50000 )) -then - BC=$WL/3M-february-2018.txt -elif (( $NBC2 > 50000 )) -then - BC=$WL/737K-august-2016.txt -elif (( $NBCA > 50000 )) -then - BC=$WL/737K-arc-v1.txt -elif (( $NBC1 > 50000 )) -then - BC=$WL/737K-april-2014_rc.txt -elif (( $NBC4 > 50000 )) -then - BC=$WL/3M-3pgex-may-2023.txt -elif (( $NBC5 > 50000 )) -then - BC=$WL/3M-5pgex-jan-2023.txt -else - >&2 echo "ERROR: No whitelist has matched a random selection of 200,000 barcodes! Match counts: $NBC1 (v1), $NBC2 (v2), $NBC3 (v3), $NBC4 (v4-3p), $NBC5 (v4-5p), $NBCA (multiome)." - exit 1 -fi - -## check read lengths, fail if something funky is going on: -PAIRED=False -UMILEN="" -CBLEN="" -if (( $R1DIS > 1 && $R1LEN <= 30 )) -then - >&2 echo "ERROR: Read 1 (barcode) has varying length; possibly someone thought it's a good idea to quality-trim it. Please check the fastq files." - exit 1 -elif (( $R1LEN < 24 )) -then - >&2 echo "ERROR: Read 1 (barcode) is less than 24 bp in length. Please check the fastq files." - exit 1 -elif (( $R2LEN < 40 )) -then - >&2 echo "ERROR: Read 2 (biological read) is less than 40 bp in length. Please check the fastq files." - exit 1 -fi - -## assign the necessary variables for barcode/UMI length/paired-end processing. -## script was changed to not rely on read length for the UMIs because of the epic Hassan case -# (v2 16bp barcodes + 10bp UMIs were sequenced to 28bp, effectively removing the effects of the UMIs) -if (( $R1LEN > 50 )) -then - PAIRED=True -fi - -if [[ $BC == "$WL/3M-february-2018.txt" || $BC == "$WL/737K-arc-v1.txt" || $BC == "$WL/3M-3pgex-may-2023.txt" || $BC == "$WL/3M-5pgex-jan-2023.txt" ]] -then - CBLEN=16 - UMILEN=12 -elif [[ $BC == "$WL/737K-august-2016.txt" ]] -then - CBLEN=16 - UMILEN=10 -elif [[ $BC == "$WL/737K-april-2014_rc.txt" ]] -then - CBLEN=14 - UMILEN=10 -fi - -## yet another failsafe! Some geniuses managed to sequence v3 10x with a 26bp R1, which also causes STARsolo grief. This fixes it. -if (( $CBLEN + $UMILEN > $R1LEN )) -then - NEWUMI=$((R1LEN-CBLEN)) - BCUMI=$((UMILEN+CBLEN)) - >&2 echo "WARNING: Read 1 length ($R1LEN) is less than the sum of appropriate barcode and UMI ($BCUMI). Changing UMI setting from $UMILEN to $NEWUMI!" - UMILEN=$NEWUMI -elif (( $CBLEN + $UMILEN < $R1LEN )) -then - BCUMI=$((UMILEN+CBLEN)) - >&2 echo "WARNING: Read 1 length ($R1LEN) is more than the sum of appropriate barcode and UMI ($BCUMI)." -fi - -## it's hard to come up with a universal rule to correctly infer strand-specificity of the experiment. -## this is the best I could come up with: 1) check if fraction of test reads (200k random ones) maps to GeneFull forward strand -## with higher than 50% probability; 2) if not, run the same quantification with "--soloStand Reverse" and calculate the same stat; -## 3) output a warning, and choose the strand with higher %; 4) if both percentages are below 10, - -STRAND=Forward - -$CMD STAR --runThreadN $CPUS --genomeDir $REF --readFilesIn test.R2.fastq test.R1.fastq --runDirPerm All_RWX --outSAMtype None \ - --soloType CB_UMI_Simple --soloCBwhitelist $BC --soloBarcodeReadLength 0 --soloCBlen $CBLEN --soloUMIstart $((CBLEN+1)) \ - --soloUMIlen $UMILEN --soloStrand Forward --genomeLoad LoadAndKeep \ - --soloUMIdedup 1MM_CR --soloCBmatchWLtype 1MM_multi_Nbase_pseudocounts --soloUMIfiltering MultiGeneUMI_CR \ - --soloCellFilter EmptyDrops_CR --clipAdapterType CellRanger4 --outFilterScoreMin 30 \ - --soloFeatures Gene GeneFull --soloOutFileNames test_forward/ features.tsv barcodes.tsv matrix.mtx &> /dev/null - -$CMD STAR --runThreadN $CPUS --genomeDir $REF --readFilesIn test.R2.fastq test.R1.fastq --runDirPerm All_RWX --outSAMtype None \ - --soloType CB_UMI_Simple --soloCBwhitelist $BC --soloBarcodeReadLength 0 --soloCBlen $CBLEN --soloUMIstart $((CBLEN+1)) \ - --soloUMIlen $UMILEN --soloStrand Reverse --genomeLoad LoadAndKeep \ - --soloUMIdedup 1MM_CR --soloCBmatchWLtype 1MM_multi_Nbase_pseudocounts --soloUMIfiltering MultiGeneUMI_CR \ - --soloCellFilter EmptyDrops_CR --clipAdapterType CellRanger4 --outFilterScoreMin 30 \ - --soloFeatures Gene GeneFull --soloOutFileNames test_reverse/ features.tsv barcodes.tsv matrix.mtx &> /dev/null - -PCTFWD=`grep "Reads Mapped to GeneFull: Unique GeneFull" test_forward/GeneFull/Summary.csv | awk -F "," '{printf "%d\n",$2*100+0.5}'` -PCTREV=`grep "Reads Mapped to GeneFull: Unique GeneFull" test_reverse/GeneFull/Summary.csv | awk -F "," '{printf "%d\n",$2*100+0.5}'` - -if (( $PCTREV > $PCTFWD )) -then - STRAND=Reverse -fi - -if (( $PCTREV < 50 && $PCTFWD < 50)) -then - >&2 echo "WARNING: Low percentage of reads mapping to GeneFull: forward = $PCTFWD , reverse = $PCTREV" -fi - -## finally, if paired-end experiment turned out to be 3' (yes, they do exist!), process it as single-end: -if [[ $STRAND == "Forward" && $PAIRED == "True" ]] -then - PAIRED=False -fi - -## write a file in the sample dir too, these metrics are not crucial but useful -echo "Done setting up the STARsolo run; here are final processing options:" -echo "=============================================================================" -echo "Sample: $TAG" | tee strand.txt -echo "Paired-end mode: $PAIRED" | tee -a strand.txt -echo "Strand (Forward = 3', Reverse = 5'): $STRAND, %reads mapped to GeneFull: forward = $PCTFWD , reverse = $PCTREV" | tee -a strand.txt -echo "CB whitelist: $BC, matches out of 200,000: $NBC3 (v3), $NBC2 (v2), $NBC1 (v1), $NBCA (multiome) " | tee -a strand.txt -echo "CB length: $CBLEN" | tee -a strand.txt -echo "UMI length: $UMILEN" | tee -a strand.txt -echo "GZIP: $GZIP" | tee -a strand.txt -echo "-----------------------------------------------------------------------------" | tee -a strand.txt -echo "Read 1 files: $R1" | tee -a strand.txt -echo "-----------------------------------------------------------------------------" | tee -a strand.txt -echo "Read 2 files: $R2" | tee -a strand.txt -echo "-----------------------------------------------------------------------------" | tee -a strand.txt - -if [[ $PAIRED == "True" ]] -then - ## note the R1/R2 order of input fastq reads and --soloStrand Forward for 5' paired-end experiment - $CMD STAR --runThreadN $CPUS --genomeDir $REF --readFilesIn $R1 $R2 --runDirPerm All_RWX $GZIP $BAM --soloBarcodeMate 1 --clip5pNbases 39 0 \ - --soloType CB_UMI_Simple --soloCBwhitelist $BC --soloCBstart 1 --soloCBlen $CBLEN --soloUMIstart $((CBLEN+1)) --soloUMIlen $UMILEN --soloStrand Forward \ - --soloUMIdedup 1MM_CR --soloCBmatchWLtype 1MM_multi_Nbase_pseudocounts --soloUMIfiltering MultiGeneUMI_CR \ - --soloCellFilter EmptyDrops_CR --outFilterScoreMin 30 --genomeLoad LoadAndRemove \ - --soloFeatures Gene GeneFull Velocyto --soloOutFileNames output/ features.tsv barcodes.tsv matrix.mtx --soloMultiMappers EM -else - $CMD STAR --runThreadN $CPUS --genomeDir $REF --readFilesIn $R2 $R1 --runDirPerm All_RWX $GZIP $BAM \ - --soloType CB_UMI_Simple --soloCBwhitelist $BC --soloBarcodeReadLength 0 --soloCBlen $CBLEN --soloUMIstart $((CBLEN+1)) --soloUMIlen $UMILEN --soloStrand $STRAND \ - --soloUMIdedup 1MM_CR --soloCBmatchWLtype 1MM_multi_Nbase_pseudocounts --soloUMIfiltering MultiGeneUMI_CR \ - --soloCellFilter EmptyDrops_CR --clipAdapterType CellRanger4 --outFilterScoreMin 30 --genomeLoad LoadAndRemove \ - --soloFeatures Gene GeneFull Velocyto --soloOutFileNames output/ features.tsv barcodes.tsv matrix.mtx --soloMultiMappers EM -fi - -## index the BAM file -if [[ -s Aligned.sortedByCoord.out.bam ]] -then - $CMD samtools index -@16 Aligned.sortedByCoord.out.bam -fi - -## max-CR bzip all unmapped reads with multicore pbzip2 -$CMD pbzip2 -9 Unmapped.out.mate1 & -$CMD pbzip2 -9 Unmapped.out.mate2 & -wait - -## remove test files -rm -rf test.R?.fastq test_forward test_reverse - -cd output -for i in Gene/raw Gene/filtered GeneFull/raw GeneFull/filtered Velocyto/raw Velocyto/filtered -do - cd $i; for j in *; do gzip $j & done - cd ../../ -done - -wait -echo "ALL DONE!"