From 8df721ec2880b7de2192ddf0ae18ce6e1c163461 Mon Sep 17 00:00:00 2001 From: Guillaume Poirier-Morency Date: Tue, 24 Feb 2026 10:39:03 -0800 Subject: [PATCH 1/6] Allow DEA to be performed on a subset of the samples --- .../main/config/bash_completion.d/gemma-cli | 4 +- .../bash_completion.d/gemma-cli-staging | 4 +- .../fish/completions/gemma-cli-staging.fish | 6 +- .../config/fish/completions/gemma-cli.fish | 6 +- .../DifferentialExpressionAnalysisCli.java | 91 +- .../gemma/cli/completion/CompletionType.java | 1 + .../ubic/gemma/cli/util/EntityLocator.java | 30 +- .../gemma/cli/util/EntityLocatorImpl.java | 84 +- .../DifferentialExpressionAnalysisConfig.java | 15 + .../expression/diff/LinearModelAnalyzer.java | 808 +++++++++--------- .../experiment/ExpressionExperimentDao.java | 4 + .../ExpressionExperimentDaoImpl.java | 34 + .../ExpressionExperimentService.java | 17 + .../ExpressionExperimentServiceImpl.java | 12 + 14 files changed, 662 insertions(+), 454 deletions(-) diff --git a/gemma-cli/src/main/config/bash_completion.d/gemma-cli b/gemma-cli/src/main/config/bash_completion.d/gemma-cli index 8f477c9d26..99aa9ccd32 100644 --- a/gemma-cli/src/main/config/bash_completion.d/gemma-cli +++ b/gemma-cli/src/main/config/bash_completion.d/gemma-cli @@ -251,7 +251,7 @@ function __gemma_cli_complete() { fi if [[ " $words " =~ ' ubic.gemma.apps.DifferentialExpressionAnalysisCli ' ]]; then if ! [[ "$current_option" =~ (--batch-format|--batch-output-file|--batch-report-frequency|--delete-analysis|--delete-subset|--eeListfile|--excludeEEFile|--experiment|--experiment-set|--expressionQuery|--factors|--filter-minimum-number-of-cells-per-gene|--filter-minimum-number-of-cells-per-sample|--filter-minimum-variance|--filter-repetitive-values-minimum-number-of-samples|--filter-repetitive-values-minimum-unique-values|--filter-repetitive-values-mode|--output-dir|--redo-analysis|--redo-subset|--subset|--taxon|--type|-batchFormat|-batchOutputFile|-batchReportFrequency|-d|-deleteAnalysis|-deleteSubset|-e|-eeset|-f|-factors|-filterMinNumberOfCellsPerGene|-filterMinNumberOfCellsPerSample|-filterMinVariance|-filterRepetitiveValuesMinSamples|-filterRepetitiveValuesMinUniqueValues|-filterRepetitiveValuesMode|-mdate|-q|-redoAnalysis|-redoSubset|-subset|-t|-type|-x) ]]; then - mapfile -t -O "${#COMPREPLY[@]}" COMPREPLY < <(compgen -W "--batch-format --batch-output-file --batch-report-frequency --complete-analyses --complete-factors --complete-subset-factors --complete-subsets --delete --delete-analysis --delete-subset --eeListfile --excludeEEFile --experiment --experiment-set --expressionQuery --factors --filter-minimum-number-of-cells-per-gene --filter-minimum-number-of-cells-per-sample --filter-minimum-variance --filter-repetitive-values-minimum-number-of-samples --filter-repetitive-values-minimum-unique-values --filter-repetitive-values-mode --force --help --ignore-failing-subsets --no-bayes --no-db --no-files --output-dir --redo --redo-analysis --redo-subset --subset --taxon --type --use-batch-factor -all -auto -batchFormat -batchOutputFile -batchReportFrequency -completeAnalyses -completeFactors -completeSubsetFactors -completeSubsets -d -delete -deleteAnalysis -deleteSubset -e -eeset -f -factors -filterMinNumberOfCellsPerGene -filterMinNumberOfCellsPerSample -filterMinVariance -filterRepetitiveValuesMinSamples -filterRepetitiveValuesMinUniqueValues -filterRepetitiveValuesMode -force -h -ignoreFailingSubsets -mdate -nobayes -nodb -nofiles -q -redo -redoAnalysis -redoSubset -subset -t -type -usebatch -x" -- "$2") + mapfile -t -O "${#COMPREPLY[@]}" COMPREPLY < <(compgen -W "--batch-format --batch-output-file --batch-report-frequency --complete-analyses --complete-factors --complete-subset-factors --complete-subsets --delete --delete-analysis --delete-subset --eeListfile --excludeEEFile --experiment --experiment-set --expressionQuery --factors --filter-minimum-number-of-cells-per-gene --filter-minimum-number-of-cells-per-sample --filter-minimum-variance --filter-repetitive-values-minimum-number-of-samples --filter-repetitive-values-minimum-unique-values --filter-repetitive-values-mode --force --help --ignore-failing-subsets --no-bayes --no-db --no-files --output-dir --redo --redo-analysis --redo-subset --samples --subset --taxon --type --use-batch-factor -all -auto -batchFormat -batchOutputFile -batchReportFrequency -completeAnalyses -completeFactors -completeSubsetFactors -completeSubsets -d -delete -deleteAnalysis -deleteSubset -e -eeset -f -factors -filterMinNumberOfCellsPerGene -filterMinNumberOfCellsPerSample -filterMinVariance -filterRepetitiveValuesMinSamples -filterRepetitiveValuesMinUniqueValues -filterRepetitiveValuesMode -force -h -ignoreFailingSubsets -mdate -nobayes -nodb -nofiles -q -redo -redoAnalysis -redoSubset -samples -subset -t -type -usebatch -x" -- "$2") fi if [[ "$current_option" =~ (--batch-output-file|--eeListfile|--excludeEEFile|--output-dir|-batchOutputFile|-d|-f|-x) ]]; then mapfile -t -O "${#COMPREPLY[@]}" COMPREPLY < <(compgen -f -- "$2") @@ -259,7 +259,7 @@ function __gemma_cli_complete() { fi if [[ " $words " =~ ' diffExAnalyze ' ]]; then if ! [[ "$current_option" =~ (--batch-format|--batch-output-file|--batch-report-frequency|--delete-analysis|--delete-subset|--eeListfile|--excludeEEFile|--experiment|--experiment-set|--expressionQuery|--factors|--filter-minimum-number-of-cells-per-gene|--filter-minimum-number-of-cells-per-sample|--filter-minimum-variance|--filter-repetitive-values-minimum-number-of-samples|--filter-repetitive-values-minimum-unique-values|--filter-repetitive-values-mode|--output-dir|--redo-analysis|--redo-subset|--subset|--taxon|--type|-batchFormat|-batchOutputFile|-batchReportFrequency|-d|-deleteAnalysis|-deleteSubset|-e|-eeset|-f|-factors|-filterMinNumberOfCellsPerGene|-filterMinNumberOfCellsPerSample|-filterMinVariance|-filterRepetitiveValuesMinSamples|-filterRepetitiveValuesMinUniqueValues|-filterRepetitiveValuesMode|-mdate|-q|-redoAnalysis|-redoSubset|-subset|-t|-type|-x) ]]; then - mapfile -t -O "${#COMPREPLY[@]}" COMPREPLY < <(compgen -W "--batch-format --batch-output-file --batch-report-frequency --complete-analyses --complete-factors --complete-subset-factors --complete-subsets --delete --delete-analysis --delete-subset --eeListfile --excludeEEFile --experiment --experiment-set --expressionQuery --factors --filter-minimum-number-of-cells-per-gene --filter-minimum-number-of-cells-per-sample --filter-minimum-variance --filter-repetitive-values-minimum-number-of-samples --filter-repetitive-values-minimum-unique-values --filter-repetitive-values-mode --force --help --ignore-failing-subsets --no-bayes --no-db --no-files --output-dir --redo --redo-analysis --redo-subset --subset --taxon --type --use-batch-factor -all -auto -batchFormat -batchOutputFile -batchReportFrequency -completeAnalyses -completeFactors -completeSubsetFactors -completeSubsets -d -delete -deleteAnalysis -deleteSubset -e -eeset -f -factors -filterMinNumberOfCellsPerGene -filterMinNumberOfCellsPerSample -filterMinVariance -filterRepetitiveValuesMinSamples -filterRepetitiveValuesMinUniqueValues -filterRepetitiveValuesMode -force -h -ignoreFailingSubsets -mdate -nobayes -nodb -nofiles -q -redo -redoAnalysis -redoSubset -subset -t -type -usebatch -x" -- "$2") + mapfile -t -O "${#COMPREPLY[@]}" COMPREPLY < <(compgen -W "--batch-format --batch-output-file --batch-report-frequency --complete-analyses --complete-factors --complete-subset-factors --complete-subsets --delete --delete-analysis --delete-subset --eeListfile --excludeEEFile --experiment --experiment-set --expressionQuery --factors --filter-minimum-number-of-cells-per-gene --filter-minimum-number-of-cells-per-sample --filter-minimum-variance --filter-repetitive-values-minimum-number-of-samples --filter-repetitive-values-minimum-unique-values --filter-repetitive-values-mode --force --help --ignore-failing-subsets --no-bayes --no-db --no-files --output-dir --redo --redo-analysis --redo-subset --samples --subset --taxon --type --use-batch-factor -all -auto -batchFormat -batchOutputFile -batchReportFrequency -completeAnalyses -completeFactors -completeSubsetFactors -completeSubsets -d -delete -deleteAnalysis -deleteSubset -e -eeset -f -factors -filterMinNumberOfCellsPerGene -filterMinNumberOfCellsPerSample -filterMinVariance -filterRepetitiveValuesMinSamples -filterRepetitiveValuesMinUniqueValues -filterRepetitiveValuesMode -force -h -ignoreFailingSubsets -mdate -nobayes -nodb -nofiles -q -redo -redoAnalysis -redoSubset -samples -subset -t -type -usebatch -x" -- "$2") fi if [[ "$current_option" =~ (--batch-output-file|--eeListfile|--excludeEEFile|--output-dir|-batchOutputFile|-d|-f|-x) ]]; then mapfile -t -O "${#COMPREPLY[@]}" COMPREPLY < <(compgen -f -- "$2") diff --git a/gemma-cli/src/main/config/bash_completion.d/gemma-cli-staging b/gemma-cli/src/main/config/bash_completion.d/gemma-cli-staging index 31c56d042e..200f0ff436 100644 --- a/gemma-cli/src/main/config/bash_completion.d/gemma-cli-staging +++ b/gemma-cli/src/main/config/bash_completion.d/gemma-cli-staging @@ -251,7 +251,7 @@ function __gemma_cli_staging_complete() { fi if [[ " $words " =~ ' ubic.gemma.apps.DifferentialExpressionAnalysisCli ' ]]; then if ! [[ "$current_option" =~ (--batch-format|--batch-output-file|--batch-report-frequency|--delete-analysis|--delete-subset|--eeListfile|--excludeEEFile|--experiment|--experiment-set|--expressionQuery|--factors|--filter-minimum-number-of-cells-per-gene|--filter-minimum-number-of-cells-per-sample|--filter-minimum-variance|--filter-repetitive-values-minimum-number-of-samples|--filter-repetitive-values-minimum-unique-values|--filter-repetitive-values-mode|--output-dir|--redo-analysis|--redo-subset|--subset|--taxon|--type|-batchFormat|-batchOutputFile|-batchReportFrequency|-d|-deleteAnalysis|-deleteSubset|-e|-eeset|-f|-factors|-filterMinNumberOfCellsPerGene|-filterMinNumberOfCellsPerSample|-filterMinVariance|-filterRepetitiveValuesMinSamples|-filterRepetitiveValuesMinUniqueValues|-filterRepetitiveValuesMode|-mdate|-q|-redoAnalysis|-redoSubset|-subset|-t|-type|-x) ]]; then - mapfile -t -O "${#COMPREPLY[@]}" COMPREPLY < <(compgen -W "--batch-format --batch-output-file --batch-report-frequency --complete-analyses --complete-factors --complete-subset-factors --complete-subsets --delete --delete-analysis --delete-subset --eeListfile --excludeEEFile --experiment --experiment-set --expressionQuery --factors --filter-minimum-number-of-cells-per-gene --filter-minimum-number-of-cells-per-sample --filter-minimum-variance --filter-repetitive-values-minimum-number-of-samples --filter-repetitive-values-minimum-unique-values --filter-repetitive-values-mode --force --help --ignore-failing-subsets --no-bayes --no-db --no-files --output-dir --redo --redo-analysis --redo-subset --subset --taxon --type --use-batch-factor -all -auto -batchFormat -batchOutputFile -batchReportFrequency -completeAnalyses -completeFactors -completeSubsetFactors -completeSubsets -d -delete -deleteAnalysis -deleteSubset -e -eeset -f -factors -filterMinNumberOfCellsPerGene -filterMinNumberOfCellsPerSample -filterMinVariance -filterRepetitiveValuesMinSamples -filterRepetitiveValuesMinUniqueValues -filterRepetitiveValuesMode -force -h -ignoreFailingSubsets -mdate -nobayes -nodb -nofiles -q -redo -redoAnalysis -redoSubset -subset -t -type -usebatch -x" -- "$2") + mapfile -t -O "${#COMPREPLY[@]}" COMPREPLY < <(compgen -W "--batch-format --batch-output-file --batch-report-frequency --complete-analyses --complete-factors --complete-subset-factors --complete-subsets --delete --delete-analysis --delete-subset --eeListfile --excludeEEFile --experiment --experiment-set --expressionQuery --factors --filter-minimum-number-of-cells-per-gene --filter-minimum-number-of-cells-per-sample --filter-minimum-variance --filter-repetitive-values-minimum-number-of-samples --filter-repetitive-values-minimum-unique-values --filter-repetitive-values-mode --force --help --ignore-failing-subsets --no-bayes --no-db --no-files --output-dir --redo --redo-analysis --redo-subset --samples --subset --taxon --type --use-batch-factor -all -auto -batchFormat -batchOutputFile -batchReportFrequency -completeAnalyses -completeFactors -completeSubsetFactors -completeSubsets -d -delete -deleteAnalysis -deleteSubset -e -eeset -f -factors -filterMinNumberOfCellsPerGene -filterMinNumberOfCellsPerSample -filterMinVariance -filterRepetitiveValuesMinSamples -filterRepetitiveValuesMinUniqueValues -filterRepetitiveValuesMode -force -h -ignoreFailingSubsets -mdate -nobayes -nodb -nofiles -q -redo -redoAnalysis -redoSubset -samples -subset -t -type -usebatch -x" -- "$2") fi if [[ "$current_option" =~ (--batch-output-file|--eeListfile|--excludeEEFile|--output-dir|-batchOutputFile|-d|-f|-x) ]]; then mapfile -t -O "${#COMPREPLY[@]}" COMPREPLY < <(compgen -f -- "$2") @@ -259,7 +259,7 @@ function __gemma_cli_staging_complete() { fi if [[ " $words " =~ ' diffExAnalyze ' ]]; then if ! [[ "$current_option" =~ (--batch-format|--batch-output-file|--batch-report-frequency|--delete-analysis|--delete-subset|--eeListfile|--excludeEEFile|--experiment|--experiment-set|--expressionQuery|--factors|--filter-minimum-number-of-cells-per-gene|--filter-minimum-number-of-cells-per-sample|--filter-minimum-variance|--filter-repetitive-values-minimum-number-of-samples|--filter-repetitive-values-minimum-unique-values|--filter-repetitive-values-mode|--output-dir|--redo-analysis|--redo-subset|--subset|--taxon|--type|-batchFormat|-batchOutputFile|-batchReportFrequency|-d|-deleteAnalysis|-deleteSubset|-e|-eeset|-f|-factors|-filterMinNumberOfCellsPerGene|-filterMinNumberOfCellsPerSample|-filterMinVariance|-filterRepetitiveValuesMinSamples|-filterRepetitiveValuesMinUniqueValues|-filterRepetitiveValuesMode|-mdate|-q|-redoAnalysis|-redoSubset|-subset|-t|-type|-x) ]]; then - mapfile -t -O "${#COMPREPLY[@]}" COMPREPLY < <(compgen -W "--batch-format --batch-output-file --batch-report-frequency --complete-analyses --complete-factors --complete-subset-factors --complete-subsets --delete --delete-analysis --delete-subset --eeListfile --excludeEEFile --experiment --experiment-set --expressionQuery --factors --filter-minimum-number-of-cells-per-gene --filter-minimum-number-of-cells-per-sample --filter-minimum-variance --filter-repetitive-values-minimum-number-of-samples --filter-repetitive-values-minimum-unique-values --filter-repetitive-values-mode --force --help --ignore-failing-subsets --no-bayes --no-db --no-files --output-dir --redo --redo-analysis --redo-subset --subset --taxon --type --use-batch-factor -all -auto -batchFormat -batchOutputFile -batchReportFrequency -completeAnalyses -completeFactors -completeSubsetFactors -completeSubsets -d -delete -deleteAnalysis -deleteSubset -e -eeset -f -factors -filterMinNumberOfCellsPerGene -filterMinNumberOfCellsPerSample -filterMinVariance -filterRepetitiveValuesMinSamples -filterRepetitiveValuesMinUniqueValues -filterRepetitiveValuesMode -force -h -ignoreFailingSubsets -mdate -nobayes -nodb -nofiles -q -redo -redoAnalysis -redoSubset -subset -t -type -usebatch -x" -- "$2") + mapfile -t -O "${#COMPREPLY[@]}" COMPREPLY < <(compgen -W "--batch-format --batch-output-file --batch-report-frequency --complete-analyses --complete-factors --complete-subset-factors --complete-subsets --delete --delete-analysis --delete-subset --eeListfile --excludeEEFile --experiment --experiment-set --expressionQuery --factors --filter-minimum-number-of-cells-per-gene --filter-minimum-number-of-cells-per-sample --filter-minimum-variance --filter-repetitive-values-minimum-number-of-samples --filter-repetitive-values-minimum-unique-values --filter-repetitive-values-mode --force --help --ignore-failing-subsets --no-bayes --no-db --no-files --output-dir --redo --redo-analysis --redo-subset --samples --subset --taxon --type --use-batch-factor -all -auto -batchFormat -batchOutputFile -batchReportFrequency -completeAnalyses -completeFactors -completeSubsetFactors -completeSubsets -d -delete -deleteAnalysis -deleteSubset -e -eeset -f -factors -filterMinNumberOfCellsPerGene -filterMinNumberOfCellsPerSample -filterMinVariance -filterRepetitiveValuesMinSamples -filterRepetitiveValuesMinUniqueValues -filterRepetitiveValuesMode -force -h -ignoreFailingSubsets -mdate -nobayes -nodb -nofiles -q -redo -redoAnalysis -redoSubset -samples -subset -t -type -usebatch -x" -- "$2") fi if [[ "$current_option" =~ (--batch-output-file|--eeListfile|--excludeEEFile|--output-dir|-batchOutputFile|-d|-f|-x) ]]; then mapfile -t -O "${#COMPREPLY[@]}" COMPREPLY < <(compgen -f -- "$2") diff --git a/gemma-cli/src/main/config/fish/completions/gemma-cli-staging.fish b/gemma-cli/src/main/config/fish/completions/gemma-cli-staging.fish index 61297964dd..cae2d14408 100644 --- a/gemma-cli/src/main/config/fish/completions/gemma-cli-staging.fish +++ b/gemma-cli/src/main/config/fish/completions/gemma-cli-staging.fish @@ -536,12 +536,13 @@ complete -c gemma-cli-staging -n '__fish_seen_subcommand_from ubic.gemma.apps.Di complete -c gemma-cli-staging -n '__fish_seen_subcommand_from ubic.gemma.apps.DifferentialExpressionAnalysisCli' -o type -l type -r -a '(echo -e "GENERICLM\tgeneric linear model without interaction\nOSTTEST\tone-sample t-test\nOWA\tone-way ANOVA\nTTEST\ttwo-sample t-test\nTWO_WAY_ANOVA_WITH_INTERACTION\ttwo-way ANOVA with interaction\nTWO_WAY_ANOVA_NO_INTERACTION\ttwo-way ANOVA without interaction" 2>/dev/null)' -f --description 'Type of analysis to perform. If omitted, the system will try to guess based on the experimental design.\nThis option is not available when processing more than one experiment. Possible values are: GENERICLM, OSTTEST, OWA, TTEST, TWO_WAY_ANOVA_WITH_INTERACTION, TWO_WAY_ANOVA_NO_INTERACTION.' complete -c gemma-cli-staging -n '__fish_seen_subcommand_from ubic.gemma.apps.DifferentialExpressionAnalysisCli' -o factors -l factors -r -f --description 'ID numbers, categories or names of the factor(s) to use, comma-delimited, with spaces '"'"' '"'"' and colons '"'"':'"'"' replaced by underscores '"'"'_'"'"'. Interaction can be specified by using '"'"':'"'"' as a delimiter (e.g. '"'"'factor1:factor2'"'"'). Only categorical factors can be used in interactions. Multiple factors can be provided using comma-delimited IDs or by passing the option multiple times. If omitted, factors will be selected automatically. This is incompatible with -redo,--redo, -redoAnalysis,--redo-analysis or -redoSubset,--redo-subset.\nThis option is not available when processing more than one experiment.' complete -c gemma-cli-staging -n '__fish_seen_subcommand_from ubic.gemma.apps.DifferentialExpressionAnalysisCli' -o subset -l subset -r -f --description 'ID number, category or name of the factor to use for subsetting the analysis. The factor must be categorical. If used without specifying -factors,--factors, factors will be selected automatically among the remaining one in the design. If the experiment already has subsets for the factor, those will be reused. This is incompatible with -redo,--redo, -redoAnalysis,--redo-analysis or -redoSubset,--redo-subset.\nThis option is not available when processing more than one experiment.' +complete -c gemma-cli-staging -n '__fish_seen_subcommand_from ubic.gemma.apps.DifferentialExpressionAnalysisCli' -o samples -l samples -f --description 'ID, name or accession of samples to be included in the analysis. Defaults to all samples in the dataset being analyzed. Requires the -nodb,--no-db option to be set.\nThis option is not available when processing more than one experiment.' complete -c gemma-cli-staging -n '__fish_seen_subcommand_from ubic.gemma.apps.DifferentialExpressionAnalysisCli' -o usebatch -l use-batch-factor -f --description 'If a batch factor is available, use it. Otherwise, batch information can/will be ignored in the analysis. This is incompatible with -factors,--factors, -redo,--redo, -redoAnalysis,--redo-analysis and -redoSubset,--redo-subset.' complete -c gemma-cli-staging -n '__fish_seen_subcommand_from ubic.gemma.apps.DifferentialExpressionAnalysisCli' -o nobayes -l no-bayes -f --description 'Do not apply empirical-Bayes moderated statistics. Default is to use eBayes.' complete -c gemma-cli-staging -n '__fish_seen_subcommand_from ubic.gemma.apps.DifferentialExpressionAnalysisCli' -o ignoreFailingSubsets -l ignore-failing-subsets -f --description 'Ignore failing subsets and continue processing other subsets. Requires the -subset,--subset option to be set or -redo,--redo option with existing subset analyses.' complete -c gemma-cli-staging -n '__fish_seen_subcommand_from ubic.gemma.apps.DifferentialExpressionAnalysisCli' -s d -l output-dir -r -F --description 'Write diff. ex. archive files inside the given directory. Defaults to the current directory of no other destination is selected.' complete -c gemma-cli-staging -n '__fish_seen_subcommand_from ubic.gemma.apps.DifferentialExpressionAnalysisCli' -o nodb -l no-db -f --description 'Do not persist diff. ex. results to the database and instead save them to the current directory (or the location defined by -d,--output-dir).' -complete -c gemma-cli-staging -n '__fish_seen_subcommand_from ubic.gemma.apps.DifferentialExpressionAnalysisCli' -o nofiles -l no-files -f --description 'Don'"'"'t create archive files after analysis. Default is to make them. This is incompatible with -nodb,--no-db.' +complete -c gemma-cli-staging -n '__fish_seen_subcommand_from ubic.gemma.apps.DifferentialExpressionAnalysisCli' -o nofiles -l no-files -f --description 'Don'"'"'t create archive files after analysis. Default is to make them. This is incompatible with -nodb,--no-db option to be set.' complete -c gemma-cli-staging -n '__fish_seen_subcommand_from ubic.gemma.apps.DifferentialExpressionAnalysisCli' -o redo -l redo -f --description 'Re-run all analyses for the experiment. Try to base analysis on previous analysis'"'"'s choice of statistical model.' complete -c gemma-cli-staging -n '__fish_seen_subcommand_from ubic.gemma.apps.DifferentialExpressionAnalysisCli' -o redoAnalysis -l redo-analysis -r -f --description 'Re-run a specific analysis for the experiment. Try to base analysis on previous analysis'"'"'s choice of statistical model. Multiple analysis can be provided using comma-delimited IDs or by passing the option multiple times.\nThis option is not available when processing more than one experiment.' complete -c gemma-cli-staging -n '__fish_seen_subcommand_from ubic.gemma.apps.DifferentialExpressionAnalysisCli' -o redoSubset -l redo-subset -r -f --description 'Re-run all analyses for a subset of the experiment. Try to base analysis on previous analysis'"'"'s choice of statistical model. Multiple subsets can be provided using comma-delimited IDs or by passing the option multiple times.\nThis option is not available when processing more than one experiment.' @@ -577,12 +578,13 @@ complete -c gemma-cli-staging -n '__fish_seen_subcommand_from diffExAnalyze' -o complete -c gemma-cli-staging -n '__fish_seen_subcommand_from diffExAnalyze' -o type -l type -r -a '(echo -e "GENERICLM\tgeneric linear model without interaction\nOSTTEST\tone-sample t-test\nOWA\tone-way ANOVA\nTTEST\ttwo-sample t-test\nTWO_WAY_ANOVA_WITH_INTERACTION\ttwo-way ANOVA with interaction\nTWO_WAY_ANOVA_NO_INTERACTION\ttwo-way ANOVA without interaction" 2>/dev/null)' -f --description 'Type of analysis to perform. If omitted, the system will try to guess based on the experimental design.\nThis option is not available when processing more than one experiment. Possible values are: GENERICLM, OSTTEST, OWA, TTEST, TWO_WAY_ANOVA_WITH_INTERACTION, TWO_WAY_ANOVA_NO_INTERACTION.' complete -c gemma-cli-staging -n '__fish_seen_subcommand_from diffExAnalyze' -o factors -l factors -r -f --description 'ID numbers, categories or names of the factor(s) to use, comma-delimited, with spaces '"'"' '"'"' and colons '"'"':'"'"' replaced by underscores '"'"'_'"'"'. Interaction can be specified by using '"'"':'"'"' as a delimiter (e.g. '"'"'factor1:factor2'"'"'). Only categorical factors can be used in interactions. Multiple factors can be provided using comma-delimited IDs or by passing the option multiple times. If omitted, factors will be selected automatically. This is incompatible with -redo,--redo, -redoAnalysis,--redo-analysis or -redoSubset,--redo-subset.\nThis option is not available when processing more than one experiment.' complete -c gemma-cli-staging -n '__fish_seen_subcommand_from diffExAnalyze' -o subset -l subset -r -f --description 'ID number, category or name of the factor to use for subsetting the analysis. The factor must be categorical. If used without specifying -factors,--factors, factors will be selected automatically among the remaining one in the design. If the experiment already has subsets for the factor, those will be reused. This is incompatible with -redo,--redo, -redoAnalysis,--redo-analysis or -redoSubset,--redo-subset.\nThis option is not available when processing more than one experiment.' +complete -c gemma-cli-staging -n '__fish_seen_subcommand_from diffExAnalyze' -o samples -l samples -f --description 'ID, name or accession of samples to be included in the analysis. Defaults to all samples in the dataset being analyzed. Requires the -nodb,--no-db option to be set.\nThis option is not available when processing more than one experiment.' complete -c gemma-cli-staging -n '__fish_seen_subcommand_from diffExAnalyze' -o usebatch -l use-batch-factor -f --description 'If a batch factor is available, use it. Otherwise, batch information can/will be ignored in the analysis. This is incompatible with -factors,--factors, -redo,--redo, -redoAnalysis,--redo-analysis and -redoSubset,--redo-subset.' complete -c gemma-cli-staging -n '__fish_seen_subcommand_from diffExAnalyze' -o nobayes -l no-bayes -f --description 'Do not apply empirical-Bayes moderated statistics. Default is to use eBayes.' complete -c gemma-cli-staging -n '__fish_seen_subcommand_from diffExAnalyze' -o ignoreFailingSubsets -l ignore-failing-subsets -f --description 'Ignore failing subsets and continue processing other subsets. Requires the -subset,--subset option to be set or -redo,--redo option with existing subset analyses.' complete -c gemma-cli-staging -n '__fish_seen_subcommand_from diffExAnalyze' -s d -l output-dir -r -F --description 'Write diff. ex. archive files inside the given directory. Defaults to the current directory of no other destination is selected.' complete -c gemma-cli-staging -n '__fish_seen_subcommand_from diffExAnalyze' -o nodb -l no-db -f --description 'Do not persist diff. ex. results to the database and instead save them to the current directory (or the location defined by -d,--output-dir).' -complete -c gemma-cli-staging -n '__fish_seen_subcommand_from diffExAnalyze' -o nofiles -l no-files -f --description 'Don'"'"'t create archive files after analysis. Default is to make them. This is incompatible with -nodb,--no-db.' +complete -c gemma-cli-staging -n '__fish_seen_subcommand_from diffExAnalyze' -o nofiles -l no-files -f --description 'Don'"'"'t create archive files after analysis. Default is to make them. This is incompatible with -nodb,--no-db option to be set.' complete -c gemma-cli-staging -n '__fish_seen_subcommand_from diffExAnalyze' -o redo -l redo -f --description 'Re-run all analyses for the experiment. Try to base analysis on previous analysis'"'"'s choice of statistical model.' complete -c gemma-cli-staging -n '__fish_seen_subcommand_from diffExAnalyze' -o redoAnalysis -l redo-analysis -r -f --description 'Re-run a specific analysis for the experiment. Try to base analysis on previous analysis'"'"'s choice of statistical model. Multiple analysis can be provided using comma-delimited IDs or by passing the option multiple times.\nThis option is not available when processing more than one experiment.' complete -c gemma-cli-staging -n '__fish_seen_subcommand_from diffExAnalyze' -o redoSubset -l redo-subset -r -f --description 'Re-run all analyses for a subset of the experiment. Try to base analysis on previous analysis'"'"'s choice of statistical model. Multiple subsets can be provided using comma-delimited IDs or by passing the option multiple times.\nThis option is not available when processing more than one experiment.' diff --git a/gemma-cli/src/main/config/fish/completions/gemma-cli.fish b/gemma-cli/src/main/config/fish/completions/gemma-cli.fish index 446ec8365d..ad06251c76 100644 --- a/gemma-cli/src/main/config/fish/completions/gemma-cli.fish +++ b/gemma-cli/src/main/config/fish/completions/gemma-cli.fish @@ -536,12 +536,13 @@ complete -c gemma-cli -n '__fish_seen_subcommand_from ubic.gemma.apps.Differenti complete -c gemma-cli -n '__fish_seen_subcommand_from ubic.gemma.apps.DifferentialExpressionAnalysisCli' -o type -l type -r -a '(echo -e "GENERICLM\tgeneric linear model without interaction\nOSTTEST\tone-sample t-test\nOWA\tone-way ANOVA\nTTEST\ttwo-sample t-test\nTWO_WAY_ANOVA_WITH_INTERACTION\ttwo-way ANOVA with interaction\nTWO_WAY_ANOVA_NO_INTERACTION\ttwo-way ANOVA without interaction" 2>/dev/null)' -f --description 'Type of analysis to perform. If omitted, the system will try to guess based on the experimental design.\nThis option is not available when processing more than one experiment. Possible values are: GENERICLM, OSTTEST, OWA, TTEST, TWO_WAY_ANOVA_WITH_INTERACTION, TWO_WAY_ANOVA_NO_INTERACTION.' complete -c gemma-cli -n '__fish_seen_subcommand_from ubic.gemma.apps.DifferentialExpressionAnalysisCli' -o factors -l factors -r -f --description 'ID numbers, categories or names of the factor(s) to use, comma-delimited, with spaces '"'"' '"'"' and colons '"'"':'"'"' replaced by underscores '"'"'_'"'"'. Interaction can be specified by using '"'"':'"'"' as a delimiter (e.g. '"'"'factor1:factor2'"'"'). Only categorical factors can be used in interactions. Multiple factors can be provided using comma-delimited IDs or by passing the option multiple times. If omitted, factors will be selected automatically. This is incompatible with -redo,--redo, -redoAnalysis,--redo-analysis or -redoSubset,--redo-subset.\nThis option is not available when processing more than one experiment.' complete -c gemma-cli -n '__fish_seen_subcommand_from ubic.gemma.apps.DifferentialExpressionAnalysisCli' -o subset -l subset -r -f --description 'ID number, category or name of the factor to use for subsetting the analysis. The factor must be categorical. If used without specifying -factors,--factors, factors will be selected automatically among the remaining one in the design. If the experiment already has subsets for the factor, those will be reused. This is incompatible with -redo,--redo, -redoAnalysis,--redo-analysis or -redoSubset,--redo-subset.\nThis option is not available when processing more than one experiment.' +complete -c gemma-cli -n '__fish_seen_subcommand_from ubic.gemma.apps.DifferentialExpressionAnalysisCli' -o samples -l samples -f --description 'ID, name or accession of samples to be included in the analysis. Defaults to all samples in the dataset being analyzed. Requires the -nodb,--no-db option to be set.\nThis option is not available when processing more than one experiment.' complete -c gemma-cli -n '__fish_seen_subcommand_from ubic.gemma.apps.DifferentialExpressionAnalysisCli' -o usebatch -l use-batch-factor -f --description 'If a batch factor is available, use it. Otherwise, batch information can/will be ignored in the analysis. This is incompatible with -factors,--factors, -redo,--redo, -redoAnalysis,--redo-analysis and -redoSubset,--redo-subset.' complete -c gemma-cli -n '__fish_seen_subcommand_from ubic.gemma.apps.DifferentialExpressionAnalysisCli' -o nobayes -l no-bayes -f --description 'Do not apply empirical-Bayes moderated statistics. Default is to use eBayes.' complete -c gemma-cli -n '__fish_seen_subcommand_from ubic.gemma.apps.DifferentialExpressionAnalysisCli' -o ignoreFailingSubsets -l ignore-failing-subsets -f --description 'Ignore failing subsets and continue processing other subsets. Requires the -subset,--subset option to be set or -redo,--redo option with existing subset analyses.' complete -c gemma-cli -n '__fish_seen_subcommand_from ubic.gemma.apps.DifferentialExpressionAnalysisCli' -s d -l output-dir -r -F --description 'Write diff. ex. archive files inside the given directory. Defaults to the current directory of no other destination is selected.' complete -c gemma-cli -n '__fish_seen_subcommand_from ubic.gemma.apps.DifferentialExpressionAnalysisCli' -o nodb -l no-db -f --description 'Do not persist diff. ex. results to the database and instead save them to the current directory (or the location defined by -d,--output-dir).' -complete -c gemma-cli -n '__fish_seen_subcommand_from ubic.gemma.apps.DifferentialExpressionAnalysisCli' -o nofiles -l no-files -f --description 'Don'"'"'t create archive files after analysis. Default is to make them. This is incompatible with -nodb,--no-db.' +complete -c gemma-cli -n '__fish_seen_subcommand_from ubic.gemma.apps.DifferentialExpressionAnalysisCli' -o nofiles -l no-files -f --description 'Don'"'"'t create archive files after analysis. Default is to make them. This is incompatible with -nodb,--no-db option to be set.' complete -c gemma-cli -n '__fish_seen_subcommand_from ubic.gemma.apps.DifferentialExpressionAnalysisCli' -o redo -l redo -f --description 'Re-run all analyses for the experiment. Try to base analysis on previous analysis'"'"'s choice of statistical model.' complete -c gemma-cli -n '__fish_seen_subcommand_from ubic.gemma.apps.DifferentialExpressionAnalysisCli' -o redoAnalysis -l redo-analysis -r -f --description 'Re-run a specific analysis for the experiment. Try to base analysis on previous analysis'"'"'s choice of statistical model. Multiple analysis can be provided using comma-delimited IDs or by passing the option multiple times.\nThis option is not available when processing more than one experiment.' complete -c gemma-cli -n '__fish_seen_subcommand_from ubic.gemma.apps.DifferentialExpressionAnalysisCli' -o redoSubset -l redo-subset -r -f --description 'Re-run all analyses for a subset of the experiment. Try to base analysis on previous analysis'"'"'s choice of statistical model. Multiple subsets can be provided using comma-delimited IDs or by passing the option multiple times.\nThis option is not available when processing more than one experiment.' @@ -577,12 +578,13 @@ complete -c gemma-cli -n '__fish_seen_subcommand_from diffExAnalyze' -o force -l complete -c gemma-cli -n '__fish_seen_subcommand_from diffExAnalyze' -o type -l type -r -a '(echo -e "GENERICLM\tgeneric linear model without interaction\nOSTTEST\tone-sample t-test\nOWA\tone-way ANOVA\nTTEST\ttwo-sample t-test\nTWO_WAY_ANOVA_WITH_INTERACTION\ttwo-way ANOVA with interaction\nTWO_WAY_ANOVA_NO_INTERACTION\ttwo-way ANOVA without interaction" 2>/dev/null)' -f --description 'Type of analysis to perform. If omitted, the system will try to guess based on the experimental design.\nThis option is not available when processing more than one experiment. Possible values are: GENERICLM, OSTTEST, OWA, TTEST, TWO_WAY_ANOVA_WITH_INTERACTION, TWO_WAY_ANOVA_NO_INTERACTION.' complete -c gemma-cli -n '__fish_seen_subcommand_from diffExAnalyze' -o factors -l factors -r -f --description 'ID numbers, categories or names of the factor(s) to use, comma-delimited, with spaces '"'"' '"'"' and colons '"'"':'"'"' replaced by underscores '"'"'_'"'"'. Interaction can be specified by using '"'"':'"'"' as a delimiter (e.g. '"'"'factor1:factor2'"'"'). Only categorical factors can be used in interactions. Multiple factors can be provided using comma-delimited IDs or by passing the option multiple times. If omitted, factors will be selected automatically. This is incompatible with -redo,--redo, -redoAnalysis,--redo-analysis or -redoSubset,--redo-subset.\nThis option is not available when processing more than one experiment.' complete -c gemma-cli -n '__fish_seen_subcommand_from diffExAnalyze' -o subset -l subset -r -f --description 'ID number, category or name of the factor to use for subsetting the analysis. The factor must be categorical. If used without specifying -factors,--factors, factors will be selected automatically among the remaining one in the design. If the experiment already has subsets for the factor, those will be reused. This is incompatible with -redo,--redo, -redoAnalysis,--redo-analysis or -redoSubset,--redo-subset.\nThis option is not available when processing more than one experiment.' +complete -c gemma-cli -n '__fish_seen_subcommand_from diffExAnalyze' -o samples -l samples -f --description 'ID, name or accession of samples to be included in the analysis. Defaults to all samples in the dataset being analyzed. Requires the -nodb,--no-db option to be set.\nThis option is not available when processing more than one experiment.' complete -c gemma-cli -n '__fish_seen_subcommand_from diffExAnalyze' -o usebatch -l use-batch-factor -f --description 'If a batch factor is available, use it. Otherwise, batch information can/will be ignored in the analysis. This is incompatible with -factors,--factors, -redo,--redo, -redoAnalysis,--redo-analysis and -redoSubset,--redo-subset.' complete -c gemma-cli -n '__fish_seen_subcommand_from diffExAnalyze' -o nobayes -l no-bayes -f --description 'Do not apply empirical-Bayes moderated statistics. Default is to use eBayes.' complete -c gemma-cli -n '__fish_seen_subcommand_from diffExAnalyze' -o ignoreFailingSubsets -l ignore-failing-subsets -f --description 'Ignore failing subsets and continue processing other subsets. Requires the -subset,--subset option to be set or -redo,--redo option with existing subset analyses.' complete -c gemma-cli -n '__fish_seen_subcommand_from diffExAnalyze' -s d -l output-dir -r -F --description 'Write diff. ex. archive files inside the given directory. Defaults to the current directory of no other destination is selected.' complete -c gemma-cli -n '__fish_seen_subcommand_from diffExAnalyze' -o nodb -l no-db -f --description 'Do not persist diff. ex. results to the database and instead save them to the current directory (or the location defined by -d,--output-dir).' -complete -c gemma-cli -n '__fish_seen_subcommand_from diffExAnalyze' -o nofiles -l no-files -f --description 'Don'"'"'t create archive files after analysis. Default is to make them. This is incompatible with -nodb,--no-db.' +complete -c gemma-cli -n '__fish_seen_subcommand_from diffExAnalyze' -o nofiles -l no-files -f --description 'Don'"'"'t create archive files after analysis. Default is to make them. This is incompatible with -nodb,--no-db option to be set.' complete -c gemma-cli -n '__fish_seen_subcommand_from diffExAnalyze' -o redo -l redo -f --description 'Re-run all analyses for the experiment. Try to base analysis on previous analysis'"'"'s choice of statistical model.' complete -c gemma-cli -n '__fish_seen_subcommand_from diffExAnalyze' -o redoAnalysis -l redo-analysis -r -f --description 'Re-run a specific analysis for the experiment. Try to base analysis on previous analysis'"'"'s choice of statistical model. Multiple analysis can be provided using comma-delimited IDs or by passing the option multiple times.\nThis option is not available when processing more than one experiment.' complete -c gemma-cli -n '__fish_seen_subcommand_from diffExAnalyze' -o redoSubset -l redo-subset -r -f --description 'Re-run all analyses for a subset of the experiment. Try to base analysis on previous analysis'"'"'s choice of statistical model. Multiple subsets can be provided using comma-delimited IDs or by passing the option multiple times.\nThis option is not available when processing more than one experiment.' diff --git a/gemma-cli/src/main/java/ubic/gemma/apps/DifferentialExpressionAnalysisCli.java b/gemma-cli/src/main/java/ubic/gemma/apps/DifferentialExpressionAnalysisCli.java index 7395f47839..9572094d72 100644 --- a/gemma-cli/src/main/java/ubic/gemma/apps/DifferentialExpressionAnalysisCli.java +++ b/gemma-cli/src/main/java/ubic/gemma/apps/DifferentialExpressionAnalysisCli.java @@ -39,6 +39,7 @@ import ubic.gemma.core.analysis.service.ExpressionDataFileService; import ubic.gemma.model.analysis.expression.diff.DifferentialExpressionAnalysis; import ubic.gemma.model.common.auditAndSecurity.eventType.DifferentialExpressionAnalysisEvent; +import ubic.gemma.model.expression.biomaterial.BioMaterial; import ubic.gemma.model.expression.experiment.*; import ubic.gemma.persistence.service.analysis.expression.diff.DifferentialExpressionAnalysisService; @@ -80,21 +81,40 @@ private enum Mode { COMPLETE_FACTORS } + /** + * Mode for selecting which factors to include in the analysis. + */ + private enum FactorSelectionMode { + /** + * Select factors based on a previous analysis. + */ + REDO, + /** + * Select factors automatically. + */ + AUTOMATIC, + /** + * Select factors manually. The values that are in {@link #factorIdentifiers} will be considered. + */ + MANUAL + } + /** * Specific analyses to redo. */ @Nullable - private Collection analysisIds = null; + private Collection analysisIds; @Nullable - private Collection subsetIds = null; + private Collection subsetIds; /** * Indicate the type of analysis to perform. *

* The default is to detect it based on the dataset and the requested factors. */ - private AnalysisType type = null; + @Nullable + private AnalysisType type; /** * Mode for selecting factors. @@ -114,24 +134,33 @@ private enum Mode { private String subsetFactorIdentifier; /** - * Whether batch factors should be included (if they exist) + * Whether batch factors should be included (if they exist). + *

+ * Defaults to true. */ - private boolean ignoreBatch = true; + private boolean ignoreBatch; /** * Use moderated statistics. */ - private boolean moderateStatistics = DifferentialExpressionAnalysisConfig.DEFAULT_MODERATE_STATISTICS; + private boolean moderateStatistics; /** - * Persist results to the database. + * Ignore failure of subset analyses. + *

+ * The analysis will still fail if all subset analyses fail. */ - private boolean persist = true; + private boolean ignoreFailingSubsets; - private boolean makeArchiveFiles = true; - - private boolean ignoreFailingSubsets = false; + /** + * List of sample identifiers to include in the analysis. + *

+ * Defaults to all samples. + */ + @Nullable + private String[] sampleIdentifiers; + // data filtering options @Nullable private Integer filterMinNumberOfCellsPerSample; @Nullable @@ -144,13 +173,22 @@ private enum Mode { @Nullable private Double filterMinVariance; - private DataFileOptionValue destination; + /** + * Persist results to the database. + */ + private boolean persist ; - enum FactorSelectionMode { - REDO, - AUTOMATIC, - MANUAL - } + /** + * Create archive files. + */ + private boolean makeArchiveFiles ; + + /** + * Destination to use for archive files. + *

+ * This is only applicable if {@link #makeArchiveFiles} is true. + */ + private DataFileOptionValue destination; @Override public String getCommandName() { @@ -197,6 +235,9 @@ protected void buildExperimentOptions( Options options ) { + "If the experiment already has subsets for the factor, those will be reused. " + "This is incompatible with -redo,--redo, -redoAnalysis,--redo-analysis or -redoSubset,--redo-subset." ).get() ); + addSingleExperimentOption( options, Option.builder( "samples" ).longOpt( "samples" ).argName( "ID, name, accession" ) + .desc( "ID, name or accession of samples to be included in the analysis. Defaults to all samples in the dataset being analyzed. Requires the " + formatOption( "nodb", "no-db" ) + " option to be set." ).get() ); + options.addOption( "usebatch", "use-batch-factor", false, "If a batch factor is available, use it. Otherwise, batch information can/will be ignored in the analysis. This is incompatible with " + formatOption( options, "factors" ) + ", -redo,--redo, -redoAnalysis,--redo-analysis and -redoSubset,--redo-subset." ); options.addOption( "nobayes", "no-bayes", false, "Do not apply empirical-Bayes moderated statistics. Default is to use eBayes." ); options.addOption( "ignoreFailingSubsets", "ignore-failing-subsets", false, "Ignore failing subsets and continue processing other subsets. Requires the " + formatOption( options, "subset" ) + " option to be set or -redo,--redo option with existing subset analyses." ); @@ -205,7 +246,7 @@ protected void buildExperimentOptions( Options options ) { // destination (db, standard location or custom directory) options.addOption( "nodb", "no-db", false, "Do not persist diff. ex. results to the database and instead save them to the current directory (or the location defined by " + formatOption( options, DataFileOptionsUtils.OUTPUT_DIR_OPTION ) + ")." ); - options.addOption( "nofiles", "no-files", false, "Don't create archive files after analysis. Default is to make them. This is incompatible with " + formatOption( options, "nodb" ) + "." ); + options.addOption( "nofiles", "no-files", false, "Don't create archive files after analysis. Default is to make them. This is incompatible with " + formatOption( options, "nodb" ) + " option to be set." ); // redo mode options.addOption( "redo", "redo", false, @@ -346,6 +387,7 @@ protected void processExperimentOptions( CommandLine commandLine ) throws ParseE // subset analysis can only be done in manual mode // note we add the given factor to the list of factors overall to make sure it is considered this.subsetFactorIdentifier = getOptionValue( commandLine, "subset", requires( allOf( toBeUnset( "redo" ), toBeUnset( "redoAnalysis" ), toBeUnset( "redoSubset" ) ) ) ); + this.sampleIdentifiers = getOptionValues( commandLine, "samples", requires( toBeSet( "nodb" ) ) ); // we can only force the use of a batch factor during automatic selection this.ignoreBatch = !hasOption( commandLine, "usebatch", requires( allOf( toBeUnset( "factors" ), toBeUnset( "redo" ), toBeUnset( "redoAnalysis" ), toBeUnset( "redoSubset" ) ) ) ); this.moderateStatistics = !commandLine.hasOption( "nobayes" ); @@ -433,7 +475,18 @@ protected void processExpressionExperiment( ExpressionExperiment ee ) { config.setPersist( this.persist ); config.setMakeArchiveFile( this.persist && this.makeArchiveFiles ); config.setIgnoreFailingSubsets( this.ignoreFailingSubsets ); - config.setUseWeights( super.eeService.isRNASeq( ee ) ); + config.setUseWeights( eeService.isRNASeq( ee ) ); + + // sample selection + if ( this.sampleIdentifiers != null ) { + Set samplesToUse = new HashSet<>(); + for ( String sampleId : sampleIdentifiers ) { + // allow searching in subsets, those are used in single-cell diff ex. analysis that use sub-samples for + // representing pseudo-bulks + samplesToUse.add( entityLocator.locateSample( ee, sampleId, true ) ); + } + config.setSamplesToInclude( samplesToUse ); + } // filtering config.setRepetitiveValuesFilterMode( this.filterMode ); diff --git a/gemma-cli/src/main/java/ubic/gemma/cli/completion/CompletionType.java b/gemma-cli/src/main/java/ubic/gemma/cli/completion/CompletionType.java index 196fddcc5a..b60179429f 100644 --- a/gemma-cli/src/main/java/ubic/gemma/cli/completion/CompletionType.java +++ b/gemma-cli/src/main/java/ubic/gemma/cli/completion/CompletionType.java @@ -7,6 +7,7 @@ public enum CompletionType { DATASET_GROUP, EXTERNAL_DATABASE, DATASET, + SAMPLE, EXPERIMENTAL_FACTOR, /** * Complete experimental factors and interactions suitable for DEA. diff --git a/gemma-cli/src/main/java/ubic/gemma/cli/util/EntityLocator.java b/gemma-cli/src/main/java/ubic/gemma/cli/util/EntityLocator.java index 8e15eb3b80..16d6b4babc 100644 --- a/gemma-cli/src/main/java/ubic/gemma/cli/util/EntityLocator.java +++ b/gemma-cli/src/main/java/ubic/gemma/cli/util/EntityLocator.java @@ -8,8 +8,10 @@ import ubic.gemma.model.expression.bioAssayData.CellLevelCharacteristics; import ubic.gemma.model.expression.bioAssayData.CellTypeAssignment; import ubic.gemma.model.expression.bioAssayData.DataVector; +import ubic.gemma.model.expression.biomaterial.BioMaterial; import ubic.gemma.model.expression.experiment.ExperimentalFactor; import ubic.gemma.model.expression.experiment.ExpressionExperiment; +import ubic.gemma.model.expression.experiment.ExpressionExperimentSubSet; import ubic.gemma.model.genome.Taxon; import java.util.Collection; @@ -39,9 +41,35 @@ public interface EntityLocator { ExperimentalFactor locateExperimentalFactor( ExpressionExperiment expressionExperiment, String ctfName ); - BioAssay locateBioAssay( ExpressionExperiment ee, String sampleId ); + /** + * Locate an assay by its identifier. + * + * @param ee dataset to lookup + * @param assayId assay identifier to lookup + * @param includeSubSets whether to include assays that belong to subsets of the experiment. This is only relevant + * for {@link ExpressionExperimentSubSet} that "own" their assays instead of sharing them with + * the source experiment. + */ + BioAssay locateBioAssay( ExpressionExperiment ee, String assayId, boolean includeSubSets ); + /** + * Locate an assay by its identifier in a particular set of vectors. + * + * @param ee dataset to lookup + * @param quantitationType quantitation type for the vectors + * @param sampleId sample identifier to lookup + */ BioAssay locateBioAssay( ExpressionExperiment ee, QuantitationType quantitationType, String sampleId ); + /** + * + * @param ee dataset to lookup + * @param sampleId sample identifier to lookup + * @param includeSubSets whether to include samples associated to assays that belong to subsets of the experiment. + * This is only relevant for {@link ExpressionExperimentSubSet} + * that "own" their assays instead of sharing them with the source experiment. + */ + BioMaterial locateSample( ExpressionExperiment ee, String sampleId, boolean includeSubSets ); + DifferentialExpressionAnalysis locateDiffExAnalysis( ExpressionExperiment ee, String analysisIdentifier ); } diff --git a/gemma-cli/src/main/java/ubic/gemma/cli/util/EntityLocatorImpl.java b/gemma-cli/src/main/java/ubic/gemma/cli/util/EntityLocatorImpl.java index 8a1a3fd793..66f80a02ec 100644 --- a/gemma-cli/src/main/java/ubic/gemma/cli/util/EntityLocatorImpl.java +++ b/gemma-cli/src/main/java/ubic/gemma/cli/util/EntityLocatorImpl.java @@ -2,6 +2,7 @@ import lombok.extern.apachecommons.CommonsLog; import org.apache.commons.lang3.StringUtils; +import org.apache.commons.lang3.Strings; import org.apache.commons.lang3.tuple.Pair; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.stereotype.Component; @@ -15,6 +16,7 @@ import ubic.gemma.model.expression.arrayDesign.ArrayDesign; import ubic.gemma.model.expression.bioAssay.BioAssay; import ubic.gemma.model.expression.bioAssayData.*; +import ubic.gemma.model.expression.biomaterial.BioMaterial; import ubic.gemma.model.expression.experiment.ExperimentalFactor; import ubic.gemma.model.expression.experiment.ExpressionExperiment; import ubic.gemma.model.genome.Taxon; @@ -268,7 +270,7 @@ public ExperimentalFactor locateExperimentalFactor( ExpressionExperiment express ExperimentalFactor factor; try { Long efId = Long.parseLong( identifier ); - if ( ( factor = matchOneFactor( expressionExperiment, ef -> ef.getId().equals( efId ) ) ) != null ) { + if ( ( factor = matchOneFactor( expressionExperiment, ef -> Objects.equals( ef.getId(), efId ) ) ) != null ) { return factor; } } catch ( NumberFormatException e ) { @@ -287,16 +289,16 @@ public ExperimentalFactor locateExperimentalFactor( ExpressionExperiment express } // match by category - if ( ( factor = matchOneFactor( expressionExperiment, ef -> ef.getCategory() != null && StringUtils.equalsIgnoreCase( ef.getCategory().getCategory(), finalIdentifier ) ) ) != null ) { + if ( ( factor = matchOneFactor( expressionExperiment, ef -> ef.getCategory() != null && Strings.CI.equals( ef.getCategory().getCategory(), finalIdentifier ) ) ) != null ) { return factor; } - if ( ( factor = matchOneFactor( expressionExperiment, ef -> ef.getCategory() != null && StringUtils.equalsIgnoreCase( ef.getCategory().getCategoryUri(), finalIdentifier ) ) ) != null ) { + if ( ( factor = matchOneFactor( expressionExperiment, ef -> ef.getCategory() != null && Strings.CI.equals( ef.getCategory().getCategoryUri(), finalIdentifier ) ) ) != null ) { return factor; } - if ( ( factor = matchOneFactor( expressionExperiment, ef -> ef.getCategory() != null && StringUtils.equalsIgnoreCase( ef.getCategory().getValue(), finalIdentifier ) ) ) != null ) { + if ( ( factor = matchOneFactor( expressionExperiment, ef -> ef.getCategory() != null && Strings.CI.equals( ef.getCategory().getValue(), finalIdentifier ) ) ) != null ) { return factor; } - if ( ( factor = matchOneFactor( expressionExperiment, ef -> ef.getCategory() != null && StringUtils.equalsIgnoreCase( ef.getCategory().getValueUri(), finalIdentifier ) ) ) != null ) { + if ( ( factor = matchOneFactor( expressionExperiment, ef -> ef.getCategory() != null && Strings.CI.equals( ef.getCategory().getValueUri(), finalIdentifier ) ) ) != null ) { return factor; } @@ -321,10 +323,9 @@ private ExperimentalFactor matchOneFactor( ExpressionExperiment ee, Predicate String.format( "Could not locate an analysis matching '%s' in %s.%s", - analysisIdentifier, - ee.getShortName(), - formatPossibleValues( differentialExpressionAnalysisService.findByExperiment( ee, true ), false ) ) ); - } - @Nullable - private BioAssay locateBioAssay( Collection ee, String sampleId ) { + private BioAssay locateBioAssay( Collection candidates, String sampleId ) { BioAssay ba; try { Long id = Long.parseLong( sampleId ); - if ( ( ba = matchOneAssay( ee, ba2 -> ba2.getId().equals( id ) ) ) != null ) { + if ( ( ba = matchOneAssay( candidates, ba2 -> Objects.equals( ba2.getId(), id ) ) ) != null ) { return ba; } } catch ( NumberFormatException e ) { // ignore } - if ( ( ba = matchOneAssay( ee, ba2 -> ba2.getShortName() != null && ba2.getShortName().equalsIgnoreCase( sampleId ) ) ) != null ) { + if ( ( ba = matchOneAssay( candidates, ba2 -> ba2.getShortName() != null && ba2.getShortName().equalsIgnoreCase( sampleId ) ) ) != null ) { + return ba; + } + if ( ( ba = matchOneAssay( candidates, ba2 -> ba2.getName().equalsIgnoreCase( sampleId ) ) ) != null ) { return ba; } - if ( ( ba = matchOneAssay( ee, ba2 -> ba2.getName().equalsIgnoreCase( sampleId ) ) ) != null ) { + if ( ( ba = matchOneAssay( candidates, ba2 -> ba2.getAccession() != null && ba2.getAccession().getAccession().equalsIgnoreCase( sampleId ) ) ) != null ) { + return ba; + } + return null; + } + + private BioAssay matchOneAssay( Collection candidates, Predicate ba ) { + Set bas = candidates.stream().filter( ba ).collect( Collectors.toSet() ); + if ( bas.size() == 1 ) { + return bas.iterator().next(); + } else { + return null; + } + } + + @Override + public BioMaterial locateSample( ExpressionExperiment ee, String sampleId, boolean includeSubSets ) { + return locateSample( eeService.getSamplesUsed( ee, includeSubSets ), sampleId ); + } + + private BioMaterial locateSample( Collection candidates, String sampleId ) { + BioMaterial ba; + try { + Long id = Long.parseLong( sampleId ); + if ( ( ba = matchOneSample( candidates, ba2 -> Objects.equals( ba2.getId(), id ) ) ) != null ) { + return ba; + } + } catch ( NumberFormatException e ) { + // ignore + } + if ( ( ba = matchOneSample( candidates, ba2 -> ba2.getName().equalsIgnoreCase( sampleId ) ) ) != null ) { return ba; } - if ( ( ba = matchOneAssay( ee, ba2 -> ba2.getAccession() != null && ba2.getAccession().getAccession().equalsIgnoreCase( sampleId ) ) ) != null ) { + if ( ( ba = matchOneSample( candidates, ba2 -> ba2.getExternalAccession() != null && ba2.getExternalAccession().getAccession().equalsIgnoreCase( sampleId ) ) ) != null ) { return ba; } return null; } - private BioAssay matchOneAssay( Collection bioAssays, Predicate ba ) { - Set bas = bioAssays.stream().filter( ba ).collect( Collectors.toSet() ); + private BioMaterial matchOneSample( Collection candidates, Predicate ba ) { + Set bas = candidates.stream().filter( ba ).collect( Collectors.toSet() ); if ( bas.size() == 1 ) { return bas.iterator().next(); } else { @@ -383,6 +408,15 @@ private BioAssay matchOneAssay( Collection bioAssays, Predicate String.format( "Could not locate an analysis matching '%s' in %s.%s", + analysisIdentifier, + ee.getShortName(), + formatPossibleValues( differentialExpressionAnalysisService.findByExperiment( ee, true ), false ) ) ); + } + private String formatPossibleValues( Collection possibleValues, boolean allowAmbiguousIds ) { if ( possibleValues.isEmpty() ) { return ""; diff --git a/gemma-core/src/main/java/ubic/gemma/core/analysis/expression/diff/DifferentialExpressionAnalysisConfig.java b/gemma-core/src/main/java/ubic/gemma/core/analysis/expression/diff/DifferentialExpressionAnalysisConfig.java index fb1dd25244..6eb001afec 100644 --- a/gemma-core/src/main/java/ubic/gemma/core/analysis/expression/diff/DifferentialExpressionAnalysisConfig.java +++ b/gemma-core/src/main/java/ubic/gemma/core/analysis/expression/diff/DifferentialExpressionAnalysisConfig.java @@ -23,6 +23,7 @@ import ubic.gemma.core.analysis.preprocess.filter.RepetitiveValuesFilter; import ubic.gemma.core.datastructure.matrix.ExpressionDataDoubleMatrix; import ubic.gemma.model.expression.bioAssay.BioAssay; +import ubic.gemma.model.expression.biomaterial.BioMaterial; import ubic.gemma.model.expression.experiment.ExperimentalFactor; import ubic.gemma.model.expression.experiment.FactorValue; @@ -46,7 +47,10 @@ public class DifferentialExpressionAnalysisConfig { /** * Type of analysis to perform. + *

+ * The type is automatically detected if left null. */ + @Nullable private AnalysisType analysisType; /** @@ -79,6 +83,14 @@ public class DifferentialExpressionAnalysisConfig { @Nullable private FactorValue subsetFactorValue; + /** + * Set of samples to include in the analysis. + *

+ * All samples are included if this is null. + */ + @Nullable + private Set samplesToInclude; + /** * Keep processing other subsets when encountering an {@link AnalysisException} on a subset. *

@@ -165,6 +177,9 @@ public DifferentialExpressionAnalysisConfig() { public DifferentialExpressionAnalysisConfig( DifferentialExpressionAnalysisConfig baseConfig ) { this.analysisType = baseConfig.getAnalysisType(); this.moderateStatistics = baseConfig.isModerateStatistics(); + if ( baseConfig.samplesToInclude != null ) { + this.samplesToInclude = new HashSet<>( baseConfig.samplesToInclude ); + } this.factorsToInclude.addAll( baseConfig.getFactorsToInclude() ); this.interactionsToInclude.addAll( baseConfig.getInteractionsToInclude() ); this.persist = baseConfig.isPersist(); diff --git a/gemma-core/src/main/java/ubic/gemma/core/analysis/expression/diff/LinearModelAnalyzer.java b/gemma-core/src/main/java/ubic/gemma/core/analysis/expression/diff/LinearModelAnalyzer.java index 1a5f6a7c73..cb13235520 100644 --- a/gemma-core/src/main/java/ubic/gemma/core/analysis/expression/diff/LinearModelAnalyzer.java +++ b/gemma-core/src/main/java/ubic/gemma/core/analysis/expression/diff/LinearModelAnalyzer.java @@ -111,9 +111,6 @@ public class LinearModelAnalyzer implements DiffExAnalyzer { private static final Comparator FACTOR_COMPARATOR = Comparator.comparing( DiffExAnalyzerUtils::nameForR, Comparator.naturalOrder() ); - private static final Comparator SAMPLE_COMPARATOR = - Comparator.comparing( DiffExAnalyzerUtils::nameForR, Comparator.naturalOrder() ); - @Autowired private CompositeSequenceService compositeSequenceService; @Autowired @@ -123,160 +120,6 @@ public class LinearModelAnalyzer implements DiffExAnalyzer { @Autowired private BuildInfo buildInfo; - /** - * Generate HitListSize entities that will be stored to count the number of diff. ex probes at various preset - * thresholds, to avoid wasting time generating these counts on the fly later. This is done automatically during - * analysis, so is just here to allow 'backfilling'. - * - * @param probeToGeneMap map - * @param results results - * @return hit list sizes - */ - private Set computeHitListSizes( Collection results, - Map> probeToGeneMap ) { - Set hitListSizes = new HashSet<>(); - StopWatch timer = new StopWatch(); - timer.start(); - double maxThreshold = MathUtil.max( LinearModelAnalyzer.qValueThresholdsForHitLists ); - - Collection allGenes = new HashSet<>(); - for ( Collection genes : probeToGeneMap.values() ) { - allGenes.addAll( genes ); - } - - // maps from Doubles are a bit dodgy... - Map upCounts = new HashMap<>(); - Map downCounts = new HashMap<>(); - Map eitherCounts = new HashMap<>(); - - Map upCountGenes = new HashMap<>(); - Map downCountGenes = new HashMap<>(); - Map eitherCountGenes = new HashMap<>(); - - Collection seenGenes = new HashSet<>(); - for ( DifferentialExpressionAnalysisResult r : results ) { - - Double corrP = r.getCorrectedPvalue(); - if ( corrP == null || corrP > maxThreshold ) { - continue; - } - - CompositeSequence probe = r.getProbe(); - Collection genesForProbe = probeToGeneMap.get( probe ); - int numGenes = 0; - if ( genesForProbe != null ) { - numGenes = this.countNumberOfGenesNotSeenAlready( genesForProbe, seenGenes ); - } - - // if ( numGenes == 0 ) // This is okay; it might mean we already counted it as differentially expressed. - - Collection crs = r.getContrasts(); - boolean up = false; - boolean down = false; - - /* - * We set up and down to be true (either or both) if at least on contrast is shown. - */ - for ( ContrastResult cr : crs ) { - Double lf = cr.getLogFoldChange(); - //noinspection StatementWithEmptyBody // Better readability - if ( lf == null ) { - /* - * A contrast which is actually not valid, so it won't be counted in the hit list. - */ - } else if ( lf < 0 ) { - down = true; - } else if ( lf > 0 ) { - up = true; - } - } - - for ( double thresh : LinearModelAnalyzer.qValueThresholdsForHitLists ) { - - if ( !upCounts.containsKey( thresh ) ) { - upCounts.put( thresh, 0 ); - upCountGenes.put( thresh, 0 ); - } - if ( !downCounts.containsKey( thresh ) ) { - downCounts.put( thresh, 0 ); - downCountGenes.put( thresh, 0 ); - } - if ( !eitherCounts.containsKey( thresh ) ) { - eitherCounts.put( thresh, 0 ); - eitherCountGenes.put( thresh, 0 ); - } - - if ( corrP < thresh ) { - if ( up ) { - upCounts.put( thresh, upCounts.get( thresh ) + 1 ); - upCountGenes.put( thresh, upCountGenes.get( thresh ) + numGenes ); - } - if ( down ) { - downCounts.put( thresh, downCounts.get( thresh ) + 1 ); - downCountGenes.put( thresh, downCountGenes.get( thresh ) + numGenes ); - } - - eitherCounts.put( thresh, eitherCounts.get( thresh ) + 1 ); - eitherCountGenes.put( thresh, eitherCountGenes.get( thresh ) + numGenes ); - - } - } - - } - - for ( double thresh : LinearModelAnalyzer.qValueThresholdsForHitLists ) { - // Ensure we don't set values to null. - int up = upCounts.getOrDefault( thresh, 0 ); - int down = downCounts.getOrDefault( thresh, 0 ); - int either = eitherCounts.getOrDefault( thresh, 0 ); - - int upGenes = upCountGenes.getOrDefault( thresh, 0 ); - int downGenes = downCountGenes.getOrDefault( thresh, 0 ); - int eitherGenes = eitherCountGenes.getOrDefault( thresh, 0 ); - - assert !( allGenes.size() < upGenes || allGenes.size() < downGenes - || allGenes.size() < eitherGenes ) : "There were more genes differentially expressed than exist in the experiment"; - - HitListSize upS = HitListSize.Factory.newInstance( thresh, up, Direction.UP, upGenes ); - HitListSize downS = HitListSize.Factory.newInstance( thresh, down, Direction.DOWN, downGenes ); - HitListSize eitherS = HitListSize.Factory.newInstance( thresh, either, Direction.EITHER, eitherGenes ); - - hitListSizes.add( upS ); - hitListSizes.add( downS ); - hitListSizes.add( eitherS ); - - assert upGenes <= eitherGenes : "More genes upregulated than upregulated+downregulated"; - assert downGenes <= eitherGenes : "More genes downregulated than upregulated+downregulated"; - - } - - if ( timer.getTime() > 1000 ) { - LinearModelAnalyzer.log.info( "Hitlist computation: " + timer.getTime() + " ms" ); - } - return hitListSizes; - } - - /** - * Utility method - * - * @param probeToGeneMap map - * @param resultList result list - * @return number of genes tested - */ - private int getNumberOfGenesTested( Collection resultList, - Map> probeToGeneMap ) { - - Collection gs = new HashSet<>(); - for ( DifferentialExpressionAnalysisResult d : resultList ) { - CompositeSequence probe = d.getProbe(); - if ( probeToGeneMap.containsKey( probe ) ) { - gs.addAll( probeToGeneMap.get( probe ) ); - } - } - return gs.size(); - } - - /** * I apologize for this being so complicated. Basically there are four phases: *

    @@ -290,23 +133,19 @@ private int getNumberOfGenesTested( Collection run( ExpressionExperiment expressionExperiment, ExpressionDataDoubleMatrix dmatrix, DifferentialExpressionAnalysisConfig config ) throws AnalysisException { - /* * Initialize our matrix and factor lists... */ - List factors = config.getFactorsToInclude().stream() - .sorted( FACTOR_COMPARATOR ) - .collect( Collectors.toList() ); - - /* - * FIXME this is the place to strip put the outliers. - */ - List samplesUsed = orderByExperimentalDesign( dmatrix, factors, null ); + List factors = selectFactors( config ); + List samplesUsed = selectSamples( dmatrix, factors, config ); + dropIncompleteFactors( samplesUsed, factors ); + if ( factors.isEmpty() ) { + throw new NoFactorLeftForAnalysisException( "All factors were removed due to incomplete values.", config ); + } dmatrix = dmatrix.sliceColumns( samplesUsed, createBADMap( samplesUsed ) ); // enforce ordering Map baselineConditions = BaselineSelection.getBaselineConditions( samplesUsed, factors ); - dropIncompleteFactors( samplesUsed, factors ); /* * Do the analysis, by subsets if requested @@ -327,23 +166,58 @@ public Collection run( ExpressionExperiment ee, Assert.isTrue( !subsets.isEmpty(), "No subsets provided" ); Assert.isTrue( config.getSubsetFactor().getFactorValues().containsAll( subsets.keySet() ), "Subsets must use factor values from " + config.getSubsetFactor() + "." ); Assert.isTrue( subsets.values().stream().allMatch( ss -> ss.getSourceExperiment().equals( ee ) ), "Subsets must use " + ee + " as source experiment." ); - List factors = config.getFactorsToInclude().stream() - .sorted( FACTOR_COMPARATOR ) - .collect( Collectors.toList() ); - List samplesUsed = orderByExperimentalDesign( dmatrix, factors, null ); + List factors = selectFactors( config ); + List samplesUsed = selectSamples( dmatrix, factors, config ); + dropIncompleteFactors( samplesUsed, factors ); + if ( factors.isEmpty() ) { + throw new NoFactorLeftForAnalysisException( "All factors were removed due to incomplete values.", config ); + } Map dmatrixBySubSet = makeSubSetMatrices( dmatrix, samplesUsed, factors, config.getSubsetFactor() ); Map baselineConditions = BaselineSelection.getBaselineConditions( samplesUsed, factors ); - dropIncompleteFactors( samplesUsed, factors ); - return doSubSetAnalysis( subsets, dmatrixBySubSet, factors, baselineConditions, config ); + return doSubSetAnalysis( subsets, dmatrixBySubSet, samplesUsed, factors, baselineConditions, config ); + } + + @Override + public DifferentialExpressionAnalysis run( ExpressionExperimentSubSet subset, ExpressionDataDoubleMatrix dmatrix, DifferentialExpressionAnalysisConfig config ) throws AnalysisException { + Assert.notNull( config.getSubsetFactor(), "A subset factor must be set to analyze a subset." ); + /* + * Start by setting it up like the full experiment. + */ + + ExperimentalFactor ef = config.getSubsetFactor(); + + FactorValue subsetFactorValue = config.getSubsetFactorValue(); + if ( subsetFactorValue == null ) { + log.info( "No factor value set in the configuration, will determine it from the samples..." ); + subsetFactorValue = determineSubsetFactorValue( ef, subset ); + log.info( "The subset factor value appears to be " + subsetFactorValue + "." ); + } + + List factors = selectFactors( config ); + List subsetFactors = selectFactorsForSubSetAnalysis( subset, dmatrix, factors ); + if ( subsetFactors.isEmpty() ) { + throw new NoFactorLeftForAnalysisException( "Experimental design is not valid for subset: " + subsetFactorValue + ".", config ); + } + List samplesInSubset = selectSamples( dmatrix, factors, config ); + dropIncompleteFactors( samplesInSubset, factors ); + if ( factors.isEmpty() ) { + throw new NoFactorLeftForAnalysisException( "All factors were removed due to incomplete values.", config ); + } + + // slice. + ExpressionDataDoubleMatrix subsetMatrix = dmatrix.sliceColumns( samplesInSubset, createBADMap( samplesInSubset ) ); + + + Map baselineConditions = BaselineSelection.getBaselineConditions( samplesInSubset, factors ); + + return doAnalysis( subset, subsetMatrix, samplesInSubset, subsetFactors, + baselineConditions, subsetFactorValue, createConfigForSubsetAnalysis( samplesInSubset, subsetFactors, subsetFactorValue, config ) ); } /** * Perform an analysis by subset. */ private Collection doSubSetAnalysis( ExpressionExperiment expressionExperiment, List samplesUsed, List factors, Map baselineConditions, ExperimentalFactor subsetFactor, ExpressionDataDoubleMatrix dmatrix, DifferentialExpressionAnalysisConfig config ) throws AnalysisException { - Assert.isTrue( !factors.contains( subsetFactor ), - "Subset factor cannot also be included in the analysis [ Factor was: " + subsetFactor + "]" ); - Map dmatrixBySubset = makeSubSetMatrices( dmatrix, samplesUsed, factors, subsetFactor ); /* @@ -362,15 +236,13 @@ private Collection doSubSetAnalysis( ExpressionE LinearModelAnalyzer.log.info( "Analyzing subset: " + subsetFactorValue ); - List bioMaterials = orderByExperimentalDesign( dmatrixBySubset.get( subsetFactorValue ), factors, null ); - /* * make a EESubSet */ String subsetName = FactorValueUtils.getSummaryString( subsetFactorValue ); ExpressionExperimentSubSet eeSubSet = ExpressionExperimentSubSet.Factory.newInstance( subsetName, expressionExperiment ); Collection bioAssays = new HashSet<>(); - for ( BioMaterial bm : bioMaterials ) { + for ( BioMaterial bm : samplesUsed ) { bioAssays.addAll( bm.getBioAssaysUsedIn() ); } eeSubSet.getBioAssays().addAll( bioAssays ); @@ -380,10 +252,10 @@ private Collection doSubSetAnalysis( ExpressionE LinearModelAnalyzer.log.info( "Total number of subsets: " + subsets.size() ); - return doSubSetAnalysis( subsets, dmatrixBySubset, factors, baselineConditions, config ); + return doSubSetAnalysis( subsets, dmatrixBySubset, samplesUsed, factors, baselineConditions, config ); } - private Collection doSubSetAnalysis( Map subsets, Map dmatrix, List factors, Map baselineConditions, DifferentialExpressionAnalysisConfig config ) throws AnalysisException { + private Collection doSubSetAnalysis( Map subsets, Map dmatrix, List samplesInSubSet, List factors, Map baselineConditions, DifferentialExpressionAnalysisConfig config ) throws AnalysisException { /* * Now analyze each subset */ @@ -400,10 +272,10 @@ private Collection doSubSetAnalysis( Map bioMaterials = orderByExperimentalDesign( dmatrix.get( subsetFactorValue ), factors, null ); List subsetFactors = this - .fixFactorsForSubset( subsets.get( subsetFactorValue ), dmatrix.get( subsetFactorValue ), factors ); + .selectFactorsForSubSetAnalysis( subsets.get( subsetFactorValue ), dmatrix.get( subsetFactorValue ), factors ); DifferentialExpressionAnalysisConfig subsetConfig = this - .fixConfigForSubset( factors, subsetFactorValue, config ); + .createConfigForSubsetAnalysis( samplesInSubSet, factors, subsetFactorValue, config ); if ( subsetFactors.isEmpty() ) { LinearModelAnalyzer.log @@ -436,7 +308,7 @@ private Collection doSubSetAnalysis( Map samplesInSubset = subset.getBioAssays().stream() - .map( BioAssay::getSampleUsed ) - .sorted( SAMPLE_COMPARATOR ) - .collect( Collectors.toList() ); - - FactorValue subsetFactorValue = config.getSubsetFactorValue(); - if ( subsetFactorValue == null ) { - log.info( "No factor value set in the configuration, will determine it from the samples..." ); - subsetFactorValue = determineSubsetFactorValue( ef, subset ); - log.info( "The subset factor value appears to be " + subsetFactorValue + "." ); - } - - samplesInSubset = orderByExperimentalDesign( samplesInSubset, config.getFactorsToInclude(), null ); - - // slice. - ExpressionDataDoubleMatrix subsetMatrix = dmatrix.sliceColumns( samplesInSubset, createBADMap( samplesInSubset ) ); - - List factors = config.getFactorsToInclude().stream() + /** + * Select and order the factors to + */ + private List selectFactors( DifferentialExpressionAnalysisConfig config ) { + return config.getFactorsToInclude().stream() .sorted( FACTOR_COMPARATOR ) .collect( Collectors.toList() ); - List subsetFactors = fixFactorsForSubset( subset, dmatrix, factors ); - - Map baselineConditions = BaselineSelection.getBaselineConditions( samplesInSubset, factors ); - dropIncompleteFactors( samplesInSubset, factors ); - - if ( factors.isEmpty() ) { - throw new NoFactorLeftForAnalysisException( "All factors were removed due to incomplete values", config ); - } + } - if ( subsetFactors.isEmpty() ) { - LinearModelAnalyzer.log - .warn( "Experimental design is not valid for subset: " + subsetFactorValue + "; skipping" ); - return null; + /** + * Select and order the samples to use. + *

    + * Samples are ordered by the experimental design. + */ + private List selectSamples( ExpressionDataDoubleMatrix dmatrix, Collection factors, DifferentialExpressionAnalysisConfig config ) { + List samplesUsed = orderByExperimentalDesign( dmatrix, factors, null ); + if ( config.getSamplesToInclude() != null ) { + log.info( "Filtering samples to include based on the configuration..." ); + Assert.isTrue( new HashSet<>( dmatrix.getBioMaterials() ).containsAll( config.getSamplesToInclude() ), + "All the samples to include must be present in the data matrix." ); + samplesUsed = samplesUsed.stream() + .filter( config.getSamplesToInclude()::contains ) + .collect( Collectors.toList() ); } - - return doAnalysis( subset, subsetMatrix, samplesInSubset, subsetFactors, - baselineConditions, subsetFactorValue, fixConfigForSubset( subsetFactors, subsetFactorValue, config ) ); + return samplesUsed; } /** @@ -528,117 +377,72 @@ private FactorValue determineSubsetFactorValue( ExperimentalFactor subsetFactor, return subsetFactorValue; } - private DoubleMatrix1D getLibrarySizes( DifferentialExpressionAnalysisConfig config, - ExpressionDataDoubleMatrix dmatrix ) { - - DoubleMatrix1D librarySize = new DenseDoubleMatrix1D( dmatrix.columns() ); - if ( config.isUseWeights() ) { - for ( int i = 0; i < dmatrix.columns(); i++ ) { - BioAssay ba = dmatrix.getBioAssayForColumn( i ); - - Long sequenceReadCount = ba.getSequenceReadCount(); - if ( !ba.getIsOutlier() && ( sequenceReadCount == null || sequenceReadCount == 0 ) ) { - // double check. - double[] col = dmatrix.getColumnAsDoubles( i ); - double maxExpression = DescriptiveWithMissing.max( new DoubleArrayList( col ) ); - if ( maxExpression <= 0 ) { - throw new IllegalStateException( - "Sample has a null or zero read-count, isn't marked as an outlier, and max expression level is " + maxExpression - + ": " + ba ); - } - } - - if ( sequenceReadCount == null ) sequenceReadCount = 0L; - - librarySize.set( i, sequenceReadCount ); - } - } - return librarySize; - } - /** - * @return boolean true if we need the intercept. + * Create a configuration tailored to a particular subset. + * + * @param factors the factors that will be included + * @param subsetFactorValue factor value to use for the subset analysis + * @param config original configuration to base the subset config on + * @return an updated config; the baselines are cleared; subset is cleared; interactions are only kept if + * they only involve the given factors. */ - private boolean checkIfNeedToTreatAsIntercept( ExperimentalFactor experimentalFactor, - QuantitationType quantitationType ) { - if ( experimentalFactor.getFactorValues().size() == 1 ) { - if ( quantitationType.getIsRatio() ) { - return true; - } - throw new IllegalArgumentException( - "Cannot deal with constant factors unless the data are ratiometric non-reference design" ); + private DifferentialExpressionAnalysisConfig createConfigForSubsetAnalysis( List samplesUsed, List factors, + FactorValue subsetFactorValue, DifferentialExpressionAnalysisConfig config ) { + DifferentialExpressionAnalysisConfig newConfig = new DifferentialExpressionAnalysisConfig( config ); + newConfig.setSubsetFactor( null ); + newConfig.setSubsetFactorValue( subsetFactorValue ); + // remove any samples not in the given subset + if ( newConfig.getSamplesToInclude() != null ) { + Set samplesUsedSet = new HashSet<>( samplesUsed ); + newConfig.getSamplesToInclude().removeIf( s -> !samplesUsedSet.contains( s ) ); } - return false; + // remove any factors not in the given subset + Set factorsSet = new HashSet<>( factors ); + newConfig.getFactorsToInclude().removeIf( f -> !factorsSet.contains( f ) ); + newConfig.getInteractionsToInclude().removeIf( fv -> !factorsSet.containsAll( fv ) ); + return newConfig; } /** - * Create files that can be used in R to check the results. + * Remove factors which are no longer usable, based on the subset. */ - private void outputForDebugging( ExpressionDataDoubleMatrix dmatrix, - ObjectMatrix designMatrix ) { - MatrixWriter mw = new MatrixWriter( entityUrlBuilder, buildInfo ); - try ( FileWriter writer = new FileWriter( File.createTempFile( "data.", ".txt" ) ); - FileWriter out = new FileWriter( File.createTempFile( "design.", ".txt" ) ) ) { - - mw.write( dmatrix, ProcessedExpressionDataVector.class, writer ); + private List selectFactorsForSubSetAnalysis( ExpressionExperimentSubSet subset, ExpressionDataDoubleMatrix dmatrix, List factors ) { - ubic.basecode.io.writer.MatrixWriter dem = new ubic.basecode.io.writer.MatrixWriter<>( - out ); - dem.writeMatrix( designMatrix, true ); - - } catch ( IOException e ) { - log.error( "An I/O error occurred when producing debugging output.", e ); - } - } + List result = new ArrayList<>(); - /** - * Populate the interactionFactorLists and label2Factors - * - * @param interactionFactorLists gets populated - */ - private void buildModelFormula( final DifferentialExpressionAnalysisConfig config, - final Map> label2Factors, - final List interactionFactorLists ) { + QuantitationType quantitationType = dmatrix.getQuantitationType(); + ExperimentalFactor interceptFactor = this.determineInterceptFactor( factors, quantitationType ); /* - * Add interaction terms + * Remove any constant factors, unless they are that intercept. */ - boolean hasInteractionTerms = config.getInteractionsToInclude() != null && !config.getInteractionsToInclude().isEmpty(); + for ( ExperimentalFactor f : factors ) { - if ( hasInteractionTerms ) { - for ( Collection interactionTerms : config.getInteractionsToInclude() ) { + if ( f.getType().equals( FactorType.CONTINUOUS ) ) { + result.add( f ); + } else if ( interceptFactor != null && interceptFactor.equals( f ) ) { + result.add( f ); + } else { - List interactionFactorNames = new ArrayList<>(); - for ( ExperimentalFactor factor : interactionTerms ) { - interactionFactorNames.add( DiffExAnalyzerUtils.nameForR( factor ) ); - } + Collection levels = new HashSet<>(); + DiffExAnalyzerUtils.populateFactorValuesFromBASet( subset, f, levels ); - interactionFactorLists.add( interactionFactorNames.toArray( new String[] {} ) ); + if ( levels.size() > 1 ) { + result.add( f ); + } else { + LinearModelAnalyzer.log.info( "Dropping " + f + " from subset" ); + } - // In the ANOVA table. - String factTableLabel = StringUtils.join( interactionFactorNames, ":" ); - label2Factors.put( factTableLabel, new HashSet<>() ); - label2Factors.get( factTableLabel ).addAll( interactionTerms ); } - } - } - private int countNumberOfGenesNotSeenAlready( @Nullable Collection genesForProbe, Collection seenGenes ) { - int numGenes = 0; - if ( genesForProbe != null ) { - for ( Gene g : genesForProbe ) { - if ( seenGenes.contains( g ) ) { - continue; - } - numGenes++; - } - seenGenes.addAll( genesForProbe ); } - return numGenes; + return result; } /** + * Perform a differential expression analysis. + * * @param bioAssaySet source data, could be a SubSet * @param expressionData data (for the subset, if it's a subset) * @param samplesUsed samples analyzed @@ -653,7 +457,6 @@ private DifferentialExpressionAnalysis doAnalysis( BioAssaySet bioAssaySet, Map baselineConditions, @Nullable FactorValue subsetFactorValue, DifferentialExpressionAnalysisConfig config ) throws AnalysisException { - if ( factors.isEmpty() ) { throw new IllegalArgumentException( "Must provide at least one factor" ); } @@ -671,7 +474,6 @@ private DifferentialExpressionAnalysis doAnalysis( BioAssaySet bioAssaySet, } } - ExperimentalFactor interceptFactor = this.determineInterceptFactor( factors, quantitationType ); /* @@ -732,7 +534,7 @@ private DifferentialExpressionAnalysis doAnalysis( BioAssaySet bioAssaySet, Map> resultLists = new HashMap<>( properDesignMatrix.getTerms().size() ); Map> pvaluesForQvalue = new HashMap<>( properDesignMatrix.getTerms().size() ); - // We use the design matrix to ensure that we only consider terms that actually ended up in the model. + // We use the design matrix to ensure that we only consider terms that actually ended up in the model. for ( String factorName : properDesignMatrix.getTerms() ) { if ( !label2Factors.containsKey( factorName ) ) continue; // so we skip the intercept log.info( "Setting up results for " + factorName ); @@ -840,37 +642,147 @@ private DifferentialExpressionAnalysis doAnalysis( BioAssaySet bioAssaySet, Map interactionContrastCoeffs = lm.getContrastCoefficients( factorName ); Map interactionContrastPValues = lm.getContrastPValues( factorName ); - for ( String term : interactionContrastPValues.keySet() ) { - Double contrastPvalue = interactionContrastPValues.get( term ); + for ( String term : interactionContrastPValues.keySet() ) { + Double contrastPvalue = interactionContrastPValues.get( term ); + + this.makeContrast( probeAnalysisResult, factorsForName, term, factorName, contrastPvalue, + interactionContrastTStats, interactionContrastCoeffs ); + + } + } else { + throw new UnsupportedOperationException( "Handling more than two-way interactions is not implemented." ); + } + + probeAnalysisResult.setPvalue( this.nan2Null( overallPValue ) ); + pvaluesForQvalue.get( factorName ).add( overallPValue ); + resultLists.get( factorName ).add( probeAnalysisResult ); + } // over terms + + if ( ++processed % 1000 == 0 ) { + LinearModelAnalyzer.log.info( "Processed results for " + processed + " elements..." ); + } + } // over probes + + this.fillRanksAndQvalues( resultLists, pvaluesForQvalue ); + + DifferentialExpressionAnalysis expressionAnalysis = this + .makeAnalysisEntity( bioAssaySet, expressionData.getQuantitationType(), config, label2Factors, + baselineConditions, interceptFactor, interactionFactorLists, oneSampleTTest, resultLists, + subsetFactorValue, filterResultDescription ); + + LinearModelAnalyzer.log.info( "Analysis processing phase done ..." ); + + return expressionAnalysis; + } + + private DoubleMatrix1D getLibrarySizes( DifferentialExpressionAnalysisConfig config, + ExpressionDataDoubleMatrix dmatrix ) { + + DoubleMatrix1D librarySize = new DenseDoubleMatrix1D( dmatrix.columns() ); + if ( config.isUseWeights() ) { + for ( int i = 0; i < dmatrix.columns(); i++ ) { + BioAssay ba = dmatrix.getBioAssayForColumn( i ); + + Long sequenceReadCount = ba.getSequenceReadCount(); + if ( !ba.getIsOutlier() && ( sequenceReadCount == null || sequenceReadCount == 0 ) ) { + // double check. + double[] col = dmatrix.getColumnAsDoubles( i ); + double maxExpression = DescriptiveWithMissing.max( new DoubleArrayList( col ) ); + if ( maxExpression <= 0 ) { + throw new IllegalStateException( + "Sample has a null or zero read-count, isn't marked as an outlier, and max expression level is " + maxExpression + + ": " + ba ); + } + } + + if ( sequenceReadCount == null ) sequenceReadCount = 0L; + + librarySize.set( i, sequenceReadCount ); + } + } + return librarySize; + } + + /** + * @return boolean true if we need the intercept. + */ + private boolean checkIfNeedToTreatAsIntercept( ExperimentalFactor experimentalFactor, + QuantitationType quantitationType ) { + if ( experimentalFactor.getFactorValues().size() == 1 ) { + if ( quantitationType.getIsRatio() ) { + return true; + } + throw new IllegalArgumentException( + "Cannot deal with constant factors unless the data are ratiometric non-reference design" ); + } + return false; + } + + /** + * Create files that can be used in R to check the results. + */ + private void outputForDebugging( ExpressionDataDoubleMatrix dmatrix, + ObjectMatrix designMatrix ) { + MatrixWriter mw = new MatrixWriter( entityUrlBuilder, buildInfo ); + try ( FileWriter writer = new FileWriter( File.createTempFile( "data.", ".txt" ) ); + FileWriter out = new FileWriter( File.createTempFile( "design.", ".txt" ) ) ) { + + mw.write( dmatrix, ProcessedExpressionDataVector.class, writer ); + + ubic.basecode.io.writer.MatrixWriter dem = new ubic.basecode.io.writer.MatrixWriter<>( + out ); + dem.writeMatrix( designMatrix, true ); + + } catch ( IOException e ) { + log.error( "An I/O error occurred when producing debugging output.", e ); + } + } + + /** + * Populate the interactionFactorLists and label2Factors + * + * @param interactionFactorLists gets populated + */ + private void buildModelFormula( final DifferentialExpressionAnalysisConfig config, + final Map> label2Factors, + final List interactionFactorLists ) { + + /* + * Add interaction terms + */ + boolean hasInteractionTerms = config.getInteractionsToInclude() != null && !config.getInteractionsToInclude().isEmpty(); - this.makeContrast( probeAnalysisResult, factorsForName, term, factorName, contrastPvalue, - interactionContrastTStats, interactionContrastCoeffs ); + if ( hasInteractionTerms ) { + for ( Collection interactionTerms : config.getInteractionsToInclude() ) { - } - } else { - throw new UnsupportedOperationException( "Handling more than two-way interactions is not implemented." ); + List interactionFactorNames = new ArrayList<>(); + for ( ExperimentalFactor factor : interactionTerms ) { + interactionFactorNames.add( DiffExAnalyzerUtils.nameForR( factor ) ); } - probeAnalysisResult.setPvalue( this.nan2Null( overallPValue ) ); - pvaluesForQvalue.get( factorName ).add( overallPValue ); - resultLists.get( factorName ).add( probeAnalysisResult ); - } // over terms + interactionFactorLists.add( interactionFactorNames.toArray( new String[] {} ) ); - if ( ++processed % 1000 == 0 ) { - LinearModelAnalyzer.log.info( "Processed results for " + processed + " elements..." ); + // In the ANOVA table. + String factTableLabel = StringUtils.join( interactionFactorNames, ":" ); + label2Factors.put( factTableLabel, new HashSet<>() ); + label2Factors.get( factTableLabel ).addAll( interactionTerms ); } - } // over probes - - this.fillRanksAndQvalues( resultLists, pvaluesForQvalue ); - - DifferentialExpressionAnalysis expressionAnalysis = this - .makeAnalysisEntity( bioAssaySet, expressionData.getQuantitationType(), config, label2Factors, - baselineConditions, interceptFactor, interactionFactorLists, oneSampleTTest, resultLists, - subsetFactorValue, filterResultDescription ); + } + } - LinearModelAnalyzer.log.info( "Analysis processing phase done ..." ); + private int countNumberOfGenesNotSeenAlready( @Nullable Collection genesForProbe, Collection seenGenes ) { + int numGenes = 0; + if ( genesForProbe != null ) { + for ( Gene g : genesForProbe ) { + if ( seenGenes.contains( g ) ) { + continue; + } + numGenes++; + } + seenGenes.addAll( genesForProbe ); + } - return expressionAnalysis; + return numGenes; } private void warnForElement( CompositeSequence el, String s, int warned ) { @@ -929,63 +841,6 @@ private Map> getSampleToFactorValuesMap( Experimen .collect( Collectors.toSet() ) ) ); } - /** - * Remove all configurations that have to do with factors that aren't in the selected factors. - * - * @param factors the factors that will be included - * @return an updated config; the baselines are cleared; subset is cleared; interactions are only kept if - * they only - * involve the given factors. - */ - private DifferentialExpressionAnalysisConfig fixConfigForSubset( List factors, - FactorValue subsetFactorValue, DifferentialExpressionAnalysisConfig config ) { - DifferentialExpressionAnalysisConfig newConfig = new DifferentialExpressionAnalysisConfig( config ); - newConfig.setSubsetFactor( null ); - newConfig.setSubsetFactorValue( subsetFactorValue ); - // remove any factors not in the given subset - Set factorsSet = new HashSet<>( factors ); - newConfig.getFactorsToInclude().removeIf( f -> !factorsSet.contains( f ) ); - newConfig.getInteractionsToInclude().removeIf( fv -> !factorsSet.containsAll( fv ) ); - return newConfig; - } - - /** - * Remove factors which are no longer usable, based on the subset. - */ - private List fixFactorsForSubset( ExpressionExperimentSubSet eesubSet, ExpressionDataDoubleMatrix dmatrix, - List factors ) { - - List result = new ArrayList<>(); - - QuantitationType quantitationType = dmatrix.getQuantitationType(); - ExperimentalFactor interceptFactor = this.determineInterceptFactor( factors, quantitationType ); - - /* - * Remove any constant factors, unless they are that intercept. - */ - for ( ExperimentalFactor f : factors ) { - - if ( f.getType().equals( FactorType.CONTINUOUS ) ) { - result.add( f ); - } else if ( interceptFactor != null && interceptFactor.equals( f ) ) { - result.add( f ); - } else { - - Collection levels = new HashSet<>(); - DiffExAnalyzerUtils.populateFactorValuesFromBASet( eesubSet, f, levels ); - - if ( levels.size() > 1 ) { - result.add( f ); - } else { - LinearModelAnalyzer.log.info( "Dropping " + f + " from subset" ); - } - - } - - } - - return result; - } /** * Needed to compute the number of genes tested/detected. @@ -1153,6 +1008,157 @@ private Set makeResultSets( return resultSets; } + /** + * Generate HitListSize entities that will be stored to count the number of diff. ex probes at various preset + * thresholds, to avoid wasting time generating these counts on the fly later. This is done automatically during + * analysis, so is just here to allow 'backfilling'. + * + * @param probeToGeneMap map + * @param results results + * @return hit list sizes + */ + private Set computeHitListSizes( Collection results, + Map> probeToGeneMap ) { + Set hitListSizes = new HashSet<>(); + StopWatch timer = new StopWatch(); + timer.start(); + double maxThreshold = MathUtil.max( LinearModelAnalyzer.qValueThresholdsForHitLists ); + + Collection allGenes = new HashSet<>(); + for ( Collection genes : probeToGeneMap.values() ) { + allGenes.addAll( genes ); + } + + // maps from Doubles are a bit dodgy... + Map upCounts = new HashMap<>(); + Map downCounts = new HashMap<>(); + Map eitherCounts = new HashMap<>(); + + Map upCountGenes = new HashMap<>(); + Map downCountGenes = new HashMap<>(); + Map eitherCountGenes = new HashMap<>(); + + Collection seenGenes = new HashSet<>(); + for ( DifferentialExpressionAnalysisResult r : results ) { + + Double corrP = r.getCorrectedPvalue(); + if ( corrP == null || corrP > maxThreshold ) { + continue; + } + + CompositeSequence probe = r.getProbe(); + Collection genesForProbe = probeToGeneMap.get( probe ); + int numGenes = 0; + if ( genesForProbe != null ) { + numGenes = this.countNumberOfGenesNotSeenAlready( genesForProbe, seenGenes ); + } + + // if ( numGenes == 0 ) // This is okay; it might mean we already counted it as differentially expressed. + + Collection crs = r.getContrasts(); + boolean up = false; + boolean down = false; + + /* + * We set up and down to be true (either or both) if at least on contrast is shown. + */ + for ( ContrastResult cr : crs ) { + Double lf = cr.getLogFoldChange(); + //noinspection StatementWithEmptyBody // Better readability + if ( lf == null ) { + /* + * A contrast which is actually not valid, so it won't be counted in the hit list. + */ + } else if ( lf < 0 ) { + down = true; + } else if ( lf > 0 ) { + up = true; + } + } + + for ( double thresh : LinearModelAnalyzer.qValueThresholdsForHitLists ) { + + if ( !upCounts.containsKey( thresh ) ) { + upCounts.put( thresh, 0 ); + upCountGenes.put( thresh, 0 ); + } + if ( !downCounts.containsKey( thresh ) ) { + downCounts.put( thresh, 0 ); + downCountGenes.put( thresh, 0 ); + } + if ( !eitherCounts.containsKey( thresh ) ) { + eitherCounts.put( thresh, 0 ); + eitherCountGenes.put( thresh, 0 ); + } + + if ( corrP < thresh ) { + if ( up ) { + upCounts.put( thresh, upCounts.get( thresh ) + 1 ); + upCountGenes.put( thresh, upCountGenes.get( thresh ) + numGenes ); + } + if ( down ) { + downCounts.put( thresh, downCounts.get( thresh ) + 1 ); + downCountGenes.put( thresh, downCountGenes.get( thresh ) + numGenes ); + } + + eitherCounts.put( thresh, eitherCounts.get( thresh ) + 1 ); + eitherCountGenes.put( thresh, eitherCountGenes.get( thresh ) + numGenes ); + + } + } + } + + for ( double thresh : LinearModelAnalyzer.qValueThresholdsForHitLists ) { + // Ensure we don't set values to null. + int up = upCounts.getOrDefault( thresh, 0 ); + int down = downCounts.getOrDefault( thresh, 0 ); + int either = eitherCounts.getOrDefault( thresh, 0 ); + + int upGenes = upCountGenes.getOrDefault( thresh, 0 ); + int downGenes = downCountGenes.getOrDefault( thresh, 0 ); + int eitherGenes = eitherCountGenes.getOrDefault( thresh, 0 ); + + assert !( allGenes.size() < upGenes || allGenes.size() < downGenes + || allGenes.size() < eitherGenes ) : "There were more genes differentially expressed than exist in the experiment"; + + HitListSize upS = HitListSize.Factory.newInstance( thresh, up, Direction.UP, upGenes ); + HitListSize downS = HitListSize.Factory.newInstance( thresh, down, Direction.DOWN, downGenes ); + HitListSize eitherS = HitListSize.Factory.newInstance( thresh, either, Direction.EITHER, eitherGenes ); + + hitListSizes.add( upS ); + hitListSizes.add( downS ); + hitListSizes.add( eitherS ); + + assert upGenes <= eitherGenes : "More genes upregulated than upregulated+downregulated"; + assert downGenes <= eitherGenes : "More genes downregulated than upregulated+downregulated"; + + } + + if ( timer.getTime() > 1000 ) { + LinearModelAnalyzer.log.info( "Hitlist computation: " + timer.getTime() + " ms" ); + } + return hitListSizes; + } + + /** + * Utility method + * + * @param probeToGeneMap map + * @param resultList result list + * @return number of genes tested + */ + private int getNumberOfGenesTested( Collection resultList, + Map> probeToGeneMap ) { + Collection gs = new HashSet<>(); + for ( DifferentialExpressionAnalysisResult d : resultList ) { + CompositeSequence probe = d.getProbe(); + if ( probeToGeneMap.containsKey( probe ) ) { + gs.addAll( probeToGeneMap.get( probe ) ); + } + } + return gs.size(); + } + /** * Add a contrast to the given result. */ @@ -1215,7 +1221,7 @@ private void makeContrast( DifferentialExpressionAnalysisResult probeAnalysisRes for ( ExperimentalFactor f : factorList ) { for ( FactorValue fv : f.getFactorValues() ) { - if ( fv.getId().equals( factorValueId ) ) { + if ( Objects.equals( fv.getId(), factorValueId ) ) { contrast.setFactorValue( fv ); break; } @@ -1237,7 +1243,7 @@ private void makeContrast( DifferentialExpressionAnalysisResult probeAnalysisRes for ( ExperimentalFactor f : factorList ) { for ( FactorValue fv : f.getFactorValues() ) { - if ( fv.getId().equals( factorValueId ) ) { + if ( Objects.equals( fv.getId(), factorValueId ) ) { contrast.setSecondFactorValue( fv ); break; } diff --git a/gemma-core/src/main/java/ubic/gemma/persistence/service/expression/experiment/ExpressionExperimentDao.java b/gemma-core/src/main/java/ubic/gemma/persistence/service/expression/experiment/ExpressionExperimentDao.java index 3e20bb400f..153612df01 100644 --- a/gemma-core/src/main/java/ubic/gemma/persistence/service/expression/experiment/ExpressionExperimentDao.java +++ b/gemma-core/src/main/java/ubic/gemma/persistence/service/expression/experiment/ExpressionExperimentDao.java @@ -240,6 +240,10 @@ class Identifiers { Map> getAuditEvents( Collection ids ); + Collection getBioAssays( ExpressionExperiment ee, boolean includeSubSets ); + + Collection getSamplesUsed( ExpressionExperiment ee, boolean includeSubSets ); + Collection getBioAssayDimensions( ExpressionExperiment expressionExperiment ); /** diff --git a/gemma-core/src/main/java/ubic/gemma/persistence/service/expression/experiment/ExpressionExperimentDaoImpl.java b/gemma-core/src/main/java/ubic/gemma/persistence/service/expression/experiment/ExpressionExperimentDaoImpl.java index 5a72437049..5793fe7825 100644 --- a/gemma-core/src/main/java/ubic/gemma/persistence/service/expression/experiment/ExpressionExperimentDaoImpl.java +++ b/gemma-core/src/main/java/ubic/gemma/persistence/service/expression/experiment/ExpressionExperimentDaoImpl.java @@ -1411,6 +1411,40 @@ public Map> getAuditEvents( Collection ids ) } + @Override + public Collection getBioAssays( ExpressionExperiment ee, boolean includeSubSets ) { + //noinspection unchecked + Set results = new HashSet<>( ( List ) getSessionFactory().getCurrentSession() + .createQuery( "select ba from ExpressionExperiment ee join ee.bioAssays ba where ee = :ee" ) + .setParameter( "ee", ee ) + .list() ); + if ( includeSubSets ) { + //noinspection unchecked + results.addAll( ( List ) getSessionFactory().getCurrentSession() + .createQuery( "select ba from ExpressionExperimentSubSet eess join eess.bioAssays ba where eess.sourceExperiment = :ee" ) + .setParameter( "ee", ee ) + .list() ); + } + return results; + } + + @Override + public Collection getSamplesUsed( ExpressionExperiment ee, boolean includeSubSets ) { + //noinspection unchecked + Set results = new HashSet<>( ( List ) getSessionFactory().getCurrentSession() + .createQuery( "select bm from ExpressionExperiment ee join ee.bioAssays ba join ba.sampleUsed bm where ee = :ee" ) + .setParameter( "ee", ee ) + .list() ); + if ( includeSubSets ) { + //noinspection unchecked + results.addAll( ( List ) getSessionFactory().getCurrentSession() + .createQuery( "select bm from ExpressionExperimentSubSet eess join eess.bioAssays ba join ba.sampleUsed bm where eess.sourceExperiment = :ee" ) + .setParameter( "ee", ee ) + .list() ); + } + return results; + } + @Override public Collection getBioAssayDimensions( ExpressionExperiment expressionExperiment ) { //noinspection unchecked diff --git a/gemma-core/src/main/java/ubic/gemma/persistence/service/expression/experiment/ExpressionExperimentService.java b/gemma-core/src/main/java/ubic/gemma/persistence/service/expression/experiment/ExpressionExperimentService.java index af94f4116e..f5b21b089a 100644 --- a/gemma-core/src/main/java/ubic/gemma/persistence/service/expression/experiment/ExpressionExperimentService.java +++ b/gemma-core/src/main/java/ubic/gemma/persistence/service/expression/experiment/ExpressionExperimentService.java @@ -664,6 +664,23 @@ class CharacteristicWithUsageStatisticsAndOntologyTerm { */ Map getTaxaUsageFrequency( @Nullable Filters filters, @Nullable Set extraIds ); + /** + * Obtain the assays used by the experiment. + * + * @param includeSubSets include assays that belong to subsets of the experiment. + * @see ExpressionExperiment#getBioAssays() + * @see ExpressionExperimentSubSet#getBioAssays() + */ + Collection getBioAssays( ExpressionExperiment ee, boolean includeSubSets ); + + /** + * Obtain the samples used by the assay of the experiment. + * + * @param includeSubSets include samples that are associated to assays that belong to subsets of the experiment. + * @see BioAssay#getSampleUsed() + */ + Collection getSamplesUsed( ExpressionExperiment ee, boolean includeSubSets ); + /** * Obtain all the dimensions associated to the given experiment. *

    diff --git a/gemma-core/src/main/java/ubic/gemma/persistence/service/expression/experiment/ExpressionExperimentServiceImpl.java b/gemma-core/src/main/java/ubic/gemma/persistence/service/expression/experiment/ExpressionExperimentServiceImpl.java index 70a421f5da..eb6e8a6fb2 100755 --- a/gemma-core/src/main/java/ubic/gemma/persistence/service/expression/experiment/ExpressionExperimentServiceImpl.java +++ b/gemma-core/src/main/java/ubic/gemma/persistence/service/expression/experiment/ExpressionExperimentServiceImpl.java @@ -1336,6 +1336,18 @@ public Map getTaxaUsageFrequency( @Nullable Filters filters, @Nulla } } + @Override + @Transactional(readOnly = true) + public Collection getBioAssays( ExpressionExperiment ee, boolean includeSubSets ) { + return expressionExperimentDao.getBioAssays( ee, includeSubSets ); + } + + @Override + @Transactional(readOnly = true) + public Collection getSamplesUsed( ExpressionExperiment ee, boolean includeSubSets ) { + return expressionExperimentDao.getSamplesUsed( ee, includeSubSets ); + } + @Override @Transactional(readOnly = true) public Collection getBioAssayDimensionsWithAssays( ExpressionExperiment expressionExperiment ) { From 36efca6c7ae75b7fc8b9f9115ca063db4d343591 Mon Sep 17 00:00:00 2001 From: OganM Date: Fri, 6 Mar 2026 11:51:35 -0800 Subject: [PATCH 2/6] fix setting samplesUsed --- .../core/analysis/expression/diff/LinearModelAnalyzer.java | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/gemma-core/src/main/java/ubic/gemma/core/analysis/expression/diff/LinearModelAnalyzer.java b/gemma-core/src/main/java/ubic/gemma/core/analysis/expression/diff/LinearModelAnalyzer.java index cb13235520..5b6048d805 100644 --- a/gemma-core/src/main/java/ubic/gemma/core/analysis/expression/diff/LinearModelAnalyzer.java +++ b/gemma-core/src/main/java/ubic/gemma/core/analysis/expression/diff/LinearModelAnalyzer.java @@ -239,10 +239,12 @@ private Collection doSubSetAnalysis( ExpressionE /* * make a EESubSet */ + List bioMaterials = orderByExperimentalDesign( dmatrixBySubset.get( subsetFactorValue ), factors, null ); + String subsetName = FactorValueUtils.getSummaryString( subsetFactorValue ); ExpressionExperimentSubSet eeSubSet = ExpressionExperimentSubSet.Factory.newInstance( subsetName, expressionExperiment ); Collection bioAssays = new HashSet<>(); - for ( BioMaterial bm : samplesUsed ) { + for ( BioMaterial bm : bioMaterials ) { bioAssays.addAll( bm.getBioAssaysUsedIn() ); } eeSubSet.getBioAssays().addAll( bioAssays ); From 61a66e8b86b37a677dccedc00bfe457fa6a69333 Mon Sep 17 00:00:00 2001 From: OganM Date: Mon, 9 Mar 2026 15:00:22 -0700 Subject: [PATCH 3/6] suggested changes --- .../java/ubic/gemma/apps/DifferentialExpressionAnalysisCli.java | 2 +- .../src/main/java/ubic/gemma/cli/completion/CompletionType.java | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/gemma-cli/src/main/java/ubic/gemma/apps/DifferentialExpressionAnalysisCli.java b/gemma-cli/src/main/java/ubic/gemma/apps/DifferentialExpressionAnalysisCli.java index 9572094d72..57cbb1e372 100644 --- a/gemma-cli/src/main/java/ubic/gemma/apps/DifferentialExpressionAnalysisCli.java +++ b/gemma-cli/src/main/java/ubic/gemma/apps/DifferentialExpressionAnalysisCli.java @@ -246,7 +246,7 @@ protected void buildExperimentOptions( Options options ) { // destination (db, standard location or custom directory) options.addOption( "nodb", "no-db", false, "Do not persist diff. ex. results to the database and instead save them to the current directory (or the location defined by " + formatOption( options, DataFileOptionsUtils.OUTPUT_DIR_OPTION ) + ")." ); - options.addOption( "nofiles", "no-files", false, "Don't create archive files after analysis. Default is to make them. This is incompatible with " + formatOption( options, "nodb" ) + " option to be set." ); + options.addOption( "nofiles", "no-files", false, "Don't create archive files after analysis. Default is to make them. This is incompatible with " + formatOption( options, "nodb" ) + "." ); // redo mode options.addOption( "redo", "redo", false, diff --git a/gemma-cli/src/main/java/ubic/gemma/cli/completion/CompletionType.java b/gemma-cli/src/main/java/ubic/gemma/cli/completion/CompletionType.java index b60179429f..196fddcc5a 100644 --- a/gemma-cli/src/main/java/ubic/gemma/cli/completion/CompletionType.java +++ b/gemma-cli/src/main/java/ubic/gemma/cli/completion/CompletionType.java @@ -7,7 +7,6 @@ public enum CompletionType { DATASET_GROUP, EXTERNAL_DATABASE, DATASET, - SAMPLE, EXPERIMENTAL_FACTOR, /** * Complete experimental factors and interactions suitable for DEA. From 1f2a8fe3911d83dcd9f7bc7b1df89c76b45f0de3 Mon Sep 17 00:00:00 2001 From: OganM Date: Mon, 9 Mar 2026 16:14:58 -0700 Subject: [PATCH 4/6] useWeights as a flag --- .../gemma/apps/DifferentialExpressionAnalysisCli.java | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/gemma-cli/src/main/java/ubic/gemma/apps/DifferentialExpressionAnalysisCli.java b/gemma-cli/src/main/java/ubic/gemma/apps/DifferentialExpressionAnalysisCli.java index 57cbb1e372..6e4c683d99 100644 --- a/gemma-cli/src/main/java/ubic/gemma/apps/DifferentialExpressionAnalysisCli.java +++ b/gemma-cli/src/main/java/ubic/gemma/apps/DifferentialExpressionAnalysisCli.java @@ -172,6 +172,7 @@ private enum FactorSelectionMode { private Double filterMinUniqueValues; @Nullable private Double filterMinVariance; + private boolean useWeights; /** * Persist results to the database. @@ -300,6 +301,10 @@ protected void buildExperimentOptions( Options options ) { .type( Double.class ) .desc( "Minimum variance required for a design element to be included in the analysis. Defaults to " + DifferentialExpressionAnalysisFilter.DEFAULT_MINIMUM_VARIANCE + "." ) .get() ); + options.addOption( Option.builder( "useWeights" ) + .hasArg( false ) + .desc( "Use weights" ) + .get() ); // delete mode options.addOption( "delete", "delete", false, "Remove all the existing analyses for the specified experiment(s). Use with care!" ); @@ -403,6 +408,7 @@ protected void processExperimentOptions( CommandLine commandLine ) throws ParseE // TODO: check if the analysis being redone is a subset analysis this.ignoreFailingSubsets = hasOption( commandLine, "ignoreFailingSubsets", requires( anyOf( toBeSet( "subset" ), toBeSet( "redo" ) ) ) ); + this.useWeights = commandLine.hasOption("useWeights"); } @Override @@ -475,7 +481,8 @@ protected void processExpressionExperiment( ExpressionExperiment ee ) { config.setPersist( this.persist ); config.setMakeArchiveFile( this.persist && this.makeArchiveFiles ); config.setIgnoreFailingSubsets( this.ignoreFailingSubsets ); - config.setUseWeights( eeService.isRNASeq( ee ) ); + // if useWeights flag is provided, force to true, otherwise check if RNAseq + config.setUseWeights( this.useWeights ? this.useWeights : eeService.isRNASeq( ee ) ); // sample selection if ( this.sampleIdentifiers != null ) { From a9373de73f6e45ff6b368b8f4de324ee9be88fd0 Mon Sep 17 00:00:00 2001 From: OganM Date: Mon, 9 Mar 2026 18:39:02 -0700 Subject: [PATCH 5/6] actually accept samples --- .../ubic/gemma/apps/DifferentialExpressionAnalysisCli.java | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/gemma-cli/src/main/java/ubic/gemma/apps/DifferentialExpressionAnalysisCli.java b/gemma-cli/src/main/java/ubic/gemma/apps/DifferentialExpressionAnalysisCli.java index 6e4c683d99..37495d48bc 100644 --- a/gemma-cli/src/main/java/ubic/gemma/apps/DifferentialExpressionAnalysisCli.java +++ b/gemma-cli/src/main/java/ubic/gemma/apps/DifferentialExpressionAnalysisCli.java @@ -236,7 +236,10 @@ protected void buildExperimentOptions( Options options ) { + "If the experiment already has subsets for the factor, those will be reused. " + "This is incompatible with -redo,--redo, -redoAnalysis,--redo-analysis or -redoSubset,--redo-subset." ).get() ); - addSingleExperimentOption( options, Option.builder( "samples" ).longOpt( "samples" ).argName( "ID, name, accession" ) + addSingleExperimentOption( options, Option.builder( "samples" ) + .longOpt( "samples" ) + .hasArgs() + .argName( "ID, name, accession" ) .desc( "ID, name or accession of samples to be included in the analysis. Defaults to all samples in the dataset being analyzed. Requires the " + formatOption( "nodb", "no-db" ) + " option to be set." ).get() ); options.addOption( "usebatch", "use-batch-factor", false, "If a batch factor is available, use it. Otherwise, batch information can/will be ignored in the analysis. This is incompatible with " + formatOption( options, "factors" ) + ", -redo,--redo, -redoAnalysis,--redo-analysis and -redoSubset,--redo-subset." ); From f11133fb943af6474eeb9e3e7d2f12ca453dc817 Mon Sep 17 00:00:00 2001 From: OganM Date: Mon, 9 Mar 2026 20:00:35 -0700 Subject: [PATCH 6/6] an attempt to leverage bioAssays for the search for samples --- .../ubic/gemma/apps/DifferentialExpressionAnalysisCli.java | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/gemma-cli/src/main/java/ubic/gemma/apps/DifferentialExpressionAnalysisCli.java b/gemma-cli/src/main/java/ubic/gemma/apps/DifferentialExpressionAnalysisCli.java index 37495d48bc..3fb6ddd7ed 100644 --- a/gemma-cli/src/main/java/ubic/gemma/apps/DifferentialExpressionAnalysisCli.java +++ b/gemma-cli/src/main/java/ubic/gemma/apps/DifferentialExpressionAnalysisCli.java @@ -493,7 +493,11 @@ protected void processExpressionExperiment( ExpressionExperiment ee ) { for ( String sampleId : sampleIdentifiers ) { // allow searching in subsets, those are used in single-cell diff ex. analysis that use sub-samples for // representing pseudo-bulks - samplesToUse.add( entityLocator.locateSample( ee, sampleId, true ) ); + BioMaterial sample = entityLocator.locateSample( ee, sampleId, true ); + if (sample == null){ + sample = entityLocator.locateBioAssay( ee, sampleId, true ).getSampleUsed(); + } + samplesToUse.add( sample ); } config.setSamplesToInclude( samplesToUse ); }