diff --git a/gemma-cli/src/main/java/ubic/gemma/apps/SplitExperimentCli.java b/gemma-cli/src/main/java/ubic/gemma/apps/SplitExperimentCli.java index 5173af0503..8d715ec605 100644 --- a/gemma-cli/src/main/java/ubic/gemma/apps/SplitExperimentCli.java +++ b/gemma-cli/src/main/java/ubic/gemma/apps/SplitExperimentCli.java @@ -20,19 +20,14 @@ package ubic.gemma.apps; import org.apache.commons.cli.CommandLine; -import org.apache.commons.cli.Option; import org.apache.commons.cli.Options; import org.apache.commons.cli.ParseException; import org.springframework.beans.factory.annotation.Autowired; +import ubic.gemma.cli.util.EntityLocator; import ubic.gemma.core.analysis.preprocess.SplitExperimentService; import ubic.gemma.model.analysis.expression.ExpressionExperimentSet; -import ubic.gemma.model.expression.experiment.ExperimentFactorUtils; import ubic.gemma.model.expression.experiment.ExperimentalFactor; -import ubic.gemma.model.expression.experiment.ExperimentalFactorValueObject; import ubic.gemma.model.expression.experiment.ExpressionExperiment; -import ubic.gemma.persistence.service.expression.experiment.ExperimentalFactorService; - -import java.util.Collection; /** * Split an experiment into parts based on an experimental factor @@ -41,18 +36,18 @@ */ public class SplitExperimentCli extends ExpressionExperimentManipulatingCLI { - /** - * - */ - private static final String FACTOR_OPTION = "factor"; + private static final String FACTOR_OPTION = "factor", + SKIP_POST_PROCESSING_OPTION = "nopost", + DELETE_ORIGINAL_EXPERIMENT_OPTION = "deleteOriginalExperiment"; @Autowired private SplitExperimentService serv; @Autowired - private ExperimentalFactorService efs; + private EntityLocator entityLocator; - private Long factorId; - private String factorName; + private String factorIdentifier; + private boolean skipPostProcessing; + private boolean deleteOriginalExperiment; public SplitExperimentCli() { super(); @@ -71,80 +66,23 @@ public String getShortDesc() { @Override protected void buildExperimentOptions( Options options ) { - options.addOption( Option.builder( FACTOR_OPTION ).hasArg() - .desc( "ID numbers, categories or names of the factor to use, with spaces replaced by underscores (must not be 'batch')" ) - .build() ); + options.addRequiredOption( FACTOR_OPTION, "factor", true, "ID numbers, categories or names of the factor to use, with spaces replaced by underscores (must not be 'batch')" ); + options.addOption( SKIP_POST_PROCESSING_OPTION, "no-post-processing", false, "Skip post-processing of resulting splits if applicable." ); + options.addOption( DELETE_ORIGINAL_EXPERIMENT_OPTION, "delete-original-experiment", false, "Delete the original experiment once the split succeeds." ); } @Override protected void processExperimentOptions( CommandLine commandLine ) throws ParseException { - if ( !commandLine.hasOption( FACTOR_OPTION ) ) { - throw new IllegalArgumentException( "Please specify the factor" ); - } - String rawFactor = commandLine.getOptionValue( FACTOR_OPTION ); - try { - this.factorId = Long.parseLong( rawFactor ); - } catch ( NumberFormatException e ) { - this.factorName = rawFactor; - } + factorIdentifier = commandLine.getOptionValue( FACTOR_OPTION ); + skipPostProcessing = commandLine.hasOption( SKIP_POST_PROCESSING_OPTION ); + deleteOriginalExperiment = commandLine.hasOption( DELETE_ORIGINAL_EXPERIMENT_OPTION ); } @Override protected void processExpressionExperiment( ExpressionExperiment ee ) { ee = this.eeService.thawLite( ee ); - ExperimentalFactor splitOn = this.guessFactor( ee ); - ExpressionExperimentSet eeSet = serv.split( ee, splitOn, true ); + ExperimentalFactor splitOn = entityLocator.locateExperimentalFactor( ee, factorIdentifier ); + ExpressionExperimentSet eeSet = serv.split( ee, splitOn, !skipPostProcessing, deleteOriginalExperiment ); addSuccessObject( ee, "Experiment was split on " + splitOn + " into " + eeSet.getExperiments().size() + " parts." ); } - - /** - * Adapted from code in DifferentialExpressionAnalysisCli - */ - private ExperimentalFactor guessFactor( ExpressionExperiment ee ) { - if ( ee.getExperimentalDesign() == null ) { - throw new IllegalStateException( ee + " does not have an experimental design, it cannot be split on a factor." ); - } - - if ( this.factorName != null ) { - - Collection experimentalFactors = ee.getExperimentalDesign().getExperimentalFactors(); - for ( ExperimentalFactor experimentalFactor : experimentalFactors ) { - - // has already implemented way of figuring out human-friendly name of factor value. - ExperimentalFactorValueObject fvo = new ExperimentalFactorValueObject( experimentalFactor ); - - // do not attempt to switch on 'batch' - if ( ExperimentFactorUtils.isBatchFactor( experimentalFactor ) ) { - continue; - } - - if ( factorName.contains( experimentalFactor.getName().replaceAll( " ", "_" ) ) ) { - return experimentalFactor; - } else if ( fvo.getCategory() != null && factorName - .contains( fvo.getCategory().replaceAll( " ", "_" ) ) ) { - return experimentalFactor; - } - } - - throw new IllegalArgumentException( "Didn't find factor the provided factor name " ); - - } - - ExperimentalFactor factor = efs.loadOrFail( factorId ); - factor = efs.thaw( factor ); - if ( factor == null ) { - throw new IllegalArgumentException( "No factor for id=" + factorId ); - } - if ( !factor.getExperimentalDesign().equals( ee.getExperimentalDesign() ) ) { - throw new IllegalArgumentException( "Factor with id=" + factorId + " does not belong to " + ee ); - } - - if ( ExperimentFactorUtils.isBatchFactor( factor ) ) { - throw new IllegalArgumentException( "Selected factor looks like batch, split not allowed, choose another factor instead" ); - } - - return factor; - - } - } diff --git a/gemma-core/src/main/java/ubic/gemma/core/analysis/preprocess/SplitExperimentHelperService.java b/gemma-core/src/main/java/ubic/gemma/core/analysis/preprocess/SplitExperimentHelperService.java new file mode 100644 index 0000000000..3d5d8263c2 --- /dev/null +++ b/gemma-core/src/main/java/ubic/gemma/core/analysis/preprocess/SplitExperimentHelperService.java @@ -0,0 +1,517 @@ +package ubic.gemma.core.analysis.preprocess; + + +import lombok.Value; +import lombok.extern.apachecommons.CommonsLog; +import org.apache.commons.lang3.StringUtils; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.stereotype.Service; +import org.springframework.transaction.annotation.Transactional; +import org.springframework.util.Assert; +import ubic.gemma.core.analysis.preprocess.slice.BulkDataSlicerUtils; +import ubic.gemma.core.analysis.singleCell.SingleCellSlicerUtils; +import ubic.gemma.model.analysis.expression.ExpressionExperimentSet; +import ubic.gemma.model.common.description.Characteristic; +import ubic.gemma.model.common.description.DatabaseEntry; +import ubic.gemma.model.common.measurement.Measurement; +import ubic.gemma.model.common.quantitationtype.QuantitationType; +import ubic.gemma.model.expression.bioAssay.BioAssay; +import ubic.gemma.model.expression.bioAssayData.*; +import ubic.gemma.model.expression.biomaterial.BioMaterial; +import ubic.gemma.model.expression.biomaterial.Treatment; +import ubic.gemma.model.expression.experiment.*; +import ubic.gemma.persistence.persister.Persister; +import ubic.gemma.persistence.service.expression.experiment.ExpressionExperimentService; +import ubic.gemma.persistence.service.expression.experiment.ExpressionExperimentSetService; +import ubic.gemma.persistence.service.expression.experiment.SingleCellExpressionExperimentService; + +import java.nio.charset.StandardCharsets; +import java.util.*; + +import static ubic.gemma.core.util.StringUtils.abbreviateWithSuffix; + +/** + * Helper service for {@link SplitExperimentService} to perform the split part in a transaction. + */ +@CommonsLog +@Service +class SplitExperimentHelperService { + + @Autowired + private ExpressionExperimentService eeService; + @Autowired + private SingleCellExpressionExperimentService singleCellExpressionExperimentService; + @Autowired + private ExpressionExperimentSetService expressionExperimentSetService; + @Autowired + private Persister persister; + + @Value + static class ExperimentSplitResult { + /** + * Experiment set containing the resulting split experiments which are completely detached from the original + * experiment. + */ + ExpressionExperimentSet experimentSet; + /** + * Indicate if a preferred set of raw data vectors were found in the original experiment. This is used to decide + * if post-processing is possible. + */ + boolean foundPreferred; + } + + @Transactional + public ExperimentSplitResult split( ExpressionExperiment toSplit, ExperimentalFactor splitOn ) { + if ( !toSplit.getOtherParts().isEmpty() ) { + throw new IllegalArgumentException( "Cannot split an experiment that has other parts. Delete the other parts first before splitting again." ); + } + + if ( eeService.getArrayDesignsUsed( toSplit ).size() > 1 ) { + throw new IllegalArgumentException( "Cannot split experiments that are on more than one platform." ); + } + + if ( ExperimentFactorUtils.isBatchFactor( splitOn ) ) { + throw new IllegalArgumentException( "Cannot split an experiment on a batch factor." ); + } + + Set result = new HashSet<>(); + + String sourceShortName = toSplit.getShortName(); + + // we cannot rely on ExpressionExperiment.getQuantitationTypes() because it is a denormalization, and we might + // miss some vectors + Map, Set> qtsByVectorType = eeService.getQuantitationTypesByVectorType( toSplit ); + + if ( qtsByVectorType.isEmpty() ) { + log.warn( "Experiment has no QTs, probably doesn't have data, post-processing of splits will be skipped" ); + } + + // Get the expression data matrices for the experiment. We'll split them and generate new vectors + boolean foundPreferred = false; + Map> qt2RawVec = new HashMap<>(); + Map> qt2SingleCellVec = new HashMap<>(); + + for ( Map.Entry, Set> e : qtsByVectorType.entrySet() ) { + Class vectorType = e.getKey(); + Set qts = e.getValue(); + if ( RawExpressionDataVector.class.isAssignableFrom( vectorType ) ) { + log.info( "Fetching raw expression data vectors... " ); + for ( QuantitationType qt : qts ) { + Collection vectors = eeService.getRawDataVectors( toSplit, qt ); + if ( vectors.isEmpty() ) { + // this is okay if the data is processed, or if we have stray orphaned QTs + log.debug( "No raw vectors for " + qt + ", skipping..." ); + continue; + } + if ( qt.getIsPreferred() ) { + foundPreferred = true; + } + log.info( vectors.size() + " vectors for " + qt + "; preferred=" + qt.getIsPreferred() ); + + qt2RawVec.put( qt, ( vectors ) ); + } + if ( !foundPreferred ) { + log.warn( "No preferred quantitation type found; post-processing of splits will be skipped" ); + } + } else if ( SingleCellExpressionDataVector.class.isAssignableFrom( vectorType ) ) { + log.info( "Fetching single-cell data vectors..." ); + for ( QuantitationType qt : qts ) { + List vectors = new ArrayList<>( singleCellExpressionExperimentService.getSingleCellDataVectors( toSplit, qt ) ); + if ( vectors.isEmpty() ) { + log.warn( "No single-cell vectors for " + qt + ", skipping..." ); + continue; + } + qt2SingleCellVec.put( qt, vectors ); + } + } else if ( ProcessedExpressionDataVector.class.isAssignableFrom( vectorType ) ) { + log.debug( "Found processed data vectors; these will not be carried over to the splits." ); + } else { + throw new UnsupportedOperationException( "Unsupported data vector type for splitting: " + vectorType.getName() + "." ); + } + } + + // stub the new experiments and create new names; all other information should be retained. Permissions should be the same. + int splitNumber = 0; + + for ( FactorValue splitValue : splitOn.getFactorValues() ) { + splitNumber++; + ExpressionExperiment split = ExpressionExperiment.Factory.newInstance(); + split.setShortName( sourceShortName + "." + splitNumber ); + + // copy everything but samples over + split.setName( generateNameForSplit( toSplit, splitNumber, splitValue ) ); + split.setDescription( "This experiment was created by Gemma splitting another: \n" + toSplit + toSplit.getDescription() ); + + split.setCharacteristics( this.cloneCharacteristics( toSplit.getCharacteristics() ) ); + split.setMetadata( toSplit.getMetadata() ); // + split.setPrimaryPublication( toSplit.getPrimaryPublication() ); + split.getOtherRelevantPublications().addAll( toSplit.getOtherRelevantPublications() ); + if ( toSplit.getAccession() != null ) { + split.setAccession( this.cloneAccession( toSplit.getAccession() ) ); // accession is currently unique, so have to clone + } + split.setOwner( toSplit.getOwner() ); + split.setSource( toSplit.getSource() ); + split.setTaxon( toSplit.getTaxon() ); + // starting with a fresh audit trail. + + Map old2cloneFV = new HashMap<>(); + if ( toSplit.getExperimentalDesign() != null ) { + split.setExperimentalDesign( this.cloneExperimentalDesign( toSplit.getExperimentalDesign(), old2cloneFV ) ); + } + + // add the biomaterials + List clonedBAs = new ArrayList<>(); + Collection usedFactorValues = new HashSet<>(); + for ( BioAssay ba : toSplit.getBioAssays() ) { + boolean kept = false; + BioMaterial bm = ba.getSampleUsed(); + + // identify samples we want to include + // TODO: support sub-biomaterials and use getAllFactorValues() instead, we also need to implement + // cloneBioMaterial() accordingly + for ( FactorValue fv : bm.getFactorValues() ) { + if ( fv.equals( splitValue ) ) { + kept = true; + break; + } + } + + if ( kept ) { + BioAssay newBa = this.cloneBioAssay( ba ); + clonedBAs.add( newBa ); + for ( FactorValue fv : bm.getFactorValues() ) { + if ( fv.equals( splitValue ) ) { + // make a BioMaterial characteristic, so we don't lose the information (might be redundant) + for ( Characteristic c : fv.getCharacteristics() ) { + newBa.getSampleUsed().getCharacteristics().add( this.cloneCharacteristic( c ) ); + } + // note that the split FV is not included as a FV in the new BM because all the samples + // share the same value + continue; + } + newBa.getSampleUsed().getFactorValues().add( old2cloneFV.get( fv ) ); + usedFactorValues.add( old2cloneFV.get( fv ) ); + } + } + } + + split.getBioAssays().clear(); + split.getBioAssays().addAll( clonedBAs ); + split.setNumberOfSamples( clonedBAs.size() ); + + // remove unused factors and factor values from the design and biomaterials + if ( split.getExperimentalDesign() != null ) { + Collection toRemoveFactors = new HashSet<>(); + for ( ExperimentalFactor ef : split.getExperimentalDesign().getExperimentalFactors() ) { + Collection toRemove = new HashSet<>(); + for ( FactorValue fv : ef.getFactorValues() ) { // these are clones + if ( !usedFactorValues.contains( fv ) ) { + toRemove.add( fv ); + } + } + + if ( ef.getFactorValues().removeAll( toRemove ) ) { + log.info( toRemove.size() + " unused factor values removed for " + ef + " in split " + splitNumber + ", leaving " + + ef.getFactorValues().size() + " fvs still used" ); + } + + assert !split.getBioAssays().isEmpty(); + + // EFs that have only one level, or which aren't used at all, are removed from the biomaterials (and gathered for removal from the ED) + if ( ef.getFactorValues().size() <= 1 ) { + toRemoveFactors.add( ef ); + for ( BioAssay ba : split.getBioAssays() ) { + BioMaterial bm = ba.getSampleUsed(); + Collection fvsToClear = new HashSet<>(); + for ( FactorValue fv : bm.getFactorValues() ) { + if ( fv.getExperimentalFactor().equals( ef ) ) { + fvsToClear.add( fv ); + } + } + if ( bm.getFactorValues().removeAll( fvsToClear ) ) { + log.debug( "Cleared " + fvsToClear.size() + " unused factor values from " + bm ); + } + } + } + } + + // remove the unused/unneeded factors from the ED + if ( split.getExperimentalDesign().getExperimentalFactors().removeAll( toRemoveFactors ) ) { + log.info( toRemoveFactors.size() + " unused experimental factors dropped from split " + splitNumber ); + } + } + + log.info( "Building vectors for " + qt2RawVec.size() + " quantitation types ..." ); + Map dimensionCache = new HashMap<>(); + for ( QuantitationType qt : qt2RawVec.keySet() ) { + QuantitationType clonedQt = this.cloneQt( qt, split ); + split.getQuantitationTypes().add( clonedQt ); + // these bms are same as the ones associated with the vectors, not the clones + Collection vectors = qt2RawVec.get( qt ); + Collection rawDataVectors = BulkDataSlicerUtils.slice( vectors, clonedBAs, RawExpressionDataVector.class, true, dimensionCache ); + // slice retain the original QT, so we need to replace it with the clone + vectors.forEach( v -> v.setQuantitationType( clonedQt ) ); + log.info( split.getShortName() + ": Adding " + rawDataVectors.size() + " raw data vectors for " + clonedQt + " preferred=" + + clonedQt.getIsPreferred() ); + split.getRawExpressionDataVectors().addAll( rawDataVectors ); + } + + Map singleCellDimensionCache = new HashMap<>(); + + for ( QuantitationType qt : qt2SingleCellVec.keySet() ) { + QuantitationType clonedQt = this.cloneQt( qt, split ); + split.getQuantitationTypes().add( clonedQt ); + Collection scVectors = SingleCellSlicerUtils.slice( qt2SingleCellVec.get( qt ), clonedBAs, singleCellDimensionCache ); + // slice retain the original QT, so we need to replace it with the clone + scVectors.forEach( v -> v.setQuantitationType( clonedQt ) ); + log.info( split.getShortName() + ": Adding " + scVectors.size() + " single-cell data vectors for " + clonedQt + " preferred=" + + clonedQt.getIsSingleCellPreferred() ); + split.getSingleCellExpressionDataVectors().addAll( scVectors ); + } + + split = persister.persist( split ); + + // securityService.makePublic( split ); // temporary + result.add( split ); + } + + enforceOtherParts( result ); + eeService.update( result ); + + /* + * Create a new "experiment set" that groups them together (not sure if we'll keep this) + */ + ExpressionExperimentSet g = ExpressionExperimentSet.Factory.newInstance(); + g.setDescription( "Parts of " + toSplit.getShortName() + " that were split on " + splitOn.getName() ); + g.setName( toSplit.getShortName() + " splits" ); + g.setTaxon( toSplit.getTaxon() ); + g.getExperiments().addAll( result ); + g = this.expressionExperimentSetService.create( g ); + + return new ExperimentSplitResult( g, foundPreferred ); + } + + static String generateNameForSplit( ExpressionExperiment toSplit, int splitNumber, FactorValue splitValue ) { + String categoryString = StringUtils.strip( splitValue.getExperimentalFactor().getCategory() != null ? + splitValue.getExperimentalFactor().getCategory().getValue() : + splitValue.getExperimentalFactor().getName() ); + String factorValueString = FactorValueUtils.getSummaryString( splitValue ); + String suffix = String.format( " [%s = %s]", categoryString, factorValueString ); + return abbreviateWithSuffix( + String.format( "Split part %d of: %s", splitNumber, StringUtils.strip( toSplit.getName() ) ), suffix, + "…", ExpressionExperiment.MAX_NAME_LENGTH, true, StandardCharsets.UTF_8 ); + } + + private void enforceOtherParts( Collection result ) { + // Enforce relation to other parts of the split. + for ( ExpressionExperiment split : result ) { + for ( ExpressionExperiment split2 : result ) { + if ( split.equals( split2 ) ) continue; + split.getOtherParts().add( split2 ); + } + } + } + + private ExperimentalDesign cloneExperimentalDesign( ExperimentalDesign experimentalDesign, Map old2cloneFV ) { + ExperimentalDesign clone = ExperimentalDesign.Factory.newInstance(); + clone.setDescription( experimentalDesign.getDescription() ); + clone.setName( experimentalDesign.getName() ); + clone.setNormalizationDescription( experimentalDesign.getNormalizationDescription() ); + clone.setQualityControlDescription( experimentalDesign.getQualityControlDescription() ); + clone.setReplicateDescription( experimentalDesign.getReplicateDescription() ); + clone.setTypes( this.cloneCharacteristics( experimentalDesign.getTypes() ) ); + + clone.getExperimentalFactors() + .addAll( this.cloneExperimentalFactors( experimentalDesign.getExperimentalFactors(), clone, old2cloneFV ) ); + + return clone; + } + + private Collection cloneExperimentalFactors( Collection experimentalFactors, ExperimentalDesign ed, + Map old2cloneFV ) { + assert ed.getId() == null; + Collection result = new HashSet<>(); + for ( ExperimentalFactor ef : experimentalFactors ) { + ExperimentalFactor clone = ExperimentalFactor.Factory.newInstance(); + //noinspection deprecation + clone.setAnnotations( this.cloneCharacteristics( ef.getAnnotations() ) ); + if ( ef.getCategory() != null ) { + clone.setCategory( this.cloneCharacteristic( ef.getCategory() ) ); + } + clone.setName( ef.getName() ); + clone.setDescription( ef.getDescription() ); + clone.setType( ef.getType() ); + clone.getFactorValues().addAll( this.cloneFactorValues( ef.getFactorValues(), clone, old2cloneFV ) ); + clone.setExperimentalDesign( ed ); + result.add( clone ); + // assert clone.getId() == null; + } + return result; + } + + private Collection cloneFactorValues( Collection factorValues, ExperimentalFactor ef, + Map old2cloneFV ) { + assert ef.getId() == null; + Collection result = new HashSet<>(); + for ( FactorValue fv : factorValues ) { + FactorValue clone = FactorValue.Factory.newInstance( ef ); + clone.setCharacteristics( cloneStatements( fv ) ); + clone.setIsBaseline( fv.getIsBaseline() ); + //noinspection deprecation + clone.setValue( fv.getValue() ); + if ( fv.getMeasurement() != null ) { + clone.setMeasurement( this.cloneMeasurement( fv.getMeasurement() ) ); + } + result.add( clone ); + assert !old2cloneFV.containsKey( fv ); + old2cloneFV.put( fv, clone ); + } + + return result; + } + + private Set cloneStatements( FactorValue fv ) { + Collection ch = fv.getCharacteristics(); + // pair of original -> clone + List result = new ArrayList<>( ch.size() ); + for ( Statement s : ch ) { + result.add( cloneStatement( s ) ); + } + return new HashSet<>( result ); + } + + private Statement cloneStatement( Statement s ) { + Statement clone = Statement.Factory.newInstance(); + clone.setName( s.getName() ); + clone.setDescription( s.getDescription() ); + clone.setOriginalValue( s.getOriginalValue() ); + clone.setSubject( s.getSubject() ); + clone.setSubjectUri( s.getSubjectUri() ); + clone.setCategory( s.getCategory() ); + clone.setCategoryUri( s.getCategoryUri() ); + clone.setEvidenceCode( s.getEvidenceCode() ); + clone.setPredicate( s.getPredicate() ); + clone.setPredicateUri( s.getPredicateUri() ); + clone.setObject( s.getObject() ); + clone.setObjectUri( s.getObjectUri() ); + clone.setSecondPredicate( s.getSecondPredicate() ); + clone.setSecondPredicateUri( s.getSecondPredicateUri() ); + clone.setSecondObject( s.getSecondObject() ); + clone.setSecondObjectUri( s.getSecondObjectUri() ); + return clone; + } + + private Measurement cloneMeasurement( Measurement measurement ) { + Measurement clone = Measurement.Factory.newInstance(); + clone.setKindCV( measurement.getKindCV() ); + clone.setRepresentation( measurement.getRepresentation() ); + clone.setOtherKind( measurement.getOtherKind() ); + clone.setValue( measurement.getValue() ); + clone.setType( measurement.getType() ); + return clone; + } + + private QuantitationType cloneQt( QuantitationType qt, ExpressionExperiment split ) { + QuantitationType clone = QuantitationType.Factory.newInstance(); + clone.setDescription( qt.getDescription() + " (created for split: " + split.getShortName() + ")" ); + clone.setName( qt.getName() ); + clone.setGeneralType( qt.getGeneralType() ); + clone.setIsBackground( qt.getIsBackground() ); + clone.setIsBackgroundSubtracted( qt.getIsBackgroundSubtracted() ); + clone.setIsBatchCorrected( qt.getIsBatchCorrected() ); + //noinspection deprecation + clone.setIsMaskedPreferred( qt.getIsMaskedPreferred() ); + clone.setIsNormalized( qt.getIsNormalized() ); + clone.setIsPreferred( qt.getIsPreferred() ); + clone.setIsRatio( qt.getIsRatio() ); + clone.setIsRecomputedFromRawData( qt.getIsRecomputedFromRawData() ); + clone.setRepresentation( qt.getRepresentation() ); + clone.setType( qt.getType() ); + clone.setScale( qt.getScale() ); + + return clone; + } + + private Set cloneCharacteristics( Collection ch ) { + Set result = new HashSet<>(); + for ( Characteristic c : ch ) { + Characteristic clone = cloneCharacteristic( c ); + result.add( clone ); + } + return result; + } + + private Characteristic cloneCharacteristic( Characteristic c ) { + Characteristic clone = Characteristic.Factory.newInstance(); + clone.setName( c.getName() ); + clone.setDescription( c.getDescription() ); + clone.setCategory( c.getCategory() ); + clone.setCategoryUri( c.getCategoryUri() ); + clone.setValue( c.getValue() ); + clone.setValueUri( c.getValueUri() ); + clone.setOriginalValue( c.getOriginalValue() ); + clone.setEvidenceCode( c.getEvidenceCode() ); + return clone; + } + + private BioAssay cloneBioAssay( BioAssay ba ) { + BioAssay clone = BioAssay.Factory.newInstance(); + + clone.setName( ba.getName() ); + clone.setArrayDesignUsed( ba.getArrayDesignUsed() ); + clone.setDescription( ba.getDescription() ); + clone.setMetadata( ba.getMetadata() ); + clone.setIsOutlier( ba.getIsOutlier() ); + clone.setOriginalPlatform( ba.getOriginalPlatform() ); + clone.setProcessingDate( ba.getProcessingDate() ); + clone.setSequencePairedReads( ba.getSequencePairedReads() ); + + clone.setSequenceReadCount( ba.getSequenceReadCount() ); + clone.setSequenceReadLength( ba.getSequenceReadLength() ); + + BioMaterial sampleClone = this.cloneBioMaterial( ba.getSampleUsed() ); + clone.setSampleUsed( sampleClone ); + sampleClone.getBioAssaysUsedIn().add( clone ); + if ( ba.getAccession() != null ) { + clone.setAccession( this.cloneAccession( ba.getAccession() ) ); + } + + return clone; + } + + private DatabaseEntry cloneAccession( DatabaseEntry de ) { + DatabaseEntry clone = DatabaseEntry.Factory.newInstance(); + clone.setAccession( de.getAccession() ); + clone.setAccessionVersion( de.getAccessionVersion() ); + clone.setUri( de.getUri() ); + clone.setExternalDatabase( de.getExternalDatabase() ); + return clone; + } + + private BioMaterial cloneBioMaterial( BioMaterial bm ) { + Assert.isNull( bm.getSourceBioMaterial(), "Cannot split an experiment with biomaterials that have a source biomaterial." ); + BioMaterial clone = BioMaterial.Factory.newInstance(); + clone.setName( abbreviateWithSuffix( bm.getName(), " (Split)", "…", BioMaterial.MAX_NAME_LENGTH, true, StandardCharsets.UTF_8 ) ); // it is important we make a new name, so we don't confuse this with the previous one in findOrCreate(); + clone.setDescription( bm.getDescription() ); + clone.setCharacteristics( this.cloneCharacteristics( bm.getCharacteristics() ) ); + if ( bm.getExternalAccession() != null ) { + clone.setExternalAccession( this.cloneAccession( bm.getExternalAccession() ) ); + } + clone.setSourceTaxon( bm.getSourceTaxon() ); + clone.setTreatments( this.cloneTreatments( bm.getTreatments() ) ); + // Factor values are done separately + return clone; + } + + private Set cloneTreatments( Collection ts ) { + Set result = new HashSet<>(); + for ( Treatment t : ts ) { + Treatment clone = Treatment.Factory.newInstance(); + clone.setDescription( t.getDescription() ); + clone.setName( t.getName() ); + clone.setOrderApplied( t.getOrderApplied() ); + result.add( clone ); + } + return result; + } +} diff --git a/gemma-core/src/main/java/ubic/gemma/core/analysis/preprocess/SplitExperimentService.java b/gemma-core/src/main/java/ubic/gemma/core/analysis/preprocess/SplitExperimentService.java index de8f391a47..184745dc20 100644 --- a/gemma-core/src/main/java/ubic/gemma/core/analysis/preprocess/SplitExperimentService.java +++ b/gemma-core/src/main/java/ubic/gemma/core/analysis/preprocess/SplitExperimentService.java @@ -24,8 +24,6 @@ import ubic.gemma.model.expression.experiment.ExperimentalFactor; import ubic.gemma.model.expression.experiment.ExpressionExperiment; -import java.util.Collection; - /** * TODO Document Me * @@ -37,11 +35,13 @@ public interface SplitExperimentService { * Split an experiment into multiple experiments based on a factor. The new experiments will automatically be given * short names to suit and the names will be appended with an indicator of the split. * - * @param expressionExperiment the experiment to split - * @param splitOn the factor to split the experiment on - * @param postProcess post-process the experiments resulting from the split + * @param expressionExperiment the experiment to split + * @param splitOn the factor to split the experiment on + * @param postProcess post-process the experiments resulting from the split + * @param deleteOriginalExperiment whether to delete the original experiment after splitting, otherwise it will only + * be marked as private * @return results of the split */ @Secured({ "GROUP_ADMIN", "ACL_SECURABLE_EDIT" }) - ExpressionExperimentSet split( ExpressionExperiment expressionExperiment, ExperimentalFactor splitOn, boolean postProcess ); + ExpressionExperimentSet split( ExpressionExperiment expressionExperiment, ExperimentalFactor splitOn, boolean postProcess, boolean deleteOriginalExperiment ); } diff --git a/gemma-core/src/main/java/ubic/gemma/core/analysis/preprocess/SplitExperimentServiceImpl.java b/gemma-core/src/main/java/ubic/gemma/core/analysis/preprocess/SplitExperimentServiceImpl.java index e0b95706ea..36152e12c5 100644 --- a/gemma-core/src/main/java/ubic/gemma/core/analysis/preprocess/SplitExperimentServiceImpl.java +++ b/gemma-core/src/main/java/ubic/gemma/core/analysis/preprocess/SplitExperimentServiceImpl.java @@ -20,41 +20,16 @@ package ubic.gemma.core.analysis.preprocess; import gemma.gsec.SecurityService; -import org.apache.commons.lang3.StringUtils; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.stereotype.Service; import org.springframework.transaction.annotation.Propagation; import org.springframework.transaction.annotation.Transactional; -import org.springframework.util.Assert; -import ubic.gemma.core.analysis.service.ExpressionDataFileService; -import ubic.gemma.core.datastructure.matrix.BulkExpressionDataMatrix; -import ubic.gemma.core.datastructure.matrix.BulkExpressionDataMatrixUtils; -import ubic.gemma.core.datastructure.matrix.MultiAssayBulkExpressionDataMatrix; +import ubic.gemma.core.analysis.service.ExpressionExperimentDeleterService; import ubic.gemma.model.analysis.expression.ExpressionExperimentSet; -import ubic.gemma.model.common.description.Characteristic; -import ubic.gemma.model.common.description.DatabaseEntry; -import ubic.gemma.model.common.measurement.Measurement; -import ubic.gemma.model.common.quantitationtype.PrimitiveType; -import ubic.gemma.model.common.quantitationtype.QuantitationType; -import ubic.gemma.model.expression.bioAssay.BioAssay; -import ubic.gemma.model.expression.bioAssayData.BioAssayDimension; -import ubic.gemma.model.expression.bioAssayData.RawExpressionDataVector; -import ubic.gemma.model.expression.biomaterial.BioMaterial; -import ubic.gemma.model.expression.biomaterial.Treatment; -import ubic.gemma.model.expression.experiment.*; -import ubic.gemma.persistence.persister.Persister; -import ubic.gemma.persistence.service.expression.bioAssayData.RawExpressionDataVectorService; -import ubic.gemma.persistence.service.expression.experiment.ExpressionExperimentService; -import ubic.gemma.persistence.service.expression.experiment.ExpressionExperimentSetService; -import ubic.gemma.persistence.service.expression.experiment.FactorValueService; - -import javax.annotation.Nullable; -import java.nio.charset.StandardCharsets; -import java.util.*; - -import static ubic.gemma.core.util.StringUtils.abbreviateWithSuffix; +import ubic.gemma.model.expression.experiment.ExperimentalFactor; +import ubic.gemma.model.expression.experiment.ExpressionExperiment; /** * @@ -73,510 +48,42 @@ public class SplitExperimentServiceImpl implements SplitExperimentService { @Autowired private PreprocessorService preprocessor; - @Autowired - private ExpressionExperimentService eeService; - - @Autowired - private RawExpressionDataVectorService rawExpressionDataVectorService; - - @Autowired - private Persister persister; - @Autowired private SecurityService securityService; @Autowired - private ExpressionDataFileService dataFileService; + private ExpressionExperimentDeleterService eeService; @Autowired - private ExpressionExperimentSetService expressionExperimentSetService; + private SplitExperimentHelperService splitExperimentHelperService; - @Autowired - private FactorValueService factorValueService; - - /* - * (non-Javadoc) - * - * @see ubic.gemma.persistence.service.expression.experiment.ExpressionExperimentService#split(ubic.gemma.model. - * expression.experiment.ExpressionExperiment, ubic.gemma.model.expression.experiment.ExperimentalFactor) - */ @Override - public ExpressionExperimentSet split( ExpressionExperiment toSplit, ExperimentalFactor splitOn, boolean postProcess ) { - if ( !toSplit.getOtherParts().isEmpty() ) { - throw new IllegalArgumentException( "You cannot split an experiment that was already created by a split" ); - } - - if ( eeService.getArrayDesignsUsed( toSplit ).size() > 1 ) { - throw new IllegalArgumentException( "Cannot split experiments that are on more than one platform" ); - } - - if ( ExperimentFactorUtils.isBatchFactor( splitOn ) ) { - throw new IllegalArgumentException( "Do not split experiments on 'batch'" ); - } - - Collection result = new HashSet<>(); - - String sourceShortName = toSplit.getShortName(); - - Collection qts = eeService.getQuantitationTypes( toSplit ); - - // Get the expression data matrices for the experiment. We'll split them and generate new vectors - boolean foundPreferred = false; - Map> qt2mat = new HashMap<>(); - - if ( !qts.isEmpty() ) { - log.info( "Fetching raw expression data vectors ... " ); - for ( QuantitationType qt : qts ) { - if ( !qt.getRepresentation().equals( PrimitiveType.DOUBLE ) ) { - throw new UnsupportedOperationException( "Non-double values currently not supported for experiment split" ); - } - - Collection vectors = rawExpressionDataVectorService.findAndThaw( qt ); - if ( vectors.isEmpty() ) { - // this is okay if the data is processed, or if we have stray orphaned QTs - log.debug( "No raw vectors for " + qt + "; preferred=" + qt.getIsPreferred() ); - continue; - } - if ( qt.getIsPreferred() ) { - foundPreferred = true; - } - log.info( vectors.size() + " vectors for " + qt + "; preferred=" + qt.getIsPreferred() ); - - qt2mat.put( qt, MultiAssayBulkExpressionDataMatrix.getMatrix( vectors ) ); - } - - if ( !foundPreferred ) { - log.warn( "No preferred quantitation type found; post-processing of splits will be skipped" ); - } - } else { - log.warn( "Experiment has no QTs, probably doesn't have data, post-processing of splits will be skipped" ); - } - - // stub the new experiments and create new names; all other information should be retained. Permissions should be the same. - int splitNumber = 0; - - for ( FactorValue splitValue : splitOn.getFactorValues() ) { - splitNumber++; - ExpressionExperiment split = ExpressionExperiment.Factory.newInstance(); - split.setShortName( sourceShortName + "." + splitNumber ); - - // copy everything but samples over - split.setName( generateNameForSplit( toSplit, splitNumber, splitValue ) ); - split.setDescription( "This experiment was created by Gemma splitting another: \n" + toSplit + toSplit.getDescription() ); - - split.setCharacteristics( this.cloneCharacteristics( toSplit.getCharacteristics() ) ); - split.setMetadata( toSplit.getMetadata() ); // - split.setPrimaryPublication( toSplit.getPrimaryPublication() ); - split.getOtherRelevantPublications().addAll( toSplit.getOtherRelevantPublications() ); - split.setAccession( this.cloneAccession( toSplit.getAccession() ) ); // accession is currently unique, so have to clone - split.setOwner( toSplit.getOwner() ); - split.setSource( toSplit.getSource() ); - split.setTaxon( toSplit.getTaxon() ); - // starting with a fresh audit trail. - - Map old2cloneFV = new HashMap<>(); - split.setExperimentalDesign( this.cloneExperimentalDesign( toSplit.getExperimentalDesign(), old2cloneFV ) ); - - // add the biomaterials - Map old2cloneBA = new HashMap<>(); - List bms = new ArrayList<>(); - Collection usedFactorValues = new HashSet<>(); - for ( BioAssay ba : toSplit.getBioAssays() ) { - boolean kept = false; - BioMaterial bm = ba.getSampleUsed(); - - // identify samples we want to include - // TODO: support sub-biomaterials and use getAllFactorValues() instead, we also need to implement - // cloneBioMaterial() accordingly - for ( FactorValue fv : bm.getFactorValues() ) { - if ( fv.equals( splitValue ) ) { - assert !bms.contains( bm ); - bms.add( bm ); - BioAssay newBa = this.cloneBioAssay( ba ); - old2cloneBA.put( ba, newBa ); - kept = true; - } - } - - if ( kept ) { - // copy other factor values over - BioAssay newBa = old2cloneBA.get( ba ); - for ( FactorValue fv : bm.getFactorValues() ) { - if ( fv.equals( splitValue ) ) { - // make a BioMaterial characteristic, so we don't lose the information (might be redundant) - for ( Characteristic c : fv.getCharacteristics() ) { - newBa.getSampleUsed().getCharacteristics().add( this.cloneCharacteristic( c ) ); - } - continue; - } - newBa.getSampleUsed().getFactorValues().add( old2cloneFV.get( fv ) ); - usedFactorValues.add( old2cloneFV.get( fv ) ); - } - } - } - - // here we're using the original bms; we'll replace them - BioAssayDimension newBAD = makeBioAssayDimension( bms ); - // now replace the bms in the newBAD with the clones - List badBAs = newBAD.getBioAssays(); - List replaceBAs = new ArrayList<>(); - for ( BioAssay ba : badBAs ) { - BioAssay clonedBA = old2cloneBA.get( ba ); - assert clonedBA != null; - assert clonedBA.getSampleUsed().getId() == null; - replaceBAs.add( clonedBA ); - } - newBAD.getBioAssays().clear(); - newBAD.getBioAssays().addAll( replaceBAs ); - assert replaceBAs.size() == badBAs.size(); - - split.getBioAssays().clear(); - split.getBioAssays().addAll( replaceBAs ); - split.setNumberOfSamples( replaceBAs.size() ); - - // remove unused factors and factor values from the design and biomaterials - Collection toRemoveFactors = new HashSet<>(); - for ( ExperimentalFactor ef : split.getExperimentalDesign().getExperimentalFactors() ) { - Collection toRemove = new HashSet<>(); - for ( FactorValue fv : ef.getFactorValues() ) { // these are clones - if ( !usedFactorValues.contains( fv ) ) { - toRemove.add( fv ); - } - } - - if ( ef.getFactorValues().removeAll( toRemove ) ) { - log.info( toRemove.size() + " unused factor values removed for " + ef + " in split " + splitNumber + ", leaving " - + ef.getFactorValues().size() + " fvs still used" ); - } - - assert !split.getBioAssays().isEmpty(); - - // EFs that have only one level, or which aren't used at all, are removed from the biomaterials (and gathered for removal from the ED) - if ( ef.getFactorValues().size() <= 1 ) { - toRemoveFactors.add( ef ); - for ( BioAssay ba : split.getBioAssays() ) { - BioMaterial bm = ba.getSampleUsed(); - Collection fvsToClear = new HashSet<>(); - for ( FactorValue fv : bm.getFactorValues() ) { - if ( fv.getExperimentalFactor().equals( ef ) ) { - fvsToClear.add( fv ); - } - } - if ( bm.getFactorValues().removeAll( fvsToClear ) ) { - log.debug( "Cleared " + fvsToClear.size() + " unused factor values from " + bm ); - } - } - } - } - - // remove the unused/unneeded factors from the ED - if ( split.getExperimentalDesign().getExperimentalFactors().removeAll( toRemoveFactors ) ) { - log.info( toRemoveFactors.size() + " unused experimental factors dropped from split " + splitNumber ); - } - - log.info( "Building vectors for " + qt2mat.size() + " quantitation types ..." ); - for ( QuantitationType qt : qt2mat.keySet() ) { - - QuantitationType clonedQt = this.cloneQt( qt, split ); - - split.getQuantitationTypes().add( clonedQt ); - - // these bms are same as the ones associated with the vectors, not the clones - BulkExpressionDataMatrix expressionDataMatrix = qt2mat.get( qt ).sliceColumns( bms, newBAD ); - - Collection rawDataVectors = BulkExpressionDataMatrixUtils.toVectors( expressionDataMatrix, RawExpressionDataVector.class ); - for ( RawExpressionDataVector v : rawDataVectors ) { - v.setQuantitationType( clonedQt ); - v.setExpressionExperiment( split ); - assert v.getBioAssayDimension().equals( newBAD ); - assert v.getDesignElement() != null; - assert v.getDesignElement().getArrayDesign() != null; - assert v.getDesignElement().getArrayDesign().getId() != null; - } - log.info( split.getShortName() + ": Adding " + rawDataVectors.size() + " raw data vectors for " + clonedQt + " preferred=" - + clonedQt.getIsPreferred() ); - split.getRawExpressionDataVectors().addAll( rawDataVectors ); - } - - split = ( ExpressionExperiment ) persister.persist( split ); + public ExpressionExperimentSet split( ExpressionExperiment toSplit, ExperimentalFactor splitOn, boolean postProcess, boolean deleteOriginalExperiment ) { + SplitExperimentHelperService.ExperimentSplitResult result = splitExperimentHelperService.split( toSplit, splitOn ); - // securityService.makePublic( split ); // temporary - result.add( split ); - } - - enforceOtherParts( result ); - eeService.update( result ); - - for ( ExpressionExperiment split : result ) { - // postprocess - if ( foundPreferred && postProcess ) { + if ( result.isFoundPreferred() && postProcess ) { + for ( ExpressionExperiment split : result.getExperimentSet().getExperiments() ) { + // postprocess try { preprocessor.process( split ); } catch ( Exception e ) { log.error( "Failure while postprocessing (will continue): " + split + ": " + e.getMessage() ); } - } else { - log.info( "Postprocessing skipped for " + split ); } + } else { + log.info( "Postprocessing skipped for experiments in " + result.getExperimentSet() + "." ); } - /* - * Create a new "experiment set" that groups them together (not sure if we'll keep this) - */ - ExpressionExperimentSet g = ExpressionExperimentSet.Factory.newInstance(); - g.setDescription( "Parts of " + toSplit.getShortName() + " that were split on " + splitOn.getName() ); - g.setName( toSplit.getShortName() + " splits" ); - g.setTaxon( toSplit.getTaxon() ); - g.getExperiments().addAll( result ); - g = this.expressionExperimentSetService.create( g ); - - // remove useless data files - - dataFileService.deleteAllFiles( toSplit ); // Clean the source experiment? remove diff and coexpression analyses, PCA, correlation matrices, processed data vectors // delete it? - // eeService.remove(toSplit); - // OR perhaps only - securityService.makePrivate( toSplit ); - // Or mark it as troubled? - - return g; - } - - static String generateNameForSplit( ExpressionExperiment toSplit, int splitNumber, FactorValue splitValue ) { - String categoryString = StringUtils.strip( splitValue.getExperimentalFactor().getCategory() != null ? - splitValue.getExperimentalFactor().getCategory().getValue() : - splitValue.getExperimentalFactor().getName() ); - String factorValueString = FactorValueUtils.getSummaryString( splitValue ); - String suffix = String.format( " [%s = %s]", categoryString, factorValueString ); - return abbreviateWithSuffix( - String.format( "Split part %d of: %s", splitNumber, StringUtils.strip( toSplit.getName() ) ), suffix, - "…", ExpressionExperiment.MAX_NAME_LENGTH, true, StandardCharsets.UTF_8 ); - } - - private void enforceOtherParts( Collection result ) { - // Enforce relation to other parts of the split. - for ( ExpressionExperiment split : result ) { - for ( ExpressionExperiment split2 : result ) { - if ( split.equals( split2 ) ) continue; - split.getOtherParts().add( split2 ); - } - } - } - - private ExperimentalDesign cloneExperimentalDesign( ExperimentalDesign experimentalDesign, Map old2cloneFV ) { - ExperimentalDesign clone = ExperimentalDesign.Factory.newInstance(); - clone.setDescription( experimentalDesign.getDescription() ); - clone.setName( experimentalDesign.getName() ); - clone.setNormalizationDescription( experimentalDesign.getNormalizationDescription() ); - clone.setQualityControlDescription( experimentalDesign.getQualityControlDescription() ); - clone.setReplicateDescription( experimentalDesign.getReplicateDescription() ); - clone.setTypes( this.cloneCharacteristics( experimentalDesign.getTypes() ) ); - - clone.getExperimentalFactors() - .addAll( this.cloneExperimentalFactors( experimentalDesign.getExperimentalFactors(), clone, old2cloneFV ) ); - - return clone; - } - - private Collection cloneExperimentalFactors( Collection experimentalFactors, ExperimentalDesign ed, - Map old2cloneFV ) { - assert ed.getId() == null; - Collection result = new HashSet<>(); - for ( ExperimentalFactor ef : experimentalFactors ) { - ExperimentalFactor clone = ExperimentalFactor.Factory.newInstance(); - //noinspection deprecation - clone.setAnnotations( this.cloneCharacteristics( ef.getAnnotations() ) ); - if ( ef.getCategory() != null ) { - clone.setCategory( this.cloneCharacteristic( ef.getCategory() ) ); - } - clone.setName( ef.getName() ); - clone.setDescription( ef.getDescription() ); - clone.setType( ef.getType() ); - clone.getFactorValues().addAll( this.cloneFactorValues( ef.getFactorValues(), clone, old2cloneFV ) ); - clone.setExperimentalDesign( ed ); - result.add( clone ); - // assert clone.getId() == null; - } - return result; - } - - private Collection cloneFactorValues( Collection factorValues, ExperimentalFactor ef, - Map old2cloneFV ) { - assert ef.getId() == null; - Collection result = new HashSet<>(); - for ( FactorValue fv : factorValues ) { - FactorValue clone = FactorValue.Factory.newInstance( ef ); - clone.setCharacteristics( cloneStatements( fv ) ); - clone.setIsBaseline( fv.getIsBaseline() ); - //noinspection deprecation - clone.setValue( fv.getValue() ); - clone.setMeasurement( this.cloneMeasurement( fv.getMeasurement() ) ); - result.add( clone ); - assert !old2cloneFV.containsKey( fv ); - old2cloneFV.put( fv, clone ); - } - - return result; - } - - private Set cloneStatements( FactorValue fv ) { - Collection ch = fv.getCharacteristics(); - // pair of original -> clone - List result = new ArrayList<>( ch.size() ); - for ( Statement s : ch ) { - result.add( cloneStatement( s ) ); - } - return new HashSet<>( result ); - } - - private Statement cloneStatement( Statement s ) { - Statement clone = Statement.Factory.newInstance(); - clone.setName( s.getName() ); - clone.setDescription( s.getDescription() ); - clone.setOriginalValue( s.getOriginalValue() ); - clone.setSubject( s.getSubject() ); - clone.setSubjectUri( s.getSubjectUri() ); - clone.setCategory( s.getCategory() ); - clone.setCategoryUri( s.getCategoryUri() ); - clone.setEvidenceCode( s.getEvidenceCode() ); - clone.setPredicate( s.getPredicate() ); - clone.setPredicateUri( s.getPredicateUri() ); - clone.setObject( s.getObject() ); - clone.setObjectUri( s.getObjectUri() ); - clone.setSecondPredicate( s.getSecondPredicate() ); - clone.setSecondPredicateUri( s.getSecondPredicateUri() ); - clone.setSecondObject( s.getSecondObject() ); - clone.setSecondObjectUri( s.getSecondObjectUri() ); - return clone; - } - - private Measurement cloneMeasurement( Measurement measurement ) { - - if ( measurement == null ) return null; - Measurement clone = Measurement.Factory.newInstance(); - clone.setKindCV( measurement.getKindCV() ); - clone.setRepresentation( measurement.getRepresentation() ); - clone.setOtherKind( measurement.getOtherKind() ); - clone.setValue( measurement.getValue() ); - clone.setType( measurement.getType() ); - - return clone; - } - - private QuantitationType cloneQt( QuantitationType qt, ExpressionExperiment split ) { - QuantitationType clone = QuantitationType.Factory.newInstance(); - clone.setDescription( qt.getDescription() + " (created for split: " + split.getShortName() + ")" ); - clone.setName( qt.getName() ); - clone.setGeneralType( qt.getGeneralType() ); - clone.setIsBackground( qt.getIsBackground() ); - clone.setIsBackgroundSubtracted( qt.getIsBackgroundSubtracted() ); - clone.setIsBatchCorrected( qt.getIsBatchCorrected() ); - //noinspection deprecation - clone.setIsMaskedPreferred( qt.getIsMaskedPreferred() ); - clone.setIsNormalized( qt.getIsNormalized() ); - clone.setIsPreferred( qt.getIsPreferred() ); - clone.setIsRatio( qt.getIsRatio() ); - clone.setIsRecomputedFromRawData( qt.getIsRecomputedFromRawData() ); - clone.setRepresentation( qt.getRepresentation() ); - clone.setType( qt.getType() ); - clone.setScale( qt.getScale() ); - - return clone; - } - - private Set cloneCharacteristics( Collection ch ) { - Set result = new HashSet<>(); - for ( Characteristic c : ch ) { - Characteristic clone = cloneCharacteristic( c ); - result.add( clone ); - } - return result; - } - - private Characteristic cloneCharacteristic( Characteristic c ) { - Characteristic clone = Characteristic.Factory.newInstance(); - clone.setName( c.getName() ); - clone.setDescription( c.getDescription() ); - clone.setCategory( c.getCategory() ); - clone.setCategoryUri( c.getCategoryUri() ); - clone.setValue( c.getValue() ); - clone.setValueUri( c.getValueUri() ); - clone.setOriginalValue( c.getOriginalValue() ); - clone.setEvidenceCode( c.getEvidenceCode() ); - return clone; - } - - private BioAssay cloneBioAssay( BioAssay ba ) { - BioAssay clone = BioAssay.Factory.newInstance(); - - clone.setName( ba.getName() ); - clone.setArrayDesignUsed( ba.getArrayDesignUsed() ); - clone.setDescription( ba.getDescription() ); - clone.setMetadata( ba.getMetadata() ); - clone.setIsOutlier( ba.getIsOutlier() ); - clone.setOriginalPlatform( ba.getOriginalPlatform() ); - clone.setProcessingDate( ba.getProcessingDate() ); - clone.setSequencePairedReads( ba.getSequencePairedReads() ); - - clone.setSequenceReadCount( ba.getSequenceReadCount() ); - clone.setSequenceReadLength( ba.getSequenceReadLength() ); - - BioMaterial sampleClone = this.cloneBioMaterial( ba.getSampleUsed() ); - clone.setSampleUsed( sampleClone ); - sampleClone.getBioAssaysUsedIn().add( clone ); - clone.setAccession( this.cloneAccession( ba.getAccession() ) ); - - return clone; - } - - private DatabaseEntry cloneAccession( @Nullable DatabaseEntry de ) { - if ( de == null ) return null; - DatabaseEntry clone = DatabaseEntry.Factory.newInstance(); - clone.setAccession( de.getAccession() ); - clone.setAccessionVersion( de.getAccessionVersion() ); - clone.setUri( de.getUri() ); - clone.setExternalDatabase( de.getExternalDatabase() ); - return clone; - } - - private BioMaterial cloneBioMaterial( BioMaterial bm ) { - Assert.isNull( bm.getSourceBioMaterial(), "Cannot split an experiment with biomaterials that have a source biomaterial." ); - BioMaterial clone = BioMaterial.Factory.newInstance(); - clone.setName( abbreviateWithSuffix( bm.getName(), " (Split)", "…", BioMaterial.MAX_NAME_LENGTH, true, StandardCharsets.UTF_8 ) ); // it is important we make a new name, so we don't confuse this with the previous one in findOrCreate(); - clone.setDescription( bm.getDescription() ); - clone.setCharacteristics( this.cloneCharacteristics( bm.getCharacteristics() ) ); - clone.setExternalAccession( this.cloneAccession( bm.getExternalAccession() ) ); - clone.setSourceTaxon( bm.getSourceTaxon() ); - clone.setTreatments( this.cloneTreatments( bm.getTreatments() ) ); - // Factor values are done separately - return clone; - } - - private Set cloneTreatments( Collection ts ) { - Set result = new HashSet<>(); - for ( Treatment t : ts ) { - Treatment clone = Treatment.Factory.newInstance(); - clone.setDescription( t.getDescription() ); - clone.setName( t.getName() ); - clone.setOrderApplied( t.getOrderApplied() ); - result.add( clone ); - } - - return result; - } - - private BioAssayDimension makeBioAssayDimension( List samplesToUse ) { - - List bioAssays = new ArrayList<>(); - for ( BioMaterial bm : samplesToUse ) { - BioAssay ba = bm.getBioAssaysUsedIn().iterator().next(); - bioAssays.add( ba ); + if ( deleteOriginalExperiment ) { + eeService.delete( toSplit ); + } else { + // OR perhaps only + // Or mark it as troubled? + securityService.makePrivate( toSplit ); } - BioAssayDimension result = BioAssayDimension.Factory.newInstance( bioAssays ); - - assert result.getBioAssays().size() == samplesToUse.size(); - return result; + return result.getExperimentSet(); } } diff --git a/gemma-core/src/main/java/ubic/gemma/core/analysis/preprocess/slice/BulkDataSlicerUtils.java b/gemma-core/src/main/java/ubic/gemma/core/analysis/preprocess/slice/BulkDataSlicerUtils.java index ce8ee9a6ce..bf57c74838 100644 --- a/gemma-core/src/main/java/ubic/gemma/core/analysis/preprocess/slice/BulkDataSlicerUtils.java +++ b/gemma-core/src/main/java/ubic/gemma/core/analysis/preprocess/slice/BulkDataSlicerUtils.java @@ -33,7 +33,18 @@ public class BulkDataSlicerUtils { * @param vectorType the type of vector produced */ public static Collection slice( Collection vectors, List assays, Class vectorType, boolean allowMissing ) { - return vectors.stream().map( createSlicer( assays, vectorType, allowMissing ) ).collect( Collectors.toList() ); + return slice( vectors, assays, vectorType, allowMissing, new HashMap<>() ); + } + + /** + * Slice a collection of bulk data vectors, reusing the provided dimension cache. + * + * @param vectorType the type of vector produced + */ + public static Collection slice( Collection vectors, List assays, Class vectorType, boolean allowMissing, Map dimensionCache ) { + Map bioAssayMappingCache = new HashMap<>(); + Map missingValueCache = new HashMap<>(); + return vectors.stream().map( bulkDataVector -> slice( bulkDataVector, assays, dimensionCache, bioAssayMappingCache, vectorType, getDataVectorIgnoredProperties( vectorType ), allowMissing, missingValueCache ) ).collect( Collectors.toList() ); } /** diff --git a/gemma-core/src/main/java/ubic/gemma/core/analysis/service/ExpressionExperimentDeleterService.java b/gemma-core/src/main/java/ubic/gemma/core/analysis/service/ExpressionExperimentDeleterService.java new file mode 100644 index 0000000000..7f65a6a90b --- /dev/null +++ b/gemma-core/src/main/java/ubic/gemma/core/analysis/service/ExpressionExperimentDeleterService.java @@ -0,0 +1,18 @@ +package ubic.gemma.core.analysis.service; + +import org.springframework.security.access.annotation.Secured; +import ubic.gemma.model.expression.experiment.ExpressionExperiment; + +/** + * High-level service for deleting an {@link ExpressionExperiment} and all associated data files. + * + * @author poirigui + */ +public interface ExpressionExperimentDeleterService { + + /** + * Delete an experiment and all associated data files. + */ + @Secured({ "GROUP_USER", "ACL_SECURABLE_EDIT" }) + void delete( ExpressionExperiment ee ); +} diff --git a/gemma-core/src/main/java/ubic/gemma/core/analysis/service/ExpressionExperimentDeleterServiceImpl.java b/gemma-core/src/main/java/ubic/gemma/core/analysis/service/ExpressionExperimentDeleterServiceImpl.java new file mode 100644 index 0000000000..841a20bfdc --- /dev/null +++ b/gemma-core/src/main/java/ubic/gemma/core/analysis/service/ExpressionExperimentDeleterServiceImpl.java @@ -0,0 +1,25 @@ +package ubic.gemma.core.analysis.service; + +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.stereotype.Service; +import org.springframework.transaction.annotation.Propagation; +import org.springframework.transaction.annotation.Transactional; +import ubic.gemma.model.expression.experiment.ExpressionExperiment; +import ubic.gemma.persistence.service.expression.experiment.ExpressionExperimentService; + +@Service +@Transactional(propagation = Propagation.NEVER) +public class ExpressionExperimentDeleterServiceImpl implements ExpressionExperimentDeleterService { + + @Autowired + private ExpressionExperimentService expressionExperimentService; + + @Autowired + private ExpressionDataFileService expressionDataFileService; + + @Override + public void delete( ExpressionExperiment ee ) { + expressionExperimentService.remove( ee ); + expressionDataFileService.deleteAllFiles( ee ); + } +} diff --git a/gemma-core/src/main/java/ubic/gemma/core/analysis/singleCell/SingleCellSlicerUtils.java b/gemma-core/src/main/java/ubic/gemma/core/analysis/singleCell/SingleCellSlicerUtils.java index f4286ac20e..a91fa5714f 100644 --- a/gemma-core/src/main/java/ubic/gemma/core/analysis/singleCell/SingleCellSlicerUtils.java +++ b/gemma-core/src/main/java/ubic/gemma/core/analysis/singleCell/SingleCellSlicerUtils.java @@ -19,6 +19,7 @@ /** * Utilities for slicing single-cell data. + * * @author poirigui * @see ubic.gemma.core.analysis.preprocess.slice.BulkDataSlicerUtils */ @@ -41,8 +42,12 @@ public class SingleCellSlicerUtils { } public static Collection slice( Collection vectors, List bioAssays ) { + return slice( vectors, bioAssays, new HashMap<>() ); + } + + public static Collection slice( Collection vectors, List bioAssays, Map singleCellDimensionCache ) { return vectors.stream() - .map( createSlicer( bioAssays ) ) + .map( createSlicer( bioAssays, singleCellDimensionCache ) ) .collect( Collectors.toList() ); } @@ -50,20 +55,30 @@ public static Collection slice( Collection createSlicer( List assays ) { - return createSlicer( assays, null, null, null ); + return createSlicer( assays, null, null, null, new HashMap<>() ); + } + + public static Function createSlicer( List assays, Map singleCellDimensionCache ) { + return createSlicer( assays, null, null, null, singleCellDimensionCache ); } /** * Create a slicer for single-cell data vectors whose cell IDs, CTAs, and CLCs are already pre-sliced. *

* Unlike sparse vectors, these structures can be sliced in the database. + * * @param cellIds pre-sliced cell IDs * @param ctas pre-sliced CTAs * @param clcs pre-sliced CLCs */ public static Function createSlicer( List assays, @Nullable List cellIds, @Nullable Set ctas, @Nullable Set clcs ) { - Map scdCache = new HashMap<>(); + return createSlicer( assays, cellIds, ctas, clcs, new HashMap<>() ); + } + + public static Function createSlicer( List assays, + @Nullable List cellIds, @Nullable Set ctas, @Nullable Set clcs, + Map scdCache ) { Map sampleIndicesCache = new HashMap<>(); return vec -> sliceVector( vec, assays, cellIds, ctas, clcs, scdCache, sampleIndicesCache ); } diff --git a/gemma-core/src/test/java/ubic/gemma/core/analysis/preprocess/SplitExperimentNameGeneratorTest.java b/gemma-core/src/test/java/ubic/gemma/core/analysis/preprocess/SplitExperimentNameGeneratorTest.java index 18dd29cf7f..c8ae465632 100644 --- a/gemma-core/src/test/java/ubic/gemma/core/analysis/preprocess/SplitExperimentNameGeneratorTest.java +++ b/gemma-core/src/test/java/ubic/gemma/core/analysis/preprocess/SplitExperimentNameGeneratorTest.java @@ -10,7 +10,7 @@ import static org.assertj.core.api.Assertions.assertThatThrownBy; import static org.junit.Assert.assertEquals; -import static ubic.gemma.core.analysis.preprocess.SplitExperimentServiceImpl.generateNameForSplit; +import static ubic.gemma.core.analysis.preprocess.SplitExperimentHelperService.generateNameForSplit; public class SplitExperimentNameGeneratorTest { diff --git a/gemma-core/src/test/java/ubic/gemma/core/analysis/preprocess/SplitExperimentServiceTest.java b/gemma-core/src/test/java/ubic/gemma/core/analysis/preprocess/SplitExperimentServiceTest.java new file mode 100644 index 0000000000..7dda02270e --- /dev/null +++ b/gemma-core/src/test/java/ubic/gemma/core/analysis/preprocess/SplitExperimentServiceTest.java @@ -0,0 +1,187 @@ +package ubic.gemma.core.analysis.preprocess; + +import gemma.gsec.SecurityService; +import org.junit.Before; +import org.junit.Test; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.context.annotation.Bean; +import org.springframework.context.annotation.Configuration; +import org.springframework.test.context.ContextConfiguration; +import ubic.gemma.core.analysis.service.ExpressionDataFileService; +import ubic.gemma.core.context.TestComponent; +import ubic.gemma.core.util.test.BaseTest; +import ubic.gemma.model.analysis.expression.ExpressionExperimentSet; +import ubic.gemma.model.common.Identifiable; +import ubic.gemma.model.common.quantitationtype.*; +import ubic.gemma.model.expression.arrayDesign.ArrayDesign; +import ubic.gemma.model.expression.bioAssayData.DataVector; +import ubic.gemma.model.expression.bioAssayData.RawExpressionDataVector; +import ubic.gemma.model.expression.bioAssayData.SingleCellExpressionDataVector; +import ubic.gemma.model.expression.designElement.CompositeSequence; +import ubic.gemma.model.expression.experiment.ExperimentalDesign; +import ubic.gemma.model.expression.experiment.ExperimentalFactor; +import ubic.gemma.model.expression.experiment.ExpressionExperiment; +import ubic.gemma.model.genome.Taxon; +import ubic.gemma.persistence.persister.Persister; +import ubic.gemma.persistence.service.expression.bioAssayData.RandomBulkDataUtils; +import ubic.gemma.persistence.service.expression.bioAssayData.RandomSingleCellDataUtils; +import ubic.gemma.persistence.service.expression.bioAssayData.RawExpressionDataVectorService; +import ubic.gemma.persistence.service.expression.experiment.*; + +import java.util.*; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.mockito.ArgumentMatchers.any; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.when; + +@ContextConfiguration +public class SplitExperimentServiceTest extends BaseTest { + + @Autowired + private ExpressionExperimentSetService expressionExperimentSetService; + @Autowired + private ExpressionExperimentService expressionExperimentService; + @Autowired + private SingleCellExpressionExperimentService singleCellExpressionExperimentService; + + @Configuration + @TestComponent + static class CC { + + @Bean + public SplitExperimentService splitExperimentService() { + return new SplitExperimentServiceImpl(); + } + + @Bean + public SplitExperimentHelperService splitExperimentHelperService() { + return new SplitExperimentHelperService(); + } + + @Bean + public PreprocessorService preprocessor() { + return mock(); + } + + @Bean + public ExpressionExperimentService eeService() { + return mock(); + } + + @Bean + public RawExpressionDataVectorService rawExpressionDataVectorService() { + return mock(); + } + + @Bean + public Persister persister() { + return mock(); + } + + @Bean + public SecurityService securityService() { + return mock(); + } + + @Bean + public ExpressionDataFileService dataFileService() { + return mock(); + } + + @Bean + public ExpressionExperimentSetService expressionExperimentSetService() { + return mock(); + } + + @Bean + public SingleCellExpressionExperimentService singleCellExpressionExperimentService() { + return mock(); + } + } + + @Autowired + private SplitExperimentService splitExperimentService; + + @Autowired + private Persister persister; + + @Before + public void setUp() { + when( persister.persist( any( Identifiable.class ) ) ).thenAnswer( invocation -> invocation.getArgument( 0 ) ); + when( expressionExperimentSetService.create( any( ExpressionExperimentSet.class ) ) ).thenAnswer( invocation -> invocation.getArgument( 0 ) ); + } + + @Test + public void test() { + Taxon taxon = new Taxon(); + ArrayDesign ad = new ArrayDesign(); + for ( int i = 0; i < 100; i++ ) { + CompositeSequence cs = CompositeSequence.Factory.newInstance( "cs" + i, ad ); + ad.getCompositeSequences().add( cs ); + } + ExpressionExperiment ee = RandomExpressionExperimentUtils.randomExpressionExperiment( taxon, 16, ad ); + ee.setExperimentalDesign( new ExperimentalDesign() ); + ExperimentalFactor factor = RandomExperimentalDesignUtils.randomCategoricalFactor( ee, "test", 4 ); + + // setup some vectors + QuantitationType scQt = QuantitationType.Factory.newInstance(); + scQt.setName( "counts" ); + scQt.setGeneralType( GeneralType.QUANTITATIVE ); + scQt.setType( StandardQuantitationType.COUNT ); + scQt.setScale( ScaleType.COUNT ); + scQt.setRepresentation( PrimitiveType.DOUBLE ); + List scVectors = RandomSingleCellDataUtils.randomSingleCellVectors( ee, ad, scQt ); + ee.getQuantitationTypes().add( scQt ); + ee.getSingleCellExpressionDataVectors().addAll( scVectors ); + when( singleCellExpressionExperimentService.getSingleCellDataVectors( ee, scQt ) ) + .thenReturn( scVectors ); + + QuantitationType rawQt = QuantitationType.Factory.newInstance(); + rawQt.setName( "log2cpm" ); + rawQt.setGeneralType( GeneralType.QUANTITATIVE ); + rawQt.setType( StandardQuantitationType.AMOUNT ); + rawQt.setScale( ScaleType.LOG2 ); + rawQt.setRepresentation( PrimitiveType.DOUBLE ); + Collection rawVectors = RandomBulkDataUtils.randomBulkVectors( ee, ad, rawQt, RawExpressionDataVector.class ); + ee.getQuantitationTypes().add( rawQt ); + ee.getRawExpressionDataVectors().addAll( rawVectors ); + + Map, Set> qtsByVt = new HashMap<>(); + qtsByVt.put( SingleCellExpressionDataVector.class, Collections.singleton( scQt ) ); + qtsByVt.put( RawExpressionDataVector.class, Collections.singleton( rawQt ) ); + when( expressionExperimentService.getQuantitationTypesByVectorType( ee ) ) + .thenReturn( qtsByVt ); + + ExpressionExperimentSet parts = splitExperimentService.split( ee, factor, false, false ); + + assertThat( parts.getExperiments() ) + .hasSize( 4 ) + .allSatisfy( split -> { + assertThat( split.getShortName() ).startsWith( ee.getShortName() + "." ); + assertThat( split.getBioAssays() ) + .isSubsetOf( ee.getBioAssays() ) + // make sure that fresh copies are being used + .usingElementComparator( Comparator.comparingInt( System::identityHashCode ) ) + .doesNotContainAnyElementsOf( ee.getBioAssays() ); + assertThat( split.getSingleCellExpressionDataVectors() ) + .hasSize( 100 ) + .first() + .satisfies( vec -> { + assertThat( vec.getDesignElement() ) + .isIn( ad.getCompositeSequences() ); + assertThat( vec.getQuantitationType() ) + .isEqualTo( scQt ) + .isNotSameAs( scQt ); + assertThat( vec.getSingleCellDimension().getBioAssays() ) + .hasSizeLessThan( 16 ) + .isSubsetOf( ee.getBioAssays() ) + // make sure that fresh copies are being used + .usingElementComparator( Comparator.comparingInt( System::identityHashCode ) ) + .doesNotContainAnyElementsOf( ee.getBioAssays() ) + // make sure it contains the same entities that the split experiment has + .containsExactlyInAnyOrderElementsOf( split.getBioAssays() ); + } ); + } ); + } +} \ No newline at end of file diff --git a/gemma-core/src/test/java/ubic/gemma/core/analysis/preprocess/SplitExperimentTest.java b/gemma-core/src/test/java/ubic/gemma/core/analysis/preprocess/SplitExperimentTest.java index b114f9b5e8..31e785103d 100644 --- a/gemma-core/src/test/java/ubic/gemma/core/analysis/preprocess/SplitExperimentTest.java +++ b/gemma-core/src/test/java/ubic/gemma/core/analysis/preprocess/SplitExperimentTest.java @@ -127,7 +127,7 @@ public void testSplitGSE17183ByOrganismPart() throws Exception { assertNotNull( splitOn ); - results = splitService.split( ee, splitOn, true ); + results = splitService.split( ee, splitOn, true, false ); assertEquals( splitOn.getFactorValues().size(), results.getExperiments().size() ); @@ -208,7 +208,7 @@ public void testSplitGSE123753ByCollectionOfMaterial() throws Exception { assertNotNull( splitOn ); - results = splitService.split( ee, splitOn, false ); + results = splitService.split( ee, splitOn, false, false ); assertEquals( splitOn.getFactorValues().size(), results.getExperiments().size() ); } diff --git a/gemma-core/src/test/java/ubic/gemma/persistence/service/expression/bioAssayData/RandomBulkDataUtils.java b/gemma-core/src/test/java/ubic/gemma/persistence/service/expression/bioAssayData/RandomBulkDataUtils.java index 96ddd1164c..ff83d68486 100644 --- a/gemma-core/src/test/java/ubic/gemma/persistence/service/expression/bioAssayData/RandomBulkDataUtils.java +++ b/gemma-core/src/test/java/ubic/gemma/persistence/service/expression/bioAssayData/RandomBulkDataUtils.java @@ -17,6 +17,7 @@ /** * Utilities for generating random bulk vectors. + * * @see RandomExpressionDataMatrixUtils * @see RandomSingleCellDataUtils */ diff --git a/gemma-core/src/test/java/ubic/gemma/persistence/service/expression/bioAssayData/RandomSingleCellDataUtils.java b/gemma-core/src/test/java/ubic/gemma/persistence/service/expression/bioAssayData/RandomSingleCellDataUtils.java index ab7247ff67..4916e9cb62 100644 --- a/gemma-core/src/test/java/ubic/gemma/persistence/service/expression/bioAssayData/RandomSingleCellDataUtils.java +++ b/gemma-core/src/test/java/ubic/gemma/persistence/service/expression/bioAssayData/RandomSingleCellDataUtils.java @@ -10,7 +10,6 @@ import ubic.gemma.model.expression.bioAssayData.CellTypeAssignment; import ubic.gemma.model.expression.bioAssayData.SingleCellDimension; import ubic.gemma.model.expression.bioAssayData.SingleCellExpressionDataVector; -import ubic.gemma.model.expression.biomaterial.BioMaterial; import ubic.gemma.model.expression.designElement.CompositeSequence; import ubic.gemma.model.expression.experiment.ExpressionExperiment; import ubic.gemma.model.genome.Taxon; @@ -80,6 +79,8 @@ public static List randomSingleCellVectors( int /** * Generate random single-cell vectors with 1000 cells/sample and 90% sparsity. + *

+ * The quantitation type and vectors will be added to the expression experiment. * * @see #randomSingleCellVectors(ExpressionExperiment, ArrayDesign, QuantitationType, int, double) */ @@ -117,6 +118,8 @@ public static SingleCellExpressionDataVector randomSingleCellVector( ExpressionE /** * Generate a single random single-cell vector. + *

+ * The quantitation type and vector will be added to the expression experiment. */ public static SingleCellExpressionDataVector randomSingleCellVector( ExpressionExperiment ee, CompositeSequence compositeSequence, QuantitationType qt, SingleCellDimension dimension, double sparsity ) { Assert.isTrue( qt.getGeneralType() == GeneralType.QUANTITATIVE, @@ -150,6 +153,8 @@ public static SingleCellExpressionDataVector randomSingleCellVector( ExpressionE default: throw new UnsupportedOperationException( "Sampling " + qt.getRepresentation() + " is not supported." ); } + ee.getQuantitationTypes().add( qt ); + ee.getSingleCellExpressionDataVectors().add( vector ); return vector; }