From 47da0458bed79ad389a2efbbd19668ebb0c3f8a8 Mon Sep 17 00:00:00 2001 From: Clemens Schmid Date: Mon, 23 Feb 2026 07:32:13 +0100 Subject: [PATCH 1/2] removed space-time interpolation generator code --- poseidon-analysis-hs.cabal | 2 +- src/Poseidon/Generator/CLI/SpaceTime.hs | 157 ------------------------ src/Poseidon/Generator/Parsers.hs | 54 -------- src/Poseidon/Generator/Types.hs | 13 -- 4 files changed, 1 insertion(+), 225 deletions(-) delete mode 100644 src/Poseidon/Generator/CLI/SpaceTime.hs diff --git a/poseidon-analysis-hs.cabal b/poseidon-analysis-hs.cabal index ecf6e63..6bf06ca 100644 --- a/poseidon-analysis-hs.cabal +++ b/poseidon-analysis-hs.cabal @@ -15,7 +15,7 @@ extra-source-files: README.md, library exposed-modules: Poseidon.Analysis.FStatsConfig, Poseidon.Analysis.RASconfig, Poseidon.Analysis.Utils, Poseidon.Analysis.CLI.FStats, Poseidon.Analysis.CLI.RAS, - Poseidon.Generator.CLI.AdmixPops, Poseidon.Generator.CLI.SpaceTime, + Poseidon.Generator.CLI.AdmixPops, Poseidon.Generator.Parsers, Poseidon.Generator.Types, Poseidon.Generator.SampleGeno, Poseidon.Generator.Utils hs-source-dirs: src diff --git a/src/Poseidon/Generator/CLI/SpaceTime.hs b/src/Poseidon/Generator/CLI/SpaceTime.hs deleted file mode 100644 index 383ec51..0000000 --- a/src/Poseidon/Generator/CLI/SpaceTime.hs +++ /dev/null @@ -1,157 +0,0 @@ -module Poseidon.Generator.CLI.SpaceTime where - --- import Poseidon.Generator.Parsers --- import Poseidon.Generator.SampleGeno --- import Poseidon.Generator.Types --- import Poseidon.Generator.Utils - --- import Control.Monad (forM) --- import Data.List (nub, sortBy, intersect) --- import Data.Maybe (isJust, fromMaybe, catMaybes) --- import Pipes --- import qualified Pipes.Prelude as P --- import Pipes.Safe (runSafeT) --- import Poseidon.GenotypeData --- import Poseidon.Janno --- import Poseidon.Package --- import SequenceFormats.Eigenstrat (EigenstratIndEntry (..), writeEigenstrat) --- import SequenceFormats.Plink (writePlink) --- import System.Directory (createDirectoryIfMissing) --- import System.FilePath (()) --- import System.IO (hPutStrLn, stderr) - --- data SpaceTimeOptions = SpaceTimeOptions { --- _spatBaseDirs :: [FilePath] --- , _spatIndWithPosition :: [IndWithPosition] --- , _spatIndWithPositionFile :: Maybe FilePath --- , _spatNumberOfNearestNeighbors :: Int --- , _spatTemporalDistanceScaling :: Double --- , _spatOutFormat :: GenotypeFormatSpec --- , _spatOutPath :: FilePath --- } - --- pacReadOpts :: PackageReadOptions --- pacReadOpts = defaultPackageReadOptions { --- _readOptVerbose = True --- , _readOptStopOnDuplicates = True --- , _readOptIgnoreChecksums = False --- , _readOptIgnoreGeno = False --- , _readOptGenoCheck = True --- } - --- runSpaceTime :: SpaceTimeOptions -> IO () --- runSpaceTime (SpaceTimeOptions baseDirs poisDirect poisFile numNeighbors temporalDistanceScaling outFormat outDir) = do --- -- compile pois --- poisFromFile <- case poisFile of --- Nothing -> return [] --- Just f -> readIndWithPositionFromFile f --- let pois = poisDirect ++ poisFromFile --- -- load Poseidon packages --- allPackages <- readPoseidonPackageCollection pacReadOpts baseDirs --- -- load janno tables --- let jannos = concatMap posPacJanno allPackages --- -- transform to spatiotemporal positions --- stInds = jannoToSpaceTimePos jannos --- -- calculate distances --- distancesToPois = map (\x -> distanceOneToAll temporalDistanceScaling x stInds) pois --- -- get X closest inds --- closest = map (getClosestInds numNeighbors) distancesToPois --- closestIndividuals = map (map fst) closest --- closestDistances = map (map snd) closest --- closestWeights = map distToWeight closestDistances --- -- putStrLn $ "Closest individuals: " ++ intercalate ", " closestIndividuals --- -- determine relevant packages --- relevantPackages <- filterPackagesByInds (nub $ concat closestIndividuals) allPackages --- closestIndices <- mapM (`extractIndIndices` relevantPackages) closestIndividuals --- -- merge poi indices and weights --- let infoForIndividualPOIs = zip closestIndices closestWeights --- -- compile genotype data structure --- let [outInd, outSnp, outGeno] = case outFormat of --- GenotypeFormatEigenstrat -> ["spacetime_package.ind", "spacetime_package.snp", "spacetime_package.geno"] --- GenotypeFormatPlink -> ["spacetime_package.fam", "spacetime_package.bim", "spacetime_package.bed"] --- -- create output poseidon package --- hPutStrLn stderr "Creating output Poseidon package" --- createDirectoryIfMissing True outDir --- let genotypeData = GenotypeDataSpec outFormat outGeno Nothing outSnp Nothing outInd Nothing Nothing --- pac = newMinimalPackageTemplate outDir "spacetime_package" genotypeData --- writePoseidonPackage pac --- -- compile genotype data --- hPutStrLn stderr "Compiling chimeras" --- runSafeT $ do --- (_, eigenstratProd) <- getJointGenotypeData False False relevantPackages --- let [outG, outS, outI] = map (outDir ) [outGeno, outSnp, outInd] --- newIndEntries = map (\x -> EigenstratIndEntry (spatInd x) Unknown (spatUnit x)) pois --- let outConsumer = case outFormat of --- GenotypeFormatEigenstrat -> writeEigenstrat outG outS outI newIndEntries --- GenotypeFormatPlink -> writePlink outG outS outI newIndEntries --- runEffect $ eigenstratProd >-> printSNPCopyProgress >-> P.mapM (sampleGenoForMultiplePOIs infoForIndividualPOIs) >-> outConsumer --- liftIO $ hPutStrLn stderr "Done" - --- jannoToSpaceTimePos :: [JannoRow] -> [IndWithPosition] --- jannoToSpaceTimePos jannos = --- let filterPos x = (isJust (jDateBCADMedian x) || jDateType x == Just Modern) && isJust (jLatitude x) && isJust (jLongitude x) --- transformTo x = IndWithPosition (jPoseidonID x) (head $ getJannoList $ jGroupName x) $ --- SpatialTemporalPosition --- (fromMaybe 2000 (jDateBCADMedian x)) -- If empty then modern (after filter), which is approx. 2000AD --- (fromMaybe (Latitude 0) (jLatitude x)) --- (fromMaybe (Longitude 0) (jLongitude x)) --- in map transformTo $ filter filterPos jannos - --- distanceOneToAll :: Double -> IndWithPosition -> [IndWithPosition] -> [(String, String, Double)] --- distanceOneToAll temporalDistanceScaling poi = --- map (distanceOneToOne poi) --- where distanceOneToOne i1 i2 = (spatInd i1, spatInd i2, spatioTemporalDistance temporalDistanceScaling (spatPos i1) (spatPos i2)) - --- spatioTemporalDistance :: Double -> SpatialTemporalPosition -> SpatialTemporalPosition -> Double --- spatioTemporalDistance --- scaling --- (SpatialTemporalPosition t1 (Latitude lat1) (Longitude lon1)) --- (SpatialTemporalPosition t2 (Latitude lat2) (Longitude lon2)) = --- let tDist = fromIntegral (abs (t1 - t2)) * scaling --- sDist = haversineDist (lat1, lon1) (lat2, lon2) --- in sqrt $ tDist^(2 :: Integer) + sDist^(2 :: Integer) - --- haversineDist :: (Double, Double) -> (Double, Double) -> Double --- haversineDist (lat1, lon1) (lat2, lon2) = --- let r = 6371000 -- radius of Earth in meters --- toRadians n = n * pi / 180 --- square x = x * x --- cosr = cos . toRadians --- dlat = toRadians (lat1 - lat2) / 2 --- dlon = toRadians (lon1 - lon2) / 2 --- a = square (sin dlat) + cosr lat1 * cosr lat2 * square (sin dlon) --- c = 2 * atan2 (sqrt a) (sqrt (1 - a)) --- in (r * c) / 1000 - --- getClosestInds :: Int -> [(String, String, Double)] -> [(String, Double)] --- getClosestInds n dists = map (\(_,x,y) -> (x,y)) $ take n $ sortBy (\(_,_,x) (_,_,y) -> compare x y) dists - --- distToWeight :: [Double] -> [Rational] --- distToWeight distances = --- let closeness = map (1/) distances --- in rescale 0 100 closeness - --- rescale :: Double -> Double -> [Double] -> [Rational] --- rescale minNew maxNew xs = --- let minOld = minimum xs --- maxOld = maximum xs --- a = (maxNew - minNew) / (maxOld - minOld) --- bs = map (\x -> x - maxOld) xs --- res = map (\x -> a * x + maxNew) bs --- in map toRational res - --- filterPackagesByInds :: [String] -> [PoseidonPackage] -> IO [PoseidonPackage] --- filterPackagesByInds indNamesStats packages = do --- fmap catMaybes . forM packages $ \pac -> do --- inds <- getIndividuals pac --- let indNamesPac = [ind | EigenstratIndEntry ind _ _ <- inds] --- if length (intersect indNamesPac indNamesStats) > 0 --- then return (Just pac) --- else return Nothing - --- extractIndIndices :: [String] -> [PoseidonPackage] -> IO [Int] --- extractIndIndices indNames relevantPackages = do --- let allPackageNames = map posPacTitle relevantPackages --- allIndEntries <- mapM getIndividuals relevantPackages --- let filterFunc (_,_,EigenstratIndEntry ind _ _) = ind `elem` indNames --- return $ map extractFirst $ filter filterFunc (zipGroup allPackageNames allIndEntries) diff --git a/src/Poseidon/Generator/Parsers.hs b/src/Poseidon/Generator/Parsers.hs index 570c07c..63235de 100644 --- a/src/Poseidon/Generator/Parsers.hs +++ b/src/Poseidon/Generator/Parsers.hs @@ -3,14 +3,10 @@ module Poseidon.Generator.Parsers where import Poseidon.Generator.Types import Poseidon.Generator.Utils -import Poseidon.ColumnTypesJanno - import Control.Exception (throwIO) -import Control.Monad (guard) import Data.List (intercalate) import Data.Ratio ((%)) import qualified Text.Parsec as P -import qualified Text.Parsec.Number as P import qualified Text.Parsec.String as P renderRequestedInds :: [RequestedInd] -> String @@ -55,53 +51,3 @@ parsePopulationWithFraction = do _ <- P.oneOf "=" percP <- read <$> P.many1 P.digit return (PopFrac popP (percP % 100)) - -readIndWithPositionString :: String -> Either String [IndWithPosition] -readIndWithPositionString s = case P.runParser indWithPositionParser () "" s of - Left p -> Left (show p) - Right x -> Right x - -readIndWithPositionFromFile :: FilePath -> IO [IndWithPosition] -readIndWithPositionFromFile positionFile = do - let multiPositionParser = indWithPositionParser `P.sepBy1` (P.newline *> P.spaces) - eitherParseResult <- P.parseFromFile (P.spaces *> multiPositionParser <* P.spaces) positionFile - case eitherParseResult of - Left err -> throwIO $ PoseidonGeneratorCLIParsingException (show err) - Right r -> return (concat r) - -indWithPositionParser :: P.Parser [IndWithPosition] -indWithPositionParser = P.try (P.sepBy parseIndWithPosition (P.char ';' <* P.spaces)) - -parseIndWithPosition :: P.Parser IndWithPosition -parseIndWithPosition = do - _ <- P.oneOf "[" - ind <- P.manyTill P.anyChar (P.string ":") - unit <- P.manyTill P.anyChar (P.string "]") - _ <- P.oneOf "(" - spatpos <- parseSpatialTemporalPosition - _ <- P.oneOf ")" - return (IndWithPosition ind unit spatpos) - -parseSpatialTemporalPosition :: P.Parser SpatialTemporalPosition -parseSpatialTemporalPosition = do - timeP <- pInt - _ <- P.oneOf "," - latP <- pLat - _ <- P.oneOf "," - lonP <- pLon - return (SpatialTemporalPosition timeP latP lonP) - -pInt :: P.Parser Int -pInt = read <$> P.many1 P.digit - -pLat :: P.Parser JannoLatitude -pLat = do - latP <- P.sign <*> P.floating2 True - guard (latP >= -90 && latP <= 90) P. "valid latitude (-90 to 90)" - return (JannoLatitude latP) - -pLon :: P.Parser JannoLongitude -pLon = do - lonP <- P.sign <*> P.floating2 True - guard (lonP >= -180 && lonP <= 180) P. "valid longitude (-180 to 180)" - return (JannoLongitude lonP) diff --git a/src/Poseidon/Generator/Types.hs b/src/Poseidon/Generator/Types.hs index a004130..9450571 100644 --- a/src/Poseidon/Generator/Types.hs +++ b/src/Poseidon/Generator/Types.hs @@ -1,7 +1,6 @@ module Poseidon.Generator.Types where import Data.List (intercalate) -import Poseidon.ColumnTypesJanno data IndConcrete = IndConcrete { _indName :: String @@ -34,15 +33,3 @@ data PopFrac = PopFrac { instance Show PopFrac where show (PopFrac _pop _frac) = _pop ++ "=" ++ show _frac - -data IndWithPosition = IndWithPosition { - spatInd :: String - , spatUnit :: String - , spatPos :: SpatialTemporalPosition -} deriving (Show) - -data SpatialTemporalPosition = SpatialTemporalPosition { - time :: Int - , lat :: JannoLatitude - , lon :: JannoLongitude -} deriving (Show) From a4631898ecb6a19a2be363b58a70d8a17e3080f8 Mon Sep 17 00:00:00 2001 From: Clemens Schmid Date: Mon, 23 Feb 2026 09:11:31 +0100 Subject: [PATCH 2/2] enabled vcf output for admixpops, writing diploid output --- src/Poseidon/Generator/CLI/AdmixPops.hs | 52 +++++++++++++++++-------- 1 file changed, 36 insertions(+), 16 deletions(-) diff --git a/src/Poseidon/Generator/CLI/AdmixPops.hs b/src/Poseidon/Generator/CLI/AdmixPops.hs index 3f547cc..7a86c4f 100644 --- a/src/Poseidon/Generator/CLI/AdmixPops.hs +++ b/src/Poseidon/Generator/CLI/AdmixPops.hs @@ -1,9 +1,17 @@ module Poseidon.Generator.CLI.AdmixPops where +import Poseidon.ColumnTypesJanno (JannoGenotypePloidy (..)) +import Poseidon.EntityTypes (HasNameAndVersion (..)) import Poseidon.Generator.Parsers import Poseidon.Generator.SampleGeno import Poseidon.Generator.Types import Poseidon.Generator.Utils +import Poseidon.GenotypeData +import Poseidon.Janno (JannoRows (..), + createMinimalJanno, + jGenotypePloidy) +import Poseidon.Package +import Poseidon.Utils import Control.Exception (catch, throwIO) import Control.Monad (forM, unless, when) @@ -18,10 +26,6 @@ import Pipes import qualified Pipes.Group as PG import qualified Pipes.Prelude as P import Pipes.Safe (runSafeT) -import Poseidon.EntityTypes (HasNameAndVersion (..)) -import Poseidon.GenotypeData -import Poseidon.Package -import Poseidon.Utils import SequenceFormats.Eigenstrat import SequenceFormats.Plink (eigenstratInd2PlinkFam, writePlink) @@ -100,14 +104,18 @@ runAdmixPops ( let gz = if outZip then "gz" else "" genotypeFileData <- case outFormat of GenotypeOutFormatEigenstrat -> - return $ GenotypeEigenstrat (outName <.> "geno" <.> gz) Nothing - (outName <.> "snp" <.> gz) Nothing - (outName <.> "ind") Nothing - GenotypeOutFormatPlink -> - return $ GenotypePlink (outName <.> "bed" <.> gz) Nothing - (outName <.> "bim" <.> gz) Nothing - (outName <.> "fam") Nothing - GenotypeOutFormatVCF -> throwM $ PoseidonGeneratorCLIParsingException "VCF output format not supported yet for admixing populations" + return $ GenotypeEigenstrat + (outName <.> "geno" <.> gz) Nothing + (outName <.> "snp" <.> gz) Nothing + (outName <.> "ind") Nothing + GenotypeOutFormatPlink -> + return $ GenotypePlink + (outName <.> "bed" <.> gz) Nothing + (outName <.> "bim" <.> gz) Nothing + (outName <.> "fam") Nothing + GenotypeOutFormatVCF -> + return $ GenotypeVCF + (outName <.> "vcf" <.> gz) Nothing let genotypeData = GenotypeDataSpec genotypeFileData Nothing -- we set no snpSet pac <- newMinimalPackageTemplate outPath outName genotypeData liftIO $ writePoseidonPackage pac @@ -122,10 +130,22 @@ runAdmixPops ( let newIndEntries = map (\x -> EigenstratIndEntry (B.pack $ _indName x) Unknown (B.pack $ _groupName x)) preparedInds outConsumer <- case genotypeFileData of GenotypeEigenstrat outG _ outS _ outI _ -> - return $ writeEigenstrat (outPath outG) (outPath outS) (outPath outI) newIndEntries - GenotypePlink outG _ outS _ outI _ -> - return $ writePlink (outPath outG) (outPath outS) (outPath outI) (map (eigenstratInd2PlinkFam outPlinkPopMode) newIndEntries) - GenotypeVCF _ _ -> throwM $ PoseidonGeneratorCLIParsingException "VCF output format not supported yet for admixing populations" + return $ writeEigenstrat + (outPath outG) + (outPath outS) + (outPath outI) + newIndEntries + GenotypePlink outG _ outS _ outI _ -> + return $ writePlink + (outPath outG) + (outPath outS) + (outPath outI) + (map (eigenstratInd2PlinkFam outPlinkPopMode) newIndEntries) + GenotypeVCF outG _ -> do + let (JannoRows xs) = createMinimalJanno newIndEntries + -- writeVCF needs to know if diploid or pseudo-haploid genotypes + madeUpPloidyJannoRows = map (\x -> x {jGenotypePloidy = Just Diploid}) xs + return $ writeVCF logA madeUpPloidyJannoRows (outPath outG) case methodSetting of PerSNP marginalizeMissing -> do runEffect $ eigenstratProd >->