From 901a3ea4ba578b106152628f743ad443ae8b3dd1 Mon Sep 17 00:00:00 2001 From: Clemens Schmid Date: Fri, 16 Dec 2022 18:14:03 +0100 Subject: [PATCH 01/26] made sure that _readOptStopOnDuplicates = True is only set where it really should be --- src/Poseidon/CLI/Genoconvert.hs | 2 +- src/Poseidon/CLI/Update.hs | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/Poseidon/CLI/Genoconvert.hs b/src/Poseidon/CLI/Genoconvert.hs index 101805574..84bb7fb27 100644 --- a/src/Poseidon/CLI/Genoconvert.hs +++ b/src/Poseidon/CLI/Genoconvert.hs @@ -41,7 +41,7 @@ data GenoconvertOptions = GenoconvertOptions pacReadOpts :: PackageReadOptions pacReadOpts = defaultPackageReadOptions { - _readOptStopOnDuplicates = True + _readOptStopOnDuplicates = False , _readOptIgnoreChecksums = True , _readOptIgnoreGeno = False , _readOptGenoCheck = True diff --git a/src/Poseidon/CLI/Update.hs b/src/Poseidon/CLI/Update.hs index cef389b82..5e692cb46 100644 --- a/src/Poseidon/CLI/Update.hs +++ b/src/Poseidon/CLI/Update.hs @@ -38,7 +38,7 @@ data UpdateOptions = UpdateOptions pacReadOpts :: PackageReadOptions pacReadOpts = defaultPackageReadOptions { - _readOptStopOnDuplicates = True + _readOptStopOnDuplicates = False , _readOptIgnoreChecksums = True , _readOptIgnoreGeno = True , _readOptGenoCheck = False From c0263f7bfbf3d0412b06e5ac31d49a987824abe9 Mon Sep 17 00:00:00 2001 From: Clemens Schmid Date: Fri, 16 Dec 2022 19:44:58 +0100 Subject: [PATCH 02/26] first rough draft of a logic to recover from duplicates in forge according to the discussion in #179 --- src/Poseidon/CLI/Forge.hs | 30 ++++++++++++++++++++++++++++-- 1 file changed, 28 insertions(+), 2 deletions(-) diff --git a/src/Poseidon/CLI/Forge.hs b/src/Poseidon/CLI/Forge.hs index 938f736a4..25194101f 100644 --- a/src/Poseidon/CLI/Forge.hs +++ b/src/Poseidon/CLI/Forge.hs @@ -33,11 +33,12 @@ import Poseidon.Utils (PoseidonException (..), PoseidonLogIO, determinePackageOutName, logInfo, logWarning) +import Poseidon.SecondaryTypes (IndividualInfo (..)) import Control.Exception (catch, throwIO) import Control.Monad (forM, forM_, unless, when) import Control.Monad.Reader (ask) -import Data.List (intercalate, nub, (\\)) +import Data.List (intercalate, nub, (\\), groupBy, sortBy) import Data.Maybe (mapMaybe) import Data.Time (getCurrentTime) import qualified Data.Vector as V @@ -120,8 +121,33 @@ runForge ( logInfo $ (show . length $ relevantPackages) ++ " packages contain data for this forging operation" when (null relevantPackages) $ liftIO $ throwIO PoseidonEmptyForgeException + -- all individuals from relevant packages + let allInds = getJointIndividualInfo $ relevantPackages + -- determine relevant individual indices - let relevantIndices = conformingEntityIndices entities . getJointIndividualInfo $ relevantPackages + let relevantIndicesWithDuplicates = conformingEntityIndices entities allInds + relevantIndividuals = map (allInds !!) relevantIndicesWithDuplicates + relevantIndividualsSimpleName = map indInfoName relevantIndividuals + relevantIndividualsFullName = map (\(IndividualInfo indN groupNs pacN) -> pacN ++ "." ++ head groupNs ++ "." ++ indN) relevantIndividuals + + -- find duplicates + let equalNameIndividuals = groupBy (\(_,x,_) (_,y,_) -> x == y) $ + sortBy (\(_,x,_) (_,y,_) -> compare x y) $ + zip3 relevantIndicesWithDuplicates relevantIndividualsSimpleName relevantIndividualsFullName + nonDuplicatedInds = concat $ filter (\x -> length x == 1) equalNameIndividuals + duplicatedInds = concat $ filter (\x -> length x > 1) equalNameIndividuals + + dIs <- if null duplicatedInds + then return [] + else do + logWarning $ "There are duplicated individuals" + mapM_ (\(_,simpleName,fullName) -> logWarning $ simpleName ++ " -> " ++ fullName) duplicatedInds + logWarning $ "Trying to recover" + let huhu = filter (\(_,_,x) -> x `elem` ([a | Ind a <- nonExistentEntities])) duplicatedInds + mapM_ (\(_,simpleName,fullName) -> logWarning $ simpleName ++ " -> " ++ fullName) huhu + return huhu + + let relevantIndices = map (\(x,_,_) -> x) nonDuplicatedInds ++ map (\(x,_,_) -> x) dIs -- collect data -- -- janno From 6d94abb8079a21c2535af6e785101a55a983b6dc Mon Sep 17 00:00:00 2001 From: Clemens Schmid Date: Fri, 16 Dec 2022 20:51:18 +0100 Subject: [PATCH 03/26] tried to make the user feedback more helpful and clear --- src/Poseidon/CLI/Forge.hs | 54 ++++++++++++++++++++++++--------------- src/Poseidon/Package.hs | 4 +-- 2 files changed, 36 insertions(+), 22 deletions(-) diff --git a/src/Poseidon/CLI/Forge.hs b/src/Poseidon/CLI/Forge.hs index 25194101f..082812df2 100644 --- a/src/Poseidon/CLI/Forge.hs +++ b/src/Poseidon/CLI/Forge.hs @@ -54,6 +54,7 @@ import SequenceFormats.Plink (writePlink) import System.Directory (createDirectoryIfMissing) import System.FilePath (dropTrailingPathSeparator, (<.>), ()) +import Data.Function ((&)) -- | A datatype representing command line options for the survey command data ForgeOptions = ForgeOptions @@ -113,7 +114,8 @@ runForge ( -- check for entities that do not exist this this dataset let nonExistentEntities = findNonExistentEntities entities . getJointIndividualInfo $ allPackages unless (null nonExistentEntities) $ - logWarning $ "The following entities do not exist in this dataset and will be ignored: " ++ + logWarning $ "Detected entities that do not exist in the dataset. " ++ + "They will be considered to recover from duplicated individuals or ignored: " ++ intercalate ", " (map show nonExistentEntities) -- determine relevant packages @@ -124,30 +126,42 @@ runForge ( -- all individuals from relevant packages let allInds = getJointIndividualInfo $ relevantPackages - -- determine relevant individual indices + -- determine relevant individuals let relevantIndicesWithDuplicates = conformingEntityIndices entities allInds - relevantIndividuals = map (allInds !!) relevantIndicesWithDuplicates - relevantIndividualsSimpleName = map indInfoName relevantIndividuals - relevantIndividualsFullName = map (\(IndividualInfo indN groupNs pacN) -> pacN ++ "." ++ head groupNs ++ "." ++ indN) relevantIndividuals + relevantInds = map (allInds !!) relevantIndicesWithDuplicates + relevantIndsSimpleName = map indInfoName relevantInds + relevantIndsFullName = map (\(IndividualInfo indN groupNs pacN) -> pacN ++ "." ++ head groupNs ++ "." ++ indN) relevantInds -- find duplicates - let equalNameIndividuals = groupBy (\(_,x,_) (_,y,_) -> x == y) $ - sortBy (\(_,x,_) (_,y,_) -> compare x y) $ - zip3 relevantIndicesWithDuplicates relevantIndividualsSimpleName relevantIndividualsFullName - nonDuplicatedInds = concat $ filter (\x -> length x == 1) equalNameIndividuals - duplicatedInds = concat $ filter (\x -> length x > 1) equalNameIndividuals + let equalNameIndividuals = + zip3 relevantIndicesWithDuplicates relevantIndsSimpleName relevantIndsFullName & + sortBy (\(_,x,_) (_,y,_) -> compare x y) & + groupBy (\(_,x,_) (_,y,_) -> x == y) + singleInds = concat $ filter (\x -> length x == 1) equalNameIndividuals + duplicatedInds = concat $ filter (\x -> length x > 1 ) equalNameIndividuals - dIs <- if null duplicatedInds - then return [] - else do - logWarning $ "There are duplicated individuals" - mapM_ (\(_,simpleName,fullName) -> logWarning $ simpleName ++ " -> " ++ fullName) duplicatedInds - logWarning $ "Trying to recover" - let huhu = filter (\(_,_,x) -> x `elem` ([a | Ind a <- nonExistentEntities])) duplicatedInds - mapM_ (\(_,simpleName,fullName) -> logWarning $ simpleName ++ " -> " ++ fullName) huhu - return huhu + dupIndsToKeep <- + if null duplicatedInds + then return [] + else do + logWarning $ "There are duplicated individuals, but forge does not allow that" + logWarning $ "Please use the following names in your --forgeString or --forgeFile to select them explictly" + mapM_ (\(_,simpleName,fullName) -> logWarning $ simpleName ++ " -> <" ++ fullName ++ ">") duplicatedInds + unless (null nonExistentEntities) $ + logWarning $ "Trying to apply nonexistent entities to recover..." + let selectedDuplicatedInds = filter (\(_,_,x) -> x `elem` ([a | Ind a <- nonExistentEntities])) duplicatedInds + unless (null selectedDuplicatedInds) $ do + logWarning $ "You made a decision for the following Individuals: " ++ intercalate "," (map (\(_,x,_) -> x) selectedDuplicatedInds) + let totalNames = nub $ map (\(_,x,_) -> x) duplicatedInds + recoveredNames = nub $ map (\(_,x,_) -> x) selectedDuplicatedInds + notRecoveredNames = totalNames \\ recoveredNames + unless (null notRecoveredNames) $ + liftIO $ throwIO $ PoseidonForgeEntitiesException $ + "Please make a decision for the following duplicated individuals: " ++ + intercalate ", " notRecoveredNames + return selectedDuplicatedInds - let relevantIndices = map (\(x,_,_) -> x) nonDuplicatedInds ++ map (\(x,_,_) -> x) dIs + let relevantIndices = map (\(x,_,_) -> x) singleInds ++ map (\(x,_,_) -> x) dupIndsToKeep -- collect data -- -- janno diff --git a/src/Poseidon/Package.hs b/src/Poseidon/Package.hs index f877c815b..ce55046c0 100644 --- a/src/Poseidon/Package.hs +++ b/src/Poseidon/Package.hs @@ -434,12 +434,12 @@ checkIndividualsUnique stopOnDuplicates indEntries = do if stopOnDuplicates then do liftIO $ throwIO $ PoseidonCollectionException $ - "Duplicate individuals (" ++ + "Duplicate individuals in package collection (" ++ intercalate ", " (genoIDs \\ nub genoIDs) ++ ")" else do logWarning $ - "Duplicate individuals (" ++ + "Duplicate individuals in package collection (" ++ intercalate ", " (take 3 $ genoIDs \\ nub genoIDs) ++ if length (genoIDs \\ nub genoIDs) > 3 then ", ...)" From c6875e0a2408c8c2c7058cd760b35a712efb05bc Mon Sep 17 00:00:00 2001 From: Clemens Schmid Date: Sat, 17 Dec 2022 18:00:52 +0100 Subject: [PATCH 04/26] laid the foundation for better duplicate filtering --- src/Poseidon/CLI/Forge.hs | 12 +++++---- src/Poseidon/EntitiesList.hs | 52 +++++++++++++++++++++++++++--------- 2 files changed, 46 insertions(+), 18 deletions(-) diff --git a/src/Poseidon/CLI/Forge.hs b/src/Poseidon/CLI/Forge.hs index 082812df2..d19b292c0 100644 --- a/src/Poseidon/CLI/Forge.hs +++ b/src/Poseidon/CLI/Forge.hs @@ -9,7 +9,7 @@ import Poseidon.EntitiesList (EntityInput, PoseidonEntity (..), conformingEntityIndices, filterRelevantPackages, findNonExistentEntities, - readEntityInputs) + readEntityInputs, getIndName, PoseidonIndividual (..)) import Poseidon.GenotypeData (GenoDataSource (..), GenotypeDataSpec (..), GenotypeFormatSpec (..), @@ -114,10 +114,12 @@ runForge ( -- check for entities that do not exist this this dataset let nonExistentEntities = findNonExistentEntities entities . getJointIndividualInfo $ allPackages unless (null nonExistentEntities) $ - logWarning $ "Detected entities that do not exist in the dataset. " ++ - "They will be considered to recover from duplicated individuals or ignored: " ++ + logWarning $ "Detected entities that do not exist in the dataset. They will be ignored: " ++ intercalate ", " (map show nonExistentEntities) + -- extract SpecificInd values + let specificInds = [SpecificInd p g i | (Include (Ind (SpecificInd p g i))) <- entities] + -- determine relevant packages let relevantPackages = filterRelevantPackages entities allPackages logInfo $ (show . length $ relevantPackages) ++ " packages contain data for this forging operation" @@ -130,7 +132,7 @@ runForge ( let relevantIndicesWithDuplicates = conformingEntityIndices entities allInds relevantInds = map (allInds !!) relevantIndicesWithDuplicates relevantIndsSimpleName = map indInfoName relevantInds - relevantIndsFullName = map (\(IndividualInfo indN groupNs pacN) -> pacN ++ "." ++ head groupNs ++ "." ++ indN) relevantInds + relevantIndsFullName = map (\(IndividualInfo indN groupNs pacN) -> pacN ++ ":" ++ head groupNs ++ ":" ++ indN) relevantInds -- find duplicates let equalNameIndividuals = @@ -149,7 +151,7 @@ runForge ( mapM_ (\(_,simpleName,fullName) -> logWarning $ simpleName ++ " -> <" ++ fullName ++ ">") duplicatedInds unless (null nonExistentEntities) $ logWarning $ "Trying to apply nonexistent entities to recover..." - let selectedDuplicatedInds = filter (\(_,_,x) -> x `elem` ([a | Ind a <- nonExistentEntities])) duplicatedInds + let selectedDuplicatedInds = filter (\(_,_,x) -> x `elem` ([a | Ind (SpecificInd _ _ a) <- nonExistentEntities])) duplicatedInds unless (null selectedDuplicatedInds) $ do logWarning $ "You made a decision for the following Individuals: " ++ intercalate "," (map (\(_,x,_) -> x) selectedDuplicatedInds) let totalNames = nub $ map (\(_,x,_) -> x) duplicatedInds diff --git a/src/Poseidon/EntitiesList.hs b/src/Poseidon/EntitiesList.hs index 6f05a0405..1cf21ac9d 100644 --- a/src/Poseidon/EntitiesList.hs +++ b/src/Poseidon/EntitiesList.hs @@ -4,7 +4,7 @@ module Poseidon.EntitiesList ( indInfoConformsToEntitySpec, underlyingEntity, entitySpecParser, readEntitiesFromFile, readEntitiesFromString, findNonExistentEntities, indInfoFindRelevantPackageNames, filterRelevantPackages, - conformingEntityIndices, entitiesListP, EntityInput(..), readEntityInputs) where + conformingEntityIndices, entitiesListP, EntityInput(..), readEntityInputs, getIndName, PoseidonIndividual (..)) where import Poseidon.Package (PoseidonPackage (..), getJointIndividualInfo) @@ -27,19 +27,34 @@ import qualified Text.Parsec as P import qualified Text.Parsec.String as P -- | A datatype to represent a package, a group or an individual -data PoseidonEntity = Pac String +data PoseidonEntity = + Pac String | Group String - | Ind String + | Ind PoseidonIndividual deriving (Eq, Ord) +data PoseidonIndividual = + SimpleInd String + | SpecificInd String String String + deriving (Eq, Ord) + +getIndName :: PoseidonIndividual -> String +getIndName (SimpleInd n) = n +getIndName (SpecificInd _ _ n) = n + +instance Show PoseidonIndividual where + show (SimpleInd i) = "<" ++ i ++ ">" + show (SpecificInd p g i) = "<" ++ p ++ ":" ++ g ++ ":" ++ i ++ ">" + instance Show PoseidonEntity where - show (Pac n) = "*" ++ n ++ "*" - show (Group n) = n - show (Ind n) = "<" ++ n ++ ">" + show (Pac p) = "*" ++ p ++ "*" + show (Group g) = g + show (Ind i) = show i type EntitiesList = [PoseidonEntity] -data SignedEntity = Include PoseidonEntity +data SignedEntity = + Include PoseidonEntity | Exclude PoseidonEntity deriving (Eq, Ord) @@ -65,7 +80,8 @@ instance EntitySpec SignedEntity where shouldIncExc (Include entity) = if entity & isIndInfo then Just True else Nothing shouldIncExc (Exclude entity) = if entity & isIndInfo then Just False else Nothing isIndInfo :: PoseidonEntity -> Bool - isIndInfo (Ind n) = n == indName + isIndInfo (Ind (SimpleInd n)) = n == indName + isIndInfo (Ind (SpecificInd _ _ n)) = n == indName isIndInfo (Group n) = n `elem` groupNames isIndInfo (Pac n) = n == pacName underlyingEntity = removeEntitySign @@ -79,10 +95,20 @@ instance EntitySpec PoseidonEntity where underlyingEntity = id entitySpecParser = parsePac <|> parseGroup <|> parseInd where - parsePac = Pac <$> P.between (P.char '*') (P.char '*') parseName - parseGroup = Group <$> parseName - parseInd = Ind <$> P.between (P.char '<') (P.char '>') parseName - parseName = P.many1 (P.satisfy (\c -> not (isSpace c || c `elem` ",<>*"))) + parsePac = Pac <$> P.between (P.char '*') (P.char '*') parseName + parseGroup = Group <$> parseName + parseInd = Ind <$> (parseSpecificInd <|> parseSimpleInd) + parseName = P.many1 (P.satisfy (\c -> not (isSpace c || c `elem` ":,<>*"))) + parseSimpleInd = SimpleInd <$> P.between (P.char '<') (P.char '>') parseName + parseSpecificInd = do + _ <- P.char '<' + pacName <- parseName + _ <- P.char ':' + groupName <- parseName + _ <- P.char ':' + individualName <- parseName + _ <- P.char '>' + return $ SpecificInd pacName groupName individualName -- turns out that we cannot easily write instances for classes, so need to be explicit for both types instance FromJSON PoseidonEntity where parseJSON = withText "PoseidonEntity" aesonParseEntitySpec @@ -155,7 +181,7 @@ findNonExistentEntities entities individuals = groupNamesStats = nub [ group | Group group <- map underlyingEntity entities] indNamesStats = nub [ ind | Ind ind <- map underlyingEntity entities] missingPacs = map Pac $ titlesRequestedPacs \\ titlesPac - missingInds = map Ind $ indNamesStats \\ indNamesPac + missingInds = map Ind $ filter (\x -> getIndName x `notElem` indNamesPac) indNamesStats missingGroups = map Group $ groupNamesStats \\ groupNamesPac in missingPacs ++ missingInds ++ missingGroups From 14ecfe5ce7965e0060513c9f700f04c0fb33fb1a Mon Sep 17 00:00:00 2001 From: Clemens Schmid Date: Sat, 17 Dec 2022 23:29:55 +0100 Subject: [PATCH 05/26] work on a more elegant solution for the duplicate handling - doesn't work yet --- src/Poseidon/CLI/Forge.hs | 50 +++++++++++----------------------- src/Poseidon/EntitiesList.hs | 22 +++++++-------- src/Poseidon/SecondaryTypes.hs | 5 +++- 3 files changed, 31 insertions(+), 46 deletions(-) diff --git a/src/Poseidon/CLI/Forge.hs b/src/Poseidon/CLI/Forge.hs index d19b292c0..c0386b32e 100644 --- a/src/Poseidon/CLI/Forge.hs +++ b/src/Poseidon/CLI/Forge.hs @@ -32,7 +32,7 @@ import Poseidon.Package (PackageReadOptions (..), import Poseidon.Utils (PoseidonException (..), PoseidonLogIO, determinePackageOutName, logInfo, - logWarning) + logWarning, logError) import Poseidon.SecondaryTypes (IndividualInfo (..)) import Control.Exception (catch, throwIO) @@ -118,7 +118,7 @@ runForge ( intercalate ", " (map show nonExistentEntities) -- extract SpecificInd values - let specificInds = [SpecificInd p g i | (Include (Ind (SpecificInd p g i))) <- entities] + let specificInds = [i | (Include (Ind (SpecificInd i))) <- entities] -- determine relevant packages let relevantPackages = filterRelevantPackages entities allPackages @@ -129,41 +129,23 @@ runForge ( let allInds = getJointIndividualInfo $ relevantPackages -- determine relevant individuals - let relevantIndicesWithDuplicates = conformingEntityIndices entities allInds - relevantInds = map (allInds !!) relevantIndicesWithDuplicates - relevantIndsSimpleName = map indInfoName relevantInds - relevantIndsFullName = map (\(IndividualInfo indN groupNs pacN) -> pacN ++ ":" ++ head groupNs ++ ":" ++ indN) relevantInds + let relevantIndizesAndInds = conformingEntityIndices entities allInds + relevantIndices = map fst relevantIndizesAndInds + relevantInds = map snd relevantIndizesAndInds - -- find duplicates + -- check for duplicates let equalNameIndividuals = - zip3 relevantIndicesWithDuplicates relevantIndsSimpleName relevantIndsFullName & - sortBy (\(_,x,_) (_,y,_) -> compare x y) & - groupBy (\(_,x,_) (_,y,_) -> x == y) - singleInds = concat $ filter (\x -> length x == 1) equalNameIndividuals - duplicatedInds = concat $ filter (\x -> length x > 1 ) equalNameIndividuals + relevantInds & + sortBy (\(IndividualInfo a _ _) (IndividualInfo b _ _) -> compare a b) & + groupBy (\(IndividualInfo a _ _) (IndividualInfo b _ _) -> a == b) + duplicatedInds = filter (\x -> length x > 1 ) equalNameIndividuals + duplicatedIndsNotHandledBySpecifics = concat $ filter (\xs -> length (filter (\x -> x `notElem` specificInds) xs) > 1) duplicatedInds - dupIndsToKeep <- - if null duplicatedInds - then return [] - else do - logWarning $ "There are duplicated individuals, but forge does not allow that" - logWarning $ "Please use the following names in your --forgeString or --forgeFile to select them explictly" - mapM_ (\(_,simpleName,fullName) -> logWarning $ simpleName ++ " -> <" ++ fullName ++ ">") duplicatedInds - unless (null nonExistentEntities) $ - logWarning $ "Trying to apply nonexistent entities to recover..." - let selectedDuplicatedInds = filter (\(_,_,x) -> x `elem` ([a | Ind (SpecificInd _ _ a) <- nonExistentEntities])) duplicatedInds - unless (null selectedDuplicatedInds) $ do - logWarning $ "You made a decision for the following Individuals: " ++ intercalate "," (map (\(_,x,_) -> x) selectedDuplicatedInds) - let totalNames = nub $ map (\(_,x,_) -> x) duplicatedInds - recoveredNames = nub $ map (\(_,x,_) -> x) selectedDuplicatedInds - notRecoveredNames = totalNames \\ recoveredNames - unless (null notRecoveredNames) $ - liftIO $ throwIO $ PoseidonForgeEntitiesException $ - "Please make a decision for the following duplicated individuals: " ++ - intercalate ", " notRecoveredNames - return selectedDuplicatedInds - - let relevantIndices = map (\(x,_,_) -> x) singleInds ++ map (\(x,_,_) -> x) dupIndsToKeep + unless (null duplicatedInds) $ do + logError "There are duplicated individuals, but forge does not allow that" + logError "Please specify in your --forgeString or --forgeFile with the following names" + mapM_ (\i@(IndividualInfo n _ _) -> logError $ show (SimpleInd n) ++ " -> " ++ show (SpecificInd i)) duplicatedIndsNotHandledBySpecifics + liftIO $ throwIO $ PoseidonForgeEntitiesException "Duplicated individuals" -- collect data -- -- janno diff --git a/src/Poseidon/EntitiesList.hs b/src/Poseidon/EntitiesList.hs index 1cf21ac9d..fb7fe7533 100644 --- a/src/Poseidon/EntitiesList.hs +++ b/src/Poseidon/EntitiesList.hs @@ -35,16 +35,16 @@ data PoseidonEntity = data PoseidonIndividual = SimpleInd String - | SpecificInd String String String + | SpecificInd IndividualInfo deriving (Eq, Ord) getIndName :: PoseidonIndividual -> String getIndName (SimpleInd n) = n -getIndName (SpecificInd _ _ n) = n +getIndName (SpecificInd (IndividualInfo n _ _)) = n instance Show PoseidonIndividual where - show (SimpleInd i) = "<" ++ i ++ ">" - show (SpecificInd p g i) = "<" ++ p ++ ":" ++ g ++ ":" ++ i ++ ">" + show (SimpleInd i ) = "<" ++ i ++ ">" + show (SpecificInd (IndividualInfo i g p)) = "<" ++ p ++ ":" ++ (head g) ++ ":" ++ i ++ ">" instance Show PoseidonEntity where show (Pac p) = "*" ++ p ++ "*" @@ -71,7 +71,7 @@ class Eq a => EntitySpec a where entitySpecParser :: P.Parser a instance EntitySpec SignedEntity where - indInfoConformsToEntitySpec signedEntities (IndividualInfo indName groupNames pacName) = + indInfoConformsToEntitySpec signedEntities indInfo@(IndividualInfo indName groupNames pacName) = case mapMaybe shouldIncExc signedEntities of [] -> False xs -> last xs @@ -81,7 +81,7 @@ instance EntitySpec SignedEntity where shouldIncExc (Exclude entity) = if entity & isIndInfo then Just False else Nothing isIndInfo :: PoseidonEntity -> Bool isIndInfo (Ind (SimpleInd n)) = n == indName - isIndInfo (Ind (SpecificInd _ _ n)) = n == indName + isIndInfo (Ind (SpecificInd i)) = i == indInfo isIndInfo (Group n) = n `elem` groupNames isIndInfo (Pac n) = n == pacName underlyingEntity = removeEntitySign @@ -97,7 +97,7 @@ instance EntitySpec PoseidonEntity where where parsePac = Pac <$> P.between (P.char '*') (P.char '*') parseName parseGroup = Group <$> parseName - parseInd = Ind <$> (parseSpecificInd <|> parseSimpleInd) + parseInd = Ind <$> (P.try parseSimpleInd <|> parseSpecificInd) parseName = P.many1 (P.satisfy (\c -> not (isSpace c || c `elem` ":,<>*"))) parseSimpleInd = SimpleInd <$> P.between (P.char '<') (P.char '>') parseName parseSpecificInd = do @@ -106,9 +106,9 @@ instance EntitySpec PoseidonEntity where _ <- P.char ':' groupName <- parseName _ <- P.char ':' - individualName <- parseName + indName <- parseName _ <- P.char '>' - return $ SpecificInd pacName groupName individualName + return $ SpecificInd (IndividualInfo indName [groupName] pacName) -- turns out that we cannot easily write instances for classes, so need to be explicit for both types instance FromJSON PoseidonEntity where parseJSON = withText "PoseidonEntity" aesonParseEntitySpec @@ -185,8 +185,8 @@ findNonExistentEntities entities individuals = missingGroups = map Group $ groupNamesStats \\ groupNamesPac in missingPacs ++ missingInds ++ missingGroups -conformingEntityIndices :: (EntitySpec a) => [a] -> [IndividualInfo] -> [Int] -conformingEntityIndices entities = map fst . filter (indInfoConformsToEntitySpec entities . snd) . zip [0..] +conformingEntityIndices :: (EntitySpec a) => [a] -> [IndividualInfo] -> [(Int, IndividualInfo)] +conformingEntityIndices entities = filter (indInfoConformsToEntitySpec entities . snd) . zip [0..] readEntityInputs :: (MonadIO m, EntitySpec a) => [EntityInput a] -> m [a] -- An empty list means that entities are wanted. readEntityInputs entityInputs = diff --git a/src/Poseidon/SecondaryTypes.hs b/src/Poseidon/SecondaryTypes.hs index 9a728b99b..eeeabf45b 100644 --- a/src/Poseidon/SecondaryTypes.hs +++ b/src/Poseidon/SecondaryTypes.hs @@ -33,7 +33,10 @@ data IndividualInfo = IndividualInfo { indInfoName :: String , indInfoGroups :: [String] , indInfoPacName :: String - } deriving Show + } deriving (Show, Ord) + +instance Eq IndividualInfo where + (==) (IndividualInfo a1 b1 c1) (IndividualInfo a2 b2 c2) = a1 == a2 && head b1 == head b2 && c1 == c2 instance ToJSON IndividualInfo where toJSON x = object [ From c964d7fbc7258a06fa1a20b9dd4b51b4973978cb Mon Sep 17 00:00:00 2001 From: Clemens Schmid Date: Sun, 18 Dec 2022 12:10:05 +0100 Subject: [PATCH 06/26] even more significant changes necessary to make this work --- src/Poseidon/CLI/Forge.hs | 32 ++++++++++++------ src/Poseidon/EntitiesList.hs | 65 +++++++++++++++++++++++++----------- 2 files changed, 68 insertions(+), 29 deletions(-) diff --git a/src/Poseidon/CLI/Forge.hs b/src/Poseidon/CLI/Forge.hs index c0386b32e..7070b278d 100644 --- a/src/Poseidon/CLI/Forge.hs +++ b/src/Poseidon/CLI/Forge.hs @@ -9,7 +9,7 @@ import Poseidon.EntitiesList (EntityInput, PoseidonEntity (..), conformingEntityIndices, filterRelevantPackages, findNonExistentEntities, - readEntityInputs, getIndName, PoseidonIndividual (..)) + readEntityInputs, getIndName, PoseidonIndividual (..), SelectionLevel2 (..)) import Poseidon.GenotypeData (GenoDataSource (..), GenotypeDataSpec (..), GenotypeFormatSpec (..), @@ -117,7 +117,7 @@ runForge ( logWarning $ "Detected entities that do not exist in the dataset. They will be ignored: " ++ intercalate ", " (map show nonExistentEntities) - -- extract SpecificInd values + -- extract fully specified individuals from the entity list let specificInds = [i | (Include (Ind (SpecificInd i))) <- entities] -- determine relevant packages @@ -129,24 +129,24 @@ runForge ( let allInds = getJointIndividualInfo $ relevantPackages -- determine relevant individuals - let relevantIndizesAndInds = conformingEntityIndices entities allInds - relevantIndices = map fst relevantIndizesAndInds - relevantInds = map snd relevantIndizesAndInds + let relevantInds = conformingEntityIndices entities allInds -- check for duplicates let equalNameIndividuals = relevantInds & - sortBy (\(IndividualInfo a _ _) (IndividualInfo b _ _) -> compare a b) & - groupBy (\(IndividualInfo a _ _) (IndividualInfo b _ _) -> a == b) - duplicatedInds = filter (\x -> length x > 1 ) equalNameIndividuals - duplicatedIndsNotHandledBySpecifics = concat $ filter (\xs -> length (filter (\x -> x `notElem` specificInds) xs) > 1) duplicatedInds + sortBy (\(_,IndividualInfo a _ _,_) (_,IndividualInfo b _ _,_) -> compare a b) & + groupBy (\(_,IndividualInfo a _ _,_) (_,IndividualInfo b _ _,_) -> a == b) + equalNameIndividualsFiltered = map onlyKeepSpecifics equalNameIndividuals + duplicatedInds = concat $ filter (\x -> length x > 1) equalNameIndividualsFiltered unless (null duplicatedInds) $ do logError "There are duplicated individuals, but forge does not allow that" logError "Please specify in your --forgeString or --forgeFile with the following names" - mapM_ (\i@(IndividualInfo n _ _) -> logError $ show (SimpleInd n) ++ " -> " ++ show (SpecificInd i)) duplicatedIndsNotHandledBySpecifics + mapM_ (\(_,i@(IndividualInfo n _ _),_) -> logError $ show (SimpleInd n) ++ " -> " ++ show (SpecificInd i)) duplicatedInds liftIO $ throwIO $ PoseidonForgeEntitiesException "Duplicated individuals" + let relevantIndices = map (\(i,_,_) -> i) $ concat equalNameIndividualsFiltered + -- collect data -- -- janno let jannoRows = getJointJanno relevantPackages @@ -261,3 +261,15 @@ fillMissingSnpSets packages = forM packages $ \pac -> do logWarning $ "Warning for package " ++ title_ ++ ": field \"snpSet\" \ \is not set. I will interpret this as \"snpSet: Other\"" return SNPSetOther + +--exactlyOneSpecified :: [IndividualInfo] -> [IndividualInfo] -> Bool +--exactlyOneSpecified specified xs = +-- let indsThatAreSpecified = filter (`elem` specified) xs +-- in length indsThatAreSpecified == 1 + +onlyKeepSpecifics :: [(Int, IndividualInfo, SelectionLevel2)] -> [(Int, IndividualInfo, SelectionLevel2)] +onlyKeepSpecifics xs = + let highPrio = [ x | x@(_,_,ShouldBeIncludedWithHigherPriority) <- xs] + in if length xs > 1 && length highPrio == 1 + then highPrio + else xs \ No newline at end of file diff --git a/src/Poseidon/EntitiesList.hs b/src/Poseidon/EntitiesList.hs index fb7fe7533..2239e2e7b 100644 --- a/src/Poseidon/EntitiesList.hs +++ b/src/Poseidon/EntitiesList.hs @@ -4,7 +4,7 @@ module Poseidon.EntitiesList ( indInfoConformsToEntitySpec, underlyingEntity, entitySpecParser, readEntitiesFromFile, readEntitiesFromString, findNonExistentEntities, indInfoFindRelevantPackageNames, filterRelevantPackages, - conformingEntityIndices, entitiesListP, EntityInput(..), readEntityInputs, getIndName, PoseidonIndividual (..)) where + conformingEntityIndices, entitiesListP, EntityInput(..), readEntityInputs, getIndName, PoseidonIndividual (..), SelectionLevel2 (..)) where import Poseidon.Package (PoseidonPackage (..), getJointIndividualInfo) @@ -33,6 +33,13 @@ data PoseidonEntity = | Ind PoseidonIndividual deriving (Eq, Ord) +instance Show PoseidonEntity where + show (Pac p) = "*" ++ p ++ "*" + show (Group g) = g + show (Ind i) = show i + +type EntitiesList = [PoseidonEntity] + data PoseidonIndividual = SimpleInd String | SpecificInd IndividualInfo @@ -46,12 +53,21 @@ instance Show PoseidonIndividual where show (SimpleInd i ) = "<" ++ i ++ ">" show (SpecificInd (IndividualInfo i g p)) = "<" ++ p ++ ":" ++ (head g) ++ ":" ++ i ++ ">" -instance Show PoseidonEntity where - show (Pac p) = "*" ++ p ++ "*" - show (Group g) = g - show (Ind i) = show i +data SelectionLevel1 = + IsInIndInfo + | IsInIndInfoSpecified + | IsNotInIndInfo -type EntitiesList = [PoseidonEntity] +data SelectionLevel2 = + ShouldBeIncluded + | ShouldBeIncludedWithHigherPriority + | ShouldNotBeIncluded + deriving Show + +meansIn :: SelectionLevel2 -> Bool +meansIn ShouldBeIncluded = True +meansIn ShouldBeIncludedWithHigherPriority = True +meansIn ShouldNotBeIncluded = False data SignedEntity = Include PoseidonEntity @@ -66,24 +82,34 @@ type SignedEntitiesList = [SignedEntity] -- A class to generalise signed and unsigned Entity Lists. Both have the feature that they can be used to filter individuals. class Eq a => EntitySpec a where - indInfoConformsToEntitySpec :: [a] -> IndividualInfo -> Bool + indInfoConformsToEntitySpec :: [a] -> IndividualInfo -> SelectionLevel2 underlyingEntity :: a -> PoseidonEntity entitySpecParser :: P.Parser a instance EntitySpec SignedEntity where indInfoConformsToEntitySpec signedEntities indInfo@(IndividualInfo indName groupNames pacName) = case mapMaybe shouldIncExc signedEntities of - [] -> False + [] -> ShouldNotBeIncluded xs -> last xs where - shouldIncExc :: SignedEntity -> Maybe Bool - shouldIncExc (Include entity) = if entity & isIndInfo then Just True else Nothing - shouldIncExc (Exclude entity) = if entity & isIndInfo then Just False else Nothing - isIndInfo :: PoseidonEntity -> Bool - isIndInfo (Ind (SimpleInd n)) = n == indName - isIndInfo (Ind (SpecificInd i)) = i == indInfo - isIndInfo (Group n) = n `elem` groupNames - isIndInfo (Pac n) = n == pacName + shouldIncExc :: SignedEntity -> Maybe SelectionLevel2 + shouldIncExc (Include entity) = + case isIndInfo entity of + IsInIndInfo -> Just ShouldBeIncluded + IsInIndInfoSpecified -> Just ShouldBeIncludedWithHigherPriority + IsNotInIndInfo -> Nothing + --if isIndInfo entity then Just True else Nothing + shouldIncExc (Exclude entity) = + case isIndInfo entity of + IsInIndInfo -> Just ShouldNotBeIncluded + IsInIndInfoSpecified -> Just ShouldNotBeIncluded + IsNotInIndInfo -> Nothing + --if isIndInfo entity then Just False else Nothing + isIndInfo :: PoseidonEntity -> SelectionLevel1 + isIndInfo (Ind (SimpleInd n)) = if n == indName then IsInIndInfo else IsNotInIndInfo + isIndInfo (Ind (SpecificInd i)) = if i == indInfo then IsInIndInfoSpecified else IsNotInIndInfo + isIndInfo (Group n) = if n `elem` groupNames then IsInIndInfo else IsNotInIndInfo + isIndInfo (Pac n) = if n == pacName then IsInIndInfo else IsNotInIndInfo underlyingEntity = removeEntitySign entitySpecParser = parseSign <*> entitySpecParser where @@ -165,7 +191,7 @@ readEntitiesFromString s = case P.runParser (entitiesListP <* P.eof) () "" s of indInfoFindRelevantPackageNames :: (EntitySpec a) => [a] -> [IndividualInfo] -> [String] indInfoFindRelevantPackageNames e = - nub . map indInfoPacName . filter (indInfoConformsToEntitySpec e) + nub . map indInfoPacName . filter (meansIn . indInfoConformsToEntitySpec e) filterRelevantPackages :: (EntitySpec a) => [a] -> [PoseidonPackage] -> [PoseidonPackage] filterRelevantPackages e packages = @@ -185,8 +211,9 @@ findNonExistentEntities entities individuals = missingGroups = map Group $ groupNamesStats \\ groupNamesPac in missingPacs ++ missingInds ++ missingGroups -conformingEntityIndices :: (EntitySpec a) => [a] -> [IndividualInfo] -> [(Int, IndividualInfo)] -conformingEntityIndices entities = filter (indInfoConformsToEntitySpec entities . snd) . zip [0..] +conformingEntityIndices :: (EntitySpec a) => [a] -> [IndividualInfo] -> [(Int, IndividualInfo, SelectionLevel2)] +conformingEntityIndices entities xs = --filter (indInfoConformsToEntitySpec entities . snd) . zip [0..] xs + filter (\(_,_,level) -> meansIn level) $ map (\(index, x) -> (index, x, indInfoConformsToEntitySpec entities x)) (zip [0..] xs) readEntityInputs :: (MonadIO m, EntitySpec a) => [EntityInput a] -> m [a] -- An empty list means that entities are wanted. readEntityInputs entityInputs = From 200db25b702b06705f20debae7a3448013ca0c3b Mon Sep 17 00:00:00 2001 From: Clemens Schmid Date: Sun, 18 Dec 2022 14:27:58 +0100 Subject: [PATCH 07/26] code cleaning --- src/Poseidon/CLI/Forge.hs | 47 ++++++++++-------------------------- src/Poseidon/EntitiesList.hs | 31 +++++++++++++++--------- 2 files changed, 32 insertions(+), 46 deletions(-) diff --git a/src/Poseidon/CLI/Forge.hs b/src/Poseidon/CLI/Forge.hs index 7070b278d..ddeb9337d 100644 --- a/src/Poseidon/CLI/Forge.hs +++ b/src/Poseidon/CLI/Forge.hs @@ -9,7 +9,7 @@ import Poseidon.EntitiesList (EntityInput, PoseidonEntity (..), conformingEntityIndices, filterRelevantPackages, findNonExistentEntities, - readEntityInputs, getIndName, PoseidonIndividual (..), SelectionLevel2 (..)) + readEntityInputs, getIndName, PoseidonIndividual (..), onlyKeepSpecifics) import Poseidon.GenotypeData (GenoDataSource (..), GenotypeDataSpec (..), GenotypeFormatSpec (..), @@ -125,35 +125,35 @@ runForge ( logInfo $ (show . length $ relevantPackages) ++ " packages contain data for this forging operation" when (null relevantPackages) $ liftIO $ throwIO PoseidonEmptyForgeException - -- all individuals from relevant packages + -- get all individuals from the relevant packages let allInds = getJointIndividualInfo $ relevantPackages - -- determine relevant individuals + -- determine which individuals are potentially relevant let relevantInds = conformingEntityIndices entities allInds - -- check for duplicates + -- resolve duplicates that are already specified let equalNameIndividuals = relevantInds & sortBy (\(_,IndividualInfo a _ _,_) (_,IndividualInfo b _ _,_) -> compare a b) & - groupBy (\(_,IndividualInfo a _ _,_) (_,IndividualInfo b _ _,_) -> a == b) - equalNameIndividualsFiltered = map onlyKeepSpecifics equalNameIndividuals - duplicatedInds = concat $ filter (\x -> length x > 1) equalNameIndividualsFiltered - + groupBy (\(_,IndividualInfo a _ _,_) (_,IndividualInfo b _ _,_) -> a == b) & + map onlyKeepSpecifics + + -- check if there still are duplicates and if yes, then stop + let duplicatedInds = concat $ filter (\x -> length x > 1) equalNameIndividuals unless (null duplicatedInds) $ do logError "There are duplicated individuals, but forge does not allow that" - logError "Please specify in your --forgeString or --forgeFile with the following names" + logError "Please specify in your --forgeString or --forgeFile:" mapM_ (\(_,i@(IndividualInfo n _ _),_) -> logError $ show (SimpleInd n) ++ " -> " ++ show (SpecificInd i)) duplicatedInds - liftIO $ throwIO $ PoseidonForgeEntitiesException "Duplicated individuals" + liftIO $ throwIO $ PoseidonForgeEntitiesException "Unresolved duplicated individuals" - let relevantIndices = map (\(i,_,_) -> i) $ concat equalNameIndividualsFiltered + -- reduce individual list to a list of relevant indices + let relevantIndices = map (\(i,_,_) -> i) $ concat equalNameIndividuals -- collect data -- -- janno let jannoRows = getJointJanno relevantPackages relevantJannoRows = map (jannoRows !!) relevantIndices - -- check for duplicates among the individuals selected for merging - checkIndividualsUniqueJanno relevantJannoRows -- bib let bibEntries = concatMap posPacBib relevantPackages relevantBibEntries = filterBibEntries relevantJannoRows bibEntries @@ -237,15 +237,6 @@ sumNonMissingSNPs accumulator (_, geno) = do | x == Missing = 0 | otherwise = 1 -checkIndividualsUniqueJanno :: [JannoRow] -> PoseidonLogIO () -checkIndividualsUniqueJanno rows = do - let indIDs = map jPoseidonID rows - when (length indIDs /= length (nub indIDs)) $ do - liftIO $ throwIO $ PoseidonForgeEntitiesException $ - "Duplicate individuals in selection (" ++ - intercalate ", " (indIDs \\ nub indIDs) ++ - ")" - filterBibEntries :: [JannoRow] -> BibTeX -> BibTeX filterBibEntries samples references_ = let relevantPublications = nub . concatMap getJannoList . mapMaybe jPublication $ samples @@ -261,15 +252,3 @@ fillMissingSnpSets packages = forM packages $ \pac -> do logWarning $ "Warning for package " ++ title_ ++ ": field \"snpSet\" \ \is not set. I will interpret this as \"snpSet: Other\"" return SNPSetOther - ---exactlyOneSpecified :: [IndividualInfo] -> [IndividualInfo] -> Bool ---exactlyOneSpecified specified xs = --- let indsThatAreSpecified = filter (`elem` specified) xs --- in length indsThatAreSpecified == 1 - -onlyKeepSpecifics :: [(Int, IndividualInfo, SelectionLevel2)] -> [(Int, IndividualInfo, SelectionLevel2)] -onlyKeepSpecifics xs = - let highPrio = [ x | x@(_,_,ShouldBeIncludedWithHigherPriority) <- xs] - in if length xs > 1 && length highPrio == 1 - then highPrio - else xs \ No newline at end of file diff --git a/src/Poseidon/EntitiesList.hs b/src/Poseidon/EntitiesList.hs index 2239e2e7b..80961283d 100644 --- a/src/Poseidon/EntitiesList.hs +++ b/src/Poseidon/EntitiesList.hs @@ -4,7 +4,7 @@ module Poseidon.EntitiesList ( indInfoConformsToEntitySpec, underlyingEntity, entitySpecParser, readEntitiesFromFile, readEntitiesFromString, findNonExistentEntities, indInfoFindRelevantPackageNames, filterRelevantPackages, - conformingEntityIndices, entitiesListP, EntityInput(..), readEntityInputs, getIndName, PoseidonIndividual (..), SelectionLevel2 (..)) where + conformingEntityIndices, entitiesListP, EntityInput(..), readEntityInputs, getIndName, PoseidonIndividual (..), onlyKeepSpecifics) where import Poseidon.Package (PoseidonPackage (..), getJointIndividualInfo) @@ -53,6 +53,17 @@ instance Show PoseidonIndividual where show (SimpleInd i ) = "<" ++ i ++ ">" show (SpecificInd (IndividualInfo i g p)) = "<" ++ p ++ ":" ++ (head g) ++ ":" ++ i ++ ">" +data SignedEntity = + Include PoseidonEntity + | Exclude PoseidonEntity + deriving (Eq, Ord) + +instance Show SignedEntity where + show (Include a) = show a + show (Exclude a) = "-" ++ show a + +type SignedEntitiesList = [SignedEntity] + data SelectionLevel1 = IsInIndInfo | IsInIndInfoSpecified @@ -69,17 +80,6 @@ meansIn ShouldBeIncluded = True meansIn ShouldBeIncludedWithHigherPriority = True meansIn ShouldNotBeIncluded = False -data SignedEntity = - Include PoseidonEntity - | Exclude PoseidonEntity - deriving (Eq, Ord) - -instance Show SignedEntity where - show (Include a) = show a - show (Exclude a) = "-" ++ show a - -type SignedEntitiesList = [SignedEntity] - -- A class to generalise signed and unsigned Entity Lists. Both have the feature that they can be used to filter individuals. class Eq a => EntitySpec a where indInfoConformsToEntitySpec :: [a] -> IndividualInfo -> SelectionLevel2 @@ -215,6 +215,13 @@ conformingEntityIndices :: (EntitySpec a) => [a] -> [IndividualInfo] -> [(Int, I conformingEntityIndices entities xs = --filter (indInfoConformsToEntitySpec entities . snd) . zip [0..] xs filter (\(_,_,level) -> meansIn level) $ map (\(index, x) -> (index, x, indInfoConformsToEntitySpec entities x)) (zip [0..] xs) +onlyKeepSpecifics :: [(Int, IndividualInfo, SelectionLevel2)] -> [(Int, IndividualInfo, SelectionLevel2)] +onlyKeepSpecifics xs = + let highPrio = [ x | x@(_,_,ShouldBeIncludedWithHigherPriority) <- xs] + in if length xs > 1 && length highPrio == 1 + then highPrio + else xs + readEntityInputs :: (MonadIO m, EntitySpec a) => [EntityInput a] -> m [a] -- An empty list means that entities are wanted. readEntityInputs entityInputs = fmap nub . fmap concat . forM entityInputs $ \entityInput -> case entityInput of From fdd08c4181adc73ffe11a9abd4c7e6202373a36b Mon Sep 17 00:00:00 2001 From: Clemens Schmid Date: Sun, 18 Dec 2022 14:35:08 +0100 Subject: [PATCH 08/26] stylish haskell --- src/Poseidon/CLI/Forge.hs | 17 ++++++++++------- src/Poseidon/EntitiesList.hs | 6 +++--- src/Poseidon/Package.hs | 7 ++++--- 3 files changed, 17 insertions(+), 13 deletions(-) diff --git a/src/Poseidon/CLI/Forge.hs b/src/Poseidon/CLI/Forge.hs index ddeb9337d..f28c7a717 100644 --- a/src/Poseidon/CLI/Forge.hs +++ b/src/Poseidon/CLI/Forge.hs @@ -5,11 +5,13 @@ module Poseidon.CLI.Forge where import Poseidon.BibFile (BibEntry (..), BibTeX, writeBibTeXFile) import Poseidon.EntitiesList (EntityInput, PoseidonEntity (..), + PoseidonIndividual (..), SignedEntity (..), conformingEntityIndices, filterRelevantPackages, findNonExistentEntities, - readEntityInputs, getIndName, PoseidonIndividual (..), onlyKeepSpecifics) + getIndName, onlyKeepSpecifics, + readEntityInputs) import Poseidon.GenotypeData (GenoDataSource (..), GenotypeDataSpec (..), GenotypeFormatSpec (..), @@ -29,16 +31,18 @@ import Poseidon.Package (PackageReadOptions (..), newPackageTemplate, readPoseidonPackageCollection, writePoseidonPackage) +import Poseidon.SecondaryTypes (IndividualInfo (..)) import Poseidon.Utils (PoseidonException (..), PoseidonLogIO, - determinePackageOutName, logInfo, - logWarning, logError) -import Poseidon.SecondaryTypes (IndividualInfo (..)) + determinePackageOutName, logError, + logInfo, logWarning) import Control.Exception (catch, throwIO) import Control.Monad (forM, forM_, unless, when) import Control.Monad.Reader (ask) -import Data.List (intercalate, nub, (\\), groupBy, sortBy) +import Data.Function ((&)) +import Data.List (groupBy, intercalate, nub, sortBy, + (\\)) import Data.Maybe (mapMaybe) import Data.Time (getCurrentTime) import qualified Data.Vector as V @@ -54,7 +58,6 @@ import SequenceFormats.Plink (writePlink) import System.Directory (createDirectoryIfMissing) import System.FilePath (dropTrailingPathSeparator, (<.>), ()) -import Data.Function ((&)) -- | A datatype representing command line options for the survey command data ForgeOptions = ForgeOptions @@ -137,7 +140,7 @@ runForge ( sortBy (\(_,IndividualInfo a _ _,_) (_,IndividualInfo b _ _,_) -> compare a b) & groupBy (\(_,IndividualInfo a _ _,_) (_,IndividualInfo b _ _,_) -> a == b) & map onlyKeepSpecifics - + -- check if there still are duplicates and if yes, then stop let duplicatedInds = concat $ filter (\x -> length x > 1) equalNameIndividuals unless (null duplicatedInds) $ do diff --git a/src/Poseidon/EntitiesList.hs b/src/Poseidon/EntitiesList.hs index 80961283d..a60fa55ec 100644 --- a/src/Poseidon/EntitiesList.hs +++ b/src/Poseidon/EntitiesList.hs @@ -46,7 +46,7 @@ data PoseidonIndividual = deriving (Eq, Ord) getIndName :: PoseidonIndividual -> String -getIndName (SimpleInd n) = n +getIndName (SimpleInd n) = n getIndName (SpecificInd (IndividualInfo n _ _)) = n instance Show PoseidonIndividual where @@ -76,9 +76,9 @@ data SelectionLevel2 = deriving Show meansIn :: SelectionLevel2 -> Bool -meansIn ShouldBeIncluded = True +meansIn ShouldBeIncluded = True meansIn ShouldBeIncludedWithHigherPriority = True -meansIn ShouldNotBeIncluded = False +meansIn ShouldNotBeIncluded = False -- A class to generalise signed and unsigned Entity Lists. Both have the feature that they can be used to filter individuals. class Eq a => EntitySpec a where diff --git a/src/Poseidon/Package.hs b/src/Poseidon/Package.hs index ce55046c0..49dee42de 100644 --- a/src/Poseidon/Package.hs +++ b/src/Poseidon/Package.hs @@ -431,17 +431,18 @@ checkIndividualsUnique :: Bool -> [EigenstratIndEntry] -> PoseidonLogIO () checkIndividualsUnique stopOnDuplicates indEntries = do let genoIDs = [ x | EigenstratIndEntry x _ _ <- indEntries] when (length genoIDs /= length (nub genoIDs)) $ do + let dups = nub (genoIDs \\ nub genoIDs) if stopOnDuplicates then do liftIO $ throwIO $ PoseidonCollectionException $ "Duplicate individuals in package collection (" ++ - intercalate ", " (genoIDs \\ nub genoIDs) ++ + intercalate ", " dups ++ ")" else do logWarning $ "Duplicate individuals in package collection (" ++ - intercalate ", " (take 3 $ genoIDs \\ nub genoIDs) ++ - if length (genoIDs \\ nub genoIDs) > 3 + intercalate ", " (take 3 dups) ++ + if length (nub $ dups) > 3 then ", ...)" else ")" From 30bb2d6a694c44a5b91c3a7d28bc48116649ef12 Mon Sep 17 00:00:00 2001 From: Clemens Schmid Date: Sun, 18 Dec 2022 14:49:15 +0100 Subject: [PATCH 09/26] made pedantic compiler happy --- src/Poseidon/CLI/Forge.hs | 8 ++------ src/Poseidon/EntitiesList.hs | 1 - 2 files changed, 2 insertions(+), 7 deletions(-) diff --git a/src/Poseidon/CLI/Forge.hs b/src/Poseidon/CLI/Forge.hs index f28c7a717..0295f2ec9 100644 --- a/src/Poseidon/CLI/Forge.hs +++ b/src/Poseidon/CLI/Forge.hs @@ -10,7 +10,7 @@ import Poseidon.EntitiesList (EntityInput, PoseidonEntity (..), conformingEntityIndices, filterRelevantPackages, findNonExistentEntities, - getIndName, onlyKeepSpecifics, + onlyKeepSpecifics, readEntityInputs) import Poseidon.GenotypeData (GenoDataSource (..), GenotypeDataSpec (..), @@ -41,8 +41,7 @@ import Control.Exception (catch, throwIO) import Control.Monad (forM, forM_, unless, when) import Control.Monad.Reader (ask) import Data.Function ((&)) -import Data.List (groupBy, intercalate, nub, sortBy, - (\\)) +import Data.List (groupBy, intercalate, nub, sortBy) import Data.Maybe (mapMaybe) import Data.Time (getCurrentTime) import qualified Data.Vector as V @@ -120,9 +119,6 @@ runForge ( logWarning $ "Detected entities that do not exist in the dataset. They will be ignored: " ++ intercalate ", " (map show nonExistentEntities) - -- extract fully specified individuals from the entity list - let specificInds = [i | (Include (Ind (SpecificInd i))) <- entities] - -- determine relevant packages let relevantPackages = filterRelevantPackages entities allPackages logInfo $ (show . length $ relevantPackages) ++ " packages contain data for this forging operation" diff --git a/src/Poseidon/EntitiesList.hs b/src/Poseidon/EntitiesList.hs index a60fa55ec..1fc996132 100644 --- a/src/Poseidon/EntitiesList.hs +++ b/src/Poseidon/EntitiesList.hs @@ -19,7 +19,6 @@ import Data.Aeson (FromJSON (..), ToJSON (..), Value (..), withText) import Data.Aeson.Types (Parser) import Data.Char (isSpace) -import Data.Function ((&)) import Data.List (nub, (\\)) import Data.Maybe (mapMaybe) import Data.Text (Text, pack, unpack) From 6deefb8c4f0bd8f5e0140ddceea0c962f9a5ed85 Mon Sep 17 00:00:00 2001 From: Clemens Schmid Date: Sun, 18 Dec 2022 15:02:20 +0100 Subject: [PATCH 10/26] updated tests (needs more love) --- test/Poseidon/EntitiesListSpec.hs | 45 ++++++++++++++++--------------- 1 file changed, 23 insertions(+), 22 deletions(-) diff --git a/test/Poseidon/EntitiesListSpec.hs b/test/Poseidon/EntitiesListSpec.hs index d24a73e33..e09082947 100644 --- a/test/Poseidon/EntitiesListSpec.hs +++ b/test/Poseidon/EntitiesListSpec.hs @@ -29,28 +29,29 @@ testReadPoseidonEntitiesString :: Spec testReadPoseidonEntitiesString = describe "Poseidon.EntitiesList.readPoseidonEntitiesString" $ do it "should parse single entity lists correctly" $ do - fromRight [] (readEntitiesFromString "") `shouldBe` [Include $ Ind "a"] + fromRight [] (readEntitiesFromString "") `shouldBe` [Include $ Ind (SimpleInd "a")] + fromRight [] (readEntitiesFromString "") `shouldBe` [Include $ Ind (SpecificInd $ IndividualInfo "a" ["b"] "c")] fromRight [] (readEntitiesFromString "b") `shouldBe` [Include $ Group "b"] fromRight [] (readEntitiesFromString "*c*") `shouldBe` [Include $ Pac "c"] it "should parse longer entity lists correctly" $ do fromRight [] (readEntitiesFromString ",b,*c*") `shouldBe` - map Include [Ind "a", Group "b", Pac "c"] + map Include [Ind (SimpleInd "a"), Group "b", Pac "c"] fromRight [] (readEntitiesFromString ",b1,,*c*,b2") `shouldBe` - map Include [Ind "a1", Group "b1", Ind "a2", Pac "c", Group "b2"] + map Include [Ind (SimpleInd "a1"), Group "b1", Ind (SimpleInd "a2"), Pac "c", Group "b2"] it "should parse unsigned entity lists correctly" $ do fromRight [] (readEntitiesFromString ",b,*c*") `shouldBe` - [Ind "a", Group "b", Pac "c"] + [Ind (SimpleInd "a"), Group "b", Pac "c"] fromRight [] (readEntitiesFromString ",b1,,*c*,b2") `shouldBe` - [Ind "a1", Group "b1", Ind "a2", Pac "c", Group "b2"] + [Ind (SimpleInd "a1"), Group "b1", Ind (SimpleInd "a2"), Pac "c", Group "b2"] it "should ignore spaces after commas" $ do fromRight [] (readEntitiesFromString ", b, *c*") `shouldBe` - map Include [Ind "a", Group "b", Pac "c"] + map Include [Ind (SimpleInd "a"), Group "b", Pac "c"] fromRight [] (readEntitiesFromString "*c*, b") `shouldBe` map Include [Pac "c", Group "b"] it "should parse exclusion entities correctly" $ do - fromRight [] (readEntitiesFromString "-") `shouldBe` [Exclude $ Ind "a"] + fromRight [] (readEntitiesFromString "-") `shouldBe` [Exclude $ Ind (SimpleInd "a")] fromRight [] (readEntitiesFromString "-, , -b1,b2,-*c1*, *c2*") `shouldBe` - [Exclude $ Ind "a1", Include $ Ind "a2", + [Exclude $ Ind (SimpleInd "a1"), Include $ Ind (SimpleInd "a2"), Exclude $ Group "b1", Include $ Group "b2", Exclude $ Pac "c1", Include $ Pac "c2"] it "should fail with any other spaces" $ do @@ -79,19 +80,19 @@ testReadEntitiesFromFile = it "should parse good, single-value-per-line files correctly" $ do g1res <- readEntitiesFromFile g1 g1res `shouldBe` - map Include [Ind "a", Group "b", Pac "c"] + map Include [Ind (SimpleInd "a"), Group "b", Pac "c"] it "should parse good, multi-value-per-line files correctly" $ do g2res <- readEntitiesFromFile g2 g2res `shouldBe` - map Include [Ind "a1", Ind "a2", Group "b1", Pac "c1", Pac "c2", Group "b2", Group "b3"] + map Include [Ind (SimpleInd "a1"), Ind (SimpleInd "a2"), Group "b1", Pac "c1", Pac "c2", Group "b2", Group "b3"] it "should handle empty lines and #-comments correctly" $ do g3res <- readEntitiesFromFile g3 g3res `shouldBe` - map Include [Ind "a1", Ind "a2", Group "b1", Group "b2", Group "b3"] + map Include [Ind (SimpleInd "a1"), Ind (SimpleInd "a2"), Group "b1", Group "b2", Group "b3"] it "should handle exclusion correctly" $ do g4res <- readEntitiesFromFile g4 g4res `shouldBe` - [Include $ Ind "a1", Exclude $ Ind "a2", + [Include $ Ind (SimpleInd "a1"), Exclude $ Ind (SimpleInd "a2"), Exclude $ Group "b1", Include $ Group "b1", Exclude $ Pac "c2"] it "should fail to parse bad files and throw an exception" $ do @@ -112,14 +113,14 @@ goodEntities :: EntitiesList goodEntities = [ Pac "Schiffels_2016", Group "POP1", - Ind "SAMPLE3" + Ind (SimpleInd "SAMPLE3") ] badEntities :: EntitiesList badEntities = [ Pac "Schiffels_2015", Group "foo", - Ind "bar" + Ind (SimpleInd "bar") ] testFindNonExistentEntities :: Spec @@ -151,11 +152,11 @@ testExtractEntityIndices = describe "Poseidon.EntitiesList.extractEntityIndices" $ do it "should select all relevant individuals" $ do ps <- testLog $ readPoseidonPackageCollection testPacReadOpts testBaseDir - let indInts = conformingEntityIndices goodEntities (getJointIndividualInfo ps) + let indInts = map (\(i,_,_) -> i) $ conformingEntityIndices goodEntities (getJointIndividualInfo ps) indInts `shouldMatchList` [0, 2, 6, 8, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 23] it "should drop all irrelevant individuals" $ do ps <- testLog $ readPoseidonPackageCollection testPacReadOpts testBaseDir - let indInts = conformingEntityIndices badEntities (getJointIndividualInfo ps) + let indInts = map (\(i,_,_) -> i) $ conformingEntityIndices badEntities (getJointIndividualInfo ps) indInts `shouldBe` [] it "should correctly extract indices with ordered signed entities" $ do let indInfo = [ @@ -167,23 +168,23 @@ testExtractEntityIndices = IndividualInfo "Ind6" ["Pop3", "PopC"] "Pac2", IndividualInfo "Ind7" ["Pop4", "PopC"] "Pac2", IndividualInfo "Ind8" ["Pop4", "PopC"] "Pac2"] - conformingEntityIndices [Include (Pac "Pac1"), Exclude (Group "Pop2"), Include (Ind "Ind3")] indInfo `shouldBe` [0, 1, 2] - conformingEntityIndices [Include (Pac "Pac1")] indInfo `shouldBe` [0, 1, 2, 3] + map (\(i,_,_) -> i) (conformingEntityIndices [Include (Pac "Pac1"), Exclude (Group "Pop2"), Include (Ind (SimpleInd "Ind3"))] indInfo) `shouldBe` [0, 1, 2] + map (\(i,_,_) -> i) (conformingEntityIndices [Include (Pac "Pac1")] indInfo) `shouldBe` [0, 1, 2, 3] testJSON :: Spec testJSON = describe "Poseidon.EntitiesList.ToJSON" $ do it "should encode entities correctly to JSON" $ do - encode (Ind "Ind1") `shouldBe` "\"\"" + encode (Ind (SimpleInd "Ind1")) `shouldBe` "\"\"" encode (Group "Group1") `shouldBe` "\"Group1\"" encode (Pac "Pac1") `shouldBe` "\"*Pac1*\"" - encode (Exclude (Ind "Ind1")) `shouldBe` "\"-\"" + encode (Exclude (Ind (SimpleInd "Ind1"))) `shouldBe` "\"-\"" encode (Exclude (Group "Group1")) `shouldBe` "\"-Group1\"" encode (Exclude (Pac "Pac1")) `shouldBe` "\"-*Pac1*\"" it "should decode entities correctly from JSON" $ do - decode "\"\"" `shouldBe` Just (Ind "Ind1") + decode "\"\"" `shouldBe` Just (Ind (SimpleInd "Ind1")) decode "\"Group1\"" `shouldBe` Just (Group "Group1") decode "\"*Pac1*\"" `shouldBe` Just (Pac "Pac1") - decode "\"-\"" `shouldBe` Just (Exclude (Ind "Ind1")) + decode "\"-\"" `shouldBe` Just (Exclude (Ind (SimpleInd "Ind1"))) decode "\"-Group1\"" `shouldBe` Just (Exclude (Group "Group1")) decode "\"-*Pac1*\"" `shouldBe` Just (Exclude (Pac "Pac1")) From 10a058cca5139d73102d79e7e0394b90cdb738e9 Mon Sep 17 00:00:00 2001 From: Clemens Schmid Date: Sun, 18 Dec 2022 15:20:47 +0100 Subject: [PATCH 11/26] fixed wrong sorting of output individuals --- src/Poseidon/CLI/Forge.hs | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/Poseidon/CLI/Forge.hs b/src/Poseidon/CLI/Forge.hs index 0295f2ec9..69fd76a99 100644 --- a/src/Poseidon/CLI/Forge.hs +++ b/src/Poseidon/CLI/Forge.hs @@ -41,7 +41,8 @@ import Control.Exception (catch, throwIO) import Control.Monad (forM, forM_, unless, when) import Control.Monad.Reader (ask) import Data.Function ((&)) -import Data.List (groupBy, intercalate, nub, sortBy) +import Data.List (groupBy, intercalate, nub, sort, + sortBy) import Data.Maybe (mapMaybe) import Data.Time (getCurrentTime) import qualified Data.Vector as V @@ -127,10 +128,10 @@ runForge ( -- get all individuals from the relevant packages let allInds = getJointIndividualInfo $ relevantPackages - -- determine which individuals are potentially relevant + -- determine which individuals are potentially relevant and attribute each of them an index let relevantInds = conformingEntityIndices entities allInds - -- resolve duplicates that are already specified + -- resolve duplicates that are already specified in --foŕgeString with let equalNameIndividuals = relevantInds & sortBy (\(_,IndividualInfo a _ _,_) (_,IndividualInfo b _ _,_) -> compare a b) & @@ -146,7 +147,7 @@ runForge ( liftIO $ throwIO $ PoseidonForgeEntitiesException "Unresolved duplicated individuals" -- reduce individual list to a list of relevant indices - let relevantIndices = map (\(i,_,_) -> i) $ concat equalNameIndividuals + let relevantIndices = sort $ map (\(i,_,_) -> i) $ concat equalNameIndividuals -- collect data -- -- janno From f91e6f8a3856dbce85765567f134493b22af1743 Mon Sep 17 00:00:00 2001 From: Clemens Schmid Date: Sun, 18 Dec 2022 15:41:58 +0100 Subject: [PATCH 12/26] removed some old comments --- src/Poseidon/EntitiesList.hs | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/Poseidon/EntitiesList.hs b/src/Poseidon/EntitiesList.hs index 1fc996132..ab39a1dc2 100644 --- a/src/Poseidon/EntitiesList.hs +++ b/src/Poseidon/EntitiesList.hs @@ -97,13 +97,11 @@ instance EntitySpec SignedEntity where IsInIndInfo -> Just ShouldBeIncluded IsInIndInfoSpecified -> Just ShouldBeIncludedWithHigherPriority IsNotInIndInfo -> Nothing - --if isIndInfo entity then Just True else Nothing shouldIncExc (Exclude entity) = case isIndInfo entity of IsInIndInfo -> Just ShouldNotBeIncluded IsInIndInfoSpecified -> Just ShouldNotBeIncluded IsNotInIndInfo -> Nothing - --if isIndInfo entity then Just False else Nothing isIndInfo :: PoseidonEntity -> SelectionLevel1 isIndInfo (Ind (SimpleInd n)) = if n == indName then IsInIndInfo else IsNotInIndInfo isIndInfo (Ind (SpecificInd i)) = if i == indInfo then IsInIndInfoSpecified else IsNotInIndInfo @@ -211,7 +209,7 @@ findNonExistentEntities entities individuals = in missingPacs ++ missingInds ++ missingGroups conformingEntityIndices :: (EntitySpec a) => [a] -> [IndividualInfo] -> [(Int, IndividualInfo, SelectionLevel2)] -conformingEntityIndices entities xs = --filter (indInfoConformsToEntitySpec entities . snd) . zip [0..] xs +conformingEntityIndices entities xs = filter (\(_,_,level) -> meansIn level) $ map (\(index, x) -> (index, x, indInfoConformsToEntitySpec entities x)) (zip [0..] xs) onlyKeepSpecifics :: [(Int, IndividualInfo, SelectionLevel2)] -> [(Int, IndividualInfo, SelectionLevel2)] From 2fdca4af1a363206ab53a908c876b348125de690 Mon Sep 17 00:00:00 2001 From: Stephan Schiffels Date: Wed, 11 Jan 2023 17:15:31 +0100 Subject: [PATCH 13/26] minor typo and naming fixes --- src/Poseidon/CLI/Forge.hs | 4 ++-- src/Poseidon/EntitiesList.hs | 14 +++++++------- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/src/Poseidon/CLI/Forge.hs b/src/Poseidon/CLI/Forge.hs index 69fd76a99..6db5d30b9 100644 --- a/src/Poseidon/CLI/Forge.hs +++ b/src/Poseidon/CLI/Forge.hs @@ -114,9 +114,9 @@ runForge ( logInfo $ "Forging with the following entity-list: " ++ (intercalate ", " . map show . take 10) entities ++ if length entities > 10 then " and " ++ show (length entities - 10) ++ " more" else "" - -- check for entities that do not exist this this dataset + -- check for entities that do not exist in this dataset let nonExistentEntities = findNonExistentEntities entities . getJointIndividualInfo $ allPackages - unless (null nonExistentEntities) $ + unless (null nonExistentEntities) $ logWarning $ "Detected entities that do not exist in the dataset. They will be ignored: " ++ intercalate ", " (map show nonExistentEntities) diff --git a/src/Poseidon/EntitiesList.hs b/src/Poseidon/EntitiesList.hs index ab39a1dc2..807c5d674 100644 --- a/src/Poseidon/EntitiesList.hs +++ b/src/Poseidon/EntitiesList.hs @@ -93,20 +93,20 @@ instance EntitySpec SignedEntity where where shouldIncExc :: SignedEntity -> Maybe SelectionLevel2 shouldIncExc (Include entity) = - case isIndInfo entity of + case isInIndInfo entity of IsInIndInfo -> Just ShouldBeIncluded IsInIndInfoSpecified -> Just ShouldBeIncludedWithHigherPriority IsNotInIndInfo -> Nothing shouldIncExc (Exclude entity) = - case isIndInfo entity of + case isInIndInfo entity of IsInIndInfo -> Just ShouldNotBeIncluded IsInIndInfoSpecified -> Just ShouldNotBeIncluded IsNotInIndInfo -> Nothing - isIndInfo :: PoseidonEntity -> SelectionLevel1 - isIndInfo (Ind (SimpleInd n)) = if n == indName then IsInIndInfo else IsNotInIndInfo - isIndInfo (Ind (SpecificInd i)) = if i == indInfo then IsInIndInfoSpecified else IsNotInIndInfo - isIndInfo (Group n) = if n `elem` groupNames then IsInIndInfo else IsNotInIndInfo - isIndInfo (Pac n) = if n == pacName then IsInIndInfo else IsNotInIndInfo + isInIndInfo :: PoseidonEntity -> SelectionLevel1 + isInIndInfo (Ind (SimpleInd n)) = if n == indName then IsInIndInfo else IsNotInIndInfo + isInIndInfo (Ind (SpecificInd i)) = if i == indInfo then IsInIndInfoSpecified else IsNotInIndInfo + isInIndInfo (Group n) = if n `elem` groupNames then IsInIndInfo else IsNotInIndInfo + isInIndInfo (Pac n) = if n == pacName then IsInIndInfo else IsNotInIndInfo underlyingEntity = removeEntitySign entitySpecParser = parseSign <*> entitySpecParser where From ab81f5d786275f02144846ec329b60361cc0fba7 Mon Sep 17 00:00:00 2001 From: Clemens Schmid Date: Thu, 12 Jan 2023 13:10:02 +0100 Subject: [PATCH 14/26] stylish haskell --- src/Poseidon/CLI/Forge.hs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Poseidon/CLI/Forge.hs b/src/Poseidon/CLI/Forge.hs index 6db5d30b9..67ac3e10c 100644 --- a/src/Poseidon/CLI/Forge.hs +++ b/src/Poseidon/CLI/Forge.hs @@ -116,7 +116,7 @@ runForge ( -- check for entities that do not exist in this dataset let nonExistentEntities = findNonExistentEntities entities . getJointIndividualInfo $ allPackages - unless (null nonExistentEntities) $ + unless (null nonExistentEntities) $ logWarning $ "Detected entities that do not exist in the dataset. They will be ignored: " ++ intercalate ", " (map show nonExistentEntities) From 94e0320c76bc8c5400f9daa2d5d4ff02f581c39c Mon Sep 17 00:00:00 2001 From: Clemens Schmid Date: Thu, 12 Jan 2023 16:00:27 +0100 Subject: [PATCH 15/26] added tests for the change in the entity definition - this uncovered some issues in the code I also decided to address in this commit --- src/Poseidon/CLI/Forge.hs | 14 +- src/Poseidon/EntitiesList.hs | 40 +++-- test/Poseidon/EntitiesListSpec.hs | 159 ++++++++++++------ .../testDat/testEntityFiles/goodEntities1.txt | 1 + .../testDat/testEntityFiles/goodEntities2.txt | 2 +- .../testDat/testEntityFiles/goodEntities4.txt | 2 +- 6 files changed, 142 insertions(+), 76 deletions(-) diff --git a/src/Poseidon/CLI/Forge.hs b/src/Poseidon/CLI/Forge.hs index 67ac3e10c..1f25c1289 100644 --- a/src/Poseidon/CLI/Forge.hs +++ b/src/Poseidon/CLI/Forge.hs @@ -10,8 +10,8 @@ import Poseidon.EntitiesList (EntityInput, PoseidonEntity (..), conformingEntityIndices, filterRelevantPackages, findNonExistentEntities, - onlyKeepSpecifics, - readEntityInputs) + readEntityInputs, + resolveIndividualNameDuplicates) import Poseidon.GenotypeData (GenoDataSource (..), GenotypeDataSpec (..), GenotypeFormatSpec (..), @@ -40,9 +40,7 @@ import Poseidon.Utils (PoseidonException (..), import Control.Exception (catch, throwIO) import Control.Monad (forM, forM_, unless, when) import Control.Monad.Reader (ask) -import Data.Function ((&)) -import Data.List (groupBy, intercalate, nub, sort, - sortBy) +import Data.List (intercalate, nub, sort) import Data.Maybe (mapMaybe) import Data.Time (getCurrentTime) import qualified Data.Vector as V @@ -132,11 +130,7 @@ runForge ( let relevantInds = conformingEntityIndices entities allInds -- resolve duplicates that are already specified in --foŕgeString with - let equalNameIndividuals = - relevantInds & - sortBy (\(_,IndividualInfo a _ _,_) (_,IndividualInfo b _ _,_) -> compare a b) & - groupBy (\(_,IndividualInfo a _ _,_) (_,IndividualInfo b _ _,_) -> a == b) & - map onlyKeepSpecifics + let equalNameIndividuals = resolveIndividualNameDuplicates relevantInds -- check if there still are duplicates and if yes, then stop let duplicatedInds = concat $ filter (\x -> length x > 1) equalNameIndividuals diff --git a/src/Poseidon/EntitiesList.hs b/src/Poseidon/EntitiesList.hs index 807c5d674..5bb01dfe6 100644 --- a/src/Poseidon/EntitiesList.hs +++ b/src/Poseidon/EntitiesList.hs @@ -4,7 +4,7 @@ module Poseidon.EntitiesList ( indInfoConformsToEntitySpec, underlyingEntity, entitySpecParser, readEntitiesFromFile, readEntitiesFromString, findNonExistentEntities, indInfoFindRelevantPackageNames, filterRelevantPackages, - conformingEntityIndices, entitiesListP, EntityInput(..), readEntityInputs, getIndName, PoseidonIndividual (..), onlyKeepSpecifics) where + conformingEntityIndices, entitiesListP, EntityInput(..), readEntityInputs, getIndName, PoseidonIndividual (..), resolveIndividualNameDuplicates) where import Poseidon.Package (PoseidonPackage (..), getJointIndividualInfo) @@ -19,7 +19,8 @@ import Data.Aeson (FromJSON (..), ToJSON (..), Value (..), withText) import Data.Aeson.Types (Parser) import Data.Char (isSpace) -import Data.List (nub, (\\)) +import Data.Function ((&)) +import Data.List (groupBy, nub, sortBy, (\\)) import Data.Maybe (mapMaybe) import Data.Text (Text, pack, unpack) import qualified Text.Parsec as P @@ -200,24 +201,33 @@ findNonExistentEntities entities individuals = let titlesPac = nub . map indInfoPacName $ individuals indNamesPac = map indInfoName individuals groupNamesPac = nub . concatMap indInfoGroups $ individuals - titlesRequestedPacs = nub [ pac | Pac pac <- map underlyingEntity entities] - groupNamesStats = nub [ group | Group group <- map underlyingEntity entities] - indNamesStats = nub [ ind | Ind ind <- map underlyingEntity entities] - missingPacs = map Pac $ titlesRequestedPacs \\ titlesPac - missingInds = map Ind $ filter (\x -> getIndName x `notElem` indNamesPac) indNamesStats - missingGroups = map Group $ groupNamesStats \\ groupNamesPac - in missingPacs ++ missingInds ++ missingGroups + titlesRequestedPacs = nub [ pac | Pac pac <- map underlyingEntity entities] + groupNamesStats = nub [ group | Group group <- map underlyingEntity entities] + simpleIndNamesStats = nub [ ind | Ind (SimpleInd ind) <- map underlyingEntity entities] + specificIndsStats = nub [ ind | Ind (SpecificInd ind) <- map underlyingEntity entities] + missingPacs = map Pac $ titlesRequestedPacs \\ titlesPac + missingGroups = map Group $ groupNamesStats \\ groupNamesPac + missingSimpleInds = map (Ind . SimpleInd) $ simpleIndNamesStats \\ indNamesPac + missingSpecificInds = map (Ind . SpecificInd) $ specificIndsStats \\ individuals + in missingPacs ++ missingGroups ++ missingSimpleInds ++ missingSpecificInds conformingEntityIndices :: (EntitySpec a) => [a] -> [IndividualInfo] -> [(Int, IndividualInfo, SelectionLevel2)] conformingEntityIndices entities xs = filter (\(_,_,level) -> meansIn level) $ map (\(index, x) -> (index, x, indInfoConformsToEntitySpec entities x)) (zip [0..] xs) -onlyKeepSpecifics :: [(Int, IndividualInfo, SelectionLevel2)] -> [(Int, IndividualInfo, SelectionLevel2)] -onlyKeepSpecifics xs = - let highPrio = [ x | x@(_,_,ShouldBeIncludedWithHigherPriority) <- xs] - in if length xs > 1 && length highPrio == 1 - then highPrio - else xs +resolveIndividualNameDuplicates :: [(Int, IndividualInfo, SelectionLevel2)] -> [[(Int, IndividualInfo, SelectionLevel2)]] +resolveIndividualNameDuplicates entityIndices = + entityIndices & + sortBy (\(_,IndividualInfo a _ _,_) (_,IndividualInfo b _ _,_) -> compare a b) & + groupBy (\(_,IndividualInfo a _ _,_) (_,IndividualInfo b _ _,_) -> a == b) & + map onlyKeepSpecifics + where + onlyKeepSpecifics :: [(Int, IndividualInfo, SelectionLevel2)] -> [(Int, IndividualInfo, SelectionLevel2)] + onlyKeepSpecifics xs = + let highPrio = [ x | x@(_,_,ShouldBeIncludedWithHigherPriority) <- xs] + in if length xs > 1 && length highPrio == 1 + then highPrio + else xs readEntityInputs :: (MonadIO m, EntitySpec a) => [EntityInput a] -> m [a] -- An empty list means that entities are wanted. readEntityInputs entityInputs = diff --git a/test/Poseidon/EntitiesListSpec.hs b/test/Poseidon/EntitiesListSpec.hs index e09082947..d6049b8b2 100644 --- a/test/Poseidon/EntitiesListSpec.hs +++ b/test/Poseidon/EntitiesListSpec.hs @@ -22,6 +22,7 @@ spec = do testFindNonExistentEntities testFilterPackages testExtractEntityIndices + testResolveIndividualNameDuplicates testJSON @@ -34,27 +35,37 @@ testReadPoseidonEntitiesString = fromRight [] (readEntitiesFromString "b") `shouldBe` [Include $ Group "b"] fromRight [] (readEntitiesFromString "*c*") `shouldBe` [Include $ Pac "c"] it "should parse longer entity lists correctly" $ do - fromRight [] (readEntitiesFromString ",b,*c*") `shouldBe` - map Include [Ind (SimpleInd "a"), Group "b", Pac "c"] - fromRight [] (readEntitiesFromString ",b1,,*c*,b2") `shouldBe` - map Include [Ind (SimpleInd "a1"), Group "b1", Ind (SimpleInd "a2"), Pac "c", Group "b2"] + fromRight [] (readEntitiesFromString ",,b,*c*") `shouldBe` + map Include [Ind (SimpleInd "a"), Ind (SpecificInd $ IndividualInfo "a" ["b"] "c"), Group "b", Pac "c"] + fromRight [] (readEntitiesFromString ",b1,,*c*,b2,") `shouldBe` + map Include [ + Ind (SimpleInd "a1") + , Group "b1" + , Ind (SimpleInd "a2") + , Pac "c" + , Group "b2" + , Ind (SpecificInd $ IndividualInfo "a3" ["b2"] "c") + ] it "should parse unsigned entity lists correctly" $ do - fromRight [] (readEntitiesFromString ",b,*c*") `shouldBe` - [Ind (SimpleInd "a"), Group "b", Pac "c"] - fromRight [] (readEntitiesFromString ",b1,,*c*,b2") `shouldBe` - [Ind (SimpleInd "a1"), Group "b1", Ind (SimpleInd "a2"), Pac "c", Group "b2"] + fromRight [] (readEntitiesFromString ",,b,*c*") `shouldBe` + [Ind (SimpleInd "a"), Ind (SpecificInd $ IndividualInfo "a" ["b"] "c"), Group "b", Pac "c"] + fromRight [] (readEntitiesFromString ",b1,,*c*,b2,") `shouldBe` + [Ind (SimpleInd "a1"), Group "b1", Ind (SimpleInd "a2"), Pac "c", Group "b2", Ind (SpecificInd $ IndividualInfo "a3" ["b2"] "c")] it "should ignore spaces after commas" $ do - fromRight [] (readEntitiesFromString ", b, *c*") `shouldBe` - map Include [Ind (SimpleInd "a"), Group "b", Pac "c"] + fromRight [] (readEntitiesFromString ", , b, *c*") `shouldBe` + map Include [Ind (SimpleInd "a"), Ind (SpecificInd $ IndividualInfo "a" ["b"] "c"), Group "b", Pac "c"] fromRight [] (readEntitiesFromString "*c*, b") `shouldBe` map Include [Pac "c", Group "b"] it "should parse exclusion entities correctly" $ do fromRight [] (readEntitiesFromString "-") `shouldBe` [Exclude $ Ind (SimpleInd "a")] - fromRight [] (readEntitiesFromString "-, , -b1,b2,-*c1*, *c2*") `shouldBe` - [Exclude $ Ind (SimpleInd "a1"), Include $ Ind (SimpleInd "a2"), - Exclude $ Group "b1", Include $ Group "b2", - Exclude $ Pac "c1", Include $ Pac "c2"] - it "should fail with any other spaces" $ do + fromRight [] (readEntitiesFromString "-, -, , -b1,b2,-*c1*, *c2*") `shouldBe` + [ Exclude $ Ind (SimpleInd "a1") + , Exclude $ Ind (SpecificInd $ IndividualInfo "a" ["b"] "c") + , Include $ Ind (SimpleInd "a2") + , Exclude $ Group "b1", Include $ Group "b2" + , Exclude $ Pac "c1", Include $ Pac "c2" + ] + it "should fail with any other setting" $ do -- the following type annotations - annoyingly - are required because readEntitiesFromString is polymorphic, -- and even though it all returns Left, the compiler complains about ambiguous types. (readEntitiesFromString " ,b,*c*" :: Either PoseidonException EntitiesList) `shouldSatisfy` isLeft @@ -68,6 +79,10 @@ testReadPoseidonEntitiesString = (readEntitiesFromString "-,b,*c*c*" :: Either PoseidonException EntitiesList) `shouldSatisfy` isLeft (readEntitiesFromString ",b,*c*-" :: Either PoseidonException EntitiesList) `shouldSatisfy` isLeft (readEntitiesFromString "-a>,b,*c*" :: Either PoseidonException EntitiesList) `shouldSatisfy` isLeft + (readEntitiesFromString "" :: Either PoseidonException EntitiesList) `shouldSatisfy` isLeft + (readEntitiesFromString "" :: Either PoseidonException EntitiesList) `shouldSatisfy` isLeft + (readEntitiesFromString "" :: Either PoseidonException EntitiesList) `shouldSatisfy` isLeft + (readEntitiesFromString "" :: Either PoseidonException EntitiesList) `shouldSatisfy` isLeft testReadEntitiesFromFile :: Spec testReadEntitiesFromFile = @@ -80,11 +95,19 @@ testReadEntitiesFromFile = it "should parse good, single-value-per-line files correctly" $ do g1res <- readEntitiesFromFile g1 g1res `shouldBe` - map Include [Ind (SimpleInd "a"), Group "b", Pac "c"] + map Include [Ind (SimpleInd "a"), Ind (SpecificInd $ IndividualInfo "a" ["b"] "c"), Group "b", Pac "c"] it "should parse good, multi-value-per-line files correctly" $ do g2res <- readEntitiesFromFile g2 g2res `shouldBe` - map Include [Ind (SimpleInd "a1"), Ind (SimpleInd "a2"), Group "b1", Pac "c1", Pac "c2", Group "b2", Group "b3"] + map Include [ + Ind (SimpleInd "a1") + , Ind (SpecificInd $ IndividualInfo "a3" ["b2"] "c") + , Ind (SimpleInd "a2") + , Group "b1", Pac "c1" + , Pac "c2" + , Group "b2" + , Group "b3" + ] it "should handle empty lines and #-comments correctly" $ do g3res <- readEntitiesFromFile g3 g3res `shouldBe` @@ -92,9 +115,12 @@ testReadEntitiesFromFile = it "should handle exclusion correctly" $ do g4res <- readEntitiesFromFile g4 g4res `shouldBe` - [Include $ Ind (SimpleInd "a1"), Exclude $ Ind (SimpleInd "a2"), - Exclude $ Group "b1", Include $ Group "b1", - Exclude $ Pac "c2"] + [ Include $ Ind (SimpleInd "a1"), + Exclude $ Ind (SpecificInd $ IndividualInfo "a3" ["b2"] "c") + , Exclude $ Ind (SimpleInd "a2") + , Exclude $ Group "b1", Include $ Group "b1" + , Exclude $ Pac "c2" + ] it "should fail to parse bad files and throw an exception" $ do (readEntitiesFromFile b1 :: IO EntitiesList) `shouldThrow` anyException -- wrong space @@ -111,16 +137,20 @@ testBaseDir = ["test/testDat/testPackages/ancient"] goodEntities :: EntitiesList goodEntities = [ - Pac "Schiffels_2016", - Group "POP1", - Ind (SimpleInd "SAMPLE3") + Pac "Schiffels_2016" + , Group "POP1" + , Ind (SimpleInd "SAMPLE3") + , Ind (SpecificInd $ IndividualInfo "XXX001" ["POP1"] "Schiffels_2016") + , Ind (SpecificInd $ IndividualInfo "XXX012" ["POP2"] "Lamnidis_2018") ] badEntities :: EntitiesList badEntities = [ - Pac "Schiffels_2015", - Group "foo", - Ind (SimpleInd "bar") + Pac "Schiffels_2015" + , Group "foo" + , Ind (SimpleInd "bar") + , Ind (SpecificInd $ IndividualInfo "XXX002" ["POP1"] "Schiffels_2016") + , Ind (SpecificInd $ IndividualInfo "XXX001" ["POP2"] "Schiffels_2016") ] testFindNonExistentEntities :: Spec @@ -153,38 +183,69 @@ testExtractEntityIndices = it "should select all relevant individuals" $ do ps <- testLog $ readPoseidonPackageCollection testPacReadOpts testBaseDir let indInts = map (\(i,_,_) -> i) $ conformingEntityIndices goodEntities (getJointIndividualInfo ps) - indInts `shouldMatchList` [0, 2, 6, 8, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 23] + indInts `shouldMatchList` [0, 1, 2, 6, 8, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 23] it "should drop all irrelevant individuals" $ do ps <- testLog $ readPoseidonPackageCollection testPacReadOpts testBaseDir let indInts = map (\(i,_,_) -> i) $ conformingEntityIndices badEntities (getJointIndividualInfo ps) indInts `shouldBe` [] it "should correctly extract indices with ordered signed entities" $ do let indInfo = [ - IndividualInfo "Ind1" ["Pop1", "PopB"] "Pac1", - IndividualInfo "Ind2" ["Pop1", "PopB"] "Pac1", - IndividualInfo "Ind3" ["Pop2", "PopB"] "Pac1", - IndividualInfo "Ind4" ["Pop2", "PopB"] "Pac1", - IndividualInfo "Ind5" ["Pop3", "PopC"] "Pac2", - IndividualInfo "Ind6" ["Pop3", "PopC"] "Pac2", - IndividualInfo "Ind7" ["Pop4", "PopC"] "Pac2", - IndividualInfo "Ind8" ["Pop4", "PopC"] "Pac2"] - map (\(i,_,_) -> i) (conformingEntityIndices [Include (Pac "Pac1"), Exclude (Group "Pop2"), Include (Ind (SimpleInd "Ind3"))] indInfo) `shouldBe` [0, 1, 2] - map (\(i,_,_) -> i) (conformingEntityIndices [Include (Pac "Pac1")] indInfo) `shouldBe` [0, 1, 2, 3] + IndividualInfo "Ind1" ["Pop1", "PopB"] "Pac1" + , IndividualInfo "Ind2" ["Pop1", "PopB"] "Pac1" + , IndividualInfo "Ind3" ["Pop2", "PopB"] "Pac1" + , IndividualInfo "Ind4" ["Pop2", "PopB"] "Pac1" + , IndividualInfo "Ind5" ["Pop3", "PopC"] "Pac2" + , IndividualInfo "Ind6" ["Pop3", "PopC"] "Pac2" + , IndividualInfo "Ind7" ["Pop4", "PopC"] "Pac2" + , IndividualInfo "Ind8" ["Pop4", "PopC"] "Pac2" + ] + map (\(i,_,_) -> i) (conformingEntityIndices [ + Include (Pac "Pac1") + ] indInfo) `shouldBe` [0, 1, 2, 3] + map (\(i,_,_) -> i) (conformingEntityIndices [ + Include (Pac "Pac1") + , Exclude (Group "Pop2") + , Include (Ind (SimpleInd "Ind3")) + , Include (Ind (SpecificInd $ IndividualInfo "Ind8" ["Pop4"] "Pac2")) + ] indInfo) `shouldBe` [0, 1, 2, 7] + +testResolveIndividualNameDuplicates :: Spec +testResolveIndividualNameDuplicates = + describe "Poseidon.EntitiesList.resolveIndividualNameDuplicates" $ do + it "should correctly extract indices in case of duplicates" $ do + let indInfoDuplicates = [ + IndividualInfo "Ind1" ["Pop1", "PopB"] "Pac1" + , IndividualInfo "Ind1" ["Pop1", "PopB"] "Pac2" + , IndividualInfo "Ind1" ["Pop1", "PopB"] "Pac3" + , IndividualInfo "Ind2" ["Pop2", "PopB"] "Pac1" + , IndividualInfo "Ind2" ["Pop2", "PopB"] "Pac2" + , IndividualInfo "Ind2" ["Pop2", "PopB"] "Pac3" + ] + map (\(i,_,_) -> i) (concat (resolveIndividualNameDuplicates (conformingEntityIndices [ + Include (Ind (SimpleInd "Ind1")) + , Include (Ind (SpecificInd $ IndividualInfo "Ind1" ["Pop1"] "Pac2")) + , Include (Ind (SimpleInd "Pop2")) + , Include (Ind (SpecificInd $ IndividualInfo "Ind2" ["Pop2"] "Pac3")) + ] indInfoDuplicates))) `shouldBe` [1,5] testJSON :: Spec testJSON = describe "Poseidon.EntitiesList.ToJSON" $ do it "should encode entities correctly to JSON" $ do - encode (Ind (SimpleInd "Ind1")) `shouldBe` "\"\"" - encode (Group "Group1") `shouldBe` "\"Group1\"" - encode (Pac "Pac1") `shouldBe` "\"*Pac1*\"" - encode (Exclude (Ind (SimpleInd "Ind1"))) `shouldBe` "\"-\"" - encode (Exclude (Group "Group1")) `shouldBe` "\"-Group1\"" - encode (Exclude (Pac "Pac1")) `shouldBe` "\"-*Pac1*\"" + encode (Ind (SimpleInd "Ind1")) `shouldBe` "\"\"" + encode (Ind (SpecificInd $ IndividualInfo "a" ["b"] "c")) `shouldBe` "\"\"" + encode (Group "Group1") `shouldBe` "\"Group1\"" + encode (Pac "Pac1") `shouldBe` "\"*Pac1*\"" + encode (Exclude (Ind (SimpleInd "Ind1"))) `shouldBe` "\"-\"" + encode (Exclude (Ind (SpecificInd $ IndividualInfo "a" ["b"] "c"))) `shouldBe` "\"-\"" + encode (Exclude (Group "Group1")) `shouldBe` "\"-Group1\"" + encode (Exclude (Pac "Pac1")) `shouldBe` "\"-*Pac1*\"" it "should decode entities correctly from JSON" $ do - decode "\"\"" `shouldBe` Just (Ind (SimpleInd "Ind1")) - decode "\"Group1\"" `shouldBe` Just (Group "Group1") - decode "\"*Pac1*\"" `shouldBe` Just (Pac "Pac1") - decode "\"-\"" `shouldBe` Just (Exclude (Ind (SimpleInd "Ind1"))) - decode "\"-Group1\"" `shouldBe` Just (Exclude (Group "Group1")) - decode "\"-*Pac1*\"" `shouldBe` Just (Exclude (Pac "Pac1")) + decode "\"\"" `shouldBe` Just (Ind (SimpleInd "Ind1")) + decode "\"\"" `shouldBe` Just (Ind (SpecificInd $ IndividualInfo "a" ["b"] "c")) + decode "\"Group1\"" `shouldBe` Just (Group "Group1") + decode "\"*Pac1*\"" `shouldBe` Just (Pac "Pac1") + decode "\"-\"" `shouldBe` Just (Exclude (Ind (SimpleInd "Ind1"))) + decode "\"-\"" `shouldBe` Just (Exclude (Ind (SpecificInd $ IndividualInfo "a" ["b"] "c"))) + decode "\"-Group1\"" `shouldBe` Just (Exclude (Group "Group1")) + decode "\"-*Pac1*\"" `shouldBe` Just (Exclude (Pac "Pac1")) diff --git a/test/testDat/testEntityFiles/goodEntities1.txt b/test/testDat/testEntityFiles/goodEntities1.txt index 990d67eee..68bc136ab 100644 --- a/test/testDat/testEntityFiles/goodEntities1.txt +++ b/test/testDat/testEntityFiles/goodEntities1.txt @@ -1,3 +1,4 @@ + b *c* \ No newline at end of file diff --git a/test/testDat/testEntityFiles/goodEntities2.txt b/test/testDat/testEntityFiles/goodEntities2.txt index 2bf0ec524..bd0a37df8 100644 --- a/test/testDat/testEntityFiles/goodEntities2.txt +++ b/test/testDat/testEntityFiles/goodEntities2.txt @@ -1,3 +1,3 @@ -, +,, b1, *c1* *c2*, b2,b3 \ No newline at end of file diff --git a/test/testDat/testEntityFiles/goodEntities4.txt b/test/testDat/testEntityFiles/goodEntities4.txt index 697cf6d42..7d1fd5ffe 100644 --- a/test/testDat/testEntityFiles/goodEntities4.txt +++ b/test/testDat/testEntityFiles/goodEntities4.txt @@ -1,4 +1,4 @@ -,- +,-,- -b1 b1 #-*c1* From ad366da98ea7a3d0c57748c1185c43a02813cd77 Mon Sep 17 00:00:00 2001 From: Clemens Schmid Date: Thu, 12 Jan 2023 17:17:16 +0100 Subject: [PATCH 16/26] refactoring to provide a more intuitive interface when resolving entities to lists of individuals --- src/Poseidon/CLI/Forge.hs | 20 ++++------- src/Poseidon/EntitiesList.hs | 41 +++++++++++++--------- test/Poseidon/EntitiesListSpec.hs | 57 ++++++++++++++++++++----------- 3 files changed, 68 insertions(+), 50 deletions(-) diff --git a/src/Poseidon/CLI/Forge.hs b/src/Poseidon/CLI/Forge.hs index 1f25c1289..c30f0a8f9 100644 --- a/src/Poseidon/CLI/Forge.hs +++ b/src/Poseidon/CLI/Forge.hs @@ -7,11 +7,10 @@ import Poseidon.BibFile (BibEntry (..), BibTeX, import Poseidon.EntitiesList (EntityInput, PoseidonEntity (..), PoseidonIndividual (..), SignedEntity (..), - conformingEntityIndices, filterRelevantPackages, findNonExistentEntities, readEntityInputs, - resolveIndividualNameDuplicates) + resolveEntityIndices) import Poseidon.GenotypeData (GenoDataSource (..), GenotypeDataSpec (..), GenotypeFormatSpec (..), @@ -40,7 +39,7 @@ import Poseidon.Utils (PoseidonException (..), import Control.Exception (catch, throwIO) import Control.Monad (forM, forM_, unless, when) import Control.Monad.Reader (ask) -import Data.List (intercalate, nub, sort) +import Data.List (intercalate, nub) import Data.Maybe (mapMaybe) import Data.Time (getCurrentTime) import qualified Data.Vector as V @@ -126,23 +125,16 @@ runForge ( -- get all individuals from the relevant packages let allInds = getJointIndividualInfo $ relevantPackages - -- determine which individuals are potentially relevant and attribute each of them an index - let relevantInds = conformingEntityIndices entities allInds - - -- resolve duplicates that are already specified in --foŕgeString with - let equalNameIndividuals = resolveIndividualNameDuplicates relevantInds + -- determine indizes of relevant individuals and resolve duplicates + let (unresolvedDuplicatedInds, relevantIndices) = resolveEntityIndices entities allInds -- check if there still are duplicates and if yes, then stop - let duplicatedInds = concat $ filter (\x -> length x > 1) equalNameIndividuals - unless (null duplicatedInds) $ do + unless (null unresolvedDuplicatedInds) $ do logError "There are duplicated individuals, but forge does not allow that" logError "Please specify in your --forgeString or --forgeFile:" - mapM_ (\(_,i@(IndividualInfo n _ _),_) -> logError $ show (SimpleInd n) ++ " -> " ++ show (SpecificInd i)) duplicatedInds + mapM_ (\(_,i@(IndividualInfo n _ _),_) -> logError $ show (SimpleInd n) ++ " -> " ++ show (SpecificInd i)) $ concat unresolvedDuplicatedInds liftIO $ throwIO $ PoseidonForgeEntitiesException "Unresolved duplicated individuals" - -- reduce individual list to a list of relevant indices - let relevantIndices = sort $ map (\(i,_,_) -> i) $ concat equalNameIndividuals - -- collect data -- -- janno let jannoRows = getJointJanno relevantPackages diff --git a/src/Poseidon/EntitiesList.hs b/src/Poseidon/EntitiesList.hs index 5bb01dfe6..ed650b67f 100644 --- a/src/Poseidon/EntitiesList.hs +++ b/src/Poseidon/EntitiesList.hs @@ -4,7 +4,8 @@ module Poseidon.EntitiesList ( indInfoConformsToEntitySpec, underlyingEntity, entitySpecParser, readEntitiesFromFile, readEntitiesFromString, findNonExistentEntities, indInfoFindRelevantPackageNames, filterRelevantPackages, - conformingEntityIndices, entitiesListP, EntityInput(..), readEntityInputs, getIndName, PoseidonIndividual (..), resolveIndividualNameDuplicates) where + entitiesListP, EntityInput(..), readEntityInputs, getIndName, PoseidonIndividual (..), + resolveEntityIndices, SelectionLevel2 (..)) where import Poseidon.Package (PoseidonPackage (..), getJointIndividualInfo) @@ -20,7 +21,7 @@ import Data.Aeson (FromJSON (..), ToJSON (..), import Data.Aeson.Types (Parser) import Data.Char (isSpace) import Data.Function ((&)) -import Data.List (groupBy, nub, sortBy, (\\)) +import Data.List (groupBy, nub, sort, sortBy, (\\)) import Data.Maybe (mapMaybe) import Data.Text (Text, pack, unpack) import qualified Text.Parsec as P @@ -73,7 +74,7 @@ data SelectionLevel2 = ShouldBeIncluded | ShouldBeIncludedWithHigherPriority | ShouldNotBeIncluded - deriving Show + deriving (Show, Eq) meansIn :: SelectionLevel2 -> Bool meansIn ShouldBeIncluded = True @@ -211,23 +212,31 @@ findNonExistentEntities entities individuals = missingSpecificInds = map (Ind . SpecificInd) $ specificIndsStats \\ individuals in missingPacs ++ missingGroups ++ missingSimpleInds ++ missingSpecificInds -conformingEntityIndices :: (EntitySpec a) => [a] -> [IndividualInfo] -> [(Int, IndividualInfo, SelectionLevel2)] -conformingEntityIndices entities xs = - filter (\(_,_,level) -> meansIn level) $ map (\(index, x) -> (index, x, indInfoConformsToEntitySpec entities x)) (zip [0..] xs) - -resolveIndividualNameDuplicates :: [(Int, IndividualInfo, SelectionLevel2)] -> [[(Int, IndividualInfo, SelectionLevel2)]] -resolveIndividualNameDuplicates entityIndices = - entityIndices & - sortBy (\(_,IndividualInfo a _ _,_) (_,IndividualInfo b _ _,_) -> compare a b) & - groupBy (\(_,IndividualInfo a _ _,_) (_,IndividualInfo b _ _,_) -> a == b) & - map onlyKeepSpecifics +-- | Result: fst is a list of unresolved duplicates, snd a simple list of integers for the simple single individuals +resolveEntityIndices :: (EntitySpec a) => [a] -> [IndividualInfo] -> ([[(Int, IndividualInfo, SelectionLevel2)]], [Int]) +resolveEntityIndices entities xs = + let allFittingIndizes = conformingEntityIndices entities xs + groupsOfEqualNameIndividuals = resolveDuplicatesIfPossible $ groupByIndividualName allFittingIndizes + unresolvedDuplicates = filter (\x -> length x > 1) groupsOfEqualNameIndividuals + simpleSingles = sort $ map (\(i,_,_) -> i) $ concat $ filter (\x -> length x == 1) groupsOfEqualNameIndividuals + in (unresolvedDuplicates, simpleSingles) where + conformingEntityIndices :: (EntitySpec a) => [a] -> [IndividualInfo] -> [(Int, IndividualInfo, SelectionLevel2)] + conformingEntityIndices ents inds = + filter (\(_,_,level) -> meansIn level) $ map (\(index, x) -> (index, x, indInfoConformsToEntitySpec ents x)) (zip [0..] inds) + groupByIndividualName :: [(Int, IndividualInfo, SelectionLevel2)] -> [[(Int, IndividualInfo, SelectionLevel2)]] + groupByIndividualName entityIndices = + entityIndices & + sortBy (\(_,IndividualInfo a _ _,_) (_,IndividualInfo b _ _,_) -> compare a b) & + groupBy (\(_,IndividualInfo a _ _,_) (_,IndividualInfo b _ _,_) -> a == b) + resolveDuplicatesIfPossible :: [[(Int, IndividualInfo, SelectionLevel2)]] -> [[(Int, IndividualInfo, SelectionLevel2)]] + resolveDuplicatesIfPossible = map onlyKeepSpecifics onlyKeepSpecifics :: [(Int, IndividualInfo, SelectionLevel2)] -> [(Int, IndividualInfo, SelectionLevel2)] - onlyKeepSpecifics xs = - let highPrio = [ x | x@(_,_,ShouldBeIncludedWithHigherPriority) <- xs] + onlyKeepSpecifics groupOfInds = + let highPrio = [ x | x@(_,_,ShouldBeIncludedWithHigherPriority) <- groupOfInds] in if length xs > 1 && length highPrio == 1 then highPrio - else xs + else groupOfInds readEntityInputs :: (MonadIO m, EntitySpec a) => [EntityInput a] -> m [a] -- An empty list means that entities are wanted. readEntityInputs entityInputs = diff --git a/test/Poseidon/EntitiesListSpec.hs b/test/Poseidon/EntitiesListSpec.hs index d6049b8b2..c3da27ed6 100644 --- a/test/Poseidon/EntitiesListSpec.hs +++ b/test/Poseidon/EntitiesListSpec.hs @@ -21,8 +21,7 @@ spec = do testReadEntitiesFromFile testFindNonExistentEntities testFilterPackages - testExtractEntityIndices - testResolveIndividualNameDuplicates + testResolveEntityIndices testJSON @@ -177,17 +176,17 @@ testFilterPackages = let pacs = filterRelevantPackages badEntities ps pacs `shouldBe` [] -testExtractEntityIndices :: Spec -testExtractEntityIndices = - describe "Poseidon.EntitiesList.extractEntityIndices" $ do +testResolveEntityIndices :: Spec +testResolveEntityIndices = + describe "Poseidon.EntitiesList.resolveEntityIndices" $ do it "should select all relevant individuals" $ do ps <- testLog $ readPoseidonPackageCollection testPacReadOpts testBaseDir - let indInts = map (\(i,_,_) -> i) $ conformingEntityIndices goodEntities (getJointIndividualInfo ps) - indInts `shouldMatchList` [0, 1, 2, 6, 8, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 23] + let indInts = resolveEntityIndices goodEntities (getJointIndividualInfo ps) + indInts `shouldBe` ([], [0, 1, 2, 6, 8, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 23]) it "should drop all irrelevant individuals" $ do ps <- testLog $ readPoseidonPackageCollection testPacReadOpts testBaseDir - let indInts = map (\(i,_,_) -> i) $ conformingEntityIndices badEntities (getJointIndividualInfo ps) - indInts `shouldBe` [] + let indInts = resolveEntityIndices badEntities (getJointIndividualInfo ps) + indInts `shouldBe` ([], []) it "should correctly extract indices with ordered signed entities" $ do let indInfo = [ IndividualInfo "Ind1" ["Pop1", "PopB"] "Pac1" @@ -199,19 +198,15 @@ testExtractEntityIndices = , IndividualInfo "Ind7" ["Pop4", "PopC"] "Pac2" , IndividualInfo "Ind8" ["Pop4", "PopC"] "Pac2" ] - map (\(i,_,_) -> i) (conformingEntityIndices [ + resolveEntityIndices [ Include (Pac "Pac1") - ] indInfo) `shouldBe` [0, 1, 2, 3] - map (\(i,_,_) -> i) (conformingEntityIndices [ + ] indInfo `shouldBe` ([], [0, 1, 2, 3]) + resolveEntityIndices [ Include (Pac "Pac1") , Exclude (Group "Pop2") , Include (Ind (SimpleInd "Ind3")) , Include (Ind (SpecificInd $ IndividualInfo "Ind8" ["Pop4"] "Pac2")) - ] indInfo) `shouldBe` [0, 1, 2, 7] - -testResolveIndividualNameDuplicates :: Spec -testResolveIndividualNameDuplicates = - describe "Poseidon.EntitiesList.resolveIndividualNameDuplicates" $ do + ] indInfo `shouldBe` ([], [0, 1, 2, 7]) it "should correctly extract indices in case of duplicates" $ do let indInfoDuplicates = [ IndividualInfo "Ind1" ["Pop1", "PopB"] "Pac1" @@ -221,12 +216,34 @@ testResolveIndividualNameDuplicates = , IndividualInfo "Ind2" ["Pop2", "PopB"] "Pac2" , IndividualInfo "Ind2" ["Pop2", "PopB"] "Pac3" ] - map (\(i,_,_) -> i) (concat (resolveIndividualNameDuplicates (conformingEntityIndices [ + resolveEntityIndices [ + Include (Ind (SimpleInd "Ind1")) + , Include (Ind (SpecificInd $ IndividualInfo "Ind1" ["Pop1"] "Pac2")) + ] indInfoDuplicates `shouldBe` ( + [], + [1] + ) + resolveEntityIndices [ Include (Ind (SimpleInd "Ind1")) , Include (Ind (SpecificInd $ IndividualInfo "Ind1" ["Pop1"] "Pac2")) - , Include (Ind (SimpleInd "Pop2")) + , Include (Ind (SimpleInd "Ind2")) , Include (Ind (SpecificInd $ IndividualInfo "Ind2" ["Pop2"] "Pac3")) - ] indInfoDuplicates))) `shouldBe` [1,5] + ] indInfoDuplicates `shouldBe` ( + [], + [1,5] + ) + resolveEntityIndices [ + Include (Ind (SimpleInd "Ind1")) + , Include (Ind (SpecificInd $ IndividualInfo "Ind1" ["Pop1"] "Pac2")) + , Include (Ind (SimpleInd "Ind2")) + ] indInfoDuplicates `shouldBe` ( + [[ + (3, IndividualInfo {indInfoName = "Ind2", indInfoGroups = ["Pop2","PopB"], indInfoPacName = "Pac1"}, ShouldBeIncluded) + , (4, IndividualInfo {indInfoName = "Ind2", indInfoGroups = ["Pop2","PopB"], indInfoPacName = "Pac2"}, ShouldBeIncluded) + , (5, IndividualInfo {indInfoName = "Ind2", indInfoGroups = ["Pop2","PopB"], indInfoPacName = "Pac3"}, ShouldBeIncluded) + ]], + [1] + ) testJSON :: Spec testJSON = From 4a727236a9a89382dd46dc7e0cdefb2d29a596e9 Mon Sep 17 00:00:00 2001 From: Clemens Schmid Date: Thu, 12 Jan 2023 18:47:40 +0100 Subject: [PATCH 17/26] added a golden test to check the new duplicate handling in forge -- this required minor changes in the way duplicates are handled all around the library: validate needed a new command line argument to ignore duplicates --- src-executables/Main-trident.hs | 1 + .../CLI/OptparseApplicativeParsers.hs | 7 ++++ src/Poseidon/CLI/Validate.hs | 14 ++++---- test/Poseidon/EntitiesListSpec.hs | 6 ++-- test/Poseidon/GoldenTestsRunCommands.hs | 36 +++++++++++++++++++ test/Poseidon/PackageSpec.hs | 7 ++-- .../poseidonHSGoldenTestCheckSumFile.txt | 10 +++--- .../ForgePac7/POSEIDON.yml | 2 +- .../ForgePac9/ForgePac9.geno | 9 +++++ .../ForgePac9/ForgePac9.ind | 4 +++ .../ForgePac9/ForgePac9.janno | 5 +++ .../ForgePac9/ForgePac9.snp | 9 +++++ .../ForgePac9/POSEIDON.yml | 16 +++++++++ .../Schmid/POSEIDON.yml | 8 +++++ .../poseidonHSGoldenTestData/Schmid/geno.txt | 9 +++++ .../poseidonHSGoldenTestData/Schmid/ind.txt | 2 ++ .../poseidonHSGoldenTestData/Schmid/snp.txt | 9 +++++ .../poseidonHSGoldenTestData/summarise1 | 20 +++++------ .../poseidonHSGoldenTestData/summarise2 | 20 +++++------ test/testDat/poseidonHSGoldenTestData/survey1 | 1 + test/testDat/poseidonHSGoldenTestData/survey2 | 1 + .../ancient/Schmid_2028/POSEIDON.yml | 16 +++++++++ .../ancient/Schmid_2028/Schmid_2028.janno | 3 ++ .../testPackages/ancient/Schmid_2028/geno.txt | 9 +++++ .../testPackages/ancient/Schmid_2028/ind.txt | 2 ++ .../testPackages/ancient/Schmid_2028/snp.txt | 9 +++++ .../ancient/Schmid_2028/sources.bib | 3 ++ 27 files changed, 200 insertions(+), 38 deletions(-) create mode 100644 test/testDat/poseidonHSGoldenTestData/ForgePac9/ForgePac9.geno create mode 100644 test/testDat/poseidonHSGoldenTestData/ForgePac9/ForgePac9.ind create mode 100644 test/testDat/poseidonHSGoldenTestData/ForgePac9/ForgePac9.janno create mode 100644 test/testDat/poseidonHSGoldenTestData/ForgePac9/ForgePac9.snp create mode 100644 test/testDat/poseidonHSGoldenTestData/ForgePac9/POSEIDON.yml create mode 100644 test/testDat/poseidonHSGoldenTestData/Schmid/POSEIDON.yml create mode 100644 test/testDat/poseidonHSGoldenTestData/Schmid/geno.txt create mode 100644 test/testDat/poseidonHSGoldenTestData/Schmid/ind.txt create mode 100644 test/testDat/poseidonHSGoldenTestData/Schmid/snp.txt create mode 100644 test/testDat/testPackages/ancient/Schmid_2028/POSEIDON.yml create mode 100755 test/testDat/testPackages/ancient/Schmid_2028/Schmid_2028.janno create mode 100644 test/testDat/testPackages/ancient/Schmid_2028/geno.txt create mode 100644 test/testDat/testPackages/ancient/Schmid_2028/ind.txt create mode 100644 test/testDat/testPackages/ancient/Schmid_2028/snp.txt create mode 100644 test/testDat/testPackages/ancient/Schmid_2028/sources.bib diff --git a/src-executables/Main-trident.hs b/src-executables/Main-trident.hs index 08da93b52..0d078a4df 100644 --- a/src-executables/Main-trident.hs +++ b/src-executables/Main-trident.hs @@ -207,3 +207,4 @@ validateOptParser :: OP.Parser ValidateOptions validateOptParser = ValidateOptions <$> parseBasePaths <*> parseIgnoreGeno <*> parseNoExitCode + <*> parseIgnoreDuplicates diff --git a/src/Poseidon/CLI/OptparseApplicativeParsers.hs b/src/Poseidon/CLI/OptparseApplicativeParsers.hs index a82c20665..43315ad1e 100644 --- a/src/Poseidon/CLI/OptparseApplicativeParsers.hs +++ b/src/Poseidon/CLI/OptparseApplicativeParsers.hs @@ -397,6 +397,13 @@ parseNoExitCode = OP.switch ( OP.hidden ) +parseIgnoreDuplicates :: OP.Parser Bool +parseIgnoreDuplicates = OP.switch ( + OP.long "ignoreDuplicates" <> + OP.help "do not stop on duplicated individual names in the package collection" <> + OP.hidden + ) + parseRemoteURL :: OP.Parser String parseRemoteURL = OP.strOption ( OP.long "remoteURL" <> diff --git a/src/Poseidon/CLI/Validate.hs b/src/Poseidon/CLI/Validate.hs index 10188fcdb..1ccb0e88e 100644 --- a/src/Poseidon/CLI/Validate.hs +++ b/src/Poseidon/CLI/Validate.hs @@ -18,23 +18,23 @@ import System.Exit (exitFailure, exitSuccess) -- | A datatype representing command line options for the validate command data ValidateOptions = ValidateOptions - { _validateBaseDirs :: [FilePath] - , _validateIgnoreGeno :: Bool - , _validateNoExitCode :: Bool + { _validateBaseDirs :: [FilePath] + , _validateIgnoreGeno :: Bool + , _validateNoExitCode :: Bool + , _validateIgnoreDuplicates :: Bool } pacReadOpts :: PackageReadOptions pacReadOpts = defaultPackageReadOptions { - _readOptStopOnDuplicates = True - , _readOptIgnoreChecksums = False + _readOptIgnoreChecksums = False , _readOptGenoCheck = True } runValidate :: ValidateOptions -> PoseidonLogIO () -runValidate (ValidateOptions baseDirs ignoreGeno noExitCode) = do +runValidate (ValidateOptions baseDirs ignoreGeno noExitCode ignoreDup) = do posFiles <- liftIO $ concat <$> mapM findAllPoseidonYmlFiles baseDirs allPackages <- readPoseidonPackageCollection - pacReadOpts {_readOptIgnoreGeno = ignoreGeno} + pacReadOpts {_readOptIgnoreGeno = ignoreGeno, _readOptStopOnDuplicates = not ignoreDup} baseDirs let numberOfPOSEIDONymlFiles = length posFiles numberOfLoadedPackagesWithDuplicates = foldl' (+) 0 $ map posPacDuplicate allPackages diff --git a/test/Poseidon/EntitiesListSpec.hs b/test/Poseidon/EntitiesListSpec.hs index c3da27ed6..d7b8acf27 100644 --- a/test/Poseidon/EntitiesListSpec.hs +++ b/test/Poseidon/EntitiesListSpec.hs @@ -125,7 +125,7 @@ testReadEntitiesFromFile = testPacReadOpts :: PackageReadOptions testPacReadOpts = defaultPackageReadOptions { - _readOptStopOnDuplicates = True + _readOptStopOnDuplicates = False , _readOptIgnoreChecksums = False , _readOptIgnoreGeno = False , _readOptGenoCheck = False @@ -170,7 +170,7 @@ testFilterPackages = it "should select all relevant packages" $ do ps <- testLog $ readPoseidonPackageCollection testPacReadOpts testBaseDir let pacs = filterRelevantPackages goodEntities ps - map posPacTitle pacs `shouldMatchList` ["Schiffels_2016", "Wang_Plink_test_2020", "Lamnidis_2018"] + map posPacTitle pacs `shouldMatchList` ["Schiffels_2016", "Wang_Plink_test_2020", "Schmid_2028", "Lamnidis_2018"] it "should drop all irrelevant packages" $ do ps <- testLog $ readPoseidonPackageCollection testPacReadOpts testBaseDir let pacs = filterRelevantPackages badEntities ps @@ -182,7 +182,7 @@ testResolveEntityIndices = it "should select all relevant individuals" $ do ps <- testLog $ readPoseidonPackageCollection testPacReadOpts testBaseDir let indInts = resolveEntityIndices goodEntities (getJointIndividualInfo ps) - indInts `shouldBe` ([], [0, 1, 2, 6, 8, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 23]) + indInts `shouldBe` ([], [0, 1, 2, 6, 8, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 25]) it "should drop all irrelevant individuals" $ do ps <- testLog $ readPoseidonPackageCollection testPacReadOpts testBaseDir let indInts = resolveEntityIndices badEntities (getJointIndividualInfo ps) diff --git a/test/Poseidon/GoldenTestsRunCommands.hs b/test/Poseidon/GoldenTestsRunCommands.hs index d9a2ebd9d..b5f914d31 100644 --- a/test/Poseidon/GoldenTestsRunCommands.hs +++ b/test/Poseidon/GoldenTestsRunCommands.hs @@ -175,6 +175,22 @@ testPipelineInit testDir checkFilePath testPacsDir = do runAndChecksumFiles checkFilePath testDir action3 "init" [ "Lamnidis" "Lamnidis.janno" ] + -- this is just here to copy the test package over + testLog (runInit InitOptions { + _initGenoData = GenotypeDataSpec { + format = GenotypeFormatEigenstrat + , genoFile = testPacsDir "Schmid_2028" "geno.txt" + , genoFileChkSum = Nothing + , snpFile = testPacsDir "Schmid_2028" "snp.txt" + , snpFileChkSum = Nothing + , indFile = testPacsDir "Schmid_2028" "ind.txt" + , indFileChkSum = Nothing + , snpSet = Just SNPSetOther + } + , _initPacPath = testDir "Schmid" + , _initPacName = Nothing + , _initMinimal = True + }) >> patchLastModified testDir ("Schmid" "POSEIDON.yml") patchLastModified :: FilePath -> FilePath -> IO () patchLastModified testDir yamlFile = do @@ -202,6 +218,7 @@ testPipelineValidate testDir checkFilePath = do _validateBaseDirs = [testDir] , _validateIgnoreGeno = False , _validateNoExitCode = True + , _validateIgnoreDuplicates = True } runAndChecksumStdOut checkFilePath testDir (testLog $ runValidate validateOpts1) "validate" 1 let validateOpts2 = validateOpts1 { @@ -554,6 +571,25 @@ testPipelineForge testDir checkFilePath testEntityFiles = do runAndChecksumFiles checkFilePath testDir action8 "forge" [ "ForgePac8" "ForgePac8.janno" ] + -- forge test 9 (duplicates are handled correctly if an individual is properly specified) + let forgeOpts9 = ForgeOptions { + _forgeGenoSources = [PacBaseDir $ testDir "Schiffels", PacBaseDir $ testDir "Schmid"] + , _forgeEntityInput = [EntitiesDirect (fromRight [] $ readEntitiesFromString "POP1,")] + , _forgeSnpFile = Nothing + , _forgeIntersect = False + , _forgeOutFormat = GenotypeFormatEigenstrat + , _forgeOutMinimal = False + , _forgeOutOnlyGeno = False + , _forgeOutPacPath = testDir "ForgePac9" + , _forgeOutPacName = Just "ForgePac9" + , _forgeNoExtract = False + } + let action9 = testLog (runForge forgeOpts9) >> patchLastModified testDir ("ForgePac9" "POSEIDON.yml") + runAndChecksumFiles checkFilePath testDir action9 "forge" [ + "ForgePac9" "ForgePac9.geno" + , "ForgePac9" "ForgePac9.janno" + ] + -- Note: We here use our test server (no SSL and different port). The reason is that -- sometimes we would like to implement new features that affect the communication diff --git a/test/Poseidon/PackageSpec.hs b/test/Poseidon/PackageSpec.hs index 062844602..945996327 100644 --- a/test/Poseidon/PackageSpec.hs +++ b/test/Poseidon/PackageSpec.hs @@ -44,7 +44,7 @@ spec = do testPacReadOpts :: PackageReadOptions testPacReadOpts = defaultPackageReadOptions { - _readOptStopOnDuplicates = True + _readOptStopOnDuplicates = False , _readOptIgnoreChecksums = False , _readOptIgnoreGeno = False , _readOptGenoCheck = False @@ -162,10 +162,11 @@ testreadPoseidonPackageCollection = describe "PoseidonPackage.findPoseidonPackag let dir = "test/testDat/testPackages/ancient" it "should discover packages correctly" $ do pac <- testLog $ readPoseidonPackageCollection testPacReadOpts [dir] - sort (map posPacTitle pac) `shouldBe` ["Lamnidis_2018", "Schiffels_2016", "Wang_Plink_test_2020"] + sort (map posPacTitle pac) `shouldBe` ["Lamnidis_2018", "Schiffels_2016", "Schmid_2028", "Wang_Plink_test_2020"] sort (map posPacLastModified pac) `shouldBe` [Just (fromGregorian 2020 2 20), Just (fromGregorian 2020 5 20), - Just (fromGregorian 2021 11 9)] + Just (fromGregorian 2021 11 9), + Just (fromGregorian 2023 01 12)] files :: [String] files = ["test/testDat/testPackages/ancient/Schiffels_2016/geno.txt", diff --git a/test/testDat/poseidonHSGoldenTestCheckSumFile.txt b/test/testDat/poseidonHSGoldenTestCheckSumFile.txt index dbeb27ea8..f35bffc03 100644 --- a/test/testDat/poseidonHSGoldenTestCheckSumFile.txt +++ b/test/testDat/poseidonHSGoldenTestCheckSumFile.txt @@ -14,10 +14,10 @@ d41d8cd98f00b204e9800998ecf8427e validate validate2 c27895a4e15eb46f0d4473cd37488cae list list2 c448a4bd45d6cdc7e127316549beccf3 list list3 a9f5b04f5b40eeff3aabf1ad7bd22771 list list4 -1dd62ce95bbc8c8e35ef1659230bf188 summarise summarise1 -8fe12b69216f9b4fcd89f2c6fe0e3e25 summarise summarise2 -007c5851ceaec0bff861241953777cfa survey survey1 -ca8a8cde86a2d5b680ad953c01e9af0d survey survey2 +a27f3820e0d0893aedfd37666e6f0c15 summarise summarise1 +88cef92d61b209f7ee57dbc9791d79bd summarise summarise2 +5577e451b538ba6d09d3001a4b88c0be survey survey1 +63d21b4613ec54ec32f0879dae88f8d5 survey survey2 3feaa876c5d05fd000470c5ba840cc2d genoconvert Wang/Wang.geno b46831b007c2d53a12b472484b7b00b4 genoconvert Wang/Wang.snp 2faf8a7b87037451d4fd5ae9cc9af460 genoconvert Wang/Wang.ind @@ -59,6 +59,8 @@ c613af8349f6c05927ee0771021f9b7d forge ForgePac7/ForgePac7.janno 6e3ecc6695234ba1e71045278634d06b forge ForgePac7/ForgePac7.snp 2bd17be137df5cb11830b09b833b937d forge ForgePac7/ForgePac7.ind fedf5dde1efab7d6521a6398c21c096e forge ForgePac8/ForgePac8.janno +84bac408ab864c357bb2a6e1db5cda20 forge ForgePac9/ForgePac9.geno +66402712dd6d6bc65e73e13c2fc28780 forge ForgePac9/ForgePac9.janno e95272d9a7dfd81afb125081479c96e1 fetch 2019_Nikitin_LBK/POSEIDON.yml 663d9358c94640fd56da3ff9bdf2e207 fetch 2019_Nikitin_LBK/Nikitin_LBK.janno 3abe2144e4f5aea3aa0b83395ba9b355 fetch 2019_Nikitin_LBK/Nikitin_LBK.fam \ No newline at end of file diff --git a/test/testDat/poseidonHSGoldenTestData/ForgePac7/POSEIDON.yml b/test/testDat/poseidonHSGoldenTestData/ForgePac7/POSEIDON.yml index de55a464e..7c929a8f5 100644 --- a/test/testDat/poseidonHSGoldenTestData/ForgePac7/POSEIDON.yml +++ b/test/testDat/poseidonHSGoldenTestData/ForgePac7/POSEIDON.yml @@ -6,7 +6,7 @@ contributor: email: carberry@brown.edu orcid: 0000-0002-1825-0097 packageVersion: 0.1.0 -lastModified: 2022-12-16 +lastModified: 2023-01-12 genotypeData: format: EIGENSTRAT genoFile: ForgePac7.geno diff --git a/test/testDat/poseidonHSGoldenTestData/ForgePac9/ForgePac9.geno b/test/testDat/poseidonHSGoldenTestData/ForgePac9/ForgePac9.geno new file mode 100644 index 000000000..f0c711d09 --- /dev/null +++ b/test/testDat/poseidonHSGoldenTestData/ForgePac9/ForgePac9.geno @@ -0,0 +1,9 @@ +0000 +2120 +0000 +1000 +2100 +2220 +9120 +0120 +9210 diff --git a/test/testDat/poseidonHSGoldenTestData/ForgePac9/ForgePac9.ind b/test/testDat/poseidonHSGoldenTestData/ForgePac9/ForgePac9.ind new file mode 100644 index 000000000..c4dbea676 --- /dev/null +++ b/test/testDat/poseidonHSGoldenTestData/ForgePac9/ForgePac9.ind @@ -0,0 +1,4 @@ +XXX003 M POP1 +XXX007 M POP1 +XXX009 F POP1 +XXX001 F POP1 diff --git a/test/testDat/poseidonHSGoldenTestData/ForgePac9/ForgePac9.janno b/test/testDat/poseidonHSGoldenTestData/ForgePac9/ForgePac9.janno new file mode 100644 index 000000000..de32cec6e --- /dev/null +++ b/test/testDat/poseidonHSGoldenTestData/ForgePac9/ForgePac9.janno @@ -0,0 +1,5 @@ +Poseidon_ID Genetic_Sex Group_Name Alternative_IDs Relation_To Relation_Degree Relation_Type Relation_Note Collection_ID Country Location Site Latitude Longitude Date_Type Date_C14_Labnr Date_C14_Uncal_BP Date_C14_Uncal_BP_Err Date_BC_AD_Start Date_BC_AD_Median Date_BC_AD_Stop Date_Note MT_Haplogroup Y_Haplogroup Source_Tissue Nr_Libraries Capture_Type UDG Library_Built Genotype_Ploidy Data_Preparation_Pipeline_URL Endogenous Nr_SNPs Coverage_on_Target_SNPs Damage Contamination Contamination_Err Contamination_Meas Contamination_Note Genetic_Source_Accession_IDs Primary_Contact Publication Note Keywords AddCol1 AddCol2 +XXX003 M POP1 n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a 7 n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a v1 v2 +XXX007 M POP1 n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a 9 n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a v1 v2 +XXX009 F POP1 n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a 9 n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a v1 v2 +XXX001 F POP1 n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a 9 n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a diff --git a/test/testDat/poseidonHSGoldenTestData/ForgePac9/ForgePac9.snp b/test/testDat/poseidonHSGoldenTestData/ForgePac9/ForgePac9.snp new file mode 100644 index 000000000..7c9063a46 --- /dev/null +++ b/test/testDat/poseidonHSGoldenTestData/ForgePac9/ForgePac9.snp @@ -0,0 +1,9 @@ +1_752566 1 2.013e-2 752566 G A +1_842013 1 2.2518e-2 842013 T G +1_891021 1 2.4116e-2 891021 G A +1_949654 1 2.5727e-2 949654 A G +2_1018704 2 2.6288e-2 1018704 A G +2_1045331 2 2.6665e-2 1045331 G A +2_1048955 2 2.6674e-2 1048955 A G +2_1061166 2 2.6711e-2 1061166 T C +2_1108637 2 2.8311e-2 1108637 G A diff --git a/test/testDat/poseidonHSGoldenTestData/ForgePac9/POSEIDON.yml b/test/testDat/poseidonHSGoldenTestData/ForgePac9/POSEIDON.yml new file mode 100644 index 000000000..406d52f3f --- /dev/null +++ b/test/testDat/poseidonHSGoldenTestData/ForgePac9/POSEIDON.yml @@ -0,0 +1,16 @@ +poseidonVersion: 2.6.0 +title: ForgePac9 +description: Empty package template. Please add a description +contributor: +- name: Josiah Carberry + email: carberry@brown.edu + orcid: 0000-0002-1825-0097 +packageVersion: 0.1.0 +lastModified: 1970-01-01 +genotypeData: + format: EIGENSTRAT + genoFile: ForgePac9.geno + snpFile: ForgePac9.snp + indFile: ForgePac9.ind + snpSet: Other +jannoFile: ForgePac9.janno diff --git a/test/testDat/poseidonHSGoldenTestData/Schmid/POSEIDON.yml b/test/testDat/poseidonHSGoldenTestData/Schmid/POSEIDON.yml new file mode 100644 index 000000000..9aaf5a2fb --- /dev/null +++ b/test/testDat/poseidonHSGoldenTestData/Schmid/POSEIDON.yml @@ -0,0 +1,8 @@ +poseidonVersion: 2.6.0 +title: Schmid +genotypeData: + format: EIGENSTRAT + genoFile: geno.txt + snpFile: snp.txt + indFile: ind.txt + snpSet: Other diff --git a/test/testDat/poseidonHSGoldenTestData/Schmid/geno.txt b/test/testDat/poseidonHSGoldenTestData/Schmid/geno.txt new file mode 100644 index 000000000..e5e735c61 --- /dev/null +++ b/test/testDat/poseidonHSGoldenTestData/Schmid/geno.txt @@ -0,0 +1,9 @@ +00 +00 +00 +00 +00 +00 +00 +00 +00 diff --git a/test/testDat/poseidonHSGoldenTestData/Schmid/ind.txt b/test/testDat/poseidonHSGoldenTestData/Schmid/ind.txt new file mode 100644 index 000000000..5f99068fe --- /dev/null +++ b/test/testDat/poseidonHSGoldenTestData/Schmid/ind.txt @@ -0,0 +1,2 @@ +XXX001 F POP1 +XXX002 F POP2 diff --git a/test/testDat/poseidonHSGoldenTestData/Schmid/snp.txt b/test/testDat/poseidonHSGoldenTestData/Schmid/snp.txt new file mode 100644 index 000000000..6a56b2435 --- /dev/null +++ b/test/testDat/poseidonHSGoldenTestData/Schmid/snp.txt @@ -0,0 +1,9 @@ +1_752566 1 0.020130 752566 G A +1_842013 1 0.022518 842013 T G +1_891021 1 0.024116 891021 G A +1_949654 1 0.025727 949654 A G +2_1018704 2 0.026288 1018704 A G +2_1045331 2 0.026665 1045331 G A +2_1048955 2 0.026674 1048955 A G +2_1061166 2 0.026711 1061166 T C +2_1108637 2 0.028311 1108637 G A diff --git a/test/testDat/poseidonHSGoldenTestData/summarise1 b/test/testDat/poseidonHSGoldenTestData/summarise1 index e6bcc8491..b0a2afdc8 100644 --- a/test/testDat/poseidonHSGoldenTestData/summarise1 +++ b/test/testDat/poseidonHSGoldenTestData/summarise1 @@ -1,22 +1,22 @@ .-------------------------.--------------------------------------------------------------. | Summary | Value | :=========================:==============================================================: -| Nr Individuals | 25 | -| Individuals | SAMPLE0, SAMPLE1, SAMPLE2, SAMPLE3, SAMPLE4, XXX001, XXX002… | +| Nr Individuals | 27 | +| Individuals | SAMPLE0, SAMPLE1, SAMPLE2, SAMPLE3, SAMPLE4, XXX001, XXX001… | | Nr Groups | 8 | -| Groups | POP1: 8, POP2: 8, POP3: 4, 1: 1, 2: 1, 3: 1, 4: 1, 5: 1 | +| Groups | POP1: 9, POP2: 9, POP3: 4, 1: 1, 2: 1, 3: 1, 4: 1, 5: 1 | | Nr Publications | 1 | | Publications | no values | | Nr Countries | 1 | -| Countries | n/a: 25 | +| Countries | n/a: 27 | | Mean age BC/AD | no values | -| Dating type | n/a: 25 | -| Sex distribution | F: 13, M: 12 | -| MT haplogroups | n/a: 25 | -| Y haplogroups | n/a: 25 | +| Dating type | n/a: 27 | +| Sex distribution | F: 15, M: 12 | +| MT haplogroups | n/a: 27 | +| Y haplogroups | n/a: 27 | | % endogenous human DNA | no values | | Nr of SNPs | no values | | Coverage on target SNPs | no values | -| Library type | n/a: 25 | -| UDG treatment | n/a: 25 | +| Library type | n/a: 27 | +| UDG treatment | n/a: 27 | '-------------------------'--------------------------------------------------------------' diff --git a/test/testDat/poseidonHSGoldenTestData/summarise2 b/test/testDat/poseidonHSGoldenTestData/summarise2 index 2dea22ea1..b97f81e07 100644 --- a/test/testDat/poseidonHSGoldenTestData/summarise2 +++ b/test/testDat/poseidonHSGoldenTestData/summarise2 @@ -1,18 +1,18 @@ -Nr Individuals 25 -Individuals SAMPLE0, SAMPLE1, SAMPLE2, SAMPLE3, SAMPLE4, XXX001, XXX002, XXX003, XXX004, XXX005, XXX006, XXX007, XXX008, XXX009, XXX010, XXX011, XXX012, XXX013, XXX014, XXX015, XXX016, XXX017, XXX018, XXX019, XXX020 +Nr Individuals 27 +Individuals SAMPLE0, SAMPLE1, SAMPLE2, SAMPLE3, SAMPLE4, XXX001, XXX001, XXX002, XXX002, XXX003, XXX004, XXX005, XXX006, XXX007, XXX008, XXX009, XXX010, XXX011, XXX012, XXX013, XXX014, XXX015, XXX016, XXX017, XXX018, XXX019, XXX020 Nr Groups 8 -Groups POP1: 8, POP2: 8, POP3: 4, 1: 1, 2: 1, 3: 1, 4: 1, 5: 1 +Groups POP1: 9, POP2: 9, POP3: 4, 1: 1, 2: 1, 3: 1, 4: 1, 5: 1 Nr Publications 1 Publications no values Nr Countries 1 -Countries n/a: 25 +Countries n/a: 27 Mean age BC/AD no values -Dating type n/a: 25 -Sex distribution F: 13, M: 12 -MT haplogroups n/a: 25 -Y haplogroups n/a: 25 +Dating type n/a: 27 +Sex distribution F: 15, M: 12 +MT haplogroups n/a: 27 +Y haplogroups n/a: 27 % endogenous human DNA no values Nr of SNPs no values Coverage on target SNPs no values -Library type n/a: 25 -UDG treatment n/a: 25 +Library type n/a: 27 +UDG treatment n/a: 27 diff --git a/test/testDat/poseidonHSGoldenTestData/survey1 b/test/testDat/poseidonHSGoldenTestData/survey1 index 8d4be564d..c7adb5ba7 100644 --- a/test/testDat/poseidonHSGoldenTestData/survey1 +++ b/test/testDat/poseidonHSGoldenTestData/survey1 @@ -3,5 +3,6 @@ :===========:=========================================================: | Lamnidis | GB|███..|.....|.....|.....|.....|.....|.....|.....|.... | | Schiffels | GB|███..|.....|.....|.....|.....|.....|.....|.....|.... | +| Schmid | G.|███..|.....|.....|.....|.....|.....|.....|.....|.... | | Wang | G.|███..|.....|.....|.....|.....|.....|.....|.....|.... | '-----------'---------------------------------------------------------' diff --git a/test/testDat/poseidonHSGoldenTestData/survey2 b/test/testDat/poseidonHSGoldenTestData/survey2 index 6fb08918b..26eec8e7c 100644 --- a/test/testDat/poseidonHSGoldenTestData/survey2 +++ b/test/testDat/poseidonHSGoldenTestData/survey2 @@ -1,3 +1,4 @@ Lamnidis GB|███..|.....|.....|.....|.....|.....|.....|.....|.... Schiffels GB|███..|.....|.....|.....|.....|.....|.....|.....|.... +Schmid G.|███..|.....|.....|.....|.....|.....|.....|.....|.... Wang G.|███..|.....|.....|.....|.....|.....|.....|.....|.... diff --git a/test/testDat/testPackages/ancient/Schmid_2028/POSEIDON.yml b/test/testDat/testPackages/ancient/Schmid_2028/POSEIDON.yml new file mode 100644 index 000000000..d50da3cf4 --- /dev/null +++ b/test/testDat/testPackages/ancient/Schmid_2028/POSEIDON.yml @@ -0,0 +1,16 @@ +poseidonVersion: 2.6.0 +title: Schmid_2028 +description: Genetic data that will never be published in Schmid et al. 2028 +contributor: +- name: Clemens Schmid + email: schmid@institute.org +packageVersion: 1.0.0 +lastModified: 2023-01-12 +genotypeData: + format: EIGENSTRAT + genoFile: geno.txt + snpFile: snp.txt + indFile: ind.txt + snpSet: Other +jannoFile: Schmid_2028.janno + diff --git a/test/testDat/testPackages/ancient/Schmid_2028/Schmid_2028.janno b/test/testDat/testPackages/ancient/Schmid_2028/Schmid_2028.janno new file mode 100755 index 000000000..108642f94 --- /dev/null +++ b/test/testDat/testPackages/ancient/Schmid_2028/Schmid_2028.janno @@ -0,0 +1,3 @@ +Poseidon_ID Group_Name Genetic_Sex +XXX001 POP1 F +XXX002 POP2 F diff --git a/test/testDat/testPackages/ancient/Schmid_2028/geno.txt b/test/testDat/testPackages/ancient/Schmid_2028/geno.txt new file mode 100644 index 000000000..e5e735c61 --- /dev/null +++ b/test/testDat/testPackages/ancient/Schmid_2028/geno.txt @@ -0,0 +1,9 @@ +00 +00 +00 +00 +00 +00 +00 +00 +00 diff --git a/test/testDat/testPackages/ancient/Schmid_2028/ind.txt b/test/testDat/testPackages/ancient/Schmid_2028/ind.txt new file mode 100644 index 000000000..5f99068fe --- /dev/null +++ b/test/testDat/testPackages/ancient/Schmid_2028/ind.txt @@ -0,0 +1,2 @@ +XXX001 F POP1 +XXX002 F POP2 diff --git a/test/testDat/testPackages/ancient/Schmid_2028/snp.txt b/test/testDat/testPackages/ancient/Schmid_2028/snp.txt new file mode 100644 index 000000000..6a56b2435 --- /dev/null +++ b/test/testDat/testPackages/ancient/Schmid_2028/snp.txt @@ -0,0 +1,9 @@ +1_752566 1 0.020130 752566 G A +1_842013 1 0.022518 842013 T G +1_891021 1 0.024116 891021 G A +1_949654 1 0.025727 949654 A G +2_1018704 2 0.026288 1018704 A G +2_1045331 2 0.026665 1045331 G A +2_1048955 2 0.026674 1048955 A G +2_1061166 2 0.026711 1061166 T C +2_1108637 2 0.028311 1108637 G A diff --git a/test/testDat/testPackages/ancient/Schmid_2028/sources.bib b/test/testDat/testPackages/ancient/Schmid_2028/sources.bib new file mode 100644 index 000000000..17e0967ef --- /dev/null +++ b/test/testDat/testPackages/ancient/Schmid_2028/sources.bib @@ -0,0 +1,3 @@ +@article{Schiffels2016, + title = Test +} From f678bd1c1185d49fa5ac165b3f4e1ff1c926c6c8 Mon Sep 17 00:00:00 2001 From: Clemens Schmid Date: Thu, 12 Jan 2023 19:59:09 +0100 Subject: [PATCH 18/26] added another golden test for the duplicate solution with precise individual syntax --- test/Poseidon/GoldenTestsRunCommands.hs | 18 ++++++++++++++++++ .../poseidonHSGoldenTestCheckSumFile.txt | 2 ++ .../ForgePac10/ForgePac10.geno | 9 +++++++++ .../ForgePac10/ForgePac10.ind | 10 ++++++++++ .../ForgePac10/ForgePac10.janno | 11 +++++++++++ .../ForgePac10/ForgePac10.snp | 9 +++++++++ .../ForgePac10/POSEIDON.yml | 16 ++++++++++++++++ 7 files changed, 75 insertions(+) create mode 100644 test/testDat/poseidonHSGoldenTestData/ForgePac10/ForgePac10.geno create mode 100644 test/testDat/poseidonHSGoldenTestData/ForgePac10/ForgePac10.ind create mode 100644 test/testDat/poseidonHSGoldenTestData/ForgePac10/ForgePac10.janno create mode 100644 test/testDat/poseidonHSGoldenTestData/ForgePac10/ForgePac10.snp create mode 100644 test/testDat/poseidonHSGoldenTestData/ForgePac10/POSEIDON.yml diff --git a/test/Poseidon/GoldenTestsRunCommands.hs b/test/Poseidon/GoldenTestsRunCommands.hs index b5f914d31..54ebbe406 100644 --- a/test/Poseidon/GoldenTestsRunCommands.hs +++ b/test/Poseidon/GoldenTestsRunCommands.hs @@ -589,6 +589,24 @@ testPipelineForge testDir checkFilePath testEntityFiles = do "ForgePac9" "ForgePac9.geno" , "ForgePac9" "ForgePac9.janno" ] + -- forge test 10 (duplicates can also be resolved with negative selection) + let forgeOpts10 = ForgeOptions { + _forgeGenoSources = [PacBaseDir $ testDir "Schiffels", PacBaseDir $ testDir "Schmid"] + , _forgeEntityInput = [EntitiesDirect (fromRight [] $ readEntitiesFromString "-,-")] + , _forgeSnpFile = Nothing + , _forgeIntersect = False + , _forgeOutFormat = GenotypeFormatEigenstrat + , _forgeOutMinimal = False + , _forgeOutOnlyGeno = False + , _forgeOutPacPath = testDir "ForgePac10" + , _forgeOutPacName = Just "ForgePac10" + , _forgeNoExtract = False + } + let action10 = testLog (runForge forgeOpts10) >> patchLastModified testDir ("ForgePac10" "POSEIDON.yml") + runAndChecksumFiles checkFilePath testDir action10 "forge" [ + "ForgePac10" "ForgePac10.geno" + , "ForgePac10" "ForgePac10.janno" + ] -- Note: We here use our test server (no SSL and different port). The reason is that diff --git a/test/testDat/poseidonHSGoldenTestCheckSumFile.txt b/test/testDat/poseidonHSGoldenTestCheckSumFile.txt index f35bffc03..44192e320 100644 --- a/test/testDat/poseidonHSGoldenTestCheckSumFile.txt +++ b/test/testDat/poseidonHSGoldenTestCheckSumFile.txt @@ -61,6 +61,8 @@ c613af8349f6c05927ee0771021f9b7d forge ForgePac7/ForgePac7.janno fedf5dde1efab7d6521a6398c21c096e forge ForgePac8/ForgePac8.janno 84bac408ab864c357bb2a6e1db5cda20 forge ForgePac9/ForgePac9.geno 66402712dd6d6bc65e73e13c2fc28780 forge ForgePac9/ForgePac9.janno +bcccc5136aade390f711fee3695ef62b forge ForgePac10/ForgePac10.geno +75dde2f4c025aca30e51141df11e11f7 forge ForgePac10/ForgePac10.janno e95272d9a7dfd81afb125081479c96e1 fetch 2019_Nikitin_LBK/POSEIDON.yml 663d9358c94640fd56da3ff9bdf2e207 fetch 2019_Nikitin_LBK/Nikitin_LBK.janno 3abe2144e4f5aea3aa0b83395ba9b355 fetch 2019_Nikitin_LBK/Nikitin_LBK.fam \ No newline at end of file diff --git a/test/testDat/poseidonHSGoldenTestData/ForgePac10/ForgePac10.geno b/test/testDat/poseidonHSGoldenTestData/ForgePac10/ForgePac10.geno new file mode 100644 index 000000000..352c4c14d --- /dev/null +++ b/test/testDat/poseidonHSGoldenTestData/ForgePac10/ForgePac10.geno @@ -0,0 +1,9 @@ +2000001000 +2222212220 +0009100000 +0101000000 +1219012010 +2222222220 +2922212210 +2012212200 +2921222120 diff --git a/test/testDat/poseidonHSGoldenTestData/ForgePac10/ForgePac10.ind b/test/testDat/poseidonHSGoldenTestData/ForgePac10/ForgePac10.ind new file mode 100644 index 000000000..552ab81aa --- /dev/null +++ b/test/testDat/poseidonHSGoldenTestData/ForgePac10/ForgePac10.ind @@ -0,0 +1,10 @@ +XXX001 M POP1 +XXX003 M POP1 +XXX004 F POP2 +XXX005 M POP2 +XXX006 F POP2 +XXX007 M POP1 +XXX008 F POP3 +XXX009 F POP1 +XXX010 M POP3 +XXX002 F POP2 diff --git a/test/testDat/poseidonHSGoldenTestData/ForgePac10/ForgePac10.janno b/test/testDat/poseidonHSGoldenTestData/ForgePac10/ForgePac10.janno new file mode 100644 index 000000000..99fd17ac5 --- /dev/null +++ b/test/testDat/poseidonHSGoldenTestData/ForgePac10/ForgePac10.janno @@ -0,0 +1,11 @@ +Poseidon_ID Genetic_Sex Group_Name Alternative_IDs Relation_To Relation_Degree Relation_Type Relation_Note Collection_ID Country Location Site Latitude Longitude Date_Type Date_C14_Labnr Date_C14_Uncal_BP Date_C14_Uncal_BP_Err Date_BC_AD_Start Date_BC_AD_Median Date_BC_AD_Stop Date_Note MT_Haplogroup Y_Haplogroup Source_Tissue Nr_Libraries Capture_Type UDG Library_Built Genotype_Ploidy Data_Preparation_Pipeline_URL Endogenous Nr_SNPs Coverage_on_Target_SNPs Damage Contamination Contamination_Err Contamination_Meas Contamination_Note Genetic_Source_Accession_IDs Primary_Contact Publication Note Keywords AddCol1 AddCol2 +XXX001 M POP1 n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a 9 n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a v1 v2 +XXX003 M POP1 n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a 7 n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a v1 v2 +XXX004 F POP2 n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a 9 n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a v1 v2 +XXX005 M POP2 n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a 7 n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a v1 v2 +XXX006 F POP2 n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a 9 n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a v1 v2 +XXX007 M POP1 n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a 9 n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a v1 v2 +XXX008 F POP3 n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a 9 n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a v1 v2 +XXX009 F POP1 n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a 9 n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a v1 v2 +XXX010 M POP3 n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a 9 n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a v1 v2 +XXX002 F POP2 n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a 9 n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a diff --git a/test/testDat/poseidonHSGoldenTestData/ForgePac10/ForgePac10.snp b/test/testDat/poseidonHSGoldenTestData/ForgePac10/ForgePac10.snp new file mode 100644 index 000000000..7c9063a46 --- /dev/null +++ b/test/testDat/poseidonHSGoldenTestData/ForgePac10/ForgePac10.snp @@ -0,0 +1,9 @@ +1_752566 1 2.013e-2 752566 G A +1_842013 1 2.2518e-2 842013 T G +1_891021 1 2.4116e-2 891021 G A +1_949654 1 2.5727e-2 949654 A G +2_1018704 2 2.6288e-2 1018704 A G +2_1045331 2 2.6665e-2 1045331 G A +2_1048955 2 2.6674e-2 1048955 A G +2_1061166 2 2.6711e-2 1061166 T C +2_1108637 2 2.8311e-2 1108637 G A diff --git a/test/testDat/poseidonHSGoldenTestData/ForgePac10/POSEIDON.yml b/test/testDat/poseidonHSGoldenTestData/ForgePac10/POSEIDON.yml new file mode 100644 index 000000000..b7e48ed84 --- /dev/null +++ b/test/testDat/poseidonHSGoldenTestData/ForgePac10/POSEIDON.yml @@ -0,0 +1,16 @@ +poseidonVersion: 2.6.0 +title: ForgePac10 +description: Empty package template. Please add a description +contributor: +- name: Josiah Carberry + email: carberry@brown.edu + orcid: 0000-0002-1825-0097 +packageVersion: 0.1.0 +lastModified: 1970-01-01 +genotypeData: + format: EIGENSTRAT + genoFile: ForgePac10.geno + snpFile: ForgePac10.snp + indFile: ForgePac10.ind + snpSet: Other +jannoFile: ForgePac10.janno From f2ae70ef8c031b115b4166d54f274c1c53a02412 Mon Sep 17 00:00:00 2001 From: Clemens Schmid Date: Thu, 12 Jan 2023 20:12:58 +0100 Subject: [PATCH 19/26] started to work on changelog --- CHANGELOG.md | 3 ++- CHANGELOGRELEASE.md | 2 ++ poseidon-hs.cabal | 2 +- 3 files changed, 5 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 1c7697571..01c3bc443 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,7 @@ +- V 1.1.7.0: Reorganized handling of duplicate individuals: Duplicates are now generally ignored, except in validate (can also be turned of with a new flag) and forge. The forgeString language features a new syntactic entity to select individuals specifically and thus resolve duplication conflicts - V 1.1.6.0: Removed outdated --verbose from validate and ignore trailing slashes from --outPath - V 1.1.5.0: Enabled reading and forging additional, unspecified variables in .janno files -- V 1.1.4.2: Added parsing for Accession IDs (.janno file). Wrong entries are ignored, so this is non-breaking. +- V 1.1.4.2: Added parsing for Accession IDs (.janno file). Wrong entries are ignored, so this is non-breaking - V 1.1.4.1: Added a small validation check for calibrated ages in the .janno file - V 1.1.4.0: Changes to make poseidon-hs compatible with Poseidon v2.6.0 (backwards compatible with v2.5.0): contributor field optional, added orcid field for contributors, added more capture type options in janno files - V 1.1.3.1: Package reading will now fail if bib-entries are not found due to missing bibtex files diff --git a/CHANGELOGRELEASE.md b/CHANGELOGRELEASE.md index da00582a6..7e51d0c1b 100644 --- a/CHANGELOGRELEASE.md +++ b/CHANGELOGRELEASE.md @@ -1,3 +1,5 @@ +### V 1.1.7.0 + ### V 1.1.6.0 #### Additional columns in .janno files (V 1.1.5.0) diff --git a/poseidon-hs.cabal b/poseidon-hs.cabal index c9fa48b61..faa3191c3 100644 --- a/poseidon-hs.cabal +++ b/poseidon-hs.cabal @@ -1,5 +1,5 @@ name: poseidon-hs -version: 1.1.6.0 +version: 1.1.7.0 synopsis: A package with tools for working with Poseidon Genotype Data description: The tools in this package read and analyse Poseidon-formatted genotype databases, a modular system for storing genotype data from thousands of individuals. license: MIT From a2bbb709cba0c31304130bfbeb393cac38d70d19 Mon Sep 17 00:00:00 2001 From: Clemens Schmid Date: Thu, 12 Jan 2023 20:28:03 +0100 Subject: [PATCH 20/26] added another test for the entity resolver, which makes use of secondary group names and exclusion to solve duplication --- test/Poseidon/EntitiesListSpec.hs | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/test/Poseidon/EntitiesListSpec.hs b/test/Poseidon/EntitiesListSpec.hs index d7b8acf27..1701747cf 100644 --- a/test/Poseidon/EntitiesListSpec.hs +++ b/test/Poseidon/EntitiesListSpec.hs @@ -244,6 +244,15 @@ testResolveEntityIndices = ]], [1] ) + resolveEntityIndices [ + Include (Group "PopB") + , Include (Ind (SpecificInd $ IndividualInfo "Ind1" ["Pop1"] "Pac2")) + , Exclude (Ind (SpecificInd $ IndividualInfo "Ind2" ["Pop2"] "Pac1")) + , Exclude (Ind (SpecificInd $ IndividualInfo "Ind2" ["Pop2"] "Pac3")) + ] indInfoDuplicates `shouldBe` ( + [], + [1,4] + ) testJSON :: Spec testJSON = From 094eb00bee10903c6a74eb18e26603e4834fc6d6 Mon Sep 17 00:00:00 2001 From: Clemens Schmid Date: Thu, 12 Jan 2023 20:44:29 +0100 Subject: [PATCH 21/26] two more tests for the individual selection algorithm --- test/Poseidon/EntitiesListSpec.hs | 26 +++++++++++++++++++++++++- 1 file changed, 25 insertions(+), 1 deletion(-) diff --git a/test/Poseidon/EntitiesListSpec.hs b/test/Poseidon/EntitiesListSpec.hs index 1701747cf..fdb17381e 100644 --- a/test/Poseidon/EntitiesListSpec.hs +++ b/test/Poseidon/EntitiesListSpec.hs @@ -207,7 +207,7 @@ testResolveEntityIndices = , Include (Ind (SimpleInd "Ind3")) , Include (Ind (SpecificInd $ IndividualInfo "Ind8" ["Pop4"] "Pac2")) ] indInfo `shouldBe` ([], [0, 1, 2, 7]) - it "should correctly extract indices in case of duplicates" $ do + it "should correctly extract indices in case of duplicates across packages" $ do let indInfoDuplicates = [ IndividualInfo "Ind1" ["Pop1", "PopB"] "Pac1" , IndividualInfo "Ind1" ["Pop1", "PopB"] "Pac2" @@ -216,6 +216,14 @@ testResolveEntityIndices = , IndividualInfo "Ind2" ["Pop2", "PopB"] "Pac2" , IndividualInfo "Ind2" ["Pop2", "PopB"] "Pac3" ] + -- test simple extraction with specific syntax + resolveEntityIndices [ + Include (Ind (SpecificInd $ IndividualInfo "Ind1" ["Pop1"] "Pac2")) + ] indInfoDuplicates `shouldBe` ( + [], + [1] + ) + -- test solving simple duplication for one individual resolveEntityIndices [ Include (Ind (SimpleInd "Ind1")) , Include (Ind (SpecificInd $ IndividualInfo "Ind1" ["Pop1"] "Pac2")) @@ -223,6 +231,7 @@ testResolveEntityIndices = [], [1] ) + -- test solving duplication for two individuals at once resolveEntityIndices [ Include (Ind (SimpleInd "Ind1")) , Include (Ind (SpecificInd $ IndividualInfo "Ind1" ["Pop1"] "Pac2")) @@ -232,6 +241,7 @@ testResolveEntityIndices = [], [1,5] ) + -- test output in case of unresolved duplicates resolveEntityIndices [ Include (Ind (SimpleInd "Ind1")) , Include (Ind (SpecificInd $ IndividualInfo "Ind1" ["Pop1"] "Pac2")) @@ -244,6 +254,7 @@ testResolveEntityIndices = ]], [1] ) + -- test interaction with secondary group name selection and negative selection to solve duplication resolveEntityIndices [ Include (Group "PopB") , Include (Ind (SpecificInd $ IndividualInfo "Ind1" ["Pop1"] "Pac2")) @@ -253,6 +264,19 @@ testResolveEntityIndices = [], [1,4] ) + it "should correctly extract indices in case of duplicates within one package" $ do + let indInfoDuplicates = [ + IndividualInfo "Ind1" ["Pop1", "PopB"] "Pac1" + , IndividualInfo "Ind1" ["Pop2", "PopB"] "Pac1" + , IndividualInfo "Ind1" ["Pop3", "PopB"] "Pac1" + ] + resolveEntityIndices [ + Include (Ind (SimpleInd "Ind1")) + , Include (Ind (SpecificInd $ IndividualInfo "Ind1" ["Pop2"] "Pac1")) + ] indInfoDuplicates `shouldBe` ( + [], + [1] + ) testJSON :: Spec testJSON = From e236bb2012a55196a174d616a721a266387858ee Mon Sep 17 00:00:00 2001 From: Clemens Schmid Date: Thu, 12 Jan 2023 20:52:05 +0100 Subject: [PATCH 22/26] mentioned the new syntax in the cli documentation of forge --- src/Poseidon/CLI/OptparseApplicativeParsers.hs | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/Poseidon/CLI/OptparseApplicativeParsers.hs b/src/Poseidon/CLI/OptparseApplicativeParsers.hs index 43315ad1e..4fe36d5b7 100644 --- a/src/Poseidon/CLI/OptparseApplicativeParsers.hs +++ b/src/Poseidon/CLI/OptparseApplicativeParsers.hs @@ -167,7 +167,9 @@ parseForgeEntitiesDirect = OP.option (OP.eitherReader readSignedEntities) (OP.lo \forge will apply excludes and includes in order. If the first entity is negative, then forge \ \will assume you want to merge all individuals in the packages found in the baseDirs (except the \ \ones explicitly excluded) before the exclude entities are applied. \ - \An empty forgeString (and no --forgeFile) will therefore merge all available individuals.") + \An empty forgeString (and no --forgeFile) will therefore merge all available individuals. \ + \If there are individuals in your input packages with equal individual id, but different main group or \ + \source package, they can be specified with the special syntax \"\".") where readSignedEntities s = case readEntitiesFromString s of Left e -> Left (show e) From df015ee3b75c938d2be504da7d8ee348927e778d Mon Sep 17 00:00:00 2001 From: Clemens Schmid Date: Thu, 12 Jan 2023 20:57:04 +0100 Subject: [PATCH 23/26] removed unconnected left over file --- test/testDat/testPackages/ancient/Schmid_2028/sources.bib | 3 --- 1 file changed, 3 deletions(-) delete mode 100644 test/testDat/testPackages/ancient/Schmid_2028/sources.bib diff --git a/test/testDat/testPackages/ancient/Schmid_2028/sources.bib b/test/testDat/testPackages/ancient/Schmid_2028/sources.bib deleted file mode 100644 index 17e0967ef..000000000 --- a/test/testDat/testPackages/ancient/Schmid_2028/sources.bib +++ /dev/null @@ -1,3 +0,0 @@ -@article{Schiffels2016, - title = Test -} From 5f7739c2ce3db8238ab456811743c5ce4eb4051d Mon Sep 17 00:00:00 2001 From: Clemens Schmid Date: Fri, 13 Jan 2023 11:43:01 +0100 Subject: [PATCH 24/26] wrote the release note --- CHANGELOGRELEASE.md | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/CHANGELOGRELEASE.md b/CHANGELOGRELEASE.md index 7e51d0c1b..bf5383b91 100644 --- a/CHANGELOGRELEASE.md +++ b/CHANGELOGRELEASE.md @@ -1,5 +1,23 @@ ### V 1.1.7.0 +This release clarifies a long standing uncertainty how trident treats individual ID duplicates. It adds a new feature to the forge language to specify individuals more precisly and thus resolve duplication conflicts. + +Generally we discourage individuals with identical identifiers, so `Poseidon_ID`s, across package collections. But there is no reason to enforce this unnecessarily for subcommands where it does not matter. Here are the rules we defined now: + +- Generally, so in the subcommands `ìnit`, `fetch`, `genoconvert`, `update`, `list`, `summarise`, and `survey`, `trident` logs a warning if it observes duplicates in the package collection found in the base dirs, but then proceeds normally. +- Deviating from this the special subcommand `validate` stops with an error if it observes duplicates. This behaviour can be changed with the new flag `--ignoreDuplicates`. +- The `forge` subcommand, finally, also ignores duplicates in the base dirs, except (!) this conflict exists within the entities to be forged. In this case it stops with an informative error: + +``` +[Error] There are duplicated individuals, but forge does not allow that +[Error] Please specify in your --forgeString or --forgeFile: +[Error] -> <2010_RasmussenNature:Greenland_Saqqaq.SG:Inuk.SG> +[Error] -> <2011_RasmussenNature:Greenland_Saqqaq.SG:Inuk.SG> +[Error] Error in the forge selection: Unresolved duplicated individuals +``` + +This already shows that the `-f`/`--forgeString` selection language of `forge` (and ìncidentally also `fetch`) includes a new syntactic element since this release: Individuals can now be described not just with ``, but also more specifically ``. Such defined individuals take precedence over differently defined ones (so: directly with `` or as a subset of `*package*` or `group`). This allows to resolve duplication issues precisely -- at least in cases where the duplicated individuals differ in source package or primary group. + ### V 1.1.6.0 #### Additional columns in .janno files (V 1.1.5.0) From cbdbd2ef08b628b608d52f6b13641bc0d970af28 Mon Sep 17 00:00:00 2001 From: Clemens Schmid Date: Fri, 13 Jan 2023 12:24:26 +0100 Subject: [PATCH 25/26] made release changelog more clear regarding duplicates within one package --- CHANGELOGRELEASE.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/CHANGELOGRELEASE.md b/CHANGELOGRELEASE.md index bf5383b91..f1392a705 100644 --- a/CHANGELOGRELEASE.md +++ b/CHANGELOGRELEASE.md @@ -2,10 +2,10 @@ This release clarifies a long standing uncertainty how trident treats individual ID duplicates. It adds a new feature to the forge language to specify individuals more precisly and thus resolve duplication conflicts. -Generally we discourage individuals with identical identifiers, so `Poseidon_ID`s, across package collections. But there is no reason to enforce this unnecessarily for subcommands where it does not matter. Here are the rules we defined now: +trident does **not** allow individuals with identical identifiers, so `Poseidon_ID`s, **within one package**. And we generally also discourage such duplicates across packages in package collections. But there is no reason to enforce this unnecessarily for subcommands where it does not matter. Here are the rules we defined now: -- Generally, so in the subcommands `ìnit`, `fetch`, `genoconvert`, `update`, `list`, `summarise`, and `survey`, `trident` logs a warning if it observes duplicates in the package collection found in the base dirs, but then proceeds normally. -- Deviating from this the special subcommand `validate` stops with an error if it observes duplicates. This behaviour can be changed with the new flag `--ignoreDuplicates`. +- Generally, so in the subcommands `ìnit`, `fetch`, `genoconvert`, `update`, `list`, `summarise`, and `survey`, `trident` logs a warning if it observes duplicates in a package collection found in the base dirs. But it proceeds normally then. +- Deviating from this, the special subcommand `validate` stops with an error if it observes duplicates. This behaviour can be changed with the new flag `--ignoreDuplicates`. - The `forge` subcommand, finally, also ignores duplicates in the base dirs, except (!) this conflict exists within the entities to be forged. In this case it stops with an informative error: ``` From f3440ad940387be53f4dceb21f551de65bed4932 Mon Sep 17 00:00:00 2001 From: Clemens Schmid Date: Fri, 13 Jan 2023 17:04:38 +0100 Subject: [PATCH 26/26] minor changes in the changelog --- CHANGELOGRELEASE.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/CHANGELOGRELEASE.md b/CHANGELOGRELEASE.md index f1392a705..69ed31700 100644 --- a/CHANGELOGRELEASE.md +++ b/CHANGELOGRELEASE.md @@ -4,9 +4,9 @@ This release clarifies a long standing uncertainty how trident treats individual trident does **not** allow individuals with identical identifiers, so `Poseidon_ID`s, **within one package**. And we generally also discourage such duplicates across packages in package collections. But there is no reason to enforce this unnecessarily for subcommands where it does not matter. Here are the rules we defined now: -- Generally, so in the subcommands `ìnit`, `fetch`, `genoconvert`, `update`, `list`, `summarise`, and `survey`, `trident` logs a warning if it observes duplicates in a package collection found in the base dirs. But it proceeds normally then. +- Generally, so in the subcommands `ìnit`, `fetch`, `genoconvert`, `update`, `list`, `summarise`, and `survey`, trident logs a warning if it observes duplicates in a package collection found in the base dirs. But it proceeds normally then. - Deviating from this, the special subcommand `validate` stops with an error if it observes duplicates. This behaviour can be changed with the new flag `--ignoreDuplicates`. -- The `forge` subcommand, finally, also ignores duplicates in the base dirs, except (!) this conflict exists within the entities to be forged. In this case it stops with an informative error: +- The `forge` subcommand, finally, also ignores duplicates in the base dirs, except (!) this conflict exists within the entities in the `--forgeString`. In this case it stops with an informative error: ``` [Error] There are duplicated individuals, but forge does not allow that