Skip to content

Commit

Permalink
Merge pull request #132 from mmore500/alifestd-count-nodes
Browse files Browse the repository at this point in the history
Impl, test alifestd count nodes tools
  • Loading branch information
mmore500 authored Jan 22, 2024
2 parents 344d887 + 36d11ec commit 8089ca5
Show file tree
Hide file tree
Showing 17 changed files with 732 additions and 15 deletions.
3 changes: 1 addition & 2 deletions docs/citing.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ bibtex:
volume = {7},
number = {80},
pages = {4866},
author = {Matthew Andres Moreno and Emily Dolson and Charles Ofria},
author = {Matthew Andres Moreno and Emily Dolson and Charles Ofria},
title = {hstrat: a Python Package for phylogenetic inference on distributed digital evolution populations},
journal = {Journal of Open Source Software}
}
Expand Down Expand Up @@ -68,4 +68,3 @@ Chicago:
MLA:

> Moreno, Matthew Andres et al. "Hereditary Stratigraphy: Genome Annotations to Enable Phylogenetic Inference over Distributed Populations." Artificial Life Conference Proceedings. 2022. https://doi.org/10.1162/isal_a_00550
12 changes: 12 additions & 0 deletions hstrat/_auxiliary_lib/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,12 +14,18 @@
from ._alifestd_assign_root_ancestor_token import (
alifestd_assign_root_ancestor_token,
)
from ._alifestd_calc_polytomic_index import alifestd_calc_polytomic_index
from ._alifestd_chronological_sort import alifestd_chronological_sort
from ._alifestd_coarsen_mask import alifestd_coarsen_mask
from ._alifestd_collapse_unifurcations import alifestd_collapse_unifurcations
from ._alifestd_convert_root_ancestor_token import (
alifestd_convert_root_ancestor_token,
)
from ._alifestd_count_inner_nodes import alifestd_count_inner_nodes
from ._alifestd_count_leaf_nodes import alifestd_count_leaf_nodes
from ._alifestd_count_polytomies import alifestd_count_polytomies
from ._alifestd_count_root_nodes import alifestd_count_root_nodes
from ._alifestd_count_unifurcations import alifestd_count_unifurcations
from ._alifestd_find_chronological_inconsistency import (
alifestd_find_chronological_inconsistency,
)
Expand Down Expand Up @@ -179,6 +185,12 @@
"alifestd_chronological_sort",
"alifestd_coarsen_mask",
"alifestd_collapse_unifurcations",
"alifestd_calc_polytomic_index",
"alifestd_count_inner_nodes",
"alifestd_count_leaf_nodes",
"alifestd_count_root_nodes",
"alifestd_count_polytomies",
"alifestd_count_unifurcations",
"alifestd_find_chronological_inconsistency",
"alifestd_find_leaf_ids",
"alifestd_find_mrca_id_asexual",
Expand Down
21 changes: 21 additions & 0 deletions hstrat/_auxiliary_lib/_alifestd_calc_polytomic_index.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
import pandas as pd

from ._alifestd_count_leaf_nodes import alifestd_count_leaf_nodes
from ._alifestd_count_root_nodes import alifestd_count_root_nodes
from ._alifestd_count_unifurcations import alifestd_count_unifurcations


def alifestd_calc_polytomic_index(phylogeny_df: pd.DataFrame) -> int:
"""Count how many fewer inner nodes are contained in phylogeny than expected
if strictly bifurcationg.
Excludes unifurcations from calculation.
"""
num_leaf_nodes = alifestd_count_leaf_nodes(phylogeny_df)
num_root_nodes = alifestd_count_root_nodes(phylogeny_df)
expected_rows_if_bifurcating = max(2 * num_leaf_nodes - num_root_nodes, 0)
num_unifurcations = alifestd_count_unifurcations(phylogeny_df)
num_non_unifurcating_rows = len(phylogeny_df) - num_unifurcations
res = expected_rows_if_bifurcating - num_non_unifurcating_rows
assert 0 <= res < max(expected_rows_if_bifurcating, 1)
return res
15 changes: 15 additions & 0 deletions hstrat/_auxiliary_lib/_alifestd_count_inner_nodes.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
import pandas as pd

from ._alifestd_count_leaf_nodes import alifestd_count_leaf_nodes


def alifestd_count_inner_nodes(
phylogeny_df: pd.DataFrame,
mutate: bool = False,
) -> int:
"""Count how many non-leaf nodes are contained in phylogeny."""

num_leaves = alifestd_count_leaf_nodes(phylogeny_df)
res = len(phylogeny_df) - num_leaves
assert res >= 0
return res
8 changes: 8 additions & 0 deletions hstrat/_auxiliary_lib/_alifestd_count_leaf_nodes.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
import pandas as pd

from ._alifestd_find_leaf_ids import alifestd_find_leaf_ids


def alifestd_count_leaf_nodes(phylogeny_df: pd.DataFrame) -> int:
"""How many leaf nodes are contained in phylogeny?"""
return len(alifestd_find_leaf_ids(phylogeny_df))
19 changes: 19 additions & 0 deletions hstrat/_auxiliary_lib/_alifestd_count_polytomies.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
from collections import Counter

import pandas as pd

from ._alifestd_try_add_ancestor_id_col import alifestd_try_add_ancestor_id_col


def alifestd_count_polytomies(phylogeny_df: pd.DataFrame) -> int:
"""Count how many inner nodes have more than two descendant nodes.
Only supports asexual phylogenies.
"""
phylogeny_df = alifestd_try_add_ancestor_id_col(phylogeny_df)
if "ancestor_id" not in phylogeny_df.columns:
raise ValueError(
"alifestd_count_polytomies only supports asexual phylogenies.",
)
ancestor_counts = Counter(phylogeny_df["ancestor_id"])
return sum(v > 2 for v in ancestor_counts.values())
13 changes: 13 additions & 0 deletions hstrat/_auxiliary_lib/_alifestd_count_root_nodes.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
import numpy as np
import pandas as pd


def alifestd_count_root_nodes(phylogeny_df: pd.DataFrame) -> np.array: # int
"""How many root nodes are contained in phylogeny?"""
return (
phylogeny_df["ancestor_list"]
.astype(str)
.str.lower()
.isin(("[none]", "[]"))
.sum()
)
20 changes: 20 additions & 0 deletions hstrat/_auxiliary_lib/_alifestd_count_unifurcations.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
from collections import Counter

import pandas as pd

from ._alifestd_try_add_ancestor_id_col import alifestd_try_add_ancestor_id_col


def alifestd_count_unifurcations(phylogeny_df: pd.DataFrame) -> int:
"""Count how many inner nodes have exactly one descendant node.
Only supports asexual phylogenies.
"""
phylogeny_df = alifestd_try_add_ancestor_id_col(phylogeny_df)
if "ancestor_id" not in phylogeny_df.columns:
raise ValueError(
"alifestd_count_unifurcations only supports asexual phylogenies.",
)
except_roots = phylogeny_df["ancestor_id"] != phylogeny_df["id"]
ancestor_counts = Counter(phylogeny_df.loc[except_roots, "ancestor_id"])
return sum(v == 1 for v in ancestor_counts.values())
8 changes: 6 additions & 2 deletions hstrat/_auxiliary_lib/_alifestd_find_leaf_ids.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,8 +34,12 @@ def alifestd_find_leaf_ids(phylogeny_df: pd.DataFrame) -> typing.List[int]:
internal_ids = (
set(
ancestor_id
for ancestor_list_str in phylogeny_df["ancestor_list"]
for ancestor_id in alifestd_parse_ancestor_ids(ancestor_list_str)
for ancestor_list in phylogeny_df["ancestor_list"]
for ancestor_id in (
alifestd_parse_ancestor_ids(ancestor_list)
if isinstance(ancestor_list, str)
else ancestor_list
)
)
if "ancestor_id" not in phylogeny_df
else set(
Expand Down
5 changes: 4 additions & 1 deletion hstrat/_auxiliary_lib/_alifestd_find_root_ids.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,9 @@ def alifestd_find_root_ids(phylogeny_df: pd.DataFrame) -> np.array: # int
"""

root_df = phylogeny_df[
phylogeny_df["ancestor_list"].str.lower().isin(("[none]", "[]"))
phylogeny_df["ancestor_list"]
.astype(str)
.str.lower()
.isin(("[none]", "[]"))
]
return root_df["id"].to_numpy().copy()
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
import pandas as pd
import pytest

from hstrat._auxiliary_lib import (
alifestd_calc_polytomic_index,
alifestd_make_empty,
)


def test_empty_df():
assert alifestd_calc_polytomic_index(alifestd_make_empty()) == 0


def test_singleton_df():
df = pd.DataFrame(
{
"id": [0],
"ancestor_list": [[None]],
}
)
assert alifestd_calc_polytomic_index(df) == 0


def test_polytomy_df1():
df = pd.DataFrame(
{
"id": [0, 1, 2, 3, 4],
"ancestor_list": [[None], [0], [0], [0], [1]],
}
)
assert alifestd_calc_polytomic_index(df) == 1


def test_polytomy_df2():
df = pd.DataFrame(
{
"id": [0, 1, 2, 3, 4, 5, 6],
"ancestor_list": [[None], [0], [0], [0], [1], [1], [1]],
}
)
assert alifestd_calc_polytomic_index(df) == 2


def test_polytomy_df3():
df = pd.DataFrame(
{
"id": [0, 1, 2, 3, 4, 5, 6, 7],
"ancestor_list": [[None], [0], [0], [0], [1], [1], [1], [1]],
}
)
assert alifestd_calc_polytomic_index(df) == 3


def test_multiple_trees_df1():
df = pd.DataFrame(
{
"id": [0, 1, 2, 3, 4, 5],
"ancestor_list": [[None], [None], [0], [2], [2], [3]],
}
)
assert alifestd_calc_polytomic_index(df) == 0


def test_multiple_trees_df2():
df = pd.DataFrame(
{
"id": [0, 1, 2, 3, 4, 5],
"ancestor_list": [[None], [None], [0], [1], [0], [0]],
}
)
assert alifestd_calc_polytomic_index(df) == 1


def test_multiple_trees_df3():
df = pd.DataFrame(
{
"id": [0, 1, 2, 3, 4, 5, 6],
"ancestor_list": [[None], [None], [0], [1], [0], [0], [None]],
}
)
assert alifestd_calc_polytomic_index(df) == 1


def test_sexual():
df = pd.DataFrame(
{
"id": [0, 1, 2, 3, 5],
"ancestor_list": [[None], [None], [0, 1], [1], [0]],
}
)
with pytest.raises(ValueError):
alifestd_calc_polytomic_index(df)
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
import pandas as pd

from hstrat._auxiliary_lib import (
alifestd_count_inner_nodes,
alifestd_make_empty,
)


def test_empty_df():
assert alifestd_count_inner_nodes(alifestd_make_empty()) == 0


def test_singleton_df():
df = pd.DataFrame(
{
"id": [0],
"ancestor_list": [[None]],
}
)
assert alifestd_count_inner_nodes(df) == 0


def test_sexual_df1():
df = pd.DataFrame(
{
"id": [0, 1, 2, 3, 4],
"ancestor_list": ["[None]", "[0]", "[0]", "[1, 0]", "[1]"],
}
)
assert alifestd_count_inner_nodes(df) == 2


def test_sexual_df2():
df = pd.DataFrame(
{
"id": [0, 1, 2, 3, 4],
"ancestor_list": [[None], [0], [0], [1, 0], [1]],
}
)
assert alifestd_count_inner_nodes(df) == 2


def test_polytomy_df():
df = pd.DataFrame(
{
"id": [0, 1, 2, 3, 4],
"ancestor_list": [[None], [0], [0], [0], [1]],
}
)
assert alifestd_count_inner_nodes(df) == 2


def test_multiple_trees_df1():
df = pd.DataFrame(
{
"id": [0, 1, 2, 3, 4, 5],
"ancestor_list": [[None], [None], [0], [2], [2], [3]],
}
)
assert alifestd_count_inner_nodes(df) == 3


def test_multiple_trees_df2():
df = pd.DataFrame(
{
"id": [0, 1, 2, 3, 4, 5],
"ancestor_list": [[None], [None], [0], [1], [2], [3]],
}
)
assert alifestd_count_inner_nodes(df) == 4


def test_strictly_bifurcating_df():
df = pd.DataFrame(
{
"id": [0, 1, 2, 3, 4],
"ancestor_list": [[None], [0], [0], [1], [1]],
}
)
assert alifestd_count_inner_nodes(df) == 2
Loading

0 comments on commit 8089ca5

Please sign in to comment.