Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve documentation and code of ancestry readers/writers. Solve variants_alt inconsistency. Rename phased to sum_strands and set default to False #12

Merged
merged 21 commits into from
Nov 21, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
adc7c90
Implement save and save_pickle for LocalAncestryObject. Add support f…
miriambt Nov 18, 2024
b5a7db6
Improve docstring and code in LAIReader. Add support for .msp.tsv
miriambt Nov 18, 2024
576a3c5
Improve docstring and comments. Add expected MSP contnt
miriambt Nov 18, 2024
fb9721a
Improve local ancestry readers
miriambt Nov 18, 2024
f3bae66
Improve docstrings in local ancestry readers/writers
miriambt Nov 18, 2024
3494f91
Improve flexibility of MSPWriter. Only write available attributes. So…
miriambt Nov 18, 2024
9b0c19a
Make chm and lai/samples optional in MSPWriter
miriambt Nov 18, 2024
d1c1e4e
Write missing attributes as NaN for MSP and read as None
miriambt Nov 19, 2024
8aebefa
Improve AdmixtureMappingVCFWriter
miriambt Nov 19, 2024
3d75031
Improve global ancestry readers and writers
miriambt Nov 19, 2024
6edefd5
Add save and save_pickle to GlobalAncestryObject. Improve AdmixtureWr…
miriambt Nov 19, 2024
e1b10b3
Merge remote-tracking branch 'origin/main' into readers-writers
miriambt Nov 20, 2024
7603094
Fix inconsistency in variants_alt. (rad allways as 1-dimensional)
miriambt Nov 20, 2024
be9e7cc
Rename phased to sum_strands. Set defauult to False
miriambt Nov 20, 2024
75aaf0c
Fix bug accessing variants_alt in BEDWriter
miriambt Nov 20, 2024
29f7bc4
Update variants_alt in AdmixtureMappingVCFWriter
davidbonet Nov 20, 2024
dd82fae
variants_alt-related typos in SNPObject
davidbonet Nov 20, 2024
d5834f6
Uncomment tests for variants_alt
davidbonet Nov 20, 2024
6923775
Add test for variants_alt to match snpobj's n_snps
davidbonet Nov 20, 2024
60402b9
control number of variant alts in allel.read_vcf with alt_number
davidbonet Nov 20, 2024
dc1888a
Use append_biallelic in PGENWriter when summed_strands
davidbonet Nov 21, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion benchmark/read_bed.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
def read_bed_snputils(path):
"""Read BED fileset using snputils"""
import snputils
return snputils.read_bed(path, phased=False, fields=["GT"]).calldata_gt
return snputils.read_bed(path, sum_strands=True, fields=["GT"]).calldata_gt


def read_bed_pgenlib(path):
Expand Down
2 changes: 1 addition & 1 deletion benchmark/read_pgen.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
def read_pgen_snputils(path):
"""Read PGEN file using snputils"""
import snputils
return snputils.read_pgen(path, phased=False, fields=["GT"]).calldata_gt
return snputils.read_pgen(path, sum_strands=True, fields=["GT"]).calldata_gt


def read_pgen_pgenlib(path):
Expand Down
4 changes: 2 additions & 2 deletions benchmark/read_vcf.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,13 @@
def read_vcf_snputils(path):
"""Read VCF file using snputils"""
import snputils
return snputils.read_vcf(path, phased=False).calldata_gt
return snputils.read_vcf(path, sum_strands=True).calldata_gt


def read_vcf_snputils_polars(path):
"""Read VCF file using snputils and polars"""
from snputils.snp.io.read.vcf import VCFReaderPolars
return VCFReaderPolars(path).read(fields=[], phased=False).calldata_gt
return VCFReaderPolars(path).read(fields=[], sum_strands=True).calldata_gt


def read_vcf_scikit_allel(path):
Expand Down
5 changes: 2 additions & 3 deletions demos/LAI_visualization.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
},
{
"cell_type": "code",
"execution_count": 1,
"execution_count": null,
"id": "50752deb-9a50-4725-90dc-0d10005e1bbb",
"metadata": {},
"outputs": [
Expand All @@ -27,7 +27,6 @@
"source": [
"import os\n",
"import sys\n",
"import matplotlib.pyplot as plt\n",
"import logging\n",
"\n",
"logging.basicConfig(stream=sys.stdout, level=logging.INFO)\n",
Expand Down Expand Up @@ -59,7 +58,7 @@
"name": "stdout",
"output_type": "stream",
"text": [
"INFO:snputils.ancestry.io.local.read.msp:Reading msp file from '/home/miriam/Documents/snputils/data/lai.msp'...\n"
"INFO:snputils.ancestry.io.local.read.msp:Reading '/home/miriam/Documents/snputils/data/lai.msp'...\n"
]
}
],
Expand Down
12 changes: 6 additions & 6 deletions demos/PhenObj.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": 2,
"id": "325c0d03-45cb-4b5f-9420-439ad354ab4c",
"metadata": {},
"outputs": [
Expand Down Expand Up @@ -100,7 +100,7 @@
"3 HG00100 AFR"
]
},
"execution_count": 3,
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
Expand Down Expand Up @@ -132,7 +132,7 @@
},
{
"cell_type": "code",
"execution_count": 4,
"execution_count": 3,
"id": "9f7ea6c5-a53d-43c1-b998-83b446644572",
"metadata": {},
"outputs": [
Expand Down Expand Up @@ -182,7 +182,7 @@
"1 HG00097 EUR"
]
},
"execution_count": 4,
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
Expand All @@ -203,7 +203,7 @@
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": 4,
"id": "a3e4b46c-dc1b-471e-830f-d133816e74b4",
"metadata": {},
"outputs": [
Expand Down Expand Up @@ -253,7 +253,7 @@
"1 HG00099 AFR"
]
},
"execution_count": 5,
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
Expand Down
70 changes: 53 additions & 17 deletions demos/SNPObj.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@
"\n",
"# Read VCF into SNPObject with the standard reader\n",
"reader = VCFReader(query_path)\n",
"snpobj = reader.read(phased=True)\n",
"snpobj = reader.read(sum_strands=False)\n",
"\n",
"print(\"Attributes of the SNPObject:\", snpobj.keys())\n",
"\n",
Expand Down Expand Up @@ -248,7 +248,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 8,
"id": "dcf2c069",
"metadata": {},
"outputs": [
Expand Down Expand Up @@ -281,7 +281,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 9,
"id": "7c64f7ca",
"metadata": {},
"outputs": [
Expand Down Expand Up @@ -311,15 +311,21 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 10,
"id": "30807670",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Unique genotype values before renaming missings: [0 1]\n",
"Unique genotype values before renaming missings: [0 1]\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Unique genotype values after renaming missings: [0 1]\n"
]
}
Expand All @@ -346,7 +352,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 11,
"id": "9beadb42",
"metadata": {},
"outputs": [
Expand Down Expand Up @@ -384,7 +390,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 12,
"id": "4cf4d34e",
"metadata": {},
"outputs": [
Expand Down Expand Up @@ -416,7 +422,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 13,
"id": "89ae6d84",
"metadata": {},
"outputs": [
Expand Down Expand Up @@ -452,7 +458,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 14,
"id": "4fa41771",
"metadata": {},
"outputs": [
Expand Down Expand Up @@ -490,7 +496,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 15,
"id": "ba4d2066",
"metadata": {},
"outputs": [
Expand Down Expand Up @@ -722,7 +728,7 @@
"name": "stdout",
"output_type": "stream",
"text": [
"First 5 variant positions after shuffling: [23614817 40647956 44287560 41652214 42829912]\n"
"First 5 variant positions after shuffling: [23057061 26864842 29640589 30592114 33893258]\n"
]
}
],
Expand Down Expand Up @@ -790,7 +796,31 @@
},
{
"cell_type": "code",
"execution_count": 29,
"execution_count": 4,
"id": "3ebf53e5",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"INFO:snputils.snp.io.read.vcf:Reading ../data/subset.vcf\n",
"INFO:snputils.snp.io.read.vcf:Finished reading ../data/subset.vcf\n"
]
}
],
"source": [
"# Define the path to the VCF file\n",
"query_path = '../data/subset.vcf'\n",
"\n",
"# Read VCF into SNPObject with the standard reader\n",
"reader = VCFReader(query_path)\n",
"snpobj = reader.read(sum_strands=False)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "544b2955",
"metadata": {},
"outputs": [
Expand Down Expand Up @@ -827,15 +857,21 @@
},
{
"cell_type": "code",
"execution_count": 30,
"execution_count": 6,
"id": "08eff0b2",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"INFO:snputils.snp.io.write.pgen:Writing to ../data/output.pvar\n",
"INFO:snputils.snp.io.write.pgen:Writing to ../data/output.pvar\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"INFO:snputils.snp.io.write.pgen:Writing ../data/output.psam\n",
"INFO:snputils.snp.io.write.pgen:Writing to ../data/output.pgen\n",
"SNPObject saved to ../data/output.pgen\n",
Expand Down Expand Up @@ -870,7 +906,7 @@
},
{
"cell_type": "code",
"execution_count": 31,
"execution_count": 7,
"id": "fbca1fd8",
"metadata": {},
"outputs": [
Expand Down Expand Up @@ -921,7 +957,7 @@
},
{
"cell_type": "code",
"execution_count": 33,
"execution_count": 8,
"id": "cea856ad",
"metadata": {},
"outputs": [
Expand Down Expand Up @@ -951,7 +987,7 @@
{
"cell_type": "code",
"execution_count": null,
"id": "5b3a0360",
"id": "300d5cae",
"metadata": {},
"outputs": [],
"source": []
Expand Down
2 changes: 1 addition & 1 deletion demos/SNP_PCA.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@
"source": [
"# Load genotype data from VCF file\n",
"reader = VCFReader('../data/subset.vcf')\n",
"snpobj = reader.read(phased=True)"
"snpobj = reader.read(sum_strands=False)"
]
},
{
Expand Down
16 changes: 8 additions & 8 deletions demos/TorchPCA.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
},
{
"cell_type": "code",
"execution_count": 7,
"execution_count": 1,
"id": "2fd07cb3",
"metadata": {
"scrolled": true
Expand Down Expand Up @@ -48,7 +48,7 @@
},
{
"cell_type": "code",
"execution_count": 8,
"execution_count": 2,
"id": "abc98c7c",
"metadata": {
"scrolled": true
Expand All @@ -67,7 +67,7 @@
"source": [
"# Load genotype data using VCFReader\n",
"reader = VCFReader('../data/subset.vcf')\n",
"snpobj = reader.read(phased=True)\n",
"snpobj = reader.read(sum_strands=False)\n",
"\n",
"# Extract genotype call data (GT) and display shape and data type\n",
"gt = snpobj.calldata_gt\n",
Expand Down Expand Up @@ -95,7 +95,7 @@
},
{
"cell_type": "code",
"execution_count": 9,
"execution_count": 3,
"id": "fa643a80",
"metadata": {},
"outputs": [
Expand Down Expand Up @@ -137,7 +137,7 @@
},
{
"cell_type": "code",
"execution_count": 10,
"execution_count": 4,
"id": "dfd07a81",
"metadata": {},
"outputs": [
Expand Down Expand Up @@ -188,7 +188,7 @@
},
{
"cell_type": "code",
"execution_count": 11,
"execution_count": 5,
"id": "0e1a6754",
"metadata": {},
"outputs": [
Expand All @@ -197,7 +197,7 @@
"output_type": "stream",
"text": [
"Using device: cpu\n",
"PCA completed. Data shape: torch.Size([4, 976599]), Time taken: 0.078 seconds\n",
"PCA completed. Data shape: torch.Size([4, 976599]), Time taken: 0.093 seconds\n",
"PCA result shape: torch.Size([4, 2])\n"
]
}
Expand Down Expand Up @@ -233,7 +233,7 @@
},
{
"cell_type": "code",
"execution_count": 12,
"execution_count": 6,
"id": "65b4a232",
"metadata": {
"scrolled": true
Expand Down
2 changes: 1 addition & 1 deletion demos/admixture_mapping.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -262,7 +262,7 @@
],
"metadata": {
"kernelspec": {
"display_name": "admixture",
"display_name": "galaxybio",
"language": "python",
"name": "python3"
},
Expand Down
Loading