Skip to content

Commit

Permalink
Rename missing values to '.' by default in BED, PGEN, and VCF writing
Browse files Browse the repository at this point in the history
  • Loading branch information
miriambt committed Dec 28, 2024
1 parent 4e71335 commit fc9a5da
Show file tree
Hide file tree
Showing 4 changed files with 98 additions and 67 deletions.
90 changes: 30 additions & 60 deletions demos/SNPObj.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -122,7 +122,7 @@
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": 4,
"id": "56fe8a53",
"metadata": {},
"outputs": [
Expand Down Expand Up @@ -155,7 +155,7 @@
},
{
"cell_type": "code",
"execution_count": 6,
"execution_count": 5,
"id": "f85111f1",
"metadata": {},
"outputs": [
Expand Down Expand Up @@ -191,7 +191,7 @@
},
{
"cell_type": "code",
"execution_count": 7,
"execution_count": 6,
"id": "cd2e476c-e47e-44ea-b6fa-fa6ebfdc29b9",
"metadata": {},
"outputs": [
Expand Down Expand Up @@ -220,7 +220,7 @@
},
{
"cell_type": "code",
"execution_count": 8,
"execution_count": 7,
"id": "1e1f08b5",
"metadata": {},
"outputs": [
Expand Down Expand Up @@ -248,7 +248,7 @@
},
{
"cell_type": "code",
"execution_count": 9,
"execution_count": 8,
"id": "dcf2c069",
"metadata": {},
"outputs": [
Expand Down Expand Up @@ -281,7 +281,7 @@
},
{
"cell_type": "code",
"execution_count": 10,
"execution_count": 9,
"id": "7c64f7ca",
"metadata": {},
"outputs": [
Expand Down Expand Up @@ -311,21 +311,15 @@
},
{
"cell_type": "code",
"execution_count": 11,
"execution_count": 10,
"id": "30807670",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Unique genotype values before renaming missings: [0 1]\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Unique genotype values before renaming missings: [0 1]\n",
"Unique genotype values after renaming missings: [0 1]\n"
]
}
Expand All @@ -352,21 +346,15 @@
},
{
"cell_type": "code",
"execution_count": 12,
"execution_count": 11,
"id": "9beadb42",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Number of SNPs before filtering: 976599\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Number of SNPs before filtering: 976599\n",
"Number of SNPs after filtering: 0\n"
]
}
Expand All @@ -390,7 +378,7 @@
},
{
"cell_type": "code",
"execution_count": 13,
"execution_count": 12,
"id": "4cf4d34e",
"metadata": {},
"outputs": [
Expand Down Expand Up @@ -428,7 +416,7 @@
},
{
"cell_type": "code",
"execution_count": 14,
"execution_count": 13,
"id": "89ae6d84",
"metadata": {},
"outputs": [
Expand Down Expand Up @@ -464,21 +452,15 @@
},
{
"cell_type": "code",
"execution_count": 15,
"execution_count": 14,
"id": "4fa41771",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Samples before filtering: ['sample_A' 'sample_B' 'sample_C' 'sample_D']\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Samples before filtering: ['sample_A' 'sample_B' 'sample_C' 'sample_D']\n",
"Samples after filtering: ['sample_A' 'sample_B']\n"
]
}
Expand All @@ -502,21 +484,15 @@
},
{
"cell_type": "code",
"execution_count": 16,
"execution_count": 15,
"id": "ba4d2066",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Samples before filtering: ['sample_A' 'sample_B' 'sample_C' 'sample_D']\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Samples before filtering: ['sample_A' 'sample_B' 'sample_C' 'sample_D']\n",
"Samples after filtering: ['sample_B' 'sample_C']\n"
]
}
Expand Down Expand Up @@ -544,7 +520,7 @@
},
{
"cell_type": "code",
"execution_count": 17,
"execution_count": 16,
"id": "b6951a98",
"metadata": {},
"outputs": [
Expand Down Expand Up @@ -580,7 +556,7 @@
},
{
"cell_type": "code",
"execution_count": 18,
"execution_count": 17,
"id": "ee7a73f1",
"metadata": {},
"outputs": [
Expand Down Expand Up @@ -612,7 +588,7 @@
},
{
"cell_type": "code",
"execution_count": 19,
"execution_count": 18,
"id": "889be954",
"metadata": {},
"outputs": [
Expand Down Expand Up @@ -653,7 +629,7 @@
},
{
"cell_type": "code",
"execution_count": 20,
"execution_count": 19,
"id": "b956538b",
"metadata": {},
"outputs": [
Expand Down Expand Up @@ -686,7 +662,7 @@
},
{
"cell_type": "code",
"execution_count": 21,
"execution_count": 20,
"id": "850ea83e",
"metadata": {},
"outputs": [
Expand Down Expand Up @@ -725,7 +701,7 @@
},
{
"cell_type": "code",
"execution_count": 22,
"execution_count": 21,
"id": "aadc12bf",
"metadata": {},
"outputs": [
Expand All @@ -734,7 +710,7 @@
"output_type": "stream",
"text": [
"First 5 variant positions before shuffling: [5033871 5033884 5033887 5035658 5038298]\n",
"First 5 variant positions after shuffling: [16943014 15089462 45897576 28971451 36528369]\n"
"First 5 variant positions after shuffling: [40064132 19950752 21721370 46148286 36956322]\n"
]
}
],
Expand All @@ -758,7 +734,7 @@
},
{
"cell_type": "code",
"execution_count": 23,
"execution_count": 22,
"id": "17a44f89",
"metadata": {},
"outputs": [
Expand Down Expand Up @@ -808,7 +784,7 @@
},
{
"cell_type": "code",
"execution_count": 24,
"execution_count": 23,
"id": "3ebf53e5",
"metadata": {},
"outputs": [
Expand All @@ -832,7 +808,7 @@
},
{
"cell_type": "code",
"execution_count": 25,
"execution_count": 24,
"id": "544b2955",
"metadata": {},
"outputs": [
Expand Down Expand Up @@ -889,21 +865,15 @@
},
{
"cell_type": "code",
"execution_count": 26,
"execution_count": 25,
"id": "08eff0b2",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"INFO:snputils.snp.io.write.pgen:Writing to ../data/output.pvar\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"INFO:snputils.snp.io.write.pgen:Writing to ../data/output.pvar\n",
"INFO:snputils.snp.io.write.pgen:Writing ../data/output.psam\n",
"INFO:snputils.snp.io.write.pgen:Writing to ../data/output.pgen\n",
"SNPObject saved to ../data/output.pgen\n",
Expand Down Expand Up @@ -938,7 +908,7 @@
},
{
"cell_type": "code",
"execution_count": 27,
"execution_count": 26,
"id": "fbca1fd8",
"metadata": {},
"outputs": [
Expand Down Expand Up @@ -989,7 +959,7 @@
},
{
"cell_type": "code",
"execution_count": 28,
"execution_count": 27,
"id": "cea856ad",
"metadata": {},
"outputs": [
Expand Down
27 changes: 24 additions & 3 deletions snputils/snp/io/write/bed.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import numpy as np
import pgenlib as pg
from pathlib import Path
from typing import Union

from snputils.snp.genobj import SNPObject

Expand All @@ -22,15 +23,35 @@ def __init__(self, snpobj: SNPObject, filename: str):
self.__snpobj = snpobj.copy()
self.__filename = Path(filename)

def write(self):
"""Writes the SNPObject to bed/bim/fam formats."""

def write(
self,
rename_missing_values: bool = True,
before: Union[int, float, str] = -1,
after: Union[int, float, str] = '.'
):
"""
Writes the SNPObject to bed/bim/fam formats.
Args:
rename_missing_values (bool, optional):
If True, renames potential missing values in `snpobj.calldata_gt` before writing.
Defaults to True.
before (int, float, or str, default=-1):
The current representation of missing values in `calldata_gt`. Common values might be -1, '.', or NaN.
Default is -1.
after (int, float, or str, default='.'):
The value that will replace `before`. Default is '.'.
"""
# Save .bed file
if self.__filename.suffix != '.bed':
self.__filename = self.__filename.with_suffix('.bed')

log.info(f"Writing .bed file: {self.__filename}")

# Optionally rename potential missing values in `snpobj.calldata_gt` before writing
if rename_missing_values:
self.__snpobj.rename_missings(before=before, after=after, inplace=True)

# If the input matrix has three dimensions, it indicates that the data is divided into two strands.
if len(self.__snpobj.calldata_gt.shape) == 3:
# Sum the two strands
Expand Down
24 changes: 22 additions & 2 deletions snputils/snp/io/write/pgen.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import pgenlib as pg
from pathlib import Path
import zstandard as zstd
from typing import Union

from snputils.snp.genobj.snpobj import SNPObject

Expand All @@ -26,17 +27,36 @@ def __init__(self, snpobj: SNPObject, filename: str):
self.__snpobj = snpobj
self.__filename = Path(filename)

def write(self, vzs: bool = False):
def write(
self,
vzs: bool = False,
rename_missing_values: bool = True,
before: Union[int, float, str] = -1,
after: Union[int, float, str] = '.'
):
"""
Writes the SNPObject data to .pgen, .psam, and .pvar files.
Args:
vzs (bool, optional): If True, compresses the .pvar file using zstd and saves it as .pvar.zst. Defaults to False.
vzs (bool, optional):
If True, compresses the .pvar file using zstd and saves it as .pvar.zst. Defaults to False.
rename_missing_values (bool, optional):
If True, renames potential missing values in `snpobj.calldata_gt` before writing.
Defaults to True.
before (int, float, or str, default=-1):
The current representation of missing values in `calldata_gt`. Common values might be -1, '.', or NaN.
Default is -1.
after (int, float, or str, default='.'):
The value that will replace `before`. Default is '.'.
"""
file_extensions = (".pgen", ".psam", ".pvar", ".pvar.zst")
if self.__filename.suffix in file_extensions:
self.__filename = self.__filename.with_suffix('')

# Optionally rename potential missing values in `snpobj.calldata_gt` before writing
if rename_missing_values:
self.__snpobj.rename_missings(before=before, after=after, inplace=True)

self.write_pvar(vzs=vzs)
self.write_psam()
self.write_pgen()
Expand Down
Loading

0 comments on commit fc9a5da

Please sign in to comment.