-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
8423ed4
commit 0c28230
Showing
6 changed files
with
379 additions
and
3 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,191 @@ | ||
# MolDF | ||
# Author: Ruibin Liu <ruibinliuphd@gmail.com> | ||
# License: MIT | ||
# Code Repository: https://github.com/Ruibin-Liu/MolDF | ||
"""JCSV format reading. | ||
Reads a JCSV file into a dict of ``Pandas DataFrame`` s. | ||
It is not limited to any molecular format. | ||
""" | ||
from __future__ import annotations | ||
|
||
import csv | ||
import os | ||
import warnings | ||
from collections import defaultdict | ||
|
||
import pandas as pd # type: ignore | ||
|
||
|
||
def read_jcsv( | ||
jcsv_file: str | os.PathLike, | ||
category_names: list | None = None, | ||
) -> dict[str, pd.DataFrame]: | ||
"""Reads a JCSV file by name. | ||
Currently no molecular file repository has JCSV files so we can only read from a | ||
file name/path. | ||
Args: | ||
jcsv_file (required): JCSV file name/path. | ||
category_names (optional): a list of category names. If ``None``, all categories | ||
are read. Defaults to **None**. | ||
Returns: | ||
a dict of Pandas DataFrames for each category. | ||
Raises: | ||
TypeError: if ``category_names`` is not a list of strings. | ||
ValueError: if any of the ``category_names`` has double quotes or | ||
if the number of items in any line does not match the number of | ||
column names in the same category. | ||
""" | ||
read_all = False | ||
if category_names is not None: | ||
if not isinstance(category_names, list): | ||
raise TypeError(f"{category_names} is not a list") | ||
for cat in category_names: | ||
if not isinstance(cat, str): | ||
raise TypeError(f"{cat} is not a str") | ||
elif '"' in cat: | ||
raise ValueError(f"{cat} has double quotes.") | ||
else: | ||
read_all = True | ||
|
||
results: dict[str, pd.DataFrame] = {} | ||
meta_found: bool | int = False | ||
with open(jcsv_file, "r") as jf: | ||
jf_reader = csv.reader(jf, delimiter=",", quotechar='"') | ||
for i, row in enumerate(jf_reader): | ||
if i == 0 and row[0] == "#jcsv_meta": | ||
meta_found = i + 1 | ||
n_lines = _count_n_lines(jcsv_file) | ||
elif not meta_found: | ||
results = _read_jcsv_by_line(jcsv_file, category_names=category_names) | ||
break | ||
elif meta_found and i == meta_found: | ||
meta_col_names = row | ||
col_data: dict[str, list] = defaultdict(list) | ||
elif row[0][0] == "#": | ||
break | ||
elif meta_found: | ||
if len(row) != len(meta_col_names): | ||
message = "Meta data has unmatched number" | ||
message += f" of items in row '{row}' with the column" | ||
message += f" names: {meta_col_names}" | ||
raise ValueError(message) | ||
value: str | int = "" | ||
for col_name, value in zip(meta_col_names, row): | ||
if col_name == "start_line_index": | ||
value = int(value) | ||
col_data[col_name].append(value) | ||
if meta_found: | ||
meta = list(zip(col_data["category"], col_data["start_line_index"])) | ||
start_line_index: int | str = 0 | ||
for i, (category_name, start_line_index) in enumerate(meta[1:]): | ||
if read_all or ( | ||
isinstance(category_names, list) and category_name in category_names | ||
): | ||
start_line_index = int(start_line_index) | ||
skip_rows = [j for j in range(start_line_index)] | ||
if i < len(meta) - 2: | ||
next_start = int(meta[i + 2][1]) | ||
ending_rows = [j for j in range(n_lines) if j > (next_start - 2)] | ||
skip_rows.extend(ending_rows) | ||
results[category_name] = pd.read_csv( | ||
jcsv_file, sep=",", quotechar='"', skiprows=skip_rows | ||
) | ||
|
||
return results | ||
|
||
|
||
def _read_jcsv_by_line( | ||
jcsv_file: str | os.PathLike, | ||
category_names: list | None = None, | ||
) -> dict[str, pd.DataFrame]: | ||
"""Reads JCSV file line by line when the file has no meta data to select blocks. | ||
Args: | ||
jcsv_file (required): JCSV file name/path. | ||
category_names (optional): a list of category names. If ``None``, all categories | ||
are read. Defaults to **None**. It is passed by the ``read_jcsv`` caller, so | ||
it is not sanitized here. | ||
Returns: | ||
a dict of Pandas DataFrames for each category. | ||
Raises: | ||
ValueError: if the number of items in any line does not match the number of | ||
column names in the same category. | ||
""" | ||
results: dict[str, pd.DataFrame] = {} | ||
|
||
read_all = False | ||
if category_names is None: | ||
read_all = True | ||
with open(jcsv_file, "r", encoding="utf-8") as jf: | ||
jf_reader = csv.reader(jf, delimiter=",", quotechar='"') | ||
req_cat_name_found: int | bool = False | ||
category_name = "" | ||
cols_data: dict[str, list] = defaultdict(list) | ||
if isinstance(category_names, list): | ||
category_names = list(category_names) | ||
for i, row in enumerate(jf_reader): | ||
if row[0][0] == "#": | ||
if read_all or ( | ||
isinstance(category_names, list) and row[0][1:] in category_names | ||
): | ||
if category_name: | ||
results[category_name] = pd.DataFrame(cols_data) | ||
|
||
category_name = row[0][1:] | ||
cols_data = defaultdict(list) | ||
req_cat_name_found = i + 1 | ||
else: | ||
req_cat_name_found = False | ||
elif req_cat_name_found and i == req_cat_name_found: | ||
col_names = row | ||
elif req_cat_name_found: | ||
if len(row) != len(col_names): | ||
message = f"Category {category_name} has unmatched number" | ||
message += f" of items in row '{row}' with the column" | ||
message += f" names: {col_names}" | ||
raise ValueError(message) | ||
for col_name, value in zip(col_names, row): | ||
cols_data[col_name].append(value) | ||
|
||
if category_name: | ||
results[category_name] = pd.DataFrame(cols_data) | ||
|
||
if category_names is not None: | ||
for category_name in category_names: | ||
if category_name not in results: | ||
warnings.warn( | ||
"Category {category_name} not in {jcsv_file}, not read", | ||
RuntimeWarning, | ||
stacklevel=2, | ||
) | ||
|
||
return results | ||
|
||
|
||
def _count_n_lines(file_name: str | os.PathLike): | ||
"""Gets the number of lines in a file. | ||
From https://stackoverflow.com/a/68385697/10094189 | ||
Args: | ||
file_name (required): file name or path. | ||
""" | ||
|
||
def _make_gen(reader): | ||
while True: | ||
b = reader(2**16) | ||
if not b: | ||
break | ||
yield b | ||
|
||
with open(file_name, "rb") as f: | ||
count = sum(buf.count(b"\n") for buf in _make_gen(f.raw.read)) | ||
|
||
return count |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,75 @@ | ||
#jcsv_meta | ||
category,start_line_index | ||
meta,1 | ||
_chem_comp,10 | ||
_chem_comp_atom,13 | ||
_chem_comp_bond,36 | ||
_pdbx_chem_comp_descriptor,59 | ||
_pdbx_chem_comp_identifier,68 | ||
_pdbx_chem_comp_audit,72 | ||
#_chem_comp | ||
id,name,type,pdbx_type,formula,mon_nstd_parent_comp_id,pdbx_synonyms,pdbx_formal_charge,pdbx_initial_date,pdbx_modified_date,pdbx_ambiguous_flag,pdbx_release_status,pdbx_replaced_by,pdbx_replaces,formula_weight,one_letter_code,three_letter_code,pdbx_model_coordinates_details,pdbx_model_coordinates_missing_flag,pdbx_ideal_coordinates_details,pdbx_ideal_coordinates_missing_flag,pdbx_model_coordinates_db_code,pdbx_subcomponent_list,pdbx_processing_site | ||
HIS,HISTIDINE,L-PEPTIDE LINKING,ATOMP,C6 H10 N3 O2,?,?,1,1999-07-08,2011-06-04,N,REL,?,?,156.162,H,HIS,?,N,OpenEye/OEToolkits V1.4.2,N,?,?,EBI | ||
#_chem_comp_atom | ||
comp_id,atom_id,alt_atom_id,type_symbol,charge,pdbx_align,pdbx_aromatic_flag,pdbx_leaving_atom_flag,pdbx_stereo_config,model_Cartn_x,model_Cartn_y,model_Cartn_z,pdbx_model_Cartn_x_ideal,pdbx_model_Cartn_y_ideal,pdbx_model_Cartn_z_ideal,pdbx_component_atom_id,pdbx_component_comp_id,pdbx_ordinal | ||
HIS,N,N,N,0,1,N,N,N,33.472,42.685,-4.610,-0.040,-1.210,0.053,N,HIS,1 | ||
HIS,CA,CA,C,0,1,N,N,S,33.414,41.686,-5.673,1.172,-1.709,0.652,CA,HIS,2 | ||
HIS,C,C,C,0,1,N,N,N,33.773,42.279,-7.040,1.083,-3.207,0.905,C,HIS,3 | ||
HIS,O,O,O,0,1,N,N,N,33.497,43.444,-7.337,0.040,-3.770,1.222,O,HIS,4 | ||
HIS,CB,CB,C,0,1,N,N,N,32.005,41.080,-5.734,1.484,-0.975,1.962,CB,HIS,5 | ||
HIS,CG,CG,C,0,1,Y,N,N,31.888,39.902,-6.651,2.940,-1.060,2.353,CG,HIS,6 | ||
HIS,ND1,ND1,N,1,1,Y,N,N,32.539,38.710,-6.414,3.380,-2.075,3.129,ND1,HIS,7 | ||
HIS,CD2,CD2,C,0,1,Y,N,N,31.199,39.734,-7.804,3.960,-0.251,2.046,CD2,HIS,8 | ||
HIS,CE1,CE1,C,0,1,Y,N,N,32.251,37.857,-7.382,4.693,-1.908,3.317,CE1,HIS,9 | ||
HIS,NE2,NE2,N,0,1,Y,N,N,31.439,38.453,-8.237,5.058,-0.801,2.662,NE2,HIS,10 | ||
HIS,OXT,OXT,O,0,1,N,Y,N,34.382,41.455,-7.879,2.247,-3.882,0.744,OXT,HIS,11 | ||
HIS,H,H,H,0,1,N,N,N,33.485,42.227,-3.721,-0.102,-1.155,-0.950,H,HIS,12 | ||
HIS,H2,HN2,H,0,1,N,Y,N,34.301,43.234,-4.714,-0.715,-0.741,0.634,H2,HIS,13 | ||
HIS,HA,HA,H,0,1,N,N,N,34.155,40.908,-5.439,1.965,-1.558,-0.089,HA,HIS,14 | ||
HIS,HB2,1HB,H,0,1,N,N,N,31.733,40.750,-4.721,1.215,0.087,1.879,HB2,HIS,15 | ||
HIS,HB3,2HB,H,0,1,N,N,N,31.337,41.860,-6.127,0.859,-1.368,2.775,HB3,HIS,16 | ||
HIS,HD1,HD1,H,0,1,N,N,N,33.135,38.521,-5.633,2.828,-2.838,3.511,HD1,HIS,17 | ||
HIS,HD2,HD2,H,0,1,N,N,N,30.577,40.470,-8.292,4.108,0.647,1.479,HD2,HIS,18 | ||
HIS,HE1,HE1,H,0,1,N,N,N,32.618,36.844,-7.461,5.340,-2.550,3.892,HE1,HIS,19 | ||
HIS,HE2,HE2,H,0,1,N,N,N,31.061,38.039,-9.065,6.002,-0.428,2.627,HE2,HIS,20 | ||
HIS,HXT,HXT,H,0,1,N,Y,N,34.553,41.905,-8.698,2.188,-4.848,0.901,HXT,HIS,21 | ||
#_chem_comp_bond | ||
comp_id,atom_id_1,atom_id_2,value_order,pdbx_aromatic_flag,pdbx_stereo_config,pdbx_ordinal | ||
HIS,N,CA,SING,N,N,1 | ||
HIS,N,H,SING,N,N,2 | ||
HIS,N,H2,SING,N,N,3 | ||
HIS,CA,C,SING,N,N,4 | ||
HIS,CA,CB,SING,N,N,5 | ||
HIS,CA,HA,SING,N,N,6 | ||
HIS,C,O,DOUB,N,N,7 | ||
HIS,C,OXT,SING,N,N,8 | ||
HIS,CB,CG,SING,N,N,9 | ||
HIS,CB,HB2,SING,N,N,10 | ||
HIS,CB,HB3,SING,N,N,11 | ||
HIS,CG,ND1,SING,Y,N,12 | ||
HIS,CG,CD2,DOUB,Y,N,13 | ||
HIS,ND1,CE1,DOUB,Y,N,14 | ||
HIS,ND1,HD1,SING,N,N,15 | ||
HIS,CD2,NE2,SING,Y,N,16 | ||
HIS,CD2,HD2,SING,N,N,17 | ||
HIS,CE1,NE2,SING,Y,N,18 | ||
HIS,CE1,HE1,SING,N,N,19 | ||
HIS,NE2,HE2,SING,N,N,20 | ||
HIS,OXT,HXT,SING,N,N,21 | ||
#_pdbx_chem_comp_descriptor | ||
comp_id,type,program,program_version,descriptor | ||
HIS,SMILES,ACDLabs,10.04,O=C(O)C(N)Cc1cnc[nH+]1 | ||
HIS,SMILES_CANONICAL,CACTVS,3.341,N[C@@H](Cc1c[nH]c[nH+]1)C(O)=O | ||
HIS,SMILES,CACTVS,3.341,N[CH](Cc1c[nH]c[nH+]1)C(O)=O | ||
HIS,SMILES_CANONICAL,OpenEye OEToolkits,1.5.0,c1c([nH+]c[nH]1)C[C@@H](C(=O)O)N | ||
HIS,SMILES,OpenEye OEToolkits,1.5.0,c1c([nH+]c[nH]1)CC(C(=O)O)N | ||
HIS,InChI,InChI,1.03,"InChI=1S/C6H9N3O2/c7-5(6(10)11)1-4-2-8-3-9-4/h2-3,5H,1,7H2,(H,8,9)(H,10,11)/p+1/t5-/m0/s1" | ||
HIS,InChIKey,InChI,1.03,HNDVDQJCIGZPNO-YFKPBYRVSA-O | ||
#_pdbx_chem_comp_identifier | ||
comp_id,type,program,program_version,identifier | ||
HIS,SYSTEMATIC NAME,ACDLabs,10.04,3-(1H-imidazol-3-ium-4-yl)-L-alanine | ||
HIS,SYSTEMATIC NAME,OpenEye OEToolkits,1.5.0,(2S)-2-amino-3-(1H-imidazol-3-ium-4-yl)propanoic acid | ||
#_pdbx_chem_comp_audit | ||
comp_id,action_type,date,processing_site | ||
HIS,Create component,1999-07-08,EBI | ||
HIS,Modify descriptor,2011-06-04,RCSB |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,53 @@ | ||
# MolDF | ||
# Author: Ruibin Liu <ruibinliuphd@gmail.com> | ||
# License: MIT | ||
# Code Repository: https://github.com/Ruibin-Liu/MolDF | ||
"""Tests for reading jcsv files.""" | ||
import os | ||
import sys | ||
|
||
from moldf.read_jcsv import read_jcsv | ||
|
||
sys.path.append("..") | ||
CFD = os.path.dirname(__file__) | ||
|
||
|
||
def test_read_jcsv_nometa(): | ||
""" | ||
Test read_jcsv function without meta in jcsv | ||
""" | ||
# without meta | ||
file_path = [CFD, "test_files", "pdbx_moldf.jcsv"] | ||
test_file = f"{os.sep}".join(file_path) | ||
jcsv = read_jcsv(test_file) | ||
cat_size = { | ||
"_chem_comp": (1, 24), | ||
"_chem_comp_atom": (21, 18), | ||
"_chem_comp_bond": (21, 7), | ||
"_pdbx_chem_comp_descriptor": (7, 5), | ||
"_pdbx_chem_comp_identifier": (2, 5), | ||
"_pdbx_chem_comp_audit": (2, 4), | ||
} | ||
assert len(jcsv) == 6, "Not all categories are read." | ||
for k, v in jcsv.items(): | ||
assert cat_size[k] == v.shape, f"Category {k} read incorrectly" | ||
|
||
|
||
def test_read_jcsv_meta(): | ||
""" | ||
Test read_jcsv function with meta in jcsv | ||
""" | ||
file_path = [CFD, "test_files", "pdbx_moldf_meta.jcsv"] | ||
test_file = f"{os.sep}".join(file_path) | ||
jcsv = read_jcsv(test_file) | ||
cat_size = { | ||
"_chem_comp": (1, 24), | ||
"_chem_comp_atom": (21, 18), | ||
"_chem_comp_bond": (21, 7), | ||
"_pdbx_chem_comp_descriptor": (7, 5), | ||
"_pdbx_chem_comp_identifier": (2, 5), | ||
"_pdbx_chem_comp_audit": (2, 4), | ||
} | ||
assert len(jcsv) == 6, "Not all categories are read." | ||
for k, v in jcsv.items(): | ||
assert cat_size[k] == v.shape, f"Category {k} read incorrectly" |
Oops, something went wrong.