Skip to content

Commit

Permalink
read and write to JCSV
Browse files Browse the repository at this point in the history
  • Loading branch information
Ruibin-Liu committed Oct 9, 2023
1 parent 8423ed4 commit 0c28230
Show file tree
Hide file tree
Showing 6 changed files with 379 additions and 3 deletions.
7 changes: 7 additions & 0 deletions docs/api.rst
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,13 @@ MOL2 Reader
:members:
:private-members:

JCSV Reader
-------------------------

.. automodule:: moldf.read_jcsv
:members:
:private-members:

PDBDataFrame Class
-----------------------------
.. _PDBDataFrame:
Expand Down
191 changes: 191 additions & 0 deletions moldf/read_jcsv.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,191 @@
# MolDF
# Author: Ruibin Liu <ruibinliuphd@gmail.com>
# License: MIT
# Code Repository: https://github.com/Ruibin-Liu/MolDF
"""JCSV format reading.
Reads a JCSV file into a dict of ``Pandas DataFrame`` s.
It is not limited to any molecular format.
"""
from __future__ import annotations

import csv
import os
import warnings
from collections import defaultdict

import pandas as pd # type: ignore


def read_jcsv(
jcsv_file: str | os.PathLike,
category_names: list | None = None,
) -> dict[str, pd.DataFrame]:
"""Reads a JCSV file by name.
Currently no molecular file repository has JCSV files so we can only read from a
file name/path.
Args:
jcsv_file (required): JCSV file name/path.
category_names (optional): a list of category names. If ``None``, all categories
are read. Defaults to **None**.
Returns:
a dict of Pandas DataFrames for each category.
Raises:
TypeError: if ``category_names`` is not a list of strings.
ValueError: if any of the ``category_names`` has double quotes or
if the number of items in any line does not match the number of
column names in the same category.
"""
read_all = False
if category_names is not None:
if not isinstance(category_names, list):
raise TypeError(f"{category_names} is not a list")
for cat in category_names:
if not isinstance(cat, str):
raise TypeError(f"{cat} is not a str")
elif '"' in cat:
raise ValueError(f"{cat} has double quotes.")
else:
read_all = True

results: dict[str, pd.DataFrame] = {}
meta_found: bool | int = False
with open(jcsv_file, "r") as jf:
jf_reader = csv.reader(jf, delimiter=",", quotechar='"')
for i, row in enumerate(jf_reader):
if i == 0 and row[0] == "#jcsv_meta":
meta_found = i + 1
n_lines = _count_n_lines(jcsv_file)
elif not meta_found:
results = _read_jcsv_by_line(jcsv_file, category_names=category_names)
break
elif meta_found and i == meta_found:
meta_col_names = row
col_data: dict[str, list] = defaultdict(list)
elif row[0][0] == "#":
break
elif meta_found:
if len(row) != len(meta_col_names):
message = "Meta data has unmatched number"
message += f" of items in row '{row}' with the column"
message += f" names: {meta_col_names}"
raise ValueError(message)
value: str | int = ""
for col_name, value in zip(meta_col_names, row):
if col_name == "start_line_index":
value = int(value)
col_data[col_name].append(value)
if meta_found:
meta = list(zip(col_data["category"], col_data["start_line_index"]))
start_line_index: int | str = 0
for i, (category_name, start_line_index) in enumerate(meta[1:]):
if read_all or (
isinstance(category_names, list) and category_name in category_names
):
start_line_index = int(start_line_index)
skip_rows = [j for j in range(start_line_index)]
if i < len(meta) - 2:
next_start = int(meta[i + 2][1])
ending_rows = [j for j in range(n_lines) if j > (next_start - 2)]
skip_rows.extend(ending_rows)
results[category_name] = pd.read_csv(
jcsv_file, sep=",", quotechar='"', skiprows=skip_rows
)

return results


def _read_jcsv_by_line(
jcsv_file: str | os.PathLike,
category_names: list | None = None,
) -> dict[str, pd.DataFrame]:
"""Reads JCSV file line by line when the file has no meta data to select blocks.
Args:
jcsv_file (required): JCSV file name/path.
category_names (optional): a list of category names. If ``None``, all categories
are read. Defaults to **None**. It is passed by the ``read_jcsv`` caller, so
it is not sanitized here.
Returns:
a dict of Pandas DataFrames for each category.
Raises:
ValueError: if the number of items in any line does not match the number of
column names in the same category.
"""
results: dict[str, pd.DataFrame] = {}

read_all = False
if category_names is None:
read_all = True
with open(jcsv_file, "r", encoding="utf-8") as jf:
jf_reader = csv.reader(jf, delimiter=",", quotechar='"')
req_cat_name_found: int | bool = False
category_name = ""
cols_data: dict[str, list] = defaultdict(list)
if isinstance(category_names, list):
category_names = list(category_names)
for i, row in enumerate(jf_reader):
if row[0][0] == "#":
if read_all or (
isinstance(category_names, list) and row[0][1:] in category_names
):
if category_name:
results[category_name] = pd.DataFrame(cols_data)

category_name = row[0][1:]
cols_data = defaultdict(list)
req_cat_name_found = i + 1
else:
req_cat_name_found = False
elif req_cat_name_found and i == req_cat_name_found:
col_names = row
elif req_cat_name_found:
if len(row) != len(col_names):
message = f"Category {category_name} has unmatched number"
message += f" of items in row '{row}' with the column"
message += f" names: {col_names}"
raise ValueError(message)
for col_name, value in zip(col_names, row):
cols_data[col_name].append(value)

if category_name:
results[category_name] = pd.DataFrame(cols_data)

if category_names is not None:
for category_name in category_names:
if category_name not in results:
warnings.warn(
"Category {category_name} not in {jcsv_file}, not read",
RuntimeWarning,
stacklevel=2,
)

return results


def _count_n_lines(file_name: str | os.PathLike):
"""Gets the number of lines in a file.
From https://stackoverflow.com/a/68385697/10094189
Args:
file_name (required): file name or path.
"""

def _make_gen(reader):
while True:
b = reader(2**16)
if not b:
break
yield b

with open(file_name, "rb") as f:
count = sum(buf.count(b"\n") for buf in _make_gen(f.raw.read))

return count
22 changes: 22 additions & 0 deletions moldf/write_jcsv.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
def write_jcsv(
data: dict[str, pd.DataFrame],
file_name: str | os.PathLike | None = None,
write_meta: bool = True,
**kwargs,
) -> None:
"""Write a dict of ``Pandas DataFrame`` s into a JCSV file.
Expand All @@ -23,11 +24,15 @@ def write_jcsv(
file_name (optional): file name to write a JCSV file. If ``None``,
``moldf_output.jcsv`` will be used as the file name if ``path_or_buf`` is not
specified in ``**kwargs``. Defaults to **None**.
write_meta (optional): whether to write meta data into the **first** category.
Currently, only the first line number for each category is recorded.
Defaults to **True**.
**kwargs: keyword arguments for ``pd.DataFrame.to_csv``. Invalid ones are ignored.
Check https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.to_csv.html
Raises:
TypeError: if ``data`` is not a valid dict of ``DataFrame``.
ValueError: if ``"`` in any of the column names.
"""
if file_name is None:
file_name = kwargs.get("path_or_buf")
Expand Down Expand Up @@ -55,6 +60,23 @@ def write_jcsv(
line_terminator = os.linesep

with open(file_name, "w", encoding="utf-8") as out_file:
if write_meta:
meta_line = f"#jcsv_meta{line_terminator}"
out_file.write(meta_line)
out_file.write(f"category,start_line_index{line_terminator}")
out_file.write(f"meta,1{line_terminator}") # just to be self-consistent
last_cat_line_length = len(data) + 3
for key, df in data.items():
if "'" in key:
key = f'"{key}"'
elif '"' in key:
message = '" is not supported in category names, but'
message += f" {key} has it."
raise ValueError(message)

out_file.write(f"{key},{last_cat_line_length+1}{line_terminator}")
last_cat_line_length += len(df) + 2

for key, df in data.items():
key_line = f"#{key}{line_terminator}"
out_file.write(key_line)
Expand Down
75 changes: 75 additions & 0 deletions tests/test_files/pdbx_moldf_meta.jcsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
#jcsv_meta
category,start_line_index
meta,1
_chem_comp,10
_chem_comp_atom,13
_chem_comp_bond,36
_pdbx_chem_comp_descriptor,59
_pdbx_chem_comp_identifier,68
_pdbx_chem_comp_audit,72
#_chem_comp
id,name,type,pdbx_type,formula,mon_nstd_parent_comp_id,pdbx_synonyms,pdbx_formal_charge,pdbx_initial_date,pdbx_modified_date,pdbx_ambiguous_flag,pdbx_release_status,pdbx_replaced_by,pdbx_replaces,formula_weight,one_letter_code,three_letter_code,pdbx_model_coordinates_details,pdbx_model_coordinates_missing_flag,pdbx_ideal_coordinates_details,pdbx_ideal_coordinates_missing_flag,pdbx_model_coordinates_db_code,pdbx_subcomponent_list,pdbx_processing_site
HIS,HISTIDINE,L-PEPTIDE LINKING,ATOMP,C6 H10 N3 O2,?,?,1,1999-07-08,2011-06-04,N,REL,?,?,156.162,H,HIS,?,N,OpenEye/OEToolkits V1.4.2,N,?,?,EBI
#_chem_comp_atom
comp_id,atom_id,alt_atom_id,type_symbol,charge,pdbx_align,pdbx_aromatic_flag,pdbx_leaving_atom_flag,pdbx_stereo_config,model_Cartn_x,model_Cartn_y,model_Cartn_z,pdbx_model_Cartn_x_ideal,pdbx_model_Cartn_y_ideal,pdbx_model_Cartn_z_ideal,pdbx_component_atom_id,pdbx_component_comp_id,pdbx_ordinal
HIS,N,N,N,0,1,N,N,N,33.472,42.685,-4.610,-0.040,-1.210,0.053,N,HIS,1
HIS,CA,CA,C,0,1,N,N,S,33.414,41.686,-5.673,1.172,-1.709,0.652,CA,HIS,2
HIS,C,C,C,0,1,N,N,N,33.773,42.279,-7.040,1.083,-3.207,0.905,C,HIS,3
HIS,O,O,O,0,1,N,N,N,33.497,43.444,-7.337,0.040,-3.770,1.222,O,HIS,4
HIS,CB,CB,C,0,1,N,N,N,32.005,41.080,-5.734,1.484,-0.975,1.962,CB,HIS,5
HIS,CG,CG,C,0,1,Y,N,N,31.888,39.902,-6.651,2.940,-1.060,2.353,CG,HIS,6
HIS,ND1,ND1,N,1,1,Y,N,N,32.539,38.710,-6.414,3.380,-2.075,3.129,ND1,HIS,7
HIS,CD2,CD2,C,0,1,Y,N,N,31.199,39.734,-7.804,3.960,-0.251,2.046,CD2,HIS,8
HIS,CE1,CE1,C,0,1,Y,N,N,32.251,37.857,-7.382,4.693,-1.908,3.317,CE1,HIS,9
HIS,NE2,NE2,N,0,1,Y,N,N,31.439,38.453,-8.237,5.058,-0.801,2.662,NE2,HIS,10
HIS,OXT,OXT,O,0,1,N,Y,N,34.382,41.455,-7.879,2.247,-3.882,0.744,OXT,HIS,11
HIS,H,H,H,0,1,N,N,N,33.485,42.227,-3.721,-0.102,-1.155,-0.950,H,HIS,12
HIS,H2,HN2,H,0,1,N,Y,N,34.301,43.234,-4.714,-0.715,-0.741,0.634,H2,HIS,13
HIS,HA,HA,H,0,1,N,N,N,34.155,40.908,-5.439,1.965,-1.558,-0.089,HA,HIS,14
HIS,HB2,1HB,H,0,1,N,N,N,31.733,40.750,-4.721,1.215,0.087,1.879,HB2,HIS,15
HIS,HB3,2HB,H,0,1,N,N,N,31.337,41.860,-6.127,0.859,-1.368,2.775,HB3,HIS,16
HIS,HD1,HD1,H,0,1,N,N,N,33.135,38.521,-5.633,2.828,-2.838,3.511,HD1,HIS,17
HIS,HD2,HD2,H,0,1,N,N,N,30.577,40.470,-8.292,4.108,0.647,1.479,HD2,HIS,18
HIS,HE1,HE1,H,0,1,N,N,N,32.618,36.844,-7.461,5.340,-2.550,3.892,HE1,HIS,19
HIS,HE2,HE2,H,0,1,N,N,N,31.061,38.039,-9.065,6.002,-0.428,2.627,HE2,HIS,20
HIS,HXT,HXT,H,0,1,N,Y,N,34.553,41.905,-8.698,2.188,-4.848,0.901,HXT,HIS,21
#_chem_comp_bond
comp_id,atom_id_1,atom_id_2,value_order,pdbx_aromatic_flag,pdbx_stereo_config,pdbx_ordinal
HIS,N,CA,SING,N,N,1
HIS,N,H,SING,N,N,2
HIS,N,H2,SING,N,N,3
HIS,CA,C,SING,N,N,4
HIS,CA,CB,SING,N,N,5
HIS,CA,HA,SING,N,N,6
HIS,C,O,DOUB,N,N,7
HIS,C,OXT,SING,N,N,8
HIS,CB,CG,SING,N,N,9
HIS,CB,HB2,SING,N,N,10
HIS,CB,HB3,SING,N,N,11
HIS,CG,ND1,SING,Y,N,12
HIS,CG,CD2,DOUB,Y,N,13
HIS,ND1,CE1,DOUB,Y,N,14
HIS,ND1,HD1,SING,N,N,15
HIS,CD2,NE2,SING,Y,N,16
HIS,CD2,HD2,SING,N,N,17
HIS,CE1,NE2,SING,Y,N,18
HIS,CE1,HE1,SING,N,N,19
HIS,NE2,HE2,SING,N,N,20
HIS,OXT,HXT,SING,N,N,21
#_pdbx_chem_comp_descriptor
comp_id,type,program,program_version,descriptor
HIS,SMILES,ACDLabs,10.04,O=C(O)C(N)Cc1cnc[nH+]1
HIS,SMILES_CANONICAL,CACTVS,3.341,N[C@@H](Cc1c[nH]c[nH+]1)C(O)=O
HIS,SMILES,CACTVS,3.341,N[CH](Cc1c[nH]c[nH+]1)C(O)=O
HIS,SMILES_CANONICAL,OpenEye OEToolkits,1.5.0,c1c([nH+]c[nH]1)C[C@@H](C(=O)O)N
HIS,SMILES,OpenEye OEToolkits,1.5.0,c1c([nH+]c[nH]1)CC(C(=O)O)N
HIS,InChI,InChI,1.03,"InChI=1S/C6H9N3O2/c7-5(6(10)11)1-4-2-8-3-9-4/h2-3,5H,1,7H2,(H,8,9)(H,10,11)/p+1/t5-/m0/s1"
HIS,InChIKey,InChI,1.03,HNDVDQJCIGZPNO-YFKPBYRVSA-O
#_pdbx_chem_comp_identifier
comp_id,type,program,program_version,identifier
HIS,SYSTEMATIC NAME,ACDLabs,10.04,3-(1H-imidazol-3-ium-4-yl)-L-alanine
HIS,SYSTEMATIC NAME,OpenEye OEToolkits,1.5.0,(2S)-2-amino-3-(1H-imidazol-3-ium-4-yl)propanoic acid
#_pdbx_chem_comp_audit
comp_id,action_type,date,processing_site
HIS,Create component,1999-07-08,EBI
HIS,Modify descriptor,2011-06-04,RCSB
53 changes: 53 additions & 0 deletions tests/test_read_jcsv.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
# MolDF
# Author: Ruibin Liu <ruibinliuphd@gmail.com>
# License: MIT
# Code Repository: https://github.com/Ruibin-Liu/MolDF
"""Tests for reading jcsv files."""
import os
import sys

from moldf.read_jcsv import read_jcsv

sys.path.append("..")
CFD = os.path.dirname(__file__)


def test_read_jcsv_nometa():
"""
Test read_jcsv function without meta in jcsv
"""
# without meta
file_path = [CFD, "test_files", "pdbx_moldf.jcsv"]
test_file = f"{os.sep}".join(file_path)
jcsv = read_jcsv(test_file)
cat_size = {
"_chem_comp": (1, 24),
"_chem_comp_atom": (21, 18),
"_chem_comp_bond": (21, 7),
"_pdbx_chem_comp_descriptor": (7, 5),
"_pdbx_chem_comp_identifier": (2, 5),
"_pdbx_chem_comp_audit": (2, 4),
}
assert len(jcsv) == 6, "Not all categories are read."
for k, v in jcsv.items():
assert cat_size[k] == v.shape, f"Category {k} read incorrectly"


def test_read_jcsv_meta():
"""
Test read_jcsv function with meta in jcsv
"""
file_path = [CFD, "test_files", "pdbx_moldf_meta.jcsv"]
test_file = f"{os.sep}".join(file_path)
jcsv = read_jcsv(test_file)
cat_size = {
"_chem_comp": (1, 24),
"_chem_comp_atom": (21, 18),
"_chem_comp_bond": (21, 7),
"_pdbx_chem_comp_descriptor": (7, 5),
"_pdbx_chem_comp_identifier": (2, 5),
"_pdbx_chem_comp_audit": (2, 4),
}
assert len(jcsv) == 6, "Not all categories are read."
for k, v in jcsv.items():
assert cat_size[k] == v.shape, f"Category {k} read incorrectly"
Loading

0 comments on commit 0c28230

Please sign in to comment.