read and write to JCSV

Ruibin-Liu · Oct 9, 2023 · 0c28230 · 0c28230
1 parent 8423ed4
commit 0c28230
Show file tree

Hide file tree

Showing 6 changed files with 379 additions and 3 deletions.
diff --git a/docs/api.rst b/docs/api.rst
@@ -26,6 +26,13 @@ MOL2 Reader
    :members:
    :private-members:
 
+JCSV Reader
+-------------------------
+
+.. automodule:: moldf.read_jcsv
+   :members:
+   :private-members:
+
 PDBDataFrame Class
 -----------------------------
 .. _PDBDataFrame:

diff --git a/moldf/read_jcsv.py b/moldf/read_jcsv.py
@@ -0,0 +1,191 @@
+# MolDF
+# Author: Ruibin Liu <ruibinliuphd@gmail.com>
+# License: MIT
+# Code Repository: https://github.com/Ruibin-Liu/MolDF
+"""JCSV format reading.
+
+Reads a JCSV file into a dict of ``Pandas DataFrame`` s.
+It is not limited to any molecular format.
+
+"""
+from __future__ import annotations
+
+import csv
+import os
+import warnings
+from collections import defaultdict
+
+import pandas as pd  # type: ignore
+
+
+def read_jcsv(
+    jcsv_file: str | os.PathLike,
+    category_names: list | None = None,
+) -> dict[str, pd.DataFrame]:
+    """Reads a JCSV file by name.
+
+    Currently no molecular file repository has JCSV files so we can only read from a
+    file name/path.
+
+    Args:
+        jcsv_file (required): JCSV file name/path.
+        category_names (optional): a list of category names. If ``None``, all categories
+            are read. Defaults to **None**.
+
+    Returns:
+        a dict of Pandas DataFrames for each category.
+
+    Raises:
+        TypeError: if ``category_names`` is not a list of strings.
+        ValueError: if any of the ``category_names`` has double quotes or
+            if the number of items in any line does not match the number of
+            column names in the same category.
+    """
+    read_all = False
+    if category_names is not None:
+        if not isinstance(category_names, list):
+            raise TypeError(f"{category_names} is not a list")
+        for cat in category_names:
+            if not isinstance(cat, str):
+                raise TypeError(f"{cat} is not a str")
+            elif '"' in cat:
+                raise ValueError(f"{cat} has double quotes.")
+    else:
+        read_all = True
+
+    results: dict[str, pd.DataFrame] = {}
+    meta_found: bool | int = False
+    with open(jcsv_file, "r") as jf:
+        jf_reader = csv.reader(jf, delimiter=",", quotechar='"')
+        for i, row in enumerate(jf_reader):
+            if i == 0 and row[0] == "#jcsv_meta":
+                meta_found = i + 1
+                n_lines = _count_n_lines(jcsv_file)
+            elif not meta_found:
+                results = _read_jcsv_by_line(jcsv_file, category_names=category_names)
+                break
+            elif meta_found and i == meta_found:
+                meta_col_names = row
+                col_data: dict[str, list] = defaultdict(list)
+            elif row[0][0] == "#":
+                break
+            elif meta_found:
+                if len(row) != len(meta_col_names):
+                    message = "Meta data has unmatched number"
+                    message += f" of items in row '{row}' with the column"
+                    message += f" names: {meta_col_names}"
+                    raise ValueError(message)
+                value: str | int = ""
+                for col_name, value in zip(meta_col_names, row):
+                    if col_name == "start_line_index":
+                        value = int(value)
+                    col_data[col_name].append(value)
+    if meta_found:
+        meta = list(zip(col_data["category"], col_data["start_line_index"]))
+        start_line_index: int | str = 0
+        for i, (category_name, start_line_index) in enumerate(meta[1:]):
+            if read_all or (
+                isinstance(category_names, list) and category_name in category_names
+            ):
+                start_line_index = int(start_line_index)
+                skip_rows = [j for j in range(start_line_index)]
+                if i < len(meta) - 2:
+                    next_start = int(meta[i + 2][1])
+                    ending_rows = [j for j in range(n_lines) if j > (next_start - 2)]
+                    skip_rows.extend(ending_rows)
+                results[category_name] = pd.read_csv(
+                    jcsv_file, sep=",", quotechar='"', skiprows=skip_rows
+                )
+
+    return results
+
+
+def _read_jcsv_by_line(
+    jcsv_file: str | os.PathLike,
+    category_names: list | None = None,
+) -> dict[str, pd.DataFrame]:
+    """Reads JCSV file line by line when the file has no meta data to select blocks.
+
+    Args:
+        jcsv_file (required): JCSV file name/path.
+        category_names (optional): a list of category names. If ``None``, all categories
+            are read. Defaults to **None**. It is passed by the ``read_jcsv`` caller, so
+            it is not sanitized here.
+
+    Returns:
+        a dict of Pandas DataFrames for each category.
+
+    Raises:
+        ValueError: if the number of items in any line does not match the number of
+            column names in the same category.
+    """
+    results: dict[str, pd.DataFrame] = {}
+
+    read_all = False
+    if category_names is None:
+        read_all = True
+    with open(jcsv_file, "r", encoding="utf-8") as jf:
+        jf_reader = csv.reader(jf, delimiter=",", quotechar='"')
+        req_cat_name_found: int | bool = False
+        category_name = ""
+        cols_data: dict[str, list] = defaultdict(list)
+        if isinstance(category_names, list):
+            category_names = list(category_names)
+        for i, row in enumerate(jf_reader):
+            if row[0][0] == "#":
+                if read_all or (
+                    isinstance(category_names, list) and row[0][1:] in category_names
+                ):
+                    if category_name:
+                        results[category_name] = pd.DataFrame(cols_data)
+
+                    category_name = row[0][1:]
+                    cols_data = defaultdict(list)
+                    req_cat_name_found = i + 1
+                else:
+                    req_cat_name_found = False
+            elif req_cat_name_found and i == req_cat_name_found:
+                col_names = row
+            elif req_cat_name_found:
+                if len(row) != len(col_names):
+                    message = f"Category {category_name} has unmatched number"
+                    message += f" of items in row '{row}' with the column"
+                    message += f" names: {col_names}"
+                    raise ValueError(message)
+                for col_name, value in zip(col_names, row):
+                    cols_data[col_name].append(value)
+
+        if category_name:
+            results[category_name] = pd.DataFrame(cols_data)
+
+    if category_names is not None:
+        for category_name in category_names:
+            if category_name not in results:
+                warnings.warn(
+                    "Category {category_name} not in {jcsv_file}, not read",
+                    RuntimeWarning,
+                    stacklevel=2,
+                )
+
+    return results
+
+
+def _count_n_lines(file_name: str | os.PathLike):
+    """Gets the number of lines in a file.
+    From https://stackoverflow.com/a/68385697/10094189
+
+    Args:
+        file_name (required): file name or path.
+    """
+
+    def _make_gen(reader):
+        while True:
+            b = reader(2**16)
+            if not b:
+                break
+            yield b
+
+    with open(file_name, "rb") as f:
+        count = sum(buf.count(b"\n") for buf in _make_gen(f.raw.read))
+
+    return count
diff --git a/moldf/write_jcsv.py b/moldf/write_jcsv.py
@@ -13,6 +13,7 @@
 def write_jcsv(
     data: dict[str, pd.DataFrame],
     file_name: str | os.PathLike | None = None,
+    write_meta: bool = True,
     **kwargs,
 ) -> None:
     """Write a dict of ``Pandas DataFrame`` s into a JCSV file.
@@ -23,11 +24,15 @@ def write_jcsv(
         file_name (optional): file name to write a JCSV file. If ``None``,
             ``moldf_output.jcsv`` will be used as the file name if ``path_or_buf`` is not
             specified in ``**kwargs``. Defaults to **None**.
+        write_meta (optional): whether to write meta data into the **first** category.
+            Currently, only the first line number for each category is recorded.
+            Defaults to **True**.
         **kwargs: keyword arguments for ``pd.DataFrame.to_csv``. Invalid ones are ignored.
             Check https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.to_csv.html
 
     Raises:
         TypeError: if ``data`` is not a valid dict of ``DataFrame``.
+        ValueError: if ``"`` in any of the column names.
     """
     if file_name is None:
         file_name = kwargs.get("path_or_buf")
@@ -55,6 +60,23 @@ def write_jcsv(
         line_terminator = os.linesep
 
     with open(file_name, "w", encoding="utf-8") as out_file:
+        if write_meta:
+            meta_line = f"#jcsv_meta{line_terminator}"
+            out_file.write(meta_line)
+            out_file.write(f"category,start_line_index{line_terminator}")
+            out_file.write(f"meta,1{line_terminator}")  # just to be self-consistent
+            last_cat_line_length = len(data) + 3
+            for key, df in data.items():
+                if "'" in key:
+                    key = f'"{key}"'
+                elif '"' in key:
+                    message = '" is not supported in category names, but'
+                    message += f" {key} has it."
+                    raise ValueError(message)
+
+                out_file.write(f"{key},{last_cat_line_length+1}{line_terminator}")
+                last_cat_line_length += len(df) + 2
+
         for key, df in data.items():
             key_line = f"#{key}{line_terminator}"
             out_file.write(key_line)

diff --git a/tests/test_files/pdbx_moldf_meta.jcsv b/tests/test_files/pdbx_moldf_meta.jcsv
@@ -0,0 +1,75 @@
+#jcsv_meta
+category,start_line_index
+meta,1
+_chem_comp,10
+_chem_comp_atom,13
+_chem_comp_bond,36
+_pdbx_chem_comp_descriptor,59
+_pdbx_chem_comp_identifier,68
+_pdbx_chem_comp_audit,72
+#_chem_comp
+id,name,type,pdbx_type,formula,mon_nstd_parent_comp_id,pdbx_synonyms,pdbx_formal_charge,pdbx_initial_date,pdbx_modified_date,pdbx_ambiguous_flag,pdbx_release_status,pdbx_replaced_by,pdbx_replaces,formula_weight,one_letter_code,three_letter_code,pdbx_model_coordinates_details,pdbx_model_coordinates_missing_flag,pdbx_ideal_coordinates_details,pdbx_ideal_coordinates_missing_flag,pdbx_model_coordinates_db_code,pdbx_subcomponent_list,pdbx_processing_site
+HIS,HISTIDINE,L-PEPTIDE LINKING,ATOMP,C6 H10 N3 O2,?,?,1,1999-07-08,2011-06-04,N,REL,?,?,156.162,H,HIS,?,N,OpenEye/OEToolkits V1.4.2,N,?,?,EBI
+#_chem_comp_atom
+comp_id,atom_id,alt_atom_id,type_symbol,charge,pdbx_align,pdbx_aromatic_flag,pdbx_leaving_atom_flag,pdbx_stereo_config,model_Cartn_x,model_Cartn_y,model_Cartn_z,pdbx_model_Cartn_x_ideal,pdbx_model_Cartn_y_ideal,pdbx_model_Cartn_z_ideal,pdbx_component_atom_id,pdbx_component_comp_id,pdbx_ordinal
+HIS,N,N,N,0,1,N,N,N,33.472,42.685,-4.610,-0.040,-1.210,0.053,N,HIS,1
+HIS,CA,CA,C,0,1,N,N,S,33.414,41.686,-5.673,1.172,-1.709,0.652,CA,HIS,2
+HIS,C,C,C,0,1,N,N,N,33.773,42.279,-7.040,1.083,-3.207,0.905,C,HIS,3
+HIS,O,O,O,0,1,N,N,N,33.497,43.444,-7.337,0.040,-3.770,1.222,O,HIS,4
+HIS,CB,CB,C,0,1,N,N,N,32.005,41.080,-5.734,1.484,-0.975,1.962,CB,HIS,5
+HIS,CG,CG,C,0,1,Y,N,N,31.888,39.902,-6.651,2.940,-1.060,2.353,CG,HIS,6
+HIS,ND1,ND1,N,1,1,Y,N,N,32.539,38.710,-6.414,3.380,-2.075,3.129,ND1,HIS,7
+HIS,CD2,CD2,C,0,1,Y,N,N,31.199,39.734,-7.804,3.960,-0.251,2.046,CD2,HIS,8
+HIS,CE1,CE1,C,0,1,Y,N,N,32.251,37.857,-7.382,4.693,-1.908,3.317,CE1,HIS,9
+HIS,NE2,NE2,N,0,1,Y,N,N,31.439,38.453,-8.237,5.058,-0.801,2.662,NE2,HIS,10
+HIS,OXT,OXT,O,0,1,N,Y,N,34.382,41.455,-7.879,2.247,-3.882,0.744,OXT,HIS,11
+HIS,H,H,H,0,1,N,N,N,33.485,42.227,-3.721,-0.102,-1.155,-0.950,H,HIS,12
+HIS,H2,HN2,H,0,1,N,Y,N,34.301,43.234,-4.714,-0.715,-0.741,0.634,H2,HIS,13
+HIS,HA,HA,H,0,1,N,N,N,34.155,40.908,-5.439,1.965,-1.558,-0.089,HA,HIS,14
+HIS,HB2,1HB,H,0,1,N,N,N,31.733,40.750,-4.721,1.215,0.087,1.879,HB2,HIS,15
+HIS,HB3,2HB,H,0,1,N,N,N,31.337,41.860,-6.127,0.859,-1.368,2.775,HB3,HIS,16
+HIS,HD1,HD1,H,0,1,N,N,N,33.135,38.521,-5.633,2.828,-2.838,3.511,HD1,HIS,17
+HIS,HD2,HD2,H,0,1,N,N,N,30.577,40.470,-8.292,4.108,0.647,1.479,HD2,HIS,18
+HIS,HE1,HE1,H,0,1,N,N,N,32.618,36.844,-7.461,5.340,-2.550,3.892,HE1,HIS,19
+HIS,HE2,HE2,H,0,1,N,N,N,31.061,38.039,-9.065,6.002,-0.428,2.627,HE2,HIS,20
+HIS,HXT,HXT,H,0,1,N,Y,N,34.553,41.905,-8.698,2.188,-4.848,0.901,HXT,HIS,21
+#_chem_comp_bond
+comp_id,atom_id_1,atom_id_2,value_order,pdbx_aromatic_flag,pdbx_stereo_config,pdbx_ordinal
+HIS,N,CA,SING,N,N,1
+HIS,N,H,SING,N,N,2
+HIS,N,H2,SING,N,N,3
+HIS,CA,C,SING,N,N,4
+HIS,CA,CB,SING,N,N,5
+HIS,CA,HA,SING,N,N,6
+HIS,C,O,DOUB,N,N,7
+HIS,C,OXT,SING,N,N,8
+HIS,CB,CG,SING,N,N,9
+HIS,CB,HB2,SING,N,N,10
+HIS,CB,HB3,SING,N,N,11
+HIS,CG,ND1,SING,Y,N,12
+HIS,CG,CD2,DOUB,Y,N,13
+HIS,ND1,CE1,DOUB,Y,N,14
+HIS,ND1,HD1,SING,N,N,15
+HIS,CD2,NE2,SING,Y,N,16
+HIS,CD2,HD2,SING,N,N,17
+HIS,CE1,NE2,SING,Y,N,18
+HIS,CE1,HE1,SING,N,N,19
+HIS,NE2,HE2,SING,N,N,20
+HIS,OXT,HXT,SING,N,N,21
+#_pdbx_chem_comp_descriptor
+comp_id,type,program,program_version,descriptor
+HIS,SMILES,ACDLabs,10.04,O=C(O)C(N)Cc1cnc[nH+]1
+HIS,SMILES_CANONICAL,CACTVS,3.341,N[C@@H](Cc1c[nH]c[nH+]1)C(O)=O
+HIS,SMILES,CACTVS,3.341,N[CH](Cc1c[nH]c[nH+]1)C(O)=O
+HIS,SMILES_CANONICAL,OpenEye OEToolkits,1.5.0,c1c([nH+]c[nH]1)C[C@@H](C(=O)O)N
+HIS,SMILES,OpenEye OEToolkits,1.5.0,c1c([nH+]c[nH]1)CC(C(=O)O)N
+HIS,InChI,InChI,1.03,"InChI=1S/C6H9N3O2/c7-5(6(10)11)1-4-2-8-3-9-4/h2-3,5H,1,7H2,(H,8,9)(H,10,11)/p+1/t5-/m0/s1"
+HIS,InChIKey,InChI,1.03,HNDVDQJCIGZPNO-YFKPBYRVSA-O
+#_pdbx_chem_comp_identifier
+comp_id,type,program,program_version,identifier
+HIS,SYSTEMATIC NAME,ACDLabs,10.04,3-(1H-imidazol-3-ium-4-yl)-L-alanine
+HIS,SYSTEMATIC NAME,OpenEye OEToolkits,1.5.0,(2S)-2-amino-3-(1H-imidazol-3-ium-4-yl)propanoic acid
+#_pdbx_chem_comp_audit
+comp_id,action_type,date,processing_site
+HIS,Create component,1999-07-08,EBI
+HIS,Modify descriptor,2011-06-04,RCSB
diff --git a/tests/test_read_jcsv.py b/tests/test_read_jcsv.py
@@ -0,0 +1,53 @@
+# MolDF
+# Author: Ruibin Liu <ruibinliuphd@gmail.com>
+# License: MIT
+# Code Repository: https://github.com/Ruibin-Liu/MolDF
+"""Tests for reading jcsv files."""
+import os
+import sys
+
+from moldf.read_jcsv import read_jcsv
+
+sys.path.append("..")
+CFD = os.path.dirname(__file__)
+
+
+def test_read_jcsv_nometa():
+    """
+    Test read_jcsv function without meta in jcsv
+    """
+    # without meta
+    file_path = [CFD, "test_files", "pdbx_moldf.jcsv"]
+    test_file = f"{os.sep}".join(file_path)
+    jcsv = read_jcsv(test_file)
+    cat_size = {
+        "_chem_comp": (1, 24),
+        "_chem_comp_atom": (21, 18),
+        "_chem_comp_bond": (21, 7),
+        "_pdbx_chem_comp_descriptor": (7, 5),
+        "_pdbx_chem_comp_identifier": (2, 5),
+        "_pdbx_chem_comp_audit": (2, 4),
+    }
+    assert len(jcsv) == 6, "Not all categories are read."
+    for k, v in jcsv.items():
+        assert cat_size[k] == v.shape, f"Category {k} read incorrectly"
+
+
+def test_read_jcsv_meta():
+    """
+    Test read_jcsv function with meta in jcsv
+    """
+    file_path = [CFD, "test_files", "pdbx_moldf_meta.jcsv"]
+    test_file = f"{os.sep}".join(file_path)
+    jcsv = read_jcsv(test_file)
+    cat_size = {
+        "_chem_comp": (1, 24),
+        "_chem_comp_atom": (21, 18),
+        "_chem_comp_bond": (21, 7),
+        "_pdbx_chem_comp_descriptor": (7, 5),
+        "_pdbx_chem_comp_identifier": (2, 5),
+        "_pdbx_chem_comp_audit": (2, 4),
+    }
+    assert len(jcsv) == 6, "Not all categories are read."
+    for k, v in jcsv.items():
+        assert cat_size[k] == v.shape, f"Category {k} read incorrectly"