diff --git a/README.rst b/README.rst index fbabf26..52b899e 100644 --- a/README.rst +++ b/README.rst @@ -42,6 +42,25 @@ the different use cases can be found in this documentation. If you are here from one of the packages using CGSmiles check out the GettingStarted section to learn the syntax. +Installation +============ + +The easiest ways to install **cgsmiles** is using pip: + +.. code:: bash + + pip install git+https://github.com/gruenewald-lab/CGsmiles.git + +In the future we will also distribute it through the Pypi +package index but that is currently not supported. Note that the drawing module +depends on the `scipy `__ and `matplotlib `__ +packages. These need to be installed before the module can be used. + +.. code:: bash + + pip install scipy + pip install matplotlib + Examples ======== @@ -65,18 +84,6 @@ Martini 3 Benzene # Draw molecule at different resolutions ax, pos = draw_molecule(mol_graph) -Installation -============ - -The easiest ways to install **cgsmiles** is using pip: - -.. code:: bash - - pip install git+https://github.com/gruenewald-lab/CGsmiles.git - -In the future we will also distribute it through the Pypi -package index but that is currently not supported. - Related Tools ============= diff --git a/cgsmiles/dialects.py b/cgsmiles/dialects.py index 9ba9eab..2dfc089 100644 --- a/cgsmiles/dialects.py +++ b/cgsmiles/dialects.py @@ -139,7 +139,7 @@ def create_dialect(default_attributes, # KNOWN DIALECTS # ########################################################## # this one is for global use -# it is the base CGSmiles dialect +# it is the base CGsmiles dialect CGSMILES_DEFAULT_DIALECT = create_dialect({"fragname": (None, str), "q": (0.0, float), "w": (1.0, float)}) diff --git a/cgsmiles/pysmiles_utils.py b/cgsmiles/pysmiles_utils.py index 4062c9e..d8cef62 100644 --- a/cgsmiles/pysmiles_utils.py +++ b/cgsmiles/pysmiles_utils.py @@ -77,7 +77,7 @@ def rebuild_h_atoms(mol_graph, "show delocalization-induced molecular equivalency and thus " "is not considered aromatic. For example, 4-methyl imidazole " "is often written as [nH]1cc(nc1)C, but should be written as " - "[NH]1C=C(N=C1)C. A corresponding CGSmiles string would be " + "[NH]1C=C(N=C1)C. A corresponding CGsmiles string would be " "{[#A]1[#B][#C]1}.{#A=[>][<]N,#B=[$]N=C[>],#C=[$]C(C)=C[<]}") raise SyntaxError(msg) nx.set_node_attributes(mol_graph, 0, 'hcount') @@ -126,7 +126,7 @@ def read_fragment_smiles(smiles_str, ez_isomers={}, attributes={}): """ - Read a smiles_str corresponding to a CGSmiles fragment and + Read a smiles_str corresponding to a CGsmiles fragment and annotate bonding descriptors, isomers, as well as any other attributes. diff --git a/cgsmiles/read_fragments.py b/cgsmiles/read_fragments.py index 59d7328..f91a82d 100644 --- a/cgsmiles/read_fragments.py +++ b/cgsmiles/read_fragments.py @@ -104,21 +104,21 @@ def collect_ring_number(smile_iter, token, node_count, rings): def strip_bonding_descriptors(fragment_string): """ - Processes a CGSmiles fragment string by + Processes a CGsmiles fragment string by stripping the bonding descriptors and storing them in a dict with reference to the atom they - refer to. Furthermore, a cleaned SMILES or CGSmiles + refer to. Furthermore, a cleaned SMILES or CGsmiles string is returned. Parameters ---------- fragment_string: str - a CGSmiles fragment string + a CGsmiles fragment string Returns ------- str: - a canonical SMILES or CGSmiles string + a canonical SMILES or CGsmiles string dict: a dict mapping bonding descriptors to the nodes within the string @@ -255,19 +255,19 @@ def fragment_iter(fragment_str, all_atom=True): def read_fragments(fragment_str, all_atom=True, fragment_dict=None): """ - Collects the fragments defined in a CGSmiles fragment string + Collects the fragments defined in a CGsmiles fragment string as networkx.Graph and returns a dict of them. Bonding descriptors are annotated as node attribtues. Parameters ---------- fragment_str: str - string using CGSmiles fragment syntax + string using CGsmiles fragment syntax all_atom: bool If the fragment strings are all-atom following the OpenSmiles syntax. Default is True but if - set to False fragments follow the CGSmiles + set to False fragments follow the CGsmiles syntax. fragment_dict: dict diff --git a/cgsmiles/resolve.py b/cgsmiles/resolve.py index cb592d0..da89145 100644 --- a/cgsmiles/resolve.py +++ b/cgsmiles/resolve.py @@ -14,7 +14,7 @@ def compatible(left, right, legacy=False): """ Check bonding descriptor compatibility according - to the CGSmiles syntax conventions. With legacy + to the CGsmiles syntax conventions. With legacy the BigSmiles convention can be used. Parameters @@ -87,27 +87,27 @@ def match_bonding_descriptors(source, target, bond_attribute="bonding", legacy=F class MoleculeResolver: """ - Resolve the molecule(s) described by a CGSmiles string and return a networkx.Graph + Resolve the molecule(s) described by a CGsmiles string and return a networkx.Graph of the molecule. First, this class has to be initiated using one of three class construction - methods. When trying to read a CGSmiles string always use the first method. + methods. When trying to read a CGsmiles string always use the first method. The other constructors can be used in case fragments or the lowest resolution molecule are defined by graphs that come from elsewhere. `self.from_string`: use when fragments and lowest resolution are - described in one CGSmiles string. - `self.from_graph`: use when fragments are described by CGSmiles + described in one CGsmiles string. + `self.from_graph`: use when fragments are described by CGsmiles strings but the lowest resolution is given as nx.Graph `self.from_fragment_dicts`: use when fragments are given as nx.Graphs and the lowest resolution is provided as - CGSmiles string + CGsmiles string Once the `MoleculeResolver` is initiated you can call the `resolve_iter` to loop over the different levels of resolution. The resolve iter will always return the previous lower resolution graph as well as the current higher - resolution graph. For example, if the CGSmiles string describes a monomer + resolution graph. For example, if the CGsmiles string describes a monomer sequence of a regular polymer, the lower resolution graph will be the graph of this monomer sequence and the higher resolution graph the full molecule. @@ -129,6 +129,7 @@ class MoleculeResolver: --------------------- Alternatively, one could have gotten the block level graph from somewhere else defined as `nx.Graph` in that case: + >>> # the string only defines the fragments >>> cgsmiles_str = "{#B1=[#PEO]|4,#B2=[#PE]|2}.{#PEO=[>]COC[<],#PE=[>]CC[<]}" >>> block_graph = nx.Graph() @@ -137,10 +138,11 @@ class MoleculeResolver: >>> resolver = MoleculeResolver.from_graph(cgsmiles_str, block_graph) Finally, there is the option of having the fragments from elsewhere for - example a library. Then only the graph defined as CGSmiles string. In this + example a library. Then only the graph defined as CGsmiles string. In this case the `from_fragment_dicts` method can be used. Please note that the fragment graphs need to have the following attributes as a graph returned by the `cgsmiles.read_fragments` function. + >>> fragment_dicts = [] >>> for frag_string in ["{#B1=[#PEO]|4,#B2=[#PE]|2}", "{#PEO=[>]COC[<],#PE=[>]CC[<]}"]: >>> frag_dict = read_fragments(frag_string) @@ -178,11 +180,11 @@ def __init__(self, legacy: bool which syntax convention to use for matching the bonding descriptors. Legacy syntax adheres to the BigSmiles convention. Default syntax - adheres to CGSmiles convention where bonding descriptors '$' match + adheres to CGsmiles convention where bonding descriptors '$' match with every '$' and every '<' matches every '>'. With the BigSmiles convention a alphanumeric string may be provided that distinguishes these connectors. For example, '$A' would not match '$B'. However, - such use cases should be rare and the CGSmiles convention facilitates + such use cases should be rare and the CGsmiles convention facilitates usage of bonding descriptors in the Sampler where the labels are used to assign different probabilities. """ @@ -199,7 +201,7 @@ def __init__(self, @staticmethod def read_fragment_strings(fragment_strings, last_all_atom=True): """ - Read a list of CGSmiles fragment_strings and return a list + Read a list of CGsmiles fragment_strings and return a list of dicts with the fragment graphs. If `last_all_atom` is True then pysmiles is used to read the last fragment string provided in the list. @@ -207,7 +209,7 @@ def read_fragment_strings(fragment_strings, last_all_atom=True): Parameters ---------- fragment_strings: list[str] - list of CGSmiles fragment strings + list of CGsmiles fragment strings last_all_atom: bool if the last string in the list is an all atom string and should be read using pysmiles. @@ -348,7 +350,7 @@ def squash_atoms(self): def resolve(self): """ - Resolve a CGSmiles string once and return the next resolution. + Resolve a CGsmiles string once and return the next resolution. """ # check if this is an all-atom level resolution all_atom = (self.resolution_counter == self.resolutions - 1 and self.last_all_atom) @@ -429,7 +431,7 @@ def from_string(cls, cgsmiles_str, last_all_atom=True, legacy=False): legacy: bool which syntax convention to use for matching the bonding descriptors. Legacy syntax adheres to the BigSmiles convention. Default syntax - adheres to CGSmiles convention. A more detailed explanation can be + adheres to CGsmiles convention. A more detailed explanation can be found in the MoleculeResolver.__init__ method. Returns @@ -466,7 +468,7 @@ def from_graph(cls, cgsmiles_str, meta_graph, last_all_atom=True, legacy=False): legacy: bool which syntax convention to use for matching the bonding descriptors. Legacy syntax adheres to the BigSmiles convention. Default syntax - adheres to CGSmiles convention. A more detailed explanation can be + adheres to CGsmiles convention. A more detailed explanation can be found in the MoleculeResolver.__init__ method. Returns @@ -507,7 +509,7 @@ def from_fragment_dicts(cls, cgsmiles_str, fragment_dicts, last_all_atom=True, l legacy: bool which syntax convention to use for matching the bonding descriptors. Legacy syntax adheres to the BigSmiles convention. Default syntax - adheres to CGSmiles convention. A more detailed explanation can be + adheres to CGsmiles convention. A more detailed explanation can be found in the MoleculeResolver.__init__ method. Returns diff --git a/cgsmiles/sample.py b/cgsmiles/sample.py index adf4436..041e0d2 100644 --- a/cgsmiles/sample.py +++ b/cgsmiles/sample.py @@ -42,12 +42,12 @@ def _set_bond_order_defaults(bonding): class MoleculeSampler: """ - Given a fragment string in CGSmiles format and probabilities for residues + Given a fragment string in CGsmiles format and probabilities for residues to occur, return a random molecule with target molecular weight. First, this class has to be initiated using the class construction method `from_string`, which makes sure to read and resolve the fragment - graphs provided in the CGSmiles string. + graphs provided in the CGsmiles string. Once the `MoleculeSampler` is initiated you can call the `sampler` method in order to generate a new random polymer molecule from the fragment string @@ -124,7 +124,7 @@ class MoleculeSampler: can be provided. For example, To generate a bottle brush polymer that has PMA in the backbone - and PEG as side-chain terminated with an OH group the following CGSmiles string + and PEG as side-chain terminated with an OH group the following CGsmiles string in combination with the above mentioned probabilities can be provided. Note that in this case we declare '$A' and '$B' to be terminal bonding diff --git a/cgsmiles/tests/test_utils.py b/cgsmiles/tests/test_utils.py index b485e58..93f94fe 100644 --- a/cgsmiles/tests/test_utils.py +++ b/cgsmiles/tests/test_utils.py @@ -6,7 +6,7 @@ "show delocalization-induced molecular equivalency and thus " "is not considered aromatic. For example, 4-methyl imidazole " "is often written as [nH]1cc(nc1)C, but should be written as " - "[NH]1C=C(N=C1)C. A corresponding CGSmiles string would be " + "[NH]1C=C(N=C1)C. A corresponding CGsmiles string would be " "{[#A]1[#B][#C]1}.{#A=[>][<]N,#B=[$]N=C[>],#C=[$]C(C)=C[<]}") @pytest.mark.parametrize('frag_str, hatoms_ref, error_type, err_msg', ( diff --git a/cgsmiles/write_cgsmiles.py b/cgsmiles/write_cgsmiles.py index 1b871a3..5ebdcb5 100644 --- a/cgsmiles/write_cgsmiles.py +++ b/cgsmiles/write_cgsmiles.py @@ -11,7 +11,7 @@ def format_node(molecule, current): """ Format a node from a `molecule` graph according to - the CGSmiles syntax. The attribute fragname has to + the CGsmiles syntax. The attribute fragname has to be set for the `current` node. Parameters @@ -68,7 +68,7 @@ def write_graph(molecule, smiles_format=False, default_element='*'): Returns ------- str - The CGSmiles string describing `molecule`. + The CGsmiles string describing `molecule`. """ start = min(molecule) dfs_successors = nx.dfs_successors(molecule, source=start) @@ -161,19 +161,19 @@ def write_graph(molecule, smiles_format=False, default_element='*'): def write_cgsmiles_graph(molecule): """ - Write a CGSmiles graph sans fragments at + Write a CGsmiles graph sans fragments at different resolution. Parameters ---------- molecule: networkx.Graph a molecule where each node as a fragname attribute - that is used as name in the CGSmiles string. + that is used as name in the CGsmiles string. Returns ------- str - the CGSmiles string + the CGsmiles string """ cgsmiles_str = write_graph(molecule) @@ -192,7 +192,7 @@ def write_cgsmiles_fragments(fragment_dict, smiles_format=True): a dict of fragment graphs smiles_format: bool write all atom SMILES if True (default) otherwise - write CGSmiles + write CGsmiles Returns ------- @@ -208,7 +208,7 @@ def write_cgsmiles_fragments(fragment_dict, smiles_format=True): def write_cgsmiles(molecule_graph, fragments, last_all_atom=True): """ - Write a CGSmiles string given a low resolution molecule graph + Write a CGsmiles string given a low resolution molecule graph and any number of higher resolutions provided as fragment dicts. Parameters @@ -222,7 +222,7 @@ def write_cgsmiles(molecule_graph, fragments, last_all_atom=True): Returns ------- str - CGSmiles string + CGsmiles string """ final_str = write_cgsmiles_graph(molecule_graph) for layer, fragment in enumerate(fragments): diff --git a/docs/source/conf.py b/docs/source/conf.py index e95b313..dc1ec8a 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -25,7 +25,7 @@ # -- Project information ----------------------------------------------------- -project = 'CGSmiles' +project = 'CGsmiles' copyright = '2024, Dr. F Gruenewald' author = 'F. Gruneewald and P. C. Kroon' @@ -132,7 +132,7 @@ # -- Options for HTMLHelp output --------------------------------------------- # Output file base name for HTML help builder. -htmlhelp_basename = 'CGSmilesdoc' +htmlhelp_basename = 'CGsmilesdoc' # -- Options for LaTeX output ------------------------------------------------ @@ -159,7 +159,7 @@ # (source start file, target name, title, # author, documentclass [howto, manual, or own class]). latex_documents = [ - (master_doc, 'CGSmiles.tex', 'CGSmiles Documentation', + (master_doc, 'CGsmiles.tex', 'CGsmiles Documentation', author, 'manual'), ] @@ -169,7 +169,7 @@ # One entry per manual page. List of tuples # (source start file, name, description, authors, manual section). man_pages = [ - (master_doc, 'cgsmiles', 'CGSmiles Documentation', + (master_doc, 'cgsmiles', 'CGsmiles Documentation', [author], 1) ] @@ -180,8 +180,8 @@ # (source start file, target name, title, author, # dir menu entry, description, category) texinfo_documents = [ - (master_doc, 'CGSmiles', 'CGSmiles Documentation', - author, 'CGSmiles', 'One line description of project.', + (master_doc, 'CGsmiles', 'CGsmiles Documentation', + author, 'CGsmiles', 'One line description of project.', 'Miscellaneous'), ] diff --git a/docs/source/gettingstarted/installation.rst b/docs/source/gettingstarted/installation.rst index 88db138..ac75d0f 100644 --- a/docs/source/gettingstarted/installation.rst +++ b/docs/source/gettingstarted/installation.rst @@ -9,3 +9,6 @@ The easiest ways to install **cgsmiles** is using pip: In the future we will also distribute it through the Pypi package index but that is currently not supported. + +Note that the drawing module depends on the `scipy `__ and `matplotlib `__ +packages. These need to be installed before the module can be used. diff --git a/docs/source/gettingstarted/syntax_examples.rst b/docs/source/gettingstarted/syntax_examples.rst index f07052c..c99c418 100644 --- a/docs/source/gettingstarted/syntax_examples.rst +++ b/docs/source/gettingstarted/syntax_examples.rst @@ -1,18 +1,18 @@ Syntax Examples =============== -This page collects examples of CGSmiles string of increasing +This page collects examples of CGsmiles string of increasing complexity. They are seperated into the following categories: -- CGSmiles without fragments -- CGSmiles with all-atom fragments -- CGSmiles with coarse-grained fragments +- CGsmiles without fragments +- CGsmiles with all-atom fragments +- CGsmiles with coarse-grained fragments -CGSmiles without fragments +CGsmiles without fragments -------------------------- If one just seeks to describe a graph at abitrary level of -complexity CGSmiles notation can be used. Each of the smiles +complexity CGsmiles notation can be used. Each of the smiles listed below can be read and converted using the `read_cgsmiles` function of the package: @@ -57,7 +57,7 @@ function of the package: "{[#nodeA]([#nodeAB][#nodeAB])|5}" -CGSmiles with all-atom fragments +CGsmiles with all-atom fragments -------------------------------- - simple linear graph describing PEO with two OH end-groups diff --git a/docs/source/api/overview.rst b/docs/source/gettingstarted/tutorials/drawing.rst similarity index 54% rename from docs/source/api/overview.rst rename to docs/source/gettingstarted/tutorials/drawing.rst index 5744b21..41b1261 100644 --- a/docs/source/api/overview.rst +++ b/docs/source/gettingstarted/tutorials/drawing.rst @@ -1,68 +1,9 @@ -Overview -======== -The API is designed to read, write, and interpret CGSmiles string. -Detailed information can be found in the module documentation. -This overview page provides some quick tutorial style explanation -of the main functionalities. - -Reading CGSmiles ----------------- -A CGSmiles string can contain a base-graph (see Syntax Rules) and -multiple enumerations of fragment graphs each corresonding to a -different resolution. The base graph can be read using the -``read_cgsmiles`` function, while the fragments can be read using -the ``read_fragments`` function. However, most user will find it -convienient to directly read the entire string and resolve the -different resolutions. This is done using the ``MoleculeResolver`` -class. - -First we need to import the ``MoleculeResolver`` and initate it -using the ``from_string`` or one of the other initator methods. -Note that we can specify if the last resolution is at the atomic -level by providing ``last_all_atom=True`` argument. - -.. code-block:: python - - from cgsmiles import MoleculeResolver - cgsmiles_string = '{[#TC5]1[#TC5][#TC5]1}.{#TC5=[$]cc[$]}' - resolver = MoleculeResolver.from_string(cgsmiles_string, - last_all_atom=True) - -Next we can resolve the atomic resolution from the CG graph by -running the ``.resolve`` function once. - -.. code-block:: python - - cg_graph, aa_graph = resolver.resolve() - -For multiple resolutions we can run the ``resolver`` function -multiple times. Each time a new set of graphs at a coarse level -and the next finer level is returned. Alternatively, the -``resolve_iter`` can be used to loop over all resolutions. Let's -take the molecule in Figure 3 of the main paper: - -.. code-block:: python - - from cgsmiles import MoleculeResolver - # CGSmiles string with 3 resolutions - cgsmiles_str = "{[#hphilic][#hdphob]|3[#hphilic]}.\ - {#hphilic=[<][#PEO][>]|3,#hdphob=[<][#PMA][>]([#BUT])}.\ - {#PEO=[<][#SN3r][>],#PMA=[<][#TC3][>][#SN4a][$],#BUT=[$][#SC3][$]}.\ - {#SN3r=[<]COC[>],#TC3=[<]CC[>][$1],#SN4a=[$1]C(=O)OC[$2],#SC3=[$2]CCC}" - # Generate the MoleculeResolver - resolver = MoleculeResolver.from_string(cgsmiles_str, last_all_atom=True) - - # Now we can loop over all resolutions using - for coarse_graph, finer_graph in resolver.resolve_iter(): - print(coarse_graph.nodes(data='fragname')) - print(finer_graph.nodes(data='atomname')) - -Alternatively, we could just have gotten the final two pairs by calling -``.resolve_all()``. - -Drawing CGSmiles ----------------- -It is very easy to check the correctness of a CGSmiles string by +Drawing +======= +Note that this tutorial requires the scipy and matplotlib packages +to be installed. See installation instructions. + +It is very easy to check the correctness of a CGsmiles string by simply drawing the molecule and the mapping to the coarser level. Drawing molecules can be accomplished using the drawing module. @@ -121,10 +62,10 @@ illustrates this for poly(ethylene) glycol. ax, pos = draw_molecule(mol_graph, labels=labels, scale=1) ax.set_frame_on('True') -Likley you will see that not the entire molecule fits in the bounding box as +Likely, you will see that not the entire molecule fits in the bounding box as indicated by the frame. The reason is that the drawing function does not automatically scale the image. You have two choices now. You can use the scale -keywod to shrink the molecule image until it fits (e.g. ``scale=0.5``) or you +keyword to shrink the molecule image until it fits (e.g. ``scale=0.5``) or you can provide a larger canvas. .. code:: python @@ -142,8 +83,8 @@ terms of labels, bonds, and atoms. Thus you only have to find a visually pleasin canvas size once and can draw a large collection of molecules. One added bonus feature of the drawing utility is that it will draw cis/trans -isomers correctly accoding to the cgsmiles string the user has provided. You -can see a simple exmaple below. +isomers correctly according to the cgsmiles string the user has provided. You +can see a simple example below. .. code:: python @@ -154,10 +95,10 @@ can see a simple exmaple below. fig, axes = plt.subplots(1,2, figsize=(6, 6)) # trans butene - cgsmiles_str_tans = "{[#A][#B]}.{#A=c\c[$],#B=[$]c\c}" + cgsmiles_str_tans = "{[#A][#B]}.{#A=C\C=[$],#B=[$]=C\C}" # cis butene - cgsmiles_str_cis = "{[#A][#B]}.{#A=c\c[$],#B=[$]c/c}" + cgsmiles_str_cis = "{[#A][#B]}.{#A=C\C=[$],#B=[$]=C/C}" # Resolve molecule into networkx graphs for ax, cgstr in zip(axes, [cgsmiles_str_tans, cgsmiles_str_cis]): diff --git a/docs/source/gettingstarted/tutorials/index.rst b/docs/source/gettingstarted/tutorials/index.rst new file mode 100644 index 0000000..6ce6ea3 --- /dev/null +++ b/docs/source/gettingstarted/tutorials/index.rst @@ -0,0 +1,11 @@ +Tutorials +========= + +.. toctree:: + :maxdepth: 2 + + resolving.rst + drawing.rst + martini.rst + polymers.rst + mapping.rst diff --git a/docs/source/gettingstarted/api_examples.rst b/docs/source/gettingstarted/tutorials/mapping.rst similarity index 50% rename from docs/source/gettingstarted/api_examples.rst rename to docs/source/gettingstarted/tutorials/mapping.rst index f3320c0..1e4fd63 100644 --- a/docs/source/gettingstarted/api_examples.rst +++ b/docs/source/gettingstarted/tutorials/mapping.rst @@ -1,37 +1,3 @@ -API Examples -============ - -The following tutorials illustrate how to use read, -draw, and manipulate CGSmiles using the package API. -For more detailed information on the syntax please -consult the examples and Syntax documentation. - -Read and draw CGSmile of Polystyrene ------------------------------------- - -If one just seeks to describe a graph at abitrary level of -complexity CGSmiles notation can be used. - -.. code:: python - - import matplotlib.pyplot as plt - import networkx as nx - import cgsmiles - - # Express 5 units of Polystyrene in CGSmiles - cgsmiles_str = "{[#PS]|5}.{#PS=[$]CC[$](c1ccccc1)}" - - # Resolve molecule into networkx graphs - res_graph, mol_graph = cgsmiles.MoleculeResolver(cgsmiles_str).resolve() - - # Draw molecule at different resolutions - for g in [res_graph, mol_graph]: - nx.draw_networkx(g) - plt.show() - - # Get fragment corresponding to first residue - fragment_1 = res_graph.nodes[0]['graph'] - Map all-atom structure to CG resolution --------------------------------------- @@ -48,7 +14,7 @@ BENZ.pdb from this repository. # Read pdb of Benzen mol = vermouth.pdb.read_pdb("BENZ.pdb") - # Express the mapping as CGSmiles string + # Express the mapping as CGsmiles string cgsmiles_str = "{[#R]1[#R][#R]1}.{#R=[$]cc[$]}" # Resolve molecule into networkx graphs @@ -68,9 +34,3 @@ BENZ.pdb from this repository. pos += mol.nodes[mapping[all_atom_node]]['position'] final_pos = pos / len(fragement) res_graph.nodes[node][final_pos] - -Searching the Martini databse of small molecules ------------------------------------------------- - -Here goes some example on how to lookup molecules from the Martini -Database using CGSmiles diff --git a/docs/source/gettingstarted/tutorials/martini.rst b/docs/source/gettingstarted/tutorials/martini.rst new file mode 100644 index 0000000..5ed8071 --- /dev/null +++ b/docs/source/gettingstarted/tutorials/martini.rst @@ -0,0 +1,57 @@ +Martini Mappings +================ + +CGsmiles can be used to define Martini mappings. Currently there a no canonical +rules on how the CGsmiles string needs to be formatted. However, we recommend to +follow the rules listed below to store all information relevant for forward +mapping, bead assignment, and backwards mapping. + +Design Guidelines +----------------- + +- use the bead type as fragment name in the CGsmiles string +- if there are multiple fragments that have the same bead type append letters + A-Z (e.g. TC5 -> TC5A) +- if an atom is part of more than one fragment use the [!] bonding operator +- annotate chiral atoms in the fragment part +- annotate cis/trans isomers in the fragment part +- annotate weights if applicable +- annotate charges in the Martini resolution part if there are any +- draw your string to check it's correctness + +Consider the difference between the Martini 3 mappings for Toluene and +2,4-dichlorotoluene shown below. + +.. list-table:: Martini Mappings of Toluene and 2,4-dichlorotoluene + :widths: auto + :header-rows: 0 + + * - .. image:: /images/toluene.jpeg + :width: 400px + - .. image:: /images/dichlorotoluene.jpeg + :width: 400px + +In Toluene the two TC5 beads are equivalent and connect the same way to the SC4 +bead. Therefore the CGsmiles string below is valid. + +.. code:: + + Toluene + {[#SC4]1[#TC5][#TC5]1}.{#SC4=Cc(c[!])c[!],#TC5=[!]ccc[!]} + +However, in 2,4-dichlorotoluene there are two SX3 beads which are equivalent +except for the fact that they connect to the SC4 beads at different carbons. Once +the carbon with the chlorine connects to the SC4 and once the carbon without the +chlorine connects. To represent this connectivity correctly you need to use two +different fragments and two differently labeled bonding operators in the CGsmiles +string as shown below: + +.. code:: + + 2,4-dichlorotoluene + {[#SC4]1[#SX3][#SX3A]1}.{#SC4=Cc[$a]c[$],#SX3=Clc[$a]c[$b],#SX3A=Clc[$b]c[$]} + + +Examples +-------- +An extensive list with examples can be found in the publication. diff --git a/docs/source/gettingstarted/tutorials/polymers.rst b/docs/source/gettingstarted/tutorials/polymers.rst new file mode 100644 index 0000000..34ee560 --- /dev/null +++ b/docs/source/gettingstarted/tutorials/polymers.rst @@ -0,0 +1,87 @@ +Polymers +======== + +Here is a collection of CGsmiles applications for +polymer molecules. + +Linear Polymer Polystyrene +-------------------------- + +Polystyrene (PS) is one of the most common commodity +polymers. In the following example we resolve the +CGSmiles string and draw the polymer graphs. + +.. code:: python + + import networkx as nx + import pysmiles + import cgsmiles + from cgsmiles.drawing import draw_molecule, FRAGID_TO_COLOR + + # Express 5 units of Polystyrene in CGsmiles + cgsmiles_str = "{[#PS]|5}.{#PS=[$]CC[$](c1ccccc1)}" + + # Resolve molecule into networkx graphs + res_graph, mol_graph = cgsmiles.MoleculeResolver(cgsmiles_str).resolve() + + # Draw graph at the monomer resolution + labels = nx.get_node_attributes(res_graph, 'fragname') + ax, pos = draw_molecule(res_graph, + colors=FRAGID_TO_COLOR, + cg_mapping=False, + labels=labels) + + # Draw graph at the atomic resolution + pysmiles.remove_explicit_hydrogens(mol_graph) + ax, pos = draw_molecule(mol_graph, scale=0.7 + +Graft Polymer mPEG Acrylate +--------------------------- + +mPEG Acrylate is a branched graft polymer, that +contains PEG units attached to an poly methyl acrylate +backbone. In CGSmiles we can represent this polymers +in multiple equivalent ways. + +.. code:: python + + import matplotlib.pyplot as plt + import networkx as nx + import pysmiles + import cgsmiles + from cgsmiles.drawing import draw_molecule + + # Using 2 resolutions + cgsmiles_str_two = "{[#PMA]([#PEG]|3)|5}.{#PMA=[<]CC[>]C(=O)OC[$],#PEG=[$]COC[$]}" + + # Using 3 resolutions + cgsmiles_str_three = "{[#mPEG]|5}.{#mPEG=[$][#PMA][$]([#PEG]|3)}.{#PMA=[<]CC[>]C(=O)OC[$],#PEG=[$]COC[$]}" + + # Resolve molecule into networkx graphs + # Using the resolve_all method we directly jump to the last level + # which means that res_graph is the graph of the monomeric repeat units + res_graph_two, _ = cgsmiles.MoleculeResolver.from_string(cgsmiles_str_two).resolve_all() + res_graph_three, _ = cgsmiles.MoleculeResolver.from_string(cgsmiles_str_three).resolve_all() + + # Let's make a custom coluring function that colors by fragment name + def custom_colors_names(graph): + fragname_colors = {"PMA": "tab:blue", "PEG": "tab:red"} + fragnames = nx.get_node_attributes(graph, "fragname") + colors = {node: fragname_colors[fragname] for node, fragname in fragnames.items()} + return colors, fragnames + + colors_two, labels_two = custom_colors(res_graph_two) + colors_three, labels_three = custom_colors(res_graph_three) + + # Draw the residue graphs only + fig, axes = plt.subplots(1, 2, figsize=(10, 6)) + draw_molecule(res_graph_two, + ax=axes[0], + cg_mapping=False, + colors=custom_colors, + scale=0.75) + draw_molecule(res_graph_three, + ax=axes[1], + cg_mapping=False, + colors=custom_colors, + scale=0.75) diff --git a/docs/source/gettingstarted/tutorials/resolving.rst b/docs/source/gettingstarted/tutorials/resolving.rst new file mode 100644 index 0000000..a45ac41 --- /dev/null +++ b/docs/source/gettingstarted/tutorials/resolving.rst @@ -0,0 +1,55 @@ +Reading & Resolving +=================== + +A CGsmiles string can contain a base-graph (see Syntax Rules) and +multiple enumerations of fragment graphs each corresponding to a +different resolution. The base graph can be read using the +``read_cgsmiles`` function, while the fragments can be read using +the ``read_fragments`` function. However, most user will find it +convenient to directly read the entire string and resolve the +different resolutions. This is done using the ``MoleculeResolver`` +class. + +First we need to import the ``MoleculeResolver`` and initiate it +using the ``from_string`` or one of the other initiator methods. +Note that we can specify if the last resolution is at the atomic +level by providing ``last_all_atom=True`` argument. + +.. code-block:: python + + from cgsmiles import MoleculeResolver + cgsmiles_string = '{[#TC5]1[#TC5][#TC5]1}.{#TC5=[$]cc[$]}' + resolver = MoleculeResolver.from_string(cgsmiles_string, + last_all_atom=True) + +Next we can resolve the atomic resolution from the CG graph by +running the ``.resolve`` function once. + +.. code-block:: python + + cg_graph, aa_graph = resolver.resolve() + +For multiple resolutions we can run the ``resolver`` function +multiple times. Each time a new set of graphs at a coarse level +and the next finer level is returned. Alternatively, the +``resolve_iter`` can be used to loop over all resolutions. Let's +take the molecule in Figure 3 of the main paper: + +.. code-block:: python + + from cgsmiles import MoleculeResolver + # CGsmiles string with 3 resolutions + cgsmiles_str = "{[#hphilic][#hdphob]|3[#hphilic]}.\ + {#hphilic=[<][#PEO][>]|3,#hdphob=[<][#PMA][>]([#BUT])}.\ + {#PEO=[<][#SN3r][>],#PMA=[<][#TC3][>][#SN4a][$],#BUT=[$][#SC3][$]}.\ + {#SN3r=[<]COC[>],#TC3=[<]CC[>][$1],#SN4a=[$1]C(=O)OC[$2],#SC3=[$2]CCC}" + # Generate the MoleculeResolver + resolver = MoleculeResolver.from_string(cgsmiles_str, last_all_atom=True) + + # Now we can loop over all resolutions using + for coarse_graph, finer_graph in resolver.resolve_iter(): + print(coarse_graph.nodes(data='fragname')) + print(finer_graph.nodes(data='atomname')) + +Alternatively, we could just have gotten the final two pairs by calling +``.resolve_all()``. diff --git a/docs/source/images/dichlorotoluene.jpeg b/docs/source/images/dichlorotoluene.jpeg new file mode 100644 index 0000000..5be30e8 Binary files /dev/null and b/docs/source/images/dichlorotoluene.jpeg differ diff --git a/docs/source/images/toluene.jpeg b/docs/source/images/toluene.jpeg new file mode 100644 index 0000000..b01b1e7 Binary files /dev/null and b/docs/source/images/toluene.jpeg differ diff --git a/docs/source/index.rst b/docs/source/index.rst index c72f144..1926816 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -9,7 +9,7 @@ Table of Contents gettingstarted/installation gettingstarted/syntax_examples - gettingstarted/api_examples + gettingstarted/tutorials/index.rst .. toctree:: :maxdepth: 2 @@ -25,7 +25,6 @@ Table of Contents :maxdepth: 2 :caption: API - api/overview.rst api/cgsmiles.resolve api/cgsmiles.drawing api/cgsmiles.sample diff --git a/docs/source/syntax/basic_graph_description.rst b/docs/source/syntax/basic_graph_description.rst index 09bce26..cbe4200 100644 --- a/docs/source/syntax/basic_graph_description.rst +++ b/docs/source/syntax/basic_graph_description.rst @@ -3,11 +3,11 @@ General Graph Syntax Overview -------- -The first resolution of the CGSmiles notation captures the coarsest representation +The first resolution of the CGsmiles notation captures the coarsest representation of a molecule. The syntax is adapted from the SMILES notation and can be used to represent arbitrary graphs. These graphs do not need to be molecules but the syntax is geared towards molecules. The basic syntax features are sufficient to -write a CGSmiles string for any (connected) graph. The advanced syntax features +write a CGsmiles string for any (connected) graph. The advanced syntax features can be used to reduce the verbosity through use of a multiplication operator, allow annotation of bond orders, which are important for atomic resolutions and resolving multiple resolutions, as well as a general annotation syntax that @@ -15,7 +15,7 @@ permits writing of node labels. Basic Syntax Features ----------------------- -The basic structure of CGSmiles involves describing each node within a graph +The basic structure of CGsmiles involves describing each node within a graph using a specific notation that identifies connections and relationships between nodes. Here’s how the nodes and their connections are represented: @@ -63,7 +63,7 @@ connecting nodes A, B, and C would be written as ``[#A]1[#B][#C]1``. String Encapsulation ^^^^^^^^^^^^^^^^^^^^ -For clarity and to define boundaries, CGSmiles strings are enclosed in curly braces. +For clarity and to define boundaries, CGsmiles strings are enclosed in curly braces. .. code-block:: none @@ -117,7 +117,7 @@ Annotations ^^^^^^^^^^^ Some important information are are not encoded by the graph representation of a molecule. Such information are for examples charges or chirality. -CGSmiles supports a general annotation syntax, which allows users to store +CGsmiles supports a general annotation syntax, which allows users to store this kind of information in the form of ``symbol=value`` pairs. Any node name may be followed by one or more of these ``symbol=value`` pairs separated by a semi-colon. For example, to specify that node a has a charge of 1 but node @@ -151,7 +151,7 @@ symbols is named a `dialect` and can be specified using the functionality in the dialect module. Note that currently dialects are not easily accessible for modification. -CGSmiles comes with two sets of predefined dialects. One is used for the coarse +CGsmiles comes with two sets of predefined dialects. One is used for the coarse resolution fragments / graphs and the other for those which are of atomic resolution. The table below lists the specifications of those keywords. Note that it is always permissible to use the keyword explicitly. @@ -175,7 +175,7 @@ Reserved Annotation Symbols Multiplication Operator ^^^^^^^^^^^^^^^^^^^^^^^ To efficiently represent repeated units in large molecules, such as polymers, -CGSmiles syntax includes a multiplication operator ``|``. This operator can be +CGsmiles syntax includes a multiplication operator ``|``. This operator can be applied after a node or a branch to repeat it a specified number of times. - **Node Multiplication:** The multiplication operator is placed after a node @@ -200,7 +200,7 @@ applied after a node or a branch to repeat it a specified number of times. Syntax Features Lookup Table ---------------------------- Below is the updated quick reference table for the essential features of -CGSmiles syntax: +CGsmiles syntax: +----------------+----------------------------------------------+------------------------------------------------+ | Feature | Description | Example | diff --git a/docs/source/syntax/chirality.rst b/docs/source/syntax/chirality.rst index 80f4c80..798f6cd 100644 --- a/docs/source/syntax/chirality.rst +++ b/docs/source/syntax/chirality.rst @@ -6,7 +6,7 @@ features have no direct counterparts in CG models and require special treatment. Implicit Hydrogen ^^^^^^^^^^^^^^^^^ The simplest case is the treatment of implicit hydrogen atoms. SMILES allows for -shorthand notation where hydrogen atoms can be omitted and CGSmiles adopts this +shorthand notation where hydrogen atoms can be omitted and CGsmiles adopts this approach. Hydrogen atoms are automatically assigned once the full atomistic molecule is resolved. This procedure ensures proper handling of any unconsumed bonding operators, which are interpreted as additional hydrogen atoms where @@ -25,7 +25,7 @@ constructing the complete SMILES string. Chirality ^^^^^^^^^ -CGSmiles adopts an explicit method of chirality assignment using annotations. A +CGsmiles adopts an explicit method of chirality assignment using annotations. A chiral atom can be annotated using the ``x`` keyword as shorthand for chirality. For example, S-Alanine is represented as ``C[C;x=S]C(=O)ON``, while R-Alanine is written as ``C[C;x=R]C(=O)ON``. The ``x`` may be omitted if a weight is defined @@ -35,7 +35,7 @@ general annotation syntax for more information. Aromaticity ^^^^^^^^^^^ In SMILES, aromaticity is encoded using lowercase letters as a shorthand for -aromatic atoms or a colon as a marker for aromatic bonds. CGSmiles utilizes the +aromatic atoms or a colon as a marker for aromatic bonds. CGsmiles utilizes the same convention. In addition, aromatic systems may also be split across multiple fragments by simply keeping the shorthand. For example, Martini Benzene is represented as: @@ -46,13 +46,13 @@ represented as: Although the shorthand for aromaticity is well-defined, its interpretation in SMILES remains somewhat ambiguous. To ensure unambiguous valance assignment, -necessary for tasks like adding hydrogen atoms, CGSmiles employs the following +necessary for tasks like adding hydrogen atoms, CGsmiles employs the following definition: only atoms capable of participating in delocalization-induced molecular equivalence (i.e., systems where multiple resonance structures can be drawn without introducing charges) are considered aromatic. By this definition Benzene is aromatic but thiophene is not. CGsmiles uses the same definition as Pysmiles package, which provides a more detailed discussion of this topic. To -enhance user-friendliness, the CGSmiles API automatically corrects strings with +enhance user-friendliness, the CGsmiles API automatically corrects strings with incorrectly assigned aromaticity at the time of reading. If corrections cannot be made unambiguously, an error is raised, ensuring robust and accurate handling of aromaticity. diff --git a/docs/source/syntax/fragments.rst b/docs/source/syntax/fragments.rst index 48d7692..310e8fa 100644 --- a/docs/source/syntax/fragments.rst +++ b/docs/source/syntax/fragments.rst @@ -3,7 +3,7 @@ General Fragment Syntax Overview -------- -CGSmiles supports the representation of molecular structures at different +CGsmiles supports the representation of molecular structures at different resolutions through a fragment replacement syntax. This allows users to specify more detailed molecular structures connected to a coarse graph representation. @@ -23,7 +23,7 @@ syntax to define an atomic resolution fragment. Bond Operators ^^^^^^^^^^^^^^ To define how two consecutive fragments at a finer resolution are connected, -CGSmiles builds upon the bonding connector syntax established in BigSMILES to +CGsmiles builds upon the bonding connector syntax established in BigSMILES to avoid ambiguity. Any node or atom that connects to a neighboring fragment is followed by one of four bonding connectors (‘$’, ‘>’, ‘<’, ‘!’) enclosed in square brackets. In addition, any operator may be combined with an alphanumeric @@ -52,7 +52,7 @@ label to distinguish non-equivalent operators of the same type. - **Shared Bonding Operator !** To address a common scenario in CG force fields where an atom is distributed - between two finer resolution nodes, CGSmiles introduces the shared bonding + between two finer resolution nodes, CGsmiles introduces the shared bonding operator ‘!’. In the case of toluene represented at the Martini 3 level, some of the ring atoms are shared between the two CG beads. When two fragments are connected using the shared bonding operator, the atoms at the connection point @@ -78,7 +78,7 @@ double bond between ethane and propane fragment. Updated Bonding Descriptors Lookup Table ---------------------------------------- -This table now includes the squash descriptor, summarizing all the bonding descriptors used in CGSmiles: +This table now includes the squash descriptor, summarizing all the bonding descriptors used in CGsmiles: +----------------+---------------------------+--------------------------------------------------------------------+ | Descriptor | Symbol | Description | diff --git a/docs/source/syntax/introduction.rst b/docs/source/syntax/introduction.rst index e17881c..1e13b5c 100644 --- a/docs/source/syntax/introduction.rst +++ b/docs/source/syntax/introduction.rst @@ -1,7 +1,7 @@ Introduction ============ -The CGSmiles line notation encodes arbitrary resolutions of molecules and +The CGsmiles line notation encodes arbitrary resolutions of molecules and defines the conversion between these resolutions unambiguously. Each resolution is explicitly defined and multiple resolutions may be layered together using this notation. @@ -12,7 +12,7 @@ polymer, which represent a coarser resolution compared to the next (all-atom) representation. Edges in the graph describe chemical connections between these (groups of) atoms. -With this premise, the first resolution of the CGSmiles notation describes +With this premise, the first resolution of the CGsmiles notation describes the molecule graph at the coarsest level. Subsequent resolutions define fragments that specify how each node is represented at the next finer resolution (e.g. residue to coarse-grained beads, or coarse-grained beads @@ -25,6 +25,6 @@ below: In the remainder of this section we first explain the syntax to describe a general graph, which can represent a molecule at any resolution in -CGSmiles. Subsequently, the description is extended to define fragments. +CGsmiles. Subsequently, the description is extended to define fragments. Finally, it is show how to deal with special issues that can arise when converting a coarse resolution graph to atomic representation. diff --git a/docs/source/syntax/multiple_resolutions.rst b/docs/source/syntax/multiple_resolutions.rst index c879c5e..bea6c1b 100644 --- a/docs/source/syntax/multiple_resolutions.rst +++ b/docs/source/syntax/multiple_resolutions.rst @@ -1,7 +1,7 @@ Layering of Resolutions ======================= -CGSmiles enables the representation of molecular graphs at arbitrary resolutions +CGsmiles enables the representation of molecular graphs at arbitrary resolutions and their connection to progressively finer resolutions, allowing for the hierarchical layering of multiple levels of details. @@ -14,7 +14,7 @@ The notation starts with the coarsest representation of the system – the base graph. This graph is enclosed in curly braces. Each additional resolution is represented as a list of fragment graphs, also enclosed in curly braces and separated from the preceding resolution graph by a period. If the final resolution -graph is at the atomic level, either CGSmiles or OpenSMILES syntax can be used +graph is at the atomic level, either CGsmiles or OpenSMILES syntax can be used to describe the fragment graph. This dual approach allows seamless conversion to atomistic resolution using established standards, while also supporting intermediate coarse-grained representations. @@ -30,7 +30,7 @@ Linearizing Rings ^^^^^^^^^^^^^^^^^ Rings at the atomistic resolution can often be mapped into linear structures at the CG level, a common practice in chemically specific force fields such -as Martini. In the CGSmiles notation, bond orders at the coarser resolution are +as Martini. In the CGsmiles notation, bond orders at the coarser resolution are utilized to describe such a case. For example, cyclohexane is represented at the Martini 3 level with a bond @@ -61,7 +61,7 @@ correspond to any finer-resolution nodes or atoms. For example, at the Martini 3 resolution glucose is represented by three CG particles splitting the sugar ring and one additional virtual particle. The TC4 bead captures the hydrophobic interactions at the ring center but lacks any corresponding fragments at finer -resolution. To accommodate such particles, the CGSmiles notation employs zero +resolution. To accommodate such particles, the CGsmiles notation employs zero bond order edges, referred to as virtual edges. .. code-block:: none @@ -83,9 +83,9 @@ the fine-grained resolution because of a loss in resolution at the CG level. An example are Martini lipids such as POPC. POPC can describe lipids with a tail length of 16 or 18 carbons and thus represents at least four molecules when accounting for the position for the double bond. To capture this feature -CGSmiles allows to overload the wildcard (*) syntax using annotations. In +CGsmiles allows to overload the wildcard (*) syntax using annotations. In OpenSMILES a wildcard means any atom can be placed at the wildcard position. -To specify a selection of atoms CGSmiles allows to annotate a wildcard using the +To specify a selection of atoms CGsmiles allows to annotate a wildcard using the select keyword abbreviated as ‘s’. Thus, a tail bead in POPC could be written as ``C1=CCCC[*;s=C,0][*;s=C,0]``. Note that the current molecule resolver is not able to handle wildcard overloading.