From 3586276eef77effce61808c25d10c82d6212d513 Mon Sep 17 00:00:00 2001 From: TieuLongPhan Date: Mon, 22 Apr 2024 10:27:22 +0200 Subject: [PATCH] update curate oxidation and reduction --- Test/SynChemImputer/test_curate_oxidation.py | 63 +++++ Test/SynChemImputer/test_curate_reduction.py | 165 +++++++++++++ synrbl/SynChemImputer/compounds_template.json | 23 +- synrbl/SynChemImputer/curate_oxidation.py | 186 +++++++++++++++ synrbl/SynChemImputer/curate_reduction.py | 198 +++++++++++++--- .../functional_group_checker.py | 219 ------------------ synrbl/SynChemImputer/post_process.py | 82 +++++++ synrbl/SynChemImputer/reaction_template.json | 70 ++++++ synrbl/SynChemImputer/reduction_template.py | 102 -------- .../SynChemImputer/reduction_templates.json | 42 ---- synrbl/SynUtils/chem_utils.py | 89 ++++++- synrbl/SynUtils/common.py | 1 - 12 files changed, 827 insertions(+), 413 deletions(-) create mode 100644 Test/SynChemImputer/test_curate_oxidation.py create mode 100644 Test/SynChemImputer/test_curate_reduction.py create mode 100644 synrbl/SynChemImputer/curate_oxidation.py delete mode 100644 synrbl/SynChemImputer/functional_group_checker.py create mode 100644 synrbl/SynChemImputer/post_process.py create mode 100644 synrbl/SynChemImputer/reaction_template.json delete mode 100644 synrbl/SynChemImputer/reduction_template.py delete mode 100644 synrbl/SynChemImputer/reduction_templates.json diff --git a/Test/SynChemImputer/test_curate_oxidation.py b/Test/SynChemImputer/test_curate_oxidation.py new file mode 100644 index 0000000..e34bca2 --- /dev/null +++ b/Test/SynChemImputer/test_curate_oxidation.py @@ -0,0 +1,63 @@ +import unittest +from synrbl.SynChemImputer.curate_oxidation import CurationOxidation + + +class TestCurationOxidation(unittest.TestCase): + + def setUp(self): + self.curate = CurationOxidation() + self.data = [ + {"R-id": "alcohol_carbonyl", "reactions": "CCO.[O]>>CC=O"}, + {"R-id": "alcohol_acid", "reactions": "CCO.[O]>>CC(=O)O"}, + {"R-id": "carbonyl_acid", "reactions": "CC=O.[O]>>CC(=O)O"}, + ] + + def test_alcohol_carbonyl(self): + result = self.curate.process_dict( + self.data[0], + "reactions", + return_all=True, + compounds_template=self.curate.compounds_template, + reaction_templates=self.curate.reaction_templates, + ) + self.assertEqual( + result["curated_reaction"][0], + "CCO.O=[Cr](Cl)(-[O-])=O.c1cc[nH+]cc1>>CC=O.O=[Cr](O)O.c1cc[nH+]cc1.[Cl-]", + ) + self.assertEqual(result["stoichiometric"][0], [1]) + + def test_alcohol_acid(self): + result = self.curate.process_dict( + self.data[1], + "reactions", + return_all=True, + compounds_template=self.curate.compounds_template, + reaction_templates=self.curate.reaction_templates, + ) + print(result["curated_reaction"][0]) + self.assertEqual( + result["curated_reaction"][0], + ( + "CCO.[K][O][Mn](=O)(=O)=O.OS(=O)(=O)O>>" + + "CC(=O)O.[K][O]S(=O)(=O)[O][K].[Mn]1[O]S(=O)(=O)[O]1" + ), + ) + self.assertEqual(result["stoichiometric"][0], [5, 4, 6, 5, 11, 2, 4]) + + def test_aldehyde_acid(self): + result = self.curate.process_dict( + self.data[2], + "reactions", + return_all=True, + compounds_template=self.curate.compounds_template, + reaction_templates=self.curate.reaction_templates, + ) + self.assertEqual( + result["curated_reaction"][0], + "CC=O.O=[Mn](=O)(=O)O[K].O>>CC(=O)O.O=[Mn]=O.O[K]", + ) + self.assertEqual(result["stoichiometric"][0], [5, 4, 6, 5, 4, 1]) + + +if __name__ == "__main__": + unittest.main() diff --git a/Test/SynChemImputer/test_curate_reduction.py b/Test/SynChemImputer/test_curate_reduction.py new file mode 100644 index 0000000..7254217 --- /dev/null +++ b/Test/SynChemImputer/test_curate_reduction.py @@ -0,0 +1,165 @@ +import unittest +from synrbl.SynChemImputer.curate_reduction import CurationReduction + + +class TestCurationReduction(unittest.TestCase): + + def setUp(self): + self.curate = CurationReduction() + self.data = [ + {"R-id": "aldehyde", "reactions": "CC=O.[H].[H]>>CCO"}, + {"R-id": "ketone", "reactions": "CC(=O)C.[H].[H]>>CC(O)C"}, + {"R-id": "acid", "reactions": "CC(=O)O.[H].[H].[H].[H]>>CCO.O"}, + {"R-id": "ester", "reactions": "CC(=O)OC.[H].[H].[H].[H]>>CCO.CO"}, + ] + + def test_aldehyde(self): + result_ion = self.curate.process_dict( + self.data[0], + "reactions", + return_all=True, + neutralize=False, + compounds_template=self.curate.compounds_template, + reaction_templates=self.curate.reaction_templates, + ) + self.assertEqual( + result_ion["curated_reaction"][1], "CC=O.[BH4-].[Na+].[H+]>>CCO.[BH3].[Na+]" + ) + + result_neutral = self.curate.process_dict( + self.data[0], + "reactions", + return_all=True, + neutralize=True, + compounds_template=self.curate.compounds_template, + reaction_templates=self.curate.reaction_templates, + ) + self.assertEqual( + result_neutral["curated_reaction"][1], + "CC=O.[BH4-].[Na+].Cl>>CCO.[BH3].[Na][Cl]", + ) + + def test_ketone(self): + result_ion = self.curate.process_dict( + self.data[1], + "reactions", + return_all=True, + neutralize=False, + compounds_template=self.curate.compounds_template, + reaction_templates=self.curate.reaction_templates, + ) + self.assertEqual( + result_ion["curated_reaction"][2], + "CC(=O)C.[BH3-]C#N.[Na+].[H+]>>CC(O)C.[BH2]C#N.[Na+]", + ) + + result_neutral = self.curate.process_dict( + self.data[1], + "reactions", + return_all=True, + neutralize=True, + compounds_template=self.curate.compounds_template, + reaction_templates=self.curate.reaction_templates, + ) + self.assertEqual( + result_neutral["curated_reaction"][2], + "CC(=O)C.[BH3-]C#N.[Na+].Cl>>CC(O)C.[BH2]C#N.[Na][Cl]", + ) + + def test_acid(self): + result_ion = self.curate.process_dict( + self.data[2], + "reactions", + return_all=True, + neutralize=False, + compounds_template=self.curate.compounds_template, + reaction_templates=self.curate.reaction_templates, + ) + self.assertEqual( + result_ion["curated_reaction"][0], + ( + "CC(=O)O.[AlH4-].[Li+].[H+].[AlH4-].[Li+].[H+]>>" + + "CCO.O.[AlH3].[Li+].[AlH3].[Li+]", + ), + ) + + result_neutral = self.curate.process_dict( + self.data[2], + "reactions", + return_all=True, + neutralize=True, + compounds_template=self.curate.compounds_template, + reaction_templates=self.curate.reaction_templates, + ) + self.assertEqual( + result_neutral["curated_reaction"][0], + ( + "CC(=O)O.[AlH4-].[Li+].Cl.[AlH4-].[Li+].Cl>>" + + "CCO.O.[AlH3].[Li][Cl].[AlH3].[Li][Cl]" + ), + ) + + def test_ester(self): + result_ion = self.curate.process_dict( + self.data[3], + "reactions", + return_all=True, + neutralize=False, + compounds_template=self.curate.compounds_template, + reaction_templates=self.curate.reaction_templates, + ) + self.assertEqual( + result_ion["curated_reaction"][0], + ( + "CC(=O)OC.[BH4-].[Na+].[H+].[BH4-].[Na+].[H+]>>" + + "CCO.CO.[BH3].[Na+].[BH3].[Na+]" + ), + ) + + result_neutral = self.curate.process_dict( + self.data[3], + "reactions", + return_all=True, + neutralize=True, + compounds_template=self.curate.compounds_template, + reaction_templates=self.curate.reaction_templates, + ) + print(result_neutral) + self.assertEqual( + result_neutral["curated_reaction"][0], + ( + "CC(=O)OC.[BH4-].[Na+].Cl.[BH4-].[Na+].Cl>>" + + "CCO.CO.[BH3].[Na][Cl].[BH3].[Na][Cl]" + ), + ) + + def test_parallel_curate(self): + result = self.curate.parallel_curate( + self.data, n_jobs=2, verbose=2, return_all=True, neutralize=False + ) + + self.assertEqual( + result[0]["curated_reaction"][1], "CC=O.[BH4-].[Na+].[H+]>>CCO.[BH3].[Na+]" + ) + self.assertEqual( + result[1]["curated_reaction"][2], + "CC(=O)C.[BH3-]C#N.[Na+].[H+]>>CC(O)C.[BH2]C#N.[Na+]", + ) + self.assertEqual( + result[2]["curated_reaction"][0], + ( + "CC(=O)O.[AlH4-].[Li+].[H+].[AlH4-].[Li+].[H+]>>" + + "CCO.O.[AlH3].[Li+].[AlH3].[Li+]", + ), + ) + self.assertEqual( + result[3]["curated_reaction"][0], + ( + "CC(=O)OC.[BH4-].[Na+].[H+].[BH4-].[Na+].[H+]>>" + + "CCO.CO.[BH3].[Na+].[BH3].[Na+]", + ), + ) + + +if __name__ == "__main__": + unittest.main() diff --git a/synrbl/SynChemImputer/compounds_template.json b/synrbl/SynChemImputer/compounds_template.json index 36a2ef1..c0040f3 100644 --- a/synrbl/SynChemImputer/compounds_template.json +++ b/synrbl/SynChemImputer/compounds_template.json @@ -1,9 +1,18 @@ { - "aldehyde": ["template_1", "template_2", "template_3", "template_4"], - "ketone": ["template_1", "template_2", "template_3", "template_4"], - "ester": ["template_2", "template_3", "template_4"], - "acyl_chloride": ["template_2", "template_3", "template_4"], - "amid": ["template_4"], - "carboxylic_acid": ["template_4"], - "other": ["template_1"] + "reduction": { + "aldehyde": ["template_1", "template_2", "template_3", "template_4"], + "ketone": ["template_1", "template_2", "template_3", "template_4"], + "ester": ["template_2", "template_3", "template_4"], + "acyl_chloride": ["template_2", "template_3", "template_4"], + "amid": ["template_4"], + "carboxylic_acid": ["template_4"], + "other": ["template_1"] + }, + "oxidation": { + "primary_alcohol>>aldehyde": ["template_1"], + "secondary_alcohol>>ketone": ["template_1"], + "primary_alcohol>>carboxylic_acid": ["template_2"], + "aldehyde>>carboxylic_acid": ["template_3"], + "other": [] + } } diff --git a/synrbl/SynChemImputer/curate_oxidation.py b/synrbl/SynChemImputer/curate_oxidation.py new file mode 100644 index 0000000..09a627b --- /dev/null +++ b/synrbl/SynChemImputer/curate_oxidation.py @@ -0,0 +1,186 @@ +from typing import Dict, List, Tuple, Optional +from synrbl.SynUtils.chem_utils import ( + find_functional_reactivity, + check_for_isolated_atom, + count_radical_atoms, +) +from joblib import Parallel, delayed +import synrbl.SynChemImputer +from synrbl.SynUtils.rsmi_utils import load_database +import rdkit.RDLogger as RDLogger +import importlib.resources + +RDLogger.DisableLog("rdApp.*") + + +compounds_template = load_database( + importlib.resources.files(synrbl.SynChemImputer).joinpath("compounds_template.json") +) + +reaction_templates = load_database( + importlib.resources.files(synrbl.SynChemImputer).joinpath("reaction_template.json") +) + + +class CurationOxidation: + def __init__( + self, + compounds_template: Dict = compounds_template, + reaction_templates: Dict = reaction_templates, + ): + self.compounds_template = compounds_template + self.reaction_templates = reaction_templates + + @staticmethod + def find_oxidation_pattern(reaction_smiles: str) -> List[str]: + """ + Determines the predominant oxidation pattern from a SMILES string + by identifying functional group changes. + + Parameters: + reaction_smiles (str): SMILES string representing the chemical reaction. + + Returns: + List[str]: A list containing the identified oxidation pattern + described as 'reactant>>product'. + """ + reactant_fg, product_fg = find_functional_reactivity(reaction_smiles) + if reactant_fg and product_fg: + return [f"{reactant_fg[0]}>>{product_fg[0]}"] + return [] + + @staticmethod + def process_ox_template( + reaction_smiles: str, compounds_template: Dict, reaction_templates: Dict + ) -> Tuple[str, Optional[bool]]: + """ + Processes an oxidation template based on the given SMILES string of the reaction. + + Parameters: + reaction_smiles (str): The reaction SMILES string + compounds_template (Dict): A dictionary containing compounds templates + reaction_templates (Dict): A dictionary containing reaction templates + + Returns: + Tuple[str, Optional[bool]]: A tuple containing the modified SMILES string + and a boolean indicating if the process was stoichiometric. + """ + reaction_list = [] + stoichiometry_list = [] + try: + cp_temp = CurationOxidation.find_oxidation_pattern(reaction_smiles)[0] + temps = compounds_template["oxidation"].get( + cp_temp, compounds_template["oxidation"]["other"] + ) + except IndexError: + # print("No oxidation pattern found.") + return [reaction_smiles], [None] + + reactant, product = reaction_smiles.split(">>") + o_count = count_radical_atoms(reactant, 8) + reactant = [x for x in reactant.split(".") if x != "[O]"] + product = product.split(".") + if len(temps) == 0: + return [reaction_smiles], [None] + + for temp in temps: + + if cp_temp in [ + "primary_alcohol>>aldehyde", + "secondary_alcohol>>ketone", + "aldehyde>>carboxylic_acid", + ]: + for _ in range(o_count): + reactant.extend(reaction_templates["oxidation"][temp]["reactants"]) + product.extend(reaction_templates["oxidation"][temp]["products"]) + stoichiometry = reaction_templates["oxidation"][temp][ + "stoichiometric" + ] + elif cp_temp == "primary_alcohol>>carboxylic_acid": + reactant.extend(reaction_templates["oxidation"][temp]["reactants"]) + product.extend(reaction_templates["oxidation"][temp]["products"]) + stoichiometry = reaction_templates["oxidation"][temp]["stoichiometric"] + else: + return [reaction_smiles], [None] + + reactant = ".".join(reactant) + product = ".".join(product) + rsmi = f"{reactant}>>{product}" + reaction_list.append(rsmi) + stoichiometry_list.append(stoichiometry) + + return reaction_list, stoichiometry_list + + @staticmethod + def process_dict( + reaction_dict: Dict, + reaction_columns: str = "reactions", + compounds_template: Dict = None, + reaction_templates: Dict = None, + return_all: bool = False, + ) -> Dict: + """ + Processes a single reaction dictionary and updates it with the results of + oxidation reaction curation. + + Parameters: + reaction_dict (Dict): The dictionary containing the reaction data. + reaction_columns (str): The key where the reaction SMILES string is stored. + Defaults to 'reactions'. + compounds_template (Dict, optional): A dictionary of compounds templates. + reaction_templates (Dict, optional): A dictionary of reaction templates. + return_all (bool): A flag to determine if all results or only the first result + should be returned. + + Returns: + Dict: The updated dictionary with additional fields for curated reaction, + stoichiometry, and radical presence. + """ + reaction = reaction_dict[reaction_columns] + new_reaction, stoichiometry = CurationOxidation.process_ox_template( + reaction, compounds_template, reaction_templates + ) + reaction_dict["curated_reaction"] = ( + new_reaction if return_all else new_reaction[0] + ) + reaction_dict["stoichiometric"] = ( + stoichiometry if return_all else stoichiometry[0] + ) + reaction_dict["radical"] = check_for_isolated_atom(new_reaction[0], "O") + return reaction_dict + + def parallel_curate( + self, + data: List[Dict], + n_jobs: int = 4, + verbose: int = 1, + return_all: bool = False, + ) -> List[Dict]: + """ + Curates a list of reaction dictionaries in parallel, updating each + with oxidation reaction curation results. + + Parameters: + data (List[Dict]): A list of dictionaries, each containing + reaction data to be curated. + n_jobs (int): The number of parallel jobs to run. Defaults to 4. + verbose (int): The verbosity level for parallel processing. Defaults to 1. + return_all (bool): A flag to determine if all results or only the first result + should be returned for each reaction. + + Returns: + List[Dict]: A list of the curated dictionaries, each updated with + additional fields for curated reaction, + stoichiometry, and radical presence. + """ + results = Parallel(n_jobs=n_jobs, verbose=verbose)( + delayed(CurationOxidation.process_dict)( + reaction, + "reactions", + self.compounds_template, + self.reaction_templates, + return_all=return_all, + ) + for reaction in data + ) + return results diff --git a/synrbl/SynChemImputer/curate_reduction.py b/synrbl/SynChemImputer/curate_reduction.py index a87685f..ad1d431 100644 --- a/synrbl/SynChemImputer/curate_reduction.py +++ b/synrbl/SynChemImputer/curate_reduction.py @@ -1,63 +1,185 @@ -import re -import copy -from typing import List, Dict, Any +from typing import Dict, List, Tuple, Optional +from synrbl.SynUtils.chem_utils import ( + find_functional_reactivity, + check_for_isolated_atom, + count_radical_atoms, +) from joblib import Parallel, delayed -from synrbl.SynChemImputer.reduction_template import ReductionTemplate +import synrbl.SynChemImputer +from synrbl.SynUtils.rsmi_utils import load_database import rdkit.RDLogger as RDLogger +import importlib.resources RDLogger.DisableLog("rdApp.*") -class CurateReduction: +compounds_template = load_database( + importlib.resources.files(synrbl.SynChemImputer).joinpath("compounds_template.json") +) + +reaction_templates = load_database( + importlib.resources.files(synrbl.SynChemImputer).joinpath("reaction_template.json") +) + + +class CurationReduction: + def __init__( + self, + compounds_template: Dict = compounds_template, + reaction_templates: Dict = reaction_templates, + ): + self.compounds_template = compounds_template + self.reaction_templates = reaction_templates @staticmethod - def check_for_isolated_hydrogen(smiles: str) -> bool: + def find_reduction_pattern(reaction_smiles: str) -> List[str]: + """ + Determines the predominant oxidation pattern from a SMILES string + by identifying functional group changes. + + Parameters: + reaction_smiles (str): SMILES string representing the chemical reaction. - pattern = r"\[H\](?![^[]*\])" - return bool(re.search(pattern, smiles)) + Returns: + List[str]: A list containing the identified oxidation pattern + described as 'reactant>>product'. + """ + reactant_fg, _ = find_functional_reactivity(reaction_smiles) + if reactant_fg: + return reactant_fg + return [] @staticmethod - def curate( - reaction_dict: Dict[str, Any], - reaction_column: str = "reactions", - compound_template: Dict[str, Any] = None, - all_templates: Dict = None, + def process_reduct_template( + reaction_smiles: str, + compounds_template: Dict, + reaction_templates: Dict, + neutralize: bool = False, + ) -> Tuple[str, Optional[bool]]: + """ + Processes an oxidation template based on the given SMILES string of the reaction. + + Parameters: + reaction_smiles (str): The SMILES string representing the reaction. + compounds_template (Dict): A dictionary containing compounds templates. + reaction_templates (Dict): A dictionary containing reaction templates. + + Returns: + Tuple[str, Optional[bool]]: A tuple containing the modified SMILES string and + a boolean indicating if the process was stoichiometric + """ + reaction_list = [] + stoichiometry_list = [] + compounds_template = compounds_template["reduction"] + reaction_templates = reaction_templates["reduction"] + try: + cp_temp = CurationReduction.find_reduction_pattern(reaction_smiles)[0] + temps = compounds_template.get(cp_temp, compounds_template["other"]) + + if len(temps) == 0: + return [reaction_smiles], [None] + except IndexError: + # print("No reduction pattern found.") + temps = compounds_template["other"] + # return [reaction_smiles], [None] + reactant, product = reaction_smiles.split(">>") + h_count = count_radical_atoms(reactant, 1) + if h_count % 2 != 0: + return [reaction_smiles], [None] + reactant = [x for x in reactant.split(".") if x != "[H]"] + product = product.split(".") + + for temp in temps: + reactant_copy = reactant.copy() + product_copy = product.copy() + hh_count = h_count // 2 + template_type = "neutral" if neutralize else "ion" + for _ in range(hh_count): + reactant_copy.extend( + reaction_templates[temp][template_type]["reactants"] + ) + product_copy.extend(reaction_templates[temp][template_type]["products"]) + stoichiometry_list.append( + reaction_templates[temp][template_type]["stoichiometric"] + ) + updated_reactants = ".".join(reactant_copy) + updated_products = ".".join(product_copy) + curated_reaction = f"{updated_reactants}>>{updated_products}" + # print(curated_reaction) + reaction_list.append(curated_reaction) + return reaction_list, stoichiometry_list + + @staticmethod + def process_dict( + reaction_dict: Dict, + reaction_columns: str = "reactions", + compounds_template: Dict = None, + reaction_templates: Dict = None, return_all: bool = False, - ) -> Dict[str, Any]: + neutralize: bool = False, + ) -> Dict: + """ + Processes a single reaction dictionary and updates it with the results of + oxidation reaction curation. - new_reaction_dict = copy.deepcopy(reaction_dict) - reactions = reaction_dict.get(reaction_column, []) - # print(reactions) - if not reactions: - return reaction_dict # Early return if no reactions are found + Parameters: + reaction_dict (Dict): The dictionary containing the reaction data. + reaction_columns (str): The key where the reaction SMILES string is stored. + Defaults to 'reactions'. + compounds_template (Dict, optional): A dictionary of compounds templates + reaction_templates (Dict, optional): A dictionary of reaction templates + return_all (bool): A flag to determine if all results + or only the first result should be returned. - # Process the first reaction for simplification - curate_reaction = ReductionTemplate.reduction_template( - reactions, compound_template, all_templates, return_all + Returns: + Dict: The updated dictionary with additional fields for curated reaction, + stoichiometry, and radical presence. + """ + reaction = reaction_dict[reaction_columns] + new_reaction, stoichiometry = CurationReduction.process_reduct_template( + reaction, compounds_template, reaction_templates, neutralize ) - new_reaction_dict["curated_reaction"] = curate_reaction - new_reaction_dict["radical"] = CurateReduction.check_for_isolated_hydrogen( - curate_reaction[0] if curate_reaction else "" + reaction_dict["curated_reaction"] = ( + new_reaction if return_all else new_reaction[0] ) + reaction_dict["stoichiometric"] = ( + stoichiometry if return_all else stoichiometry[0] + ) + reaction_dict["radical"] = check_for_isolated_atom(new_reaction[0], "H") + return reaction_dict - return new_reaction_dict - - @classmethod def parallel_curate( - cls, - reaction_list: List[Dict[str, Any]], - reaction_column: str = "reactions", - compound_template: Dict[str, Any] = None, - all_templates: Dict = None, - return_all: bool = False, + self, + data: List[Dict], n_jobs: int = 4, verbose: int = 1, - ) -> List[Dict[str, Any]]: + return_all: bool = False, + neutralize: bool = False, + ) -> List[Dict]: + """ + Curates a list of oxidation reaction dictionaries in parallel. + + Parameters: + data (List[Dict]): A list of dictionaries + n_jobs (int): The number of parallel jobs to run. Defaults to 4. + verbose (int): The verbosity level for parallel processing. Defaults to 1. + return_all (bool): A flag to determine if all results + or only the first result should be returned for each reaction. + Returns: + List[Dict]: A list of the curated dictionaries, each updated + with additional fields for curated reaction, + stoichiometry, and radical presence. + """ results = Parallel(n_jobs=n_jobs, verbose=verbose)( - delayed(cls.curate)( - reaction, reaction_column, compound_template, all_templates, return_all + delayed(CurationReduction.process_dict)( + reaction, + "reactions", + self.compounds_template, + self.reaction_templates, + return_all=return_all, + neutralize=neutralize, ) - for reaction in reaction_list + for reaction in data ) return results diff --git a/synrbl/SynChemImputer/functional_group_checker.py b/synrbl/SynChemImputer/functional_group_checker.py deleted file mode 100644 index 680e0bd..0000000 --- a/synrbl/SynChemImputer/functional_group_checker.py +++ /dev/null @@ -1,219 +0,0 @@ -from rdkit import Chem - - -class FunctionalGroupChecker: - # 1. peroxid group - @staticmethod - def check_peroxide(smiles: str) -> bool: - """ - Check for the presence of a peroxide substructure in a molecule. - """ - peroxide_pattern = Chem.MolFromSmarts("OO") - mol = Chem.MolFromSmiles(smiles) - return ( - mol.HasSubstructMatch(peroxide_pattern) - if mol and not FunctionalGroupChecker.check_peracid(smiles) - else False - ) - - @staticmethod - def check_peracid(smiles: str) -> bool: - """ - Check for the presence of a peracid substructure in a molecule. - """ - peracid_pattern = Chem.MolFromSmarts("C(OO)=O") - mol = Chem.MolFromSmiles(smiles) - return mol.HasSubstructMatch(peracid_pattern) if mol else False - - # 2. Alcohol group - @staticmethod - def check_alcohol(smiles: str) -> bool: - """ - Check for the presence of an alcohol functional group in a molecule. - """ - alcohol_pattern = Chem.MolFromSmarts("CO") - mol = Chem.MolFromSmiles(smiles) - return mol.HasSubstructMatch(alcohol_pattern) if mol else False - - @staticmethod - def check_enol(smiles: str) -> bool: - """ - Check for the presence of an enol functional group in a molecule. - """ - enol_pattern = Chem.MolFromSmarts("C=C(O)") - mol = Chem.MolFromSmiles(smiles) - return mol.HasSubstructMatch(enol_pattern) if mol else False - - @staticmethod - def check_phenol(smiles: str) -> bool: - """ - Check for the presence of a phenol functional group in a molecule. - """ - phenol_pattern = Chem.MolFromSmarts("[c]O") - mol = Chem.MolFromSmiles(smiles) - return mol.HasSubstructMatch(phenol_pattern) if mol else False - - @staticmethod - def check_vicinal_diol(smiles: str) -> bool: - """ - Check for the presence of a vicinal diol functional group in a molecule. - """ - vicinal_diol_pattern = Chem.MolFromSmarts("OCO") - mol = Chem.MolFromSmiles(smiles) - return ( - mol.HasSubstructMatch(vicinal_diol_pattern) - if mol - and not FunctionalGroupChecker.check_hemiacetal(smiles) - and not FunctionalGroupChecker.check_carbonate(smiles) - and not FunctionalGroupChecker.check_carboxylic_acid(smiles) - and not FunctionalGroupChecker.check_ester(smiles) - else False - ) - - @staticmethod - def check_gem_diol(smiles: str) -> bool: - """ - Check for the presence of a gem diol functional group in a molecule. - """ - gem_diol_pattern = Chem.MolFromSmarts("OCCO") - mol = Chem.MolFromSmiles(smiles) - return mol.HasSubstructMatch(gem_diol_pattern) if mol else False - - @staticmethod - def check_ether(smiles: str) -> bool: - """ - Check for the presence of an ether functional group in a molecule. - """ - ether_pattern = Chem.MolFromSmarts("COC") - mol = Chem.MolFromSmiles(smiles) - return mol.HasSubstructMatch(ether_pattern) if mol else False - - # 3. Carbonyl group - @staticmethod - def check_aldehyde(smiles: str) -> bool: - """ - Check for the presence of an aldehyde functional group in a molecule. - """ - aldehyde_pattern = Chem.MolFromSmarts("[CX3H1](=O)[#6]") - mol = Chem.MolFromSmiles(smiles) - return mol.HasSubstructMatch(aldehyde_pattern) if mol else False - - @staticmethod - def check_ketone(smiles: str) -> bool: - """ - Check for the presence of a ketone functional group in a molecule. - """ - ketone_pattern = Chem.MolFromSmarts("[#6][CX3](=O)[#6]") - mol = Chem.MolFromSmiles(smiles) - return mol.HasSubstructMatch(ketone_pattern) if mol else False - - @staticmethod - def check_acetal(smiles: str) -> bool: - """ - Check for the presence of an acetal functional group in a molecule. - """ - acetal_pattern = Chem.MolFromSmarts("[CX4][OX2][CX4]") - mol = Chem.MolFromSmiles(smiles) - return mol.HasSubstructMatch(acetal_pattern) if mol else False - - @staticmethod - def check_hemiacetal(smiles: str) -> bool: - """ - Check for the presence of a hemiacetal functional group in a molecule. - """ - hemiacetal_pattern = Chem.MolFromSmarts("COCO") - mol = Chem.MolFromSmiles(smiles) - return ( - mol.HasSubstructMatch(hemiacetal_pattern) - if mol - and not FunctionalGroupChecker.check_carbonate(smiles) - and not FunctionalGroupChecker.check_carboxylic_acid(smiles) - and not FunctionalGroupChecker.check_ester(smiles) - else False - ) - - # 4. Carboxylic group - @staticmethod - def check_carboxylic_acid(smiles: str) -> bool: - """ - Check for the presence of a carboxylic acid functional group in a molecule. - """ - carboxylic_acid_pattern = Chem.MolFromSmarts("C(=O)O") - mol = Chem.MolFromSmiles(smiles) - return mol.HasSubstructMatch(carboxylic_acid_pattern) if mol else False - - @staticmethod - def check_ester(smiles: str) -> bool: - """ - Check for the presence of an ester functional group in a molecule. - """ - ester_pattern = Chem.MolFromSmarts("C(=O)OC") - mol = Chem.MolFromSmiles(smiles) - return ( - mol.HasSubstructMatch(ester_pattern) - if mol and not FunctionalGroupChecker.check_carbonate(smiles) - else False - ) - - @staticmethod - def check_amide(smiles: str) -> bool: - """ - Check for the presence of an amide functional group in a molecule. - """ - amide_pattern = Chem.MolFromSmarts("NC=O") - mol = Chem.MolFromSmiles(smiles) - return ( - mol.HasSubstructMatch(amide_pattern) - if mol and not FunctionalGroupChecker.check_urea(smiles) - else False - ) - - @staticmethod - def check_cyanide(smiles: str) -> bool: - """ - Check for the presence of a cyanide functional group in a molecule. - """ - cyanide_pattern = Chem.MolFromSmarts("[C-]#[N+]") - mol = Chem.MolFromSmiles(smiles) - return mol.HasSubstructMatch(cyanide_pattern) if mol else False - - @staticmethod - def check_urea(smiles: str) -> bool: - """ - Check for the presence of a urea functional group in a molecule. - """ - urea_pattern = Chem.MolFromSmarts("NC(=O)N") - mol = Chem.MolFromSmiles(smiles) - return mol.HasSubstructMatch(urea_pattern) if mol else False - - @staticmethod - def check_carbonate(smiles: str) -> bool: - """ - Check for the presence of a carbonate functional group in a molecule. - """ - carbonate_pattern = Chem.MolFromSmarts("OC(=O)O") - mol = Chem.MolFromSmiles(smiles) - return mol.HasSubstructMatch(carbonate_pattern) if mol else False - - # 5. Amine group - @staticmethod - def check_amine(smiles: str) -> bool: - """ - Check for the presence of an amine functional group in a molecule. - """ - amine_pattern = Chem.MolFromSmarts("CN") - mol = Chem.MolFromSmiles(smiles) - return ( - mol.HasSubstructMatch(amine_pattern) - if mol and not FunctionalGroupChecker.check_amide(smiles) - else False - ) - - @staticmethod - def check_nitro(smiles: str) -> bool: - """ - Check for the presence of a nitro functional group in a molecule. - """ - nitro_pattern = Chem.MolFromSmarts("[N+](=O)[O-]") - mol = Chem.MolFromSmiles(smiles) - return mol.HasSubstructMatch(nitro_pattern) if mol else False diff --git a/synrbl/SynChemImputer/post_process.py b/synrbl/SynChemImputer/post_process.py new file mode 100644 index 0000000..c5905e4 --- /dev/null +++ b/synrbl/SynChemImputer/post_process.py @@ -0,0 +1,82 @@ +from rdkit import Chem +from typing import Dict, List +from synrbl.SynChemImputer.curate_oxidation import CurationOxidation +from synrbl.SynChemImputer.curate_reduction import CurationReduction +from joblib import Parallel, delayed + + +class PostProcess: + + def __init__(self, data: List[dict]): + self.data = data + + @staticmethod + def label_reactions( + reaction_dict: Dict, id_column: str = "R-id", reaction_column: str = "reactions" + ) -> Dict: + """ + Labels chemical reactions based on their reactants, indicating whether they + are oxidation or reduction reactions, and canonicalizes the SMILES strings. + + Parameters: + - reaction_list (List[Dict]): A list of dictionaries, each representing a reaction + with keys 'R-id' and 'new_reaction'. + + Returns: + - List[Dict]: A list of dictionaries, each augmented with a 'label', 'reactants', + and 'products' keys, where 'reactants' and 'products' are canonicalized SMILES. + """ + + label = "unspecified" + r_id = reaction_dict.get("R-id", "N/A") + new_reaction = reaction_dict.get(reaction_column, "") + + try: + reactants, products = new_reaction.split(">>", 1) + except ValueError: + reactants, products = "", "" + + labeling_criteria = { + ".[O]": "Oxidation", + ".[H]": "Reduction", + } + + for marker, reaction_label in labeling_criteria.items(): + if marker in reactants: + label = reaction_label + break + + reactants_smiles = Chem.CanonSmiles(reactants) if reactants else "" + products_smiles = Chem.CanonSmiles(products) if products else "" + + new_dict = { + id_column: r_id, + reaction_column: new_reaction, + "label": label, + "reactants": reactants_smiles, + "products": products_smiles, + } + + return new_dict + + def fit(self, n_jobs=4, verbose=1): + label_data = Parallel(n_jobs=n_jobs, verbose=verbose)( + delayed(PostProcess.label_reactions)(d) for d in self.data + ) + + reduction_data = [ + value for value in label_data if value["label"] == "Reduction" + ] + oxidation_data = [ + value for value in label_data if value["label"] == "Oxidation" + ] + + curate_reduction = CurationReduction() + curate_oxidation = CurationOxidation() + result_reduction = curate_reduction.parallel_curate( + reduction_data, n_jobs=n_jobs, verbose=verbose + ) + result_oxidation = curate_oxidation.parallel_curate( + oxidation_data, n_jobs=n_jobs, verbose=verbose + ) + return result_reduction, result_oxidation diff --git a/synrbl/SynChemImputer/reaction_template.json b/synrbl/SynChemImputer/reaction_template.json new file mode 100644 index 0000000..2e367be --- /dev/null +++ b/synrbl/SynChemImputer/reaction_template.json @@ -0,0 +1,70 @@ +{ + "reduction":{ + "template_1": { + "neutral": { + "reactants": ["[HH]"], + "products": [], + "stoichiometric": [1] + }, + "ion": { + "reactants": ["[HH]"], + "products": [], + "stoichiometric": [1] + } + }, + "template_2": { + "neutral": { + "reactants": ["[BH4-]", "[Na+]", "Cl"], + "products": ["[BH3]", "[Na][Cl]"], + "stoichiometric": [1] + }, + "ion": { + "reactants": ["[BH4-]", "[Na+]", "[H+]"], + "products": ["[BH3]", "[Na+]"], + "stoichiometric": [1] + } + }, + "template_3": { + "neutral": { + "reactants": ["[BH3-]C#N", "[Na+]", "Cl"], + "products": ["[BH2]C#N", "[Na][Cl]"], + "stoichiometric": [1] + }, + "ion": { + "reactants": ["[BH3-]C#N", "[Na+]", "[H+]"], + "products": ["[BH2]C#N", "[Na+]"], + "stoichiometric": [1] + } + }, + "template_4": { + "neutral": { + "reactants": ["[AlH4-]", "[Li+]", "Cl"], + "products": ["[AlH3]", "[Li][Cl]"], + "stoichiometric": [1] + }, + "ion": { + "reactants": ["[AlH4-]", "[Li+]", "[H+]"], + "products": ["[AlH3]", "[Li+]"], + "stoichiometric": [1] + } + } + }, + "oxidation": { + "template_1": { + "reactants": ["O=[Cr](Cl)(-[O-])=O", "c1cc[nH+]cc1"], + "products": ["O=[Cr](O)O", "c1cc[nH+]cc1","[Cl-]"], + "stoichiometric": [1] + }, + "template_2": { + "reactants": ["[K][O][Mn](=O)(=O)=O", "OS(=O)(=O)O"], + "products": ["[K][O]S(=O)(=O)[O][K]", "[Mn]1[O]S(=O)(=O)[O]1"], + "stoichiometric": [5,4,6,5,11,2,4] + }, + "template_3": { + "reactants": ["O=[Mn](=O)(=O)O[K]", "O"], + "products": ["O=[Mn]=O", "O[K]"], + "stoichiometric": [5,4,6,5,4,1] + } + +} +} diff --git a/synrbl/SynChemImputer/reduction_template.py b/synrbl/SynChemImputer/reduction_template.py deleted file mode 100644 index 0f8745c..0000000 --- a/synrbl/SynChemImputer/reduction_template.py +++ /dev/null @@ -1,102 +0,0 @@ -from typing import List, Dict, Union -from rdkit import Chem -from fgutils import FGQuery -import rdkit.RDLogger as RDLogger - -RDLogger.DisableLog("rdApp.*") - - -class ReductionTemplate: - @staticmethod - def count_radical_isolated_hydrogens(smiles): - - mol = Chem.MolFromSmiles(smiles) - - # Initialize count for isolated radical hydrogens - hydrogen_count = 0 - - # Iterate over all atoms in the molecule - for atom in mol.GetAtoms(): - # Check if the atom is a hydrogen atom - if atom.GetAtomicNum() == 1: - # Check if the hydrogen atom is isolated (has no neighbors) - if len(atom.GetNeighbors()) == 0: - # Check if the hydrogen is a radical (has unpaired electrons) - if atom.GetNumRadicalElectrons() > 0: - hydrogen_count += 1 - - return hydrogen_count - - @staticmethod - def find_reactive_functional_groups(reaction_smiles: str) -> List[str]: - query = FGQuery(use_smiles=True) - reactant, product = reaction_smiles.split(">>") - fg_reactant = query.get(reactant) - fg_product = query.get(product) - fg_reactant = [value[0] for value in fg_reactant] - fg_product = [value[0] for value in fg_product] - return [fg for fg in fg_reactant if fg not in fg_product] - - @staticmethod - def process_template( - reaction_smiles: str, - neutralize: bool = False, - all_templates: Dict = None, - template: str = None, - ) -> str: - if template is None: - selected_template = all_templates[ - 0 - ] # Default to template_1 if none provided - else: - selected_template = all_templates[template] - reactants, products = reaction_smiles.split(">>") - hydrogen_count = ReductionTemplate.count_radical_hydrogens(reactants) - if hydrogen_count % 2 != 0: - return reaction_smiles - hh_count = hydrogen_count // 2 - reactant_list = [x for x in reactants.split(".") if x != "[H]"] - product_list = products.split(".") - template_type = "neutral" if neutralize else "ion" - for _ in range(hh_count): - reactant_list.extend(selected_template[template_type]["reactants"]) - product_list.extend(selected_template[template_type]["products"]) - updated_reactants = ".".join(reactant_list) - updated_products = ".".join(product_list) - return f"{updated_reactants}>>{updated_products}" - - @classmethod - def reduction_template( - cls, - reaction_smiles: str, - compound_template: Dict, - all_templates: Dict = None, - return_all: bool = False, - ) -> Union[str, List[str]]: - try: - fg_reactive = cls.find_reactive_functional_groups(reaction_smiles) - if len(fg_reactive) == 0: - fg_reactive = ["other"] - processed_smiles = [] - for group, templates in compound_template.items(): - if group in fg_reactive: - # print(f"Processing {group} with template {templates}") - processed_smiles.extend( - [ - cls.process_template( - reaction_smiles, - neutralize=False, - all_templates=all_templates, - template=tpl, - ) - for tpl in templates - ] - ) - return ( - processed_smiles - if return_all - else (processed_smiles[0] if processed_smiles else None) - ) - except Exception as e: - print(e) - return [reaction_smiles] diff --git a/synrbl/SynChemImputer/reduction_templates.json b/synrbl/SynChemImputer/reduction_templates.json deleted file mode 100644 index 0cf5899..0000000 --- a/synrbl/SynChemImputer/reduction_templates.json +++ /dev/null @@ -1,42 +0,0 @@ -{ - "template_1": { - "neutral": { - "reactants": ["[HH]"], - "products": [] - }, - "ion": { - "reactants": ["[HH]"], - "products": [] - } - }, - "template_2": { - "neutral": { - "reactants": ["[BH4-]", "[Na+]", "Cl"], - "products": ["[BH3]", "[Na][Cl]"] - }, - "ion": { - "reactants": ["[BH4-]", "[Na+]", "[H+]"], - "products": ["[BH3]", "[Na+]"] - } - }, - "template_3": { - "neutral": { - "reactants": ["[BH3-]C#N", "[Na+]", "Cl"], - "products": ["[BH2]C#N", "[Na][Cl]"] - }, - "ion": { - "reactants": ["[BH3-]C#N", "[Na+]", "[H+]"], - "products": ["[BH2]C#N", "[Na+]"] - } - }, - "template_4": { - "neutral": { - "reactants": ["[AlH4-]", "[Li+]", "Cl"], - "products": ["[AlH3]", "[Li][Cl]"] - }, - "ion": { - "reactants": ["[AlH4-]", "[Na+]", "[H+]"], - "products": ["[AlH3]", "[Na+]"] - } - } -} diff --git a/synrbl/SynUtils/chem_utils.py b/synrbl/SynUtils/chem_utils.py index 85a24a4..54a527b 100644 --- a/synrbl/SynUtils/chem_utils.py +++ b/synrbl/SynUtils/chem_utils.py @@ -6,10 +6,9 @@ import rdkit.Chem.rdFingerprintGenerator as rdFingerprintGenerator import rdkit.Chem.AllChem as AllChem import rdkit.Chem.rdmolfiles as rdmolfiles - - -from typing import List, Dict -from typing import Union +from collections import Counter +from typing import List, Dict, Tuple, Optional, Union +from fgutils import FGQuery class CheckCarbonBalance: @@ -230,3 +229,85 @@ def _fp(mol1, mol2): exp_ed, res_ed = _get_diff_mol(exp_e, res_e) exp_pd, res_pd = _get_diff_mol(exp_p, res_p) return np.min([_fp(exp_ed, res_ed), _fp(exp_pd, res_pd)]) + + +def check_for_isolated_atom(smiles: str, atom: Optional[str] = "H") -> bool: + """ + Checks if a specified type of isolated atom (hydrogen by default, or oxygen) + exists in a SMILES string. + """ + # Pattern to find isolated atoms; not connected to any other atoms + pattern = rf"\[{atom}\](?![^[]*\])" + return bool(re.search(pattern, smiles)) + + +def count_radical_atoms(smiles: str, atomic_num: int) -> int: + """ + Counts isolated radical atoms in SMILES string. + """ + mol = Chem.MolFromSmiles(smiles) + radical_count = 0 + + # Iterate over all atoms in the molecule + for atom in mol.GetAtoms(): + + if atom.GetAtomicNum() == atomic_num and atom.GetNumRadicalElectrons() > 0: + # Further check if the atom is isolated (has no neighbors) + if len(atom.GetNeighbors()) == 0: + radical_count += 1 + + return radical_count + + +def list_difference( + list1: List[str], list2: List[str] +) -> Tuple[Dict[str, int], Dict[str, int]]: + """ + Compares two lists and returns dictionaries that count unique occurrences + Parameters: + list1 (List[str]): First list of items for comparison. + list2 (List[str]): Second list of items for comparison. + + Returns: + Tuple[Dict[str, int], Dict[str, int]]: A tuple of two dictionaries: + - First dictionary: Items unique to the first list with their counts. + - Second dictionary: Items unique to the second list with their counts. + """ + count1 = Counter(list1) + count2 = Counter(list2) + unique_to_list1 = {} + unique_to_list2 = {} + + for key, count in count1.items(): + if key not in count2: + unique_to_list1[key] = count + elif count > count2[key]: + unique_to_list1[key] = count - count2[key] + + for key, count in count2.items(): + if key not in count1: + unique_to_list2[key] = count + elif count > count1[key]: + unique_to_list2[key] = count - count1[key] + + return unique_to_list1, unique_to_list2 + + +def find_functional_reactivity(reaction_smiles: str) -> Tuple[List[str], List[str]]: + """ + Analyzes functional groups + + Parameters: + reaction_smiles (str): SMILES string of the reaction + Returns: + Tuple[List[str], List[str]]: Two lists containing unique functional groups + in reactants and products, respectively. + """ + query = FGQuery(use_smiles=True) + reactant, product = reaction_smiles.split(">>") + fg_reactant = query.get(reactant) + fg_product = query.get(product) + fg_reactant = [value[0] for value in fg_reactant] + fg_product = [value[0] for value in fg_product] + reactant_fg, product_fg = list_difference(fg_reactant, fg_product) + return list(reactant_fg.keys()), list(product_fg.keys()) diff --git a/synrbl/SynUtils/common.py b/synrbl/SynUtils/common.py index dc1deae..adc90ae 100644 --- a/synrbl/SynUtils/common.py +++ b/synrbl/SynUtils/common.py @@ -1,4 +1,3 @@ - def update_reactants_and_products( reactions, reaction_col,