From 9dafd639143ed347f4fe18507cc036808fc2c954 Mon Sep 17 00:00:00 2001
From: Usman Jamshed <usman.jamshedk@gmail.com>
Date: Fri, 13 Oct 2023 23:58:47 -0400
Subject: [PATCH 1/3] Initial investigation completed and handling molecules
 without PCA or UMAP data

---
 .../DB_investigation-checkpoint.ipynb         | 798 ++++++++++++++++++
 DB_investigation.ipynb                        | 798 ++++++++++++++++++
 backend/app/app/api/v2/endpoints/molecule.py  |   9 +-
 duplicates.pkl                                | Bin 0 -> 9364 bytes
 frontend/src/pages/Molecule.jsx               |  26 +-
 5 files changed, 1617 insertions(+), 14 deletions(-)
 create mode 100644 .ipynb_checkpoints/DB_investigation-checkpoint.ipynb
 create mode 100644 DB_investigation.ipynb
 create mode 100644 duplicates.pkl
diff --git a/.ipynb_checkpoints/DB_investigation-checkpoint.ipynb b/.ipynb_checkpoints/DB_investigation-checkpoint.ipynb
new file mode 100644
index 0000000..025dc89
--- /dev/null
+++ b/.ipynb_checkpoints/DB_investigation-checkpoint.ipynb
@@ -0,0 +1,798 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9368bb63",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Want to start by checking which molecules are duplicates.\n",
+    "# For examples we have 241905 and 1497"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "e28921c8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import psycopg2\n",
+    "import pandas as pd\n",
+    "from rdkit import Chem"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 33,
+   "id": "214277f9",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from tqdm import tqdm"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "a128c9e3",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/tmp/ipykernel_556/3175014960.py:16: UserWarning: pandas only supports SQLAlchemy connectable (engine/connection) or database string URI or sqlite3 DBAPI2 connection. Other DBAPI2 objects are not tested. Please consider using SQLAlchemy.\n",
+      "  df = pd.read_sql_query(query, connection)\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Establish a connection\n",
+    "# You must have the DB container running to run this cell successfully.\n",
+    "# Connection parameters\n",
+    "db_params = {\n",
+    "    'dbname': 'postgres',\n",
+    "    'user': 'postgres',\n",
+    "    'password': '',\n",
+    "    'host': '127.0.0.1',\n",
+    "    'port': '5432'\n",
+    "}\n",
+    "\n",
+    "# Establish a connection to the PostgreSQL database\n",
+    "connection = psycopg2.connect(**db_params)\n",
+    "\n",
+    "# Execute an SQL statement\n",
+    "query = \"SELECT molecule_id, smiles, molecular_weight FROM molecule\"\n",
+    "df = pd.read_sql_query(query, connection)\n",
+    "\n",
+    "# Close the connection\n",
+    "connection.close()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "686a1a35",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>molecule_id</th>\n",
+       "      <th>smiles</th>\n",
+       "      <th>molecular_weight</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>331406</td>\n",
+       "      <td>COc1cccc(c1c1ccccc1P(c1ccccc1)c1ccccc1)OC</td>\n",
+       "      <td>398.441986</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>140360</td>\n",
+       "      <td>COc1ccc(P(c2ccccc2SC)c2ccccc2SC)c(C)c1</td>\n",
+       "      <td>398.532990</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>331409</td>\n",
+       "      <td>C1CCC(CC1)P(c1ccccc1)C1CCCCC1</td>\n",
+       "      <td>274.388000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>2027</td>\n",
+       "      <td>CN(c1ccccc1c1ccccc1P(C12CC3CC(C2)CC(C1)C3)C12C...</td>\n",
+       "      <td>497.707001</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>2036</td>\n",
+       "      <td>CCC1(CC)O[C@@H]2[C@@H](O1)C(c1cc(C(C)(C)C)cc(C...</td>\n",
+       "      <td>1049.558960</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>330962</th>\n",
+       "      <td>608</td>\n",
+       "      <td>Cc1cc(C)cc(P(c2cc(C)cc(C)c2)c2ccccc2C=O)c1</td>\n",
+       "      <td>346.410004</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>330963</th>\n",
+       "      <td>461</td>\n",
+       "      <td>CCO[Si](CCP(c1ccccc1)c1ccccc1)(OCC)OCC</td>\n",
+       "      <td>376.509003</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>330964</th>\n",
+       "      <td>1064</td>\n",
+       "      <td>Cc1c(C)n(C(C)C)c(=NP(N=c2n(C(C)C)c(C)c(C)n2C(C...</td>\n",
+       "      <td>462.666992</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>330965</th>\n",
+       "      <td>523</td>\n",
+       "      <td>CN(C)/N=C/c1ccc(P(c2ccc(/C=N/N(C)C)s2)c2ccc(/C...</td>\n",
+       "      <td>490.664001</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>330966</th>\n",
+       "      <td>1817</td>\n",
+       "      <td>COc1cccc(P(c2cccc(OC)c2OC)c2cccc(OC)c2OC)c1OC</td>\n",
+       "      <td>442.447998</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>330967 rows × 3 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "        molecule_id                                             smiles  \\\n",
+       "0            331406          COc1cccc(c1c1ccccc1P(c1ccccc1)c1ccccc1)OC   \n",
+       "1            140360             COc1ccc(P(c2ccccc2SC)c2ccccc2SC)c(C)c1   \n",
+       "2            331409                      C1CCC(CC1)P(c1ccccc1)C1CCCCC1   \n",
+       "3              2027  CN(c1ccccc1c1ccccc1P(C12CC3CC(C2)CC(C1)C3)C12C...   \n",
+       "4              2036  CCC1(CC)O[C@@H]2[C@@H](O1)C(c1cc(C(C)(C)C)cc(C...   \n",
+       "...             ...                                                ...   \n",
+       "330962          608         Cc1cc(C)cc(P(c2cc(C)cc(C)c2)c2ccccc2C=O)c1   \n",
+       "330963          461             CCO[Si](CCP(c1ccccc1)c1ccccc1)(OCC)OCC   \n",
+       "330964         1064  Cc1c(C)n(C(C)C)c(=NP(N=c2n(C(C)C)c(C)c(C)n2C(C...   \n",
+       "330965          523  CN(C)/N=C/c1ccc(P(c2ccc(/C=N/N(C)C)s2)c2ccc(/C...   \n",
+       "330966         1817      COc1cccc(P(c2cccc(OC)c2OC)c2cccc(OC)c2OC)c1OC   \n",
+       "\n",
+       "        molecular_weight  \n",
+       "0             398.441986  \n",
+       "1             398.532990  \n",
+       "2             274.388000  \n",
+       "3             497.707001  \n",
+       "4            1049.558960  \n",
+       "...                  ...  \n",
+       "330962        346.410004  \n",
+       "330963        376.509003  \n",
+       "330964        462.666992  \n",
+       "330965        490.664001  \n",
+       "330966        442.447998  \n",
+       "\n",
+       "[330967 rows x 3 columns]"
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "id": "149e8d9b",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>molecule_id</th>\n",
+       "      <th>smiles</th>\n",
+       "      <th>molecular_weight</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>143223</th>\n",
+       "      <td>241905</td>\n",
+       "      <td>[H]P([H])C</td>\n",
+       "      <td>48.025002</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>329868</th>\n",
+       "      <td>1497</td>\n",
+       "      <td>CP</td>\n",
+       "      <td>48.025002</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "        molecule_id      smiles  molecular_weight\n",
+       "143223       241905  [H]P([H])C         48.025002\n",
+       "329868         1497          CP         48.025002"
+      ]
+     },
+     "execution_count": 15,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# Sanity check to see if the data is correct\n",
+    "df[(df[\"molecule_id\"]==241905) | (df[\"molecule_id\"]==1497)]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 27,
+   "id": "9596cb12",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# These two molecules are the same so lets check if rdkit will return the same smiles string when canonicalizing them\n",
+    "mol_241905 = df[df[\"molecule_id\"]==241905][\"smiles\"].to_list()[0]\n",
+    "mol_1497 = df[df[\"molecule_id\"]==1497][\"smiles\"].to_list()[0]\n",
+    "\n",
+    "# Double check with the molecular weight. Use difference is less than some tolerance 1e-6.\n",
+    "mol_241905_weight = df[df[\"molecule_id\"]==241905][\"molecular_weight\"].to_list()[0]\n",
+    "mol_1497_weight = df[df[\"molecule_id\"]==1497][\"molecular_weight\"].to_list()[0]\n",
+    "\n",
+    "a = Chem.CanonSmiles(mol_241905)\n",
+    "b = Chem.CanonSmiles(mol_241905)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 29,
+   "id": "9c94d293",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "CP 48.025001525878906\n",
+      "CP 48.025001525878906\n",
+      "True\n",
+      "True\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(a, mol_241905_weight)\n",
+    "print(b, mol_1497_weight)\n",
+    "print(a == b)\n",
+    "print(abs(mol_241905_weight - mol_1497_weight) < 0.000001)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 31,
+   "id": "5cca5c81",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "all_data_dict = df.to_dict(orient=\"records\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 41,
+   "id": "7ea5a56b",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "  0%|                                                                                        | 0/330967 [00:00<?, ?it/s][23:43:15] Explicit valence for atom # 7 B, 6, is greater than permitted\n",
+      "  0%|▏                                                                           | 653/330967 [00:00<01:38, 3339.89it/s]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "c1ccc(cc1)P([B]1234[BH]567[BH]891[BH]1%102[BH]2%114[BH]435[CH]357[BH]768[CH]691[BH]1%102[BH]%1143[BH]5761)c1ccccc1\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "  1%|▌                                                                          | 2331/330967 [00:00<01:20, 4099.64it/s][23:43:15] Explicit valence for atom # 7 C, 6, is greater than permitted\n",
+      "  1%|▋                                                                          | 3150/330967 [00:00<01:24, 3897.22it/s]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "c1ccc(cc1)P([C]1234[BH]567[BH]891[BH]1%103[BH]3%112[BH]245[BH]456[CH]678[BH]791[BH]1%10%11[BH]324[BH]5671)c1ccccc1\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "  2%|█▎                                                                         | 5580/330967 [00:01<01:22, 3940.69it/s][23:43:16] Explicit valence for atom # 7 B, 6, is greater than permitted\n",
+      "[23:43:16] Explicit valence for atom # 7 C, 6, is greater than permitted\n",
+      "  2%|█▍                                                                         | 6369/330967 [00:01<01:23, 3905.70it/s]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "C1CCC(CC1)P([B]1234[BH]567[BH]891[BH]1%102[BH]2%114[BH]435[CH]357[BH]768[CH]691[BH]1%102[BH]%1143[BH]5761)C1CCCCC1\n",
+      "c1ccc(cc1)P([C]1234[CH]567[BH]893[BH]3%102[BH]2%111[BH]145[BH]456[BH]678[BH]79%10[BH]83%11[BH]214[BH]5678)c1ccccc1\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "100%|█████████████████████████████████████████████████████████████████████████| 330967/330967 [02:02<00:00, 2711.21it/s]\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Store each record in a hashmap with the CANONICAL smile as the key, and value as a tuple of molecule_id and molecular_weight\n",
+    "# When storing, check to see if the CANONICAL smile is already there, if so check its weight and add the tuple of molecule_ids\n",
+    "# the original  \n",
+    "hashmap = {}\n",
+    "duplicates = []\n",
+    "for entry in tqdm(all_data_dict):\n",
+    "    try:\n",
+    "        # Canonicalize smiles\n",
+    "        canonical_smile = Chem.CanonSmiles(entry[\"smiles\"])\n",
+    "        # Check if its in hashmap\n",
+    "        if canonical_smile not in hashmap:\n",
+    "            # add it\n",
+    "            hashmap[canonical_smile] = (entry[\"molecule_id\"], entry[\"smiles\"], entry[\"molecular_weight\"])\n",
+    "        else:\n",
+    "            a = hashmap[canonical_smile] # Get matched molecule tuple data\n",
+    "            b = (entry[\"molecule_id\"], entry[\"smiles\"], entry[\"molecular_weight\"])\n",
+    "            duplicates.append((a, b))\n",
+    "    except:\n",
+    "        # See which molecules cause issues if any, deal with them later\n",
+    "        print(entry[\"smiles\"])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 43,
+   "id": "361ec9af",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "78"
+      ]
+     },
+     "execution_count": 43,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "len(duplicates)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 44,
+   "id": "158c8d2c",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[((1519, 'COP', 64.02400207519531),\n",
+       "  (241851, '[H]P([H])OC', 64.02400207519531)),\n",
+       " ((241739, '[H]P([H])C(C)C', 76.0790023803711),\n",
+       "  (1495, 'CC(C)P', 76.0790023803711)),\n",
+       " ((2045, 'C1CC2CCCC(C1)P2B1Nc2ccccc2c2c1cccc2', 319.1969909667969),\n",
+       "  (1006, 'c1ccc2c(c1)NB(P1C3CCCC1CCC3)c1ccccc1-2', 319.1969909667969)),\n",
+       " ((1521, 'CSP', 80.09200286865234),\n",
+       "  (241909, '[H]P([H])SC', 80.09200286865234)),\n",
+       " ((2063,\n",
+       "   'COC1=CC=C(OC)C(C2=C(C(C)C)C=C(C(C)C)C=C2C(C)C)=C1P3C(C)(C)CC4(OCCO4)CC3(C)C',\n",
+       "   554.7520141601562),\n",
+       "  (1900,\n",
+       "   'COc1ccc(OC)c(P2C(C)(C)CC3(CC2(C)C)OCCO3)c1-c1c(C(C)C)cc(C(C)C)cc1C(C)C',\n",
+       "   554.7520141601562)),\n",
+       " ((1494, 'CC(C)PC(C)C', 118.16000366210938),\n",
+       "  (41951, '[H]P(C(C)C)C(C)C', 118.16000366210938)),\n",
+       " ((2054, 'PC12CC3CC(C2)CC(C1)C3', 168.22000122070312),\n",
+       "  (1296, 'PC12CC3CC(CC(C3)C1)C2', 168.22000122070312)),\n",
+       " ((820,\n",
+       "   'COC1=[C@@](C2=CC=CC=C2C=C1)[C@@]3=C4C(C=CC=C4)=CC=C3P(OC5=CC=C(C=CC=C6)C6=[C@]57)OC8=[C@]7C(C=CC=C9)=C9C=C8',\n",
+       "   598.6380004882812),\n",
+       "  (821,\n",
+       "   'COC1=[C@@](C2=CC=CC=C2C=C1)[C@@]3=C4C(C=CC=C4)=CC=C3P(OC5=CC=C(C=CC=C6)C6=[C@@]57)OC8=[C@@]7C(C=CC=C9)=C9C=C8',\n",
+       "   598.6380004882812)),\n",
+       " ((2053, 'C1C2CC3CC1CC(C2)(C3)PC12CC3CC(C2)CC(C1)C3', 302.4419860839844),\n",
+       "  (1295, 'C1C2CC3CC1CC(PC14CC5CC(CC(C5)C1)C4)(C2)C3', 302.4419860839844)),\n",
+       " ((213971, '[H]P(F)F', 69.97799682617188), (1293, 'FPF', 69.97799682617188)),\n",
+       " ((150328, '[H]P([Si](C)(C)C)[Si](C)(C)C', 178.36399841308594),\n",
+       "  (1520, 'C[Si](C)(C)P[Si](C)(C)C', 178.36399841308594)),\n",
+       " ((2020,\n",
+       "   'c1ccc(cc1)c1ccccc1P(C12CC3CC(C2)CC(C1)C3)C12CC3CC(C2)CC(C1)C3',\n",
+       "   454.63800048828125),\n",
+       "  (306,\n",
+       "   'c1ccc(-c2ccccc2P(C23CC4CC(CC(C4)C2)C3)C23CC4CC(CC(C4)C2)C3)cc1',\n",
+       "   454.63800048828125)),\n",
+       " ((2053, 'C1C2CC3CC1CC(C2)(C3)PC12CC3CC(C2)CC(C1)C3', 302.4419860839844),\n",
+       "  (252420,\n",
+       "   '[H]P(C12CC3CC(CC(C3)C1)C2)C12CC3CC(CC(C3)C1)C2',\n",
+       "   302.4419860839844)),\n",
+       " ((1491, 'Pc1ccco1', 100.05699920654297),\n",
+       "  (242228, '[H]P([H])c1ccco1', 100.05699920654297)),\n",
+       " ((2033,\n",
+       "   'CCCCc1c(F)c(F)c(c(c1F)F)c1c(cc(c(c1C(C)C)c1cccc(c1P(C12CC3CC(C2)CC(C1)C3)C12CC3CC(C2)CC(C1)C3)OC)C(C)C)C(C)C',\n",
+       "   815.072998046875),\n",
+       "  (369,\n",
+       "   'CCCCc1c(F)c(F)c(-c2c(C(C)C)cc(C(C)C)c(-c3cccc(OC)c3P(C34CC5CC(CC(C5)C3)C4)C34CC5CC(CC(C5)C3)C4)c2C(C)C)c(F)c1F',\n",
+       "   815.072998046875)),\n",
+       " ((1297, 'CC(C)(C)PC(C)(C)C', 146.21400451660156),\n",
+       "  (20221, '[H]P(C(C)(C)C)C(C)(C)C', 146.21400451660156)),\n",
+       " ((2044, 'CC(P(C(C)(C)C)C1C2CC3CC1CC(C2)C3)(C)C', 280.4360046386719),\n",
+       "  (952, 'CC(C)(C)P(C1C2CC3CC(C2)CC1C3)C(C)(C)C', 280.4360046386719)),\n",
+       " ((331415, 'P(CCCCCCCC)(CCCCCCCC)CCCCCCCC', 370.64599609375),\n",
+       "  (239, 'CCCCCCCCP(CCCCCCCC)CCCCCCCC', 370.64599609375)),\n",
+       " ((1493, 'CCP', 62.051998138427734),\n",
+       "  (241815, '[H]P([H])CC', 62.051998138427734)),\n",
+       " ((1518, 'CN(C)PN(C)C', 120.13600158691406),\n",
+       "  (91817, '[H]P(N(C)C)N(C)C', 120.13600158691406)),\n",
+       " ((1294, 'FP', 51.987998962402344),\n",
+       "  (242039, '[H]P([H])F', 51.987998962402344)),\n",
+       " ((2038, 'C1C2CC3CC1CC(C2)(C3)P(c1ccccc1)c1ccccc1', 320.4159851074219),\n",
+       "  (576, 'c1ccc(P(c2ccccc2)C23CC4CC(CC(C4)C2)C3)cc1', 320.4159851074219)),\n",
+       " ((1487, 'Pc1ccc2c(c1)CCCC2', 164.18800354003906),\n",
+       "  (242188, '[H]P([H])c1ccc2c(c1)CCCC2', 164.18800354003906)),\n",
+       " ((2034,\n",
+       "   'O1CCN(CC1)c1ccccc1P(C12CC3CC(C2)CC(C1)C3)C12CC3CC(C2)CC(C1)C3',\n",
+       "   463.64599609375),\n",
+       "  (371,\n",
+       "   'c1ccc(P(C23CC4CC(CC(C4)C2)C3)C23CC4CC(CC(C4)C2)C3)c(N2CCOCC2)c1',\n",
+       "   463.64599609375)),\n",
+       " ((331412, 'CC(C=CC=C1C)=C1P(C2=CC=CC=C2)C3=CC=CC=C3', 290.34600830078125),\n",
+       "  (203, 'Cc1cccc(C)c1P(c1ccccc1)c1ccccc1', 290.34600830078125)),\n",
+       " ((2043, 'CC(P(C12CC3CC(C2)CC(C1)C3)C(C)(C)C)(C)C', 280.4360046386719),\n",
+       "  (951, 'CC(C)(C)P(C(C)(C)C)C12CC3CC(CC(C3)C1)C2', 280.4360046386719)),\n",
+       " ((1492, 'CCPCC', 90.10600280761719),\n",
+       "  (85517, '[H]P(CC)CC', 90.10600280761719)),\n",
+       " ((1488, 'PC12c3ccccc3C(c3ccccc31)c1ccccc12', 286.3139953613281),\n",
+       "  (242107, '[H]P([H])C12c3ccccc3C(c3ccccc31)c1ccccc12', 286.3139953613281)),\n",
+       " ((2064,\n",
+       "   'COC1=CC=C(C2=C(OC)C=CC3=C2C=CC=C3)C(OC)=C1P(C4CCCCC4)C5CCCCC5',\n",
+       "   490.6239929199219),\n",
+       "  (1901,\n",
+       "   'COc1ccc(-c2c(OC)ccc3ccccc23)c(OC)c1P(C1CCCCC1)C1CCCCC1',\n",
+       "   490.6239929199219)),\n",
+       " ((331406, 'COc1cccc(c1c1ccccc1P(c1ccccc1)c1ccccc1)OC', 398.4419860839844),\n",
+       "  (7, 'COc1cccc(OC)c1-c1ccccc1P(c1ccccc1)c1ccccc1', 398.4419860839844)),\n",
+       " ((1485, 'Pc1ccc2ccccc2c1', 160.156005859375),\n",
+       "  (242192, '[H]P([H])c1ccc2ccccc2c1', 160.156005859375)),\n",
+       " ((1496, 'CPC', 62.051998138427734), (137130, '[H]P(C)C', 62.051998138427734)),\n",
+       " ((2052,\n",
+       "   'CC(c1cccc(c1P(C12CC3CC(C2)CC(C1)C3)C12CC3CC(C2)CC(C1)C3)C(C)C)C',\n",
+       "   462.7019958496094),\n",
+       "  (1274,\n",
+       "   'CC(C)c1cccc(C(C)C)c1P(C12CC3CC(CC(C3)C1)C2)C12CC3CC(CC(C3)C1)C2',\n",
+       "   462.7019958496094)),\n",
+       " ((2025,\n",
+       "   'O=C(P(C12CC3CC(C2)CC(C1)C3)C12CC3CC(C2)CC(C1)C3)c1cccc2c1cccc2',\n",
+       "   456.6099853515625),\n",
+       "  (348,\n",
+       "   'O=C(c1cccc2ccccc12)P(C12CC3CC(CC(C3)C1)C2)C12CC3CC(CC(C3)C1)C2',\n",
+       "   456.6099853515625)),\n",
+       " ((575, 'Pc1ccccc1', 110.09600067138672),\n",
+       "  (242207, '[H]P([H])c1ccccc1', 110.09600067138672)),\n",
+       " ((2026,\n",
+       "   'CC([Si](C(C)C)(C(C)C)Oc1ccccc1P(C12CC3CC(C2)CC(C1)C3)C12CC3CC(C2)CC(C1)C3)C',\n",
+       "   550.8839721679688),\n",
+       "  (349,\n",
+       "   'CC(C)[Si](Oc1ccccc1P(C12CC3CC(CC(C3)C1)C2)C12CC3CC(CC(C3)C1)C2)(C(C)C)C(C)C',\n",
+       "   550.8839721679688)),\n",
+       " ((2054, 'PC12CC3CC(C2)CC(C1)C3', 168.22000122070312),\n",
+       "  (242106, '[H]P([H])C12CC3CC(CC(C3)C1)C2', 168.22000122070312)),\n",
+       " ((2048, 'COCCP(C12CC3CC(C2)CC(C1)C3)C12CC3CC(C2)CC(C1)C3', 360.5220031738281),\n",
+       "  (1140,\n",
+       "   'COCCP(C12CC3CC(CC(C3)C1)C2)C12CC3CC(CC(C3)C1)C2',\n",
+       "   360.5220031738281)),\n",
+       " ((2051, 'O=Cc1ccc(cc1)P(c1ccc(cc1)C=O)C1C2CC3CC1CC(C2)C3', 376.4360046386719),\n",
+       "  (1241,\n",
+       "   'O=Cc1ccc(P(c2ccc(C=O)cc2)C2C3CC4CC(C3)CC2C4)cc1',\n",
+       "   376.4360046386719)),\n",
+       " ((2024,\n",
+       "   'COc1ccc(c(c1c1c(cc(cc1C(C)C)C(C)C)C(C)C)P(C12CC3CC(C2)CC(C1)C3)C12CC3CC(C2)CC(C1)C3)OC',\n",
+       "   640.9329833984375),\n",
+       "  (347,\n",
+       "   'COc1ccc(OC)c(P(C23CC4CC(CC(C4)C2)C3)C23CC4CC(CC(C4)C2)C3)c1-c1c(C(C)C)cc(C(C)C)cc1C(C)C',\n",
+       "   640.9329833984375)),\n",
+       " ((2047,\n",
+       "   'COc1ccccc1P(C12CC3CC(C2)CC(C1)C3)C12CC3CC(C2)CC(C1)C3',\n",
+       "   408.5660095214844),\n",
+       "  (1137,\n",
+       "   'COc1ccccc1P(C12CC3CC(CC(C3)C1)C2)C12CC3CC(CC(C3)C1)C2',\n",
+       "   408.5660095214844)),\n",
+       " ((2016,\n",
+       "   'C1C2CC3CC1CC(C2)(C3)P(C12CC3CC(C2)CC(C1)C3)Cc1ccccc1',\n",
+       "   392.5669860839844),\n",
+       "  (64,\n",
+       "   'c1ccc(CP(C23CC4CC(CC(C4)C2)C3)C23CC4CC(CC(C4)C2)C3)cc1',\n",
+       "   392.5669860839844)),\n",
+       " ((2022,\n",
+       "   'c1ccc(cc1)c1nn(c(c1n1nccc1P(C12CC3CC(C2)CC(C1)C3)C12CC3CC(C2)CC(C1)C3)c1ccccc1)c1ccccc1',\n",
+       "   662.8619995117188),\n",
+       "  (340,\n",
+       "   'c1ccc(-c2nn(-c3ccccc3)c(-c3ccccc3)c2-n2nccc2P(C23CC4CC(CC(C4)C2)C3)C23CC4CC(CC(C4)C2)C3)cc1',\n",
+       "   662.8619995117188)),\n",
+       " ((2014,\n",
+       "   'C1C2CC3CC1CC(C2)(C3)P(C12CC3CC(C2)CC(C1)C3)C12CC3CC(C2)CC(C1)C3',\n",
+       "   436.66400146484375),\n",
+       "  (6,\n",
+       "   'C1C2CC3CC1CC(P(C14CC5CC(CC(C5)C1)C4)C14CC5CC(CC(C5)C1)C4)(C2)C3',\n",
+       "   436.66400146484375)),\n",
+       " ((2037,\n",
+       "   'CN(c1ccccc1P(C12CC3CC(C2)CC(C1)C3)C12CC3CC(C2)CC(C1)C3)C',\n",
+       "   421.6090087890625),\n",
+       "  (527,\n",
+       "   'CN(C)c1ccccc1P(C12CC3CC(CC(C3)C1)C2)C12CC3CC(CC(C3)C1)C2',\n",
+       "   421.6090087890625)),\n",
+       " ((1490, 'Pc1cccs1', 116.125), (242229, '[H]P([H])c1cccs1', 116.125)),\n",
+       " ((331421, 'CCO[Si](OCC)(OCC)CCCP(c1ccccc1)c2ccccc2', 390.5360107421875),\n",
+       "  (728, 'CCO[Si](CCCP(c1ccccc1)c1ccccc1)(OCC)OCC', 390.5360107421875)),\n",
+       " ((1486, 'Pc1cccc2ccccc12', 160.156005859375),\n",
+       "  (242206, '[H]P([H])c1cccc2ccccc12', 160.156005859375)),\n",
+       " ((2021,\n",
+       "   'C1CCN(CC1)c1ccccc1P(C12CC3CC(C2)CC(C1)C3)C12CC3CC(C2)CC(C1)C3',\n",
+       "   461.67401123046875),\n",
+       "  (338,\n",
+       "   'c1ccc(P(C23CC4CC(CC(C4)C2)C3)C23CC4CC(CC(C4)C2)C3)c(N2CCCCC2)c1',\n",
+       "   461.67401123046875)),\n",
+       " ((2015,\n",
+       "   'CCCCP(C12CC3CC(C2)CC(C1)C3)C12CC3CC(C2)CC(C1)C3',\n",
+       "   358.54998779296875),\n",
+       "  (10, 'CCCCP(C12CC3CC(CC(C3)C1)C2)C12CC3CC(CC(C3)C1)C2', 358.54998779296875)),\n",
+       " ((2065,\n",
+       "   'C12=CC=CC=C1C(C3=C(C4=C(P(C5CCCCC5)C6CCCCC6)C=CC=C4)C(C=CC=C7)=C7C=C3)=C8C(C=CC=C8)=C2',\n",
+       "   576.7639770507812),\n",
+       "  (1902,\n",
+       "   'c1ccc(P(C2CCCCC2)C2CCCCC2)c(-c2c(-c3c4ccccc4cc4ccccc34)ccc3ccccc23)c1',\n",
+       "   576.7639770507812)),\n",
+       " ((2027,\n",
+       "   'CN(c1ccccc1c1ccccc1P(C12CC3CC(C2)CC(C1)C3)C12CC3CC(C2)CC(C1)C3)C',\n",
+       "   497.7070007324219),\n",
+       "  (350,\n",
+       "   'CN(C)c1ccccc1-c1ccccc1P(C12CC3CC(CC(C3)C1)C2)C12CC3CC(CC(C3)C1)C2',\n",
+       "   497.7070007324219)),\n",
+       " ((331409, 'C1CCC(CC1)P(c1ccccc1)C1CCCCC1', 274.38800048828125),\n",
+       "  (68, 'c1ccc(P(C2CCCCC2)C2CCCCC2)cc1', 274.38800048828125)),\n",
+       " ((2017,\n",
+       "   'O=C(P(C12CC3CC(C2)CC(C1)C3)C(=O)Nc1ccccc1)Nc1ccccc1',\n",
+       "   406.46600341796875),\n",
+       "  (259,\n",
+       "   'O=C(Nc1ccccc1)P(C(=O)Nc1ccccc1)C12CC3CC(CC(C3)C1)C2',\n",
+       "   406.46600341796875)),\n",
+       " ((2023,\n",
+       "   'O=C(P(C12CC3CC(C2)CC(C1)C3)C12CC3CC(C2)CC(C1)C3)c1ccccc1',\n",
+       "   406.54998779296875),\n",
+       "  (346,\n",
+       "   'O=C(c1ccccc1)P(C12CC3CC(CC(C3)C1)C2)C12CC3CC(CC(C3)C1)C2',\n",
+       "   406.54998779296875)),\n",
+       " ((574, 'c1ccc(Pc2ccccc2)cc1', 186.19400024414062),\n",
+       "  (310083, '[H]P(c1ccccc1)c1ccccc1', 186.19400024414062)),\n",
+       " ((42, 'CC(C)(C)P(c1ccccc1-c1ccccc1)C(C)(C)C', 298.4100036621094),\n",
+       "  (331407, 'CC(P(C(C)(C)C)c1ccccc1c1ccccc1)(C)C', 298.4100036621094)),\n",
+       " ((103,\n",
+       "   'COc1ccc(C)c(-c2c(C(C)C)cc(C(C)C)cc2C(C)C)c1P(C(C)(C)C)C(C)(C)C',\n",
+       "   468.70599365234375),\n",
+       "  (331410,\n",
+       "   'COc1ccc(c(c1P(C(C)(C)C)C(C)(C)C)c1c(cc(cc1C(C)C)C(C)C)C(C)C)C',\n",
+       "   468.70599365234375)),\n",
+       " ((116, 'CN(C)c1ccc(P(c2ccccc2)c2ccccc2)cc1', 305.3609924316406),\n",
+       "  (331411, 'CN(c1ccc(cc1)P(c1ccccc1)c1ccccc1)C', 305.3609924316406)),\n",
+       " ((246,\n",
+       "   'COc1c(C)cc(P(c2cc(C)c(OC)c(C)c2)c2cc(C)c(OC)c(C)c2)cc1C',\n",
+       "   436.5320129394531),\n",
+       "  (331416,\n",
+       "   'P(c1cc(c(c(c1)C)OC)C)(c1cc(c(c(c1)C)OC)C)c1cc(c(c(c1)C)OC)C',\n",
+       "   436.5320129394531)),\n",
+       " ((234,\n",
+       "   'Fc1c(F)c(F)c(P(c2ccccc2)c2c(F)c(F)c(F)c(F)c2F)c(F)c1F',\n",
+       "   442.1919860839844),\n",
+       "  (331414,\n",
+       "   'Fc1c(c(c(c(c1F)F)P(c1c(c(c(c(c1F)F)F)F)F)c1ccccc1)F)F',\n",
+       "   442.1919860839844)),\n",
+       " ((356, 'C/C=C/CP(C(C)(C)C)C(C)(C)C', 200.30599975585938),\n",
+       "  (331419, 'P(C(C)(C)C)(C(C)(C)C)C/C=C/C', 200.30599975585938)),\n",
+       " ((487,\n",
+       "   'FC(F)(F)C(F)(F)C(F)(F)C(Cc1ccc(P(c2ccccc2)c2ccc(CC(C(F)(F)F)(C(F)(F)F)C(F)(F)C(F)(F)C(F)(F)F)cc2)cc1)(C(F)(F)F)C(F)(F)F',\n",
+       "   926.4099731445312),\n",
+       "  (331420,\n",
+       "   'FC(F)(F)C(F)(F)C(F)(F)C(C(F)(F)F)(C(F)(F)F)Cc1ccc(cc1)P(c3ccc(cc3)CC(C(F)(F)F)(C(F)(F)F)C(F)(F)C(F)(F)C(F)(F)F)c2ccccc2',\n",
+       "   926.4099731445312)),\n",
+       " ((216, 'CN(C)c1ccc(P(C(C)(C)C)C(C)(C)C)cc1', 265.3810119628906),\n",
+       "  (331413, 'CC(P(C1=CC=C(N(C)C)C=C1)C(C)(C)C)(C)C', 265.3810119628906)),\n",
+       " ((298, 'c1ccc(P2Cc3cccc4c3C3(CC4)CCc4cccc(c43)C2)cc1', 354.4330139160156),\n",
+       "  (331417, 'P1(Cc2c3c(ccc2)CCC23CCc3c2c(ccc3)C1)c1ccccc1', 354.4330139160156)),\n",
+       " ((320, 'COc1cc(OC)c(-c2ccccc2P(C2CCCCC2)C2CCCCC2)c(OC)c1', 440.5639953613281),\n",
+       "  (331418,\n",
+       "   'P(C1CCCCC1)(C1CCCCC1)c1c(cccc1)c1c(cc(cc1OC)OC)OC',\n",
+       "   440.5639953613281)),\n",
+       " ((3, 'COc1cccc(OC)c1-c1ccccc1P(C1CCCCC1)C1CCCCC1', 410.5379943847656),\n",
+       "  (331405, 'COc1cccc(c1c1ccccc1P(C1CCCCC1)C1CCCCC1)OC', 410.5379943847656)),\n",
+       " ((60,\n",
+       "   'Cc1cc(C)c(P(c2c(C)cc(C)cc2C)c2c(C)cc(C)cc2C)c(C)c1',\n",
+       "   388.5350036621094),\n",
+       "  (331408,\n",
+       "   'Cc1cc(C)cc(c1P(c1c(C)cc(cc1C)C)c1c(C)cc(cc1C)C)C',\n",
+       "   388.5350036621094)),\n",
+       " ((241905, '[H]P([H])C', 48.025001525878906),\n",
+       "  (1497, 'CP', 48.025001525878906)),\n",
+       " ((771,\n",
+       "   'c1ccc(P(C23CC4CC(CC(C4)C2)C3)C23CC4CC(CC(C4)C2)C3)cc1',\n",
+       "   378.5400085449219),\n",
+       "  (2040,\n",
+       "   'C1C2CC3CC1CC(C2)(C3)P(C12CC3CC(C2)CC(C1)C3)c1ccccc1',\n",
+       "   378.5400085449219)),\n",
+       " ((779,\n",
+       "   'c1ccc(-n2cccc2P(C23CC4CC(CC(C4)C2)C3)C23CC4CC(CC(C4)C2)C3)cc1',\n",
+       "   443.614990234375),\n",
+       "  (2041,\n",
+       "   'c1ccc(cc1)n1cccc1P(C12CC3CC(C2)CC(C1)C3)C12CC3CC(C2)CC(C1)C3',\n",
+       "   443.614990234375)),\n",
+       " ((729, 'CCP(CC)c1ccccc1-n1c2ccccc2c2ccccc21', 331.39898681640625),\n",
+       "  (331422, 'CCP(CC)c1ccccc1n2c3ccccc3c4c2cccc4', 331.39898681640625)),\n",
+       " ((417,\n",
+       "   'CC(C)(C)P(C12CC3CC(CC(C3)C1)C2)C12CC3CC(CC(C3)C1)C2',\n",
+       "   358.54998779296875),\n",
+       "  (2035,\n",
+       "   'CC(P(C12CC3CC(C2)CC(C1)C3)C12CC3CC(C2)CC(C1)C3)(C)C',\n",
+       "   358.54998779296875)),\n",
+       " ((783,\n",
+       "   'c1ccc(-n2c(P(C34CC5CC(CC(C5)C3)C4)C34CC5CC(CC(C5)C3)C4)cc3ccccc32)cc1',\n",
+       "   493.67498779296875),\n",
+       "  (2042,\n",
+       "   'c1ccc(cc1)n1c2ccccc2cc1P(C12CC3CC(C2)CC(C1)C3)C12CC3CC(C2)CC(C1)C3',\n",
+       "   493.67498779296875)),\n",
+       " ((1201,\n",
+       "   'Cc1cc(C)c(-c2cccc3c2CP(C2CCCCC2)Cc2c-3cccc2-c2c(C)cc(C)cc2C)c(C)c1',\n",
+       "   530.7360229492188),\n",
+       "  (2049,\n",
+       "   'Cc1cc(C)cc(c1c1cccc2c1CP(Cc1c2cccc1c1c(C)cc(cc1C)C)C1CCCCC1)C',\n",
+       "   530.7360229492188)),\n",
+       " ((1202,\n",
+       "   'Cc1cc(C)c(-c2cccc3c2CP(C(C)C)Cc2c-3cccc2-c2c(C)cc(C)cc2C)c(C)c1',\n",
+       "   490.6709899902344),\n",
+       "  (2050,\n",
+       "   'CC(P1Cc2c(c3c(C1)c(ccc3)c1c(C)cc(cc1C)C)cccc2c1c(C)cc(cc1C)C)C',\n",
+       "   490.6709899902344)),\n",
+       " ((241701, '[H]P([H])C(C)(C)C', 90.10600280761719),\n",
+       "  (1298, 'CC(C)(C)P', 90.10600280761719)),\n",
+       " ((242088, '[H]P([H])[H]', 33.99800109863281), (1299, 'P', 33.99800109863281))]"
+      ]
+     },
+     "execution_count": 44,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "duplicates"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 45,
+   "id": "1f1bd039",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Save the object\n",
+    "import pickle"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 46,
+   "id": "3c5ac97b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "with open('duplicates.pkl', 'wb') as file:\n",
+    "    pickle.dump(duplicates, file)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a187e06d",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.7"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/DB_investigation.ipynb b/DB_investigation.ipynb
new file mode 100644
index 0000000..025dc89
--- /dev/null
+++ b/DB_investigation.ipynb
@@ -0,0 +1,798 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9368bb63",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Want to start by checking which molecules are duplicates.\n",
+    "# For examples we have 241905 and 1497"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "e28921c8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import psycopg2\n",
+    "import pandas as pd\n",
+    "from rdkit import Chem"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 33,
+   "id": "214277f9",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from tqdm import tqdm"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "a128c9e3",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/tmp/ipykernel_556/3175014960.py:16: UserWarning: pandas only supports SQLAlchemy connectable (engine/connection) or database string URI or sqlite3 DBAPI2 connection. Other DBAPI2 objects are not tested. Please consider using SQLAlchemy.\n",
+      "  df = pd.read_sql_query(query, connection)\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Establish a connection\n",
+    "# You must have the DB container running to run this cell successfully.\n",
+    "# Connection parameters\n",
+    "db_params = {\n",
+    "    'dbname': 'postgres',\n",
+    "    'user': 'postgres',\n",
+    "    'password': '',\n",
+    "    'host': '127.0.0.1',\n",
+    "    'port': '5432'\n",
+    "}\n",
+    "\n",
+    "# Establish a connection to the PostgreSQL database\n",
+    "connection = psycopg2.connect(**db_params)\n",
+    "\n",
+    "# Execute an SQL statement\n",
+    "query = \"SELECT molecule_id, smiles, molecular_weight FROM molecule\"\n",
+    "df = pd.read_sql_query(query, connection)\n",
+    "\n",
+    "# Close the connection\n",
+    "connection.close()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "686a1a35",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>molecule_id</th>\n",
+       "      <th>smiles</th>\n",
+       "      <th>molecular_weight</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>331406</td>\n",
+       "      <td>COc1cccc(c1c1ccccc1P(c1ccccc1)c1ccccc1)OC</td>\n",
+       "      <td>398.441986</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>140360</td>\n",
+       "      <td>COc1ccc(P(c2ccccc2SC)c2ccccc2SC)c(C)c1</td>\n",
+       "      <td>398.532990</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>331409</td>\n",
+       "      <td>C1CCC(CC1)P(c1ccccc1)C1CCCCC1</td>\n",
+       "      <td>274.388000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>2027</td>\n",
+       "      <td>CN(c1ccccc1c1ccccc1P(C12CC3CC(C2)CC(C1)C3)C12C...</td>\n",
+       "      <td>497.707001</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>2036</td>\n",
+       "      <td>CCC1(CC)O[C@@H]2[C@@H](O1)C(c1cc(C(C)(C)C)cc(C...</td>\n",
+       "      <td>1049.558960</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>330962</th>\n",
+       "      <td>608</td>\n",
+       "      <td>Cc1cc(C)cc(P(c2cc(C)cc(C)c2)c2ccccc2C=O)c1</td>\n",
+       "      <td>346.410004</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>330963</th>\n",
+       "      <td>461</td>\n",
+       "      <td>CCO[Si](CCP(c1ccccc1)c1ccccc1)(OCC)OCC</td>\n",
+       "      <td>376.509003</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>330964</th>\n",
+       "      <td>1064</td>\n",
+       "      <td>Cc1c(C)n(C(C)C)c(=NP(N=c2n(C(C)C)c(C)c(C)n2C(C...</td>\n",
+       "      <td>462.666992</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>330965</th>\n",
+       "      <td>523</td>\n",
+       "      <td>CN(C)/N=C/c1ccc(P(c2ccc(/C=N/N(C)C)s2)c2ccc(/C...</td>\n",
+       "      <td>490.664001</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>330966</th>\n",
+       "      <td>1817</td>\n",
+       "      <td>COc1cccc(P(c2cccc(OC)c2OC)c2cccc(OC)c2OC)c1OC</td>\n",
+       "      <td>442.447998</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>330967 rows × 3 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "        molecule_id                                             smiles  \\\n",
+       "0            331406          COc1cccc(c1c1ccccc1P(c1ccccc1)c1ccccc1)OC   \n",
+       "1            140360             COc1ccc(P(c2ccccc2SC)c2ccccc2SC)c(C)c1   \n",
+       "2            331409                      C1CCC(CC1)P(c1ccccc1)C1CCCCC1   \n",
+       "3              2027  CN(c1ccccc1c1ccccc1P(C12CC3CC(C2)CC(C1)C3)C12C...   \n",
+       "4              2036  CCC1(CC)O[C@@H]2[C@@H](O1)C(c1cc(C(C)(C)C)cc(C...   \n",
+       "...             ...                                                ...   \n",
+       "330962          608         Cc1cc(C)cc(P(c2cc(C)cc(C)c2)c2ccccc2C=O)c1   \n",
+       "330963          461             CCO[Si](CCP(c1ccccc1)c1ccccc1)(OCC)OCC   \n",
+       "330964         1064  Cc1c(C)n(C(C)C)c(=NP(N=c2n(C(C)C)c(C)c(C)n2C(C...   \n",
+       "330965          523  CN(C)/N=C/c1ccc(P(c2ccc(/C=N/N(C)C)s2)c2ccc(/C...   \n",
+       "330966         1817      COc1cccc(P(c2cccc(OC)c2OC)c2cccc(OC)c2OC)c1OC   \n",
+       "\n",
+       "        molecular_weight  \n",
+       "0             398.441986  \n",
+       "1             398.532990  \n",
+       "2             274.388000  \n",
+       "3             497.707001  \n",
+       "4            1049.558960  \n",
+       "...                  ...  \n",
+       "330962        346.410004  \n",
+       "330963        376.509003  \n",
+       "330964        462.666992  \n",
+       "330965        490.664001  \n",
+       "330966        442.447998  \n",
+       "\n",
+       "[330967 rows x 3 columns]"
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "id": "149e8d9b",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>molecule_id</th>\n",
+       "      <th>smiles</th>\n",
+       "      <th>molecular_weight</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>143223</th>\n",
+       "      <td>241905</td>\n",
+       "      <td>[H]P([H])C</td>\n",
+       "      <td>48.025002</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>329868</th>\n",
+       "      <td>1497</td>\n",
+       "      <td>CP</td>\n",
+       "      <td>48.025002</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "        molecule_id      smiles  molecular_weight\n",
+       "143223       241905  [H]P([H])C         48.025002\n",
+       "329868         1497          CP         48.025002"
+      ]
+     },
+     "execution_count": 15,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# Sanity check to see if the data is correct\n",
+    "df[(df[\"molecule_id\"]==241905) | (df[\"molecule_id\"]==1497)]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 27,
+   "id": "9596cb12",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# These two molecules are the same so lets check if rdkit will return the same smiles string when canonicalizing them\n",
+    "mol_241905 = df[df[\"molecule_id\"]==241905][\"smiles\"].to_list()[0]\n",
+    "mol_1497 = df[df[\"molecule_id\"]==1497][\"smiles\"].to_list()[0]\n",
+    "\n",
+    "# Double check with the molecular weight. Use difference is less than some tolerance 1e-6.\n",
+    "mol_241905_weight = df[df[\"molecule_id\"]==241905][\"molecular_weight\"].to_list()[0]\n",
+    "mol_1497_weight = df[df[\"molecule_id\"]==1497][\"molecular_weight\"].to_list()[0]\n",
+    "\n",
+    "a = Chem.CanonSmiles(mol_241905)\n",
+    "b = Chem.CanonSmiles(mol_241905)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 29,
+   "id": "9c94d293",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "CP 48.025001525878906\n",
+      "CP 48.025001525878906\n",
+      "True\n",
+      "True\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(a, mol_241905_weight)\n",
+    "print(b, mol_1497_weight)\n",
+    "print(a == b)\n",
+    "print(abs(mol_241905_weight - mol_1497_weight) < 0.000001)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 31,
+   "id": "5cca5c81",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "all_data_dict = df.to_dict(orient=\"records\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 41,
+   "id": "7ea5a56b",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "  0%|                                                                                        | 0/330967 [00:00<?, ?it/s][23:43:15] Explicit valence for atom # 7 B, 6, is greater than permitted\n",
+      "  0%|▏                                                                           | 653/330967 [00:00<01:38, 3339.89it/s]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "c1ccc(cc1)P([B]1234[BH]567[BH]891[BH]1%102[BH]2%114[BH]435[CH]357[BH]768[CH]691[BH]1%102[BH]%1143[BH]5761)c1ccccc1\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "  1%|▌                                                                          | 2331/330967 [00:00<01:20, 4099.64it/s][23:43:15] Explicit valence for atom # 7 C, 6, is greater than permitted\n",
+      "  1%|▋                                                                          | 3150/330967 [00:00<01:24, 3897.22it/s]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "c1ccc(cc1)P([C]1234[BH]567[BH]891[BH]1%103[BH]3%112[BH]245[BH]456[CH]678[BH]791[BH]1%10%11[BH]324[BH]5671)c1ccccc1\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "  2%|█▎                                                                         | 5580/330967 [00:01<01:22, 3940.69it/s][23:43:16] Explicit valence for atom # 7 B, 6, is greater than permitted\n",
+      "[23:43:16] Explicit valence for atom # 7 C, 6, is greater than permitted\n",
+      "  2%|█▍                                                                         | 6369/330967 [00:01<01:23, 3905.70it/s]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "C1CCC(CC1)P([B]1234[BH]567[BH]891[BH]1%102[BH]2%114[BH]435[CH]357[BH]768[CH]691[BH]1%102[BH]%1143[BH]5761)C1CCCCC1\n",
+      "c1ccc(cc1)P([C]1234[CH]567[BH]893[BH]3%102[BH]2%111[BH]145[BH]456[BH]678[BH]79%10[BH]83%11[BH]214[BH]5678)c1ccccc1\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "100%|█████████████████████████████████████████████████████████████████████████| 330967/330967 [02:02<00:00, 2711.21it/s]\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Store each record in a hashmap with the CANONICAL smile as the key, and value as a tuple of molecule_id and molecular_weight\n",
+    "# When storing, check to see if the CANONICAL smile is already there, if so check its weight and add the tuple of molecule_ids\n",
+    "# the original  \n",
+    "hashmap = {}\n",
+    "duplicates = []\n",
+    "for entry in tqdm(all_data_dict):\n",
+    "    try:\n",
+    "        # Canonicalize smiles\n",
+    "        canonical_smile = Chem.CanonSmiles(entry[\"smiles\"])\n",
+    "        # Check if its in hashmap\n",
+    "        if canonical_smile not in hashmap:\n",
+    "            # add it\n",
+    "            hashmap[canonical_smile] = (entry[\"molecule_id\"], entry[\"smiles\"], entry[\"molecular_weight\"])\n",
+    "        else:\n",
+    "            a = hashmap[canonical_smile] # Get matched molecule tuple data\n",
+    "            b = (entry[\"molecule_id\"], entry[\"smiles\"], entry[\"molecular_weight\"])\n",
+    "            duplicates.append((a, b))\n",
+    "    except:\n",
+    "        # See which molecules cause issues if any, deal with them later\n",
+    "        print(entry[\"smiles\"])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 43,
+   "id": "361ec9af",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "78"
+      ]
+     },
+     "execution_count": 43,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "len(duplicates)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 44,
+   "id": "158c8d2c",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[((1519, 'COP', 64.02400207519531),\n",
+       "  (241851, '[H]P([H])OC', 64.02400207519531)),\n",
+       " ((241739, '[H]P([H])C(C)C', 76.0790023803711),\n",
+       "  (1495, 'CC(C)P', 76.0790023803711)),\n",
+       " ((2045, 'C1CC2CCCC(C1)P2B1Nc2ccccc2c2c1cccc2', 319.1969909667969),\n",
+       "  (1006, 'c1ccc2c(c1)NB(P1C3CCCC1CCC3)c1ccccc1-2', 319.1969909667969)),\n",
+       " ((1521, 'CSP', 80.09200286865234),\n",
+       "  (241909, '[H]P([H])SC', 80.09200286865234)),\n",
+       " ((2063,\n",
+       "   'COC1=CC=C(OC)C(C2=C(C(C)C)C=C(C(C)C)C=C2C(C)C)=C1P3C(C)(C)CC4(OCCO4)CC3(C)C',\n",
+       "   554.7520141601562),\n",
+       "  (1900,\n",
+       "   'COc1ccc(OC)c(P2C(C)(C)CC3(CC2(C)C)OCCO3)c1-c1c(C(C)C)cc(C(C)C)cc1C(C)C',\n",
+       "   554.7520141601562)),\n",
+       " ((1494, 'CC(C)PC(C)C', 118.16000366210938),\n",
+       "  (41951, '[H]P(C(C)C)C(C)C', 118.16000366210938)),\n",
+       " ((2054, 'PC12CC3CC(C2)CC(C1)C3', 168.22000122070312),\n",
+       "  (1296, 'PC12CC3CC(CC(C3)C1)C2', 168.22000122070312)),\n",
+       " ((820,\n",
+       "   'COC1=[C@@](C2=CC=CC=C2C=C1)[C@@]3=C4C(C=CC=C4)=CC=C3P(OC5=CC=C(C=CC=C6)C6=[C@]57)OC8=[C@]7C(C=CC=C9)=C9C=C8',\n",
+       "   598.6380004882812),\n",
+       "  (821,\n",
+       "   'COC1=[C@@](C2=CC=CC=C2C=C1)[C@@]3=C4C(C=CC=C4)=CC=C3P(OC5=CC=C(C=CC=C6)C6=[C@@]57)OC8=[C@@]7C(C=CC=C9)=C9C=C8',\n",
+       "   598.6380004882812)),\n",
+       " ((2053, 'C1C2CC3CC1CC(C2)(C3)PC12CC3CC(C2)CC(C1)C3', 302.4419860839844),\n",
+       "  (1295, 'C1C2CC3CC1CC(PC14CC5CC(CC(C5)C1)C4)(C2)C3', 302.4419860839844)),\n",
+       " ((213971, '[H]P(F)F', 69.97799682617188), (1293, 'FPF', 69.97799682617188)),\n",
+       " ((150328, '[H]P([Si](C)(C)C)[Si](C)(C)C', 178.36399841308594),\n",
+       "  (1520, 'C[Si](C)(C)P[Si](C)(C)C', 178.36399841308594)),\n",
+       " ((2020,\n",
+       "   'c1ccc(cc1)c1ccccc1P(C12CC3CC(C2)CC(C1)C3)C12CC3CC(C2)CC(C1)C3',\n",
+       "   454.63800048828125),\n",
+       "  (306,\n",
+       "   'c1ccc(-c2ccccc2P(C23CC4CC(CC(C4)C2)C3)C23CC4CC(CC(C4)C2)C3)cc1',\n",
+       "   454.63800048828125)),\n",
+       " ((2053, 'C1C2CC3CC1CC(C2)(C3)PC12CC3CC(C2)CC(C1)C3', 302.4419860839844),\n",
+       "  (252420,\n",
+       "   '[H]P(C12CC3CC(CC(C3)C1)C2)C12CC3CC(CC(C3)C1)C2',\n",
+       "   302.4419860839844)),\n",
+       " ((1491, 'Pc1ccco1', 100.05699920654297),\n",
+       "  (242228, '[H]P([H])c1ccco1', 100.05699920654297)),\n",
+       " ((2033,\n",
+       "   'CCCCc1c(F)c(F)c(c(c1F)F)c1c(cc(c(c1C(C)C)c1cccc(c1P(C12CC3CC(C2)CC(C1)C3)C12CC3CC(C2)CC(C1)C3)OC)C(C)C)C(C)C',\n",
+       "   815.072998046875),\n",
+       "  (369,\n",
+       "   'CCCCc1c(F)c(F)c(-c2c(C(C)C)cc(C(C)C)c(-c3cccc(OC)c3P(C34CC5CC(CC(C5)C3)C4)C34CC5CC(CC(C5)C3)C4)c2C(C)C)c(F)c1F',\n",
+       "   815.072998046875)),\n",
+       " ((1297, 'CC(C)(C)PC(C)(C)C', 146.21400451660156),\n",
+       "  (20221, '[H]P(C(C)(C)C)C(C)(C)C', 146.21400451660156)),\n",
+       " ((2044, 'CC(P(C(C)(C)C)C1C2CC3CC1CC(C2)C3)(C)C', 280.4360046386719),\n",
+       "  (952, 'CC(C)(C)P(C1C2CC3CC(C2)CC1C3)C(C)(C)C', 280.4360046386719)),\n",
+       " ((331415, 'P(CCCCCCCC)(CCCCCCCC)CCCCCCCC', 370.64599609375),\n",
+       "  (239, 'CCCCCCCCP(CCCCCCCC)CCCCCCCC', 370.64599609375)),\n",
+       " ((1493, 'CCP', 62.051998138427734),\n",
+       "  (241815, '[H]P([H])CC', 62.051998138427734)),\n",
+       " ((1518, 'CN(C)PN(C)C', 120.13600158691406),\n",
+       "  (91817, '[H]P(N(C)C)N(C)C', 120.13600158691406)),\n",
+       " ((1294, 'FP', 51.987998962402344),\n",
+       "  (242039, '[H]P([H])F', 51.987998962402344)),\n",
+       " ((2038, 'C1C2CC3CC1CC(C2)(C3)P(c1ccccc1)c1ccccc1', 320.4159851074219),\n",
+       "  (576, 'c1ccc(P(c2ccccc2)C23CC4CC(CC(C4)C2)C3)cc1', 320.4159851074219)),\n",
+       " ((1487, 'Pc1ccc2c(c1)CCCC2', 164.18800354003906),\n",
+       "  (242188, '[H]P([H])c1ccc2c(c1)CCCC2', 164.18800354003906)),\n",
+       " ((2034,\n",
+       "   'O1CCN(CC1)c1ccccc1P(C12CC3CC(C2)CC(C1)C3)C12CC3CC(C2)CC(C1)C3',\n",
+       "   463.64599609375),\n",
+       "  (371,\n",
+       "   'c1ccc(P(C23CC4CC(CC(C4)C2)C3)C23CC4CC(CC(C4)C2)C3)c(N2CCOCC2)c1',\n",
+       "   463.64599609375)),\n",
+       " ((331412, 'CC(C=CC=C1C)=C1P(C2=CC=CC=C2)C3=CC=CC=C3', 290.34600830078125),\n",
+       "  (203, 'Cc1cccc(C)c1P(c1ccccc1)c1ccccc1', 290.34600830078125)),\n",
+       " ((2043, 'CC(P(C12CC3CC(C2)CC(C1)C3)C(C)(C)C)(C)C', 280.4360046386719),\n",
+       "  (951, 'CC(C)(C)P(C(C)(C)C)C12CC3CC(CC(C3)C1)C2', 280.4360046386719)),\n",
+       " ((1492, 'CCPCC', 90.10600280761719),\n",
+       "  (85517, '[H]P(CC)CC', 90.10600280761719)),\n",
+       " ((1488, 'PC12c3ccccc3C(c3ccccc31)c1ccccc12', 286.3139953613281),\n",
+       "  (242107, '[H]P([H])C12c3ccccc3C(c3ccccc31)c1ccccc12', 286.3139953613281)),\n",
+       " ((2064,\n",
+       "   'COC1=CC=C(C2=C(OC)C=CC3=C2C=CC=C3)C(OC)=C1P(C4CCCCC4)C5CCCCC5',\n",
+       "   490.6239929199219),\n",
+       "  (1901,\n",
+       "   'COc1ccc(-c2c(OC)ccc3ccccc23)c(OC)c1P(C1CCCCC1)C1CCCCC1',\n",
+       "   490.6239929199219)),\n",
+       " ((331406, 'COc1cccc(c1c1ccccc1P(c1ccccc1)c1ccccc1)OC', 398.4419860839844),\n",
+       "  (7, 'COc1cccc(OC)c1-c1ccccc1P(c1ccccc1)c1ccccc1', 398.4419860839844)),\n",
+       " ((1485, 'Pc1ccc2ccccc2c1', 160.156005859375),\n",
+       "  (242192, '[H]P([H])c1ccc2ccccc2c1', 160.156005859375)),\n",
+       " ((1496, 'CPC', 62.051998138427734), (137130, '[H]P(C)C', 62.051998138427734)),\n",
+       " ((2052,\n",
+       "   'CC(c1cccc(c1P(C12CC3CC(C2)CC(C1)C3)C12CC3CC(C2)CC(C1)C3)C(C)C)C',\n",
+       "   462.7019958496094),\n",
+       "  (1274,\n",
+       "   'CC(C)c1cccc(C(C)C)c1P(C12CC3CC(CC(C3)C1)C2)C12CC3CC(CC(C3)C1)C2',\n",
+       "   462.7019958496094)),\n",
+       " ((2025,\n",
+       "   'O=C(P(C12CC3CC(C2)CC(C1)C3)C12CC3CC(C2)CC(C1)C3)c1cccc2c1cccc2',\n",
+       "   456.6099853515625),\n",
+       "  (348,\n",
+       "   'O=C(c1cccc2ccccc12)P(C12CC3CC(CC(C3)C1)C2)C12CC3CC(CC(C3)C1)C2',\n",
+       "   456.6099853515625)),\n",
+       " ((575, 'Pc1ccccc1', 110.09600067138672),\n",
+       "  (242207, '[H]P([H])c1ccccc1', 110.09600067138672)),\n",
+       " ((2026,\n",
+       "   'CC([Si](C(C)C)(C(C)C)Oc1ccccc1P(C12CC3CC(C2)CC(C1)C3)C12CC3CC(C2)CC(C1)C3)C',\n",
+       "   550.8839721679688),\n",
+       "  (349,\n",
+       "   'CC(C)[Si](Oc1ccccc1P(C12CC3CC(CC(C3)C1)C2)C12CC3CC(CC(C3)C1)C2)(C(C)C)C(C)C',\n",
+       "   550.8839721679688)),\n",
+       " ((2054, 'PC12CC3CC(C2)CC(C1)C3', 168.22000122070312),\n",
+       "  (242106, '[H]P([H])C12CC3CC(CC(C3)C1)C2', 168.22000122070312)),\n",
+       " ((2048, 'COCCP(C12CC3CC(C2)CC(C1)C3)C12CC3CC(C2)CC(C1)C3', 360.5220031738281),\n",
+       "  (1140,\n",
+       "   'COCCP(C12CC3CC(CC(C3)C1)C2)C12CC3CC(CC(C3)C1)C2',\n",
+       "   360.5220031738281)),\n",
+       " ((2051, 'O=Cc1ccc(cc1)P(c1ccc(cc1)C=O)C1C2CC3CC1CC(C2)C3', 376.4360046386719),\n",
+       "  (1241,\n",
+       "   'O=Cc1ccc(P(c2ccc(C=O)cc2)C2C3CC4CC(C3)CC2C4)cc1',\n",
+       "   376.4360046386719)),\n",
+       " ((2024,\n",
+       "   'COc1ccc(c(c1c1c(cc(cc1C(C)C)C(C)C)C(C)C)P(C12CC3CC(C2)CC(C1)C3)C12CC3CC(C2)CC(C1)C3)OC',\n",
+       "   640.9329833984375),\n",
+       "  (347,\n",
+       "   'COc1ccc(OC)c(P(C23CC4CC(CC(C4)C2)C3)C23CC4CC(CC(C4)C2)C3)c1-c1c(C(C)C)cc(C(C)C)cc1C(C)C',\n",
+       "   640.9329833984375)),\n",
+       " ((2047,\n",
+       "   'COc1ccccc1P(C12CC3CC(C2)CC(C1)C3)C12CC3CC(C2)CC(C1)C3',\n",
+       "   408.5660095214844),\n",
+       "  (1137,\n",
+       "   'COc1ccccc1P(C12CC3CC(CC(C3)C1)C2)C12CC3CC(CC(C3)C1)C2',\n",
+       "   408.5660095214844)),\n",
+       " ((2016,\n",
+       "   'C1C2CC3CC1CC(C2)(C3)P(C12CC3CC(C2)CC(C1)C3)Cc1ccccc1',\n",
+       "   392.5669860839844),\n",
+       "  (64,\n",
+       "   'c1ccc(CP(C23CC4CC(CC(C4)C2)C3)C23CC4CC(CC(C4)C2)C3)cc1',\n",
+       "   392.5669860839844)),\n",
+       " ((2022,\n",
+       "   'c1ccc(cc1)c1nn(c(c1n1nccc1P(C12CC3CC(C2)CC(C1)C3)C12CC3CC(C2)CC(C1)C3)c1ccccc1)c1ccccc1',\n",
+       "   662.8619995117188),\n",
+       "  (340,\n",
+       "   'c1ccc(-c2nn(-c3ccccc3)c(-c3ccccc3)c2-n2nccc2P(C23CC4CC(CC(C4)C2)C3)C23CC4CC(CC(C4)C2)C3)cc1',\n",
+       "   662.8619995117188)),\n",
+       " ((2014,\n",
+       "   'C1C2CC3CC1CC(C2)(C3)P(C12CC3CC(C2)CC(C1)C3)C12CC3CC(C2)CC(C1)C3',\n",
+       "   436.66400146484375),\n",
+       "  (6,\n",
+       "   'C1C2CC3CC1CC(P(C14CC5CC(CC(C5)C1)C4)C14CC5CC(CC(C5)C1)C4)(C2)C3',\n",
+       "   436.66400146484375)),\n",
+       " ((2037,\n",
+       "   'CN(c1ccccc1P(C12CC3CC(C2)CC(C1)C3)C12CC3CC(C2)CC(C1)C3)C',\n",
+       "   421.6090087890625),\n",
+       "  (527,\n",
+       "   'CN(C)c1ccccc1P(C12CC3CC(CC(C3)C1)C2)C12CC3CC(CC(C3)C1)C2',\n",
+       "   421.6090087890625)),\n",
+       " ((1490, 'Pc1cccs1', 116.125), (242229, '[H]P([H])c1cccs1', 116.125)),\n",
+       " ((331421, 'CCO[Si](OCC)(OCC)CCCP(c1ccccc1)c2ccccc2', 390.5360107421875),\n",
+       "  (728, 'CCO[Si](CCCP(c1ccccc1)c1ccccc1)(OCC)OCC', 390.5360107421875)),\n",
+       " ((1486, 'Pc1cccc2ccccc12', 160.156005859375),\n",
+       "  (242206, '[H]P([H])c1cccc2ccccc12', 160.156005859375)),\n",
+       " ((2021,\n",
+       "   'C1CCN(CC1)c1ccccc1P(C12CC3CC(C2)CC(C1)C3)C12CC3CC(C2)CC(C1)C3',\n",
+       "   461.67401123046875),\n",
+       "  (338,\n",
+       "   'c1ccc(P(C23CC4CC(CC(C4)C2)C3)C23CC4CC(CC(C4)C2)C3)c(N2CCCCC2)c1',\n",
+       "   461.67401123046875)),\n",
+       " ((2015,\n",
+       "   'CCCCP(C12CC3CC(C2)CC(C1)C3)C12CC3CC(C2)CC(C1)C3',\n",
+       "   358.54998779296875),\n",
+       "  (10, 'CCCCP(C12CC3CC(CC(C3)C1)C2)C12CC3CC(CC(C3)C1)C2', 358.54998779296875)),\n",
+       " ((2065,\n",
+       "   'C12=CC=CC=C1C(C3=C(C4=C(P(C5CCCCC5)C6CCCCC6)C=CC=C4)C(C=CC=C7)=C7C=C3)=C8C(C=CC=C8)=C2',\n",
+       "   576.7639770507812),\n",
+       "  (1902,\n",
+       "   'c1ccc(P(C2CCCCC2)C2CCCCC2)c(-c2c(-c3c4ccccc4cc4ccccc34)ccc3ccccc23)c1',\n",
+       "   576.7639770507812)),\n",
+       " ((2027,\n",
+       "   'CN(c1ccccc1c1ccccc1P(C12CC3CC(C2)CC(C1)C3)C12CC3CC(C2)CC(C1)C3)C',\n",
+       "   497.7070007324219),\n",
+       "  (350,\n",
+       "   'CN(C)c1ccccc1-c1ccccc1P(C12CC3CC(CC(C3)C1)C2)C12CC3CC(CC(C3)C1)C2',\n",
+       "   497.7070007324219)),\n",
+       " ((331409, 'C1CCC(CC1)P(c1ccccc1)C1CCCCC1', 274.38800048828125),\n",
+       "  (68, 'c1ccc(P(C2CCCCC2)C2CCCCC2)cc1', 274.38800048828125)),\n",
+       " ((2017,\n",
+       "   'O=C(P(C12CC3CC(C2)CC(C1)C3)C(=O)Nc1ccccc1)Nc1ccccc1',\n",
+       "   406.46600341796875),\n",
+       "  (259,\n",
+       "   'O=C(Nc1ccccc1)P(C(=O)Nc1ccccc1)C12CC3CC(CC(C3)C1)C2',\n",
+       "   406.46600341796875)),\n",
+       " ((2023,\n",
+       "   'O=C(P(C12CC3CC(C2)CC(C1)C3)C12CC3CC(C2)CC(C1)C3)c1ccccc1',\n",
+       "   406.54998779296875),\n",
+       "  (346,\n",
+       "   'O=C(c1ccccc1)P(C12CC3CC(CC(C3)C1)C2)C12CC3CC(CC(C3)C1)C2',\n",
+       "   406.54998779296875)),\n",
+       " ((574, 'c1ccc(Pc2ccccc2)cc1', 186.19400024414062),\n",
+       "  (310083, '[H]P(c1ccccc1)c1ccccc1', 186.19400024414062)),\n",
+       " ((42, 'CC(C)(C)P(c1ccccc1-c1ccccc1)C(C)(C)C', 298.4100036621094),\n",
+       "  (331407, 'CC(P(C(C)(C)C)c1ccccc1c1ccccc1)(C)C', 298.4100036621094)),\n",
+       " ((103,\n",
+       "   'COc1ccc(C)c(-c2c(C(C)C)cc(C(C)C)cc2C(C)C)c1P(C(C)(C)C)C(C)(C)C',\n",
+       "   468.70599365234375),\n",
+       "  (331410,\n",
+       "   'COc1ccc(c(c1P(C(C)(C)C)C(C)(C)C)c1c(cc(cc1C(C)C)C(C)C)C(C)C)C',\n",
+       "   468.70599365234375)),\n",
+       " ((116, 'CN(C)c1ccc(P(c2ccccc2)c2ccccc2)cc1', 305.3609924316406),\n",
+       "  (331411, 'CN(c1ccc(cc1)P(c1ccccc1)c1ccccc1)C', 305.3609924316406)),\n",
+       " ((246,\n",
+       "   'COc1c(C)cc(P(c2cc(C)c(OC)c(C)c2)c2cc(C)c(OC)c(C)c2)cc1C',\n",
+       "   436.5320129394531),\n",
+       "  (331416,\n",
+       "   'P(c1cc(c(c(c1)C)OC)C)(c1cc(c(c(c1)C)OC)C)c1cc(c(c(c1)C)OC)C',\n",
+       "   436.5320129394531)),\n",
+       " ((234,\n",
+       "   'Fc1c(F)c(F)c(P(c2ccccc2)c2c(F)c(F)c(F)c(F)c2F)c(F)c1F',\n",
+       "   442.1919860839844),\n",
+       "  (331414,\n",
+       "   'Fc1c(c(c(c(c1F)F)P(c1c(c(c(c(c1F)F)F)F)F)c1ccccc1)F)F',\n",
+       "   442.1919860839844)),\n",
+       " ((356, 'C/C=C/CP(C(C)(C)C)C(C)(C)C', 200.30599975585938),\n",
+       "  (331419, 'P(C(C)(C)C)(C(C)(C)C)C/C=C/C', 200.30599975585938)),\n",
+       " ((487,\n",
+       "   'FC(F)(F)C(F)(F)C(F)(F)C(Cc1ccc(P(c2ccccc2)c2ccc(CC(C(F)(F)F)(C(F)(F)F)C(F)(F)C(F)(F)C(F)(F)F)cc2)cc1)(C(F)(F)F)C(F)(F)F',\n",
+       "   926.4099731445312),\n",
+       "  (331420,\n",
+       "   'FC(F)(F)C(F)(F)C(F)(F)C(C(F)(F)F)(C(F)(F)F)Cc1ccc(cc1)P(c3ccc(cc3)CC(C(F)(F)F)(C(F)(F)F)C(F)(F)C(F)(F)C(F)(F)F)c2ccccc2',\n",
+       "   926.4099731445312)),\n",
+       " ((216, 'CN(C)c1ccc(P(C(C)(C)C)C(C)(C)C)cc1', 265.3810119628906),\n",
+       "  (331413, 'CC(P(C1=CC=C(N(C)C)C=C1)C(C)(C)C)(C)C', 265.3810119628906)),\n",
+       " ((298, 'c1ccc(P2Cc3cccc4c3C3(CC4)CCc4cccc(c43)C2)cc1', 354.4330139160156),\n",
+       "  (331417, 'P1(Cc2c3c(ccc2)CCC23CCc3c2c(ccc3)C1)c1ccccc1', 354.4330139160156)),\n",
+       " ((320, 'COc1cc(OC)c(-c2ccccc2P(C2CCCCC2)C2CCCCC2)c(OC)c1', 440.5639953613281),\n",
+       "  (331418,\n",
+       "   'P(C1CCCCC1)(C1CCCCC1)c1c(cccc1)c1c(cc(cc1OC)OC)OC',\n",
+       "   440.5639953613281)),\n",
+       " ((3, 'COc1cccc(OC)c1-c1ccccc1P(C1CCCCC1)C1CCCCC1', 410.5379943847656),\n",
+       "  (331405, 'COc1cccc(c1c1ccccc1P(C1CCCCC1)C1CCCCC1)OC', 410.5379943847656)),\n",
+       " ((60,\n",
+       "   'Cc1cc(C)c(P(c2c(C)cc(C)cc2C)c2c(C)cc(C)cc2C)c(C)c1',\n",
+       "   388.5350036621094),\n",
+       "  (331408,\n",
+       "   'Cc1cc(C)cc(c1P(c1c(C)cc(cc1C)C)c1c(C)cc(cc1C)C)C',\n",
+       "   388.5350036621094)),\n",
+       " ((241905, '[H]P([H])C', 48.025001525878906),\n",
+       "  (1497, 'CP', 48.025001525878906)),\n",
+       " ((771,\n",
+       "   'c1ccc(P(C23CC4CC(CC(C4)C2)C3)C23CC4CC(CC(C4)C2)C3)cc1',\n",
+       "   378.5400085449219),\n",
+       "  (2040,\n",
+       "   'C1C2CC3CC1CC(C2)(C3)P(C12CC3CC(C2)CC(C1)C3)c1ccccc1',\n",
+       "   378.5400085449219)),\n",
+       " ((779,\n",
+       "   'c1ccc(-n2cccc2P(C23CC4CC(CC(C4)C2)C3)C23CC4CC(CC(C4)C2)C3)cc1',\n",
+       "   443.614990234375),\n",
+       "  (2041,\n",
+       "   'c1ccc(cc1)n1cccc1P(C12CC3CC(C2)CC(C1)C3)C12CC3CC(C2)CC(C1)C3',\n",
+       "   443.614990234375)),\n",
+       " ((729, 'CCP(CC)c1ccccc1-n1c2ccccc2c2ccccc21', 331.39898681640625),\n",
+       "  (331422, 'CCP(CC)c1ccccc1n2c3ccccc3c4c2cccc4', 331.39898681640625)),\n",
+       " ((417,\n",
+       "   'CC(C)(C)P(C12CC3CC(CC(C3)C1)C2)C12CC3CC(CC(C3)C1)C2',\n",
+       "   358.54998779296875),\n",
+       "  (2035,\n",
+       "   'CC(P(C12CC3CC(C2)CC(C1)C3)C12CC3CC(C2)CC(C1)C3)(C)C',\n",
+       "   358.54998779296875)),\n",
+       " ((783,\n",
+       "   'c1ccc(-n2c(P(C34CC5CC(CC(C5)C3)C4)C34CC5CC(CC(C5)C3)C4)cc3ccccc32)cc1',\n",
+       "   493.67498779296875),\n",
+       "  (2042,\n",
+       "   'c1ccc(cc1)n1c2ccccc2cc1P(C12CC3CC(C2)CC(C1)C3)C12CC3CC(C2)CC(C1)C3',\n",
+       "   493.67498779296875)),\n",
+       " ((1201,\n",
+       "   'Cc1cc(C)c(-c2cccc3c2CP(C2CCCCC2)Cc2c-3cccc2-c2c(C)cc(C)cc2C)c(C)c1',\n",
+       "   530.7360229492188),\n",
+       "  (2049,\n",
+       "   'Cc1cc(C)cc(c1c1cccc2c1CP(Cc1c2cccc1c1c(C)cc(cc1C)C)C1CCCCC1)C',\n",
+       "   530.7360229492188)),\n",
+       " ((1202,\n",
+       "   'Cc1cc(C)c(-c2cccc3c2CP(C(C)C)Cc2c-3cccc2-c2c(C)cc(C)cc2C)c(C)c1',\n",
+       "   490.6709899902344),\n",
+       "  (2050,\n",
+       "   'CC(P1Cc2c(c3c(C1)c(ccc3)c1c(C)cc(cc1C)C)cccc2c1c(C)cc(cc1C)C)C',\n",
+       "   490.6709899902344)),\n",
+       " ((241701, '[H]P([H])C(C)(C)C', 90.10600280761719),\n",
+       "  (1298, 'CC(C)(C)P', 90.10600280761719)),\n",
+       " ((242088, '[H]P([H])[H]', 33.99800109863281), (1299, 'P', 33.99800109863281))]"
+      ]
+     },
+     "execution_count": 44,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "duplicates"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 45,
+   "id": "1f1bd039",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Save the object\n",
+    "import pickle"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 46,
+   "id": "3c5ac97b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "with open('duplicates.pkl', 'wb') as file:\n",
+    "    pickle.dump(duplicates, file)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a187e06d",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.7"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/backend/app/app/api/v2/endpoints/molecule.py b/backend/app/app/api/v2/endpoints/molecule.py
index fa29c9b..930ed40 100644
--- a/backend/app/app/api/v2/endpoints/molecule.py
+++ b/backend/app/app/api/v2/endpoints/molecule.py
@@ -424,8 +424,13 @@ def search_neighbors(
         raise HTTPException(
             status_code=400, detail="No molecule with the id provided was found!"
         )
-
-    return results
+    # If a molecule does not have UMAP or PCA data, the results returned will just be molecules from id 1 to the limit.
+    # The easiest way to check if the molecule has PCA or UMAP data is to see if the first molecule returned is the same as the one requested since the distance should be 0.
+    if (results[0][1] == molecule_id):
+        return results
+    else:
+        # Return a 204 since the query executed but there is no real UMAP or PCA data.
+        return Response(status_code=204)
 
 
 @router.get("/dimensions/", response_model=List[schemas.MoleculeComponents])
diff --git a/duplicates.pkl b/duplicates.pkl
new file mode 100644
index 0000000000000000000000000000000000000000..a878a6a40dd0c7c60c497220cf369566d6959c60
GIT binary patch
literal 9364
zcmb_hYmA&#8Q$61>1<o*1%g#*WreoRG%RH<+bw}@#~xkkOlD`aqy*L_8@@t|!fG$+
z#n!qYSzU7!l`U2AQq!n~UqTcwHKy^0G2SC8ZGXrbtSCgVHV`QSKF@nD-#4@0?lQsI
z%bYpid7t<Gp7)#??Km{PyiHz16PfA@DV<cc`os;TdSbkU=}%6SkAElGrsofC9I9vV
zFI!X5r(+Z43ZF0V&s9ccxnxsn!LR^SpG@gFiqpjP(_<6WSJV1Rl~XFO@XDxMww~XR
z8wm0NUwQoGWB{t2=T=S{u;-I{rOfApOpwbCY{=Aesz3k$sY2G!1i9{flnl(yQngJK
z;l|X{!va=*u~D^6l-oZA&B9JyQ8kt8Q>ss8YEUer@)%JW{6t52GuNkb^#X@np^BhU
zwIW6Zeg^tIn>)IMclEBc?pHMtm&ie;E)YV8)hf>stgUKPsv8R}+aMg|$f*GqLjNhH
z=Nogor$e9a8#PRyI;t0mr{odU5pur(<a0XpC3Tg9PheH33*qS+Re<!jr{|Rn(4ths
zDl92vxr$cztAJKa>JNzp2UV#wB-Vr3@SDe9E-SNzK2-#OOc%5AR;a@!J;v<j!J4dE
z(}(3lJ-sl>S{e2_s%t@YE!M3C_j|v0=@BDiPg39gf8`xzUK*O2J9JNVYS@I)4ND?A
zX-wK_I_vIu=AvOE`NC8aB}j@&^$-KUJ!0x2z~CMF#Zw|3pHN80PH}R7wjU4QckuA2
zfxQqZ&|hzu1Hsx#dz)Tr@^{mn@R7uBb}$?R<c`WWCk@EIQhJFBAJ)Sm1JSR8{(nvD
zJ`>*oGSww!9nmy3xS-M0N#Gs$U{A^L%_sCagVv48AcX-?d6*WK5wjrAXb=34nS=;Y
zxUq?CtI8dxVXCW5K6v)PvY{N2DVzjI#M;#-Qo6G)VtttSHz%!`%EiZkZE_6^jP{L7
zV4<^VeHTj?05bh30sO;H4vr&52C2@E;EqxtT==mC3nEoalKWI2-+O4(7<79=k2Eu<
zX{VH#Om|SAy`@YcGgYB67GW-!K5jCwRZ%p{G4MQlXumk67t?i&x4M024Bgg$s>_J@
z>VSTWagU42xMhI)Z(6Sa6lC)Yn-ULB#UsLRAA5Py!2cjAR7L=VrA<{yD|~`p7A;DJ
zj5)j@)uxw$nSBA-jobhfdsb}*8|<K!7xZOrK0@TCqD!I5AGuE{?o+in-<}kJ^5MpP
zN>L5}6g1DX<PSjDI{)$Mt)1U38QO>MNVq_hhuJAh0I(pX+xyA$y|3*)Vd(ZeiU6Kx
z$^rtI&hrmJ@lsm9(_1WB+z_CWongj<a+g!3T58uOlHn|u$1`QDrv%U6B4A9ina!UT
z<c70l%x<dX3z&6p^FoU#)2s#FzfquSkR2AmUgYG4i+nK%cfV}hBzPY<Z@paInb7M6
zgJBp~f?_9w%m4}>w)s2+h~gp6CKzW@+GHopDQt7hezYKFQ)IbwW7!8**ru)WOMN-3
z3bR%L7`Be_$_}?2ws?j9lnjbh-X_A-9QuAz=v)rDb)mJ8L$;nmJD+>t_=l8EA<!k5
zH;?_?hTVk&2v-w!3Zcwc_76eyyOe$h_fJ!i0<>Cg6grS0vHi@PE)&Pc8OK>460OjJ
z-=a?47j}Ts{IR6M=rBBc82qwMVAA*or4cXGLu}S~^+1+`55F=gvQ+O*>ou<DlTypZ
zFR&Wq(dS8tl>#r+Xs8gLcrh|O3?k#RjEt;>3g}$x-WW%lj(x9dE-Gog$`g{m^p=d2
zX`;ka!W_R&>4h!>NrDy~l5OdD4f^q{u!#A$#2Z_ra_xM>N?@I2p{qBPvhpKK(2JPn
z1y%-hw90jK>h%awqblB7NL0580`b6e*9{wQobM1knt-~veXjvm`RIi!T!#`)M)9E1
zKhk<#4X1)uBs0t-XPpD%KOHp!egG{G;oK5RnagLR=%Fan%IDx9-}-hv*SHTrkbQ7$
z`J}KaUrxs@ZggDJ8qhzR#(79(Ov}fe2K7MR%sECvC^XY7#Z&xyPp|8FhZ2IVOyd~~
zIE%7Q7!e8Dd2BR$UF9FK4lMH;hGK@OpoyMc+B)@WR_3U^XVD1=yFYbHmvP^&j;Zt^
zv+4Zh>0$3ABtQU&|IrNMfNWwx#;UId3=|<19xvoHu*Wwzryzl+xBD#)S~V0MD3T|k
zG|#AD_*BF+qr%feg1@KrW>>KSGPhl6V4G60wau;;>||=veQDclR0k9KCdY;~skJS1
zw$ria1BG5s>mFDBS}Gwh%)W7VrLnzZI+WQR>`@@iWLg)SeaMP4oheNv5HIZ=x0zEZ
z=`|)o7^!c(@n}SWFn>$yn+y;`Ey#_GWa#pd+z9K=bRzkx7)cE$rmd{P>AuI$44b^T
zF`)+yAMAl(VviRA_rJ(E-#wBi=X~p0d0x=!&uLWorZkz!#4-B7_4~_TKOxdr<}|_4
zE{*3Q?AqA><0j=~!A8x)LBJQ&dM#Q|XR}u9%pLNhH~(bR*mYsM03m&RGZ0thAX@B3
z=7Av7@BOiMCo95GCpgFk-xG7JZCrxx<g4sX&`s84qd*1CqOZYj!BXmE%c6JSi_O~J
zZ4ZnJeD!2|NY}L7^{)a2xF{VY{*CWUEO1Uv2EH?SdrW5%yAoJua4tZ|yt#A1Y1`3O
z-~T4g1+alT7dSk~_f#4ii<l>LWhDFJCSk4J+kSOItX!Elm3;QH<;fwy;!f-eIe!e{
zUVwHuql%{Yv8^v|>?oze)t&UoND6z)*NbMkS2}0h*}2KJn556{owM|aiH4E1evc^>
zECGTg+ZIPK3yKBKcB~jh2_^hDg93LlO!LLAnj!2_A^vk|U5aF!3&~b9@8g%%CXJxC
zCiJ@)Ek5&jUCUQP)r6w<Bs5(LdPF4A=jD=tVF)@LPa~-#ljcyceWkr0tGrt;o55CM
z1HTwH_)A(BW@=S4=n@8;7U2k%+_!bt38Qv0VKBmVFkvHM7>l9yQ4C1(4C>!ZlK5&B
z32`SFp<Gy3Z%G&+_po6X4cSj;d=CVf*0t-4tu<Ypvdz)ferxcuI^NM{uctgmH-KDz
z;n+lFm0nIS^7I_K(Z`6iPi&(>wc}g&4~ucj_tXDZf{Zp5Qsu+Tv2n+7PX%6E^*Yx{
zfxzSl0q`LZBo5rY9OKi0y8)#5SfyfIT>b*3zzNtGgY7^J1?7^v${?mIq*fje^6k=B
zgsB4MLi^ec!5Vt!B|kf2Eb#zkAdwX!?1N38Vlqw4=ZV1RCEY7>LqBM47;!9TYz|OD
zz_NJ?IzZz68$Z!yxPOt{uQP6AKLjb{dl?>nC{WYf)Tt;M#Clfu^wVDMzPHf=e%e0H
z+jqG<-G9`!eB}ozl)-TV@XlDIGy*|3&X^xD9F2vK6vyEp(zhh^rD`>5@oILDFb00{
zV+UA#=iFbMFqU|TOqWJhMb;aH=yGOuLhtEUPzk@yZ%SmE;(xjAG87=QL{q5+5~iUK
zq1Kdz)KZ_j;fN9ZD-?WsNn+R;;Rp?raw9=_^N<uW3P;VFIoZPg#!CYCpKJ<$XK8P8
z8~z`8@U2$*LzLd$Zrg7n%|$MphEQ}aqu-#lr-S`hCG=I+%z0(@!(ve2cadyRmDw#b
zK@pmX9YMZ(<v%UkFOlt4^;||FOA0gqjLnS~hemwnt*Y20ymyO|gL^5VuQ7$gWE^*E
zzNDWA5chE9c6EF61&ICQ?&j{xlt6Y+Sxq<F+b~;j=pX23OApSC@*Bl~!ILvwcX>QF
z0PVhG4_OO*hWz6%+1&c9AIR6@o)&zs*RVP3gUZ_K!;yRp@)*H@4WIdmjmqAQZU^`r
z`CsEfpeKw$eT5FnD;D*p&x`>Nfi~hSzd8u7@%eaTvQTPcH(x!4Yk&Ew!B4?&$)xU~
z;4L1)Mfg3(j!qir|DeBsaT;zXW;k{UL#PDtJO*T$pLAa#;xOog$6Z?}bes%bwpDTG
zUW2#)hfhKzs{3+UUvD(Slk5m}Z0yh?9Y)x=gp&}?gF(1c?cASH55)@t)(ub_<?fv-
z?RxySl88|L8hv_26BSf5%_=)tAumRW!0H?5w89P|lnoY`rKR+3d-WVWBJD(6M-^g*
z`pRAaNjwH5S(tP^IX%MWB<_p9*h!QN8Yw8aVsr4cJ&{z;r}c&g&p0Et>K_jYQ6KHl
z8x&EQFwzK-d61WD5-BNA7545HvGQ`7z)D$;RiMEi-8XvZ>0xnSHNmd~93&G`ga$^A
zp!gti%PkpbS(zkBkqCUDT1U|!;$ysmc6d!9f;)cRh<pbh`LUI+y%XIZe4Ha4cdiVW
zQVi@~VAEJRFw1M_hDi$O;ZQzSuzZB~!U~Hi7gqUiWvUmY^jsgD(I=ok##`45{H-en
XSWtT1U0Gb5(g}X;=x4?zZom3}JWMxC

literal 0
HcmV?d00001

diff --git a/frontend/src/pages/Molecule.jsx b/frontend/src/pages/Molecule.jsx
index 90e2bb7..fe6e020 100644
--- a/frontend/src/pages/Molecule.jsx
+++ b/frontend/src/pages/Molecule.jsx
@@ -46,7 +46,10 @@ async function dimensionality(molecule_id, type, components, signal, limit=10) {
       if (!response.ok) {
          throw new Error('Invalid Molecule Id')
       }
-   
+      else if (response.status == 204) {
+         // Return an empty array because there is no PCA or UMAP data.
+         return [];
+      }
       else {
          return await response.json()
       }
@@ -169,7 +172,9 @@ export default function MoleculeInfo() {
 
    return (
       <Container maxWidth="xl">
-         <Grid container alignItems="center" justifyContent="center" spacing={2}>
+         
+         { Object.keys(molData).length > 0 ?
+            <Grid container alignItems="center" justifyContent="center" spacing={2}>
             <Grid item xs={(width > 1366) ? 6 : 12} sx={{mt: 3}}>
             {Object.keys(svg).length > 0 && 
                <Box 
@@ -195,7 +200,7 @@ export default function MoleculeInfo() {
             <Grid item xs={(width > 1366) ? 6 : 12}>
                {Object.keys(molData).length > 0 && <MoleculeDataTable molecule_id={molData.molecule_id} initial_data_type="ml" />}
             </Grid>
-            {(width > 768) && allConformers.length > 0 && conformer.length > 0 && <Grid item xs={(width > 1366) ? 6 : 12}>
+            {(width > 768) && allConformers.length > 0 && conformer.length > 0 && <Grid item xs={(width > 1366) && Object.keys(neighborData).length > 0 ? 6 : 12}>
                <Container>
                   <FormControl fullWidth variant="standard">
                      <InputLabel id="conformer">Conformer</InputLabel>
@@ -219,8 +224,8 @@ export default function MoleculeInfo() {
                   </Box>
                </Container>
             </Grid>}
-            {(width > 768) && <Grid item xs={(width > 1366) && allConformers.length > 0 && conformer.length > 0 ? 6 : 12}>
-               {Object.keys(neighborData).length > 0 ? 
+            {(width > 768) && Object.keys(neighborData).length > 0 && <Grid item xs={(width > 1366) && allConformers.length > 0 && conformer.length > 0 ? 6 : 12}>
+               {Object.keys(neighborData).length > 0 &&
                <Box display="flex" flexDirection="column" justifyContent="center" alignItems="center">
                   <TextField
                   select
@@ -236,16 +241,13 @@ export default function MoleculeInfo() {
                      <Graph molData={neighborData} componentArray={components} type={type} neighborSearch={true}></Graph>
                   </Container>
                </Box>
-               :
-               <Box display="flex" flexDirection="column" justifyContent="center" alignItems="center">
-                  <CircularProgress />
-               </Box>
                }
             </Grid>
             }
-            {Object.keys(molData).length > 0 && (width > 768) && <Grid item xs={12}>
-            </Grid>}
-         </Grid>
+         </Grid> :
+         <Box display="flex" flexDirection="column" justifyContent="center" alignItems="center">
+            <CircularProgress />
+         </Box>}
       </Container>
    )
 }

From 1773ec75297f6dcc3313aee2a2532f88ffa71d26 Mon Sep 17 00:00:00 2001
From: Usman Jamshed <usman.jamshedk@gmail.com>
Date: Sat, 14 Oct 2023 16:45:11 -0400
Subject: [PATCH 2/3] Removing ipynb checkpoints

---
 .../DB_investigation-checkpoint.ipynb         | 798 ------------------
 1 file changed, 798 deletions(-)
 delete mode 100644 .ipynb_checkpoints/DB_investigation-checkpoint.ipynb

diff --git a/.ipynb_checkpoints/DB_investigation-checkpoint.ipynb b/.ipynb_checkpoints/DB_investigation-checkpoint.ipynb
deleted file mode 100644
index 025dc89..0000000
--- a/.ipynb_checkpoints/DB_investigation-checkpoint.ipynb
+++ /dev/null
@@ -1,798 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "9368bb63",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Want to start by checking which molecules are duplicates.\n",
-    "# For examples we have 241905 and 1497"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "id": "e28921c8",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import psycopg2\n",
-    "import pandas as pd\n",
-    "from rdkit import Chem"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 33,
-   "id": "214277f9",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from tqdm import tqdm"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 6,
-   "id": "a128c9e3",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "/tmp/ipykernel_556/3175014960.py:16: UserWarning: pandas only supports SQLAlchemy connectable (engine/connection) or database string URI or sqlite3 DBAPI2 connection. Other DBAPI2 objects are not tested. Please consider using SQLAlchemy.\n",
-      "  df = pd.read_sql_query(query, connection)\n"
-     ]
-    }
-   ],
-   "source": [
-    "# Establish a connection\n",
-    "# You must have the DB container running to run this cell successfully.\n",
-    "# Connection parameters\n",
-    "db_params = {\n",
-    "    'dbname': 'postgres',\n",
-    "    'user': 'postgres',\n",
-    "    'password': '',\n",
-    "    'host': '127.0.0.1',\n",
-    "    'port': '5432'\n",
-    "}\n",
-    "\n",
-    "# Establish a connection to the PostgreSQL database\n",
-    "connection = psycopg2.connect(**db_params)\n",
-    "\n",
-    "# Execute an SQL statement\n",
-    "query = \"SELECT molecule_id, smiles, molecular_weight FROM molecule\"\n",
-    "df = pd.read_sql_query(query, connection)\n",
-    "\n",
-    "# Close the connection\n",
-    "connection.close()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 7,
-   "id": "686a1a35",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>molecule_id</th>\n",
-       "      <th>smiles</th>\n",
-       "      <th>molecular_weight</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>0</th>\n",
-       "      <td>331406</td>\n",
-       "      <td>COc1cccc(c1c1ccccc1P(c1ccccc1)c1ccccc1)OC</td>\n",
-       "      <td>398.441986</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1</th>\n",
-       "      <td>140360</td>\n",
-       "      <td>COc1ccc(P(c2ccccc2SC)c2ccccc2SC)c(C)c1</td>\n",
-       "      <td>398.532990</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2</th>\n",
-       "      <td>331409</td>\n",
-       "      <td>C1CCC(CC1)P(c1ccccc1)C1CCCCC1</td>\n",
-       "      <td>274.388000</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>3</th>\n",
-       "      <td>2027</td>\n",
-       "      <td>CN(c1ccccc1c1ccccc1P(C12CC3CC(C2)CC(C1)C3)C12C...</td>\n",
-       "      <td>497.707001</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>4</th>\n",
-       "      <td>2036</td>\n",
-       "      <td>CCC1(CC)O[C@@H]2[C@@H](O1)C(c1cc(C(C)(C)C)cc(C...</td>\n",
-       "      <td>1049.558960</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>...</th>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>330962</th>\n",
-       "      <td>608</td>\n",
-       "      <td>Cc1cc(C)cc(P(c2cc(C)cc(C)c2)c2ccccc2C=O)c1</td>\n",
-       "      <td>346.410004</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>330963</th>\n",
-       "      <td>461</td>\n",
-       "      <td>CCO[Si](CCP(c1ccccc1)c1ccccc1)(OCC)OCC</td>\n",
-       "      <td>376.509003</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>330964</th>\n",
-       "      <td>1064</td>\n",
-       "      <td>Cc1c(C)n(C(C)C)c(=NP(N=c2n(C(C)C)c(C)c(C)n2C(C...</td>\n",
-       "      <td>462.666992</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>330965</th>\n",
-       "      <td>523</td>\n",
-       "      <td>CN(C)/N=C/c1ccc(P(c2ccc(/C=N/N(C)C)s2)c2ccc(/C...</td>\n",
-       "      <td>490.664001</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>330966</th>\n",
-       "      <td>1817</td>\n",
-       "      <td>COc1cccc(P(c2cccc(OC)c2OC)c2cccc(OC)c2OC)c1OC</td>\n",
-       "      <td>442.447998</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "<p>330967 rows × 3 columns</p>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "        molecule_id                                             smiles  \\\n",
-       "0            331406          COc1cccc(c1c1ccccc1P(c1ccccc1)c1ccccc1)OC   \n",
-       "1            140360             COc1ccc(P(c2ccccc2SC)c2ccccc2SC)c(C)c1   \n",
-       "2            331409                      C1CCC(CC1)P(c1ccccc1)C1CCCCC1   \n",
-       "3              2027  CN(c1ccccc1c1ccccc1P(C12CC3CC(C2)CC(C1)C3)C12C...   \n",
-       "4              2036  CCC1(CC)O[C@@H]2[C@@H](O1)C(c1cc(C(C)(C)C)cc(C...   \n",
-       "...             ...                                                ...   \n",
-       "330962          608         Cc1cc(C)cc(P(c2cc(C)cc(C)c2)c2ccccc2C=O)c1   \n",
-       "330963          461             CCO[Si](CCP(c1ccccc1)c1ccccc1)(OCC)OCC   \n",
-       "330964         1064  Cc1c(C)n(C(C)C)c(=NP(N=c2n(C(C)C)c(C)c(C)n2C(C...   \n",
-       "330965          523  CN(C)/N=C/c1ccc(P(c2ccc(/C=N/N(C)C)s2)c2ccc(/C...   \n",
-       "330966         1817      COc1cccc(P(c2cccc(OC)c2OC)c2cccc(OC)c2OC)c1OC   \n",
-       "\n",
-       "        molecular_weight  \n",
-       "0             398.441986  \n",
-       "1             398.532990  \n",
-       "2             274.388000  \n",
-       "3             497.707001  \n",
-       "4            1049.558960  \n",
-       "...                  ...  \n",
-       "330962        346.410004  \n",
-       "330963        376.509003  \n",
-       "330964        462.666992  \n",
-       "330965        490.664001  \n",
-       "330966        442.447998  \n",
-       "\n",
-       "[330967 rows x 3 columns]"
-      ]
-     },
-     "execution_count": 7,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "df"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 15,
-   "id": "149e8d9b",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>molecule_id</th>\n",
-       "      <th>smiles</th>\n",
-       "      <th>molecular_weight</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>143223</th>\n",
-       "      <td>241905</td>\n",
-       "      <td>[H]P([H])C</td>\n",
-       "      <td>48.025002</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>329868</th>\n",
-       "      <td>1497</td>\n",
-       "      <td>CP</td>\n",
-       "      <td>48.025002</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "        molecule_id      smiles  molecular_weight\n",
-       "143223       241905  [H]P([H])C         48.025002\n",
-       "329868         1497          CP         48.025002"
-      ]
-     },
-     "execution_count": 15,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "# Sanity check to see if the data is correct\n",
-    "df[(df[\"molecule_id\"]==241905) | (df[\"molecule_id\"]==1497)]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 27,
-   "id": "9596cb12",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# These two molecules are the same so lets check if rdkit will return the same smiles string when canonicalizing them\n",
-    "mol_241905 = df[df[\"molecule_id\"]==241905][\"smiles\"].to_list()[0]\n",
-    "mol_1497 = df[df[\"molecule_id\"]==1497][\"smiles\"].to_list()[0]\n",
-    "\n",
-    "# Double check with the molecular weight. Use difference is less than some tolerance 1e-6.\n",
-    "mol_241905_weight = df[df[\"molecule_id\"]==241905][\"molecular_weight\"].to_list()[0]\n",
-    "mol_1497_weight = df[df[\"molecule_id\"]==1497][\"molecular_weight\"].to_list()[0]\n",
-    "\n",
-    "a = Chem.CanonSmiles(mol_241905)\n",
-    "b = Chem.CanonSmiles(mol_241905)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 29,
-   "id": "9c94d293",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "CP 48.025001525878906\n",
-      "CP 48.025001525878906\n",
-      "True\n",
-      "True\n"
-     ]
-    }
-   ],
-   "source": [
-    "print(a, mol_241905_weight)\n",
-    "print(b, mol_1497_weight)\n",
-    "print(a == b)\n",
-    "print(abs(mol_241905_weight - mol_1497_weight) < 0.000001)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 31,
-   "id": "5cca5c81",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "all_data_dict = df.to_dict(orient=\"records\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 41,
-   "id": "7ea5a56b",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "  0%|                                                                                        | 0/330967 [00:00<?, ?it/s][23:43:15] Explicit valence for atom # 7 B, 6, is greater than permitted\n",
-      "  0%|▏                                                                           | 653/330967 [00:00<01:38, 3339.89it/s]"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "c1ccc(cc1)P([B]1234[BH]567[BH]891[BH]1%102[BH]2%114[BH]435[CH]357[BH]768[CH]691[BH]1%102[BH]%1143[BH]5761)c1ccccc1\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "  1%|▌                                                                          | 2331/330967 [00:00<01:20, 4099.64it/s][23:43:15] Explicit valence for atom # 7 C, 6, is greater than permitted\n",
-      "  1%|▋                                                                          | 3150/330967 [00:00<01:24, 3897.22it/s]"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "c1ccc(cc1)P([C]1234[BH]567[BH]891[BH]1%103[BH]3%112[BH]245[BH]456[CH]678[BH]791[BH]1%10%11[BH]324[BH]5671)c1ccccc1\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "  2%|█▎                                                                         | 5580/330967 [00:01<01:22, 3940.69it/s][23:43:16] Explicit valence for atom # 7 B, 6, is greater than permitted\n",
-      "[23:43:16] Explicit valence for atom # 7 C, 6, is greater than permitted\n",
-      "  2%|█▍                                                                         | 6369/330967 [00:01<01:23, 3905.70it/s]"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "C1CCC(CC1)P([B]1234[BH]567[BH]891[BH]1%102[BH]2%114[BH]435[CH]357[BH]768[CH]691[BH]1%102[BH]%1143[BH]5761)C1CCCCC1\n",
-      "c1ccc(cc1)P([C]1234[CH]567[BH]893[BH]3%102[BH]2%111[BH]145[BH]456[BH]678[BH]79%10[BH]83%11[BH]214[BH]5678)c1ccccc1\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "100%|█████████████████████████████████████████████████████████████████████████| 330967/330967 [02:02<00:00, 2711.21it/s]\n"
-     ]
-    }
-   ],
-   "source": [
-    "# Store each record in a hashmap with the CANONICAL smile as the key, and value as a tuple of molecule_id and molecular_weight\n",
-    "# When storing, check to see if the CANONICAL smile is already there, if so check its weight and add the tuple of molecule_ids\n",
-    "# the original  \n",
-    "hashmap = {}\n",
-    "duplicates = []\n",
-    "for entry in tqdm(all_data_dict):\n",
-    "    try:\n",
-    "        # Canonicalize smiles\n",
-    "        canonical_smile = Chem.CanonSmiles(entry[\"smiles\"])\n",
-    "        # Check if its in hashmap\n",
-    "        if canonical_smile not in hashmap:\n",
-    "            # add it\n",
-    "            hashmap[canonical_smile] = (entry[\"molecule_id\"], entry[\"smiles\"], entry[\"molecular_weight\"])\n",
-    "        else:\n",
-    "            a = hashmap[canonical_smile] # Get matched molecule tuple data\n",
-    "            b = (entry[\"molecule_id\"], entry[\"smiles\"], entry[\"molecular_weight\"])\n",
-    "            duplicates.append((a, b))\n",
-    "    except:\n",
-    "        # See which molecules cause issues if any, deal with them later\n",
-    "        print(entry[\"smiles\"])"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 43,
-   "id": "361ec9af",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "78"
-      ]
-     },
-     "execution_count": 43,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "len(duplicates)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 44,
-   "id": "158c8d2c",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "[((1519, 'COP', 64.02400207519531),\n",
-       "  (241851, '[H]P([H])OC', 64.02400207519531)),\n",
-       " ((241739, '[H]P([H])C(C)C', 76.0790023803711),\n",
-       "  (1495, 'CC(C)P', 76.0790023803711)),\n",
-       " ((2045, 'C1CC2CCCC(C1)P2B1Nc2ccccc2c2c1cccc2', 319.1969909667969),\n",
-       "  (1006, 'c1ccc2c(c1)NB(P1C3CCCC1CCC3)c1ccccc1-2', 319.1969909667969)),\n",
-       " ((1521, 'CSP', 80.09200286865234),\n",
-       "  (241909, '[H]P([H])SC', 80.09200286865234)),\n",
-       " ((2063,\n",
-       "   'COC1=CC=C(OC)C(C2=C(C(C)C)C=C(C(C)C)C=C2C(C)C)=C1P3C(C)(C)CC4(OCCO4)CC3(C)C',\n",
-       "   554.7520141601562),\n",
-       "  (1900,\n",
-       "   'COc1ccc(OC)c(P2C(C)(C)CC3(CC2(C)C)OCCO3)c1-c1c(C(C)C)cc(C(C)C)cc1C(C)C',\n",
-       "   554.7520141601562)),\n",
-       " ((1494, 'CC(C)PC(C)C', 118.16000366210938),\n",
-       "  (41951, '[H]P(C(C)C)C(C)C', 118.16000366210938)),\n",
-       " ((2054, 'PC12CC3CC(C2)CC(C1)C3', 168.22000122070312),\n",
-       "  (1296, 'PC12CC3CC(CC(C3)C1)C2', 168.22000122070312)),\n",
-       " ((820,\n",
-       "   'COC1=[C@@](C2=CC=CC=C2C=C1)[C@@]3=C4C(C=CC=C4)=CC=C3P(OC5=CC=C(C=CC=C6)C6=[C@]57)OC8=[C@]7C(C=CC=C9)=C9C=C8',\n",
-       "   598.6380004882812),\n",
-       "  (821,\n",
-       "   'COC1=[C@@](C2=CC=CC=C2C=C1)[C@@]3=C4C(C=CC=C4)=CC=C3P(OC5=CC=C(C=CC=C6)C6=[C@@]57)OC8=[C@@]7C(C=CC=C9)=C9C=C8',\n",
-       "   598.6380004882812)),\n",
-       " ((2053, 'C1C2CC3CC1CC(C2)(C3)PC12CC3CC(C2)CC(C1)C3', 302.4419860839844),\n",
-       "  (1295, 'C1C2CC3CC1CC(PC14CC5CC(CC(C5)C1)C4)(C2)C3', 302.4419860839844)),\n",
-       " ((213971, '[H]P(F)F', 69.97799682617188), (1293, 'FPF', 69.97799682617188)),\n",
-       " ((150328, '[H]P([Si](C)(C)C)[Si](C)(C)C', 178.36399841308594),\n",
-       "  (1520, 'C[Si](C)(C)P[Si](C)(C)C', 178.36399841308594)),\n",
-       " ((2020,\n",
-       "   'c1ccc(cc1)c1ccccc1P(C12CC3CC(C2)CC(C1)C3)C12CC3CC(C2)CC(C1)C3',\n",
-       "   454.63800048828125),\n",
-       "  (306,\n",
-       "   'c1ccc(-c2ccccc2P(C23CC4CC(CC(C4)C2)C3)C23CC4CC(CC(C4)C2)C3)cc1',\n",
-       "   454.63800048828125)),\n",
-       " ((2053, 'C1C2CC3CC1CC(C2)(C3)PC12CC3CC(C2)CC(C1)C3', 302.4419860839844),\n",
-       "  (252420,\n",
-       "   '[H]P(C12CC3CC(CC(C3)C1)C2)C12CC3CC(CC(C3)C1)C2',\n",
-       "   302.4419860839844)),\n",
-       " ((1491, 'Pc1ccco1', 100.05699920654297),\n",
-       "  (242228, '[H]P([H])c1ccco1', 100.05699920654297)),\n",
-       " ((2033,\n",
-       "   'CCCCc1c(F)c(F)c(c(c1F)F)c1c(cc(c(c1C(C)C)c1cccc(c1P(C12CC3CC(C2)CC(C1)C3)C12CC3CC(C2)CC(C1)C3)OC)C(C)C)C(C)C',\n",
-       "   815.072998046875),\n",
-       "  (369,\n",
-       "   'CCCCc1c(F)c(F)c(-c2c(C(C)C)cc(C(C)C)c(-c3cccc(OC)c3P(C34CC5CC(CC(C5)C3)C4)C34CC5CC(CC(C5)C3)C4)c2C(C)C)c(F)c1F',\n",
-       "   815.072998046875)),\n",
-       " ((1297, 'CC(C)(C)PC(C)(C)C', 146.21400451660156),\n",
-       "  (20221, '[H]P(C(C)(C)C)C(C)(C)C', 146.21400451660156)),\n",
-       " ((2044, 'CC(P(C(C)(C)C)C1C2CC3CC1CC(C2)C3)(C)C', 280.4360046386719),\n",
-       "  (952, 'CC(C)(C)P(C1C2CC3CC(C2)CC1C3)C(C)(C)C', 280.4360046386719)),\n",
-       " ((331415, 'P(CCCCCCCC)(CCCCCCCC)CCCCCCCC', 370.64599609375),\n",
-       "  (239, 'CCCCCCCCP(CCCCCCCC)CCCCCCCC', 370.64599609375)),\n",
-       " ((1493, 'CCP', 62.051998138427734),\n",
-       "  (241815, '[H]P([H])CC', 62.051998138427734)),\n",
-       " ((1518, 'CN(C)PN(C)C', 120.13600158691406),\n",
-       "  (91817, '[H]P(N(C)C)N(C)C', 120.13600158691406)),\n",
-       " ((1294, 'FP', 51.987998962402344),\n",
-       "  (242039, '[H]P([H])F', 51.987998962402344)),\n",
-       " ((2038, 'C1C2CC3CC1CC(C2)(C3)P(c1ccccc1)c1ccccc1', 320.4159851074219),\n",
-       "  (576, 'c1ccc(P(c2ccccc2)C23CC4CC(CC(C4)C2)C3)cc1', 320.4159851074219)),\n",
-       " ((1487, 'Pc1ccc2c(c1)CCCC2', 164.18800354003906),\n",
-       "  (242188, '[H]P([H])c1ccc2c(c1)CCCC2', 164.18800354003906)),\n",
-       " ((2034,\n",
-       "   'O1CCN(CC1)c1ccccc1P(C12CC3CC(C2)CC(C1)C3)C12CC3CC(C2)CC(C1)C3',\n",
-       "   463.64599609375),\n",
-       "  (371,\n",
-       "   'c1ccc(P(C23CC4CC(CC(C4)C2)C3)C23CC4CC(CC(C4)C2)C3)c(N2CCOCC2)c1',\n",
-       "   463.64599609375)),\n",
-       " ((331412, 'CC(C=CC=C1C)=C1P(C2=CC=CC=C2)C3=CC=CC=C3', 290.34600830078125),\n",
-       "  (203, 'Cc1cccc(C)c1P(c1ccccc1)c1ccccc1', 290.34600830078125)),\n",
-       " ((2043, 'CC(P(C12CC3CC(C2)CC(C1)C3)C(C)(C)C)(C)C', 280.4360046386719),\n",
-       "  (951, 'CC(C)(C)P(C(C)(C)C)C12CC3CC(CC(C3)C1)C2', 280.4360046386719)),\n",
-       " ((1492, 'CCPCC', 90.10600280761719),\n",
-       "  (85517, '[H]P(CC)CC', 90.10600280761719)),\n",
-       " ((1488, 'PC12c3ccccc3C(c3ccccc31)c1ccccc12', 286.3139953613281),\n",
-       "  (242107, '[H]P([H])C12c3ccccc3C(c3ccccc31)c1ccccc12', 286.3139953613281)),\n",
-       " ((2064,\n",
-       "   'COC1=CC=C(C2=C(OC)C=CC3=C2C=CC=C3)C(OC)=C1P(C4CCCCC4)C5CCCCC5',\n",
-       "   490.6239929199219),\n",
-       "  (1901,\n",
-       "   'COc1ccc(-c2c(OC)ccc3ccccc23)c(OC)c1P(C1CCCCC1)C1CCCCC1',\n",
-       "   490.6239929199219)),\n",
-       " ((331406, 'COc1cccc(c1c1ccccc1P(c1ccccc1)c1ccccc1)OC', 398.4419860839844),\n",
-       "  (7, 'COc1cccc(OC)c1-c1ccccc1P(c1ccccc1)c1ccccc1', 398.4419860839844)),\n",
-       " ((1485, 'Pc1ccc2ccccc2c1', 160.156005859375),\n",
-       "  (242192, '[H]P([H])c1ccc2ccccc2c1', 160.156005859375)),\n",
-       " ((1496, 'CPC', 62.051998138427734), (137130, '[H]P(C)C', 62.051998138427734)),\n",
-       " ((2052,\n",
-       "   'CC(c1cccc(c1P(C12CC3CC(C2)CC(C1)C3)C12CC3CC(C2)CC(C1)C3)C(C)C)C',\n",
-       "   462.7019958496094),\n",
-       "  (1274,\n",
-       "   'CC(C)c1cccc(C(C)C)c1P(C12CC3CC(CC(C3)C1)C2)C12CC3CC(CC(C3)C1)C2',\n",
-       "   462.7019958496094)),\n",
-       " ((2025,\n",
-       "   'O=C(P(C12CC3CC(C2)CC(C1)C3)C12CC3CC(C2)CC(C1)C3)c1cccc2c1cccc2',\n",
-       "   456.6099853515625),\n",
-       "  (348,\n",
-       "   'O=C(c1cccc2ccccc12)P(C12CC3CC(CC(C3)C1)C2)C12CC3CC(CC(C3)C1)C2',\n",
-       "   456.6099853515625)),\n",
-       " ((575, 'Pc1ccccc1', 110.09600067138672),\n",
-       "  (242207, '[H]P([H])c1ccccc1', 110.09600067138672)),\n",
-       " ((2026,\n",
-       "   'CC([Si](C(C)C)(C(C)C)Oc1ccccc1P(C12CC3CC(C2)CC(C1)C3)C12CC3CC(C2)CC(C1)C3)C',\n",
-       "   550.8839721679688),\n",
-       "  (349,\n",
-       "   'CC(C)[Si](Oc1ccccc1P(C12CC3CC(CC(C3)C1)C2)C12CC3CC(CC(C3)C1)C2)(C(C)C)C(C)C',\n",
-       "   550.8839721679688)),\n",
-       " ((2054, 'PC12CC3CC(C2)CC(C1)C3', 168.22000122070312),\n",
-       "  (242106, '[H]P([H])C12CC3CC(CC(C3)C1)C2', 168.22000122070312)),\n",
-       " ((2048, 'COCCP(C12CC3CC(C2)CC(C1)C3)C12CC3CC(C2)CC(C1)C3', 360.5220031738281),\n",
-       "  (1140,\n",
-       "   'COCCP(C12CC3CC(CC(C3)C1)C2)C12CC3CC(CC(C3)C1)C2',\n",
-       "   360.5220031738281)),\n",
-       " ((2051, 'O=Cc1ccc(cc1)P(c1ccc(cc1)C=O)C1C2CC3CC1CC(C2)C3', 376.4360046386719),\n",
-       "  (1241,\n",
-       "   'O=Cc1ccc(P(c2ccc(C=O)cc2)C2C3CC4CC(C3)CC2C4)cc1',\n",
-       "   376.4360046386719)),\n",
-       " ((2024,\n",
-       "   'COc1ccc(c(c1c1c(cc(cc1C(C)C)C(C)C)C(C)C)P(C12CC3CC(C2)CC(C1)C3)C12CC3CC(C2)CC(C1)C3)OC',\n",
-       "   640.9329833984375),\n",
-       "  (347,\n",
-       "   'COc1ccc(OC)c(P(C23CC4CC(CC(C4)C2)C3)C23CC4CC(CC(C4)C2)C3)c1-c1c(C(C)C)cc(C(C)C)cc1C(C)C',\n",
-       "   640.9329833984375)),\n",
-       " ((2047,\n",
-       "   'COc1ccccc1P(C12CC3CC(C2)CC(C1)C3)C12CC3CC(C2)CC(C1)C3',\n",
-       "   408.5660095214844),\n",
-       "  (1137,\n",
-       "   'COc1ccccc1P(C12CC3CC(CC(C3)C1)C2)C12CC3CC(CC(C3)C1)C2',\n",
-       "   408.5660095214844)),\n",
-       " ((2016,\n",
-       "   'C1C2CC3CC1CC(C2)(C3)P(C12CC3CC(C2)CC(C1)C3)Cc1ccccc1',\n",
-       "   392.5669860839844),\n",
-       "  (64,\n",
-       "   'c1ccc(CP(C23CC4CC(CC(C4)C2)C3)C23CC4CC(CC(C4)C2)C3)cc1',\n",
-       "   392.5669860839844)),\n",
-       " ((2022,\n",
-       "   'c1ccc(cc1)c1nn(c(c1n1nccc1P(C12CC3CC(C2)CC(C1)C3)C12CC3CC(C2)CC(C1)C3)c1ccccc1)c1ccccc1',\n",
-       "   662.8619995117188),\n",
-       "  (340,\n",
-       "   'c1ccc(-c2nn(-c3ccccc3)c(-c3ccccc3)c2-n2nccc2P(C23CC4CC(CC(C4)C2)C3)C23CC4CC(CC(C4)C2)C3)cc1',\n",
-       "   662.8619995117188)),\n",
-       " ((2014,\n",
-       "   'C1C2CC3CC1CC(C2)(C3)P(C12CC3CC(C2)CC(C1)C3)C12CC3CC(C2)CC(C1)C3',\n",
-       "   436.66400146484375),\n",
-       "  (6,\n",
-       "   'C1C2CC3CC1CC(P(C14CC5CC(CC(C5)C1)C4)C14CC5CC(CC(C5)C1)C4)(C2)C3',\n",
-       "   436.66400146484375)),\n",
-       " ((2037,\n",
-       "   'CN(c1ccccc1P(C12CC3CC(C2)CC(C1)C3)C12CC3CC(C2)CC(C1)C3)C',\n",
-       "   421.6090087890625),\n",
-       "  (527,\n",
-       "   'CN(C)c1ccccc1P(C12CC3CC(CC(C3)C1)C2)C12CC3CC(CC(C3)C1)C2',\n",
-       "   421.6090087890625)),\n",
-       " ((1490, 'Pc1cccs1', 116.125), (242229, '[H]P([H])c1cccs1', 116.125)),\n",
-       " ((331421, 'CCO[Si](OCC)(OCC)CCCP(c1ccccc1)c2ccccc2', 390.5360107421875),\n",
-       "  (728, 'CCO[Si](CCCP(c1ccccc1)c1ccccc1)(OCC)OCC', 390.5360107421875)),\n",
-       " ((1486, 'Pc1cccc2ccccc12', 160.156005859375),\n",
-       "  (242206, '[H]P([H])c1cccc2ccccc12', 160.156005859375)),\n",
-       " ((2021,\n",
-       "   'C1CCN(CC1)c1ccccc1P(C12CC3CC(C2)CC(C1)C3)C12CC3CC(C2)CC(C1)C3',\n",
-       "   461.67401123046875),\n",
-       "  (338,\n",
-       "   'c1ccc(P(C23CC4CC(CC(C4)C2)C3)C23CC4CC(CC(C4)C2)C3)c(N2CCCCC2)c1',\n",
-       "   461.67401123046875)),\n",
-       " ((2015,\n",
-       "   'CCCCP(C12CC3CC(C2)CC(C1)C3)C12CC3CC(C2)CC(C1)C3',\n",
-       "   358.54998779296875),\n",
-       "  (10, 'CCCCP(C12CC3CC(CC(C3)C1)C2)C12CC3CC(CC(C3)C1)C2', 358.54998779296875)),\n",
-       " ((2065,\n",
-       "   'C12=CC=CC=C1C(C3=C(C4=C(P(C5CCCCC5)C6CCCCC6)C=CC=C4)C(C=CC=C7)=C7C=C3)=C8C(C=CC=C8)=C2',\n",
-       "   576.7639770507812),\n",
-       "  (1902,\n",
-       "   'c1ccc(P(C2CCCCC2)C2CCCCC2)c(-c2c(-c3c4ccccc4cc4ccccc34)ccc3ccccc23)c1',\n",
-       "   576.7639770507812)),\n",
-       " ((2027,\n",
-       "   'CN(c1ccccc1c1ccccc1P(C12CC3CC(C2)CC(C1)C3)C12CC3CC(C2)CC(C1)C3)C',\n",
-       "   497.7070007324219),\n",
-       "  (350,\n",
-       "   'CN(C)c1ccccc1-c1ccccc1P(C12CC3CC(CC(C3)C1)C2)C12CC3CC(CC(C3)C1)C2',\n",
-       "   497.7070007324219)),\n",
-       " ((331409, 'C1CCC(CC1)P(c1ccccc1)C1CCCCC1', 274.38800048828125),\n",
-       "  (68, 'c1ccc(P(C2CCCCC2)C2CCCCC2)cc1', 274.38800048828125)),\n",
-       " ((2017,\n",
-       "   'O=C(P(C12CC3CC(C2)CC(C1)C3)C(=O)Nc1ccccc1)Nc1ccccc1',\n",
-       "   406.46600341796875),\n",
-       "  (259,\n",
-       "   'O=C(Nc1ccccc1)P(C(=O)Nc1ccccc1)C12CC3CC(CC(C3)C1)C2',\n",
-       "   406.46600341796875)),\n",
-       " ((2023,\n",
-       "   'O=C(P(C12CC3CC(C2)CC(C1)C3)C12CC3CC(C2)CC(C1)C3)c1ccccc1',\n",
-       "   406.54998779296875),\n",
-       "  (346,\n",
-       "   'O=C(c1ccccc1)P(C12CC3CC(CC(C3)C1)C2)C12CC3CC(CC(C3)C1)C2',\n",
-       "   406.54998779296875)),\n",
-       " ((574, 'c1ccc(Pc2ccccc2)cc1', 186.19400024414062),\n",
-       "  (310083, '[H]P(c1ccccc1)c1ccccc1', 186.19400024414062)),\n",
-       " ((42, 'CC(C)(C)P(c1ccccc1-c1ccccc1)C(C)(C)C', 298.4100036621094),\n",
-       "  (331407, 'CC(P(C(C)(C)C)c1ccccc1c1ccccc1)(C)C', 298.4100036621094)),\n",
-       " ((103,\n",
-       "   'COc1ccc(C)c(-c2c(C(C)C)cc(C(C)C)cc2C(C)C)c1P(C(C)(C)C)C(C)(C)C',\n",
-       "   468.70599365234375),\n",
-       "  (331410,\n",
-       "   'COc1ccc(c(c1P(C(C)(C)C)C(C)(C)C)c1c(cc(cc1C(C)C)C(C)C)C(C)C)C',\n",
-       "   468.70599365234375)),\n",
-       " ((116, 'CN(C)c1ccc(P(c2ccccc2)c2ccccc2)cc1', 305.3609924316406),\n",
-       "  (331411, 'CN(c1ccc(cc1)P(c1ccccc1)c1ccccc1)C', 305.3609924316406)),\n",
-       " ((246,\n",
-       "   'COc1c(C)cc(P(c2cc(C)c(OC)c(C)c2)c2cc(C)c(OC)c(C)c2)cc1C',\n",
-       "   436.5320129394531),\n",
-       "  (331416,\n",
-       "   'P(c1cc(c(c(c1)C)OC)C)(c1cc(c(c(c1)C)OC)C)c1cc(c(c(c1)C)OC)C',\n",
-       "   436.5320129394531)),\n",
-       " ((234,\n",
-       "   'Fc1c(F)c(F)c(P(c2ccccc2)c2c(F)c(F)c(F)c(F)c2F)c(F)c1F',\n",
-       "   442.1919860839844),\n",
-       "  (331414,\n",
-       "   'Fc1c(c(c(c(c1F)F)P(c1c(c(c(c(c1F)F)F)F)F)c1ccccc1)F)F',\n",
-       "   442.1919860839844)),\n",
-       " ((356, 'C/C=C/CP(C(C)(C)C)C(C)(C)C', 200.30599975585938),\n",
-       "  (331419, 'P(C(C)(C)C)(C(C)(C)C)C/C=C/C', 200.30599975585938)),\n",
-       " ((487,\n",
-       "   'FC(F)(F)C(F)(F)C(F)(F)C(Cc1ccc(P(c2ccccc2)c2ccc(CC(C(F)(F)F)(C(F)(F)F)C(F)(F)C(F)(F)C(F)(F)F)cc2)cc1)(C(F)(F)F)C(F)(F)F',\n",
-       "   926.4099731445312),\n",
-       "  (331420,\n",
-       "   'FC(F)(F)C(F)(F)C(F)(F)C(C(F)(F)F)(C(F)(F)F)Cc1ccc(cc1)P(c3ccc(cc3)CC(C(F)(F)F)(C(F)(F)F)C(F)(F)C(F)(F)C(F)(F)F)c2ccccc2',\n",
-       "   926.4099731445312)),\n",
-       " ((216, 'CN(C)c1ccc(P(C(C)(C)C)C(C)(C)C)cc1', 265.3810119628906),\n",
-       "  (331413, 'CC(P(C1=CC=C(N(C)C)C=C1)C(C)(C)C)(C)C', 265.3810119628906)),\n",
-       " ((298, 'c1ccc(P2Cc3cccc4c3C3(CC4)CCc4cccc(c43)C2)cc1', 354.4330139160156),\n",
-       "  (331417, 'P1(Cc2c3c(ccc2)CCC23CCc3c2c(ccc3)C1)c1ccccc1', 354.4330139160156)),\n",
-       " ((320, 'COc1cc(OC)c(-c2ccccc2P(C2CCCCC2)C2CCCCC2)c(OC)c1', 440.5639953613281),\n",
-       "  (331418,\n",
-       "   'P(C1CCCCC1)(C1CCCCC1)c1c(cccc1)c1c(cc(cc1OC)OC)OC',\n",
-       "   440.5639953613281)),\n",
-       " ((3, 'COc1cccc(OC)c1-c1ccccc1P(C1CCCCC1)C1CCCCC1', 410.5379943847656),\n",
-       "  (331405, 'COc1cccc(c1c1ccccc1P(C1CCCCC1)C1CCCCC1)OC', 410.5379943847656)),\n",
-       " ((60,\n",
-       "   'Cc1cc(C)c(P(c2c(C)cc(C)cc2C)c2c(C)cc(C)cc2C)c(C)c1',\n",
-       "   388.5350036621094),\n",
-       "  (331408,\n",
-       "   'Cc1cc(C)cc(c1P(c1c(C)cc(cc1C)C)c1c(C)cc(cc1C)C)C',\n",
-       "   388.5350036621094)),\n",
-       " ((241905, '[H]P([H])C', 48.025001525878906),\n",
-       "  (1497, 'CP', 48.025001525878906)),\n",
-       " ((771,\n",
-       "   'c1ccc(P(C23CC4CC(CC(C4)C2)C3)C23CC4CC(CC(C4)C2)C3)cc1',\n",
-       "   378.5400085449219),\n",
-       "  (2040,\n",
-       "   'C1C2CC3CC1CC(C2)(C3)P(C12CC3CC(C2)CC(C1)C3)c1ccccc1',\n",
-       "   378.5400085449219)),\n",
-       " ((779,\n",
-       "   'c1ccc(-n2cccc2P(C23CC4CC(CC(C4)C2)C3)C23CC4CC(CC(C4)C2)C3)cc1',\n",
-       "   443.614990234375),\n",
-       "  (2041,\n",
-       "   'c1ccc(cc1)n1cccc1P(C12CC3CC(C2)CC(C1)C3)C12CC3CC(C2)CC(C1)C3',\n",
-       "   443.614990234375)),\n",
-       " ((729, 'CCP(CC)c1ccccc1-n1c2ccccc2c2ccccc21', 331.39898681640625),\n",
-       "  (331422, 'CCP(CC)c1ccccc1n2c3ccccc3c4c2cccc4', 331.39898681640625)),\n",
-       " ((417,\n",
-       "   'CC(C)(C)P(C12CC3CC(CC(C3)C1)C2)C12CC3CC(CC(C3)C1)C2',\n",
-       "   358.54998779296875),\n",
-       "  (2035,\n",
-       "   'CC(P(C12CC3CC(C2)CC(C1)C3)C12CC3CC(C2)CC(C1)C3)(C)C',\n",
-       "   358.54998779296875)),\n",
-       " ((783,\n",
-       "   'c1ccc(-n2c(P(C34CC5CC(CC(C5)C3)C4)C34CC5CC(CC(C5)C3)C4)cc3ccccc32)cc1',\n",
-       "   493.67498779296875),\n",
-       "  (2042,\n",
-       "   'c1ccc(cc1)n1c2ccccc2cc1P(C12CC3CC(C2)CC(C1)C3)C12CC3CC(C2)CC(C1)C3',\n",
-       "   493.67498779296875)),\n",
-       " ((1201,\n",
-       "   'Cc1cc(C)c(-c2cccc3c2CP(C2CCCCC2)Cc2c-3cccc2-c2c(C)cc(C)cc2C)c(C)c1',\n",
-       "   530.7360229492188),\n",
-       "  (2049,\n",
-       "   'Cc1cc(C)cc(c1c1cccc2c1CP(Cc1c2cccc1c1c(C)cc(cc1C)C)C1CCCCC1)C',\n",
-       "   530.7360229492188)),\n",
-       " ((1202,\n",
-       "   'Cc1cc(C)c(-c2cccc3c2CP(C(C)C)Cc2c-3cccc2-c2c(C)cc(C)cc2C)c(C)c1',\n",
-       "   490.6709899902344),\n",
-       "  (2050,\n",
-       "   'CC(P1Cc2c(c3c(C1)c(ccc3)c1c(C)cc(cc1C)C)cccc2c1c(C)cc(cc1C)C)C',\n",
-       "   490.6709899902344)),\n",
-       " ((241701, '[H]P([H])C(C)(C)C', 90.10600280761719),\n",
-       "  (1298, 'CC(C)(C)P', 90.10600280761719)),\n",
-       " ((242088, '[H]P([H])[H]', 33.99800109863281), (1299, 'P', 33.99800109863281))]"
-      ]
-     },
-     "execution_count": 44,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "duplicates"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 45,
-   "id": "1f1bd039",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Save the object\n",
-    "import pickle"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 46,
-   "id": "3c5ac97b",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "with open('duplicates.pkl', 'wb') as file:\n",
-    "    pickle.dump(duplicates, file)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "a187e06d",
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.9.7"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}

From d56d017941a6991c5262a24aa489bcb3264e3043 Mon Sep 17 00:00:00 2001
From: Usman Jamshed <usman.jamshedk@gmail.com>
Date: Sun, 19 Nov 2023 00:04:41 -0500
Subject: [PATCH 3/3] removing db_investigation and duplicates.pkl

---
 DB_investigation.ipynb | 798 -----------------------------------------
 duplicates.pkl         | Bin 9364 -> 0 bytes
 2 files changed, 798 deletions(-)
 delete mode 100644 DB_investigation.ipynb
 delete mode 100644 duplicates.pkl

diff --git a/DB_investigation.ipynb b/DB_investigation.ipynb
deleted file mode 100644
index 025dc89..0000000
--- a/DB_investigation.ipynb
+++ /dev/null
@@ -1,798 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "9368bb63",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Want to start by checking which molecules are duplicates.\n",
-    "# For examples we have 241905 and 1497"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "id": "e28921c8",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import psycopg2\n",
-    "import pandas as pd\n",
-    "from rdkit import Chem"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 33,
-   "id": "214277f9",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from tqdm import tqdm"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 6,
-   "id": "a128c9e3",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "/tmp/ipykernel_556/3175014960.py:16: UserWarning: pandas only supports SQLAlchemy connectable (engine/connection) or database string URI or sqlite3 DBAPI2 connection. Other DBAPI2 objects are not tested. Please consider using SQLAlchemy.\n",
-      "  df = pd.read_sql_query(query, connection)\n"
-     ]
-    }
-   ],
-   "source": [
-    "# Establish a connection\n",
-    "# You must have the DB container running to run this cell successfully.\n",
-    "# Connection parameters\n",
-    "db_params = {\n",
-    "    'dbname': 'postgres',\n",
-    "    'user': 'postgres',\n",
-    "    'password': '',\n",
-    "    'host': '127.0.0.1',\n",
-    "    'port': '5432'\n",
-    "}\n",
-    "\n",
-    "# Establish a connection to the PostgreSQL database\n",
-    "connection = psycopg2.connect(**db_params)\n",
-    "\n",
-    "# Execute an SQL statement\n",
-    "query = \"SELECT molecule_id, smiles, molecular_weight FROM molecule\"\n",
-    "df = pd.read_sql_query(query, connection)\n",
-    "\n",
-    "# Close the connection\n",
-    "connection.close()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 7,
-   "id": "686a1a35",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>molecule_id</th>\n",
-       "      <th>smiles</th>\n",
-       "      <th>molecular_weight</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>0</th>\n",
-       "      <td>331406</td>\n",
-       "      <td>COc1cccc(c1c1ccccc1P(c1ccccc1)c1ccccc1)OC</td>\n",
-       "      <td>398.441986</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1</th>\n",
-       "      <td>140360</td>\n",
-       "      <td>COc1ccc(P(c2ccccc2SC)c2ccccc2SC)c(C)c1</td>\n",
-       "      <td>398.532990</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2</th>\n",
-       "      <td>331409</td>\n",
-       "      <td>C1CCC(CC1)P(c1ccccc1)C1CCCCC1</td>\n",
-       "      <td>274.388000</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>3</th>\n",
-       "      <td>2027</td>\n",
-       "      <td>CN(c1ccccc1c1ccccc1P(C12CC3CC(C2)CC(C1)C3)C12C...</td>\n",
-       "      <td>497.707001</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>4</th>\n",
-       "      <td>2036</td>\n",
-       "      <td>CCC1(CC)O[C@@H]2[C@@H](O1)C(c1cc(C(C)(C)C)cc(C...</td>\n",
-       "      <td>1049.558960</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>...</th>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>330962</th>\n",
-       "      <td>608</td>\n",
-       "      <td>Cc1cc(C)cc(P(c2cc(C)cc(C)c2)c2ccccc2C=O)c1</td>\n",
-       "      <td>346.410004</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>330963</th>\n",
-       "      <td>461</td>\n",
-       "      <td>CCO[Si](CCP(c1ccccc1)c1ccccc1)(OCC)OCC</td>\n",
-       "      <td>376.509003</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>330964</th>\n",
-       "      <td>1064</td>\n",
-       "      <td>Cc1c(C)n(C(C)C)c(=NP(N=c2n(C(C)C)c(C)c(C)n2C(C...</td>\n",
-       "      <td>462.666992</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>330965</th>\n",
-       "      <td>523</td>\n",
-       "      <td>CN(C)/N=C/c1ccc(P(c2ccc(/C=N/N(C)C)s2)c2ccc(/C...</td>\n",
-       "      <td>490.664001</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>330966</th>\n",
-       "      <td>1817</td>\n",
-       "      <td>COc1cccc(P(c2cccc(OC)c2OC)c2cccc(OC)c2OC)c1OC</td>\n",
-       "      <td>442.447998</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "<p>330967 rows × 3 columns</p>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "        molecule_id                                             smiles  \\\n",
-       "0            331406          COc1cccc(c1c1ccccc1P(c1ccccc1)c1ccccc1)OC   \n",
-       "1            140360             COc1ccc(P(c2ccccc2SC)c2ccccc2SC)c(C)c1   \n",
-       "2            331409                      C1CCC(CC1)P(c1ccccc1)C1CCCCC1   \n",
-       "3              2027  CN(c1ccccc1c1ccccc1P(C12CC3CC(C2)CC(C1)C3)C12C...   \n",
-       "4              2036  CCC1(CC)O[C@@H]2[C@@H](O1)C(c1cc(C(C)(C)C)cc(C...   \n",
-       "...             ...                                                ...   \n",
-       "330962          608         Cc1cc(C)cc(P(c2cc(C)cc(C)c2)c2ccccc2C=O)c1   \n",
-       "330963          461             CCO[Si](CCP(c1ccccc1)c1ccccc1)(OCC)OCC   \n",
-       "330964         1064  Cc1c(C)n(C(C)C)c(=NP(N=c2n(C(C)C)c(C)c(C)n2C(C...   \n",
-       "330965          523  CN(C)/N=C/c1ccc(P(c2ccc(/C=N/N(C)C)s2)c2ccc(/C...   \n",
-       "330966         1817      COc1cccc(P(c2cccc(OC)c2OC)c2cccc(OC)c2OC)c1OC   \n",
-       "\n",
-       "        molecular_weight  \n",
-       "0             398.441986  \n",
-       "1             398.532990  \n",
-       "2             274.388000  \n",
-       "3             497.707001  \n",
-       "4            1049.558960  \n",
-       "...                  ...  \n",
-       "330962        346.410004  \n",
-       "330963        376.509003  \n",
-       "330964        462.666992  \n",
-       "330965        490.664001  \n",
-       "330966        442.447998  \n",
-       "\n",
-       "[330967 rows x 3 columns]"
-      ]
-     },
-     "execution_count": 7,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "df"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 15,
-   "id": "149e8d9b",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>molecule_id</th>\n",
-       "      <th>smiles</th>\n",
-       "      <th>molecular_weight</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>143223</th>\n",
-       "      <td>241905</td>\n",
-       "      <td>[H]P([H])C</td>\n",
-       "      <td>48.025002</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>329868</th>\n",
-       "      <td>1497</td>\n",
-       "      <td>CP</td>\n",
-       "      <td>48.025002</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "        molecule_id      smiles  molecular_weight\n",
-       "143223       241905  [H]P([H])C         48.025002\n",
-       "329868         1497          CP         48.025002"
-      ]
-     },
-     "execution_count": 15,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "# Sanity check to see if the data is correct\n",
-    "df[(df[\"molecule_id\"]==241905) | (df[\"molecule_id\"]==1497)]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 27,
-   "id": "9596cb12",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# These two molecules are the same so lets check if rdkit will return the same smiles string when canonicalizing them\n",
-    "mol_241905 = df[df[\"molecule_id\"]==241905][\"smiles\"].to_list()[0]\n",
-    "mol_1497 = df[df[\"molecule_id\"]==1497][\"smiles\"].to_list()[0]\n",
-    "\n",
-    "# Double check with the molecular weight. Use difference is less than some tolerance 1e-6.\n",
-    "mol_241905_weight = df[df[\"molecule_id\"]==241905][\"molecular_weight\"].to_list()[0]\n",
-    "mol_1497_weight = df[df[\"molecule_id\"]==1497][\"molecular_weight\"].to_list()[0]\n",
-    "\n",
-    "a = Chem.CanonSmiles(mol_241905)\n",
-    "b = Chem.CanonSmiles(mol_241905)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 29,
-   "id": "9c94d293",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "CP 48.025001525878906\n",
-      "CP 48.025001525878906\n",
-      "True\n",
-      "True\n"
-     ]
-    }
-   ],
-   "source": [
-    "print(a, mol_241905_weight)\n",
-    "print(b, mol_1497_weight)\n",
-    "print(a == b)\n",
-    "print(abs(mol_241905_weight - mol_1497_weight) < 0.000001)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 31,
-   "id": "5cca5c81",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "all_data_dict = df.to_dict(orient=\"records\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 41,
-   "id": "7ea5a56b",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "  0%|                                                                                        | 0/330967 [00:00<?, ?it/s][23:43:15] Explicit valence for atom # 7 B, 6, is greater than permitted\n",
-      "  0%|▏                                                                           | 653/330967 [00:00<01:38, 3339.89it/s]"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "c1ccc(cc1)P([B]1234[BH]567[BH]891[BH]1%102[BH]2%114[BH]435[CH]357[BH]768[CH]691[BH]1%102[BH]%1143[BH]5761)c1ccccc1\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "  1%|▌                                                                          | 2331/330967 [00:00<01:20, 4099.64it/s][23:43:15] Explicit valence for atom # 7 C, 6, is greater than permitted\n",
-      "  1%|▋                                                                          | 3150/330967 [00:00<01:24, 3897.22it/s]"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "c1ccc(cc1)P([C]1234[BH]567[BH]891[BH]1%103[BH]3%112[BH]245[BH]456[CH]678[BH]791[BH]1%10%11[BH]324[BH]5671)c1ccccc1\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "  2%|█▎                                                                         | 5580/330967 [00:01<01:22, 3940.69it/s][23:43:16] Explicit valence for atom # 7 B, 6, is greater than permitted\n",
-      "[23:43:16] Explicit valence for atom # 7 C, 6, is greater than permitted\n",
-      "  2%|█▍                                                                         | 6369/330967 [00:01<01:23, 3905.70it/s]"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "C1CCC(CC1)P([B]1234[BH]567[BH]891[BH]1%102[BH]2%114[BH]435[CH]357[BH]768[CH]691[BH]1%102[BH]%1143[BH]5761)C1CCCCC1\n",
-      "c1ccc(cc1)P([C]1234[CH]567[BH]893[BH]3%102[BH]2%111[BH]145[BH]456[BH]678[BH]79%10[BH]83%11[BH]214[BH]5678)c1ccccc1\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "100%|█████████████████████████████████████████████████████████████████████████| 330967/330967 [02:02<00:00, 2711.21it/s]\n"
-     ]
-    }
-   ],
-   "source": [
-    "# Store each record in a hashmap with the CANONICAL smile as the key, and value as a tuple of molecule_id and molecular_weight\n",
-    "# When storing, check to see if the CANONICAL smile is already there, if so check its weight and add the tuple of molecule_ids\n",
-    "# the original  \n",
-    "hashmap = {}\n",
-    "duplicates = []\n",
-    "for entry in tqdm(all_data_dict):\n",
-    "    try:\n",
-    "        # Canonicalize smiles\n",
-    "        canonical_smile = Chem.CanonSmiles(entry[\"smiles\"])\n",
-    "        # Check if its in hashmap\n",
-    "        if canonical_smile not in hashmap:\n",
-    "            # add it\n",
-    "            hashmap[canonical_smile] = (entry[\"molecule_id\"], entry[\"smiles\"], entry[\"molecular_weight\"])\n",
-    "        else:\n",
-    "            a = hashmap[canonical_smile] # Get matched molecule tuple data\n",
-    "            b = (entry[\"molecule_id\"], entry[\"smiles\"], entry[\"molecular_weight\"])\n",
-    "            duplicates.append((a, b))\n",
-    "    except:\n",
-    "        # See which molecules cause issues if any, deal with them later\n",
-    "        print(entry[\"smiles\"])"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 43,
-   "id": "361ec9af",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "78"
-      ]
-     },
-     "execution_count": 43,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "len(duplicates)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 44,
-   "id": "158c8d2c",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "[((1519, 'COP', 64.02400207519531),\n",
-       "  (241851, '[H]P([H])OC', 64.02400207519531)),\n",
-       " ((241739, '[H]P([H])C(C)C', 76.0790023803711),\n",
-       "  (1495, 'CC(C)P', 76.0790023803711)),\n",
-       " ((2045, 'C1CC2CCCC(C1)P2B1Nc2ccccc2c2c1cccc2', 319.1969909667969),\n",
-       "  (1006, 'c1ccc2c(c1)NB(P1C3CCCC1CCC3)c1ccccc1-2', 319.1969909667969)),\n",
-       " ((1521, 'CSP', 80.09200286865234),\n",
-       "  (241909, '[H]P([H])SC', 80.09200286865234)),\n",
-       " ((2063,\n",
-       "   'COC1=CC=C(OC)C(C2=C(C(C)C)C=C(C(C)C)C=C2C(C)C)=C1P3C(C)(C)CC4(OCCO4)CC3(C)C',\n",
-       "   554.7520141601562),\n",
-       "  (1900,\n",
-       "   'COc1ccc(OC)c(P2C(C)(C)CC3(CC2(C)C)OCCO3)c1-c1c(C(C)C)cc(C(C)C)cc1C(C)C',\n",
-       "   554.7520141601562)),\n",
-       " ((1494, 'CC(C)PC(C)C', 118.16000366210938),\n",
-       "  (41951, '[H]P(C(C)C)C(C)C', 118.16000366210938)),\n",
-       " ((2054, 'PC12CC3CC(C2)CC(C1)C3', 168.22000122070312),\n",
-       "  (1296, 'PC12CC3CC(CC(C3)C1)C2', 168.22000122070312)),\n",
-       " ((820,\n",
-       "   'COC1=[C@@](C2=CC=CC=C2C=C1)[C@@]3=C4C(C=CC=C4)=CC=C3P(OC5=CC=C(C=CC=C6)C6=[C@]57)OC8=[C@]7C(C=CC=C9)=C9C=C8',\n",
-       "   598.6380004882812),\n",
-       "  (821,\n",
-       "   'COC1=[C@@](C2=CC=CC=C2C=C1)[C@@]3=C4C(C=CC=C4)=CC=C3P(OC5=CC=C(C=CC=C6)C6=[C@@]57)OC8=[C@@]7C(C=CC=C9)=C9C=C8',\n",
-       "   598.6380004882812)),\n",
-       " ((2053, 'C1C2CC3CC1CC(C2)(C3)PC12CC3CC(C2)CC(C1)C3', 302.4419860839844),\n",
-       "  (1295, 'C1C2CC3CC1CC(PC14CC5CC(CC(C5)C1)C4)(C2)C3', 302.4419860839844)),\n",
-       " ((213971, '[H]P(F)F', 69.97799682617188), (1293, 'FPF', 69.97799682617188)),\n",
-       " ((150328, '[H]P([Si](C)(C)C)[Si](C)(C)C', 178.36399841308594),\n",
-       "  (1520, 'C[Si](C)(C)P[Si](C)(C)C', 178.36399841308594)),\n",
-       " ((2020,\n",
-       "   'c1ccc(cc1)c1ccccc1P(C12CC3CC(C2)CC(C1)C3)C12CC3CC(C2)CC(C1)C3',\n",
-       "   454.63800048828125),\n",
-       "  (306,\n",
-       "   'c1ccc(-c2ccccc2P(C23CC4CC(CC(C4)C2)C3)C23CC4CC(CC(C4)C2)C3)cc1',\n",
-       "   454.63800048828125)),\n",
-       " ((2053, 'C1C2CC3CC1CC(C2)(C3)PC12CC3CC(C2)CC(C1)C3', 302.4419860839844),\n",
-       "  (252420,\n",
-       "   '[H]P(C12CC3CC(CC(C3)C1)C2)C12CC3CC(CC(C3)C1)C2',\n",
-       "   302.4419860839844)),\n",
-       " ((1491, 'Pc1ccco1', 100.05699920654297),\n",
-       "  (242228, '[H]P([H])c1ccco1', 100.05699920654297)),\n",
-       " ((2033,\n",
-       "   'CCCCc1c(F)c(F)c(c(c1F)F)c1c(cc(c(c1C(C)C)c1cccc(c1P(C12CC3CC(C2)CC(C1)C3)C12CC3CC(C2)CC(C1)C3)OC)C(C)C)C(C)C',\n",
-       "   815.072998046875),\n",
-       "  (369,\n",
-       "   'CCCCc1c(F)c(F)c(-c2c(C(C)C)cc(C(C)C)c(-c3cccc(OC)c3P(C34CC5CC(CC(C5)C3)C4)C34CC5CC(CC(C5)C3)C4)c2C(C)C)c(F)c1F',\n",
-       "   815.072998046875)),\n",
-       " ((1297, 'CC(C)(C)PC(C)(C)C', 146.21400451660156),\n",
-       "  (20221, '[H]P(C(C)(C)C)C(C)(C)C', 146.21400451660156)),\n",
-       " ((2044, 'CC(P(C(C)(C)C)C1C2CC3CC1CC(C2)C3)(C)C', 280.4360046386719),\n",
-       "  (952, 'CC(C)(C)P(C1C2CC3CC(C2)CC1C3)C(C)(C)C', 280.4360046386719)),\n",
-       " ((331415, 'P(CCCCCCCC)(CCCCCCCC)CCCCCCCC', 370.64599609375),\n",
-       "  (239, 'CCCCCCCCP(CCCCCCCC)CCCCCCCC', 370.64599609375)),\n",
-       " ((1493, 'CCP', 62.051998138427734),\n",
-       "  (241815, '[H]P([H])CC', 62.051998138427734)),\n",
-       " ((1518, 'CN(C)PN(C)C', 120.13600158691406),\n",
-       "  (91817, '[H]P(N(C)C)N(C)C', 120.13600158691406)),\n",
-       " ((1294, 'FP', 51.987998962402344),\n",
-       "  (242039, '[H]P([H])F', 51.987998962402344)),\n",
-       " ((2038, 'C1C2CC3CC1CC(C2)(C3)P(c1ccccc1)c1ccccc1', 320.4159851074219),\n",
-       "  (576, 'c1ccc(P(c2ccccc2)C23CC4CC(CC(C4)C2)C3)cc1', 320.4159851074219)),\n",
-       " ((1487, 'Pc1ccc2c(c1)CCCC2', 164.18800354003906),\n",
-       "  (242188, '[H]P([H])c1ccc2c(c1)CCCC2', 164.18800354003906)),\n",
-       " ((2034,\n",
-       "   'O1CCN(CC1)c1ccccc1P(C12CC3CC(C2)CC(C1)C3)C12CC3CC(C2)CC(C1)C3',\n",
-       "   463.64599609375),\n",
-       "  (371,\n",
-       "   'c1ccc(P(C23CC4CC(CC(C4)C2)C3)C23CC4CC(CC(C4)C2)C3)c(N2CCOCC2)c1',\n",
-       "   463.64599609375)),\n",
-       " ((331412, 'CC(C=CC=C1C)=C1P(C2=CC=CC=C2)C3=CC=CC=C3', 290.34600830078125),\n",
-       "  (203, 'Cc1cccc(C)c1P(c1ccccc1)c1ccccc1', 290.34600830078125)),\n",
-       " ((2043, 'CC(P(C12CC3CC(C2)CC(C1)C3)C(C)(C)C)(C)C', 280.4360046386719),\n",
-       "  (951, 'CC(C)(C)P(C(C)(C)C)C12CC3CC(CC(C3)C1)C2', 280.4360046386719)),\n",
-       " ((1492, 'CCPCC', 90.10600280761719),\n",
-       "  (85517, '[H]P(CC)CC', 90.10600280761719)),\n",
-       " ((1488, 'PC12c3ccccc3C(c3ccccc31)c1ccccc12', 286.3139953613281),\n",
-       "  (242107, '[H]P([H])C12c3ccccc3C(c3ccccc31)c1ccccc12', 286.3139953613281)),\n",
-       " ((2064,\n",
-       "   'COC1=CC=C(C2=C(OC)C=CC3=C2C=CC=C3)C(OC)=C1P(C4CCCCC4)C5CCCCC5',\n",
-       "   490.6239929199219),\n",
-       "  (1901,\n",
-       "   'COc1ccc(-c2c(OC)ccc3ccccc23)c(OC)c1P(C1CCCCC1)C1CCCCC1',\n",
-       "   490.6239929199219)),\n",
-       " ((331406, 'COc1cccc(c1c1ccccc1P(c1ccccc1)c1ccccc1)OC', 398.4419860839844),\n",
-       "  (7, 'COc1cccc(OC)c1-c1ccccc1P(c1ccccc1)c1ccccc1', 398.4419860839844)),\n",
-       " ((1485, 'Pc1ccc2ccccc2c1', 160.156005859375),\n",
-       "  (242192, '[H]P([H])c1ccc2ccccc2c1', 160.156005859375)),\n",
-       " ((1496, 'CPC', 62.051998138427734), (137130, '[H]P(C)C', 62.051998138427734)),\n",
-       " ((2052,\n",
-       "   'CC(c1cccc(c1P(C12CC3CC(C2)CC(C1)C3)C12CC3CC(C2)CC(C1)C3)C(C)C)C',\n",
-       "   462.7019958496094),\n",
-       "  (1274,\n",
-       "   'CC(C)c1cccc(C(C)C)c1P(C12CC3CC(CC(C3)C1)C2)C12CC3CC(CC(C3)C1)C2',\n",
-       "   462.7019958496094)),\n",
-       " ((2025,\n",
-       "   'O=C(P(C12CC3CC(C2)CC(C1)C3)C12CC3CC(C2)CC(C1)C3)c1cccc2c1cccc2',\n",
-       "   456.6099853515625),\n",
-       "  (348,\n",
-       "   'O=C(c1cccc2ccccc12)P(C12CC3CC(CC(C3)C1)C2)C12CC3CC(CC(C3)C1)C2',\n",
-       "   456.6099853515625)),\n",
-       " ((575, 'Pc1ccccc1', 110.09600067138672),\n",
-       "  (242207, '[H]P([H])c1ccccc1', 110.09600067138672)),\n",
-       " ((2026,\n",
-       "   'CC([Si](C(C)C)(C(C)C)Oc1ccccc1P(C12CC3CC(C2)CC(C1)C3)C12CC3CC(C2)CC(C1)C3)C',\n",
-       "   550.8839721679688),\n",
-       "  (349,\n",
-       "   'CC(C)[Si](Oc1ccccc1P(C12CC3CC(CC(C3)C1)C2)C12CC3CC(CC(C3)C1)C2)(C(C)C)C(C)C',\n",
-       "   550.8839721679688)),\n",
-       " ((2054, 'PC12CC3CC(C2)CC(C1)C3', 168.22000122070312),\n",
-       "  (242106, '[H]P([H])C12CC3CC(CC(C3)C1)C2', 168.22000122070312)),\n",
-       " ((2048, 'COCCP(C12CC3CC(C2)CC(C1)C3)C12CC3CC(C2)CC(C1)C3', 360.5220031738281),\n",
-       "  (1140,\n",
-       "   'COCCP(C12CC3CC(CC(C3)C1)C2)C12CC3CC(CC(C3)C1)C2',\n",
-       "   360.5220031738281)),\n",
-       " ((2051, 'O=Cc1ccc(cc1)P(c1ccc(cc1)C=O)C1C2CC3CC1CC(C2)C3', 376.4360046386719),\n",
-       "  (1241,\n",
-       "   'O=Cc1ccc(P(c2ccc(C=O)cc2)C2C3CC4CC(C3)CC2C4)cc1',\n",
-       "   376.4360046386719)),\n",
-       " ((2024,\n",
-       "   'COc1ccc(c(c1c1c(cc(cc1C(C)C)C(C)C)C(C)C)P(C12CC3CC(C2)CC(C1)C3)C12CC3CC(C2)CC(C1)C3)OC',\n",
-       "   640.9329833984375),\n",
-       "  (347,\n",
-       "   'COc1ccc(OC)c(P(C23CC4CC(CC(C4)C2)C3)C23CC4CC(CC(C4)C2)C3)c1-c1c(C(C)C)cc(C(C)C)cc1C(C)C',\n",
-       "   640.9329833984375)),\n",
-       " ((2047,\n",
-       "   'COc1ccccc1P(C12CC3CC(C2)CC(C1)C3)C12CC3CC(C2)CC(C1)C3',\n",
-       "   408.5660095214844),\n",
-       "  (1137,\n",
-       "   'COc1ccccc1P(C12CC3CC(CC(C3)C1)C2)C12CC3CC(CC(C3)C1)C2',\n",
-       "   408.5660095214844)),\n",
-       " ((2016,\n",
-       "   'C1C2CC3CC1CC(C2)(C3)P(C12CC3CC(C2)CC(C1)C3)Cc1ccccc1',\n",
-       "   392.5669860839844),\n",
-       "  (64,\n",
-       "   'c1ccc(CP(C23CC4CC(CC(C4)C2)C3)C23CC4CC(CC(C4)C2)C3)cc1',\n",
-       "   392.5669860839844)),\n",
-       " ((2022,\n",
-       "   'c1ccc(cc1)c1nn(c(c1n1nccc1P(C12CC3CC(C2)CC(C1)C3)C12CC3CC(C2)CC(C1)C3)c1ccccc1)c1ccccc1',\n",
-       "   662.8619995117188),\n",
-       "  (340,\n",
-       "   'c1ccc(-c2nn(-c3ccccc3)c(-c3ccccc3)c2-n2nccc2P(C23CC4CC(CC(C4)C2)C3)C23CC4CC(CC(C4)C2)C3)cc1',\n",
-       "   662.8619995117188)),\n",
-       " ((2014,\n",
-       "   'C1C2CC3CC1CC(C2)(C3)P(C12CC3CC(C2)CC(C1)C3)C12CC3CC(C2)CC(C1)C3',\n",
-       "   436.66400146484375),\n",
-       "  (6,\n",
-       "   'C1C2CC3CC1CC(P(C14CC5CC(CC(C5)C1)C4)C14CC5CC(CC(C5)C1)C4)(C2)C3',\n",
-       "   436.66400146484375)),\n",
-       " ((2037,\n",
-       "   'CN(c1ccccc1P(C12CC3CC(C2)CC(C1)C3)C12CC3CC(C2)CC(C1)C3)C',\n",
-       "   421.6090087890625),\n",
-       "  (527,\n",
-       "   'CN(C)c1ccccc1P(C12CC3CC(CC(C3)C1)C2)C12CC3CC(CC(C3)C1)C2',\n",
-       "   421.6090087890625)),\n",
-       " ((1490, 'Pc1cccs1', 116.125), (242229, '[H]P([H])c1cccs1', 116.125)),\n",
-       " ((331421, 'CCO[Si](OCC)(OCC)CCCP(c1ccccc1)c2ccccc2', 390.5360107421875),\n",
-       "  (728, 'CCO[Si](CCCP(c1ccccc1)c1ccccc1)(OCC)OCC', 390.5360107421875)),\n",
-       " ((1486, 'Pc1cccc2ccccc12', 160.156005859375),\n",
-       "  (242206, '[H]P([H])c1cccc2ccccc12', 160.156005859375)),\n",
-       " ((2021,\n",
-       "   'C1CCN(CC1)c1ccccc1P(C12CC3CC(C2)CC(C1)C3)C12CC3CC(C2)CC(C1)C3',\n",
-       "   461.67401123046875),\n",
-       "  (338,\n",
-       "   'c1ccc(P(C23CC4CC(CC(C4)C2)C3)C23CC4CC(CC(C4)C2)C3)c(N2CCCCC2)c1',\n",
-       "   461.67401123046875)),\n",
-       " ((2015,\n",
-       "   'CCCCP(C12CC3CC(C2)CC(C1)C3)C12CC3CC(C2)CC(C1)C3',\n",
-       "   358.54998779296875),\n",
-       "  (10, 'CCCCP(C12CC3CC(CC(C3)C1)C2)C12CC3CC(CC(C3)C1)C2', 358.54998779296875)),\n",
-       " ((2065,\n",
-       "   'C12=CC=CC=C1C(C3=C(C4=C(P(C5CCCCC5)C6CCCCC6)C=CC=C4)C(C=CC=C7)=C7C=C3)=C8C(C=CC=C8)=C2',\n",
-       "   576.7639770507812),\n",
-       "  (1902,\n",
-       "   'c1ccc(P(C2CCCCC2)C2CCCCC2)c(-c2c(-c3c4ccccc4cc4ccccc34)ccc3ccccc23)c1',\n",
-       "   576.7639770507812)),\n",
-       " ((2027,\n",
-       "   'CN(c1ccccc1c1ccccc1P(C12CC3CC(C2)CC(C1)C3)C12CC3CC(C2)CC(C1)C3)C',\n",
-       "   497.7070007324219),\n",
-       "  (350,\n",
-       "   'CN(C)c1ccccc1-c1ccccc1P(C12CC3CC(CC(C3)C1)C2)C12CC3CC(CC(C3)C1)C2',\n",
-       "   497.7070007324219)),\n",
-       " ((331409, 'C1CCC(CC1)P(c1ccccc1)C1CCCCC1', 274.38800048828125),\n",
-       "  (68, 'c1ccc(P(C2CCCCC2)C2CCCCC2)cc1', 274.38800048828125)),\n",
-       " ((2017,\n",
-       "   'O=C(P(C12CC3CC(C2)CC(C1)C3)C(=O)Nc1ccccc1)Nc1ccccc1',\n",
-       "   406.46600341796875),\n",
-       "  (259,\n",
-       "   'O=C(Nc1ccccc1)P(C(=O)Nc1ccccc1)C12CC3CC(CC(C3)C1)C2',\n",
-       "   406.46600341796875)),\n",
-       " ((2023,\n",
-       "   'O=C(P(C12CC3CC(C2)CC(C1)C3)C12CC3CC(C2)CC(C1)C3)c1ccccc1',\n",
-       "   406.54998779296875),\n",
-       "  (346,\n",
-       "   'O=C(c1ccccc1)P(C12CC3CC(CC(C3)C1)C2)C12CC3CC(CC(C3)C1)C2',\n",
-       "   406.54998779296875)),\n",
-       " ((574, 'c1ccc(Pc2ccccc2)cc1', 186.19400024414062),\n",
-       "  (310083, '[H]P(c1ccccc1)c1ccccc1', 186.19400024414062)),\n",
-       " ((42, 'CC(C)(C)P(c1ccccc1-c1ccccc1)C(C)(C)C', 298.4100036621094),\n",
-       "  (331407, 'CC(P(C(C)(C)C)c1ccccc1c1ccccc1)(C)C', 298.4100036621094)),\n",
-       " ((103,\n",
-       "   'COc1ccc(C)c(-c2c(C(C)C)cc(C(C)C)cc2C(C)C)c1P(C(C)(C)C)C(C)(C)C',\n",
-       "   468.70599365234375),\n",
-       "  (331410,\n",
-       "   'COc1ccc(c(c1P(C(C)(C)C)C(C)(C)C)c1c(cc(cc1C(C)C)C(C)C)C(C)C)C',\n",
-       "   468.70599365234375)),\n",
-       " ((116, 'CN(C)c1ccc(P(c2ccccc2)c2ccccc2)cc1', 305.3609924316406),\n",
-       "  (331411, 'CN(c1ccc(cc1)P(c1ccccc1)c1ccccc1)C', 305.3609924316406)),\n",
-       " ((246,\n",
-       "   'COc1c(C)cc(P(c2cc(C)c(OC)c(C)c2)c2cc(C)c(OC)c(C)c2)cc1C',\n",
-       "   436.5320129394531),\n",
-       "  (331416,\n",
-       "   'P(c1cc(c(c(c1)C)OC)C)(c1cc(c(c(c1)C)OC)C)c1cc(c(c(c1)C)OC)C',\n",
-       "   436.5320129394531)),\n",
-       " ((234,\n",
-       "   'Fc1c(F)c(F)c(P(c2ccccc2)c2c(F)c(F)c(F)c(F)c2F)c(F)c1F',\n",
-       "   442.1919860839844),\n",
-       "  (331414,\n",
-       "   'Fc1c(c(c(c(c1F)F)P(c1c(c(c(c(c1F)F)F)F)F)c1ccccc1)F)F',\n",
-       "   442.1919860839844)),\n",
-       " ((356, 'C/C=C/CP(C(C)(C)C)C(C)(C)C', 200.30599975585938),\n",
-       "  (331419, 'P(C(C)(C)C)(C(C)(C)C)C/C=C/C', 200.30599975585938)),\n",
-       " ((487,\n",
-       "   'FC(F)(F)C(F)(F)C(F)(F)C(Cc1ccc(P(c2ccccc2)c2ccc(CC(C(F)(F)F)(C(F)(F)F)C(F)(F)C(F)(F)C(F)(F)F)cc2)cc1)(C(F)(F)F)C(F)(F)F',\n",
-       "   926.4099731445312),\n",
-       "  (331420,\n",
-       "   'FC(F)(F)C(F)(F)C(F)(F)C(C(F)(F)F)(C(F)(F)F)Cc1ccc(cc1)P(c3ccc(cc3)CC(C(F)(F)F)(C(F)(F)F)C(F)(F)C(F)(F)C(F)(F)F)c2ccccc2',\n",
-       "   926.4099731445312)),\n",
-       " ((216, 'CN(C)c1ccc(P(C(C)(C)C)C(C)(C)C)cc1', 265.3810119628906),\n",
-       "  (331413, 'CC(P(C1=CC=C(N(C)C)C=C1)C(C)(C)C)(C)C', 265.3810119628906)),\n",
-       " ((298, 'c1ccc(P2Cc3cccc4c3C3(CC4)CCc4cccc(c43)C2)cc1', 354.4330139160156),\n",
-       "  (331417, 'P1(Cc2c3c(ccc2)CCC23CCc3c2c(ccc3)C1)c1ccccc1', 354.4330139160156)),\n",
-       " ((320, 'COc1cc(OC)c(-c2ccccc2P(C2CCCCC2)C2CCCCC2)c(OC)c1', 440.5639953613281),\n",
-       "  (331418,\n",
-       "   'P(C1CCCCC1)(C1CCCCC1)c1c(cccc1)c1c(cc(cc1OC)OC)OC',\n",
-       "   440.5639953613281)),\n",
-       " ((3, 'COc1cccc(OC)c1-c1ccccc1P(C1CCCCC1)C1CCCCC1', 410.5379943847656),\n",
-       "  (331405, 'COc1cccc(c1c1ccccc1P(C1CCCCC1)C1CCCCC1)OC', 410.5379943847656)),\n",
-       " ((60,\n",
-       "   'Cc1cc(C)c(P(c2c(C)cc(C)cc2C)c2c(C)cc(C)cc2C)c(C)c1',\n",
-       "   388.5350036621094),\n",
-       "  (331408,\n",
-       "   'Cc1cc(C)cc(c1P(c1c(C)cc(cc1C)C)c1c(C)cc(cc1C)C)C',\n",
-       "   388.5350036621094)),\n",
-       " ((241905, '[H]P([H])C', 48.025001525878906),\n",
-       "  (1497, 'CP', 48.025001525878906)),\n",
-       " ((771,\n",
-       "   'c1ccc(P(C23CC4CC(CC(C4)C2)C3)C23CC4CC(CC(C4)C2)C3)cc1',\n",
-       "   378.5400085449219),\n",
-       "  (2040,\n",
-       "   'C1C2CC3CC1CC(C2)(C3)P(C12CC3CC(C2)CC(C1)C3)c1ccccc1',\n",
-       "   378.5400085449219)),\n",
-       " ((779,\n",
-       "   'c1ccc(-n2cccc2P(C23CC4CC(CC(C4)C2)C3)C23CC4CC(CC(C4)C2)C3)cc1',\n",
-       "   443.614990234375),\n",
-       "  (2041,\n",
-       "   'c1ccc(cc1)n1cccc1P(C12CC3CC(C2)CC(C1)C3)C12CC3CC(C2)CC(C1)C3',\n",
-       "   443.614990234375)),\n",
-       " ((729, 'CCP(CC)c1ccccc1-n1c2ccccc2c2ccccc21', 331.39898681640625),\n",
-       "  (331422, 'CCP(CC)c1ccccc1n2c3ccccc3c4c2cccc4', 331.39898681640625)),\n",
-       " ((417,\n",
-       "   'CC(C)(C)P(C12CC3CC(CC(C3)C1)C2)C12CC3CC(CC(C3)C1)C2',\n",
-       "   358.54998779296875),\n",
-       "  (2035,\n",
-       "   'CC(P(C12CC3CC(C2)CC(C1)C3)C12CC3CC(C2)CC(C1)C3)(C)C',\n",
-       "   358.54998779296875)),\n",
-       " ((783,\n",
-       "   'c1ccc(-n2c(P(C34CC5CC(CC(C5)C3)C4)C34CC5CC(CC(C5)C3)C4)cc3ccccc32)cc1',\n",
-       "   493.67498779296875),\n",
-       "  (2042,\n",
-       "   'c1ccc(cc1)n1c2ccccc2cc1P(C12CC3CC(C2)CC(C1)C3)C12CC3CC(C2)CC(C1)C3',\n",
-       "   493.67498779296875)),\n",
-       " ((1201,\n",
-       "   'Cc1cc(C)c(-c2cccc3c2CP(C2CCCCC2)Cc2c-3cccc2-c2c(C)cc(C)cc2C)c(C)c1',\n",
-       "   530.7360229492188),\n",
-       "  (2049,\n",
-       "   'Cc1cc(C)cc(c1c1cccc2c1CP(Cc1c2cccc1c1c(C)cc(cc1C)C)C1CCCCC1)C',\n",
-       "   530.7360229492188)),\n",
-       " ((1202,\n",
-       "   'Cc1cc(C)c(-c2cccc3c2CP(C(C)C)Cc2c-3cccc2-c2c(C)cc(C)cc2C)c(C)c1',\n",
-       "   490.6709899902344),\n",
-       "  (2050,\n",
-       "   'CC(P1Cc2c(c3c(C1)c(ccc3)c1c(C)cc(cc1C)C)cccc2c1c(C)cc(cc1C)C)C',\n",
-       "   490.6709899902344)),\n",
-       " ((241701, '[H]P([H])C(C)(C)C', 90.10600280761719),\n",
-       "  (1298, 'CC(C)(C)P', 90.10600280761719)),\n",
-       " ((242088, '[H]P([H])[H]', 33.99800109863281), (1299, 'P', 33.99800109863281))]"
-      ]
-     },
-     "execution_count": 44,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "duplicates"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 45,
-   "id": "1f1bd039",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Save the object\n",
-    "import pickle"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 46,
-   "id": "3c5ac97b",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "with open('duplicates.pkl', 'wb') as file:\n",
-    "    pickle.dump(duplicates, file)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "a187e06d",
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.9.7"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}
diff --git a/duplicates.pkl b/duplicates.pkl
deleted file mode 100644
index a878a6a40dd0c7c60c497220cf369566d6959c60..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 9364
zcmb_hYmA&#8Q$61>1<o*1%g#*WreoRG%RH<+bw}@#~xkkOlD`aqy*L_8@@t|!fG$+
z#n!qYSzU7!l`U2AQq!n~UqTcwHKy^0G2SC8ZGXrbtSCgVHV`QSKF@nD-#4@0?lQsI
z%bYpid7t<Gp7)#??Km{PyiHz16PfA@DV<cc`os;TdSbkU=}%6SkAElGrsofC9I9vV
zFI!X5r(+Z43ZF0V&s9ccxnxsn!LR^SpG@gFiqpjP(_<6WSJV1Rl~XFO@XDxMww~XR
z8wm0NUwQoGWB{t2=T=S{u;-I{rOfApOpwbCY{=Aesz3k$sY2G!1i9{flnl(yQngJK
z;l|X{!va=*u~D^6l-oZA&B9JyQ8kt8Q>ss8YEUer@)%JW{6t52GuNkb^#X@np^BhU
zwIW6Zeg^tIn>)IMclEBc?pHMtm&ie;E)YV8)hf>stgUKPsv8R}+aMg|$f*GqLjNhH
z=Nogor$e9a8#PRyI;t0mr{odU5pur(<a0XpC3Tg9PheH33*qS+Re<!jr{|Rn(4ths
zDl92vxr$cztAJKa>JNzp2UV#wB-Vr3@SDe9E-SNzK2-#OOc%5AR;a@!J;v<j!J4dE
z(}(3lJ-sl>S{e2_s%t@YE!M3C_j|v0=@BDiPg39gf8`xzUK*O2J9JNVYS@I)4ND?A
zX-wK_I_vIu=AvOE`NC8aB}j@&^$-KUJ!0x2z~CMF#Zw|3pHN80PH}R7wjU4QckuA2
zfxQqZ&|hzu1Hsx#dz)Tr@^{mn@R7uBb}$?R<c`WWCk@EIQhJFBAJ)Sm1JSR8{(nvD
zJ`>*oGSww!9nmy3xS-M0N#Gs$U{A^L%_sCagVv48AcX-?d6*WK5wjrAXb=34nS=;Y
zxUq?CtI8dxVXCW5K6v)PvY{N2DVzjI#M;#-Qo6G)VtttSHz%!`%EiZkZE_6^jP{L7
zV4<^VeHTj?05bh30sO;H4vr&52C2@E;EqxtT==mC3nEoalKWI2-+O4(7<79=k2Eu<
zX{VH#Om|SAy`@YcGgYB67GW-!K5jCwRZ%p{G4MQlXumk67t?i&x4M024Bgg$s>_J@
z>VSTWagU42xMhI)Z(6Sa6lC)Yn-ULB#UsLRAA5Py!2cjAR7L=VrA<{yD|~`p7A;DJ
zj5)j@)uxw$nSBA-jobhfdsb}*8|<K!7xZOrK0@TCqD!I5AGuE{?o+in-<}kJ^5MpP
zN>L5}6g1DX<PSjDI{)$Mt)1U38QO>MNVq_hhuJAh0I(pX+xyA$y|3*)Vd(ZeiU6Kx
z$^rtI&hrmJ@lsm9(_1WB+z_CWongj<a+g!3T58uOlHn|u$1`QDrv%U6B4A9ina!UT
z<c70l%x<dX3z&6p^FoU#)2s#FzfquSkR2AmUgYG4i+nK%cfV}hBzPY<Z@paInb7M6
zgJBp~f?_9w%m4}>w)s2+h~gp6CKzW@+GHopDQt7hezYKFQ)IbwW7!8**ru)WOMN-3
z3bR%L7`Be_$_}?2ws?j9lnjbh-X_A-9QuAz=v)rDb)mJ8L$;nmJD+>t_=l8EA<!k5
zH;?_?hTVk&2v-w!3Zcwc_76eyyOe$h_fJ!i0<>Cg6grS0vHi@PE)&Pc8OK>460OjJ
z-=a?47j}Ts{IR6M=rBBc82qwMVAA*or4cXGLu}S~^+1+`55F=gvQ+O*>ou<DlTypZ
zFR&Wq(dS8tl>#r+Xs8gLcrh|O3?k#RjEt;>3g}$x-WW%lj(x9dE-Gog$`g{m^p=d2
zX`;ka!W_R&>4h!>NrDy~l5OdD4f^q{u!#A$#2Z_ra_xM>N?@I2p{qBPvhpKK(2JPn
z1y%-hw90jK>h%awqblB7NL0580`b6e*9{wQobM1knt-~veXjvm`RIi!T!#`)M)9E1
zKhk<#4X1)uBs0t-XPpD%KOHp!egG{G;oK5RnagLR=%Fan%IDx9-}-hv*SHTrkbQ7$
z`J}KaUrxs@ZggDJ8qhzR#(79(Ov}fe2K7MR%sECvC^XY7#Z&xyPp|8FhZ2IVOyd~~
zIE%7Q7!e8Dd2BR$UF9FK4lMH;hGK@OpoyMc+B)@WR_3U^XVD1=yFYbHmvP^&j;Zt^
zv+4Zh>0$3ABtQU&|IrNMfNWwx#;UId3=|<19xvoHu*Wwzryzl+xBD#)S~V0MD3T|k
zG|#AD_*BF+qr%feg1@KrW>>KSGPhl6V4G60wau;;>||=veQDclR0k9KCdY;~skJS1
zw$ria1BG5s>mFDBS}Gwh%)W7VrLnzZI+WQR>`@@iWLg)SeaMP4oheNv5HIZ=x0zEZ
z=`|)o7^!c(@n}SWFn>$yn+y;`Ey#_GWa#pd+z9K=bRzkx7)cE$rmd{P>AuI$44b^T
zF`)+yAMAl(VviRA_rJ(E-#wBi=X~p0d0x=!&uLWorZkz!#4-B7_4~_TKOxdr<}|_4
zE{*3Q?AqA><0j=~!A8x)LBJQ&dM#Q|XR}u9%pLNhH~(bR*mYsM03m&RGZ0thAX@B3
z=7Av7@BOiMCo95GCpgFk-xG7JZCrxx<g4sX&`s84qd*1CqOZYj!BXmE%c6JSi_O~J
zZ4ZnJeD!2|NY}L7^{)a2xF{VY{*CWUEO1Uv2EH?SdrW5%yAoJua4tZ|yt#A1Y1`3O
z-~T4g1+alT7dSk~_f#4ii<l>LWhDFJCSk4J+kSOItX!Elm3;QH<;fwy;!f-eIe!e{
zUVwHuql%{Yv8^v|>?oze)t&UoND6z)*NbMkS2}0h*}2KJn556{owM|aiH4E1evc^>
zECGTg+ZIPK3yKBKcB~jh2_^hDg93LlO!LLAnj!2_A^vk|U5aF!3&~b9@8g%%CXJxC
zCiJ@)Ek5&jUCUQP)r6w<Bs5(LdPF4A=jD=tVF)@LPa~-#ljcyceWkr0tGrt;o55CM
z1HTwH_)A(BW@=S4=n@8;7U2k%+_!bt38Qv0VKBmVFkvHM7>l9yQ4C1(4C>!ZlK5&B
z32`SFp<Gy3Z%G&+_po6X4cSj;d=CVf*0t-4tu<Ypvdz)ferxcuI^NM{uctgmH-KDz
z;n+lFm0nIS^7I_K(Z`6iPi&(>wc}g&4~ucj_tXDZf{Zp5Qsu+Tv2n+7PX%6E^*Yx{
zfxzSl0q`LZBo5rY9OKi0y8)#5SfyfIT>b*3zzNtGgY7^J1?7^v${?mIq*fje^6k=B
zgsB4MLi^ec!5Vt!B|kf2Eb#zkAdwX!?1N38Vlqw4=ZV1RCEY7>LqBM47;!9TYz|OD
zz_NJ?IzZz68$Z!yxPOt{uQP6AKLjb{dl?>nC{WYf)Tt;M#Clfu^wVDMzPHf=e%e0H
z+jqG<-G9`!eB}ozl)-TV@XlDIGy*|3&X^xD9F2vK6vyEp(zhh^rD`>5@oILDFb00{
zV+UA#=iFbMFqU|TOqWJhMb;aH=yGOuLhtEUPzk@yZ%SmE;(xjAG87=QL{q5+5~iUK
zq1Kdz)KZ_j;fN9ZD-?WsNn+R;;Rp?raw9=_^N<uW3P;VFIoZPg#!CYCpKJ<$XK8P8
z8~z`8@U2$*LzLd$Zrg7n%|$MphEQ}aqu-#lr-S`hCG=I+%z0(@!(ve2cadyRmDw#b
zK@pmX9YMZ(<v%UkFOlt4^;||FOA0gqjLnS~hemwnt*Y20ymyO|gL^5VuQ7$gWE^*E
zzNDWA5chE9c6EF61&ICQ?&j{xlt6Y+Sxq<F+b~;j=pX23OApSC@*Bl~!ILvwcX>QF
z0PVhG4_OO*hWz6%+1&c9AIR6@o)&zs*RVP3gUZ_K!;yRp@)*H@4WIdmjmqAQZU^`r
z`CsEfpeKw$eT5FnD;D*p&x`>Nfi~hSzd8u7@%eaTvQTPcH(x!4Yk&Ew!B4?&$)xU~
z;4L1)Mfg3(j!qir|DeBsaT;zXW;k{UL#PDtJO*T$pLAa#;xOog$6Z?}bes%bwpDTG
zUW2#)hfhKzs{3+UUvD(Slk5m}Z0yh?9Y)x=gp&}?gF(1c?cASH55)@t)(ub_<?fv-
z?RxySl88|L8hv_26BSf5%_=)tAumRW!0H?5w89P|lnoY`rKR+3d-WVWBJD(6M-^g*
z`pRAaNjwH5S(tP^IX%MWB<_p9*h!QN8Yw8aVsr4cJ&{z;r}c&g&p0Et>K_jYQ6KHl
z8x&EQFwzK-d61WD5-BNA7545HvGQ`7z)D$;RiMEi-8XvZ>0xnSHNmd~93&G`ga$^A
zp!gti%PkpbS(zkBkqCUDT1U|!;$ysmc6d!9f;)cRh<pbh`LUI+y%XIZe4Ha4cdiVW
zQVi@~VAEJRFw1M_hDi$O;ZQzSuzZB~!U~Hi7gqUiWvUmY^jsgD(I=ok##`45{H-en
XSWtT1U0Gb5(g}X;=x4?zZom3}JWMxC


	molecule_id	smiles	molecular_weight
0	331406	COc1cccc(c1c1ccccc1P(c1ccccc1)c1ccccc1)OC	398.441986
1	140360	COc1ccc(P(c2ccccc2SC)c2ccccc2SC)c(C)c1	398.532990
2	331409	C1CCC(CC1)P(c1ccccc1)C1CCCCC1	274.388000
3	2027	CN(c1ccccc1c1ccccc1P(C12CC3CC(C2)CC(C1)C3)C12C...	497.707001
4	2036	CCC1(CC)O[C@@H]2[C@@H](O1)C(c1cc(C(C)(C)C)cc(C...	1049.558960
...	...	...	...
330962	608	Cc1cc(C)cc(P(c2cc(C)cc(C)c2)c2ccccc2C=O)c1	346.410004
330963	461	CCO[Si](CCP(c1ccccc1)c1ccccc1)(OCC)OCC	376.509003
330964	1064	Cc1c(C)n(C(C)C)c(=NP(N=c2n(C(C)C)c(C)c(C)n2C(C...	462.666992
330965	523	CN(C)/N=C/c1ccc(P(c2ccc(/C=N/N(C)C)s2)c2ccc(/C...	490.664001
330966	1817	COc1cccc(P(c2cccc(OC)c2OC)c2cccc(OC)c2OC)c1OC	442.447998