Adding DMSO scores into plots (WayScience#23)

* init * Add new mAP score figures and update analysis utilities for improved data handling * nb converted notebooks * edited docs
axiomcura · Feb 19, 2025 · 4936072 · 4936072
1 parent 0f466a2
commit 4936072
Show file tree

Hide file tree

Showing 35 changed files with 2,066 additions and 1,326 deletions.
diff --git a/notebooks/1.map-analysis/1.run-map.ipynb b/notebooks/1.map-analysis/1.run-map.ipynb
@@ -75,6 +75,8 @@
    "source": [
     "# Setting the base data directory and ensure it exists (raises an error if it doesn't)\n",
     "data_dir = pathlib.Path(\"../data/\").resolve(strict=True)\n",
+    "agg_data_dir = (data_dir / \"agg_fs_profiles\").resolve(strict=True)\n",
+    "fs_profiles_paths = list((data_dir / \"agg_fs_profiles\").resolve(strict=True).glob(\"*.parquet\"))\n",
     "\n",
     "# Setting the metadata directory for updated plate maps and ensure it exists\n",
     "metadata_dir = pathlib.Path(\"../data/metadata/updated_platemaps\").resolve(strict=True)\n",
@@ -122,21 +124,23 @@
    "cell_type": "code",
    "execution_count": 4,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Total amount of shared columns among all profiles:\n",
+      "482\n"
+     ]
+    }
+   ],
    "source": [
-    "shared_cols = None\n",
-    "for aggregated_profile in list(data_dir.glob(\"*.parquet\")):\n",
-    "    # read aggreagated profiled and column names\n",
-    "    agg_df = pd.read_parquet(aggregated_profile)\n",
-    "    columns = list(agg_df.columns)\n",
+    "# finding shared features while deleting duplicate column names\n",
+    "shared_cols = data_utils.find_shared_features(profile_paths=fs_profiles_paths, delete_dups=True)\n",
     "\n",
-    "    # Update the shared_columns set\n",
-    "    if shared_cols is None:\n",
-    "        # Initialize shared columns with the first profile's columns, preserving order\n",
-    "        shared_cols = columns\n",
-    "    else:\n",
-    "        # Retain only the columns present in both the current profile and shared columns\n",
-    "        shared_cols = [col for col in shared_cols if col in columns]\n"
+    "# total amount of shared columns among all profiles in batch 1\n",
+    "print(\"Total amount of shared columns among all profiles:\")\n",
+    "print(len(shared_cols))"
    ]
   },
   {
@@ -152,67 +156,66 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "# Suffix for aggregated profiles\n",
+    "# suffix for aggregated profiles\n",
     "aggregated_file_suffix = \"aggregated_post_fs.parquet\"\n",
     "\n",
-    "# Dictionary to store loaded plate data grouped by batch\n",
+    "# dictionary to store loaded plate data grouped by batch\n",
     "loaded_plate_batches = {}\n",
     "loaded_shuffled_plate_batches = {}\n",
     "\n",
-    "# Iterate over unique platemap files and their associated plates\n",
+    "# iterate over unique platemap files and their associated plates\n",
     "for batch_index, (platemap_filename, associated_plates_df) in enumerate(\n",
     "    barcode.groupby(\"platemap_file\")\n",
     "):\n",
-    "    # Generate a unique batch ID\n",
+    "    # generate a unique batch ID\n",
     "    batch_id = f\"batch_{batch_index + 1}\"\n",
-    "    shuffled_batch_id = f\"shuffled_batch_{batch_index + 1}\"\n",
     "\n",
-    "    # Load the platemap CSV file\n",
+    "    # load the platemap CSV file\n",
     "    platemap_path = (metadata_dir / f\"{platemap_filename}.csv\").resolve(strict=True)\n",
     "    platemap_data = pd.read_csv(platemap_path)\n",
     "\n",
-    "    # Extract all plate names associated with the current platemap\n",
+    "    # extract all plate names associated with the current platemap\n",
     "    plate_barcodes = associated_plates_df[\"plate_barcode\"].tolist()\n",
     "\n",
-    "    # List to store all loaded and processed aggregated plates for the current batch\n",
+    "    # list to store all loaded and processed aggregated plates for the current batch\n",
     "    loaded_aggregated_plates = []\n",
     "    loaded_shuffled_aggregated_plates = []\n",
     "\n",
     "    for plate_barcode in plate_barcodes:\n",
-    "        # Resolve the file path for the aggregated plate data\n",
+    "        # resolve the file path for the aggregated plate data\n",
     "        plate_file_path = (\n",
-    "            data_dir / f\"{plate_barcode}_{aggregated_file_suffix}\"\n",
+    "            agg_data_dir / f\"{plate_barcode}_{aggregated_file_suffix}\"\n",
     "        ).resolve(strict=True)\n",
     "\n",
-    "        # Load the aggregated profile data for the current plate\n",
+    "        # load the aggregated profile data for the current plate\n",
     "        aggregated_data = load_profiles(plate_file_path)\n",
     "\n",
-    "        # Update loaded data frame with only shared features\n",
+    "        # update loaded data frame with only shared features\n",
     "        aggregated_data = aggregated_data[shared_cols]\n",
     "\n",
-    "        # Add a new column indicating the source plate for each row\n",
+    "        # add a new column indicating the source plate for each row\n",
     "        aggregated_data.insert(0,\"Metadata_plate_barcode\" , plate_barcode)\n",
     "\n",
-    "        # Append the processed aggregated data for this plate to the batch list\n",
+    "        # append the processed aggregated data for this plate to the batch list\n",
     "        loaded_aggregated_plates.append(aggregated_data)\n",
     "\n",
     "        # adding shuffled aggregated profiles\n",
     "        shuffled_aggregated_data = data_utils.shuffle_features(aggregated_data)\n",
     "\n",
-    "        # Append the processed and shuffled aggregated data for this plate to the batch list\n",
+    "        # append the processed and shuffled aggregated data for this plate to the batch list\n",
     "        loaded_shuffled_aggregated_plates.append(shuffled_aggregated_data)\n",
     "\n",
-    "    # Combine all processed plates for the current batch into a single DataFrame\n",
+    "    # combine all processed plates for the current batch into a single DataFrame\n",
     "    combined_aggregated_data = pd.concat(loaded_aggregated_plates)\n",
     "    meta_concat, feats_concat = data_utils.split_meta_and_features(combined_aggregated_data)\n",
     "\n",
-    "    # Combine all shuffled and processed plates for the current batch into a single DataFrame\n",
+    "    # combine all shuffled and processed plates for the current batch into a single DataFrame\n",
     "    shuffled_combined_aggregated_data = pd.concat(loaded_shuffled_aggregated_plates)\n",
     "    meta_concat, feats_concat = data_utils.split_meta_and_features(shuffled_combined_aggregated_data)\n",
     "\n",
-    "    # Store the combined DataFrame in the loaded_plate_batches dictionary\n",
+    "    # store the combined DataFrame in the loaded_plate_batches dictionary\n",
     "    loaded_plate_batches[batch_id] = combined_aggregated_data\n",
-    "    loaded_shuffled_plate_batches[shuffled_batch_id] = shuffled_combined_aggregated_data"
+    "    loaded_shuffled_plate_batches[batch_id] = shuffled_combined_aggregated_data"
    ]
   },
   {
@@ -234,13 +237,6 @@
    "execution_count": 6,
    "metadata": {},
    "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                     \r"
-     ]
-    },
     {
      "name": "stderr",
      "output_type": "stream",
@@ -289,7 +285,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": null,
    "metadata": {},
    "outputs": [
     {
@@ -301,20 +297,20 @@
     }
    ],
    "source": [
-    "# here we execute map pipeline with with the original \n",
+    "# Here we execute mAP pipeline with with the original \n",
     "analysis_utils.calculate_trt_map_batch_profiles(\n",
     "    batched_profiles=loaded_plate_batches,\n",
     "    configs=configs,\n",
     "    outdir_path=results_dir,\n",
     "    shuffled=False\n",
     ")\n",
     "\n",
-    "# here we execute map pipeline with with the shuffled dataset \n",
+    "# Here we execute mAP pipeline with with the shuffled dataset \n",
     "analysis_utils.calculate_trt_map_batch_profiles(\n",
     "    batched_profiles=loaded_shuffled_plate_batches,\n",
     "    configs=configs,\n",
     "    outdir_path=results_dir,\n",
-    "    shuffled=False\n",
+    "    shuffled=True\n",
     ")"
    ]
   }