Skip to content

Commit

Permalink
Adding DMSO scores into plots (WayScience#23)
Browse files Browse the repository at this point in the history
* init

* Add new mAP score figures and update analysis utilities for improved data handling

* nb converted notebooks

* edited docs
  • Loading branch information
axiomcura authored Feb 19, 2025
1 parent 0f466a2 commit 4936072
Show file tree
Hide file tree
Showing 35 changed files with 2,066 additions and 1,326 deletions.
82 changes: 39 additions & 43 deletions notebooks/1.map-analysis/1.run-map.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,8 @@
"source": [
"# Setting the base data directory and ensure it exists (raises an error if it doesn't)\n",
"data_dir = pathlib.Path(\"../data/\").resolve(strict=True)\n",
"agg_data_dir = (data_dir / \"agg_fs_profiles\").resolve(strict=True)\n",
"fs_profiles_paths = list((data_dir / \"agg_fs_profiles\").resolve(strict=True).glob(\"*.parquet\"))\n",
"\n",
"# Setting the metadata directory for updated plate maps and ensure it exists\n",
"metadata_dir = pathlib.Path(\"../data/metadata/updated_platemaps\").resolve(strict=True)\n",
Expand Down Expand Up @@ -122,21 +124,23 @@
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Total amount of shared columns among all profiles:\n",
"482\n"
]
}
],
"source": [
"shared_cols = None\n",
"for aggregated_profile in list(data_dir.glob(\"*.parquet\")):\n",
" # read aggreagated profiled and column names\n",
" agg_df = pd.read_parquet(aggregated_profile)\n",
" columns = list(agg_df.columns)\n",
"# finding shared features while deleting duplicate column names\n",
"shared_cols = data_utils.find_shared_features(profile_paths=fs_profiles_paths, delete_dups=True)\n",
"\n",
" # Update the shared_columns set\n",
" if shared_cols is None:\n",
" # Initialize shared columns with the first profile's columns, preserving order\n",
" shared_cols = columns\n",
" else:\n",
" # Retain only the columns present in both the current profile and shared columns\n",
" shared_cols = [col for col in shared_cols if col in columns]\n"
"# total amount of shared columns among all profiles in batch 1\n",
"print(\"Total amount of shared columns among all profiles:\")\n",
"print(len(shared_cols))"
]
},
{
Expand All @@ -152,67 +156,66 @@
"metadata": {},
"outputs": [],
"source": [
"# Suffix for aggregated profiles\n",
"# suffix for aggregated profiles\n",
"aggregated_file_suffix = \"aggregated_post_fs.parquet\"\n",
"\n",
"# Dictionary to store loaded plate data grouped by batch\n",
"# dictionary to store loaded plate data grouped by batch\n",
"loaded_plate_batches = {}\n",
"loaded_shuffled_plate_batches = {}\n",
"\n",
"# Iterate over unique platemap files and their associated plates\n",
"# iterate over unique platemap files and their associated plates\n",
"for batch_index, (platemap_filename, associated_plates_df) in enumerate(\n",
" barcode.groupby(\"platemap_file\")\n",
"):\n",
" # Generate a unique batch ID\n",
" # generate a unique batch ID\n",
" batch_id = f\"batch_{batch_index + 1}\"\n",
" shuffled_batch_id = f\"shuffled_batch_{batch_index + 1}\"\n",
"\n",
" # Load the platemap CSV file\n",
" # load the platemap CSV file\n",
" platemap_path = (metadata_dir / f\"{platemap_filename}.csv\").resolve(strict=True)\n",
" platemap_data = pd.read_csv(platemap_path)\n",
"\n",
" # Extract all plate names associated with the current platemap\n",
" # extract all plate names associated with the current platemap\n",
" plate_barcodes = associated_plates_df[\"plate_barcode\"].tolist()\n",
"\n",
" # List to store all loaded and processed aggregated plates for the current batch\n",
" # list to store all loaded and processed aggregated plates for the current batch\n",
" loaded_aggregated_plates = []\n",
" loaded_shuffled_aggregated_plates = []\n",
"\n",
" for plate_barcode in plate_barcodes:\n",
" # Resolve the file path for the aggregated plate data\n",
" # resolve the file path for the aggregated plate data\n",
" plate_file_path = (\n",
" data_dir / f\"{plate_barcode}_{aggregated_file_suffix}\"\n",
" agg_data_dir / f\"{plate_barcode}_{aggregated_file_suffix}\"\n",
" ).resolve(strict=True)\n",
"\n",
" # Load the aggregated profile data for the current plate\n",
" # load the aggregated profile data for the current plate\n",
" aggregated_data = load_profiles(plate_file_path)\n",
"\n",
" # Update loaded data frame with only shared features\n",
" # update loaded data frame with only shared features\n",
" aggregated_data = aggregated_data[shared_cols]\n",
"\n",
" # Add a new column indicating the source plate for each row\n",
" # add a new column indicating the source plate for each row\n",
" aggregated_data.insert(0,\"Metadata_plate_barcode\" , plate_barcode)\n",
"\n",
" # Append the processed aggregated data for this plate to the batch list\n",
" # append the processed aggregated data for this plate to the batch list\n",
" loaded_aggregated_plates.append(aggregated_data)\n",
"\n",
" # adding shuffled aggregated profiles\n",
" shuffled_aggregated_data = data_utils.shuffle_features(aggregated_data)\n",
"\n",
" # Append the processed and shuffled aggregated data for this plate to the batch list\n",
" # append the processed and shuffled aggregated data for this plate to the batch list\n",
" loaded_shuffled_aggregated_plates.append(shuffled_aggregated_data)\n",
"\n",
" # Combine all processed plates for the current batch into a single DataFrame\n",
" # combine all processed plates for the current batch into a single DataFrame\n",
" combined_aggregated_data = pd.concat(loaded_aggregated_plates)\n",
" meta_concat, feats_concat = data_utils.split_meta_and_features(combined_aggregated_data)\n",
"\n",
" # Combine all shuffled and processed plates for the current batch into a single DataFrame\n",
" # combine all shuffled and processed plates for the current batch into a single DataFrame\n",
" shuffled_combined_aggregated_data = pd.concat(loaded_shuffled_aggregated_plates)\n",
" meta_concat, feats_concat = data_utils.split_meta_and_features(shuffled_combined_aggregated_data)\n",
"\n",
" # Store the combined DataFrame in the loaded_plate_batches dictionary\n",
" # store the combined DataFrame in the loaded_plate_batches dictionary\n",
" loaded_plate_batches[batch_id] = combined_aggregated_data\n",
" loaded_shuffled_plate_batches[shuffled_batch_id] = shuffled_combined_aggregated_data"
" loaded_shuffled_plate_batches[batch_id] = shuffled_combined_aggregated_data"
]
},
{
Expand All @@ -234,13 +237,6 @@
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
" \r"
]
},
{
"name": "stderr",
"output_type": "stream",
Expand Down Expand Up @@ -289,7 +285,7 @@
},
{
"cell_type": "code",
"execution_count": 7,
"execution_count": null,
"metadata": {},
"outputs": [
{
Expand All @@ -301,20 +297,20 @@
}
],
"source": [
"# here we execute map pipeline with with the original \n",
"# Here we execute mAP pipeline with with the original \n",
"analysis_utils.calculate_trt_map_batch_profiles(\n",
" batched_profiles=loaded_plate_batches,\n",
" configs=configs,\n",
" outdir_path=results_dir,\n",
" shuffled=False\n",
")\n",
"\n",
"# here we execute map pipeline with with the shuffled dataset \n",
"# Here we execute mAP pipeline with with the shuffled dataset \n",
"analysis_utils.calculate_trt_map_batch_profiles(\n",
" batched_profiles=loaded_shuffled_plate_batches,\n",
" configs=configs,\n",
" outdir_path=results_dir,\n",
" shuffled=False\n",
" shuffled=True\n",
")"
]
}
Expand Down
Loading

0 comments on commit 4936072

Please sign in to comment.