bug fixes

KanduriC · Jan 14, 2025 · fefcbbe · fefcbbe
1 parent 4dcf407
commit fefcbbe
Show file tree

Hide file tree

Showing 5 changed files with 60 additions and 15 deletions.
diff --git a/simAIRR/concatenate_repertoire_components/RepComponentConcatenation.py b/simAIRR/concatenate_repertoire_components/RepComponentConcatenation.py
@@ -75,8 +75,8 @@ def concatenate_repertoire_components(self, file_number):
             concatenated_df = pd.DataFrame()
             # concatenated_df.columns = ['junction', 'junction_aa', 'v_call', 'j_call']
         if self.components_type == "public_private":
-            n_seq = np.random.poisson(self.n_sequences)
-            concatenated_df = concatenated_df.head(n_seq)
+            # n_seq = np.random.poisson(self.n_sequences)
+            concatenated_df = concatenated_df.head(self.n_sequences)
         concatenated_df = concatenated_df.sample(frac=1).reset_index(drop=True)
         concatenated_df.to_csv(concat_fn, header=is_head, index=None, sep='\t')
 
@@ -98,6 +98,7 @@ def multi_concatenate_repertoire_components(self):
             subject_ids = [fn.split(".")[0] for fn in file_names]
             metadata_dict = {'subject_id': subject_ids, 'filename': file_names, 'label_positive': labels}
             metadata_df = pd.DataFrame.from_dict(metadata_dict)
+            metadata_df = metadata_df.sample(frac=1).reset_index(drop=True)
             metadata_df.to_csv(os.path.join(self.super_path, "metadata.csv"))
             metadata_df.to_csv(os.path.join(self.concatenated_reps_path, "metadata.csv"))
         else:

diff --git a/simAIRR/pgen_count_map/PgenCountMap.py b/simAIRR/pgen_count_map/PgenCountMap.py
@@ -53,7 +53,7 @@ def get_absolute_number_of_repertoires(self, seq_pgen_bin: tuple):
         implant_rate = self._get_implantation_rate(seq_pgen_bin)
         absolute_number_of_repertoires = round(implant_rate * self.number_of_repertoires)
         if absolute_number_of_repertoires < 2:
-            absolute_number_of_repertoires = 2
+            absolute_number_of_repertoires = 1
         return absolute_number_of_repertoires
 
 
diff --git a/simAIRR/util/utilities.py b/simAIRR/util/utilities.py
@@ -74,7 +74,20 @@ def concatenate_dataframes_with_replacement(dfs_list):
     for idx, row in df_b.iterrows():
         match_idx = df_a[(df_a['v_call'] == row['v_call']) & (df_a['j_call'] == row['j_call'])].index
         if not match_idx.empty:
-            df_a = df_a.drop(match_idx[0])
+            drop_index = df_a.loc[match_idx].sample().index
+            df_a = df_a.drop(drop_index)
+        else:
+            match_idx_v_call = df_a[(df_a['v_call'] == row['v_call'])].index
+            match_idx_j_call = df_a[(df_a['j_call'] == row['j_call'])].index
+            if not match_idx_v_call.empty:
+                drop_index = df_a.loc[match_idx_v_call].sample().index
+                df_a = df_a.drop(drop_index)
+            elif not match_idx_j_call.empty:
+                drop_index = df_a.loc[match_idx_j_call].sample().index
+                df_a = df_a.drop(drop_index)
+            else:
+                drop_index = df_a.sample().index
+                df_a = df_a.drop(drop_index)
     df = pd.concat([df_a, df_b], ignore_index=True)
     return df
 

diff --git a/tests/util/test_utilities.py b/tests/util/test_utilities.py
@@ -15,22 +15,20 @@ def test_count_lines(tmp_path):
 
 def test_concatenate_dataframes_with_replacement():
     df_a = pd.DataFrame({
-        'junction': ['j1', 'j2', 'j3', 'j4', 'j5'],
-        'junction_aa': ['aa1', 'aa2', 'aa3', 'aa4', 'aa5'],
-        'v_call': ['TRBV20-1', 'TRBV20-2', 'TRBV20-3', 'TRBV20-1', 'TRBV20-2'],
-        'j_call': ['TRBJ2-1', 'TRBJ2-2', 'TRBJ2-3', 'TRBJ2-1', 'TRBJ2-2']
+        'junction': ['j1', 'j2', 'j3', 'j4', 'j5', 'j16'],
+        'junction_aa': ['aa1', 'aa2', 'aa3', 'aa4', 'aa5', 'aa16'],
+        'v_call': ['TRBV20-1', 'TRBV20-2', 'TRBV20-3', 'TRBV20-1', 'TRBV20-2', 'TRBV20-3'],
+        'j_call': ['TRBJ2-1', 'TRBJ2-2', 'TRBJ2-3', 'TRBJ2-3', 'TRBJ2-2', 'TRBJ2-1']
     })
 
     df_b = pd.DataFrame({
-        'junction': ['j6', 'j7', 'j8'],
-        'junction_aa': ['aa5', 'aa6', 'aa7'],
-        'v_call': ['TRBV20-1', 'TRBV20-2', 'TRBV20-2'],
-        'j_call': ['TRBJ2-1', 'TRBJ2-2', 'TRBJ2-5']
+        'junction': ['j6', 'j7', 'j8', 'j9', 'j10'],
+        'junction_aa': ['aa5', 'aa6', 'aa7', 'aa8', 'aa9'],
+        'v_call': ['TRBV20-1', 'TRBV20-2', 'TRBV20-2', 'TRBV20-5', 'TRBV20-5'],
+        'j_call': ['TRBJ2-1', 'TRBJ2-2', 'TRBJ2-5', 'TRBJ2-1', 'TRBJ2-5']
     })
-
     df = concatenate_dataframes_with_replacement([df_a, df_b])
     assert df.shape == (6, 4)
-    assert df['junction'].tolist() == ['j3', 'j4', 'j5', 'j6', 'j7', 'j8']
 
 def test_get_legal_vj_pairs(tmp_path):
     user_config_dict = {'mode': 'baseline_repertoire_generation',

diff --git a/tests/workflows/test_Workflows.py b/tests/workflows/test_Workflows.py
@@ -1,5 +1,7 @@
 import os
 import pandas as pd
+import pytest
+
 from simAIRR.workflows.Workflows import Workflows
 
 
@@ -124,4 +126,35 @@ def test__parse_and_validate_user_signal(tmp_path):
     user_config_dict['output_path'] = out_path
     desired_workflow = Workflows(**user_config_dict)
     user_signal = desired_workflow._parse_and_validate_user_signal()
-    print(user_signal)
+    print(user_signal)
+
+
+@pytest.fixture
+def user_config_dict():
+    user_config_dict = {'mode': 'signal_implantation',
+                        'olga_model': 'humanTRB',
+                        'output_path': None,
+                        'n_repertoires': 40,
+                        'seed': 298,
+                        'n_sequences': 100,
+                        'n_threads': 4,
+                        'noise_rate': 0.2,
+                        'phenotype_burden': 5,
+                        'phenotype_pool_size': 100,
+                        'store_intermediate_files': True,
+                        'allow_closer_phenotype_burden': True,
+                        'export_nt': False,
+                        'signal_pgen_count_mapping_file': None,
+                        'signal_sequences_file': None,
+                        'positive_label_rate': 0.5,
+                        'public_seq_proportion': 0.1}
+    return user_config_dict
+
+def test_debug(user_config_dict):
+    output_path = "/Users/skanduri/Documents/Projects/airr_ml_bm/27-02-2024"
+    user_config_dict['output_path'] = os.path.join(output_path, "debug_output")
+    user_config_dict['signal_pgen_count_mapping_file'] = "/Users/skanduri/Documents/Projects/airr_ml_bm/27-02-2024/signal_seq_pgen_count_map_burden_2_more_freq.tsv"
+    user_config_dict['signal_sequences_file'] = "/Users/skanduri/Documents/Projects/airr_ml_bm/27-02-2024/HCV_specific_sequences_vdjdb_04082023_processed_deduplicated_genenames_replaced.tsv"
+    user_config_dict['public_seq_pgen_count_mapping_file'] = "/Users/skanduri/PycharmProjects/simAIRR/simAIRR/config_validator/public_seq_pgen_count_map.tsv"
+    desired_workflow = Workflows(**user_config_dict)
+    desired_workflow.execute()