Skip to content

Commit

Permalink
bug fixes
Browse files Browse the repository at this point in the history
  • Loading branch information
KanduriC committed Jan 14, 2025
1 parent 4dcf407 commit fefcbbe
Show file tree
Hide file tree
Showing 5 changed files with 60 additions and 15 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -75,8 +75,8 @@ def concatenate_repertoire_components(self, file_number):
concatenated_df = pd.DataFrame()
# concatenated_df.columns = ['junction', 'junction_aa', 'v_call', 'j_call']
if self.components_type == "public_private":
n_seq = np.random.poisson(self.n_sequences)
concatenated_df = concatenated_df.head(n_seq)
# n_seq = np.random.poisson(self.n_sequences)
concatenated_df = concatenated_df.head(self.n_sequences)
concatenated_df = concatenated_df.sample(frac=1).reset_index(drop=True)
concatenated_df.to_csv(concat_fn, header=is_head, index=None, sep='\t')

Expand All @@ -98,6 +98,7 @@ def multi_concatenate_repertoire_components(self):
subject_ids = [fn.split(".")[0] for fn in file_names]
metadata_dict = {'subject_id': subject_ids, 'filename': file_names, 'label_positive': labels}
metadata_df = pd.DataFrame.from_dict(metadata_dict)
metadata_df = metadata_df.sample(frac=1).reset_index(drop=True)
metadata_df.to_csv(os.path.join(self.super_path, "metadata.csv"))
metadata_df.to_csv(os.path.join(self.concatenated_reps_path, "metadata.csv"))
else:
Expand Down
2 changes: 1 addition & 1 deletion simAIRR/pgen_count_map/PgenCountMap.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ def get_absolute_number_of_repertoires(self, seq_pgen_bin: tuple):
implant_rate = self._get_implantation_rate(seq_pgen_bin)
absolute_number_of_repertoires = round(implant_rate * self.number_of_repertoires)
if absolute_number_of_repertoires < 2:
absolute_number_of_repertoires = 2
absolute_number_of_repertoires = 1
return absolute_number_of_repertoires


15 changes: 14 additions & 1 deletion simAIRR/util/utilities.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,20 @@ def concatenate_dataframes_with_replacement(dfs_list):
for idx, row in df_b.iterrows():
match_idx = df_a[(df_a['v_call'] == row['v_call']) & (df_a['j_call'] == row['j_call'])].index
if not match_idx.empty:
df_a = df_a.drop(match_idx[0])
drop_index = df_a.loc[match_idx].sample().index
df_a = df_a.drop(drop_index)
else:
match_idx_v_call = df_a[(df_a['v_call'] == row['v_call'])].index
match_idx_j_call = df_a[(df_a['j_call'] == row['j_call'])].index
if not match_idx_v_call.empty:
drop_index = df_a.loc[match_idx_v_call].sample().index
df_a = df_a.drop(drop_index)
elif not match_idx_j_call.empty:
drop_index = df_a.loc[match_idx_j_call].sample().index
df_a = df_a.drop(drop_index)
else:
drop_index = df_a.sample().index
df_a = df_a.drop(drop_index)
df = pd.concat([df_a, df_b], ignore_index=True)
return df

Expand Down
18 changes: 8 additions & 10 deletions tests/util/test_utilities.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,22 +15,20 @@ def test_count_lines(tmp_path):

def test_concatenate_dataframes_with_replacement():
df_a = pd.DataFrame({
'junction': ['j1', 'j2', 'j3', 'j4', 'j5'],
'junction_aa': ['aa1', 'aa2', 'aa3', 'aa4', 'aa5'],
'v_call': ['TRBV20-1', 'TRBV20-2', 'TRBV20-3', 'TRBV20-1', 'TRBV20-2'],
'j_call': ['TRBJ2-1', 'TRBJ2-2', 'TRBJ2-3', 'TRBJ2-1', 'TRBJ2-2']
'junction': ['j1', 'j2', 'j3', 'j4', 'j5', 'j16'],
'junction_aa': ['aa1', 'aa2', 'aa3', 'aa4', 'aa5', 'aa16'],
'v_call': ['TRBV20-1', 'TRBV20-2', 'TRBV20-3', 'TRBV20-1', 'TRBV20-2', 'TRBV20-3'],
'j_call': ['TRBJ2-1', 'TRBJ2-2', 'TRBJ2-3', 'TRBJ2-3', 'TRBJ2-2', 'TRBJ2-1']
})

df_b = pd.DataFrame({
'junction': ['j6', 'j7', 'j8'],
'junction_aa': ['aa5', 'aa6', 'aa7'],
'v_call': ['TRBV20-1', 'TRBV20-2', 'TRBV20-2'],
'j_call': ['TRBJ2-1', 'TRBJ2-2', 'TRBJ2-5']
'junction': ['j6', 'j7', 'j8', 'j9', 'j10'],
'junction_aa': ['aa5', 'aa6', 'aa7', 'aa8', 'aa9'],
'v_call': ['TRBV20-1', 'TRBV20-2', 'TRBV20-2', 'TRBV20-5', 'TRBV20-5'],
'j_call': ['TRBJ2-1', 'TRBJ2-2', 'TRBJ2-5', 'TRBJ2-1', 'TRBJ2-5']
})

df = concatenate_dataframes_with_replacement([df_a, df_b])
assert df.shape == (6, 4)
assert df['junction'].tolist() == ['j3', 'j4', 'j5', 'j6', 'j7', 'j8']

def test_get_legal_vj_pairs(tmp_path):
user_config_dict = {'mode': 'baseline_repertoire_generation',
Expand Down
35 changes: 34 additions & 1 deletion tests/workflows/test_Workflows.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
import os
import pandas as pd
import pytest

from simAIRR.workflows.Workflows import Workflows


Expand Down Expand Up @@ -124,4 +126,35 @@ def test__parse_and_validate_user_signal(tmp_path):
user_config_dict['output_path'] = out_path
desired_workflow = Workflows(**user_config_dict)
user_signal = desired_workflow._parse_and_validate_user_signal()
print(user_signal)
print(user_signal)


@pytest.fixture
def user_config_dict():
user_config_dict = {'mode': 'signal_implantation',
'olga_model': 'humanTRB',
'output_path': None,
'n_repertoires': 40,
'seed': 298,
'n_sequences': 100,
'n_threads': 4,
'noise_rate': 0.2,
'phenotype_burden': 5,
'phenotype_pool_size': 100,
'store_intermediate_files': True,
'allow_closer_phenotype_burden': True,
'export_nt': False,
'signal_pgen_count_mapping_file': None,
'signal_sequences_file': None,
'positive_label_rate': 0.5,
'public_seq_proportion': 0.1}
return user_config_dict

def test_debug(user_config_dict):
output_path = "/Users/skanduri/Documents/Projects/airr_ml_bm/27-02-2024"
user_config_dict['output_path'] = os.path.join(output_path, "debug_output")
user_config_dict['signal_pgen_count_mapping_file'] = "/Users/skanduri/Documents/Projects/airr_ml_bm/27-02-2024/signal_seq_pgen_count_map_burden_2_more_freq.tsv"
user_config_dict['signal_sequences_file'] = "/Users/skanduri/Documents/Projects/airr_ml_bm/27-02-2024/HCV_specific_sequences_vdjdb_04082023_processed_deduplicated_genenames_replaced.tsv"
user_config_dict['public_seq_pgen_count_mapping_file'] = "/Users/skanduri/PycharmProjects/simAIRR/simAIRR/config_validator/public_seq_pgen_count_map.tsv"
desired_workflow = Workflows(**user_config_dict)
desired_workflow.execute()

0 comments on commit fefcbbe

Please sign in to comment.