Skip to content

Commit

Permalink
small changes
Browse files Browse the repository at this point in the history
  • Loading branch information
ypriverol committed Sep 26, 2024
1 parent 3909487 commit 07cb771
Show file tree
Hide file tree
Showing 3 changed files with 46 additions and 99 deletions.
62 changes: 0 additions & 62 deletions examples/loom2parquetmerge.py

This file was deleted.

58 changes: 21 additions & 37 deletions fslite/fs/fdataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,14 +38,12 @@ class FSDataFrame:
"""

def __init__(
self,
df: pd.DataFrame,
sample_col: Optional[str] = None,
label_col: Optional[str] = None,
sparse_threshold: float = 0.7, # Threshold for sparsity
memory_threshold: Optional[
float
] = 0.75, # Proportion of system memory to use for dense arrays
self,
df: pd.DataFrame,
sample_col: Optional[str] = None,
label_col: Optional[str] = None,
sparse_threshold: float = 0.7, # Threshold for sparsity
memory_threshold: Optional[float] = 0.75, # Proportion of system memory to use for dense arrays
):
"""
Create an instance of FSDataFrame.
Expand All @@ -61,21 +59,15 @@ def __init__(
in the feature matrix exceeds this value, the matrix is stored in a sparse format unless memory allows.
:param memory_threshold: Proportion of system memory available to use before deciding on sparse/dense.
"""
# TODO: We are loading full data into memory, look for other options. Maybe Dask?
self.__df = df.copy()

# Check for necessary columns
columns_to_drop = []
# Copy the DataFrame for internal usage
self.__df = df

# Handle sample column
if sample_col:
if sample_col not in df.columns:
raise ValueError(
f"Sample column '{sample_col}' not found in DataFrame."
)
raise ValueError(f"Sample column '{sample_col}' not found in DataFrame.")
self.__sample_col = sample_col
self.__samples = df[sample_col].tolist()
columns_to_drop.append(sample_col)
else:
self.__sample_col = None
self.__samples = []
Expand All @@ -90,55 +82,47 @@ def __init__(
self.__label_col = label_col
self.__labels = df[label_col].tolist()

# Encode labels
# TODO: Check if labels are categorical or continuous? For now, assume categorical
# Encode labels (assume categorical for now)
label_encoder = LabelEncoder()
self.__labels_matrix = label_encoder.fit_transform(df[label_col]).tolist()
columns_to_drop.append(label_col)

# Drop both sample and label columns in one step
self.__df = self.__df.drop(columns=columns_to_drop)
# Select only numerical columns, excluding sample_col and label_col
feature_columns = df.select_dtypes(include=[np.number]).columns.tolist()
self.__original_features = [col for col in feature_columns if col not in [sample_col, label_col]]

# Extract features
self.__original_features = self.__df.columns.tolist()
# Select only the feature columns directly (no drop)
numerical_df = df[self.__original_features]

# Ensure only numerical features are retained
numerical_df = self.__df.select_dtypes(include=[np.number])
if numerical_df.empty:
raise ValueError("No numerical features found in the DataFrame.")

# Check sparsity
# Calculate sparsity
num_elements = numerical_df.size
num_zeros = np.count_nonzero(numerical_df == 0)
num_zeros = (numerical_df == 0).sum().sum()
sparsity = num_zeros / num_elements

# Estimate memory usage
dense_matrix_size = numerical_df.memory_usage(deep=True).sum() # In bytes
available_memory = psutil.virtual_memory().available # In bytes

# Handle sparse or dense matrix based on sparsity and available memory
if sparsity > sparse_threshold:
if dense_matrix_size < memory_threshold * available_memory:
# Use dense matrix if enough memory is available
logging.info(
f"Data is sparse (sparsity={sparsity:.2f}) but enough memory available. "
f"Using a dense matrix."
)
self.__matrix = numerical_df.to_numpy(dtype=np.float32)
self.__is_sparse = False
else:
# Use sparse matrix due to memory constraints
logging.info(
f"Data is sparse (sparsity={sparsity:.2f}), memory insufficient for dense matrix. "
f"Using a sparse matrix representation."
)
self.__matrix = sparse.csr_matrix(
numerical_df.to_numpy(dtype=np.float32)
)
self.__matrix = sparse.csr_matrix(numerical_df.to_numpy(dtype=np.float32))
self.__is_sparse = True
else:
# Use dense matrix since it's not sparse
logging.info(
f"Data is not sparse (sparsity={sparsity:.2f}), using a dense matrix."
)
logging.info(f"Data is not sparse (sparsity={sparsity:.2f}), using a dense matrix.")
self.__matrix = numerical_df.to_numpy(dtype=np.float32)
self.__is_sparse = False

Expand Down
25 changes: 25 additions & 0 deletions fslite/tests/test_univariate_methods.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import pandas as pd
import psutil

from fslite.fs.fdataframe import FSDataFrame
from fslite.fs.univariate import FSUnivariate
Expand Down Expand Up @@ -29,6 +30,30 @@ def test_univariate_filter_corr():
df_filtered = fsdf_filtered.to_pandas()
df_filtered.to_csv("filtered_tnbc_data.csv", index=False)

def test_univariate_filter_big_corr():
# import tsv as pandas DataFrame
df = pd.read_parquet(path="../../examples/GSE156793.parquet")
df.drop(columns=["development_day", "assay_id"], inplace=True)
print(df.shape[1])

dense_matrix_size = (df.memory_usage(deep=True).sum() / 1e+6) # In megabytes
available_memory = (psutil.virtual_memory().available / 1e+6) # In megabytes

# create FSDataFrame instance
fs_df = FSDataFrame(df=df, sample_col="sample_id", label_col="cell_cluster_id")

# create FSUnivariate instance
fs_univariate = FSUnivariate(fs_method="u_corr", selection_threshold=0.3)

fsdf_filtered = fs_univariate.select_features(fs_df)

assert fs_df.count_features() == 500
assert fsdf_filtered.count_features() == 211

# Export the filtered DataFrame as Pandas DataFrame
df_filtered = fsdf_filtered.to_pandas()
df_filtered.to_csv("single_cell_output.csv", index=False)


# test the univariate_filter method with 'anova' method
def test_univariate_filter_anova():
Expand Down

0 comments on commit 07cb771

Please sign in to comment.