Skip to content

Commit

Permalink
loom2parquet examples
Browse files Browse the repository at this point in the history
  • Loading branch information
ypriverol committed Sep 24, 2024
1 parent 5cbd7da commit cc493f6
Show file tree
Hide file tree
Showing 4 changed files with 74 additions and 58 deletions.
53 changes: 0 additions & 53 deletions examples/2.concat_parquet_files.py

This file was deleted.

Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
import loompy

# define the path to the loom file
loom_file = "/path/to/loom/GSE156793_S3_gene_count.loom"
loom_file = "GSE156793_S3_gene_count.loom"

# connect to the loom file
ds = loompy.connect(loom_file)
Expand All @@ -31,7 +31,7 @@
assay = ds.ca["Assay"]
development_day = ds.ca["Development_day"]

# make a dataframe with the sample metadata
# make a dataframe with the sample metadata, define the columns types
sample_df = pd.DataFrame({"sample_id": sample_id,
"cell_cluster": cell_cluster,
"assay": assay,
Expand All @@ -57,7 +57,7 @@
# Save the sample metadata to parquet
(sample_df
.reset_index()
.to_parquet("sample_metadata.parquet.gz",
.to_parquet("sample_metadata.parquet",
index=False,
engine="auto",
compression="gzip")
Expand All @@ -67,6 +67,8 @@
# transpose dataset and convert to parquet.
# process the data per chunks.
chunk_size = 2000
number_chunks = 1000 # Number of chunks to process, if None, all chunks are processed
count = 0
for (ix, selection, view) in ds.scan(axis=1, batch_size=chunk_size):
# retrieve the chunk
matrix_chunk = view[:, :]
Expand Down Expand Up @@ -99,9 +101,14 @@
df_chunk = df_chunk.rename(columns={"index": "sample_id"})

# save the chunk to parquet
df_chunk.to_parquet(f"gene_count_chunk_{ix}.parquet.gz",
df_chunk.to_parquet(f"gene_count_chunk_{ix}.parquet",
index=False,
engine="fastparquet",
engine="pyarrow",
compression="gzip")

print(f"Chunk {ix} saved")
count = count + 1

# break the loop if the number of chunks is reached
if number_chunks is not None and count >= number_chunks:
break
62 changes: 62 additions & 0 deletions examples/loom2parquetmerge.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
import os
import pyarrow.parquet as pq
import pyarrow as pa


# get all absolute paths of files in a directory
def get_files_paths(directory, extension: str = "parquet"):
"""
Get all file paths in a directory.
:param extension: str, file extension.
:param directory: str, directory path.
:return: list, list of file paths.
"""
files_paths = []
for root, dirs, files in os.walk(directory):
for file in files:
if file.endswith(extension):
files_paths.append(os.path.join(root, file))
return files_paths


def concatenate_parquet_files_incremental(files_paths, output_path, batch_size=10000):
"""
Concatenate multiple parquet files in an incremental fashion to avoid memory overload.
:param files_paths: List of parquet file paths.
:param output_path: Path to the output parquet file.
:param batch_size: Number of rows to read from each file at a time.
"""
writer = None

for file_path in files_paths:
print(f"Processing file: {file_path}")
parquet_file = pq.ParquetFile(file_path)

# Read the file in batches to avoid memory overload
for batch in parquet_file.iter_batches(batch_size=batch_size):
# Convert the batch to a PyArrow Table
table = pa.Table.from_batches([batch])

# If the writer is not initialized, create a new Parquet writer
if writer is None:
writer = pq.ParquetWriter(output_path, table.schema, compression='gzip')

# Write the batch to the output Parquet file
writer.write_table(table)

# Close the writer after all batches are written
if writer is not None:
writer.close()
print(f"Concatenated parquet file written to {output_path}")


# Get all files paths
files_paths = get_files_paths(directory="./",
extension="parquet")

# Output path for the final concatenated parquet file
output_path = "GSE156793.parquet"

# Concatenate the parquet files and write to a single file incrementally
concatenate_parquet_files_incremental(files_paths, output_path, batch_size=10000)
Empty file removed examples/sc_human_atlas.py
Empty file.

0 comments on commit cc493f6

Please sign in to comment.