loom2parquet examples

bigbio · Sep 24, 2024 · cc493f6 · cc493f6
1 parent 5cbd7da
commit cc493f6
Show file tree

Hide file tree

Showing 4 changed files with 74 additions and 58 deletions.
diff --git a/examples/2.concat_parquet_files.py b/examples/2.concat_parquet_files.py
diff --git a/examples/1.parse_single_cell_data.py → examples/loom2parquetchunks.py b/examples/1.parse_single_cell_data.py → examples/loom2parquetchunks.py
@@ -7,7 +7,7 @@
 import loompy
 
 # define the path to the loom file
-loom_file = "/path/to/loom/GSE156793_S3_gene_count.loom"
+loom_file = "GSE156793_S3_gene_count.loom"
 
 # connect to the loom file
 ds = loompy.connect(loom_file)
@@ -31,7 +31,7 @@
 assay = ds.ca["Assay"]
 development_day = ds.ca["Development_day"]
 
-# make a dataframe with the sample metadata
+# make a dataframe with the sample metadata, define the columns types
 sample_df = pd.DataFrame({"sample_id": sample_id,
                           "cell_cluster": cell_cluster,
                           "assay": assay,
@@ -57,7 +57,7 @@
 # Save the sample metadata to parquet
 (sample_df
  .reset_index()
- .to_parquet("sample_metadata.parquet.gz",
+ .to_parquet("sample_metadata.parquet",
              index=False,
              engine="auto",
              compression="gzip")
@@ -67,6 +67,8 @@
 # transpose dataset and convert to parquet.
 # process the data per chunks.
 chunk_size = 2000
+number_chunks = 1000 # Number of chunks to process, if None, all chunks are processed
+count = 0
 for (ix, selection, view) in ds.scan(axis=1, batch_size=chunk_size):
     # retrieve the chunk
     matrix_chunk = view[:, :]
@@ -99,9 +101,14 @@
     df_chunk = df_chunk.rename(columns={"index": "sample_id"})
 
     # save the chunk to parquet
-    df_chunk.to_parquet(f"gene_count_chunk_{ix}.parquet.gz",
+    df_chunk.to_parquet(f"gene_count_chunk_{ix}.parquet",
                         index=False,
-                        engine="fastparquet",
+                        engine="pyarrow",
                         compression="gzip")
 
     print(f"Chunk {ix} saved")
+    count = count + 1
+
+    # break the loop if the number of chunks is reached
+    if number_chunks is not None and count >= number_chunks:
+        break
diff --git a/examples/loom2parquetmerge.py b/examples/loom2parquetmerge.py
@@ -0,0 +1,62 @@
+import os
+import pyarrow.parquet as pq
+import pyarrow as pa
+
+
+# get all absolute paths of files in a directory
+def get_files_paths(directory, extension: str = "parquet"):
+    """
+    Get all file paths in a directory.
+    :param extension: str, file extension.
+    :param directory: str, directory path.
+    :return: list, list of file paths.
+    """
+    files_paths = []
+    for root, dirs, files in os.walk(directory):
+        for file in files:
+            if file.endswith(extension):
+                files_paths.append(os.path.join(root, file))
+    return files_paths
+
+
+def concatenate_parquet_files_incremental(files_paths, output_path, batch_size=10000):
+    """
+    Concatenate multiple parquet files in an incremental fashion to avoid memory overload.
+
+    :param files_paths: List of parquet file paths.
+    :param output_path: Path to the output parquet file.
+    :param batch_size: Number of rows to read from each file at a time.
+    """
+    writer = None
+
+    for file_path in files_paths:
+        print(f"Processing file: {file_path}")
+        parquet_file = pq.ParquetFile(file_path)
+
+        # Read the file in batches to avoid memory overload
+        for batch in parquet_file.iter_batches(batch_size=batch_size):
+            # Convert the batch to a PyArrow Table
+            table = pa.Table.from_batches([batch])
+
+            # If the writer is not initialized, create a new Parquet writer
+            if writer is None:
+                writer = pq.ParquetWriter(output_path, table.schema, compression='gzip')
+
+            # Write the batch to the output Parquet file
+            writer.write_table(table)
+
+    # Close the writer after all batches are written
+    if writer is not None:
+        writer.close()
+        print(f"Concatenated parquet file written to {output_path}")
+
+
+# Get all files paths
+files_paths = get_files_paths(directory="./",
+                              extension="parquet")
+
+# Output path for the final concatenated parquet file
+output_path = "GSE156793.parquet"
+
+# Concatenate the parquet files and write to a single file incrementally
+concatenate_parquet_files_incremental(files_paths, output_path, batch_size=10000)
diff --git a/examples/sc_human_atlas.py b/examples/sc_human_atlas.py