Skip to content

Commit

Permalink
🔧 Fixes issue #51 (index of partition elem)
Browse files Browse the repository at this point in the history
- Modify `create_variants` in `_metafile.py` to use `create_monoincreasing_index`
- Implement `create_monoincreasing_index` to ensure a continuous index
  • Loading branch information
horta committed Oct 28, 2024
1 parent bd6eebb commit 9c54211
Showing 1 changed file with 12 additions and 1 deletion.
13 changes: 12 additions & 1 deletion bgen_reader/_metafile.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,7 +122,7 @@ def create_variants(filepath: Path, nvariants: int, npartitions: int, part_size:
("vaddr", int),
]
df = dd.from_delayed(dfs, meta=dd.utils.make_meta(meta), divisions=divisions)
return df
return create_monoincreasing_index(df)


cache = LRUCache(maxsize=3)
Expand All @@ -145,3 +145,14 @@ def _read_partition(filepath: Path, partition: int) -> DataFrame:
}
df = DataFrame(data)
return df[["id", "rsid", "chrom", "pos", "nalleles", "allele_ids", "vaddr"]]


# Source: https://stackoverflow.com/a/66320758
def create_monoincreasing_index(df):
# Assume that rows are already ordered.
cumlens = [0] + list(df.map_partitions(len).compute().cumsum())
new_partitions = []
for i, partition in enumerate(df.partitions):
partition.index = partition.index + cumlens[i]
new_partitions.append(partition)
return dd.concat(new_partitions)

0 comments on commit 9c54211

Please sign in to comment.