Merge pull request #835 from uclahs-cds/czhu-fix-parse-circexplorer

Fix circRNA variant ID exon/intron order
uclahs-cds · Jan 16, 2024 · aea4b9b · aea4b9b
2 parents ebe509c + 3c84824
commit aea4b9b
Show file tree

Hide file tree

Showing 3 changed files with 9 additions and 7 deletions.
diff --git a/.github/workflows/tests.yaml b/.github/workflows/tests.yaml
@@ -26,7 +26,7 @@ jobs:
       - name: Install dependencies
         run: |
           python -m pip install --upgrade pip
-          pip install biopython pathos pytest psutil six matplotlib regex
+          pip install biopython==1.82 pathos pytest psutil six matplotlib regex
 
       - name: Run Unit Tests
         run: |

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -10,7 +10,7 @@ This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.htm
 
 ## [Unreleased]
 
-## [1.2.2] - 2023-12-22
+## [1.2.2] - 2024-1-16
 
 ### Fixed:
 
@@ -20,6 +20,8 @@ This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.htm
 
 - Updated `splitFasta` and `summarizeFasta` to accept source combinations in `--order-source`.
 
+- Fixed parseCIRCexplorer so the exon/intron indices in variant IDs are sorted correctly.
+
 ## [1.2.1] - 2023-10-05
 
 ### Add

diff --git a/moPepGen/parser/CIRCexplorerParser.py b/moPepGen/parser/CIRCexplorerParser.py
@@ -57,7 +57,7 @@ def convert_to_circ_rna(self, anno:gtf.GenomicAnnotation,
         else:
             raise ValueError(f'circRNA type unsupported: {self.circ_type}')
 
-        fragment_ids = []
+        fragment_ids:List[Tuple(fragment, str, int)] = []
 
         for i, exon_size in enumerate(self.exon_sizes):
             exon_offset = self.exon_offsets[i]
@@ -81,21 +81,21 @@ def convert_to_circ_rna(self, anno:gtf.GenomicAnnotation,
 
             if fragment_type == 'exon':
                 exon_index = anno.find_exon_index(tx_id, fragment)
-                fragment_ids.append( f"E{exon_index + 1}")
+                fragment_ids.append((fragment, 'E', exon_index))
 
             elif fragment_type == 'intron':
                 intron_index = anno.find_intron_index(
                     tx_id, fragment,
                     intron_start_range=intron_start_range,
                     intron_end_range=intron_end_range
                 )
-                fragment_ids.append(f"I{intron_index + 1}")
+                fragment_ids.append((fragment, 'I', intron_index))
                 intron.append(i)
 
             fragments.append(fragment)
 
-        fragment_ids.sort()
-        circ_id += f'-{tx_id}-' + '-'.join(fragment_ids)
+        fragment_ids.sort(key=lambda x: x[0])
+        circ_id += f'-{tx_id}-' + '-'.join([f"{t}{i+1}" for _,t,i in fragment_ids])
 
         genomic_location = f"{self.chrom}:{self.start}"