Skip to content

Commit

Permalink
Merge pull request #54 from 4dn-dcic/29
Browse files Browse the repository at this point in the history
29
  • Loading branch information
Carl Vitzthum authored Mar 15, 2018
2 parents 4e79f36 + 74e17b9 commit a181178
Show file tree
Hide file tree
Showing 27 changed files with 21,161 additions and 26 deletions.
5 changes: 4 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -639,6 +639,9 @@ ulimit -n 2000

## Version history

### 0.3.4
* The maximum chromosome size allowed is now 2^30 instead of 2^29 with new index. *Index structure changed.*

### 0.3.3
* The problem of `pypairix` `get_blocknames` crashing python when called twice now fixed.

Expand Down Expand Up @@ -671,7 +674,7 @@ ulimit -n 2000
* `pairix` has now option `-w` which specifies region split character (default '|') during indexing. A query string should use this character as a separater.
* `pypairix` also now has a parameter `region_split_character` in function `build_index` (default '|')
* `juicer_shortform2pairs.pl` is now available in the `util` folder.
* Index structure changed - please re-index if you're using an older version of index.
* *Index structure changed* - please re-index if you're using an older version of index.

### 0.2.4
* Updated magic number for the new index, to avoid crash caused by different index structure.
Expand Down
2 changes: 1 addition & 1 deletion VERSION.txt
Original file line number Diff line number Diff line change
@@ -1 +1 @@
0.3.3
0.3.4
21,006 changes: 21,006 additions & 0 deletions samples/4dn.bsorted.chr21_22_only.largechr.pairs

Large diffs are not rendered by default.

Binary file modified samples/4dn.bsorted.chr21_22_only.nontriangle.pairs.gz.px2
Binary file not shown.
Binary file modified samples/4dn.bsorted.chr21_22_only.pairs.gz.px2
Binary file not shown.
Binary file modified samples/SRR1171591.variants.snp.vqsr.p.vcf.gz.px2
Binary file not shown.
Binary file modified samples/merged_nodup.tab.chrblock_sorted.txt.gz.px2
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file added samples/mock.largechr.pairs.gz
Binary file not shown.
Binary file added samples/mock.largechr.pairs.gz.px2
Binary file not shown.
Binary file modified samples/test_4dn.pairs.gz.px2
Binary file not shown.
Binary file modified samples/test_4dn_2.bsorted.pairs.gz.px2
Binary file not shown.
Binary file modified samples/test_juicer_shortform.bsorted.pairs.gz.px2
Binary file not shown.
Binary file modified samples/test_merged_nodups.bsorted.pairs.gz.px2
Binary file not shown.
Binary file modified samples/test_merged_nodups.txt.bsorted.gz.px2
Binary file not shown.
Binary file modified samples/test_old_merged_nodups.bsorted.pairs.gz.px2
Binary file not shown.
Binary file modified samples/test_old_merged_nodups.txt.bsorted.gz.px2
Binary file not shown.
50 changes: 28 additions & 22 deletions src/index.c
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,8 @@
#include <time.h>

#define TAD_MIN_CHUNK_GAP 32768
// 1<<14 is the size of minimum bin.
#define TAD_LIDX_SHIFT 14
// 1<<15 is the size of minimum bin.
#define TAD_LIDX_SHIFT 15
#define DEFAULT_DELIMITER '\t'
#define MAX_REGION_STR_LEN 10000

Expand Down Expand Up @@ -101,11 +101,11 @@ int ti_readline(BGZF *fp, kstring_t *str)
static inline int ti_reg2bin(uint32_t beg, uint32_t end)
{
--end;
if (beg>>14 == end>>14) return 4681 + (beg>>14);
if (beg>>17 == end>>17) return 585 + (beg>>17);
if (beg>>20 == end>>20) return 73 + (beg>>20);
if (beg>>23 == end>>23) return 9 + (beg>>23);
if (beg>>26 == end>>26) return 1 + (beg>>26);
if (beg>>15 == end>>15) return 4681 + (beg>>15);
if (beg>>18 == end>>18) return 585 + (beg>>18);
if (beg>>21 == end>>21) return 73 + (beg>>21);
if (beg>>24 == end>>24) return 9 + (beg>>24);
if (beg>>27 == end>>27) return 1 + (beg>>27);
return 0;
}

Expand Down Expand Up @@ -471,7 +471,7 @@ void ti_index_save(const ti_index_t *idx, BGZF *fp)
int32_t i, size, ti_is_be;
khint_t k;
ti_is_be = bam_is_big_endian();
bgzf_write(fp, "PX2.002\1", 8);
bgzf_write(fp, "PX2.003\1", 8);
if (ti_is_be) {
uint32_t x = idx->n;
bgzf_write(fp, bam_swap_endian_4p(&x), 4);
Expand Down Expand Up @@ -560,7 +560,7 @@ static ti_index_t *ti_index_load_core(BGZF *fp)
return 0;
}
bgzf_read(fp, magic, 8);
if (strncmp(magic, "PX2.002\1", 8)) {
if (strncmp(magic, "PX2.003\1", 8)) {
fprintf(stderr, "[ti_index_load] wrong magic number. Re-index if your index file was created by an earlier version of pairix.\n");
return 0;
}
Expand Down Expand Up @@ -806,15 +806,15 @@ int ti_parse_region(const ti_index_t *idx, const char *str, int *tid, int *begin
return -1;
}
if (i == k) { /* dump the whole sequence */
*begin = 0; *end = 1<<29; free(s);
*begin = 0; *end = 1<<30; free(s);
return 0;
}
for (p = s + i + 1; i != k; ++i) if (s[i] == '-') break;
*begin = atoi(p);
if (i < k) {
p = s + i + 1;
*end = atoi(p);
} else *end = 1<<29;
} else *end = 1<<30;
if (*begin > 0) --*begin;
free(s);
if (*begin > *end) return -2;
Expand Down Expand Up @@ -884,29 +884,29 @@ int ti_parse_region2d(const ti_index_t *idx, const char *str, int *tid, int *beg

/* parsing pos1 */
if (pos1s-1 == coord1e) { /* dump the whole sequence */
*begin = 0; *end = 1<<29;
*begin = 0; *end = 1<<30;
} else {
p = s + pos1s;
for (i = pos1s ; i != coord1e; ++i) if (s[i] == '-') break;
*begin = atoi(p);
if (i < coord1e) {
p = s + i + 1;
*end = atoi(p);
} else *end = 1<<29;
} else *end = 1<<30;
if (*begin > 0) --*begin;
}

/* parsing pos2 */
if (pos2s-1 == coord2e) { /* dump the whole sequence */
*begin2 = 0; *end2 = 1<<29;
*begin2 = 0; *end2 = 1<<30;
} else {
p = s + pos2s;
for (i = pos2s ; i != coord2e; ++i) if (s[i] == '-') break;
*begin2 = atoi(p);
if (i < coord2e) {
p = s + i + 1;
*end2 = atoi(p);
} else *end2 = 1<<29;
} else *end2 = 1<<30;
if (*begin2 > 0) --*begin2;
}

Expand All @@ -924,19 +924,25 @@ int ti_parse_region2d(const ti_index_t *idx, const char *str, int *tid, int *beg
*******************************/

#define MAX_BIN 37450 // =(8^6-1)/7+1
// #define MAX_BIN 74898
// #define MAX_BIN 149794
// #define MAX_BIN 299594

static inline int reg2bins(uint32_t beg, uint32_t end, uint16_t list[MAX_BIN])
{
int i = 0, k;
if (beg >= end) return 0;
if (end >= 1u<<29) end = 1u<<29;
if (end > 1u<<30) {
end = 1u<<30;
fprintf(stderr, "Warning: maximum chromosome size is 2^30.\n");
}
--end;
list[i++] = 0;
for (k = 1 + (beg>>26); k <= 1 + (end>>26); ++k) list[i++] = k;
for (k = 9 + (beg>>23); k <= 9 + (end>>23); ++k) list[i++] = k;
for (k = 73 + (beg>>20); k <= 73 + (end>>20); ++k) list[i++] = k;
for (k = 585 + (beg>>17); k <= 585 + (end>>17); ++k) list[i++] = k;
for (k = 4681 + (beg>>14); k <= 4681 + (end>>14); ++k) list[i++] = k;
for (k = 1 + (beg>>27); k <= 1 + (end>>27); ++k) list[i++] = k;
for (k = 9 + (beg>>24); k <= 9 + (end>>24); ++k) list[i++] = k;
for (k = 73 + (beg>>21); k <= 73 + (end>>21); ++k) list[i++] = k;
for (k = 585 + (beg>>18); k <= 585 + (end>>18); ++k) list[i++] = k;
for (k = 4681 + (beg>>15); k <= 4681 + (end>>15); ++k) list[i++] = k;
return i;
}

Expand Down Expand Up @@ -1902,7 +1908,7 @@ sequential_iter_t *querys_2D_wrapper(pairix_t *tb, const char *reg, int flip)

int get_nblocks(ti_index_t *idx, int tid, BGZF *fp)
{
ti_iter_t iter = ti_iter_query(idx, tid, 0, 1<<29, 0, 1<<29);
ti_iter_t iter = ti_iter_query(idx, tid, 0, 1<<30, 0, 1<<30);
int64_t start_block_address = iter->off[0].u>>16; // in bytes
int64_t end_block_offset = iter->off[0].v;
int nblocks=0;
Expand Down
2 changes: 1 addition & 1 deletion src/pairix.h
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
#ifndef __TABIDX_H
#define __TABIDX_H

#define PACKAGE_VERSION "0.3.3"
#define PACKAGE_VERSION "0.3.4"

#include <stdint.h>
#include "kstring.h"
Expand Down
55 changes: 54 additions & 1 deletion test/test.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@
TEST_FILE_2D_4DN_NOT_TRIANGLE = 'samples/4dn.bsorted.chr21_22_only.nontriangle.pairs.gz'
TEST_FILE_1D = 'samples/SRR1171591.variants.snp.vqsr.p.vcf.gz'
TEST_FILE_2D_SPACE = 'samples/merged_nodups.space.chrblock_sorted.subsample1.txt.gz'

TEST_FILE_LARGE_CHR = 'samples/mock.largechr.pairs.gz'

def get_header(filename, meta_char='#'):
"""Read gzipped file and retrieve lines beginning with '#'."""
Expand Down Expand Up @@ -400,6 +400,59 @@ def test_build_index_with_force_merged_nodups_with_no_preset(self): ## recogniz
self.assertEqual(self.result, pr2_result)


## 2D query on 2D indexed file with chromosomes using a pairs file with large chromosomes
class PairixTest2D_LargeChr(unittest.TestCase):
f_type = find_pairs_type(TEST_FILE_LARGE_CHR)
regions = read_pairs(TEST_FILE_LARGE_CHR, f_type)
chrom = 'chr21'
start = 1
end = 1073741824
chrom2 = 'chr22'
start2 = 1
end2 = 1073741824
# reverse reversed results to get them in the required order here
result = get_result_2D(regions, chrom, start, end, chrom2, start2, end2)
pr = pypairix.open(TEST_FILE_LARGE_CHR)

def test_query2_4dn(self):
it = self.pr.query2D(self.chrom, self.start, self.end, self.chrom2, self.start2, self.end2)
pr_result = build_it_result(it, self.f_type)
self.assertEqual(self.result, pr_result)

def test_querys_2_4dn(self):
query = '{}:{}-{}|{}:{}-{}'.format(self.chrom, self.start, self.end, self.chrom2, self.start2, self.end2)
it = self.pr.querys2D(query)
pr_result = build_it_result(it, self.f_type)
self.assertEqual(self.result, pr_result)

def test_build_index_without_force(self):
# expect an error here... the px2 file already exists
with self.assertRaises(pypairix.PairixError) as error:
pypairix.build_index(TEST_FILE_LARGE_CHR)
# errors are handled differently in python 2 and python 3
if sys.version_info > (3,0):
self.assertEqual(error.exception.__str__(), "The index file exists. Please use force=1 to overwrite.")
else:
self.assertEqual(error.exception.message, "The index file exists. Please use force=1 to overwrite.")

def test_build_index_with_region_split_character(self):
pypairix.build_index(TEST_FILE_LARGE_CHR, region_split_character="^", force=1)
pr2 = pypairix.open(TEST_FILE_LARGE_CHR)
query = '{}:{}-{}^{}:{}-{}'.format(self.chrom, self.start, self.end, self.chrom2, self.start2, self.end2)
it2 = pr2.querys2D(query)
pr2_result = build_it_result(it2, self.f_type)
pypairix.build_index(TEST_FILE_LARGE_CHR, force=1) # revert
self.assertEqual(self.result, pr2_result)

def test_build_index_with_force(self): ## recognizing file extension pairs.gz
pypairix.build_index(TEST_FILE_LARGE_CHR, force=1)
pr2 = pypairix.open(TEST_FILE_LARGE_CHR)
query = '{}:{}-{}|{}:{}-{}'.format(self.chrom, self.start, self.end, self.chrom2, self.start2, self.end2)
it2 = pr2.querys2D(query)
pr2_result = build_it_result(it2, self.f_type)
self.assertEqual(self.result, pr2_result)


## 1D query on 2D indexed file
class PairixTest_1_on_2(unittest.TestCase):
f_type='4DN'
Expand Down
Loading

0 comments on commit a181178

Please sign in to comment.