Skip to content

Commit

Permalink
Merge pull request #57 from 4dn-dcic/0.3.6
Browse files Browse the repository at this point in the history
0.3.6
  • Loading branch information
Carl Vitzthum authored Apr 30, 2018
2 parents e32ff38 + dfd49e2 commit 07bcab0
Show file tree
Hide file tree
Showing 56 changed files with 746 additions and 32 deletions.
2 changes: 2 additions & 0 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,10 @@ matrix:
- sudo python setup.py install
- python test/test.py
- python test/test_oldindex.py
- python test/test_oldindex2.py
- source test/test_c.sh # valgrind test is applied when VALGRIND_TEST_ON=1
- source test/test_c_oldindex.sh # test for old index
- source test/test_c_oldindex2.sh # test for old index
- language: c
sudo: required
compiler: gcc
Expand Down
3 changes: 3 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -639,6 +639,9 @@ ulimit -n 2000

## Version history

### 0.3.6
* Line count (`pairix -n`) integer overflow issue has been fixed. The index structure has changed. The index generated by the previous versions (0.2.5~0.3.3, 0.3.4~0.3.5) can be auto-detected and used as well (backward compatible).

### 0.3.5
* Backward compatibility is added - The index generated by the previous version (0.2.5 ~ 0.3.3) can now be auto-detected and used by Pairix.

Expand Down
2 changes: 1 addition & 1 deletion VERSION.txt
Original file line number Diff line number Diff line change
@@ -1 +1 @@
0.3.5
0.3.6
Binary file modified samples/4dn.bsorted.chr21_22_only.nontriangle.pairs.gz.px2
Binary file not shown.
Binary file modified samples/4dn.bsorted.chr21_22_only.pairs.gz.px2
Binary file not shown.
Binary file modified samples/SRR1171591.variants.snp.vqsr.p.vcf.gz.px2
Binary file not shown.
Binary file modified samples/merged_nodup.tab.chrblock_sorted.txt.gz.px2
Binary file not shown.
Binary file modified samples/merged_nodups.space.chrblock_sorted.subsample1.txt.gz.px2
Binary file not shown.
Binary file modified samples/merged_nodups.space.chrblock_sorted.subsample2.txt.gz.px2
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file modified samples/merged_nodups.space.chrblock_sorted.subsample3.txt.gz.px2
Binary file not shown.
Binary file modified samples/mock.largechr.pairs.gz.px2
Binary file not shown.
Binary file modified samples/old_index/SRR1171591.variants.snp.vqsr.p.vcf.gz.px2
Binary file not shown.
Binary file modified samples/old_index/merged_nodup.tab.chrblock_sorted.txt.gz.px2
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file modified samples/old_index/test_4dn.pairs.gz.px2
Binary file not shown.
Binary file modified samples/old_index/test_juicer_shortform.bsorted.pairs.gz.px2
Binary file not shown.
Binary file modified samples/old_index/test_merged_nodups.bsorted.pairs.gz.px2
Binary file not shown.
Binary file modified samples/old_index/test_merged_nodups.txt.bsorted.gz.px2
Binary file not shown.
Binary file modified samples/old_index/test_old_merged_nodups.bsorted.pairs.gz.px2
Binary file not shown.
Binary file modified samples/old_index/test_old_merged_nodups.txt.bsorted.gz.px2
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file added samples/old_index2/mock.largechr.pairs.gz
Binary file not shown.
Binary file added samples/old_index2/mock.largechr.pairs.gz.px2
Binary file not shown.
Binary file added samples/old_index2/test_4dn.pairs.gz
Binary file not shown.
Binary file added samples/old_index2/test_4dn.pairs.gz.px2
Binary file not shown.
Binary file modified samples/test_4dn.pairs.gz.px2
Binary file not shown.
Binary file modified samples/test_4dn_2.bsorted.pairs.gz.px2
Binary file not shown.
Binary file modified samples/test_juicer_shortform.bsorted.pairs.gz.px2
Binary file not shown.
Binary file modified samples/test_merged_nodups.bsorted.pairs.gz.px2
Binary file not shown.
Binary file modified samples/test_merged_nodups.txt.bsorted.gz.px2
Binary file not shown.
Binary file modified samples/test_old_merged_nodups.bsorted.pairs.gz.px2
Binary file not shown.
Binary file modified samples/test_old_merged_nodups.txt.bsorted.gz.px2
Binary file not shown.
41 changes: 27 additions & 14 deletions src/index.c
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,8 @@
int TAD_LIDX_SHIFT = TAD_LIDX_SHIFT_LARGE_CHR;
int MAX_CHR = MAX_CHR_LARGE_CHR;

#define MAGIC_NUMBER "PX2.003\1"
#define MAGIC_NUMBER "PX2.004\1"
#define OLD_MAGIC_NUMBER2 "PX2.003\1" // magic number for older version of pairix (0.3.4 - 0.3.5)
#define OLD_MAGIC_NUMBER "PX2.002\1" // magic number for older version of pairix (up to 0.3.3)


Expand Down Expand Up @@ -56,7 +57,7 @@ struct __ti_index_t {
khash_t(s) *tname;
khash_t(i) **index;
ti_lidx_t *index2;
int linecount;
uint64_t linecount;
};

struct __ti_iter_t {
Expand Down Expand Up @@ -490,9 +491,9 @@ void ti_index_save(const ti_index_t *idx, BGZF *fp)
bgzf_write(fp, bam_swap_endian_4p(&x), 4);
} else bgzf_write(fp, &idx->n, 4);
if (ti_is_be) {
uint32_t x = idx->linecount;
bgzf_write(fp, bam_swap_endian_4p(&x), 4);
} else bgzf_write(fp, &idx->linecount, 4);
uint64_t x = idx->linecount;
bgzf_write(fp, bam_swap_endian_8p(&x), 8);
} else bgzf_write(fp, &idx->linecount, 8);
assert(sizeof(ti_conf_t) == 40);
if (ti_is_be) { // write ti_conf_t;
uint32_t x[6];
Expand Down Expand Up @@ -574,19 +575,28 @@ static ti_index_t *ti_index_load_core(BGZF *fp)
}
bgzf_read(fp, magic, 8);
if (strncmp(magic, MAGIC_NUMBER, 8)) {
if (strncmp(magic, OLD_MAGIC_NUMBER, 8)) {
fprintf(stderr, "[ti_index_load] wrong magic number. Re-index if your index file was created by an earlier version of pairix.\n");
return 0;
} else {
if (strncmp(magic, OLD_MAGIC_NUMBER, 8)==0) {
TAD_LIDX_SHIFT = TAD_LIDX_SHIFT_ORIGINAL;
MAX_CHR = MAX_CHR_ORIGINAL;
}
else if(strncmp(magic, OLD_MAGIC_NUMBER2, 8)==0) {
}
else {
fprintf(stderr, "[ti_index_load] wrong magic number. Re-index if your index file was created by an earlier version of pairix.\n");
return 0;
}
}
idx = (ti_index_t*)calloc(1, sizeof(ti_index_t));
bgzf_read(fp, &idx->n, 4);
if (ti_is_be) bam_swap_endian_4p(&idx->n);
bgzf_read(fp, &idx->linecount, 4);
if (ti_is_be) bam_swap_endian_4p(&idx->linecount);
if(strncmp(magic, MAGIC_NUMBER, 8)==0) {
bgzf_read(fp, &idx->linecount, 8);
if (ti_is_be) bam_swap_endian_8p(&idx->linecount);
}
else if(strncmp(magic, OLD_MAGIC_NUMBER2, 8)==0 || strncmp(magic, OLD_MAGIC_NUMBER, 8)==0) {
bgzf_read(fp, &idx->linecount, 4);
if (ti_is_be) bam_swap_endian_4p(&idx->linecount);
}
idx->tname = kh_init(s);
idx->index = (khash_t(i)**)calloc(idx->n, sizeof(void*));
idx->index2 = (ti_lidx_t*)calloc(idx->n, sizeof(ti_lidx_t));
Expand Down Expand Up @@ -845,9 +855,10 @@ int ti_parse_region(const ti_index_t *idx, const char *str, int *tid, int *begin
// if 1d, begin2 and end2 will have value -1.
// query string error: -1
// region_split_character not matching error: -2
// memory allocation error: -3
int ti_parse_region2d(const ti_index_t *idx, const char *str, int *tid, int *begin, int *end, int *begin2, int *end2)
{
char *s, *p, *sname;
char *s, *p, *sname, *tmp_s;
int i, l, k, h;
int coord1s, coord1e, coord2s, coord2e, pos1s, pos2s;
char region_split_character = ti_get_region_split_character(idx);
Expand All @@ -872,7 +883,9 @@ int ti_parse_region2d(const ti_index_t *idx, const char *str, int *tid, int *beg
free(s); return (res);
}
if(i == k && dim == 2) { //1d query on 2d data : interprete query 'x' as 'x|x'
s = (char*)realloc(s, k*2+2);
tmp_s = (char*)realloc(s, k*2+2);
if(tmp_s) s = tmp_s;
else return(-3); // memory alloc error
strcpy(s+i+1, s);
s[i] = region_split_character;
k = k*2+1;
Expand Down Expand Up @@ -975,7 +988,7 @@ ti_iter_t ti_iter_first()
}


int get_linecount(const ti_index_t *idx)
uint64_t get_linecount(const ti_index_t *idx)
{
return(idx->linecount);
}
Expand Down
4 changes: 2 additions & 2 deletions src/main.c
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
#include <sys/stat.h>
#include <errno.h>
#include <getopt.h>
#include <inttypes.h>
#include "bgzf.h"
#include "pairix.h"
#include "knetfile.h"
Expand Down Expand Up @@ -220,8 +221,7 @@ int main(int argc, char *argv[])
fprintf(stderr, "[main] fail to load the index file.\n");
return 1;
}
int linecount = get_linecount(idx);
printf("%d\n", linecount);
printf("%"PRIu64"\n", get_linecount(idx));
ti_index_destroy(idx);
return 0;
}
Expand Down
4 changes: 2 additions & 2 deletions src/pairix.h
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
#ifndef __TABIDX_H
#define __TABIDX_H

#define PACKAGE_VERSION "0.3.5"
#define PACKAGE_VERSION "0.3.6"

#include <stdint.h>
#include "kstring.h"
Expand Down Expand Up @@ -151,7 +151,7 @@ extern "C" {
const char **ti_seqname(const ti_index_t *idx, int *n);

/* get linecount */
int get_linecount(const ti_index_t *idx);
uint64_t get_linecount(const ti_index_t *idx);

/* get file offset
* returns number of bgzf blocks spanning a sequence (pair) */
Expand Down
12 changes: 6 additions & 6 deletions test/test.py
Original file line number Diff line number Diff line change
Expand Up @@ -253,10 +253,10 @@ def test_querys_2_bad_order(self):
with warnings.catch_warnings(record=True) as w:
warnings.simplefilter("always")
# trigger a warning
it = self.pr.querys2D(query)
self.pr.querys2D(query)
# verify some things about the warning
assert len(w) == 1
assert issubclass(w[-1].category, pypairix.PairixWarning)
self.assertEqual(len(w), 1)
self.assertTrue(issubclass(w[-1].category, pypairix.PairixWarning))

def test_build_index_with_force_merged_nodups_tab(self): ## recognizing custom set
pypairix.build_index(TEST_FILE_2D, sc=2, bc=3, ec=3, sc2=6, bc2=7, ec2=7, force=1)
Expand Down Expand Up @@ -300,10 +300,10 @@ def test_query2_rev_fail(self):
with warnings.catch_warnings(record=True) as w:
warnings.simplefilter("always")
# trigger a warning
it = self.pr.query2D(self.chrom, self.start, self.end, self.chrom2, self.start2, self.end2)
self.pr.query2D(self.chrom, self.start, self.end, self.chrom2, self.start2, self.end2)
# verify some things about the warning
assert len(w) == 1
assert issubclass(w[-1].category, pypairix.PairixWarning)
self.assertEqual(len(w), 1)
self.assertTrue(issubclass(w[-1].category, pypairix.PairixWarning))


## 2D query on 2D indexed file with chromosomes using a 4DN pairs file
Expand Down
2 changes: 2 additions & 0 deletions test/test_c.sh
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
#!/bin/bash

PATH=./bin:$PATH

if [ $VALGRIND_TEST_ON -eq 1 ]; then
Expand Down
2 changes: 2 additions & 0 deletions test/test_c_oldindex.sh
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
#!/bin/bash

PATH=./bin:$PATH

if [ $VALGRIND_TEST_ON -eq 1 ]; then
Expand Down
172 changes: 172 additions & 0 deletions test/test_c_oldindex2.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,172 @@
#!/bin/bash

PATH=./bin:$PATH

if [ $VALGRIND_TEST_ON -eq 1 ]; then
VALGRIND="valgrind --error-exitcode=42 --leak-check=full"
else
VALGRIND=""
fi

## 2D
echo "test 1"
$VALGRIND pairix samples/old_index2/merged_nodup.tab.chrblock_sorted.txt.gz '10:1-1000000|20' > log1
gunzip -c samples/old_index2/merged_nodup.tab.chrblock_sorted.txt.gz | awk '$2=="10" && $3>=1 && $3<=1000000 && $6=="20"' > log2
if [ ! -z "$(diff log1 log2)" ]; then
echo "test 1 failed"
return 1;
fi

echo "test 1b"
$VALGRIND pairix -a samples/old_index2/merged_nodup.tab.chrblock_sorted.txt.gz '10:1-1000000|20' > log1
gunzip -c samples/old_index2/merged_nodup.tab.chrblock_sorted.txt.gz | awk '$2=="10" && $3>=1 && $3<=1000000 && $6=="20"' > log2
if [ ! -z "$(diff log1 log2)" ]; then
echo "test 1b failed"
return 1;
fi

echo "test 1c"
$VALGRIND pairix -a samples/old_index2/merged_nodup.tab.chrblock_sorted.txt.gz '20|10:1-1000000' > log1
gunzip -c samples/old_index2/merged_nodup.tab.chrblock_sorted.txt.gz | awk '$2=="10" && $3>=1 && $3<=1000000 && $6=="20"' > log2
if [ ! -z "$(diff log1 log2)" ]; then
echo "test 1c failed"
return 1;
fi

echo "test 1d"
$VALGRIND pairix samples/old_index2/test_4dn.pairs.gz 'chr22:50000000-60000000' > log1
$VALGRIND pairix samples/old_index2/test_4dn.pairs.gz 'chr22:50000000-60000000|chr22:50000000-60000000' > log2
if [ ! -z "$(diff log1 log2)" ]; then
echo "test 1d failed"
return 1;
fi

echo "test 1e"
$VALGRIND pairix samples/old_index2/test_4dn.pairs.gz 'chrY:1-2000000' > log1
$VALGRIND pairix samples/old_index2/test_4dn.pairs.gz 'chrY:1-2000000|chrY:1-2000000' > log2
if [ ! -z "$(diff log1 log2)" ]; then
echo "test 1e failed"
return 1;
fi

echo "test 2"
$VALGRIND pairix samples/old_index2/merged_nodup.tab.chrblock_sorted.txt.gz '10:1-1000000|20:50000000-60000000' > log1
gunzip -c samples/old_index2/merged_nodup.tab.chrblock_sorted.txt.gz | awk '$2=="10" && $3>=1 && $3<=1000000 && $6=="20" && $7>=50000000 && $7<=60000000' > log2
if [ ! -z "$(diff log1 log2)" ]; then
echo "test 2 failed"
return 1;
fi

echo "test 3"
$VALGRIND pairix samples/old_index2/merged_nodup.tab.chrblock_sorted.txt.gz '1:1-10000000|20:50000000-60000000' '3:5000000-9000000|X:70000000-90000000' > log1
gunzip -c samples/old_index2/merged_nodup.tab.chrblock_sorted.txt.gz | awk '$2=="1" && $3>=1 && $3<=10000000 && $6=="20" && $7>=50000000 && $7<=60000000' > log2
gunzip -c samples/old_index2/merged_nodup.tab.chrblock_sorted.txt.gz | awk '$2=="3" && $3>=5000000 && $3<=9000000 && $6=="X" && $7>=70000000 && $7<=90000000' >> log2
if [ ! -z "$(diff log1 log2)" ]; then
echo "test 3 failed"
return 1;
fi

echo "test 4"
$VALGRIND pairix samples/old_index2/merged_nodup.tab.chrblock_sorted.txt.gz '*|1:0-100000' > log1
gunzip -c samples/old_index2/merged_nodup.tab.chrblock_sorted.txt.gz | awk '$6=="1" && $7>=0 && $7<=100000' > log2
if [ ! -z "$(diff log1 log2)" ]; then
echo "test 4 failed"
return 1;
fi

echo "test 5"
$VALGRIND pairix samples/old_index2/merged_nodup.tab.chrblock_sorted.txt.gz '1:0-100000|*' > log1
gunzip -c samples/old_index2/merged_nodup.tab.chrblock_sorted.txt.gz | awk '$2=="1" && $3>=0 && $3<=100000' > log2
if [ ! -z "$(diff log1 log2)" ]; then
echo "test 5 failed"
return 1;
fi


## 1D
echo "test 6"
$VALGRIND pairix samples/old_index2/SRR1171591.variants.snp.vqsr.p.vcf.gz chr10:1-4000000 > log1
gunzip -c samples/old_index2/SRR1171591.variants.snp.vqsr.p.vcf.gz | awk '$1=="chr10" && $2>=1 && $2<=4000000' > log2
if [ ! -z "$(diff log1 log2)" ]; then
echo "test 6 failed"
return 1;
fi


## 2D, space-delimited
echo "test 7"
$VALGRIND pairix samples/old_index2/merged_nodups.space.chrblock_sorted.subsample1.txt.gz '10:1-1000000|20' > log1
gunzip -c samples/old_index2/merged_nodups.space.chrblock_sorted.subsample1.txt.gz | awk '$2=="10" && $3>=1 && $3<=1000000 && $6=="20"' > log2
if [ ! -z "$(diff log1 log2)" ]; then
echo "test 7 failed"
return 1;
fi


## preset for pairs.gz
echo "test 8"
$VALGRIND pairix samples/old_index2/test_4dn.pairs.gz 'chr10|chr20' > log1
gunzip -c samples/old_index2/test_4dn.pairs.gz | awk '$2=="chr10" && $4=="chr20"' > log2
if [ ! -z "$(diff log1 log2)" ]; then
echo "test 8 failed"
return 1;
fi

## linecount
echo "test linecount"
$VALGRIND pairix -n samples/old_index2/test_4dn.pairs.gz > log1
gunzip -c samples/old_index2/test_4dn.pairs.gz |wc -l | sed "s/ //g" > log2
if [ ! -z "$(diff log1 log2)" ]; then
echo "linecount test failed"
return 1;
fi


## bgzf block count (currently no auto test for the accuracy of the result)
echo "test bgzf block count"
$VALGRIND pairix -B samples/old_index2/test_4dn.pairs.gz

## check triangle
echo "test check triangle"
$VALGRIND pairix -Y samples/old_index2/4dn.bsorted.chr21_22_only.pairs.gz
$VALGRIND pairix -Y samples/old_index2/4dn.bsorted.chr21_22_only.nontriangle.pairs.gz
res=$(pairix -Y samples/old_index2/4dn.bsorted.chr21_22_only.nontriangle.pairs.gz)
if [ "$res" != "The file is not a triangle." ]; then
echo "test check triangle failed"
return 1;
fi

echo "test check triangle #2"
res=$(pairix -Y samples/old_index2/4dn.bsorted.chr21_22_only.pairs.gz)
if [ "$res" != "The file is a triangle." ]; then
echo "test check triangle #2 failed"
return 1;
fi


# test large chromosome
echo "test large chr"
$VALGRIND pairix samples/old_index2/mock.largechr.pairs.gz 'chr21:800000000-900000000|chr22' > log1
gunzip -c samples/old_index2/mock.largechr.pairs.gz | awk '$2=="chr21" && $3>800000000 && $3<900000000 && $4=="chr22"' > log2
if [ ! -z "$(diff log1 log2)" ]; then
echo "test large chromosome failed"
return 1;
fi

# test large chromosome
echo "test large chr2"
$VALGRIND pairix samples/old_index2/mock.largechr.pairs.gz 'chr22:800000000-997027270|chr22' > log1
gunzip -c samples/old_index2/mock.largechr.pairs.gz | awk '$2=="chr22" && $3>=800000000 && $3<=997027270 && $4=="chr22"' > log2
if [ ! -z "$(diff log1 log2)" ]; then
echo "test large chromosome2 failed"
return 1;
fi

# test large chromosome
echo "test large chr3"
$VALGRIND pairix samples/old_index2/mock.largechr.pairs.gz 'chr22:1073741820-1073741824|chr22' > log1
gunzip -c samples/old_index2/mock.largechr.pairs.gz | awk '$2=="chr22" && $3>=1073741820 && $3<=1073741824 && $4=="chr22"' > log2
if [ ! -z "$(diff log1 log2)" ]; then
echo "test large chromosome3 failed"
return 1;
fi
12 changes: 6 additions & 6 deletions test/test_oldindex.py
Original file line number Diff line number Diff line change
Expand Up @@ -245,10 +245,10 @@ def test_querys_2_bad_order(self):
with warnings.catch_warnings(record=True) as w:
warnings.simplefilter("always")
# trigger a warning
it = self.pr.querys2D(query)
self.pr.querys2D(query)
# verify some things about the warning
assert len(w) == 1
assert issubclass(w[-1].category, pypairix.PairixWarning)
self.assertEqual(len(w), 1)
self.assertTrue(issubclass(w[-1].category, pypairix.PairixWarning))


## 2D query on 2D indexed file with chromosomes input in reverse order
Expand Down Expand Up @@ -283,10 +283,10 @@ def test_query2_rev_fail(self):
with warnings.catch_warnings(record=True) as w:
warnings.simplefilter("always")
# trigger a warning
it = self.pr.query2D(self.chrom, self.start, self.end, self.chrom2, self.start2, self.end2)
self.pr.query2D(self.chrom, self.start, self.end, self.chrom2, self.start2, self.end2)
# verify some things about the warning
assert len(w) == 1
assert issubclass(w[-1].category, pypairix.PairixWarning)
self.assertEqual(len(w), 1)
self.assertTrue(issubclass(w[-1].category, pypairix.PairixWarning))


## 2D query on 2D indexed file with chromosomes using a 4DN pairs file
Expand Down
Loading

0 comments on commit 07bcab0

Please sign in to comment.