Merge pull request #17 from 4dn-dcic/dev

Dev
4dn-dcic · Apr 5, 2017 · 9b089b8 · 9b089b8
2 parents 7a7cbcc + 3c80792
commit 9b089b8
Show file tree

Hide file tree

Showing 37 changed files with 177 additions and 60 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: Rpairix
 Title: Rpairix
-Version: 0.1.1
+Version: 0.1.2
 Authors@R: person("Soo", "Lee", email = "duplexa@gmail.com", role = c("aut", "cre"))
 Description: R binder for pairix, tool for querying a pair of genomic ranges in a pairs file (pairix-indexed bgzipped text file)
 Depends:

diff --git a/NAMESPACE b/NAMESPACE
@@ -8,6 +8,7 @@ export(px_endpos1_col)
 export(px_endpos2_col)
 export(px_exists)
 export(px_getChar)
+export(px_get_column_names)
 export(px_keylist)
 export(px_query)
 export(px_seq1list)
@@ -21,6 +22,7 @@ useDynLib(Rpairix,check_1d_vs_2d)
 useDynLib(Rpairix,getChar)
 useDynLib(Rpairix,get_chr1_col)
 useDynLib(Rpairix,get_chr2_col)
+useDynLib(Rpairix,get_column_names)
 useDynLib(Rpairix,get_endpos1_col)
 useDynLib(Rpairix,get_endpos2_col)
 useDynLib(Rpairix,get_keylist)

diff --git a/R/px_chr1_col.R b/R/px_chr1_col.R
@@ -12,7 +12,7 @@
 #' res = px_chr1_col(filename)
 #' print(res)
 #'
-#' filename = system.file(".","merged_nodup.tab.chrblock_sorted.txt.gz",package="Rpairix")
+#' filename = system.file(".","merged_nodups.space.chrblock_sorted.subsample1.txt.gz",package="Rpairix")
 #' res = px_chr1_col(filename)
 #' print(res)
 #'

diff --git a/R/px_chr2_col.R b/R/px_chr2_col.R
@@ -12,7 +12,7 @@
 #' res = px_chr2_col(filename)
 #' print(res)
 #'
-#' filename = system.file(".","merged_nodup.tab.chrblock_sorted.txt.gz",package="Rpairix")
+#' filename = system.file(".","merged_nodups.space.chrblock_sorted.subsample1.txt.gz",package="Rpairix")
 #' res = px_chr2_col(filename)
 #' print(res)
 #'

diff --git a/R/px_endpos1_col.R b/R/px_endpos1_col.R
@@ -12,7 +12,7 @@
 #' res = px_endpos1_col(filename)
 #' print(res)
 #'
-#' filename = system.file(".","merged_nodup.tab.chrblock_sorted.txt.gz",package="Rpairix")
+#' filename = system.file(".","merged_nodups.space.chrblock_sorted.subsample1.txt.gz",package="Rpairix")
 #' res = px_endpos1_col(filename)
 #' print(res)
 #'

diff --git a/R/px_endpos2_col.R b/R/px_endpos2_col.R
@@ -12,7 +12,7 @@
 #' res = px_endpos2_col(filename)
 #' print(res)
 #'
-#' filename = system.file(".","merged_nodup.tab.chrblock_sorted.txt.gz",package="Rpairix")
+#' filename = system.file(".","merged_nodups.space.chrblock_sorted.subsample1.txt.gz",package="Rpairix")
 #' res = px_endpos2_col(filename)
 #' print(res)
 #'

diff --git a/R/px_exists.R b/R/px_exists.R
@@ -13,7 +13,7 @@
 #' res = px_exists(filename, key)
 #' print(res)
 #'
-#' filename = system.file(".","merged_nodup.tab.chrblock_sorted.txt.gz",package="Rpairix")
+#' filename = system.file(".","merged_nodups.space.chrblock_sorted.subsample1.txt.gz",package="Rpairix")
 #' key = "10|20"
 #' res = px_exists(filename, key)
 #' print(res)

diff --git a/R/px_get_column_names.R b/R/px_get_column_names.R
@@ -0,0 +1,17 @@
+#' Column name retrival function on pairix-indexed pairs file.
+#'
+#' This function returns a vector of column names for a pairs format.
+#'
+#' @param filename a pairs file, or a bgzipped text file (sometextfile.gz) with an index file sometextfile.gz.px2 in the same folder.
+#'
+#' @keywords pairix names
+#' @export px_get_column_names
+#' @examples
+#' filename = system.file(".","test_4dn.pairs.gz", package="Rpairix")
+#' cols = px_get_column_names(filename)
+#' print(cols)
+#' @useDynLib Rpairix get_column_names
+px_get_column_names<-function(filename){
+  out = .Call("get_column_names", filename)
+  if(!is.null(out)) { cols=strsplit(out, ' ')[[1]]; return(cols[2:length(cols)]); } else { return(NULL); }
+}
diff --git a/R/px_keylist.R b/R/px_keylist.R
@@ -11,7 +11,7 @@
 #' res = px_keylist(filename)
 #' print(res)
 #'
-#' filename = system.file(".","merged_nodup.tab.chrblock_sorted.txt.gz",package="Rpairix")
+#' filename = system.file(".","merged_nodups.space.chrblock_sorted.subsample1.txt.gz",package="Rpairix")
 #' res = px_keylist(filename)
 #' print(res)
 #'

diff --git a/R/px_query.R b/R/px_query.R
@@ -9,6 +9,7 @@
 #' @param linecount.only If TRUE, the function returns an integer corresponding to the number of output lines instead of the actual query result. (default FALSE) 
 #' @param autoflip If TRUE, the function will rerun on a flipped query (mate1 and mate2 swapped) if the original query results in an empty output. (default FALSE). If linecount.only option is used in combination with autoflip, the result count is on the flipped query in case the query gets flipped.
 #'
+#' @return data frame containing the query result. Column names are added if indexing was done with a pairs preset.
 #' @keywords pairix query 2D
 #' @export px_query
 #' @examples
@@ -31,7 +32,7 @@
 #' res = px_query(filename, querystr, autoflip=TRUE)
 #' print(res)
 #'
-#' filename = system.file(".","merged_nodup.tab.chrblock_sorted.txt.gz", package="Rpairix")
+#' filename = system.file(".","merged_nodups.space.chrblock_sorted.subsample1.txt.gz", package="Rpairix")
 #' querystr = "10:1-1000000|20"
 #' res = px_query(filename, querystr)
 #' print(res)
@@ -96,7 +97,11 @@ px_query<-function(filename, querystr, max_mem=100000000, stringsAsFactors=FALSE
   if(out2[[2]][1] == -1) return(NULL)  ## error
 
   ## tabularize
-  res.table = as.data.frame(do.call("rbind",strsplit(out2[[1]],'\t')),stringsAsFactors=stringsAsFactors)
+  ##res.table = as.data.frame(do.call("rbind",strsplit(out2[[1]],'\t')),stringsAsFactors=stringsAsFactors)
+  res.table = as.data.frame(do.call("rbind",out2[[1]]),stringsAsFactors=stringsAsFactors)
+  cols = px_get_column_names(filename)
+  if(!is.null(cols) && length(cols)==ncol(res.table)) colnames(res.table)=cols; 
+
   return (res.table)
 }
 

diff --git a/R/px_seq1list.R b/R/px_seq1list.R
@@ -11,7 +11,7 @@
 #' res = px_seq1list(filename)
 #' print(res)
 #'
-#' filename = system.file(".","merged_nodup.tab.chrblock_sorted.txt.gz",package="Rpairix")
+#' filename = system.file(".","merged_nodups.space.chrblock_sorted.subsample1.txt.gz",package="Rpairix")
 #' res = px_seq1list(filename)
 #' print(res)
 px_seq1list<-function(filename){

diff --git a/R/px_seq2list.R b/R/px_seq2list.R
@@ -11,7 +11,7 @@
 #' res = px_seq2list(filename)
 #' print(res)
 #'
-#' filename = system.file(".","merged_nodup.tab.chrblock_sorted.txt.gz",package="Rpairix")
+#' filename = system.file(".","merged_nodups.space.chrblock_sorted.subsample1.txt.gz",package="Rpairix")
 #' res = px_seq2list(filename)
 #' print(res)
 px_seq2list<-function(filename){

diff --git a/R/px_seqlist.R b/R/px_seqlist.R
@@ -11,7 +11,7 @@
 #' res = px_seqlist(filename)
 #' print(res)
 #'
-#' filename = system.file(".","merged_nodup.tab.chrblock_sorted.txt.gz",package="Rpairix")
+#' filename = system.file(".","merged_nodups.space.chrblock_sorted.subsample1.txt.gz",package="Rpairix")
 #' res = px_seqlist(filename)
 #' print(res)
 px_seqlist<-function(filename){

diff --git a/R/px_startpos1_col.R b/R/px_startpos1_col.R
@@ -12,7 +12,7 @@
 #' res = px_startpos1_col(filename)
 #' print(res)
 #'
-#' filename = system.file(".","merged_nodup.tab.chrblock_sorted.txt.gz",package="Rpairix")
+#' filename = system.file(".","merged_nodups.space.chrblock_sorted.subsample1.txt.gz",package="Rpairix")
 #' res = px_startpos1_col(filename)
 #' print(res)
 #'

diff --git a/R/px_startpos2_col.R b/R/px_startpos2_col.R
@@ -12,7 +12,7 @@
 #' res = px_startpos2_col(filename)
 #' print(res)
 #'
-#' filename = system.file(".","merged_nodup.tab.chrblock_sorted.txt.gz",package="Rpairix")
+#' filename = system.file(".","merged_nodups.space.chrblock_sorted.subsample1.txt.gz",package="Rpairix")
 #' res = px_startpos2_col(filename)
 #' print(res)
 #'

diff --git a/README.md b/README.md
@@ -34,12 +34,12 @@ R --no-site-file --no-environ --no-save --no-restore CMD INSTALL --install-tests
 To install a specific version,
 ```
 library(devtools)
-install_url("https://github.com/4dn-dcic/Rpairix/archive/0.1.1.zip")
+install_url("https://github.com/4dn-dcic/Rpairix/archive/0.1.2.zip")
 ```
 
 
 ## Available R functions
-`px_build_index`, `px_query`, `px_keylist`, `px_seqlist`, `px_seq1list`, `px_seq2list`, `px_exists`, `px_chr1_col`, `px_chr2_col`, `px_startpos1_col`, `px_startpos2_col`, `px_endpos1_col`, `px_endpos2_col`, `px_check_dim`
+`px_build_index`, `px_query`, `px_keylist`, `px_seqlist`, `px_seq1list`, `px_seq2list`, `px_exists`, `px_chr1_col`, `px_chr2_col`, `px_startpos1_col`, `px_startpos2_col`, `px_endpos1_col`, `px_endpos2_col`, `px_check_dim`, `px_get_column_names` 
 
 ## Usage
 ```
@@ -59,6 +59,7 @@ px_startpos2_col(filename) # 1-based column index for mate2 start position
 px_endpos1_col(filename) # 1-based column index for mate1 end position
 px_endpos2_col(filename) # 1-based column index for mate2 end position
 px_check_dim(filename) # returns 1 if the file is 1D-indexed, 2 if 2D-indexed. -1 if error.
+px_get_column_names(filename) # returns a vector of column names, if available. (works only for pairs format)
 ```
 
 ### Indexing
@@ -147,6 +148,16 @@ px_check_dim(filename)
 * `filename` is sometextfile.gz and an index file sometextfile.gz.px2 must exist.
 * The return value is an integer; 1 if the input file is 1D-indexed, 2 if 2D-indexed, -1 if an error occurred.
 
+### Getting column names
+```
+px_get_column_names(filename)
+```
+* `filename` is sometextfile.gz and an index file sometextfile.gz.px2 must exist
+* The return value is a vector of column names.
+* Returns values only if the indexing must have been done with 'pairs' preset (either explicitly by setting a preset or by file extension recognition) and if the column heading information is available.
+
+***
+
 ## Example run
 ```
 > library(Rpairix)
@@ -162,9 +173,9 @@ px_check_dim(filename)
 > querystr = "chr10:1-3000000|chr20"
 > res = px_query(filename,querystr)
 > print(res)
-                   V1    V2      V3    V4      V5 V6 V7
-1 SRR1658581.51740952 chr10  157600 chr20  167993  -  -
-2 SRR1658581.33457260 chr10 2559777 chr20 7888262  -  +
+               readID  chr1    pos1  chr2    pos2 strand1 strand2
+1 SRR1658581.51740952 chr10  157600 chr20  167993       -       -
+2 SRR1658581.33457260 chr10 2559777 chr20 7888262       -       +
 >
 > # line-count-only
 > n = px_query(filename,querystr, linecount.only=TRUE)
@@ -175,9 +186,9 @@ px_check_dim(filename)
 > px_query("inst/test_4dn.pairs.gz","chr20|chr10:1-3000000")
 data frame with 0 columns and 0 rows
 > px_query("inst/test_4dn.pairs.gz","chr20|chr10:1-3000000", autoflip=TRUE)
-                   V1    V2      V3    V4      V5 V6 V7
-1 SRR1658581.51740952 chr10  157600 chr20  167993  -  -
-2 SRR1658581.33457260 chr10 2559777 chr20 7888262  -  +
+               readID  chr1    pos1  chr2    pos2 strand1 strand2
+1 SRR1658581.51740952 chr10  157600 chr20  167993       -       -
+2 SRR1658581.33457260 chr10 2559777 chr20 7888262       -       +
 > px_query("inst/test_4dn.pairs.gz","chr20|chr10:1-3000000", linecount.only=TRUE)
 [1] 0
 > px_query("inst/test_4dn.pairs.gz","chr20|chr10:1-3000000", autoflip=TRUE, linecount.only=TRUE)
@@ -227,6 +238,10 @@ data frame with 0 columns and 0 rows
 > # checking if the file is 1D-indexed or 2D-indexed
 > px_check_dim("inst/test_4dn.pairs.gz")
 [1] 2
+>
+> # get column names
+> px_get_column_names("inst/test_4dn.pairs.gz")
+[1] "readID"  "chr1"    "pos1"    "chr2"    "pos2"    "strand1" "strand2"
 ```
 
 
@@ -241,6 +256,11 @@ Individual R functions are written and documented in `R/`. The `src/rpairixlib.c
 
 
 ## Version history
+### 0.1.2
+* Function `px_get_column_names` is now added.
+* `px_query` now adds column names for the query result if indexing was done with pairs preset.
+* `px_query`: problem of merged_nodups query result not splitting by space is now fixed.
+
 ### 0.1.1
 * `px_build_index`: When neither `preset` nor a custom set of columns is given, file extensions are automatically recognized for indexing.
 

diff --git a/Rpairix.Rproj b/Rpairix.Rproj
@@ -1,4 +1,4 @@
-Version: 0.1.1
+Version: 0.1.2
 
 RestoreWorkspace: No
 SaveWorkspace: No

diff --git a/inst/merged_nodup.tab.chrblock_sorted.txt.gz b/inst/merged_nodup.tab.chrblock_sorted.txt.gz
diff --git a/inst/merged_nodup.tab.chrblock_sorted.txt.gz.px2 b/inst/merged_nodup.tab.chrblock_sorted.txt.gz.px2
diff --git a/inst/test_4dn.pairs.gz.px2 b/inst/test_4dn.pairs.gz.px2
diff --git a/man/px_chr1_col.Rd b/man/px_chr1_col.Rd
diff --git a/man/px_chr2_col.Rd b/man/px_chr2_col.Rd
diff --git a/man/px_endpos1_col.Rd b/man/px_endpos1_col.Rd
diff --git a/man/px_endpos2_col.Rd b/man/px_endpos2_col.Rd
diff --git a/man/px_exists.Rd b/man/px_exists.Rd
diff --git a/man/px_get_column_names.Rd b/man/px_get_column_names.Rd
diff --git a/man/px_keylist.Rd b/man/px_keylist.Rd
diff --git a/man/px_query.Rd b/man/px_query.Rd
diff --git a/man/px_seq1list.Rd b/man/px_seq1list.Rd
diff --git a/man/px_seq2list.Rd b/man/px_seq2list.Rd