diff --git a/src/cli/Sequence_Analysis/FASTAExtractCLI.java b/src/cli/Sequence_Analysis/FASTAExtractCLI.java index b033e82c8..9cfc2f389 100644 --- a/src/cli/Sequence_Analysis/FASTAExtractCLI.java +++ b/src/cli/Sequence_Analysis/FASTAExtractCLI.java @@ -42,7 +42,7 @@ public Integer call() throws Exception { System.exit(1); } - FASTAExtract script_obj = new FASTAExtract(genomeFASTA, bedFile, output, forceStrand, bedHeader, null); + FASTAExtract script_obj = new FASTAExtract(genomeFASTA, bedFile, output, forceStrand, bedHeader, System.err); script_obj.run(); System.err.println("Extraction Complete."); diff --git a/src/scripts/Sequence_Analysis/FASTAExtract.java b/src/scripts/Sequence_Analysis/FASTAExtract.java index 8a09782a6..5255d2e67 100644 --- a/src/scripts/Sequence_Analysis/FASTAExtract.java +++ b/src/scripts/Sequence_Analysis/FASTAExtract.java @@ -10,10 +10,9 @@ import java.io.IOException; import java.io.PrintStream; import java.util.ArrayList; -import java.util.Arrays; -import java.util.Scanner; import util.FASTAUtilities; +import util.BEDUtilities; public class FASTAExtract { private File GENOME = null; @@ -41,11 +40,8 @@ public FASTAExtract(File gen, File b, File out, boolean str, boolean head, Print } public void run() throws IOException, InterruptedException { - - if (PS == null) - PS = System.err; - System.out.println("STRAND:" + STRAND); - System.out.println("COORD:" + HEADER); + PS.println("STRAND:" + STRAND); + PS.println("COORD:" + HEADER); try { IndexedFastaSequenceFile QUERY = new IndexedFastaSequenceFile(GENOME); @@ -53,7 +49,7 @@ public void run() throws IOException, InterruptedException { // Open Output File OUT = new PrintStream(OUTFILE); - ArrayList BED_Coord = loadCoord(BED); + ArrayList BED_Coord = BEDUtilities.loadCoord(BED, HEADER); for (int y = 0; y < BED_Coord.size(); y++) { try { @@ -76,47 +72,5 @@ public void run() throws IOException, InterruptedException { } catch (SAMException e) { PS.println(e.getMessage()); } - - } - - public ArrayList loadCoord(File INPUT) throws FileNotFoundException { - Scanner scan = new Scanner(INPUT); - ArrayList COORD = new ArrayList(); - while (scan.hasNextLine()) { - String[] temp = scan.nextLine().split("\t"); - if (temp.length > 2) { - if (!temp[0].contains("track") && !temp[0].contains("#")) { - String name = ""; - - if (!HEADER) { // create genomic coordinate name if requested - if (temp.length > 5) { - name = temp[0] + ":" + temp[1] + "-" + temp[2] + "(" + temp[5] + ")"; - } else { - name = temp[0] + ":" + temp[1] + "-" + temp[2] + "(.)"; - } - } else { // else create name based on BED file name or create one if non-existent - if (temp.length > 3) { - name = temp[3]; - } else { - name = temp[0] + ":" + temp[1] + "-" + temp[2] + "(" + temp[5] + ")"; - } - } - - if (Integer.parseInt(temp[1]) >= 0) { - if (temp[5].equals("+")) { - COORD.add(new BEDCoord(temp[0], Integer.parseInt(temp[1]), Integer.parseInt(temp[2]), "+", - name)); - } else { - COORD.add(new BEDCoord(temp[0], Integer.parseInt(temp[1]), Integer.parseInt(temp[2]), "-", - name)); - } - } else { - System.out.println("Invalid Coordinate in File!!!\n" + Arrays.toString(temp)); - } - } - } - } - scan.close(); - return COORD; } } \ No newline at end of file diff --git a/src/util/BEDUtilities.java b/src/util/BEDUtilities.java new file mode 100644 index 000000000..21f2cf816 --- /dev/null +++ b/src/util/BEDUtilities.java @@ -0,0 +1,82 @@ +package util; + +import java.io.BufferedReader; +import java.io.File; +import java.io.FileInputStream; +import java.io.IOException; +import java.io.InputStreamReader; +import java.io.UnsupportedEncodingException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.zip.GZIPInputStream; + +import objects.CoordinateObjects.BEDCoord; + +/** + * Class containing a set of shared methods to be used across script classes. + * + * @author Olivia Lang + * @see scripts.Sequence_Analysis.FASTAExtract + */ +public class BEDUtilities { + + /** + * Load a list of BEDCoord objects from a file. + * + * @param INPUT the BED-formatted input file to load + * @param HEADER the style of FASTA-header to use for the output (true = BED + * coord name, false = use Genomic Coordinate) + * @return + * @throws IOException + * @throws UnsupportedEncodingException + */ + public static ArrayList loadCoord(File input, boolean HEADER) throws UnsupportedEncodingException, IOException { + ArrayList COORD = new ArrayList(); + // Check if file is gzipped and instantiate appropriate BufferedReader + BufferedReader br; + if (GZipUtilities.isGZipped(input)) { + br = new BufferedReader(new InputStreamReader(new GZIPInputStream(new FileInputStream(input)), "UTF-8")); + } else { + br = new BufferedReader(new InputStreamReader(new FileInputStream(input), "UTF-8")); + } + // Initialize line variable to loop through + String line = br.readLine(); + while (line != null) { + String[] temp = line.split("\t"); + if (temp.length > 2) { + if (!temp[0].contains("track") && !temp[0].contains("#")) { + String name = ""; + + if (!HEADER) { // create genomic coordinate name if requested + if (temp.length > 5) { + name = temp[0] + ":" + temp[1] + "-" + temp[2] + "(" + temp[5] + ")"; + } else { + name = temp[0] + ":" + temp[1] + "-" + temp[2] + "(.)"; + } + } else { // else create name based on BED file name or create one if non-existent + if (temp.length > 3) { + name = temp[3]; + } else { + name = temp[0] + ":" + temp[1] + "-" + temp[2] + "(" + temp[5] + ")"; + } + } + + if (Integer.parseInt(temp[1]) >= 0) { + if (temp[5].equals("+")) { + COORD.add(new BEDCoord(temp[0], Integer.parseInt(temp[1]), Integer.parseInt(temp[2]), "+", + name)); + } else { + COORD.add(new BEDCoord(temp[0], Integer.parseInt(temp[1]), Integer.parseInt(temp[2]), "-", + name)); + } + } else { + System.out.println("Invalid Coordinate in File!!!\n" + Arrays.toString(temp)); + } + } + } + line = br.readLine(); + } + br.close(); + return COORD; + } +}