lib/perl/Bio/EnsEMBL/EGPipeline/DNAFeatures/ClusterRepeatLib.pm

=head1 LICENSE

See the NOTICE file distributed with this work for additional information
regarding copyright ownership.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

     http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

=cut


=pod

=head1 NAME

Bio::EnsEMBL::EGPipeline::DNAFeatures::ClusterRepeatLib

=head1 DESCRIPTION

Cluster repeat library generated by RepeatModeller using cd-hit-est

=head1 Author

Vasily Sitnik

=cut

package Bio::EnsEMBL::EGPipeline::DNAFeatures::ClusterRepeatLib;

use strict;
use warnings;

use Bio::SeqIO;
use Capture::Tiny ':all';
use File::Copy qw(copy);
use File::Spec::Functions qw(catfile);
use File::Basename qw(fileparse);

use base qw(Bio::EnsEMBL::Production::Pipeline::Common::Base);

sub param_defaults {
  my ($self) = @_;
  
  return {
    cdhit_identity_threshold => 0.8,
    cdhit_alignment_coverage => 0.9,
    cdhit_word_len           => 10,
    cdhit_num_threads        => 1,
    cdhit_mem                => 0,    
  };
}

sub run {
  my ($self) = @_;
  my $lib_file = $self->param('lib_file');
  my $out_file = $self->param('out_file');
  my $wd_path  = $self->param('wd_path');

  mkdir $wd_path unless -e $wd_path;
  
  my $origin_suffix = ".before_cluster";
  
  my ($in_filename, $in_dir) = fileparse($lib_file);
  my $backup_file = catfile($wd_path, ${in_filename} . $origin_suffix);
  copy($lib_file, $backup_file) or $self->throw("Failed to backup '$lib_file' to $wd_path: $@");

  # Remove hanging N and add seq number to id to avoid duplicates
  my $noN_fasta = catfile($wd_path, "noN.fasta");
  $self->fasta_preprocess($lib_file, $noN_fasta);

  # Clustering
  my $cdhit_output_dir = catfile($wd_path, "cdhit");
  mkdir $cdhit_output_dir unless -e $cdhit_output_dir;
  my $cdhit_output = catfile($cdhit_output_dir, "out");
  $self->run_cdhit_est($noN_fasta, $cdhit_output);
  
  my $clusters_raw_file = "${cdhit_output}.clstr";
  my $clusters = {};
  $self->process_clusters($clusters_raw_file, $clusters);

  # Restore the repeat types of clusters
  $self->gen_final_fasta($clusters, $noN_fasta, $out_file);
}


# remove hanging N and add seq number to id to avoid duplicates
sub fasta_preprocess {
  my ($self, $infile, $outfile) = @_;
  my $in  = Bio::SeqIO->new(-file => "$infile", -format => 'Fasta');
  my $out = Bio::SeqIO->new(-file => ">$outfile", -format => 'Fasta');
  my $cnt = 0;
  while ( my $seq = $in->next_seq() ) {
    my $id = $seq->display_id();
    $seq->display_id($id . "_$cnt"); 
    $cnt++;    

    my $seq_str =  $seq->seq();
    $seq_str =~ s/^N+//i;
    $seq_str =~ s/N+$//i;
    $seq->seq($seq_str);

    $out->write_seq($seq);
  }
}


# run cdhit_est 
sub run_cdhit_est {
  my ($self, $input_fasta, $cdhit_output) = @_;
  my @cdhit_est_args = ( $self->param('cdhit_est_exe') );
    push @cdhit_est_args, "-c " . $self->param('cdhit_identity_threshold');
    push @cdhit_est_args, "-aL " . $self->param('cdhit_alignment_coverage'); # coverage for the longer sequence
    push @cdhit_est_args, "-aS " . $self->param('cdhit_alignment_coverage'); # coverage for the shorter sequence
    push @cdhit_est_args, "-n " . $self->param('cdhit_word_len');
    push @cdhit_est_args, "-g 1 "; # 'accurate' mode

    push @cdhit_est_args, "-p 1 "; # print .clstr alignments
    push @cdhit_est_args, "-d 0 "; # .clstr length of description (0 -- not trimmed)
    push @cdhit_est_args, "-sc 1 "; # sort clusters by size
    push @cdhit_est_args, "-sf 1 "; # sort fastas by size

    push @cdhit_est_args, "-T " . $self->param('cdhit_num_threads');
    push @cdhit_est_args, "-M " . $self->param('cdhit_mem');

    push @cdhit_est_args, "-i " . $input_fasta; # input
    push @cdhit_est_args, "-o " . $cdhit_output;          # output

  my $cdhit_est_cmd = join(' ', @cdhit_est_args);
  $self->warning("Trying to run CDHIT EST clustering: $cdhit_est_cmd");
  $self->_execute($cdhit_est_cmd);
}

sub _execute {
  my $self = shift;
  my ($cmd) = @_;
  
  my ($stdout, $stderr, $exit) = capture {
    system($cmd);
  };
  if ($exit) {
    $self->throw("Cannot execute $cmd:\n$stderr");
  }
}


# get cluster families and representative from '.clstr' file
sub process_clusters {
  my ($self, $file, $out) = @_;

  open (my $fh, "$file")
    or $self->throw("Cannot open clusters file '$file':\n$!");

  my %cluster_data;
  while (<$fh>) {
    chomp;
    if (m/^>/) {
      _process(\%cluster_data, $out);
      %cluster_data = (families => {}, chosen => '', chosen_pfx => '', chosen_sfx => '');
      next;
    }
    if (m/^\d+\s+[^,]+,\s+>([^\s]+)\s+(at|\*)(?:\s+|$)/) {
      my ($id, $chosen) = ($1, $2);
      $id =~ s/\.\.\.$//;
      my ($family_id, $family_name) = split(/#/, $id);
      $family_name =~ s/_(\d+)$//;
      my $ln_sfx = $1;
      $cluster_data{families}->{$family_name}++;
      
      if ($chosen eq "*") {
        $cluster_data{chosen} = $id;
        $cluster_data{chosen_pfx} = $family_id;
        $cluster_data{chosen_sfx} = $ln_sfx;
      }
    }
  }
  _process(\%cluster_data, $out);
  close($fh);
}

sub _process {
  my ($data, $out) = @_;
  if ($data && %$data && $data->{chosen}) {
    my $fm = $data->{families};
    my @filtered =  grep !/^Unknown$/, sort {$fm->{$b} <=> $fm->{$a}} keys %$fm;
    my $chosen_family = scalar(@filtered) > 0 ? $filtered[0] : "Unknown";
    $out->{$data->{chosen}} = $data->{chosen_pfx}. '_' . $data->{chosen_sfx} . '#' . $chosen_family;
  }
}


# form final clustered file, updating family names
sub gen_final_fasta {
  my ($self, $clusters, $infile, $outfile) = @_;
  my $in  = Bio::SeqIO->new(-file => "$infile", -format => 'Fasta');
  my $out = Bio::SeqIO->new(-file => ">$outfile", -format => 'Fasta');

  while ( my $seq = $in->next_seq() ) {
    my $id = $seq->display_id();
    next if (!exists $clusters->{$id});
    $seq->display_id($clusters->{$id}); 
    $out->write_seq($seq);
  }
}


1;