#!/usr/bin/env perl

=head1 NAME

locarnate

=head1 SYNOPSIS

locarnate [--parwise_params <parameters> --tcoffee_params <parameters>
           --pp_calculater_params <parameters> --help --pairwise_aligner <locarna or exparnap>
           --output <outputFolder> --library-only <lib-file> --library-name <name>
           --additional-libraries <list_of_libraries>] <inputFile>

Creates MSAs using different pairwise aligners and TCoffee as
a multiple alignment tool

=head1 AUTHORS

Niklas Meinzer <meinzern@informatik.uni-freiburg.de

=head1 OPTIONS

=over 4

=item B<--pairwise_parameters>

parameters for the pairwise alignment step

=item B<--tcoffee_params>

parameters for tcoffee

=item B<--pp_calculator_params>

parameters for locarna_rnafold_pp

=item B<--output>

path to output folder

=item B<--library-only>

if given only a t-coffee library will be written without actual
computation

=item B<--library-name>

the file name of the produced t-coffee library
                            (default: "input.lib")

=item B<--additional-libraries>

a comma separated list of paths to t-coffee library files that should
be given to t-coffee in addition to the one generated during this run

=item B<--pairwise_aligner>

which pairwise aligner to use (default: locarna)

=item B<--scaling_factor>

each edge from each EPM gets the same weight (exparnap only!)

=item B<--help>

display this help message and exit

=back

=cut

use strict;
use warnings;


use FindBin;
use lib "$FindBin::Bin/../lib/perl";
my $prefix = "$FindBin::Bin/..";
my $bindir = "$FindBin::Bin";

## the tcoffe version to use
my $t_coffee = "t_coffee";

## vienna package programs
my $RNAfold = "RNAfold";
my $RNAalifold = "RNAalifold";


use Pod::Usage;
use Getopt::Long;

use FileHandle;
use DirHandle;
use FindBin;

use MLocarna;
use MLocarna::PairwiseAligners;

use lib $FindBin::RealBin;

### global varibales ###
my $locarna = "$bindir/locarna";
my $base_path = 'results';
my $ribosum_file = "$prefix/share/locarna/Matrices/RIBOSUM85_60";

###########################################################################
# main
###########################################################################

### options ###

my $version = '0.9';
my $description = 'Calculates a multiple local RNA sequence structure'
    .' alignment of the sequences given by INPUT';

# all options are collected in the options hash
my %options;
$options{"BP-Probability-Calculator"} = "locarna_rnafold_pp";
$options{TCoffee} = $t_coffee;
$options{Locarna} = $bindir . "/locarna";

$options{tcoffee_params} = "";
$options{pairwise_aligner_params} = "";
$options{pp_calculator_params} = "";

$options{pairwise_aligner} = "locarna";

$options{library_only} = 0;
$options{library_name} = "input.lib";
$options{additional_libraries} = "";
$options{scaling_factor} = "-1";

GetOptions("help" => \$options{help},
           "man" => \$options{man},
           "pairwise_parameters=s" => \$options{pairwise_aligner_params},
           "tcoffee_parameters=s" => \$options{tcoffee_params},
           "pp_preprocessing_parameters=s" => \$options{pp_calculator_params},
           "library-only" => \$options{library_only},
           "output=s" => \$options{output_path},
           "pairwise_aligner=s" => \$options{pairwise_aligner},
           "library-name=s" => \$options{library_name},
           "additional-libraries=s" => \$options{additional_libraries},
            "scaling_factor=i" => \$options{scaling_factor}
           );

pod2usage(1) if ($options{help});
pod2usage(-exitstatus => 0, -verbose => 2) if $options{man};

if ( ($options{pairwise_aligner} ne "locarna") && ($options{pairwise_aligner} ne "exparnap")){
    print "Unsupported pairwise aligner: " . $options{pairwise_aligner} . "\n";
    exit(1);
}

if($options{pairwise_aligner} eq "locarna" && $options{scaling_factor} ne -1){
	print "scaling factor not implemented for locarna \n";
	exit(1);
}

## require that executable programm exists in search path
sub require_prog_exists_path {
    my $prg=shift;
    my $prgpath = `which $prg`;
    if ( $prgpath eq ""  ) {
        print STDERR "ERROR: Programm $prg cannot be found in search path!\n";
        print STDERR "    Please install and/or set path.\n";
        exit(0);
    }
}
## require that executable programm exists in dir
sub require_prog_exists {
    my $dir=shift;
    my $prg=shift;
    my $prgpath = $dir."/".$prg;
    if ( $prgpath eq ""  ) {
        print STDERR "ERROR: Programm $prg cannot be found in directory $dir!\n";
        print STDERR "    Please check your installation or (re)install.\n";
        exit(0);
    }
}

require_prog_exists_path($RNAfold);
require_prog_exists_path($RNAalifold);
require_prog_exists($bindir,"locarna");
require_prog_exists($bindir,"exparna_p");
require_prog_exists_path($options{TCoffee});


### calculations ###

# check t_coffe parameters
my $checkresult = check_tcoffee_params(\%options, $options{tcoffee_params});
if ($checkresult ne "") {
    print "Tcoffee parameters invalid. Tcoffee says: " . $checkresult . "\n";
    exit(1);
}

# check pairwise aligner parameters
if ($options{pairwise_aligner} eq "locarna") {
    $checkresult = check_locarna_params(\%options, $options{pairwise_aligner_params});
    if ($checkresult ne "") {
        print "Locarna parameters invalid. Locarna says: " . $checkresult . "\n";
        exit(1);
    }
}

# check locarna_rnafold_pp parameters
$checkresult = check_pp_parameters(\%options, $options{pp_calculator_params});
if ($checkresult ne "") {
    print "Locarna_rnafold_pp parameters invalid. Locarna_rnafold_pp says: " . $checkresult . "\n";
    exit(1);
}

my $results_path = "test_results";
if(defined $options{output_path}) {
    $results_path = $options{output_path};
}

mkdir($results_path);
pod2usage("Not enough arguments!\n") if scalar(@ARGV) < 1;

# sequence input is either a fasta file or a folder containing the
# .pp files, if preprocessing should be skipped
my $sequence_input = $ARGV[scalar(@ARGV) - 1];

if (!-e $sequence_input) {
    print "No such file or directory: " . $sequence_input;
    exit(1);
}
# if the given file is not a directory, assume it's a fasta file and
# proceed with preprocessing
my $sequences;
# pp_dir is the directory the .pp files can be found
my $pp_dir;
if (!(-d $sequence_input)) {
    $sequences = MLocarna::read_fasta($ARGV[scalar(@ARGV) - 1]);

    calculate_basepair_probabilities(\%options, $results_path, $sequences);
    # pp files are now under results_path/pp
    $pp_dir = $results_path . "/pp";
} else {
    $sequences = parse_pp_directory($sequence_input);
    $pp_dir = $sequence_input;
}

mkdir($results_path."/pair");

### Calculate pairwise alignments
my $libfile = ($results_path . "/" . $options{library_name});
if ($options{pairwise_aligner} eq "locarna") {
    my $pair_alignments =
      MLocarna::PairwiseAligners::locarna_compute_pairwise_alignments(
        \%options, $results_path, $sequences, $bindir, $pp_dir);

    # write the t-coffee library file
    MLocarna::write_tcoffee_lib_file($libfile, $pair_alignments);
} elsif ($options{pairwise_aligner} eq "exparnap") {
    my $pair_alignments =
      MLocarna::PairwiseAligners::exparna_compute_pairwise_alignments(
        \%options, $results_path, $sequences, $bindir, $pp_dir);

    MLocarna::PairwiseAligners::exparna_result_to_tcoffee_lib_file(
        $libfile, $pair_alignments, $sequences);
}

# finally call tcoffee to calculate the multiple alignment
if(not $options{library_only}) {
    calculate_mult_alignment(\%options, $results_path, $libfile);
} else {
    print "Tcoffee library file written to $libfile\n";
    exit(0);
}
#~
#~ my $mult_alignment = parse_mult_alignment($options, $results_path,
                                          #~ $sequences);
#~
#~ my $cons_rates = determine_cons_rates($options, $sequences,
                                      #~ $pair_alignments, $mult_alignment);
#~ # \@cons_rates
#~ # []  $column_rate
#~
#~ # determine_exclusions($options, $sequences, $pair_alignments,
#~ #                      $mult_alignment, $cons_rates);
#~
#~ calculate_cons_data($options, $results_path,
                    #~ $mult_alignment, $cons_rates);
#~
#~ my $cons_data = parse_cons_data($options, $results_path, $cons_rates);
#~ # \%cons_data
#~ # {'seq'}    $seq
#~ # {'struc'}  $struc
#~
#~ # join_exclusions($options, $sequences, $mult_alignment,
#~ #                 $cons_rates, $cons_data);
#~
#~ write_clustal($options, $results_path, $sequences,
              #~ $mult_alignment, $cons_rates);
#~ write_fasta($options, $results_path, $sequences,
            #~ $mult_alignment, $cons_rates);
#~
#~ write_stockholm($options, $results_path, $sequences,
                #~ $mult_alignment, $cons_rates, $cons_data);
#~
#~ write_flat($options, $results_path, $sequences,
           #~ $mult_alignment, $cons_rates, $cons_data);
#~
#~ cleanup($options, $results_path, $sequences);


###########################################################################
# subs
###########################################################################

# calculates the pairwise base pair probabilities for all input sequences using RNAfold
sub calculate_basepair_probabilities {

  my $options =  $_[0];
  my $results_path = $_[1];
  my $sequences = $_[2];

  *STDERR->print('Calculate pair probabilities...'."\r");

  mkdir($results_path.'/pp');

  my $pp_tool_call = $bindir . "/" . $options->{"BP-Probability-Calculator"}
    . " " . $options->{pp_calculator_params};

  for (my $i = 0; $i < scalar(@{$sequences}); ++$i) {
    my $tag = '>S'.($i + 1);

    # the filename of the pp file
    my $pp_file = $results_path . "/pp/S" . ($i + 1) . ".pp";

    my $sequence = $sequences->[$i]->{seq};

    # open pipe to pp tool
    open (my $PIPE, "|-", "$pp_tool_call -o $pp_file");
    print $PIPE $tag . "\n" . $sequence . "\n";
    close($PIPE);
  }

  *STDERR->print('Calculate pair probabilities... done!'."\n");
}

sub parse_pair_alignments {

  my $options =  $_[0];
  my $results_path = $_[1];
  my $sequences = $_[2];
  my $numSequences = scalar(@{$sequences});

  *STDERR->print('parse pairwise alignments...'."\r");

  my @alignments = ();

  for (my $i = 0; $i < $numSequences - 1; ++$i) {
    for (my $j = $i + 1; $j < $numSequences; ++$j) {
      my $file_name = $results_path.'/pair/S'.($i + 1).
          '_S'.($j + 1).'.aln';

      my ($header, $aln) = MLocarna::read_clustalw_aln($file_name);
      my $score;
      if ($header =~ m/Score: ([\d|-]+)/) {
        $score = $1;
      }
      push(@alignments, {"rows" => $aln, "score" => $score});
    }
  }

  *STDERR->print('Parse pairwise alignments... done!'."\n");

  return(\@alignments);
}

sub calculate_mult_alignment {
  my ($options, $results_path, $libFile) = @_;

  mkdir($results_path.'/mult');

  *STDERR->print('Calculate multiple alignment...'."\r");


  my $aln_file_name = $results_path.'/mult/tcoffee.aln';
  my $out_file_name = $results_path.'/mult/tcoffee.out';
  my $err_file_name = $results_path.'/mult/tcoffee.err';

  # if additional libraries were given prepare them for passing to tcoffee
  if (length($options->{additional_libraries}) > 0) {
      print "asd " . $options->{additional_libraries} ."\n";
    my @libs = split /,/, $options->{additional_libraries};
    foreach my $l (@libs){
        # attach each file to the lib File string
        $libFile .= ",L" . $l;
    }
  }

  my $call = $options->{TCoffee} . " " . $options->{tcoffee_params} . ' -in=L'.$libFile.
      ' -outfile='.$aln_file_name.' 1>'.$out_file_name.
      ' 2>'.$err_file_name;

  print "\n$call\n";

  my $code = system($call);

  *STDERR->print('Calculate multiple alignment... done!'."\n");
}


sub check_locarna_params {
    # Runs Locarna without input file, but with all given parameters
    # if locarna complains that there is no input file, all other
    # parameters are ok
    # otherwise the incorrect parameters are returned
    my ($options, $parameters) = @_;

    my $call = $options->{Locarna} . " $parameters 2>&1";

    my @output = `$call`;

    if ($output[0] =~ /ERROR --- Mandatory option and\/or argument missing: <Input 1>/) {
        return "";
    }
    my $error = $output[0];

    # try to remove the binary path from the error message
    my @errorsplit = split /:/, $error;
    if (scalar(@errorsplit) > 1) {
        $error = $errorsplit[1];
    }
    chomp $error;
    return $error;
}

sub check_tcoffee_params {
    # Runs Locarna without input file, but with all given parameters
    # if locarna complains that there is no input file, all other
    # parameters are ok
    # otherwise the incorrect parameters are returned
    my ($options, $parameters) = @_;

    my $call = $options->{TCoffee} . " $parameters 2>&1";

    my @output = `$call`;

    foreach my $line (@output) {
        if( $line =~ /needs a value/ or $line =~ /IS NOT A PARAMETER/) {
            chomp $line;
            return $line;
        }
    }
    return "";
}

sub check_pp_parameters {
    # Runs locarna_rnafol_pp without input file, but with all given parameters
    # and checks if it complains
    my ($options, $parameters) = @_;

    my $call = $bindir . "/" .$options->{"BP-Probability-Calculator"} . " $parameters 2>&1";

    my @output = `$call hopefully_this_file_does_not_exist.pdf.txt`;

    my $error =  $output[0];

    if ($error =~ /Error in input format/) {
        return "";
    }

    # try to remove bin path

    # try to remove the binary path from the error message
    my @errorsplit = split /:/, $error;
    if (scalar(@errorsplit) > 1) {
        $error = $errorsplit[1];
    }
    chomp $error;
    return $error;
}

sub parse_pp_directory {
    my ($dir) = @_;

    opendir(my $DIR, $dir);

    my @result;

    # iterate over files in dir
    while(my $file = readdir($DIR)) {
        if ($file =~ /(.+)\.pp/) {
            my $newseq = { "name" => $1,
                            "seq" => get_sequence_from_pp_file($dir . "/" .$file)};
            push(@result, $newseq);
        }
    }
    closedir($DIR);

    ## sort sequences according to the order in the original fasta file (S1,S2,...Sn)
    @result = sort { substr($a->{name},1,length($a->{name})) <=> substr($b->{name},1,length($b->{name}))} @result;
    return \@result;
}


sub get_sequence_from_pp_file {
    my ($file) = @_;

    open(my $FILE, "<", "$file");
    my @data = <$FILE>;
    # the sequence should be on the third line of the file
    my $seqline = $data[2];
    chomp $seqline;
    my $seq = "";
    if ($seqline =~ /\S+\s+(\S+)/) {
        $seq = $1;
    }
    close($FILE);
    return $seq;
}
