#!/usr/bin/perl -w
use Getopt::Long;
use Data::Dumper;

my $out;
my $gff;
my $type="novopaired";
my $scoremax=500;
my $readlength = 36;
my $sedfile;
my $help;
my $scores;
my %Scores;
my %Scorecount;


=head1 SYNOPSIS

	Zayed Albertyn 2008

	Convert Novo paired Native format to  Paf/GFF format 

=head2 Discussion

	This program can process 1 or more files on the command line and converts the input to PAF.
	Compressed gzip files are accepted


=head2 Usage

	-gff Boolean. Alternatively convert the novo format to Gene Feature Format (GFF)

	
	-sedfile Provide a sed-style file for converting output e.g. chromosome headers
		e.g. 

		s/A/B/;
		s/NC_00343/chr2/
		s/|ref|20223/chrX/;
		
	-scores  Print a distribution of the mapping quality scores from the novo report

	-max The maximum score to print if -scores is used. Default is 500;

	-o  Automatically write to an output file with a .paf extension (or .gff if gff is selected)

	-pafeval  Do pafeval. Must be called with -o or the program ignores it. paf_utils.pl needs to be in your PATH env variable


=cut

my $pafeval;
#Get command line options
options();


#Read each file on cmd line
my $file;
my %scorerange;
initialize_scores();

foreach (@ARGV) {
	$file=$_;
	print STDERR "Input:$file\n";
	my $outfile = novo2paf($file);

	if (!$gff && $out && $pafeval) {
		system("paf_utils.pl pafeval $outfile");
		unlink($outfile);

	}
	
}

#print the scores
if ($scores) {
	scoreHist();
}


sub varstring {
	my $aref=shift;
	my @vars = @$aref;
}

sub options {
	&GetOptions(
		'gff!'=>\$gff,
		'max=n'=>\$scoremax,
		'pafeval!'=>\$pafeval,
		'help!'=>\$help,
		'sedfile=s'=>\$sedfile,
		'scoresdist!'=>\$scores,
		'l=n'=>\$readlength,
		'out!' => \$out,
		't=s'=>\$type

	);
	if (scalar @ARGV ==0 || $help) {
		system("perldoc $0");
		exit;
	}
}


sub writeopts {
	my $cmd="";
	my $file=shift;
	if ($gff) {
		$ext="gff";
	}else {
		$ext="paf";	
	}
	my $outfile;
	$outfile = "$file.$ext";

	if ($out) {	
		if ($sedfile) {
			$cmd = " | sed -f $sedfile > $outfile";
		}
		else {
			$cmd = "+>$outfile";
		}
	}else {
		if ($sedfile) {
			$cmd = " | sed -f $sedfile";
		}
		else {
			$cmd = "+>&STDOUT";
		}

	}

	return ($cmd,$outfile);

}

sub initialize_scores{
	for (my $i=0;$i<=$scoremax;$i++) {
		$scorerange{$i}=1;	
	}
}

sub scoreHist{
	print "# Score Distribution\n";
	printf "#No. Records\t%s\n",join"\t",values %Scorecount;
	printf "#Files\t%s\n",join"\t",keys %Scorecount;
	foreach $val (sort {$a <=> $b} keys %scorerange) {
			print "$val\t";
			foreach $file (keys %Scores) {
					my $total=0;
					$total=$Scorecount{$file};
					if ($Scores{$file}{$val}) {
						print "$Scores{$file}{$val}\t$total\t";
					}else {
						print "0\t$total\t";
					}
			}
		print "\n";
	}	

}


sub novo2paf {
	my $file=shift;
	my ($cmd,$outfile) =	&writeopts($file);
	if ($file =~ /\.gz$/) {
		open(IN,"zcat $file |") or die "$!";
	}else {
		open(IN,$file) or die "$!";
	}	
	open(OUT,$cmd) or die "$!";
	my $strand;
	my $qstr=".";
	my $qual=".";
	my $cigar="";
	my $pafstr=".";
	my $gffstr;
	my $prog = "nv";
	while (<IN>) {
		 $prog="$1.$2" if /^#\s(novo\S+)\s+\((\S+)\)/;
		if (/^#/) {
			print OUT $_; 
			next;
		}
		s/^@/>/;
		#s/>//g;
		s/m#\s+/m#/;
		next if /\sNM\s|\sQL\s|\sQC\s/;
		my @F=split;
		my $readlength= length($F[2]);
		$cigar = $readlength."M";
		$qstr=".";
		my $qname= $F[0];
#@22_22999804_22999920_1/1       S       AACCCTGTCTCTACTAAGAATACAAAATGCACATAG    U       92      4       >22     21896892        F       28
		if ($F[4] eq 'R') {
			$pafstr =  "$qname\t1\t-\t1111\t\t00\t0\t33M\t$qstr\t$qual\t$prog.repeat\tR\n";
			print OUT  $pafstr unless $gff;

		}else {
			my($cat,$sequence,$type,$score,$mapqual,$tname,$tstart,$hairpinscore,$direction);
			if (scalar $F[7] =~ /^>/) {
				$hairpinscore=0;
#@22_35348060_35348242_0/2       R       TCAAGTATTGTGCCACACTAGCTTGTGCCTAGCACA    IIIIIIIIIIIIIIIIII?III5<IIIIIIIB*FII    U       60      61      >22     20070357        R

				($cat,$sequence,$qstr,$type,$score,$mapqual,$tname,$tstart,$direction)= @F[ 1 .. 9];
			} else  {
				($cat,$sequence,$qstr,$type,$score,$mapqual,$hairpinscore,$tname,$tstart,$direction)= @F[ 1 .. 9];

			}

			$qstr=$sequence;
			####### Add scores to histogram 
			$Scores{$file}{$mapqual}++;
			$Scorecount{$file}++;
			##############################
			my $tstop  = $tstart + $readlength - 1;
			if ($direction eq "F") {
				$strand = "+";
			} else {
				$strand = "-";
			}
			my $extra;
			my $mutstr;
			if ( scalar @F >= 10 && $F[6] =~ />/ ) {
				my $dl = scalar @F -1; 
		   		$extra = join"\t",@F[ 9 .. $dl];
				$mutstr=join",",@F[ 9 .. $dl];
			}else {
				$extra="";
				$mutstr="";
			}
#		  qName qBegin strand tName tBegin score CIGAR qSeq/. qQual/.
			$pafstr =  "$qname\t1\t$strand\t$tname\t$tstart\t$mapqual\t$cigar\t$qstr\t$qual\t$prog\t$extra";
		
			$gffstr= "$tname\t$prog\t$file\t$tstart\t$tstop\t$mapqual\t$strand\t.\t$qname ; $mutstr";



			if ($gff) {
				print OUT "$gffstr\n";
			
			}else {
				print OUT "$pafstr\n";
			}	

		}




	}



	close IN;
	return $outfile;
}
