#!/usr/bin/perl --
use warnings;
use strict;
use English;
use FindBin qw($RealBin);
use Getopt::Long;
use IPC::Open3;

# Quick check that they've supplied some arguments
unless (@ARGV) {
	die "SeqMonk non-interactive project creator\nUsage is seqmonk_import --outfile [out.smk] --genome [Species/assembly] [file1.bam] [file2.bam]\nRun seqmonk_import --help for more options\n";
}

# Check to see if they've mistakenly downloaded the source distribution
# since several people have made this mistake

if (-e "$RealBin/uk/ac/babraham/SeqMonk/SeqMonkApplication.java") {
	die "This is the source distribution of SeqMonk.  You need to get the compiled version if you want to run the program\n";
}


if ($ENV{CLASSPATH}) {
	$ENV{CLASSPATH} .= ":$RealBin:$RealBin/htsjdk.jar:$RealBin/Jama-1.0.2.jar:$RealBin/commons-math3-3.5.jar";
}
else {
	$ENV{CLASSPATH} = "$RealBin:$RealBin/htsjdk.jar:$RealBin/Jama-1.0.2.jar:$RealBin/commons-math3-3.5.jar";
}

# Temp for windows stuff...
#$ENV{CLASSPATH} =~ s/:/;/g;

# We need to make sure the language is English so we can parse the output of some
# standard commands, however if we leave this in place it can break things for people
# in other locales so we need to restore the original value.

my $original_lang = undef;

if (exists $ENV{LANG}) {
	$original_lang = $ENV{LANG};
}	

$ENV{LANG}='C';

my @java_args= ("-Djava.awt.headless=true");

# We now need to scan the command line for switches which we're going
# to pass on to the main java program.

my $help;
my $memory;
my $mapq;
my $genome;
my $outfile;
my $spliced;
my $introns;
my $unspliced;

my $result = GetOptions('help' => \$help,
						'memory=i' => \$memory,
						'genome=s' => \$genome,
						'outfile=s' => \$outfile,
						'spliced' => \$spliced,
						'introns' => \$introns,
						'unspliced' => \$unspliced,
						'mapq=i' => \$mapq,
						 );

my @files = @ARGV;

# Check the simple stuff first

if ($help) {
	# Just print the help and exit
	print while(<DATA>);
	exit;
}

unless (@files) {
	die "No files to import specified\n";
}

unless ($outfile) {
	die "No output file (--outfile) supplied\n";
}

unless ($genome) {
	die "No genome (--genome) supplied\n";
}

unless ($genome =~ /^[^\/]+\/[^\/]+$/) {
	die "Genome '$genome' didn't appear to have the correct 'species/assembly' structure\n";
}

if (defined $mapq) {
	if ($mapq < 0 or $mapq > 255) {
		die "MAPQ ($mapq) must be in the range 0-255\n";
	}
}
else {
	# We set this automatically
	$mapq = -1;
}

if ($unspliced and $spliced) {
	die "You can't set both spliced and unspliced on the same import\n";
}

if ($unspliced and $introns) {
	die "You can't set both introns and unspliced on the same import\n";
}

# Introns implies spliced even if they didn't say it
if ($introns) {
	$spliced = 1;
}


if ($memory) {
	if ($memory < 500) {
		die "Memory allocation must be at least 500M";
	}
}
else {
	$memory = determine_optimal_memory();
}



# We now need to correct for the fact that java doesn't honour the
# heap size you set
my $actual_memory = correct_memory($memory);

warn "Correcting for VM actual requested allocation for $memory is $actual_memory\n";

unshift @java_args,"-Xmx${actual_memory}m";

# Expand the default stack size to allow for large recursive sorts
unshift @java_args,"-Xss4m";

# Work out the value we're using for splicing, 0 = auto, 1 = unspliced, 2 = spliced, 3=spliced_introns
my $spliced_value = 0;
$spliced_value = 1 if ($unspliced);
if ($spliced) {
	if ($introns) {
		$spliced_value = 3;	
	}
	else {
		$spliced_value = 2;
	}
}


# Put the environment back to how we found it
if (defined $original_lang) {
	$ENV{LANG} = $original_lang;
}
else {
	delete $ENV{LANG};
}

exec "java",@java_args, "uk.ac.babraham.SeqMonk.Importer.SeqMonkImporter", $genome, $outfile, $spliced_value,$mapq, @files;

sub print_error {
	
	# We wrap errors like this so we can keep a windows shell window open
	# so the user can see any errors we generate
	
	my ($error) = @_;
	
	warn $error;
	
	$_ = <STDIN>;
	
	exit 1;
}

sub correct_memory {
	
	my ($requested_memory) = @_;
	
	my $actual_memory = `java -Xmx${requested_memory}m uk.ac.babraham.SeqMonk.Utilities.ReportMemoryUsage`;
	
	if ($actual_memory =~ /^(\d+)/) {
		$actual_memory = $1;
		
		$actual_memory = int($requested_memory * ($requested_memory/$actual_memory));
		
		return $actual_memory;
	}
	else {
		warn "Failed to correct requested memory. Request output was '$actual_memory'";
		return $requested_memory;
	}
}

sub determine_optimal_memory {
	
	# We'll set a ceiling for the memory allocation.  On a 32-bit OS this is going
	# to be 1536m (the max it can safely handle), on a 64-bit OS we won't take more
	# than 6GB
	my $max_memory = 1331;
	
	# We need not only a 64 bit OS but 64 bit java as well. It's easiest to just test
	# java since the OS support must be there if you have a 64 bit JRE.
	
	my $is_java_7 = 0;
	
	my ($in,$out);
	open3(\*IN,\*OUT,\*OUT,"java -version") or print_error("Can't find java");
	close IN;
	while (<OUT>) {
		if (/64-Bit/) {
			$max_memory = 10240;
		}
		if (/build 1\.7/) {
			$is_java_7 = 1;
		}
	}
	close OUT;
		
	warn "Memory ceiling is $max_memory\n";
	
	# The way we determine the amount of physical memory is OS dependent.
	my $os = $^O;
	
	my $physical;
	if ($os =~ /Win/) {
		$physical = get_windows_memory($max_memory);
	}
	elsif ($os =~/darwin/) {
		$physical = get_osx_memory($max_memory);
		
		# Add the OSX specific options to use a standard OSX menu bar
		# and set the program name to something sensible.
				
		push @java_args, '-Xdock:name=SeqMonk';
		push @java_args, "-Xdock:icon=$RealBin/../Resources/seqmonk.icns";

		# Java7 on OSX has a bug in the apple screen bar which prevents the
		# menu items from being updated correctly resulting in them staying
		# disabled when they should be enabled.  We'll only enable this view
		# when the JRE is an older version.
		
		# Update - this seems to be fixed in newer versions of java 1.7 so 
		# we'll revert this
		
#		unless ($is_java_7) {		
			push @java_args, '-Dapple.laf.useScreenMenuBar=true';
#		}
	}
	elsif ($os =~ /bsd/) {
		$physical = get_osx_memory($max_memory);	
	}
	else {
		$physical = get_linux_memory($max_memory);
	}
	
	warn "Raw physical memory is $physical\n";
	
	# We then set the memory to be the minimum of 2/3 of the physical
	# memory or the ceiling, whichever is lower.
	$physical = int(($physical/3)*2);
	
	if ($max_memory < $physical) {
		return $max_memory;
	}
	
	warn "Using $physical MB of RAM to launch seqmonk\n";
	return $physical;
	
}

sub get_linux_memory {
	# We get the amount of physical memory on linux by parsing the output of free
	
	open (MEM,"free -m |") or print_error("Can't launch free on linux: $!");
	
	while (<MEM>) {
		if (/^Mem:\s+(\d+)/) {
			return $1;
		}
	}
	
	close MEM;
	
	print_error("Couldn't parse physical memory from the output of free");
}

sub get_osx_memory {
	
	# We get the amount of physical memory on OSX by parsing the output of top
	
	open (MEM,"top -l 1 -n 0 |") or print_error("Can't get amount of memory on OSX: $!");
	
	# Output from pre-mavericks OSX looks like this:
	#
	# Processes: 143 total, 5 running, 2 stuck, 136 sleeping, 842 threads
	# 2013/10/24 08:22:58
	# Load Avg: 1.47, 1.12, 0.90
	# CPU usage: 2.12% user, 10.63% sys, 87.23% idle
	# SharedLibs: 1040K resident, 0B data, 0B linkedit.
 	# MemRegions: 33232 total, 2277M resident, 99M private, 781M shared.
	# PhysMem: 1650M wired, 3229M active, 1137M inactive, 6016M used, 2168M free.
	# VM: 306G vsize, 1026M framework vsize, 5152376(0) pageins, 0(0) pageouts.
	# Networks: packets: 3155246/3187M in, 2197390/1031M out.
	# Disks: 214828/7466M read, 673736/15G written.
	#
	# In Mavericks it changed to look like this:
	#
	# Processes: 172 total, 3 running, 6 stuck, 163 sleeping, 817 threads 
	# 2013/10/24 08:27:53
	# Load Avg: 1.42, 1.78, 2.59 
	# CPU usage: 11.66% user, 11.66% sys, 76.66% idle 
	# SharedLibs: 12M resident, 14M data, 0B linkedit.
	# MemRegions: 28889 total, 2123M resident, 104M private, 371M shared.
	# PhysMem: 6442M used (1287M wired), 1743M unused.
	# VM: 388G vsize, 1065M framework vsize, 104(0) swapins, 126(0) swapouts.
	# Networks: packets: 2711222/3332M in, 387599/29M out.
	# Disks: 340088/8060M read, 174291/11G written.
	#
	# We're parsing the PhyMem line in each case to get the full amount of
	# system memory.
	
	my $total_mem = 0;
	
	while (<MEM>) {
		if (/^PhysMem:.*?(\d+)([MG])\s+used.*?(\d+)([MG])\s+(free|unused)/) {
		
			if ($2 eq 'G') {
				$total_mem += $1 * 1024;
			}
			else {
				$total_mem += $1;			
			}
			if ($4 eq 'G') {
				$total_mem += $3 * 1024;
			}
			else {
				$total_mem += $3;
			}
		}	

	}
	
	close MEM;
	
	unless ($total_mem) {
		print_error("Could't parse physical memory from the output of top");
	}
	
	return $total_mem;
	
}

sub get_windows_memory {
	
	warn "Getting windows physical memory\n";
	
	# This code was adapted from an answer posted by Tom Feiner on
	# stackoverflow
	#
	# http://stackoverflow.com/questions/423797/how-can-i-find-the-exact-amount-of-physical-memory-on-windows-x86-32bit-using-per
	
	my ($max_memory) = @_;
	
	eval {
		require Win32::OLE;
		Win32::OLE->import (qw( EVENTS HRESULT in ));
		1;
	} or do {
		print_error("Couldn't load Win32 module to determine windows memory");
	};
	
    my $WMI = Win32::OLE->GetObject( "winmgmts:{impersonationLevel=impersonate,(security)}//./" ) || print_error ("Could not get Win32 object: $OS_ERROR");
    my $total_capacity = 0;

	foreach my $object (in($WMI->InstancesOf( 'Win32_PhysicalMemory' ))) {
		$total_capacity += $object->{Capacity};
	}

    my $total_capacity_in_mb = $total_capacity / (1024*1024);
        
    return $total_capacity_in_mb;
}

__DATA__

      SeqMonk Importer - Creating SeqMonk Projects from the command line

SYNOPSIS

	seqmonk_import [--(un)spliced] [--mapq=20] --genome "Mus musculus/GRCm38" --outfile out.smk *.bam


DESCRIPTION

	This script allows you to run seqmonk in a non-interactive mode to read in
	a number of BAM or Bismark coverage files and save these into a single project 
	file which you can then transfer to an interactive server to do further 
	downstream analysis.
    
    The options for the program as as follows:

    --genome        The genome to use for the import.  This is specified as
                    species/assembly and must match an existing genome in your
                    seqmonk genomes folder.
	               
    --outfile       The name of the file you want to write the project to

    --spliced       Split spliced reads so you only see the exonic parts. Will
                    be added by default if any of the first 100,000 reads in
                    the first imported BAM file have a splice site in them. 
                    Adding this flag overrides the auto-detection.
	                
    --unspliced     No not split spliced reads even if they are present.  This
                    flag overrides the default auto-detection.

    --introns     	When importing with --spliced import introns rather than exons
	                
    --mapq          Value to use as a MAPQ cutoff for imported reads.  Defaults
                    to 20 if any of the first 100,000 reads has a value above 20.

    -h --help       Print this help file and exit
    
    -m --memory     Set the starting memory allocation in megabytes. Defaults
                    to 1300. Minimum allowed value is 500 and values above
                    1300 should only be set on systems running a 64-bit JRE
                        
BUGS

    Any bugs in SeqMonk should be reported either to simon.andrews@babraham.ac.uk
    or in the github issue tracker.
                   
    