#!/usr/bin/perl -w

#qrna2gff.pl

use strict;
use vars qw ($opt_c $opt_I $opt_J $opt_G $opt_H $opt_g $opt_l $opt_m $opt_n $opt_q $opt_s $opt_u $opt_w $opt_x $opt_z);  # required if strict used
use Getopt::Std;
use constant GNUPLOT => '/usr/bin/gnuplot';

getopts ('c:I:J:G:H:g:l:m:n:q:s:u:w:x:z:');     # ('aci:p:o:') means 'ac' are flags, 'i:p:o:' gets following scalar.


# Print a helpful message if the user provides no input file.
if (!@ARGV) { 
        print "usage:  qrna2gff.pl [options] file.qrna\n\n";
	print "options:\n";
        print "-c <case>          :  cases (default is case = 1)\n";
        print "                        possible cases are:\n";
        print "                        0=GLOBAL\n";  
        print "                        1=LOCAL_DIAG_VITERBI 2=LOCAL_DIAG_FORWARD\n";
        print "                        3=LOCAL_SEMI_VITERBI 4=LOCAL_SEMI_FORWARD\n";
        print "                        5=LOCAL_FULL_VITERBI 6=LOCAL_FULL_FORWARD\n";
	print "-I <min_id>        : min ID for analysis                      [default min_id = 0]\n";
	print "-J <max_id>        : max ID for analysis                      [default max_id = 100]\n";
	print "-G <min_gc>        : min GC for analysis                      [default min_gc = 0]\n";
	print "-H <max_gc>        : max GC for analysis                      [default max_gc = 100]\n";
	print "-g <typetarget>    :  which type of loci you want to analyze (default is all)\n";
        print "                        possible types of loci are:\n";
        print "                        OTH | COD | RNA \n";
        print "-l <lambda>        : lambda parameter of an EVD fit\n";
        print "-m <mu>            : mu parameter of an EVD fit\n";
        print "-n <size>          : size of database\n";
	print "-q <file.q>        : include qfile to get the actual ends of the qrna call\n";
        print "-s <type_of_score> : type of score (sigmoidal | simple)       [default = sigmoidal]\n";
	print "-u <cutoff>        : default is cutoff = 0\n";
	print "-w <whichorg>      : default is whichorg = 1  (use 1-for-org1 2-for-org2 12-for-both)\n";
	print "-x <name>          : ignore given name, use this one for gff outputs\n\n";
	print "-z <file>          : give a table of cutoffs for %gc ranges\n\n";
       exit;
}
my $verbose;
undef $verbose;

my $file = shift;
my $tag;
my $type;

my $dir;
my $filename;

if ($file =~ /^(\S+)\/([^\/]+)$/) {
    $dir  = $1;
    $filename = $2;
}
else {
    $dir  = "";
    $filename = $file;
}
print "file: $filename\n";
print "dir:  $dir/\n";

my $output   = "$dir";


my $n_in_ali = 0; 
my $file_q = $opt_q;

my $typetarget; 
if ($opt_g) { $typetarget = $opt_g; } 
else        { $typetarget = "all";  }
$verbose = $typetarget;


my $whichorg ;
if ($opt_w) { $whichorg = $opt_w; }
else        { $whichorg = 1;      }
if ($whichorg != 1 && $whichorg != 2 && $whichorg != 12 ) { print "organism has to be '1' or '2' or '12'\n"; die }

my $usename;
if ($opt_x) { $usename = $opt_x; }

my $fix_cutoff;
if (defined($opt_u)) { $fix_cutoff = $opt_u; }
else                 { $fix_cutoff = 0;     }

my $id_min;
if ($opt_I) { $id_min = $opt_I; }
else        { $id_min = 0;      }
my $id_max;
if ($opt_J) { $id_max = $opt_J; }
else        { $id_max = 100;    }

my $gc_min;
if ($opt_G) { $gc_min = $opt_G; }
else        { $gc_min = 0;      }
my $gc_max;
if ($opt_H) { $gc_max = $opt_H; }
else        { $gc_max = 100;    }

my $fix_lambda = -1;
if ($opt_l) { $fix_lambda = $opt_l; }

my $fix_mu = -1;
if ($opt_m) { $fix_mu = $opt_m; }

my $fix_size;
if ($opt_n) { $fix_size = $opt_n; }

my @cutoff;
my %gc_min_cutoff;
my %gc_max_cutoff;
my %lambda;
my %mu;
my %size;

read_gc_cutoff_file("$opt_z", \@cutoff, \%gc_min_cutoff, \%gc_max_cutoff, \%lambda, \%mu, \%size);

my $spec = "ID[$id_max:$id_min].GC[$gc_max:$gc_min]";

my $type_of_score;
if ($opt_s) { $type_of_score = $opt_s;      }
else        { $type_of_score = "sigmoidal"; }
if ($type_of_score =~ /^simple$/ || $type_of_score =~ /^sigmoidal$/) { ;}
else { print "wrong type of score. options are: 'simple' or 'sigmoidal'"; die; }

my $case;
if ($opt_c) { $case = $opt_c; }
else        { $case = 1;      }

if   ($case==0) { $tag = "GLOBAL";        }
elsif($case==1) { $tag = "LOCAL_DIAG_VITERBI"; }
elsif($case==2) { $tag = "LOCAL_DIAG_FORWARD"; }
elsif($case==3) { $tag = "LOCAL_SEMI_VITERBI"; }
elsif($case==4) { $tag = "LOCAL_SEMI_FORWARD"; }
elsif($case==5) { $tag = "LOCAL_FULL_VITERBI"; }
elsif($case==6) { $tag = "LOCAL_FULL_FORWARD"; }

my $sqrt2 = sqrt(2.0);

my $gff_file = "$file.$typetarget.";
if ($opt_z) { $gff_file .= "CUTOFFvar.$spec.gff";      }
else        { $gff_file .= "CUTOFF$fix_cutoff.$spec.gff";  }

my $num = 0;

my $coor1;
my $coor2;

my $othsc;
my $rnasc;
my $codsc;

my $rnalod;
my $codlod;

my $othlodsigm;
my $rnalodsigm;
my $codlodsigm;

my $rna;
my $cod;

my $name1;
my $name2;

my @name1;
my @name2;
my $rest;

my @lloci1;
my @lloci2;

my @rloci1;
my @rloci2;

my @type1;
my @type2;

my @codsc1;
my @codsc2;

my @rnasc1;
my @rnasc2;

my $startblast1;
my $startblast2;

my $seq = 0;

my $idx = 0;

my $len;
my $len_new;
my $id;
my $id_new;
my $gc;

my $score;

my $shuffle = 0;

my $time;

my $coor1l;
my $coor1r;
my $coor2l;
my $coor2r;
my $startwin1;
my $startwin2;
my $endwin1;
my $endwin2;
my $new = 1;

my $abs1;
my $abs2;
my $whole_name1;
my $whole_name2;
my $name1_quote;
my $name2_quote;

my $ali1;
my $ali2;

my $nnamesseq = 0;

my $qrna;

my $evalue;

my $oth_lend;
my $oth_rend;
my $cod_lend;
my $cod_rend;
my $rna_lend;
my $rna_rend;
my $motif_lend;
my $motif_rend;

my $total_len;

my $ali_num;

open (GFF,">$gff_file") || die;

open (FILE,"$file") || die;
while (<FILE>) {
    
    if (/^Divergence time \(\S+\):\s+(\S+)/) {
	$time = $1;
    }

    elsif (/^length of whole alignment after removing common gaps: (\d+)/) { 
	$total_len = $1;
    }
    
    elsif (/^\#\s+(\d+)\s+/) { 
	$ali_num = $1;

	if ($_ =~ /shuffled/) { $shuffle = 1; }
    }
    
    elsif (/^>(\S+)/ && $seq == 0) {
 
	$whole_name1 = $1;
	$whole_name1 =~ s/\\//g;

	if ($whole_name1 =~ /^(\S+)[\-\:](\d+[><]\d+)\-(.*)$/) { 
	    
	    $name1 = $1;
	    $startblast1 = $2; 
	    $rest = $3;
	
	    # regex metacharacters: \ | ( ) [ { ^ $ * + ? .
	    #
	    $name1_quote = quotemeta $name1;
	    
	    $abs1 =0;	
	    
	    if ($name1 =~ /^(\S+)\/frag\d+(.+)$/) {
		$name1 = $1; 
		$rest  = $2;
		$name1 =~ s/\\//g;
		
		if ($rest =~ /(\S+)\-\S+/) {
		    $rest = $1;
		    $rest =~ s/\///g; $rest =~ s/\\//g; 
		    $abs1 = $rest-1;
		    
		}
	    }
	    $name1 =~ s/\\//g;
	    if ($name1 =~ /^(\S+)[\/\-\:](\d+)\-(\d+)/) { 
		$name1 = $1;
		$rest  = $4;
		if ($2 < $3) { $abs1 += $2-1; }
		else         { $abs1 += $3-1; }
	    }
	}

	$seq = 1; 
	$nnamesseq++; 
    }
    
    elsif (/^>(\S+)/ && $seq == 1) { 
	$whole_name2 = $1;
	$whole_name2 =~ s/\\//g;
	
	if ($whole_name2 =~ /^(\S+)[\-\:](\d+[><]\d+)\-(.*)$/) { 
	    $name2 = $1;
	    $startblast2 = $2;
	    $rest = $3;
	    
	    # regex metacharacters: \ | ( ) [ { ^ $ * + ? .
	    #
	    $name2_quote = quotemeta $name2;
	    
	    $abs2 = 0;
	    if ($name2 =~ /^(\S+)\/frag\d+(.+)$/) {
		$name2 = $1; 
		$rest  = $2;
		$name2 =~ s/\\//g;
		
		if ($rest =~ /(\S+)\-\S+/) {
		    $rest = $1;
		    $rest =~ s/\///g; $rest =~ s/\\//g; 
		    $abs2 = $rest-1;
		}
	    }
	    $name2 =~ s/\\//g;
	    if ($name2 =~ /^(\S+)[\-\:](\d+)\-(\d+)/) { 
		$name2 = $1;
		$rest = $4;
		if ($2 < $3) { $abs2 += $2-1; }
		else         { $abs2 += $3-1; }
	    }	    	    
	}

	if ($opt_q) { get_ali_from_qfile ($file_q, $whole_name1, $whole_name2, \$ali1, \$ali2); }
	else        { $ali1 = ""; $ali2 = ""; }
	
	$seq = 0; 
	$nnamesseq++; 
    } 
    
    elsif (/^length alignment:\s+(\S+) \(id=(\d+\.\d+)\)/) { 
	$len = $1;
	$id  = $2;
    }
    
    elsif (/^posX: (.+)$/ ) { 
	$coor1 = $1;
	
	if ($coor1 =~ /\(\S+\s(\S+)\s(\S+)\s\S+\)/) { $gc = $1 + $2; }
    }
    
    elsif (/^posY: (.+)$/) { 
	$coor2 = $1;
	
	if ($coor2 =~ /\(\S+\s(\S+)\s(\S+)\s\S+\)/) { $gc += $1 + $2; $gc *= 50; }
    } 
    
    elsif (/^$tag/) { 
	$num = 1; 
    }

    elsif (/^OTH ends \*\([\-\+]\)\s+=\s+\((\d+)\.\.\[\d+\]\.\.(\d+)\)/) { 
	$oth_lend = $1; 
	$oth_rend = $2; 
    }
    
    elsif (/^COD ends \*\([\-\+]\)\s+=\s+\((\d+)\.\.\[\d+\]\.\.(\d+)\)/) { 
	$cod_lend = $1; 
	$cod_rend = $2; 
    }
    
    elsif (/^RNA ends \*\([\-\+]\)\s+=\s+\((\d+)\.\.\[\d+\]\.\.(\d+)\)/) { 
	$rna_lend = $1; 
	$rna_rend = $2; 
    }
    
    #this is to accomodate version 1.2b now obsolete
    elsif (/^OTH\s+ends = (\d+)\s+(\d+)/) { 
	$oth_lend = $1; 
	$oth_rend = $2; 
    }
    elsif (/^COD\s+ends = (\d+)\s+(\d+)/) { 
	$cod_lend = $1; 
	$cod_rend = $2; 
    }
    elsif (/^RNA\s+ends = (\d+)\s+(\d+)/) { 
	$rna_lend = $1; 
	$rna_rend = $2; 
    }
    
    elsif (/winner = (\S+)/) { $type = $1; }
    
    elsif (/^\s+ OTH = \s+(\S+)\s+ COD = \s+(\S+)\s+ RNA = \s+(\S+)/ && $num == 1) { 
	$othsc = $1; 
	$codsc = $2; 
	$rnasc = $3; 

	$codlod = $codsc - $othsc;
	$rnalod = $rnasc - $othsc;
	
	if ($othsc < - 5000.0 || $codsc < - 5000.0 || $rnasc < - 5000.0) { 
	    $othlodsigm = -5000.0;
	    $codlodsigm = -5000.0;
	    $rnalodsigm = -5000.0;
	}
	else {
	    $othlodsigm = -log(exp(log(2.0)*($codsc-$othsc)) + exp(log(2.0)*($rnasc-$othsc)))/log(2.0);
	    $codlodsigm = -log(exp(log(2.0)*($othsc-$codsc)) + exp(log(2.0)*($rnasc-$codsc)))/log(2.0);
	    $rnalodsigm = -log(exp(log(2.0)*($othsc-$rnasc)) + exp(log(2.0)*($codsc-$rnasc)))/log(2.0);
	}
	
	undef($score);
	if    ($type =~/^RNA$/) { $score = $rnalodsigm; $motif_lend = $rna_lend; $motif_rend = $rna_rend; }
	elsif ($type =~/^COD$/) { $score = $codlodsigm; $motif_lend = $cod_lend; $motif_rend = $cod_rend; }
	elsif ($type =~/^OTH$/) { $score = $othlodsigm; $motif_lend = $oth_lend; $motif_rend = $oth_rend; }
	else                    { print "wrong type ($type)\n"; die; }
	
	if ($motif_lend > $motif_rend) { switch(\$motif_lend, \$motif_rend); }
	
	# there is a bug in reporting bugs of version 1.2b sometimes they reach the end of the
	# sequence. At this stage, I'm not going to fix the bug, but deal with it.
	if ($motif_lend <  0         ) { print "laggg $motif_lend\n"; die; }
	if ($motif_lend >= $total_len) { $motif_lend = $total_len-1; }
	if ($motif_rend >= $total_len) { $motif_rend = $total_len-1; }

	$qrna = "QRNA";
	if ($shuffle == 1) { $type = "sh$type"; $qrna = "shQRNA"; }
	
	$num = 0;
	
	if ($score) {
	    
	    if ($typetarget =~ /^all$/ || $type =~ /$typetarget$/) 
	    {
		if ($id >= $id_min && $id <= $id_max && $gc >= $gc_min && $gc <= $gc_max) {
		    
		    foreach my $cut (@cutoff) {
			if ($score >= $cut && $gc >= $gc_min_cutoff{$cut} && $gc < $gc_max_cutoff{$cut}) {

		    if ($opt_q) {
			get_coords_from_ali ($ali1, $motif_lend, $motif_rend, \$coor1l, \$coor1r);
			get_coords_from_ali ($ali2, $motif_lend, $motif_rend, \$coor2l, \$coor2r);
		    }
		    else {   
			#remember conventions for qrna output:
			#
			#      posX: 0-62 [0-59](60) 
			#
			# is an alignment of 63 positions with 3 gaps. 
			# So the actual positions are from 0 to 59 not to 62.
			#
			#
			if ($coor1 =~ /^\d+-\d+\s+\[(\d+)-(\d+)\]\((\d+)\)/) { $coor1l = $1; $coor1r = $2; }
			if ($coor2 =~ /^\d+-\d+\s+\[(\d+)-(\d+)\]\((\d+)\)/) { $coor2l = $1; $coor2r = $2; }
		    }
		    
		    #get the ENDS depending on the strand
		    #	
		    if ($startblast1 =~ /(\d+)>(\d+)/) {
			$startwin1 = $1 + $coor1l;
			$endwin1   = $1 + $coor1r;
		    }
		    elsif ($startblast1 =~ /(\d+)<(\d+)/) {
			$startwin1 = $2 - $coor1r;
			$endwin1   = $2 - $coor1l;
		    }
		    else { print "identify_loci(): fasta name has to indicate strand (1): $startblast1\n"; die; }
		    
		    $startwin1 += $abs1; 
		    $endwin1   += $abs1;

		    if ($startblast2 =~ /(\d+)>(\d+)/) {
			$startwin2 = $1 + $coor2l;
			$endwin2   = $1 + $coor2r;
		    }
		    elsif ($startblast2 =~ /(\d+)<(\d+)/) {
			$startwin2 = $2 - $coor2r;
			$endwin2   = $2 - $coor2l;
		    }
		    else { print "identify_loci(): fasta name has to indicate strand (2): $startblast2\n"; die; }
		    
		    $startwin2 += $abs2; 
		    $endwin2   += $abs2;
		    
		    #paranoia
		    if ( ($startwin2 > $endwin2) || $startwin2 < 0 || $endwin2 < 0 ) { print "got ends of the window wrong\n"; die; }
		    if ( ($startwin1 > $endwin1) || $startwin1 < 0 || $endwin1 < 0 ) { print "got ends of the window wrong\n"; die; }
		    
		    $evalue = evalue ($score, $size{$cut}, $lambda{$cut}, $mu{$cut});

		    if ($whichorg == 1) {
			if (defined($opt_x)) { print GFF "$usename\t$qrna\t$type\t$startwin1\t$endwin1\t$score\t.\t\.\tgene '$name1' id '$id' eval '$evalue'\n"; }
			else                 { print GFF "$name1\t$qrna\t$type\t$startwin1\t$endwin1\t$score\t.\t\.\tgene '$name1' id '$id' eval '$evalue\n";   }
		    }
		    if ($whichorg == 2) {
			if (defined($opt_x)) { print GFF "$usename\t$qrna\t$type\t$startwin2\t$endwin2\t$score\t.\t\.\tgene '$name2' id '$id' eval '$evalue'\n"; }
			else                 { print GFF "$name2\t$qrna\t$type\t$startwin2\t$endwin2\t$score\t.\t\.\tgene '$name2' id '$id' eval '$evalue\n";   } 
		    }
		}
		    }
		}
	    }
	}
    }
    
    else  { next; }
    
}
close (FILE);


#
# subroutines
#

sub evalue {

    my ($score, $size, $lambda, $mu) = @_; 

    my $evalue = -1;

    if ($lambda > 0) { 

	$evalue = 1.0 - exp(-exp(-$lambda*($score-$mu)));

	$evalue *= $size; 
    }

    return $evalue;
}
sub get_ali_from_qfile {

    my ($qfile, $name1, $name2, $ali1_ref, $ali2_ref) = @_; 

    my $idx = 0;
    my $ali1;
    my $ali2;

    my $flag1 = 0;
    my $flag2 = 0;

    my $name;

    open (QFILE,"$qfile") || die;
    while (<QFILE>) {

	if (/^>(\S+)/ && $idx == 0) 
	{ 
	    $name = $1;

	    if ($flag1 == 1 && $flag2 == 1) { last; }

	    $ali1 = "";
	    $ali2 = "";
	    $flag1 = 0;
	    $flag2 = 0;

	    if ($name1 =~ $name) { $flag1 = 1; }
	    $idx = 1;
	}

	elsif (/^[^\>]/ && $idx == 1) 
	{
	    if ($flag1 == 1) { $ali1 .= $_; }

	}

	elsif (/^>(\S+)/ && $idx == 1) 
	{ 
	    $name = $1;
	    if ($name2 =~ $name) { $flag2 = 1; }
	    else                 { $flag1 = 0; $ali1 = ""; }
	    $idx = 0;
	}
	
	elsif (/^[^\>]/ && $idx == 0) 
	{
	    if ($flag1 == 1 && $flag2 == 1) { $ali2 .= $_; }
	}

    }
    close (QFILE);
   
    $$ali1_ref = $ali1;
    $$ali2_ref = $ali2;
}

sub get_coords_from_ali {

    my ($ali, $ali_lend, $ali_rend, $coorl_ref, $coorr_ref) = @_;

    my $cali = $ali;
    my $coorl = -1;
    my $coorr = -1;

    my $prev_ali;
    my $prev;
    my $post;

    if ($ali_lend == $ali_rend) { $$coorl_ref = 0; $$coorr_ref = 0; return; }

    $cali =~ s/\s+//g; $cali =~ s/\n//g;

    my $len_ali = length($cali);

    #paranoia test
    #
    if ($ali_lend >= $len_ali ||
	$ali_rend >= $len_ali   ) { print "bad motif? lend=$ali_lend rend=$ali_rend total=$len_ali\n $ali\n\n"; die; }

    $prev = $ali_lend;
    $post = $len_ali - $ali_rend - 1;

    if ($post < 0) { print "wrong ends? len_ali=$len_ali $ali_lend $ali_rend \n"; die; }

    if ($prev > 0) {
	$cali =~ s/^(.{$prev})//;
	
	$prev_ali = $1;
	$prev_ali =~ s/\-//g;
	$prev_ali =~ s/\.//g;
	$coorl= length($prev_ali);
    }
    else {
	$coorl = 0;
    }


    $cali =~ s/.{$post}$//;
    $cali =~ s/\-//g;
    $cali =~ s/\.//g;

    if (length($cali) == 0) { $coorr = $coorl;                     }
    else                    { $coorr = $coorl + length($cali) - 1; }

    $$coorl_ref = $coorl;
    $$coorr_ref = $coorr;
}


sub read_gc_cutoff_file {
    my ($gc_cutoff_file, $cutoff_ref, $gc_min_cutoff_ref, $gc_max_cutoff_ref, $lambda_ref, $mu_ref, $size_ref) = @_;

    my $x = 0;
    my $cut;
    my $min_gc;
    my $max_gc;

    my $lambda;
    my $mu;
    my $size;

    if ($gc_cutoff_file) {
	open (GC,"$gc_cutoff_file") || die;
	while (<GC>) {
	    if (/^(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)/) {
		$cut = $1;
		$min_gc = $2;
		$max_gc = $3;
		$lambda = $4;
		$mu     = $5;
		$size   = $6;

		$cutoff_ref->[$x] = $cut;
		$gc_min_cutoff_ref->{$cut} = $min_gc;
		$gc_max_cutoff_ref->{$cut} = $max_gc;

		$lambda_ref->{$cut} = $lambda;
		$mu_ref->{$cut}     = $mu;
		$size_ref->{$cut}   = $size;

		$x++;
	    }
	}
	close (GC);
    }
    else {
	$cutoff_ref->[0] = $fix_cutoff;
	$gc_min_cutoff_ref->{$fix_cutoff} = 0;
	$gc_max_cutoff_ref->{$fix_cutoff} = 100;

	$lambda_ref->{$fix_cutoff} = $fix_lambda;
	$mu_ref->{$fix_cutoff}     = $fix_mu;
	$size_ref->{$fix_cutoff}   = $fix_size;
    }
   
}

sub switch {

    my ($x1_ref, $x2_ref) = @_;

    my $x1 = $$x1_ref;
    my $x2 = $$x2_ref;
    my $x;

    $x  = $x1;
    $x1 = $x2;
    $x2 = $x;

    $$x1_ref = $x1;
    $$x2_ref = $x2;

    if ($$x1_ref > $$x2_ref) { print "bad switch\n"; die; }
}
