#!/usr/bin/perl
print "Program start\n";
use strict;

# input files 
my $vectorFile = "vector.fasta"; # This file should contain ONLY ONE sequence, STARTING with the restriction site at the 3' END of the insert.
my $vectorAttachSeq = "vector_start.fasta"; # sequence that is joined to the predicted sequence at the start.
my $ensemblCdna = "database.fasta";
my $crossmatch_enst = "crossmatch_database";
my $crossmatch_vector = "crossmatch_vector";




my $ensemblCdnaList = $ensemblCdna.".sublist";
my $specialHex1Feature = 1;
#my $pQE_ATG = 3419; # Expressions vector feature: where is the start codon?


my $seqFasta = "seq.fasta"; # Were the ESTs live. Script will ALSO NEED $seqFasta.screen.qual !!

# parameters

my $altSpliceDetectWindow =  10; # bp, high abundance of errors in this sequence window suggests alternative splicing
my $altSpliceDetectErrorRate = 0.5; # This error rate suggests alternative splicing
my $DIoffset = 10; # how far may deletions/insertion be separated to keep the reading frame intact
my $seqJoinResult = "seqjoin.result"; # Here you will find the results upon executing emboss.script
my $seqJoinPepResult = "seqjoin.pep"; 
unlink $seqJoinResult if (-f $seqJoinResult); # say good-bye to any old versions of this file
open (STATA, "> seqjoin.stat.all"); # Output file
open (STATP, "> seqjoin.stat.predict"); # Output file
open (EMBOSS, "> emboss.script"); # Output file, enter at the prompt: sh -v emboss.script
open (SUBSET, "> $ensemblCdnaList");
my $minAvgQual = 15; # cross_match does not take base caller qualities into account for alignments. 
# Alignments might extend into low quality regions. The average quality around the position of interest
# is determined by the function avgQualWindow(). $minAvgQual is the minimal avg quality returned by this function
# which is still considered.
my $aheadBp = 4; # window size for avgQual()
my $behindBp = 4; # window size for avgQual()
my %minqual;
$minqual{mismatch} = 20; 
$minqual{S} = 18; # was 15, check 800H12507 
$minqual{I} = 15;
$minqual{D} = $minqual{I};
$minqual{EST} = 15; # the minimal quality of the peace of the EST sequence that is used to join the vector and ENST sequences.
my $minPercId = 0.95;
my $EmbossPath = "/package/emboss/bin";
my $tempSeq = "seqJoinTempSeq";
my @veccross;
my @enstcross;
my %descr; # a hash of DISCREPANCY lines from a reformatted cross_match output
my %matchRec; 
my $lastvec;
my $pQELen;
my $pQEAttLen = 0;
my %vecBuf; 

reformatCrossmOutput($crossmatch_enst);
reformatCrossmOutput($crossmatch_vector);
die "Please run:\ncross_match -alignments -tags -discrep_lists $seqFasta $vectorFile > $crossmatch_vector\n"
  if (not -f  $crossmatch_vector);
die "Please run:\ncross_match -alignments -tags -discrep_lists $seqFasta.screen $ensemblCdna > $crossmatch_enst\n"
	if (not -f $crossmatch_enst);
#open (ENST,"$crossmatch_format_script $crossmatch_enst |") ;
#open (VEC,"$crossmatch_format_script $crossmatch_vector |") ;
open (ENST,"$crossmatch_enst".".reformat") || die "cannot open "."$crossmatch_enst".".reformat";
open (VEC,"$crossmatch_vector".".reformat") || die "cannot open "."$crossmatch_vector".".reformat";

print "Read alignments with vector sequence(s) in $vectorFile from "."$crossmatch_vector".".reformat .... ";
while (<VEC>) {
	my @vecline = split;
	next if (scalar @vecline < 12);
	next if ($vecline[10] < 2000); # match with far end of vector
	$vecBuf{$vecline[4]} = [@vecline];
#	print ".";
}
print "done.\n\n";
close VEC;

print "Vector sequence file $vectorFile\n";

# qual tests
# print " avgQualWindow(800A02507.SCF,10) ",avgQualWindow("800A02507.SCF",10),"\n";
# print	" avgQualWindow(800A06507.SCF,10) ",avgQualWindow("800A06507.SCF",10),"\n";
# print " minQuality(800A06507.SCF,20) " ,minQuality("800A06507.SCF",10,20),"\n";
# print " minQuality(800a11555.SCF,100,101) ." ,minQuality("800a11555.SCF",100,101),".\n";
my $i;
LINES: while (<ENST>){
#	print;
	my @line = split;
	if (scalar @line == 12 and $line[8] =~ /^ENST\d+$/) {
		analyseCrossMatch() if (scalar @enstcross > 0 and scalar @veccross > 0); 
		@enstcross = @line;
		@veccross = (); 
		%descr = ();
		#my $x = `$crossmatch_format_script $crossmatch_vector | \grep -w $enstcross[4]`;
		#my @vecline = @{$vecBuf{$enstcross[4]}};
		#split(" ",$x);}
		#print "\$x = $x\n\$vecline[4] = $vecline[4]\n";
		if (defined $vecBuf{$enstcross[4]}) {
			@veccross = @{$vecBuf{$enstcross[4]}} ;
		} else {
			#print STDERR "\nWarning: no vector alignment for EST $enstcross[4].\n";
		}
	} elsif (scalar @line == 7 and scalar @enstcross > 0) {
			$descr{$line[2]} = [@line]; 
			#print "DESCR $descr{$line[2]}[6] $descr{$line[2]}[2] $descr{$line[2]}[4]\n";
	} else {
			#print " Never should get here...\n$_\n";
			analyseCrossMatch() if (scalar @enstcross > 0 and scalar @veccross > 0);
			@enstcross = ();
	}
}

analyseCrossMatch() if (scalar @enstcross > 0 and scalar @veccross > 0);


my %clonecount;
foreach my $est (sort keys %matchRec){
	my $enstCount;
	my %enstToJoin = (); 
	#print "\$est $est\n";
	foreach my $enst (sort keys %{$matchRec{$est}} ) {
		$enstCount += 1;
		printStat(\*STATA, $enstCount, '', %{$matchRec{$est}{$enst}});
		if ($matchRec{$est}{$enst}{altSplice} == 0 and
				$matchRec{$est}{$enst}{countDIcodon}{D} == 0 and
				$matchRec{$est}{$enst}{countDIcodon}{I} == 0 and
				$matchRec{$est}{$enst}{minESTqual} >= $minqual{EST}) {
			if (scalar keys %enstToJoin == 0) {
				%enstToJoin = %{$matchRec{$est}{$enst}};
			} elsif (
				$enstToJoin{minESTqual} >  $matchRec{$est}{$enst}{minESTqual} or (
				$enstToJoin{minESTqual} == $matchRec{$est}{$enst}{minESTqual} and
				$enstToJoin{matchLength} < $matchRec{$est}{$enst}{matchLength})) {
					%enstToJoin = %{$matchRec{$est}{$enst}};
			}
		}
	}
	if (scalar keys %enstToJoin > 0) {
		#print "Ich mache eine Sequenz mit $est und $enstToJoin{enst}.\n";
		print EMBOSS $enstToJoin{emboss};
		$clonecount{$enstToJoin{clone}} += 1;
		printStat(\*STATP, $enstCount, $clonecount{$enstToJoin{clone}}, %enstToJoin);
	}
}
print EMBOSS "transeq $seqJoinResult sp\n";
#print EMBOSS qq{perl -ne 'if(/^>/){$w=0;}if($_=~s/\*.*//){if($w==0 and length($_)>1){print;}$w=1;}if($w==0){print;}' sp > $seqJoinPepResult\n};
print EMBOSS "rm seqJoinTempSeq.* sp\n";
print "Run sh -v emboss.script\n";
print "\nFinished. Look for results in the files seqjoin.stat.all seqjoin.stat.predicted and emboss.script.\n\n";
close SUBSET;
close EMBOSS;
`sort $ensemblCdnaList | uniq | sort -o $ensemblCdnaList`;

sub bpFromEst {
	my %h = @_;
	return $h{useESTend} - $h{useESTstart};
}

sub stopInFrame {
	my $seq = shift;
	$seq = uc $seq;
	while (length($seq) >= 3) {
		$seq = /(\w\w\w)(\w+)/;
		my $codon = $1;
		$seq = $2;
		if ($codon eq "TGA" or $codon eq "TAG" or $codon eq "TAA") {
			return 1;
		} 
	}
	return 0;
}

# column	field
# 1 clone
# 2 clone sequence
# 3 ENST-ID
# 4 ENST count
# 5 clone count
# 6 alternative splicing
# 7 perc ident
# 8 match length
# 9 use EST start
# 10 use EST end
# 11 min EST quality
# 12 ENST start
# 13 substitutions
# 14 insertions + deletions
# 15 comment

sub printStat{
	my $fh = shift;
	my $enstCount = shift;
#	my $est = shift;
	my $cloneCount = shift;
	my %h = @_;
	printf $fh "$h{clone}\t%-25s\t$h{enst}\t$enstCount\t$clonecount{$h{clone}}\t$h{altSplice}\t%0.4f\t",$h{est},$h{percIdent};
	print $fh "$h{matchLength}\t$h{useESTstart}\t$h{useESTend}\t$h{minESTqual}\t$h{enstStart}\t$h{countDI}{S}\t",
			$h{countDIcodon}{I} + $h{countDIcodon}{D},"\t",
			$h{comment},"\n";
}
	
sub analyseCrossMatch {
	my $sysCmd = '';
	my $altSplice = 0;
	my $mismatches = mismatch_count();
	my $percIdent = 1 - ($mismatches / ($enstcross[6] - $enstcross[5]));
	my $comment;
  #printf "Loop $enstcross[4] $enstcross[8] $mismatches %0.0f%\n",$percIdent*100;
	#print "ENST: $enstcross[8]\n";
	#print "Vector: $veccross[8] $veccross[4]\n";
	my $estref = $enstcross[4];
	#if ($specialHex1Feature) {
		#if ($estref =~ /(^800\w\d+)/) {
			#$estref = uc($1);
		#}			
	#}	
	if ($percIdent >= $minPercId) {
   if (defined $matchRec{$estref}{$enstcross[8]} ) {
				#$matchRec{$estref} = (); # alternative Splicing
				$matchRec{$estref}{$enstcross[8]}{altSplice} = 1;
				$matchRec{$estref}{$enstcross[8]}{comment} .= 
					"Alternative Splicing? cross_match finds two alignments with this ENST.";							
	 } else {
		#unlink $tempSeq if (-f $tempSeq);
		if (not $lastvec eq $veccross[8]) {
			`infoseq -auto $vectorFile:$veccross[8] | grep -vE "^\#" ` =~ /\bN\s+(\d+)/;
			$pQELen = $1; # Vector length
			print "Check the vector length for $vectorFile:$veccross[8]. Found $pQELen\n"; 
			if (-f $vectorAttachSeq) {
				`infoseq -auto $vectorAttachSeq:$veccross[8] | grep -vE "^\#" ` =~ /\bN\s+(\d+)/;
				$pQEAttLen = $1;
				print "Check the length of $vectorAttachSeq:$veccross[8]. Found $pQEAttLen\n"; 
			}
			$lastvec = $veccross[8];
		} 
		my %rec = (
			matchLength => $enstcross[6] - $enstcross[5],
			estStart => $enstcross[5],
			estEnd => $enstcross[6],
			enstStart => $enstcross[9],
			enst => $enstcross[8],
			est => $enstcross[4],
			percIdent => $percIdent,
			minESTqual => 99,
			clone => 'unknown',
		);	
		if ($specialHex1Feature) {
			$rec{clone} = $enstcross[4];		
			if ($rec{clone} =~ /(^800\w\d+)/) {
				$rec{clone} = uc($1);
			}			
		} 	
		if (-f $vectorAttachSeq) {
			$sysCmd .= "seqret $vectorAttachSeq:$veccross[8] $tempSeq.1 -supper\n";
		} else {
			$sysCmd .= "echo \\>empty > $tempSeq.1\n";
		}
		my $seqname = "$rec{est}_$rec{enst}";
		$rec{useESTstart} = $veccross[6] + $pQELen-$veccross[10] + 1;
		$rec{useESTend} = $enstcross[5] - 1;
		if ($rec{useESTstart} <= $rec{useESTend}) {
			$sysCmd .= "seqret fasta::$seqFasta:$veccross[4] -out $tempSeq.2 -sbegin ".
				$rec{useESTstart}." -send ".$rec{useESTend}." -slower\n";
			$sysCmd .= "pasteseq $tempSeq.2 $tempSeq.1 $tempSeq.1 -pos 0\n";
			$rec{minESTqual} = minQuality($rec{est},$rec{useESTstart},$rec{useESTend});
		}
		print SUBSET "$ensemblCdna:$enstcross[8]\n";		
		$sysCmd .= "seqret $ensemblCdna:$enstcross[8] $tempSeq.2 -supper\n";
		#jetzt die Unterschiede einbauen...
		my %countDI = (D => 0,I => 0,S => 0);
		my %countDIcodon = ();
		my %before = ();
		my %altSpliceMem = ();
		foreach my $d (sort keys %descr) {
			#print "avgQualWindow($enstcross[4],$descr{$d}[2]) = ",avgQualWindow($enstcross[4],$descr{$d}[2])," \$descr{$d}[3] = $descr{$d}[3]\n";
			if ($descr{$d}[4] >= $minqual{$descr{$d}[0]} 
					and avgQualWindow($enstcross[4],$descr{$d}[2]) > $minAvgQual
					and not uc($descr{$d}[3]) eq 'N' ) {
				if ($descr{$d}[0] eq 'S' and uc($descr{$d}[3]) =~ /^[ACGT]$/ ) {
					$countDI{S} += 1;
					$comment = "Found reliable substitutions [pos(quality seq.)]: " if (length($comment) == 0);
					$comment .= " ".$descr{$d}[5]."(".$descr{$d}[4]." ".$descr{$d}[6]."),";
					$sysCmd .= "cutseq $tempSeq.2 $tempSeq.2 -from $descr{$d}[5] -to $descr{$d}[5]\n";
					$sysCmd .= "pasteseq $tempSeq.2 asis::". lc($descr{$d}[3]). " $tempSeq.2 -pos ".($descr{$d}[5] - 1)."\n";
				} elsif ($descr{$d}[0] =~ /^[DI]$/) {
				  foreach my $abiPos ($descr{$d}[5] .. ($descr{$d}[5] + $descr{$d}[1] - 1)) {
						#print "\$descr{$d}[1] $descr{$d}[1] \$abiPos $abiPos \n";
						#$countDI{$descr{$d}[0]} += $descr{$d}[1];
						$countDI{$descr{$d}[0]} += 1;
						if (($before{$descr{$d}[0]}[2] >= $descr{$d}[5] - $DIoffset) and ($before{$descr{$d}[0]}[2] > 0)) {
							# found a reliable insertion or deletion
							# EMBOSS not implemented yet
							$before{$descr{$d}[0]} = ();
							$countDIcodon{$descr{$d}[0]} += 1;
						} else {
							$before{$descr{$d}[0]}[2] = ($before{$descr{$d}[0]}[1] >= $descr{$d}[5] - $DIoffset) ?
										$before{$descr{$d}[0]}[1] : 0;
							#$before{$descr{$d}[0]}[1] = $descr{$d}[5];
							$before{$descr{$d}[0]}[1] = $abiPos;
						}
					}
				}
			}
		}
		$comment =~ s/,$/\./;
		my $subtext = "Found $countDI{S} reliable substitution";
		$subtext .= ($countDI{S} > 1) ? 's' : '';
		$comment =~ s/Found reliable substitutions/$subtext/;
		#print "$enstcross[4] $enstcross[8]  \$countDI{D,I,S} I $countDI{I} D $countDI{D} S $countDI{S} ",
		#	"\$countDIcodon{D,I} I $countDIcodon{I} D $countDIcodon{D} altSplice = ",altSpliceDetect(),"\n";
		my $cutEnstToPos = 
			($veccross[6] + ($pQELen-$veccross[10]) < $enstcross[5]) ? 
			$enstcross[9] - 1 : 
			$enstcross[9] + $veccross[6] + ($pQELen-$veccross[10]) - $enstcross[5];
		$sysCmd .= "cutseq $tempSeq.2 $tempSeq.2 -from 1 -to ".$cutEnstToPos."\n" 
			if ($cutEnstToPos > 0);
		$sysCmd .= "pasteseq $tempSeq.2 $tempSeq.1 $tempSeq.1 -pos 0\n";
		$sysCmd .= "descseq $tempSeq.1 $tempSeq.1 -name $seqname\n";
		$sysCmd .= "cat $tempSeq.1 >> $seqJoinResult\n";
		my $altSplice = altSpliceDetect();
		if ($countDIcodon{D} + $countDIcodon{I} > 0) {
			$comment = "Found deletion or insertion of complete codons. ".$comment;
		}
		if ($altSplice > 0) {
			$comment = "Alternative Splicing? Cluster of mismatches found at ".$altSplice.". ".$comment;
		}
		$rec{emboss} = $sysCmd;
		$rec{altSplice} = ($altSplice > 0) ? 1 : 0;
		$rec{countDIcodon} = {%countDIcodon};
		$rec{countDI} = {%countDI};
		$rec{comment} = $comment;
		$matchRec{$estref}{$enstcross[8]} = {%rec};	
	}
 }
 print ".";
}

sub altSpliceDetect {
	foreach my $d1 (sort keys %descr) {
		my $errCount = 0;
		foreach my $d2 (sort keys %descr) {
			$errCount += $descr{$d2}[1] 
				if (
					$d2 <= $d1 
					and $d2 > $d1 - $altSpliceDetectWindow 
					and $descr{$d2}[4] >= $minqual{$descr{$d2}[0]}
				);
		}
		return $d1 if $errCount >= $altSpliceDetectWindow * $altSpliceDetectErrorRate;
	}
	return 0;
}
		

sub mismatch_count {
	my $counter = 0;
	foreach my $desckey (sort keys %descr) {
		#print "\$counter $counter \$descr{$desckey}[4] $descr{$desckey}[4]\n";
		$counter = $counter + 1 if $descr{$desckey}[4] >= $minqual{$descr{$desckey}[0]};
	}
	return $counter;
}

#sub avg_qual {
##	my @tempdesc = @descr;
#	#print "\$descr[538][6] $descr{538}[4] \$descr[235][6] $descr{235}[4]\n";
#	#print "\$tempdesc[538][6] $tempdesc[538][4] \$tempdesc[235][6] $tempdesc[235][4]\n";
#	my $counter = 0;
#	my $qualsum = 0;
#	foreach my $desckey (keys %descr) {
#		
#	foreach my @arr (@descr) {
#	$counter += 1;
#		#print $arr[4],"\n";
#		$qualsum += $descr{$desckey}[4];
#	}
#	#print "\$qualsum $qualsum \$counter $counter\n";
#	return $qualsum / $counter;
#}

sub BEGIN {
	my @quals;
	my $storeSeqID = '';	

	# getQualities extracts the base call quality values for a sequence from the file
	# $seqFasta.".screen.qual" 
	# $storeSeqID the ID of the last quality file the function worked with
	# @quals	the content of the (last) quality data set.
	sub getQualities {
		my $seqId = shift;
		#print "getQualities \$seqId $seqId \$storeSeqID $storeSeqID \$quals[100] $quals[100]\n";
		if (not $seqId eq $storeSeqID) {
			$storeSeqID = $seqId;
			@quals = (0);
			open (I,$seqFasta.".screen.qual") or return;
			COMMENTS: while (<I>) {
				if (/^>$seqId\b/) {
					#print;
					while (<I>) {
						if (/^[\d\s]+$/) {
							push @quals, split;
						} else {
							last COMMENTS;
						}
					}
				}
			}
		}
		#print "getQualities \$seqId $seqId \$storeSeqID $storeSeqID \$quals[100] $quals[100]\n";
		return @quals;
	}
}

# determines the minimal base call quality in a part of a chromatogramm
# $seqid	id of the sequence
# $start, $end	sequence interval, for which the minimal quality will be determined
sub minQuality {
	my ($seqId, $start, $end) = @_;
	my @quals = getQualities($seqId); # array of the base call qualities of this sequence
	my $minQual = 99;
	foreach my $i ($start .. $end) {
		#print "\$i $i \$quals[$i] $quals[$i]\n";
		$minQual = $quals[$i] if ($quals[$i] < $minQual);
	}
	return $minQual;
}

# avgQual calculates the average base calling quality _around_ the bp at
# position @_[1] (Argument 2) in the sequence with the name @_[0] (Argument 1). 
# Optional arguments 3 and 4 determine how far to look ahead and behind for
# quality values.
sub avgQualWindow {
	#print "ich habe ", scalar @_, " Argumente bekommen.\n";
	my $seqId = shift;	
	my $pos = shift;
	$aheadBp = shift if (scalar @_ > 0);
	$behindBp = shift if (scalar @_ > 0);
	#my @quals = (0); #start with 1
	return '' if ($seqId eq '');
	my @quals = getQualities($seqId);
	my ($counter, $qualSum);
	foreach my $i (($pos-$aheadBp>0 ? $pos-$aheadBp : 1 ) .. ($pos-1), 
								($pos+1) .. ($pos+$behindBp)) {
		#print "\$pos $pos \$quals[$i] = $quals[$i]\n";
		$qualSum += $quals[$i];
		$counter += 1;
	}
	return $qualSum / $counter;
}
# reformatCrossmOutput re-formats the output of cross_match to allow for more easy parsing.
# A new file with the suffic .reformat is created by this procedure.
sub reformatCrossmOutput {
	my $fileName = shift;
	my $newName = $fileName.".reformat";
	open(I, $fileName) || die "cannot open $fileName";
	open(O, "> $newName");
	while (<I>) {
		if (/^ALIGNMENT\b/) {
			s/\ +/\t/g;
      #s/ALIGNMENT\t//;
			s/gi\|(\d+)[\|\w\.]+/$1/;
			s/[\(\)]//g;
			s/\t$//;
			my @fields = split;
			if ($fields[9] eq 'C') {
				foreach $i(1..8,10,12,13,11) {
					print O "$fields[$i]\t";
				}
				print O "\n";
			} else {
				foreach $i(1..8,9,10,11,12) {
					print O "$fields[$i]\t";
				}
				print O "\n";
			}
		} elsif (/^DISCREPANCY\b/) {
			my @fields = split;
			if ($fields[1] =~ /^([DSI])$/) {
				print O "$1\t1\t";
			} elsif ($fields[1] =~ /^([DSI])-(\d+)\b/) {
				print O "$1\t$2\t";
			}
			print O "$fields[2]\t";
			if ($fields[3] =~ /^(\w)\((\d+)\)/) {
				print O "$1\t$2\t";
			} else {
				print O "\t\t";
			}
			print O "$fields[4]\t$fields[5]\n";
		}
	}
	close O;
}



