#!/usr/bin/perl
use lib "/usr/lib";


# $Id$
#
# untangle.pl takes in the output of grommit together with the input
# to grommit to disambiguate the scaffold.  The output is consistent with the 
# .out.xml file format.
#
#  Copyright @ 2002, 2003 The Institute for Genomic Research (TIGR).  All
#  rights reserved.

use strict;
use XML::Parser;
use TIGR::Foundation;

my $base = new TIGR::Foundation;
if (! defined $base){
    die ("A horrible death");
}

my $VERSION = "1.0";
my $VERSION_STRING = "$VERSION (" . '$Revision$ ' . ")";

$base->setVersionInfo($VERSION_STRING);

my $HELPTEXT = q~
    Take a file of evidence to disambiguate a scaffold.

    untangle -e <evidencefile> -s <scaffile> -o <outfile>
    ~;

$base->setHelpInfo($HELPTEXT);

my $xmlfile;
my $evidencefile;
my $outfile;

my $err = $base->TIGR_GetOptions("e=s" => \$evidencefile,
				 "s=s" => \$xmlfile,
				 "o=s" => \$outfile);

$base->bail("Option processing failed") unless $err;

if (! defined $evidencefile || ! defined $xmlfile || ! defined $outfile) {
    $base->bail("You need to provide all options listed in the help (try -h)");
}

open(OUT, ">$outfile") || $base->bail("Cannot open $outfile: $!\n");

print OUT "<GROUPING>\n";

my $xml = new XML::Parser(Style => 'Stream');

# parser definitions.  These values are filled as a side-effect of
# calling $xml->parsefile
my $inEvidence = 0;  # we are in the evidence section

my $inLib = 0;       # we are in a library
my $libId;           # ID of current library
my %libraries;       # mapping from lib ids to size range

my $inInsert = 0;    # we are in an insert
my $insId;           # ID of current insert
my %inserts;         # insert ID to sequence list mapping
my %insertLib;       # mapping from insert ID to library ID

my $inContig = 0;    # we are in a contig
my $conId;           # ID of current contig
my %contigs;         # mapping from contig ID to contig size

my $scaffId;         # ID of current scaffold
my @scaffolds;       # list of all scaffold IDs
my %scaffSize;       # sum of lengths of all contigs in scaffold
my %scaffContig;     # mapping from scaffold ID to list of contigs
my %scaffLink;       # mapping from scaffold ID to list of links

my %contigX;         # coordinate of contig beginning in scaffold
my %contigStart;     # start of contig in scaffold
my %contigEnd;       # end of contig in scaffold
my %contigOri;       # orientation of contig in scaffold
my %contigScaff;     # mapping from contig  ID to scaffold ID

my $inLink = 0;      # we are in a link
my $id;              # current link ID
my %linkCtg;         # mapping from link ID to list of contigs
my %linkValid;       # valid codes for all links

my %seqName;         # mapping from sequence ID to sequence name
my %seqCtg;          # mapping from sequence ID to contig ID
my %seqOri;          # orientation of sequence within contig
my %seqLend;         # coordinates of sequence within contig
my %seqRend;

my $inUnused = 0;    # we are in the unused link section
my @unseen;          # list of unseen links

# now we parse the files
$xml->parsefile($evidencefile);
$xml->parsefile($xmlfile);

# Now we go through each scaffold
for (my $s = 0 ; $s <= $#scaffolds ; $s++){
    my $scaffId = $scaffolds[$s];

    my @ctgs = split(' ', $scaffContig{$scaffId});
    @ctgs = sort {$contigStart{$a} <=> $contigStart{$b}} @ctgs;

    my %conflicts = ();   # all conflicts between contigs
    my %nconflicts = ();

    print STDERR "doing scaffold $scaffId with ", $#ctgs + 1, " contigs\n";
    for (my $c = 0; $c <= $#ctgs; $c++){
	my $l = $contigStart{$ctgs[$c]}; 
	my $r = $contigEnd{$ctgs[$c]};
	# now I go for all the contigs that start before the current one
	my $b = $c - 1;
	while ($b >= 0 && $contigEnd{$ctgs[$b]} > $l){
	    $conflicts{$ctgs[$c]} .= "$ctgs[$b] ";
	    $nconflicts{$ctgs[$c]}++;
	    $b--;
	}

	# then the ones afterwards
	$b = $c + 1;
	while ($b <= $#ctgs && $contigStart{$ctgs[$b]} < $r){
	    $conflicts{$ctgs[$c]} .= "$ctgs[$b] ";
	    $nconflicts{$ctgs[$c]}++;
	    $b++;
	}

#	for (my $cc = 0; $cc <= $#ctgs; $cc++){
#	    if ($ctgs[$cc] eq $ctgs[$c]) { next;} # avoid self matches
#	    my $st = $contigStart{$ctgs[$cc]};
#	    my $e = $contigEnd{$ctgs[$cc]};
#	    if ($e > $l && $st < $r) {
#		$conflicts{$ctgs[$c]} .= "$ctgs[$cc] ";
#		$nconflicts{$ctgs[$c]}++;
#		$conflicts = 1;
#	    }
#	} # for each contig
	if (exists $nconflicts{$ctgs[$c]}){
	    print STDERR "contig $ctgs[$c] has $nconflicts{$ctgs[$c]} conflicts\n";
	}
    } # for each contig

    # now %conflicts holds a list of all the contigs conflicting with the key

    # for each contig, we assign it to a cluster
#     my %contigClust;
#     my $clustId = 0;

#     for (my $c = 0; $c <= $#ctgs; $c++){
# 	my @cluster = ();
# 	my @stack = ();
# 	my %seen = ();
# 	push(@stack, $ctgs[$c]);
# 	$clustId++;
# 	while (my $contig = pop @stack){
# 	    $seen{$contig} = 1;
# 	    $contigClust{$contig} = $clustId;
# 	    print STDERR "contig $contig is in cluster $clustId\n";
# 	    print STDERR "checking $contig for conflicts\n";
# 	    if (exists $nconflicts{$contig}){
# 		my @cflcts = split(' ', $conflicts{$contig});
# 		for (my $flct = 0; $flct <= $#cflcts; $flct++){
# 		    if (! exists $seen{$cflcts[$flct]}){
# 			push(@stack, $cflcts[$flct]);
# 		    }
# 		}
# 	    }
# 	    push(@cluster, $contig);
# 	}

# 	# now cluster contains a cluster of potentially conflicting contigs
# 	if ($#cluster >=  1){
# 	    print STDERR "found a cluster with ", $#cluster + 1, " contigs\n";
# 	}

# 	# now we try to find the longest contig in this cluster
# 	my $best = 0;
# 	my $bestlen = 0;
# 	for (my $cc = 0; $cc <= $#cluster; $cc++){
# 	    if ($contigs{$cluster[$cc]} > $bestlen){
# 		$bestlen = $contigs{$cluster[$cc]};
# 		$best = $cluster[$cc];
# 	    }
# 	}
# 	$chosen{$best} = 1; # mark the chosen contig
#     } # for each contig
    
    print STDERR "getting ready to build scaffolds\n";
#    @ctgs = sort {$contigStart{$a} <=> $contigStart{$b}} @ctgs; # sort by start
#    print STDERR "done sorting\n";

    # the basic algorithm goes like this:
    #  starting with the leftmost contig, assign it number 1
    #  find all contigs linked to it

    # from within each group of conflicting contigs, we'll choose one, namely
    # the longest one


    # now the algorithm is pretty simple.  Start building "groups" of contigs
    # by following clone links and stopping when we find a non-chosen contig
    # that has conflicts

    # first we need to find all contig mates
    my %valids;    # number of valid links
    my %validLs;   # list of valid links
    my %invalids;  # number of invalid links
    my %invalidLs; # list of invalid links
    my %neighbors; # right neighbors of contig

    my @links = split(' ', $scaffLink{$scaffId});

    for (my $l = 0; $l <= $#links; $l++){
	my $lnk = $links[$l];
	
	my ($ctgA, $sa, $ctgB, $sb) = ctgPair($lnk);
	
	if ($sa != $scaffId || $sb != $scaffId){
	    $base->logError("Wierd link $lnk in scaffold $scaffId connects two scaffolds $sa and $sb", 1);
	    next;
	}
	
	my $edge;
	if (($contigStart{$ctgA} <= $contigStart{$ctgB})){
	    $edge = "$ctgA $ctgB" ;
	} else { 
	    $edge = "$ctgB $ctgA";
	}

	$neighbors{$ctgA} .= "$ctgB ";
	$neighbors{$ctgB} .= "$ctgA ";
	
	if ($linkValid{$lnk} eq "VALID"){
	    $valids{$edge}++;
	    $validLs{$edge} .= " $lnk";
	} else {
	    $invalids{$edge}++;
	    $invalidLs{$edge} .= " $lnk";
	} 
    } # for all links


    my $chunk = 0;  # which scaffold chunk

    my $busy = 1;   # we are not done
    my %added = (); # contigs added to current scaffold

    while ($busy) {
	$busy = 0;
	for (my $con = 0; $con <= $#ctgs; $con++){
	    print STDERR "trying node $ctgs[$con]\n";
	    if (!exists $added{$ctgs[$con]}){
		print STDERR "contig $ctgs[$con] was not added\n";
		print STDERR "it is the seed\n";
		$busy = 1;
		$chunk++; # start a new scaffold chunk
		my @scf = ();
		my @scflinks = ();
		my @stk = ();
		my %instack = ();
		push (@stk, $ctgs[$con]);
		$instack{$ctgs[$con]} = 1;
		print STDERR "Adding $ctgs[$con] to stack\n";

		# we add contigs to the scaffold until we meet one that
		# belongs to a cluster we've already seen

		while ($#stk >= 0){
		    my $ctg = pop(@stk);
		    if (! exists $added{$ctg}){
			print STDERR "contig $ctg was not added yet\n";
			push(@scf, $ctg);
			$added{$ctg} = 1;
			
			my @nbrs = split(' ', $neighbors{$ctg});
			print STDERR "and has ", $#nbrs + 1, " neighbors\n";
			for (my $nb = 0; $nb <= $#nbrs; $nb++){
			    if (exists $added{$nbrs[$nb]}){
				# if already in scaffold skip it
				next;
			    }
			    if (exists $instack{$nbrs[$nb]}){
				# we've already added it to the todo pile
				next;
			    }

			    # now check all it's conflicts
			    my @cf = split(' ', $conflicts{$nbrs[$nb]});
			    my $conflict = 0;
			    my $conflictId;
			    for (my $cfi = 0; $cfi <= $#cf; $cfi++){
				if (exists $instack{$cf[$cfi]}){
				    $conflict = 1;
				    $conflictId = $cf[$cfi];
				    last;
				}
			    }
				    
			    my $e;
			    if (exists $validLs{"$ctg $nbrs[$nb]"}){
				$e = "$ctg $nbrs[$nb]";
			    } else {
				$e = "$nbrs[$nb] $ctg";
			    }

			    my @vals = split(' ', $validLs{$e});
			    my @invals = split(' ', $invalidLs{$e});			    
			    if ($conflict){
				print STDERR "skipping $nbrs[$nb] because we already saw $conflictId\n";
				# we don't want this neighborhood relationship

				for (my $ln = 0 ; $ln <= $#vals; $ln++){
				    push(@unseen, $vals[$ln]);
				}
				
				for (my $ln = 0; $ln <= $#invals; $ln++){
				    push(@unseen, $invals[$ln]);
				}
			    } else {
				# we want this neighbor
				push (@stk, $nbrs[$nb]);
				$instack{$nbrs[$nb]} = 1;
				print STDERR "adding $nbrs[$nb] to the stack\n";
				for (my $ln = 0 ; $ln <= $#vals; $ln++){
				    push(@scflinks, $vals[$ln]);
				}
				
				for (my $ln = 0; $ln <= $#invals; $ln++){
				    push(@scflinks, $invals[$ln]);
				}
			    }
			} # for each neighbor
		    } # if ! exists added
		} # while stack not empy

		# now we simply print out the scaffold chunk;
		my $newId = $scaffId . "_$chunk";
		
		print OUT "<SCAFFOLD ID = \"$newId\">\n";

		for (my $c = 0; $c <= $#scf; $c++){
		    my $conId = $scf[$c];
		    print OUT "<CONTIG ID = \"$conId\"\n";
		    print OUT "  X = \"$contigX{$conId}\"\n";
		    print OUT "  ORI = \"$contigOri{$conId}\"\n";
		    print OUT "></CONTIG>\n";
		    
		} # for each contig

		for (my $l = 0; $l <= $#scflinks; $l++){
		    my $lnId = $scflinks[$l];

		    print OUT "<LINK ID = \"$lnId\"\n";
		    print OUT "  VALID = \"$linkValid{$lnId}\"\n";
		    print OUT "  TAG = \"T\"\n";
		    print OUT "></LINK>\n";
		} # for each link

		print OUT "</SCAFFOLD>\n";

	    } # if ! exists $added
	} # for each contig
    } # while ($busy);

} # for each scaffold
print OUT "<UNUSED>\n";

for (my $l = 0; $l <= $#unseen; $l++){
    my $lnId = $unseen[$l];
    
    print OUT "<LINK ID = \"$lnId\"\n";
    print OUT "  VALID = \"$linkValid{$lnId}\"\n";
    print OUT "  TAG = \"T\"\n";
    print OUT "></LINK>\n";
} # for each link

print OUT "</UNUSED>\n";

# done with the work
print OUT "</GROUPING>\n";

exit(0);

###############################
# Functions
###############################

# returns the contig pair linked by the specified link
sub ctgPair
{
    my $lnk = shift;
    my $ctgA;
    my $ctgB;
    my $scaffA;
    my $scaffB;

    if (exists $inserts{$lnk}){
	my @seqs = split(' ', $inserts{$lnk});
	if ($#seqs != 1){
	    $base->logError("Link $lnk has less than 2 seqs", 1);
	} else {
	    my $seqA = $seqs[0];
	    my $seqB = $seqs[1];
	    
	    $ctgA = $seqCtg{$seqA};
	    $ctgB = $seqCtg{$seqB};
	}
    } else { # ! exists $inserts... link is MUMmer
	my @ctgs = split(' ', $linkCtg{$lnk});
	if ($#ctgs != 1){
	    $base->logError("Link $lnk has less than 2 contigs", 1);
	    return (undef, undef, undef, undef);
	}
	$ctgA = $ctgs[0];
	$ctgB = $ctgs[1];
    }

    $scaffA = $contigScaff{$ctgA};
    $scaffB = $contigScaff{$ctgB};
    
    return ($ctgA, $scaffA, $ctgB, $scaffB);
}

## XML functions ##
sub StartDocument
{
    print STDERR "starting\n";
}

sub StartTag
{
    my $tag = $_[1];
   
    if ($tag eq "EVIDENCE"){
	$inEvidence = 1;
    } elsif ($tag eq "LIBRARY") {
	$libId = $_{ID};
	$libraries{$libId} = "$_{MIN} $_{MAX}";
	$inLib = 1;
    } elsif ($tag eq "INSERT"){
	if (! $inLib){
	    $base->bail("Cannot have insert outside of library");
	}
	$inInsert = 1;
	$insId = $_{ID};
	$inserts{$insId} = "";
	$insertLib{$insId} = $libId;
    } elsif ($tag eq "CONTIG"){
	$conId = $_{ID};
	if ($inEvidence){
	    if ($inLink){
		$inContig = 1;
		$linkCtg{$id} .= "$conId ";
	    } else {
		$contigs{$conId} = $_{LEN};
	    }
	} else {
	    $contigX{$conId} = $_{X};
	    $contigStart{$conId} = ($_{ORI} eq "BE") ? 
		$_{X} : ($_{X} - $contigs{$conId});
	    $contigEnd{$conId} = ($_{ORI} eq "BE") ?
		($_{X} + $contigs{$conId}) : $_{X};
	    $contigOri{$conId} = $_{ORI};
	    $contigScaff{$conId} = $scaffId;
	    $scaffContig{$scaffId} .= "$conId ";
	    $scaffSize{$scaffId} += $contigs{$conId};
	}
    } elsif ($tag eq "SEQUENCE"){
	my $id = $_{ID};
	if ($inInsert){
	    $inserts{$insId} .= "$id ";
	    $seqName{$id} = $_{NAME};
	} else { # we are in contigs
	    $seqCtg{$id} = $conId;
	    $seqOri{$id} = $_{ORI};
	    $seqLend{$id} = $_{ASM_LEND};
	    $seqRend{$id} = $_{ASM_REND};
	}
    } elsif ($tag eq "SCAFFOLD") {
	$scaffId = $_{ID};
	push(@scaffolds, $scaffId);
    } elsif ($tag eq "LINK") {
	$id = $_{ID};
	if ($inEvidence){ #&& $_{TYPE} eq "MUMmer"){
	    if (! exists $libraries{$_{TYPE}}){
		$libraries{$_{TYPE}} = "0 0";
	    }
	    $insertLib{$id} = $_{TYPE}; # all MUMmer links belong to the MUMmer library
	    $inLink = 1;
	} else {
	    if ($inUnused) {
		push(@unseen, $id);
	    } else {
		$scaffLink{$scaffId} .= "$id ";
	    }
	    $linkValid{$id} = $_{VALID};
	}
    } elsif ($tag eq "UNUSED") {
	$inUnused = 1;
    } else {
#	print "Unknown tag: $tag\n";
    }
}

sub EndTag
{
    my $tag = $_[1];
    if ($tag eq "INSERT"){
	$inInsert = 0;
    } elsif ($tag eq "LIBRARY"){
	$inLib = 0;
    } elsif ($tag eq "UNUSED"){
	$inUnused = 0;
    } elsif ($tag eq "EVIDENCE"){
	$inEvidence = 0;
    } elsif ($tag eq "LINK"){
	if ($inEvidence){
	    $inLink = 0;
	}
    } elsif ($tag eq "CONTIG"){
	if ($inLink){
	    $inContig = 0;
	}
    }
}

sub Text
{
    if ($inContig){
#	$conEv{$conId} = $_;
    }
}

sub pi
{
}

sub EndDocument
{
    print STDERR "Done\n";
}
