#!/usr/bin/perl -w
#
# $Id: amberdash 125 2017-11-06 14:16:36Z dw $
#
# AMBERDASH: mdash interface for Amber trajectories.
#
# Copyright (C) 2013,2015 Centre for Molecular Design, University of Portsmouth.
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#

# If these programs are not on the PATH, specify their full pathnames.
# $PTRAJ may specify ptraj, which calculates statistics for the torsion
# angles, or cpptraj, which reads trajectories faster.
my $DASH  = 'mdash';
my $PTRAJ = 'cpptraj';

=head1 NAME

AMBERDASH - mdash interface for Amber trajectories

=head1 SYNOPSIS

B<amberdash>  [options] I<seed> [-- I<dash_options>]

Run B<mdash> on trajectories generated by the Amber molecular dynamics package (http://ambermd.org/).

=head1 OPTIONS

=over

=item B<-help>

Print help message and exit.

=item B<-version>

Print version number and exit.

=item B<-debug>

Print progress messages on B<stderr>.

=item B<-keep>

Keep all the intermediate files that are normally deleted when the
program exits.

=item B<-keep-dash-input>

Keep the B<dash> input file I<seed>.dash.in which is normally deleted
when the program exits.

=item B<-keep-ptraj-input>

Keep the B<ptraj> input file I<seed>.ptraj.in which is normally deleted
when the program exits.

=item B<-no-dash>

Generate (and keep) the B<dash> input file I<seed>.dash.in but do not
run B<mdash>.

=item B<-progress>

Print output from the B<ptraj> command on B<stdout>. Useful for
monitoring progress when reading large trajectories.

=item B<-snap>

Write PDB files containing snapshots representing the B<dash> states.

=item B<-backbone> I<r1[T]>B<:>I<r2[T]>

Analyze the sequence of backbone phi/psi torsion angles from residue
I<r1> to residue I<r2>. If I<T> is specified the residue is treated as
a terminal residue and only one torsion angle is included (psi for
I<r1>, phi for I<r2>).

=back

=head1 DASH OPTIONS

The I<dash_options> are described in the B<mdash> documentation.

=head1 INPUT FILES

Several input files are required, specified by the I<seed> prefix, in
order to identify the topology, the torsion angles and the trajectory.

The topology is always specified by an Amber topology file I<seed>.top.

The torsion angles may be specified either by the B<-backbone> option or
by a file I<seed>.tor. The B<-backbone> option takes precedence.

The trajectory may be specified either as a single Amber trajectory
file I<seed>.trj, or by a sequence of B<trajin> commands in a text
file I<seed>.trajin. If I<seed>.trajin exists, it is used and
I<seed>.trj is ignored; otherwise I<seed>.trj is used. The
I<seed>.trajin approach is necessary if the trajectory spans several
files or a subset of the trajectory is required via I<start>, I<stop>
and I<offset> arguments to B<trajin>.

If snapshots are required (B<-snap>) and the trajectory is specified
in I<seed>.trajin, the trajin commands in I<seed>.trajin must include
start, stop and offset fields. Otherwise the script is unable to
locate the representative B<dash> states in the trajectory and the
snapshots are omitted.

=over

=item I<seed>.top

An Amber topology file corresponding to the trajectory to be analyzed.

=item I<seed>.trajin

A text file containing B<trajin> commands to extract the trajectory to
be analyzed. Any lines not containing B<trajin> commands are ignored.

=item I<seed>.trj

An Amber trajectory file.

=item I<seed>.tor

A text file defining the torsion angles to be analyzed. Each torsion
angle is specified by a whitespace-separated line with five fields:

    name mask1 mask2 mask3 mask4

Here I<name> is an identifier and I<mask1>, ..., I<mask4> are Amber
atom masks defining the torsion angle. Lines starting with '#' are
treated as comments and ignored. This file must be prepared manually
by the user.

=back

=head1 OUTPUT FILES

=over

=item I<seed>.dash.out

The B<dash> output file.

=item I<seed>.ptraj.out

The output from the B<ptraj> command to extract the torsions.

=back

If B<-snap> is specified, the following files are written for each B<dash> state:

=over

=over

=item I<seed>.stateI<N>.I<frame>

The PDB file containing the representative frame I<frame> for B<dash> state I<N>.

=item I<seed>.ptraj.stateI<N>.out

The output from the ptraj command to generate the PDB file for B<dash> state I<N>.

=back

=back

If B<-keep> is specified, the following intermediate files are retained:

=over

=over

=item I<seed>.ptraj.in

The input file for the B<ptraj> command to extract the torsions.

=item I<seed>.I<name>

The torsion angles for each torsion I<name>.

=item I<seed>.dash.in

The dash input file obtained by joining the torsion angle files.

=back

=back

If B<-keep-ptraj> and B<-snap> are specified, the following files are retained:

=over

=over

=item I<seed>.ptraj.stateI<N>.in

The input file for the ptraj command to generate the PDB file for
B<dash> state I<N>.

=back

=back

=head1 INSTALLATION

The programs B<mdash> and either B<ptraj> or B<cpptraj> are
required. If they are not on the B<PATH> their full pathnames must be
specified at the top of the B<amberdash> script. B<cpptraj> reads
large trajectories faster than B<ptraj>, while B<ptraj> calculates
statistics for the torsion angles.

=head1 NOTE

All the output files will be clobbered by the next run of the script for the same I<seed>.

=head1 REFERENCE

 D. W. Salt, B. D. Hudson, L. Banting, M. J. Ellis and M. G. Ford
 DASH: A novel analysis method for molecular dynamics simulation data.
 Analysis of ligands of PPAR-gamma, J. Med. Chem., 48, 3214-3220, 2005.

=head1 ACKNOWLEDGEMENT

This work was funded by the Peptide Research Network of Excellence
(PeReNE) (http://www.perene-project.eu/) as part of the INTERREG IV A
France (Channel) England Programme (http://www.interreg4a-manche.eu/).

=head1 AUTHOR

David Whitley, University of Portsmouth <david.whitley@port.ac.uk>.

=cut

use strict;
use warnings;
use Getopt::Long;
use File::Basename;

sub write_ptraj_input($);
sub run_ptraj($$);
sub find_local_frame($);

$| = 1;

# command line options
my $progname = basename $0;
my $help = 0;
my $version = 0;
my $debug = 0;
my $keep = 0;
my $keep_dash_input = 0;
my $keep_ptraj_input = 0;
my $keep_angles = 0;
my $progress = 0;
my $no_dash = 0;
my $snap = 0;
my $backbone = "";
my ($bb1, $bb2, $bbt1, $bbt2);

# check dash runs and extract version numbers
my ($dash_major_version, $dash_minor_version) = find_dash_version();

# svn revision number
my @revision = split(/\s+/,'$Revision: 125 $');

my $title = "AMBERDASH - mdash interface for Amber trajectories, revision $revision[1].\n";

my $copyright =
"Copyright (C) 2013,2015 Centre for Molecular Design, University of Portsmouth.
This program is free software; see the GNU General Public License for more details.
This program is distributed WITH ABSOLUTELY NO WARRANTY.\n";

my $usage =
"Usage:
   $progname [options] seed [-- dash_options]

Options:
   -help             : print help message and exit
   -version          : print version number and exit
   -debug            : print progress messages on stderr
   -keep             : keep all intermediate files
   -keep-dash-input  : keep the dash input file
   -keep-ptraj-input : keep the ptraj input file
   -no-dash          : keep the dash input file but do not run mdash
   -progress         : print output from the ptraj command on stdout
   -snap             : write PDB files containing snapshots representing the dash states
   -backbone r1:r2   : analyze the sequence of backbone torsion angles from residue r1 to residue r2

For full documentation, use the command \"perldoc $progname\".\n";

# process command line
GetOptions(
    "help" => \$help,
    "version" => \$version,
    "debug" => \$debug,
    "progress" => \$progress,
    "keep" => \$keep,
    "keep-dash-input" => \$keep_dash_input,
    "keep-ptraj-input" => \$keep_ptraj_input,
    "no-dash" => \$no_dash,
    "snap" => \$snap,
    "backbone=s" => \$backbone
    ) or die $usage;

if ($help) {
    print $title, $copyright, "\n", $usage;
    exit 0;
}

if ($version) {
    print $title, $copyright;
    exit 0;
}

if ($keep) {
    $keep_dash_input = $keep_ptraj_input = $keep_angles = 1;
}

if ($backbone) {
    (($bb1, $bbt1, $bb2, $bbt2) = ($backbone =~ /^(\d+)(T?):(\d+)(T?)$/))
        or die "wrong format for -backbone range\n", $usage;
    ($bb1 < $bb2)
        or die "-backbone range r1:r2 requires r1 < r2\n", $usage;
}

die $usage if ($#ARGV < 0);

my $seed = shift @ARGV;
my @dash_opts = @ARGV;

if ($debug) {
    print STDERR "command line arguments\n";
    print STDERR "seed: ", $seed, "\n";
    print STDERR "snap: ", $snap, "\n";
    print STDERR "backbone: ", $backbone, "\n";
    if ($backbone) {
        print STDERR "back1: ", $bb1, "\n";
        print STDERR "back2: ", $bb2, "\n";
    }
    print STDERR "dash_opts: ", join(" ", @dash_opts), "\n";
}

-e $seed . ".top" or die "topology file $seed.top does not exist, exiting\n";
-r $seed . ".top" or die "topology file $seed.top is not readable, exiting\n";

# are we using ptraj or cpptraj?
my $cpptraj = ($PTRAJ =~ /cpptraj$/);

my @tornames;
my %torsions;
my $use_trajin = 0; # 0 => we're using seed.trj; 1 => we're using seed.trajin
my @trajin_cmd;
my $dash_in = $seed . ".dash.in";
my $dash_out = $seed . ".dash.out";
my $nframes;
my $ntorsions;

if ($backbone) {
    generate_tor();
} else {
    read_tor();
}

create_trajin_cmd();
write_ptraj_input("$seed.ptraj.in");
run_ptraj("$seed.ptraj.in", "$seed.ptraj.out");
write_dash_input();

if (!$no_dash) {
    run_dash();

    if ($snap) {
        write_snaps();
    }
}

sub find_dash_version {
    my $dash_version = qx($DASH -v 2>&1);

    if (!defined $dash_version) {
        die "please edit $progname to specify the mdash program\nunable to execute $DASH";
    }

    my ($major, $minor) = ($dash_version =~ /^mdash (\d+).(\d+)/);

    if (defined $major && defined $minor) {
        print STDERR "found mdash $major.$minor\n" if $debug;
    } else {
        die "$progname: unknown mdash version\n";
    }

    return ($major, $minor);
}

# generate the torsion definitions for the specified range of backbone residues
# store the torsion definitions in the %torsions hash of arrays
# store the torsion names in the @tornames array (so we can retrieve
# them from the hash in their input order)
#
# for each torsion T, the pattern is
# phiT :(T-1)@C :T@N :T@CA :T@C
# psiT :T@N :T@CA :T@C :(T+1)@N
sub generate_tor {
    if ($bbt1) {
        push @tornames, "psi$bb1";
        $torsions{"psi$bb1"} = [(":$bb1\@N",":$bb1\@CA", ":$bb1\@C", ":" . ($bb1+1) . "\@N")];
        ++$bb1;
    }

    --$bb2 if ($bbt2);

    for (my $t=$bb1; $t<=$bb2; $t++) {
        push @tornames, "phi$t";
        $torsions{"phi$t"} = [(":" . ($t-1) . "\@C",":$t\@N", ":$t\@CA", ":$t\@C")];
        push @tornames, "psi$t";
        $torsions{"psi$t"} = [(":$t\@N",":$t\@CA", ":$t\@C", ":" . ($t+1) . "\@N")];
    }

    if ($bbt2) {
        ++$bb2;
        push @tornames, "phi$bb2";
        $torsions{"phi$bb2"} = [(":" . ($bb2-1) . "\@C",":$bb2\@N", ":$bb2\@CA", ":$bb2\@C")];
    }

    if ($debug) {
        print STDERR "generated ", scalar @tornames, " torsions:\n";

        foreach my $k (@tornames) {
            print STDERR "  $k: @{$torsions{$k}}\n";
        }
    }
}

# read the torsion file <seed>.tor
# store the torsion definitions in the %torsions hash of arrays
# store the torsion names in the @tornames array (so we can retrieve
# them from the hash in their input order)
sub read_tor {
    my $tor = $seed . ".tor";

    open (my $fh, '<', $tor) or die "$progname: torsion file $tor not found: $!\n";

    print STDERR "reading torsion file: $tor\n" if $debug;

    while(my $line = <$fh>) {
        chomp($line);
        print STDERR "line $.: |$line|\n" if $debug;
        $line =~ s/^\s+//; # remove leading whitespace

        if ($line =~ /^$/) {
            print STDERR "ignoring blank line\n" if $debug;
            next;
        }

        if ($line =~ /^#/) {
            print STDERR "ignoring comment line\n" if $debug;
            next;
        }

        my @tordef = split /\s+/, $line;
        my $ntor = $#tordef + 1;
        ($ntor == 5) or die "invalid torsion at line $. of $tor: expecting 5 fields, found $ntor\n";

        my $torname = shift @tordef;
        push @tornames, $torname;
        $torsions{$torname} = [@tordef];
    }

    if ($debug) {
        print STDERR "found ", scalar @tornames, " torsions:\n";

        foreach my $k (@tornames) {
            print STDERR "  $k: @{$torsions{$k}}\n";
        }
    }

    close $fh;
}

# create the trajin section of the ptraj command script
# if seed.trajin exists, use its contents, otherwise use "trajin seed.trj"
sub create_trajin_cmd {
    my $trajin = $seed . ".trajin";
    my $trj = $seed . ".trj";

    if (-e $trajin) {
        my $fh;

        open ($fh, '<', $trajin)
            or die "can't read trajin command file: $trajin: $!\n";

        # ignore lines not starting with a trajin command
        while (<$fh>) {
            if (/^\s*trajin.*/) { push @trajin_cmd, $_; }
        }

        close $fh;

        $use_trajin = 1;
        print STDERR "using trajectory from $trajin\n" if $debug;

        if ($snap) {
            check_trajin_cmd()
        }
    } elsif (-e $trj) {
        push @trajin_cmd, "trajin $trj";
        print STDERR "using trajectory $trj\n" if $debug;
    } else {
        die "can't find $trajin or $trj, exiting.\n";
   }
}

# check that trajin commands contain start, stop and offset fields
# called when snapshots are required for a trajectory specified via seed.trajin
sub check_trajin_cmd {
    for my $line (@trajin_cmd) {
        my @fields = split /\s+/, $line;

        die "To generate snapshots, trajin commands require start, stop and offset fields. Exiting.\n"
            if ($#fields < 4);
    }
}

# write the ptraj command script
sub write_ptraj_input ($) {
    my ($ptraj_in) = @_;
    my $fh;

    open ($fh, '>', $ptraj_in)
        or die "can't write ptraj command script: $ptraj_in: $!\n";

    print STDERR "writing ptraj command script: $ptraj_in\n" if $debug;
    print $fh @trajin_cmd, "\n";

    foreach my $k (@tornames) {
        print $fh "dihedral $k @{$torsions{$k}} out $seed.$k\n"
    }

    if ($cpptraj) {
        print $fh "$progress\n";
    } else {
        print $fh "analyze statistics all\n";
    }

    print $fh "go\n";
    close $fh;

    if ($debug) {
        open ($fh, '<', $ptraj_in)
            or die "can't read ptraj command script: $ptraj_in: $!\n";
        while (<$fh>) { print STDERR "  $_"; }
        close $fh;
    }
}

sub run_ptraj ($$) {
    my ($ptraj_in, $ptraj_out) = @_;
    my $top = $seed . ".top";
    my $start = (times)[0];

    open PTRAJPIPE, "$PTRAJ $top < $ptraj_in |" or die "can't open ptraj pipe: $!";
    open PTRAJOUT, ">$ptraj_out" or die "can't write to $ptraj_out: $!";

    while (<PTRAJPIPE>) {
        if ($progress) { print; }
        print PTRAJOUT $_;
    }

    my $stop =  (times)[0];

    if ($progress) {
        printf("CPU TIME: ptraj took %.2fs\n", $stop - $start);
    }

    unlink $ptraj_in unless $keep_ptraj_input;
}

# join the ptraj output files for each torsion into a single input
# file for dash
sub write_dash_input {
    my $in;
    my $out;
    my @angles;

    foreach my $k (@tornames) {
        my $torfile = $seed . "." . $k;

        open ($in, '<', $torfile)
            or die "can't read angles for torsion $k from $torfile: $!\n";

        while (<$in>) {
            my ($frame, $angle) = split;
            next if ($frame eq "#Frame"); # ignore cpptraj header line
            push @{$angles[$frame]}, $angle;
        }

        close $in;

        if (!$keep_angles) {
            unlink $torfile
                or die "failed to delete $torfile: $!\n";
        }
    }

    $nframes = $#angles;           # frame numbers start at 1
    $ntorsions = $#{$angles[1]}+1; # torsion indices start at 0

    open ($out, '>', $dash_in)
        or die "can't write dash input file: $dash_in: $!\n";

    for my $f (1 .. $nframes) {
        print $out join(" ", @{$angles[$f]}), "\n";
    }

    close $out;

    if ($debug) {
        print STDERR "nframes: $nframes\n";
        print STDERR "ntorsions: $ntorsions\n";
    }
}

sub run_dash {
    if ($dash_major_version == 2 && $dash_minor_version <= 10) {
        run_dash_1();
    } else {
        run_dash_2();
    }

    unlink $dash_in unless $keep_dash_input;
}

sub run_dash_1 {
    if ($snap) {
        push @dash_opts, "-R";
    }

    my $dash_cmd = "$DASH -N $nframes -T $ntorsions " . join(" ", @dash_opts) . " < $dash_in > $dash_out";

    print STDERR "running dash command: $dash_cmd\n" if $debug;

    system("$dash_cmd") == 0
        or die "$progname: dash command failed: $!\n";
}

sub run_dash_2 {
    my $dash_cmd = "$DASH " . join(" ", @dash_opts) . " -i $dash_in -o $dash_out";

    print STDERR "running dash command: $dash_cmd\n" if $debug;

    system("$dash_cmd")/256 == 255
        or die "$progname: dash command failed: $!\n";
}

# write pdb files for dash state representative frames
sub write_snaps {
    my ($fh, $nstates);
    my @frames;

    my (@files, @start, @stop, @step, @numframes, @cumframes);

    if ($use_trajin) {
        print STDERR "reading trajin commands\n" if $debug;

        for my $line (@trajin_cmd) {
            my @fields = split /\s+/, $line;
            shift @fields; # trajin
            push @files, shift @fields;
            push @start, shift @fields;
            push @stop, shift @fields;
            push @step, shift @fields;
        }

        for (my $i=0; $i<=$#start; $i++) {
            $numframes[$i] = int(($stop[$i] - $start[$i])/$step[$i]) + 1;
        }

        $cumframes[0] = $numframes[0];
        for (my $i=1; $i<=$#start; $i++) {
            $cumframes[$i] = $cumframes[$i-1] + $numframes[$i];
        }

        if ($debug) {
            print STDERR "files: ", join(" ", @files), "\n";
            print STDERR "start: ", join(" ", @start), "\n";
            print STDERR "stop: ", join(" ", @stop), "\n";
            print STDERR "step: ", join(" ", @step), "\n";
            print STDERR "numframes: ", join(" ", @numframes), "\n";
            print STDERR "cumframes: ", join(" ", @cumframes), "\n";
        }

        # sanity check
        # catch (some) cases where stop field in trajin command is past the end of the trajectory
        # ptraj warns about this, but the warning is easily missed
        if ($cumframes[$#start] != $nframes) {
            print STDERR "$seed.trajin specifies $nframes frames but ptraj read $cumframes[$#start] frames\n";
            die "unable to locate snapshots, exiting\n";
        }
    }

    open ($fh, '<', $dash_out)
        or die "can't open dash output file: $dash_out: $!\n";

    print STDERR "reading dash output file: $dash_out\n" if $debug;

    if ($dash_major_version == 2 && $dash_minor_version <= 10) {
        while (<$fh>) {
            # read the number of dash states for later sanity check
            if(/^final states\s+= (\d+)/) { $nstates = $1 };

            # stop at <DashStateRepresentativeFrames> block
            last if (/DashStateRepresentativeFrames/);
        }

        while (<$fh>) {
            # in section <DashStateRepresentativeFrames>
            # extract frame number from lines matching "state X: frame Y, rmsd = Z"
            if (/^state \d+: frame (\d+)/) { push @frames, $1 };

            # break at end of block
            last if (/\/DashStateRepresentativeFrames/);
        }
    } else {
        while (<$fh>) {
            # read the number of dash states for later sanity check
            if(/^final states\s+: (\d+)/) { $nstates = $1 };

            # stop at [DASH_STATE_DISTRIBUTION] block
            last if (/DASH_STATE_DISTRIBUTION/);
        }

        while (<$fh>) {
            # in section [DASH_STATE_DISTRIBUTION]
            # extract frame number from lines matching
            # State	  Frames   %Frames   Rep.Frame      RMSD
            # 1              265      1.06       9700       5.25
            if (/^\d+\s+\d+\s+\d+.\d+\s+(\d+)\s+\d+.\d+$/) { push @frames, $1 };

            # break at blank line
            last if (/^$/);
        }
    }

    close $fh;

    die "$progname: failed to read number of states from $dash_out\n"
        if (!defined $nstates);

    die "$progname: failed to read representative frames from $dash_out: expected $nstates, found " . ($#frames+1) . "\n"
        if ($nstates != $#frames+1);

    print STDERR "representative frames: ", join(" ", @frames), "\n" if $debug;

    my $state = 0;

    for my $f (@frames) {
        ++$state;

        # create ptraj command
        my $ptraj_in = $seed . ".ptraj.state$state.in";
        my $ptraj_out = $seed . ".ptraj.state$state.out";

        open ($fh, '>', $ptraj_in)
            or die "can't write ptraj snapshot command script: $ptraj_in: $!\n";

        print STDERR "writing ptraj snapshot command script: $ptraj_in\n" if $debug;

        if ($use_trajin) {
            my $found = 0;
            for (my $i=0; $i<=$#start; $i++) {
                if ($f <= $cumframes[$i]) {
                    my $s = ($i>0) ? $start[$i] + $step[$i]*($f - $cumframes[$i-1] - 1) : $start[$i] + $step[$i]*($f - 1);
                    print STDERR "global frame $f is local frame $s in file $files[$i]\n" if $debug;
                    print $fh "trajin $files[$i] $s $s\n";
                    $found = 1;
                    last;
                }
            }

            die "global frame $f is not in the trajectory, exiting.\n" if !$found;
        } else {
            print $fh "trajin $seed.trj $f $f\n";
        }

        print $fh "trajout $seed.state$state.pdb pdb\n";
        print $fh "go\n";
        close $fh;

        run_ptraj($ptraj_in, $ptraj_out);
    }
}
