#!/usr/bin/perl
# USAGE: perl nw.pl HEAGAWGHEE PAWHEAE BLOSUM50.txt -8
# See: "Biological sequence anaysis" Durbin et al. ed. CUP 1998, Pg. 19
# Needleman-Wunsch global alignment algo (GOTHO 1982 mod)
# usage statement
die "usage: $0 <sequence 1> <sequence 2> <substmatrix> <gapscore> \n" unless @ARGV == 4;
# get sequences, matrix and gapcost from command line
my ($seq1, $seq2, $smfile, $gapcost) = @ARGV;
# scoring scheme (instead of using fixed MATCH and MISMATCH scores we will use values read from BLOSUM50)
my $MATCH = 1; # +1 for letters that match
my $MISMATCH = -1; # -1 for letters that mismatch
my $GAP = $gapcost; # for any gap
my %BLOSUM50 = ();
my @aalist = ();
# read substitution matrix
open in, $smfile;
while(<in>){
if($.<2){next;}
# read columns names (aa)
if($.<3){
chop $_;
@aalist=split(/\s+/,$_);
next;
}
chop $_;
@vals=split(/\s+/,$_);
$curaaROW=$vals[0];
for($i=1;$i<=$#vals;$i++){
$curaaCOLUMN=$aalist[$i];
$BLOSUM50{$curaaROW}{$curaaCOLUMN}=$vals[$i];
}
}
close in;
# initialization
my @matrix;
$matrix[0][0]{score} = 0;
$matrix[0][0]{pointer} = "none";
for(my $j = 1; $j <= length($seq1); $j++) {
$matrix[0][$j]{score} = $GAP * $j;
$matrix[0][$j]{pointer} = "left";
}
for (my $i = 1; $i <= length($seq2); $i++) {
$matrix[$i][0]{score} = $GAP * $i;
$matrix[$i][0]{pointer} = "up";
}
# fill
for(my $i = 1; $i <= length($seq2); $i++) {
for(my $j = 1; $j <= length($seq1); $j++) {
my ($diagonal_score, $left_score, $up_score);
# calculate match score
my $letter1 = substr($seq1, $j-1, 1);
my $letter2 = substr($seq2, $i-1, 1);
if ($letter1 eq $letter2) {
$diagonal_score = $matrix[$i-1][$j-1]{score} + $BLOSUM50{$letter1}{$letter2};
}
else {
$diagonal_score = $matrix[$i-1][$j-1]{score} + $BLOSUM50{$letter1}{$letter2};
}
# calculate gap scores
$up_score = $matrix[$i-1][$j]{score} + $GAP;
$left_score = $matrix[$i][$j-1]{score} + $GAP;
# choose best score
if ($diagonal_score >= $up_score) {
if ($diagonal_score >= $left_score) {
$matrix[$i][$j]{score} = $diagonal_score;
$matrix[$i][$j]{pointer} = "diagonal";
}
else {
$matrix[$i][$j]{score} = $left_score;
$matrix[$i][$j]{pointer} = "left";
}
} else {
if ($up_score >= $left_score) {
$matrix[$i][$j]{score} = $up_score;
$matrix[$i][$j]{pointer} = "up";
}
else {
$matrix[$i][$j]{score} = $left_score;
$matrix[$i][$j]{pointer} = "left";
}
}
}
}
# trace-back
my $align1 = "";
my $align2 = "";
my $descrstr = "";
# start at last cell of matrix
my $j = length($seq1);
my $i = length($seq2);
while (1) {
last if $matrix[$i][$j]{pointer} eq "none"; # ends at first cell of matrix
if ($matrix[$i][$j]{pointer} eq "diagonal") {
$align1 .= substr($seq1, $j-1, 1);
$align2 .= substr($seq2, $i-1, 1);
if(substr($seq1, $j-1,1) eq substr($seq2, $i-1,1)){$descrstr .="|";}else{$descrstr .= ".";}
$i--;
$j--;
}
elsif ($matrix[$i][$j]{pointer} eq "left") {
$align1 .= substr($seq1, $j-1, 1);
$align2 .= "-";
$descrstr .= " ";
$j--;
}
elsif ($matrix[$i][$j]{pointer} eq "up") {
$align1 .= "-";
$align2 .= substr($seq2, $i-1, 1);
$descrstr .= " ";
$i--;
}
}
$align1 = reverse $align1;
$align2 = reverse $align2;
$descrstr = reverse $descrstr;
# print matrices:
print "\n\n";
for(my $i = 0; $i <= length($seq2); $i++) {
for(my $j = 0; $j <= length($seq1); $j++) {
printf("%2.1f", $matrix[$i][$j]{score});
print("\t");
}
print"\n";
}
print "\n\n";
# print the alignment:
print "$align1\n";
print "$descrstr\n";
print "$align2\n";
__END__
# Entries for the BLOSUM50 matrix at a scale of ln(2)/3.0.
Find matrix at http://bioinformaticsonline.com/file/view/27455/blosum50-matrix