#!/usr/bin/perl
use strict;
use warnings;
use Bio::SeqIO;
$| = 1;
#Script to see the distance beetween all the contigs and scaffolds
#Usage: perl clustalReads.pl genome.fa > HammingDist.txt
#Dependancy: MAFFT
my ($refSeq, $SeqIds) = fastafile2hash($ARGV[0]);
my $tmpFile="tmpOutfile";
foreach my $chr (@$SeqIds) {
my $cnt=0;
my $value=scalar(@$SeqIds);
do{{
$value=$value-1;
my $ofh = write_fh($tmpFile);
next if $chr eq @$SeqIds[$cnt];
print "$chr\t@$SeqIds[$cnt]\t";
print $ofh ">$chr\n$refSeq->{$chr}{seq}\n>@$SeqIds[$cnt]\n$refSeq->{@$SeqIds[$cnt]}{seq}\n";
close $ofh or die "Could not close file '$tmpFile' $!";
$cnt++;
system ("mafft --retree 1 --maxiterate 0 --thread 40 $tmpFile > MAFFT_out");
#usage: perl clustalRead.pl MAFFT_out
my ($seqRef, $ids) = fastafile2hash('MAFFT_out');
my ($fval, $sval) = trimMSA($seqRef->{$ids->[0]}{seq}, $seqRef->{$ids->[1]}{seq});
my $seq1 = substr $seqRef->{$ids->[0]}{seq}, $fval, ($seqRef->{$ids->[0]}{len}-$sval)-$fval;
my $seq2 = substr $seqRef->{$ids->[1]}{seq}, $fval, ($seqRef->{$ids->[1]}{len}-$sval)-$fval;
my $val = hammingDist($seq1, $seq2);
print "$val\n";
}} until($value<=0);
}
#calculate the hamming distance
sub hammingDist {
return ($_[0] ^ $_[1]) =~ tr/\001-\255//;
}
#Remove '-' from both end
sub trimMSA {
my ($seq1, $seq2) = @_;
my ($substring1) = $seq1 =~ /^.{0}(.*?)[a-zA-Z_]/s;
my ($substring2) = $seq2 =~ /^.{0}(.*?)[a-zA-Z_]/s;
my $trimLen1=length($substring1); my $trimLen2=length($substring2);
my $val;
#print "$trimLen1 <= $trimLen2\n";
if ($trimLen1 >= $trimLen2) {$val = $trimLen1; } elsif ($trimLen1 <= $trimLen2) {$val = $trimLen2; }
my $seq1_rev = reverse $seq1; my $seq2_rev = reverse $seq2;
my ($substring1_rev) = $seq1_rev =~ /^.{0}(.*?)[a-zA-Z_]/s;
my ($substring2_rev) = $seq2_rev =~ /^.{0}(.*?)[a-zA-Z_]/s;
my $trimLen1_rev=length($substring1_rev); my $trimLen2_rev=length($substring2_rev);
my $val_rev;
if ($trimLen1_rev >= $trimLen2_rev) {$val_rev = $trimLen1_rev; } elsif ($trimLen1_rev <= $trimLen2_rev) {$val_rev = $trimLen2_rev; }
return $val, $val_rev;
}
#Store fasta file to hash
sub fastafile2hash {
my $fastafile = shift @_;
my %sequences;
my $fh = &read_fh($fastafile);
my $seqid; my @allIds;
while (my $line = <$fh>) {
if ($line =~ /^>(\S+)(.*)/) {
$seqid = $1;
$sequences{$seqid}{desc} = $2;
push @allIds, $seqid;
}
else {
chomp $line;
$sequences{$seqid}{seq} .= $line;
$sequences{$seqid}{len} += length $line;
$sequences{$seqid}{gc} += ($line =~ tr/gcGC/gcGC/);
$line =~ s/[^atgc]/N/ig;
$sequences{$seqid}{nonatgc} += ($line =~ tr/N/N/);
}
}
close $fh;
return \%sequences, \@allIds;
}
sub read_fh {
my $filename = shift @_;
my $filehandle;
if ($filename =~ /gz$/) {
open $filehandle, "gunzip -dc $filename |" or die $!;
}
else {
open $filehandle, "<$filename" or die $!;
}
return $filehandle;
}
sub write_fh {
my $filename = shift @_;
my $filehandle;
open $filehandle, ">$filename" or die $!;
return $filehandle;
}