Perl script to find the distance beetween all the contigs and scaffolds

Jit — Tue, 26 Jun 2018 04:40:48 -0500
#!/usr/bin/perl
use strict;
use warnings;
use Bio::SeqIO;

$| = 1;

#Script to see the distance beetween all the contigs and scaffolds
#Usage: perl clustalReads.pl genome.fa > HammingDist.txt
#Dependancy: MAFFT

my ($refSeq, $SeqIds) = fastafile2hash($ARGV[0]);
my $tmpFile="tmpOutfile";
foreach my $chr (@$SeqIds) {
	my $cnt=0;
	my $value=scalar(@$SeqIds);
	do{{ 		
		$value=$value-1;
		my $ofh = write_fh($tmpFile);
		next if $chr eq @$SeqIds[$cnt];
		print "$chr\t@$SeqIds[$cnt]\t"; 
		print $ofh ">$chr\n$refSeq->{$chr}{seq}\n>@$SeqIds[$cnt]\n$refSeq->{@$SeqIds[$cnt]}{seq}\n";
		close $ofh or die "Could not close file '$tmpFile' $!";
		$cnt++;
		system ("mafft --retree 1 --maxiterate 0 --thread 40 $tmpFile > MAFFT_out");
		#usage: perl clustalRead.pl MAFFT_out

		my ($seqRef, $ids) = fastafile2hash('MAFFT_out');

		my ($fval, $sval) = trimMSA($seqRef->{$ids->[0]}{seq}, $seqRef->{$ids->[1]}{seq});
		my $seq1  = substr $seqRef->{$ids->[0]}{seq}, $fval, ($seqRef->{$ids->[0]}{len}-$sval)-$fval;
		my $seq2  = substr $seqRef->{$ids->[1]}{seq}, $fval, ($seqRef->{$ids->[1]}{len}-$sval)-$fval;

		my $val = hammingDist($seq1, $seq2);
		print "$val\n";
		}} until($value<=0);
}


#calculate the hamming distance
sub hammingDist {
	return ($_[0] ^ $_[1]) =~ tr/\001-\255//;
}

#Remove '-' from both end
sub trimMSA {
    my ($seq1, $seq2) = @_;
    my ($substring1) = $seq1 =~ /^.{0}(.*?)[a-zA-Z_]/s;
    my ($substring2) = $seq2 =~ /^.{0}(.*?)[a-zA-Z_]/s;
    my $trimLen1=length($substring1); my $trimLen2=length($substring2);
    my $val;
#print "$trimLen1 <= $trimLen2\n";
    if ($trimLen1 >= $trimLen2) {$val = $trimLen1; } elsif ($trimLen1 <= $trimLen2) {$val = $trimLen2; }
    
    my $seq1_rev = reverse $seq1; my $seq2_rev = reverse $seq2;

    my ($substring1_rev) = $seq1_rev =~ /^.{0}(.*?)[a-zA-Z_]/s;
    my ($substring2_rev) = $seq2_rev =~ /^.{0}(.*?)[a-zA-Z_]/s;
    my $trimLen1_rev=length($substring1_rev); my $trimLen2_rev=length($substring2_rev);
    my $val_rev;
    if ($trimLen1_rev >= $trimLen2_rev) {$val_rev = $trimLen1_rev; } elsif ($trimLen1_rev <= $trimLen2_rev) {$val_rev = $trimLen2_rev; }
    
    return $val, $val_rev;
}


#Store fasta file to hash
sub fastafile2hash {
    my $fastafile = shift @_;
    my %sequences;
    my $fh = &read_fh($fastafile);
    my $seqid; my @allIds;
    while (my $line = <$fh>) {
        if ($line =~ /^>(\S+)(.*)/) {
            $seqid = $1;
            $sequences{$seqid}{desc} = $2;
	    push @allIds, $seqid;
        }
        else {
            chomp $line;
            $sequences{$seqid}{seq}     .= $line;
            $sequences{$seqid}{len}     += length $line;
            $sequences{$seqid}{gc}      += ($line =~ tr/gcGC/gcGC/);
            $line =~ s/[^atgc]/N/ig;
            $sequences{$seqid}{nonatgc} += ($line =~ tr/N/N/);
        }
    }
    close $fh;
    return \%sequences, \@allIds;
}


sub read_fh {
    my $filename = shift @_;
    my $filehandle;
    if ($filename =~ /gz$/) {
        open $filehandle, "gunzip -dc $filename |" or die $!;
    }
    else {
        open $filehandle, "<$filename" or die $!;
    }
    return $filehandle;
}

sub write_fh {
    my $filename = shift @_;
    my $filehandle;
    open $filehandle, ">$filename" or die $!;
    return $filehandle;
}
BOL: Perl script to find the distance beetween all the contigs and scaffolds

Perl script to find the distance beetween all the contigs and scaffolds