BOL: Perl script to find the distance beetween all the contigs and scaffolds

BioScripts
Jit
Perl script to find the distance beetween all the contigs and scaffolds
Perl script to find the distance beetween all the contigs and scaffolds

By Jit 2343 days ago
#!/usr/bin/perl
use strict;
use warnings;
use Bio::SeqIO;

$| = 1;

#Script to see the distance beetween all the contigs and scaffolds
#Usage: perl clustalReads.pl genome.fa > HammingDist.txt
#Dependancy: MAFFT

my ($refSeq, $SeqIds) = fastafile2hash($ARGV[0]);
my $tmpFile="tmpOutfile";
foreach my $chr (@$SeqIds) {
	my $cnt=0;
	my $value=scalar(@$SeqIds);
	do{{ 		
		$value=$value-1;
		my $ofh = write_fh($tmpFile);
		next if $chr eq @$SeqIds[$cnt];
		print "$chr\t@$SeqIds[$cnt]\t"; 
		print $ofh ">$chr\n$refSeq->{$chr}{seq}\n>@$SeqIds[$cnt]\n$refSeq->{@$SeqIds[$cnt]}{seq}\n";
		close $ofh or die "Could not close file '$tmpFile' $!";
		$cnt++;
		system ("mafft --retree 1 --maxiterate 0 --thread 40 $tmpFile > MAFFT_out");
		#usage: perl clustalRead.pl MAFFT_out

		my ($seqRef, $ids) = fastafile2hash('MAFFT_out');

		my ($fval, $sval) = trimMSA($seqRef->{$ids->[0]}{seq}, $seqRef->{$ids->[1]}{seq});
		my $seq1  = substr $seqRef->{$ids->[0]}{seq}, $fval, ($seqRef->{$ids->[0]}{len}-$sval)-$fval;
		my $seq2  = substr $seqRef->{$ids->[1]}{seq}, $fval, ($seqRef->{$ids->[1]}{len}-$sval)-$fval;

		my $val = hammingDist($seq1, $seq2);
		print "$val\n";
		}} until($value<=0);
}


#calculate the hamming distance
sub hammingDist {
	return ($_[0] ^ $_[1]) =~ tr/\001-\255//;
}

#Remove '-' from both end
sub trimMSA {
    my ($seq1, $seq2) = @_;
    my ($substring1) = $seq1 =~ /^.{0}(.*?)[a-zA-Z_]/s;
    my ($substring2) = $seq2 =~ /^.{0}(.*?)[a-zA-Z_]/s;
    my $trimLen1=length($substring1); my $trimLen2=length($substring2);
    my $val;
#print "$trimLen1 <= $trimLen2\n";
    if ($trimLen1 >= $trimLen2) {$val = $trimLen1; } elsif ($trimLen1 <= $trimLen2) {$val = $trimLen2; }
    
    my $seq1_rev = reverse $seq1; my $seq2_rev = reverse $seq2;

    my ($substring1_rev) = $seq1_rev =~ /^.{0}(.*?)[a-zA-Z_]/s;
    my ($substring2_rev) = $seq2_rev =~ /^.{0}(.*?)[a-zA-Z_]/s;
    my $trimLen1_rev=length($substring1_rev); my $trimLen2_rev=length($substring2_rev);
    my $val_rev;
    if ($trimLen1_rev >= $trimLen2_rev) {$val_rev = $trimLen1_rev; } elsif ($trimLen1_rev <= $trimLen2_rev) {$val_rev = $trimLen2_rev; }
    
    return $val, $val_rev;
}


#Store fasta file to hash
sub fastafile2hash {
    my $fastafile = shift @_;
    my %sequences;
    my $fh = &read_fh($fastafile);
    my $seqid; my @allIds;
    while (my $line = <$fh>) {
        if ($line =~ /^>(\S+)(.*)/) {
            $seqid = $1;
            $sequences{$seqid}{desc} = $2;
	    push @allIds, $seqid;
        }
        else {
            chomp $line;
            $sequences{$seqid}{seq}     .= $line;
            $sequences{$seqid}{len}     += length $line;
            $sequences{$seqid}{gc}      += ($line =~ tr/gcGC/gcGC/);
            $line =~ s/[^atgc]/N/ig;
            $sequences{$seqid}{nonatgc} += ($line =~ tr/N/N/);
        }
    }
    close $fh;
    return \%sequences, \@allIds;
}


sub read_fh {
    my $filename = shift @_;
    my $filehandle;
    if ($filename =~ /gz$/) {
        open $filehandle, "gunzip -dc $filename |" or die $!;
    }
    else {
        open $filehandle, "<$filename" or die $!;
    }
    return $filehandle;
}

sub write_fh {
    my $filename = shift @_;
    my $filehandle;
    open $filehandle, ">$filename" or die $!;
    return $filehandle;
}
BOL

Jit

Our Sponsors

Perl script to find the distance beetween all the contigs and scaffolds