BOL: All

Extract ids from file with perl

Radha Agarkar — Wed, 15 Mar 2017 05:21:59 -0500

#!/usr/bin/perl
use strict;
use warnings;

my $fh=read_fh("fin.txt");
my %idHash;
my $lastKey;
while (<$fh>) {
  chomp $_;
  my @cells = split /\t/, $_;
  $idHash{$cells[0]}=$.;
  push @allIds, $cells[0];
}

#Delete the last id for secutiry -- might does not finish all steps 
delete $hash{$allIds[-1]};
next if exists $hash{$look_for};


############################################################
#Open and Read a file
sub read_fh {
    my $filename = shift @_;
    my $filehandle;
    if ($filename =~ /gz$/) {
        open $filehandle, "gunzip -dc $filename |" or die $!;
    }
    else {
        open $filehandle, "<$filename" or die $!;
    }
    return $filehandle;
}

Transpose the file coordinates and plot dendrogram in R

Abhimanyu Singh — Mon, 06 Mar 2017 04:57:54 -0600

#Save this as tr.awk
{ 
    for (i=1; i<=NF; i++) a[NR,i]=$i
}
END {
    for (i=1; i<=NF; i++) {
        for (j=1; j<=NR; j++) {
            printf "%s", a[j,i]
            if (j bbbout.txt

#Plot in R
tetra <- read.csv("bbbout.txt", header=T, stringsAsFactors = F, sep = "\t", row.names = 1)
hc = hclust(dist(tetra))
plot (hc)

Extracting FASTA sequences based on position with perl script !!

Shruti Paniwala — Wed, 01 Mar 2017 17:10:11 -0600

#!/usr/bin/env perl
 
#Uses: perl sub-seq.pl input.txt range
 
use strict;
use warnings;
 
my $end   = pop;
my $start = pop;
local $/ = '>';
 
while (<>) {
    chomp;
    next unless /(.+)/;
    my ($header) = "$/$1_$start-$end\n";
    my $seq = ${^POSTMATCH};
    $seq =~ s/\s//g;
    print $header;
    print +( substr $seq, $start - 1, $end ) . "\n";
}

Extract a range from genome file with perl.

Abhimanyu Singh — Fri, 24 Feb 2017 09:30:11 -0600

#!/usr/bin/perl 

use strict;
use warnings;
use Bio::SeqIO;

my $in_file = $ARGV[0];
my $start_pos = $ARGV[1];
my $end_pos = $ARGV[2];

my $in = Bio::SeqIO->new ( -file => $in_file, -format => 'fasta');
my $out = Bio::SeqIO->new( -file => ">$in_file.out", -format => 'fasta');


while (my $seq = $in->next_seq() ) {

    $seq->display_id( $seq->display_id() . "_$start_pos-$end_pos" );
    $out->write_seq( $seq->trunc($start_pos, $end_pos) );
}

Check overlapping range with Perl

Jit — Fri, 24 Feb 2017 05:09:56 -0600

#!/usr/bin/perl

use strict;
use warnings;

my @ranges = 0;
push @ranges, $ranges[-1] + 1 + int rand 200 for 1..10000;
my @tests = map int rand $ranges[-1], 0..1000000;
match (\@ranges, \@tests);

sub div {
    my ($border, $tests) = @_;
    my ($lt, $ge) = ([], []);
    push @{$_ < $border ? $lt : $ge}, $_ for @$tests;
    ($lt, $ge);
}

sub match {
    my ($ranges, $tests) = @_;
    if (@$ranges == 1) {
        if (@$tests) {
            print "tests in range $ranges->[0]:\n", join(", ", @$tests), "\n";
        }
        else {
            print "range $ranges->[0] is empty\n";
        }
    }
    else {
        my $pivot = int((@$ranges + 1)/ 2);
        my ($lt, $ge) = div($ranges->[$pivot], $tests);
        match([@{$ranges}[0..$pivot-1]], $lt);
        match([@{$ranges}[$pivot..$#$ranges]], $ge);
    }
}

Check overlaps with Perl

Jit — Wed, 15 Feb 2017 04:43:39 -0600

#!/usr/bin/perl
use strict;
use warnings;

#For normal overlaps 

my ($lower, $upper) = (40, 100);

for my $num (17,42,99,111) {

    my $is_between = (sort {$a <=> $b} $lower, $upper, $num)[1] == $num;
    
    printf "$num is%s between $lower and $upper\n", $is_between ? "" : " not";
}


#For range to range overlaps

#!/usr/bin/perl
use strict;
use warnings;
use 5.010;

my ($lower, $upper) = (40, 100);

for my $range ( [10,17],
                [30,71],
                [42,99],
                [83,120],
                [101,111] ) {

    my $is_within = [(sort {$a <=> $b} $lower, $upper, @$range)[1,2]] ~~ $range;
    
    printf "[@$range] is%s within [$lower $upper]\n", $is_within ? "" : " not";
}

Calculate some statistics for a DNA alignment with Perl

Abhimanyu Singh — Thu, 02 Feb 2017 04:11:54 -0600

use Bio::AlignIO;
  use Bio::Align::DNAStatistics;

  my $stats = Bio::Align::DNAStatistics->new();
  my $alignin = Bio::AlignIO->new(-format => 'emboss',
                                 -file   => 't/data/insulin.water');
  my $aln = $alignin->next_aln;
  my $jcmatrix = $stats->distance(-align => $aln, 
                                  -method => 'Jukes-Cantor');

  print $jcmatrix->print_matrix;
  ## and for measurements of synonymous /nonsynonymous substitutions ##

  my $in = Bio::AlignIO->new(-format => 'fasta',
                            -file   => 't/data/nei_gojobori_test.aln');
  my $alnobj = $in->next_aln;
  my ($seq1id,$seq2id) = map { $_->display_id } $alnobj->each_seq;
  my $results = $stats->calc_KaKs_pair($alnobj, $seq1id, $seq2id);
  print "comparing ".$results->[0]{'Seq1'}." and ".$results->[0]{'Seq2'}."\n";
  for (sort keys %{$results->[0]} ){
      next if /Seq/;
      printf("%-9s %.4f \n",$_ , $results->[0]{$_});
  }

  my $results2 = $stats->calc_all_KaKs_pairs($alnobj);
  for my $an (@$results2){
      print "comparing ". $an->{'Seq1'}." and ". $an->{'Seq2'}. " \n";
      for (sort keys %$an ){
          next if /Seq/;
          printf("%-9s %.4f \n",$_ , $an->{$_});
      }
      print "\n\n";
  }

  my $result3 = $stats->calc_average_KaKs($alnobj, 1000);
  for (sort keys %$result3 ){
      next if /Seq/;
      printf("%-9s %.4f \n",$_ , $result3->{$_});
  }

BASH script for SelfBLAST a genome

Jit — Mon, 30 Jan 2017 09:31:33 -0600

#!/bin/bash

#self BLAST a genome -- Expecting you have blast and samtools installed in your system
#Author: Jitendra Narayan
#USAGE: ./selfBlast.sh extract 
#USAGE: ./selfBlast.sh all

#Common settings 
FASTAFILE=MergedContigs.fasta
MYDB=myDB
OUTFILE=seeRES
THREAD=20
SEQ=""

echo "User $USER provided $# arguments, Detail of the arguments: $@"

if [ -f $MYDB.nhr ]
then
  echo "BLAST database for MergedContigs.fasta genome exists"
else
  echo "Thanks for testing this script $USER; Me creating creating blastDB named $MYDB for you";
  makeblastdb -in $FASTAFILE -parse_seqids -dbtype nucl -out $MYDB
fi

if [ $1 = "extract" ]
then
  echo "Extracting the sequence $2 for you from $FASTAFILE -- MAKE SURE U HAVE ADDED CORRECT NAME"
  samtools faidx MergedContigs.fasta
  samtools faidx MergedContigs.fasta $2 > $2.fa
  SEQ=$2.fa
elif [ $1 = "all" ]
then
  echo "You want entire sequence to blast"
  SEQ=$FASTAFILE
else
  echo "Something went wrong $USER - Contact jitendra"
fi

echo "Doing alignments -- BLASting";
blastn -task megablast -query $SEQ -db $MYDB -evalue 1e-5 -num_threads $THREAD -max_target_seqs 1 -outfmt '6 qseqid staxid qstart qend sseqid sstart send evalue length frames qcovs' -out $OUTFILE;

echo "DONE successfully :)"

Calculate ATGC percentage in parallel with perl

Jit — Thu, 26 Jan 2017 10:18:53 -0600

#!/usr/bin/perl

use strict;
use Parallel::ForkManager;
use Bio::SeqIO;

#usage: perl testParallel.pl 

my %sequences;
my $seqio = Bio::SeqIO->new(-file => "$ARGV[0]", -format => "fasta");
while(my$seqobj = $seqio->next_seq) {
    my $id  = $seqobj->display_id;    # there's your key
    my $seq = $seqobj->seq;           # and there's your value
    $sequences{$id} = $seq;
}

  my $max_procs = 5;
  my @names = keys %sequences;

  # hash to resolve PID's back to child specific information
  my $pm =  new Parallel::ForkManager($max_procs);

 # Setup a callback for when a child finishes up so we can
  # get it's exit code
  $pm->run_on_finish (
    sub { my ($pid, $exit_code, $ident) = @_;
      #print "** $ident just got out of the pool ".
        "with PID $pid and exit code: $exit_code\n";
    }
  );

  $pm->run_on_start(
    sub { my ($pid,$ident)=@_;
     #print "** $ident started, pid: $pid\n";
    }
  );

  $pm->run_on_wait(
    sub {
      #print "** Have to wait for one children ...\n"
    },
    0.5
  );

  NAMES:
  foreach my $child ( 0 .. $#names ) {
    my $pid = $pm->start($names[$child]) and next NAMES;
    checkATCG($names[$child]);
    $pm->finish($child); # pass an exit code to finish
  }

  print "Waiting for Children...\n";
  $pm->wait_all_children;
  print "Everybody is out of the pool!\n";


sub checkATCG {
my $name=shift;
my $DNA=$sequences{$name};
my $length=length $DNA;
my $a=($DNA=~tr/A//);
my $b=($DNA=~tr/C//);
my $c=($DNA=~tr/G//);
my $d=($DNA=~tr/T//);
my $Total=$a+$b+$c+$d;
my $GC=($DNA=~s/GC/GC/g);
my $AT=($DNA=~s/AT/AT/g);
my $GCper=($GC/($Total)*100);
print"$name\t$Total\t$AT\t$GC\t$GCper:\n";

}

Perl script to insert the DNA string in genome

Shruti Paniwala — Mon, 23 Jan 2017 10:04:55 -0600

#!/usr/bin/perl

use warnings;
use strict;
use Bio::SeqIO;
use Bio::Seq;

my $file = $ARGV[0]; # input fasta file (genome file)
my $out = $ARGV[1]; # output fasta file

my $chr="test"; #insertion chromosome
my $pos=10; # position of the insertion
my $seqI = "AAAA"; #sequence of the insertion

my $seq_in  = Bio::SeqIO->new( -format => 'fasta',-file => $file);
my $seq_out = Bio::SeqIO->new( -format => 'fasta',-file => ">".$out);
while( my $seq = $seq_in->next_seq() ) {    
    if($seq->primary_id eq $chr){
        my $length = length($seq->seq);    
        my $upstream=substr($seq->seq, 0, $pos);
        my $downstream=substr($seq->seq, $pos,$length);        
        my $seq_obj = Bio::Seq->new(-seq => $upstream.$seqI.$downstream,-display_id => $seq->primary_id,-alphabet => "dna" );
            $seq_out->write_seq($seq_obj);
    }
    else{
        $seq_out->write_seq($seq);
    }
}