BOL: All

Mapping with BWA-mem or BWA-sampe in one go with python script !

Jit — Thu, 14 Jun 2018 06:37:37 -0500

BAM files and mapping
BESST requires sorted and indexed BAM files as input. Any read aligner + samtools can be used to obtain such files. Read pairs needs to be aligned in paired read mode. BESST provides a script (https://github.com/ksahlin/BESST/blob/master/scripts/reads_to_ctg_map.py) for obtaining sorted and indexed BAM files with BWA-mem or BWA-sampe in one go. An example call for mapping with this script is

python reads_to_ctg_map.py /path/to/lib1_A.fq /path/to/lib1_B.fq /path/to/contigs.fasta --threads N
where N is an integer specifying how many threads BWA-mem should use. --nomem can be specified to the above call to use BWA-sampe as the paired read alignment pipeline.

https://github.com/ksahlin/BESST/blob/master/scripts/reads_to_ctg_map.py

Perl script to check fastq reads qualities !

Abhimanyu Singh — Tue, 12 Jun 2018 04:42:20 -0500

#!/usr/bin/env perl

use strict;
use warnings;

sub readfq {
	my ($fh, $aux) = @_;
	@$aux = [undef, 0] if (!defined(@$aux));
	return if ($aux->[1]);
	if (!defined($aux->[0])) {
		while (<$fh>) {
			chomp;
			if (substr($_, 0, 1) eq '>' || substr($_, 0, 1) eq '@') {
				$aux->[0] = $_;
				last;
			}
		}
		if (!defined($aux->[0])) {
			$aux->[1] = 1;
			return;
		}
	}
	my $name = /^.(\S+)/? $1 : '';
	my $seq = '';
	my $c;
	$aux->[0] = undef;
	while (<$fh>) {
		chomp;
		$c = substr($_, 0, 1);
		last if ($c eq '>' || $c eq '@' || $c eq '+');
		$seq .= $_;
	}
	$aux->[0] = $_;
	$aux->[1] = 1 if (!defined($aux->[0]));
	return ($name, $seq) if ($c ne '+');
	my $qual = '';
	while (<$fh>) {
		chomp;
		$qual .= $_;
		if (length($qual) >= length($seq)) {
			$aux->[0] = undef;
			return ($name, $seq, $qual);
		}
	}
	$aux->[1] = 1;
	return ($name, $seq);
}

my @aux = undef;
my ($name, $seq, $qual);
my ($n, $slen, $qlen) = (0, 0, 0);
while (($name, $seq, $qual) = readfq(\*STDIN, \@aux)) {
	++$n;
	$slen += length($seq);
	$qlen += length($qual) if ($qual);
print join("\t", $n, $slen, $qlen), "\n";
}
#print join("\t", $n, $slen, $qlen), "\n";

__END__
 my @aux = undef; # this is for keeping intermediate data
  while (my ($name, $seq, $qual) = readfq(\*STDIN, \@aux)) { 
     if( (length($seq) >= 21) && (length($seq) <= 25) ) { 
         print "@$name\n";
         print "$seq\n"; 
         print "+\n";
         print "$qual\n";
     }
  }

Perl script to find palindromic regions in DNA sequences

Abhimanyu Singh — Tue, 12 Jun 2018 04:34:12 -0500

use strict;
use warnings;

my $pp = qr/(?: (\w) (?1) \g{-1} | \w? )/ix;
my $filename = $ARGV[0];
open(my $fh, '<:encoding(UTF-8)', $filename) or die "Could not open file '$filename' $!";

local $/ = '';

while (<$fh>) {
    chomp;
    my ($header, @lines) = split "\n";
    my $data = join '', @lines;

    print "$header\n$data\n";

    while ($data =~ /(?=($pp))/g) {
	my $end=($-[0]+length($1));
	my $n=(length($1)/2);
	my $len=length($1);
	my $midPoint = ($n == int $n) ? $n : int($n + 1);
	$midPoint=$midPoint+$-[0];
        print "$-[0]\t$midPoint\t$end\t$1\t$len\n" if length($1) > 100;
    }
}

__DATA__
>TRE|Q47404|Q47404 (409 AA) Glycosyl transferase [Escherichia coli]
MIFDASLKKLRKLFVNPIGFFRDSWFFNSKNKAEELLSPLKIKSKNIFIVAHLGQLKKAE
LFIQKFSRRSNFLIVLATKKNTEMPRLILEQMNKKLFSSYKLLFIPTEPNTFSLKKVIWF
YNVYKYIVLNSKAKDAYFMSYAQHYAIFIWLFKKNNIRCSLIEEGTGTYKTEKKKPLVNI
NFYSWIINSIILFHYPDLKFENVYGTFPNLLKEKFDAKKIFEFKTIPLVKSSTRMDNLIH

>seq1
TGAATTACTAGAAGTACTTAAAATGATGGTTGGAGGAAATATTCTTGATGATCAAATTGC
CGTTAAACTAGGATTTCTTATAAAGGAGGTTGGTAGTAAAATTCATGAAGATCATTAAGT

>TRE|Q8VRL9|Q8VRL9 (492 AA) SiaD [Neisseria meningitidis]
MLQKIRKALFHPKKFFQDSQWFATPLFSSFAPKSNLFIISTFAQLNQAHSLTKMQKLKNN
LLVILYTTQNMKMPKLIQKSVDKELFSVTYMFELPRKPGIVSPKKFLYIQRGYKKLLKTI
QPAHLYVMSFAGHYSSLLSLAKKMNITTHLVEEGTATYAPLLESFTYKPTKFEQRFVGNN
LHQKGYFDKFDILHVAFPEYAKKIFNANEYHRFFAHSGGISTSQSIAKIQDKYRISQNDY

Perl script to find coding regions in DNA sequences

Shruti Paniwala — Mon, 11 Jun 2018 08:03:03 -0500

#!/usr/bin/perl -w

use strict;


# if the number of input arguments is lower than 2
# return a message showing the error

if (scalar(@ARGV) < 2) {
  print "dnaloglkh.pl codontable DNAsequence\n";
  exit(1);
}

# create two vars contaning the filenames

my $filecodontable = $ARGV[0];
my $filesequence = $ARGV[1];


# open the first file: table of codon usage frequencies

if (!open(PROPCODONS,"< $filecodontable")) {
  print "dnaloglkh.pl: the file $filecodontable can not be opened\n";
  exit(1);
}


# load the frequencies of codon usage into the hash table %pcodons
# this hash is indexed by a triplet of nucleotides or codon

my %pcodons;
my ($codon,$freq);
my $line;

# for each line in the file (tableofcodons) do

while ($line = ) {

	# extract the two values or columns with a regular expression
	# A group of letters and a decimal number
	
	($codon,$freq) = ($line =~ m/(\w+) (\d+\.\d+)/);
  
	# load the hash table
	$pcodons{$codon}=$freq;
}

close(PROPCODONS);

##########################################################
# open the DNA sequence (second file)

if (!open(SEQUENCE,"< $filesequence")) {
	print "dnaloglkh.pl: the file $filesequence can not be opened\n";
	exit(1);
}


# FASTA format: 
# >header containing a description of the sequence
# AGACTGACTTAC
# AGACTGACTTAC
# AGACTGAC

my $iden = ;  # reading the header of the sequence
my @seql = ;  # load the complete sequence into an array
my $seq  = "";          # use a var to save the whole sequence inside

close(SEQUENCE);

# concat every segment (line) in the array into the var $seq

my $i=0;

while ($i < scalar(@seql)) {
	chomp($seql[$i]);
	$seq = $seq . $seql[$i];
	$i = $i + 1;
}

$seq = "\U$seq";

# we are going to use the array @codons to store all of the codons in
# the input DNA sequence. To split the sequence in segments of length
# three, a regular expression will be used searching for all of the
# possible groups of three symbols (option g)

my @codons = ($seq =~ m/.../g);

# the likelihood ratio is for a current triplet, the ratio between 
# the probability computed from the table of codon frequencies divided
# by the random probability (uniform), that is, 1/64. After this,
# the log must be applied to obtain the log-likelihood ratio.
# It is recommended the use of logs to manage calculations involving
# small numbers between 0 and 1. Log of the product is equal to the
# the sum of logs and the log of the division is the substraction of logs

my $r = 0.0;

$i=0;
while ($i < scalar(@codons)) {
  $r = $r + log($pcodons{$codons[$i]}) - log(1/64);
  $i = $i + 1;
}

printf "%.2f\n",$r;


__END__

To test the program, save this two files in your current working directory: exon.fa and intron.fa. Both files contain the sequence of a real exon and intron, respectively. Extremely different values of coding potential should be obtained with the two sequences: high values (positive) for coding protein sequences and low values (zero or even negative) for intronic regions. 

 exon.fa 

>HUMHBB.2ex
GCTGCTGGTGGTCTACCCTTGGACCCAGAGGTTCTTTGAGTCCTTTGGGGATCTGTCCACT
CCTGATGCTGTTATGGGCAACCCTAAGGTGAAGGCTCATGGCAAGAAAGTGCTCGGTGCCT
TTAGTGATGGCCTGGCTCACCTGGACAACCTCAAGGGCACCTTTGCCACACTGAGTGAGCT
GCACTGTGACAAGCTGCACGTGGATCCTGAGAACTTCAGG


 intron.fa.

>HUMHBB.2pin
ATAACAATTGTTTTCTTTTGTTTAATTCTTGCTTTCTTTTTTTTTCTTCTCCGCAATTTTT
ACTATTATACTTAATGCCTTAACATTGTGTATAACAAAAGGAAATATCTCTGAGATACATT
AAGTAACTTAAAAAAAAACTTTACACAGTCTGCCTAGTACATTACTATTTGGAATATATGT
GTGCTTATTTGCATATTCATAATCTCCCTACTTTATTTTC

Biological Sequence handling with Perl !

Rahul Nayak — Wed, 16 May 2018 08:18:12 -0500

package Sequence::Generic;
# File: Sequence/Generic.pm

use strict;
use Carp;
use overload 
  '""'        => 'asString',
  'neg'       => 'reverse',
  '.'         => 'concatenate',
  'fallback'  => 'TRUE';

# These methods should be overriden by child classes
# class constructor
sub new {
    my $class = shift;
    croak "$class must override the new() method";
}
# Return the sequence as a string
sub seq {
    my $self = shift;
    croak ref($self)," must override the seq() method";
}
# Return the type of the sequence as a human readable string
sub type {
    return 'Generic Sequence';
}
# These methods probably don't have to be overridden
# The length of the sequence
sub length {
    my $self = shift;
    return length($self->seq);
}
# The reverse of the sequence
sub reverse {
    my $self = shift;
    my $reversed = reverse $self->seq;
    return $reversed;
}
# A human-readable description of the object
sub asString {
  my $self = shift;
  return $self->type . '(' . $self->length . ' residues)';
}
# Concatenate two sequences together and return the result

sub concatenate {
  my $self = shift;
  my ($new_seq,$prepend) = @_;
  my ($to_append);
  if (ref($new_seq)) {
      croak "argument to concatenate must be a string or a Sequence object"
      unless $new_seq->isa(__PACKAGE__);
      $to_append = $new_seq->seq ;
  } else {
      $to_append = $new_seq;
  }
  return $self->new($prepend ? $to_append . $self->seq 
                     : $self->seq . $to_append);
}
1;

Back to Article

Listing Two
 package Sequence::Nucleotide;
# file: Sequence/Nucleotide.pm

use Sequence::Generic;
use Sequence::Nucleotide::Subsequence;
use Sequence::Alignment;
use Carp;

use strict;
use vars '@ISA';
:Generic';

my %CODON_TABLE = (
           UCA => 'S',UCG => 'S',UCC => 'S',UCU => 'S',
           UUU => 'F',UUC => 'F',UUA => 'L',UUG => 'L',
           UAU => 'Y',UAC => 'Y',UAA => '*',UAG => '*',
           UGU => 'C',UGC => 'C',UGA => '*',UGG => 'W',
           CUA => 'L',CUG => 'L',CUC => 'L',CUU => 'L',
           CCA => 'P',CCG => 'P',CCC => 'P',CCU => 'P',
           CAU => 'H',CAC => 'H',CAA => 'Q',CAG => 'Q',
           CGA => 'R',CGG => 'R',CGC => 'R',CGU => 'R',
           AUU => 'I',AUC => 'I',AUA => 'I',AUG => 'M',
           ACA => 'T',ACG => 'T',ACC => 'T',ACU => 'T',
           AAU => 'N',AAC => 'N',AAA => 'K',AAG => 'K',
           AGU => 'S',AGC => 'S',AGA => 'R',AGG => 'R',
           GUA => 'V',GUG => 'V',GUC => 'V',GUU => 'V',
           GCA => 'A',GCG => 'A',GCC => 'A',GCU => 'A',
           GAU => 'D',GAC => 'D',GAA => 'E',GAG => 'E',
           GGA => 'G',GGG => 'G',GGC => 'G',GGU => 'G',
          );
*complement = *reversec = \&reverse;

sub new {
  my $class = shift;
  $class = ref($class) if ref($class);
  my ($sequence,$type) = @_;

  my $self = bless {},$class;
  if (ref($sequence)) {
    croak "Can't initialize sequence from non-Sequence object.\n"
      unless $sequence->can('seq');
    %{$self} = %{$sequence};  # clone operation
  } else {
    croak "Doesn't look like sequence data" 
      unless $sequence=~/^[gactnu\s]+$/i;
    $self->{'data'} = $self->_canonicalize($sequence);
    $self->{'type'} = $type || ($sequence=~/u/i ? 'RNA' : 'DNA');
  }
  return $self;
}
sub seq {
    my $self = shift;
    $self->{'data'} = $self->_canonicalize($_[0])  if defined($_[0]);
    my $seq = $self->{'data'};
    return $seq unless $self->is_RNA;
    $seq=~tr/T/U/;
    return $seq;
}
sub type {
    my $self = shift;
    return defined($_[0]) ? $self->{'type'} = $_[0] : $self->{'type'};
}
sub is_DNA {
    my $self = shift;
    return $self->type eq 'DNA';
}
sub is_RNA {
  my $self = shift;
  return $self->type eq 'RNA';
}
sub subseq {
  my $self = shift;
  my ($start,$end) = @_;
  return (__PACKAGE__ . '::Subsequence')->new($self,$start,$end);
}
sub reverse {
  my $self = shift;
  return (__PACKAGE__ . '::Subsequence')->new($self,$self->length,1);
}
sub translate {
  my $self = shift;
  my $frame = shift() || 1;
  my $l = $self->length;
  my $seq = $frame > 0 ? $self->subseq($frame,$l-($l-$frame+1)%3)
              : $self->reverse->subseq(abs($frame),$l-($l-abs($frame)+1)%3);
  my $s = $seq->seq;
  $s=~tr/T/U/;  # put it in RNA mode
  $s =~ s/(\S{3})/$CODON_TABLE{$1} || 'X'/eg;
  return $s;
}
sub longest_orf {
    my $self = shift;

    my ($max,$pos,$frame);
    foreach (-3..-1,1..3) {
    my $translation = $self->translate($_);
    while ($translation=~/([^*]+)/g) {
        if (length($1) > length($max)) {
        $max = $1;
        $frame = $_;
        $pos = pos($translation) - length($max); 
        }
    }
    }
    $pos *= 3;
    $pos += abs($frame);
    return ($pos,$pos+3*length($max)-1) if $frame > 0;
    return ($self->length-$pos,$self->length-$pos-3*length($max));
}
sub align {
    my $self = shift;
    my $seq = shift;
    $seq = $seq->seq if ref($seq);
    return new Sequence::Alignment(src=>$seq,target=>$self->seq);
}
sub _canonicalize {
  my $self = shift;
  my $seq = shift;
  $seq =~ tr/uU/tT/;
  $seq =~ s/[^gatcn]//ig;
  return uc($seq);
}
1;

Bash script to convert SAM to BAM visualization ready

Jit — Mon, 14 May 2018 07:31:10 -0500

samtools view -bS file.sam | samtools sort - file_sorted

samtools index test_sorted.bam test_sorted.bai

Coverage / Depth of reads !

Neel — Tue, 17 Apr 2018 14:18:44 -0500

# get total number of bases covered at MIN_COVERAGE_DEPTH or higher
samtools mpileup mapping_result_sorted.bam | awk -v X="${MIN_COVERAGE_DEPTH}" '$4>=X' | wc -l
32876

# get length of reference genome
bowtie2-inspect -s refgenome | awk '{ FS = "\t" } ; BEGIN{L=0}; {L=L+$3}; END{print L}'
45678

Genome Covered !

Neel — Tue, 17 Apr 2018 14:13:05 -0500

zero=$(bedtools genomecov -ibam BAM -g hg38.fasta -bga | awk '$4==0 {bpCountZero+=($3-$2)} {print bpCountZero}' | tail -1)

nonzero=$(bedtools genomecov -ibam BAM -g hg38.fasta -bga | awk '$4>0 {bpCountNonZero+=($3-$2)} {print bpCountNonZero}' | tail -1)

percent=$(bc <<< "scale=6; ($nonzero / ($zero + $nonzero))*100")

echo $percent

Perl script to read multi fasta sequence one by one

Jit — Fri, 06 Apr 2018 09:01:17 -0500

#!/usr/bin/env perl

use strict;
use warnings;

#USAGE
#perl rohanRun.pl seq.fa
my $outfile='tmp.fa';
my $fastaSeq_ref = readfasta ("$ARGV[0]");
my %fastaSeq = %$fastaSeq_ref;
foreach my $key ( keys %fastaSeq) {
open (OUT, ">$outfile") or die "couldn't open the file $outfile $!";
print OUT "$key\n$fastaSeq{$key}\n";

}


sub readfasta 
{
  (my $file)=@_;
	my %sequence;
	my $header;
	my $temp_seq;
	
	#suppose fasta files contains multiple sequences;
	 
	open (IN, "<$file") or die "couldn't open the file $file $!";
	
	while ()
	{	
		chop;
		next if /^\s*$/; #skip empty line 
		if ($_ =~ /^>/)  #when see head line
		{	
			$header= $_;
			if ($sequence{$header}){print colored("#CAUTION: SAME FASTA HAS BEEN READ MULTIPLE TIMES.\n#CAUTION: PLEASE CHECK FASTA SEQUENCE:$header\n","red")};
			if ($temp_seq) {$temp_seq=""} # If there is alreay sequence in temp_seq, empty the sequence file
			
		}
		else # when see the sequence line 
		{
		   s/\s+//g;
		   $temp_seq .= $_;
		   $sequence{$header}=$temp_seq; #update the contents
		}
	
	}
	
	return \%sequence;
}

Perl script to count the number of files in a directory with regex

Jit — Wed, 21 Mar 2018 05:13:53 -0500

#!/usr/bin/perl
use strict;
use warnings;
my @allNames=("_D14_","_B14_","_B15_","_C1T1_","_C3T3_","_D12_","_D13_","_E1B1_","_E1B3_","_H001_","_H3-03_","_H3-04_","_HB0_","_C28_","_C33_","_C31_","_C1T2_","_D11_","_E11_","_E31_","_B11_","_B33_","_B3B1_","_C210_","_C211_","_C21_","_C24_","_C27_","_E2B2_","_H004_","_H4-28_","_Hprim34_","_Hprim53_","_HPRIM1_","_HPRIM21_","_HPRIM22_","_HPRIM36_","_A110_","_A12_","_A11_","_A111_","_A112_","_A13_","_A14_","_A15_","_A18_","_A19_","_C29_","_Hprim18_","_D22_","_D23_","_D21_","_A16_","_A17_","_B24_","_A3B1_","_C2B4_","_B22_","_H3-14_","_Hprim54_","_Hprim12_","_H4-02_","_H4-04_","_HPRIM15_","_HPRIM19_","_B39_","_C3B1_","_C3T1_","_HPRIM16_","_A21_","_A22_","_A32_","_A33_","_A24_","_A25_","_A23_","_B34_","_B38_","_E3B2_","_E3T1_","_HPRIM14_","_H158_");

my @files = glob("*.scf *.SCF");
foreach my $name (@allNames) {
my $nName=lc ($name);
my $cnt=0;
foreach my $file (@files) {
#print "$file =~ /$nName/\n";
	my $nFile =lc $file;
	$cnt++ if ($nFile =~ /$nName/);
        #print "$file\n";
    }
print "$cnt\n";
if ($cnt > 2 or $cnt < 2) { print "$nName\n";}
}