BOL: Owner

Biological Sequence handling with Perl !

Rahul Nayak — Wed, 16 May 2018 08:18:12 -0500

package Sequence::Generic;
# File: Sequence/Generic.pm

use strict;
use Carp;
use overload 
  '""'        => 'asString',
  'neg'       => 'reverse',
  '.'         => 'concatenate',
  'fallback'  => 'TRUE';

# These methods should be overriden by child classes
# class constructor
sub new {
    my $class = shift;
    croak "$class must override the new() method";
}
# Return the sequence as a string
sub seq {
    my $self = shift;
    croak ref($self)," must override the seq() method";
}
# Return the type of the sequence as a human readable string
sub type {
    return 'Generic Sequence';
}
# These methods probably don't have to be overridden
# The length of the sequence
sub length {
    my $self = shift;
    return length($self->seq);
}
# The reverse of the sequence
sub reverse {
    my $self = shift;
    my $reversed = reverse $self->seq;
    return $reversed;
}
# A human-readable description of the object
sub asString {
  my $self = shift;
  return $self->type . '(' . $self->length . ' residues)';
}
# Concatenate two sequences together and return the result

sub concatenate {
  my $self = shift;
  my ($new_seq,$prepend) = @_;
  my ($to_append);
  if (ref($new_seq)) {
      croak "argument to concatenate must be a string or a Sequence object"
      unless $new_seq->isa(__PACKAGE__);
      $to_append = $new_seq->seq ;
  } else {
      $to_append = $new_seq;
  }
  return $self->new($prepend ? $to_append . $self->seq 
                     : $self->seq . $to_append);
}
1;

Back to Article

Listing Two
 package Sequence::Nucleotide;
# file: Sequence/Nucleotide.pm

use Sequence::Generic;
use Sequence::Nucleotide::Subsequence;
use Sequence::Alignment;
use Carp;

use strict;
use vars '@ISA';
:Generic';

my %CODON_TABLE = (
           UCA => 'S',UCG => 'S',UCC => 'S',UCU => 'S',
           UUU => 'F',UUC => 'F',UUA => 'L',UUG => 'L',
           UAU => 'Y',UAC => 'Y',UAA => '*',UAG => '*',
           UGU => 'C',UGC => 'C',UGA => '*',UGG => 'W',
           CUA => 'L',CUG => 'L',CUC => 'L',CUU => 'L',
           CCA => 'P',CCG => 'P',CCC => 'P',CCU => 'P',
           CAU => 'H',CAC => 'H',CAA => 'Q',CAG => 'Q',
           CGA => 'R',CGG => 'R',CGC => 'R',CGU => 'R',
           AUU => 'I',AUC => 'I',AUA => 'I',AUG => 'M',
           ACA => 'T',ACG => 'T',ACC => 'T',ACU => 'T',
           AAU => 'N',AAC => 'N',AAA => 'K',AAG => 'K',
           AGU => 'S',AGC => 'S',AGA => 'R',AGG => 'R',
           GUA => 'V',GUG => 'V',GUC => 'V',GUU => 'V',
           GCA => 'A',GCG => 'A',GCC => 'A',GCU => 'A',
           GAU => 'D',GAC => 'D',GAA => 'E',GAG => 'E',
           GGA => 'G',GGG => 'G',GGC => 'G',GGU => 'G',
          );
*complement = *reversec = \&reverse;

sub new {
  my $class = shift;
  $class = ref($class) if ref($class);
  my ($sequence,$type) = @_;

  my $self = bless {},$class;
  if (ref($sequence)) {
    croak "Can't initialize sequence from non-Sequence object.\n"
      unless $sequence->can('seq');
    %{$self} = %{$sequence};  # clone operation
  } else {
    croak "Doesn't look like sequence data" 
      unless $sequence=~/^[gactnu\s]+$/i;
    $self->{'data'} = $self->_canonicalize($sequence);
    $self->{'type'} = $type || ($sequence=~/u/i ? 'RNA' : 'DNA');
  }
  return $self;
}
sub seq {
    my $self = shift;
    $self->{'data'} = $self->_canonicalize($_[0])  if defined($_[0]);
    my $seq = $self->{'data'};
    return $seq unless $self->is_RNA;
    $seq=~tr/T/U/;
    return $seq;
}
sub type {
    my $self = shift;
    return defined($_[0]) ? $self->{'type'} = $_[0] : $self->{'type'};
}
sub is_DNA {
    my $self = shift;
    return $self->type eq 'DNA';
}
sub is_RNA {
  my $self = shift;
  return $self->type eq 'RNA';
}
sub subseq {
  my $self = shift;
  my ($start,$end) = @_;
  return (__PACKAGE__ . '::Subsequence')->new($self,$start,$end);
}
sub reverse {
  my $self = shift;
  return (__PACKAGE__ . '::Subsequence')->new($self,$self->length,1);
}
sub translate {
  my $self = shift;
  my $frame = shift() || 1;
  my $l = $self->length;
  my $seq = $frame > 0 ? $self->subseq($frame,$l-($l-$frame+1)%3)
              : $self->reverse->subseq(abs($frame),$l-($l-abs($frame)+1)%3);
  my $s = $seq->seq;
  $s=~tr/T/U/;  # put it in RNA mode
  $s =~ s/(\S{3})/$CODON_TABLE{$1} || 'X'/eg;
  return $s;
}
sub longest_orf {
    my $self = shift;

    my ($max,$pos,$frame);
    foreach (-3..-1,1..3) {
    my $translation = $self->translate($_);
    while ($translation=~/([^*]+)/g) {
        if (length($1) > length($max)) {
        $max = $1;
        $frame = $_;
        $pos = pos($translation) - length($max); 
        }
    }
    }
    $pos *= 3;
    $pos += abs($frame);
    return ($pos,$pos+3*length($max)-1) if $frame > 0;
    return ($self->length-$pos,$self->length-$pos-3*length($max));
}
sub align {
    my $self = shift;
    my $seq = shift;
    $seq = $seq->seq if ref($seq);
    return new Sequence::Alignment(src=>$seq,target=>$self->seq);
}
sub _canonicalize {
  my $self = shift;
  my $seq = shift;
  $seq =~ tr/uU/tT/;
  $seq =~ s/[^gatcn]//ig;
  return uc($seq);
}
1;

Estimate Genome Size with Jellyfish and R

Rahul Nayak — Mon, 12 Mar 2018 10:11:19 -0500

jellyfish count -t 8 -C -m 19 -s 5G -o 19mer_out --min-qual-char=? /common/Tutorial/Genome_estimation/sample_read_1.fastq /common/Tutorial/Genome_estimation/sample_read_2.fastq

#-t    -treads=unit32       Number of treads to be used in the run. eg: 1,2,3,..etc.
#-C    -both-strands        Count both strands
#-m    -mer-len=unit32      Length of the k-mer    
#-s    -size=unit32         Hash size / memory allocation  
#-o    -output=string       Output file name
#--min-quality-char         Base quality value. Version 2.2.3 of Jellyfish uses the “Phred” score, where "?" = 30

jellyfish histo -o 19mer_out.histo 19mer_out

#Plot
dataframe19 <- read.table("19mer_out.histo") #load the data into dataframe19
plot(dataframe19[1:200,], type="l") #plots the data points 1 through 200 in the dataframe19 using a line

plot(dataframe19[2:200,], type="l")

plot(dataframe19[2:100,], type="l") #plot line graph 
points(dataframe19[2:100,]) #plot the data points from 2 through 100

sum(as.numeric(dataframe19[2:9325,1]*dataframe19[2:9325,2]))

data[10:20,]

sum(as.numeric(dataframe19[2:9325,1]*dataframe19[2:9325,2]))/12

#Return around ~ 305 Mb

Plot dotplot with last !

Rahul Nayak — Tue, 06 Mar 2018 09:21:18 -0600

# generate dotplot
lastdb test/ref.fa
lastal -f TAB test/ref.fa test/contigs.reduced.pacbio.fa | last-dotplot - test/contigs.reduced.pacbio.fa.ref.png
lastal -f TAB test/ref.fa test/contigs.reduced.nanopore.fa | last-dotplot - test/contigs.reduced.nanopore.fa.ref.png

Download genomes in batch from NCBI

Rahul Nayak — Fri, 23 Feb 2018 08:52:03 -0600

curl 'ftp://ftp.ncbi.nlm.nih.gov/genomes/genbank/bacteria/assembly_summary.txt' | awk '{FS="\t"} !/^#/ {print $20}' | sed -r 's|(ftp://ftp.ncbi.nlm.nih.gov/genomes/all/)(GCA/)([0-9]{3}/)([0-9]{3}/)([0-9]{3}/)(GCA_.+)|\1\2\3\4\5\6/\6_genomic.fna.gz|' > genomic_file

Estimate Genome Size

Rahul Nayak — Thu, 22 Feb 2018 03:28:26 -0600

# Count k-mer occurrence using Jellyfish 2.2.6
jellyfish count -t 8 -C -m 19 -s 5G -o 19mer_out --min-qual-char=? sread_1.fastq sread_2.fastq

# points for a histogram
jellyfish histo -o 19mer_out.histo 19mer_out

#Plot results using R
##load the data into dataframe19
dataframe19 <- read.table("19mer_out.histo") 
##plots the data points 1 through 200 in the dataframe19 using a line
plot(dataframe19[1:200,], type="l")
##plot the data points from 2 through 100
points(dataframe19[2:100,])

#calculate the total k-mers in the distribution
#Assuming the total number of data points is 9325
sum(as.numeric(dataframe19[2:9325,1]*dataframe19[2:9325,2]))

#peak position and genome size
#plotted graph we can get an idea where the peak position lies
#see actual point 
data[10:20,] #If peak more likely to be between 10-20

#If 12 is the peak
sum(as.numeric(dataframe19[2:9325,1]*dataframe19[2:9325,2]))/12

#Compare the peak shape with Poisson distribution in R
singleC <- sum(as.numeric(dataframe19[2:28,1]*dataframe19[2:28,2]))/12
poisdtb <- dpois(1:100,12)*singleC
plot(poisdtb, type='l', lty=2, col="green")
lines(dataframe19[1:100,12] * singleC, type = "l", col=3)#, Ity=2)
lines(dataframe19[1:100,],type= "l")


#ALTERNATE  WAY
#https://github.com/dib-lab/khmer/blob/master/scripts/normalize-by-median.py
python normalize-by-median.py -x 1e8 -k 20 -C 20 -R report.txt reads.fa
#https://github.com/dib-lab/khmer-recipes/blob/master/003-estimate-genome-size/estimate-genome-size.py
python estimate-genome-size.py -C 20 -k 20 reads.fa.keep report.txt

Calculate Dinucleotide Frequency with Perl

Rahul Nayak — Sun, 10 Dec 2017 05:51:42 -0600

#!/usr/bin/perl -w
use strict;

my ($genome, $head, $tail);
my (%mono_nt, %di_nt);

$/ = ">";
open my $fasta, '<', $ARGV[0] or die $!;
while (<$fasta>) {
    chomp; s/\r//g; s/^\s*|\s*$//;
    if (/.+?\n(.+)/s) {
        (my $seq = $1) =~ s/\n//g;
        $genome .= uc $seq;
        $head = uc substr($seq, 0, 1);
        $di_nt{"$tail$head"}-- if $tail;
        $tail = uc substr($seq, -1);
    }
}
close $fasta;

my $len = length $genome;
for my $i (0..$len-2) {
    my $each_mono_nt = substr($genome, $i, 1);
    my $each_di_nt   = substr($genome, $i, 2);
    $mono_nt{$each_mono_nt}++;
    $di_nt{$each_di_nt}++;
}
$mono_nt{$tail}++;

print "-"x30, "\nSingle nucleotide frequency:\n";
for my $nt (sort keys %mono_nt) {print "$nt\t", $mono_nt{$nt} / $len, "\n";}

print "\n", "-"x30, "\nDinucleotide frequency:\nDinucleotide\tObs. freq.\tExp. freq.\n";
for my $nt_pair (sort keys %di_nt) {
    my ($first_nt, $second_nt) = split //, $nt_pair;
    print "$nt_pair\t", $di_nt{$nt_pair} / ($len-1), "\t",
        $mono_nt{$first_nt} * $mono_nt{$second_nt} /$len /$len, "\n";
}

Create a heatmap with R

Rahul Nayak — Mon, 31 Jul 2017 08:45:58 -0500

bio <- read.csv("ppg2008.csv", sep=",")

bio <- bio[order(bio$PTS),]
row.names(bio) <- bio$Name
bio <- bio[,2:20]

bio_matrix <- data.matrix(bio)
bio_heatmap <- heatmap(bio_matrix, Rowv=NA, Colv=NA, col = brewer.pal(9, "Blues"), scale="column", margins=c(5,10))


##
#Sample DATA
#Name  ,G,MIN,PTS,FGM,FGA,FGP,FTM,FTA,FTP,3PM,3PA,3PP,ORB,DRB,TRB,AST,STL,BLK,TO,PF
#Genome1 ,79,38.6,30.2,10.8,22,0.491,7.5,9.8,0.765,1.1,3.5,0.317,1.1,3.9,5,7.5,2.2,1.3,3.4,2.3
#Genome2 ,81,37.7,28.4,9.7,19.9,0.489,7.3,9.4,0.78,1.6,4.7,0.344,1.3,6.3,7.6,7.2,1.7,1.1,3,1.7
#Genome3,82,36.2,26.8,9.8,20.9,0.467,5.9,6.9,0.856,1.4,4.1,0.351,1.1,4.1,5.2,4.9,1.5,0.5,2.6,2.3


library(ggplot2)
bio <- read.csv("seeTNF_Final", sep="\t")
row.names(bio) <- bio$Contig
bio <- bio[,2:256]
data=as.matrix(bio)
head(data)
#Rcolorbrewer palette
library(RColorBrewer)
coul = colorRampPalette(brewer.pal(8, "PiYG"))(25)
#heatmap(data)

# Use 'scale' to normalize (right)
heatmap(data, scale="column")
#heatmap(data, scale="column", col = coul)

Download the gff files from NCBI using bash script/command

Rahul Nayak — Thu, 08 Jun 2017 08:17:11 -0500

#!/bin/bash

# Download the genome from NCBI using command

# Create a Directory
mkdir genome_gff
cd genome_gff

# Look for genome assembly summary and extract the URL
# USER need to provide the right summary file to curl  
# Commentline if you are not interested in that genome set
# -for fungi
curl 'ftp://ftp.ncbi.nlm.nih.gov/genomes/refseq/fungi/assembly_summary.txt' | awk '{FS="\t"} !/^#/ {print $20} ' | sed -r 's|(ftp://ftp.ncbi.nlm.nih.gov/genomes/all/.+/)(GCF_.+)|\1\2/\2_genomic.gff.gz|' > genomic_file_fungi

# -for bacteria
curl 'ftp://ftp.ncbi.nlm.nih.gov/genomes/genbank/bacteria/assembly_summary.txt' | awk '{FS="\t"} !/^#/ {print $20} ' | sed -r 's|(ftp://ftp.ncbi.nlm.nih.gov/genomes/all/.+/)(GCA_.+)|\1\2/\2_genomic.gff.gz|' > genomic_file_bacteria

# -for plant 
curl 'ftp://ftp.ncbi.nlm.nih.gov/genomes/refseq/plant/assembly_summary.txt' | awk '{FS="\t"} !/^#/ {print $20} ' | sed -r 's|(ftp://ftp.ncbi.nlm.nih.gov/genomes/all/.+/)(GCF_.+)|\1\2/\2_genomic.gff.gz|' > genomic_file_plant 

# -for archaea
curl 'ftp://ftp.ncbi.nlm.nih.gov/genomes/refseq/archaea/assembly_summary.txt' | awk '{FS="\t"} !/^#/ {print $20} ' | sed -r 's|(ftp://ftp.ncbi.nlm.nih.gov/genomes/all/.+/)(GCF_.+)|\1\2/\2_genomic.gff.gz|' > genomic_file_archaea

# -for protozoa
curl 'ftp://ftp.ncbi.nlm.nih.gov/genomes/refseq/protozoa/assembly_summary.txt' | awk '{FS="\t"} !/^#/ {print $20} ' | sed -r 's|(ftp://ftp.ncbi.nlm.nih.gov/genomes/all/.+/)(GCF_.+)|\1\2/\2_genomic.gff.gz|' > genomic_file_protozoa

# -for vertebrate_mammalian
curl 'ftp://ftp.ncbi.nlm.nih.gov/genomes/refseq/vertebrate_mammalian/assembly_summary.txt' | awk '{FS="\t"} !/^#/ {print $20} ' | sed -r 's|(ftp://ftp.ncbi.nlm.nih.gov/genomes/all/.+/)(GCF_.+)|\1\2/\2_genomic.gff.gz|' > genomic_file_vertebrate_mammalian

# -for vertebrate_other
curl 'ftp://ftp.ncbi.nlm.nih.gov/genomes/refseq/vertebrate_other/assembly_summary.txt' | awk '{FS="\t"} !/^#/ {print $20} ' | sed -r 's|(ftp://ftp.ncbi.nlm.nih.gov/genomes/all/.+/)(GCF_.+)|\1\2/\2_genomic.gff.gz|' > genomic_file_vertebrate_other

# -for invertebrate
curl 'ftp://ftp.ncbi.nlm.nih.gov/genomes/refseq/invertebrate/assembly_summary.txt' | awk '{FS="\t"} !/^#/ {print $20} ' | sed -r 's|(ftp://ftp.ncbi.nlm.nih.gov/genomes/all/.+/)(GCF_.+)|\1\2/\2_genomic.gff.gz|' > genomic_file_invertebrate

# -for viral
curl 'ftp://ftp.ncbi.nlm.nih.gov/genomes/refseq/viral/assembly_summary.txt' | awk '{FS="\t"} !/^#/ {print $20} ' | sed -r 's|(ftp://ftp.ncbi.nlm.nih.gov/genomes/all/.+/)(GCF_.+)|\1\2/\2_genomic.gff.gz|' > genomic_file_viral

#Read the uerl from file and download

FILES=$(pwd)/*
for f in $FILES
do
  echo "Processing $f file..."
  filename=$(basename "$f")
  extension="${filename##*.}"
  filename="${filename%.*}"
  # Create a directory with appending G
  mkdir "GFF$filename"
  cd "GFF$filename"
  # take action on each file. $f store current file name
  head -n 4 $f > $f.head
  wget --input $f.head
  gunzip *.gz
  #cat $f
  cd ..
done