BOL: All

Perl script to count the number of Adenine, Thymine, Guanine and Cytosine in your DNA Sequence

Anjana — Wed, 11 May 2016 10:34:44 -0500

#!/usr/local/bin/perl -w

# While executing this script it asks for the file name of the DNA sequence. If the sequence file is not available in the same directory of this script, enter the name of the file along with the path.  
In windows:  c:\dnafile.txt, 
In Linux: /home/user/sequence/dnafile.txt

print "ENTER THE FILENAME OF THE DNA SEQUENCE:= ";
$dna_filename = ;
chomp $dna_filename;
unless ( open(DNAFILE, $dna_filename) ) 
{
	print "Sorry the file does not exist!!! \n";
	print "Cannot open file \"$dna_filename\"\n";
	die;
}
@DNA = ;
close DNAFILE;
$DNA = join( '', @DNA);
print " \n The original DNA file is:\n  $DNA \n";
$DNA =~ s/\s//g;
@DNA = split( '', $DNA );
$count_of_A = 0;
$count_of_C = 0;
$count_of_G = 0;
$count_of_T = 0;
$errors     = 0;
foreach $base (@DNA) {

    if     ( $base eq  'a' ) {
        ++$count_of_A;
    } elsif ( $base eq 'c' ) {
        ++$count_of_C;
    } elsif ( $base eq 'g' ) {
        ++$count_of_G;
    } elsif ( $base eq 't' ) {
        ++$count_of_T;
    }
        elsif ( $base eq 'T' ) {
        ++$count_of_T; }

        elsif ( $base eq 'C' ) {
        ++$count_of_C; }
        elsif ( $base eq 'A' ) {
        ++$count_of_A; }
        elsif ( $base eq 'G' ) {
        ++$count_of_G; }

        else {
        print "Error - Unknown base: $base\n";
        ++$errors;
    }
}
print "Adenine = $count_of_A\n";
print "Cytosine = $count_of_C\n";
print "Guanine = $count_of_G\n";
print "Thymine = $count_of_T\n";

if ($errors) {
        print "There were $errors unrecognized bases.\n";
}

Perl script to Mutate a DNA Sequence

Anjana — Wed, 11 May 2016 10:27:58 -0500

#!/usr/local/bin/perl -w

# This script randomly mutates the DNA sequence and generates 10 successive mutation results.
# While executing this script it asks for the file name of the DNA sequence.
# If the DNA sequence file is not in the same directory of this script, enter the file name with its full path.
# Example:
# In windows:  c:\rnafile.txt
# In Linux  : /home/user/sequence/rnafile.txt

use File::Path;

print "ENTER THE FILENAME OF THE DNA SEQUENCE:= ";
$dnafilename = ;
chomp $dnafilename;
unless ( open(DNAFILE, $dnafilename) ) 
{
    print "Cannot open file \"$dnafilename\"\n\n";
    goto h;
}
my $DNA = ;
close DNAFILE;

my $i;
my $mutant;
$mutant = mutate($DNA);
print "Mutate DNA\n\n";

print "HERE ARE THE 10 SUCCESSIVE MUTATIONS:\n\n";
for ($i=0 ; $i < 10 ; ++$i)
  {
    $mutant = mutate($mutant);
    print "$mutant\n";
        print WRITE "$mutant\n";
  }

sub mutate
  {
        my($dna) = @_;
        my($position) = randomposition($dna);
        my $current_base = substr($dna, $position, 1);
        my $newbase;
    do
  {
        $newbase = randomnucleotide();
  }
        until ($newbase ne $current_base);
        substr($dna,$position,1,$newbase);
        return $dna;
  }
sub randomposition
  {
        my($string) = @_;
        return int rand length $string;
  }
sub randomelement
  {
    my(@array) = @_;
    return $array[rand @array];
  }
sub randomnucleotide
  {
    my(@nucleotides) = ('A', 'C', 'G', 'T');
    return randomelement(@nucleotides);
  }

Check all seqs in a folder

Anjana — Wed, 11 May 2016 10:16:47 -0500

#!/usr/local/bin/perl -w

# Can be easily modified to run any command on every sequence in a folder
# Directory of sequences
$myDir = "/home/anjana/seqs";

# Output directory (relative to $myDir or full path)
$outputDir = "OutDir";

# Path to pattern file
$patFile = "/home/anjana/patterns/polyA.pat";

# Go to sequence directory and open it (i.e, read contents)
chdir($myDir) || die "Cannot change to $myDir: $!";      # Go to $myDir
opendir(DIR, ".") || die "Cannot open .: $!";      # Open $myDir

foreach $seqFile (sort readdir(DIR))
{
    if ($seqFile =~ /\.fa$/)      # if file ends in .fa
    {
        print "Processing $seqFile\n";
        $outFile = $seqFile;         # Create $outFile name
        $outFile =~ s/\.fa/\.polyA\.out/;      # s/old/new/; 

        #User can process these files as per their need
        print "$patFile \t$seqFile \t $outputDir/$outFile\n";
     }
}

BioPerl to convert between sequence formats from Fasta to Genbank

Anjana — Wed, 11 May 2016 09:49:04 -0500

#!/usr/local/bin/perl -w

# Sequence formats to choose: Fasta, EMBL. GenBank, Swissprot, PIR and GCG

use Bio::SeqIO;

$inFile = "BRCA2.fa";

$in  = Bio::SeqIO->newFh('-file' => "$inFile" ,
                           '-format' => 'Fasta');
$out = Bio::SeqIO->newFh('-format' => 'Genbank');
print $out $_ while <$in>;

Parse a genbank file using regular expressions

Nishi Singh — Tue, 10 May 2016 11:56:26 -0500

#! /usr/local/bin/perl -w

$genbank = "genbank_file.txt";

open (GENBANK, $genbank) || die "cannot open $gb_report for reading: $!";

# Flag for multiline translation; 1 means translation "in progress"  
$trans = 0;

while ()
{
   if (/(LOCUS\s*)(\w*)(.*)/) { 
       print "Locus: $2\n"; 
   }
   elsif (/(VERSION.*GI:)(\d*)/) { 
      print "GI: $2\n"; 
   }
   elsif (/(DEFINITION\s*)(.*)(\.)/) {
      print "Sequence name: $2\n";
   }
   elsif (/(ORGANISM\s*)(.*)/) {
      print "Organism: $2\n";
   }
   elsif(/(gene)(\s*)(\d*)(\.\.)(\d*)/) {
      print "Gene length: $5\n";
   }
   elsif (/(CDS\s*)(\d*)(\.\.)(\d*)/)  {
   # ex: CDS             357..1541
      $cds_start = $2;
      $cds_end = $4;
      print "CDS: $cds_start - $cds_end\n";
   }
   elsif (/(\/translation=")(.*)/)  {  # protein product begins
      print "Translation: ";
      $protein = $2;
      $trans = 1;
   }
   elsif ($trans)  {   # translation still going on
      if (!/"/)  {  # no terminal quote; translation continues
         $protein .= $_;
      }
      elsif (/(.*)(")/)  {  # terminal quote; end of translation
         $protein .= $1;
         $protein =~ s/\s*//g;
         print "$protein\n";
         $trans = 0;
      }
      else  {
         print "Problems: end of translation product not found.\n";
      }
   }
   else  {
      # Skip this data
   }
}

Check if your coputer ready to use BioPerl

Nishi Singh — Tue, 10 May 2016 11:48:50 -0500

#!/usr/bin/perl
use strict;
use warnings;

#bioperl example code
use strict;
use warnings;

#make the bioperl module (class) accessible to your program
use Bio::Seq;

print"ok - ready to use Bio::Seq";

Read lines from input file – print lines that match a regular expression

Nishi Singh — Tue, 10 May 2016 11:46:19 -0500

#!/usr/bin/perl
use strict;
use warnings;

my $line;

#read the line-by-line for each line ask if it matches the regex print it if it matches

while($line = ){
        chomp $line;
        if ($line =~ /^ATG?C*[ATCG]+?A{3,10}$/) {
                print "$line\n";
        }
}

exit();

__DATA__
ATGCCCAA
ATGCCCAAAA
ATGCCCAAAAAAAAAAAAAAAAA
AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
DDDDDDDDDDDDDDDDDDDDDDDDDDDDDDD

For a given DNA sequence find its RNA transcript, find its reverse complement and check if the reverse complement contains a start codon

Nishi Singh — Tue, 10 May 2016 11:42:30 -0500

#!/usr/bin/perl
use strict;
use warnings;

my $DNA = "GATTACACAT";

#transcribe DNA to RNA - T changes to U
my $RNA = $DNA;
$RNA =~ s/T/U/g;
print "RNA sequence is $RNA\n";

#find the reverse complement of $DNA using substitution operator
#first - reverse the sequence
my $rcDNA = reverse($DNA);

$rcDNA =~ s/T/A/g;
$rcDNA =~ s/A/T/g;
$rcDNA =~ s/G/C/g;
$rcDNA =~ s/C/G/g;

print "Reverse complement of $DNA is $rcDNA\n"; #did it work?

#find the reverse complement of $DNA using translation operator
#first - reverse the sequence
$rcDNA = reverse($DNA);
#then - complement the sequence
$rcDNA =~ tr/ACGT/TGCA/;
#then - print the reverse complement
print "Reverse complement of $DNA is $rcDNA\n";

#look for a start codon in the reverse sequence
if($rcDNA =~ /ATG/){
	print "Start codon found\n";
}
else{
	print "Start codon not found\n";
}

Count the frequency of base G in a given DNA sequence

Nishi Singh — Tue, 10 May 2016 11:38:32 -0500

#!/usr/bin/perl
use strict;
use warnings;

my $DNA = "GATTACACAT";

#initialize $countG and $currentPos
my $countG = 0;
my $currentPos = 0;

#calculate the length of $DNA
my $DNAlength = length($DNA);

#for each letter in the sequence check if it is the base G
#if 'yes' increment $countG
while($currentPos < $DNAlength){
	my $base = substr($DNA,$currentPos,1);
	if($base eq "G"){
		$countG++;
	}
	$currentPos++;
} #end of while loop

#print out the number of Gs
print "There are $countG G bases\n";

Concatenate two given sequences, and find the length of the new sequence and also print out the second codon of the sequence

Nishi Singh — Tue, 10 May 2016 11:36:27 -0500

#!/usr/bin/perl
use strict;
use warnings;

#assign strings to variables
my $DNA = "GATTACACAT";
my $polyA = "AAAA";

#concatenate two strings
my $modifiedDNA = $DNA.$polyA;

#calculate the length of $modifiedDNA and
#print out the value of the variable and its length
my $DNAlength = length($modifiedDNA);
print "Modified DNA: $modifiedDNA has length $DNAlength\n";

#extract the second codon in $modifiedDNA
my $codon = substr($modifiedDNA,3,3);
print "Second codon is $codon\n";