BOL: All

RPKM normalization- R script

EagleEye — Fri, 24 Jun 2016 17:45:55 -0500

P<-read.table("input_table.txt",sep="\t",header=T)

len <- ncol(p)

rownames(p) <- p[,1]

for(i in 2:ncol(p)-1)

{

d <-p[,i]

l <- p[,len] #accessing the length column

cS <- sum(as.numeric(p[,i])) #Total mapped reads per sample 

rpkm[[i]] <- (10^9)*(as.numeric(p[,i]))/(as.numeric(l)*cS)

rpkm[[1]] <- p[[1]]

}

write.table(rpkm,"output_table_rpkm.txt",sep="\t",quote=F,row.names=F)

Perl script to extract fasta sequence by matching name/ids !!

Surabhi Chaudhary — Tue, 21 Jun 2016 09:28:19 -0500

#!/usr/bin/perl

use strict;
use warnings;
use Text::Trim qw(trim);

#Usage perl extractSeqbyID.pl ids.txt seq.fasta Result.fasta

$ARGV[2] or die "use extractSeqbyID.pl LIST FASTA OUT\n";

my $list = shift @ARGV;
my $fasta = shift @ARGV;
my $out = shift @ARGV;
my %select;

open LINE, "$list" or die;
while () {
    chomp;
    next if /^\s*$/;
    s/>//g; 
    my @ids=split (/\t/, $_);
    $select{$ids[0]} = 1;
}
my $size = keys %select;
print "Total Ids $size\n";
close LINE;

$/ = "\n>";
open OUT, ">$out" or die;
open FILE, "$fasta" or die;
while () {
    trim($_);
    s/>//g;
    my ($id) = split (/\n/, $_);
    #my @i=split (/\s/, $id); # To avoid >flattened_line_10751 circular cases
    print OUT ">$_" if (defined $select{$id});
}
close FILE;
close OUT;

Perl script to extract lines with matching ids !!

Surabhi Chaudhary — Tue, 21 Jun 2016 09:24:46 -0500

#!/usr/bin/perl
use strict;
use warnings;
my %patterns;

#USAGE: perl extactByIds.pl Idsfile1 file2 > Result

# Open file and get patterns to search for
open(my $fh2,"<","$ARGV[0]")|| die "ERROR: Could not open file2";
while (<$fh2>)
{
   chop;
   $patterns{$_}=1;
}

# Now read data file
open(my $fh1,"<","$ARGV[1]")|| die "ERROR: Could not open file1";
while (<$fh1>)
{
   # You might need to adjust this place according to your file type
   #(undef,$srch,undef)=split;
   my @ids=split (/\t/, $_);
   print $_ if defined $patterns{$ids[0]};
}

Perl script to find the absolute "full" path of the file !

Jit — Fri, 17 Jun 2016 08:58:08 -0500

#!/usr/bin/perl

use Cwd;
my $this_file_full_path = Cwd::abs_path(__FILE__);
print "$this_file_full_path\n";


use Cwd qw/ realpath /;
## $0; this script 
my $path = realpath($0);
print $path;

Perl script to generate a random psuedo DNA sequence !

Abhi — Mon, 13 Jun 2016 08:49:13 -0500

#!/usr/bin/perl

print "Enter a number of nucleotides: \n";
chomp ($N = );
@b=qw/A T G C/;print ">Genome\n";while($l<$N){print @b[int(rand(4))];$l++;};
print "\n"

Implementation of biological random mutation with Perl

Priya Singh — Thu, 26 May 2016 02:38:01 -0500

#!/usr/bin/perl -w

use strict;  
use warnings;  

#sequence for a better recognition  
my $DNA="AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA \ n";  
my $I;  
my $mutant;  
srand (time | $$).  
$mutant=mutate ($DNA);  
print "Mutate \ n". $DNA;  
print "Here is the the original DNA: \ n";  
print "$DNA \ n";  
print "Here is the mutant DNA: \ n \ n";  
print "$mutant \ n";  
print "motorcycle 10 more successive mutations: \ n";  
for ($I=0; $i<10; $I + +)  { 
    $mutant=mutate ($mutant);  
    Print "$mutant \ n";  
}  

#subroutine: according to the length of the sequence is defined a random location of subroutine  
sub randomposition { 
    my ($string)=@ _;  
    return int (rand (length ($string)));  

} #subroutine: randomly selected from an element from an array  
sub randelement { 
  my (@array)=@ _;  
  return $array [rand @ array];  
}  

#subroutine: refer to the above subroutine, randomly selected from four bases ATGC a  
sub randomnucleotide { 
  my (@nucleotides)='ve/A, C/T G'.  
  return randelement (@nucleotides);  
}  

#subroutine: generate mutations subroutine  
sub mutate { 
    my DNA ($)=@ _;  
    my (@nucleotides)='ve (A C T G)';  
    my ($position)=randomposition DNA ($);  
    my ($newbase)=randomnucleotide (@nucleotides);  
    DNA, substr ($$position, 1, $newbase); #substr ($string, $initial_position, $length, replacement substring)  
    return $DNA;  

}

Count GC Content in nucleotide sequence with Perl

Radha Agarkar — Sat, 21 May 2016 22:56:18 -0500

#!/usr/bin/perl -w

### Usage: get_gc_content.pl                                                         ###

#---------------------------------------------------------------------------------------------------------------------------
#Deal with passed parameters
#---------------------------------------------------------------------------------------------------------------------------
if ($#ARGV == -1) {
    usage();
    exit;
}
$fasta_file = $ARGV[0];
$out_file = "gc_out.txt";
unless ( open(IN, "$fasta_file") ) {    
    print "Got a bad fasta file: $fasta_file\n\n";
    exit;
}
unless ( open(OUT, ">$out_file") ) {
    print "Couldn't create $out_file\n";
    exit;
}
print "Parameters:\nfasta file = $fasta_file\noutput file = $out_file\n\n";
#---------------------------------------------------------------------------------------------------------------------------
#The main event
#---------------------------------------------------------------------------------------------------------------------------
print OUT "ID\t% GCContent\tTotal Count\tG Count\tC Count\tA Count\tT Count\n";
$seq = "";
while () {
    chomp;
    if (/^>/) {
	#finish up previous line.
	if (length($seq) > 0) {
	    &process_it;
	}
	#start new line.
	$id = $_;
	$id =~ s/^>(.+?)\s.+$/$1/g;
	print OUT "$id\t";
    }
    else {
	$seq = $seq . $_;
    }
}

#finish up last line.
&process_it;

close(IN);
close(OUT);

sub usage {
   $0 get_gc_content.pl     
}

sub process_it {
    @letters = split(//, $seq);
    $gccount = 0;
    $totalcount = 0;
    $acount = 0;
    $tcount = 0;
    $gcount = 0;
    $ccount = 0;
    foreach $i (@letters) {
	if (lc($i) =~ /[a-z]/) {
	    $totalcount++;
	}
	if (lc($i) eq "g" || lc($i) eq "c") {
	    $gccount++;
	}
	if (lc($i) eq "a") {
	    $acount++;
	}
	if (lc($i) eq "t") {
	    $tcount++;
	}
	if (lc($i) eq "g") {
	    $gcount++;
	}
	if (lc($i) eq "c") {
	    $ccount++;
	}
    }
    if ($totalcount > 0) {
	$gccontent = (100 * $gccount) / $totalcount;
    }
    else {
	$gccontent = 0;
    }
    print OUT "$gccontent\t$totalcount\t$gcount\t$ccount\t$acount\t$tcount\n";
    $seq = "";
}

Needleman-Wunsch Algorithm in Perl

Radha Agarkar — Sat, 21 May 2016 22:07:06 -0500

#!/usr/bin/perl

# USAGE:   perl nw.pl HEAGAWGHEE PAWHEAE BLOSUM50.txt -8

# See:     "Biological sequence anaysis" Durbin et al. ed. CUP 1998, Pg. 19
# Needleman-Wunsch global alignment algo (GOTHO 1982 mod)

# usage statement
die "usage: $0     \n" unless @ARGV == 4;

# get sequences, matrix and gapcost from command line
my ($seq1, $seq2, $smfile, $gapcost) = @ARGV;

# scoring scheme (instead of using fixed MATCH and MISMATCH scores we will use values read from BLOSUM50)
my $MATCH    =  1; # +1 for letters that match
my $MISMATCH = -1; # -1 for letters that mismatch
my $GAP      = $gapcost; # for any gap
my %BLOSUM50 = ();
my @aalist = ();

# read substitution matrix
open in, $smfile;
while(){
    if($.<2){next;}
    # read columns names (aa)
    if($.<3){
        chop $_;
        @aalist=split(/\s+/,$_);
        next;
        }
    chop $_;
    @vals=split(/\s+/,$_);
    $curaaROW=$vals[0];
    for($i=1;$i<=$#vals;$i++){
        $curaaCOLUMN=$aalist[$i];
        $BLOSUM50{$curaaROW}{$curaaCOLUMN}=$vals[$i];
    }   
}
close in;


# initialization
my @matrix;
$matrix[0][0]{score}   = 0;
$matrix[0][0]{pointer} = "none";
for(my $j = 1; $j <= length($seq1); $j++) {
    $matrix[0][$j]{score}   = $GAP * $j;
    $matrix[0][$j]{pointer} = "left";
}
for (my $i = 1; $i <= length($seq2); $i++) {
    $matrix[$i][0]{score}   = $GAP * $i;
    $matrix[$i][0]{pointer} = "up";
}

# fill
for(my $i = 1; $i <= length($seq2); $i++) {
    for(my $j = 1; $j <= length($seq1); $j++) {
        my ($diagonal_score, $left_score, $up_score);

        # calculate match score
        my $letter1 = substr($seq1, $j-1, 1);
        my $letter2 = substr($seq2, $i-1, 1);                            
        if ($letter1 eq $letter2) {
            $diagonal_score = $matrix[$i-1][$j-1]{score} + $BLOSUM50{$letter1}{$letter2};
        }
        else {
            $diagonal_score = $matrix[$i-1][$j-1]{score} + $BLOSUM50{$letter1}{$letter2};
        }

        # calculate gap scores
        $up_score   = $matrix[$i-1][$j]{score} + $GAP;
        $left_score = $matrix[$i][$j-1]{score} + $GAP;

        # choose best score
        if ($diagonal_score >= $up_score) {
            if ($diagonal_score >= $left_score) {
                $matrix[$i][$j]{score}   = $diagonal_score;
                $matrix[$i][$j]{pointer} = "diagonal";
            }
        else {
                $matrix[$i][$j]{score}   = $left_score;
                $matrix[$i][$j]{pointer} = "left";
            }
        } else {
            if ($up_score >= $left_score) {
                $matrix[$i][$j]{score}   = $up_score;
                $matrix[$i][$j]{pointer} = "up";
            }
            else {
                $matrix[$i][$j]{score}   = $left_score;
                $matrix[$i][$j]{pointer} = "left";
            }
        }
    }
}

# trace-back

my $align1 = "";
my $align2 = "";
my $descrstr = "";

# start at last cell of matrix
my $j = length($seq1);
my $i = length($seq2);

while (1) {
    last if $matrix[$i][$j]{pointer} eq "none"; # ends at first cell of matrix

    if ($matrix[$i][$j]{pointer} eq "diagonal") {
        $align1 .= substr($seq1, $j-1, 1);
        $align2 .= substr($seq2, $i-1, 1);
        if(substr($seq1, $j-1,1) eq substr($seq2, $i-1,1)){$descrstr .="|";}else{$descrstr .= ".";}
        $i--;
        $j--;
    }
    elsif ($matrix[$i][$j]{pointer} eq "left") {
        $align1 .= substr($seq1, $j-1, 1);
        $align2 .= "-";
        $descrstr .= " ";
        $j--;
    }
    elsif ($matrix[$i][$j]{pointer} eq "up") {
        $align1 .= "-";
        $align2 .= substr($seq2, $i-1, 1);
        $descrstr .= " ";
        $i--;
    }    
}

$align1 = reverse $align1;
$align2 = reverse $align2;
$descrstr = reverse $descrstr;

# print matrices:
print "\n\n";

for(my $i = 0; $i <= length($seq2); $i++) {
    for(my $j = 0; $j <= length($seq1); $j++) {
        printf("%2.1f", $matrix[$i][$j]{score});
        print("\t");
    }
    print"\n";
}
print "\n\n";

# print the alignment:
print "$align1\n";
print "$descrstr\n";
print "$align2\n";

__END__
# Entries for the BLOSUM50 matrix at a scale of ln(2)/3.0.
Find matrix at http://bioinformaticsonline.com/file/view/27455/blosum50-matrix

Generating a random string with Perl

Abhi — Fri, 20 May 2016 05:13:20 -0500

#!/usr/bin/perl

# This function generates random strings of a given length
sub generate_random_string
{
	my $length_of_randomstring=shift;# the length of 
			 # the random string to generate

	my @chars=('a'..'z','A'..'Z','0'..'9','_');
	my $random_string;
	foreach (1..$length_of_randomstring) 
	{
		# rand @chars will generate a random 
		# number between 0 and scalar @chars
		$random_string.=$chars[rand @chars];
	}
	return $random_string;
}

#Generate the random string
my $random_string=&generate_random_string(11);

print "Random string: ".$random_string."\n";
print "Length: ".length($random_string)."\n";

Find the number of each 2 consecutive characters AA, AC,AG,AT,CC,CA... with Perl

Jit — Wed, 18 May 2016 08:50:04 -0500

#!/usr/bin/perl -w

use strict;

my $subject = "AACGTACTGACGTACTGGTTGGTACGA";
my %results = ();
while ($subject =~ m/[ACTG][ATGC]/g) {
    # matched text = $&
        if(exists $results{$&})
        {
            $results{$&}++ 
        }
        else
        {
            $results{$&} = 1;
        }
}

foreach (sort keys %results) {
    print "$_ : $results{$_}\n";
  }