BOL: All

String matching with Perl

Abhi — Wed, 18 May 2016 08:37:27 -0500

#!/usr/bin/perl


# make three strings of nucleotides
$dna1 = “AAAAAAAAAAAAAAATGAAAAAAAAAAAAAAAA”;
$dna2 = “AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA”;

$dna3 = “ATGAAAAAAAAAATGAAAAAAAAAAAATGAAAA”;
$pattern = “ATG”;


# match pattern to dna1
$m = $dna1 =~ m/$pattern/g;

print “Was the ATG pattern found in dna1 : $m \n”;


# match pattern in dna2

$m2 = $dna2 =~ m/$pattern/g;

print “Was the ATG pattern found in dna2 : $m2 \n”;


# find the position of the pattern match in dna1

$pos = index($dna1, $pattern);

print “The match position of ATG in dna1 is : $pos \n”;


# replace the ATG sites with CCC

$dna3 =~ s/ATG/CCC/g;

print “Replaced dna3 with CCC is : $dna3 \n”;

Extract sequence from UCSC

Jit — Tue, 17 May 2016 08:08:26 -0500

#!/usr/bin/env perl

use strict;
use warnings;
use LWP::Simple;
use XML::XPath;
use XML::XPath::XMLParser;

# Use DAS of UCSC to fetch specific sequence by its given chromosome position
# From here: https://www.biostars.org/p/6156/

my $chr  = shift;
my $pos  = shift;
my $size = shift;

my $usage = "Example: perl extract_seq_from_ucsc.pl 14 482780 1000\n";

if (! $size) {
	die "ERROR: You must pass three arguments: chr. num., position, and size.\n$usage";
	
}

chomp $size;

my $start = $pos - ($size/2);
my $end   = $pos + ($size/2);

# Figure out URL for the DAS server. Example:
# http://genome.ucsc.edu/cgi-bin/das/calJac3/dna?segment=chr14:482280,483280

my $URL_gene ="http://genome.ucsc.edu/cgi-bin/das/papAnu2/dna?segment=chr";
$URL_gene .= $chr . ":" . $start . "," . $end;

my $xml = get($URL_gene);

my $xp = XML::XPath->new(xml=>$xml);

my $nodeset = $xp->find('/DASDNA/SEQUENCE/DNA/text()'); # find all sequences
# there should be only one node, anyway:    
foreach my $node ($nodeset->get_nodelist) {

	my $seq = $node->getValue;
	$seq =~ s/\s//g; # remove white spaces
	print ">papAnu2_chr" . $chr . ":" . $start . "-" . $end . "\n";
	print $seq, "\n";
	
}

Perl to print indivisual nucleotide from a sequence!

Jit — Fri, 13 May 2016 10:17:35 -0500

#!/usr/bin/perl
use strict;
use warnings;

my $string = "ATGCTTGCGT?AAATG??CT?GCGTA";

my @chars = split("", $string);

print "First character: $chars[0]\n";

Parse a Fasta file with Perl

Radha Agarkar — Fri, 13 May 2016 05:00:18 -0500

#!/usr/bin/env perl

# Usage:  fastaRead.pl data.fa

use strict;
use warnings;

my $filename = $ARGV[0];
my  $sequence;
open my $fileH, "<", $filename or die "could not open $filename\n";
while (<$fileH>) {
    chomp;
    if ($_ =~ /^>/) {
        print "this line is a header: $_\n";
    }
    else {
        print "this line contains sequence data: $_\n";
        # Concatenate everything from the file into a single var
        $sequence .= $_;
    }
}
close $fileH;

Perl program to implement sliding window !

Radha Agarkar — Fri, 13 May 2016 04:28:24 -0500

#!/usr/bin/perl -w

my $filename = 'data.txt';
open(my TR, '<:encoding(UTF-8)', $filename)
  or die "Could not open file '$filename' $!";

my %hash;
while (my $line1=)
{
    chomp($line1);
    my @ar = split(/\t/,$line1);
    $hash{$ar[1]} = $ar[3];
}
close TR;

open my $SC, "<", $file2 or die "Error blah blah... $!";
while (my $line2 = <$SC>) 
{
    my ($id, $val) = split /\t/, $line2;
    my $val_file1 = $hash{$id};
    if ( $val > $val_file1 - $margin and $val < $val_file1 + $margin) {
        # print out something
    }
}
close $SC;

Find and replace ambiguous characters in fasta file with Perl and Bioperl

Radha Agarkar — Fri, 13 May 2016 03:20:09 -0500

#!/usr/bin/perl -w

my $usage="\nUsage: $0 [-h] [-m char] [fastaFileName1 ...]\n".
    "  -h: help\n".
    "  -m: missing character\n".
    "Print out the name of sequences with characters other than ATGC-.\n".
    "If -m is specified, the ambiguous characters are repleced with the\n".
    "specified character.  e.g. -m '?' will place ? to the ambigous characters.\n" .
    "If multiple files are given, sequences in all files are marged.  If no \n".
    "argument is given, it will take STDIN as the input\n";

our($opt_h, $opt_m);

use Bio::SeqIO;

use Getopt::Std;
getopts('hm:') || die "$usage\n";
die "$usage\n" if (defined($opt_h));

my $format = "fasta";
my @seqArr = ();

@ARGV = ('-') unless @ARGV;
while (my $file = shift) {
    my $seqio_obj = Bio::SeqIO->new(-file => $file, -format => $format);
    while (my $seq = $seqio_obj->next_seq()) {
	push(@seqArr, $seq);
    }
}

#@seqArr = sort { $a->id() cmp $b->id() } @seqArr;

foreach my $s (@seqArr) {
    my $thisSeq = $s->seq();
    my $ambig = AmbiguousChar($thisSeq);
    if ($ambig ne "") {
	print STDERR $s->id(), "\t$ambig\n";
	if (defined($opt_m)) {
	    $thisSeq = ReplaceAmbiguousChar($thisSeq, $opt_m);
	    $s->seq($thisSeq);
	}
    }
}

if (defined($opt_m)) {
    my $seqOut = Bio::SeqIO->new(-fs => \*STDOUT, -format => $format);
    foreach my $s (@seqArr) {
	$seqOut->write_seq($s);
    }
}
exit;


sub AmbiguousChar {
    my $string = shift;
    $string =~ s/[ATGC-]//g;

    $string =~ s/\s+//g;
    return $string;
}

sub ReplaceAmbiguousChar {
    my ($string, $char) = @_;
    $string =~ s/[^ATGC-]/$char/g;
    return $string;
}

Blast result parser with Perl and Bioperl

Radha Agarkar — Fri, 13 May 2016 03:15:06 -0500

#!/usr/local/bin/perl

#
#	Dr. Xiaodong Bai
#	It may be freely distributed under GNU General Public License.
#	This script will parse a NCBI blastx output file and output the top N hits of each blast search result.
#	For each hit, the following results are reported:
#	accesion number, length, description, E value, bit score, query frame, query start, query end, hit start, hit end, positives, and identical
# 	The results are tab-deliminated and ready for import into a spreadsheet program for browsing and further analysis.
#

use strict;
use warnings;
use Bio::SearchIO;

# Usage information
die "Usage: $0   \n", if (@ARGV != 3);

my ($infile,$numHits,$outfile) = @ARGV;
print "Parsing the BLAST result ...";
my $in = Bio::SearchIO->new(-format => 'blast', -file => $infile);
open (OUT,">$outfile") or die "Cannot open $outfile: $!";

# print the header info for tab-deliminated columns
print OUT "query_name\tquery_length\taccession_number\tlength\tdescription\tE value\tbit score\tframe\tquery_start\t";
print OUT "query_end\thit_start\thit_end\tpositives\tidentical\n";

# extraction of information for each result recursively
while ( my $result = $in->next_result ) {
	# the name of the query sequence
   	print OUT $result->query_name . "\t";

        # the length of the query sequence
    	print OUT $result->query_length;

        # output "no hits found" if there is no hits
    	if ( $result->num_hits == 0 ) {
		print OUT "\tNo hits found\n";
    	} else {
		my $count = 0;

                # process each hit recursively
		while (my $hit = $result->next_hit) {
			print OUT "\t" if ($count > 0);
                        # get the accession numbers of the hits
			print OUT "\t" . $hit->accession . "\t";
                        # get the lengths of the hit sequences
                        print OUT $hit->length . "\t";
                        # get the description of the hit sequences
			print OUT $hit->description . "\t";
                        # get the E value of the hit
			print OUT $hit->significance . "\t";
                        #get the bit score of the hit
			print OUT $hit->bits . "\t";

                        my $hspcount = 0;

                        # process the top HSP for the top hit
			while (my $hsp = $hit->next_hsp) {
                        	print OUT "\t\t\t\t\t\t\t", if ($hspcount > 0);
                        	# get the frame of the query sequence
				print OUT $hsp->query->frame . "\t";
                                # get the start and the end of the query sequence in the alignment
				print OUT $hsp->start('query') . "\t" . $hsp->end('query'). "\t";
                                # get the start and the end of the hit sequence in the alignment
				print OUT $hsp->start('hit') . "\t" . $hsp->end('hit') . "\t";
                                # get the similarity value
				printf OUT "%.1f" , ($hsp->frac_conserved * 100);
				print OUT "%\t";
                                # get the identity value
				printf OUT "%.1f" , ($hsp->frac_identical * 100);
		       		print OUT "%\n";
                                $hspcount++;
                        }
			$count++;

                        # flow control for the number of hits needed
			last if ($count == $numHits);
		}
    	}
}
close OUT;
print " DONE!!!\n";

Extract a random sequence from a file

Abhi — Thu, 12 May 2016 11:02:24 -0500

#!/usr/local/bin/perl -w

use strict;
use warnings;
use autodie;

use List::Util qw/ shuffle /;

my $outputfile = 'randomoutput.txt';

open my $in_fh,  '<', 'seq1.fa';
open my $out_fh, '>', $outputfile;

my $size       = 21;
my $count      = 10;

while (my $line = <$in_fh>) {
   next unless $line =~ /^([ATGCN]+)/;

   my $genome     = $1;
   my $len_genome = length $genome;

   my @start_points = shuffle(0 .. $len_genome-$size);
   next unless @start_points >= $count;
   print substr($genome, $_, 21), "\n" for @start_points[0 .. $count-1];
}

A multilayer perceptron (MLP) neural network in Perl

Jit — Wed, 11 May 2016 11:48:08 -0500

#!/usr/local/bin/perl -w

####################################################
#MLP neural network in Perl Original source code by Phil Brierley
#Translated into perl - ccolbourn Oct 2004
####################################################


#Tanh hidden neurons
#Linear output neuron

#To include an input bias create an
#extra input in the training data
#and set to 1


################ User settings #########
my $numEpochs = 500;
my $numHidden = 4;
my $LR_IH = 0.7;
my $LR_HO = 0.07;

################ Data dependent settings #########
my $numInputs = 3;
my $numPatterns = 4;

########################################

my $patNum;
my $errThisPat;
my $outPred;
my $RMSerror;

my @trainInputs;
my @trainOutput;


# the outputs of the hidden neurons
my @hiddenVal;

# the weights
my @weightsIH;
my @weightsHO;


main();


#==============================================================
#********** THIS IS THE MAIN PROGRAM **************************
#==============================================================

sub main
 {

 # initiate the weights
  initWeights();

 # load in the data
  initData();

 # train the network
    for(my $j = 0;$j <= $numEpochs;$j++)
    {

        for(my $i = 0;$i<$numPatterns;$i++)
        {

            #select a pattern at random
            $patNum = (rand()*$numPatterns)-0.001;

            #calculate the current network output
            #and error for this pattern
            calcNet();

            #change network weights
            WeightChangesHO();
            WeightChangesIH();
        }

        #display the overall network error
        #after each epoch
        calcOverallError();

        print "epoch = ".$j."  RMS Error = ".$RMSerror."\n";

    }

    #training has finished
    #display the results
    displayResults();

 }

#============================================================
#********** END OF THE MAIN PROGRAM **************************
#=============================================================






#***********************************
sub calcNet()
 {
    #calculate the outputs of the hidden neurons
    #the hidden neurons are tanh

    for(my $i = 0;$i<$numHidden;$i++)
    {
	$hiddenVal[$i] = 0.0;

        for(my $j = 0;$j<$numInputs;$j++)
	{
        $hiddenVal[$i] = $hiddenVal[$i] + ($trainInputs[$patNum][$j] * $weightsIH[$j][$i]);
	}

        $hiddenVal[$i] = tanh($hiddenVal[$i]);
    }

   #calculate the output of the network
   #the output neuron is linear
   $outPred = 0.0;

   for(my $i = 0;$i<$numHidden;$i++)
   {
    $outPred = $outPred + $hiddenVal[$i] * $weightsHO[$i];
   }
    #calculate the error
    $errThisPat = $outPred - $trainOutput[$patNum];
 }


#************************************
 sub WeightChangesHO()
 #adjust the weights hidden-output
 {
   for(my $k = 0;$k<$numHidden;$k++)
   {
    $weightChange = $LR_HO * $errThisPat * $hiddenVal[$k];
    $weightsHO[$k] = $weightsHO[$k] - $weightChange;

    #regularisation on the output weights
    if ($weightsHO[$k] < -5)
    {
        $weightsHO[$k] = -5;
    }
    elsif ($weightsHO[$k] > 5)
    {
        $weightsHO[$k] = 5;
    }
   }
 }


#************************************
 sub WeightChangesIH()
 #adjust the weights input-hidden
 {
  for(my $i = 0;$i<$numHidden;$i++)
  {
   for(my $k = 0;$k<$numInputs;$k++)
   {
    my $x = 1 - ($hiddenVal[$i] * $hiddenVal[$i]);
    $x = $x * $weightsHO[$i] * $errThisPat * $LR_IH;
    $x = $x * $trainInputs[$patNum][$k];
    my $weightChange = $x;
    $weightsIH[$k][$i] = $weightsIH[$k][$i] - $weightChange;
   }
  }
 }


#************************************
 sub initWeights()
 {

  for(my $j = 0;$j<$numHidden;$j++)
  {
    $weightsHO[$j] = (rand() - 0.5)/2;
    for(my $i = 0;$i<$numInputs;$i++)
    {
    $weightsIH[$i][$j] = (rand() - 0.5)/5;
    }
  }

 }


#************************************
 sub initData()
 {

    print "initialising data\n";

    # the data here is the XOR data
    # it has been rescaled to the range
    # [-1][1]
    # an extra input valued 1 is also added
    # to act as the bias
    # the output must lie in the range -1 to 1

    $trainInputs[0][0]  = 1;
    $trainInputs[0][1]  = -1;
    $trainInputs[0][2]  = 1;    #bias
    $trainOutput[0] = 1;

    $trainInputs[1][0]  = -1;
    $trainInputs[1][1]  = 1;
    $trainInputs[1][2]  = 1;       #bias
    $trainOutput[1] = 1;

    $trainInputs[2][0]  = 1;
    $trainInputs[2][1]  = 1;
    $trainInputs[2][2]  = 1;        #bias
    $trainOutput[2] = -1;

    $trainInputs[3][0]  = -1;
    $trainInputs[3][1]  = -1;
    $trainInputs[3][2]  = 1;     #bias
    $trainOutput[3] = -1;

 }


#************************************
 sub tanh()
 {


	my $x = shift;

    if ($x > 20){ return 1;}
    elsif ($x < -20){ return -1;}
    else
        {
        my $a = exp($x);
        my $b = exp(-$x);
        return ($a-$b)/($a+$b);
        }
 }


#************************************
 sub displayResults()
    {
     for(my $i = 0;$i<$numPatterns;$i++)
        {
        $patNum = $i;
        calcNet();
        print "pat = ".($patNum+1)." actual = ".$trainOutput[$patNum]." neural model = ".$outPred."\n";
        }
    }


#************************************
sub calcOverallError()
    {
     $RMSerror = 0.0;
     for(my $i = 0;$i<$numPatterns;$i++)
        {
        $patNum = $i;
        calcNet();
        $RMSerror = $RMSerror + ($errThisPat * $errThisPat);
        }
     $RMSerror = $RMSerror/$numPatterns;
     $RMSerror = sqrt($RMSerror);
    }

Retrieve NCBI GenBank records with a range of accession numbers

Anjana — Wed, 11 May 2016 11:02:40 -0500

#!/usr/bin/perl

#FILE: ncbi_search.pl
#AUTH: Paul Stothard (paul.stothard@gmail.com)

use warnings;
use strict;
use Getopt::Long;
use LWP::Simple;
use URI::Escape;

use LWP::UserAgent;
use HTTP::Request::Common;

my %param = (
    query      => undef,
    outputFile => undef,
    database   => undef,
    returnType => undef,
    maxRecords => undef,
    format     => undef,
    verbose    => undef,
    url        => 'http://www.ncbi.nlm.nih.gov/entrez/eutils',
    retries    => 0,
    maxRetries => 5,
    help       => undef
);

Getopt::Long::Configure('bundling');
GetOptions(
    'q|query=s'       => \$param{query},
    'o|output_file=s' => \$param{outputFile},
    'd|database=s'    => \$param{database},
    'r|return_type=s' => \$param{returnType},
    'm|max_records=i' => \$param{maxRecords},
    'verbose|v'       => \$param{verbose},
    'h|help'          => \$param{help}
);

if ( defined( $param{help} ) ) {
    print_usage();
    exit(0);
}

if (   !( defined( $param{query} ) )
    or !( defined( $param{outputFile} ) )
    or !( defined( $param{database} ) )
    or !( defined( $param{returnType} ) ) )
{
    print_usage();
    exit(1);
}

$param{returnType} = lc( $param{returnType} );

$param{query} = uri_escape( $param{query} );

_doSearch(%param);

sub _doSearch {
    my %param = @_;

    my $esearch = "$param{url}/esearch.fcgi?db=$param{database}"
        . "&retmax=1&usehistory=y&term=$param{query}";
    my $esearch_result = get($esearch);

    while (
        ( !defined($esearch_result) )
        || (!(  $esearch_result
                =~ m/(\d+)<\/Count>.*(\d+)<\/QueryKey>.*(\S+)<\/WebEnv>/s
            )
        )
        )
    {
        if ($esearch_result =~ m/(.*)<\/ERROR>/is) {
            die("ESearch returned an error: $1");
        }
        message( $param{verbose},
            "ESearch results could not be parsed. Resubmitting query.\n" );
        sleep(10);
        if ( $param{retries} >= $param{maxRetries} ) {
            die("Too many failures--giving up search.");
        }

        $esearch_result = get($esearch);
        $param{retries}++;
    }

    $param{retries} = 0;

    $esearch_result
        =~ m/(\d+)<\/Count>.*(\d+)<\/QueryKey>.*(\S+)<\/WebEnv>/s;

    my $count     = $1;
    my $query_key = $2;
    my $web_env   = $3;

    if ( defined( $param{maxRecords} ) ) {
        if ( $count > $param{maxRecords} ) {
            message( $param{verbose},
                "Retrieving $param{maxRecords} records out of $count available records.\n"
            );
            $count = $param{maxRecords};
        }
        else {
            message( $param{verbose},
                "Retrieving $count records out of $count available records.\n"
            );
        }
    }
    else {
        message( $param{verbose},
            "Retrieving $count records out of $count available records.\n" );
    }

    my $retmax = 500;
    if ( $retmax > $count ) {
        $retmax = $count;
    }

    open( my $OUTFILE, ">" . $param{outputFile} )
        or die("Error: Cannot open $param{outputFile} : $!");

    for (
        my $retstart = 0;
        $retstart < $count;
        $retstart = $retstart + $retmax
        )
    {
        message( $param{verbose},
                  "Downloading records "
                . ( $retstart + 1 ) . " to "
                . ( $retstart + $retmax )
                . "\n" );
        my $efetch
            = "$param{url}/efetch.fcgi?rettype=$param{returnType}&retmode=text&retstart=$retstart&retmax=$retmax&db=$param{database}&query_key=$query_key&WebEnv=$web_env";
        my $efetch_result = get($efetch);

        while ( !defined($efetch_result) ) {
            message( $param{verbose},
                "EFetch results could not be parsed. Resubmitting query.\n" );
            sleep(10);
            if ( $param{retries} >= $param{maxRetries} ) {
                die("Too many failures--giving up search.");
            }

            $efetch_result = get($efetch);
            $param{retries}++;
        }

        print( $OUTFILE $efetch_result );

        unless (
            ( defined( $param{maxRecords} ) && ( $param{maxRecords} == 1 ) ) )
        {
            sleep(3);
        }
    }

    close($OUTFILE) or die("Error: Cannot close $param{outputFile} file: $!");
}

sub message {
    my $verbose = shift;
    my $message = shift;
    if ($verbose) {
        print $message;
    }
}

sub print_usage {
    print <