BOL: Owner

Retrieve NCBI GenBank records with a range of accession numbers

Anjana — Wed, 11 May 2016 11:02:40 -0500

#!/usr/bin/perl

#FILE: ncbi_search.pl
#AUTH: Paul Stothard (paul.stothard@gmail.com)

use warnings;
use strict;
use Getopt::Long;
use LWP::Simple;
use URI::Escape;

use LWP::UserAgent;
use HTTP::Request::Common;

my %param = (
    query      => undef,
    outputFile => undef,
    database   => undef,
    returnType => undef,
    maxRecords => undef,
    format     => undef,
    verbose    => undef,
    url        => 'http://www.ncbi.nlm.nih.gov/entrez/eutils',
    retries    => 0,
    maxRetries => 5,
    help       => undef
);

Getopt::Long::Configure('bundling');
GetOptions(
    'q|query=s'       => \$param{query},
    'o|output_file=s' => \$param{outputFile},
    'd|database=s'    => \$param{database},
    'r|return_type=s' => \$param{returnType},
    'm|max_records=i' => \$param{maxRecords},
    'verbose|v'       => \$param{verbose},
    'h|help'          => \$param{help}
);

if ( defined( $param{help} ) ) {
    print_usage();
    exit(0);
}

if (   !( defined( $param{query} ) )
    or !( defined( $param{outputFile} ) )
    or !( defined( $param{database} ) )
    or !( defined( $param{returnType} ) ) )
{
    print_usage();
    exit(1);
}

$param{returnType} = lc( $param{returnType} );

$param{query} = uri_escape( $param{query} );

_doSearch(%param);

sub _doSearch {
    my %param = @_;

    my $esearch = "$param{url}/esearch.fcgi?db=$param{database}"
        . "&retmax=1&usehistory=y&term=$param{query}";
    my $esearch_result = get($esearch);

    while (
        ( !defined($esearch_result) )
        || (!(  $esearch_result
                =~ m/(\d+)<\/Count>.*(\d+)<\/QueryKey>.*(\S+)<\/WebEnv>/s
            )
        )
        )
    {
        if ($esearch_result =~ m/(.*)<\/ERROR>/is) {
            die("ESearch returned an error: $1");
        }
        message( $param{verbose},
            "ESearch results could not be parsed. Resubmitting query.\n" );
        sleep(10);
        if ( $param{retries} >= $param{maxRetries} ) {
            die("Too many failures--giving up search.");
        }

        $esearch_result = get($esearch);
        $param{retries}++;
    }

    $param{retries} = 0;

    $esearch_result
        =~ m/(\d+)<\/Count>.*(\d+)<\/QueryKey>.*(\S+)<\/WebEnv>/s;

    my $count     = $1;
    my $query_key = $2;
    my $web_env   = $3;

    if ( defined( $param{maxRecords} ) ) {
        if ( $count > $param{maxRecords} ) {
            message( $param{verbose},
                "Retrieving $param{maxRecords} records out of $count available records.\n"
            );
            $count = $param{maxRecords};
        }
        else {
            message( $param{verbose},
                "Retrieving $count records out of $count available records.\n"
            );
        }
    }
    else {
        message( $param{verbose},
            "Retrieving $count records out of $count available records.\n" );
    }

    my $retmax = 500;
    if ( $retmax > $count ) {
        $retmax = $count;
    }

    open( my $OUTFILE, ">" . $param{outputFile} )
        or die("Error: Cannot open $param{outputFile} : $!");

    for (
        my $retstart = 0;
        $retstart < $count;
        $retstart = $retstart + $retmax
        )
    {
        message( $param{verbose},
                  "Downloading records "
                . ( $retstart + 1 ) . " to "
                . ( $retstart + $retmax )
                . "\n" );
        my $efetch
            = "$param{url}/efetch.fcgi?rettype=$param{returnType}&retmode=text&retstart=$retstart&retmax=$retmax&db=$param{database}&query_key=$query_key&WebEnv=$web_env";
        my $efetch_result = get($efetch);

        while ( !defined($efetch_result) ) {
            message( $param{verbose},
                "EFetch results could not be parsed. Resubmitting query.\n" );
            sleep(10);
            if ( $param{retries} >= $param{maxRetries} ) {
                die("Too many failures--giving up search.");
            }

            $efetch_result = get($efetch);
            $param{retries}++;
        }

        print( $OUTFILE $efetch_result );

        unless (
            ( defined( $param{maxRecords} ) && ( $param{maxRecords} == 1 ) ) )
        {
            sleep(3);
        }
    }

    close($OUTFILE) or die("Error: Cannot close $param{outputFile} file: $!");
}

sub message {
    my $verbose = shift;
    my $message = shift;
    if ($verbose) {
        print $message;
    }
}

sub print_usage {
    print <



Perl script to count the number of Adenine, Thymine, Guanine and Cytosine in your DNA Sequence
Anjana — Wed, 11 May 2016 10:34:44 -0500
#!/usr/local/bin/perl -w

# While executing this script it asks for the file name of the DNA sequence. If the sequence file is not available in the same directory of this script, enter the name of the file along with the path.  
In windows:  c:\dnafile.txt, 
In Linux: /home/user/sequence/dnafile.txt

print "ENTER THE FILENAME OF THE DNA SEQUENCE:= ";
$dna_filename = ;
chomp $dna_filename;
unless ( open(DNAFILE, $dna_filename) ) 
{
	print "Sorry the file does not exist!!! \n";
	print "Cannot open file \"$dna_filename\"\n";
	die;
}
@DNA = ;
close DNAFILE;
$DNA = join( '', @DNA);
print " \n The original DNA file is:\n  $DNA \n";
$DNA =~ s/\s//g;
@DNA = split( '', $DNA );
$count_of_A = 0;
$count_of_C = 0;
$count_of_G = 0;
$count_of_T = 0;
$errors     = 0;
foreach $base (@DNA) {

    if     ( $base eq  'a' ) {
        ++$count_of_A;
    } elsif ( $base eq 'c' ) {
        ++$count_of_C;
    } elsif ( $base eq 'g' ) {
        ++$count_of_G;
    } elsif ( $base eq 't' ) {
        ++$count_of_T;
    }
        elsif ( $base eq 'T' ) {
        ++$count_of_T; }

        elsif ( $base eq 'C' ) {
        ++$count_of_C; }
        elsif ( $base eq 'A' ) {
        ++$count_of_A; }
        elsif ( $base eq 'G' ) {
        ++$count_of_G; }

        else {
        print "Error - Unknown base: $base\n";
        ++$errors;
    }
}
print "Adenine = $count_of_A\n";
print "Cytosine = $count_of_C\n";
print "Guanine = $count_of_G\n";
print "Thymine = $count_of_T\n";

if ($errors) {
        print "There were $errors unrecognized bases.\n";
}


Perl script to Mutate a DNA Sequence
Anjana — Wed, 11 May 2016 10:27:58 -0500
#!/usr/local/bin/perl -w

# This script randomly mutates the DNA sequence and generates 10 successive mutation results.
# While executing this script it asks for the file name of the DNA sequence.
# If the DNA sequence file is not in the same directory of this script, enter the file name with its full path.
# Example:
# In windows:  c:\rnafile.txt
# In Linux  : /home/user/sequence/rnafile.txt

use File::Path;

print "ENTER THE FILENAME OF THE DNA SEQUENCE:= ";
$dnafilename = ;
chomp $dnafilename;
unless ( open(DNAFILE, $dnafilename) ) 
{
    print "Cannot open file \"$dnafilename\"\n\n";
    goto h;
}
my $DNA = ;
close DNAFILE;

my $i;
my $mutant;
$mutant = mutate($DNA);
print "Mutate DNA\n\n";

print "HERE ARE THE 10 SUCCESSIVE MUTATIONS:\n\n";
for ($i=0 ; $i < 10 ; ++$i)
  {
    $mutant = mutate($mutant);
    print "$mutant\n";
        print WRITE "$mutant\n";
  }

sub mutate
  {
        my($dna) = @_;
        my($position) = randomposition($dna);
        my $current_base = substr($dna, $position, 1);
        my $newbase;
    do
  {
        $newbase = randomnucleotide();
  }
        until ($newbase ne $current_base);
        substr($dna,$position,1,$newbase);
        return $dna;
  }
sub randomposition
  {
        my($string) = @_;
        return int rand length $string;
  }
sub randomelement
  {
    my(@array) = @_;
    return $array[rand @array];
  }
sub randomnucleotide
  {
    my(@nucleotides) = ('A', 'C', 'G', 'T');
    return randomelement(@nucleotides);
  }


Check all seqs in a folder
Anjana — Wed, 11 May 2016 10:16:47 -0500
#!/usr/local/bin/perl -w

# Can be easily modified to run any command on every sequence in a folder
# Directory of sequences
$myDir = "/home/anjana/seqs";

# Output directory (relative to $myDir or full path)
$outputDir = "OutDir";

# Path to pattern file
$patFile = "/home/anjana/patterns/polyA.pat";

# Go to sequence directory and open it (i.e, read contents)
chdir($myDir) || die "Cannot change to $myDir: $!";      # Go to $myDir
opendir(DIR, ".") || die "Cannot open .: $!";      # Open $myDir

foreach $seqFile (sort readdir(DIR))
{
    if ($seqFile =~ /\.fa$/)      # if file ends in .fa
    {
        print "Processing $seqFile\n";
        $outFile = $seqFile;         # Create $outFile name
        $outFile =~ s/\.fa/\.polyA\.out/;      # s/old/new/; 

        #User can process these files as per their need
        print "$patFile \t$seqFile \t $outputDir/$outFile\n";
     }
}


BioPerl to convert between sequence formats from Fasta to Genbank
Anjana — Wed, 11 May 2016 09:49:04 -0500
#!/usr/local/bin/perl -w

# Sequence formats to choose: Fasta, EMBL. GenBank, Swissprot, PIR and GCG

use Bio::SeqIO;

$inFile = "BRCA2.fa";

$in  = Bio::SeqIO->newFh('-file' => "$inFile" ,
                           '-format' => 'Fasta');
$out = Bio::SeqIO->newFh('-format' => 'Genbank');
print $out $_ while <$in>;