Our Sponsors



Download BioinformaticsOnline(BOL) Apps in your chrome browser.




Retrieve NCBI GenBank records with a range of accession numbers

  • Public
By Anjana 2879 days ago
#!/usr/bin/perl #FILE: ncbi_search.pl #AUTH: Paul Stothard (paul.stothard@gmail.com) use warnings; use strict; use Getopt::Long; use LWP::Simple; use URI::Escape; use LWP::UserAgent; use HTTP::Request::Common; my %param = ( query => undef, outputFile => undef, database => undef, returnType => undef, maxRecords => undef, format => undef, verbose => undef, url => 'http://www.ncbi.nlm.nih.gov/entrez/eutils', retries => 0, maxRetries => 5, help => undef ); Getopt::Long::Configure('bundling'); GetOptions( 'q|query=s' => \$param{query}, 'o|output_file=s' => \$param{outputFile}, 'd|database=s' => \$param{database}, 'r|return_type=s' => \$param{returnType}, 'm|max_records=i' => \$param{maxRecords}, 'verbose|v' => \$param{verbose}, 'h|help' => \$param{help} ); if ( defined( $param{help} ) ) { print_usage(); exit(0); } if ( !( defined( $param{query} ) ) or !( defined( $param{outputFile} ) ) or !( defined( $param{database} ) ) or !( defined( $param{returnType} ) ) ) { print_usage(); exit(1); } $param{returnType} = lc( $param{returnType} ); $param{query} = uri_escape( $param{query} ); _doSearch(%param); sub _doSearch { my %param = @_; my $esearch = "$param{url}/esearch.fcgi?db=$param{database}" . "&retmax=1&usehistory=y&term=$param{query}"; my $esearch_result = get($esearch); while ( ( !defined($esearch_result) ) || (!( $esearch_result =~ m/<Count>(\d+)<\/Count>.*<QueryKey>(\d+)<\/QueryKey>.*<WebEnv>(\S+)<\/WebEnv>/s ) ) ) { if ($esearch_result =~ m/<ERROR>(.*)<\/ERROR>/is) { die("ESearch returned an error: $1"); } message( $param{verbose}, "ESearch results could not be parsed. Resubmitting query.\n" ); sleep(10); if ( $param{retries} >= $param{maxRetries} ) { die("Too many failures--giving up search."); } $esearch_result = get($esearch); $param{retries}++; } $param{retries} = 0; $esearch_result =~ m/<Count>(\d+)<\/Count>.*<QueryKey>(\d+)<\/QueryKey>.*<WebEnv>(\S+)<\/WebEnv>/s; my $count = $1; my $query_key = $2; my $web_env = $3; if ( defined( $param{maxRecords} ) ) { if ( $count > $param{maxRecords} ) { message( $param{verbose}, "Retrieving $param{maxRecords} records out of $count available records.\n" ); $count = $param{maxRecords}; } else { message( $param{verbose}, "Retrieving $count records out of $count available records.\n" ); } } else { message( $param{verbose}, "Retrieving $count records out of $count available records.\n" ); } my $retmax = 500; if ( $retmax > $count ) { $retmax = $count; } open( my $OUTFILE, ">" . $param{outputFile} ) or die("Error: Cannot open $param{outputFile} : $!"); for ( my $retstart = 0; $retstart < $count; $retstart = $retstart + $retmax ) { message( $param{verbose}, "Downloading records " . ( $retstart + 1 ) . " to " . ( $retstart + $retmax ) . "\n" ); my $efetch = "$param{url}/efetch.fcgi?rettype=$param{returnType}&retmode=text&retstart=$retstart&retmax=$retmax&db=$param{database}&query_key=$query_key&WebEnv=$web_env"; my $efetch_result = get($efetch); while ( !defined($efetch_result) ) { message( $param{verbose}, "EFetch results could not be parsed. Resubmitting query.\n" ); sleep(10); if ( $param{retries} >= $param{maxRetries} ) { die("Too many failures--giving up search."); } $efetch_result = get($efetch); $param{retries}++; } print( $OUTFILE $efetch_result ); unless ( ( defined( $param{maxRecords} ) && ( $param{maxRecords} == 1 ) ) ) { sleep(3); } } close($OUTFILE) or die("Error: Cannot close $param{outputFile} file: $!"); } sub message { my $verbose = shift; my $message = shift; if ($verbose) { print $message; } } sub print_usage { print <<BLOCK; USAGE: perl ncbi_search.pl -q STRING -o FILE -d STRING -r STRING [Options] DESCRIPTION: Uses NCBI's eSearch to download collections of sequences. REQUIRED ARGUMENTS: -q, --query [STRING] Raw query text. -o, --output [FILE] Output file to create. -d, --database [STRING] Name of the NCBI database to search, such as 'nucleotide', 'protein', or 'gene'. -r, --return_type [STRING] The type of information requested. For sequences 'fasta' is often used. The accepted formats vary depending on the database being queried. -m, --max_records [INTEGER] The maximum number of records to return (default is to return all matches satisfying the query). -v, --verbose Provide progress messages. -h, --help Show this message. EXAMPLE: perl ncbi_search.pl -q 'dysphagia AND homo sapiens[ORGN]' \\ -o results.txt -d pubmed -r uilist -m 100 BLOCK }