BOL: Download the genome from NCBI using bash script/command

BioScripts
Jit
Download the genome from NCBI using bash script/command
Download the genome from NCBI using bash script/command

By Jit 2773 days ago
#!/bin/bash

# Download the genome from NCBI using command

# Create a Directory
mkdir genome
cd genome

# Look for genome assembly summary and extract the URL
# USER need to provide the right summary file to curl  
# Commentline if you are not interested in that genome set
# -for fungi
curl 'ftp://ftp.ncbi.nlm.nih.gov/genomes/refseq/fungi/assembly_summary.txt' | awk '{FS="\t"} !/^#/ {print $20} ' | sed -r 's|(ftp://ftp.ncbi.nlm.nih.gov/genomes/all/.+/)(GCF_.+)|\1\2/\2_genomic.fna.gz|' > genomic_file_fungi

# -for bacteria
curl 'ftp://ftp.ncbi.nlm.nih.gov/genomes/genbank/bacteria/assembly_summary.txt' | awk '{FS="\t"} !/^#/ {print $20} ' | sed -r 's|(ftp://ftp.ncbi.nlm.nih.gov/genomes/all/.+/)(GCF_.+)|\1\2/\2_genomic.fna.gz|' > genomic_file_bacteria

# -for plant 
curl 'ftp://ftp.ncbi.nlm.nih.gov/genomes/refseq/plant/assembly_summary.txt' | awk '{FS="\t"} !/^#/ {print $20} ' | sed -r 's|(ftp://ftp.ncbi.nlm.nih.gov/genomes/all/.+/)(GCF_.+)|\1\2/\2_genomic.fna.gz|' > genomic_file_plant 

# -for archaea
curl 'ftp://ftp.ncbi.nlm.nih.gov/genomes/refseq/archaea/assembly_summary.txt' | awk '{FS="\t"} !/^#/ {print $20} ' | sed -r 's|(ftp://ftp.ncbi.nlm.nih.gov/genomes/all/.+/)(GCF_.+)|\1\2/\2_genomic.fna.gz|' > genomic_file_archaea

# -for protozoa
curl 'ftp://ftp.ncbi.nlm.nih.gov/genomes/refseq/protozoa/assembly_summary.txt' | awk '{FS="\t"} !/^#/ {print $20} ' | sed -r 's|(ftp://ftp.ncbi.nlm.nih.gov/genomes/all/.+/)(GCF_.+)|\1\2/\2_genomic.fna.gz|' > genomic_file_protozoa

# -for vertebrate_mammalian
curl 'ftp://ftp.ncbi.nlm.nih.gov/genomes/refseq/vertebrate_mammalian/assembly_summary.txt' | awk '{FS="\t"} !/^#/ {print $20} ' | sed -r 's|(ftp://ftp.ncbi.nlm.nih.gov/genomes/all/.+/)(GCF_.+)|\1\2/\2_genomic.fna.gz|' > genomic_file_vertebrate_mammalian

# -for vertebrate_other
curl 'ftp://ftp.ncbi.nlm.nih.gov/genomes/refseq/vertebrate_other/assembly_summary.txt' | awk '{FS="\t"} !/^#/ {print $20} ' | sed -r 's|(ftp://ftp.ncbi.nlm.nih.gov/genomes/all/.+/)(GCF_.+)|\1\2/\2_genomic.fna.gz|' > genomic_file_vertebrate_other

# -for invertebrate
curl 'ftp://ftp.ncbi.nlm.nih.gov/genomes/refseq/invertebrate/assembly_summary.txt' | awk '{FS="\t"} !/^#/ {print $20} ' | sed -r 's|(ftp://ftp.ncbi.nlm.nih.gov/genomes/all/.+/)(GCF_.+)|\1\2/\2_genomic.fna.gz|' > genomic_file_invertebrate

# -for viral
curl 'ftp://ftp.ncbi.nlm.nih.gov/genomes/refseq/viral/assembly_summary.txt' | awk '{FS="\t"} !/^#/ {print $20} ' | sed -r 's|(ftp://ftp.ncbi.nlm.nih.gov/genomes/all/.+/)(GCF_.+)|\1\2/\2_genomic.fna.gz|' > genomic_file_viral

#Read the url from file and download

FILES=$(pwd)/*
for f in $FILES
do
  echo "Processing $f file..."
  filename=$(basename "$f")
  extension="${filename##*.}"
  filename="${filename%.*}"
  # Create a directory with appending G
  mkdir "G$filename"
  cd "G$filename"
  # take action on each file. $f store current file name
  wget --input $f
  #cat $f
  cd ..
done

#Reference
#ftp://ftp.ncbi.nlm.nih.gov/pub/factsheets/HowTo_Downloading_Genomic_Data.pdf
BOL

Jit

Our Sponsors

Download the genome from NCBI using bash script/command