#!/bin/bash
# Download the genome from NCBI using command
# Create a Directory
mkdir genome_gff
cd genome_gff
# Look for genome assembly summary and extract the URL
# USER need to provide the right summary file to curl
# Commentline if you are not interested in that genome set
# -for fungi
curl 'ftp://ftp.ncbi.nlm.nih.gov/genomes/refseq/fungi/assembly_summary.txt' | awk '{FS="\t"} !/^#/ {print $20} ' | sed -r 's|(ftp://ftp.ncbi.nlm.nih.gov/genomes/all/.+/)(GCF_.+)|\1\2/\2_genomic.gff.gz|' > genomic_file_fungi
# -for bacteria
curl 'ftp://ftp.ncbi.nlm.nih.gov/genomes/genbank/bacteria/assembly_summary.txt' | awk '{FS="\t"} !/^#/ {print $20} ' | sed -r 's|(ftp://ftp.ncbi.nlm.nih.gov/genomes/all/.+/)(GCA_.+)|\1\2/\2_genomic.gff.gz|' > genomic_file_bacteria
# -for plant
curl 'ftp://ftp.ncbi.nlm.nih.gov/genomes/refseq/plant/assembly_summary.txt' | awk '{FS="\t"} !/^#/ {print $20} ' | sed -r 's|(ftp://ftp.ncbi.nlm.nih.gov/genomes/all/.+/)(GCF_.+)|\1\2/\2_genomic.gff.gz|' > genomic_file_plant
# -for archaea
curl 'ftp://ftp.ncbi.nlm.nih.gov/genomes/refseq/archaea/assembly_summary.txt' | awk '{FS="\t"} !/^#/ {print $20} ' | sed -r 's|(ftp://ftp.ncbi.nlm.nih.gov/genomes/all/.+/)(GCF_.+)|\1\2/\2_genomic.gff.gz|' > genomic_file_archaea
# -for protozoa
curl 'ftp://ftp.ncbi.nlm.nih.gov/genomes/refseq/protozoa/assembly_summary.txt' | awk '{FS="\t"} !/^#/ {print $20} ' | sed -r 's|(ftp://ftp.ncbi.nlm.nih.gov/genomes/all/.+/)(GCF_.+)|\1\2/\2_genomic.gff.gz|' > genomic_file_protozoa
# -for vertebrate_mammalian
curl 'ftp://ftp.ncbi.nlm.nih.gov/genomes/refseq/vertebrate_mammalian/assembly_summary.txt' | awk '{FS="\t"} !/^#/ {print $20} ' | sed -r 's|(ftp://ftp.ncbi.nlm.nih.gov/genomes/all/.+/)(GCF_.+)|\1\2/\2_genomic.gff.gz|' > genomic_file_vertebrate_mammalian
# -for vertebrate_other
curl 'ftp://ftp.ncbi.nlm.nih.gov/genomes/refseq/vertebrate_other/assembly_summary.txt' | awk '{FS="\t"} !/^#/ {print $20} ' | sed -r 's|(ftp://ftp.ncbi.nlm.nih.gov/genomes/all/.+/)(GCF_.+)|\1\2/\2_genomic.gff.gz|' > genomic_file_vertebrate_other
# -for invertebrate
curl 'ftp://ftp.ncbi.nlm.nih.gov/genomes/refseq/invertebrate/assembly_summary.txt' | awk '{FS="\t"} !/^#/ {print $20} ' | sed -r 's|(ftp://ftp.ncbi.nlm.nih.gov/genomes/all/.+/)(GCF_.+)|\1\2/\2_genomic.gff.gz|' > genomic_file_invertebrate
# -for viral
curl 'ftp://ftp.ncbi.nlm.nih.gov/genomes/refseq/viral/assembly_summary.txt' | awk '{FS="\t"} !/^#/ {print $20} ' | sed -r 's|(ftp://ftp.ncbi.nlm.nih.gov/genomes/all/.+/)(GCF_.+)|\1\2/\2_genomic.gff.gz|' > genomic_file_viral
#Read the uerl from file and download
FILES=$(pwd)/*
for f in $FILES
do
echo "Processing $f file..."
filename=$(basename "$f")
extension="${filename##*.}"
filename="${filename%.*}"
# Create a directory with appending G
mkdir "GFF$filename"
cd "GFF$filename"
# take action on each file. $f store current file name
head -n 4 $f > $f.head
wget --input $f.head
gunzip *.gz
#cat $f
cd ..
done