BOL: All

Perl onliner to check the ids in two files !

Surabhi Chaudhary — Thu, 21 Oct 2021 07:21:10 -0500

perl -lane 'BEGIN{open(A,"ids2.txt"); while(){chomp; $k{$_}++}} if (defined($k{$F[0]})) {print "$_\t$F[0]\t1"} else {print "$_\tNA\t0"}; ' ids1.txt > aaa.xls

Onliner to convert multi line fasta to single line fasta !

Abhi — Wed, 20 Oct 2021 05:00:38 -0500

#Oneliner to convert
awk '/^>/ {printf("\n%s\n",$0);next; } { printf("%s",$0);}  END {printf("\n");}' < file.fa > fileres.fa

#Then delete the first empty line
tail -n +2 fileres.fa > fileout.fa

Simulate the reads !

Surabhi Chaudhary — Wed, 20 Oct 2021 04:52:40 -0500

# make reference for randomreads.sh
# randomreads.sh part of BBTools/BBMap https://sourceforge.net/projects/bbmap/
/genetics/elbers/bbmap-38.86/randomreads.sh build=1 \
seed=1 \
ref=GCA_003401745.1_ASM340174v1_genomic.fna_upper.diploid.fasta.gz \
illuminanames=t addslash=t \
pacbio=t pbmin=0.13 pbmax=0.17 \
reads=100 paired=f \
gaussianlength=t \
minlength=1000 midlength=20000 maxlength=100000 \
out=/dev/null




# make 60x haploid coverage for Illumina reads
/genetics/elbers/bbmap-38.86/randomreads.sh build=1 \
ref=GCA_003401745.1_ASM340174v1_genomic.fna_upper.diploid.fasta.gz \
illuminanames=t addslash=t \
coverage=30 paired=t maxinsert=550 mininsert=450 \
out1=illumina1.fastq.gz out2=illumina2.fastq.gz > random_reads_illumina.log 2>&1




# interleave the paired-end reads
# reformat.sh part of BBTools/BBMap https://sourceforge.net/projects/bbmap/
/genetics/elbers/bbmap-38.86/reformat.sh \
in=illumina1.fastq.gz in2=illumina2.fastq.gz out=illumina.int.fastq 2>/dev/null




# use KmerGenie 1.7051 to get an idea of k-mer with that produces longest N50
# http://kmergenie.bx.psu.edu/
mkdir -p /genetics/elbers/test/fly2/kmergenie-illumina-raw-reads

cd /genetics/elbers/test/fly2/kmergenie-illumina-raw-reads
/genetics/elbers/kmergenie-1.7051/kmergenie ../illumina.int.fastq \
> kmergenie-illumina-raw-reads.log 2>&1
rm ../illumina.int.fastq

k=`grep "^best k:" \
kmergenie-illumina-raw-reads.log | grep -Po "\d+"` 
echo "best k=${k}"




# make 30x haploid coverage for PacBio CLR reads
# error rate from 13 - 15 % minimum 1000bp midlength 20000bp maximum 30000bp
cd /genetics/elbers/test/fly2

/genetics/elbers/bbmap-38.86/randomreads.sh build=1 \
ow=t seed=1 \
ref=GCA_003401745.1_ASM340174v1_genomic.fna_upper.diploid.fasta.gz \
illuminanames=t addslash=t \
pacbio=t pbmin=0.13 pbmax=0.15 \
coverage=15 paired=f \
gaussianlength=t \
minlength=1000 midlength=20000 maxlength=30000 \
out=pacbio.fastq.gz > random_reads_pacbio.log 2>&1



# make 30x haploid coverage for PacBio reads for Hifi reads
# error rate from 1 - 0.1 % minimum 9000bp midlength 10000bp max 12000bp
/genetics/elbers/bbmap-38.86/randomreads.sh build=1 \
ow=t seed=1 \
ref=GCA_003401745.1_ASM340174v1_genomic.fna_upper.diploid.fasta.gz \
illuminanames=t addslash=t \
pacbio=t pbmin=0.001 pbmax=0.01 \
coverage=15 paired=f \
gaussianlength=t \
minlength=9000 midlength=10000 maxlength=12000 \
out=hifi.fastq.gz > random_reads_pacbio_hifi.log 2>&1

Downloading mmseqs databases !

Abhi — Wed, 06 Oct 2021 06:25:12 -0500

# mmseqs databases
Usage: mmseqs databases    [options]

  Name                	Type      	Taxonomy	Url
- UniRef100           	Aminoacid 	     yes	https://www.uniprot.org/help/uniref
- UniRef90            	Aminoacid 	     yes	https://www.uniprot.org/help/uniref
- UniRef50            	Aminoacid 	     yes	https://www.uniprot.org/help/uniref
- UniProtKB           	Aminoacid 	     yes	https://www.uniprot.org/help/uniprotkb
- UniProtKB/TrEMBL    	Aminoacid 	     yes	https://www.uniprot.org/help/uniprotkb
- UniProtKB/Swiss-Prot	Aminoacid 	     yes	https://uniprot.org
- NR                  	Aminoacid 	       -	https://ftp.ncbi.nlm.nih.gov/blast/db/FASTA
- NT                  	Nucleotide	       -	https://ftp.ncbi.nlm.nih.gov/blast/db/FASTA
- PDB                 	Aminoacid 	       -	https://www.rcsb.org
- PDB70               	Profile   	       -	https://github.com/soedinglab/hh-suite
- Pfam-A.full         	Profile   	       -	https://pfam.xfam.org
- Pfam-A.seed         	Profile   	       -	https://pfam.xfam.org
- Pfam-B              	Profile   	       -	https://xfam.wordpress.com/2020/06/30/a-new-pfam-b-is-released
- eggNOG              	Profile   	       -	http://eggnog5.embl.de
- dbCAN2              	Profile   	       -	http://bcb.unl.edu/dbCAN2
- Resfinder           	Nucleotide	       -	https://cge.cbs.dtu.dk/services/ResFinder
- Kalamari            	Nucleotide	     yes	https://github.com/lskatz/Kalamari

#For example, run the following to download and setup the Swiss-Prot at the output path outpath/swissprot:

mmseqs databases UniProtKB/Swiss-Prot outpath/swissprot tmp

#In this case, since Swiss-Prot has a value yes in the Taxonomy column above, all necessary files to use it as a valid seqTaxDB will be downloaded and prepared by the databases command.

More information @ https://github.com/soedinglab/mmseqs2/wiki#downloading-databases

Install hhsuite using conda !

Abhi — Wed, 06 Oct 2021 05:06:43 -0500

(base) [abhi@hn1 bin]$ conda install -c conda-forge -c bioconda hhsuite
Collecting package metadata (current_repodata.json): done
Solving environment: done

## Package Plan ##

  environment location: /home/abhi/anaconda3

  added / updated specs:
    - hhsuite


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    _openmp_mutex-4.5          |            1_gnu          22 KB
    conda-4.10.3               |   py38h578d9bd_2         3.0 MB  conda-forge
    hhsuite-3.3.0              |py38pl526h6ed170a_1        26.6 MB  bioconda
    libgomp-9.3.0              |      h5101ec6_17         311 KB
    ------------------------------------------------------------
                                           Total:        30.0 MB

The following NEW packages will be INSTALLED:

  _openmp_mutex      pkgs/main/linux-64::_openmp_mutex-4.5-1_gnu
  hhsuite            bioconda/linux-64::hhsuite-3.3.0-py38pl526h6ed170a_1
  libgomp            pkgs/main/linux-64::libgomp-9.3.0-h5101ec6_17

The following packages will be UPDATED:

  conda                               4.10.3-py38h578d9bd_1 --> 4.10.3-py38h578d9bd_2


Proceed ([y]/n)? y


Downloading and Extracting Packages
_openmp_mutex-4.5    | 22 KB     | ######################################################################################################## | 100%
conda-4.10.3         | 3.0 MB    | ######################################################################################################## | 100%
hhsuite-3.3.0        | 26.6 MB   | ######################################################################################################## | 100%
libgomp-9.3.0        | 311 KB    | ######################################################################################################## | 100%
Preparing transaction: done
Verifying transaction: done
Executing transaction: done

Download desire version of Blast software !

Abhi — Wed, 06 Oct 2021 02:55:15 -0500

#Create a directory and wget it
wget ftp://ftp.ncbi.nlm.nih.gov/blast/executables/blast+/2.6.0/ncbi-blast-2.6.0+-x64-linux.tar.gz

#unpacking blast
tar -zxvf ncbi-blast-2.6.0+-x64-linux.tar.gz

#Slurm template

#!/bin/bash
#SBATCH --partition=longjobs
#SBATCH --nodes=1
#SBATCH --ntasks-per-node=32
#SBATCH --time=1:00:00
#SBATCH --job-name=vsearch
#SBATCH -o result_%N_%j.out
#SBATCH -e result_%N_%j.err

export SBATCH_EXPORT=NONE
export OMP_NUM_THREADS=???

module load ncbi-blast/2.6.0_x86_64

Command line to download blast database / protein

Abhi — Tue, 05 Oct 2021 00:06:08 -0500

#download all available nr - protein database as a single file 

#Database location - NCBI where all databases are available
ftp://ftp.ncbi.nlm.nih.gov/blast/db/
https://ftp.ncbi.nlm.nih.gov/blast/db/

# Database detail / description 
nr.*tar.gz | Non-redundant protein sequences from GenPept, Swissprot, PIR, PDF, PDB, and NCBI RefSeq

#First run this to download
wget 'ftp://ftp.ncbi.nlm.nih.gov/blast/db/nr.*.tar.gz'

#cat them into one
cat nr.*.tar.gz | tar -zxvi -f - -C .

Get the Linux system information !

Surabhi Chaudhary — Thu, 30 Sep 2021 06:37:45 -0500

#!/bin/bash

# while-menu-dialog: a menu driven system information program

DIALOG_CANCEL=1
DIALOG_ESC=255
HEIGHT=0
WIDTH=0

display_result() {
  dialog --title "$1" \
    --no-collapse \
    --msgbox "$result" 0 0
}

while true; do
  exec 3>&1
  selection=$(dialog \
    --backtitle "System Information" \
    --title "Menu" \
    --clear \
    --cancel-label "Exit" \
    --menu "Please select:" $HEIGHT $WIDTH 4 \
    "1" "Display System Information" \
    "2" "Display Disk Space" \
    "3" "Display Home Space Utilization" \
    2>&1 1>&3)
  exit_status=$?
  exec 3>&-
  case $exit_status in
    $DIALOG_CANCEL)
      clear
      echo "Program terminated."
      exit
      ;;
    $DIALOG_ESC)
      clear
      echo "Program aborted." >&2
      exit 1
      ;;
  esac
  case $selection in
    1 )
      result=$(echo "Hostname: $HOSTNAME"; uptime)
      display_result "System Information"
      ;;
    2 )
      result=$(df -h)
      display_result "Disk Space"
      ;;
    3 )
      if [[ $(id -u) -eq 0 ]]; then
        result=$(du -sh /home/* 2> /dev/null)
        display_result "Home Space Utilization (All Users)"
      else
        result=$(du -sh $HOME 2> /dev/null)
        display_result "Home Space Utilization ($USER)"
      fi
      ;;
  esac
done

Bash script for getopts

Surabhi Chaudhary — Wed, 29 Sep 2021 04:53:14 -0500

#using : after a switch variable means it requires some input (ie, t: requires something after t to validate while h requires nothing.
while getopts “ht:r:p:v” OPTION
do
     case $OPTION in
         h)
             usage
             exit 1
             ;;
         t)
             TEST=$OPTARG
             ;;
         r)
             SERVER=$OPTARG
             ;;
         p)
             PASSWD=$OPTARG
             ;;
         v)
             VERBOSE=1
             ;;
         ?)
             usage
             exit
             ;;
     esac
done

if [[ -z $TEST ]] || [[ -z $SERVER ]] || [[ -z $PASSWD ]]
then
     usage
     exit 1
fi

Inreractive SCP / File transfer !

Neel — Tue, 28 Sep 2021 08:14:04 -0500

#!/bin/bash
#next line prints hearer of script
echo "Interactive Script to Copy File (files) / Directory using scp"
#next line check if entered value is not null, and if null it will reask user to enter Destination Server
while [ x$desthost = "x" ]; do
#next line prints what userd should enter, and stores entered value to variable with name desthost
read -p "Destination Server Name : " desthost
#next line finishes while loop
done
#next line check if entered value is not null, and if null it will reask user to enter Destination Path
while [ x$destpath = "x" ]; do
#next line prints what userd should enter, and stores entered value to variable with name destpath
read -p "Destination Path : " destpath
#next line finishes while loop
done
#next line put null value to variable filename
filename='null'
#next line check if entered value is null, and If not null it will reask user to enter file(s) to copy
while ! [ x"$filename" = "x" ]; do
#next line prints what userd should enter, and stores entered value to variable with name filename
read -p "Path to source directory / file : " filename
#next line checks if entered value is not null, and if not null it will copy file(s)
if ! [ x"$filename" = "x" ];
then
#next line prints header
echo -n "Copying $filename ... "
#next like copy pre-entered file(s) or dir to destination path on destination server
scp -r "$filename" "$desthost":"$destpath"
#end of if
fi
#next line finishes while loop
done