BOL: Owner

Perl onliner to check the ids in two files !

Surabhi Chaudhary — Thu, 21 Oct 2021 07:21:10 -0500

perl -lane 'BEGIN{open(A,"ids2.txt"); while(){chomp; $k{$_}++}} if (defined($k{$F[0]})) {print "$_\t$F[0]\t1"} else {print "$_\tNA\t0"}; ' ids1.txt > aaa.xls

Simulate the reads !

Surabhi Chaudhary — Wed, 20 Oct 2021 04:52:40 -0500

# make reference for randomreads.sh
# randomreads.sh part of BBTools/BBMap https://sourceforge.net/projects/bbmap/
/genetics/elbers/bbmap-38.86/randomreads.sh build=1 \
seed=1 \
ref=GCA_003401745.1_ASM340174v1_genomic.fna_upper.diploid.fasta.gz \
illuminanames=t addslash=t \
pacbio=t pbmin=0.13 pbmax=0.17 \
reads=100 paired=f \
gaussianlength=t \
minlength=1000 midlength=20000 maxlength=100000 \
out=/dev/null




# make 60x haploid coverage for Illumina reads
/genetics/elbers/bbmap-38.86/randomreads.sh build=1 \
ref=GCA_003401745.1_ASM340174v1_genomic.fna_upper.diploid.fasta.gz \
illuminanames=t addslash=t \
coverage=30 paired=t maxinsert=550 mininsert=450 \
out1=illumina1.fastq.gz out2=illumina2.fastq.gz > random_reads_illumina.log 2>&1




# interleave the paired-end reads
# reformat.sh part of BBTools/BBMap https://sourceforge.net/projects/bbmap/
/genetics/elbers/bbmap-38.86/reformat.sh \
in=illumina1.fastq.gz in2=illumina2.fastq.gz out=illumina.int.fastq 2>/dev/null




# use KmerGenie 1.7051 to get an idea of k-mer with that produces longest N50
# http://kmergenie.bx.psu.edu/
mkdir -p /genetics/elbers/test/fly2/kmergenie-illumina-raw-reads

cd /genetics/elbers/test/fly2/kmergenie-illumina-raw-reads
/genetics/elbers/kmergenie-1.7051/kmergenie ../illumina.int.fastq \
> kmergenie-illumina-raw-reads.log 2>&1
rm ../illumina.int.fastq

k=`grep "^best k:" \
kmergenie-illumina-raw-reads.log | grep -Po "\d+"` 
echo "best k=${k}"




# make 30x haploid coverage for PacBio CLR reads
# error rate from 13 - 15 % minimum 1000bp midlength 20000bp maximum 30000bp
cd /genetics/elbers/test/fly2

/genetics/elbers/bbmap-38.86/randomreads.sh build=1 \
ow=t seed=1 \
ref=GCA_003401745.1_ASM340174v1_genomic.fna_upper.diploid.fasta.gz \
illuminanames=t addslash=t \
pacbio=t pbmin=0.13 pbmax=0.15 \
coverage=15 paired=f \
gaussianlength=t \
minlength=1000 midlength=20000 maxlength=30000 \
out=pacbio.fastq.gz > random_reads_pacbio.log 2>&1



# make 30x haploid coverage for PacBio reads for Hifi reads
# error rate from 1 - 0.1 % minimum 9000bp midlength 10000bp max 12000bp
/genetics/elbers/bbmap-38.86/randomreads.sh build=1 \
ow=t seed=1 \
ref=GCA_003401745.1_ASM340174v1_genomic.fna_upper.diploid.fasta.gz \
illuminanames=t addslash=t \
pacbio=t pbmin=0.001 pbmax=0.01 \
coverage=15 paired=f \
gaussianlength=t \
minlength=9000 midlength=10000 maxlength=12000 \
out=hifi.fastq.gz > random_reads_pacbio_hifi.log 2>&1

Get the Linux system information !

Surabhi Chaudhary — Thu, 30 Sep 2021 06:37:45 -0500

#!/bin/bash

# while-menu-dialog: a menu driven system information program

DIALOG_CANCEL=1
DIALOG_ESC=255
HEIGHT=0
WIDTH=0

display_result() {
  dialog --title "$1" \
    --no-collapse \
    --msgbox "$result" 0 0
}

while true; do
  exec 3>&1
  selection=$(dialog \
    --backtitle "System Information" \
    --title "Menu" \
    --clear \
    --cancel-label "Exit" \
    --menu "Please select:" $HEIGHT $WIDTH 4 \
    "1" "Display System Information" \
    "2" "Display Disk Space" \
    "3" "Display Home Space Utilization" \
    2>&1 1>&3)
  exit_status=$?
  exec 3>&-
  case $exit_status in
    $DIALOG_CANCEL)
      clear
      echo "Program terminated."
      exit
      ;;
    $DIALOG_ESC)
      clear
      echo "Program aborted." >&2
      exit 1
      ;;
  esac
  case $selection in
    1 )
      result=$(echo "Hostname: $HOSTNAME"; uptime)
      display_result "System Information"
      ;;
    2 )
      result=$(df -h)
      display_result "Disk Space"
      ;;
    3 )
      if [[ $(id -u) -eq 0 ]]; then
        result=$(du -sh /home/* 2> /dev/null)
        display_result "Home Space Utilization (All Users)"
      else
        result=$(du -sh $HOME 2> /dev/null)
        display_result "Home Space Utilization ($USER)"
      fi
      ;;
  esac
done

Bash script for getopts

Surabhi Chaudhary — Wed, 29 Sep 2021 04:53:14 -0500

#using : after a switch variable means it requires some input (ie, t: requires something after t to validate while h requires nothing.
while getopts “ht:r:p:v” OPTION
do
     case $OPTION in
         h)
             usage
             exit 1
             ;;
         t)
             TEST=$OPTARG
             ;;
         r)
             SERVER=$OPTARG
             ;;
         p)
             PASSWD=$OPTARG
             ;;
         v)
             VERBOSE=1
             ;;
         ?)
             usage
             exit
             ;;
     esac
done

if [[ -z $TEST ]] || [[ -z $SERVER ]] || [[ -z $PASSWD ]]
then
     usage
     exit 1
fi

Command line to create blast uniref database !

Surabhi Chaudhary — Tue, 28 Sep 2021 05:46:20 -0500

#The NCBI BLAST+ distribution does not include 'blastpgp', it has been replaced by the 'psiblast' program. The 'blastpgp' program is available in the legacy NCBI BLAST package (no longer supported), which is available from the NCBI's FTP site: ftp://ftp.ncbi.nlm.nih.gov/blast/executables/release/2.2.26/.

wget ftp://ftp.uniprot.org/pub/databases/uniprot/uniref/uniref90/uniref90.fasta.gz
gunzip -v uniref90.fasta.gz
bin/pfilt uniref90.fasta > uniref90filt
formatdb -t uniref90filt -i uniref90filt

#When using NCBI BLAST+ the 'formatdb' command should be replaced by the equivalent 'makeblastdb' command:

makeblastdb -dbtype prot -in uniref90filt -out uniref90filt

blastpgp arguments !

Surabhi Chaudhary — Tue, 28 Sep 2021 05:30:02 -0500

blastpgp   arguments:

  -d  Database [String]
    default = nr
  -i  Query File [File In]
    default = stdin
  -A  Multiple Hits window size (zero for single hit algorithm) [Integer]
    default = 40
  -f  Threshold for extending hits [Integer]
    default = 0
  -e  Expectation value (E) [Real]
    default = 10.0
  -m  alignment view options:
0 = pairwise,
1 = query-anchored showing identities,
2 = query-anchored no identities,
3 = flat query-anchored, show identities,
4 = flat query-anchored, no identities,
5 = query-anchored no identities and blunt ends,
6 = flat query-anchored, no identities and blunt ends,
7 = XML Blast output,
8 = Tabular output, 
9 = Tabular output with comments [Integer]
    default = 0
  -o  Output File for Alignment [File Out]  Optional
    default = stdout
  -y  Dropoff (X) for blast extensions in bits (default if zero) [Real]
    default = 7.0
  -P  0 for multiple hits 1-pass, 1 for single hit 1-pass, 2 for 2-pass [Integer]
    default = 0
  -F  Filter query sequence with SEG [String]
    default = F
  -G  Cost to open a gap [Integer]
    default = 11
  -E  Cost to extend a gap [Integer]
    default = 1
  -X  X dropoff value for gapped alignment (in bits) [Integer]
    default = 15
  -N  Number of bits to trigger gapping [Real]
    default = 22.0
  -g  Gapped [T/F]
    default = T
  -S  Start of required region in query [Integer]
    default = 1
  -H  End of required region in query (-1 indicates end of query) [Integer]
    default = -1
  -a  Number of processors to use [Integer]
    default = 1
  -I  Show GI's in deflines [T/F]
    default = F
  -h  e-value threshold for inclusion in multipass model [Real]
    default = 0.005
  -c  Constant in pseudocounts for multipass version [Integer]
    default = 9
  -j  Maximum number of passes to use in  multipass version [Integer]
    default = 1
  -J  Believe the query defline [T/F]
    default = F
  -Z  X dropoff value for final gapped alignment (in bits) [Integer]
    default = 25
  -O  SeqAlign file ('Believe the query defline' must be TRUE) [File Out]  Optional
  -M  Matrix [String]
    default = BLOSUM62
  -v  Number of database sequences to show one-line descriptions for (V) [Integer]
    default = 500
  -b  Number of database sequence to show alignments for (B) [Integer]
    default = 250
  -C  Output File for PSI-BLAST Checkpointing [File Out]  Optional
  -R  Input File for PSI-BLAST Restart [File In]  Optional
  -W  Word size, default if zero [Integer]
    default = 0
  -z  Effective length of the database (use zero for the real size) [Real]
    default = 0
  -K  Number of best hits from a region to keep [Integer]
    default = 0
  -s  Compute locally optimal Smith-Waterman alignments [T/F]
    default = F
  -Y  Effective length of the search space (use zero for the real size) [Real]
    default = 0
  -p  program option for PHI-BLAST [String]
    default = blastpgp
  -k  Hit File for PHI-BLAST [File In]
    default = hit_file
  -T  Produce HTML output [T/F]
    default = F
  -Q  Output File for PSI-BLAST Matrix in ASCII [File Out]  Optional
  -B  Input Alignment File for PSI-BLAST Restart [File In]  Optional
  -l  Restrict search of database to list of GI's [String]  Optional
  -U  Use lower case filtering of FASTA sequence [T/F]  Optional
    default = F
  -t  Use composition based statistics [T/F]
    default = T
  -L  Cost to decline alignment (disabled when 0) [Integer]
    default = 0

Perl script for Smith-Waterman Algorithm

Surabhi Chaudhary — Tue, 28 Sep 2021 05:19:18 -0500

# Smith-Waterman  Algorithm

# usage statement
die "usage: $0  \n" unless @ARGV == 2;

# get sequences from command line
my ($seq1, $seq2) = @ARGV;

# scoring scheme
my $MATCH    =  1; # +1 for letters that match
my $MISMATCH = -1; # -1 for letters that mismatch
my $GAP      = -1; # -1 for any gap

# initialization
my @matrix;
$matrix[0][0]{score}   = 0;
$matrix[0][0]{pointer} = "none";
for(my $j = 1; $j <= length($seq1); $j++) {
    $matrix[0][$j]{score}   = 0;
    $matrix[0][$j]{pointer} = "none";
}
for (my $i = 1; $i <= length($seq2); $i++) {
    $matrix[$i][0]{score}   = 0;
    $matrix[$i][0]{pointer} = "none";
}

# fill
my $max_i     = 0;
my $max_j     = 0;
my $max_score = 0;

for(my $i = 1; $i <= length($seq2); $i++) {
    for(my $j = 1; $j <= length($seq1); $j++) {
        my ($diagonal_score, $left_score, $up_score);
        
        # calculate match score
        my $letter1 = substr($seq1, $j-1, 1);
        my $letter2 = substr($seq2, $i-1, 1);       
        if ($letter1 eq $letter2) {
            $diagonal_score = $matrix[$i-1][$j-1]{score} + $MATCH;
        }
        else {
            $diagonal_score = $matrix[$i-1][$j-1]{score} + $MISMATCH;
        }
        
        # calculate gap scores
        $up_score   = $matrix[$i-1][$j]{score} + $GAP;
        $left_score = $matrix[$i][$j-1]{score} + $GAP;
        
        if ($diagonal_score <= 0 and $up_score <= 0 and $left_score <= 0) {
            $matrix[$i][$j]{score}   = 0;
            $matrix[$i][$j]{pointer} = "none";
            next; # terminate this iteration of the loop
        }
        
        # choose best score
        if ($diagonal_score >= $up_score) {
            if ($diagonal_score >= $left_score) {
                $matrix[$i][$j]{score}   = $diagonal_score;
                $matrix[$i][$j]{pointer} = "diagonal";
            }
            else {
                $matrix[$i][$j]{score}   = $left_score;
                $matrix[$i][$j]{pointer} = "left";
            }
        } else {
            if ($up_score >= $left_score) {
                $matrix[$i][$j]{score}   = $up_score;
                $matrix[$i][$j]{pointer} = "up";
            }
            else {
                $matrix[$i][$j]{score}   = $left_score;
                $matrix[$i][$j]{pointer} = "left";
            }
        }
        
        # set maximum score
        if ($matrix[$i][$j]{score} > $max_score) {
            $max_i     = $i;
            $max_j     = $j;
            $max_score = $matrix[$i][$j]{score};
        }
    }
}

# trace-back

my $align1 = "";
my $align2 = "";

my $j = $max_j;
my $i = $max_i;

while (1) {
    last if $matrix[$i][$j]{pointer} eq "none";
    
    if ($matrix[$i][$j]{pointer} eq "diagonal") {
        $align1 .= substr($seq1, $j-1, 1);
        $align2 .= substr($seq2, $i-1, 1);
        $i--; $j--;
    }
    elsif ($matrix[$i][$j]{pointer} eq "left") {
        $align1 .= substr($seq1, $j-1, 1);
        $align2 .= "-";
        $j--;
    }
    elsif ($matrix[$i][$j]{pointer} eq "up") {
        $align1 .= "-";
        $align2 .= substr($seq2, $i-1, 1);
        $i--;
    }   
}

$align1 = reverse $align1;
$align2 = reverse $align2;
print "$align1\n";
print "$align2\n";

Oneliner to convert lower-case to sequence masked with Ns

Surabhi Chaudhary — Tue, 28 Sep 2021 04:47:05 -0500

perl -pe '/^[^>]/ and $_=~ s/[a-z]/N/g' genomic.fna > genomic.N-masked.fna

awk '{if(/^[^>]/)gsub(/[a-z]/,"N");print $0}' genomic.fna > genomic.N-masked.fna

List of string comparison algorithms !

Surabhi Chaudhary — Fri, 27 Aug 2021 07:27:29 -0500

String comparison:

    Levenshtein Distance
    Damerau-Levenshtein Distance
    Jaro Distance
    Jaro-Winkler Distance
    Match Rating Approach Comparison
    Hamming Distance

More at https://jellyfish.readthedocs.io/en/latest/comparison.html

Python script to download covid genome !

Surabhi Chaudhary — Fri, 26 Mar 2021 07:01:29 -0500

#!/usr/bin/env python3

# these are the publicly available "complete" sequences
# https://www.gisaid.org/ has more (1200?), but they require you to sign up

import requests
import yaml

seqs = yaml.load(requests.get("https://www.ncbi.nlm.nih.gov/core/assets/genbank/files/ncov-sequences.yaml").text)
seqs = seqs['genbank-sequences']
print("got %d sequences" % len(seqs))

from Bio import Entrez
allseq = {}
for x in seqs:
  if 'gene-region' in x and x['gene-region'] == "complete":
    nm = x['accession']
    print("downloading", nm)
    dna = Entrez.efetch(db='nucleotide',id=nm, rettype = 'fasta', retmode= 'text').read().split("\n")[1:]
    allseq[nm] = ''.join(dna)

import json
with open("data/allseq.json", "w") as f:
  json.dump(allseq, f)