BOL: Owner

Install Ragout genome assembler

Neel — Sat, 02 May 2020 06:46:00 -0500

$ conda install -c bioconda ragout
Collecting package metadata (repodata.json): done
Solving environment: done

## Package Plan ##

  environment location: /home/anaconda3

  added / updated specs:
    - ragout


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    decorator-4.3.0            |           py37_0          15 KB
    ragout-2.3                 |   py37hc9558a2_0         2.8 MB  bioconda
    sibelia-3.0.7              |       he1b5a44_2        24.8 MB  bioconda
    ------------------------------------------------------------
                                           Total:        27.6 MB

The following NEW packages will be INSTALLED:

  ragout             bioconda/linux-64::ragout-2.3-py37hc9558a2_0
  sibelia            bioconda/linux-64::sibelia-3.0.7-he1b5a44_2

The following packages will be DOWNGRADED:

  decorator                                    4.4.0-py37_1 --> 4.3.0-py37_0


Proceed ([y]/n)? y


Downloading and Extracting Packages
ragout-2.3           | 2.8 MB    | ################################################## | 100% 
sibelia-3.0.7        | 24.8 MB   | ################################################## | 100% 
decorator-4.3.0      | 15 KB     | ################################################## | 100% 
Preparing transaction: done
Verifying transaction: done
Executing transaction: done

Install Samtools, Bcftools and htslib on Ubuntu !

Neel — Wed, 29 Jan 2020 06:47:15 -0600

#Inspired from online search
sudo apt-get update
sudo apt-get install gcc
sudo apt-get install make
sudo apt-get install libbz2-dev
sudo apt-get install zlib1g-dev
sudo apt-get install libncurses5-dev 
sudo apt-get install libncursesw5-dev
sudo apt-get install liblzma-dev

cd /usr/bin
wget https://github.com/samtools/htslib/releases/download/1.9/htslib-1.9.tar.bz2
tar -vxjf htslib-1.9.tar.bz2
cd htslib-1.9
make

cd ..
wget https://github.com/samtools/samtools/releases/download/1.9/samtools-1.9.tar.bz2
tar -vxjf samtools-1.9.tar.bz2
cd samtools-1.9
make

cd ..
wget https://github.com/samtools/bcftools/releases/download/1.9/bcftools-1.9.tar.bz2
tar -vxjf bcftools-1.9.tar.bz2
cd bcftools-1.9
make

export PATH="$PATH:/usr/bin/bcftools-1.9"
export PATH="$PATH:/usr/bin/samtools-1.9"
export PATH="$PATH:/usr/bin/htslib-1.9"
source ~/.profile

echo "All done"

Bash script to alignment of short reads against reference genome !

Neel — Tue, 28 Jan 2020 04:21:43 -0600

bwa mem -t 40 -R '@RG\tID:K12\tSM:K12' \
    E.coli_K12_MG1655.fa SRR1770413_1.fastq.gz SRR1770413_2.fastq.gz \
    | samtools view -b - >SRR1770413.raw.bam
sambamba sort SRR1770413.raw.bam
sambamba markdup SRR1770413.raw.sorted.bam SRR1770413.bam


##Breaking it down by line:

#alignment with bwa: bwa mem -t $threads -R '@RG\tID:K12\tSM:K12' --- this says "align using so many threads" and also "give the reads the read group K12 and the sample name K12"
#reference and FASTQs E.coli_K12_MG1655.fa SRR1770413_1.fastq.gz SRR1770413_2.fastq.gz --- this just specifies the base reference file name (bwa finds the indexes using this) and the input alignment files. The first file should contain the first mate, the second file the second mate.
#conversion to BAM: samtools view -b - --- this reads SAM from stdin (the - specifier in place of the file name indicates this) and converts to BAM.
#sorting the BAM file: sambamba sort SRR1770413.raw.bam --- sort the BAM file, writing it to .sorted.bam.
#marking PCR duplicates: sambamba markdup SRR1770413.raw.sorted.bam SRR1770413.bam --- this marks reads which appear to be redundant PCR duplicates based on their read mapping position. It uses the same criteria for marking duplicates as picard.

minimap2 -ax sr -t 40 -R '@RG\tID:O104_H4\tSM:O104_H4' \
    E.coli_K12_MG1655.fa SRR341549_1.fastq.gz  SRR341549_2.fastq.gz \
    | samtools view -b - >SRR341549.raw.minimap2.bam
sambamba sort SRR341549.raw.minimap2.bam
sambamba markdup SRR341549.raw.sorted.minimap2.bam SRR341549.minimap2.bam

#The only major change from bwa mem is that we'll tell it we're working with short read data using -ax sr:

CollectGcBiasMetrics.jar will generate a GC bias plot for each contig

Neel — Fri, 09 Nov 2018 13:37:28 -0600

samtools index aln-pe.mapped.sorted.bam

for i in $(samtools view -H aln-pe.mapped.sorted.bam | awk -F"\t" '/@SQ/{gsub("^SN:","",$2);print $2}'
); do samtools view -b aln-pe.mapped.sorted.bam $i > aln-pe.mapped.sorted.$i.bam; java -Xmx2g -jar $(which CollectGcBiasMetrics.jar) R=data/Cdiff078.fa I=aln-pe.mapped.sorted.$i.bam O=aln-pe.mapped.sorted.${i}_GCBias.txt CHART=aln-pe.mapped.sorted.${i}_GCBias.pdf ASSUME_SORTED=true; done

Script to Plot the Coverage

Neel — Fri, 09 Nov 2018 12:29:07 -0600

#!/bin/bash
Plot the coverage script

chr=$1
start=$2
end=$3

samtools depth deduped_MA605.bam > deduped_MA605.coverage

awk '$1 == $chr {print $0}' deduped_MA605.coverage > chr1_MA605.coverage

#awk '$1 == 2 {print $0}' deduped_MA605.coverage > chr2_MA605.coverage

#Rscript
library(reshape)
my.chr2 <- read.table("my.coverage", header=FALSE, sep="\t", na.strings="NA", dec=".", strip.white=TRUE)`
my.chr2<-rename(my.chr2,c(V1="Chr", V2="locus", V3="depth")) # renames the header

plot(my.chr2$locus, my.chr2$depth)

#Shushi tool .... gawk '{/^[0-9]/{print >$1".coverag"}' ./deduped_MA605.coverag

Collecting arguments with R

Neel — Fri, 09 Nov 2018 12:21:40 -0600

#! /usr/bin/Rscript

## Collect arguments
args <- commandArgs(TRUE)

## Parse arguments (we expect the form --arg=value)
parseArgs <- function(x) strsplit(sub("^--", "", x), "=")
argsL <- as.list(as.character(as.data.frame(do.call("rbind", parseArgs(args)))$V2))
names(argsL) <- as.data.frame(do.call("rbind", parseArgs(args)))$V1
args <- argsL
rm(argsL)

## Give some value to options if not provided 
if(is.null(args$opt_arg1)) {args$opt_arg1="default_option1"}
if(is.null(args$opt_arg2)) {args$opt_arg2="default_option1"} else {args$opt_arg2=as.numeric(args$opt_arg2)}

## Default setting when no all arguments passed or help needed
if("--help" %in% args | is.null(args$arg1) | is.null(args$arg2)) {
  cat("
      The R Script arguments_section.R
      
      Mandatory arguments:
      --arg1=type           - description
      --arg2=type           - description
      --help                - print this text
      
      Optionnal arguments:
      --opt_arg1=String          - example:an absolute path, default:default_option1
      --opt_arg2=Value           - example:a threshold, default:10

      WARNING : here put all the things the user has to know
      
      Example:
      ./arguments_section.R --arg1=~/Documents/ --arg2=10 --opt_arg2=8 \n\n")
  
  q(save="no")
}

cat("first mandatory argument : ", args$arg1,"\n",sep="")
cat("second mandatory argument : ", args$arg2,"\n",sep="")
cat("first optional argument : ", args$opt_arg1,"\n",sep="")
cat("second optional argument : ", args$opt_arg2,"\n",sep="")

Perl script to create a consensus of nucleotide sequences !

Neel — Fri, 12 Oct 2018 10:01:22 -0500

use strict;
use warnings;

my @instances  = qw ( AAAAA ATCGA ATAAA );
my @instances2 = qw ( AAAAA AACGA ATAAA AGAAA AGAAA);

print consensus(@instances),"\n";        # ATAAA
print consensus(@instances2),"\n";       # ATAAA
exit;

sub consensus{
 my @mi = @_;
 chomp(@mi);
 my $motif_count=0;
 my @words =();

  my %H = ( A=>[], T=>[], C=>[], G=>[] );

  s/\s//g for @mi;
  my ($w) = sort {$b <=> $a} map {length} @mi;    # set w to the length of the longest element

    foreach my $j ( 0 .. $w-1 ){
        # Initialize the base counts.
        my %h = ( a=>0, t=>0, c=>0, g=>0 );
        my @mi_letters = map { [split '', uc $_] } @mi;
  	foreach my $j ( 0 .. $w-1 ){
    		$H{ $_->[$j] }->[$j]++ for @mi_letters;
  	}
        push @{$H{ uc $_ }}, $h{$_} for keys %h;   # example:  push @{$H{G}}, $g;
    }

    my @cons = ();
    my %prefOrder = ( A=>1, T=>2, C=>3, G=>4 );
    foreach my $B ( 0 .. $w-1 ){
      push @cons, [ sort { ($H{$b}->[$B]||0) <=> ($H{$a}->[$B]||0) || $prefOrder{$b} <=> $prefOrder{$a} } qw/A T G C/ ]->[0];
    }

    return @cons;
}

#reference https://www.perlmonks.org/bare/?node_id=500962

Perl script to reverse complement a DNA sequence !

Neel — Mon, 01 Oct 2018 08:44:56 -0500

#!/usr/bin/perl -w

$DNA = 'ACGGGAGGACGGGAAAATTACTACGGCATTAGC';

print "Here is the starting DNA:\n\n";

print "$DNA\n\n";

$revcom = reverse $DNA;

$revcom =~ s/A/T/g;
$revcom =~ s/T/A/g;
$revcom =~ s/G/C/g;
$revcom =~ s/C/G/g;

print "Here is the reverse complement DNA: WRONG:\n\n";

print "$revcom\n";
print "\nThat was a bad algorithm, and the reverse complement was wrong!\n";
print "Try again ... \n\n";

# Make a new copy of the DNA (see why we saved the original?)
$revcom = reverse $DNA;

# See the text for a discussion of tr///
$revcom =~ tr/ACGTacgt/TGCAtgca/;

print "Here is the reverse complement DNA:\n\n";

print "$revcom\n";

print "\nThis time it worked!\n\n";

exit;

Coverage / Depth of reads !

Neel — Tue, 17 Apr 2018 14:18:44 -0500

# get total number of bases covered at MIN_COVERAGE_DEPTH or higher
samtools mpileup mapping_result_sorted.bam | awk -v X="${MIN_COVERAGE_DEPTH}" '$4>=X' | wc -l
32876

# get length of reference genome
bowtie2-inspect -s refgenome | awk '{ FS = "\t" } ; BEGIN{L=0}; {L=L+$3}; END{print L}'
45678

Genome Covered !

Neel — Tue, 17 Apr 2018 14:13:05 -0500

zero=$(bedtools genomecov -ibam BAM -g hg38.fasta -bga | awk '$4==0 {bpCountZero+=($3-$2)} {print bpCountZero}' | tail -1)

nonzero=$(bedtools genomecov -ibam BAM -g hg38.fasta -bga | awk '$4>0 {bpCountNonZero+=($3-$2)} {print bpCountNonZero}' | tail -1)

percent=$(bc <<< "scale=6; ($nonzero / ($zero + $nonzero))*100")

echo $percent