BOL: All

Perl script to find missing and move to desire folder

Neel — Mon, 19 Mar 2018 12:35:54 -0500

#!/usr/bin/perl -w
use strict;
use warnings;
open(my $ids,  "<",  "$ARGV[0]")  or die "Can't open input.txt: $!";
while (<$ids>) {
chomp;
next if $_ =~ /^\s*$/;
my $id = $_;
	open(my $val,  "<",  "$ARGV[1]")  or die "Can't open input.txt: $!";

	while (<$val>)  {
	chomp;
	if (/$id/) {
	  print "found string $id\n";
	system ("cp $_ 2move"); 
	}

}


}

Perl script to extract the uniq Ids

Neel — Mon, 19 Mar 2018 12:34:19 -0500

#!/usr/bin/perl -w
use strict;
use warnings;
use List::Uniq ':all';

open(my $val,  "<",  "$ARGV[0]")  or die "Can't open input.txt: $!";
my @allMissed;
while (<$val>)  {
chomp;
my $flag=0;
next if $_ =~ /^\s*$/;
my $string = "$_";

open(my $file,  "<",  "$ARGV[1]")  or die "Can't open input.txt: $!";
while (<$file>)  {
	chomp;
	if (/$string/) {
	  #print "$string\n"; 
	$flag=1;
	}
}
close $file;
	if ($flag == 0) {
	push @allMissed, $string;
	}
}

close $val;

my @allRemaining = uniq (@allMissed);
foreach (@allRemaining) { print "$_\n";}

__END__

Estimate Genome Size with Jellyfish and R

Rahul Nayak — Mon, 12 Mar 2018 10:11:19 -0500

jellyfish count -t 8 -C -m 19 -s 5G -o 19mer_out --min-qual-char=? /common/Tutorial/Genome_estimation/sample_read_1.fastq /common/Tutorial/Genome_estimation/sample_read_2.fastq

#-t    -treads=unit32       Number of treads to be used in the run. eg: 1,2,3,..etc.
#-C    -both-strands        Count both strands
#-m    -mer-len=unit32      Length of the k-mer    
#-s    -size=unit32         Hash size / memory allocation  
#-o    -output=string       Output file name
#--min-quality-char         Base quality value. Version 2.2.3 of Jellyfish uses the “Phred” score, where "?" = 30

jellyfish histo -o 19mer_out.histo 19mer_out

#Plot
dataframe19 <- read.table("19mer_out.histo") #load the data into dataframe19
plot(dataframe19[1:200,], type="l") #plots the data points 1 through 200 in the dataframe19 using a line

plot(dataframe19[2:200,], type="l")

plot(dataframe19[2:100,], type="l") #plot line graph 
points(dataframe19[2:100,]) #plot the data points from 2 through 100

sum(as.numeric(dataframe19[2:9325,1]*dataframe19[2:9325,2]))

data[10:20,]

sum(as.numeric(dataframe19[2:9325,1]*dataframe19[2:9325,2]))/12

#Return around ~ 305 Mb

Plot custom gene density with R

Jit — Thu, 08 Mar 2018 16:30:34 -0600

library(karyoploteR)

pp <- getDefaultPlotParams(plot.type=2)
pp$data1outmargin <- 100
pp$data2outmargin <- 100
pp$topmargin <- 450

gff.file <- "http://plasmodb.org/common/downloads/Current_Release/PvivaxP01/gff/data/PlasmoDB-35_PvivaxP01.gff"
data.points.colors <- c("#FFBD07", "#00A6ED",  "#FF3366", "#8EA604", "#C200FB")

header.lines <- readLines(gff.file, n = 30)
## Error in file(con, "r"): cannot open the connection to 'http://plasmodb.org/common/downloads/Current_Release/PvivaxP01/gff/data/PlasmoDB-33_PvivaxP01.gff'
#The lines with the standard chromosomes start with "##sequence-region PvP01".
#Select them.
ll <- header.lines[grepl(header.lines, pattern = "##sequence-region PvP01")]
## Error in eval(expr, envir, enclos): object 'header.lines' not found
#split them by space, and create a data.frame
gg <- data.frame(do.call(rbind, strsplit(ll, split = " ")))
## Error in strsplit(ll, split = " "): object 'll' not found
gg[,3] <- as.numeric(as.character(gg[,3]))
## Error in eval(expr, envir, enclos): object 'gg' not found
gg[,4] <- as.numeric(as.character(gg[,4]))
## Error in eval(expr, envir, enclos): object 'gg' not found
#and create a GRanges with the information
PvP01.genome <- toGRanges(gg[,c(2,3,4)])
## Error in is(A, "GRanges"): object 'gg' not found
PvP01.genome
## Error in eval(expr, envir, enclos): object 'PvP01.genome' not found

#kp <- plotKaryotype(genome=PvP01.genome)
#kp <- plotKaryotype(genome=PvP01.genome, ideogram.plotter = NULL, plot.type=2, plot.params = pp)
kp <- plotKaryotype(genome=PvP01.genome, plot.type=2, plot.params = pp)

#kp <- plotKaryotype(genome=PvP01.genome, ideogram.plotter = NULL)

#kpAddCytobandsAsLine(kp)

features <- import(gff.file)

table(features$type)

genes <- features[features$type=="gene"]

#kp <- plotKaryotype(genome=PvP01.genome, ideogram.plotter = NULL)

#kpAddCytobandsAsLine(kp)

#kpPlotRegions(kp, data=genes)

#kp <- plotKaryotype(genome=PvP01.genome, ideogram.plotter = NULL, plot.type=2)
kpAddMainTitle(kp, "Plasmodium Vivax - PvP01 with genes", cex=2)


kpPlotRegions(kp, data=genes[strand(genes)=="+"], avoid.overlapping = FALSE, col="deepskyblue")

kpPlotRegions(kp, data=genes[strand(genes)=="-"], avoid.overlapping = FALSE, col="gold", data.panel=2)

kpAddLabels(kp, "strand +", cex=0.8, col="#888888")

kpAddLabels(kp, "strand -", data.panel=2, cex=0.8, col="#888888")



#plot all
pp <- getDefaultPlotParams(plot.type = 4)
pp$data1inmargin <- 0
pp$bottommargin <- 20

kp <- plotKaryotype(genome=PvP01.genome, plot.type=4, ideogram.plotter = NULL,
                    labels.plotter = NULL, plot.params = pp,
                    main="Gene Density")
kpAddCytobandsAsLine(kp)
kpAddChromosomeNames(kp, srt=45)
kpPlotDensity(kp, genes, window.size = 10e2, col="#ddaacc")



#plot all
pp <- getDefaultPlotParams(plot.type = 2)
pp$data1inmargin <- 0
pp$bottommargin <- 20

kp <- plotKaryotype(genome=PvP01.genome, plot.params = pp)
#kp <- kpPlotDensity(kp, genes)

kpAddChromosomeNames(kp, srt=45)
kpPlotDensity(kp, genes, col="#ddaacc", data.panel = 1)

kpAbline(kp, h=0.4, data.panel = 2, r0=0.2, r1=0, col=data.points.colors[3])

Perl script to convert fastq to fasta file

Poonam Mahapatra — Wed, 07 Mar 2018 04:23:26 -0600

#!/usr/bin/env perl

use strict;
use warnings;
use Bio::Factory::EMBOSS;

my $usage   = "perl $0 in.fq out.fa";
my $infile  = shift or die $usage;
my $outfile = shift or die $usage;

my $factory = Bio::Factory::EMBOSS->new;
my $seqret  = $factory->program('seqret'); # $seqret is a Bio::Tools::Run::EMBOSSApplication object

$seqret->run({-sequence => $infile,
              -sformat1 => 'fastq',
              -outseq   => $outfile,
              -osformat => 'fasta'});

Perl script to remove fasta sequences in multifasta file with certain length threshold

Poonam Mahapatra — Wed, 07 Mar 2018 04:16:32 -0600

#!/usr/bin/perl
use strict;
use warnings;
 
my $minlen = shift or die "Error: `minlen` parameter not provided\n";
{
    local $/=">";
    while(<>) {
        chomp;
        next unless /\w/;
        s/>$//gs;
        my @chunk = split /\n/;
        my $header = shift @chunk;
        my $seqlen = length join "", @chunk;
        print ">$_" if($seqlen >= $minlen);
    }
    local $/="\n";
}

Plot dotplot with last !

Rahul Nayak — Tue, 06 Mar 2018 09:21:18 -0600

# generate dotplot
lastdb test/ref.fa
lastal -f TAB test/ref.fa test/contigs.reduced.pacbio.fa | last-dotplot - test/contigs.reduced.pacbio.fa.ref.png
lastal -f TAB test/ref.fa test/contigs.reduced.nanopore.fa | last-dotplot - test/contigs.reduced.nanopore.fa.ref.png

Download genomes in batch from NCBI

Rahul Nayak — Fri, 23 Feb 2018 08:52:03 -0600

curl 'ftp://ftp.ncbi.nlm.nih.gov/genomes/genbank/bacteria/assembly_summary.txt' | awk '{FS="\t"} !/^#/ {print $20}' | sed -r 's|(ftp://ftp.ncbi.nlm.nih.gov/genomes/all/)(GCA/)([0-9]{3}/)([0-9]{3}/)([0-9]{3}/)(GCA_.+)|\1\2\3\4\5\6/\6_genomic.fna.gz|' > genomic_file

Estimate Genome Size

Rahul Nayak — Thu, 22 Feb 2018 03:28:26 -0600

# Count k-mer occurrence using Jellyfish 2.2.6
jellyfish count -t 8 -C -m 19 -s 5G -o 19mer_out --min-qual-char=? sread_1.fastq sread_2.fastq

# points for a histogram
jellyfish histo -o 19mer_out.histo 19mer_out

#Plot results using R
##load the data into dataframe19
dataframe19 <- read.table("19mer_out.histo") 
##plots the data points 1 through 200 in the dataframe19 using a line
plot(dataframe19[1:200,], type="l")
##plot the data points from 2 through 100
points(dataframe19[2:100,])

#calculate the total k-mers in the distribution
#Assuming the total number of data points is 9325
sum(as.numeric(dataframe19[2:9325,1]*dataframe19[2:9325,2]))

#peak position and genome size
#plotted graph we can get an idea where the peak position lies
#see actual point 
data[10:20,] #If peak more likely to be between 10-20

#If 12 is the peak
sum(as.numeric(dataframe19[2:9325,1]*dataframe19[2:9325,2]))/12

#Compare the peak shape with Poisson distribution in R
singleC <- sum(as.numeric(dataframe19[2:28,1]*dataframe19[2:28,2]))/12
poisdtb <- dpois(1:100,12)*singleC
plot(poisdtb, type='l', lty=2, col="green")
lines(dataframe19[1:100,12] * singleC, type = "l", col=3)#, Ity=2)
lines(dataframe19[1:100,],type= "l")


#ALTERNATE  WAY
#https://github.com/dib-lab/khmer/blob/master/scripts/normalize-by-median.py
python normalize-by-median.py -x 1e8 -k 20 -C 20 -R report.txt reads.fa
#https://github.com/dib-lab/khmer-recipes/blob/master/003-estimate-genome-size/estimate-genome-size.py
python estimate-genome-size.py -C 20 -k 20 reads.fa.keep report.txt

Plot the density of genes in R

Abhimanyu Singh — Fri, 02 Feb 2018 03:19:16 -0600

#column1 = chromosome name and column2 = start position of the gene

# check if ggplot2 is installed, if so, load it, 
# if not, install and load it
if("ggplot2" %in% rownames(installed.packages())){
    library(ggplot2)
} else {
    install.packages("ggplot2")
    library(ggplot2)
}

# import a text file with gene positions
# columns should be: chr, position (no end or gene name required)
genes <- read.table("genes.txt",sep="\t",header=T)

# make sure the chromosomes are ordered in the way you want
# them to appear in the plot
genes$chr <- with(genes, factor(chr, levels=paste("chr",c(1:22,"X","Y"),sep=""), ordered=TRUE))

# make a density plot of genes over the provided chromosomes (or scaffolds ...)
plottedGenes <- ggplot(genes) + geom_histogram(aes(x=pos),binwidth=1000000) + facet_wrap(~chr,ncol=2) + ggtitle("RefSeq genes density over human genome 19") + xlab("Genomic position (bins 1 Mb)") + ylab("Number of genes")

# save it to an image
png("genes.png",width=1000,height=1500)
print(plottedGenes)
dev.off()