# Count k-mer occurrence using Jellyfish 2.2.6
jellyfish count -t 8 -C -m 19 -s 5G -o 19mer_out --min-qual-char=? sread_1.fastq sread_2.fastq
# points for a histogram
jellyfish histo -o 19mer_out.histo 19mer_out
#Plot results using R
##load the data into dataframe19
dataframe19 <- read.table("19mer_out.histo")
##plots the data points 1 through 200 in the dataframe19 using a line
plot(dataframe19[1:200,], type="l")
##plot the data points from 2 through 100
points(dataframe19[2:100,])
#calculate the total k-mers in the distribution
#Assuming the total number of data points is 9325
sum(as.numeric(dataframe19[2:9325,1]*dataframe19[2:9325,2]))
#peak position and genome size
#plotted graph we can get an idea where the peak position lies
#see actual point
data[10:20,] #If peak more likely to be between 10-20
#If 12 is the peak
sum(as.numeric(dataframe19[2:9325,1]*dataframe19[2:9325,2]))/12
#Compare the peak shape with Poisson distribution in R
singleC <- sum(as.numeric(dataframe19[2:28,1]*dataframe19[2:28,2]))/12
poisdtb <- dpois(1:100,12)*singleC
plot(poisdtb, type='l', lty=2, col="green")
lines(dataframe19[1:100,12] * singleC, type = "l", col=3)#, Ity=2)
lines(dataframe19[1:100,],type= "l")
#ALTERNATE WAY
#https://github.com/dib-lab/khmer/blob/master/scripts/normalize-by-median.py
python normalize-by-median.py -x 1e8 -k 20 -C 20 -R report.txt reads.fa
#https://github.com/dib-lab/khmer-recipes/blob/master/003-estimate-genome-size/estimate-genome-size.py
python estimate-genome-size.py -C 20 -k 20 reads.fa.keep report.txt
Comments
# count k-mers (see jellyfish documentation for options)
gzip -dc reads1.fastq.gz reads2.fastq.gz | jellyfish count -m 31 -o fastq.counts -C -s 10000000000 -U 500 -t 30 /dev/fd/0
# generate a histogram
jellyfish histo fastq.counts_0 > fastq.counts_0.histo
# generate a pdf graph of the histogram
jellyplot.pl fastq.counts_0.histo
# look at fastq.counts_0.histo.pdf and identify the approximate peak
# use find_valleys.pl to help pinpoint the actual peak
find_valleys.pl fastq.counts_0.histo
# estimate the size and coverage
estimate_genome_size.pl --kmer=31 --peak=42 --fastq=reads1.fastq.gz reads2.fastq.gz