BOL: Owner

Install R in Ubuntu / Linux !

Abhi — Wed, 02 Feb 2022 02:34:51 -0600

#R is a feature rich interpretive programming language originally released in 1995. It is heavily used in the bioinformatics community largely due to numerous R libraries available on bioconductor. It takes a several minutes to compile so we’ll use one which has already been setup. If we were to install R, we first would need to download and extract the source code. Next we’d configure the installation with --with-x=no which tells R to install without X11, a windowing system for displays. We’d also specify --prefix which is where the R framework will go, this includes the additional R libraries we’ll download later. From there we’d do make and make install to build the software and copy the files to their proper location and create symlinks for the executables. Finally we’d install the devtools and Biocmanager packages from the command line to make installing additional packages easier. We’ve commented out the code below, however it is exactly what was run to set up the R we will be using, except the installation location.

## download and extract
cd ~/workspace/bin
wget https://cran.r-project.org/src/base/R-3/R-3.5.1.tar.gz
tar -zxvf R-3.5.1.tar.gz

## configure the installation, build the code
cd R-3.5.1
./configure --prefix=/home/ubuntu/workspace/bin --with-x=no
make
make install

## make symlinks
ln -s ~/workspace/bin/R-3.5.1/bin/Rscript ~/workspace/bin/Rscript
ln -s ~/workspace/bin/R-3.5.1/bin/R ~/workspace/bin/R

## test installation
cd ~/workspace/bin
~/workspace/bin/Rscript --version

## install additional packages
~/workspace/bin/R --vanilla -e 'install.packages(c("devtools", "BiocManager", "dplyr", "tidyr", "ggplot2"), repos="http://cran.us.r-project.org")'

Install Install Gffcompare on Ubuntu / Linux

Abhi — Wed, 02 Feb 2022 02:34:18 -0600

#Gffcompare is a program that is used to perform operations on general feature format (GFF) and general transfer format (GTF) files. It has a binary distribution compatible with the linux we’re using so we will just download, extract, and make a symlink.

# download and extract
cd ~/workspace/bin
wget http://ccb.jhu.edu/software/stringtie/dl/gffcompare-0.9.8.Linux_x86_64.tar.gz
tar -xzvf gffcompare-0.9.8.Linux_x86_64.tar.gz

# make symlink
ln -s ~/workspace/bin/gffcompare-0.9.8.Linux_x86_64/gffcompare ~/workspace/bin/gffcompare

# check Installation
~/workspace/bin/gffcompare

Extract the sequences with IDs !

Abhi — Wed, 05 Jan 2022 07:29:47 -0600

#sed -i 's/\_/ /g' Delta_seqID_from_lineage_report.txt
seqtk subseq genomic.fna Delta_seqID_from_lineage_report.txt > Delta.fasta

#Split the fasta in 11 equal sequences subsets
pyfasta split -n 11 Delta.fasta

Perl script to rename the fasta file !

Abhi — Wed, 29 Dec 2021 07:32:58 -0600

#Script #1
#!/usr/bin/perl -w
use strict;

#USAGE
#perl extractPattern.pl kmerfasta > uniref100_result_broad

my %kHash;
local $/ = '>';
my $infile2 = "$ARGV[0]"; # Kmer fasta
open( FH2, '<', $infile2 ) or die $!;
while () {
my @allVal = split ('\n', $_);
my $namewa;

for (my $j=0; $j");
        if ($j == 0) {$namewa=$allVal[0]; next;}
        my $num=$j;
        my @lName=split(' ',$allVal[0]);
        #print "$allVal[$j]\t$lName[0]\n";
        #$kHash{$allVal[$j]}=$lName[0];
        print ">$lName[0].$num\n$allVal[$j]\n";
      }
}

Update the Linux OS !

Abhi — Mon, 27 Dec 2021 06:35:22 -0600

#To update the linux OS -- run the following

sudo -- sh -c 'apt-get update; apt-get upgrade -y; apt-get dist-upgrade -y; apt-get autoremove -y; apt-get autoclean -y'

#OR

sudo apt-get update && sudo apt-get upgrade

Rules to run fastp / Snakemake !

Abhi — Tue, 16 Nov 2021 01:03:22 -0600

rule fastp:
    input:
        fwd=INPUT + "/{sample}-read_1.fq",
        rev=INPUT + "/{sample}-read_2.fq"
    output:
        fwd=RESULTS + "/fastq_trimmed/{sample}.1.trimmed.fastq",
        rev=RESULTS + "/fastq_trimmed/{sample}.2.trimmed.fastq",
        html=RESULTS + "/fastq_trimmed/{sample}.html",
        json=RESULTS + "/fastq_trimmed/{sample}.json"
    threads:
        5
    log:
        RESULTS + "/logs/fastp/{sample}.preprocess.log"
    shell:
        "fastp --in1 {input.fwd} --in2 {input.rev} "
        "--out1 {output.fwd} --out2 {output.rev} --thread {threads} --cut_tail --html {output.html} --json {output.json} 2> {log}"

Installing manadatory software for websites hosting !

Abhi — Mon, 01 Nov 2021 04:35:23 -0500

#See all the installed
apt list --installed
apt list

#Installing Apache
#To install Apache, install the latest meta-package apache2 by running:

sudo apt update
sudo apt install apache2

#Install the mysql-server
sudo apt install mysql-server
sudo systemctl status mysql

#Remove PHP and all his libraries and files
sudo apt-get purge php7.*

#Install PHP again
sudo apt-get install php7.2

#Install Php admin
sudo apt install phpmyadmin php-mbstring php-gettext

Installing manadatory software for websites hosting !

Abhi — Mon, 01 Nov 2021 04:35:11 -0500

#See all the installed
apt list --installed
apt list

#Installing Apache
#To install Apache, install the latest meta-package apache2 by running:

sudo apt update
sudo apt install apache2

#Install the mysql-server
sudo apt install mysql-server
sudo systemctl status mysql

#Remove PHP and all his libraries and files
sudo apt-get purge php7.*

#Install PHP again
sudo apt-get install php7.2

Remove Apache2 from Linux !

Abhi — Mon, 01 Nov 2021 03:53:23 -0500

#Purge it
sudo apt-get purge apache2
#Auto remove 
sudo apt-get autoremove
#Remove
sudo rm -rf /etc/apache2

#Try this
sudo apt remove apache2.*

Bash script to simulate a genome !

Abhi — Sat, 30 Oct 2021 13:50:47 -0500

# Reference https://github.com/chhylp123/hifiasm/issues/33

# Use Drosophila melongaster PacBio assembly
cd /genetics/elbers/test/fly2
wget https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/003/401/745/GCA_003401745.1_ASM340174v1/GCA_003401745.1_ASM340174v1_genomic.fna.gz




# Use BCFtools 1.10.2 and SAMtools 1.10
conda activate bcftools1.10.2




# Use Seqtk to convert soft-masked bases to upper-case bases, also compress with bgzip
# https://github.com/lh3/seqtk
/genetics/elbers/bin/seqtk/seqtk seq -U \
GCA_003401745.1_ASM340174v1_genomic.fna.gz | \
bgzip -@75 > GCA_003401745.1_ASM340174v1_genomic.fna_upper.fasta.gz




# Convert to diploid with approximately 2% heterozygosity rate, max indels=20bp
# mutate.sh part of BBTools/BBMap https://sourceforge.net/projects/bbmap/
/genetics/elbers/bbmap-38.86/mutate.sh \
in=GCA_003401745.1_ASM340174v1_genomic.fna_upper.fasta.gz \
ow=t \
vcf=GCA_003401745.1_ASM340174v1_genomic.fna_upper.diploid.vcf.gz \
out=GCA_003401745.1_ASM340174v1_genomic.fna_upper.diploid.fasta.gz \
ziplevel=6 \
ploidy=2 \
subrate=0.0192 \
indelrate=0.001 \
maxindel=20 \
nohomopolymers=t \
hetrate=1 2> GCA_003401745.1_ASM340174v1_genomic.fna_upper.diploid.fasta.log.txt

# genome size
bgzip -@75 -cd GCA_003401745.1_ASM340174v1_genomic.fna_upper.fasta.gz | \
grep -v ">"|wc -m
# 140687135

# 2% heterozygosity is how many bases
calc 0.02\*140687135
# 0.02*140687135 = 2813742.700000

# number of mutations added
bgzip -@75 -cd GCA_003401745.1_ASM340174v1_genomic.fna_upper.diploid.vcf.gz| \
grep -v "^#"|wc -l
# 2830139

# ~2% het rate




# get only 1 haplotype from the "diploid" reference
bgzip -@75 -dc GCA_003401745.1_ASM340174v1_genomic.fna_upper.diploid.fasta.gz|\
/genetics/elbers/bin/seqtk/seqtk seq -L0|paste - - |grep -P "haplo_0\t"| \
tr '\t' '\n' |\
/genetics/elbers/bin/seqtk/seqtk seq -L60 |\
bgzip -@75 \
> GCA_003401745.1_ASM340174v1_genomic.fna_upper.diploid.haplotype0.fasta.gz




# make reference for randomreads.sh
# randomreads.sh part of BBTools/BBMap https://sourceforge.net/projects/bbmap/
/genetics/elbers/bbmap-38.86/randomreads.sh build=1 \
seed=1 \
ref=GCA_003401745.1_ASM340174v1_genomic.fna_upper.diploid.fasta.gz \
illuminanames=t addslash=t \
pacbio=t pbmin=0.13 pbmax=0.17 \
reads=100 paired=f \
gaussianlength=t \
minlength=1000 midlength=20000 maxlength=100000 \
out=/dev/null




# make 60x haploid coverage for Illumina reads
/genetics/elbers/bbmap-38.86/randomreads.sh build=1 \
ref=GCA_003401745.1_ASM340174v1_genomic.fna_upper.diploid.fasta.gz \
illuminanames=t addslash=t \
coverage=30 paired=t maxinsert=550 mininsert=450 \
out1=illumina1.fastq.gz out2=illumina2.fastq.gz > random_reads_illumina.log 2>&1




# interleave the paired-end reads
# reformat.sh part of BBTools/BBMap https://sourceforge.net/projects/bbmap/
/genetics/elbers/bbmap-38.86/reformat.sh \
in=illumina1.fastq.gz in2=illumina2.fastq.gz out=illumina.int.fastq 2>/dev/null




# use KmerGenie 1.7051 to get an idea of k-mer with that produces longest N50
# http://kmergenie.bx.psu.edu/
mkdir -p /genetics/elbers/test/fly2/kmergenie-illumina-raw-reads

cd /genetics/elbers/test/fly2/kmergenie-illumina-raw-reads
/genetics/elbers/kmergenie-1.7051/kmergenie ../illumina.int.fastq \
> kmergenie-illumina-raw-reads.log 2>&1
rm ../illumina.int.fastq

k=`grep "^best k:" \
kmergenie-illumina-raw-reads.log | grep -Po "\d+"` 
echo "best k=${k}"




# make 30x haploid coverage for PacBio CLR reads
# error rate from 13 - 15 % minimum 1000bp midlength 20000bp maximum 30000bp
cd /genetics/elbers/test/fly2

/genetics/elbers/bbmap-38.86/randomreads.sh build=1 \
ow=t seed=1 \
ref=GCA_003401745.1_ASM340174v1_genomic.fna_upper.diploid.fasta.gz \
illuminanames=t addslash=t \
pacbio=t pbmin=0.13 pbmax=0.15 \
coverage=15 paired=f \
gaussianlength=t \
minlength=1000 midlength=20000 maxlength=30000 \
out=pacbio.fastq.gz > random_reads_pacbio.log 2>&1



# make 30x haploid coverage for PacBio reads for Hifi reads
# error rate from 1 - 0.1 % minimum 9000bp midlength 10000bp max 12000bp
/genetics/elbers/bbmap-38.86/randomreads.sh build=1 \
ow=t seed=1 \
ref=GCA_003401745.1_ASM340174v1_genomic.fna_upper.diploid.fasta.gz \
illuminanames=t addslash=t \
pacbio=t pbmin=0.001 pbmax=0.01 \
coverage=15 paired=f \
gaussianlength=t \
minlength=9000 midlength=10000 maxlength=12000 \
out=hifi.fastq.gz > random_reads_pacbio_hifi.log 2>&1