BOL: All

Perl subroutine to creating kmer !

Rahul Nayak — Mon, 20 Jan 2020 04:32:32 -0600

sub k_mers {
	my ($sequence, $k) = @_;
	my $len = length($sequence);
	my @result = ();
	for (my $i = 0; $i <= $len-$k; $i++) {
		push(@result, substr($sequence, $i, $k));
	}
	return @result;
}

Samtools commands for bioinformatician !

Shruti Paniwala — Sun, 15 Dec 2019 10:31:22 -0600

## count mapped reads
samtools view -c -F 260 mapping_file.bam


### converting sam file into fasta
samtools fasta reads_mapped.sam > reads.fasta

### converting sam file into bam
# -b : output is bam
# -S : input is sam
# -o : output file name
samtools view -b -S -o sal_sej.bam sal_sej.sam

### viewing bam files (view command)
samtool view sal_sej.bam | less


### sort reads by flag specified and show them
# -f INT show flag matches
samtools -f 4 sal_sej.bam | less

# -F INT show reads excepting flag matches
samtools -F 4 sal_sej.bam | less


### count reads (-c) by flag specified
samtool view -c -f 4 sal_sej.bam


### count reads by quality value specified (-q) (>=)
# -q : minimal quality value
samtools view -q 42 -c sal_sej.bam



### sorting bam file by genome position
samtools sort sal_sej.bam > sal_sej_sorted.bam

### indexing sorted file
# ouput is file.bai
# always index sorted files
samtools index sal_sej_sorted.bam.bam

### identifying genome variants (mpileup command)
# -g : output is bcf (binary call format) file
# -f : use reference genome given
samtools mpileup -g -f sal_ref_sej.fasta sal_sej_sorted.bam.bam > sal_vars.bcf


### calling snp and indels
# -c : find snp
# -v : output only potential variants
bcftools view -c -v sal_vars.bcf > sal_vars.vcf
### note: new command is "call". type "bcftools" to see current version and commands

### calling snp and indels with no frequency threshold
# omit -v parameter
bcftools call -c sal_vars.bcf > sal_vars.vcf

### normalize (realign) indels
# -f : reference fasta, needed to left align and normalize
bcftools norm -f ./input_data/ref.fasta \
vars.vcf \
-o vars_indels_realigned.vcf

### show alignment
# first arg is sorted bam file
# second arg is reference 
samtools tview sal_sej_sorted.bam.bam sal_ref_sej.fasta

### results into txt file
samtools depth /path/to/sorted_bam.bam > /path/to/coverage_results.txt
### note: returns count of depth at each position. input should be sorted

### count depth at each position and put it into a txt file
# -a : at all positions
samtools depth -a sorted_dupremoved.bam > depth.txt

### one liner to count mean depth
samtools depth -a sorted_dupremoved.bam | awk '{c++;s+=$3}END{print s/c}'

### one liner to count coverage breadth
samtools depth -a sorted_dupremoved.bam | awk '{c++; if($3>0) total+=1}END{print (total/c)*100}'

### flagstat
# output file has two columns: QC-passed reads and QC-failed reads
# and rows: total reads, duplicates, mapped.
samtools flagstat sorted_dupremoved.bam > flagstat.txt

####################################
# try these recipies: 
### convert BAM to FASTA:
samtools view filename.bam | awk '{OFS="\t"; print ">"$1"\n"$10}' - > filename.fasta

## converting BAM to SAM:
samtools view -h -o out.sam in.bam
## "samtools view -h  > "

QV calculation in Bash !

Shruti Paniwala — Fri, 13 Dec 2019 21:14:16 -0600

# $1 = vcf file
# $2 = input bam file
# $3 = output QV file

module load samtools

NUM_BP=`samtools depth $2 | perl -e '$c = 0; while(<>){chomp; @s = split(/\t/); if(scalar(@s) >= 3){$c++;}} print "$c\n";'`
echo "num bp: "$NUM_BP

NUM_SNP=`cat $1 |grep -v "#" | awk -F "\t" '{if (!match($NF, "0/1")) print $1"\t"$2"\t"$3"\t"$4"\t"$5"\t"$8}' | tr ';' ' ' | sed s/AB=//g | awk -v WEIGHT=0 '{if ($6 >= WEIGHT) print $0}' | awk -v SUM=0 '{if (length($4) == length($5)) { SUM+=length($4); } else if (length($4) < length($5)) { SUM+=length($5)-length($4); } else { SUM+=length($4)-length($5)}} END { print SUM}'`
echo "num snp: "$NUM_SNP

perl -e 'chomp(@ARGV); $ns = $ARGV[0]; $nb = $ARGV[1]; print (-10 * log($ns/$nb)/log(10)); print "\n";' $NUM_SNP $NUM_BP > $3
cat $3

Find and replace in multifasta or fasta header with perl onliner

BioStar — Mon, 02 Dec 2019 20:45:42 -0600

You have a fasta file and you want to replace: 
"|"

You are told to replace that by   
"_"

perl -i -p -e "s/\|/_/g"  genome.fasta

 
-i = inplace editing
-p = loop over lines and print each line (after processing)
-e = command line script

Perl script to run in parellel !

LEGE — Sun, 22 Sep 2019 22:08:20 -0500

#!/usr/bin/perl

use strict;
use warnings;
use Parallel::ForkManager;
use Bio::SeqIO;

my ($sequence_data_ref) = parse_genome_files($ARGV[0]);
my %genome=%{$sequence_data_ref};

my $n_processes = 4;
my $pm = Parallel::ForkManager->new( $n_processes );
for my $i ( 1 .. $n_processes ) {
    $pm->start and next;

    my $count = 0;
    foreach my $chr_set (keys %genome) {         
        $count++;
        if ( ( $count % $i ) == 0 ) {
            if ( !output_exists($genome{$chr_set}{name}) ) {
                start_new_XFOIL_instance($genome{$chr_set}{name}, $genome{$chr_set}{nuc_seq});
            }
        }
    }

    $pm->finish;
}
$pm->wait_all_children;

sub output_exists {
    my $chr_set = shift;
    return ( -f "$chr_set.out" );
}

sub start_new_XFOIL_instance {
    my ($chr_set, $chr_seq) = @_;
    print "starting XFOIL instance with parameters $chr_set!\n";
    touch( "$chr_set.out", $chr_seq );
    print "finished run with parameters $chr_set!\n";
}

sub touch {
    my ($fn, $seq) = @_;
    open FILE, ">$fn" or die $!;
    system ("augustus --species=caenorhabditis --outfile=$fn $seq --AUGUSTUS_CONFIG_PATH=/home/urbe/Tools/Alienomics_v1.1/augustus.2.5.5/config");
    close FILE or die $!;
}

sub parse_genome_files {
    my $file=shift;
    my (%sequence_data);
    my $file_content = new Bio::SeqIO(-format => 'fasta',-file => "$file");
    my $out_content = Bio::SeqIO->newFh(-format => 'fasta', ,-file => ">genomeRES.fa");
    while (my $gene_info = $file_content->next_seq()) {
      my $sequence = $gene_info->seq();
      my $accession_number = $gene_info->display_id; 
      my $len = $gene_info->length;
      my $GCcount = $sequence =~ tr/GC|gc//;
      my $GCcontent = ($GCcount / $len) * 100;
      $sequence_data{$accession_number}{status} = "OK"; #everybody starts fine
      $sequence_data{$accession_number}{problem_desc} = "-"; #everybody starts fine
      if ($sequence_data{$accession_number}{status} eq "OK") { # Add check points here <<<<<<
        $sequence_data{$accession_number}{nuc_seq} = $sequence;
	$sequence_data{$accession_number}{len} = $len;
	$sequence_data{$accession_number}{gc} = $GCcontent;
	$sequence_data{$accession_number}{name} = $accession_number;
	print $out_content $gene_info;
      }
    }
  return (\%sequence_data);
}

Run sspace !

Jit — Sat, 17 Aug 2019 15:06:38 -0500

#!/bin/bash

cd `pwd`

perl ~/apps/SSPACE-1.2_linux-x86_64/SSPACE_v1-2.pl \
-l libraries.txt \
-s Contigs_over200_nocp.fasta \
-k 5 \
-a 0.7 \
-x 1 \
-m 30 \
-o 20 \
-b Rayk31_scaffolds_extension

Map the long reads

Jit — Thu, 15 Aug 2019 01:05:25 -0500

Map them agaist reference avaga genome using following codes 
git clone https://github.com/lh3/bwa.git
cd bwa; make
bwa index ref.fa
bwa mem -x pacbio ref.fa pacbio.fq > aln.sam
bwa mem -x ont2d ref.fa ont-2D.fq > aln.sam

Convert FASTQ to FASTQ

Abhimanyu Singh — Thu, 15 Aug 2019 00:33:31 -0500

# Convert FASTQ to FASTA
seqtk seq -a IN.fastq > OUT.fasta

# Convert FASTQ to FASTA and set bases of quality lower than 20 to N
seqtk seq -aQ64 -q20 -n N IN.fastq > OUT.fasta


# Download Seqtk
https://github.com/lh3/seqtk

Resume the MIRA assembler run !

Jit — Mon, 05 Aug 2019 22:55:27 -0500

mira -r manifest_file

Usage:
mira [options] manifest_file [manifest_file ...]

Options:
  -c / --cwd=           directory       Change working directory
  -r / --resume                         Resume an interupted assembly
  -h / --help                           Print short help and exit
  -v / --version                        Print version and exit

Palindrome Simulation commands !

Jit — Sun, 04 Aug 2019 19:24:42 -0500

(base) ➜  palindromeAssemblySim more allCommands 
 3315  mutate.sh in= mutant15CH101.fasta out=mutant153CH101.fasta id=97
 3316  mutate.sh in=mutant15CH101.fasta out=mutant153CH101.fasta id=97
 3317  history > allCommands
 3318  cat ch101_read33155_template_pass_FAH31515.faa mutant3CH101.fasta mutant15CH101.fasta mutant153CH101.fasta > allPalindromeSimulated.fa
 3319  ~/Tools/art_bin_MountRainier/art_illumina -ss MSv3 -sam -i allPalindromeSimulated.fa -p -l 251 -f 100 -m 300 -s 10 -o paired_dat
 3320  ~/Tools/art_bin_MountRainier/art_illumina -ss MSv3 -sam -i allPalindromeSimulated.fa -p -l 250 -f 100 -m 300 -s 10 -o paired_dat
 3321  ~/Tools/seqtk/seqtk mergepe paired_dat1.fq paired_dat2.fq > interleavedPE.fq
 3322  ~/Tools/seqtk/seqtk seq -a interleavedPE.fq > interleavedPE.fasta
 3323  ~/Tools/BWISE/Bwise.py -c 0 -p 4 -K 250 -t 10 -o simPalBwise -x interleavedPE.fasta
 3326  cd masurca; ~/Tools/MaSuRCA-3.2.3/bin/masurca sim.conf
 3327  ./assemble.sh
 3328  cd ..
 3329  ~/Tools/SPAdes-3.10.1-Linux/bin/spades.py --careful -1 paired_dat1.fq -2 paired_dat2.fq -o SpadesAssembly_OUT
 3330  ~/Tools/platanus/platanus
 3331  ~/Tools/platanus/platanus assemble 
 3332  ~/Tools/platanus/platanus assemble -o seePlatanus -f paired_dat[12].fq -t 10