BOL: All

R script for Circos plot !

Abhi — Tue, 11 Jul 2023 01:41:03 -0500

#!/usr/bin/env Rscript
library(RCircos)

# usage: Rscript make_circos.r     

# parse args
args = commandArgs(trailingOnly=TRUE)
sv.file <- args[1]
sample.name <- args[2]
gene.label.file <- args[3]
cnv.file <- args[4]
out.file <- args[5]
# TMP <- Sys.getenv("TMP_DIR") 
# tmp.bed = paste0(TMP ,"/" , sample.name, "_bkpts.bed")
tmp.bed = paste0(sample.name, "_bkpts.bed")

# load prereq data
data(UCSC.HG19.Human.CytoBandIdeogram)

# set core parameters
chr.exclude <- NULL;
cyto.info <- UCSC.HG19.Human.CytoBandIdeogram;
tracks.inside <- 10;
tracks.outside <- 5;
RCircos.Set.Core.Components(cyto.info, chr.exclude, tracks.inside, tracks.outside);
rcircos.params <- RCircos.Get.Plot.Parameters();
rcircos.params$text.size <- 1
RCircos.Reset.Plot.Parameters(rcircos.params)
rcircos.cyto <- RCircos.Get.Plot.Ideogram();
rcircos.position <- RCircos.Get.Plot.Positions();
RCircos.List.Plot.Parameters()

link.data <- tryCatch(read.table(sv.file, sep = ',', stringsAsFactors = F, header = T), error=function(e) data.frame())
                    
if (nrow(link.data) != 0) {
  
  link.data <- transform(link.data,
                         chromStart = as.numeric(chromStart),
                         chromEnd = as.numeric(chromEnd),
                         chromStart.1 = as.numeric(chromStart.1),
                         chromEnd.1 = as.numeric(chromEnd.1))
  
  # write a bed file of all breakpoints to intersect with gene label table
  bkpts.1 <- link.data[c("Chromosome", "chromStart", "chromEnd")]
  bkpts.2 <- link.data[c("Chromosome.1", "chromStart.1", "chromEnd.1")]
  colnames(bkpts.2) <- colnames(bkpts.1)
  write.table(rbind(bkpts.1, bkpts.2), tmp.bed, sep = '\t', quote = F, col.names = F, row.names = F)
  
  # only keep labels that fall within an event
  print(paste0("bedtools intersect -wb -a ", tmp.bed, " -b ", gene.label.file))
  gene.labels <- system(paste0("bedtools intersect -wb -a ", tmp.bed, " -b ", gene.label.file), intern = T)
  gene.labels <- data.frame(do.call('rbind', strsplit(gene.labels, '\t', fixed=TRUE)), stringsAsFactors = F)
  if (nrow(gene.labels) > 0) {
    gene.labels <- gene.labels[,4:ncol(gene.labels)]
    
    # deduplicate labels
    gene.labels <- gene.labels[!duplicated(gene.labels),]
    colnames(gene.labels) <- c("Chromosome", "chromStart", "chromEnd", "gene")
    gene.labels <- transform(gene.labels,
                             chromStart = as.numeric(chromStart),
                             chromEnd = as.numeric(chromEnd))
  }
  
  # make the plot
  png(file=out.file, height=3000, width=3000, res = 500)
  RCircos.Set.Plot.Area()
  RCircos.Chromosome.Ideogram.Plot()
  track.num <- 2
  RCircos.Link.Plot(link.data, track.num, TRUE)
  title(sample.name, line=-1)
  
  # label the genes
  if (nrow(gene.labels) > 0) {
    name.col <- 4
    side <- "out"
    track.num <- 1
    RCircos.Gene.Connector.Plot(gene.labels, track.num, side);
    track.num <- 2
    RCircos.Gene.Name.Plot(gene.labels, name.col, track.num, side);
  }
    
  # remove intermediate file
  system(paste0("rm -f ", tmp.bed))
  
} else {
  # make empty plot
  png(file=out.file, height=3000, width=3000, res = 500)
  RCircos.Set.Plot.Area()
  RCircos.Chromosome.Ideogram.Plot()
  title(sample.name, line=-1)
}
                    
# parse cnv data
cnv = tryCatch(read.csv(cnv.file, stringsAsFactors = F), error=function(e) data.frame())
               
if (nrow(cnv) != 0) {
    colnames(cnv) <- c("Chromosome", "chromStart", "chromEnd", "cnv")
    cnv$Chromosome <- paste0('chr', cnv$Chromosome)
    cnv$GeneName <- "gene"
    cnv <- cnv[, c("Chromosome", "chromStart", "chromEnd", "GeneName", "cnv")]
}
                    
# add CNV heatmap track
if (nrow(cnv) != 0) {
  RCircos.Heatmap.Plot(cnv, data.col = 5, track.num = 1, side = "in")
}
                   
dev.off()

#-------- DATA FORMAT ------
chr1	11869	14412	DDX11L1
chr1	14363	29806	WASH7P
chr1	29554	31109	MIR1302-10
chr1	34554	36081	FAM138A
chr1	52473	54936	OR4G4P
chr1	62948	63887	OR4G11P
chr1	69091	70008	OR4F5
chr1	131025	134836	CICP27
chr1	134901	139379	AL627309.1
chr1	157784	157887	RNU6-1100P
chr1	227615	267253	AP006222.2
chr1	228292	228775	AP006222.1
chr1	317720	453948	RP4-669L17.10
chr1	326096	328112	RP4-669L17.8
chr1	329431	332236	CICP7
chr1	334126	334305	RP4-669L17.4
chr1	367640	368634	OR4F29
chr1	379105	379467	WBP1LP7

Download lumpy skin disease data !

BioStar — Wed, 22 Mar 2023 05:12:12 -0500

Location

https://www.ncbi.nlm.nih.gov/sra?linkname=bioproject_sra_all&from_uid=880745


The raw genome sequence data from the 2022 outbreak in India is available in the SRA Project PRJNA880745

Perl script for chi-squared test !

Neel — Tue, 21 Mar 2023 03:53:45 -0500

#!/usr/bin/perl
#
# chidi.pl 
#
# A script to perform a chi-squared test of the dinucleotide frequencies of two FASTA files
# Last updated by: $Author$
# Last updated on: $Date$

use strict;
use warnings;
use Getopt::Long;
use FAlite;



# sanity checks
die "Usage: chidi.pl  \n" if (!$ARGV[1]);

my @dinucs = qw (AA AC AG AT CA CC CG CT GA GC GG GT TA TC TG TT);

# hashes for obersered and expected dinucleotide frequencies of both files

my %file1_ob;
my %file2_ob;
my %file1_ex;
my %file2_ex;
								
############################################################
# Read sequence file 1
############################################################

open(FILE,"$ARGV[0]") || die "Can't open $ARGV[0]\n";
my $fasta = new FAlite(\*FILE);

# loop through each sequence in file 1
while(my $entry = $fasta->nextEntry) {	
	my $seq = uc($entry->seq);
	# to count dinucleotides, loop through sequence, take 2 bp and increment the hash counter
	foreach my $i (0..length($seq)){
	    my $tmp = substr($seq,$i,2);		
		$file1_ob{$tmp}++;
	}
}
close(FILE);


############################################################
# Read sequence file 2
############################################################

open(FILE,"$ARGV[1]") || die "Can't open $ARGV[1]\n";
$fasta = new FAlite(\*FILE);

# loop through each sequence in file 1
while(my $entry = $fasta->nextEntry) {	
	my $seq = uc($entry->seq);
	# to count dinucleotides, loop through sequence, take 2 bp and increment the hash counter
	foreach my $i (0..length($seq)){
	    my $tmp = substr($seq,$i,2);		
		$file2_ob{$tmp}++;
	}
}
close(FILE);


############################################################
# Perform chi-squared test
############################################################

# need total of all counts in both sequences, plus totals of 'rows' in chi-square table

my $total;
my $row1;
my $row2;

foreach my $di (@dinucs){
	$row1  += $file1_ob{$di};
	$row2  += $file2_ob{$di};
	$total += ($file1_ob{$di} + $file2_ob{$di});
}


# now calculate expected values

foreach my $di (@dinucs){
	# calculate (column total * row total) / $total
	$file1_ex{$di} = sprintf("%.2f",(($file1_ob{$di}+$file2_ob{$di}) * $row1) / $total);
	$file2_ex{$di} = sprintf("%.2f",(($file1_ob{$di}+$file2_ob{$di}) * $row2) / $total);	
}

# now calculate chi-squared values
my ($chi1,$chi2);
my $chi_total;
print "\tObs1\tExp2\t\tChi1\tObs2\tExp2\t\tChi2\n";
foreach my $di (@dinucs){
	$chi1 = sprintf("%.2f",(($file1_ob{$di} - $file1_ex{$di})**2)/$file1_ex{$di});
	$chi2 = sprintf("%.2f",(($file2_ob{$di} - $file2_ex{$di})**2)/$file2_ex{$di});	
	print "$di\t$file1_ob{$di}\t$file1_ex{$di}\t$chi1\t$file2_ob{$di}\t$file2_ex{$di}\t$chi2\n";

	$chi_total += ($chi1+$chi2);
}

printf  "Chi squared value = %6.2f\n", $chi_total;
				
print "Significance level at 5% = 25.00\n";
print "Significance level at 1% = 30.58\n";


exit(0);

R script to covert and export html page to png

BioStar — Tue, 14 Mar 2023 07:03:42 -0500

# Library
library(streamgraph)
# Create data:
data <- data.frame(
  year=rep(seq(1990,2016) , each=10),
  name=rep(letters[1:10] , 27),
  value=sample( seq(0,1,0.0001) , 270)
)
# Start with a classic stream graph. It is supposed to open in a browser
streamgraph(data, key="name", value="value", date="year")
# Copy the URL of the html window you get
# load webshot library
library(webshot)

#install phantom:
webshot::install_phantomjs()
# Make a webshot in pdf : high quality but can not choose printed zone
webshot("paste_your_html_here.html" , "output.pdf", delay = 0.2)

# Make a webshot in png : Low quality - but you can choose shape
webshot("paste_your_html_here" , "output.png", delay = 0.2 , cliprect = c(440, 0, 1000, 10))

Raku script to find palindrome in genomes !

BioStar — Tue, 07 Mar 2023 14:15:17 -0600

sub is-palindrome(Str $str) returns Bool {
    $str.=uc; # convert to uppercase
    $str.=subst:g/\s+//; # remove any spaces
    return $str eq $str.flip;
}

sub find-palindromes(Str $dna, Int $min-length, Int $max-length) {
    for $min-length..$max-length -> $length {
        for 0..^$dna.chars - $length -> $pos {
            my $substring = $dna.substr($pos, $length);
            if is-palindrome($substring) {
                say "Palindrome found at position $pos: $substring";
            }
        }
    }
}

# Example usage
my $dna = "GGATCCATGGCCTAGG"; # example DNA sequence
find-palindromes($dna, 3, 8); # find palindromes with length between 3 and 8

Perl script to find edit distance between two sequences !

BioStar — Tue, 07 Mar 2023 14:11:00 -0600

#!/usr/bin/perl

use strict;
use warnings;

sub edit_distance {
    my ($s1, $s2) = @_;

    my $len1 = length($s1);
    my $len2 = length($s2);

    my @dp;
    for (my $i = 0; $i <= $len1; $i++) {
        for (my $j = 0; $j <= $len2; $j++) {
            $dp[$i][$j] = 0;
        }
    }

    for (my $i = 0; $i <= $len1; $i++) {
        $dp[$i][0] = $i;
    }

    for (my $j = 0; $j <= $len2; $j++) {
        $dp[0][$j] = $j;
    }

    for (my $i = 1; $i <= $len1; $i++) {
        for (my $j = 1; $j <= $len2; $j++) {
            my $cost = substr($s1, $i-1, 1) eq substr($s2, $j-1, 1) ? 0 : 1;
            $dp[$i][$j] = min($dp[$i-1][$j]+1, $dp[$i][$j-1]+1, $dp[$i-1][$j-1]+$cost);
        }
    }

    return $dp[$len1][$len2];
}

sub min {
    my $min = shift @_;
    foreach (@_) {
        $min = $_ if $_ < $min;
    }
    return $min;
}

# Example usage
my $seq1 = "ACGTAGCTAGCTGACTGAC";
my $seq2 = "CGTAGCTAGCTGACAGCTA";
my $distance = edit_distance($seq1, $seq2);
print "The edit distance between $seq1 and $seq2 is $distance.\n";

Perl script to find inverted repeats !

BioStar — Tue, 07 Mar 2023 06:25:23 -0600

#!/usr/bin/perl

use strict;
use warnings;

use Bio::SeqIO;
use Bio::Tools::Run::RepeatMasker;

my $genome_file = "genome.fasta";

# read genome sequence
my $seqio = Bio::SeqIO->new(-file => $genome_file, -format => "fasta");
my $seqobj = $seqio->next_seq();
my $seq = $seqobj->seq();

# run RepeatMasker
my $rm = Bio::Tools::Run::RepeatMasker->new();
my $rm_report = $rm->run($genome_file);

# parse RepeatMasker output
while (my $rm_result = $rm_report->next_result()) {
    my $rm_match = $rm_result->repeat_consensus();
    my $rm_class = $rm_result->repeat_class();
    my $rm_start = $rm_result->start();
    my $rm_end = $rm_result->end();
    my $rm_strand = $rm_result->strand();
    
    if ($rm_class eq "Inverted") {
        my $rm_seq = substr($seq, $rm_start-1, $rm_end-$rm_start+1);
        if ($rm_strand eq "-") {
            $rm_seq = reverse_complement($rm_seq);
        }
        print "Inverted repeat found at positions $rm_start-$rm_end: $rm_seq\n";
    }
}

sub reverse_complement {
    my ($seq) = @_;
    $seq = reverse($seq);
    $seq =~ tr/ACGTacgt/TGCAtgca/;
    return $seq;
}

Identify genome-wide synteny with LASTZ alignment

BioStar — Mon, 05 Dec 2022 04:52:48 -0600

#This is the walkstrough how to identifiy genome-wide synteny markers based on LASTZ alignment.

Step1：Mask the repeat sequences for both genomes and chromosomes.

RepeatMasker -pa 40 -nolow -norna -gff -xmall -lib custom.TE.lib_for_rice.fa AAChr1.txt RepeatMasker -pa 40 -nolow -norna -gff -xmall -lib custom.TE.lib_for_FF.fa FFChr1.txt

Step2: Alignment using LASTZ and Chain/Net

lastz AAChr1.txt FFChr1.txt K=2200 L=6000 Y=3400 E=30 H=0 O=400 T=1 --format=axt --out=chr01.axt axtChain -linearGap=medium chr01.axt AAChr1.txt FFChr1.txt chr01.axt.chain chainPreNet chr01.axt.chain AAChr1.txt.sizes FFChr1.txt.sizes chr01.chain.filter chainNet chr01.chain.filter -minSpace=1 AAChr1.txt.sizes FFChr1.txt.sizes chr1.chain.filter.tnet chr1.chain.filter.qnet netSyntenic chr1.chain.filter.tnet chr1.chain.filter.tnet.synnet netToAxt chr1.chain.filter.tnet.synnet chr1.chain.filter.tnet.synnet chr01.chain.filter AAChr1.txt FFChr1.txt chr1.chain.filter.tnet.synnet.axt axtSort chr1.chain.filter.tnet.synnet.axt chr1.chain.filter.tnet.synnet.Sort.axt axtToMaf chr1.chain.filter.tnet.synnet.axt AAChr1.txt.sizes FFChr1.txt.sizes -tPrefix=target. -qPrefix=query. chr1.chain.filter.tnet.synnet.axt.maf

Step 3: Get syntenic markers

perl Maf2rawsynteny.pl chr1.chain.filter.tnet.synnet.axt.maf target.AAChr1 query.FFChr1 
perl Get_synteny.pl -i chr1.chain.filter.tnet.synnet.axt.maf -n 0 -m 3 -t target.AAChr1 -q query.FFChr1 -o syn.final.out

@ https://github.com/yiliao1022/Centromere_synteny_project

Perl script to read the next line of a file !

Abhi — Mon, 17 Oct 2022 23:19:56 -0500

my $line = <$fileHandler>;
while(1) { # keep looping until I say so
    my $nextLine = <$fileHandler>;

    if ($line =~ m/>/ || !defined $nextLine) {
        ### Do the stuff
    }
    ### Do any other stuff;

    last unless defined $nextLine;
    $line = $nextLine;
}

Extract the mapped and unmapped reads !

Abhi — Fri, 23 Sep 2022 06:18:33 -0500

PROCESSORS=20

#Single_End_Layout:
samtools view --threads $PROCESSORS -b -F 4 in.bam > mapped.bam
samtools view --threads $PROCESSORS -b -f 4 in.bam > unmapped.bam

#Paired_End_Layout
samtools view --threads $PROCESSORS -b -f 2 in.bam > mapped.bam
samtools view --threads $PROCESSORS -b -F 2 in.bam > unmapped.bam