Our Sponsors



Download BioinformaticsOnline(BOL) Apps in your chrome browser.




DotPlot with Perl GD

#!/usr/local/bin/perl -w # NOTE: YOU MUST CHANGE THE LINE ABOVE TO POINT TO # THE FULL PATH OF THE PERL EXECUTABLE ON YOUR SYSTEM. # System requirements: requires perl and perl modules # DB_File and GD.pm, available from CPAN. # For usage information, run this program with the flag # -h. require v5.6.0; use GD; use DB_File; use strict; use vars qw/$VERSION/; BEGIN { $VERSION = '1.0'; if (!defined $GD::VERSION || $GD::VERSION ne 1.32) { print STDERR "\n$0:\n"; print STDERR "WARNING -- GD VERSION 1.32 REQUIRED.\n"; if (defined $GD::VERSION){ print STDERR "You have GD version $GD::VERSION.\n"; print STDERR "If $0\n"; print STDERR "does not work correctly you will\n"; print STDERR "have to install GD version 1.32.\n\n"; } } if (!defined $DB_File::VERSION || $DB_File::VERSION ne 1.814) { print STDERR "\n$0:\n"; print STDERR "WARNING -- DB_File VERSION 1.814 REQUIRED.\n"; if (defined $DB_File::VERSION){ print STDERR "You have DB_File version $DB_File::VERSION.\n"; print STDERR "If $0\n"; print STDERR "does not work correctly you will\n"; print STDERR "have to install DB_File version 1.814.\n\n"; } } } use Getopt::Long; sub usage(); sub main(); sub print_square($$$$$$$$$$); sub print_triangle($$$$$$$$$); sub print_png ($$); sub revcomp($); main(); sub usage() { print qq/ USAGE: $0 -i <in file> [optional args] OPTIONAL ARGUMENTS: -j <2nd file> -o <out file> -d <dot file> -w <word size> -s <step size> -p <pixels> -t <title> -h print this help message Create a PNG ("portable network graphics") file that displays a triangular dot plot of the input sequence against itself, or a square plot against a second sequence with option -j. <in file> is a fasta format file from which the sequence is taken. This is the only required parameter. <2nd file> is a fasta format file from which a second sequence is taken. default: null -- triangular dot plot created <out file> is the PNG file created. default: <in file(s)>_<word_size>_<step_length>_<pixels>.png <dot file> A database file containing the positions of words of <word size> encountered in your sequence. This database is not human-readable, and can get quite large, so make sure that you have plenty of disk space available. Databases size scales linearly with sequence length. With default parameters,databases are are roughly 150k per base-pair of sequence. Complex sequences use more disk space than repetitive ones. default: <infile(s)>_<word_size>_<step_length>.dot <word size> is the word size for a match. A dot is printed if there is a perfect match of length <word size>. default: <word size> = 100 <step size> is the number of bases to move the word for each dot. default: <step size> = 1 <pixels> is the width of the plot in pixels and defines the resolution of the image. default: <pixels> = 600 <title> is a title to place in the output. default: null -- no title appears If for some reason this program is disrupted during a run, restart with the same parameters and it will automatically resume where it left off. Completed runs can be re-plotted from the existing dot file using a different resolution by changing -p -h causes this message to printed. (Version $VERSION) /; } sub main() { #Define the arguments my ($seqfile, $second, $outfile, $dotfile, $word, $step, $pixels, $title, $help); #Perform Checking -- remind the user that TMTOWTDI. if (!GetOptions( 'infile=s' => \$seqfile, 'jfile=s' => \$second, 'wordlen=i' => \$word, 'step=i' => \$step, 'outfile=s' => \$outfile, 'title=s' => \$title, 'pixels=i' => \$pixels, 'dotfile=s' => \$dotfile, 'help' => \$help)) { usage; exit -1; } if ($help) { usage; exit; } if (!defined $seqfile){ usage; exit -1; } if (!defined $second){ print "Creating triangular plot from: $seqfile\n"; print "\t use -j to make a square plot\n"; } if (!defined $word){ $word = 100; print "Using word size 100\n"; print "\tuse -w to set word size\n"; }elsif ($word <= 0){ print STDERR "$0 -w <word size> must be >= 0\n"; exit -1; } if (!defined $step){ $step = 1; print "Using step size 1\n"; print "\tuse -s to set step size\n"; }elsif ($step <= 0) { print STDERR "$0 -s <step length> must be >= 0\n"; exit -1; } if (!defined $pixels){ $pixels = 600; print "Using image size 600 pixels\n"; print "\tuse -p to set pixels\n"; }elsif ($pixels <= 0) { print STDERR "$0 -s <pixels> must be >= 0\n"; exit -1; } if (!defined $dotfile) { $seqfile =~ m/\/?([^\/\.]+)\.?\w*$/; $dotfile = $1; if ($second){ $second =~ m/\/?([^\/\.]+)\.?\w*$/; $dotfile .= "_" . $1; } $dotfile .= "_" . $word . "_" . $step; #$dotfile =~ tr/[\.\/]/_/; $dotfile .= ".dot"; print "Storing matches in: $dotfile\n"; print "\tuse -d to name dot file\n"; } if (!defined $outfile){ $seqfile =~ m/\/?([^\/\.]+)\.?\w*$/; $outfile = $1; if ($second){ $second =~ m/\/?([^\/\.]+)\.?\w*$/; $outfile .= "_" . $1; } $outfile .= "_" . $word . "_" . $step . "_" . $pixels; #$outfile =~ tr/[\.\/]/_/; $outfile .= ".png"; print "Printing picture in: $outfile\n"; print "\tuse -o to name out file\n"; } $title = '' unless defined $title; #Open the database file -- this makes us fast! my %DOT = (); dbmopen(%DOT, "$dotfile", 0666) || die "Cannot open $dotfile: $!"; #Define some variables we need my ($xlen, $ylen) = (0, 0); my ($i, $j, $k, $key) = (0, 0, 0, ""); my $seq = ""; #Start reading the first sequence, avoid a file slurp open(IN, $seqfile) || die "Cannot open $seqfile: $!"; while (<IN>){ if (m/^>/){ next; } chomp; $k += length $_; $seq .= uc($_); while ($seq && (($i + $word) < $k) && ($j + $word) < $k){ #wait until we hit the place where we left off! if ($DOT{xpos} && $i < $DOT{xpos}){ }elsif ($i < $j){ $DOT{xpos} = $i; }elsif ($i == $j){ $DOT{xpos} = $i; $key = substr($seq, 0, $word); #treat Ns as mismatches unless ($key =~ m/N/){ if ($DOT{$key}){ $DOT{$key} .= "x:$i\t"; }elsif($DOT{revcomp($key)}){ $DOT{revcomp($key)} .= "x:$i\t"; }else{ $DOT{$key} .= "x:$i\t"; } } $j += $step; } substr($seq, 0, 1) = ""; $i++; } } close IN; $xlen = $k; #Is there another sequence? if ($second){ #Looks like there is. #Flush out these variables: ($i, $j, $k, $key) = (0, 0, 0, ""); $seq = ""; #Read the second sequence, avoid a file slurp open(IN, $second) || die "Cannot open $second: $!"; while (<IN>){ if (m/^>/){ next; } chomp; $k += length $_; $seq .= uc($_); while ($seq && (($i + $word) < $k) && ($j + $word) < $k){ #wait until we hit the place where we left off! if ($DOT{ypos} && $i < $DOT{ypos}){ }elsif ($i < $j){ $DOT{ypos} = $i; }elsif ($i == $j){ $DOT{ypos} = $i; $key = substr($seq, 0, $word); #treat Ns as mismatches unless ($key =~ m/N/){ if ($DOT{$key}){ $DOT{$key} .= "y:$i\t"; }elsif($DOT{revcomp($key)}){ $DOT{revcomp($key)} .= "y:$i\t"; }else{ $DOT{$key} .= "y:$i\t"; } } $j += $step; } substr($seq, 0, 1) = ""; $i++; } } close IN; $ylen = $k; } # Close that DB dbmclose %DOT; # Create and print the output. my ($img, $white, $black, $gray, $margin, $x0, $y0, $x1, $y1); $margin = 50; $x0 = $y0 = $margin; $x1 = $y1 = $pixels - $margin; $img = new GD::Image($pixels, $pixels); $white = $img->colorAllocate(255,255,255); $black = $img->colorAllocate(0,0,0); $gray = $img->colorAllocate(187,187,187); $img->interlaced('true'); $img->string(gdLargeFont, $x0, $y0 - 35, "$title -w=$word -s=$step", $black); # Square or Triangle? if ($second){ $img->string(gdLargeFont, $x0, $y0 - 15, "$xlen bp x $ylen bp", $black); print_square($img, $x0, $x1, $y0, $y1, $black, $gray, $xlen, $ylen, $dotfile); }else{ $img->string(gdLargeFont, $x0, $y0 - 15, "$xlen bp", $black); print_triangle($img, $x0, $x1, $y0, $y1, $black, $gray, $xlen, $dotfile); } # Print this thing out print_png($img, $outfile); print "Plot Complete!\n"; } sub print_square ($$$$$$$$$$){ my ($img, $x0, $x1, $y0, $y1, $black, $gray, $xlen, $ylen, $dotfile) = @_; my $del = ($x1 - $x0) / $xlen; if ($del > (($y1 - $y0) / $ylen)){ $del = (($y1 - $y0) / $ylen); } my %hash = (); my ($xd, $yd, $key, $value); my %TEMPFILE = (); dbmopen(%TEMPFILE, "$dotfile", 0666) || die "Can't open database $dotfile: $!"; #avoid slurping again, pull values out one by one! while (($key,$value) = each %TEMPFILE){ if ($key =~ m/[ACGT]+/){ while ($value =~ m/([xy]):(\d+)\t(.*)/){ ($hash{$1}{$2}, $value) = (1, $3); } } foreach $xd (keys(%{$hash{x}})){ foreach $yd (keys(%{$hash{y}})){ $img->setPixel(($x0 + ($del * $xd)), ($y0 + ($del * $yd)), $black); } } %hash = (); } dbmclose %TEMPFILE; $img->rectangle($x0, $y0, ($x0 + ($del * $xlen)), ($y0 + ($del * $ylen)), $gray); } sub print_triangle ($$$$$$$$$){ my ($img, $x0, $x1, $y0, $y1, $black, $gray, $xlen, $dotfile) = @_; my $del = ($x1 - $x0) / $xlen; my %hash = (); my ($xd, $yd, $key, $value); my %TEMPFILE = (); dbmopen(%TEMPFILE, "$dotfile", 0666) || die "Can't open database $dotfile: $!"; #avoid slurping again, pull values out one by one! while (($key,$value) = each %TEMPFILE){ if ($key =~ m/[ACGT]+/){ while ($value =~ m/x:(\d+)\t(.*)/){ ($hash{x}{$1}, $hash{y}{$1}, $value) = (1, 1, $2); } } foreach $xd (keys(%{$hash{x}})){ foreach $yd (keys(%{$hash{y}})){ $img->setPixel(($x0 + (($del * ($xd + $yd)) * 0.5)), ($y1 - (($del * sqrt(($yd - $xd) * ($yd - $xd))) * 0.5)), $black); } } %hash = (); } dbmclose %TEMPFILE; $img->line($x0, $y1, $x1, $y1, $gray); $img->line($x0, $y1, (($x0 + $x1) * 0.5), ($y1 - (($x1 - $x0) * 0.5)), $gray); $img->line((($x0 + $x1) * 0.5), ($y1 - (($x1 - $x0) * 0.5)), $x1, $y1, $gray); } sub print_png ($$) { my ($img, $outfile) = @_; if ($outfile) { open(OUT, ">$outfile") || die "Cannot write $outfile: $!\n"; print OUT $img->png; close OUT; } else { print $img->png; } } sub revcomp ($){ my $f = shift(@_); my $r = reverse($f); $r =~ tr/GATC/CTAG/; return $r; }