Our Sponsors



Download BioinformaticsOnline(BOL) Apps in your chrome browser.

Freelancer

AdvertiseWithUs

Create genome scaffolding with Perl

  • Public
By BioStar 15 days ago
#!/usr/bin/perl use warnings; use strict; use English; use Pod::Usage; ## uses pod documentation in usage code use Getopt::Long qw(:config auto_version auto_help pass_through); our $VERSION = "1.00"; =head1 NAME psl_scaffolder.pl - use self-mapped PSL file to scaffold a genome =head1 SYNOPSIS ./psl_scaffolder.pl -query <file> [options] <mapping.psl> =cut sub min { ($a, $b) = @_; return( ($a < $b) ? $a : $b); } sub max { ($a, $b) = @_; return( ($a > $b) ? $a : $b); } sub rc { my ($seq) = @_; $seq =~ tr/ACGTUYRSWMKDVHBXN-/TGCAARYSWKMHBDVXN-/; # work on masked sequences as well $seq =~ tr/acgtuyrswmkdvhbxn/tgcaaryswkmhbdvxn/; return(scalar(reverse($seq))); } sub getConsensus { my ($b1, $b2) = @_; if(($b1 eq $b2) || ($b1 eq " ") || ($b2 eq " ")){ ## equal bases, or absent bases, so consensus is easy return($b1); } # if different, convert to upper case to simplify lookup my $bc = uc(($b1 cmp $b2) ? $b1.$b2 : $b2.$b1); my %consensusLookup = (AC => "M", AM => "A", CM => "C", GT => "K", GK => "G", KT => "T", AG => "R", AR => "A", GR => "G", CT => "Y", CY => "C", TY => "T", AT => "W", AW => "A", TW => "T", ); # if "simple" ambiguity can be found, return that, otherwise return N # (i.e. GT => K, -A => N, YM -> N) return( ($consensusLookup{$bc}) ? $consensusLookup{$bc} : "N"); } sub getMatch { my ($b1, $b2) = @_; return((($b1 eq $b2) || ($b1 eq " ") || ($b2 eq " ") || ($b1 eq "N") || ($b2 eq "N")) ? " " : "*"); } ############### Program starts here # set default options my @pslFiles = (); my $projOpts = { "query" => 0, # contig file for query sequences "prefix" => "psl_scaffold_", # prefix for contig names "pid" => 90, # percent ID threshold "trimlimit" => 50, # max number of overlapping bases outside match region }; GetOptions($projOpts, 'query=s', 'pid=i', 'trimlimit=i', 'prefix=s'); # process remaining command line arguments (hopefully only PSL files) while (@ARGV) { my $argument = shift @ARGV; if(-f $argument){ push (@pslFiles, $argument); } else { pod2usage({-exitVal => 1, -message => "Error: Unknown command-line option or ". "non-existent file, '$argument'\n", -verbose => 0}); } } @ARGV = @pslFiles; if(!$projOpts->{"query"}){ pod2usage({-exitVal => 1, -message => "Error: No query assembly file provided", -verbose => 0}); } if(!(-f $projOpts->{"query"})){ pod2usage({-exitVal => 1, -message => sprintf("Error: query file '%s' doesn't exist", $projOpts->{"query"}), -verbose => 0}); } print(STDERR "Loading query sequences into memory..."); open(my $queryFile, "<", $projOpts->{"query"}); my $seqID = ""; my %querySeqs = (); while(<$queryFile>){ chomp; if(/^>((.+?)( .*?\s*)?)$/){ ## line is sequence header $seqID = $2; $querySeqs{$seqID}{fullName} = $1; $querySeqs{$seqID}{sequence} = ""; } else { if(!$seqID){ pod2usage({-exitVal => 1, -message => sprintf(" Error: query file '%s' doesn't look ". "like a FASTA file (no initial ID header)", $projOpts->{"query"}), -verbose => 0}); } ## line is sequence $querySeqs{$seqID}{"sequence"} .= $_; } } close($queryFile); my %targetSeqs = %querySeqs; my $nextScaffoldID = 1; my %replacementSeqs = (); printf(STDERR " loaded in %d sequences\n", scalar(keys(%querySeqs))); print(STDERR "Processing results..."); while(<>){ chomp; my @fields = split(/\t/); my ($matches, $misMatches, $repMatches, $nCount, $qNumInsert, $qBaseInsert, $tNumInsert, $tBaseInsert, $strand, $qName, $qSize, $qStart, $qEnd, $tName, $tSize, $tStart, $tEnd, $blockCount, $blockSizes, $qStarts, $tStarts, @rest) = @fields; if(!$tStarts){ pod2usage({-exitVal => 1, -message => sprintf(" Error: mapping file doesn't look ". "like a PSL file (expecting". ">=21 tab-separated values, got %d)", scalar(@fields)), -verbose => 0}); } ## calculate percent identity my $qAliSize = $qEnd - $qStart; my $tAliSize = $tEnd - $tStart; my $sizeDif = abs($qAliSize - $tAliSize); my $pid = 100 * ($matches + $repMatches - ($qNumInsert + $tNumInsert + 3*log(1+$sizeDif))) / ($matches + $repMatches + $misMatches); if(($pid >= $projOpts->{"pid"}) && $querySeqs{$qName} && $targetSeqs{$tName}){ my %meta = (); my $shortTarget = ($tSize < $qSize) ? 1 : 0; my $longTarget = (1 - $shortTarget); my $sName = $fields[9 + ($shortTarget * 4)]; my $lName = $fields[9 + ($longTarget * 4)]; my $sLen = $fields[10 + ($shortTarget * 4)]; my $lLen = $fields[10 + ($longTarget * 4)]; my $sStart = $fields[11 + ($shortTarget * 4)]; my $lStart = $fields[11 + ($longTarget * 4)]; my $sEnd = $fields[12 + ($shortTarget * 4)]; my $lEnd = $fields[12 + ($longTarget * 4)]; my @sBlStarts = split(/,/, $fields[19 + $shortTarget]); my @lBlStarts = split(/,/, $fields[19 + $longTarget]); my @blSizes = split(/,/, $fields[18]); my ($sSeq, $lSeq) = ($querySeqs{$qName}{sequence}, $querySeqs{$tName}{sequence}); if($shortTarget){ ($sSeq, $lSeq) = ($lSeq, $sSeq); } my $doRC = ($strand eq "-"); if($doRC){ if ($shortTarget) { # target sequence is assumed to be forward strand $lSeq = rc($lSeq); ($lStart, $lEnd) = ($lLen - $lEnd, $lEnd - $lStart); } else { $sSeq = rc($sSeq); ($sStart, $sEnd) = ($sLen - $sEnd, $sEnd - $sStart); } } my $preTrim = min($sStart, $lStart); my $postTrim = min($sLen - $sEnd, $lLen - $lEnd); ## Only continue on if there's a good likelihood that this will work ## i.e. trimLength * (1-%id) < threshold my $trimTotal = ($preTrim + $postTrim); if($trimTotal <= $projOpts->{"trimlimit"}){ my $sPre = substr($sSeq, 0, $sStart); my $lPre = substr($lSeq, 0, $lStart); my $sMid = substr($sSeq, $sStart, $sEnd-$sStart); my $lMid = substr($lSeq, $lStart, $lEnd-$lStart); my $sPost = substr($sSeq, $sEnd); my $lPost = substr($lSeq, $lEnd); my $sPreTrim = substr($sPre, length($sPre)-$preTrim); my $sPostTrim = substr($sPost, 0, $postTrim); my $lPreTrim = substr($lPre, length($lPre)-$preTrim); my $lPostTrim = substr($lPost, 0, $postTrim); my $preLen = max(length($sPre), length($lPre)); my $postLen = max(length($sPost), length($lPost)); my $lastS = $sBlStarts[0]; my $lastL = $lBlStarts[0]; my $alSeqS = ""; my $alSeqL = ""; for (my $i = 0; $i <= $#blSizes; $i++) { my $gapS = $sBlStarts[$i] - $lastS; my $gapL = $lBlStarts[$i] - $lastL; my $gapLength = max($gapS, $gapL); my $fillS = $gapLength - $gapS; my $fillL = $gapLength - $gapL; $alSeqS .= ("-" x $fillS) . substr($sSeq, $sBlStarts[$i]-$gapS, $gapS); $alSeqL .= ("-" x $fillL) . substr($lSeq, $lBlStarts[$i]-$gapL, $gapL); $alSeqS .= substr($sSeq, $sBlStarts[$i], $blSizes[$i]); $alSeqL .= substr($lSeq, $lBlStarts[$i], $blSizes[$i]); $lastS = $sBlStarts[$i] + $blSizes[$i]; $lastL = $lBlStarts[$i] + $blSizes[$i]; } $alSeqS = $sPreTrim . $alSeqS . $sPostTrim; $alSeqL = $lPreTrim . $alSeqL . $lPostTrim; my $alConsensus = ""; for (my $i = 0; $i < length($alSeqS); $i++) { $alConsensus .= getConsensus(substr($alSeqS,$i,1),substr($alSeqL,$i,1)); } my $consensusLength = length($alConsensus); $alConsensus = substr($sPre, 0, length($sPre) - $preTrim). substr($lPre, 0, length($lPre) - $preTrim). $alConsensus. substr($sPost, $postTrim).substr($lPost, $postTrim); my $newSeqID = sprintf("%s_%d", $projOpts->{"prefix"}, $nextScaffoldID++); if(!exists($replacementSeqs{$sName}{score}) || ($trimTotal < $replacementSeqs{$sName}{score}) || (($trimTotal == $replacementSeqs{$sName}{score}) && ($consensusLength > $replacementSeqs{$sName}{clength}))){ $replacementSeqs{$sName}{score} = $trimTotal; $replacementSeqs{$sName}{clength} = $consensusLength; $replacementSeqs{$sName}{fullName} = sprintf("%s [%s %s]", $newSeqID, $sName, $lName); $replacementSeqs{$sName}{sequence} = $alConsensus; # printf(STDERR "Match: $sName\n"); } if(!exists($replacementSeqs{$lName}{score}) || ($trimTotal < $replacementSeqs{$lName}{score}) || (($trimTotal == $replacementSeqs{$lName}{score}) && ($consensusLength > $replacementSeqs{$lName}{clength}))){ $replacementSeqs{$lName}{score} = $trimTotal; $replacementSeqs{$lName}{clength} = $consensusLength; $replacementSeqs{$lName}{fullName} = sprintf("%s [%s %s]", $newSeqID, $sName, $lName); $replacementSeqs{$lName}{sequence} = $alConsensus; # printf(STDERR "Match: $lName\n"); } } else { # printf(STDERR "Rejecting match '%s' vs '%s': too many bases trimmed (%d [%d,%d] [%d,%d])\n", # $qName, $tName, $trimTotal, $sStart, $lStart, $sLen-$sEnd, $lLen-$lEnd); } } elsif($pid < $projOpts->{"pid"}){ # printf(STDERR "Rejecting match '%s' vs '%s': identity (%f) too low\n", # $qName, $tName, $pid); } } printf(STDERR " done\n"); my %displayed = (); foreach my $seqID (sort(keys(%targetSeqs))){ my $fullName = $targetSeqs{$seqID}{fullName}; my $sequence = $targetSeqs{$seqID}{sequence}; if(exists($replacementSeqs{$seqID})){ print(STDERR "Found match for $seqID\n"); $fullName = $replacementSeqs{$seqID}{fullName}; $sequence = $replacementSeqs{$seqID}{sequence}; } if(!$displayed{$fullName}){ printf(">%s\n%s\n", $fullName, $sequence); $displayed{$fullName} = 1; } }