<?xml version='1.0'?><rss version="2.0" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:georss="http://www.georss.org/georss" xmlns:atom="http://www.w3.org/2005/Atom" >
<channel>
	<title><![CDATA[BOL: Create genome scaffolding with Perl]]></title>
	<link>https://bioinformaticsonline.com/snippets/view/35123/create-genome-scaffolding-with-perl?</link>
	<atom:link href="https://bioinformaticsonline.com/snippets/view/35123/create-genome-scaffolding-with-perl?" rel="self" type="application/rss+xml" />
	<description><![CDATA[]]></description>
	
	<item>
	<guid isPermaLink="true">https://bioinformaticsonline.com/snippets/view/35123/create-genome-scaffolding-with-perl</guid>
	<pubDate>Mon, 08 Jan 2018 23:51:46 -0600</pubDate>
	<link>https://bioinformaticsonline.com/snippets/view/35123/create-genome-scaffolding-with-perl</link>
	<title><![CDATA[Create genome scaffolding with Perl]]></title>
	<description><![CDATA[<code>#!/usr/bin/perl

use warnings;
use strict;
use English;

use Pod::Usage; ## uses pod documentation in usage code
use Getopt::Long qw(:config auto_version auto_help pass_through);

our $VERSION = &quot;1.00&quot;;

=head1 NAME

psl_scaffolder.pl - use self-mapped PSL file to scaffold a genome

=head1 SYNOPSIS

./psl_scaffolder.pl -query &lt;file&gt; [options] &lt;mapping.psl&gt;

=cut

sub min {
  ($a, $b) = @_;
  return( ($a &lt; $b) ? $a : $b);
}

sub max {
  ($a, $b) = @_;
  return( ($a &gt; $b) ? $a : $b);
}

sub rc {
  my ($seq) = @_;
  $seq =~ tr/ACGTUYRSWMKDVHBXN-/TGCAARYSWKMHBDVXN-/;
  # work on masked sequences as well
  $seq =~ tr/acgtuyrswmkdvhbxn/tgcaaryswkmhbdvxn/;
  return(scalar(reverse($seq)));
}

sub getConsensus {
  my ($b1, $b2) = @_;
  if(($b1 eq $b2) || ($b1 eq &quot; &quot;) || ($b2 eq &quot; &quot;)){
    ## equal bases, or absent bases, so consensus is easy
    return($b1);
  }
  # if different, convert to upper case to simplify lookup
  my $bc = uc(($b1 cmp $b2) ? $b1.$b2 : $b2.$b1);
  my %consensusLookup =
    (AC =&gt; &quot;M&quot;, AM =&gt; &quot;A&quot;, CM =&gt; &quot;C&quot;,
     GT =&gt; &quot;K&quot;, GK =&gt; &quot;G&quot;, KT =&gt; &quot;T&quot;,
     AG =&gt; &quot;R&quot;, AR =&gt; &quot;A&quot;, GR =&gt; &quot;G&quot;,
     CT =&gt; &quot;Y&quot;, CY =&gt; &quot;C&quot;, TY =&gt; &quot;T&quot;,
     AT =&gt; &quot;W&quot;, AW =&gt; &quot;A&quot;, TW =&gt; &quot;T&quot;,
    );
  # if &quot;simple&quot; ambiguity can be found, return that, otherwise return N
  # (i.e. GT =&gt; K, -A =&gt; N, YM -&gt; N)
  return( ($consensusLookup{$bc}) ? $consensusLookup{$bc} : &quot;N&quot;);
}

sub getMatch {
  my ($b1, $b2) = @_;
  return((($b1 eq $b2) || ($b1 eq &quot; &quot;) || ($b2 eq &quot; &quot;) ||
         ($b1 eq &quot;N&quot;) || ($b2 eq &quot;N&quot;)) ? &quot; &quot; : &quot;*&quot;);
}

############### Program starts here

# set default options
my @pslFiles = ();
my $projOpts =
  {
   &quot;query&quot; =&gt; 0, # contig file for query sequences
   &quot;prefix&quot; =&gt; &quot;psl_scaffold_&quot;, # prefix for contig names
   &quot;pid&quot; =&gt; 90, # percent ID threshold
   &quot;trimlimit&quot; =&gt; 50, # max number of overlapping bases outside match region
  };

GetOptions($projOpts, &#039;query=s&#039;, &#039;pid=i&#039;, &#039;trimlimit=i&#039;, &#039;prefix=s&#039;);

# process remaining command line arguments (hopefully only PSL files)
while (@ARGV) {
  my $argument = shift @ARGV;
  if(-f $argument){
    push (@pslFiles, $argument);
  } else {
  pod2usage({-exitVal =&gt; 1,
               -message =&gt; &quot;Error: Unknown command-line option or &quot;.
             &quot;non-existent file, &#039;$argument&#039;\n&quot;, -verbose =&gt; 0});
  }
}

@ARGV = @pslFiles;

if(!$projOpts-&gt;{&quot;query&quot;}){
  pod2usage({-exitVal =&gt; 1,
             -message =&gt; &quot;Error: No query assembly file provided&quot;,
             -verbose =&gt; 0});
}

if(!(-f $projOpts-&gt;{&quot;query&quot;})){
  pod2usage({-exitVal =&gt; 1,
             -message =&gt; sprintf(&quot;Error: query file &#039;%s&#039; doesn&#039;t exist&quot;,
                                 $projOpts-&gt;{&quot;query&quot;}),
             -verbose =&gt; 0});
}

print(STDERR &quot;Loading query sequences into memory...&quot;);
open(my $queryFile, &quot;&lt;&quot;, $projOpts-&gt;{&quot;query&quot;});
my $seqID = &quot;&quot;;
my %querySeqs = ();
while(&lt;$queryFile&gt;){
  chomp;
  if(/^&gt;((.+?)( .*?\s*)?)$/){
    ## line is sequence header
    $seqID = $2;
    $querySeqs{$seqID}{fullName} = $1;
    $querySeqs{$seqID}{sequence} = &quot;&quot;;
  } else {
    if(!$seqID){
      pod2usage({-exitVal =&gt; 1,
                 -message =&gt; sprintf(&quot; Error: query file &#039;%s&#039; doesn&#039;t look &quot;.
                                     &quot;like a FASTA file (no initial ID header)&quot;,
                                     $projOpts-&gt;{&quot;query&quot;}),
                 -verbose =&gt; 0});
    }
    ## line is sequence
    $querySeqs{$seqID}{&quot;sequence&quot;} .= $_;
  }
}
close($queryFile);

my %targetSeqs = %querySeqs;
my $nextScaffoldID = 1;

my %replacementSeqs = ();

printf(STDERR &quot; loaded in %d sequences\n&quot;, scalar(keys(%querySeqs)));

print(STDERR &quot;Processing results...&quot;);
while(&lt;&gt;){
  chomp;
  my @fields = split(/\t/);
  my ($matches, $misMatches, $repMatches, $nCount, $qNumInsert,
      $qBaseInsert, $tNumInsert, $tBaseInsert, $strand, $qName,
      $qSize, $qStart, $qEnd, $tName, $tSize,
      $tStart, $tEnd, $blockCount, $blockSizes, $qStarts,
      $tStarts, @rest) = @fields;
  if(!$tStarts){
    pod2usage({-exitVal =&gt; 1,
               -message =&gt; sprintf(&quot; Error: mapping file doesn&#039;t look &quot;.
                                   &quot;like a PSL file (expecting&quot;.
                                   &quot;&gt;=21 tab-separated values, got %d)&quot;,
                                  scalar(@fields)),
               -verbose =&gt; 0});
  }
  ## calculate percent identity
  my $qAliSize = $qEnd - $qStart;
  my $tAliSize = $tEnd - $tStart;
  my $sizeDif = abs($qAliSize - $tAliSize);
  my $pid = 100 * ($matches + $repMatches -
                   ($qNumInsert + $tNumInsert + 3*log(1+$sizeDif))) /
                     ($matches + $repMatches + $misMatches);
  if(($pid &gt;= $projOpts-&gt;{&quot;pid&quot;}) &amp;&amp;
     $querySeqs{$qName} &amp;&amp; $targetSeqs{$tName}){
    my %meta = ();
    my $shortTarget = ($tSize &lt; $qSize) ? 1 : 0;
    my $longTarget = (1 - $shortTarget);
    my $sName = $fields[9 + ($shortTarget * 4)];
    my $lName = $fields[9 + ($longTarget * 4)];
    my $sLen = $fields[10 + ($shortTarget * 4)];
    my $lLen = $fields[10 + ($longTarget * 4)];
    my $sStart = $fields[11 + ($shortTarget * 4)];
    my $lStart = $fields[11 + ($longTarget * 4)];
    my $sEnd = $fields[12 + ($shortTarget * 4)];
    my $lEnd = $fields[12 + ($longTarget * 4)];
    my @sBlStarts = split(/,/, $fields[19 + $shortTarget]);
    my @lBlStarts = split(/,/, $fields[19 + $longTarget]);
    my @blSizes = split(/,/, $fields[18]);
    my ($sSeq, $lSeq) = ($querySeqs{$qName}{sequence},
                         $querySeqs{$tName}{sequence});
    if($shortTarget){
      ($sSeq, $lSeq) = ($lSeq, $sSeq);
    }
    my $doRC = ($strand eq &quot;-&quot;);
    if($doRC){
        if ($shortTarget) { # target sequence is assumed to be forward strand
          $lSeq = rc($lSeq);
          ($lStart, $lEnd) = ($lLen - $lEnd, $lEnd - $lStart);
        } else {
          $sSeq = rc($sSeq);
          ($sStart, $sEnd) = ($sLen - $sEnd, $sEnd - $sStart);
        }
    }
    my $preTrim = min($sStart, $lStart);
    my $postTrim = min($sLen - $sEnd, $lLen - $lEnd);
    ## Only continue on if there&#039;s a good likelihood that this will work
    ## i.e. trimLength * (1-%id) &lt; threshold
    my $trimTotal = ($preTrim + $postTrim);
    if($trimTotal &lt;= $projOpts-&gt;{&quot;trimlimit&quot;}){
      my $sPre = substr($sSeq, 0, $sStart);
      my $lPre = substr($lSeq, 0, $lStart);
      my $sMid = substr($sSeq, $sStart, $sEnd-$sStart);
      my $lMid = substr($lSeq, $lStart, $lEnd-$lStart);
      my $sPost = substr($sSeq, $sEnd);
      my $lPost = substr($lSeq, $lEnd);
      my $sPreTrim = substr($sPre, length($sPre)-$preTrim);
      my $sPostTrim = substr($sPost, 0, $postTrim);
      my $lPreTrim = substr($lPre, length($lPre)-$preTrim);
      my $lPostTrim = substr($lPost, 0, $postTrim);
      my $preLen = max(length($sPre), length($lPre));
      my $postLen = max(length($sPost), length($lPost));
      my $lastS = $sBlStarts[0];
      my $lastL = $lBlStarts[0];
      my $alSeqS = &quot;&quot;;
      my $alSeqL = &quot;&quot;;
      for (my $i = 0; $i &lt;= $#blSizes; $i++) {
        my $gapS = $sBlStarts[$i] - $lastS;
        my $gapL = $lBlStarts[$i] - $lastL;
        my $gapLength = max($gapS, $gapL);
        my $fillS = $gapLength - $gapS;
        my $fillL = $gapLength - $gapL;
        $alSeqS .= (&quot;-&quot; x $fillS) . substr($sSeq, $sBlStarts[$i]-$gapS, $gapS);
        $alSeqL .= (&quot;-&quot; x $fillL) . substr($lSeq, $lBlStarts[$i]-$gapL, $gapL);
        $alSeqS .= substr($sSeq, $sBlStarts[$i], $blSizes[$i]);
        $alSeqL .= substr($lSeq, $lBlStarts[$i], $blSizes[$i]);
        $lastS = $sBlStarts[$i] + $blSizes[$i];
        $lastL = $lBlStarts[$i] + $blSizes[$i];
      }
      $alSeqS = $sPreTrim . $alSeqS . $sPostTrim;
      $alSeqL = $lPreTrim . $alSeqL . $lPostTrim;
      my $alConsensus = &quot;&quot;;
      for (my $i = 0; $i &lt; length($alSeqS); $i++) {
        $alConsensus .= getConsensus(substr($alSeqS,$i,1),substr($alSeqL,$i,1));
      }
      my $consensusLength = length($alConsensus);
      $alConsensus =
        substr($sPre, 0, length($sPre) - $preTrim).
          substr($lPre, 0, length($lPre) - $preTrim).
            $alConsensus.
              substr($sPost, $postTrim).substr($lPost, $postTrim);
      my $newSeqID = sprintf(&quot;%s_%d&quot;, $projOpts-&gt;{&quot;prefix&quot;}, $nextScaffoldID++);
      if(!exists($replacementSeqs{$sName}{score}) ||
         ($trimTotal &lt; $replacementSeqs{$sName}{score}) ||
         (($trimTotal == $replacementSeqs{$sName}{score}) &amp;&amp;
          ($consensusLength &gt; $replacementSeqs{$sName}{clength}))){
        $replacementSeqs{$sName}{score} = $trimTotal;
        $replacementSeqs{$sName}{clength} = $consensusLength;
        $replacementSeqs{$sName}{fullName} =
          sprintf(&quot;%s [%s %s]&quot;, $newSeqID, $sName, $lName);
        $replacementSeqs{$sName}{sequence} = $alConsensus;
        # printf(STDERR &quot;Match: $sName\n&quot;);
      }
      if(!exists($replacementSeqs{$lName}{score}) ||
         ($trimTotal &lt; $replacementSeqs{$lName}{score}) ||
         (($trimTotal == $replacementSeqs{$lName}{score}) &amp;&amp;
          ($consensusLength &gt; $replacementSeqs{$lName}{clength}))){
        $replacementSeqs{$lName}{score} = $trimTotal;
        $replacementSeqs{$lName}{clength} = $consensusLength;
        $replacementSeqs{$lName}{fullName} =
          sprintf(&quot;%s [%s %s]&quot;, $newSeqID, $sName, $lName);
        $replacementSeqs{$lName}{sequence} = $alConsensus;
        # printf(STDERR &quot;Match: $lName\n&quot;);
      }
    } else {
      # printf(STDERR &quot;Rejecting match &#039;%s&#039; vs &#039;%s&#039;: too many bases trimmed (%d [%d,%d] [%d,%d])\n&quot;,
      #        $qName, $tName, $trimTotal, $sStart, $lStart, $sLen-$sEnd, $lLen-$lEnd);
    }
  } elsif($pid &lt; $projOpts-&gt;{&quot;pid&quot;}){
    # printf(STDERR &quot;Rejecting match &#039;%s&#039; vs &#039;%s&#039;: identity (%f) too low\n&quot;,
    #      $qName, $tName, $pid);
  }
}
printf(STDERR &quot; done\n&quot;);

my %displayed = ();

foreach my $seqID (sort(keys(%targetSeqs))){
  my $fullName = $targetSeqs{$seqID}{fullName};
  my $sequence = $targetSeqs{$seqID}{sequence};
  if(exists($replacementSeqs{$seqID})){
    print(STDERR &quot;Found match for $seqID\n&quot;);
    $fullName = $replacementSeqs{$seqID}{fullName};
    $sequence = $replacementSeqs{$seqID}{sequence};
  }
  if(!$displayed{$fullName}){
    printf(&quot;&gt;%s\n%s\n&quot;, $fullName, $sequence);
    $displayed{$fullName} = 1;
  }
}</code>]]></description>
	<dc:creator>BioStar</dc:creator>
</item>

</channel>
</rss>