<?xml version='1.0'?><rss version="2.0" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:georss="http://www.georss.org/georss" xmlns:atom="http://www.w3.org/2005/Atom" >
<channel>
	<title><![CDATA[BOL: Perl script to break the contigs by 'N']]></title>
	<link>https://bioinformaticsonline.com/snippets/view/37682/perl-script-to-break-the-contigs-by-n?</link>
	<atom:link href="https://bioinformaticsonline.com/snippets/view/37682/perl-script-to-break-the-contigs-by-n?" rel="self" type="application/rss+xml" />
	<description><![CDATA[]]></description>
	
	<item>
	<guid isPermaLink="true">https://bioinformaticsonline.com/snippets/view/37682/perl-script-to-break-the-contigs-by-n</guid>
	<pubDate>Tue, 11 Sep 2018 15:31:55 -0500</pubDate>
	<link>https://bioinformaticsonline.com/snippets/view/37682/perl-script-to-break-the-contigs-by-n</link>
	<title><![CDATA[Perl script to break the contigs by 'N']]></title>
	<description><![CDATA[<code>#!/usr/bin/perl -w
use Bio::SeqIO;
use strict;


my $fasta = Bio::SeqIO-&gt;new( -file =&gt; &quot;&lt;$ARGV[0]&quot;, -format =&gt; &#039;fasta&#039; );

my $minNlen = 5;
my $out=Bio::SeqIO-&gt;new(-file=&gt;&quot;&gt;$ARGV[0].parts.fasta&quot;,-format=&gt;&#039;fasta&#039;);
open(SCAFF,&quot;&gt;$ARGV[0].parts.scaff&quot;);
while ( my $seqobj = $fasta-&gt;next_seq() ) {
	#gets contig id
	my $contig = $seqobj-&gt;display_id();
	#gets contig sequence
	my $seq = $seqobj-&gt;seq;
	my $lenseq=length $seq;
	$seq = uc $seq; #now all bases are in uppercase

	#Searches for NNNNN regions (scaffold breaks)
	#Hash nregions stores a hash from the contig id to the NNNNN regions found. 
	#Each region is represented as a pair &quot;$ini $end&quot; indicating that start and 
	#the end of the NNNNN region in the contig
	my @regions=();
	my @bases=split //,$seq;
	push(@bases,&quot;E&quot;); #This extra position marks the end of the contig
	my $n=0;
	#Sorry for this loop, but its less memory expensive than regular expressions
	for (my $i=0;$i&lt;=$#bases;$i++) {
		if ($bases[$i] eq &#039;N&#039;) {
			$n++;
		} else {
			if ($n&gt;=$minNlen) {
				my $end=$i;
				my $ini=$end-$n+1;
				push(@regions,&quot;$ini $end&quot;);
			}
			$n=0;
		}
	}
	#Now, all regions without Ns are in @regions
	my $cont=1;
	my $laststart=1;
	if ($#regions&gt;=0) {
		for (my $i=0;$i&lt;=$#regions;$i++) {
			my ($ini,$end)=split /\s+/,$regions[$i];
			my $cini=$laststart; my $cend=$ini-1;
			my $pseqobj=$seqobj-&gt;trunc($cini,$cend);
			my $new_id=$contig . &quot;_p$cont&quot;;
			$cont++;
			$pseqobj-&gt;display_id(&quot;$new_id&quot;);
			$out-&gt;write_seq($pseqobj);
			print SCAFF &quot;$new_id $cini $cend\n&quot;;
			my $linksize=$end-$ini+1;
			print SCAFF &quot;LINK $ini $end $linksize\n&quot;;	
			$laststart=$end+1;
		}
		my $lastregion=$regions[$#regions];
#		print &quot;$lastregion $#regions\n&quot;; die;
		my ($ini,$end)= split /\s+/,$lastregion;
		if (($end+1)&lt;$lenseq) {
			my $cini=$end+1;
			my $cend=$lenseq;
		        my $pseqobj=$seqobj-&gt;trunc($end+1,$lenseq);
        		my $new_id=$contig . &quot;_p$cont&quot;;
		        $pseqobj-&gt;display_id(&quot;$new_id&quot;);
        		$out-&gt;write_seq($pseqobj);
	        	print SCAFF &quot;$new_id $cini $cend\n&quot;;
		}
	} else {
	     
		$seqobj-&gt;display_id(&quot;$contig.1.1&quot;);
		$out-&gt;write_seq($seqobj);
		print SCAFF &quot;$contig.1.1 1 $lenseq\n&quot;;
	}
}
close(SCAFF);</code>]]></description>
	<dc:creator>Rahul Nayak</dc:creator>
</item>

</channel>
</rss>