<?xml version='1.0'?><rss version="2.0" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:georss="http://www.georss.org/georss" xmlns:atom="http://www.w3.org/2005/Atom" >
<channel>
	<title><![CDATA[BOL: Perl script to convert GFF 2 FASTA !]]></title>
	<link>https://bioinformaticsonline.com/snippets/view/37189/perl-script-to-convert-gff-2-fasta?</link>
	<atom:link href="https://bioinformaticsonline.com/snippets/view/37189/perl-script-to-convert-gff-2-fasta?" rel="self" type="application/rss+xml" />
	<description><![CDATA[]]></description>
	
	<item>
	<guid isPermaLink="true">https://bioinformaticsonline.com/snippets/view/37189/perl-script-to-convert-gff-2-fasta</guid>
	<pubDate>Wed, 27 Jun 2018 09:58:20 -0500</pubDate>
	<link>https://bioinformaticsonline.com/snippets/view/37189/perl-script-to-convert-gff-2-fasta</link>
	<title><![CDATA[Perl script to convert GFF 2 FASTA !]]></title>
	<description><![CDATA[<code>#!/usr/bin/perl
use strict;
use warnings;
use Bio::Seq;
use Bio::SeqIO;
use Bio::DB::Fasta;

$| = 1;    # Flush output
my $outfile_cds = Bio::SeqIO-&gt;new( -format =&gt; &#039;fasta&#039;, -file =&gt; &quot;&gt;$ARGV[2].cds.fasta&quot; );
my $outfile_pep = Bio::SeqIO-&gt;new( -format =&gt; &#039;fasta&#039;, -file =&gt; &quot;&gt;$ARGV[2].pep.fasta&quot; );
my $outfile_cdna = Bio::SeqIO-&gt;new( -format =&gt; &#039;fasta&#039;, -file =&gt; &quot;&gt;$ARGV[2].cdna.fasta&quot; );
my $outfile_gene = Bio::SeqIO-&gt;new( -format =&gt; &#039;fasta&#039;, -file =&gt; &quot;&gt;$ARGV[2].gene.fasta&quot; );
my $outfile_upstream3000 = Bio::SeqIO-&gt;new( -format =&gt; &#039;fasta&#039;, -file =&gt; &quot;&gt;$ARGV[2].upstream3000.fasta&quot; );

###### Output type description ######
# cds - translated sequence (starting with ATG and ending with a stop codon included)
# cdna - transcribed sequence (devoid of introns, but containing untranslated exons)
# protein - cds translated (includes a * as the stop codon)
# gene - the entire gene sequence (including UTRs and introns)
# upstream3000 - the 3000 upstream region of the gene (likely including the promoter)

### First, index the genome
my $file_fasta = $ARGV[0];
my $db = Bio::DB::Fasta-&gt;new($file_fasta);
print (&quot;Genome fasta parsed\n&quot;);



### Second, parse the GFF3
my %CDS;
my %CDNA;
my $mRNA_name;
my $frame;
open GFF, &quot;&lt;$ARGV[1]&quot; or die $!;
while ( my $line = &lt;GFF&gt; ) {
    chomp $line;
    my @array = split( &quot;\t&quot;, $line );
    my $type = $array[2];

    if ($type eq &#039;gene&#039;) {
        my @attrs = split( &quot;;&quot;, $array[8] );
        $attrs[0] =~ s/ID=//;
        my $gene_name = $attrs[0];
        my $gene_start = $array[3];
        my $gene_end = $array[4];
        my $gene_seq = $db-&gt;seq( $array[0], $gene_start, $gene_end );
        my $output_gene = Bio::Seq-&gt;new(
            -seq        =&gt; $gene_seq,
            -id         =&gt; $gene_name,
            -display_id =&gt; $gene_name,
            -alphabet   =&gt; &#039;dna&#039;,
        );


        # The upstream 3000
        my $upstream_start;
        my $upstream_end;
        if($array[6] eq &#039;+&#039;) {
            $upstream_start=$gene_start-3000;
            $upstream_end=$gene_start-1;
        }
        elsif ($array[6] eq &#039;-&#039;) {
            $upstream_start=$gene_end+1;
            $upstream_end=$gene_end+3000;
        }
        my $upstream_seq = $db-&gt;seq( $array[0], $upstream_start, $upstream_end );
        my $output_upstream3000 = Bio::Seq-&gt;new(
            -seq        =&gt; $upstream_seq,
            -id         =&gt; $gene_name.&quot;_upstream3000&quot;,
            -display_id =&gt; $gene_name.&quot;_upstream3000&quot;,
            -alphabet   =&gt; &#039;dna&#039;,
        );

        # Reverse Complement if the frame is minus
        if($array[6] eq &#039;+&#039;) {
        }
        elsif ($array[6] eq &#039;-&#039;) {
            $output_gene = $output_gene-&gt;revcom();
            $output_upstream3000 = $output_upstream3000-&gt;revcom();
        }
        else {
            die &quot;Unknown frame! At line $. of the GFF\n&quot;;
        }
        $outfile_gene-&gt;write_seq($output_gene);
        $outfile_upstream3000-&gt;write_seq($output_upstream3000);
    }


    if ( ( $type eq &#039;mRNA&#039; ) and ( $. &gt; 2 ) ) {
        # CDS: Collect CDSs and extract sequence of the previous mRNA
        my $mergedCDS_seq;
        # WARNING we must sort by $cds_coord[1]


        foreach my $key (sort {$a &lt;=&gt; $b} keys %CDS) { # Ascending numeric sort of the starting coordinate
            my $coord = $CDS{$key};
            my @cds_coord = split( &quot; &quot;, $coord );
            my $cds_seq = $db-&gt;seq( $cds_coord[0], $cds_coord[1], $cds_coord[2] );
            $mergedCDS_seq .= $cds_seq;
        }

        my $output_cds = Bio::Seq-&gt;new(
            -seq        =&gt; $mergedCDS_seq,
            -id         =&gt; $mRNA_name,
            -display_id =&gt; $mRNA_name,
            -alphabet   =&gt; &#039;dna&#039;,
        );
        if ($frame eq &#039;-&#039;) {
            $output_cds = $output_cds-&gt;revcom();
        }
        my $output_pep = $output_cds-&gt;translate();
        $outfile_cds-&gt;write_seq($output_cds);
        $outfile_pep-&gt;write_seq($output_pep);


        # CDNA: Collect UTRs and CDSs and extract sequence of the previous mRNA
        my $mergedCDNA_seq;
        foreach my $key (sort {$a &lt;=&gt; $b} keys %CDNA) { # Ascending numeric sort of the starting coordinate
            my $coord = $CDNA{$key};
            my @cds_coord = split( &quot; &quot;, $coord );
            my $cds_seq = $db-&gt;seq( $cds_coord[0], $cds_coord[1], $cds_coord[2] );
            $mergedCDNA_seq .= $cds_seq;
        }

        my $output_cdna = Bio::Seq-&gt;new(
            -seq        =&gt; $mergedCDNA_seq,
            -id         =&gt; $mRNA_name,
            -display_id =&gt; $mRNA_name,
            -alphabet   =&gt; &#039;dna&#039;,
        );
        if ($frame eq &#039;-&#039;) {
            $output_cdna = $output_cdna-&gt;revcom();
        }
        $outfile_cdna-&gt;write_seq($output_cdna);



        # Now initialize the next mRNA
        my @attrs = split( &quot;;&quot;, $array[8] );
        $attrs[0] =~ s/ID=//;
        $mRNA_name = $attrs[0];
        $frame=$array[6];
        %CDS = (); %CDNA = (); # Empty the chunk arrays
    }
    elsif ( $type eq &#039;mRNA&#039; ) {    # First mRNA
        my @attrs = split( &quot;;&quot;, $array[8] );
        $attrs[0] =~ s/ID=//;
        $mRNA_name = $attrs[0];
        $frame=$array[6];
    }
    elsif ( $type eq &#039;CDS&#039; ) {
        my $cds_coord = $array[0] . &quot; &quot; . $array[3] . &quot; &quot; . $array[4];
        $CDS{$array[3]}=$cds_coord;
        $CDNA{$array[3]}=$cds_coord;
    }
    elsif ($type eq &#039;UTR&#039; ) {
        my $utr_coord = $array[0] . &quot; &quot; . $array[3] . &quot; &quot; . $array[4];
        $CDNA{$array[3]}=$utr_coord;
    }

}

close GFF;

__END__
jitendra@jitendra-UNLOCK-INSTALL[telomeX&amp;] perl gff2fasta.pl Adineta_vaga_v2.0.scaffolds.fa Adineta_vaga.v2.gff3 Avaga</code>]]></description>
	<dc:creator>Jit</dc:creator>
</item>

</channel>
</rss>