Perl and BioPerl script to extract protein sequences using GFF file !

LEGE — Thu, 01 Feb 2024 01:51:00 -0600

#!/usr/bin/perl
use strict;
use warnings;
use Bio::DB::Fasta;
use Bio::SeqIO;

# Paths to your GFF file and genome FASTA file
my $gff_file = 'path/to/your/file.gff';
my $genome_fasta = 'path/to/your/genome.fasta';

# Gene ID to extract
my $gene_id_to_extract = 'your_gene_id';

# Step 1: Parse GFF file to get gene locations
my %gene_locations;
open my $gff_fh, '<', $gff_file or die "Cannot open GFF file: $!";
while (<$gff_fh>) {
    next if /^#/;  # Skip comments
    my @fields = split(/\t/, $_);
    next unless $fields[2] eq 'gene';  # Consider only gene features
    my ($gene_id) = $fields[8] =~ /ID=([^;]+)/;
    $gene_locations{$gene_id} = [$fields[3], $fields[4]];
}
close $gff_fh;

# Step 2: Extract DNA sequence from genome
my $db = Bio::DB::Fasta->new($genome_fasta);
my ($start, $end) = @{$gene_locations{$gene_id_to_extract}};
my $gene_dna_sequence = $db->seq($fields[0], $start, $end);

# Step 3: Translate DNA sequence into protein sequence
my $gene_protein_sequence = translate_dna_to_protein($gene_dna_sequence);

# Print the protein sequence
print "Protein Sequence:\n$gene_protein_sequence\n";

# Subroutine to translate DNA sequence to protein sequence
sub translate_dna_to_protein {
    my ($dna_sequence) = @_;
    my $seq_obj = Bio::Seq->new(-seq => $dna_sequence, -alphabet => 'dna');
    my $protein_sequence = $seq_obj->translate->seq;
    return $protein_sequence;
}

BOL: Perl and BioPerl script to extract protein sequences using GFF file !

Perl and BioPerl script to extract protein sequences using GFF file !