Perl script to calculate the basic stats of the assembled genome !

LEGE — Thu, 01 Feb 2024 02:19:05 -0600
#!/usr/bin/perl
use strict;
use warnings;
use Bio::SeqIO;

# Input file containing the genome assembly in FASTA format
my $input_file = 'genome_assembly.fasta';

# Create Bio::SeqIO object to read the FASTA file
my $seqio = Bio::SeqIO->new(-file => $input_file, -format => 'fasta');

# Variables for computing statistics
my $total_length = 0;
my $num_contigs = 0;
my @contig_lengths;

# Iterate through each sequence in the assembly
while (my $seq = $seqio->next_seq) {
    my $length = $seq->length;
    $total_length += $length;
    $num_contigs++;
    push @contig_lengths, $length;
}

# Sort contig lengths in descending order
@contig_lengths = sort { $b <=> $a } @contig_lengths;

# Calculate additional statistics
my $min_contig_length = $contig_lengths[-1];
my $max_contig_length = $contig_lengths[0];
my $avg_contig_length = $total_length / $num_contigs;
my $median_contig_length = calculate_median(\@contig_lengths);

# Calculate N50
my $n50 = calculate_n50(\@contig_lengths);

# Calculate GC content
my $gc_content = calculate_gc_content($input_file);

# Print the computed statistics and information
print "Genome Assembly Statistics:\n";
print "---------------------------\n";
print "Total Length: $total_length\n";
print "Number of Contigs: $num_contigs\n";
print "Minimum Contig Length: $min_contig_length\n";
print "Maximum Contig Length: $max_contig_length\n";
print "Average Contig Length: $avg_contig_length\n";
print "Median Contig Length: $median_contig_length\n";
print "N50: $n50\n";
print "GC Content: $gc_content%\n";
print "\nContig Length Distribution:\n";
print "---------------------------\n";

# Print contig length distribution
foreach my $length (@contig_lengths) {
    print "$length\n";
}

# Subroutine to calculate N50
sub calculate_n50 {
    my ($lengths_ref) = @_;
    my $total_size = 0;
    foreach my $length (@$lengths_ref) {
        $total_size += $length;
    }
    my $half_size = $total_size / 2;
    my $cumulative_size = 0;
    for my $length (@$lengths_ref) {
        $cumulative_size += $length;
        if ($cumulative_size >= $half_size) {
            return $length;
        }
    }
    return 0; # Should not reach here
}

# Subroutine to calculate GC content
sub calculate_gc_content {
    my ($file) = @_;
    my $gc_count = 0;
    my $total_bases = 0;

    open my $fh, '<', $file or die "Cannot open file: $!";
    while (<$fh>) {
        next if /^>/; # Skip header lines
        chomp;
        $gc_count += tr/GCgc//;
        $total_bases += length($_);
    }
    close $fh;

    my $gc_content_percentage = ($gc_count / $total_bases) * 100;
    return sprintf("%.2f", $gc_content_percentage);
}

# Subroutine to calculate median
sub calculate_median {
    my ($array_ref) = @_;
    my $count = scalar @$array_ref;
    return $array_ref->[$count / 2];
}
BOL: Perl script to calculate the basic stats of the assembled genome !

Perl script to calculate the basic stats of the assembled genome !