<?xml version='1.0'?><rss version="2.0" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:georss="http://www.georss.org/georss" xmlns:atom="http://www.w3.org/2005/Atom" >
<channel>
	<title><![CDATA[BOL: Perl script to calculate the basic stats of the assembled genome !]]></title>
	<link>https://bioinformaticsonline.com/snippets/view/44459/perl-script-to-calculate-the-basic-stats-of-the-assembled-genome?</link>
	<atom:link href="https://bioinformaticsonline.com/snippets/view/44459/perl-script-to-calculate-the-basic-stats-of-the-assembled-genome?" rel="self" type="application/rss+xml" />
	<description><![CDATA[]]></description>
	
	<item>
	<guid isPermaLink="true">https://bioinformaticsonline.com/snippets/view/44459/perl-script-to-calculate-the-basic-stats-of-the-assembled-genome</guid>
	<pubDate>Thu, 01 Feb 2024 02:19:05 -0600</pubDate>
	<link>https://bioinformaticsonline.com/snippets/view/44459/perl-script-to-calculate-the-basic-stats-of-the-assembled-genome</link>
	<title><![CDATA[Perl script to calculate the basic stats of the assembled genome !]]></title>
	<description><![CDATA[<code>#!/usr/bin/perl
use strict;
use warnings;
use Bio::SeqIO;

# Input file containing the genome assembly in FASTA format
my $input_file = &#039;genome_assembly.fasta&#039;;

# Create Bio::SeqIO object to read the FASTA file
my $seqio = Bio::SeqIO-&gt;new(-file =&gt; $input_file, -format =&gt; &#039;fasta&#039;);

# Variables for computing statistics
my $total_length = 0;
my $num_contigs = 0;
my @contig_lengths;

# Iterate through each sequence in the assembly
while (my $seq = $seqio-&gt;next_seq) {
    my $length = $seq-&gt;length;
    $total_length += $length;
    $num_contigs++;
    push @contig_lengths, $length;
}

# Sort contig lengths in descending order
@contig_lengths = sort { $b &lt;=&gt; $a } @contig_lengths;

# Calculate additional statistics
my $min_contig_length = $contig_lengths[-1];
my $max_contig_length = $contig_lengths[0];
my $avg_contig_length = $total_length / $num_contigs;
my $median_contig_length = calculate_median(\@contig_lengths);

# Calculate N50
my $n50 = calculate_n50(\@contig_lengths);

# Calculate GC content
my $gc_content = calculate_gc_content($input_file);

# Print the computed statistics and information
print &quot;Genome Assembly Statistics:\n&quot;;
print &quot;---------------------------\n&quot;;
print &quot;Total Length: $total_length\n&quot;;
print &quot;Number of Contigs: $num_contigs\n&quot;;
print &quot;Minimum Contig Length: $min_contig_length\n&quot;;
print &quot;Maximum Contig Length: $max_contig_length\n&quot;;
print &quot;Average Contig Length: $avg_contig_length\n&quot;;
print &quot;Median Contig Length: $median_contig_length\n&quot;;
print &quot;N50: $n50\n&quot;;
print &quot;GC Content: $gc_content%\n&quot;;
print &quot;\nContig Length Distribution:\n&quot;;
print &quot;---------------------------\n&quot;;

# Print contig length distribution
foreach my $length (@contig_lengths) {
    print &quot;$length\n&quot;;
}

# Subroutine to calculate N50
sub calculate_n50 {
    my ($lengths_ref) = @_;
    my $total_size = 0;
    foreach my $length (@$lengths_ref) {
        $total_size += $length;
    }
    my $half_size = $total_size / 2;
    my $cumulative_size = 0;
    for my $length (@$lengths_ref) {
        $cumulative_size += $length;
        if ($cumulative_size &gt;= $half_size) {
            return $length;
        }
    }
    return 0; # Should not reach here
}

# Subroutine to calculate GC content
sub calculate_gc_content {
    my ($file) = @_;
    my $gc_count = 0;
    my $total_bases = 0;

    open my $fh, &#039;&lt;&#039;, $file or die &quot;Cannot open file: $!&quot;;
    while (&lt;$fh&gt;) {
        next if /^&gt;/; # Skip header lines
        chomp;
        $gc_count += tr/GCgc//;
        $total_bases += length($_);
    }
    close $fh;

    my $gc_content_percentage = ($gc_count / $total_bases) * 100;
    return sprintf(&quot;%.2f&quot;, $gc_content_percentage);
}

# Subroutine to calculate median
sub calculate_median {
    my ($array_ref) = @_;
    my $count = scalar @$array_ref;
    return $array_ref-&gt;[$count / 2];
}</code>]]></description>
	<dc:creator>LEGE</dc:creator>
</item>

</channel>
</rss>