Our Sponsors



Download BioinformaticsOnline(BOL) Apps in your chrome browser.




  • BioScripts
  • LEGE
  • Perl script to calculate the basic stats of the assembled genome !

Perl script to calculate the basic stats of the assembled genome !

  • Public
By LEGE 86 days ago
#!/usr/bin/perl use strict; use warnings; use Bio::SeqIO; # Input file containing the genome assembly in FASTA format my $input_file = 'genome_assembly.fasta'; # Create Bio::SeqIO object to read the FASTA file my $seqio = Bio::SeqIO->new(-file => $input_file, -format => 'fasta'); # Variables for computing statistics my $total_length = 0; my $num_contigs = 0; my @contig_lengths; # Iterate through each sequence in the assembly while (my $seq = $seqio->next_seq) { my $length = $seq->length; $total_length += $length; $num_contigs++; push @contig_lengths, $length; } # Sort contig lengths in descending order @contig_lengths = sort { $b <=> $a } @contig_lengths; # Calculate additional statistics my $min_contig_length = $contig_lengths[-1]; my $max_contig_length = $contig_lengths[0]; my $avg_contig_length = $total_length / $num_contigs; my $median_contig_length = calculate_median(\@contig_lengths); # Calculate N50 my $n50 = calculate_n50(\@contig_lengths); # Calculate GC content my $gc_content = calculate_gc_content($input_file); # Print the computed statistics and information print "Genome Assembly Statistics:\n"; print "---------------------------\n"; print "Total Length: $total_length\n"; print "Number of Contigs: $num_contigs\n"; print "Minimum Contig Length: $min_contig_length\n"; print "Maximum Contig Length: $max_contig_length\n"; print "Average Contig Length: $avg_contig_length\n"; print "Median Contig Length: $median_contig_length\n"; print "N50: $n50\n"; print "GC Content: $gc_content%\n"; print "\nContig Length Distribution:\n"; print "---------------------------\n"; # Print contig length distribution foreach my $length (@contig_lengths) { print "$length\n"; } # Subroutine to calculate N50 sub calculate_n50 { my ($lengths_ref) = @_; my $total_size = 0; foreach my $length (@$lengths_ref) { $total_size += $length; } my $half_size = $total_size / 2; my $cumulative_size = 0; for my $length (@$lengths_ref) { $cumulative_size += $length; if ($cumulative_size >= $half_size) { return $length; } } return 0; # Should not reach here } # Subroutine to calculate GC content sub calculate_gc_content { my ($file) = @_; my $gc_count = 0; my $total_bases = 0; open my $fh, '<', $file or die "Cannot open file: $!"; while (<$fh>) { next if /^>/; # Skip header lines chomp; $gc_count += tr/GCgc//; $total_bases += length($_); } close $fh; my $gc_content_percentage = ($gc_count / $total_bases) * 100; return sprintf("%.2f", $gc_content_percentage); } # Subroutine to calculate median sub calculate_median { my ($array_ref) = @_; my $count = scalar @$array_ref; return $array_ref->[$count / 2]; }