Our Sponsors



Download BioinformaticsOnline(BOL) Apps in your chrome browser.




Perl script to calculate N50

  • Public
By Poonam Mahapatra 2695 days ago
#! /usr/local/bin/perl5.8.3 -w ### ### Script Name : calcN50.pl ### Description : This script calculates the N50 of a collection of sequences in a fasta file ### Input : A fasta sequence file and the minimum length of sequence to consider it for the N50 calculation ### Output : None. The N50 value as well as the number of sequences >= this value is sent to STDOUT ### use strict; use Getopt::Long; use Bio::SeqIO; my $fasta_file; my $filter_length; GetOptions("fasta_file=s" => \$fasta_file, "length_filter=i" => \$filter_length); die "Usage: $0 [options] \t\t-f /path/to/fasta/file [Required] \t\t-l filter seq less than length (0 if no filter is to be done)\n\n" if !$fasta_file || !defined($filter_length) || $filter_length !~ /\d+/; ### ### Get the length of each sequence: ### my $in = Bio::SeqIO->new(-file => $fasta_file, -format => 'fasta'); my @seq_lengths = (); while (my $seq_obj = $in->next_seq){ next if length($seq_obj->seq) < $filter_length; push @seq_lengths, length($seq_obj->seq); } ### ### Determine the N50: ### my @vals; my $sum = 0; for my $val (@seq_lengths){ chomp $val; $sum += $val; push(@vals, $val); } my @sorted = sort { $a <=> $b } @vals; my $runningSum = 0; my $N50 = 0; foreach my $v (@sorted) { $runningSum += $v; if($runningSum >= ($sum / 2)) { $N50 = $v; last; } } ### ### Determine the number of sequences >= than the N50 in length: ### my $count = 0; foreach my $v (@sorted) { if($v >= $N50) { $count++; } } printf("N50: $N50 Count >= N50: $count\n");