#!/usr/bin/perl
use strict; 
use warnings;

##Usage
##perl FastaCleaner.pl <FASTAinfile> <FASTAoutfile>

open INFILE, '<', $ARGV[0] or die "could not open file for read $!\n";
open OUTFILE, ">" , $ARGV[1] or die "$0: open file for read $!";

$/ = "\n";  # 

my @seq; my $Ncount=0; my $realCount=0; my $name;
while (<INFILE>) {
chomp $_;
    my $seq=uc($_);
    if ($seq =~ /^>/){ 
	if ($. > 1) {
		my $score=(($Ncount*100)/($Ncount+$realCount));
		print "Name:$name|N'scount:$Ncount|RealCount:$realCount|Percentage:$score%\n";
		if ($score > 50 ) { print "I think you should not rely on this sequence $name\n";}
		}  
	print OUTFILE "$seq\n";  $name=$seq; $Ncount=0; $realCount=0;
	$seq =~ s/^>*.+\n//; next; 
    } # remove FASTA header
    next if (/^\s*$/);
    $seq =~ s/\n//g;  # remove endlines
    my @chars = split '', $seq;
    foreach (@chars) {
	if ($_ eq "N") {
		my $Nucleotide=randChar();
		push @seq, $Nucleotide;
		$Ncount++;
		}
	else	{ push @seq, $_; $realCount++}
		
    }
foreach (@seq) { print OUTFILE "$_"; }
print OUTFILE "\n";
undef @seq; 
}  # end while

my $score=(($Ncount*100)/($Ncount+$realCount));
print "Name:$name|N'scount:$Ncount|RealCount:$realCount|Percentage:$score%\n";
if ($score > 50 ) { print "I think you should not rely on this sequence $name\n";}

close INFILE;

sub randChar {
my @b=qw/A T G C/;
my $newNucleotide=@b[int(rand(4))];
return $newNucleotide;
}

exit;
