##!/usr/bin/perl -w
#use strict;

## Run this script to find <Pattern> in a fasta file .. I use this code to detect centromere pattern in my study .. hence cetro.pl
## It print the pattern and its location with the information of match and mismatches.
## perl centro.pl <FastaFile> <OutFileName><mismatchNumber>
## Change the patter at line:27

$infile =   $ARGV[0]; #hetero hap file
$outfile = $ARGV[1];
$miss=$ARGV[2];
open INFILE,  $infile;
open OUTFILE, ">$outfile";
$/ = "\n";

$z=10;
while (<INFILE>) {
      chomp($_);            
      if ($_=~ m/^>/){ $name=$_; $sequence=""; next;}
      s/\r//;                
      $_ = uc $_;                
      $sequence = $sequence . $_; # Append the line to the sequence
     
	$l=length($sequence);
	push (@len, $l);

	my $my_pattern = "GTA.{8}TAC.{20,24}TA.{3}T";    # Pattern to search for
	#my $my_pattern = "ACCTCCCTAAGACCAA";
	#my $my_pattern  = "CCTAGGCGAA";

	my $exact_pattern = fuzzy_pattern($my_pattern, 0);
	my @exact_matches = match_positions($exact_pattern, $sequence);
	if (@exact_matches) {
  		print OUTFILE "\n\n********************probable CENTROMERE  ************************\n\n";
   		print OUTFILE "Exact matches:\n";
   		print_matches(\@exact_matches, 0); ## For zero mismatch

	}
	my $one_mismatch = fuzzy_pattern($my_pattern, $miss); ## You can change the minimum number mismatch $ARGV[2]
	my @approximate_matches = match_positions($one_mismatch, $sequence);
	if (@approximate_matches) {
  		print OUTFILE "\n\n********************probable CENTROMERE  ************************\n\n";
   		print OUTFILE "With $miss mismatches:\n";
    		print_matches(\@approximate_matches, $miss);  ## For one mismatch

		my $sequence = ""; 
		my $name= "";

	}
}




sub print_matches {
   my ($matches_ref, $mismatch) = @_;
   my @matches=@$matches_ref;
   	foreach my $match (@matches) {
      		print OUTFILE $match, "\n";
	}

	foreach $m (@matches){  
		@a=split( /\.\./, $m);
		@a1=split(/\[/, $a[0]);
		@a2=split(/\)/, $a[1]);
		$nn=$a2[0]-$a1[1];
		$matching_string=substr ($sequence, $a1[1],$nn );
		print OUTFILE "$matching_string\n";
	}
my $matches=$nn-$mismatch; 
print OUTFILE "Name of the scafolld:: $name\n";
print OUTFILE "Total number of matches:: $matches\n";
#print OUTFILE "\n \n  ******************************************************** \n \n";
}


use re qw(eval);
use vars qw($matchStart);

sub match_positions {
   my $pattern;
   local $_;
   ($pattern, $_) = @_;
   my @results;
   local $matchStart;
   my $instrumentedPattern = qr/(?{ $matchStart = pos() })$pattern/;
   while (/$instrumentedPattern/g) {
      my $nextStart = pos();
      push @results, "[$matchStart..$nextStart)";
      pos() = $matchStart+1;
   }
   return @results;
}


sub fuzzy_pattern {
   my ($original_pattern, $mismatches_allowed) = @_;
   $mismatches_allowed >= 0
      or die "Number of mismatches must be greater than or equal to zero\n";
   my $new_pattern = make_approximate($original_pattern, $mismatches_allowed);
   return qr/$new_pattern/;
}


sub make_approximate {
   my ($pattern, $mismatches_allowed) = @_;
   if ($mismatches_allowed == 0) { return $pattern }
   elsif (length($pattern) <= $mismatches_allowed) { 
	$pattern =~ tr/ACTG/./; return $pattern }
   else {
      	my ($first, $rest) = $pattern =~ /^(.)(.*)/;
      	my $after_match = make_approximate($rest, $mismatches_allowed);
      	if ($first =~ /[ACGT]/) {
         	my $after_miss = make_approximate($rest, $mismatches_allowed-1);
         	return "(?:$first$after_match|.$after_miss)";
      	}
      	else { return "$first$after_match" }
   }
}
