X BOL wishing you a very and Happy New year

Alternative content

Our Sponsors



Download BioinformaticsOnline(BOL) Apps in your chrome browser.




Genetic Algorithms demonstration with word DNA in Perl

#!/usr/bin/perl -w # GA demonstration with word DNA (512 bits) use strict; use Data::Dumper; # individuals in the population my $popsize = 1024; # a good starting point my $dna_length = 512; # 4 "letters" in the DNA my $dna_byte_length = $dna_length / 8; # the DNA byte length my $mut_rate = 0.01; # the mutation rate my $min_fitness = 0.1; # the minimum fitness for survival my $generation_count = 100000; # run for this many generations my $generation = 0; # generation counter my $pop_ref = []; # a reference to a population array sub evaluate_fitness { my $population = shift @_; my $fitness_function = shift @_; foreach my $individual (@$population) { # set the fitness to the result of invoking the fitness function # on the individual's DNA $individual->{fitness} = $fitness_function->($individual->{dna}); } } sub survive{ my $population = shift @_; my $min_fitness = shift @_; my $survived = 0; foreach my $individual (@$population) { # set the fitness to 0 for unfit individuals (so they won't procreate) $individual->{survived} = $individual->{fitness} >= $min_fitness; if ($individual->{survived}) { $survived++; } else { $individual->{fitness} = 0 } } if (0 == $survived) { die "No individuals survived, dying peacefully"; } } sub select_parents { my $population = shift @_; my $pop_size = scalar @$population; # population size # create the weights array: select only survivors from the population, # then use map to have only the fitness come through my @weights = map { $_->{fitness} } grep { $_->{survived} } @$population; # if we have less than 2 survivors, we're in trouble die "Population size $pop_size is too small" if $pop_size < 2; # we need to fill $pop_size parenting slots, to preserve the population size foreach my $slot (1..$pop_size) { my $index = sample(\@weights); # we pass a reference to the weights array here # do sanity checking on $index die "Undefined index returned by sample(), probably all individuals have died" unless defined $index; die "Invalid index $index returned by sample()" unless $index >= 0 && $index < $pop_size; # increase the parenting slots for this population member $population->[$index]->{parent}++; } } sub recombine { my $population = shift @_; my $pop_size = scalar @$population; # population size my @parent_population; my @new_population; my $total_parent_slots = 1; while ($total_parent_slots) { # find out how many parent slots are left $total_parent_slots = 0; $total_parent_slots += $_->{parent} foreach @$population; last unless $total_parent_slots; # if we are here, we're sure we have at least one individual with # parent > 0 my $individual = undef; # start with an undefined individual do { # select a random individual $individual = $population->[int(rand($pop_size))]; # individual is acceptable only if he can be a parent undef($individual) unless $individual->{parent}; } while (not defined $individual); push @parent_population, $individual; # insert the individual in the parent population $individual->{parent}--; # decrease the parenting slots of the individual by 1 } foreach my $parent (@parent_population) { # select a random individual from the parent population (parent #2) my $parent2 = @parent_population[int(rand($pop_size))]; my $child = { survived => 1, parent => 0, fitness => 0, dna => 0 }; # this is breeding! my $dna1 = $parent->{dna}; my $dna2 = $parent2->{dna}; # note we do operations on BYTES, not BITS. This is because bytes # are the unit of information (and preserving them is the faster # breeding method) foreach my $byte (1 .. $dna_byte_length) { # get one random byte from the parents and add it to the child vec($child->{dna}, $byte-1, 8) = vec(((rand() < 0.5) ? $dna1 : $dna2), $byte-1, 8); } push @new_population, $child; # the child is now a part of the new generation } return \@new_population; } sub mutate { my $population = shift @_; my $mut_rate = shift @_; foreach my $individual (@$population) { # only mutate individuals if rand() returns more than mut_rate next if rand > $mut_rate; # mutate the DNA by and-ing and then or-ing it with two random # integers between 0 and 2^$dna_length my $old_dna = $individual->{dna}; my $new_dna = 0; foreach my $byte (1 .. $dna_byte_length) { vec($new_dna, $byte-1, 8) &= int(rand(256)); vec($new_dna, $byte-1, 8) |= int(rand(256)); } $individual->{dna} = $new_dna; # print "Mutated $old_dna to ", $individual->{dna}, "\n"; } } # this is a closure block! { # private static variable @dictionary in closure for fitness() only my @dictionary; my %freqs; # calculate the fitness of the DNA sub fitness { my $dna = shift @_; my $words = dna_to_words($dna); my $fitness = 0; # start with 0 fitness my $max_entry_length = 20; # longest word we accept # you can use any word list at the end of the program # do the @dictionary initialization just once unless (@dictionary) { @dictionary = ''; foreach (@dictionary) { chomp; } # eliminate words over $max_entry_length letters, and uppercase them @dictionary = grep { length($_) > 1 && length($_) < $max_entry_length } map { uc } @dictionary; # build the letter frequencies hash (remember, all letters are uppercase) $freqs{$_}++ foreach split '', join '', @dictionary; } # there is no easy way to avoid this exhaustive check of the dictionary # without complicating this example too much foreach my $entry (@dictionary, 'A'..'Z') { # do nothing if the entry is not matched in the DNA, or vice versa next unless $words =~ /$entry/; # we have a match! (it may be a substring, that's OK) # increment the fitness depending on how long the match was; $fitness += 2**length($entry); $fitness+= $freqs{$entry} if exists $freqs{$entry}; } return $fitness; } # end of fitness() } # Function to sample from an array of weighted elements # originally written by Abigail <abigail@foad.org> # Documentation for the algorithm is at # http://theoryx5.uwinnipeg.ca/CPAN/data/Sample/Sample.html # (the CPAN Sample module) sub sample { # get the reference to the weights array my $weights = shift @_ or return undef; # internal counting variables my ($count, $sample, @count); for (my $i = 0; $i < scalar @$weights; $i ++) { $count += $weights->[$i]; $sample = $i if rand $count [$i]; } # return an index into the weights array return $sample; } # ASCII-centric byte to letter conversion sub byte_to_letter { my $dna = shift @_; my $byte = shift @_; # print "Got byte $byte\n"; my $letter = vec $dna, $byte, 8; # is the byte in the letter ranges? if so, return it. return chr($letter) if ($letter >= 65 && $letter <= 90); # if not, return a space. the use of ord() every time could be cached. return ' '; } # print the DNA out to a scalar sub dna_to_words { my $dna = shift @_; my @words; foreach my $byte (1.. $dna_byte_length) { # print the letter equivalent of the current byte push @words, byte_to_letter($dna, $byte-1); } # return the printable words return join '', @words; } init_population($pop_ref, $popsize); do { evaluate_fitness($pop_ref, \&fitness); # print out a generation summary line my @sorted_population = sort { $a->{fitness} cmp $b->{fitness} } @$pop_ref; printf "generation %d: size %dnleast fit DNA [%s]/%d\n most fit DNA [%s]/%d\n", $generation, scalar @sorted_population, dna_to_words($sorted_population[0]->{dna}), $sorted_population[0]->{fitness}, dna_to_words($sorted_population[-1]->{dna}), $sorted_population[-1]->{fitness}; survive($pop_ref, $min_fitness); # select survivors from the population select_parents($pop_ref); $pop_ref = recombine($pop_ref); # recombine() returns a whole new population array reference # from this point on, we are working with a new generation in $pop_ref mutate($pop_ref, $mut_rate); # apply mutation to the individuals } while ($generation++ < $generation_count); # run until we are out of generations sub init_population { my $population = shift @_; my $pop_size = shift @_; # for each individual foreach my $id (1 .. $pop_size) { # insert an anonymous hash reference in the population array # with the individual's data # the DNA is a random number my $random_dna = 0; foreach my $byte (1 .. $dna_byte_length) { vec($random_dna, $byte-1, 8) = int(rand(256)); # printf "Byte $byte; Random DNA is now [%64s]\n", dna_to_words($random_dna); } push @$population, { dna => $random_dna, survived => 1, parent => 0, fitness => 0 }; } } __DATA__ about algorithm and biology by century come computer electronics evolution field fitting genetic in intriguing is it most of one only progress reach rivaled sciences speed that the to was