#!/usr/bin/perl -w
use strict;
# if the number of input arguments is lower than 2
# return a message showing the error
if (scalar(@ARGV) < 2) {
print "dnaloglkh.pl codontable DNAsequence\n";
exit(1);
}
# create two vars contaning the filenames
my $filecodontable = $ARGV[0];
my $filesequence = $ARGV[1];
# open the first file: table of codon usage frequencies
if (!open(PROPCODONS,"< $filecodontable")) {
print "dnaloglkh.pl: the file $filecodontable can not be opened\n";
exit(1);
}
# load the frequencies of codon usage into the hash table %pcodons
# this hash is indexed by a triplet of nucleotides or codon
my %pcodons;
my ($codon,$freq);
my $line;
# for each line in the file (tableofcodons) do
while ($line = ) {
# extract the two values or columns with a regular expression
# A group of letters and a decimal number
($codon,$freq) = ($line =~ m/(\w+) (\d+\.\d+)/);
# load the hash table
$pcodons{$codon}=$freq;
}
close(PROPCODONS);
##########################################################
# open the DNA sequence (second file)
if (!open(SEQUENCE,"< $filesequence")) {
print "dnaloglkh.pl: the file $filesequence can not be opened\n";
exit(1);
}
# FASTA format:
# >header containing a description of the sequence
# AGACTGACTTAC
# AGACTGACTTAC
# AGACTGAC
my $iden = ; # reading the header of the sequence
my @seql = ; # load the complete sequence into an array
my $seq = ""; # use a var to save the whole sequence inside
close(SEQUENCE);
# concat every segment (line) in the array into the var $seq
my $i=0;
while ($i < scalar(@seql)) {
chomp($seql[$i]);
$seq = $seq . $seql[$i];
$i = $i + 1;
}
$seq = "\U$seq";
# we are going to use the array @codons to store all of the codons in
# the input DNA sequence. To split the sequence in segments of length
# three, a regular expression will be used searching for all of the
# possible groups of three symbols (option g)
my @codons = ($seq =~ m/.../g);
# the likelihood ratio is for a current triplet, the ratio between
# the probability computed from the table of codon frequencies divided
# by the random probability (uniform), that is, 1/64. After this,
# the log must be applied to obtain the log-likelihood ratio.
# It is recommended the use of logs to manage calculations involving
# small numbers between 0 and 1. Log of the product is equal to the
# the sum of logs and the log of the division is the substraction of logs
my $r = 0.0;
$i=0;
while ($i < scalar(@codons)) {
$r = $r + log($pcodons{$codons[$i]}) - log(1/64);
$i = $i + 1;
}
printf "%.2f\n",$r;
__END__
To test the program, save this two files in your current working directory: exon.fa and intron.fa. Both files contain the sequence of a real exon and intron, respectively. Extremely different values of coding potential should be obtained with the two sequences: high values (positive) for coding protein sequences and low values (zero or even negative) for intronic regions.
exon.fa
>HUMHBB.2ex
GCTGCTGGTGGTCTACCCTTGGACCCAGAGGTTCTTTGAGTCCTTTGGGGATCTGTCCACT
CCTGATGCTGTTATGGGCAACCCTAAGGTGAAGGCTCATGGCAAGAAAGTGCTCGGTGCCT
TTAGTGATGGCCTGGCTCACCTGGACAACCTCAAGGGCACCTTTGCCACACTGAGTGAGCT
GCACTGTGACAAGCTGCACGTGGATCCTGAGAACTTCAGG
intron.fa.
>HUMHBB.2pin
ATAACAATTGTTTTCTTTTGTTTAATTCTTGCTTTCTTTTTTTTTCTTCTCCGCAATTTTT
ACTATTATACTTAATGCCTTAACATTGTGTATAACAAAAGGAAATATCTCTGAGATACATT
AAGTAACTTAAAAAAAAACTTTACACAGTCTGCCTAGTACATTACTATTTGGAATATATGT
GTGCTTATTTGCATATTCATAATCTCCCTACTTTATTTTC