BOL: Owner

Check os version in Linux !

Radha Agarkar — Fri, 20 Mar 2020 06:28:14 -0500

The procedure to find os name and version on Linux:

#Open the terminal application (bash shell)
#For remote server login using the ssh: ssh user@server-name
#Type any one of the following command to find os name and version in Linux:
    cat /etc/os-release
    lsb_release -a
    hostnamectl
#Type the following command to find Linux kernel version:
    uname -r

Extract ids from file with perl

Radha Agarkar — Wed, 15 Mar 2017 05:21:59 -0500

#!/usr/bin/perl
use strict;
use warnings;

my $fh=read_fh("fin.txt");
my %idHash;
my $lastKey;
while (<$fh>) {
  chomp $_;
  my @cells = split /\t/, $_;
  $idHash{$cells[0]}=$.;
  push @allIds, $cells[0];
}

#Delete the last id for secutiry -- might does not finish all steps 
delete $hash{$allIds[-1]};
next if exists $hash{$look_for};


############################################################
#Open and Read a file
sub read_fh {
    my $filename = shift @_;
    my $filehandle;
    if ($filename =~ /gz$/) {
        open $filehandle, "gunzip -dc $filename |" or die $!;
    }
    else {
        open $filehandle, "<$filename" or die $!;
    }
    return $filehandle;
}

Count GC Content in nucleotide sequence with Perl

Radha Agarkar — Sat, 21 May 2016 22:56:18 -0500

#!/usr/bin/perl -w

### Usage: get_gc_content.pl                                                         ###

#---------------------------------------------------------------------------------------------------------------------------
#Deal with passed parameters
#---------------------------------------------------------------------------------------------------------------------------
if ($#ARGV == -1) {
    usage();
    exit;
}
$fasta_file = $ARGV[0];
$out_file = "gc_out.txt";
unless ( open(IN, "$fasta_file") ) {    
    print "Got a bad fasta file: $fasta_file\n\n";
    exit;
}
unless ( open(OUT, ">$out_file") ) {
    print "Couldn't create $out_file\n";
    exit;
}
print "Parameters:\nfasta file = $fasta_file\noutput file = $out_file\n\n";
#---------------------------------------------------------------------------------------------------------------------------
#The main event
#---------------------------------------------------------------------------------------------------------------------------
print OUT "ID\t% GCContent\tTotal Count\tG Count\tC Count\tA Count\tT Count\n";
$seq = "";
while () {
    chomp;
    if (/^>/) {
	#finish up previous line.
	if (length($seq) > 0) {
	    &process_it;
	}
	#start new line.
	$id = $_;
	$id =~ s/^>(.+?)\s.+$/$1/g;
	print OUT "$id\t";
    }
    else {
	$seq = $seq . $_;
    }
}

#finish up last line.
&process_it;

close(IN);
close(OUT);

sub usage {
   $0 get_gc_content.pl     
}

sub process_it {
    @letters = split(//, $seq);
    $gccount = 0;
    $totalcount = 0;
    $acount = 0;
    $tcount = 0;
    $gcount = 0;
    $ccount = 0;
    foreach $i (@letters) {
	if (lc($i) =~ /[a-z]/) {
	    $totalcount++;
	}
	if (lc($i) eq "g" || lc($i) eq "c") {
	    $gccount++;
	}
	if (lc($i) eq "a") {
	    $acount++;
	}
	if (lc($i) eq "t") {
	    $tcount++;
	}
	if (lc($i) eq "g") {
	    $gcount++;
	}
	if (lc($i) eq "c") {
	    $ccount++;
	}
    }
    if ($totalcount > 0) {
	$gccontent = (100 * $gccount) / $totalcount;
    }
    else {
	$gccontent = 0;
    }
    print OUT "$gccontent\t$totalcount\t$gcount\t$ccount\t$acount\t$tcount\n";
    $seq = "";
}

Needleman-Wunsch Algorithm in Perl

Radha Agarkar — Sat, 21 May 2016 22:07:06 -0500

#!/usr/bin/perl

# USAGE:   perl nw.pl HEAGAWGHEE PAWHEAE BLOSUM50.txt -8

# See:     "Biological sequence anaysis" Durbin et al. ed. CUP 1998, Pg. 19
# Needleman-Wunsch global alignment algo (GOTHO 1982 mod)

# usage statement
die "usage: $0     \n" unless @ARGV == 4;

# get sequences, matrix and gapcost from command line
my ($seq1, $seq2, $smfile, $gapcost) = @ARGV;

# scoring scheme (instead of using fixed MATCH and MISMATCH scores we will use values read from BLOSUM50)
my $MATCH    =  1; # +1 for letters that match
my $MISMATCH = -1; # -1 for letters that mismatch
my $GAP      = $gapcost; # for any gap
my %BLOSUM50 = ();
my @aalist = ();

# read substitution matrix
open in, $smfile;
while(){
    if($.<2){next;}
    # read columns names (aa)
    if($.<3){
        chop $_;
        @aalist=split(/\s+/,$_);
        next;
        }
    chop $_;
    @vals=split(/\s+/,$_);
    $curaaROW=$vals[0];
    for($i=1;$i<=$#vals;$i++){
        $curaaCOLUMN=$aalist[$i];
        $BLOSUM50{$curaaROW}{$curaaCOLUMN}=$vals[$i];
    }   
}
close in;


# initialization
my @matrix;
$matrix[0][0]{score}   = 0;
$matrix[0][0]{pointer} = "none";
for(my $j = 1; $j <= length($seq1); $j++) {
    $matrix[0][$j]{score}   = $GAP * $j;
    $matrix[0][$j]{pointer} = "left";
}
for (my $i = 1; $i <= length($seq2); $i++) {
    $matrix[$i][0]{score}   = $GAP * $i;
    $matrix[$i][0]{pointer} = "up";
}

# fill
for(my $i = 1; $i <= length($seq2); $i++) {
    for(my $j = 1; $j <= length($seq1); $j++) {
        my ($diagonal_score, $left_score, $up_score);

        # calculate match score
        my $letter1 = substr($seq1, $j-1, 1);
        my $letter2 = substr($seq2, $i-1, 1);                            
        if ($letter1 eq $letter2) {
            $diagonal_score = $matrix[$i-1][$j-1]{score} + $BLOSUM50{$letter1}{$letter2};
        }
        else {
            $diagonal_score = $matrix[$i-1][$j-1]{score} + $BLOSUM50{$letter1}{$letter2};
        }

        # calculate gap scores
        $up_score   = $matrix[$i-1][$j]{score} + $GAP;
        $left_score = $matrix[$i][$j-1]{score} + $GAP;

        # choose best score
        if ($diagonal_score >= $up_score) {
            if ($diagonal_score >= $left_score) {
                $matrix[$i][$j]{score}   = $diagonal_score;
                $matrix[$i][$j]{pointer} = "diagonal";
            }
        else {
                $matrix[$i][$j]{score}   = $left_score;
                $matrix[$i][$j]{pointer} = "left";
            }
        } else {
            if ($up_score >= $left_score) {
                $matrix[$i][$j]{score}   = $up_score;
                $matrix[$i][$j]{pointer} = "up";
            }
            else {
                $matrix[$i][$j]{score}   = $left_score;
                $matrix[$i][$j]{pointer} = "left";
            }
        }
    }
}

# trace-back

my $align1 = "";
my $align2 = "";
my $descrstr = "";

# start at last cell of matrix
my $j = length($seq1);
my $i = length($seq2);

while (1) {
    last if $matrix[$i][$j]{pointer} eq "none"; # ends at first cell of matrix

    if ($matrix[$i][$j]{pointer} eq "diagonal") {
        $align1 .= substr($seq1, $j-1, 1);
        $align2 .= substr($seq2, $i-1, 1);
        if(substr($seq1, $j-1,1) eq substr($seq2, $i-1,1)){$descrstr .="|";}else{$descrstr .= ".";}
        $i--;
        $j--;
    }
    elsif ($matrix[$i][$j]{pointer} eq "left") {
        $align1 .= substr($seq1, $j-1, 1);
        $align2 .= "-";
        $descrstr .= " ";
        $j--;
    }
    elsif ($matrix[$i][$j]{pointer} eq "up") {
        $align1 .= "-";
        $align2 .= substr($seq2, $i-1, 1);
        $descrstr .= " ";
        $i--;
    }    
}

$align1 = reverse $align1;
$align2 = reverse $align2;
$descrstr = reverse $descrstr;

# print matrices:
print "\n\n";

for(my $i = 0; $i <= length($seq2); $i++) {
    for(my $j = 0; $j <= length($seq1); $j++) {
        printf("%2.1f", $matrix[$i][$j]{score});
        print("\t");
    }
    print"\n";
}
print "\n\n";

# print the alignment:
print "$align1\n";
print "$descrstr\n";
print "$align2\n";

__END__
# Entries for the BLOSUM50 matrix at a scale of ln(2)/3.0.
Find matrix at http://bioinformaticsonline.com/file/view/27455/blosum50-matrix

Parse a Fasta file with Perl

Radha Agarkar — Fri, 13 May 2016 05:00:18 -0500

#!/usr/bin/env perl

# Usage:  fastaRead.pl data.fa

use strict;
use warnings;

my $filename = $ARGV[0];
my  $sequence;
open my $fileH, "<", $filename or die "could not open $filename\n";
while (<$fileH>) {
    chomp;
    if ($_ =~ /^>/) {
        print "this line is a header: $_\n";
    }
    else {
        print "this line contains sequence data: $_\n";
        # Concatenate everything from the file into a single var
        $sequence .= $_;
    }
}
close $fileH;

Perl program to implement sliding window !

Radha Agarkar — Fri, 13 May 2016 04:28:24 -0500

#!/usr/bin/perl -w

my $filename = 'data.txt';
open(my TR, '<:encoding(UTF-8)', $filename)
  or die "Could not open file '$filename' $!";

my %hash;
while (my $line1=)
{
    chomp($line1);
    my @ar = split(/\t/,$line1);
    $hash{$ar[1]} = $ar[3];
}
close TR;

open my $SC, "<", $file2 or die "Error blah blah... $!";
while (my $line2 = <$SC>) 
{
    my ($id, $val) = split /\t/, $line2;
    my $val_file1 = $hash{$id};
    if ( $val > $val_file1 - $margin and $val < $val_file1 + $margin) {
        # print out something
    }
}
close $SC;

Find and replace ambiguous characters in fasta file with Perl and Bioperl

Radha Agarkar — Fri, 13 May 2016 03:20:09 -0500

#!/usr/bin/perl -w

my $usage="\nUsage: $0 [-h] [-m char] [fastaFileName1 ...]\n".
    "  -h: help\n".
    "  -m: missing character\n".
    "Print out the name of sequences with characters other than ATGC-.\n".
    "If -m is specified, the ambiguous characters are repleced with the\n".
    "specified character.  e.g. -m '?' will place ? to the ambigous characters.\n" .
    "If multiple files are given, sequences in all files are marged.  If no \n".
    "argument is given, it will take STDIN as the input\n";

our($opt_h, $opt_m);

use Bio::SeqIO;

use Getopt::Std;
getopts('hm:') || die "$usage\n";
die "$usage\n" if (defined($opt_h));

my $format = "fasta";
my @seqArr = ();

@ARGV = ('-') unless @ARGV;
while (my $file = shift) {
    my $seqio_obj = Bio::SeqIO->new(-file => $file, -format => $format);
    while (my $seq = $seqio_obj->next_seq()) {
	push(@seqArr, $seq);
    }
}

#@seqArr = sort { $a->id() cmp $b->id() } @seqArr;

foreach my $s (@seqArr) {
    my $thisSeq = $s->seq();
    my $ambig = AmbiguousChar($thisSeq);
    if ($ambig ne "") {
	print STDERR $s->id(), "\t$ambig\n";
	if (defined($opt_m)) {
	    $thisSeq = ReplaceAmbiguousChar($thisSeq, $opt_m);
	    $s->seq($thisSeq);
	}
    }
}

if (defined($opt_m)) {
    my $seqOut = Bio::SeqIO->new(-fs => \*STDOUT, -format => $format);
    foreach my $s (@seqArr) {
	$seqOut->write_seq($s);
    }
}
exit;


sub AmbiguousChar {
    my $string = shift;
    $string =~ s/[ATGC-]//g;

    $string =~ s/\s+//g;
    return $string;
}

sub ReplaceAmbiguousChar {
    my ($string, $char) = @_;
    $string =~ s/[^ATGC-]/$char/g;
    return $string;
}

Blast result parser with Perl and Bioperl

Radha Agarkar — Fri, 13 May 2016 03:15:06 -0500

#!/usr/local/bin/perl

#
#	Dr. Xiaodong Bai
#	It may be freely distributed under GNU General Public License.
#	This script will parse a NCBI blastx output file and output the top N hits of each blast search result.
#	For each hit, the following results are reported:
#	accesion number, length, description, E value, bit score, query frame, query start, query end, hit start, hit end, positives, and identical
# 	The results are tab-deliminated and ready for import into a spreadsheet program for browsing and further analysis.
#

use strict;
use warnings;
use Bio::SearchIO;

# Usage information
die "Usage: $0   \n", if (@ARGV != 3);

my ($infile,$numHits,$outfile) = @ARGV;
print "Parsing the BLAST result ...";
my $in = Bio::SearchIO->new(-format => 'blast', -file => $infile);
open (OUT,">$outfile") or die "Cannot open $outfile: $!";

# print the header info for tab-deliminated columns
print OUT "query_name\tquery_length\taccession_number\tlength\tdescription\tE value\tbit score\tframe\tquery_start\t";
print OUT "query_end\thit_start\thit_end\tpositives\tidentical\n";

# extraction of information for each result recursively
while ( my $result = $in->next_result ) {
	# the name of the query sequence
   	print OUT $result->query_name . "\t";

        # the length of the query sequence
    	print OUT $result->query_length;

        # output "no hits found" if there is no hits
    	if ( $result->num_hits == 0 ) {
		print OUT "\tNo hits found\n";
    	} else {
		my $count = 0;

                # process each hit recursively
		while (my $hit = $result->next_hit) {
			print OUT "\t" if ($count > 0);
                        # get the accession numbers of the hits
			print OUT "\t" . $hit->accession . "\t";
                        # get the lengths of the hit sequences
                        print OUT $hit->length . "\t";
                        # get the description of the hit sequences
			print OUT $hit->description . "\t";
                        # get the E value of the hit
			print OUT $hit->significance . "\t";
                        #get the bit score of the hit
			print OUT $hit->bits . "\t";

                        my $hspcount = 0;

                        # process the top HSP for the top hit
			while (my $hsp = $hit->next_hsp) {
                        	print OUT "\t\t\t\t\t\t\t", if ($hspcount > 0);
                        	# get the frame of the query sequence
				print OUT $hsp->query->frame . "\t";
                                # get the start and the end of the query sequence in the alignment
				print OUT $hsp->start('query') . "\t" . $hsp->end('query'). "\t";
                                # get the start and the end of the hit sequence in the alignment
				print OUT $hsp->start('hit') . "\t" . $hsp->end('hit') . "\t";
                                # get the similarity value
				printf OUT "%.1f" , ($hsp->frac_conserved * 100);
				print OUT "%\t";
                                # get the identity value
				printf OUT "%.1f" , ($hsp->frac_identical * 100);
		       		print OUT "%\n";
                                $hspcount++;
                        }
			$count++;

                        # flow control for the number of hits needed
			last if ($count == $numHits);
		}
    	}
}
close OUT;
print " DONE!!!\n";