BOL: All

Extract the values between to user defined string with Perl

Jit — Wed, 24 Jan 2018 16:18:10 -0600

#!/usr/bin/perl -w
use strict;

while ()
{
    process_record() if /^\s*START/;
}

sub process_record
{
   my $line;
   while (defined ($line = ) and $line !~ /^\s*END/)
   {
      print "$line"
   }
   print "\n";  #a printout spacer for next record
}

__DATA__

XXXX
YYYY
START
These are the first
set of lines
which are to be extracted
END
   START
New line
And new
Will be extracted?
END
XXX
ZZZ
YYY
START
These are the second
set of lines
which are to be extracted
END
aasds
tteret
tertetr

Remove the duplicated line present only next to each other with Perl

Jit — Wed, 24 Jan 2018 15:35:15 -0600

#!/usr/bin/perl
use strict;
use warnings;

{
  $_ = ;
  my $next_line;

  while( $next_line =  )
  {
    #print "current line: $_ -- next line: $next_line$/";
    print $_ if $_ ne $next_line;
  }
  continue
  {
    $_ = $next_line;
  }
print $_ if eof;
}
__DATA__
apple
apple
plum
vinegar
apple
banana
banana
banana
apple

Remove duplicate lines with perl

Jit — Wed, 24 Jan 2018 15:12:18 -0600

#! perl -sw
use strict;
my %lines;
#open DATA, $ARGV[0] or die "Couldn't open $ARGV[0]: $!\n";
while () {
    print if not $lines{$_}++;
}

__DATA__
apple
apple
plum
vinegar
apple
banana
banana
banana
apple

Plot the clock using Lastz -gerenal outfile

Jit — Thu, 18 Jan 2018 11:05:25 -0600

use strict;
use warnings;
use Statistics::R ;
use List::Util qw(sum);

#Usage  perl clockPlot.pl Palindrome.palfc 1500
my $R = Statistics::R->new() ;
$R->startR ;

my $fileN=$ARGV[0];
my $mSize=$ARGV[1];
open (my $fh2, $fileN) or die "Could not open file $fileN $!";
my (@allKeys, @allFreq); my %pHash;
while(<$fh2>) {
	chomp;
	next if /^$/; #next if empty
	my @arr = split("\t", $_);
	if ($arr[7] eq "-") {
		my $len=$arr[5]-$arr[4];
		#next if $len < $mSize;
		my @chr = split '\_', $arr[1];
		$chr[0] =~ s/[a-z]//g;
		my $newChr="$chr[0]"."_"."$arr[3]";
		$pHash{$newChr}++;
	}
}

foreach my $val (keys %pHash) {
	#next if $pHash{$val} > 10;
	my @sChr = split '\_', $val;
	my $score=$pHash{$val}/$sChr[1];
	push @allKeys, $sChr[0];
	push @allFreq, $score;
}

my $allKeys=join ',', @allKeys;
my $allFreq=join ',', @allFreq;
my $hLen=scalar (@allKeys);
my $ll="$hLen".'L';

$R->run(qq`
d <- structure(list(Chromosome = c($allKeys), Frequency = c($allFreq)), .Names = c("Chromosome", "Frequency"
), row.names = c(NA, $ll), class = "data.frame")
str(d)

library(ggplot2)
ggplot(d , aes(x =  Chromosome, y =  Frequency, fill=Frequency)) +
  coord_polar(theta = "x", start = -pi/5) +
  geom_bar(stat = "identity") + 
  scale_x_continuous(breaks = seq(0, $hLen, 50))`);

$R->stopR() ;

Create genome scaffolding with Perl

BioStar — Mon, 08 Jan 2018 23:51:46 -0600

#!/usr/bin/perl

use warnings;
use strict;
use English;

use Pod::Usage; ## uses pod documentation in usage code
use Getopt::Long qw(:config auto_version auto_help pass_through);

our $VERSION = "1.00";

=head1 NAME

psl_scaffolder.pl - use self-mapped PSL file to scaffold a genome

=head1 SYNOPSIS

./psl_scaffolder.pl -query  [options] 

=cut

sub min {
  ($a, $b) = @_;
  return( ($a < $b) ? $a : $b);
}

sub max {
  ($a, $b) = @_;
  return( ($a > $b) ? $a : $b);
}

sub rc {
  my ($seq) = @_;
  $seq =~ tr/ACGTUYRSWMKDVHBXN-/TGCAARYSWKMHBDVXN-/;
  # work on masked sequences as well
  $seq =~ tr/acgtuyrswmkdvhbxn/tgcaaryswkmhbdvxn/;
  return(scalar(reverse($seq)));
}

sub getConsensus {
  my ($b1, $b2) = @_;
  if(($b1 eq $b2) || ($b1 eq " ") || ($b2 eq " ")){
    ## equal bases, or absent bases, so consensus is easy
    return($b1);
  }
  # if different, convert to upper case to simplify lookup
  my $bc = uc(($b1 cmp $b2) ? $b1.$b2 : $b2.$b1);
  my %consensusLookup =
    (AC => "M", AM => "A", CM => "C",
     GT => "K", GK => "G", KT => "T",
     AG => "R", AR => "A", GR => "G",
     CT => "Y", CY => "C", TY => "T",
     AT => "W", AW => "A", TW => "T",
    );
  # if "simple" ambiguity can be found, return that, otherwise return N
  # (i.e. GT => K, -A => N, YM -> N)
  return( ($consensusLookup{$bc}) ? $consensusLookup{$bc} : "N");
}

sub getMatch {
  my ($b1, $b2) = @_;
  return((($b1 eq $b2) || ($b1 eq " ") || ($b2 eq " ") ||
         ($b1 eq "N") || ($b2 eq "N")) ? " " : "*");
}

############### Program starts here

# set default options
my @pslFiles = ();
my $projOpts =
  {
   "query" => 0, # contig file for query sequences
   "prefix" => "psl_scaffold_", # prefix for contig names
   "pid" => 90, # percent ID threshold
   "trimlimit" => 50, # max number of overlapping bases outside match region
  };

GetOptions($projOpts, 'query=s', 'pid=i', 'trimlimit=i', 'prefix=s');

# process remaining command line arguments (hopefully only PSL files)
while (@ARGV) {
  my $argument = shift @ARGV;
  if(-f $argument){
    push (@pslFiles, $argument);
  } else {
  pod2usage({-exitVal => 1,
               -message => "Error: Unknown command-line option or ".
             "non-existent file, '$argument'\n", -verbose => 0});
  }
}

@ARGV = @pslFiles;

if(!$projOpts->{"query"}){
  pod2usage({-exitVal => 1,
             -message => "Error: No query assembly file provided",
             -verbose => 0});
}

if(!(-f $projOpts->{"query"})){
  pod2usage({-exitVal => 1,
             -message => sprintf("Error: query file '%s' doesn't exist",
                                 $projOpts->{"query"}),
             -verbose => 0});
}

print(STDERR "Loading query sequences into memory...");
open(my $queryFile, "<", $projOpts->{"query"});
my $seqID = "";
my %querySeqs = ();
while(<$queryFile>){
  chomp;
  if(/^>((.+?)( .*?\s*)?)$/){
    ## line is sequence header
    $seqID = $2;
    $querySeqs{$seqID}{fullName} = $1;
    $querySeqs{$seqID}{sequence} = "";
  } else {
    if(!$seqID){
      pod2usage({-exitVal => 1,
                 -message => sprintf(" Error: query file '%s' doesn't look ".
                                     "like a FASTA file (no initial ID header)",
                                     $projOpts->{"query"}),
                 -verbose => 0});
    }
    ## line is sequence
    $querySeqs{$seqID}{"sequence"} .= $_;
  }
}
close($queryFile);

my %targetSeqs = %querySeqs;
my $nextScaffoldID = 1;

my %replacementSeqs = ();

printf(STDERR " loaded in %d sequences\n", scalar(keys(%querySeqs)));

print(STDERR "Processing results...");
while(<>){
  chomp;
  my @fields = split(/\t/);
  my ($matches, $misMatches, $repMatches, $nCount, $qNumInsert,
      $qBaseInsert, $tNumInsert, $tBaseInsert, $strand, $qName,
      $qSize, $qStart, $qEnd, $tName, $tSize,
      $tStart, $tEnd, $blockCount, $blockSizes, $qStarts,
      $tStarts, @rest) = @fields;
  if(!$tStarts){
    pod2usage({-exitVal => 1,
               -message => sprintf(" Error: mapping file doesn't look ".
                                   "like a PSL file (expecting".
                                   ">=21 tab-separated values, got %d)",
                                  scalar(@fields)),
               -verbose => 0});
  }
  ## calculate percent identity
  my $qAliSize = $qEnd - $qStart;
  my $tAliSize = $tEnd - $tStart;
  my $sizeDif = abs($qAliSize - $tAliSize);
  my $pid = 100 * ($matches + $repMatches -
                   ($qNumInsert + $tNumInsert + 3*log(1+$sizeDif))) /
                     ($matches + $repMatches + $misMatches);
  if(($pid >= $projOpts->{"pid"}) &&
     $querySeqs{$qName} && $targetSeqs{$tName}){
    my %meta = ();
    my $shortTarget = ($tSize < $qSize) ? 1 : 0;
    my $longTarget = (1 - $shortTarget);
    my $sName = $fields[9 + ($shortTarget * 4)];
    my $lName = $fields[9 + ($longTarget * 4)];
    my $sLen = $fields[10 + ($shortTarget * 4)];
    my $lLen = $fields[10 + ($longTarget * 4)];
    my $sStart = $fields[11 + ($shortTarget * 4)];
    my $lStart = $fields[11 + ($longTarget * 4)];
    my $sEnd = $fields[12 + ($shortTarget * 4)];
    my $lEnd = $fields[12 + ($longTarget * 4)];
    my @sBlStarts = split(/,/, $fields[19 + $shortTarget]);
    my @lBlStarts = split(/,/, $fields[19 + $longTarget]);
    my @blSizes = split(/,/, $fields[18]);
    my ($sSeq, $lSeq) = ($querySeqs{$qName}{sequence},
                         $querySeqs{$tName}{sequence});
    if($shortTarget){
      ($sSeq, $lSeq) = ($lSeq, $sSeq);
    }
    my $doRC = ($strand eq "-");
    if($doRC){
        if ($shortTarget) { # target sequence is assumed to be forward strand
          $lSeq = rc($lSeq);
          ($lStart, $lEnd) = ($lLen - $lEnd, $lEnd - $lStart);
        } else {
          $sSeq = rc($sSeq);
          ($sStart, $sEnd) = ($sLen - $sEnd, $sEnd - $sStart);
        }
    }
    my $preTrim = min($sStart, $lStart);
    my $postTrim = min($sLen - $sEnd, $lLen - $lEnd);
    ## Only continue on if there's a good likelihood that this will work
    ## i.e. trimLength * (1-%id) < threshold
    my $trimTotal = ($preTrim + $postTrim);
    if($trimTotal <= $projOpts->{"trimlimit"}){
      my $sPre = substr($sSeq, 0, $sStart);
      my $lPre = substr($lSeq, 0, $lStart);
      my $sMid = substr($sSeq, $sStart, $sEnd-$sStart);
      my $lMid = substr($lSeq, $lStart, $lEnd-$lStart);
      my $sPost = substr($sSeq, $sEnd);
      my $lPost = substr($lSeq, $lEnd);
      my $sPreTrim = substr($sPre, length($sPre)-$preTrim);
      my $sPostTrim = substr($sPost, 0, $postTrim);
      my $lPreTrim = substr($lPre, length($lPre)-$preTrim);
      my $lPostTrim = substr($lPost, 0, $postTrim);
      my $preLen = max(length($sPre), length($lPre));
      my $postLen = max(length($sPost), length($lPost));
      my $lastS = $sBlStarts[0];
      my $lastL = $lBlStarts[0];
      my $alSeqS = "";
      my $alSeqL = "";
      for (my $i = 0; $i <= $#blSizes; $i++) {
        my $gapS = $sBlStarts[$i] - $lastS;
        my $gapL = $lBlStarts[$i] - $lastL;
        my $gapLength = max($gapS, $gapL);
        my $fillS = $gapLength - $gapS;
        my $fillL = $gapLength - $gapL;
        $alSeqS .= ("-" x $fillS) . substr($sSeq, $sBlStarts[$i]-$gapS, $gapS);
        $alSeqL .= ("-" x $fillL) . substr($lSeq, $lBlStarts[$i]-$gapL, $gapL);
        $alSeqS .= substr($sSeq, $sBlStarts[$i], $blSizes[$i]);
        $alSeqL .= substr($lSeq, $lBlStarts[$i], $blSizes[$i]);
        $lastS = $sBlStarts[$i] + $blSizes[$i];
        $lastL = $lBlStarts[$i] + $blSizes[$i];
      }
      $alSeqS = $sPreTrim . $alSeqS . $sPostTrim;
      $alSeqL = $lPreTrim . $alSeqL . $lPostTrim;
      my $alConsensus = "";
      for (my $i = 0; $i < length($alSeqS); $i++) {
        $alConsensus .= getConsensus(substr($alSeqS,$i,1),substr($alSeqL,$i,1));
      }
      my $consensusLength = length($alConsensus);
      $alConsensus =
        substr($sPre, 0, length($sPre) - $preTrim).
          substr($lPre, 0, length($lPre) - $preTrim).
            $alConsensus.
              substr($sPost, $postTrim).substr($lPost, $postTrim);
      my $newSeqID = sprintf("%s_%d", $projOpts->{"prefix"}, $nextScaffoldID++);
      if(!exists($replacementSeqs{$sName}{score}) ||
         ($trimTotal < $replacementSeqs{$sName}{score}) ||
         (($trimTotal == $replacementSeqs{$sName}{score}) &&
          ($consensusLength > $replacementSeqs{$sName}{clength}))){
        $replacementSeqs{$sName}{score} = $trimTotal;
        $replacementSeqs{$sName}{clength} = $consensusLength;
        $replacementSeqs{$sName}{fullName} =
          sprintf("%s [%s %s]", $newSeqID, $sName, $lName);
        $replacementSeqs{$sName}{sequence} = $alConsensus;
        # printf(STDERR "Match: $sName\n");
      }
      if(!exists($replacementSeqs{$lName}{score}) ||
         ($trimTotal < $replacementSeqs{$lName}{score}) ||
         (($trimTotal == $replacementSeqs{$lName}{score}) &&
          ($consensusLength > $replacementSeqs{$lName}{clength}))){
        $replacementSeqs{$lName}{score} = $trimTotal;
        $replacementSeqs{$lName}{clength} = $consensusLength;
        $replacementSeqs{$lName}{fullName} =
          sprintf("%s [%s %s]", $newSeqID, $sName, $lName);
        $replacementSeqs{$lName}{sequence} = $alConsensus;
        # printf(STDERR "Match: $lName\n");
      }
    } else {
      # printf(STDERR "Rejecting match '%s' vs '%s': too many bases trimmed (%d [%d,%d] [%d,%d])\n",
      #        $qName, $tName, $trimTotal, $sStart, $lStart, $sLen-$sEnd, $lLen-$lEnd);
    }
  } elsif($pid < $projOpts->{"pid"}){
    # printf(STDERR "Rejecting match '%s' vs '%s': identity (%f) too low\n",
    #      $qName, $tName, $pid);
  }
}
printf(STDERR " done\n");

my %displayed = ();

foreach my $seqID (sort(keys(%targetSeqs))){
  my $fullName = $targetSeqs{$seqID}{fullName};
  my $sequence = $targetSeqs{$seqID}{sequence};
  if(exists($replacementSeqs{$seqID})){
    print(STDERR "Found match for $seqID\n");
    $fullName = $replacementSeqs{$seqID}{fullName};
    $sequence = $replacementSeqs{$seqID}{sequence};
  }
  if(!$displayed{$fullName}){
    printf(">%s\n%s\n", $fullName, $sequence);
    $displayed{$fullName} = 1;
  }
}

Insert the sequence at desire location in multi-fasta file with Perl

Jit — Wed, 03 Jan 2018 10:05:30 -0600

#!/usr/bin/perl

use warnings;
use strict;
use Bio::SeqIO;
use Bio::Seq;
use File::Copy;

#ARGV[0] should be in following format --- Keep the coordinate sorted by name+location
#GenomechrName locationStart AlienGene AlienLength

# The coordinate should not overlaps --- next postition shold be bigger than firstpos+alienLen

open(my $fh, '<:encoding(UTF-8)', $ARGV[0])
or die "Could not open file $ARGV[0] $!";
my $genome = $ARGV[1]; # input fasta file (genome file)
my $out = 'tmp.fa'; # output fasta file

while (<$fh>) {
chomp;
my @tmpLine = split '\t', $_;
my $chr=$tmpLine[0]; #insertion chromosome
my $pos=$tmpLine[1]; # position of the insertion
my $seqI = $tmpLine[2]; #sequence of the insertion
my $alienLen=$tmpLine[3];

my $seq_in  = Bio::SeqIO->new( -format => 'fasta',-file => $genome);
my $seq_out = Bio::SeqIO->new( -format => 'fasta',-file => ">".$out);
while( my $seq = $seq_in->next_seq() ) { 
 
    if($seq->primary_id eq $chr){
        my $length = length($seq->seq);    
        my $upstream=substr($seq->seq, 0, $pos);
        my $downstream=substr($seq->seq, $pos,$length);        
        my $seq_obj = Bio::Seq->new(-seq => $upstream.$seqI.$downstream,-display_id => $seq->primary_id,-alphabet => "dna" );
            $seq_out->write_seq($seq_obj);
    }
    else{
        $seq_out->write_seq($seq);
    }
}

my $newLoc = $pos+$alienLen;
print "$_\t$pos\t$newLoc\n";
move("$out","$genome");

}

Fill up the form and blast with perl

BioStar — Sat, 23 Dec 2017 03:48:52 -0600

use WWW::Mechanize;
use strict;
use warnings;
my $mech = WWW::Mechanize->new;

my $sequence = 'GCCCGCGGTCTCAGAGATCTCGATATATTATA';

$mech->get('http://www.arabidopsis.org/Blast/');
$mech->submit_form(
  form_name => 'myForm',
  fields => {
    'Algorithm' => 'blastx',
    'BlastTargetSet' => 'ATH1_pep',
    'QueryText' => $sequence,
  },
);

print $mech->content;

Convert fastq to fasta in Perl

Jit — Sun, 17 Dec 2017 17:54:15 -0600

use Bio::SeqIO;
#convert .fastq.gz to .fasta
open my $zcat, 'zcat seq.fastq.gz |' or die $!;
my $in=Bio::SeqIO->new(-fh=>$zcat,
                         -format=>'fastq');
my $out=Bio::SeqIO->new(-file=>'>seq.fasta',
                          -format=>'fasta');
while (my $seq=$in->next_seq) {
      $out->write_seq($seq)
}

Loop over with all files in a directory in bash

Jit — Sat, 16 Dec 2017 20:23:51 -0600

#!/bin/bash
FILES=/media/ComparativeGenomics/ncbi-genomes-2017-11-13/*
ref=/media/ComparativeGenomics/ncbi-genomes-2017-11-13/GCA_000196735.1_ASM19673v1_genomic.fna
path=/home/urbe/Tools/SATSUMA/satsuma-code-0
for f in $FILES
do
  if [ ${f: -4} == ".fna" ]; then
  echo "Processing $f file..."
  ff=$(basename "${f%.*}")
echo $ff
  # take action on each file. $f store current file name
  mkdir $ff
  $path/SatsumaSynteny -q $ref -t $f -o $ff
  #cat $f
  fi
done

Clump Finding Problem Solved with Perl

Jit — Thu, 14 Dec 2017 09:47:41 -0600

#Find patterns forming clumps in a string.
#Given: A string Genome, and integers k, L, and t.
#Return: All distinct k-mers forming (L, t)-clumps in Genome.

use strict;
use warnings;

my %myHash;
my $string="CGGACTCGACAGATGTGAAGAAATGTGAAGACTGAGTGAAGAGAAGAGGAAACACGACACGACATTGCGACATAATGTACGAATGTAATGTGCCTATGGC";
my $subStr="?"; my $clump=4;
my $kmer=5;

for (my $aa=0; $aa<=(length($string)-$kmer); $aa++) {
    my $myStr=substr  $string, $aa,$kmer;
    #print "$myStr\n";
    my $km=kmerMatch ($string, $myStr, $kmer);
    #if ($km > $max) { $max = $km;}
    #print "$km\t$myStr\n";
    $myHash{$myStr}=$km;
    
}

#Print all key which have matching values
foreach my $name (keys %myHash){
    print "$name " if $myHash{$name} == $clump;
}

kmerMatch ($string, $subStr, $kmer);

sub kmerMatch { #Check the exact matching kmers with sliding window
my ($string, $myStr, $kmer)=@_;
my $count=0;
for (my $aa=0; $aa<=(length($string)-4); $aa++) {
    my $myWin=substr  $string, $aa,$kmer;
    if ($myWin eq $myStr) {
        #print "$myWin eq $myStr\n";
        $count++;
    }
}
return $count;
}