BOL: Owner

Bash script to convert Multi-line Fasta to Single-line Fasta !

BioStar — Wed, 31 Jan 2024 00:39:21 -0600

#!/bin/bash

input_filename="multi_line.fasta"
output_filename="single_line.fasta"

awk '/^>/ {printf("\n%s\n", $0);next; } { printf("%s", $0);} END {printf("\n");}' "$input_filename" > "$output_filename"

echo "Successfully converted $input_filename to $output_filename in single-line FASTA format."

Perl script to convert Multi-line Fasta to Single-line Fasta !

BioStar — Wed, 31 Jan 2024 00:38:21 -0600

#!/usr/bin/perl

use strict;
use warnings;

sub multi_to_single_line_fasta {
    my ($input_filename, $output_filename) = @_;

    open my $input_file, '<', $input_filename or die "Error: Could not open file '$input_filename': $!";
    open my $output_file, '>', $output_filename or die "Error: Could not open file '$output_filename': $!";

    my $current_sequence = "";

    while (my $line = <$input_file>) {
        chomp $line;
        if ($line =~ /^>/) {
            # If a header line, write the previous sequence and then the new header
            print $output_file $current_sequence . "\n" if $current_sequence;
            print $output_file $line . "\n";
            $current_sequence = "";
        } else {
            # If a sequence line, concatenate to the current sequence
            $current_sequence .= $line;
        }
    }

    # Write the last sequence
    print $output_file $current_sequence . "\n" if $current_sequence;

    close $input_file;
    close $output_file;

    print "Successfully converted $input_filename to $output_filename in single-line FASTA format.\n";
}

# Example usage:
# multi_to_single_line_fasta('multi_line.fasta', 'single_line.fasta');

Python script to convert Multi-line Fasta to Single-line Fasta

BioStar — Wed, 31 Jan 2024 00:37:15 -0600

def multi_to_single_line_fasta(input_filename, output_filename):
    try:
        with open(input_filename, 'r') as input_file:
            with open(output_filename, 'w') as output_file:
                current_sequence = ""
                for line in input_file:
                    if line.startswith('>'):
                        # If a header line, write the previous sequence and then the new header
                        if current_sequence:
                            output_file.write(current_sequence + '\n')
                        output_file.write(line.strip() + '\n')
                        current_sequence = ""
                    else:
                        # If a sequence line, concatenate to the current sequence
                        current_sequence += line.strip()
                
                # Write the last sequence
                if current_sequence:
                    output_file.write(current_sequence + '\n')

        print(f"Successfully converted {input_filename} to {output_filename} in single-line FASTA format.")

    except FileNotFoundError:
        print(f"Error: File '{input_filename}' not found.")

# Example usage:
# multi_to_single_line_fasta('multi_line.fasta', 'single_line.fasta')

Perl script to calculate GC content !

BioStar — Tue, 30 Jan 2024 05:20:10 -0600

#!/usr/bin/perl

sub calculate_gc_content {
    my ($sequence) = @_;
    $sequence = uc($sequence);  # Convert the sequence to uppercase
    my $gc_count = () = $sequence =~ /[GC]/g;
    my $total_bases = length($sequence);
    my $gc_content = ($gc_count / $total_bases) * 100;
    return $gc_content;
}

# Example usage:
my $dna_sequence = "ATGCGCTAAAGCGAGCGAAGCGCTAGATCGATCGATCGATCGATCGATCGATCGATCGATCG";
my $gc_content = calculate_gc_content($dna_sequence);
printf "GC content: %.2f%%\n", $gc_content;

Download lumpy skin disease data !

BioStar — Wed, 22 Mar 2023 05:12:12 -0500

Location

https://www.ncbi.nlm.nih.gov/sra?linkname=bioproject_sra_all&from_uid=880745


The raw genome sequence data from the 2022 outbreak in India is available in the SRA Project PRJNA880745

R script to covert and export html page to png

BioStar — Tue, 14 Mar 2023 07:03:42 -0500

# Library
library(streamgraph)
# Create data:
data <- data.frame(
  year=rep(seq(1990,2016) , each=10),
  name=rep(letters[1:10] , 27),
  value=sample( seq(0,1,0.0001) , 270)
)
# Start with a classic stream graph. It is supposed to open in a browser
streamgraph(data, key="name", value="value", date="year")
# Copy the URL of the html window you get
# load webshot library
library(webshot)

#install phantom:
webshot::install_phantomjs()
# Make a webshot in pdf : high quality but can not choose printed zone
webshot("paste_your_html_here.html" , "output.pdf", delay = 0.2)

# Make a webshot in png : Low quality - but you can choose shape
webshot("paste_your_html_here" , "output.png", delay = 0.2 , cliprect = c(440, 0, 1000, 10))

Raku script to find palindrome in genomes !

BioStar — Tue, 07 Mar 2023 14:15:17 -0600

sub is-palindrome(Str $str) returns Bool {
    $str.=uc; # convert to uppercase
    $str.=subst:g/\s+//; # remove any spaces
    return $str eq $str.flip;
}

sub find-palindromes(Str $dna, Int $min-length, Int $max-length) {
    for $min-length..$max-length -> $length {
        for 0..^$dna.chars - $length -> $pos {
            my $substring = $dna.substr($pos, $length);
            if is-palindrome($substring) {
                say "Palindrome found at position $pos: $substring";
            }
        }
    }
}

# Example usage
my $dna = "GGATCCATGGCCTAGG"; # example DNA sequence
find-palindromes($dna, 3, 8); # find palindromes with length between 3 and 8

Perl script to find edit distance between two sequences !

BioStar — Tue, 07 Mar 2023 14:11:00 -0600

#!/usr/bin/perl

use strict;
use warnings;

sub edit_distance {
    my ($s1, $s2) = @_;

    my $len1 = length($s1);
    my $len2 = length($s2);

    my @dp;
    for (my $i = 0; $i <= $len1; $i++) {
        for (my $j = 0; $j <= $len2; $j++) {
            $dp[$i][$j] = 0;
        }
    }

    for (my $i = 0; $i <= $len1; $i++) {
        $dp[$i][0] = $i;
    }

    for (my $j = 0; $j <= $len2; $j++) {
        $dp[0][$j] = $j;
    }

    for (my $i = 1; $i <= $len1; $i++) {
        for (my $j = 1; $j <= $len2; $j++) {
            my $cost = substr($s1, $i-1, 1) eq substr($s2, $j-1, 1) ? 0 : 1;
            $dp[$i][$j] = min($dp[$i-1][$j]+1, $dp[$i][$j-1]+1, $dp[$i-1][$j-1]+$cost);
        }
    }

    return $dp[$len1][$len2];
}

sub min {
    my $min = shift @_;
    foreach (@_) {
        $min = $_ if $_ < $min;
    }
    return $min;
}

# Example usage
my $seq1 = "ACGTAGCTAGCTGACTGAC";
my $seq2 = "CGTAGCTAGCTGACAGCTA";
my $distance = edit_distance($seq1, $seq2);
print "The edit distance between $seq1 and $seq2 is $distance.\n";

Perl script to find inverted repeats !

BioStar — Tue, 07 Mar 2023 06:25:23 -0600

#!/usr/bin/perl

use strict;
use warnings;

use Bio::SeqIO;
use Bio::Tools::Run::RepeatMasker;

my $genome_file = "genome.fasta";

# read genome sequence
my $seqio = Bio::SeqIO->new(-file => $genome_file, -format => "fasta");
my $seqobj = $seqio->next_seq();
my $seq = $seqobj->seq();

# run RepeatMasker
my $rm = Bio::Tools::Run::RepeatMasker->new();
my $rm_report = $rm->run($genome_file);

# parse RepeatMasker output
while (my $rm_result = $rm_report->next_result()) {
    my $rm_match = $rm_result->repeat_consensus();
    my $rm_class = $rm_result->repeat_class();
    my $rm_start = $rm_result->start();
    my $rm_end = $rm_result->end();
    my $rm_strand = $rm_result->strand();
    
    if ($rm_class eq "Inverted") {
        my $rm_seq = substr($seq, $rm_start-1, $rm_end-$rm_start+1);
        if ($rm_strand eq "-") {
            $rm_seq = reverse_complement($rm_seq);
        }
        print "Inverted repeat found at positions $rm_start-$rm_end: $rm_seq\n";
    }
}

sub reverse_complement {
    my ($seq) = @_;
    $seq = reverse($seq);
    $seq =~ tr/ACGTacgt/TGCAtgca/;
    return $seq;
}

Identify genome-wide synteny with LASTZ alignment

BioStar — Mon, 05 Dec 2022 04:52:48 -0600

#This is the walkstrough how to identifiy genome-wide synteny markers based on LASTZ alignment.

Step1：Mask the repeat sequences for both genomes and chromosomes.

RepeatMasker -pa 40 -nolow -norna -gff -xmall -lib custom.TE.lib_for_rice.fa AAChr1.txt RepeatMasker -pa 40 -nolow -norna -gff -xmall -lib custom.TE.lib_for_FF.fa FFChr1.txt

Step2: Alignment using LASTZ and Chain/Net

lastz AAChr1.txt FFChr1.txt K=2200 L=6000 Y=3400 E=30 H=0 O=400 T=1 --format=axt --out=chr01.axt axtChain -linearGap=medium chr01.axt AAChr1.txt FFChr1.txt chr01.axt.chain chainPreNet chr01.axt.chain AAChr1.txt.sizes FFChr1.txt.sizes chr01.chain.filter chainNet chr01.chain.filter -minSpace=1 AAChr1.txt.sizes FFChr1.txt.sizes chr1.chain.filter.tnet chr1.chain.filter.qnet netSyntenic chr1.chain.filter.tnet chr1.chain.filter.tnet.synnet netToAxt chr1.chain.filter.tnet.synnet chr1.chain.filter.tnet.synnet chr01.chain.filter AAChr1.txt FFChr1.txt chr1.chain.filter.tnet.synnet.axt axtSort chr1.chain.filter.tnet.synnet.axt chr1.chain.filter.tnet.synnet.Sort.axt axtToMaf chr1.chain.filter.tnet.synnet.axt AAChr1.txt.sizes FFChr1.txt.sizes -tPrefix=target. -qPrefix=query. chr1.chain.filter.tnet.synnet.axt.maf

Step 3: Get syntenic markers

perl Maf2rawsynteny.pl chr1.chain.filter.tnet.synnet.axt.maf target.AAChr1 query.FFChr1 
perl Get_synteny.pl -i chr1.chain.filter.tnet.synnet.axt.maf -n 0 -m 3 -t target.AAChr1 -q query.FFChr1 -o syn.final.out

@ https://github.com/yiliao1022/Centromere_synteny_project