BOL: All

Perl script for six frame translation !

LEGE — Thu, 01 Feb 2024 01:52:50 -0600

#!/usr/bin/perl
use strict;
use warnings;
use Bio::SeqIO;

# Path to your input nucleotide sequence file in FASTA format
my $input_fasta = 'path/to/your/input.fasta';

# Step 1: Read the input FASTA file
my $seqio = Bio::SeqIO->new(-file => $input_fasta, -format => 'fasta');
my $sequence = $seqio->next_seq;

# Step 2: Perform six-frame translation
my @frames = (1, 2, 3, -1, -2, -3);
foreach my $frame (@frames) {
    my $translated_seq = translate_frame($sequence, $frame);
    my $frame_type = $frame > 0 ? "Forward" : "Reverse";
    print "Frame $frame_type $frame Translation:\n$translated_seq\n";
}

# Subroutine to translate a sequence in a specific frame
sub translate_frame {
    my ($sequence, $frame) = @_;

    my $translated_seq;
    if ($frame > 0) {
        $translated_seq = $sequence->translate(-frame => $frame)->seq;
    } else {
        # If frame is negative, reverse and complement the sequence before translation
        my $revcomp_seq = $sequence->revcom;
        $translated_seq = $revcomp_seq->translate(-frame => abs($frame))->seq;
    }

    return $translated_seq;
}

Perl and BioPerl script to extract protein sequences using GFF file !

LEGE — Thu, 01 Feb 2024 01:51:00 -0600

#!/usr/bin/perl
use strict;
use warnings;
use Bio::DB::Fasta;
use Bio::SeqIO;

# Paths to your GFF file and genome FASTA file
my $gff_file = 'path/to/your/file.gff';
my $genome_fasta = 'path/to/your/genome.fasta';

# Gene ID to extract
my $gene_id_to_extract = 'your_gene_id';

# Step 1: Parse GFF file to get gene locations
my %gene_locations;
open my $gff_fh, '<', $gff_file or die "Cannot open GFF file: $!";
while (<$gff_fh>) {
    next if /^#/;  # Skip comments
    my @fields = split(/\t/, $_);
    next unless $fields[2] eq 'gene';  # Consider only gene features
    my ($gene_id) = $fields[8] =~ /ID=([^;]+)/;
    $gene_locations{$gene_id} = [$fields[3], $fields[4]];
}
close $gff_fh;

# Step 2: Extract DNA sequence from genome
my $db = Bio::DB::Fasta->new($genome_fasta);
my ($start, $end) = @{$gene_locations{$gene_id_to_extract}};
my $gene_dna_sequence = $db->seq($fields[0], $start, $end);

# Step 3: Translate DNA sequence into protein sequence
my $gene_protein_sequence = translate_dna_to_protein($gene_dna_sequence);

# Print the protein sequence
print "Protein Sequence:\n$gene_protein_sequence\n";

# Subroutine to translate DNA sequence to protein sequence
sub translate_dna_to_protein {
    my ($dna_sequence) = @_;
    my $seq_obj = Bio::Seq->new(-seq => $dna_sequence, -alphabet => 'dna');
    my $protein_sequence = $seq_obj->translate->seq;
    return $protein_sequence;
}

Python script to extract a protein sequence from a genome using a General Feature Format (GFF) file !

LEGE — Thu, 01 Feb 2024 01:48:49 -0600

#You typically need the corresponding genome sequence file in FASTA format. The GFF file contains information about the #features (such as genes) in the genome, including their locations and annotations.

#The outline of the steps :

#Parse the GFF file to extract information about the gene locations.
#Use the gene locations to extract the corresponding DNA sequences from the genome in FASTA format.
#Translate the DNA sequences into protein sequences.

#Simple example using Python and Biopython


from Bio import SeqIO
from Bio.SeqRecord import SeqRecord
from Bio.Seq import Seq

def extract_protein_sequence(gff_file, genome_fasta, gene_id):
    # Step 1: Parse the GFF file
    gene_locations = {}
    with open(gff_file, 'r') as gff:
        for line in gff:
            if not line.startswith('#'):
                fields = line.strip().split('\t')
                if fields[2] == 'gene':
                    gene_id = fields[8].split(';')[0].split('=')[1]
                    gene_locations[gene_id] = (int(fields[3]), int(fields[4]))

    # Step 2: Extract DNA sequence from the genome
    genome_record = SeqIO.read(genome_fasta, 'fasta')
    gene_start, gene_end = gene_locations[gene_id]
    gene_dna_sequence = genome_record.seq[gene_start - 1:gene_end]

    # Step 3: Translate DNA sequence into protein sequence
    gene_protein_sequence = gene_dna_sequence.translate()

    return gene_protein_sequence

# Example usage
gff_file = 'path/to/your/file.gff'
genome_fasta = 'path/to/your/genome.fasta'
gene_id_to_extract = 'your_gene_id'

protein_sequence = extract_protein_sequence(gff_file, genome_fasta, gene_id_to_extract)
print(protein_sequence)

Bash script to calculate difference between column !

LEGE — Thu, 01 Feb 2024 01:28:44 -0600

Space Separated

awk '{ $5 = $2 - $4 } 1' inputput.txt > outdiff.txt

Or with tab separation:

awk 'BEGIN { OFS = "\t" } { $5 = $2 - $4 } 1' inputput.txt > outdiff.txt

Bash script to convert Multi-line Fasta to Single-line Fasta !

BioStar — Wed, 31 Jan 2024 00:39:21 -0600

#!/bin/bash

input_filename="multi_line.fasta"
output_filename="single_line.fasta"

awk '/^>/ {printf("\n%s\n", $0);next; } { printf("%s", $0);} END {printf("\n");}' "$input_filename" > "$output_filename"

echo "Successfully converted $input_filename to $output_filename in single-line FASTA format."

Perl script to convert Multi-line Fasta to Single-line Fasta !

BioStar — Wed, 31 Jan 2024 00:38:21 -0600

#!/usr/bin/perl

use strict;
use warnings;

sub multi_to_single_line_fasta {
    my ($input_filename, $output_filename) = @_;

    open my $input_file, '<', $input_filename or die "Error: Could not open file '$input_filename': $!";
    open my $output_file, '>', $output_filename or die "Error: Could not open file '$output_filename': $!";

    my $current_sequence = "";

    while (my $line = <$input_file>) {
        chomp $line;
        if ($line =~ /^>/) {
            # If a header line, write the previous sequence and then the new header
            print $output_file $current_sequence . "\n" if $current_sequence;
            print $output_file $line . "\n";
            $current_sequence = "";
        } else {
            # If a sequence line, concatenate to the current sequence
            $current_sequence .= $line;
        }
    }

    # Write the last sequence
    print $output_file $current_sequence . "\n" if $current_sequence;

    close $input_file;
    close $output_file;

    print "Successfully converted $input_filename to $output_filename in single-line FASTA format.\n";
}

# Example usage:
# multi_to_single_line_fasta('multi_line.fasta', 'single_line.fasta');

Python script to convert Multi-line Fasta to Single-line Fasta

BioStar — Wed, 31 Jan 2024 00:37:15 -0600

def multi_to_single_line_fasta(input_filename, output_filename):
    try:
        with open(input_filename, 'r') as input_file:
            with open(output_filename, 'w') as output_file:
                current_sequence = ""
                for line in input_file:
                    if line.startswith('>'):
                        # If a header line, write the previous sequence and then the new header
                        if current_sequence:
                            output_file.write(current_sequence + '\n')
                        output_file.write(line.strip() + '\n')
                        current_sequence = ""
                    else:
                        # If a sequence line, concatenate to the current sequence
                        current_sequence += line.strip()
                
                # Write the last sequence
                if current_sequence:
                    output_file.write(current_sequence + '\n')

        print(f"Successfully converted {input_filename} to {output_filename} in single-line FASTA format.")

    except FileNotFoundError:
        print(f"Error: File '{input_filename}' not found.")

# Example usage:
# multi_to_single_line_fasta('multi_line.fasta', 'single_line.fasta')

Perl script to calculate GC content !

BioStar — Tue, 30 Jan 2024 05:20:10 -0600

#!/usr/bin/perl

sub calculate_gc_content {
    my ($sequence) = @_;
    $sequence = uc($sequence);  # Convert the sequence to uppercase
    my $gc_count = () = $sequence =~ /[GC]/g;
    my $total_bases = length($sequence);
    my $gc_content = ($gc_count / $total_bases) * 100;
    return $gc_content;
}

# Example usage:
my $dna_sequence = "ATGCGCTAAAGCGAGCGAAGCGCTAGATCGATCGATCGATCGATCGATCGATCGATCGATCG";
my $gc_content = calculate_gc_content($dna_sequence);
printf "GC content: %.2f%%\n", $gc_content;

Raku script to find SSRs in fastq file !

LEGE — Sun, 14 Jan 2024 12:05:24 -0600

sub find-ssrs(Str $sequence) {
    my @ssrs;

    for 2..$sequence.chars -> $min-repeats {
        for $sequence.chars...$min-repeats -> $max-repeat {
            my $repeat = $sequence.substr($min-repeats - 1, $max-repeat - $min-repeats + 1);
            my $repeat-length = $max-repeat - $min-repeats + 1;

            if $sequence.substr($max-repeat).index($repeat) == 0 {
                push @ssrs, {
                    start   => $min-repeats,
                    end     => $max-repeat,
                    length  => $repeat-length,
                    sequence => $repeat
                };
            }
        }
    }

    return @ssrs;
}

sub process-fastq-file(Str $filename) {
    my $fh = open $filename, :r;

    my $line-number = 0;
    while $fh.readline -> $header {
        $line-number++;
        my $sequence = $fh.readline.chomp;

        # Skipping the next two lines (comment and quality lines)
        $fh.readline;
        $fh.readline;

        my @ssrs = find-ssrs($sequence);

        if @ssrs {
            say "SSRs found in sequence at line $line-number:";
            for @ssrs -> $ssr {
                say "  Start: $ssr, End: $ssr, Length: $ssr, Sequence: $ssr";
            }
        }
    }

    $fh.close;
}

# Replace 'your_fastq_file.fastq' with the path to your FASTQ file
process-fastq-file('your_fastq_file.fastq');

Raku script to calculate GC content !

LEGE — Sun, 14 Jan 2024 11:56:51 -0600

sub calculate-gc-content(Str $sequence) {
    my $gc-count = $sequence.comb(/<[GCgc]>/).elems;
    my $total-bases = $sequence.chars;

    return $gc-count / $total-bases * 100;
}

my $dna_sequence = "ATGCGCTAAAGCGCGCGCCTTACGCGCGCGCGC";
my $gc_content = calculate-gc-content($dna_sequence);

say "DNA Sequence: $dna_sequence";
say "GC Content: $gc_content%";