BOL: Owner

Raku script to find microsatellites in DNA fragments !

LEGE — Thu, 01 Feb 2024 02:00:27 -0600

sub find-microsatellites($sequence, $min-repeat-length = 2, $max-repeat-length = 6, $min-repeat-count = 3) {
    my @microsatellites;

    for my $repeat-length ($min-repeat-length..$max-repeat-length) {
        for ^($sequence.chars - $repeat-length * $min-repeat-count + 1) -> $i {
            my $substring = $sequence.substr($i, $repeat-length);

            if $sequence.contains($substring x $min-repeat-count) {
                @microsatellites.push($substring);
            }
        }
    }

    return @microsatellites.unique;
}

# Example usage
my $genome-sequence = "ATCGATCGATCGATCGATCG";
my @result = find-microsatellites($genome-sequence);

say "Microsatellites found: ", @result;

Python script to find repeats in the DNA sequence !

LEGE — Thu, 01 Feb 2024 01:57:25 -0600

def find_repeats(sequence, min_repeat_length=3):
    repeats = []
    
    for i in range(len(sequence) - min_repeat_length + 1):
        substring = sequence[i:i+min_repeat_length]
        
        if sequence.count(substring) > 1 and substring not in repeats:
            repeats.append(substring)
    
    return repeats

# Example usage
genome_sequence = "ATCGATCGATCGATCG"
result = find_repeats(genome_sequence)

print("Repeats found:", result)

Raku script to find repeats in sequences !

LEGE — Thu, 01 Feb 2024 01:56:36 -0600

sub find-repeats($sequence, $min-repeat-length = 3) {
    my @repeats;

    for ^($sequence.chars - $min-repeat-length + 1) -> $i {
        my $substring = $sequence.substr($i, $min-repeat-length);

        if $sequence.contains($substring) > 1 && $substring !~~ any(@repeats) {
            @repeats.push($substring);
        }
    }

    return @repeats;
}

# Example usage
my $genome-sequence = "ATCGATCGATCGATCG";
my @result = find-repeats($genome-sequence);

say "Repeats found: ", @result;

Python script for six frame translation of sequences !

LEGE — Thu, 01 Feb 2024 01:54:14 -0600

from Bio import SeqIO
from Bio.Seq import Seq

def translate_frame(sequence, frame):
    if frame > 0:
        translated_seq = sequence[frame-1:].translate()
    else:
        reverse_complement = sequence.reverse_complement()
        translated_seq = reverse_complement[abs(frame)-1:].translate()

    return translated_seq

def six_frame_translation(fasta_file):
    records = list(SeqIO.parse(fasta_file, "fasta"))

    for record in records:
        print(f"Sequence ID: {record.id}")
        for frame in range(1, 7):
            protein_sequence = translate_frame(record.seq, frame)
            frame_type = "Forward" if frame > 0 else "Reverse"
            print(f"Frame {frame_type} {abs(frame)} Translation:\n{protein_sequence}\n")

# Replace 'path/to/your/input.fasta' with the actual path to your input nucleotide sequence in FASTA format
input_fasta = 'path/to/your/input.fasta'
six_frame_translation(input_fasta)

Perl script for six frame translation !

LEGE — Thu, 01 Feb 2024 01:52:50 -0600

#!/usr/bin/perl
use strict;
use warnings;
use Bio::SeqIO;

# Path to your input nucleotide sequence file in FASTA format
my $input_fasta = 'path/to/your/input.fasta';

# Step 1: Read the input FASTA file
my $seqio = Bio::SeqIO->new(-file => $input_fasta, -format => 'fasta');
my $sequence = $seqio->next_seq;

# Step 2: Perform six-frame translation
my @frames = (1, 2, 3, -1, -2, -3);
foreach my $frame (@frames) {
    my $translated_seq = translate_frame($sequence, $frame);
    my $frame_type = $frame > 0 ? "Forward" : "Reverse";
    print "Frame $frame_type $frame Translation:\n$translated_seq\n";
}

# Subroutine to translate a sequence in a specific frame
sub translate_frame {
    my ($sequence, $frame) = @_;

    my $translated_seq;
    if ($frame > 0) {
        $translated_seq = $sequence->translate(-frame => $frame)->seq;
    } else {
        # If frame is negative, reverse and complement the sequence before translation
        my $revcomp_seq = $sequence->revcom;
        $translated_seq = $revcomp_seq->translate(-frame => abs($frame))->seq;
    }

    return $translated_seq;
}

Perl and BioPerl script to extract protein sequences using GFF file !

LEGE — Thu, 01 Feb 2024 01:51:00 -0600

#!/usr/bin/perl
use strict;
use warnings;
use Bio::DB::Fasta;
use Bio::SeqIO;

# Paths to your GFF file and genome FASTA file
my $gff_file = 'path/to/your/file.gff';
my $genome_fasta = 'path/to/your/genome.fasta';

# Gene ID to extract
my $gene_id_to_extract = 'your_gene_id';

# Step 1: Parse GFF file to get gene locations
my %gene_locations;
open my $gff_fh, '<', $gff_file or die "Cannot open GFF file: $!";
while (<$gff_fh>) {
    next if /^#/;  # Skip comments
    my @fields = split(/\t/, $_);
    next unless $fields[2] eq 'gene';  # Consider only gene features
    my ($gene_id) = $fields[8] =~ /ID=([^;]+)/;
    $gene_locations{$gene_id} = [$fields[3], $fields[4]];
}
close $gff_fh;

# Step 2: Extract DNA sequence from genome
my $db = Bio::DB::Fasta->new($genome_fasta);
my ($start, $end) = @{$gene_locations{$gene_id_to_extract}};
my $gene_dna_sequence = $db->seq($fields[0], $start, $end);

# Step 3: Translate DNA sequence into protein sequence
my $gene_protein_sequence = translate_dna_to_protein($gene_dna_sequence);

# Print the protein sequence
print "Protein Sequence:\n$gene_protein_sequence\n";

# Subroutine to translate DNA sequence to protein sequence
sub translate_dna_to_protein {
    my ($dna_sequence) = @_;
    my $seq_obj = Bio::Seq->new(-seq => $dna_sequence, -alphabet => 'dna');
    my $protein_sequence = $seq_obj->translate->seq;
    return $protein_sequence;
}

Python script to extract a protein sequence from a genome using a General Feature Format (GFF) file !

LEGE — Thu, 01 Feb 2024 01:48:49 -0600

#You typically need the corresponding genome sequence file in FASTA format. The GFF file contains information about the #features (such as genes) in the genome, including their locations and annotations.

#The outline of the steps :

#Parse the GFF file to extract information about the gene locations.
#Use the gene locations to extract the corresponding DNA sequences from the genome in FASTA format.
#Translate the DNA sequences into protein sequences.

#Simple example using Python and Biopython


from Bio import SeqIO
from Bio.SeqRecord import SeqRecord
from Bio.Seq import Seq

def extract_protein_sequence(gff_file, genome_fasta, gene_id):
    # Step 1: Parse the GFF file
    gene_locations = {}
    with open(gff_file, 'r') as gff:
        for line in gff:
            if not line.startswith('#'):
                fields = line.strip().split('\t')
                if fields[2] == 'gene':
                    gene_id = fields[8].split(';')[0].split('=')[1]
                    gene_locations[gene_id] = (int(fields[3]), int(fields[4]))

    # Step 2: Extract DNA sequence from the genome
    genome_record = SeqIO.read(genome_fasta, 'fasta')
    gene_start, gene_end = gene_locations[gene_id]
    gene_dna_sequence = genome_record.seq[gene_start - 1:gene_end]

    # Step 3: Translate DNA sequence into protein sequence
    gene_protein_sequence = gene_dna_sequence.translate()

    return gene_protein_sequence

# Example usage
gff_file = 'path/to/your/file.gff'
genome_fasta = 'path/to/your/genome.fasta'
gene_id_to_extract = 'your_gene_id'

protein_sequence = extract_protein_sequence(gff_file, genome_fasta, gene_id_to_extract)
print(protein_sequence)

Bash script to calculate difference between column !

LEGE — Thu, 01 Feb 2024 01:28:44 -0600

Space Separated

awk '{ $5 = $2 - $4 } 1' inputput.txt > outdiff.txt

Or with tab separation:

awk 'BEGIN { OFS = "\t" } { $5 = $2 - $4 } 1' inputput.txt > outdiff.txt

Raku script to find SSRs in fastq file !

LEGE — Sun, 14 Jan 2024 12:05:24 -0600

sub find-ssrs(Str $sequence) {
    my @ssrs;

    for 2..$sequence.chars -> $min-repeats {
        for $sequence.chars...$min-repeats -> $max-repeat {
            my $repeat = $sequence.substr($min-repeats - 1, $max-repeat - $min-repeats + 1);
            my $repeat-length = $max-repeat - $min-repeats + 1;

            if $sequence.substr($max-repeat).index($repeat) == 0 {
                push @ssrs, {
                    start   => $min-repeats,
                    end     => $max-repeat,
                    length  => $repeat-length,
                    sequence => $repeat
                };
            }
        }
    }

    return @ssrs;
}

sub process-fastq-file(Str $filename) {
    my $fh = open $filename, :r;

    my $line-number = 0;
    while $fh.readline -> $header {
        $line-number++;
        my $sequence = $fh.readline.chomp;

        # Skipping the next two lines (comment and quality lines)
        $fh.readline;
        $fh.readline;

        my @ssrs = find-ssrs($sequence);

        if @ssrs {
            say "SSRs found in sequence at line $line-number:";
            for @ssrs -> $ssr {
                say "  Start: $ssr, End: $ssr, Length: $ssr, Sequence: $ssr";
            }
        }
    }

    $fh.close;
}

# Replace 'your_fastq_file.fastq' with the path to your FASTQ file
process-fastq-file('your_fastq_file.fastq');

Raku script to calculate GC content !

LEGE — Sun, 14 Jan 2024 11:56:51 -0600

sub calculate-gc-content(Str $sequence) {
    my $gc-count = $sequence.comb(/<[GCgc]>/).elems;
    my $total-bases = $sequence.chars;

    return $gc-count / $total-bases * 100;
}

my $dna_sequence = "ATGCGCTAAAGCGCGCGCCTTACGCGCGCGCGC";
my $gc_content = calculate-gc-content($dna_sequence);

say "DNA Sequence: $dna_sequence";
say "GC Content: $gc_content%";