BOL: All

Python script for basic stats of the assembled genome !

LEGE — Thu, 01 Feb 2024 02:20:54 -0600

from Bio import SeqIO
import statistics

# Input file containing the genome assembly in FASTA format
input_file = 'genome_assembly.fasta'

# Variables for computing statistics
total_length = 0
num_contigs = 0
contig_lengths = []

# Iterate through each sequence in the assembly
for record in SeqIO.parse(input_file, 'fasta'):
    length = len(record.seq)
    total_length += length
    num_contigs += 1
    contig_lengths.append(length)

# Sort contig lengths in descending order
contig_lengths.sort(reverse=True)

# Calculate additional statistics
min_contig_length = min(contig_lengths)
max_contig_length = max(contig_lengths)
avg_contig_length = statistics.mean(contig_lengths)
median_contig_length = statistics.median(contig_lengths)

# Calculate N50
def calculate_n50(lengths):
    total_size = sum(lengths)
    half_size = total_size / 2
    cumulative_size = 0
    for length in lengths:
        cumulative_size += length
        if cumulative_size >= half_size:
            return length

# Calculate GC content
def calculate_gc_content(file):
    gc_count = 0
    total_bases = 0

    with open(file, 'r') as fh:
        for line in fh:
            if line.startswith('>'):
                continue  # Skip header lines
            line = line.strip()
            gc_count += line.count('G') + line.count('C')
            total_bases += len(line)

    gc_content_percentage = (gc_count / total_bases) * 100
    return round(gc_content_percentage, 2)

# Print the computed statistics and information
print("Genome Assembly Statistics:")
print("---------------------------")
print(f"Total Length: {total_length}")
print(f"Number of Contigs: {num_contigs}")
print(f"Minimum Contig Length: {min_contig_length}")
print(f"Maximum Contig Length: {max_contig_length}")
print(f"Average Contig Length: {avg_contig_length}")
print(f"Median Contig Length: {median_contig_length}")
print(f"N50: {calculate_n50(contig_lengths)}")
print(f"GC Content: {calculate_gc_content(input_file)}%")
print("\nContig Length Distribution:")
print("---------------------------")

# Print contig length distribution
for length in contig_lengths:
    print(length)

Perl script to calculate the basic stats of the assembled genome !

LEGE — Thu, 01 Feb 2024 02:19:05 -0600

#!/usr/bin/perl
use strict;
use warnings;
use Bio::SeqIO;

# Input file containing the genome assembly in FASTA format
my $input_file = 'genome_assembly.fasta';

# Create Bio::SeqIO object to read the FASTA file
my $seqio = Bio::SeqIO->new(-file => $input_file, -format => 'fasta');

# Variables for computing statistics
my $total_length = 0;
my $num_contigs = 0;
my @contig_lengths;

# Iterate through each sequence in the assembly
while (my $seq = $seqio->next_seq) {
    my $length = $seq->length;
    $total_length += $length;
    $num_contigs++;
    push @contig_lengths, $length;
}

# Sort contig lengths in descending order
@contig_lengths = sort { $b <=> $a } @contig_lengths;

# Calculate additional statistics
my $min_contig_length = $contig_lengths[-1];
my $max_contig_length = $contig_lengths[0];
my $avg_contig_length = $total_length / $num_contigs;
my $median_contig_length = calculate_median(\@contig_lengths);

# Calculate N50
my $n50 = calculate_n50(\@contig_lengths);

# Calculate GC content
my $gc_content = calculate_gc_content($input_file);

# Print the computed statistics and information
print "Genome Assembly Statistics:\n";
print "---------------------------\n";
print "Total Length: $total_length\n";
print "Number of Contigs: $num_contigs\n";
print "Minimum Contig Length: $min_contig_length\n";
print "Maximum Contig Length: $max_contig_length\n";
print "Average Contig Length: $avg_contig_length\n";
print "Median Contig Length: $median_contig_length\n";
print "N50: $n50\n";
print "GC Content: $gc_content%\n";
print "\nContig Length Distribution:\n";
print "---------------------------\n";

# Print contig length distribution
foreach my $length (@contig_lengths) {
    print "$length\n";
}

# Subroutine to calculate N50
sub calculate_n50 {
    my ($lengths_ref) = @_;
    my $total_size = 0;
    foreach my $length (@$lengths_ref) {
        $total_size += $length;
    }
    my $half_size = $total_size / 2;
    my $cumulative_size = 0;
    for my $length (@$lengths_ref) {
        $cumulative_size += $length;
        if ($cumulative_size >= $half_size) {
            return $length;
        }
    }
    return 0; # Should not reach here
}

# Subroutine to calculate GC content
sub calculate_gc_content {
    my ($file) = @_;
    my $gc_count = 0;
    my $total_bases = 0;

    open my $fh, '<', $file or die "Cannot open file: $!";
    while (<$fh>) {
        next if /^>/; # Skip header lines
        chomp;
        $gc_count += tr/GCgc//;
        $total_bases += length($_);
    }
    close $fh;

    my $gc_content_percentage = ($gc_count / $total_bases) * 100;
    return sprintf("%.2f", $gc_content_percentage);
}

# Subroutine to calculate median
sub calculate_median {
    my ($array_ref) = @_;
    my $count = scalar @$array_ref;
    return $array_ref->[$count / 2];
}

Perl script to parse blast results and plot basic stats !

LEGE — Thu, 01 Feb 2024 02:11:23 -0600

#!/usr/bin/perl

use strict;
use warnings;
use List::Util qw(sum);

# Usage: ./parse_blast.pl blast_result.txt

die "Usage: ./parse_blast.pl blast_result.txt\n" unless @ARGV;

my $blast_file = shift @ARGV;
my @blast_entries = parse_blast($blast_file);

print "Total entries: ", scalar(@blast_entries), "\n";
print "---------------------------\n";

# Print detailed information for each entry
for my $entry (@blast_entries) {
    print "Query:             ", $entry->{QUERY},         "\n";
    print "Subject:           ", $entry->{SUBJECT},       "\n";
    print "Percent Identity:  ", $entry->{PERCENT_IDENTITY}, "\n";
    print "Alignment Length:  ", $entry->{ALIGNMENT_LENGTH}, "\n";
    print "E-value:           ", $entry->{EVALUE},        "\n";
    print "Bit Score:         ", $entry->{BITSCORE},      "\n";
    print "---------------------------\n";
}

# Calculate additional statistics
my $avg_percent_identity = calculate_average(\@blast_entries, 'PERCENT_IDENTITY');
my $avg_alignment_length = calculate_average(\@blast_entries, 'ALIGNMENT_LENGTH');
my ($min_evalue, $max_evalue, $avg_evalue) = calculate_summary_stats(\@blast_entries, 'EVALUE');
my ($min_bitscore, $max_bitscore, $avg_bitscore) = calculate_summary_stats(\@blast_entries, 'BITSCORE');

# Print summary statistics
print "Average Percent Identity:  $avg_percent_identity\n";
print "Average Alignment Length:  $avg_alignment_length\n";
print "E-value Range:             $min_evalue - $max_evalue\n";
print "Average E-value:           $avg_evalue\n";
print "Bit Score Range:           $min_bitscore - $max_bitscore\n";
print "Average Bit Score:         $avg_bitscore\n";

sub parse_blast {
    my ($file) = @_;

    open my $fh, '<', $file or die "Cannot open file $file: $!\n";

    my @entries;

    while (my $line = <$fh>) {
        next if $line =~ /^\s*$/;  # skip empty lines

        chomp $line;
        my @fields = split /\t/, $line;

        my %entry;
        @entry{qw/QUERY SUBJECT PERCENT_IDENTITY ALIGNMENT_LENGTH EVALUE BITSCORE/} = @fields;

        push @entries, \%entry;
    }

    close $fh;

    return @entries;
}

sub calculate_average {
    my ($entries, $field) = @_;
    my @values = map { $_->{$field} } @$entries;
    return @values ? sum(@values) / @values : 0;
}

sub calculate_summary_stats {
    my ($entries, $field) = @_;
    my @values = map { $_->{$field} } @$entries;

    my $min = @values ? (sort { $a <=> $b } @values)[0] : 0;
    my $max = @values ? (sort { $b <=> $a } @values)[0] : 0;
    my $avg = @values ? sum(@values) / @values : 0;

    return ($min, $max, $avg);
}

Perl script to parse VCF file !

LEGE — Thu, 01 Feb 2024 02:08:11 -0600

#!/usr/bin/perl

use strict;
use warnings;

# Usage: ./parse_vcf.pl input.vcf

die "Usage: ./parse_vcf.pl input.vcf\n" unless @ARGV;

my $vcf_file = shift @ARGV;
my @vcf_entries = parse_vcf($vcf_file);

print "Total entries: ", scalar(@vcf_entries), "\n";
print "---------------------------\n";

my %chromosome_counts;
for my $entry (@vcf_entries) {
    $chromosome_counts{$entry->{CHROM}}++;
}

print "Chromosome counts:\n";
for my $chromosome (sort keys %chromosome_counts) {
    print "  $chromosome: $chromosome_counts{$chromosome}\n";
}

sub parse_vcf {
    my ($file) = @_;

    open my $fh, '<', $file or die "Cannot open file $file: $!\n";

    my @entries;

    while (my $line = <$fh>) {
        next if $line =~ /^\s*$/;  # skip empty lines
        next if $line =~ /^\s*#/; # skip comments

        chomp $line;
        my @fields = split /\t/, $line;

        my %entry;
        @entry{qw/CHROM POS ID REF ALT QUAL FILTER INFO FORMAT SAMPLES/} = @fields;

        push @entries, \%entry;
    }

    close $fh;

    return @entries;
}

Perl script to find overlaps between two bed files !

LEGE — Thu, 01 Feb 2024 02:04:04 -0600

#!/usr/bin/perl

use strict;
use warnings;

# Check if the correct number of arguments are provided
if (@ARGV != 2) {
    die "Usage: $0 file1.bed file2.bed\n";
}

# Read the contents of the two BED files
my $file1 = shift @ARGV;
my $file2 = shift @ARGV;

open my $fh1, '<', $file1 or die "Error opening $file1: $!";
open my $fh2, '<', $file2 or die "Error opening $file2: $!";

# Iterate over each interval in the first BED file
while (my $line1 = <$fh1>) {
    chomp $line1;
    my @fields1 = split /\t/, $line1;
    my $chr1 = $fields1[0];
    my $start1 = $fields1[1];
    my $end1 = $fields1[2];

    # Check for overlaps with intervals in the second BED file
    while (my $line2 = <$fh2>) {
        chomp $line2;
        my @fields2 = split /\t/, $line2;
        my $chr2 = $fields2[0];
        my $start2 = $fields2[1];
        my $end2 = $fields2[2];

        # Check for chromosome match and overlap
        if ($chr1 eq $chr2 && $start1 < $end2 && $end1 > $start2) {
            print "Overlap found:\n";
            print "File 1: $line1\n";
            print "File 2: $line2\n\n";
        }
    }

    # Rewind file2 to the beginning for the next iteration
    seek $fh2, 0, 0;
}

close $fh1;
close $fh2;

print "Comparison completed.\n";

Raku script to find overlaps between two bed files !

LEGE — Thu, 01 Feb 2024 02:02:46 -0600

#!/usr/bin/env raku

# Check if the correct number of arguments are provided
if @*ARGS.elems != 2 {
    say "Usage: ./compare_bed_files.raku file1.bed file2.bed";
    exit 1;
}

# Read the contents of the two BED files
my @bed1 = slurp(@*ARGS[0]).lines;
my @bed2 = slurp(@*ARGS[1]).lines;

# Iterate over each interval in the first BED file
for my $line1 (@bed1) {
    my @fields1 = $line1.split("\t");
    my $chr1 = @fields1[0];
    my $start1 = @fields1[1];
    my $end1 = @fields1[2];

    # Check for overlaps with intervals in the second BED file
    for my $line2 (@bed2) {
        my @fields2 = $line2.split("\t");
        my $chr2 = @fields2[0];
        my $start2 = @fields2[1];
        my $end2 = @fields2[2];

        # Check for chromosome match and overlap
        if $chr1 eq $chr2 && $start1 < $end2 && $end1 > $start2 {
            say "Overlap found:";
            say "File 1: $line1";
            say "File 2: $line2";
            say "";
        }
    }
}

say "Comparison completed.";

Raku script to find microsatellites in DNA fragments !

LEGE — Thu, 01 Feb 2024 02:00:27 -0600

sub find-microsatellites($sequence, $min-repeat-length = 2, $max-repeat-length = 6, $min-repeat-count = 3) {
    my @microsatellites;

    for my $repeat-length ($min-repeat-length..$max-repeat-length) {
        for ^($sequence.chars - $repeat-length * $min-repeat-count + 1) -> $i {
            my $substring = $sequence.substr($i, $repeat-length);

            if $sequence.contains($substring x $min-repeat-count) {
                @microsatellites.push($substring);
            }
        }
    }

    return @microsatellites.unique;
}

# Example usage
my $genome-sequence = "ATCGATCGATCGATCGATCG";
my @result = find-microsatellites($genome-sequence);

say "Microsatellites found: ", @result;

Python script to find repeats in the DNA sequence !

LEGE — Thu, 01 Feb 2024 01:57:25 -0600

def find_repeats(sequence, min_repeat_length=3):
    repeats = []
    
    for i in range(len(sequence) - min_repeat_length + 1):
        substring = sequence[i:i+min_repeat_length]
        
        if sequence.count(substring) > 1 and substring not in repeats:
            repeats.append(substring)
    
    return repeats

# Example usage
genome_sequence = "ATCGATCGATCGATCG"
result = find_repeats(genome_sequence)

print("Repeats found:", result)

Raku script to find repeats in sequences !

LEGE — Thu, 01 Feb 2024 01:56:36 -0600

sub find-repeats($sequence, $min-repeat-length = 3) {
    my @repeats;

    for ^($sequence.chars - $min-repeat-length + 1) -> $i {
        my $substring = $sequence.substr($i, $min-repeat-length);

        if $sequence.contains($substring) > 1 && $substring !~~ any(@repeats) {
            @repeats.push($substring);
        }
    }

    return @repeats;
}

# Example usage
my $genome-sequence = "ATCGATCGATCGATCG";
my @result = find-repeats($genome-sequence);

say "Repeats found: ", @result;

Python script for six frame translation of sequences !

LEGE — Thu, 01 Feb 2024 01:54:14 -0600

from Bio import SeqIO
from Bio.Seq import Seq

def translate_frame(sequence, frame):
    if frame > 0:
        translated_seq = sequence[frame-1:].translate()
    else:
        reverse_complement = sequence.reverse_complement()
        translated_seq = reverse_complement[abs(frame)-1:].translate()

    return translated_seq

def six_frame_translation(fasta_file):
    records = list(SeqIO.parse(fasta_file, "fasta"))

    for record in records:
        print(f"Sequence ID: {record.id}")
        for frame in range(1, 7):
            protein_sequence = translate_frame(record.seq, frame)
            frame_type = "Forward" if frame > 0 else "Reverse"
            print(f"Frame {frame_type} {abs(frame)} Translation:\n{protein_sequence}\n")

# Replace 'path/to/your/input.fasta' with the actual path to your input nucleotide sequence in FASTA format
input_fasta = 'path/to/your/input.fasta'
six_frame_translation(input_fasta)