BOL: Python script for basic stats of the assembled genome !

BioScripts
LEGE
Python script for basic stats of the assembled genome !

Python script for basic stats of the assembled genome !

By LEGE 765 days ago

from Bio import SeqIO
import statistics

# Input file containing the genome assembly in FASTA format
input_file = 'genome_assembly.fasta'

# Variables for computing statistics
total_length = 0
num_contigs = 0
contig_lengths = []

# Iterate through each sequence in the assembly
for record in SeqIO.parse(input_file, 'fasta'):
    length = len(record.seq)
    total_length += length
    num_contigs += 1
    contig_lengths.append(length)

# Sort contig lengths in descending order
contig_lengths.sort(reverse=True)

# Calculate additional statistics
min_contig_length = min(contig_lengths)
max_contig_length = max(contig_lengths)
avg_contig_length = statistics.mean(contig_lengths)
median_contig_length = statistics.median(contig_lengths)

# Calculate N50
def calculate_n50(lengths):
    total_size = sum(lengths)
    half_size = total_size / 2
    cumulative_size = 0
    for length in lengths:
        cumulative_size += length
        if cumulative_size >= half_size:
            return length

# Calculate GC content
def calculate_gc_content(file):
    gc_count = 0
    total_bases = 0

    with open(file, 'r') as fh:
        for line in fh:
            if line.startswith('>'):
                continue  # Skip header lines
            line = line.strip()
            gc_count += line.count('G') + line.count('C')
            total_bases += len(line)

    gc_content_percentage = (gc_count / total_bases) * 100
    return round(gc_content_percentage, 2)

# Print the computed statistics and information
print("Genome Assembly Statistics:")
print("---------------------------")
print(f"Total Length: {total_length}")
print(f"Number of Contigs: {num_contigs}")
print(f"Minimum Contig Length: {min_contig_length}")
print(f"Maximum Contig Length: {max_contig_length}")
print(f"Average Contig Length: {avg_contig_length}")
print(f"Median Contig Length: {median_contig_length}")
print(f"N50: {calculate_n50(contig_lengths)}")
print(f"GC Content: {calculate_gc_content(input_file)}%")
print("\nContig Length Distribution:")
print("---------------------------")

# Print contig length distribution
for length in contig_lengths:
    print(length)

BOL

LEGE

Our Sponsors

Python script for basic stats of the assembled genome !