from Bio import SeqIO
import statistics
# Input file containing the genome assembly in FASTA format
input_file = 'genome_assembly.fasta'
# Variables for computing statistics
total_length = 0
num_contigs = 0
contig_lengths = []
# Iterate through each sequence in the assembly
for record in SeqIO.parse(input_file, 'fasta'):
length = len(record.seq)
total_length += length
num_contigs += 1
contig_lengths.append(length)
# Sort contig lengths in descending order
contig_lengths.sort(reverse=True)
# Calculate additional statistics
min_contig_length = min(contig_lengths)
max_contig_length = max(contig_lengths)
avg_contig_length = statistics.mean(contig_lengths)
median_contig_length = statistics.median(contig_lengths)
# Calculate N50
def calculate_n50(lengths):
total_size = sum(lengths)
half_size = total_size / 2
cumulative_size = 0
for length in lengths:
cumulative_size += length
if cumulative_size >= half_size:
return length
# Calculate GC content
def calculate_gc_content(file):
gc_count = 0
total_bases = 0
with open(file, 'r') as fh:
for line in fh:
if line.startswith('>'):
continue # Skip header lines
line = line.strip()
gc_count += line.count('G') + line.count('C')
total_bases += len(line)
gc_content_percentage = (gc_count / total_bases) * 100
return round(gc_content_percentage, 2)
# Print the computed statistics and information
print("Genome Assembly Statistics:")
print("---------------------------")
print(f"Total Length: {total_length}")
print(f"Number of Contigs: {num_contigs}")
print(f"Minimum Contig Length: {min_contig_length}")
print(f"Maximum Contig Length: {max_contig_length}")
print(f"Average Contig Length: {avg_contig_length}")
print(f"Median Contig Length: {median_contig_length}")
print(f"N50: {calculate_n50(contig_lengths)}")
print(f"GC Content: {calculate_gc_content(input_file)}%")
print("\nContig Length Distribution:")
print("---------------------------")
# Print contig length distribution
for length in contig_lengths:
print(length)