<?xml version='1.0'?><rss version="2.0" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:georss="http://www.georss.org/georss" xmlns:atom="http://www.w3.org/2005/Atom" >
<channel>
	<title><![CDATA[BOL: Python script for basic stats of the assembled genome !]]></title>
	<link>https://bioinformaticsonline.com/snippets/view/44460/python-script-for-basic-stats-of-the-assembled-genome?</link>
	<atom:link href="https://bioinformaticsonline.com/snippets/view/44460/python-script-for-basic-stats-of-the-assembled-genome?" rel="self" type="application/rss+xml" />
	<description><![CDATA[]]></description>
	
	<item>
	<guid isPermaLink="true">https://bioinformaticsonline.com/snippets/view/44460/python-script-for-basic-stats-of-the-assembled-genome</guid>
	<pubDate>Thu, 01 Feb 2024 02:20:54 -0600</pubDate>
	<link>https://bioinformaticsonline.com/snippets/view/44460/python-script-for-basic-stats-of-the-assembled-genome</link>
	<title><![CDATA[Python script for basic stats of the assembled genome !]]></title>
	<description><![CDATA[<code>from Bio import SeqIO
import statistics

# Input file containing the genome assembly in FASTA format
input_file = &#039;genome_assembly.fasta&#039;

# Variables for computing statistics
total_length = 0
num_contigs = 0
contig_lengths = []

# Iterate through each sequence in the assembly
for record in SeqIO.parse(input_file, &#039;fasta&#039;):
    length = len(record.seq)
    total_length += length
    num_contigs += 1
    contig_lengths.append(length)

# Sort contig lengths in descending order
contig_lengths.sort(reverse=True)

# Calculate additional statistics
min_contig_length = min(contig_lengths)
max_contig_length = max(contig_lengths)
avg_contig_length = statistics.mean(contig_lengths)
median_contig_length = statistics.median(contig_lengths)

# Calculate N50
def calculate_n50(lengths):
    total_size = sum(lengths)
    half_size = total_size / 2
    cumulative_size = 0
    for length in lengths:
        cumulative_size += length
        if cumulative_size &gt;= half_size:
            return length

# Calculate GC content
def calculate_gc_content(file):
    gc_count = 0
    total_bases = 0

    with open(file, &#039;r&#039;) as fh:
        for line in fh:
            if line.startswith(&#039;&gt;&#039;):
                continue  # Skip header lines
            line = line.strip()
            gc_count += line.count(&#039;G&#039;) + line.count(&#039;C&#039;)
            total_bases += len(line)

    gc_content_percentage = (gc_count / total_bases) * 100
    return round(gc_content_percentage, 2)

# Print the computed statistics and information
print(&quot;Genome Assembly Statistics:&quot;)
print(&quot;---------------------------&quot;)
print(f&quot;Total Length: {total_length}&quot;)
print(f&quot;Number of Contigs: {num_contigs}&quot;)
print(f&quot;Minimum Contig Length: {min_contig_length}&quot;)
print(f&quot;Maximum Contig Length: {max_contig_length}&quot;)
print(f&quot;Average Contig Length: {avg_contig_length}&quot;)
print(f&quot;Median Contig Length: {median_contig_length}&quot;)
print(f&quot;N50: {calculate_n50(contig_lengths)}&quot;)
print(f&quot;GC Content: {calculate_gc_content(input_file)}%&quot;)
print(&quot;\nContig Length Distribution:&quot;)
print(&quot;---------------------------&quot;)

# Print contig length distribution
for length in contig_lengths:
    print(length)</code>]]></description>
	<dc:creator>LEGE</dc:creator>
</item>

</channel>
</rss>