BOL: All

Onliner to convert minimap2 to PAF

LEGE — Wed, 03 Jun 2026 07:03:26 -0500

awk -v OFS="\t" -v ref_species="species1" -v query_species="species2" '{ print $6, $8, $9, $1, $3, $4, $5, ref_species, query_species }' mypaf.paf > synPlotter.tsv

Oneliner to convert minimap2 to paf

Jitendra Narayan — Wed, 03 Jun 2026 07:02:19 -0500

awk -v OFS="\t" -v ref_species="species1" -v query_species="species2" '{ print $6, $8, $9, $1, $3, $4, $5, ref_species, query_species }' mypaf.paf > synPlotter.tsv

Python script to split a DNA sequence into words of varying lengths

LEGE — Thu, 02 Jan 2025 11:31:22 -0600

# Script to split a DNA sequence into words of varying lengths
def split_dna_into_words(dna_sequence, min_length, max_length):
    """
    Splits a DNA sequence into words of lengths ranging from min_length to max_length.

    Parameters:
        dna_sequence (str): The DNA sequence to split (e.g., "ATGCGTAC").
        min_length (int): The minimum length of each word.
        max_length (int): The maximum length of each word.

    Returns:
        dict: A dictionary where keys are word lengths and values are lists of DNA words of that length.
    """
    if not dna_sequence:
        raise ValueError("The DNA sequence cannot be empty.")

    if min_length <= 0 or max_length <= 0:
        raise ValueError("Word lengths must be positive integers.")

    if min_length > max_length:
        raise ValueError("Minimum length cannot be greater than maximum length.")

    # Ensure the DNA sequence contains valid nucleotides
    for nucleotide in dna_sequence:
        if nucleotide.upper() not in "ATCG":
            raise ValueError(f"Invalid character '{nucleotide}' found in DNA sequence.")

    # Generate words of varying lengths
    words_by_length = {}
    for length in range(min_length, max_length + 1):
        words_by_length[length] = [dna_sequence[i:i+length] for i in range(0, len(dna_sequence) - length + 1)]

    return words_by_length

# Example usage
def main():
    dna_sequence = "ATGCGTACGCTAATGCGTACGCTAATGCGTACGCTAATGCGTACGCTAATGCGTACGCTAATGCGTACGCTAATGCGTACGCTAATGCGTACGCTAATGCGTACGCTAATGCGTACGCTA"
    min_length = 3
    max_length = 99

    try:
        words_by_length = split_dna_into_words(dna_sequence, min_length, max_length)
        for length, words in words_by_length.items():
            print(f"Words of length {length}:", words)
    except ValueError as e:
        print("Error:", e)

if __name__ == "__main__":
    main()

Python script to find all possible repeats in a DNA string !

LEGE — Mon, 16 Dec 2024 07:54:38 -0600

from collections import defaultdict

def find_repeats_in_genome(genome, min_length=2, max_length=None):
    """
    Finds all repeating sequences in a genome within a specified length range.

    Parameters:
        genome (str): The genome sequence.
        min_length (int): Minimum length of repeats to scan for (default: 2).
        max_length (int): Maximum length of repeats to scan for (default: None, meaning entire genome).

    Returns:
        dict: A dictionary where keys are repeating sequences and values are lists of starting positions.
    """
    if max_length is None:
        max_length = len(genome)

    repeats = defaultdict(list)

    # Iterate over all possible lengths of substrings
    for length in range(min_length, max_length + 1):
        seen = defaultdict(list)  # Tracks occurrences of substrings of the current length

        # Sliding window approach
        for i in range(len(genome) - length + 1):
            substring = genome[i:i + length]
            seen[substring].append(i)

        # Filter substrings that appear more than once
        for substring, positions in seen.items():
            if len(positions) > 1:
                repeats[substring].extend(positions)

    return repeats

# Example usage
def main():
    genome = "ATCGATCGAATTCGATCG"  # Example genome sequence
    min_length = 2
    max_length = 5

    repeats = find_repeats_in_genome(genome, min_length, max_length)

    print("Repeating sequences:")
    for seq, positions in repeats.items():
        print(f"Sequence: {seq}, Positions: {positions}")

if __name__ == "__main__":
    main()

Python script for treemap using Python's Plotly library

LEGE — Sat, 14 Dec 2024 12:45:15 -0600

import plotly.express as px
import pandas as pd

# Sample dataset: Representing biological pathways and their associated counts
data = {
    "Category": ["Metabolism", "Metabolism", "Metabolism", 
                 "Cellular Processes", "Cellular Processes", "Cellular Processes", 
                 "Information Storage", "Information Storage"],
    "Subcategory": ["Carbohydrate metabolism", "Lipid metabolism", "Amino acid metabolism", 
                    "Signal transduction", "Cell cycle", "Transport", 
                    "DNA replication", "RNA processing"],
    "Count": [150, 120, 90, 100, 85, 70, 110, 95]
}

# Convert data to a DataFrame
df = pd.DataFrame(data)

# Create the treemap
fig = px.treemap(
    df,
    path=["Category", "Subcategory"],  # Hierarchical levels
    values="Count",                   # Size of the treemap blocks
    color="Count",                    # Color based on the count values
    color_continuous_scale="Viridis"  # Color scale
)

# Add a title
fig.update_layout(title="Treemap: Hierarchical Data Representation in Bioinformatics")

# Show the plot
fig.show()

Bacterial Comparative Genomics Pipeline Bash Script

LEGE — Sat, 14 Dec 2024 12:34:57 -0600

#!/bin/bash

# Bacterial Comparative Genomics Pipeline Script
# This script automates key steps in bacterial comparative genomics using popular bioinformatics tools.

# Ensure the script stops on error
set -e

# Define paths
WORKDIR="./bacterial_genomics_pipeline"
INPUT_FASTA_DIR="./input_genomes"
OUTPUT_DIR="./output"
CORE_PAN_DIR="$OUTPUT_DIR/core_pan_analysis"
PHYLOGENY_DIR="$OUTPUT_DIR/phylogeny"
ALIGNMENT_DIR="$OUTPUT_DIR/genome_alignment"
RESISTANCE_DIR="$OUTPUT_DIR/antibiotic_resistance"
SYNTENY_DIR="$OUTPUT_DIR/synteny_analysis"

# Create directories if they do not exist
mkdir -p $WORKDIR $OUTPUT_DIR $CORE_PAN_DIR $PHYLOGENY_DIR $ALIGNMENT_DIR $RESISTANCE_DIR $SYNTENY_DIR

# Tools required
PROKKA="prokka"
ROARY="roary"
MAUVE="progressiveMauve"
IQTREE="iqtree"
ABRICATE="abricate"
MCSCANX="mcscanx"

# Step 1: Genome Annotation using Prokka
annotate_genomes() {
  echo "\n=== Annotating Genomes with Prokka ==="
  for fasta in $INPUT_FASTA_DIR/*.fasta; do
    basename=$(basename $fasta .fasta)
    output_path="$OUTPUT_DIR/annotation_$basename"
    echo "Annotating $basename..."
    $PROKKA --outdir $output_path --prefix $basename $fasta
  done
}

# Step 2: Core and Pan-genome Analysis using Roary
core_pan_analysis() {
  echo "\n=== Performing Core and Pan-genome Analysis with Roary ==="
  gff_files=$(find $OUTPUT_DIR -name "*.gff")
  roary_output="$CORE_PAN_DIR/pan_genome_analysis"
  mkdir -p $roary_output
  $ROARY -e -n -v -p 8 -o $roary_output $gff_files
}

# Step 3: Whole Genome Alignment using Mauve
align_genomes() {
  echo "\n=== Aligning Genomes with Mauve ==="
  alignment_output="$ALIGNMENT_DIR/aligned_genomes.xmfa"
  echo "Running Mauve on input genomes..."
  $MAUVE --output=$alignment_output $(find $INPUT_FASTA_DIR -name "*.fasta")
  echo "Alignment saved to $alignment_output"
}

# Step 4: Phylogenetic Tree Construction using IQ-TREE
construct_phylogeny() {
  echo "\n=== Constructing Phylogenetic Tree with IQ-TREE ==="
  alignment="$ALIGNMENT_DIR/aligned_genomes.xmfa"
  phylo_output="$PHYLOGENY_DIR/phylogeny_tree"
  iqtree_output="$phylo_output.treefile"

  echo "Running IQ-TREE on aligned genomes..."
  $IQTREE -s $alignment -m GTR+G -nt AUTO -pre $phylo_output
  echo "Phylogenetic tree saved to $iqtree_output"
}

# Step 5: Antibiotic Resistance Gene Identification using ABRicate
identify_resistance_genes() {
  echo "\n=== Identifying Antibiotic Resistance Genes with ABRicate ==="
  for fasta in $INPUT_FASTA_DIR/*.fasta; do
    basename=$(basename $fasta .fasta)
    output_path="$RESISTANCE_DIR/${basename}_resistance.txt"
    echo "Analyzing $basename for resistance genes..."
    abricate $fasta > $output_path
  done
}

# Step 6: Synteny Analysis using MCScanX
synteny_analysis() {
  echo "\n=== Performing Synteny Analysis with MCScanX ==="
  synteny_output="$SYNTENY_DIR/synteny_results"
  mkdir -p $synteny_output
  echo "Running MCScanX on annotated genomes..."
  MCScanX $OUTPUT_DIR > "$synteny_output/results.txt"
  echo "Synteny analysis results saved to $synteny_output"
}

# Main workflow
annotate_genomes
core_pan_analysis
align_genomes
construct_phylogeny
identify_resistance_genes
synteny_analysis

echo "\n=== Bacterial Comparative Genomics Pipeline Complete ==="
echo "Results saved in $OUTPUT_DIR"

Bash script to discover piRNA in transcriptome data !

Abhi — Fri, 13 Dec 2024 11:47:00 -0600

#!/bin/bash

# Variables (modify these as per your setup)
INPUT_FASTQ="input_reads.fastq"
ADAPTER_SEQ="TGGAATTCTCGGGTGCCAAGG"
REFERENCE_GENOME="reference_genome.fa"
BOWTIE_INDEX="reference_index"
OUTPUT_DIR="piRNA_analysis"
THREADS=4

# Create output directory
mkdir -p $OUTPUT_DIR

# Step 1: Quality Control
echo "Running FastQC for quality control..."
fastqc $INPUT_FASTQ -o $OUTPUT_DIR

# Step 2: Adapter Trimming
echo "Trimming adapters with Cutadapt..."
cutadapt -a $ADAPTER_SEQ -o $OUTPUT_DIR/trimmed_reads.fastq $INPUT_FASTQ

# Step 3: Mapping Reads to Reference Genome
echo "Mapping reads to reference genome using Bowtie..."
bowtie -v 1 -k 1 --best -p $THREADS $BOWTIE_INDEX $OUTPUT_DIR/trimmed_reads.fastq -S $OUTPUT_DIR/aligned_reads.sam

# Step 4: Convert SAM to BAM and Sort
echo "Converting SAM to BAM and sorting..."
samtools view -Sb $OUTPUT_DIR/aligned_reads.sam | samtools sort -o $OUTPUT_DIR/sorted_reads.bam

# Step 5: Extract Reads of piRNA Size (24-32 nt)
echo "Filtering reads of size 24-32 nt..."
bedtools bamtofastq -i $OUTPUT_DIR/sorted_reads.bam -fq $OUTPUT_DIR/all_reads.fastq
seqkit seq -m 24 -M 32 $OUTPUT_DIR/all_reads.fastq > $OUTPUT_DIR/piRNA_size_reads.fastq

# Step 6: Detect Sequence Bias (Optional)
echo "Checking sequence bias using WebLogo-compatible data..."
seqkit fx2tab $OUTPUT_DIR/piRNA_size_reads.fastq | cut -f2 | awk '{print ">seq"NR"\n"$0}' > $OUTPUT_DIR/piRNA_sequences.fa

# Step 7: Identify piRNA Clusters
# This step requires a tool like proTRAC or PIRANHA. Example placeholder:
echo "Identifying piRNA clusters (requires proTRAC or PIRANHA)..."
# Example with proTRAC:
# proTRAC.pl -s $OUTPUT_DIR/sorted_reads.bam -g $REFERENCE_GENOME -o $OUTPUT_DIR/clusters

# Step 8: Annotate Clusters
# Annotation depends on your genome's annotation file
# bedtools intersect example placeholder:
# bedtools intersect -a clusters.bed -b genome_annotation.gtf > annotated_clusters.bed

# Step 9: Clean up intermediate files (optional)
echo "Cleaning up intermediate files..."
rm $OUTPUT_DIR/aligned_reads.sam $OUTPUT_DIR/all_reads.fastq

# Done
echo "piRNA discovery pipeline completed! Results are in $OUTPUT_DIR."

Python script to split a genome sequence into overlapping windows of 100 base pairs

Neel — Wed, 11 Dec 2024 23:32:55 -0600

def split_genome(sequence, window_size=100, step=1):
    """
    Splits a genome sequence into overlapping windows.

    Args:
        sequence (str): The genome sequence.
        window_size (int): Size of each window (default: 100).
        step (int): Step size for overlapping (default: 1).

    Returns:
        list: A list of genome windows.
    """
    windows = []
    for i in range(0, len(sequence) - window_size + 1, step):
        windows.append(sequence[i:i + window_size])
    return windows

# Example usage:
if __name__ == "__main__":
    genome_sequence = "ATGCGTACGTTAGCTACGATCGTACGATCGTACGATCGATCGTAGCATCGATCGTACG"
    window_size = 100
    step_size = 1

    # Get overlapping windows
    genome_windows = split_genome(genome_sequence, window_size, step_size)

    # Print results
    for idx, window in enumerate(genome_windows):
        print(f"Window {idx + 1}: {window}")

Methods to upgrade the Ubuntu !

LEGE — Fri, 06 Dec 2024 23:36:11 -0600

#Install ubuntu-release-upgrader-core if it is not already installed:

sudo apt-get install ubuntu-release-upgrader-core
#Edit /etc/update-manager/release-upgrades and set Prompt=normal

#Launch the upgrade tool:

do-release-upgrade
#Follow the on-screen instructions.

Install Edirect !

LEGE — Thu, 03 Oct 2024 01:52:15 -0500

sh -c "$(curl -fsSL https://ftp.ncbi.nlm.nih.gov/entrez/entrezdirect/install-edirect.sh)"