BOL: Owner

Onliner to convert minimap2 to PAF

LEGE — Wed, 03 Jun 2026 07:03:26 -0500

awk -v OFS="\t" -v ref_species="species1" -v query_species="species2" '{ print $6, $8, $9, $1, $3, $4, $5, ref_species, query_species }' mypaf.paf > synPlotter.tsv

Python script to split a DNA sequence into words of varying lengths

LEGE — Thu, 02 Jan 2025 11:31:22 -0600

# Script to split a DNA sequence into words of varying lengths
def split_dna_into_words(dna_sequence, min_length, max_length):
    """
    Splits a DNA sequence into words of lengths ranging from min_length to max_length.

    Parameters:
        dna_sequence (str): The DNA sequence to split (e.g., "ATGCGTAC").
        min_length (int): The minimum length of each word.
        max_length (int): The maximum length of each word.

    Returns:
        dict: A dictionary where keys are word lengths and values are lists of DNA words of that length.
    """
    if not dna_sequence:
        raise ValueError("The DNA sequence cannot be empty.")

    if min_length <= 0 or max_length <= 0:
        raise ValueError("Word lengths must be positive integers.")

    if min_length > max_length:
        raise ValueError("Minimum length cannot be greater than maximum length.")

    # Ensure the DNA sequence contains valid nucleotides
    for nucleotide in dna_sequence:
        if nucleotide.upper() not in "ATCG":
            raise ValueError(f"Invalid character '{nucleotide}' found in DNA sequence.")

    # Generate words of varying lengths
    words_by_length = {}
    for length in range(min_length, max_length + 1):
        words_by_length[length] = [dna_sequence[i:i+length] for i in range(0, len(dna_sequence) - length + 1)]

    return words_by_length

# Example usage
def main():
    dna_sequence = "ATGCGTACGCTAATGCGTACGCTAATGCGTACGCTAATGCGTACGCTAATGCGTACGCTAATGCGTACGCTAATGCGTACGCTAATGCGTACGCTAATGCGTACGCTAATGCGTACGCTA"
    min_length = 3
    max_length = 99

    try:
        words_by_length = split_dna_into_words(dna_sequence, min_length, max_length)
        for length, words in words_by_length.items():
            print(f"Words of length {length}:", words)
    except ValueError as e:
        print("Error:", e)

if __name__ == "__main__":
    main()

Python script to find all possible repeats in a DNA string !

LEGE — Mon, 16 Dec 2024 07:54:38 -0600

from collections import defaultdict

def find_repeats_in_genome(genome, min_length=2, max_length=None):
    """
    Finds all repeating sequences in a genome within a specified length range.

    Parameters:
        genome (str): The genome sequence.
        min_length (int): Minimum length of repeats to scan for (default: 2).
        max_length (int): Maximum length of repeats to scan for (default: None, meaning entire genome).

    Returns:
        dict: A dictionary where keys are repeating sequences and values are lists of starting positions.
    """
    if max_length is None:
        max_length = len(genome)

    repeats = defaultdict(list)

    # Iterate over all possible lengths of substrings
    for length in range(min_length, max_length + 1):
        seen = defaultdict(list)  # Tracks occurrences of substrings of the current length

        # Sliding window approach
        for i in range(len(genome) - length + 1):
            substring = genome[i:i + length]
            seen[substring].append(i)

        # Filter substrings that appear more than once
        for substring, positions in seen.items():
            if len(positions) > 1:
                repeats[substring].extend(positions)

    return repeats

# Example usage
def main():
    genome = "ATCGATCGAATTCGATCG"  # Example genome sequence
    min_length = 2
    max_length = 5

    repeats = find_repeats_in_genome(genome, min_length, max_length)

    print("Repeating sequences:")
    for seq, positions in repeats.items():
        print(f"Sequence: {seq}, Positions: {positions}")

if __name__ == "__main__":
    main()

Python script for treemap using Python's Plotly library

LEGE — Sat, 14 Dec 2024 12:45:15 -0600

import plotly.express as px
import pandas as pd

# Sample dataset: Representing biological pathways and their associated counts
data = {
    "Category": ["Metabolism", "Metabolism", "Metabolism", 
                 "Cellular Processes", "Cellular Processes", "Cellular Processes", 
                 "Information Storage", "Information Storage"],
    "Subcategory": ["Carbohydrate metabolism", "Lipid metabolism", "Amino acid metabolism", 
                    "Signal transduction", "Cell cycle", "Transport", 
                    "DNA replication", "RNA processing"],
    "Count": [150, 120, 90, 100, 85, 70, 110, 95]
}

# Convert data to a DataFrame
df = pd.DataFrame(data)

# Create the treemap
fig = px.treemap(
    df,
    path=["Category", "Subcategory"],  # Hierarchical levels
    values="Count",                   # Size of the treemap blocks
    color="Count",                    # Color based on the count values
    color_continuous_scale="Viridis"  # Color scale
)

# Add a title
fig.update_layout(title="Treemap: Hierarchical Data Representation in Bioinformatics")

# Show the plot
fig.show()

Bacterial Comparative Genomics Pipeline Bash Script

LEGE — Sat, 14 Dec 2024 12:34:57 -0600

#!/bin/bash

# Bacterial Comparative Genomics Pipeline Script
# This script automates key steps in bacterial comparative genomics using popular bioinformatics tools.

# Ensure the script stops on error
set -e

# Define paths
WORKDIR="./bacterial_genomics_pipeline"
INPUT_FASTA_DIR="./input_genomes"
OUTPUT_DIR="./output"
CORE_PAN_DIR="$OUTPUT_DIR/core_pan_analysis"
PHYLOGENY_DIR="$OUTPUT_DIR/phylogeny"
ALIGNMENT_DIR="$OUTPUT_DIR/genome_alignment"
RESISTANCE_DIR="$OUTPUT_DIR/antibiotic_resistance"
SYNTENY_DIR="$OUTPUT_DIR/synteny_analysis"

# Create directories if they do not exist
mkdir -p $WORKDIR $OUTPUT_DIR $CORE_PAN_DIR $PHYLOGENY_DIR $ALIGNMENT_DIR $RESISTANCE_DIR $SYNTENY_DIR

# Tools required
PROKKA="prokka"
ROARY="roary"
MAUVE="progressiveMauve"
IQTREE="iqtree"
ABRICATE="abricate"
MCSCANX="mcscanx"

# Step 1: Genome Annotation using Prokka
annotate_genomes() {
  echo "\n=== Annotating Genomes with Prokka ==="
  for fasta in $INPUT_FASTA_DIR/*.fasta; do
    basename=$(basename $fasta .fasta)
    output_path="$OUTPUT_DIR/annotation_$basename"
    echo "Annotating $basename..."
    $PROKKA --outdir $output_path --prefix $basename $fasta
  done
}

# Step 2: Core and Pan-genome Analysis using Roary
core_pan_analysis() {
  echo "\n=== Performing Core and Pan-genome Analysis with Roary ==="
  gff_files=$(find $OUTPUT_DIR -name "*.gff")
  roary_output="$CORE_PAN_DIR/pan_genome_analysis"
  mkdir -p $roary_output
  $ROARY -e -n -v -p 8 -o $roary_output $gff_files
}

# Step 3: Whole Genome Alignment using Mauve
align_genomes() {
  echo "\n=== Aligning Genomes with Mauve ==="
  alignment_output="$ALIGNMENT_DIR/aligned_genomes.xmfa"
  echo "Running Mauve on input genomes..."
  $MAUVE --output=$alignment_output $(find $INPUT_FASTA_DIR -name "*.fasta")
  echo "Alignment saved to $alignment_output"
}

# Step 4: Phylogenetic Tree Construction using IQ-TREE
construct_phylogeny() {
  echo "\n=== Constructing Phylogenetic Tree with IQ-TREE ==="
  alignment="$ALIGNMENT_DIR/aligned_genomes.xmfa"
  phylo_output="$PHYLOGENY_DIR/phylogeny_tree"
  iqtree_output="$phylo_output.treefile"

  echo "Running IQ-TREE on aligned genomes..."
  $IQTREE -s $alignment -m GTR+G -nt AUTO -pre $phylo_output
  echo "Phylogenetic tree saved to $iqtree_output"
}

# Step 5: Antibiotic Resistance Gene Identification using ABRicate
identify_resistance_genes() {
  echo "\n=== Identifying Antibiotic Resistance Genes with ABRicate ==="
  for fasta in $INPUT_FASTA_DIR/*.fasta; do
    basename=$(basename $fasta .fasta)
    output_path="$RESISTANCE_DIR/${basename}_resistance.txt"
    echo "Analyzing $basename for resistance genes..."
    abricate $fasta > $output_path
  done
}

# Step 6: Synteny Analysis using MCScanX
synteny_analysis() {
  echo "\n=== Performing Synteny Analysis with MCScanX ==="
  synteny_output="$SYNTENY_DIR/synteny_results"
  mkdir -p $synteny_output
  echo "Running MCScanX on annotated genomes..."
  MCScanX $OUTPUT_DIR > "$synteny_output/results.txt"
  echo "Synteny analysis results saved to $synteny_output"
}

# Main workflow
annotate_genomes
core_pan_analysis
align_genomes
construct_phylogeny
identify_resistance_genes
synteny_analysis

echo "\n=== Bacterial Comparative Genomics Pipeline Complete ==="
echo "Results saved in $OUTPUT_DIR"

Methods to upgrade the Ubuntu !

LEGE — Fri, 06 Dec 2024 23:36:11 -0600

#Install ubuntu-release-upgrader-core if it is not already installed:

sudo apt-get install ubuntu-release-upgrader-core
#Edit /etc/update-manager/release-upgrades and set Prompt=normal

#Launch the upgrade tool:

do-release-upgrade
#Follow the on-screen instructions.

Install Edirect !

LEGE — Thu, 03 Oct 2024 01:52:15 -0500

sh -c "$(curl -fsSL https://ftp.ncbi.nlm.nih.gov/entrez/entrezdirect/install-edirect.sh)"

Extract the fasta sequence using ids

LEGE — Thu, 03 Oct 2024 01:27:32 -0500

#Extract sequences with names in file name.list, one sequence name per line:
seqtk subseq input.fasta name.list > output.fasta

R script to add P-Values in plots !

LEGE — Tue, 17 Sep 2024 20:23:02 -0500

library(ggplot2)
library(tidyverse)
library(ggpubr)
my_comp <- list( c("0.5", "1"), c("1", "2"), c("0.5", "2") )
ggboxplot(ToothGrowth,
 x = "dose", 
 y = "len",
 fill = "dose", 
 palette = "Dark2")+
 stat_compare_means(label = "p.format",
 comparisons = my_comp,
 method = "t.test",
 symnum.args = list(cutpoints = c(0, 0.001, 1), 
 symbols = "p < 0.001"))

Commands to create conda env

LEGE — Mon, 13 May 2024 06:38:11 -0500

(base) [lege@hn1 testVisanu]$ conda create -n pythonENV python=3.10 scipy=1.13.0 astroid babel
Channels:
 - conda-forge
 - bioconda
 - defaults
Platform: linux-64
Collecting package metadata (repodata.json): done
Solving environment: done


==> WARNING: A newer version of conda exists. <==
    current version: 24.3.0
    latest version: 24.4.0

Please update conda by running

    $ conda update -n base -c conda-forge conda



## Package Plan ##

  environment location: /home/lege/miniforge3/envs/pythonENV

  added / updated specs:
    - astroid
    - babel
    - python=3.10
    - scipy=1.13.0


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    astroid-3.2.0              |  py310hff52083_0         389 KB  conda-forge
    babel-2.14.0               |     pyhd8ed1ab_0         7.3 MB  conda-forge
    libblas-3.9.0              |22_linux64_openblas          14 KB  conda-forge
    libcblas-3.9.0             |22_linux64_openblas          14 KB  conda-forge
    libgfortran-ng-13.2.0      |       h69a702a_7          24 KB  conda-forge
    libgfortran5-13.2.0        |       hca663fb_7         1.4 MB  conda-forge
    liblapack-3.9.0            |22_linux64_openblas          14 KB  conda-forge
    libopenblas-0.3.27         |pthreads_h413a1c8_0         5.3 MB  conda-forge
    numpy-1.26.4               |  py310hb13e2d6_0         6.7 MB  conda-forge
    pytz-2024.1                |     pyhd8ed1ab_0         184 KB  conda-forge
    scipy-1.13.0               |  py310h93e2701_1        15.8 MB  conda-forge
    typing-extensions-4.11.0   |       hd8ed1ab_0          10 KB  conda-forge
    typing_extensions-4.11.0   |     pyha770c72_0          37 KB  conda-forge
    ------------------------------------------------------------
                                           Total:        37.1 MB

The following NEW packages will be INSTALLED:

  _libgcc_mutex      conda-forge/linux-64::_libgcc_mutex-0.1-conda_forge 
  _openmp_mutex      conda-forge/linux-64::_openmp_mutex-4.5-2_gnu 
  astroid            conda-forge/linux-64::astroid-3.2.0-py310hff52083_0 
  babel              conda-forge/noarch::babel-2.14.0-pyhd8ed1ab_0 
  bzip2              conda-forge/linux-64::bzip2-1.0.8-hd590300_5 
  ca-certificates    conda-forge/linux-64::ca-certificates-2024.2.2-hbcca054_0 
  ld_impl_linux-64   conda-forge/linux-64::ld_impl_linux-64-2.40-h55db66e_0 
  libblas            conda-forge/linux-64::libblas-3.9.0-22_linux64_openblas 
  libcblas           conda-forge/linux-64::libcblas-3.9.0-22_linux64_openblas 
  libffi             conda-forge/linux-64::libffi-3.4.2-h7f98852_5 
  libgcc-ng          conda-forge/linux-64::libgcc-ng-13.2.0-h77fa898_7 
  libgfortran-ng     conda-forge/linux-64::libgfortran-ng-13.2.0-h69a702a_7 
  libgfortran5       conda-forge/linux-64::libgfortran5-13.2.0-hca663fb_7 
  libgomp            conda-forge/linux-64::libgomp-13.2.0-h77fa898_7 
  liblapack          conda-forge/linux-64::liblapack-3.9.0-22_linux64_openblas 
  libnsl             conda-forge/linux-64::libnsl-2.0.1-hd590300_0 
  libopenblas        conda-forge/linux-64::libopenblas-0.3.27-pthreads_h413a1c8_0 
  libsqlite          conda-forge/linux-64::libsqlite-3.45.3-h2797004_0 
  libstdcxx-ng       conda-forge/linux-64::libstdcxx-ng-13.2.0-hc0a3c3a_7 
  libuuid            conda-forge/linux-64::libuuid-2.38.1-h0b41bf4_0 
  libxcrypt          conda-forge/linux-64::libxcrypt-4.4.36-hd590300_1 
  libzlib            conda-forge/linux-64::libzlib-1.2.13-hd590300_5 
  ncurses            conda-forge/linux-64::ncurses-6.5-h59595ed_0 
  numpy              conda-forge/linux-64::numpy-1.26.4-py310hb13e2d6_0 
  openssl            conda-forge/linux-64::openssl-3.3.0-hd590300_0 
  pip                conda-forge/noarch::pip-24.0-pyhd8ed1ab_0 
  python             conda-forge/linux-64::python-3.10.14-hd12c33a_0_cpython 
  python_abi         conda-forge/linux-64::python_abi-3.10-4_cp310 
  pytz               conda-forge/noarch::pytz-2024.1-pyhd8ed1ab_0 
  readline           conda-forge/linux-64::readline-8.2-h8228510_1 
  scipy              conda-forge/linux-64::scipy-1.13.0-py310h93e2701_1 
  setuptools         conda-forge/noarch::setuptools-69.5.1-pyhd8ed1ab_0 
  tk                 conda-forge/linux-64::tk-8.6.13-noxft_h4845f30_101 
  typing-extensions  conda-forge/noarch::typing-extensions-4.11.0-hd8ed1ab_0 
  typing_extensions  conda-forge/noarch::typing_extensions-4.11.0-pyha770c72_0 
  tzdata             conda-forge/noarch::tzdata-2024a-h0c530f3_0 
  wheel              conda-forge/noarch::wheel-0.43.0-pyhd8ed1ab_1 
  xz                 conda-forge/linux-64::xz-5.2.6-h166bdaf_0 


Proceed ([y]/n)? y


Downloading and Extracting Packages:
                                                                                                    
Preparing transaction: done                                                                         
Verifying transaction: done                                                                         
Executing transaction: done                                                                         
#                                                                                                   
# To activate this environment, use                                                                 
#                                                                                                   
#     $ conda activate pythonENV                                                                    
#                                                                                                   
# To deactivate an active environment, use                                                          
#                                                                                                   
#     $ conda deactivate