BOL: Owner

Bash script to discover piRNA in transcriptome data !

Abhi — Fri, 13 Dec 2024 11:47:00 -0600

#!/bin/bash

# Variables (modify these as per your setup)
INPUT_FASTQ="input_reads.fastq"
ADAPTER_SEQ="TGGAATTCTCGGGTGCCAAGG"
REFERENCE_GENOME="reference_genome.fa"
BOWTIE_INDEX="reference_index"
OUTPUT_DIR="piRNA_analysis"
THREADS=4

# Create output directory
mkdir -p $OUTPUT_DIR

# Step 1: Quality Control
echo "Running FastQC for quality control..."
fastqc $INPUT_FASTQ -o $OUTPUT_DIR

# Step 2: Adapter Trimming
echo "Trimming adapters with Cutadapt..."
cutadapt -a $ADAPTER_SEQ -o $OUTPUT_DIR/trimmed_reads.fastq $INPUT_FASTQ

# Step 3: Mapping Reads to Reference Genome
echo "Mapping reads to reference genome using Bowtie..."
bowtie -v 1 -k 1 --best -p $THREADS $BOWTIE_INDEX $OUTPUT_DIR/trimmed_reads.fastq -S $OUTPUT_DIR/aligned_reads.sam

# Step 4: Convert SAM to BAM and Sort
echo "Converting SAM to BAM and sorting..."
samtools view -Sb $OUTPUT_DIR/aligned_reads.sam | samtools sort -o $OUTPUT_DIR/sorted_reads.bam

# Step 5: Extract Reads of piRNA Size (24-32 nt)
echo "Filtering reads of size 24-32 nt..."
bedtools bamtofastq -i $OUTPUT_DIR/sorted_reads.bam -fq $OUTPUT_DIR/all_reads.fastq
seqkit seq -m 24 -M 32 $OUTPUT_DIR/all_reads.fastq > $OUTPUT_DIR/piRNA_size_reads.fastq

# Step 6: Detect Sequence Bias (Optional)
echo "Checking sequence bias using WebLogo-compatible data..."
seqkit fx2tab $OUTPUT_DIR/piRNA_size_reads.fastq | cut -f2 | awk '{print ">seq"NR"\n"$0}' > $OUTPUT_DIR/piRNA_sequences.fa

# Step 7: Identify piRNA Clusters
# This step requires a tool like proTRAC or PIRANHA. Example placeholder:
echo "Identifying piRNA clusters (requires proTRAC or PIRANHA)..."
# Example with proTRAC:
# proTRAC.pl -s $OUTPUT_DIR/sorted_reads.bam -g $REFERENCE_GENOME -o $OUTPUT_DIR/clusters

# Step 8: Annotate Clusters
# Annotation depends on your genome's annotation file
# bedtools intersect example placeholder:
# bedtools intersect -a clusters.bed -b genome_annotation.gtf > annotated_clusters.bed

# Step 9: Clean up intermediate files (optional)
echo "Cleaning up intermediate files..."
rm $OUTPUT_DIR/aligned_reads.sam $OUTPUT_DIR/all_reads.fastq

# Done
echo "piRNA discovery pipeline completed! Results are in $OUTPUT_DIR."

Python script to parse a FASTQ file !

Abhi — Mon, 10 Jun 2024 11:20:27 -0500

#Python script to parse a FASTQ file and extract basic information such as the sequence identifier, sequence, and quality scores
#pip install biopython

from Bio import SeqIO

def parse_fastq(fastq_file):
    # Initialize a list to store parsed sequences
    sequences = []

    # Read the sequences from the FASTQ file
    for record in SeqIO.parse(fastq_file, "fastq"):
        sequence_info = {
            "id": record.id,
            "sequence": str(record.seq),
            "quality": record.letter_annotations["phred_quality"]
        }
        sequences.append(sequence_info)

    return sequences

# Example usage
fastq_file = "path/to/your/sequences.fastq"
parsed_sequences = parse_fastq(fastq_file)

# Print out the parsed sequences
for seq in parsed_sequences:
    print(f"ID: {seq['id']}")
    print(f"Sequence: {seq['sequence']}")
    print(f"Quality: {seq['quality']}")
    print()

Python script to calculate basic genome stats !

Abhi — Mon, 10 Jun 2024 11:18:32 -0500

from Bio import SeqIO

def calculate_genome_stats(fasta_file):
    # Initialize variables to store genome statistics
    genome_length = 0
    gc_count = 0
    a_count = 0
    t_count = 0
    c_count = 0
    g_count = 0

    # Read the genome sequence from the FASTA file
    for record in SeqIO.parse(fasta_file, "fasta"):
        sequence = record.seq
        genome_length += len(sequence)
        a_count += sequence.count('A')
        t_count += sequence.count('T')
        c_count += sequence.count('C')
        g_count += sequence.count('G')
        gc_count += sequence.count('G') + sequence.count('C')

    # Calculate GC content
    gc_content = (gc_count / genome_length) * 100 if genome_length > 0 else 0

    # Print genome statistics
    print(f"Genome Length: {genome_length} bp")
    print(f"A Count: {a_count}")
    print(f"T Count: {t_count}")
    print(f"C Count: {c_count}")
    print(f"G Count: {g_count}")
    print(f"GC Content: {gc_content:.2f}%")

# Example usage
fasta_file = "path/to/your/genome.fasta"
calculate_genome_stats(fasta_file)

Python script to create fastq file with random sequences

Abhi — Mon, 10 Jun 2024 08:21:32 -0500

import random

def generate_random_sequence(length):
    bases = ['A', 'C', 'G', 'T']
    return ''.join(random.choice(bases) for _ in range(length))

def generate_random_quality(length):
    return ''.join(chr(random.randint(33, 73)) for _ in range(length))

def generate_fastq_entry(sequence_length):
    sequence = generate_random_sequence(sequence_length)
    quality = generate_random_quality(sequence_length)
    return f"@SEQ_ID\n{sequence}\n+\n{quality}\n"

def generate_fastq_file(num_entries, sequence_length, file_path):
    with open(file_path, 'w') as f:
        for _ in range(num_entries):
            entry = generate_fastq_entry(sequence_length)
            f.write(entry)

# Generate a FASTQ file with 5 entries, each with a sequence length of 50 bases
generate_fastq_file(100, 50, 'random.fastq')

R script for Circos plot !

Abhi — Tue, 11 Jul 2023 01:41:03 -0500

#!/usr/bin/env Rscript
library(RCircos)

# usage: Rscript make_circos.r     

# parse args
args = commandArgs(trailingOnly=TRUE)
sv.file <- args[1]
sample.name <- args[2]
gene.label.file <- args[3]
cnv.file <- args[4]
out.file <- args[5]
# TMP <- Sys.getenv("TMP_DIR") 
# tmp.bed = paste0(TMP ,"/" , sample.name, "_bkpts.bed")
tmp.bed = paste0(sample.name, "_bkpts.bed")

# load prereq data
data(UCSC.HG19.Human.CytoBandIdeogram)

# set core parameters
chr.exclude <- NULL;
cyto.info <- UCSC.HG19.Human.CytoBandIdeogram;
tracks.inside <- 10;
tracks.outside <- 5;
RCircos.Set.Core.Components(cyto.info, chr.exclude, tracks.inside, tracks.outside);
rcircos.params <- RCircos.Get.Plot.Parameters();
rcircos.params$text.size <- 1
RCircos.Reset.Plot.Parameters(rcircos.params)
rcircos.cyto <- RCircos.Get.Plot.Ideogram();
rcircos.position <- RCircos.Get.Plot.Positions();
RCircos.List.Plot.Parameters()

link.data <- tryCatch(read.table(sv.file, sep = ',', stringsAsFactors = F, header = T), error=function(e) data.frame())
                    
if (nrow(link.data) != 0) {
  
  link.data <- transform(link.data,
                         chromStart = as.numeric(chromStart),
                         chromEnd = as.numeric(chromEnd),
                         chromStart.1 = as.numeric(chromStart.1),
                         chromEnd.1 = as.numeric(chromEnd.1))
  
  # write a bed file of all breakpoints to intersect with gene label table
  bkpts.1 <- link.data[c("Chromosome", "chromStart", "chromEnd")]
  bkpts.2 <- link.data[c("Chromosome.1", "chromStart.1", "chromEnd.1")]
  colnames(bkpts.2) <- colnames(bkpts.1)
  write.table(rbind(bkpts.1, bkpts.2), tmp.bed, sep = '\t', quote = F, col.names = F, row.names = F)
  
  # only keep labels that fall within an event
  print(paste0("bedtools intersect -wb -a ", tmp.bed, " -b ", gene.label.file))
  gene.labels <- system(paste0("bedtools intersect -wb -a ", tmp.bed, " -b ", gene.label.file), intern = T)
  gene.labels <- data.frame(do.call('rbind', strsplit(gene.labels, '\t', fixed=TRUE)), stringsAsFactors = F)
  if (nrow(gene.labels) > 0) {
    gene.labels <- gene.labels[,4:ncol(gene.labels)]
    
    # deduplicate labels
    gene.labels <- gene.labels[!duplicated(gene.labels),]
    colnames(gene.labels) <- c("Chromosome", "chromStart", "chromEnd", "gene")
    gene.labels <- transform(gene.labels,
                             chromStart = as.numeric(chromStart),
                             chromEnd = as.numeric(chromEnd))
  }
  
  # make the plot
  png(file=out.file, height=3000, width=3000, res = 500)
  RCircos.Set.Plot.Area()
  RCircos.Chromosome.Ideogram.Plot()
  track.num <- 2
  RCircos.Link.Plot(link.data, track.num, TRUE)
  title(sample.name, line=-1)
  
  # label the genes
  if (nrow(gene.labels) > 0) {
    name.col <- 4
    side <- "out"
    track.num <- 1
    RCircos.Gene.Connector.Plot(gene.labels, track.num, side);
    track.num <- 2
    RCircos.Gene.Name.Plot(gene.labels, name.col, track.num, side);
  }
    
  # remove intermediate file
  system(paste0("rm -f ", tmp.bed))
  
} else {
  # make empty plot
  png(file=out.file, height=3000, width=3000, res = 500)
  RCircos.Set.Plot.Area()
  RCircos.Chromosome.Ideogram.Plot()
  title(sample.name, line=-1)
}
                    
# parse cnv data
cnv = tryCatch(read.csv(cnv.file, stringsAsFactors = F), error=function(e) data.frame())
               
if (nrow(cnv) != 0) {
    colnames(cnv) <- c("Chromosome", "chromStart", "chromEnd", "cnv")
    cnv$Chromosome <- paste0('chr', cnv$Chromosome)
    cnv$GeneName <- "gene"
    cnv <- cnv[, c("Chromosome", "chromStart", "chromEnd", "GeneName", "cnv")]
}
                    
# add CNV heatmap track
if (nrow(cnv) != 0) {
  RCircos.Heatmap.Plot(cnv, data.col = 5, track.num = 1, side = "in")
}
                   
dev.off()

#-------- DATA FORMAT ------
chr1	11869	14412	DDX11L1
chr1	14363	29806	WASH7P
chr1	29554	31109	MIR1302-10
chr1	34554	36081	FAM138A
chr1	52473	54936	OR4G4P
chr1	62948	63887	OR4G11P
chr1	69091	70008	OR4F5
chr1	131025	134836	CICP27
chr1	134901	139379	AL627309.1
chr1	157784	157887	RNU6-1100P
chr1	227615	267253	AP006222.2
chr1	228292	228775	AP006222.1
chr1	317720	453948	RP4-669L17.10
chr1	326096	328112	RP4-669L17.8
chr1	329431	332236	CICP7
chr1	334126	334305	RP4-669L17.4
chr1	367640	368634	OR4F29
chr1	379105	379467	WBP1LP7

Perl script to read the next line of a file !

Abhi — Mon, 17 Oct 2022 23:19:56 -0500

my $line = <$fileHandler>;
while(1) { # keep looping until I say so
    my $nextLine = <$fileHandler>;

    if ($line =~ m/>/ || !defined $nextLine) {
        ### Do the stuff
    }
    ### Do any other stuff;

    last unless defined $nextLine;
    $line = $nextLine;
}

Extract the mapped and unmapped reads !

Abhi — Fri, 23 Sep 2022 06:18:33 -0500

PROCESSORS=20

#Single_End_Layout:
samtools view --threads $PROCESSORS -b -F 4 in.bam > mapped.bam
samtools view --threads $PROCESSORS -b -f 4 in.bam > unmapped.bam

#Paired_End_Layout
samtools view --threads $PROCESSORS -b -f 2 in.bam > mapped.bam
samtools view --threads $PROCESSORS -b -F 2 in.bam > unmapped.bam

Genome Scaffolding and gap filling !

Abhi — Wed, 24 Aug 2022 05:41:32 -0500

scaffolding with ARCS v1.0.3 (−c3, −l,4, −a,0.9, −z500, −m50, −20 000, −e30000, −s90).  https://github.com/bcgsc/arcs

Next, automated gap filling was performed using Sealer v2.0.1 (−L150, -P10, −k75-115 [step = 10]) https://github.com/bcgsc/abyss/tree/sealer-release

Install Varscan on Ubuntu / Linux !

Abhi — Wed, 02 Feb 2022 02:38:25 -0600

#Varscan is a java program designed to call variants in sequencing data. It was developed at the Genome Institute at Washington University and is hosted on github. To use Varscan we simply need to download the distributed jar file into our ~/workspace/bin. As with the other java programs which have already been installed in this section we can invoke Varscan via java -jar.

# Install Varscan
cd ~/workspace/bin
curl -L -k -o VarScan.v2.4.2.jar https://github.com/dkoboldt/varscan/releases/download/2.4.2/VarScan.v2.4.2.jar
java -jar ~/workspace/bin/VarScan.v2.4.2.jar

Install StringTie on ubuntu / Linux !

Abhi — Wed, 02 Feb 2022 02:36:02 -0600

#StringTie is a software program to perform transcript assembly and quantification of RNAseq data. The binary distributions are available so to install we can just download this distribution and extract it. Like with our other programs we also make a symlink to make it easier to find.

# download and extract
cd ~/workspace/bin
wget http://ccb.jhu.edu/software/stringtie/dl/stringtie-1.3.0.Linux_x86_64.tar.gz
tar -xzvf stringtie-1.3.0.Linux_x86_64.tar.gz

# make symlink
ln -s ~/workspace/bin/stringtie-1.3.0.Linux_x86_64/stringtie ~/workspace/bin/stringtie

# test installation
~/workspace/bin/stringtie -h