#!/bin/bash
# Variables (modify these as per your setup)
INPUT_FASTQ="input_reads.fastq"
ADAPTER_SEQ="TGGAATTCTCGGGTGCCAAGG"
REFERENCE_GENOME="reference_genome.fa"
BOWTIE_INDEX="reference_index"
OUTPUT_DIR="piRNA_analysis"
THREADS=4
# Create output directory
mkdir -p $OUTPUT_DIR
# Step 1: Quality Control
echo "Running FastQC for quality control..."
fastqc $INPUT_FASTQ -o $OUTPUT_DIR
# Step 2: Adapter Trimming
echo "Trimming adapters with Cutadapt..."
cutadapt -a $ADAPTER_SEQ -o $OUTPUT_DIR/trimmed_reads.fastq $INPUT_FASTQ
# Step 3: Mapping Reads to Reference Genome
echo "Mapping reads to reference genome using Bowtie..."
bowtie -v 1 -k 1 --best -p $THREADS $BOWTIE_INDEX $OUTPUT_DIR/trimmed_reads.fastq -S $OUTPUT_DIR/aligned_reads.sam
# Step 4: Convert SAM to BAM and Sort
echo "Converting SAM to BAM and sorting..."
samtools view -Sb $OUTPUT_DIR/aligned_reads.sam | samtools sort -o $OUTPUT_DIR/sorted_reads.bam
# Step 5: Extract Reads of piRNA Size (24-32 nt)
echo "Filtering reads of size 24-32 nt..."
bedtools bamtofastq -i $OUTPUT_DIR/sorted_reads.bam -fq $OUTPUT_DIR/all_reads.fastq
seqkit seq -m 24 -M 32 $OUTPUT_DIR/all_reads.fastq > $OUTPUT_DIR/piRNA_size_reads.fastq
# Step 6: Detect Sequence Bias (Optional)
echo "Checking sequence bias using WebLogo-compatible data..."
seqkit fx2tab $OUTPUT_DIR/piRNA_size_reads.fastq | cut -f2 | awk '{print ">seq"NR"\n"$0}' > $OUTPUT_DIR/piRNA_sequences.fa
# Step 7: Identify piRNA Clusters
# This step requires a tool like proTRAC or PIRANHA. Example placeholder:
echo "Identifying piRNA clusters (requires proTRAC or PIRANHA)..."
# Example with proTRAC:
# proTRAC.pl -s $OUTPUT_DIR/sorted_reads.bam -g $REFERENCE_GENOME -o $OUTPUT_DIR/clusters
# Step 8: Annotate Clusters
# Annotation depends on your genome's annotation file
# bedtools intersect example placeholder:
# bedtools intersect -a clusters.bed -b genome_annotation.gtf > annotated_clusters.bed
# Step 9: Clean up intermediate files (optional)
echo "Cleaning up intermediate files..."
rm $OUTPUT_DIR/aligned_reads.sam $OUTPUT_DIR/all_reads.fastq
# Done
echo "piRNA discovery pipeline completed! Results are in $OUTPUT_DIR."