# Stop script on error.
set -uex
# The SRR BioProject number for the sequencing data.
PROJECT=PRJNA257197
# The number of datasets to subselect from the project.
N=5
# Get the project run information.
esearch -db sra -query $PROJECT | efetch -format runinfo > runinfo.txt
# Select the first N elements. Keep only valid SRR numbers.
cat runinfo.txt | cut -f 1 -d , | grep SRR | head -$N > selected.txt
# Store the data in the reads folder.
mkdir -p reads
# Download the SRR data for each
cat selected.txt | parallel fastq-dump -O reads -X 1000 --split-files {}
# Create a directory for bam files
mkdir -p bam
# Generate a separate BAM file for each SAMPLE.
cat selected.txt | parallel "picard FastqToSam F1=reads/{}_1.fastq F2=reads/{}_1.fastq O=bam/{}.bam RG=GROUP-{} LB=LIB-{} SM=SAMPLE_{} QUIET=true 2>> log.txt"
# Merge all the BAM files into one.
samtools merge -f all.bam bam/*.bam
# Investigate the readgroups in the header.
echo ""
echo "SAM file header:"
samtools view -H all.bam
echo ""
echo "Number of alignments with read group: GROUP-SRR1972919"
samtools view -c -r GROUP-SRR1972919 all.bam
# Reverting the process is to extract reads, tagged with readgroups to paired files.
samtools fastq -t -1 all1.fq -2 all2.fq all.bam
# To convert just one specific read group.
samtools view -r GROUP-SRR1972919 all.bam | samtools fastq -t -1 all_SRR1972919_1.fq -2 all_SRR1972919_2fq -