BOL: Owner

Python script to split a genome sequence into overlapping windows of 100 base pairs

Neel — Wed, 11 Dec 2024 23:32:55 -0600

def split_genome(sequence, window_size=100, step=1):
    """
    Splits a genome sequence into overlapping windows.

    Args:
        sequence (str): The genome sequence.
        window_size (int): Size of each window (default: 100).
        step (int): Step size for overlapping (default: 1).

    Returns:
        list: A list of genome windows.
    """
    windows = []
    for i in range(0, len(sequence) - window_size + 1, step):
        windows.append(sequence[i:i + window_size])
    return windows

# Example usage:
if __name__ == "__main__":
    genome_sequence = "ATGCGTACGTTAGCTACGATCGTACGATCGTACGATCGATCGTAGCATCGATCGTACG"
    window_size = 100
    step_size = 1

    # Get overlapping windows
    genome_windows = split_genome(genome_sequence, window_size, step_size)

    # Print results
    for idx, window in enumerate(genome_windows):
        print(f"Window {idx + 1}: {window}")

Perl script for chi-squared test !

Neel — Tue, 21 Mar 2023 03:53:45 -0500

#!/usr/bin/perl
#
# chidi.pl 
#
# A script to perform a chi-squared test of the dinucleotide frequencies of two FASTA files
# Last updated by: $Author$
# Last updated on: $Date$

use strict;
use warnings;
use Getopt::Long;
use FAlite;



# sanity checks
die "Usage: chidi.pl  \n" if (!$ARGV[1]);

my @dinucs = qw (AA AC AG AT CA CC CG CT GA GC GG GT TA TC TG TT);

# hashes for obersered and expected dinucleotide frequencies of both files

my %file1_ob;
my %file2_ob;
my %file1_ex;
my %file2_ex;
								
############################################################
# Read sequence file 1
############################################################

open(FILE,"$ARGV[0]") || die "Can't open $ARGV[0]\n";
my $fasta = new FAlite(\*FILE);

# loop through each sequence in file 1
while(my $entry = $fasta->nextEntry) {	
	my $seq = uc($entry->seq);
	# to count dinucleotides, loop through sequence, take 2 bp and increment the hash counter
	foreach my $i (0..length($seq)){
	    my $tmp = substr($seq,$i,2);		
		$file1_ob{$tmp}++;
	}
}
close(FILE);


############################################################
# Read sequence file 2
############################################################

open(FILE,"$ARGV[1]") || die "Can't open $ARGV[1]\n";
$fasta = new FAlite(\*FILE);

# loop through each sequence in file 1
while(my $entry = $fasta->nextEntry) {	
	my $seq = uc($entry->seq);
	# to count dinucleotides, loop through sequence, take 2 bp and increment the hash counter
	foreach my $i (0..length($seq)){
	    my $tmp = substr($seq,$i,2);		
		$file2_ob{$tmp}++;
	}
}
close(FILE);


############################################################
# Perform chi-squared test
############################################################

# need total of all counts in both sequences, plus totals of 'rows' in chi-square table

my $total;
my $row1;
my $row2;

foreach my $di (@dinucs){
	$row1  += $file1_ob{$di};
	$row2  += $file2_ob{$di};
	$total += ($file1_ob{$di} + $file2_ob{$di});
}


# now calculate expected values

foreach my $di (@dinucs){
	# calculate (column total * row total) / $total
	$file1_ex{$di} = sprintf("%.2f",(($file1_ob{$di}+$file2_ob{$di}) * $row1) / $total);
	$file2_ex{$di} = sprintf("%.2f",(($file1_ob{$di}+$file2_ob{$di}) * $row2) / $total);	
}

# now calculate chi-squared values
my ($chi1,$chi2);
my $chi_total;
print "\tObs1\tExp2\t\tChi1\tObs2\tExp2\t\tChi2\n";
foreach my $di (@dinucs){
	$chi1 = sprintf("%.2f",(($file1_ob{$di} - $file1_ex{$di})**2)/$file1_ex{$di});
	$chi2 = sprintf("%.2f",(($file2_ob{$di} - $file2_ex{$di})**2)/$file2_ex{$di});	
	print "$di\t$file1_ob{$di}\t$file1_ex{$di}\t$chi1\t$file2_ob{$di}\t$file2_ex{$di}\t$chi2\n";

	$chi_total += ($chi1+$chi2);
}

printf  "Chi squared value = %6.2f\n", $chi_total;
				
print "Significance level at 5% = 25.00\n";
print "Significance level at 1% = 30.58\n";


exit(0);

Script to rapid genome clustering based on pairwise ANI

Neel — Tue, 09 Aug 2022 04:34:44 -0500

First, create a blast+ database: makeblastdb -in  -dbtype nucl -out 

Next, use megablast from blast+ package to perform all-vs-all blastn of sequences: blastn -query  -db  -outfmt '6 std qlen slen' -max_target_seqs 10000 -o  -num_threads 32

Note: using the -perc_identity flag will speed up the search at the cost of sensitivity: blastn -query  -db  -outfmt '6 std qlen slen' -max_target_seqs 10000 -perc_identity 90 -o  -num_threads 32

Next, calculate pairwise ANI by combining local alignments between sequence pairs: anicalc.py -i  -o 

Finally, perform UCLUST-like clustering using the MIUVIG recommended-parameters (95% ANI + 85% AF): aniclust.py --fna  --ani  --out  --min_ani 95 --min_tcov 85 --min_qcov 0

Conda command to install checkV

Neel — Mon, 08 Aug 2022 23:57:25 -0500

(mENV) [jnarayan@hn1 Monkey_Pox]$ conda install -c conda-forge -c bioconda checkv
Collecting package metadata (current_repodata.json): done
Solving environment: done

## Package Plan ##

  environment location: /home/man/anaconda3/envs/jitENV

  added / updated specs:
    - checkv


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    boost-1.68.0               |py36h8619c78_1001         325 KB  conda-forge
    boost-cpp-1.68.0           |    h11c811c_1000        20.5 MB  conda-forge
    ca-certificates-2022.6.15  |       ha878542_0         149 KB  conda-forge
    checkv-1.0.1               |     pyhdfd78af_0          33 KB  bioconda
    diamond-0.9.24             |       ha888412_1         758 KB  bioconda
    importlib-metadata-4.2.0   |   py36h5fab9bb_0          30 KB  conda-forge
    kcounter-0.1.1             |   py36h91eb985_1         871 KB  bioconda
    libzlib-1.2.12             |       h166bdaf_2          63 KB  conda-forge
    more-itertools-8.13.0      |     pyhd8ed1ab_0          44 KB  conda-forge
    openssl-1.1.1q             |       h166bdaf_0         2.1 MB  conda-forge
    prodigal-gv-2.9.0          |       h7132678_0         916 KB  bioconda
    zipp-0.6.0                 |             py_0           7 KB  conda-forge
    zlib-1.2.12                |       h166bdaf_2          91 KB  conda-forge
    ------------------------------------------------------------
                                           Total:        25.9 MB

The following NEW packages will be INSTALLED:

  biopython          conda-forge/linux-64::biopython-1.79-py36h8f6f2f9_0
  boost              conda-forge/linux-64::boost-1.68.0-py36h8619c78_1001
  boost-cpp          conda-forge/linux-64::boost-cpp-1.68.0-h11c811c_1000
  checkv             bioconda/noarch::checkv-1.0.1-pyhdfd78af_0
  diamond            bioconda/linux-64::diamond-0.9.24-ha888412_1
  importlib-metadata conda-forge/linux-64::importlib-metadata-4.2.0-py36h5fab9bb_0
  kcounter           bioconda/linux-64::kcounter-0.1.1-py36h91eb985_1
  more-itertools     conda-forge/noarch::more-itertools-8.13.0-pyhd8ed1ab_0
  prodigal-gv        bioconda/linux-64::prodigal-gv-2.9.0-h7132678_0
  typing_extensions  conda-forge/noarch::typing_extensions-3.10.0.2-pyha770c72_0
  zipp               conda-forge/noarch::zipp-0.6.0-py_0

The following packages will be UPDATED:

  libgcc-ng                              11.2.0-h1d223b6_11 --> 12.1.0-h8d9b700_16
  libzlib                              1.2.11-h36c2ea0_1013 --> 1.2.12-h166bdaf_2
  zlib                                 1.2.11-h36c2ea0_1013 --> 1.2.12-h166bdaf_2

The following packages will be SUPERSEDED by a higher-priority channel:

  ca-certificates    pkgs/main::ca-certificates-2022.07.19~ --> conda-forge::ca-certificates-2022.6.15-ha878542_0
  openssl              pkgs/main::openssl-1.1.1q-h7f8727e_0 --> conda-forge::openssl-1.1.1q-h166bdaf_0


Proceed ([y]/n)? y


Downloading and Extracting Packages
ca-certificates-2022 | 149 KB    | ############################################################################################################################################################### | 100% 
openssl-1.1.1q       | 2.1 MB    | ############################################################################################################################################################### | 100% 
kcounter-0.1.1       | 871 KB    | ############################################################################################################################################################### | 100% 
libzlib-1.2.12       | 63 KB     | ############################################################################################################################################################### | 100% 
zlib-1.2.12          | 91 KB     | ############################################################################################################################################################### | 100% 
more-itertools-8.13. | 44 KB     | ############################################################################################################################################################### | 100% 
checkv-1.0.1         | 33 KB     | ############################################################################################################################################################### | 100% 
zipp-0.6.0           | 7 KB      | ############################################################################################################################################################### | 100% 
prodigal-gv-2.9.0    | 916 KB    | ############################################################################################################################################################### | 100% 
importlib-metadata-4 | 30 KB     | ############################################################################################################################################################### | 100% 
boost-cpp-1.68.0     | 20.5 MB   | ############################################################################################################################################################### | 100% 
diamond-0.9.24       | 758 KB    | ############################################################################################################################################################### | 100% 
boost-1.68.0         | 325 KB    | ############################################################################################################################################################### | 100% 
Preparing transaction: done
Verifying transaction: done
Executing transaction: \ 
     CheckV v1.0.1 an external database which needs to be downloaded
     and unarchived. The link for the database can be found in CheckV's
     instructions at https://bitbucket.org/berkeleylab/checkv/



done

Install Read Simulator

Neel — Sun, 10 Jul 2022 21:10:16 -0500

$ sudo apt install art-nextgen-simulation-tools
[sudo] password for neelam: 
Reading package lists... Done
Building dependency tree       
Reading state information... Done
The following package was automatically installed and is no longer required:
  libfwupdplugin1
Use 'sudo apt autoremove' to remove it.
The following additional packages will be installed:
  libgsl23 libgslcblas0
Suggested packages:
  art-nextgen-simulation-tools-profiles gsl-ref-psdoc | gsl-doc-pdf
  | gsl-doc-info | gsl-ref-html
The following NEW packages will be installed:
  art-nextgen-simulation-tools libgsl23 libgslcblas0
0 upgraded, 3 newly installed, 0 to remove and 1 not upgraded.
Need to get 2,616 kB of archives.
After this operation, 12.3 MB of additional disk space will be used.
Do you want to continue? [Y/n] Y
Get:1 http://in.archive.ubuntu.com/ubuntu focal/universe amd64 libgslcblas0 amd64 2.5+dfsg-6build1 [84.6 kB]
Get:2 http://in.archive.ubuntu.com/ubuntu focal/universe amd64 libgsl23 amd64 2.5+dfsg-6build1 [850 kB]
Get:3 http://in.archive.ubuntu.com/ubuntu focal/universe amd64 art-nextgen-simulation-tools amd64 20160605+dfsg-4 [1,681 kB]
Fetched 2,616 kB in 1s (4,578 kB/s)                     
Selecting previously unselected package libgslcblas0:amd64.
(Reading database ... 178334 files and directories currently installed.)
Preparing to unpack .../libgslcblas0_2.5+dfsg-6build1_amd64.deb ...
Unpacking libgslcblas0:amd64 (2.5+dfsg-6build1) ...
Selecting previously unselected package libgsl23:amd64.
Preparing to unpack .../libgsl23_2.5+dfsg-6build1_amd64.deb ...
Unpacking libgsl23:amd64 (2.5+dfsg-6build1) ...
Selecting previously unselected package art-nextgen-simulation-tools.
Preparing to unpack .../art-nextgen-simulation-tools_20160605+dfsg-4_amd64.deb .
..
Unpacking art-nextgen-simulation-tools (20160605+dfsg-4) ...
Setting up libgslcblas0:amd64 (2.5+dfsg-6build1) ...
Setting up libgsl23:amd64 (2.5+dfsg-6build1) ...
Setting up art-nextgen-simulation-tools (20160605+dfsg-4) ...
Processing triggers for libc-bin (2.31-0ubuntu9.9) ...
Processing triggers for man-db (2.9.1-1) ...

Bash command to remove /r

Neel — Thu, 19 May 2022 05:53:20 -0500

VARNAME=$'a_bad_filename\r'
VARNAME="${VARNAME%%[[:cntrl:]]}"

Command line to move all files from a directory !

Neel — Wed, 18 May 2022 04:51:45 -0500

find ~/Downloads/ -type f -print0 | xargs -0 mv -t ~/Videos

Bash script to convert multiline fasta to single line fasta !

Neel — Thu, 31 Mar 2022 03:50:05 -0500

#file.fa is multiline fasta

awk '/^>/ {printf("\n%s\n",$0);next; } { printf("%s",$0);}  END {printf("\n");}' < file.fa

Bash command to explore assembly summary genbank !

Neel — Tue, 29 Mar 2022 02:35:30 -0500

wget https://ftp.ncbi.nlm.nih.gov/genomes/genbank/assembly_summary_genbank.txt

pip3 install csvkit

csvcut -t -K 1 -c 'excluded_from_refseq' assembly_summary_genbank.txt \
  | tail -n +2 | tr ";" "\n" \
    | sed -e 's/^ //' -e 's/ $//' | grep -v '""' \
      | sort | uniq -c | sort -nr

Bash script to transfer files to server !

Neel — Mon, 21 Feb 2022 23:21:36 -0600

# rsync options source destination

rsync -azvh --progress PacBio_clean.fa xxx@xxx.xxx.res.in:/home/

# scp source_file_name username@destination_host:destination_folder

scp –rpv /datafile xxx@192.168.1.100:/home/me