BOL: Owner

Sequence Ids conversion files !

Surabhi Chaudhary — Fri, 03 Jul 2020 05:20:28 -0500

ftp://ftp.ncbi.nlm.nih.gov/gene/DATA/

Name	Size	Date Modified
ARCHIVE/		02/01/2020, 05:30:00
ASN_BINARY/		03/07/2020, 07:49:00
GENE_INFO/		03/07/2020, 07:48:00
0 B	10/02/2012, 05:30:00
15.1 kB	30/06/2020, 23:01:00
expression/		06/03/2017, 05:30:00
2.0 GB	03/07/2020, 07:44:00
61.8 MB	03/07/2020, 07:44:00
21.4 MB	03/07/2020, 07:44:00
45.1 MB	03/07/2020, 07:44:00
864 MB	03/07/2020, 07:45:00
279 kB	03/07/2020, 07:45:00
83.4 MB	03/07/2020, 07:45:00
572 MB	03/07/2020, 07:46:00
715 MB	03/07/2020, 07:47:00
30.2 MB	03/07/2020, 07:47:00
232 MB	03/07/2020, 14:38:00
1.2 kB	06/09/2011, 05:30:00
11.6 kB	16/05/2020, 01:32:00
770 kB	03/07/2020, 14:38:00
special_requests/		18/04/2020, 00:15:00
737 B	09/06/2011, 05:30:00

ftp://ftp.ncbi.nlm.nih.gov/gene/DATA/gene2go.gz
ftp://ftp.ncbi.nlm.nih.gov/gene/DATA/gene2accession.gz
ftp://ftp.ncbi.nlm.nih.gov/gene/DATA/gene2ensembl.gz
ftp://ftp.ncbi.nlm.nih.gov/gene/DATA/gene2pubmed.gz
ftp://ftp.ncbi.nlm.nih.gov/gene/DATA/gene2refseq.gz
ftp://ftp.ncbi.nlm.nih.gov/gene/DATA/gene_group.gz
ftp://ftp.ncbi.nlm.nih.gov/gene/DATA/gene_history.gz
ftp://ftp.ncbi.nlm.nih.gov/gene/DATA/gene_neighbors.gz

Extract the sequence by IDs !

Surabhi Chaudhary — Fri, 03 Jul 2020 04:58:39 -0500

#This method can be applied directly to FASTA or a FASTQ file, compressed or uncompressed files. Seqtk is a fast and lightweight tool for processing biological data (FASTA/FASTQ). if you have a list of identifiers that you would like to extract from a file, you can run this command as follows:

#Extract sequences with names in file name.list, one sequence name per line:
seqtk subseq input.fasta name.list > output.fasta

Download with Snakemake !

Surabhi Chaudhary — Wed, 11 Mar 2020 07:16:44 -0500

# list sample names & download URLs.
sample_links = {"ERR458493": "https://osf.io/5daup/download",
                "ERR458494":"https://osf.io/8rvh5/download",
                 "ERR458495":"https://osf.io/2wvn3/download",
                 "ERR458500":"https://osf.io/xju4a/download",
                 "ERR458501": "https://osf.io/nmqe6/download",
                 "ERR458502": "https://osf.io/qfsze/download"}

# the sample names are dictionary keys in sample_links. extract them to a list we can use below
SAMPLES=sample_links.keys()

# download yeast rna-seq data from Schurch et al, 2016 study
rule download_all:
    input:
        expand("rnaseq/raw_data/{sample}.fq.gz", sample=SAMPLES)

# rule to download each individual file specified in sample_links
rule download_reads:
    output: "rnaseq/raw_data/{sample}.fq.gz" 
    params:
        # dynamically generate the download link directly from the dictionary
        download_link = lambda wildcards: sample_links[wildcards.sample]
    shell: """
        curl -L {params.download_link} -o {output}
        """

Loop over with perl

Surabhi Chaudhary — Fri, 04 Aug 2017 11:49:44 -0500

my @ids =qw (scaffold_4
scaffold_4
scaffold_15
scaffold_40
scaffold_44
scaffold_51
scaffold_54
scaffold_129
scaffold_138
scaffold_138
scaffold_180
scaffold_182
scaffold_184
scaffold_219
scaffold_219
scaffold_267
scaffold_273
scaffold_282
scaffold_282
scaffold_458
scaffold_470
scaffold_480
scaffold_521
scaffold_644);

foreach my $i (@ids) {
  print "Working on $i\n";
  mkdir $i;
  system ("./actor.sh $i");
  system ("cp *.* $i");
}

Perl script to extract fasta sequence by matching name/ids !!

Surabhi Chaudhary — Tue, 21 Jun 2016 09:28:19 -0500

#!/usr/bin/perl

use strict;
use warnings;
use Text::Trim qw(trim);

#Usage perl extractSeqbyID.pl ids.txt seq.fasta Result.fasta

$ARGV[2] or die "use extractSeqbyID.pl LIST FASTA OUT\n";

my $list = shift @ARGV;
my $fasta = shift @ARGV;
my $out = shift @ARGV;
my %select;

open LINE, "$list" or die;
while () {
    chomp;
    next if /^\s*$/;
    s/>//g; 
    my @ids=split (/\t/, $_);
    $select{$ids[0]} = 1;
}
my $size = keys %select;
print "Total Ids $size\n";
close LINE;

$/ = "\n>";
open OUT, ">$out" or die;
open FILE, "$fasta" or die;
while () {
    trim($_);
    s/>//g;
    my ($id) = split (/\n/, $_);
    #my @i=split (/\s/, $id); # To avoid >flattened_line_10751 circular cases
    print OUT ">$_" if (defined $select{$id});
}
close FILE;
close OUT;

Perl script to extract lines with matching ids !!

Surabhi Chaudhary — Tue, 21 Jun 2016 09:24:46 -0500

#!/usr/bin/perl
use strict;
use warnings;
my %patterns;

#USAGE: perl extactByIds.pl Idsfile1 file2 > Result

# Open file and get patterns to search for
open(my $fh2,"<","$ARGV[0]")|| die "ERROR: Could not open file2";
while (<$fh2>)
{
   chop;
   $patterns{$_}=1;
}

# Now read data file
open(my $fh1,"<","$ARGV[1]")|| die "ERROR: Could not open file1";
while (<$fh1>)
{
   # You might need to adjust this place according to your file type
   #(undef,$srch,undef)=split;
   my @ids=split (/\t/, $_);
   print $_ if defined $patterns{$ids[0]};
}