shengxinbaixiaosheng
/
RNA-Seq-Workshop-Library-Preparation-and-Introduction-to-Data-Analysis-2
Public
forked from ucdavis-bioinformatics-training/RNA-Seq-Workshop-Library-Preparation-and-Introduction-to-Data-Analysis-2
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathPrepare_Download_Study.R
73 lines (59 loc) · 4.51 KB
/
Prepare_Download_Study.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
### download data from the article
### http://www.ncbi.nlm.nih.gov/pmc/articles/PMC4059230/
### and SRA project
### http://www.ncbi.nlm.nih.gov/sra/?term=SRP006291
### subselect a random 5M reads (average size 27M too large), and edit read id to be CASAVA 1.8 and output as gzipped
## scripts requires the sratoolkit to be available on the path, specifically the fastq-dump command [https://github.com/ncbi/sra-tools/wiki/Downloads]
library(parallel)
library(ShortRead)
library(tools)
"SRP006291"
file2get <- "http://trace.ncbi.nlm.nih.gov/Traces/sra/sra.cgi?save=efetch&rettype=runinfo&db=sra&WebEnv=NCID_1_16454837_130.14.18.48_5555_1435842911_675372944_0MetA0_S_HStore&query_key=1"
download.file(file2get,"SraRunInfo.csv")
samples = c("SRR358569", "SRR358570", "SRR358573", "SRR358574", "SRR358575", "SRR358576")
sample_names = c("NIH8656_pH2.5_3", "NIH8656_pH6_3", "NIH8656_pH6_1", "NIH8656_pH2.5_1", "NIH8656_pH6_2", "NIH8656_pH2.5_2")
treatment = c("pH2.5", "pH6", "pH6", "pH2.5", "pH6", "pH2.5")
cores <- min(detectCores(),length(samples))
write.table(data.frame("SEQUENCE_ID"= samples, "SAMPLE_ID" = sample_names, "treatment" = treatment),"samples.txt",sep="\t",row.names=F,col.names=T,quote=F)
"getSRR" = function(x) paste("ftp://ftp-trace.ncbi.nlm.nih.gov/sra/sra-instant/reads/ByRun/sra",substring(x,1,3),substring(x,1,6),x,paste0(x,".sra"),sep="/")
dir.create("00-RawData")
output <- mclapply(samples,
function(sample) {
dir.create(file.path("00-RawData",sample))
file2get = getSRR(sample)
download.file(file2get,file.path("00-RawData",sample,paste0(sample,".sra")))
cmd <- paste("fastq-dump --split-3 -F --gzip -O", file.path("00-RawData",sample), file.path("00-RawData", sample, paste0(sample,".sra")),">", file.path("00-RawData", sample, paste0(sample,".sra_extract.txt")))
system(cmd)
## read in and subsample to 5M per sample
r1 <- grep ('_1.fastq',dir(file.path("00-RawData",sample),full.names = T),value=T)
r2 <- grep ('_2.fastq',dir(file.path("00-RawData",sample),full.names = T),value=T)
fq1 <- readFastq(r1)
fq2 <- readFastq(r2)
samp <- sort(sample(1:length(fq1),5e6))
fq1_r <- fq1[samp]
fq2_r <- fq2[samp]
fq1_r@id <- BStringSet(sapply(strsplit(as.character(id(fq1_r)),split=":"),function(x) paste(paste(x[1],"136","FC706VJ",paste(x[2:5],collapse=":"),sep=":"),"1:N:0:",sep=" "))) ## made up run and flowcell id
fq2_r@id <- BStringSet(sapply(strsplit(as.character(id(fq2_r)),split=":"),function(x) paste(paste(x[1],"136","FC706VJ",paste(x[2:5],collapse=":"),sep=":"),"2:N:0:",sep=" "))) ## made up run and flowcell id
out_r1 <- sub("_1.fastq","_5M_R1.fastq",r1) ## convert to R1,R2 format for compatability with expHTS
out_r2 <- sub("_2.fastq","_5M_R2.fastq",r2)
if (file_ext(out_r1) != "gz") out_r1 <- paste0(out_r1,".gz")
if (file_ext(out_r2) != "gz") out_r2 <- paste0(out_r2,".gz")
if (file.exists(out_r1)) unlink(out_r1)
if (file.exists(out_r2)) unlink(out_r2)
writeFastq(fq1_r,out_r1)
writeFastq(fq2_r,out_r2)
}, mc.cores = cores)
dir.create("Reference")
## Get Reference Genome from Ensembl:
file2get = "ftp://ftp.ensemblgenomes.org/pub/fungi/release-27/fasta/exophiala_dermatitidis_nih_ut8656/dna/Exophiala_dermatitidis_nih_ut8656.GCA_000230625.1.27.dna.toplevel.fa.gz"
download.file(file2get,file.path("Reference",basename(file2get)))
system(paste("gunzip",file.path("Reference",basename(file2get))))
## Get Reference GTF transcript file:
file2get = "ftp://ftp.ensemblgenomes.org/pub/fungi/release-27/gtf/exophiala_dermatitidis_nih_ut8656/Exophiala_dermatitidis_nih_ut8656.GCA_000230625.1.27.gtf.gz"
download.file(file2get,file.path("Reference",basename(file2get)))
system(paste("gunzip",file.path("Reference",basename(file2get))))
## Get Pfam Annotation:
file2get = "http://www.broadinstitute.org/annotation/genome/Black_Yeasts/download/?sp=EAProteinFamilytoGenes&sp=SExop_derm_V1&sp=S.zip"
download.file(file2get,file.path("Reference",basename(file2get)))
unzip(file.path("Reference",basename(file2get)),exdir = "Reference")
file.remove(file.path("Reference",basename(file2get)))