-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathGet_GENE_from_IDq.R
139 lines (128 loc) · 6.59 KB
/
Get_GENE_from_IDq.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
#######################################
# Created on Wed Dec 5 11:22:15 2018 #
# author: Andrea Laguillo #
# CNIC Proteomics #
#######################################
#install.packages("data.table")
#install.packages("rlist")
#install.packages("tidyr")
library(data.table)
library(rlist)
library(tidyr)
# PARAMETERS
filepath <- ("input") # Set path to ID-q file
output <- ("output") # Set path to output file
get_species_redundancies <- 1 # 0 for NO redundancies only GET GENE, 1 for YES redundancies and GET GENE
preference_species <- "Sus scrofa" # Set species (as it appears on ID-q file) to check for redundancies,
# ignored if get_species_redundancies set to 0
###################################################################################################
# Get data from FASTA file
IDq <- fread(file=filepath, sep="\t", header=TRUE)
# Define function to get Gene name
getGeneName <- function(x, preference_species=NA) {
if (is.na(x["Preference_Species"])) { # Not looking at redundancies
gene <- substring(regmatches(x["FASTAProteinDescription"],
regexpr("GN=[a-zA-Z0-9]*",
x["FASTAProteinDescription"])), 4)
if (length(gene) == 0) {
gene <- NA
}
gene <- toString(gene)
return(gene)
}
else {
if (x["Preference_Species"] != "No") { # Redundacy found for that species
gene <- substring(regmatches(x["Preference_Species"],
regexpr("GN=[a-zA-Z0-9]*",
x["Preference_Species"])), 4)
procedence <- preference_species
gene <- paste(gene, procedence, sep=",")
if (length(gene) == 0) { # Gene not found
# Get from original column instead
gene <- substring(regmatches(x["FASTAProteinDescription"],
regexpr("GN=[a-zA-Z0-9]*",
x["FASTAProteinDescription"])), 4)
if (length(gene) == 0) {
gene <- NA
return(gene)
}
procedence <- "Homo sapiens"
gene <- paste(gene, procedence, sep=",")
}
if (substring(gene, 1, 3) == "LOC") { # Gene is LOC*
# Get from original column instead
gene <- substring(regmatches(x["FASTAProteinDescription"],
regexpr("GN=[a-zA-Z0-9]*",
x["FASTAProteinDescription"])), 4)
if (length(gene) == 0) {
gene <- NA
return(gene)
}
procedence <- "Homo sapiens"
gene <- paste(gene, procedence, sep=",")
}
#gene <- toString(gene)
return(gene)
}
else { # Redundacy not found for that species
gene <- substring(regmatches(x["FASTAProteinDescription"],
regexpr("GN=[a-zA-Z0-9]*",
x["FASTAProteinDescription"])), 4)
if (length(gene) == 0) {
gene <- NA
return(gene)
}
#gene <- toString(gene)
procedence <- substring(regmatches(x["FASTAProteinDescription"],
regexpr("OS=[a-zA-Z0-9]* [a-zA-Z0-9]*",
x["FASTAProteinDescription"])), 4)
gene <- paste(gene, procedence, sep=",")
return(gene)
}
}
}
# Define function to find Species redundancies
getSpeciesRedundancies <- function(x, preference_species) {
header <- x["FASTAProteinDescription"]
h_species <- substring(regmatches(x["FASTAProteinDescription"],
regexpr("OS=[a-zA-Z0-9]* [a-zA-Z0-9]*",
x["FASTAProteinDescription"])), 4)
if (h_species == "Homo sapiens"){ # Look for redundancies
redundancies <- x["Redundances"]
redundancies <- strsplit(redundancies, " -- ")
for (i in redundancies[[1]]){
r_species <- substring(regmatches(i, regexpr("OS=[a-zA-Z0-9]* [a-zA-Z0-9]*",
i)), 4)
if (r_species == preference_species){ # Redundancy found
gene <- substring(regmatches(i, regexpr("GN=[a-zA-Z0-9]*", i)), 4)
if (length(gene) == 0) { # Redundancy does not contain Gene name
return("No")
break
}
return(as.character(i))
break
}
}
}
return("No")
}
# Create new columns
IDq[, c("Preference_Species", "Gene")] <- NA
IDq$Preference_Species <- as.character(IDq$Preference_Species)
IDq$Gene <- as.character(IDq$Gene)
if (get_species_redundancies == 0) {
IDq$Gene <- apply(IDq, 1, getGeneName)
IDq$Preference_Species <- NULL
}
if (get_species_redundancies == 1) {
IDq$Preference_Species <- apply(IDq, 1, getSpeciesRedundancies, preference_species)
IDq$Gene <- apply(IDq, 1, getGeneName, preference_species)
IDq <- IDq%>% separate(Gene, c("Gene", "Procedence"), sep=",")
}
# Remove rows for which there is no Gene name
IDq <- IDq[complete.cases(IDq[,"Gene"]),]
column_name <- gsub(" ", "_", preference_species)
column_name <- paste("Redundancy_", column_name, sep="")
names(IDq)[names(IDq) == "Preference_Species"] <- column_name
# Write to file
fwrite(IDq, output, sep="\t")