-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathcode_ML.R
141 lines (121 loc) · 5.99 KB
/
code_ML.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
##########################################################################################################
# #
# Method for generating a master list / Index of DNaseI hypersensitivity sites. #
# All code, implementation details and design by Wouter Meuleman and Eric Rynes. #
# #
# Version: WM20180313 #
# #
##########################################################################################################
merge_overlap <- function(DHSs, peaks, type="summit", force=FALSE) {
num <- 0;
#core_starts <- pmax(DHSs$start, DHSs$summit+DHSs$conf95_low, na.rm=T)
#core_ends <- pmin(DHSs$end, DHSs$summit+DHSs$conf95_high, na.rm=T)
core_starts <- pmax(DHSs$start, DHSs$disp_low, na.rm=T)
core_ends <- pmin(DHSs$end, DHSs$disp_high, na.rm=T)
score_ord <- order(DHSs$score);
for (i in score_ord) {
idx <- c()
if (type=="summit") {
idx <- which(DHSs$start <= DHSs$summit[i] & DHSs$end >= DHSs$summit[i])
} else if (type=="core") {
idx <- which(core_starts <= core_ends[i] & core_ends >= core_starts[i])
} else if (type=="any") {
idx <- which(DHSs$start <= DHSs$end[i] & DHSs$end >= DHSs$start[i])
}
idx <- idx[which.max(DHSs$score[idx])]
if (length(idx) == 0) {
print(paste("Summit misalignment for DHS on line", i, "-- skipping merge"));
next
}
# If merging these DHSs results in lots of multi-peak contributions from samples... don't merge.
tab <- tabulate(table(peaks$sampleID[which(peaks$ID %in% DHSs$ID[c(i,idx)])]))
if (which.max(tab * 1:length(tab)) != 1 & sum(tab) > 10 & !force) {
print(paste("Not merging DHSs", DHSs$ID[i], "and", DHSs$ID[idx],
"because of too many multi-peak sample contributions!"))
next;
}
if (idx != i) {
peaks$ID[which(peaks$ID==DHSs$ID[i])] <- DHSs$ID[idx]
DHSs[idx,] <- get_FWHM(peaks[which(peaks$ID==DHSs$ID[idx]),])
DHSs[i,] <- rep(NA, ncol(DHSs))
num <- num+1;
}
}
DHSs <- DHSs[which(!is.na(DHSs$seqname)),]
DHSs <- DHSs[order(DHSs$seqname, DHSs$start, DHSs$end),]
list(DHSs=DHSs, peaks=peaks, num=num);
}
get_min_max <- function(x, fun="max", pos="center") {
idx <- which(x == eval(parse(text=fun))(x, na.rm=T));
if (pos=="center") {
return(idx[ceiling(length(idx)/2)])
} else if (pos=="left") {
return(idx[1])
} else if (pos=="right") {
return(idx[length(idx)])
}
}
get_FWHM <- function(res_peaks) {
# Obtain range of values and tabulate to create pile-up
xlim <- range(c(res_peaks$start, res_peaks$end))
tab <- tabulate(unlist(apply(res_peaks, 1, function(x) x[2]:x[3]))-xlim[1]+1)
tab_max <- get_min_max(tab, fun="max") # Identify maximal point in pile-up
# Determine upstream and downstream boundaries
us <- c(1:tab_max)[get_min_max(abs(tab[1:tab_max] - (max(tab)/2)), fun="min", pos="left")]
ds <- c(tab_max:length(tab))[get_min_max(abs(tab[tab_max:length(tab)] - (max(tab)/2)), fun="min", pos="right")]
start <- us+xlim[1]-1
end <- ds+xlim[1]-1
score <- sum(tapply(res_peaks$score, res_peaks$sampleID, max))
numsamples <- length(unique(res_peaks$sampleID))
numpeaks <- length(res_peaks$sampleID)
summit <- round(median(res_peaks$wavelet_summit));
#dispersion <- mad(res_peaks$wavelet_summit) # constant = 1.4826
#dispersion <- mad(res_peaks$wavelet_summit, constant=1) # constant = 1
#conf95 <- sort(res_peaks$wavelet_summit-summit)[pmax(1, qbinom(c(.025,.975), length(res_peaks$wavelet_summit), 0.5))]
#conf95_low <- conf95[1];
#conf95_high <- conf95[2];
#disp <- range(res_peaks$wavelet_summit);
disp <- quantile(res_peaks$wavelet_summit, prob=c(0.025, 0.975));
disp_low <- max(min(res_peaks$wavelet_summit), round(disp[1]));
disp_high <- min(max(res_peaks$wavelet_summit), round(disp[2]));
# If the (wavelet) summit is outside the delineated region, we use the median center instead.
if (summit < start | summit > end) {
summit <- round(median((start+end)/2))
#dispersion <- NA
#conf95_low <- conf95_high <- NA
disp_low <- disp_high <- NA
}
# Return resulting region with summary statistics
data.frame(seqname=res_peaks$seqname[1], start=start, end=end, ID=res_peaks$ID[1], score=score, numsamples=numsamples,
numpeaks=numpeaks, width=end-start, summit=summit, disp_low=disp_low, disp_high=disp_high, stringsAsFactors=FALSE)
}
get_cut_points <- function(summit_loc) {
xlim <- range(summit_loc);
n <- diff(xlim)+1
if (n < 20) return(c(0, n))
# turn ~20bp resolution data into 20bp-window smoothed versions
cont_dat <- tabulate(summit_loc)
smooth_dat <- runmean(cont_dat, 20, endrule="constant")
# Determine potential points to chop up peak clump
#cutlevel <- max(smooth_dat)/2 # Threshold for minimal signal to consider for cutting
#cutlevel <- min(1, median(smooth_dat)) # WM20180303
cutlevel <- median(smooth_dat) # WM20180302b -- works well!
idx <- which(diff(smooth_dat < cutlevel) != 0)
idx <- c(0, idx, n)
cuts <- c(0)
if (length(idx) > 2 & max(smooth_dat) >= 1) { # Only determine cut-sites if the maximum signal observed is at least 1
cuts <- c(cuts, sapply(1:(length(idx)-1), function(i) {
min_pos <- get_min_max(smooth_dat[idx[i]:idx[i+1]], fun="min", pos="center")
ifelse(min_pos == 1, NA, idx[i] + min_pos -1)
}))
}
cuts <- cuts[!is.na(cuts)]
cuts <- c(cuts, n);
# Clean up near the ends
if (sum(smooth_dat[1:cuts[2]]) < sum(smooth_dat)*0.01)
cuts <- cuts[-2];
if (sum(smooth_dat[cuts[length(cuts)-1]:cuts[length(cuts)]]) < sum(smooth_dat)*0.01)
cuts <- cuts[-(length(cuts)-1)];
if (is.unsorted(cuts)) stop("Unordered cut-point list -- this should not happen!")
return(cuts)
}