|
| 1 | +import scipy as sp |
| 2 | + |
| 3 | +from ..utils import * |
| 4 | + |
| 5 | +import pdb |
| 6 | + |
| 7 | +class Splicegraph: |
| 8 | + |
| 9 | + def __init__(self, gene = None): |
| 10 | + |
| 11 | + self.vertices = sp.zeros((2, 0), dtype='int') |
| 12 | + self.edges = sp.zeros((0, 0), dtype='int') |
| 13 | + self.terminals = sp.zeros((2, 0), dtype='int') |
| 14 | + |
| 15 | + if gene: |
| 16 | + self.from_gene(gene) |
| 17 | + |
| 18 | + def get_len(self): |
| 19 | + |
| 20 | + return self.vertices.shape[1] |
| 21 | + |
| 22 | + def new_edge(self): |
| 23 | + |
| 24 | + self.edges = sp.c_[self.edges, sp.zeros((self.edges.shape[0], 1), dtype='int')] |
| 25 | + self.edges = sp.r_[self.edges, sp.zeros((1, self.edges.shape[1]), dtype='int')] |
| 26 | + |
| 27 | + def subset(self, keep_idx): |
| 28 | + |
| 29 | + self.vertices = self.vertices[:, keep_idx] |
| 30 | + self.edges = self.edges[keep_idx, :][:, keep_idx] |
| 31 | + self.terminals = self.terminals[:, keep_idx] |
| 32 | + |
| 33 | + def update_terminals(self): |
| 34 | + |
| 35 | + self.terminals = sp.zeros(self.vertices.shape, dtype='int') |
| 36 | + self.terminals[0, sp.where(sp.sum(sp.tril(self.edges), axis=1) == 0)[0]] = 1 |
| 37 | + self.terminals[1, sp.where(sp.sum(sp.triu(self.edges), axis=1) == 0)[0]] = 1 |
| 38 | + |
| 39 | + def reorder(self, idx): |
| 40 | + |
| 41 | + self.vertices = self.vertices[:, idx] |
| 42 | + self.edges = self.edges[idx, :][:, idx] |
| 43 | + self.terminals = self.terminals[:, idx] |
| 44 | + |
| 45 | + def sort(self): |
| 46 | + |
| 47 | + s_idx = sp.lexsort([self.vertices[1, :], self.vertices[0, :]]) |
| 48 | + self.reorder(s_idx) |
| 49 | + |
| 50 | + def from_gene(self, gene): |
| 51 | + |
| 52 | + for transcript_idx in range(len(gene.transcripts)): |
| 53 | + exon_start_end = gene.exons[transcript_idx] |
| 54 | + |
| 55 | + ### only one exon in the transcript |
| 56 | + if exon_start_end.shape[0] == 1: |
| 57 | + exon1_start = exon_start_end[0, 0] |
| 58 | + exon1_end = exon_start_end[0, 1] |
| 59 | + |
| 60 | + if self.vertices.shape[1] == 0: |
| 61 | + self.vertices = sp.array([[exon1_start], [exon1_end]], dtype='int') |
| 62 | + self.edges = sp.array([[0]], dtype='int') |
| 63 | + else: |
| 64 | + self.vertices = sp.c_[self.vertices, [exon1_start, exon1_end]] |
| 65 | + self.new_edge() |
| 66 | + ### more than one exon in the transcript |
| 67 | + else: |
| 68 | + for exon_idx in xrange(exon_start_end.shape[0] - 1): |
| 69 | + exon1_start = exon_start_end[exon_idx , 0] |
| 70 | + exon1_end = exon_start_end[exon_idx, 1] |
| 71 | + exon2_start = exon_start_end[exon_idx + 1, 0] |
| 72 | + exon2_end = exon_start_end[exon_idx + 1, 1] |
| 73 | + |
| 74 | + if self.vertices.shape[1] == 0: |
| 75 | + self.vertices = sp.array([[exon1_start, exon2_start], [exon1_end, exon2_end]], dtype='int') |
| 76 | + self.edges = sp.array([[0, 1], [1, 0]], dtype='int') |
| 77 | + else: |
| 78 | + exon1_idx = -1 |
| 79 | + exon2_idx = -1 |
| 80 | + ### check if current exon already occurred |
| 81 | + for idx in range(self.vertices.shape[1]): |
| 82 | + if ((self.vertices[0, idx] == exon1_start) and (self.vertices[1, idx] == exon1_end)): |
| 83 | + exon1_idx = idx |
| 84 | + if ((self.vertices[0, idx] == exon2_start) and (self.vertices[1, idx] == exon2_end)): |
| 85 | + exon2_idx = idx |
| 86 | + |
| 87 | + ### both exons already occured -> only add an edge |
| 88 | + if (exon1_idx != -1) and (exon2_idx != -1): |
| 89 | + self.edges[exon1_idx, exon2_idx] = 1 |
| 90 | + self.edges[exon2_idx, exon1_idx] = 1 |
| 91 | + else: |
| 92 | + ### 2nd exon occured |
| 93 | + if ((exon1_idx == -1) and (exon2_idx != -1)): |
| 94 | + self.vertices = sp.c_[self.vertices, [exon1_start, exon1_end]] |
| 95 | + self.new_edge() |
| 96 | + self.edges[exon2_idx, -1] = 1 |
| 97 | + self.edges[-1, exon2_idx] = 1 |
| 98 | + ### 1st exon occured |
| 99 | + elif ((exon2_idx == -1) and (exon1_idx != -1)): |
| 100 | + self.vertices = sp.c_[self.vertices, [exon2_start, exon2_end]] |
| 101 | + self.new_edge() |
| 102 | + self.edges[exon1_idx, -1] = 1 |
| 103 | + self.edges[-1, exon1_idx] = 1 |
| 104 | + ### no exon occured |
| 105 | + else: |
| 106 | + assert((exon1_idx == -1) and (exon2_idx == -1)) |
| 107 | + self.vertices = sp.c_[self.vertices, [exon1_start, exon1_end]] |
| 108 | + self.vertices = sp.c_[self.vertices, [exon2_start, exon2_end]] |
| 109 | + self.new_edge() |
| 110 | + self.new_edge() |
| 111 | + self.edges[-2, -1] = 1 |
| 112 | + self.edges[-1, -2] = 1 |
| 113 | + |
| 114 | + ### take care of the sorting by exon start |
| 115 | + s_idx = sp.argsort(self.vertices[0, :]) |
| 116 | + self.vertices = self.vertices[:, s_idx] |
| 117 | + self.edges = self.edges[s_idx, :][:, s_idx] |
| 118 | + self.terminals = sp.zeros(self.vertices.shape, dtype='int') |
| 119 | + self.terminals[0, sp.where(sp.tril(self.edges).sum(axis=1) == 0)[0]] = 1 |
| 120 | + self.terminals[1, sp.where(sp.triu(self.edges).sum(axis=1) == 0)[0]] = 1 |
| 121 | + |
| 122 | + |
| 123 | + def from_matfile(self, mat_struct): |
| 124 | + """generates a splicing graph structure from a matfile structure""" |
| 125 | + |
| 126 | + self.vertices = mat_struct['splicegraph'][0, 0].astype('int') |
| 127 | + self.edges = mat_struct['splicegraph'][0, 1].astype('int') |
| 128 | + self.terminals = mat_struct['splicegraph'][0, 2].astype('int') |
| 129 | + |
| 130 | + |
| 131 | + def add_intron(self, idx1, flag1, idx2, flag2): |
| 132 | + """adds new introns into splicegraph between idx1 and idx2""" |
| 133 | + |
| 134 | + ### if flag1, all end terminal exons in idx1 are preserved |
| 135 | + ### if flag2, all start terminal exons in idx2 are preserved |
| 136 | + |
| 137 | + if idx2.shape[0] > 0: |
| 138 | + adj_mat = sp.triu(self.edges) |
| 139 | + |
| 140 | + if flag1: |
| 141 | + for i1 in idx1: |
| 142 | + |
| 143 | + ### if exon is end-terminal |
| 144 | + if sp.all(adj_mat[i1, :] == 0): |
| 145 | + |
| 146 | + self.vertices = sp.c_[self.vertices, self.vertices[:, i1]] |
| 147 | + |
| 148 | + self.new_edge() |
| 149 | + self.edges[:, -1] = self.edges[:, i1] |
| 150 | + self.edges[-1, :] = self.edges[i1, :] |
| 151 | + |
| 152 | + self.terminals = sp.c_[self.terminals, self.terminals[:, i1]] |
| 153 | + if flag2: |
| 154 | + for i2 in idx2: |
| 155 | + ### if exon is start-terminal |
| 156 | + if sp.all(adj_mat[:, i2] == 0): |
| 157 | + self.vertices = sp.c_[self.vertices, self.vertices[:, i2]] |
| 158 | + |
| 159 | + self.new_edge() |
| 160 | + self.edges[:, -1] = self.edges[:, i2] |
| 161 | + self.edges[-1, :] = self.edges[i2, :] |
| 162 | + |
| 163 | + self.terminals = sp.c_[self.terminals, self.terminals[:, i2]] |
| 164 | + |
| 165 | + for i1 in idx1: |
| 166 | + for i2 in idx2: |
| 167 | + self.edges[i1, i2] = 1 |
| 168 | + self.edges[i2, i1] = 1 |
| 169 | + |
| 170 | + self.uniquify() |
| 171 | + |
| 172 | + def add_cassette_exon(self, new_exon, exons_pre, exons_aft): |
| 173 | + ### exon_pre contains the indices of preceding exons |
| 174 | + ### exon_aft contains the indices of successing exons |
| 175 | + |
| 176 | + self.vertices = sp.c_[self.vertices, new_exon] |
| 177 | + |
| 178 | + self.new_edge() |
| 179 | + |
| 180 | + self.edges[exons_pre, -1] = 1 |
| 181 | + self.edges[exons_aft, -1] = 1 |
| 182 | + self.edges[-1, :] = self.edges[:, -1].T |
| 183 | + |
| 184 | + self.terminals = sp.c_[self.terminals, sp.zeros((2,), dtype='int')] |
| 185 | + |
| 186 | + |
| 187 | + def add_intron_retention(self, idx1, idx2): |
| 188 | + |
| 189 | + adj_mat = sp.triu(self.edges) |
| 190 | + |
| 191 | + self.vertices = sp.c_[self.vertices, sp.array([self.vertices[0, idx1], self.vertices[1, idx2]], dtype='int')] |
| 192 | + |
| 193 | + self.new_edge() |
| 194 | + |
| 195 | + adj_mat = sp.r_[adj_mat, sp.zeros((1, adj_mat.shape[1]), dtype='int')] |
| 196 | + adj_mat = sp.c_[adj_mat, sp.zeros((adj_mat.shape[0], 1), dtype='int')] |
| 197 | + |
| 198 | + ### check if adjacency matrix is symmetric |
| 199 | + ### otherwise or is not justyfied |
| 200 | + assert(sp.all(sp.all(adj_mat - (self.edges - adj_mat).T == 0))) |
| 201 | + |
| 202 | + ### AK: under the assumption that our splice graph representation is symmetric |
| 203 | + ### I preserve symmetry by using OR over the adj_mat column and row |
| 204 | + |
| 205 | + self.edges[:, -1] = adj_mat[:, idx1] | adj_mat[idx2, :].T |
| 206 | + self.edges[-1, :] = adj_mat[:, idx1].T | adj_mat[idx2, :] |
| 207 | + |
| 208 | + self.terminals = sp.c_[self.terminals, sp.array([self.terminals[0, idx1], self.terminals[1, idx2]], dtype='int')] |
| 209 | + |
| 210 | + def uniquify(self): |
| 211 | + # OUTPUT: splice graph that has been made unique on exons for each gene |
| 212 | + |
| 213 | + self.sort() |
| 214 | + (s_tmp, s_idx) = sort_rows(self.vertices.T, index=True) |
| 215 | + self.vertices = s_tmp.T |
| 216 | + self.edges = self.edges[s_idx, :][:, s_idx] |
| 217 | + self.terminals = self.terminals[:, s_idx] |
| 218 | + |
| 219 | + rm_idx = [] |
| 220 | + for j in range(1, self.vertices.shape[1]): |
| 221 | + if sp.all(self.vertices[:, j-1] == self.vertices[:, j]): |
| 222 | + self.edges[:, j] = self.edges[:, j-1] | self.edges[:, j] |
| 223 | + self.edges[j, :] = self.edges[j-1, :] | self.edges[j, :] |
| 224 | + rm_idx.append(j - 1) |
| 225 | + |
| 226 | + keep_idx = sp.where(~sp.in1d(sp.array(range(self.vertices.shape[1])), rm_idx))[0] |
| 227 | + self.vertices = self.vertices[:, keep_idx] |
| 228 | + self.edges = self.edges[keep_idx, :][:, keep_idx] |
| 229 | + self.terminals = self.terminals[:, keep_idx] |
| 230 | + |
0 commit comments