1
- # Copyright (c) 2014-2021 Sentieon Inc. All rights reserved
1
+ # Copyright (c) 2014-2024 Sentieon Inc. All rights reserved
2
2
import collections
3
+ import os
3
4
import struct
4
5
import sys
5
6
9
10
__all__ = ['Tabix' ]
10
11
11
12
class Header (object ):
12
- __slots__ = ('magic' , 'n_ref' , 'format' ,
13
- 'col_seq' , 'col_beg' , 'col_end' , 'meta' , 'skip' , 'l_nm' )
13
+ __slots__ = ('format' , 'col_seq' , 'col_beg' , 'col_end' , 'meta' , 'skip' )
14
14
def __init__ (self , * args ):
15
15
for k ,v in zip (self .__slots__ , args ):
16
16
setattr (self , k , v )
@@ -19,13 +19,12 @@ def __iter__(self):
19
19
yield getattr (self , k )
20
20
21
21
class Tabix (object ):
22
- SHIFTS = (14 , 17 , 20 , 23 , 26 , 29 )
23
- MAXBIN = ((1 << SHIFTS [- 1 ]- SHIFTS [0 ]+ 3 ) - 1 ) // 7 + 1
24
- MAGIC = 0x01494254
22
+ TBI_MAGIC = 0x01494254
23
+ CSI_MAGIC = 0x01495343
25
24
FMT_GENERIC , FMT_SAM , FMT_VCF , FMT_ZERO_BASED = 0 , 1 , 2 , 0x10000
26
25
27
- def __init__ (self , idxf , mode = 'r' ):
28
- self .path = idxf
26
+ def __init__ (self , path , mode = 'r' ):
27
+ self .path = path
29
28
self .mode = mode
30
29
if mode [0 :1 ] == 'r' :
31
30
self .load ()
@@ -37,72 +36,116 @@ def __init__(self, idxf, mode='r'):
37
36
def load (self ):
38
37
self .indices = collections .OrderedDict ()
39
38
40
- with bgzf .open (self .path , 'rb' ) as fp :
39
+ if os .path .exists (self .path + '.csi' ):
40
+ idxf = self .path + '.csi'
41
+ magic = self .CSI_MAGIC
42
+ else :
43
+ idxf = self .path + '.tbi'
44
+ magic = self .TBI_MAGIC
45
+
46
+ with bgzf .open (idxf , 'rb' ) as fp :
41
47
s4 = struct .Struct ('<L' )
42
48
s8 = struct .Struct ('<Q' )
43
- sh = struct .Struct ('<9L' )
44
- data = fp .read ()
45
- off = 0
46
- h = Header (* sh .unpack_from (data , off )); off += sh .size
47
- if h .magic != self .MAGIC :
49
+ sh = struct .Struct ('<6L' )
50
+ data = fp .read (); off = 0
51
+ self .magic , = s4 .unpack_from (data , off ); off += s4 .size
52
+ if self .magic != magic :
48
53
raise RuntimeError ('Not a tabix file' )
49
- self .header = h
50
- names , l_nm = [], 0
51
- for i in xrange (h .n_ref ):
52
- eos = data .find (b'\0 ' , off )
53
- if eos < 0 : break
54
- names .append (data [off :eos ].decode ())
55
- l_nm += eos + 1 - off
56
- off = eos + 1
57
- if h .l_nm != l_nm :
54
+ if self .magic == self .TBI_MAGIC :
55
+ self .min_shift , self .depth = 14 , 5
56
+ n_ref , = s4 .unpack_from (data , off ); off += s4 .size
57
+ aux = sh .unpack_from (data , off ); off += sh .size
58
+ l_nm , = s4 .unpack_from (data , off ); off += s4 .size
59
+ names = data [off :off + l_nm ].split (b'\0 ' ); off += l_nm
60
+ else :
61
+ self .min_shift , = s4 .unpack_from (data , off ); off += s4 .size
62
+ self .depth , = s4 .unpack_from (data , off ); off += s4 .size
63
+ l_aux , = s4 .unpack_from (data , off ); off += s4 .size
64
+ if l_aux < sh .size + s4 .size :
65
+ raise RuntimeError ('Invalid header' )
66
+ aux = sh .unpack_from (data , off ); off += sh .size
67
+ l_nm , = s4 .unpack_from (data , off ); off += s4 .size
68
+ names = data [off :off + l_nm ].split (b'\0 ' ); off += l_nm
69
+ off += l_aux - (sh .size + s4 .size + l_nm )
70
+ n_ref , = s4 .unpack_from (data , off ); off += s4 .size
71
+ if len (names ) != n_ref + 1 or len (names [- 1 ]) != 0 :
58
72
raise RuntimeError ('Header sequence name length mismatch' )
59
- for i in xrange (h .n_ref ):
73
+ self .header = Header (* aux )
74
+ for i in xrange (n_ref ):
60
75
bins = {}
61
76
n_bin , = s4 .unpack_from (data , off ); off += s4 .size
62
77
for _ in xrange (n_bin ):
63
78
bin , = s4 .unpack_from (data , off ); off += s4 .size
79
+ if self .magic == self .TBI_MAGIC :
80
+ loffset = 0
81
+ else :
82
+ loffset , = s8 .unpack_from (data , off ); off += s8 .size
64
83
chunks = []
65
84
n_chunk , = s4 .unpack_from (data , off ); off += s4 .size
66
85
for _ in xrange (n_chunk ):
67
86
s , = s8 .unpack_from (data , off ); off += s8 .size
68
87
e , = s8 .unpack_from (data , off ); off += s8 .size
69
88
chunks .append ((s , e ))
70
- bins [bin ] = chunks
89
+ bins [bin ] = ( loffset , chunks )
71
90
intvs = []
72
- n_intv , = s4 .unpack_from (data , off ); off += s4 .size
73
- for _ in xrange (n_intv ):
74
- o , = s8 .unpack_from (data , off ); off += s8 .size
75
- intvs .append (o )
76
- if n_intv == 0 :
77
- intvs .append (0 )
78
- self .indices [names [i ]] = (bins , intvs )
91
+ if self .magic == self .TBI_MAGIC :
92
+ n_intv , = s4 .unpack_from (data , off ); off += s4 .size
93
+ for _ in xrange (n_intv ):
94
+ o , = s8 .unpack_from (data , off ); off += s8 .size
95
+ intvs .append (o )
96
+ if n_intv == 0 :
97
+ intvs .append (0 )
98
+ self .indices [names [i ].decode ()] = (bins , intvs )
99
+ self .max_shift = self .min_shift + self .depth * 3
79
100
80
101
def save (self ):
81
102
if self .header is None :
82
103
return
83
104
self .add (None , 0 , 0 , self .end )
84
- h = self .header
85
- h .n_ref = len (self .indices )
86
- nms = b'' .join (c .encode ()+ b'\0 ' for c ,_ in iteritems (self .indices ))
87
- h .l_nm = len (nms )
88
- with bgzf .open (self .path , 'wb' ) as fp :
105
+
106
+ for ext in ('.tbi' , '.csi' ):
107
+ f = self .path + ext
108
+ if os .path .exists (f ): os .remove (f )
109
+
110
+ if self .magic == self .TBI_MAGIC :
111
+ idxf = self .path + '.tbi'
112
+ else :
113
+ idxf = self .path + '.csi'
114
+
115
+ with bgzf .open (idxf , 'wb' ) as fp :
89
116
s4 = struct .Struct ('<L' )
90
117
s8 = struct .Struct ('<Q' )
91
- sh = struct .Struct ('<9L' )
92
- fp .write (sh .pack (* h ))
93
- fp .write (nms )
118
+ sh = struct .Struct ('<6L' )
119
+ nms = b'' .join (c .encode ()+ b'\0 ' for c ,_ in iteritems (self .indices ))
120
+ fp .write (s4 .pack (self .magic ))
121
+ if self .magic == self .TBI_MAGIC :
122
+ fp .write (s4 .pack (len (self .indices )))
123
+ fp .write (sh .pack (* self .header ))
124
+ fp .write (s4 .pack (len (nms )))
125
+ fp .write (nms )
126
+ else :
127
+ fp .write (s4 .pack (self .min_shift ))
128
+ fp .write (s4 .pack (self .depth ))
129
+ fp .write (s4 .pack (sh .size + s4 .size + len (nms )))
130
+ fp .write (sh .pack (* self .header ))
131
+ fp .write (s4 .pack (len (nms )))
132
+ fp .write (nms )
133
+ fp .write (s4 .pack (len (self .indices )))
94
134
for c , (bins , intvs ) in iteritems (self .indices ):
95
135
fp .write (s4 .pack (len (bins )))
96
136
for bin in sorted (bins .keys ()):
97
- chunks = bins [bin ]
137
+ loffset , chunks = bins [bin ]
98
138
fp .write (s4 .pack (bin ))
139
+ if self .magic != self .TBI_MAGIC :
140
+ fp .write (s8 .pack (loffset ))
99
141
fp .write (s4 .pack (len (chunks )))
100
142
for s ,e in chunks :
101
143
fp .write (s8 .pack (s ))
102
144
fp .write (s8 .pack (e ))
103
- fp .write (s4 .pack (len (intvs )))
104
- for o in intvs :
105
- fp .write (s8 .pack (o ))
145
+ if self .magic == self .TBI_MAGIC :
146
+ fp .write (s4 .pack (len (intvs )))
147
+ for o in intvs :
148
+ fp .write (s8 .pack (o ))
106
149
self .header = None
107
150
108
151
def query (self , c , s , e ):
@@ -111,19 +154,24 @@ def query(self, c, s, e):
111
154
if ci is None :
112
155
return ranges
113
156
s = max (s , 0 )
114
- i = s >> self .SHIFTS [ 0 ]
115
- minoff = ci [1 ][i ] if i < len (ci [1 ]) else ci [1 ][ - 1 ]
116
- for shift in reversed (self .SHIFTS ):
117
- bo = ((1 << 29 - shift ) - 1 ) // 7
157
+ i = s >> self .min_shift
158
+ minoff = ci [1 ][min ( i , len (ci [1 ])- 1 )] if ci [1 ] else 0
159
+ for shift in range (self .max_shift , self . min_shift - 3 , - 3 ):
160
+ bo = ((1 << self . max_shift - shift ) - 1 ) // 7
118
161
bs = bo + (s >> shift )
119
162
be = bo + (e - 1 >> shift )
120
- be = min (be , self .MAXBIN - 1 )
163
+ if not ci [1 ]:
164
+ for bi in xrange (bs , bo - 1 , - 1 ):
165
+ b = ci [0 ].get (bi )
166
+ if b is not None :
167
+ minoff = max (minoff , b [0 ])
168
+ break
121
169
for bi in xrange (bs , be + 1 ):
122
- if bi not in ci [0 ]:
123
- continue
124
- for chunk in ci [ 0 ][ bi ]:
125
- if chunk [ 1 ] > minoff :
126
- ranges . append ( chunk )
170
+ b = ci [0 ]. get ( bi )
171
+ if b is not None :
172
+ ranges . extend ( b [ 1 ])
173
+ if minoff > 0 :
174
+ ranges = [( max ( s , minoff ), e ) for s , e in ranges if e > minoff ]
127
175
return self .merge (ranges , 16 )
128
176
129
177
@staticmethod
@@ -141,14 +189,37 @@ def merge(ranges, shift):
141
189
yield p
142
190
143
191
def init (self ):
144
- h = Header (self .MAGIC , 0 , self .FMT_VCF , 1 , 2 , 2 , ord ('#' ), 0 , 0 )
145
- self .header = h
192
+ self .magic = self .TBI_MAGIC
193
+ self .min_shift = 14
194
+ self .depth = 5
195
+ type = list (map (int , os .getenv ('VCF_INDEX_TYPE' , '1' ).split (':' )))
196
+ if len (type ) > 0 and type [0 ] == 2 :
197
+ self .magic = self .CSI_MAGIC
198
+ if len (type ) > 1 :
199
+ self .min_shift = type [1 ]
200
+ if len (type ) > 2 :
201
+ self .depth = type [2 ]
202
+ self .max_shift = self .min_shift + self .depth * 3
203
+ self .header = Header (self .FMT_VCF , 1 , 2 , 2 , ord ('#' ), 0 )
146
204
self .indices = collections .OrderedDict ()
147
205
self .ci = None
148
206
self .pos = 0
149
207
self .end = 0
150
208
151
209
def add (self , c , s , e , off ):
210
+ if c is None and s > 0 :
211
+ # s is the max contig length
212
+ shift = self .min_shift
213
+ limit = 1 << shift
214
+ while s > limit :
215
+ limit <<= 1
216
+ shift += 1
217
+ if shift >= 32 :
218
+ raise RuntimeError ('Some contigs are too long' )
219
+ if shift > self .min_shift + self .depth * 3 :
220
+ self .magic = self .CSI_MAGIC ;
221
+ self .depth = (shift - self .min_shift + 2 ) // 3 ;
222
+ self .max_shift = self .min_shift + self .depth * 3
152
223
if self .ci and self .ci [0 ] != c :
153
224
self .optimize (self .ci )
154
225
self .ci = None
@@ -159,17 +230,19 @@ def add(self, c, s, e, off):
159
230
if self .ci :
160
231
chrom , bins , intvs = self .ci
161
232
assert chrom == c and s >= self .pos
162
- be = e - 1 >> self .SHIFTS [ 0 ]
233
+ be = e - 1 >> self .min_shift
163
234
if be >= len (intvs ):
164
235
intvs += [self .end ] * (be + 1 - len (intvs ))
165
236
bin = 0
166
- for shift in self .SHIFTS :
237
+ for shift in range ( self .min_shift , self . max_shift + 3 , 3 ) :
167
238
bs , be = s >> shift , e - 1 >> shift
168
239
if bs == be :
169
- bo = ((1 << 29 - shift ) - 1 ) // 7
240
+ bo = ((1 << self . max_shift - shift ) - 1 ) // 7
170
241
bin = bo + bs
171
242
break
172
- chunks = bins .setdefault (bin ,[])
243
+ b = bins .setdefault (bin ,[])
244
+ if not b : b .extend ((0 , []))
245
+ chunks = b [1 ]
173
246
if chunks and chunks [- 1 ][1 ] == self .end :
174
247
chunks [- 1 ] = (chunks [- 1 ][0 ], off )
175
248
else :
@@ -179,25 +252,31 @@ def add(self, c, s, e, off):
179
252
180
253
def optimize (self , ci ):
181
254
bins = ci [1 ]
182
- for shift in self .SHIFTS [: - 1 ] :
183
- bo = ((1 << 29 - shift ) - 1 ) // 7
255
+ for shift in range ( self .min_shift , self . max_shift + 3 , 3 ) :
256
+ bo = ((1 << self . max_shift - shift ) - 1 ) // 7
184
257
for bin in sorted (bins .keys ()):
185
258
if bin < bo :
186
259
continue
187
260
if bin > bo << 3 :
188
261
break
189
- chunks = bins .get (bin )
190
- if chunks is None :
262
+ b = bins .get (bin )
263
+ if b is None :
191
264
continue
265
+ chunks = b [1 ]
192
266
if len (chunks ) == 0 :
193
267
del bins [bin ]
194
268
continue
195
269
bs = chunks [0 ][0 ] >> 16
196
270
be = chunks [- 1 ][1 ] >> 16
197
- if be - bs < 65536 :
271
+ if be - bs < 65536 and bo > 0 :
198
272
del bins [bin ]
199
273
bin = bin - 1 >> 3
200
- chunks += bins .get (bin ,[])
201
- bins [bin ] = list (self .merge (chunks , 16 ))
274
+ b = bins .setdefault (bin ,[])
275
+ if not b : b .extend ((0 , []))
276
+ b [1 ] = list (self .merge (chunks + b [1 ], 16 ))
277
+ elif ci [2 ]:
278
+ intv = (bin - bo ) << (shift - self .min_shift )
279
+ intv = min (intv , len (ci [2 ])- 1 )
280
+ b [0 ] = ci [2 ][intv ]
202
281
203
282
# vim: ts=4 sw=4 expandtab
0 commit comments