Skip to content

Commit 631e669

Browse files
authored
Merge pull request #2 from Sentieon/dev
Add support for the csi index
2 parents 39c476e + 10b8cb5 commit 631e669

File tree

11 files changed

+203
-87
lines changed

11 files changed

+203
-87
lines changed

.github/workflows/main.yml

+33
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
name: CI
2+
on:
3+
push:
4+
pull_request:
5+
6+
jobs:
7+
ci:
8+
strategy:
9+
fail-fast: true
10+
matrix:
11+
python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"]
12+
os: [ubuntu-22.04] #, macos-latest, windows-latest]
13+
runs-on: ${{ matrix.os }}
14+
steps:
15+
- uses: actions/checkout@v3
16+
- uses: actions/setup-python@v4
17+
with:
18+
python-version: ${{ matrix.python-version }}
19+
- name: Test
20+
run: |
21+
PYTHONPATH=$(pwd) python example/filter_dp.py \
22+
--input_vcf tests/hc_subset.vcf.gz \
23+
--output_vcf tests/hc_subset_dp.vcf.gz
24+
if [ ! -f tests/hc_subset_dp.vcf.gz ]; then exit 1; fi
25+
if [ ! -f tests/hc_subset_dp.vcf.gz.tbi ]; then exit 1; fi
26+
- name: Test csi
27+
run: |
28+
rm tests/hc_subset.vcf.gz.tbi
29+
PYTHONPATH=$(pwd) VCF_INDEX_TYPE=2 python example/filter_dp.py \
30+
--input_vcf tests/hc_subset.vcf.gz \
31+
--output_vcf tests/hc_subset_dp.vcf.gz
32+
if [ ! -f tests/hc_subset_dp.vcf.gz ]; then exit 1; fi
33+
if [ ! -f tests/hc_subset_dp.vcf.gz.csi ]; then exit 1; fi

.gitignore

+2
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
tmp/
2+
13
# Byte-compiled / optimized / DLL files
24
__pycache__/
35
*.py[cod]

tests/hc_subset.vcf.gz

10 KB
Binary file not shown.

tests/hc_subset.vcf.gz.csi

105 Bytes
Binary file not shown.

tests/hc_subset.vcf.gz.tbi

241 Bytes
Binary file not shown.

vcflib/bgzf.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
# Copyright (c) 2014-2021 Sentieon Inc. All rights reserved
1+
# Copyright (c) 2014-2024 Sentieon Inc. All rights reserved
22
import io
33
import struct
44
import zlib

vcflib/compat.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
# Copyright (c) 2014-2021 Sentieon Inc. All rights reserved
1+
# Copyright (c) 2014-2024 Sentieon Inc. All rights reserved
22
import sys
33

44
if sys.version_info[0] == 2:

vcflib/sharder.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
# Copyright (c) 2014-2021 Sentieon Inc. All rights reserved
1+
# Copyright (c) 2014-2024 Sentieon Inc. All rights reserved
22
from abc import ABCMeta, abstractmethod
33
import copy
44
import heapq

vcflib/tabix.py

+146-67
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
1-
# Copyright (c) 2014-2021 Sentieon Inc. All rights reserved
1+
# Copyright (c) 2014-2024 Sentieon Inc. All rights reserved
22
import collections
3+
import os
34
import struct
45
import sys
56

@@ -9,8 +10,7 @@
910
__all__ = ['Tabix']
1011

1112
class Header(object):
12-
__slots__ = ('magic', 'n_ref', 'format',
13-
'col_seq', 'col_beg', 'col_end', 'meta', 'skip', 'l_nm')
13+
__slots__ = ('format', 'col_seq', 'col_beg', 'col_end', 'meta', 'skip')
1414
def __init__(self, *args):
1515
for k,v in zip(self.__slots__, args):
1616
setattr(self, k, v)
@@ -19,13 +19,12 @@ def __iter__(self):
1919
yield getattr(self, k)
2020

2121
class Tabix(object):
22-
SHIFTS = (14, 17, 20, 23, 26, 29)
23-
MAXBIN = ((1 << SHIFTS[-1]-SHIFTS[0]+3) - 1) // 7 + 1
24-
MAGIC = 0x01494254
22+
TBI_MAGIC = 0x01494254
23+
CSI_MAGIC = 0x01495343
2524
FMT_GENERIC, FMT_SAM, FMT_VCF, FMT_ZERO_BASED = 0, 1, 2, 0x10000
2625

27-
def __init__(self, idxf, mode='r'):
28-
self.path = idxf
26+
def __init__(self, path, mode='r'):
27+
self.path = path
2928
self.mode = mode
3029
if mode[0:1] == 'r':
3130
self.load()
@@ -37,72 +36,116 @@ def __init__(self, idxf, mode='r'):
3736
def load(self):
3837
self.indices = collections.OrderedDict()
3938

40-
with bgzf.open(self.path, 'rb') as fp:
39+
if os.path.exists(self.path + '.csi'):
40+
idxf = self.path + '.csi'
41+
magic = self.CSI_MAGIC
42+
else:
43+
idxf = self.path + '.tbi'
44+
magic = self.TBI_MAGIC
45+
46+
with bgzf.open(idxf, 'rb') as fp:
4147
s4 = struct.Struct('<L')
4248
s8 = struct.Struct('<Q')
43-
sh = struct.Struct('<9L')
44-
data = fp.read()
45-
off = 0
46-
h = Header(*sh.unpack_from(data, off)); off += sh.size
47-
if h.magic != self.MAGIC:
49+
sh = struct.Struct('<6L')
50+
data = fp.read(); off = 0
51+
self.magic, = s4.unpack_from(data, off); off += s4.size
52+
if self.magic != magic:
4853
raise RuntimeError('Not a tabix file')
49-
self.header = h
50-
names, l_nm = [], 0
51-
for i in xrange(h.n_ref):
52-
eos = data.find(b'\0', off)
53-
if eos < 0: break
54-
names.append(data[off:eos].decode())
55-
l_nm += eos + 1 - off
56-
off = eos + 1
57-
if h.l_nm != l_nm:
54+
if self.magic == self.TBI_MAGIC:
55+
self.min_shift, self.depth = 14, 5
56+
n_ref, = s4.unpack_from(data, off); off += s4.size
57+
aux = sh.unpack_from(data, off); off += sh.size
58+
l_nm, = s4.unpack_from(data, off); off += s4.size
59+
names = data[off:off+l_nm].split(b'\0'); off += l_nm
60+
else:
61+
self.min_shift, = s4.unpack_from(data, off); off += s4.size
62+
self.depth, = s4.unpack_from(data, off); off += s4.size
63+
l_aux, = s4.unpack_from(data, off); off += s4.size
64+
if l_aux < sh.size + s4.size:
65+
raise RuntimeError('Invalid header')
66+
aux = sh.unpack_from(data, off); off += sh.size
67+
l_nm, = s4.unpack_from(data, off); off += s4.size
68+
names = data[off:off+l_nm].split(b'\0'); off += l_nm
69+
off += l_aux - (sh.size + s4.size + l_nm)
70+
n_ref, = s4.unpack_from(data, off); off += s4.size
71+
if len(names) != n_ref+1 or len(names[-1]) != 0:
5872
raise RuntimeError('Header sequence name length mismatch')
59-
for i in xrange(h.n_ref):
73+
self.header = Header(*aux)
74+
for i in xrange(n_ref):
6075
bins = {}
6176
n_bin, = s4.unpack_from(data, off); off += s4.size
6277
for _ in xrange(n_bin):
6378
bin, = s4.unpack_from(data, off); off += s4.size
79+
if self.magic == self.TBI_MAGIC:
80+
loffset = 0
81+
else:
82+
loffset, = s8.unpack_from(data, off); off += s8.size
6483
chunks = []
6584
n_chunk, = s4.unpack_from(data, off); off += s4.size
6685
for _ in xrange(n_chunk):
6786
s, = s8.unpack_from(data, off); off += s8.size
6887
e, = s8.unpack_from(data, off); off += s8.size
6988
chunks.append((s, e))
70-
bins[bin] = chunks
89+
bins[bin] = (loffset, chunks)
7190
intvs = []
72-
n_intv, = s4.unpack_from(data, off); off += s4.size
73-
for _ in xrange(n_intv):
74-
o, = s8.unpack_from(data, off); off += s8.size
75-
intvs.append(o)
76-
if n_intv == 0:
77-
intvs.append(0)
78-
self.indices[names[i]] = (bins, intvs)
91+
if self.magic == self.TBI_MAGIC:
92+
n_intv, = s4.unpack_from(data, off); off += s4.size
93+
for _ in xrange(n_intv):
94+
o, = s8.unpack_from(data, off); off += s8.size
95+
intvs.append(o)
96+
if n_intv == 0:
97+
intvs.append(0)
98+
self.indices[names[i].decode()] = (bins, intvs)
99+
self.max_shift = self.min_shift + self.depth * 3
79100

80101
def save(self):
81102
if self.header is None:
82103
return
83104
self.add(None, 0, 0, self.end)
84-
h = self.header
85-
h.n_ref = len(self.indices)
86-
nms = b''.join(c.encode()+b'\0' for c,_ in iteritems(self.indices))
87-
h.l_nm = len(nms)
88-
with bgzf.open(self.path, 'wb') as fp:
105+
106+
for ext in ('.tbi', '.csi'):
107+
f = self.path + ext
108+
if os.path.exists(f): os.remove(f)
109+
110+
if self.magic == self.TBI_MAGIC:
111+
idxf = self.path + '.tbi'
112+
else:
113+
idxf = self.path + '.csi'
114+
115+
with bgzf.open(idxf, 'wb') as fp:
89116
s4 = struct.Struct('<L')
90117
s8 = struct.Struct('<Q')
91-
sh = struct.Struct('<9L')
92-
fp.write(sh.pack(*h))
93-
fp.write(nms)
118+
sh = struct.Struct('<6L')
119+
nms = b''.join(c.encode()+b'\0' for c,_ in iteritems(self.indices))
120+
fp.write(s4.pack(self.magic))
121+
if self.magic == self.TBI_MAGIC:
122+
fp.write(s4.pack(len(self.indices)))
123+
fp.write(sh.pack(*self.header))
124+
fp.write(s4.pack(len(nms)))
125+
fp.write(nms)
126+
else:
127+
fp.write(s4.pack(self.min_shift))
128+
fp.write(s4.pack(self.depth))
129+
fp.write(s4.pack(sh.size + s4.size + len(nms)))
130+
fp.write(sh.pack(*self.header))
131+
fp.write(s4.pack(len(nms)))
132+
fp.write(nms)
133+
fp.write(s4.pack(len(self.indices)))
94134
for c, (bins, intvs) in iteritems(self.indices):
95135
fp.write(s4.pack(len(bins)))
96136
for bin in sorted(bins.keys()):
97-
chunks = bins[bin]
137+
loffset, chunks = bins[bin]
98138
fp.write(s4.pack(bin))
139+
if self.magic != self.TBI_MAGIC:
140+
fp.write(s8.pack(loffset))
99141
fp.write(s4.pack(len(chunks)))
100142
for s,e in chunks:
101143
fp.write(s8.pack(s))
102144
fp.write(s8.pack(e))
103-
fp.write(s4.pack(len(intvs)))
104-
for o in intvs:
105-
fp.write(s8.pack(o))
145+
if self.magic == self.TBI_MAGIC:
146+
fp.write(s4.pack(len(intvs)))
147+
for o in intvs:
148+
fp.write(s8.pack(o))
106149
self.header = None
107150

108151
def query(self, c, s, e):
@@ -111,19 +154,24 @@ def query(self, c, s, e):
111154
if ci is None:
112155
return ranges
113156
s = max(s, 0)
114-
i = s >> self.SHIFTS[0]
115-
minoff = ci[1][i] if i < len(ci[1]) else ci[1][-1]
116-
for shift in reversed(self.SHIFTS):
117-
bo = ((1 << 29-shift) - 1) // 7
157+
i = s >> self.min_shift
158+
minoff = ci[1][min(i,len(ci[1])-1)] if ci[1] else 0
159+
for shift in range(self.max_shift, self.min_shift-3, -3):
160+
bo = ((1 << self.max_shift - shift) - 1) // 7
118161
bs = bo + (s >> shift)
119162
be = bo + (e-1 >> shift)
120-
be = min(be, self.MAXBIN-1)
163+
if not ci[1]:
164+
for bi in xrange(bs, bo-1, -1):
165+
b = ci[0].get(bi)
166+
if b is not None:
167+
minoff = max(minoff, b[0])
168+
break
121169
for bi in xrange(bs, be+1):
122-
if bi not in ci[0]:
123-
continue
124-
for chunk in ci[0][bi]:
125-
if chunk[1] > minoff:
126-
ranges.append(chunk)
170+
b = ci[0].get(bi)
171+
if b is not None:
172+
ranges.extend(b[1])
173+
if minoff > 0:
174+
ranges = [(max(s,minoff), e) for s,e in ranges if e > minoff]
127175
return self.merge(ranges, 16)
128176

129177
@staticmethod
@@ -141,14 +189,37 @@ def merge(ranges, shift):
141189
yield p
142190

143191
def init(self):
144-
h = Header(self.MAGIC, 0, self.FMT_VCF, 1, 2, 2, ord('#'), 0, 0)
145-
self.header = h
192+
self.magic = self.TBI_MAGIC
193+
self.min_shift = 14
194+
self.depth = 5
195+
type = list(map(int, os.getenv('VCF_INDEX_TYPE', '1').split(':')))
196+
if len(type) > 0 and type[0] == 2:
197+
self.magic = self.CSI_MAGIC
198+
if len(type) > 1:
199+
self.min_shift = type[1]
200+
if len(type) > 2:
201+
self.depth = type[2]
202+
self.max_shift = self.min_shift + self.depth * 3
203+
self.header = Header(self.FMT_VCF, 1, 2, 2, ord('#'), 0)
146204
self.indices = collections.OrderedDict()
147205
self.ci = None
148206
self.pos = 0
149207
self.end = 0
150208

151209
def add(self, c, s, e, off):
210+
if c is None and s > 0:
211+
# s is the max contig length
212+
shift = self.min_shift
213+
limit = 1 << shift
214+
while s > limit:
215+
limit <<= 1
216+
shift += 1
217+
if shift >= 32:
218+
raise RuntimeError('Some contigs are too long')
219+
if shift > self.min_shift + self.depth * 3:
220+
self.magic = self.CSI_MAGIC;
221+
self.depth = (shift - self.min_shift + 2) // 3;
222+
self.max_shift = self.min_shift + self.depth * 3
152223
if self.ci and self.ci[0] != c:
153224
self.optimize(self.ci)
154225
self.ci = None
@@ -159,17 +230,19 @@ def add(self, c, s, e, off):
159230
if self.ci:
160231
chrom, bins, intvs = self.ci
161232
assert chrom == c and s >= self.pos
162-
be = e-1 >> self.SHIFTS[0]
233+
be = e-1 >> self.min_shift
163234
if be >= len(intvs):
164235
intvs += [self.end] * (be+1 - len(intvs))
165236
bin = 0
166-
for shift in self.SHIFTS:
237+
for shift in range(self.min_shift, self.max_shift+3, 3):
167238
bs, be = s >> shift, e-1 >> shift
168239
if bs == be:
169-
bo = ((1 << 29-shift) - 1) // 7
240+
bo = ((1 << self.max_shift - shift) - 1) // 7
170241
bin = bo + bs
171242
break
172-
chunks = bins.setdefault(bin,[])
243+
b = bins.setdefault(bin,[])
244+
if not b: b.extend((0, []))
245+
chunks = b[1]
173246
if chunks and chunks[-1][1] == self.end:
174247
chunks[-1] = (chunks[-1][0], off)
175248
else:
@@ -179,25 +252,31 @@ def add(self, c, s, e, off):
179252

180253
def optimize(self, ci):
181254
bins = ci[1]
182-
for shift in self.SHIFTS[:-1]:
183-
bo = ((1 << 29-shift) - 1) // 7
255+
for shift in range(self.min_shift, self.max_shift+3, 3):
256+
bo = ((1 << self.max_shift - shift) - 1) // 7
184257
for bin in sorted(bins.keys()):
185258
if bin < bo:
186259
continue
187260
if bin > bo << 3:
188261
break
189-
chunks = bins.get(bin)
190-
if chunks is None:
262+
b = bins.get(bin)
263+
if b is None:
191264
continue
265+
chunks = b[1]
192266
if len(chunks) == 0:
193267
del bins[bin]
194268
continue
195269
bs = chunks[0][0] >> 16
196270
be = chunks[-1][1] >> 16
197-
if be - bs < 65536:
271+
if be - bs < 65536 and bo > 0:
198272
del bins[bin]
199273
bin = bin-1 >> 3
200-
chunks += bins.get(bin,[])
201-
bins[bin] = list(self.merge(chunks, 16))
274+
b = bins.setdefault(bin,[])
275+
if not b: b.extend((0, []))
276+
b[1] = list(self.merge(chunks + b[1], 16))
277+
elif ci[2]:
278+
intv = (bin - bo) << (shift - self.min_shift)
279+
intv = min(intv, len(ci[2])-1)
280+
b[0] = ci[2][intv]
202281

203282
# vim: ts=4 sw=4 expandtab

0 commit comments

Comments
 (0)