Skip to content

Commit

Permalink
Merge pull request #215 from raphael-group/clustering-gaps
Browse files Browse the repository at this point in the history
Add flag for cluster-bins to allow multiple gaps per chromosome
  • Loading branch information
mmyers1 authored Apr 26, 2024
2 parents 2033b6f + 1fddf30 commit ea01fab
Show file tree
Hide file tree
Showing 3 changed files with 91 additions and 55 deletions.
1 change: 1 addition & 0 deletions src/hatchet/hatchet.ini
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,7 @@ covar = "diag"
tau = 0.000001
restarts = 10
subset=
allow_gaps=False

[plot_bins]
command =
Expand Down
10 changes: 9 additions & 1 deletion src/hatchet/utils/ArgParsing.py
Original file line number Diff line number Diff line change
Expand Up @@ -364,6 +364,13 @@ def parse_cluster_bins_args(args=None):
'List of sample names to use as a subset of those included in binning' '(default: none, run on all samples)'
),
)
parser.add_argument(
'--allow_gaps',
action='store_true',
required=False,
default=config.cluster_bins.allow_gaps,
help='Set this flag to allow gaps in chromosomes (each contiguous region is considered a separate "arm").',
)
parser.add_argument('-V', '--version', action='version', version=f'%(prog)s {__version__}')
args = parser.parse_args(args)

Expand Down Expand Up @@ -410,7 +417,7 @@ def parse_cluster_bins_args(args=None):
f'Invalid -s/--selection argument: {args.selection}. Valid values are {valid_selections}.',
)

valid_transmats = ('fixed', 'diag', 'tied')
valid_transmats = ('fixed', 'diag', 'free')
ensure(
args.transmat in valid_transmats,
f'Invalid -t/--transmat argument: {args.transmat}. Valid values are {valid_transmats}.',
Expand Down Expand Up @@ -449,6 +456,7 @@ def parse_cluster_bins_args(args=None):
'outbins': args.outbins,
'restarts': args.restarts,
'subset': args.subset,
'allow_gaps': args.allow_gaps,
}


Expand Down
135 changes: 81 additions & 54 deletions src/hatchet/utils/cluster_bins.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,9 @@ def main(args=None):
sp.logArgs(args, 80)

sp.log(msg='# Reading the combined BB file\n', level='STEP')
tracks, bb, sample_labels, chr_labels = read_bb(args['bbfile'], subset=args['subset'])
tracks, bb, sample_labels, chr_labels = read_bb(
args['bbfile'], subset=args['subset'], allow_gaps=args['allow_gaps']
)

if args['exactK'] > 0:
minK = args['exactK']
Expand Down Expand Up @@ -61,31 +63,33 @@ def main(args=None):

sp.log(msg='# Checking consistency of results\n', level='STEP')
pivot_check = bb.pivot(index=['#CHR', 'START', 'END'], columns='SAMPLE', values='CLUSTER')
# Verify that the array lengths and order match the bins in the BB file
chr_idx = 0
bin_indices = pivot_check.index.to_numpy()
i = 0
while chr_idx < len(tracks):
my_chr = chr_labels[chr_idx][:-2]

start_row = bin_indices[i]
assert str(start_row[0]) == my_chr, (start_row[0], my_chr)

prev_end = start_row[-1]

start_idx = i
i += 1
while i < len(bin_indices) and bin_indices[i][0] == start_row[0] and bin_indices[i][1] == prev_end:
prev_end = bin_indices[i][2]
i += 1

# check the array lengths
assert tracks[chr_idx].shape[1] == i - start_idx, (
tracks[chr_idx].shape[1],
i - start_idx,
)
if not args['allow_gaps']:
# Verify that the array lengths and order match the bins in the BB file
chr_idx = 0
bin_indices = pivot_check.index.to_numpy()
i = 0
while chr_idx < len(tracks):
my_chr = chr_labels[chr_idx][:-2]

start_row = bin_indices[i]
assert str(start_row[0]) == my_chr, (start_row[0], my_chr)

prev_end = start_row[-1]

start_idx = i
i += 1
while i < len(bin_indices) and bin_indices[i][0] == start_row[0] and bin_indices[i][1] == prev_end:
prev_end = bin_indices[i][2]
i += 1

# check the array lengths
assert tracks[chr_idx].shape[1] == i - start_idx, (
tracks[chr_idx].shape[1],
i - start_idx,
)

chr_idx += 1
chr_idx += 1

# Verify that cluster labels were applied correctly
cl_check = pivot_check.to_numpy().T
Expand Down Expand Up @@ -115,7 +119,7 @@ def main(args=None):
sp.log(msg='# Done\n', level='STEP')


def read_bb(bbfile, subset=None):
def read_bb(bbfile, subset=None, allow_gaps=False):
"""
Constructs arrays to represent the bin in each chromosome or arm.
If bbfile was binned around chromosome arm, then uses chromosome arms.
Expand Down Expand Up @@ -145,44 +149,67 @@ def read_bb(bbfile, subset=None):
for ch, df0 in bb.groupby('#CHR'):
df0 = df0.sort_values('START')

p_arrs = []
q_arrs = []
if allow_gaps:
pt = df0.pivot(index=['START', 'END'], columns='SAMPLE', values=['RD', 'BAF']).reset_index()
sample_labels = list(pt['RD'].columns)
populated_labels = True
gaps = np.where(pt.START.to_numpy()[1:] - pt.END.to_numpy()[:-1] > 0)[0] + 1

for sample, df in df0.groupby('SAMPLE'):
if not populated_labels:
sample_labels.append(sample)
chunks = []
prev_end = 0
for gap in gaps:
chunks.append(pt.iloc[prev_end:gap])
prev_end = gap

gaps = np.where(df.START.to_numpy()[1:] - df.END.to_numpy()[:-1] > 0)[0]
# print(ch, gaps)
if prev_end < len(pt):
chunks.append(pt.iloc[prev_end:])

if len(gaps) > 0:
assert len(gaps) == 1, 'Found a chromosome with >1 gaps between bins'
gap = gaps[0] + 1
for i, chunk in enumerate(chunks):
track = np.empty((len(sample_labels) * 2, len(chunk)))
track[0::2] = chunk.BAF.values.T
track[1::2] = chunk.RD.values.T
tracks.append(track)
chr_labels.append(f'{ch}_section{i}')

df_p = df.iloc[:gap]
df_q = df.iloc[gap:]
else:
p_arrs = []
q_arrs = []

p_arrs.append(df_p.BAF.to_numpy())
p_arrs.append(df_p.RD.to_numpy())
for sample, df in df0.groupby('SAMPLE'):
if not populated_labels:
sample_labels.append(sample)

q_arrs.append(df_q.BAF.to_numpy())
q_arrs.append(df_q.RD.to_numpy())
else:
df_p = df
p_arrs.append(df_p.BAF.to_numpy())
p_arrs.append(df_p.RD.to_numpy())
gaps = np.where(df.START.to_numpy()[1:] - df.END.to_numpy()[:-1] > 0)[0]
# print(ch, gaps)

if len(q_arrs) > 0:
tracks.append(np.array(p_arrs))
chr_labels.append(str(ch) + '_p')
if len(gaps) > 0:
assert len(gaps) == 1, 'Found a chromosome with >1 gaps between bins'
gap = gaps[0] + 1

tracks.append(np.array(q_arrs))
chr_labels.append(str(ch) + '_q')
else:
tracks.append(np.array(p_arrs))
chr_labels.append(str(ch) + '_p')
df_p = df.iloc[:gap]
df_q = df.iloc[gap:]

p_arrs.append(df_p.BAF.to_numpy())
p_arrs.append(df_p.RD.to_numpy())

q_arrs.append(df_q.BAF.to_numpy())
q_arrs.append(df_q.RD.to_numpy())
else:
df_p = df
p_arrs.append(df_p.BAF.to_numpy())
p_arrs.append(df_p.RD.to_numpy())

if len(q_arrs) > 0:
tracks.append(np.array(p_arrs))
chr_labels.append(str(ch) + '_p')

tracks.append(np.array(q_arrs))
chr_labels.append(str(ch) + '_q')
else:
tracks.append(np.array(p_arrs))
chr_labels.append(str(ch) + '_p')

populated_labels = True
populated_labels = True

return (
tracks,
Expand Down

0 comments on commit ea01fab

Please sign in to comment.