@@ -16,6 +16,34 @@ kernelspec:
16
16
17
17
(sec_counting_topologies)=
18
18
19
+ ``` {code-cell} ipython3
20
+ :tags: [remove-cell]
21
+ import msprime
22
+ import stdpopsim
23
+
24
+ def topologies_sim_speciestree():
25
+ newick_species_tree = "((A:100.0,B:100.0):100.0,C:200.0)"
26
+ demography = msprime.Demography.from_species_tree(newick_species_tree, initial_size=100)
27
+ ts = msprime.sim_ancestry({0: 2, 1: 2, 2: 2}, demography=demography, random_seed=321)
28
+ ts.dump("data/topologies_sim_speciestree.trees")
29
+
30
+ def topologies_sim_stdpopsim():
31
+ species = stdpopsim.get_species("HomSap")
32
+ model = species.get_demographic_model("OutOfAfrica_3G09")
33
+ contig = species.get_contig("chr1", length_multiplier=0.0002, mutation_rate=model.mutation_rate)
34
+ samples = {"YRI": 1000, "CEU": 1000, "CHB": 1000}
35
+ engine = stdpopsim.get_engine("msprime")
36
+ ts = engine.simulate(model, contig, samples, seed=321)
37
+ ts.dump("data/topologies_sim_stdpopsim.trees")
38
+
39
+
40
+ def create_notebook_data():
41
+ topologies_sim_speciestree()
42
+ topologies_sim_stdpopsim()
43
+
44
+ # create_notebook_data() # uncomment to recreate the tree seqs used in this notebook
45
+ ```
46
+
19
47
# Counting topologies
20
48
21
49
** Yan Wong**
@@ -225,21 +253,22 @@ one tip from each group using the {meth}`Tree.count_topologies` method.
225
253
226
254
``` {code-cell}
227
255
:tags: [hide-input]
228
-
229
- newick_species_tree = "((A:100.0,B:100.0):100.0,C:200.0)"
230
- demography = msprime.Demography.from_species_tree(newick_species_tree, initial_size=100)
231
- ts = msprime.sim_ancestry({0: 2, 1: 2, 2: 2}, demography=demography, random_seed=321)
232
- big_tree = ts.first()
256
+ big_tree = tskit.load("data/topologies_sim_speciestree.trees").first()
257
+ # Check all observed topologies have the same counts
258
+ assert list(big_tree.count_topologies()[0, 1, 2].values()) == [32, 32]
233
259
styles = [
234
- f".node.sample.p{p.id} > .sym " + "{" + f"fill: {colour}" + "}"
235
- for colour, p in zip(['red', 'green', 'blue'], big_tree.tree_sequence.populations())
260
+ f".node.sample.p{p.id} > .sym " + "{" + f"fill: {colour}" + "}"
261
+ for colour, p in zip(['red', 'green', 'blue'], big_tree.tree_sequence.populations())
236
262
]
237
263
big_tree.draw_svg(style="".join(styles), node_labels={}, time_scale="rank", x_label="big_tree")
238
264
```
239
265
240
- In this tree, it is fairly clear that the red and green tips cluster together (although not exclusively),
241
- so that if we took one red, one blue, and one green tip at random, we nearly always see the
242
- same embedded topology. The {meth}` Tree.count_topologies ` method does this exhaustively:
266
+ In this tree, it is clear that the green and blue tips never cluster together.
267
+ The {meth}` Tree.count_topologies ` method exhaustively looks at all
268
+ combinations of one red, one blue, and one green tip, and confirms that we never see
269
+ the topology grouping green and blue. However, as might be expected from
270
+ examination of the plot above, a red tip is equally likely to be a sister to a
271
+ green tip as to a blue tip:
243
272
244
273
``` {code-cell}
245
274
# By default `count_topologies` chooses one tip from each population, like setting
@@ -268,27 +297,24 @@ do this [incrementally](sec_incremental), without having to re-count the topolog
268
297
independently in each tree.
269
298
270
299
``` {code-cell}
271
- import stdpopsim
272
- species = stdpopsim.get_species("HomSap")
273
- model = species.get_demographic_model("OutOfAfrica_3G09")
274
- contig = species.get_contig("chr1", length_multiplier=0.0002, mutation_rate=model.mutation_rate)
275
- samples = {"YRI": 1000, "CEU": 1000, "CHB": 1000}
276
- engine = stdpopsim.get_engine("msprime")
277
- ts = engine.simulate(model, contig, samples)
278
- print("Simulated", ts.num_trees, "African+European+Chinese trees, each with", ts.num_samples, "tips")
300
+ :tags: [hide-input]
301
+ from myst_nb import glue
302
+ ts = tskit.load("data/topologies_sim_stdpopsim.trees")
303
+ print(f"Loaded a stdpopsim of {ts.num_trees} African+European+Chinese trees, each with {ts.num_samples} tips")
304
+ glue("seq_len", int(ts.sequence_length/1000), display=False)
279
305
```
280
306
281
307
Although the trees in this tree sequence are very large, counting the embedded topologies is
282
- quite doable (for speed we are only simulating 0.02% of chromosome 1 , but calculating the
283
- average over an entire chromosome simply takes a little longer)
308
+ quite doable (for speed in this demo we are only simulating {glue:} ` seq_len ` kilobases , but
309
+ calculating the average over an entire chromosome simply takes a little longer)
284
310
285
311
``` {code-cell}
286
312
from datetime import datetime
287
313
names = {"YRI": "African", "CEU": "European", "CHB": "Chinese"}
288
314
colours = {"YRI": "yellow", "CEU": "green", "CHB": "blue"}
289
315
290
316
population_map = {p.metadata["id"]: p.id for p in ts.populations()}
291
- sample_populations = [population_map[name] for name in samples.keys()]
317
+ sample_populations = list(sorted({ts.node(u).population for u in ts.samples()}))
292
318
topology_span = {tree.rank(): 0 for tree in tskit.all_trees(len(sample_populations))}
293
319
294
320
start = datetime.now()
@@ -320,5 +346,9 @@ for rank, weight in topology_span.items():
320
346
display(embedded_tree.draw_svg(size=(160, 150), style="".join(styles), node_labels=node_labels, x_label=label))
321
347
```
322
348
349
+ Perhaps unsurprisingly, the most common topology is the one that groups the non-African
350
+ populations together (although there are many trees of the other two topologies,
351
+ mostly reflecting genetic divergence prior to the emergence of humans out of Africa).
352
+
323
353
For an example with real data, see {ref}` sec_popgen_topological `
324
354
in the {ref}` sec_intro_popgen ` tutorial.
0 commit comments