Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Stop using odgi's Python bindings in calyx_depth.py #116

Merged
merged 38 commits into from
Nov 17, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
38 commits
Select commit Hold shift + click to select a range
aa6ce72
Make flake8 happy with parse_data.py
anshumanmohan Jul 10, 2023
b8aefac
No more odgi bindings in parse_data
anshumanmohan Jul 10, 2023
dc3a6b8
Merge branch 'main' into no-odgi-python
anshumanmohan Jul 13, 2023
d279b81
Nix invalid files at make clean
anshumanmohan Jul 13, 2023
c8639e4
Nix invalid graphs at make clean:
anshumanmohan Jul 13, 2023
3f0dab7
revert
anshumanmohan Jul 13, 2023
d1c069a
Whitespace in string concat
anshumanmohan Jul 13, 2023
a06466c
Error, not Exception
anshumanmohan Jul 13, 2023
53f2d3a
Trying to make exine take a gfa
anshumanmohan Jul 14, 2023
2eb4182
Force-add gfas
anshumanmohan Jul 14, 2023
b3d20f3
Merge branch 'main' into no-odgi-python
anshumanmohan Jul 14, 2023
f9ad384
More whitespace in error messages
anshumanmohan Jul 14, 2023
f5917a6
add make target for all gfa files, replace .og symlinks with .gfa sym…
susan-garry Jul 17, 2023
1926f65
Merge branch 'main' into no-odgi-python
anshumanmohan Jul 18, 2023
2b4d596
Open graph correctly
anshumanmohan Jul 18, 2023
c687c71
Revert all my changes to parse_data.py
anshumanmohan Jul 20, 2023
07dcfc5
Revert changes to mygfa
anshumanmohan Jul 20, 2023
6bd8e31
Lift get_dimensions to calyx_depth
anshumanmohan Jul 20, 2023
8cb722e
Merge branch 'main' into no-odgi-python
anshumanmohan Jul 20, 2023
1be0f85
Steps towards using GFA files all around
anshumanmohan Jul 20, 2023
78a7e95
Merge branch 'main' into no-odgi-python
anshumanmohan Jul 20, 2023
bbafd0b
misc debuggging
susan-garry Aug 28, 2023
b9f559d
Impersonating Susan to commit
susan-garry Oct 2, 2023
284dab2
update some changes from main
susan-garry Oct 11, 2023
7a5e5a9
add more test files
susan-garry Oct 11, 2023
19cc105
merge main
susan-garry Oct 11, 2023
6b4e657
typos
susan-garry Oct 11, 2023
489d736
debugging new pollen_data_gen
susan-garry Oct 11, 2023
2697c74
don't keep track of path indexes directly in mygfa
susan-garry Oct 16, 2023
1cc1089
minor bug fixes
susan-garry Oct 16, 2023
1d1635e
add no-test-flip4.gfa, inadvertently deleted and readded as no-test-f…
susan-garry Oct 16, 2023
ba290e0
mfa uses OrderedDict for paths, simple gfa parser can accept subset_p…
susan-garry Oct 30, 2023
b2a587b
simple.py pollen data gen accepts subset_paths
susan-garry Oct 30, 2023
31b49db
restore handmade tests to their state immediately before the parser… …
susan-garry Oct 30, 2023
f8a36dc
add newline to end of gfa files
susan-garry Oct 30, 2023
7de74ec
fix typo so test-depth works
susan-garry Oct 31, 2023
c253dd3
black formatter pass
susan-garry Oct 31, 2023
57a359e
fix type-checking stuff i hope
susan-garry Nov 2, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -10,14 +10,14 @@ fetch: $(TEST_FILES:%=tests/%.gfa)

og: $(OG_FILES)

test: og test-depth
test: fetch test-depth

test-depth: og
test-depth: fetch og
-turnt --save --env baseline tests/depth/subset-paths/*.txt
turnt tests/depth/subset-paths/*.txt
turnt --env calyx-depth tests/depth/subset-paths/*.txt

-turnt --save --env baseline $(DEPTH_OG_FILES)
turnt $(DEPTH_OG_FILES)
turnt --env calyx $(DEPTH_OG_FILES)


test-data-gen: og
Expand Down
8 changes: 5 additions & 3 deletions mygfa/mygfa/mygfa.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
import re
import sys

from collections import OrderedDict
from dataclasses import dataclass
from typing import List, Tuple, Optional, Dict, TextIO, Iterator
from enum import Enum
import re
from typing import List, Tuple, Optional, Dict, TextIO, Iterator


def parse_orientation(ori: str) -> bool:
Expand Down Expand Up @@ -274,7 +276,7 @@ class Graph:
@classmethod
def parse(cls, infile: TextIO) -> "Graph":
"""Parse a GFA file."""
graph = Graph([], {}, [], {})
graph = Graph([], {}, [], OrderedDict())

for line in nonblanks(infile):
fields = line.split()
Expand Down
6 changes: 5 additions & 1 deletion mygfa/mygfa/preprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,11 @@ def pathseq(graph: mygfa.Graph) -> Dict[str, str]:


def get_maxes(graph: mygfa.Graph) -> Tuple[int, int, int]:
"""Return the maximum number of nodes, steps, and paths in the graph."""
"""Given a graph, returns:
- the number of nodes
- the maximum number of steps in a path
- the number of paths in the graph.
"""
max_nodes = len(graph.segments)
max_steps = max([len(steps) for steps in node_steps(graph).values()])
max_paths = len(graph.paths)
Expand Down
54 changes: 31 additions & 23 deletions pollen_data_gen/pollen_data_gen/__main__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import sys
import argparse
from mygfa import mygfa
from typing import List

from . import depth, simple

Expand All @@ -16,26 +17,23 @@ def parse_args() -> tuple[argparse.ArgumentParser, argparse.Namespace]:
simple_parser = subparsers.add_parser(
"simple", help="Produces a simple JSON serialization of the graph."
)
# Optional arguments - argparse automatically infers flags beginning with '-' as optional
simple_parser.add_argument(
"-n",
nargs="?",
const="d",
help="The max number of nodes.",
required=False,
)
simple_parser.add_argument(
"-e",
nargs="?",
const="d",
help="The max number of steps per node.",
required=False,
)
simple_parser.add_argument(
"-p",
nargs="?",
const="d",
help="The max number of paths.",
required=False,
)
simple_parser.add_argument(
"-s",
"--subset-paths",
help="A file where each line is a path of the graph to consider when calculating node depth",
)

_ = subparsers.add_parser(
Expand All @@ -48,46 +46,56 @@ def parse_args() -> tuple[argparse.ArgumentParser, argparse.Namespace]:
)
depth_parser.add_argument(
"-n",
nargs="?",
const="d",
help="The max number of nodes.",
required=False,
)
depth_parser.add_argument(
"-e",
nargs="?",
const="d",
help="The max number of steps per node.",
required=False,
)
depth_parser.add_argument(
"-p",
nargs="?",
const="d",
help="The max number of paths.",
required=False,
)
depth_parser.add_argument(
"-s",
"--subset-paths",
help="A file where each line is a path of the graph to consider when calculating node depth",
)

# Add the graph argument to all subparsers.
# Doing it this way means that the graph argument is sought _after_ the
# command name.
for subparser in subparsers.choices.values():
subparser.add_argument(
"graph", nargs="?", help="Input GFA file", metavar="GRAPH"
)
subparser.add_argument("graph", help="Input GFA file", metavar="GRAPH")

args = parser.parse_args()

return parser, args


def parse_subset_paths(filename: str) -> List[str]:
"""
Return a list of the names of paths in [filename]
"""

if filename is None: # Return the default value
return []

with open(filename, "r", encoding="utf-8") as paths_file:
text = paths_file.read()
return text.splitlines()


def dispatch(args: argparse.Namespace) -> None:
"""Parse the graph from filename,
then dispatch to the appropriate pollen_data_gen command.
"""
subset_paths = parse_subset_paths(args.subset_paths)
name_to_func = {
"depth": lambda g: depth.depth_stdout(g, args.n, args.e, args.p),
"simple": lambda g: simple.dump(g, sys.stdout, args.n, args.e, args.p),
"depth": lambda g: depth.depth_stdout(g, args.n, args.e, args.p, subset_paths),
"simple": lambda g: simple.dump(
g, sys.stdout, args.n, args.e, args.p, subset_paths
),
"roundtrip": simple.roundtrip_test,
}
graph = mygfa.Graph.parse(open(args.graph, "r", encoding="utf-8"))
Expand Down
49 changes: 40 additions & 9 deletions pollen_data_gen/pollen_data_gen/depth.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import sys
from typing import Any, Collection, Dict, Union, Optional
from typing import Any, Collection, Dict, OrderedDict, Union, Optional, List
import json
from json import JSONEncoder
from mygfa import mygfa, preprocess
Expand All @@ -23,6 +23,7 @@ def paths_viewed_from_nodes(
path2id = {path: id for id, path in enumerate(graph.paths, start=1)}
output = {}
json_format = format_gen(max_p.bit_length())
# segment name, (path name, index on path, direction) list
for seg, crossings in preprocess.node_steps(graph).items():
data = list(path2id[c[0]] for c in crossings)
data = data + [0] * (max_e - len(data))
Expand All @@ -33,7 +34,9 @@ def paths_viewed_from_nodes(
return output


def paths_to_consider(max_n: int, max_p: int) -> OutputType:
def paths_to_consider(
subset_paths_idx: List[int], max_n: int, max_p: int
) -> OutputType:
"""Currently just a stub; later we will populate this with a
bitvector of length MAX_PATHS, where the i'th index will be 1 if
the i'th path is to be considered during depth calculation.
Expand All @@ -42,8 +45,15 @@ def paths_to_consider(max_n: int, max_p: int) -> OutputType:
are nodes in the graph.
"""
output = {}
data = []
if subset_paths_idx:
data = [0] * (max_p + 1)
for path_idx in subset_paths_idx:
data[path_idx] = 1
else:
data = [0] + ([1] * max_p)

for i in range(1, max_n + 1):
data = [0] + [1] * (max_p)
output[f"paths_to_consider{i}"] = {"data": data, "format": format_gen(1)}
return output

Expand All @@ -54,11 +64,25 @@ class NodeDepthEncoder(JSONEncoder):
The exine command `depth` is the oracle for this encoding.
"""

def __init__(self, max_n: int, max_e: int, max_p: int, **kwargs: Any) -> None:
def __init__(
self,
max_n: int,
max_e: int,
max_p: int,
subset_paths: Optional[List[str]],
**kwargs: Any,
) -> None:
super(NodeDepthEncoder, self).__init__(**kwargs)
self.max_n = max_n
self.max_e = max_e
self.max_p = max_p
self.subset_paths = subset_paths

def paths_to_idxs(self, o: mygfa.Graph) -> List[int]:
if not self.subset_paths:
return []
path2id = {path: id for id, path in enumerate(o.paths, start=1)}
return list(map(lambda p: path2id[p], self.subset_paths))

def default(self, o: Any) -> Dict[str, Dict[str, Collection[object]]]:
answer_field = {
Expand All @@ -73,15 +97,20 @@ def default(self, o: Any) -> Dict[str, Dict[str, Collection[object]]]:
"format": format_gen(self.max_p.bit_length()),
}
}
subset_paths_idx = self.paths_to_idxs(o)
paths = paths_viewed_from_nodes(
o, self.max_n, self.max_e, self.max_p
) | paths_to_consider(self.max_n, self.max_p)
) | paths_to_consider(subset_paths_idx, self.max_n, self.max_p)

return answer_field | paths | answer_field_uniq


def depth_json(
graph: mygfa.Graph, max_n: Optional[int], max_e: Optional[int], max_p: Optional[int]
graph: mygfa.Graph,
max_n: Optional[int],
max_e: Optional[int],
max_p: Optional[int],
subset_paths: Optional[List[str]],
) -> str:
"""Returns a JSON representation of `graph`
that is specific to the exine command `depth`.
Expand All @@ -97,13 +126,15 @@ def depth_json(
max_p = p_tight

return NodeDepthEncoder(
max_n=int(max_n), max_e=int(max_e), max_p=int(max_p)
max_n=int(max_n), max_e=int(max_e), max_p=int(max_p), subset_paths=subset_paths
).encode(graph)


def depth_stdout(graph: mygfa.Graph, max_n: int, max_e: int, max_p: int) -> None:
def depth_stdout(
graph: mygfa.Graph, max_n: int, max_e: int, max_p: int, subset_paths: List[str]
) -> None:
"""Prints a JSON representation of `graph` to stdout."""
encoding = depth_json(graph, max_n, max_e, max_p)
encoding = depth_json(graph, max_n, max_e, max_p, subset_paths)

json.dump(
json.loads(encoding),
Expand Down
3 changes: 2 additions & 1 deletion pollen_data_gen/pollen_data_gen/simple.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,7 @@ def dump(
max_n: Optional[int],
max_e: Optional[int],
max_p: Optional[int],
subset_paths: Optional[List[str]] = None,
) -> None:
"""Outputs the graph as a JSON, along with precomputed data for the
calculation of node depth.
Expand All @@ -140,7 +141,7 @@ def dump(
| {f"path_details_{k}": v for k, v in graph.paths.items()}
)

depth_encoding = depth.depth_json(graph, max_n, max_e, max_p)
depth_encoding = depth.depth_json(graph, max_n, max_e, max_p, subset_paths)

json.dump(
{
Expand Down
34 changes: 33 additions & 1 deletion pollen_py/pollen/depth/calyx_depth.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,14 @@
from calyx.py_ast import *
from . import parse_data

# from mygfa import mygfa, preprocess

# Defaults for the maximum possible number of nodes,
# steps per node, and paths to consider
MAX_NODES = 16
MAX_STEPS = 15
MAX_PATHS = 15


def node_depth(max_nodes, max_steps, max_paths):
stdlib = Stdlib()
Expand Down Expand Up @@ -545,9 +553,33 @@ def config_parser(parser):
)


# def get_maxes(filename):
# print("In `get_maxes`. Filename: ", filename)
# """Returns the maximum number of nodes, steps per node, and paths."""
# with open(filename, "r", encoding="utf-8") as infile:
# graph = mygfa.Graph.parse(infile)
# return preprocess.get_maxes(graph)


# def get_dimensions(args):
# """
# Compute the node depth accelerator's dimensions from commandline input.
# """
# if args.auto_size:
# filename = args.filename if args.auto_size == "d" else args.auto_size
# max_nodes, max_steps, max_paths = get_maxes(filename)
# else:
# max_nodes, max_steps, max_paths = MAX_NODES, MAX_STEPS, MAX_PATHS

# max_nodes = args.max_nodes if args.max_nodes else max_nodes
# max_steps = args.max_steps if args.max_steps else max_steps
# max_paths = args.max_paths if args.max_paths else max_paths

# return max_nodes, max_steps, max_paths


def run(args):
max_nodes, max_steps, max_paths = parse_data.get_dimensions(args)

program = node_depth(max_nodes, max_steps, max_paths)
output = program.doc()

Expand Down
Loading