cucapra · anshumanmohan · Nov 17, 2023 · Jul 10, 2023 · Jul 10, 2023 · Jul 13, 2023
diff --git a/Makefile b/Makefile
@@ -10,14 +10,14 @@ fetch: $(TEST_FILES:%=tests/%.gfa)
 
 og: $(OG_FILES)
 
-test: og test-depth
+test: fetch test-depth
 
-test-depth: og
+test-depth: fetch og
 	-turnt --save --env baseline tests/depth/subset-paths/*.txt
-	turnt tests/depth/subset-paths/*.txt
+	turnt --env calyx-depth tests/depth/subset-paths/*.txt
 
 	-turnt --save --env baseline $(DEPTH_OG_FILES)
-	turnt $(DEPTH_OG_FILES)
+	turnt --env calyx $(DEPTH_OG_FILES)
 
 
 test-data-gen: og

diff --git a/mygfa/mygfa/mygfa.py b/mygfa/mygfa/mygfa.py
@@ -1,8 +1,10 @@
+import re
 import sys
+
+from collections import OrderedDict
 from dataclasses import dataclass
-from typing import List, Tuple, Optional, Dict, TextIO, Iterator
 from enum import Enum
-import re
+from typing import List, Tuple, Optional, Dict, TextIO, Iterator
 
 
 def parse_orientation(ori: str) -> bool:
@@ -274,7 +276,7 @@ class Graph:
     @classmethod
     def parse(cls, infile: TextIO) -> "Graph":
         """Parse a GFA file."""
-        graph = Graph([], {}, [], {})
+        graph = Graph([], {}, [], OrderedDict())
 
         for line in nonblanks(infile):
             fields = line.split()

diff --git a/mygfa/mygfa/preprocess.py b/mygfa/mygfa/preprocess.py
@@ -62,7 +62,11 @@ def pathseq(graph: mygfa.Graph) -> Dict[str, str]:
 
 
 def get_maxes(graph: mygfa.Graph) -> Tuple[int, int, int]:
-    """Return the maximum number of nodes, steps, and paths in the graph."""
+    """Given a graph, returns:
+    - the number of nodes
+    - the maximum number of steps in a path
+    - the number of paths in the graph.
+    """
     max_nodes = len(graph.segments)
     max_steps = max([len(steps) for steps in node_steps(graph).values()])
     max_paths = len(graph.paths)

diff --git a/pollen_data_gen/pollen_data_gen/__main__.py b/pollen_data_gen/pollen_data_gen/__main__.py
@@ -1,6 +1,7 @@
 import sys
 import argparse
 from mygfa import mygfa
+from typing import List
 
 from . import depth, simple
 
@@ -16,26 +17,23 @@ def parse_args() -> tuple[argparse.ArgumentParser, argparse.Namespace]:
     simple_parser = subparsers.add_parser(
         "simple", help="Produces a simple JSON serialization of the graph."
     )
+    # Optional arguments - argparse automatically infers flags beginning with '-' as optional
     simple_parser.add_argument(
         "-n",
-        nargs="?",
-        const="d",
         help="The max number of nodes.",
-        required=False,
     )
     simple_parser.add_argument(
         "-e",
-        nargs="?",
-        const="d",
         help="The max number of steps per node.",
-        required=False,
     )
     simple_parser.add_argument(
         "-p",
-        nargs="?",
-        const="d",
         help="The max number of paths.",
-        required=False,
+    )
+    simple_parser.add_argument(
+        "-s",
+        "--subset-paths",
+        help="A file where each line is a path of the graph to consider when calculating node depth",
     )
 
     _ = subparsers.add_parser(
@@ -48,46 +46,56 @@ def parse_args() -> tuple[argparse.ArgumentParser, argparse.Namespace]:
     )
     depth_parser.add_argument(
         "-n",
-        nargs="?",
-        const="d",
         help="The max number of nodes.",
-        required=False,
     )
     depth_parser.add_argument(
         "-e",
-        nargs="?",
-        const="d",
         help="The max number of steps per node.",
-        required=False,
     )
     depth_parser.add_argument(
         "-p",
-        nargs="?",
-        const="d",
         help="The max number of paths.",
-        required=False,
+    )
+    depth_parser.add_argument(
+        "-s",
+        "--subset-paths",
+        help="A file where each line is a path of the graph to consider when calculating node depth",
     )
 
     # Add the graph argument to all subparsers.
     # Doing it this way means that the graph argument is sought _after_ the
     # command name.
     for subparser in subparsers.choices.values():
-        subparser.add_argument(
-            "graph", nargs="?", help="Input GFA file", metavar="GRAPH"
-        )
+        subparser.add_argument("graph", help="Input GFA file", metavar="GRAPH")
 
     args = parser.parse_args()
 
     return parser, args
 
 
+def parse_subset_paths(filename: str) -> List[str]:
+    """
+    Return a list of the names of paths in [filename]
+    """
+
+    if filename is None:  # Return the default value
+        return []
+
+    with open(filename, "r", encoding="utf-8") as paths_file:
+        text = paths_file.read()
+        return text.splitlines()
+
+
 def dispatch(args: argparse.Namespace) -> None:
     """Parse the graph from filename,
     then dispatch to the appropriate pollen_data_gen command.
     """
+    subset_paths = parse_subset_paths(args.subset_paths)
     name_to_func = {
-        "depth": lambda g: depth.depth_stdout(g, args.n, args.e, args.p),
-        "simple": lambda g: simple.dump(g, sys.stdout, args.n, args.e, args.p),
+        "depth": lambda g: depth.depth_stdout(g, args.n, args.e, args.p, subset_paths),
+        "simple": lambda g: simple.dump(
+            g, sys.stdout, args.n, args.e, args.p, subset_paths
+        ),
         "roundtrip": simple.roundtrip_test,
     }
     graph = mygfa.Graph.parse(open(args.graph, "r", encoding="utf-8"))

diff --git a/pollen_data_gen/pollen_data_gen/depth.py b/pollen_data_gen/pollen_data_gen/depth.py
@@ -1,5 +1,5 @@
 import sys
-from typing import Any, Collection, Dict, Union, Optional
+from typing import Any, Collection, Dict, OrderedDict, Union, Optional, List
 import json
 from json import JSONEncoder
 from mygfa import mygfa, preprocess
@@ -23,6 +23,7 @@ def paths_viewed_from_nodes(
     path2id = {path: id for id, path in enumerate(graph.paths, start=1)}
     output = {}
     json_format = format_gen(max_p.bit_length())
+    # segment name, (path name, index on path, direction) list
     for seg, crossings in preprocess.node_steps(graph).items():
         data = list(path2id[c[0]] for c in crossings)
         data = data + [0] * (max_e - len(data))
@@ -33,7 +34,9 @@ def paths_viewed_from_nodes(
     return output
 
 
-def paths_to_consider(max_n: int, max_p: int) -> OutputType:
+def paths_to_consider(
+    subset_paths_idx: List[int], max_n: int, max_p: int
+) -> OutputType:
     """Currently just a stub; later we will populate this with a
     bitvector of length MAX_PATHS, where the i'th index will be 1 if
     the i'th path is to be considered during depth calculation.
@@ -42,8 +45,15 @@ def paths_to_consider(max_n: int, max_p: int) -> OutputType:
     are nodes in the graph.
     """
     output = {}
+    data = []
+    if subset_paths_idx:
+        data = [0] * (max_p + 1)
+        for path_idx in subset_paths_idx:
+            data[path_idx] = 1
+    else:
+        data = [0] + ([1] * max_p)
+
     for i in range(1, max_n + 1):
-        data = [0] + [1] * (max_p)
         output[f"paths_to_consider{i}"] = {"data": data, "format": format_gen(1)}
     return output
 
@@ -54,11 +64,25 @@ class NodeDepthEncoder(JSONEncoder):
     The exine command `depth` is the oracle for this encoding.
     """
 
-    def __init__(self, max_n: int, max_e: int, max_p: int, **kwargs: Any) -> None:
+    def __init__(
+        self,
+        max_n: int,
+        max_e: int,
+        max_p: int,
+        subset_paths: Optional[List[str]],
+        **kwargs: Any,
+    ) -> None:
         super(NodeDepthEncoder, self).__init__(**kwargs)
         self.max_n = max_n
         self.max_e = max_e
         self.max_p = max_p
+        self.subset_paths = subset_paths
+
+    def paths_to_idxs(self, o: mygfa.Graph) -> List[int]:
+        if not self.subset_paths:
+            return []
+        path2id = {path: id for id, path in enumerate(o.paths, start=1)}
+        return list(map(lambda p: path2id[p], self.subset_paths))
 
     def default(self, o: Any) -> Dict[str, Dict[str, Collection[object]]]:
         answer_field = {
@@ -73,15 +97,20 @@ def default(self, o: Any) -> Dict[str, Dict[str, Collection[object]]]:
                 "format": format_gen(self.max_p.bit_length()),
             }
         }
+        subset_paths_idx = self.paths_to_idxs(o)
         paths = paths_viewed_from_nodes(
             o, self.max_n, self.max_e, self.max_p
-        ) | paths_to_consider(self.max_n, self.max_p)
+        ) | paths_to_consider(subset_paths_idx, self.max_n, self.max_p)
 
         return answer_field | paths | answer_field_uniq
 
 
 def depth_json(
-    graph: mygfa.Graph, max_n: Optional[int], max_e: Optional[int], max_p: Optional[int]
+    graph: mygfa.Graph,
+    max_n: Optional[int],
+    max_e: Optional[int],
+    max_p: Optional[int],
+    subset_paths: Optional[List[str]],
 ) -> str:
     """Returns a JSON representation of `graph`
     that is specific to the exine command `depth`.
@@ -97,13 +126,15 @@ def depth_json(
         max_p = p_tight
 
     return NodeDepthEncoder(
-        max_n=int(max_n), max_e=int(max_e), max_p=int(max_p)
+        max_n=int(max_n), max_e=int(max_e), max_p=int(max_p), subset_paths=subset_paths
     ).encode(graph)
 
 
-def depth_stdout(graph: mygfa.Graph, max_n: int, max_e: int, max_p: int) -> None:
+def depth_stdout(
+    graph: mygfa.Graph, max_n: int, max_e: int, max_p: int, subset_paths: List[str]
+) -> None:
     """Prints a JSON representation of `graph` to stdout."""
-    encoding = depth_json(graph, max_n, max_e, max_p)
+    encoding = depth_json(graph, max_n, max_e, max_p, subset_paths)
 
     json.dump(
         json.loads(encoding),

diff --git a/pollen_data_gen/pollen_data_gen/simple.py b/pollen_data_gen/pollen_data_gen/simple.py
@@ -128,6 +128,7 @@ def dump(
     max_n: Optional[int],
     max_e: Optional[int],
     max_p: Optional[int],
+    subset_paths: Optional[List[str]] = None,
 ) -> None:
     """Outputs the graph as a JSON, along with precomputed data for the
     calculation of node depth.
@@ -140,7 +141,7 @@ def dump(
         | {f"path_details_{k}": v for k, v in graph.paths.items()}
     )
 
-    depth_encoding = depth.depth_json(graph, max_n, max_e, max_p)
+    depth_encoding = depth.depth_json(graph, max_n, max_e, max_p, subset_paths)
 
     json.dump(
         {

diff --git a/pollen_py/pollen/depth/calyx_depth.py b/pollen_py/pollen/depth/calyx_depth.py
@@ -4,6 +4,14 @@
 from calyx.py_ast import *
 from . import parse_data
 
+# from mygfa import mygfa, preprocess
+
+# Defaults for the maximum possible number of nodes,
+# steps per node, and paths to consider
+MAX_NODES = 16
+MAX_STEPS = 15
+MAX_PATHS = 15
+
 
 def node_depth(max_nodes, max_steps, max_paths):
     stdlib = Stdlib()
@@ -545,9 +553,33 @@ def config_parser(parser):
     )
 
 
+# def get_maxes(filename):
+#     print("In `get_maxes`. Filename: ", filename)
+#     """Returns the maximum number of nodes, steps per node, and paths."""
+#     with open(filename, "r", encoding="utf-8") as infile:
+#         graph = mygfa.Graph.parse(infile)
+#         return preprocess.get_maxes(graph)
+
+
+# def get_dimensions(args):
+#     """
+#     Compute the node depth accelerator's dimensions from commandline input.
+#     """
+#     if args.auto_size:
+#         filename = args.filename if args.auto_size == "d" else args.auto_size
+#         max_nodes, max_steps, max_paths = get_maxes(filename)
+#     else:
+#         max_nodes, max_steps, max_paths = MAX_NODES, MAX_STEPS, MAX_PATHS
+
+#     max_nodes = args.max_nodes if args.max_nodes else max_nodes
+#     max_steps = args.max_steps if args.max_steps else max_steps
+#     max_paths = args.max_paths if args.max_paths else max_paths
+
+#     return max_nodes, max_steps, max_paths
+
+
 def run(args):
     max_nodes, max_steps, max_paths = parse_data.get_dimensions(args)
-
     program = node_depth(max_nodes, max_steps, max_paths)
     output = program.doc()