From 89c276a561e08501dcb2212c6230a1247d0f1368 Mon Sep 17 00:00:00 2001 From: stxue1 Date: Fri, 19 Jul 2024 16:25:50 -0700 Subject: [PATCH 01/36] Defer file virtualization to task boundaries and consolidate decl evaluation. Also gets rid of monkeypatching in favor of a manual function call --- src/toil/wdl/wdltoil.py | 321 ++++++++++++++-------------------------- 1 file changed, 113 insertions(+), 208 deletions(-) diff --git a/src/toil/wdl/wdltoil.py b/src/toil/wdl/wdltoil.py index 24405b1f75..197f555d64 100755 --- a/src/toil/wdl/wdltoil.py +++ b/src/toil/wdl/wdltoil.py @@ -71,7 +71,7 @@ unwrap_all) from toil.jobStores.abstractJobStore import (AbstractJobStore, UnimplementedURLException, InvalidImportExportUrlException, LocatorException) -from toil.lib.accelerators import count_nvidia_gpus, get_individual_local_accelerators +from toil.lib.accelerators import get_individual_local_accelerators from toil.lib.conversions import convert_units, human2bytes, strtobool from toil.lib.io import mkdtemp from toil.lib.memoize import memoize @@ -607,24 +607,26 @@ def unpack_toil_uri(toil_uri: str) -> Tuple[FileID, str, str]: return file_id, parent_id, file_basename -def evaluate_output_decls(output_decls: List[WDL.Tree.Decl], all_bindings: WDL.Env.Bindings[WDL.Value.Base], standard_library: WDL.StdLib.Base) -> WDL.Env.Bindings[WDL.Value.Base]: +def evaluate_bindings_from_decls(decls: List[WDL.Tree.Decl], all_bindings: WDL.Env.Bindings[WDL.Value.Base], standard_library: WDL.StdLib.Base, + include_previous: bool = False) -> WDL.Env.Bindings[WDL.Value.Base]: """ - Evaluate output decls with a given bindings environment and standard library. + Evaluate decls with a given bindings environment and standard library. Creates a new bindings object that only contains the bindings from the given decls. - Guarantees that each decl in `output_decls` can access the variables defined by the previous ones. + Guarantees that each decl in `decls` can access the variables defined by the previous ones. :param all_bindings: Environment to use when evaluating decls - :param output_decls: Decls to evaluate + :param decls: Decls to evaluate :param standard_library: Standard library - :return: New bindings object with only the output_decls - """ - # all_bindings contains output + previous bindings so that the output can reference its own declarations - # output_bindings only contains the output bindings themselves so that bindings from sections such as the input aren't included - output_bindings: WDL.Env.Bindings[WDL.Value.Base] = WDL.Env.Bindings() - for output_decl in output_decls: - output_value = evaluate_decl(output_decl, all_bindings, standard_library) - all_bindings = all_bindings.bind(output_decl.name, output_value) - output_bindings = output_bindings.bind(output_decl.name, output_value) - return output_bindings + :param include_previous: Whether to include the existing environment in the new returned environment. This will be false for outputs where only defined decls should be included + :return: New bindings object with only the decls + """ + # all_bindings contains current bindings + previous all_bindings + # bindings only contains the decl bindings themselves so that bindings from other sections prior aren't included + bindings: WDL.Env.Bindings[WDL.Value.Base] = WDL.Env.Bindings() + for each_decl in decls: + output_value = evaluate_defaultable_decl(each_decl, all_bindings, standard_library) + all_bindings = all_bindings.bind(each_decl.name, output_value) + bindings = bindings.bind(each_decl.name, output_value) + return all_bindings if include_previous else bindings class NonDownloadingSize(WDL.StdLib._Size): """ @@ -780,7 +782,7 @@ def _devirtualize_filename(self, filename: str) -> str: 'devirtualize' filename passed to a read_* function: return a filename that can be open()ed on the local host. """ - + result = self.devirtualize_to(filename, self._file_store.localTempDir, self._file_store, self._execution_dir, self._devirtualized_to_virtualized, self._virtualized_to_devirtualized) return result @@ -1316,95 +1318,6 @@ def add_paths(task_container: TaskContainer, host_paths: Iterable[str]) -> None: task_container.input_path_map[host_path] = container_path task_container.input_path_map_rev[container_path] = host_path -def import_files(environment: WDLBindings, toil: Toil, path: Optional[List[str]] = None, skip_remote: bool = False) -> WDLBindings: - """ - Make sure all File values embedded in the given bindings are imported, - using the given Toil object. - - :param path: If set, try resolving input location relative to the URLs or - directories in this list. - - :param skip_remote: If set, don't try to import files from remote - locations. Leave them as URIs. - """ - path_to_id: Dict[str, uuid.UUID] = {} - @memoize - def import_file_from_uri(uri: str) -> str: - """ - Import a file from a URI and return a virtualized filename for it. - """ - - tried = [] - for candidate_uri in potential_absolute_uris(uri, path if path is not None else []): - # Try each place it could be according to WDL finding logic. - tried.append(candidate_uri) - try: - if skip_remote and is_url(candidate_uri): - # Use remote URIs in place. But we need to find the one that exists. - if not AbstractJobStore.url_exists(candidate_uri): - # Wasn't found there - continue - # Now we know this exists, so pass it through - return candidate_uri - else: - # Actually import - # Try to import the file. Don't raise if we can't find it, just - # return None! - imported = toil.import_file(candidate_uri, check_existence=False) - if imported is None: - # Wasn't found there - continue - logger.info('Imported %s', candidate_uri) - - except UnimplementedURLException as e: - # We can't find anything that can even support this URL scheme. - # Report to the user, they are probably missing an extra. - logger.critical('Error: ' + str(e)) - sys.exit(1) - except Exception: - # Something went wrong besides the file not being found. Maybe - # we have no auth. - logger.error("Something went wrong importing %s", candidate_uri) - raise - - if imported is None: - # Wasn't found there - continue - logger.info('Imported %s', candidate_uri) - - # Work out what the basename for the file was - file_basename = os.path.basename(urlsplit(candidate_uri).path) - - if file_basename == "": - # We can't have files with no basename because we need to - # download them at that basename later. - raise RuntimeError(f"File {candidate_uri} has no basename and so cannot be a WDL File") - - # Was actually found - if is_url(candidate_uri): - # Might be a file URI or other URI. - # We need to make sure file URIs and local paths that point to - # the same place are treated the same. - parsed = urlsplit(candidate_uri) - if parsed.scheme == "file:": - # This is a local file URI. Convert to a path for source directory tracking. - parent_dir = os.path.dirname(unquote(parsed.path)) - else: - # This is some other URL. Get the URL to the parent directory and use that. - parent_dir = urljoin(candidate_uri, ".") - else: - # Must be a local path - parent_dir = os.path.dirname(candidate_uri) - - # Pack a UUID of the parent directory - dir_id = path_to_id.setdefault(parent_dir, uuid.uuid4()) - - return pack_toil_uri(imported, dir_id, file_basename) - - # If we get here we tried all the candidates - raise RuntimeError(f"Could not find {uri} at any of: {tried}") - - return map_over_files_in_bindings(environment, import_file_from_uri) def drop_missing_files(environment: WDLBindings, current_directory_override: Optional[str] = None) -> WDLBindings: """ @@ -1691,7 +1604,7 @@ def __init__(self, task: WDL.Tree.Task, prev_node_results: Sequence[Promised[WDL :param task_path: Like the namespace, but including subscript numbers for scatters. """ - super().__init__(unitName=task_path + ".inputs", displayName=namespace + ".inputs", local=True, **kwargs) + super().__init__(unitName=task_path + ".inputs", displayName=namespace + ".inputs", **kwargs) logger.info("Preparing to run task code for %s as %s", task.name, namespace) @@ -1714,18 +1627,17 @@ def run(self, file_store: AbstractFileStore) -> Promised[WDLBindings]: bindings = combine_bindings(unwrap_all(self._prev_node_results)) # Set up the WDL standard library # UUID to use for virtualizing files - standard_library = ToilWDLStdLibBase(file_store) + standard_library = ToilWDLStdLibBase(file_store, self._wdl_options.get("execution_dir")) if self._task.inputs: logger.debug("Evaluating task code") - for input_decl in self._task.inputs: - # Evaluate all the inputs that aren't pre-set - bindings = bindings.bind(input_decl.name, evaluate_defaultable_decl(input_decl, bindings, standard_library)) - for postinput_decl in self._task.postinputs: + # Evaluate all the inputs that aren't pre-set + bindings = evaluate_bindings_from_decls(self._task.inputs, bindings, standard_library, include_previous=True) + if self._task.postinputs: # Evaluate all the postinput decls. # We need these in order to evaluate the runtime. # TODO: What if they wanted resources from the runtime? - bindings = bindings.bind(postinput_decl.name, evaluate_defaultable_decl(postinput_decl, bindings, standard_library)) + bindings = evaluate_bindings_from_decls(self._task.postinputs, bindings, standard_library, include_previous=True) # Evaluate the runtime section runtime_bindings = evaluate_call_inputs(self._task, self._task.runtime, bindings, standard_library) @@ -1869,7 +1781,7 @@ def add_injections(self, command_string: str, task_container: TaskContainer) -> Currently doesn't implement the MiniWDL plugin system, but does add resource usage monitoring to Docker containers. """ - + parts = [] if isinstance(task_container, SwarmContainer): @@ -2307,7 +2219,7 @@ def patched_run_invocation(*args: Any, **kwargs: Any) -> List[str]: outputs_library = ToilWDLStdLibTaskOutputs(file_store, host_stdout_txt, host_stderr_txt, task_container.input_path_map, current_directory_override=workdir_in_container) # Make sure files downloaded as inputs get re-used if we re-upload them. outputs_library.share_files(standard_library) - output_bindings = evaluate_output_decls(self._task.outputs, bindings, outputs_library) + output_bindings = evaluate_bindings_from_decls(self._task.outputs, bindings, outputs_library) # Now we know if the standard output and error were sent somewhere by # the workflow. If not, we should report them to the leader. @@ -2377,63 +2289,63 @@ def run(self, file_store: AbstractFileStore) -> Promised[WDLBindings]: incoming_bindings = combine_bindings(unwrap_all(self._prev_node_results)) # Set up the WDL standard library standard_library = ToilWDLStdLibBase(file_store, execution_dir=self._wdl_options.get("execution_dir")) - with monkeypatch_coerce(standard_library): - if isinstance(self._node, WDL.Tree.Decl): - # This is a variable assignment - logger.info('Setting %s to %s', self._node.name, self._node.expr) - value = evaluate_decl(self._node, incoming_bindings, standard_library) - return self.postprocess(incoming_bindings.bind(self._node.name, value)) - elif isinstance(self._node, WDL.Tree.Call): - # This is a call of a task or workflow - - # Fetch all the inputs we are passing and bind them. - # The call is only allowed to use these. - logger.debug("Evaluating step inputs") - if self._node.callee is None: - # This should never be None, but mypy gets unhappy and this is better than an assert - inputs_mapping = None - else: - inputs_mapping = {e.name: e.type for e in self._node.callee.inputs or []} - input_bindings = evaluate_call_inputs(self._node, self._node.inputs, incoming_bindings, standard_library, inputs_mapping) - - # Bindings may also be added in from the enclosing workflow inputs - # TODO: this is letting us also inject them from the workflow body. - # TODO: Can this result in picking up non-namespaced values that - # aren't meant to be inputs, by not changing their names? - passed_down_bindings = incoming_bindings.enter_namespace(self._node.name) - - if isinstance(self._node.callee, WDL.Tree.Workflow): - # This is a call of a workflow - subjob: WDLBaseJob = WDLWorkflowJob(self._node.callee, [input_bindings, passed_down_bindings], self._node.callee_id, f'{self._namespace}.{self._node.name}', f'{self._task_path}.{self._node.name}', wdl_options=self._wdl_options) - self.addChild(subjob) - elif isinstance(self._node.callee, WDL.Tree.Task): - # This is a call of a task - subjob = WDLTaskWrapperJob(self._node.callee, [input_bindings, passed_down_bindings], self._node.callee_id, f'{self._namespace}.{self._node.name}', f'{self._task_path}.{self._node.name}', wdl_options=self._wdl_options) - self.addChild(subjob) - else: - raise WDL.Error.InvalidType(self._node, "Cannot call a " + str(type(self._node.callee))) - - # We need to agregate outputs namespaced with our node name, and existing bindings - subjob.then_namespace(self._node.name) - subjob.then_overlay(incoming_bindings) - self.defer_postprocessing(subjob) - return subjob.rv() - elif isinstance(self._node, WDL.Tree.Scatter): - subjob = WDLScatterJob(self._node, [incoming_bindings], self._namespace, self._task_path, wdl_options=self._wdl_options) + if isinstance(self._node, WDL.Tree.Decl): + # This is a variable assignment + logger.info('Setting %s to %s', self._node.name, self._node.expr) + value = evaluate_decl(self._node, incoming_bindings, standard_library) + bindings = incoming_bindings.bind(self._node.name, value) + return self.postprocess(bindings) + elif isinstance(self._node, WDL.Tree.Call): + # This is a call of a task or workflow + + # Fetch all the inputs we are passing and bind them. + # The call is only allowed to use these. + logger.debug("Evaluating step inputs") + if self._node.callee is None: + # This should never be None, but mypy gets unhappy and this is better than an assert + inputs_mapping = None + else: + inputs_mapping = {e.name: e.type for e in self._node.callee.inputs or []} + input_bindings = evaluate_call_inputs(self._node, self._node.inputs, incoming_bindings, standard_library, inputs_mapping) + + # Bindings may also be added in from the enclosing workflow inputs + # TODO: this is letting us also inject them from the workflow body. + # TODO: Can this result in picking up non-namespaced values that + # aren't meant to be inputs, by not changing their names? + passed_down_bindings = incoming_bindings.enter_namespace(self._node.name) + + if isinstance(self._node.callee, WDL.Tree.Workflow): + # This is a call of a workflow + subjob: WDLBaseJob = WDLWorkflowJob(self._node.callee, [input_bindings, passed_down_bindings], self._node.callee_id, f'{self._namespace}.{self._node.name}', f'{self._task_path}.{self._node.name}', wdl_options=self._wdl_options, local=True) self.addChild(subjob) - # Scatters don't really make a namespace, just kind of a scope? - # TODO: Let stuff leave scope! - self.defer_postprocessing(subjob) - return subjob.rv() - elif isinstance(self._node, WDL.Tree.Conditional): - subjob = WDLConditionalJob(self._node, [incoming_bindings], self._namespace, self._task_path, wdl_options=self._wdl_options) + elif isinstance(self._node.callee, WDL.Tree.Task): + # This is a call of a task + subjob = WDLTaskWrapperJob(self._node.callee, [input_bindings, passed_down_bindings], self._node.callee_id, f'{self._namespace}.{self._node.name}', f'{self._task_path}.{self._node.name}', wdl_options=self._wdl_options, local=True) self.addChild(subjob) - # Conditionals don't really make a namespace, just kind of a scope? - # TODO: Let stuff leave scope! - self.defer_postprocessing(subjob) - return subjob.rv() else: - raise WDL.Error.InvalidType(self._node, "Unimplemented WorkflowNode: " + str(type(self._node))) + raise WDL.Error.InvalidType(self._node, "Cannot call a " + str(type(self._node.callee))) + + # We need to agregate outputs namespaced with our node name, and existing bindings + subjob.then_namespace(self._node.name) + subjob.then_overlay(incoming_bindings) + self.defer_postprocessing(subjob) + return subjob.rv() + elif isinstance(self._node, WDL.Tree.Scatter): + subjob = WDLScatterJob(self._node, [incoming_bindings], self._namespace, self._task_path, wdl_options=self._wdl_options, local=True) + self.addChild(subjob) + # Scatters don't really make a namespace, just kind of a scope? + # TODO: Let stuff leave scope! + self.defer_postprocessing(subjob) + return subjob.rv() + elif isinstance(self._node, WDL.Tree.Conditional): + subjob = WDLConditionalJob(self._node, [incoming_bindings], self._namespace, self._task_path, wdl_options=self._wdl_options, local=True) + self.addChild(subjob) + # Conditionals don't really make a namespace, just kind of a scope? + # TODO: Let stuff leave scope! + self.defer_postprocessing(subjob) + return subjob.rv() + else: + raise WDL.Error.InvalidType(self._node, "Unimplemented WorkflowNode: " + str(type(self._node))) class WDLWorkflowNodeListJob(WDLBaseJob): """ @@ -2468,15 +2380,14 @@ def run(self, file_store: AbstractFileStore) -> Promised[WDLBindings]: # Set up the WDL standard library standard_library = ToilWDLStdLibBase(file_store, execution_dir=self._wdl_options.get("execution_dir")) - with monkeypatch_coerce(standard_library): - for node in self._nodes: - if isinstance(node, WDL.Tree.Decl): - # This is a variable assignment - logger.info('Setting %s to %s', node.name, node.expr) - value = evaluate_decl(node, current_bindings, standard_library) - current_bindings = current_bindings.bind(node.name, value) - else: - raise WDL.Error.InvalidType(node, "Unimplemented WorkflowNode: " + str(type(node))) + for node in self._nodes: + if isinstance(node, WDL.Tree.Decl): + # This is a variable assignment + logger.info('Setting %s to %s', node.name, node.expr) + value = evaluate_decl(node, current_bindings, standard_library) + current_bindings = current_bindings.bind(node.name, value) + else: + raise WDL.Error.InvalidType(node, "Unimplemented WorkflowNode: " + str(type(node))) return self.postprocess(current_bindings) @@ -2806,10 +2717,10 @@ def get_job_set_any(wdl_ids: Set[str]) -> List[WDLBaseJob]: if len(node_ids) == 1: # Make a one-node job - job: WDLBaseJob = WDLWorkflowNodeJob(section_graph.get(node_ids[0]), rvs, self._namespace, task_path, wdl_options=self._wdl_options) + job: WDLBaseJob = WDLWorkflowNodeJob(section_graph.get(node_ids[0]), rvs, self._namespace, task_path, wdl_options=self._wdl_options, local=True) else: # Make a multi-node job - job = WDLWorkflowNodeListJob([section_graph.get(node_id) for node_id in node_ids], rvs, self._namespace, wdl_options=self._wdl_options) + job = WDLWorkflowNodeListJob([section_graph.get(node_id) for node_id in node_ids], rvs, self._namespace, wdl_options=self._wdl_options, local=True) for prev_job in prev_jobs: # Connect up the happens-after relationships to make sure the # return values are available. @@ -2839,7 +2750,7 @@ def get_job_set_any(wdl_ids: Set[str]) -> List[WDLBaseJob]: leaf_rvs.append(environment) # And to fill in bindings from code not executed in this instantiation # with Null, and filter out stuff that should leave scope. - sink = WDLCombineBindingsJob(leaf_rvs, wdl_options=self._wdl_options) + sink = WDLCombineBindingsJob(leaf_rvs, wdl_options=self._wdl_options, local=True) # It runs inside us self.addChild(sink) for leaf_job in toil_leaves.values(): @@ -2937,15 +2848,14 @@ def run(self, file_store: AbstractFileStore) -> Promised[WDLBindings]: # For a task we only see the inside-the-task namespace. bindings = combine_bindings(unwrap_all(self._prev_node_results)) # Set up the WDL standard library - standard_library = ToilWDLStdLibBase(file_store) + standard_library = ToilWDLStdLibBase(file_store, self._wdl_options.get("execution_dir")) # Get what to scatter over - with monkeypatch_coerce(standard_library): - try: - scatter_value = evaluate_named_expression(self._scatter, self._scatter.variable, None, self._scatter.expr, bindings, standard_library) - finally: - # Report all files are downloaded now that all expressions are evaluated. - self.files_downloaded_hook([(p, p) for p in standard_library.get_local_paths()]) + try: + scatter_value = evaluate_named_expression(self._scatter, self._scatter.variable, None, self._scatter.expr, bindings, standard_library) + finally: + # Report all files are downloaded now that all expressions are evaluated. + self.files_downloaded_hook([(p, p) for p in standard_library.get_local_paths()]) if not isinstance(scatter_value, WDL.Value.Array): raise RuntimeError("The returned value from a scatter is not an Array type.") @@ -3076,15 +2986,14 @@ def run(self, file_store: AbstractFileStore) -> Promised[WDLBindings]: # For a task we only see the insode-the-task namespace. bindings = combine_bindings(unwrap_all(self._prev_node_results)) # Set up the WDL standard library - standard_library = ToilWDLStdLibBase(file_store) + standard_library = ToilWDLStdLibBase(file_store, self._wdl_options.get("execution_dir")) # Get the expression value. Fake a name. - with monkeypatch_coerce(standard_library): - try: - expr_value = evaluate_named_expression(self._conditional, "", WDL.Type.Boolean(), self._conditional.expr, bindings, standard_library) - finally: - # Report all files are downloaded now that all expressions are evaluated. - self.files_downloaded_hook([(p, p) for p in standard_library.get_local_paths()]) + try: + expr_value = evaluate_named_expression(self._conditional, "", WDL.Type.Boolean(), self._conditional.expr, bindings, standard_library) + finally: + # Report all files are downloaded now that all expressions are evaluated. + self.files_downloaded_hook([(p, p) for p in standard_library.get_local_paths()]) if expr_value.value: # Evaluated to true! @@ -3146,14 +3055,11 @@ def run(self, file_store: AbstractFileStore) -> Promised[WDLBindings]: standard_library = ToilWDLStdLibBase(file_store, execution_dir=self._wdl_options.get("execution_dir")) if self._workflow.inputs: - with monkeypatch_coerce(standard_library): - try: - for input_decl in self._workflow.inputs: - # Evaluate all the inputs that aren't pre-set - bindings = bindings.bind(input_decl.name, evaluate_defaultable_decl(input_decl, bindings, standard_library)) - finally: - # Report all files are downloaded now that all expressions are evaluated. - self.files_downloaded_hook([(p, p) for p in standard_library.get_local_paths()]) + try: + bindings = evaluate_bindings_from_decls(self._workflow.inputs, bindings, standard_library, include_previous=True) + finally: + # Report all files are downloaded now that all expressions are evaluated. + self.files_downloaded_hook([(p, p) for p in standard_library.get_local_paths()]) # Make jobs to run all the parts of the workflow sink = self.create_subgraph(self._workflow.body, [], bindings) @@ -3161,7 +3067,7 @@ def run(self, file_store: AbstractFileStore) -> Promised[WDLBindings]: if self._workflow.outputs != []: # Compare against empty list as None means there should be outputs # Either the output section is declared and nonempty or it is not declared # Add evaluating the outputs after the sink - outputs_job = WDLOutputsJob(self._workflow, sink.rv(), wdl_options=self._wdl_options) + outputs_job = WDLOutputsJob(self._workflow, sink.rv(), wdl_options=self._wdl_options, local=True) sink.addFollowOn(outputs_job) # Caller is responsible for making sure namespaces are applied self.defer_postprocessing(outputs_job) @@ -3215,7 +3121,7 @@ def run(self, file_store: AbstractFileStore) -> WDLBindings: # Output section is declared and is nonempty, so evaluate normally # Combine the bindings from the previous job - output_bindings = evaluate_output_decls(self._workflow.outputs, unwrap(self._bindings), standard_library) + output_bindings = evaluate_bindings_from_decls(self._workflow.outputs, unwrap(self._bindings), standard_library) finally: # We don't actually know when all our files are downloaded since # anything we evaluate might devirtualize inside any expression. @@ -3255,10 +3161,10 @@ def run(self, file_store: AbstractFileStore) -> Promised[WDLBindings]: if isinstance(self._target, WDL.Tree.Workflow): # Create a workflow job. We rely in this to handle entering the input # namespace if needed, or handling free-floating inputs. - job: WDLBaseJob = WDLWorkflowJob(self._target, [self._inputs], [self._target.name], self._namespace, self._task_path, wdl_options=self._wdl_options) + job: WDLBaseJob = WDLWorkflowJob(self._target, [self._inputs], [self._target.name], self._namespace, self._task_path, wdl_options=self._wdl_options, local=True) else: # There is no workflow. Create a task job. - job = WDLTaskWrapperJob(self._target, [self._inputs], [self._target.name], self._namespace, self._task_path, wdl_options=self._wdl_options) + job = WDLTaskWrapperJob(self._target, [self._inputs], [self._target.name], self._namespace, self._task_path, wdl_options=self._wdl_options, local=True) # Run the task or workflow job.then_namespace(self._namespace) self.addChild(job) @@ -3390,7 +3296,6 @@ def main() -> None: inputs_search_path.append(match.group(0)) # Import any files in the bindings - input_bindings = import_files(input_bindings, toil, inputs_search_path, skip_remote=options.reference_inputs) # TODO: Automatically set a good MINIWDL__SINGULARITY__IMAGE_CACHE ? @@ -3404,7 +3309,7 @@ def main() -> None: assert wdl_options.get("container") is not None # Run the workflow and get its outputs namespaced with the workflow name. - root_job = WDLRootJob(target, input_bindings, wdl_options=wdl_options) + root_job = WDLRootJob(target, input_bindings, wdl_options=wdl_options, local=True) output_bindings = toil.start(root_job) if not isinstance(output_bindings, WDL.Env.Bindings): raise RuntimeError("The output of the WDL job is not a binding.") From c85f173b4e72faa0d6adbc8c078306a8600ab31f Mon Sep 17 00:00:00 2001 From: stxue1 Date: Tue, 23 Jul 2024 18:43:45 -0700 Subject: [PATCH 02/36] Implement support for importing relative URL paths; import files at startup and carry through mappings --- src/toil/wdl/wdltoil.py | 317 ++++++++++++++++++++++++---------------- 1 file changed, 195 insertions(+), 122 deletions(-) diff --git a/src/toil/wdl/wdltoil.py b/src/toil/wdl/wdltoil.py index 197f555d64..382478b219 100755 --- a/src/toil/wdl/wdltoil.py +++ b/src/toil/wdl/wdltoil.py @@ -55,6 +55,7 @@ from WDL.runtime.backend.docker_swarm import SwarmContainer from WDL.runtime.backend.singularity import SingularityContainer from WDL.runtime.task_container import TaskContainer +from WDL.Value import rewrite_paths from toil.batchSystems.abstractBatchSystem import InsufficientSystemResources from toil.common import Toil, addOptions @@ -691,6 +692,88 @@ def is_url(filename: str, schemes: List[str] = ['http:', 'https:', 's3:', 'gs:', return True return False +def import_url_files(environment: WDLBindings, file_source: Union[AbstractFileStore, Toil], path: Optional[List[str]] = None) -> Optional[Dict[str, str]]: + """ + Iterate through the environment and import all files from the possible URLs. + Returns a mapping of the original file names to virtualized URIs + :param environment: Bindings to evaluate on + :param file_source: Context to upload/virtualize files with + :param path: List of paths to search through + :return: + """ + parent_to_id: Dict[str, uuid.UUID] = {} + filename_to_uri: Dict[str, str] = {} + def import_file_from_url(filename: str) -> str: + """ + Import a file if from a URL and return a virtualized filename for it. + """ + # Search through any input search paths passed in and download it if found + tried = [] + for candidate_uri in potential_absolute_uris(filename, path if path is not None else []): + tried.append(candidate_uri) + try: + # Try to import the file. Don't raise if we can't find it just return None + imported = None + jobstore = None + if isinstance(file_source, AbstractJobStore): + imported = file_source.import_file(candidate_uri) + elif isinstance(file_source, Toil): + imported = file_source.import_file(candidate_uri, check_existence=False) + if imported is None: + # Wasn't found there + continue + except UnimplementedURLException as e: + # We can't find anything that can even support this URL scheme. + # Report to the user, they are probably missing an extra. + logger.critical('Error: ' + str(e)) + raise + except Exception: + # Something went wrong besides the file not being found. Maybe + # we have no auth. + logger.error("Something went wrong importing %s", candidate_uri) + raise + + if imported is None: + # Wasn't found there + # Mostly to satisfy mypy + continue + logger.info('Imported %s', candidate_uri) + + # Work out what the basename for the file was + file_basename = os.path.basename(urlsplit(candidate_uri).path) + + if file_basename == "": + # We can't have files with no basename because we need to + # download them at that basename later. + raise RuntimeError(f"File {candidate_uri} has no basename and so cannot be a WDL File") + + # Was actually found + if is_url(candidate_uri): + # Might be a file URI or other URI. + # We need to make sure file URIs and local paths that point to + # the same place are treated the same. + parsed = urlsplit(candidate_uri) + if parsed.scheme == "file:": + # This is a local file URI. Convert to a path for source directory tracking. + parent_dir = os.path.dirname(unquote(parsed.path)) + else: + # This is some other URL. Get the URL to the parent directory and use that. + parent_dir = urljoin(candidate_uri, ".") + else: + # Must be a local path + return filename + + # Pack a UUID of the parent directory + dir_id = parent_to_id.setdefault(parent_dir, uuid.uuid4()) + filename_to_uri[filename] = pack_toil_uri(imported, dir_id, file_basename) + return filename + + # If we get here we tried all the candidates + raise RuntimeError(f"Could not find {filename} at any of: {tried}") + map_over_files_in_bindings(environment, import_file_from_url) + return filename_to_uri if len(filename_to_uri) > 0 else None + + # Both the WDL code itself **and** the commands that it runs will deal in # "virtualized" filenames. @@ -720,11 +803,11 @@ class ToilWDLStdLibBase(WDL.StdLib.Base): """ Standard library implementation for WDL as run on Toil. """ - def __init__(self, file_store: AbstractFileStore, execution_dir: Optional[str] = None): + def __init__(self, file_store: AbstractFileStore, wdl_options: Optional[Dict[str, Any]] = None): """ Set up the standard library. - :param execution_dir: Directory to use as the working directory for workflow code. + :param wdl_options: Extra options to pass into the standard library to use, ex directory to use as the working directory for workflow code. """ # TODO: Just always be the 1.2 standard library. wdl_version = "1.2" @@ -749,7 +832,7 @@ def __init__(self, file_store: AbstractFileStore, execution_dir: Optional[str] = # paths, to save re-uploads. self._devirtualized_to_virtualized: Dict[str, str] = {} - self._execution_dir = execution_dir + self._wdl_options = wdl_options def get_local_paths(self) -> List[str]: """ @@ -783,12 +866,68 @@ def _devirtualize_filename(self, filename: str) -> str: on the local host. """ - result = self.devirtualize_to(filename, self._file_store.localTempDir, self._file_store, self._execution_dir, + result = self.devirtualize_to(filename, self._file_store.localTempDir, self._file_store, self._wdl_options, self._devirtualized_to_virtualized, self._virtualized_to_devirtualized) return result @staticmethod - def devirtualize_to(filename: str, dest_dir: str, file_source: Union[AbstractFileStore, Toil], execution_dir: Optional[str], + def _devirtualize_uri(filename: str, dest_dir: str, file_source: Union[AbstractFileStore, Toil]) -> str: + if filename.startswith(TOIL_URI_SCHEME): + # This is a reference to the Toil filestore. + # Deserialize the FileID + file_id, parent_id, file_basename = unpack_toil_uri(filename) + + # Decide where it should be put. + # This is a URI with the "parent" UUID attached to the filename. + # Use UUID as folder name rather than a new temp folder to reduce internal clutter. + # Put the UUID in the destination path in order for tasks to + # see where to put files depending on their parents. + dir_path = os.path.join(dest_dir, parent_id) + + else: + # Parse the URL and extract the basename + file_basename = os.path.basename(urlsplit(filename).path) + # Get the URL to the directory this thing came from. Remember + # URLs are interpreted relative to the directory the thing is + # in, not relative to the thing. + parent_url = urljoin(filename, ".") + # Turn it into a string we can make a directory for + dir_path = os.path.join(dest_dir, quote(parent_url, safe='')) + + if not os.path.exists(dir_path): + # Make sure the chosen directory exists + os.mkdir(dir_path) + # And decide the file goes in it. + dest_path = os.path.join(dir_path, file_basename) + + if filename.startswith(TOIL_URI_SCHEME): + # Get a local path to the file + if isinstance(file_source, AbstractFileStore): + # Read from the file store. + # File is not allowed to be modified by the task. See + # . + # We try to get away with symlinks and hope the task + # container can mount the destination file. + result = file_source.readGlobalFile(file_id, dest_path, mutable=False, symlink=True) + elif isinstance(file_source, Toil): + # Read from the Toil context + file_source.export_file(file_id, dest_path) + result = dest_path + else: + # Download to a local file with the right name and execute bit. + # Open it exclusively + with open(dest_path, 'xb') as dest_file: + # And save to it + size, executable = AbstractJobStore.read_from_url(filename, dest_file) + if executable: + # Set the execute bit in the file's permissions + os.chmod(dest_path, os.stat(dest_path).st_mode | stat.S_IXUSR) + + result = dest_path + return result + + @staticmethod + def devirtualize_to(filename: str, dest_dir: str, file_source: Union[AbstractFileStore, Toil], wdl_options: Optional[Dict[str, Any]]=None, devirtualized_to_virtualized: Optional[Dict[str, str]] = None, virtualized_to_devirtualized: Optional[Dict[str, str]] = None) -> str: """ Download or export a WDL virtualized filename/URL to the given directory. @@ -806,6 +945,8 @@ def devirtualize_to(filename: str, dest_dir: str, file_source: Union[AbstractFil should not be added to the cache """ + if wdl_options is None: + wdl_options = {} if not os.path.isdir(dest_dir): # os.mkdir fails saying the directory *being made* caused a # FileNotFoundError. So check the dest_dir before trying to make @@ -820,58 +961,7 @@ def devirtualize_to(filename: str, dest_dir: str, file_source: Union[AbstractFil result = virtualized_to_devirtualized[filename] logger.debug("Found virtualized %s in cache with devirtualized path %s", filename, result) return result - if filename.startswith(TOIL_URI_SCHEME): - # This is a reference to the Toil filestore. - # Deserialize the FileID - file_id, parent_id, file_basename = unpack_toil_uri(filename) - - # Decide where it should be put. - # This is a URI with the "parent" UUID attached to the filename. - # Use UUID as folder name rather than a new temp folder to reduce internal clutter. - # Put the UUID in the destination path in order for tasks to - # see where to put files depending on their parents. - dir_path = os.path.join(dest_dir, parent_id) - - else: - # Parse the URL and extract the basename - file_basename = os.path.basename(urlsplit(filename).path) - # Get the URL to the directory this thing came from. Remember - # URLs are interpreted relative to the directory the thing is - # in, not relative to the thing. - parent_url = urljoin(filename, ".") - # Turn it into a string we can make a directory for - dir_path = os.path.join(dest_dir, quote(parent_url, safe='')) - - if not os.path.exists(dir_path): - # Make sure the chosen directory exists - os.mkdir(dir_path) - # And decide the file goes in it. - dest_path = os.path.join(dir_path, file_basename) - - if filename.startswith(TOIL_URI_SCHEME): - # Get a local path to the file - if isinstance(file_source, AbstractFileStore): - # Read from the file store. - # File is not allowed to be modified by the task. See - # . - # We try to get away with symlinks and hope the task - # container can mount the destination file. - result = file_source.readGlobalFile(file_id, dest_path, mutable=False, symlink=True) - elif isinstance(file_source, Toil): - # Read from the Toil context - file_source.export_file(file_id, dest_path) - result = dest_path - else: - # Download to a local file with the right name and execute bit. - # Open it exclusively - with open(dest_path, 'xb') as dest_file: - # And save to it - size, executable = AbstractJobStore.read_from_url(filename, dest_file) - if executable: - # Set the execute bit in the file's permissions - os.chmod(dest_path, os.stat(dest_path).st_mode | stat.S_IXUSR) - - result = dest_path + result = ToilWDLStdLibBase._devirtualize_uri(filename, dest_dir, file_source) if devirtualized_to_virtualized is not None: # Store the back mapping devirtualized_to_virtualized[result] = filename @@ -880,14 +970,24 @@ def devirtualize_to(filename: str, dest_dir: str, file_source: Union[AbstractFil virtualized_to_devirtualized[filename] = result logger.debug('Devirtualized %s as openable file %s', filename, result) else: - # This is a local file - # To support relative paths, join the execution dir and filename - # if filename is already an abs path, join() will do nothing - if execution_dir is not None: - result = os.path.join(execution_dir, filename) - else: - result = filename - logger.debug("Virtualized file %s is already a local path", filename) + result = None + if wdl_options.get("filename_to_uri") is not None: + # We support importing files relative to a URL when the input JSON itself is downloaded + # See import_url_files() + filename_uri = wdl_options["filename_to_uri"].get(filename) + if filename_uri is not None: + # This was previously virtualized, so use the virtualized result + result = ToilWDLStdLibBase._devirtualize_uri(filename_uri, dest_dir, file_source) + if result is None: + # This is a local file + # To support relative paths, join the execution dir and filename + # if filename is already an abs path, join() will do nothing + execution_dir = wdl_options.get("execution_dir") + if execution_dir is not None: + result = os.path.join(execution_dir, filename) + else: + result = filename + logger.debug("Virtualized file %s is already a local path", filename) if not os.path.exists(result): raise RuntimeError(f"Virtualized file {filename} looks like a local file but isn't!") @@ -909,10 +1009,16 @@ def _virtualize_filename(self, filename: str) -> str: # Otherwise this is a local file and we want to fake it as a Toil file store file # Make it an absolute path - if self._execution_dir is not None: + wdl_options: Dict[str, Any] = self._wdl_options or {} + execution_dir = wdl_options.get("execution_dir") + if wdl_options.get("filename_to_uri") is not None: + result: str = wdl_options["filename_to_uri"][filename] + if result is not None: + return result + if execution_dir is not None: # To support relative paths from execution directory, join the execution dir and filename # If filename is already an abs path, join() will not do anything - abs_filename = os.path.join(self._execution_dir, filename) + abs_filename = os.path.join(execution_dir, filename) else: abs_filename = os.path.abspath(filename) @@ -1460,7 +1566,7 @@ class WDLBaseJob(Job): Also responsible for remembering the Toil WDL configuration keys and values. """ - def __init__(self, wdl_options: Optional[Dict[str, str]] = None, **kwargs: Any) -> None: + def __init__(self, wdl_options: Optional[Dict[str, Any]] = None, **kwargs: Any) -> None: """ Make a WDL-related job. @@ -1627,7 +1733,7 @@ def run(self, file_store: AbstractFileStore) -> Promised[WDLBindings]: bindings = combine_bindings(unwrap_all(self._prev_node_results)) # Set up the WDL standard library # UUID to use for virtualizing files - standard_library = ToilWDLStdLibBase(file_store, self._wdl_options.get("execution_dir")) + standard_library = ToilWDLStdLibBase(file_store, self._wdl_options) if self._task.inputs: logger.debug("Evaluating task code") @@ -2263,7 +2369,7 @@ class WDLWorkflowNodeJob(WDLBaseJob): Job that evaluates a WDL workflow node. """ - def __init__(self, node: WDL.Tree.WorkflowNode, prev_node_results: Sequence[Promised[WDLBindings]], namespace: str, task_path: str, wdl_options: Optional[Dict[str, str]] = None, **kwargs: Any) -> None: + def __init__(self, node: WDL.Tree.WorkflowNode, prev_node_results: Sequence[Promised[WDLBindings]], namespace: str, task_path: str, wdl_options: Optional[Dict[str, Any]] = None, **kwargs: Any) -> None: """ Make a new job to run a workflow node to completion. """ @@ -2288,7 +2394,7 @@ def run(self, file_store: AbstractFileStore) -> Promised[WDLBindings]: # Combine the bindings we get from previous jobs incoming_bindings = combine_bindings(unwrap_all(self._prev_node_results)) # Set up the WDL standard library - standard_library = ToilWDLStdLibBase(file_store, execution_dir=self._wdl_options.get("execution_dir")) + standard_library = ToilWDLStdLibBase(file_store, self._wdl_options) if isinstance(self._node, WDL.Tree.Decl): # This is a variable assignment logger.info('Setting %s to %s', self._node.name, self._node.expr) @@ -2354,7 +2460,7 @@ class WDLWorkflowNodeListJob(WDLBaseJob): workflows or tasks or sections. """ - def __init__(self, nodes: List[WDL.Tree.WorkflowNode], prev_node_results: Sequence[Promised[WDLBindings]], namespace: str, wdl_options: Optional[Dict[str, str]] = None, **kwargs: Any) -> None: + def __init__(self, nodes: List[WDL.Tree.WorkflowNode], prev_node_results: Sequence[Promised[WDLBindings]], namespace: str, wdl_options: Optional[Dict[str, Any]] = None, **kwargs: Any) -> None: """ Make a new job to run a list of workflow nodes to completion. """ @@ -2378,7 +2484,7 @@ def run(self, file_store: AbstractFileStore) -> Promised[WDLBindings]: # Combine the bindings we get from previous jobs current_bindings = combine_bindings(unwrap_all(self._prev_node_results)) # Set up the WDL standard library - standard_library = ToilWDLStdLibBase(file_store, execution_dir=self._wdl_options.get("execution_dir")) + standard_library = ToilWDLStdLibBase(file_store, self._wdl_options) for node in self._nodes: if isinstance(node, WDL.Tree.Decl): @@ -2557,7 +2663,7 @@ class WDLSectionJob(WDLBaseJob): Job that can create more graph for a section of the wrokflow. """ - def __init__(self, namespace: str, task_path: str, wdl_options: Optional[Dict[str, str]] = None, **kwargs: Any) -> None: + def __init__(self, namespace: str, task_path: str, wdl_options: Optional[Dict[str, Any]] = None, **kwargs: Any) -> None: """ Make a WDLSectionJob where the interior runs in the given namespace, starting with the root workflow. @@ -2817,7 +2923,7 @@ class WDLScatterJob(WDLSectionJob): instance of the body. If an instance of the body doesn't create a binding, it gets a null value in the corresponding array. """ - def __init__(self, scatter: WDL.Tree.Scatter, prev_node_results: Sequence[Promised[WDLBindings]], namespace: str, task_path: str, wdl_options: Optional[Dict[str, str]] = None, **kwargs: Any) -> None: + def __init__(self, scatter: WDL.Tree.Scatter, prev_node_results: Sequence[Promised[WDLBindings]], namespace: str, task_path: str, wdl_options: Optional[Dict[str, Any]] = None, **kwargs: Any) -> None: """ Create a subtree that will run a WDL scatter. The scatter itself and the contents live in the given namespace. """ @@ -2848,7 +2954,7 @@ def run(self, file_store: AbstractFileStore) -> Promised[WDLBindings]: # For a task we only see the inside-the-task namespace. bindings = combine_bindings(unwrap_all(self._prev_node_results)) # Set up the WDL standard library - standard_library = ToilWDLStdLibBase(file_store, self._wdl_options.get("execution_dir")) + standard_library = ToilWDLStdLibBase(file_store, self._wdl_options) # Get what to scatter over try: @@ -2959,7 +3065,7 @@ class WDLConditionalJob(WDLSectionJob): """ Job that evaluates a conditional in a WDL workflow. """ - def __init__(self, conditional: WDL.Tree.Conditional, prev_node_results: Sequence[Promised[WDLBindings]], namespace: str, task_path: str, wdl_options: Optional[Dict[str, str]] = None, **kwargs: Any) -> None: + def __init__(self, conditional: WDL.Tree.Conditional, prev_node_results: Sequence[Promised[WDLBindings]], namespace: str, task_path: str, wdl_options: Optional[Dict[str, Any]] = None, **kwargs: Any) -> None: """ Create a subtree that will run a WDL conditional. The conditional itself and its contents live in the given namespace. """ @@ -2986,7 +3092,7 @@ def run(self, file_store: AbstractFileStore) -> Promised[WDLBindings]: # For a task we only see the insode-the-task namespace. bindings = combine_bindings(unwrap_all(self._prev_node_results)) # Set up the WDL standard library - standard_library = ToilWDLStdLibBase(file_store, self._wdl_options.get("execution_dir")) + standard_library = ToilWDLStdLibBase(file_store, self._wdl_options) # Get the expression value. Fake a name. try: @@ -3014,7 +3120,7 @@ class WDLWorkflowJob(WDLSectionJob): Job that evaluates an entire WDL workflow. """ - def __init__(self, workflow: WDL.Tree.Workflow, prev_node_results: Sequence[Promised[WDLBindings]], workflow_id: List[str], namespace: str, task_path: str, wdl_options: Optional[Dict[str, str]] = None, **kwargs: Any) -> None: + def __init__(self, workflow: WDL.Tree.Workflow, prev_node_results: Sequence[Promised[WDLBindings]], workflow_id: List[str], namespace: str, task_path: str, wdl_options: Optional[Dict[str, Any]] = None, **kwargs: Any) -> None: """ Create a subtree that will run a WDL workflow. The job returns the return value of the workflow. @@ -3033,7 +3139,6 @@ def __init__(self, workflow: WDL.Tree.Workflow, prev_node_results: Sequence[Prom logger.debug("Preparing to run workflow %s", workflow.name) - self._workflow = workflow self._prev_node_results = prev_node_results self._workflow_id = workflow_id @@ -3052,7 +3157,7 @@ def run(self, file_store: AbstractFileStore) -> Promised[WDLBindings]: # For a task we only see the insode-the-task namespace. bindings = combine_bindings(unwrap_all(self._prev_node_results)) # Set up the WDL standard library - standard_library = ToilWDLStdLibBase(file_store, execution_dir=self._wdl_options.get("execution_dir")) + standard_library = ToilWDLStdLibBase(file_store, self._wdl_options) if self._workflow.inputs: try: @@ -3082,7 +3187,7 @@ class WDLOutputsJob(WDLBaseJob): Returns an environment with just the outputs bound, in no namespace. """ - def __init__(self, workflow: WDL.Tree.Workflow, bindings: Promised[WDLBindings], wdl_options: Optional[Dict[str, str]] = None, **kwargs: Any): + def __init__(self, workflow: WDL.Tree.Workflow, bindings: Promised[WDLBindings], wdl_options: Optional[Dict[str, Any]] = None, **kwargs: Any): """ Make a new WDLWorkflowOutputsJob for the given workflow, with the given set of bindings after its body runs. """ @@ -3099,7 +3204,7 @@ def run(self, file_store: AbstractFileStore) -> WDLBindings: super().run(file_store) # Evaluate all output expressions in the normal, non-task-outputs library context - standard_library = ToilWDLStdLibBase(file_store, execution_dir=self._wdl_options.get("execution_dir")) + standard_library = ToilWDLStdLibBase(file_store, self._wdl_options) try: if self._workflow.outputs is None: @@ -3142,7 +3247,7 @@ class WDLRootJob(WDLSectionJob): the workflow name; both forms are accepted. """ - def __init__(self, target: Union[WDL.Tree.Workflow, WDL.Tree.Task], inputs: WDLBindings, wdl_options: Optional[Dict[str, str]] = None, **kwargs: Any) -> None: + def __init__(self, target: Union[WDL.Tree.Workflow, WDL.Tree.Task], inputs: WDLBindings, wdl_options: Optional[Dict[str, Any]] = None, **kwargs: Any) -> None: """ Create a subtree to run the workflow and namespace the outputs. """ @@ -3172,40 +3277,6 @@ def run(self, file_store: AbstractFileStore) -> Promised[WDLBindings]: return job.rv() -@contextmanager -def monkeypatch_coerce(standard_library: ToilWDLStdLibBase) -> Generator[None, None, None]: - """ - Monkeypatch miniwdl's WDL.Value.Base.coerce() function to virtualize files when they are represented as Strings. - Calls _virtualize_filename from a given standard library object. - :param standard_library: a standard library object - :return - """ - # We're doing this because while miniwdl recognizes when a string needs to be converted into a file, it's method of - # conversion is to just store the local filepath. Toil needs to virtualize the file into the jobstore so until - # there is an internal entrypoint, monkeypatch it. - def base_coerce(self: WDL.Value.Base, desired_type: Optional[WDL.Type.Base] = None) -> WDL.Value.Base: - if isinstance(desired_type, WDL.Type.File): - self.value = standard_library._virtualize_filename(self.value) - return self - return old_base_coerce(self, desired_type) # old_coerce will recurse back into this monkey patched coerce - def string_coerce(self: WDL.Value.String, desired_type: Optional[WDL.Type.Base] = None) -> WDL.Value.Base: - # Sometimes string coerce is called instead, so monkeypatch this one as well - if isinstance(desired_type, WDL.Type.File) and not isinstance(self, WDL.Type.File): - return WDL.Value.File(standard_library._virtualize_filename(self.value), self.expr) - return old_str_coerce(self, desired_type) - - old_base_coerce = WDL.Value.Base.coerce - old_str_coerce = WDL.Value.String.coerce - try: - # Mypy does not like monkeypatching: - # https://github.com/python/mypy/issues/2427#issuecomment-1419206807 - WDL.Value.Base.coerce = base_coerce # type: ignore[method-assign] - WDL.Value.String.coerce = string_coerce # type: ignore[method-assign] - yield - finally: - WDL.Value.Base.coerce = old_base_coerce # type: ignore[method-assign] - WDL.Value.String.coerce = old_str_coerce # type: ignore[method-assign] - @report_wdl_errors("run workflow", exit=True) def main() -> None: """ @@ -3228,8 +3299,6 @@ def main() -> None: # If we don't have a directory assigned, make one in the current directory. output_directory: str = options.output_directory if options.output_directory else mkdtemp(prefix='wdl-out-', dir=os.getcwd()) - # Get the execution directory - execution_dir = os.getcwd() try: with Toil(options) as toil: if options.restart: @@ -3302,10 +3371,14 @@ def main() -> None: # Get the execution directory execution_dir = os.getcwd() + filename_to_uri = import_url_files(input_bindings, toil, inputs_search_path) + # Configure workflow interpreter options - wdl_options: Dict[str, str] = {} + wdl_options: Dict[str, Any] = {} wdl_options["execution_dir"] = execution_dir wdl_options["container"] = options.container + if filename_to_uri is not None: + wdl_options["filename_to_uri"] = filename_to_uri assert wdl_options.get("container") is not None # Run the workflow and get its outputs namespaced with the workflow name. @@ -3326,7 +3399,7 @@ def devirtualize_output(filename: str) -> str: # Make sure the output directory exists if we have output files # that might need to use it. os.makedirs(output_directory, exist_ok=True) - return ToilWDLStdLibBase.devirtualize_to(filename, output_directory, toil, execution_dir, devirtualized_to_virtualized, virtualized_to_devirtualized) + return ToilWDLStdLibBase.devirtualize_to(filename, output_directory, toil, wdl_options, devirtualized_to_virtualized, virtualized_to_devirtualized) # Make all the files local files output_bindings = map_over_files_in_bindings(output_bindings, devirtualize_output) From 230efa046865c8978ce619144acb05b6591df174 Mon Sep 17 00:00:00 2001 From: stxue1 Date: Wed, 24 Jul 2024 11:25:09 -0700 Subject: [PATCH 03/36] Fix possible invalid lookup and don't import raw URLs --- src/toil/wdl/wdltoil.py | 25 +++++++++++++++++++------ 1 file changed, 19 insertions(+), 6 deletions(-) diff --git a/src/toil/wdl/wdltoil.py b/src/toil/wdl/wdltoil.py index 382478b219..f86db45cc8 100755 --- a/src/toil/wdl/wdltoil.py +++ b/src/toil/wdl/wdltoil.py @@ -383,7 +383,6 @@ def potential_absolute_uris(uri: str, path: List[str], importer: Optional[WDL.Tr for candidate_base in full_path_list: # Try fetching based off each base URI candidate_uri = urljoin(candidate_base, uri) - if candidate_uri in failures: # Already tried this one, maybe we have an absolute uri input. continue @@ -692,9 +691,9 @@ def is_url(filename: str, schemes: List[str] = ['http:', 'https:', 's3:', 'gs:', return True return False -def import_url_files(environment: WDLBindings, file_source: Union[AbstractFileStore, Toil], path: Optional[List[str]] = None) -> Optional[Dict[str, str]]: +def import_url_files(environment: WDLBindings, file_source: Union[AbstractFileStore, Toil], path: Optional[List[str]] = None, include_self: bool = False) -> Optional[Dict[str, str]]: """ - Iterate through the environment and import all files from the possible URLs. + Iterate through the environment and import all files from the possible URLs. This will not import if the file is detected as local. Returns a mapping of the original file names to virtualized URIs :param environment: Bindings to evaluate on :param file_source: Context to upload/virtualize files with @@ -710,11 +709,17 @@ def import_file_from_url(filename: str) -> str: # Search through any input search paths passed in and download it if found tried = [] for candidate_uri in potential_absolute_uris(filename, path if path is not None else []): + if not include_self and candidate_uri == filename: + # If the candidate uri is unchanged, as in is equal to the original uri + # we can discard it if unwanted; for example, miniwdl's backend can handle URLs fine, + # and carrying through the url through the workflow won't actually hit the imported file + # without more machinery + return filename + tried.append(candidate_uri) try: # Try to import the file. Don't raise if we can't find it just return None imported = None - jobstore = None if isinstance(file_source, AbstractJobStore): imported = file_source.import_file(candidate_uri) elif isinstance(file_source, Toil): @@ -1012,7 +1017,7 @@ def _virtualize_filename(self, filename: str) -> str: wdl_options: Dict[str, Any] = self._wdl_options or {} execution_dir = wdl_options.get("execution_dir") if wdl_options.get("filename_to_uri") is not None: - result: str = wdl_options["filename_to_uri"][filename] + result: str = wdl_options["filename_to_uri"].get(filename) if result is not None: return result if execution_dir is not None: @@ -3371,7 +3376,15 @@ def main() -> None: # Get the execution directory execution_dir = os.getcwd() - filename_to_uri = import_url_files(input_bindings, toil, inputs_search_path) + # There are two instances where paths relative to the JSON URL may be necessary: + # ToilStdLibBase._devirtualize_to and ToilStdLibBase._virtualize_filename + # devirtualize can happen when a binding needs a function call that reads the file, which miniwdl will call internally + # as the file (which is represented as a string/path) can be relative to the JSON URL, see if the mapping exists + # and use it if so + # virtualize can happen at task boundaries; files must be converted to their virtualized instances. + # Similarly, see if the mapping exists to test if the file (string/path representation) relative to the JSON URL exists, + # and use it if so + filename_to_uri = import_url_files(input_bindings, toil, inputs_search_path, include_self=False) # Configure workflow interpreter options wdl_options: Dict[str, Any] = {} From 066f780fa884155ef281037bc2abad8dadbc765c Mon Sep 17 00:00:00 2001 From: stxue1 Date: Wed, 24 Jul 2024 12:50:28 -0700 Subject: [PATCH 04/36] Get rid of sentinel value implementation + drop files before virtualization in TaskWrapper for MiniWDL parity --- src/toil/wdl/wdltoil.py | 41 +++++++++-------------------------------- 1 file changed, 9 insertions(+), 32 deletions(-) diff --git a/src/toil/wdl/wdltoil.py b/src/toil/wdl/wdltoil.py index 22b93407fa..a195e0bef4 100755 --- a/src/toil/wdl/wdltoil.py +++ b/src/toil/wdl/wdltoil.py @@ -577,19 +577,6 @@ def recursive_dependencies(root: WDL.Tree.WorkflowNode) -> Set[str]: TOIL_URI_SCHEME = 'toilfile:' -# We always virtualize any file into a URI. However, when coercing from string to file, -# it is not necessary that the file needs to exist. See https://github.com/openwdl/wdl/issues/667 -# So use a sentinel to indicate nonexistent files instead of immediately raising an error -# This is done instead of not virtualizing, using the string as a filepath, and coercing to None/null at use. -# This is because the File must represent some location on its corresponding machine. -# If a task runs on a node where a file does not exist, and passes that file as an input into another task, -# we need to remember that the file does not exist from the original node -# ex: -# Task T1 runs on node N1 with file F at path P, but P does not exist on node N1 -# Task T1 passes file F to task T2 to run on node N2 -# Task T2 runs on node N2, P exists on node N2, but file F cannot exist -# We also want to store the filename even if it does not exist, so use a sentinel URI scheme (can be useful in error messages) -TOIL_NONEXISTENT_URI_SCHEME = 'nonexistent:' def pack_toil_uri(file_id: FileID, dir_id: uuid.UUID, file_basename: str) -> str: """ @@ -698,7 +685,7 @@ def _call_eager(self, expr: "WDL.Expr.Apply", arguments: List[WDL.Value.Base]) - # Return the result as a WDL float value return WDL.Value.Float(total_size) -def is_url(filename: str, schemes: List[str] = ['http:', 'https:', 's3:', 'gs:', TOIL_URI_SCHEME, TOIL_NONEXISTENT_URI_SCHEME]) -> bool: +def is_url(filename: str, schemes: List[str] = ['http:', 'https:', 's3:', 'gs:', TOIL_URI_SCHEME]) -> bool: """ Decide if a filename is a known kind of URL """ @@ -830,8 +817,6 @@ def __init__(self, file_store: AbstractFileStore, wdl_options: Optional[Dict[str :param wdl_options: Extra options to pass into the standard library to use, ex: execution_dir: directory to use as the working directory for workflow code, - enforce_existence: If a file is detected as nonexistent, raise an error, else let it through - """ # TODO: Just always be the 1.2 standard library. wdl_version = "1.2" @@ -901,7 +886,7 @@ def _devirtualize_filename(self, filename: str) -> str: return result @staticmethod - def _devirtualize_uri(filename: str, dest_dir: str, file_source: Union[AbstractFileStore, Toil], enforce_existence: Optional[bool] = True) -> str: + def _devirtualize_uri(filename: str, dest_dir: str, file_source: Union[AbstractFileStore, Toil]) -> str: if filename.startswith(TOIL_URI_SCHEME): # This is a reference to the Toil filestore. # Deserialize the FileID @@ -913,11 +898,6 @@ def _devirtualize_uri(filename: str, dest_dir: str, file_source: Union[AbstractF # Put the UUID in the destination path in order for tasks to # see where to put files depending on their parents. dir_path = os.path.join(dest_dir, parent_id) - elif filename.startswith(TOIL_NONEXISTENT_URI_SCHEME): - if enforce_existence: - raise FileNotFoundError(f"File {filename[len(TOIL_NONEXISTENT_URI_SCHEME):]} was not available when virtualized!") - else: - return filename else: # Parse the URL and extract the basename file_basename = os.path.basename(urlsplit(filename).path) @@ -995,8 +975,7 @@ def devirtualize_to(filename: str, dest_dir: str, file_source: Union[AbstractFil result = virtualized_to_devirtualized[filename] logger.debug("Found virtualized %s in cache with devirtualized path %s", filename, result) return result - enforce_existence: Optional[bool] = wdl_options.get("enforce_existence") - result = ToilWDLStdLibBase._devirtualize_uri(filename, dest_dir, file_source, enforce_existence=enforce_existence) + result = ToilWDLStdLibBase._devirtualize_uri(filename, dest_dir, file_source) if devirtualized_to_virtualized is not None: # Store the back mapping devirtualized_to_virtualized[result] = filename @@ -1469,8 +1448,7 @@ def drop_if_missing(value_type: WDL.Type.Base, filename: str, work_dir: str) -> logger.debug("Consider file %s", filename) if is_url(filename): - if (not filename.startswith(TOIL_NONEXISTENT_URI_SCHEME) - and (filename.startswith(TOIL_URI_SCHEME) or AbstractJobStore.url_exists(filename))): + if filename.startswith(TOIL_URI_SCHEME) or AbstractJobStore.url_exists(filename): # We assume anything in the filestore actually exists. return filename else: @@ -1514,9 +1492,8 @@ def get_file_paths_in_bindings(environment: WDLBindings) -> List[str]: def append_to_paths(path: str) -> Optional[str]: # Append element and return the element. This is to avoid a logger warning inside map_over_typed_files_in_value() # But don't process nonexistent files - if not path.startswith(TOIL_NONEXISTENT_URI_SCHEME): - paths.append(path) - return path + paths.append(path) + return path map_over_files_in_bindings(environment, append_to_paths) return paths @@ -1868,6 +1845,8 @@ def run(self, file_store: AbstractFileStore) -> Promised[WDLBindings]: accelerator_requirement = parse_accelerator(accelerator_spec) runtime_accelerators = [accelerator_requirement] + # Drop files that don't actually exist so we don't try to virtualize a nonexistent file + bindings = drop_missing_files(bindings, self._wdl_options.get("execution_dir")) # Schedule to get resources. Pass along the bindings from evaluating all the inputs and decls, and the runtime, with files virtualized. run_job = WDLTaskJob(self._task, virtualize_files(bindings, standard_library), virtualize_files(runtime_bindings, standard_library), self._task_id, self._namespace, self._task_path, cores=runtime_cores or self.cores, memory=runtime_memory or self.memory, disk=runtime_disk or self.disk, accelerators=runtime_accelerators or self.accelerators, wdl_options=self._wdl_options) # Run that as a child @@ -2105,7 +2084,7 @@ def run(self, file_store: AbstractFileStore) -> Promised[WDLBindings]: # Set up the WDL standard library # UUID to use for virtualizing files # We process nonexistent files in WDLTaskWrapperJob as those must be run locally, so don't try to devirtualize them - standard_library = ToilWDLStdLibBase(file_store, wdl_options={"enforce_existence": False}) + standard_library = ToilWDLStdLibBase(file_store) # Get the bindings from after the input section bindings = unwrap(self._task_internal_bindings) @@ -2283,8 +2262,6 @@ def patched_run_invocation(*args: Any, **kwargs: Any) -> List[str]: # Replace everything with in-container paths for the command. # TODO: MiniWDL deals with directory paths specially here. def get_path_in_container(path: str) -> Optional[str]: - if path.startswith(TOIL_NONEXISTENT_URI_SCHEME): - return None return task_container.input_path_map[path] contained_bindings = map_over_files_in_bindings(bindings, get_path_in_container) From d5365a251ff24512d204d209644077babc22b705 Mon Sep 17 00:00:00 2001 From: stxue1 Date: Wed, 24 Jul 2024 18:08:32 -0700 Subject: [PATCH 05/36] Drop missing files during decl eval for outputs + Add a check for invalid coerced-to-null files and raise if exception found --- src/toil/wdl/wdltoil.py | 53 ++++++++++++++++++++++++++++++++++++++--- 1 file changed, 50 insertions(+), 3 deletions(-) diff --git a/src/toil/wdl/wdltoil.py b/src/toil/wdl/wdltoil.py index a195e0bef4..14a1aec693 100755 --- a/src/toil/wdl/wdltoil.py +++ b/src/toil/wdl/wdltoil.py @@ -611,7 +611,7 @@ def unpack_toil_uri(toil_uri: str) -> Tuple[FileID, str, str]: return file_id, parent_id, file_basename def evaluate_bindings_from_decls(decls: List[WDL.Tree.Decl], all_bindings: WDL.Env.Bindings[WDL.Value.Base], standard_library: WDL.StdLib.Base, - include_previous: bool = False) -> WDL.Env.Bindings[WDL.Value.Base]: + include_previous: bool = False, drop_missing_files: bool = False) -> WDL.Env.Bindings[WDL.Value.Base]: """ Evaluate decls with a given bindings environment and standard library. Creates a new bindings object that only contains the bindings from the given decls. @@ -620,13 +620,23 @@ def evaluate_bindings_from_decls(decls: List[WDL.Tree.Decl], all_bindings: WDL.E :param decls: Decls to evaluate :param standard_library: Standard library :param include_previous: Whether to include the existing environment in the new returned environment. This will be false for outputs where only defined decls should be included + :param drop_missing_files: Whether to coerce nonexistent files to null. The coerced elements will be checked that the transformation is valid. + Currently should only be enabled in output sections, see https://github.com/openwdl/wdl/issues/673#issuecomment-2248828116 :return: New bindings object with only the decls """ # all_bindings contains current bindings + previous all_bindings # bindings only contains the decl bindings themselves so that bindings from other sections prior aren't included bindings: WDL.Env.Bindings[WDL.Value.Base] = WDL.Env.Bindings() + drop_if_missing_with_workdir = partial(drop_if_missing, work_dir=standard_library.execution_dir) for each_decl in decls: output_value = evaluate_defaultable_decl(each_decl, all_bindings, standard_library) + if drop_missing_files: + dropped_output_value = map_over_typed_files_in_value(output_value, drop_if_missing_with_workdir) + # Typecheck that the new binding value with dropped files is valid for the declaration's type + # If a dropped file exists where the type is not optional File?, raise FileNotFoundError + # Ideally, map_over_typed_files_in_value should do this check, but that will require retooling the map functions + # to carry through WDL types as well; currently miniwdl's WDL value has a type which we use, but that does not carry the optional flag through + map_over_files_in_value_check_null_type(dropped_output_value, output_value, each_decl.type) all_bindings = all_bindings.bind(each_decl.name, output_value) bindings = bindings.bind(each_decl.name, output_value) return all_bindings if include_previous else bindings @@ -1576,6 +1586,43 @@ def map_over_typed_files_in_value(value: WDL.Value.Base, transform: Callable[[WD # All other kinds of value can be passed through unmodified. return value +def map_over_files_in_value_check_null_type(value: WDL.Value.Base, original_value: WDL.Value.Base, expected_type: WDL.Type.Base) -> None: + """ + Run through all nested values embedded in the given value and check that the null values are valid. + + If a null value is found that does not have a valid corresponding expected_type, raise an error + + (This is currently only used to check that coerced File types to null are valid, as in null elements have an equivalent File? type, + if this is to be used elsewhere, the error message should be changed to be implementation specific) + + If one of the nested values is null but the equivalent nested expected_type is not optional, this is not allowed + :param value: WDL base value to check. This is the WDL value that has been transformed and has the null elements + :param original_value: The original WDL base value prior to the transformation. Only used for error messages + :param expected_type: The WDL type of the value + :return: + """ + if isinstance(value, WDL.Value.File): + pass + elif isinstance(value, WDL.Value.Array): + for elem, orig_elem in zip(value.value, original_value.value): + map_over_files_in_value_check_null_type(elem, orig_elem, expected_type.item_type) + elif isinstance(value, WDL.Value.Map): + for pair, orig_pair in zip(value.value, original_value.value): + # The key of the map cannot be optional or else it is not serializable, so we only need to check the value + map_over_files_in_value_check_null_type(pair[1], orig_pair[1], expected_type.item_type[1]) + elif isinstance(value, WDL.Value.Pair): + map_over_files_in_value_check_null_type(value.value[0], original_value.value[0], expected_type.left_type) + map_over_files_in_value_check_null_type(value.value[1], original_value.value[1], expected_type.right_type) + elif isinstance(value, WDL.Value.Struct): + for (k, v), (_, orig_v) in zip(value.value.items(), original_value.value.items()): + map_over_files_in_value_check_null_type(v, orig_v, expected_type.members[k]) + elif isinstance(value, WDL.Value.Null): + if not expected_type.optional: + raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), original_value.value) + else: + # Don't check other (unsupported?) types + return + class WDLBaseJob(Job): """ Base job class for all WDL-related jobs. @@ -2354,7 +2401,7 @@ def get_path_in_container(path: str) -> Optional[str]: outputs_library = ToilWDLStdLibTaskOutputs(file_store, host_stdout_txt, host_stderr_txt, task_container.input_path_map, wdl_options=output_wdl_options) # Make sure files downloaded as inputs get re-used if we re-upload them. outputs_library.share_files(standard_library) - output_bindings = evaluate_bindings_from_decls(self._task.outputs, bindings, outputs_library) + output_bindings = evaluate_bindings_from_decls(self._task.outputs, bindings, outputs_library, drop_missing_files=True) # Now we know if the standard output and error were sent somewhere by # the workflow. If not, we should report them to the leader. @@ -3256,7 +3303,7 @@ def run(self, file_store: AbstractFileStore) -> WDLBindings: # Output section is declared and is nonempty, so evaluate normally # Combine the bindings from the previous job - output_bindings = evaluate_bindings_from_decls(self._workflow.outputs, unwrap(self._bindings), standard_library) + output_bindings = evaluate_bindings_from_decls(self._workflow.outputs, unwrap(self._bindings), standard_library, drop_missing_files=True) finally: # We don't actually know when all our files are downloaded since # anything we evaluate might devirtualize inside any expression. From bbb098d9808cb7efb598ee441c9d789ef2723d30 Mon Sep 17 00:00:00 2001 From: stxue1 Date: Wed, 24 Jul 2024 19:28:14 -0700 Subject: [PATCH 06/36] Deal with mypy --- src/toil/wdl/wdltoil.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/src/toil/wdl/wdltoil.py b/src/toil/wdl/wdltoil.py index 14a1aec693..d8a9ab2bc2 100755 --- a/src/toil/wdl/wdltoil.py +++ b/src/toil/wdl/wdltoil.py @@ -610,7 +610,7 @@ def unpack_toil_uri(toil_uri: str) -> Tuple[FileID, str, str]: return file_id, parent_id, file_basename -def evaluate_bindings_from_decls(decls: List[WDL.Tree.Decl], all_bindings: WDL.Env.Bindings[WDL.Value.Base], standard_library: WDL.StdLib.Base, +def evaluate_bindings_from_decls(decls: List[WDL.Tree.Decl], all_bindings: WDL.Env.Bindings[WDL.Value.Base], standard_library: ToilWDLStdLibBase, include_previous: bool = False, drop_missing_files: bool = False) -> WDL.Env.Bindings[WDL.Value.Base]: """ Evaluate decls with a given bindings environment and standard library. @@ -1603,19 +1603,22 @@ def map_over_files_in_value_check_null_type(value: WDL.Value.Base, original_valu """ if isinstance(value, WDL.Value.File): pass - elif isinstance(value, WDL.Value.Array): + elif isinstance(value, WDL.Value.Array) and isinstance(expected_type, WDL.Type.Array): for elem, orig_elem in zip(value.value, original_value.value): map_over_files_in_value_check_null_type(elem, orig_elem, expected_type.item_type) - elif isinstance(value, WDL.Value.Map): + elif isinstance(value, WDL.Value.Map) and isinstance(expected_type, WDL.Type.Map): for pair, orig_pair in zip(value.value, original_value.value): # The key of the map cannot be optional or else it is not serializable, so we only need to check the value map_over_files_in_value_check_null_type(pair[1], orig_pair[1], expected_type.item_type[1]) - elif isinstance(value, WDL.Value.Pair): + elif isinstance(value, WDL.Value.Pair) and isinstance(expected_type, WDL.Type.Pair): map_over_files_in_value_check_null_type(value.value[0], original_value.value[0], expected_type.left_type) map_over_files_in_value_check_null_type(value.value[1], original_value.value[1], expected_type.right_type) - elif isinstance(value, WDL.Value.Struct): + elif isinstance(value, WDL.Value.Struct) and isinstance(expected_type, WDL.Type.StructInstance): for (k, v), (_, orig_v) in zip(value.value.items(), original_value.value.items()): - map_over_files_in_value_check_null_type(v, orig_v, expected_type.members[k]) + # The parameters method for WDL.Type.StructInstance returns the values rather than the dictionary + # While dictionaries are ordered, this should be more robust; the else branch should never be hit + if expected_type.members is not None: + map_over_files_in_value_check_null_type(v, orig_v, expected_type.members[k]) elif isinstance(value, WDL.Value.Null): if not expected_type.optional: raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), original_value.value) From 1b6bd023a54f98bb633718c67686d078f4ab3d53 Mon Sep 17 00:00:00 2001 From: stxue1 Date: Thu, 25 Jul 2024 11:29:06 -0700 Subject: [PATCH 07/36] Don't drop unnecesssarily --- src/toil/wdl/wdltoil.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/toil/wdl/wdltoil.py b/src/toil/wdl/wdltoil.py index d8a9ab2bc2..c209680e00 100755 --- a/src/toil/wdl/wdltoil.py +++ b/src/toil/wdl/wdltoil.py @@ -637,6 +637,7 @@ def evaluate_bindings_from_decls(decls: List[WDL.Tree.Decl], all_bindings: WDL.E # Ideally, map_over_typed_files_in_value should do this check, but that will require retooling the map functions # to carry through WDL types as well; currently miniwdl's WDL value has a type which we use, but that does not carry the optional flag through map_over_files_in_value_check_null_type(dropped_output_value, output_value, each_decl.type) + output_value = dropped_output_value all_bindings = all_bindings.bind(each_decl.name, output_value) bindings = bindings.bind(each_decl.name, output_value) return all_bindings if include_previous else bindings @@ -1895,8 +1896,6 @@ def run(self, file_store: AbstractFileStore) -> Promised[WDLBindings]: accelerator_requirement = parse_accelerator(accelerator_spec) runtime_accelerators = [accelerator_requirement] - # Drop files that don't actually exist so we don't try to virtualize a nonexistent file - bindings = drop_missing_files(bindings, self._wdl_options.get("execution_dir")) # Schedule to get resources. Pass along the bindings from evaluating all the inputs and decls, and the runtime, with files virtualized. run_job = WDLTaskJob(self._task, virtualize_files(bindings, standard_library), virtualize_files(runtime_bindings, standard_library), self._task_id, self._namespace, self._task_path, cores=runtime_cores or self.cores, memory=runtime_memory or self.memory, disk=runtime_disk or self.disk, accelerators=runtime_accelerators or self.accelerators, wdl_options=self._wdl_options) # Run that as a child From f65d59cfbf80265708168421b7fbcee2f24298d5 Mon Sep 17 00:00:00 2001 From: stxue1 Date: Mon, 19 Aug 2024 19:45:24 -0700 Subject: [PATCH 08/36] Switch to setattr implementation --- src/toil/common.py | 3 + src/toil/jobStores/googleJobStore.py | 2 +- src/toil/wdl/wdltoil.py | 316 +++++++++++++-------------- 3 files changed, 161 insertions(+), 160 deletions(-) diff --git a/src/toil/common.py b/src/toil/common.py index 26f74e567e..69d5fe532f 100644 --- a/src/toil/common.py +++ b/src/toil/common.py @@ -1114,6 +1114,9 @@ def _setupAutoDeployment( logger.debug('Injecting user script %s into batch system.', userScriptResource) self._batchSystem.setUserScript(userScriptResource) + def url_exists(self, src_uri: str) -> bool: + return self._jobStore.url_exists(self.normalize_uri(src_uri)) + # Importing a file with a shared file name returns None, but without one it # returns a file ID. Explain this to MyPy. diff --git a/src/toil/jobStores/googleJobStore.py b/src/toil/jobStores/googleJobStore.py index 381b78fa55..9eaf9ff07e 100644 --- a/src/toil/jobStores/googleJobStore.py +++ b/src/toil/jobStores/googleJobStore.py @@ -383,7 +383,7 @@ def _get_blob_from_url(cls, url, exists=False): if exists: if not blob.exists(): - raise NoSuchFileException + raise NoSuchFileException(fileName) # sync with cloud so info like size is available blob.reload() return blob diff --git a/src/toil/wdl/wdltoil.py b/src/toil/wdl/wdltoil.py index c209680e00..0f1476d6cf 100755 --- a/src/toil/wdl/wdltoil.py +++ b/src/toil/wdl/wdltoil.py @@ -46,7 +46,13 @@ Type, TypeVar, Union, - cast) + cast, + TypedDict) +if sys.version_info < (3, 11): + from typing_extensions import NotRequired +else: + # NotRequired is recommended for TypedDicts over Optional but was introduced in Python 3.11 + from typing import NotRequired from urllib.parse import quote, unquote, urljoin, urlsplit from functools import partial @@ -86,6 +92,8 @@ logger = logging.getLogger(__name__) +WDL_OPTIONS = TypedDict('WDL_OPTIONS', {"execution_dir": NotRequired[str], "container": NotRequired[str]}) + @contextmanager def wdl_error_reporter(task: str, exit: bool = False, log: Callable[[str], None] = logger.critical) -> Generator[None, None, None]: @@ -610,8 +618,8 @@ def unpack_toil_uri(toil_uri: str) -> Tuple[FileID, str, str]: return file_id, parent_id, file_basename -def evaluate_bindings_from_decls(decls: List[WDL.Tree.Decl], all_bindings: WDL.Env.Bindings[WDL.Value.Base], standard_library: ToilWDLStdLibBase, - include_previous: bool = False, drop_missing_files: bool = False) -> WDL.Env.Bindings[WDL.Value.Base]: +def evaluate_decls_to_bindings(decls: List[WDL.Tree.Decl], all_bindings: WDL.Env.Bindings[WDL.Value.Base], standard_library: ToilWDLStdLibBase, + include_previous: bool = False, drop_missing_files: bool = False) -> WDL.Env.Bindings[WDL.Value.Base]: """ Evaluate decls with a given bindings environment and standard library. Creates a new bindings object that only contains the bindings from the given decls. @@ -622,12 +630,12 @@ def evaluate_bindings_from_decls(decls: List[WDL.Tree.Decl], all_bindings: WDL.E :param include_previous: Whether to include the existing environment in the new returned environment. This will be false for outputs where only defined decls should be included :param drop_missing_files: Whether to coerce nonexistent files to null. The coerced elements will be checked that the transformation is valid. Currently should only be enabled in output sections, see https://github.com/openwdl/wdl/issues/673#issuecomment-2248828116 - :return: New bindings object with only the decls + :return: New bindings object """ # all_bindings contains current bindings + previous all_bindings # bindings only contains the decl bindings themselves so that bindings from other sections prior aren't included bindings: WDL.Env.Bindings[WDL.Value.Base] = WDL.Env.Bindings() - drop_if_missing_with_workdir = partial(drop_if_missing, work_dir=standard_library.execution_dir) + drop_if_missing_with_workdir = partial(drop_if_missing, standard_library=standard_library) for each_decl in decls: output_value = evaluate_defaultable_decl(each_decl, all_bindings, standard_library) if drop_missing_files: @@ -636,7 +644,7 @@ def evaluate_bindings_from_decls(decls: List[WDL.Tree.Decl], all_bindings: WDL.E # If a dropped file exists where the type is not optional File?, raise FileNotFoundError # Ideally, map_over_typed_files_in_value should do this check, but that will require retooling the map functions # to carry through WDL types as well; currently miniwdl's WDL value has a type which we use, but that does not carry the optional flag through - map_over_files_in_value_check_null_type(dropped_output_value, output_value, each_decl.type) + ensure_null_files_are_nullable(dropped_output_value, output_value, each_decl.type) output_value = dropped_output_value all_bindings = all_bindings.bind(each_decl.name, output_value) bindings = bindings.bind(each_decl.name, output_value) @@ -659,11 +667,12 @@ def _call_eager(self, expr: "WDL.Expr.Apply", arguments: List[WDL.Value.Base]) - """ # Get all the URIs of files that actually are set. - file_uris: List[str] = [f.value for f in arguments[0].coerce(WDL.Type.Array(WDL.Type.File(optional=True))).value if not isinstance(f, WDL.Value.Null)] + file_objects: List[WDL.Value.File] = [f for f in arguments[0].coerce(WDL.Type.Array(WDL.Type.File(optional=True))).value if not isinstance(f, WDL.Value.Null)] total_size = 0.0 - for uri in file_uris: + for file in file_objects: # Sum up the sizes of all the files, if any. + uri = getattr(file, "virtualized_value", None) or file.value if is_url(uri): if uri.startswith(TOIL_URI_SCHEME): # This is a Toil File ID we encoded; we have the size @@ -705,40 +714,31 @@ def is_url(filename: str, schemes: List[str] = ['http:', 'https:', 's3:', 'gs:', return True return False -def import_url_files(environment: WDLBindings, file_source: Union[AbstractFileStore, Toil], path: Optional[List[str]] = None, include_self: bool = False) -> Optional[Dict[str, str]]: +def convert_remote_files(environment: WDLBindings, file_source: Union[AbstractFileStore, Toil], search_paths: Optional[List[str]] = None) -> None: """ - Iterate through the environment and import all files from the possible URLs. This will not import if the file is detected as local. - Returns a mapping of the original file names to virtualized URIs + Iterate through the environment and convert all files that reference a remote file from the possible URIs to that URI. :param environment: Bindings to evaluate on - :param file_source: Context to upload/virtualize files with - :param path: List of paths to search through + :param file_source: Context to search for remote files with + :param search_paths: List of paths to search through :return: """ - parent_to_id: Dict[str, uuid.UUID] = {} - filename_to_uri: Dict[str, str] = {} - def import_file_from_url(filename: str) -> str: + def convert_file_to_url(file: WDL.Value.File) -> WDL.Value.File: """ - Import a file if from a URL and return a virtualized filename for it. + Detect if any potential URI exist and convert a file's value to a URI """ # Search through any input search paths passed in and download it if found + filename = file.value tried = [] - for candidate_uri in potential_absolute_uris(filename, path if path is not None else []): - if not include_self and candidate_uri == filename: - # If the candidate uri is unchanged, as in is equal to the original uri - # we can discard it if unwanted; for example, miniwdl's backend can handle URLs fine, - # and carrying through the url through the workflow won't actually hit the imported file - # without more machinery - return filename - + for candidate_uri in potential_absolute_uris(filename, search_paths if search_paths is not None else []): tried.append(candidate_uri) try: - # Try to import the file. Don't raise if we can't find it just return None - imported = None + # Detect if the + found = False if isinstance(file_source, AbstractJobStore): - imported = file_source.import_file(candidate_uri) + found = file_source.url_exists(candidate_uri) elif isinstance(file_source, Toil): - imported = file_source.import_file(candidate_uri, check_existence=False) - if imported is None: + found = file_source.url_exists(candidate_uri) + if found is False: # Wasn't found there continue except UnimplementedURLException as e: @@ -749,14 +749,14 @@ def import_file_from_url(filename: str) -> str: except Exception: # Something went wrong besides the file not being found. Maybe # we have no auth. - logger.error("Something went wrong importing %s", candidate_uri) + logger.error("Something went wrong when testing for existence of %s", candidate_uri) raise - if imported is None: + if found is False: # Wasn't found there # Mostly to satisfy mypy continue - logger.info('Imported %s', candidate_uri) + logger.info('Converting input file path %s to %s', filename, candidate_uri) # Work out what the basename for the file was file_basename = os.path.basename(urlsplit(candidate_uri).path) @@ -767,30 +767,12 @@ def import_file_from_url(filename: str) -> str: raise RuntimeError(f"File {candidate_uri} has no basename and so cannot be a WDL File") # Was actually found - if is_url(candidate_uri): - # Might be a file URI or other URI. - # We need to make sure file URIs and local paths that point to - # the same place are treated the same. - parsed = urlsplit(candidate_uri) - if parsed.scheme == "file:": - # This is a local file URI. Convert to a path for source directory tracking. - parent_dir = os.path.dirname(unquote(parsed.path)) - else: - # This is some other URL. Get the URL to the parent directory and use that. - parent_dir = urljoin(candidate_uri, ".") - else: - # Must be a local path - return filename - - # Pack a UUID of the parent directory - dir_id = parent_to_id.setdefault(parent_dir, uuid.uuid4()) - filename_to_uri[filename] = pack_toil_uri(imported, dir_id, file_basename) - return filename + file.value = candidate_uri + return file # If we get here we tried all the candidates raise RuntimeError(f"Could not find {filename} at any of: {tried}") - map_over_files_in_bindings(environment, import_file_from_url) - return filename_to_uri if len(filename_to_uri) > 0 else None + map_over_typed_files_in_bindings(environment, convert_file_to_url) # Both the WDL code itself **and** the commands that it runs will deal in @@ -822,7 +804,7 @@ class ToilWDLStdLibBase(WDL.StdLib.Base): """ Standard library implementation for WDL as run on Toil. """ - def __init__(self, file_store: AbstractFileStore, wdl_options: Optional[Dict[str, Any]] = None): + def __init__(self, file_store: AbstractFileStore, wdl_options: Optional[WDL_OPTIONS] = None): """ Set up the standard library. @@ -852,7 +834,7 @@ def __init__(self, file_store: AbstractFileStore, wdl_options: Optional[Dict[str # paths, to save re-uploads. self._devirtualized_to_virtualized: Dict[str, str] = {} - self._wdl_options = wdl_options + self._wdl_options: WDL_OPTIONS = wdl_options or {} @property def execution_dir(self) -> Optional[str]: @@ -885,13 +867,46 @@ def share_files(self, other: "ToilWDLStdLibBase") -> None: self._devirtualized_to_virtualized.update(other._devirtualized_to_virtualized) other._devirtualized_to_virtualized = self._devirtualized_to_virtualized + def _read(self, parse: Callable[[str], WDL.Value.Base]) -> Callable[[WDL.Value.File], WDL.Value.Base]: + # To only virtualize on task/function boundaries, rely on the _read function + # as this is called before every WDL function that takes a file input + # We want to virtualize before any function call so we can control the caching + # and to support all Toil supported formats (ex Google buckets) + # Since we also want to preserve the URL/path *and* store the virtualized URI, use setattr + # I can't think of another way to do this. I still need to remember the original URL/path, + # but I need to virtualize as well, so I can't remove one or the other. + def f(file: WDL.Value.File) -> WDL.Value.Base: + if getattr(file, "virtualized_value", None) is None: + setattr(file, "virtualized_value", self._virtualize_filename(file.value)) + with open(self._devirtualize_filename(getattr(file, "virtualized_value")), "r") as infile: + return parse(infile.read()) + + return f + def _devirtualize_file(self, file: WDL.Value.File) -> WDL.Value.File: + if getattr(file, "nonexistent", False): + return file + virtualized_filename = getattr(file, "virtualized_value", file.value) + file.value = self._devirtualize_filename(virtualized_filename) + return file + + def _virtualize_file(self, file: WDL.Value.File, enforce_existence: bool = True) -> WDL.Value.File: + if enforce_existence is False: + # We only want to error on a nonexistent file in the output section + # Since we need to virtualize on task boundaries, don't enforce existence if on a boundary + abs_filepath = os.path.join(self.execution_dir, file.value) if self.execution_dir is not None else os.path.abspath(file.value) + if not os.path.exists(abs_filepath): + setattr(file, "nonexistent", True) + return file + virtualized = self._virtualize_filename(getattr(file, "virtualized_value", None) or file.value) + setattr(file, "virtualized_value", virtualized) + return file + @memoize def _devirtualize_filename(self, filename: str) -> str: """ 'devirtualize' filename passed to a read_* function: return a filename that can be open()ed on the local host. """ - result = self.devirtualize_to(filename, self._file_store.localTempDir, self._file_store, self._wdl_options, self._devirtualized_to_virtualized, self._virtualized_to_devirtualized) return result @@ -952,7 +967,7 @@ def _devirtualize_uri(filename: str, dest_dir: str, file_source: Union[AbstractF return result @staticmethod - def devirtualize_to(filename: str, dest_dir: str, file_source: Union[AbstractFileStore, Toil], wdl_options: Optional[Dict[str, Any]] = None, + def devirtualize_to(filename: str, dest_dir: str, file_source: Union[AbstractFileStore, Toil], wdl_options: Optional[WDL_OPTIONS] = None, devirtualized_to_virtualized: Optional[Dict[str, str]] = None, virtualized_to_devirtualized: Optional[Dict[str, str]] = None) -> str: """ Download or export a WDL virtualized filename/URL to the given directory. @@ -995,24 +1010,15 @@ def devirtualize_to(filename: str, dest_dir: str, file_source: Union[AbstractFil virtualized_to_devirtualized[filename] = result logger.debug('Devirtualized %s as openable file %s', filename, result) else: - result = None - if wdl_options.get("filename_to_uri") is not None: - # We support importing files relative to a URL when the input JSON itself is downloaded - # See import_url_files() - filename_uri = wdl_options["filename_to_uri"].get(filename) - if filename_uri is not None: - # This was previously virtualized, so use the virtualized result - result = ToilWDLStdLibBase._devirtualize_uri(filename_uri, dest_dir, file_source) - if result is None: - # This is a local file - # To support relative paths, join the execution dir and filename - # if filename is already an abs path, join() will do nothing - execution_dir = wdl_options.get("execution_dir") - if execution_dir is not None: - result = os.path.join(execution_dir, filename) - else: - result = filename - logger.debug("Virtualized file %s is already a local path", filename) + # This is a local file + # To support relative paths, join the execution dir and filename + # if filename is already an abs path, join() will do nothing + execution_dir = wdl_options.get("execution_dir") + if execution_dir is not None: + result = os.path.join(execution_dir, filename) + else: + result = filename + logger.debug("Virtualized file %s is already a local path", filename) if not os.path.exists(result): # Catch if something made it through without going through the proper virtualization/devirtualization steps @@ -1035,16 +1041,10 @@ def _virtualize_filename(self, filename: str) -> str: # Otherwise this is a local file and we want to fake it as a Toil file store file # Make it an absolute path - wdl_options: Dict[str, Any] = self._wdl_options or {} - execution_dir = wdl_options.get("execution_dir") - if wdl_options.get("filename_to_uri") is not None: - result: str = wdl_options["filename_to_uri"].get(filename) - if result is not None: - return result - if execution_dir is not None: + if self.execution_dir is not None: # To support relative paths from execution directory, join the execution dir and filename # If filename is already an abs path, join() will not do anything - abs_filename = os.path.join(execution_dir, filename) + abs_filename = os.path.join(self.execution_dir, filename) else: abs_filename = os.path.abspath(filename) @@ -1076,7 +1076,7 @@ class ToilWDLStdLibTaskCommand(ToilWDLStdLibBase): are host-side paths. """ - def __init__(self, file_store: AbstractFileStore, container: TaskContainer, wdl_options: Optional[Dict[str, Any]] = None): + def __init__(self, file_store: AbstractFileStore, container: TaskContainer, wdl_options: Optional[WDL_OPTIONS] = None): """ Set up the standard library for the task command section. """ @@ -1131,7 +1131,7 @@ class ToilWDLStdLibTaskOutputs(ToilWDLStdLibBase, WDL.StdLib.TaskOutputs): functions only allowed in task output sections. """ - def __init__(self, file_store: AbstractFileStore, stdout_path: str, stderr_path: str, file_to_mountpoint: Dict[str, str], wdl_options: Optional[Dict[str, Any]] = None): + def __init__(self, file_store: AbstractFileStore, stdout_path: str, stderr_path: str, file_to_mountpoint: Dict[str, str], wdl_options: Optional[WDL_OPTIONS] = None): """ Set up the standard library for a task output section. Needs to know where standard output and error from the task have been stored, and @@ -1398,21 +1398,21 @@ def evaluate_defaultable_decl(node: WDL.Tree.Decl, environment: WDLBindings, std raise # TODO: make these stdlib methods??? -def devirtualize_files(environment: WDLBindings, stdlib: WDL.StdLib.Base) -> WDLBindings: +def devirtualize_files(environment: WDLBindings, stdlib: ToilWDLStdLibBase) -> WDLBindings: """ Make sure all the File values embedded in the given bindings point to files that are actually available to command line commands. The same virtual file always maps to the same devirtualized filename even with duplicates """ - return map_over_files_in_bindings(environment, stdlib._devirtualize_filename) + return map_over_typed_files_in_bindings(environment, stdlib._devirtualize_file) -def virtualize_files(environment: WDLBindings, stdlib: WDL.StdLib.Base) -> WDLBindings: +def virtualize_files(environment: WDLBindings, stdlib: ToilWDLStdLibBase, enforce_existence: bool = True) -> WDLBindings: """ Make sure all the File values embedded in the given bindings point to files that are usable from other machines. """ - - return map_over_files_in_bindings(environment, stdlib._virtualize_filename) + virtualize_func = partial(stdlib._virtualize_file, enforce_existence=enforce_existence) + return map_over_typed_files_in_bindings(environment, virtualize_func) def add_paths(task_container: TaskContainer, host_paths: Iterable[str]) -> None: """ @@ -1449,33 +1449,39 @@ def add_paths(task_container: TaskContainer, host_paths: Iterable[str]) -> None: -def drop_if_missing(value_type: WDL.Type.Base, filename: str, work_dir: str) -> Optional[str]: +def drop_if_missing(file: WDL.Value.File, standard_library: ToilWDLStdLibBase) -> Optional[WDL.Value.File]: """ Return None if a file doesn't exist, or its path if it does. filename represents a URI or file name belonging to a WDL value of type value_type. work_dir represents the current working directory of the job and is where all relative paths will be interpreted from """ + work_dir = standard_library.execution_dir + filename = file.value + virtualized_filename = getattr(file, "virtualized_value", None) + value_type = file.type logger.debug("Consider file %s", filename) - if is_url(filename): - if filename.startswith(TOIL_URI_SCHEME) or AbstractJobStore.url_exists(filename): + if virtualized_filename is not None and is_url(virtualized_filename): + if virtualized_filename.startswith(TOIL_URI_SCHEME) or AbstractJobStore.url_exists(virtualized_filename): # We assume anything in the filestore actually exists. - return filename + devirtualized_filename = standard_library._devirtualize_filename(virtualized_filename) + file.value = devirtualized_filename + return file else: logger.warning('File %s with type %s does not actually exist at its URI', filename, value_type) return None else: # Get the absolute path, not resolving symlinks - effective_path = os.path.abspath(os.path.join(work_dir, filename)) + effective_path = os.path.abspath(os.path.join(work_dir or os.getcwd(), filename)) if os.path.islink(effective_path) or os.path.exists(effective_path): # This is a broken symlink or a working symlink or a file. - return filename + return file else: logger.warning('File %s with type %s does not actually exist at %s', filename, value_type, effective_path) return None -def drop_missing_files(environment: WDLBindings, current_directory_override: Optional[str] = None) -> WDLBindings: +def drop_missing_files(environment: WDLBindings, standard_library: ToilWDLStdLibBase) -> WDLBindings: """ Make sure all the File values embedded in the given bindings point to files that exist, or are null. @@ -1484,9 +1490,7 @@ def drop_missing_files(environment: WDLBindings, current_directory_override: Opt """ # Determine where to evaluate relative paths relative to - work_dir = '.' if not current_directory_override else current_directory_override - - drop_if_missing_with_workdir = partial(drop_if_missing, work_dir=work_dir) + drop_if_missing_with_workdir = partial(drop_if_missing, standard_library=standard_library) return map_over_typed_files_in_bindings(environment, drop_if_missing_with_workdir) def get_file_paths_in_bindings(environment: WDLBindings) -> List[str]: @@ -1500,15 +1504,17 @@ def get_file_paths_in_bindings(environment: WDLBindings) -> List[str]: paths = [] - def append_to_paths(path: str) -> Optional[str]: + def append_to_paths(file: WDL.Value.File) -> Optional[WDL.Value.File]: # Append element and return the element. This is to avoid a logger warning inside map_over_typed_files_in_value() # But don't process nonexistent files - paths.append(path) - return path - map_over_files_in_bindings(environment, append_to_paths) + if getattr(file, "nonexistent", False) is False: + path = file.value + paths.append(path) + return file + map_over_typed_files_in_bindings(environment, append_to_paths) return paths -def map_over_typed_files_in_bindings(environment: WDLBindings, transform: Callable[[WDL.Type.Base, str], Optional[str]]) -> WDLBindings: +def map_over_typed_files_in_bindings(environment: WDLBindings, transform: Callable[[WDL.Value.File], Optional[WDL.Value.File]]) -> WDLBindings: """ Run all File values embedded in the given bindings through the given transformation function. @@ -1518,18 +1524,8 @@ def map_over_typed_files_in_bindings(environment: WDLBindings, transform: Callab return environment.map(lambda b: map_over_typed_files_in_binding(b, transform)) -def map_over_files_in_bindings(bindings: WDLBindings, transform: Callable[[str], Optional[str]]) -> WDLBindings: - """ - Run all File values' types and values embedded in the given bindings - through the given transformation function. - - TODO: Replace with WDL.Value.rewrite_env_paths or WDL.Value.rewrite_files - """ - - return map_over_typed_files_in_bindings(bindings, lambda _, x: transform(x)) - -def map_over_typed_files_in_binding(binding: WDL.Env.Binding[WDL.Value.Base], transform: Callable[[WDL.Type.Base, str], Optional[str]]) -> WDL.Env.Binding[WDL.Value.Base]: +def map_over_typed_files_in_binding(binding: WDL.Env.Binding[WDL.Value.Base], transform: Callable[[WDL.Value.File], Optional[WDL.Value.File]]) -> WDL.Env.Binding[WDL.Value.Base]: """ Run all File values' types and values embedded in the given binding's value through the given transformation function. @@ -1544,7 +1540,7 @@ def map_over_typed_files_in_binding(binding: WDL.Env.Binding[WDL.Value.Base], tr # # For now we assume that any types extending the WDL value types will implement # compatible constructors. -def map_over_typed_files_in_value(value: WDL.Value.Base, transform: Callable[[WDL.Type.Base, str], Optional[str]]) -> WDL.Value.Base: +def map_over_typed_files_in_value(value: WDL.Value.Base, transform: Callable[[WDL.Value.File], Optional[WDL.Value.File]]) -> WDL.Value.Base: """ Run all File values embedded in the given value through the given transformation function. @@ -1560,8 +1556,8 @@ def map_over_typed_files_in_value(value: WDL.Value.Base, transform: Callable[[WD """ if isinstance(value, WDL.Value.File): # This is a file so we need to process it - new_path = transform(value.type, value.value) - if new_path is None: + new_file = transform(value) + if new_file is None: # Assume the transform checked types if we actually care about the # result. logger.warning("File %s became Null", value) @@ -1569,7 +1565,7 @@ def map_over_typed_files_in_value(value: WDL.Value.Base, transform: Callable[[WD else: # Make whatever the value is around the new path. # TODO: why does this need casting? - return WDL.Value.File(new_path, value.expr) + return new_file elif isinstance(value, WDL.Value.Array): # This is an array, so recurse on the items return WDL.Value.Array(value.type.item_type, [map_over_typed_files_in_value(v, transform) for v in value.value], value.expr) @@ -1587,16 +1583,17 @@ def map_over_typed_files_in_value(value: WDL.Value.Base, transform: Callable[[WD # All other kinds of value can be passed through unmodified. return value -def map_over_files_in_value_check_null_type(value: WDL.Value.Base, original_value: WDL.Value.Base, expected_type: WDL.Type.Base) -> None: +def ensure_null_files_are_nullable(value: WDL.Value.Base, original_value: WDL.Value.Base, expected_type: WDL.Type.Base) -> None: """ Run through all nested values embedded in the given value and check that the null values are valid. If a null value is found that does not have a valid corresponding expected_type, raise an error - (This is currently only used to check that coerced File types to null are valid, as in null elements have an equivalent File? type, - if this is to be used elsewhere, the error message should be changed to be implementation specific) + (This is currently only used to check that null values arising from File coercion are in locations with a nullable File? type. + If this is to be used elsewhere, the error message should be changed to describe the appropriate types and not just talk about files.) - If one of the nested values is null but the equivalent nested expected_type is not optional, this is not allowed + For example: + If one of the nested values is null but the equivalent nested expected_type is not optional, a FileNotFoundError will be raised :param value: WDL base value to check. This is the WDL value that has been transformed and has the null elements :param original_value: The original WDL base value prior to the transformation. Only used for error messages :param expected_type: The WDL type of the value @@ -1606,20 +1603,20 @@ def map_over_files_in_value_check_null_type(value: WDL.Value.Base, original_valu pass elif isinstance(value, WDL.Value.Array) and isinstance(expected_type, WDL.Type.Array): for elem, orig_elem in zip(value.value, original_value.value): - map_over_files_in_value_check_null_type(elem, orig_elem, expected_type.item_type) + ensure_null_files_are_nullable(elem, orig_elem, expected_type.item_type) elif isinstance(value, WDL.Value.Map) and isinstance(expected_type, WDL.Type.Map): for pair, orig_pair in zip(value.value, original_value.value): # The key of the map cannot be optional or else it is not serializable, so we only need to check the value - map_over_files_in_value_check_null_type(pair[1], orig_pair[1], expected_type.item_type[1]) + ensure_null_files_are_nullable(pair[1], orig_pair[1], expected_type.item_type[1]) elif isinstance(value, WDL.Value.Pair) and isinstance(expected_type, WDL.Type.Pair): - map_over_files_in_value_check_null_type(value.value[0], original_value.value[0], expected_type.left_type) - map_over_files_in_value_check_null_type(value.value[1], original_value.value[1], expected_type.right_type) + ensure_null_files_are_nullable(value.value[0], original_value.value[0], expected_type.left_type) + ensure_null_files_are_nullable(value.value[1], original_value.value[1], expected_type.right_type) elif isinstance(value, WDL.Value.Struct) and isinstance(expected_type, WDL.Type.StructInstance): for (k, v), (_, orig_v) in zip(value.value.items(), original_value.value.items()): # The parameters method for WDL.Type.StructInstance returns the values rather than the dictionary # While dictionaries are ordered, this should be more robust; the else branch should never be hit if expected_type.members is not None: - map_over_files_in_value_check_null_type(v, orig_v, expected_type.members[k]) + ensure_null_files_are_nullable(v, orig_v, expected_type.members[k]) elif isinstance(value, WDL.Value.Null): if not expected_type.optional: raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), original_value.value) @@ -1639,7 +1636,7 @@ class WDLBaseJob(Job): Also responsible for remembering the Toil WDL configuration keys and values. """ - def __init__(self, wdl_options: Optional[Dict[str, Any]] = None, **kwargs: Any) -> None: + def __init__(self, wdl_options: Optional[WDL_OPTIONS] = None, **kwargs: Any) -> None: """ Make a WDL-related job. @@ -1811,12 +1808,12 @@ def run(self, file_store: AbstractFileStore) -> Promised[WDLBindings]: if self._task.inputs: logger.debug("Evaluating task code") # Evaluate all the inputs that aren't pre-set - bindings = evaluate_bindings_from_decls(self._task.inputs, bindings, standard_library, include_previous=True) + bindings = evaluate_decls_to_bindings(self._task.inputs, bindings, standard_library, include_previous=True) if self._task.postinputs: # Evaluate all the postinput decls. # We need these in order to evaluate the runtime. # TODO: What if they wanted resources from the runtime? - bindings = evaluate_bindings_from_decls(self._task.postinputs, bindings, standard_library, include_previous=True) + bindings = evaluate_decls_to_bindings(self._task.postinputs, bindings, standard_library, include_previous=True) # Evaluate the runtime section runtime_bindings = evaluate_call_inputs(self._task, self._task.runtime, bindings, standard_library) @@ -1897,7 +1894,7 @@ def run(self, file_store: AbstractFileStore) -> Promised[WDLBindings]: runtime_accelerators = [accelerator_requirement] # Schedule to get resources. Pass along the bindings from evaluating all the inputs and decls, and the runtime, with files virtualized. - run_job = WDLTaskJob(self._task, virtualize_files(bindings, standard_library), virtualize_files(runtime_bindings, standard_library), self._task_id, self._namespace, self._task_path, cores=runtime_cores or self.cores, memory=runtime_memory or self.memory, disk=runtime_disk or self.disk, accelerators=runtime_accelerators or self.accelerators, wdl_options=self._wdl_options) + run_job = WDLTaskJob(self._task, virtualize_files(bindings, standard_library, enforce_existence=False), virtualize_files(runtime_bindings, standard_library, enforce_existence=False), self._task_id, self._namespace, self._task_path, cores=runtime_cores or self.cores, memory=runtime_memory or self.memory, disk=runtime_disk or self.disk, accelerators=runtime_accelerators or self.accelerators, wdl_options=self._wdl_options) # Run that as a child self.addChild(run_job) @@ -2310,12 +2307,13 @@ def patched_run_invocation(*args: Any, **kwargs: Any) -> List[str]: # Replace everything with in-container paths for the command. # TODO: MiniWDL deals with directory paths specially here. - def get_path_in_container(path: str) -> Optional[str]: - return task_container.input_path_map[path] - contained_bindings = map_over_files_in_bindings(bindings, get_path_in_container) + def get_path_in_container(file: WDL.Value.File) -> Optional[WDL.Value.File]: + if getattr(file, "nonexistent", False) is False: + return WDL.Value.File(task_container.input_path_map[file.value]) + contained_bindings = map_over_typed_files_in_bindings(bindings, get_path_in_container) # Make a new standard library for evaluating the command specifically, which only deals with in-container paths and out-of-container paths. - command_wdl_options: Optional[Dict[str, Any]] = {"execution_dir": workdir_in_container} if workdir_in_container is not None else None + command_wdl_options: Optional[WDL_OPTIONS] = {"execution_dir": workdir_in_container} if workdir_in_container is not None else None command_library = ToilWDLStdLibTaskCommand(file_store, task_container, wdl_options=command_wdl_options) # Work out the command string, and unwrap it @@ -2399,11 +2397,11 @@ def get_path_in_container(path: str) -> Optional[str]: # container-determined strings that are absolute paths to WDL File # objects, and like MiniWDL we can say we only support # working-directory-based relative paths for globs. - output_wdl_options: Optional[Dict[str, Any]] = {"execution_dir": workdir_in_container} if workdir_in_container is not None else None + output_wdl_options: Optional[WDL_OPTIONS] = {"execution_dir": workdir_in_container} if workdir_in_container is not None else None outputs_library = ToilWDLStdLibTaskOutputs(file_store, host_stdout_txt, host_stderr_txt, task_container.input_path_map, wdl_options=output_wdl_options) # Make sure files downloaded as inputs get re-used if we re-upload them. outputs_library.share_files(standard_library) - output_bindings = evaluate_bindings_from_decls(self._task.outputs, bindings, outputs_library, drop_missing_files=True) + output_bindings = evaluate_decls_to_bindings(self._task.outputs, bindings, outputs_library, drop_missing_files=True) # Now we know if the standard output and error were sent somewhere by # the workflow. If not, we should report them to the leader. @@ -2426,7 +2424,7 @@ def get_path_in_container(path: str) -> Optional[str]: self.handle_injection_messages(outputs_library) # Drop any files from the output which don't actually exist - output_bindings = drop_missing_files(output_bindings, current_directory_override=workdir_in_container) + output_bindings = drop_missing_files(output_bindings, standard_library=outputs_library) for decl in self._task.outputs: if not decl.type.optional and output_bindings[decl.name].value is None: # todo: make recursive @@ -2448,7 +2446,7 @@ class WDLWorkflowNodeJob(WDLBaseJob): Job that evaluates a WDL workflow node. """ - def __init__(self, node: WDL.Tree.WorkflowNode, prev_node_results: Sequence[Promised[WDLBindings]], namespace: str, task_path: str, wdl_options: Optional[Dict[str, Any]] = None, **kwargs: Any) -> None: + def __init__(self, node: WDL.Tree.WorkflowNode, prev_node_results: Sequence[Promised[WDLBindings]], namespace: str, task_path: str, wdl_options: Optional[WDL_OPTIONS] = None, **kwargs: Any) -> None: """ Make a new job to run a workflow node to completion. """ @@ -2539,7 +2537,7 @@ class WDLWorkflowNodeListJob(WDLBaseJob): workflows or tasks or sections. """ - def __init__(self, nodes: List[WDL.Tree.WorkflowNode], prev_node_results: Sequence[Promised[WDLBindings]], namespace: str, wdl_options: Optional[Dict[str, Any]] = None, **kwargs: Any) -> None: + def __init__(self, nodes: List[WDL.Tree.WorkflowNode], prev_node_results: Sequence[Promised[WDLBindings]], namespace: str, wdl_options: Optional[WDL_OPTIONS] = None, **kwargs: Any) -> None: """ Make a new job to run a list of workflow nodes to completion. """ @@ -2742,7 +2740,7 @@ class WDLSectionJob(WDLBaseJob): Job that can create more graph for a section of the wrokflow. """ - def __init__(self, namespace: str, task_path: str, wdl_options: Optional[Dict[str, Any]] = None, **kwargs: Any) -> None: + def __init__(self, namespace: str, task_path: str, wdl_options: Optional[WDL_OPTIONS] = None, **kwargs: Any) -> None: """ Make a WDLSectionJob where the interior runs in the given namespace, starting with the root workflow. @@ -3002,7 +3000,7 @@ class WDLScatterJob(WDLSectionJob): instance of the body. If an instance of the body doesn't create a binding, it gets a null value in the corresponding array. """ - def __init__(self, scatter: WDL.Tree.Scatter, prev_node_results: Sequence[Promised[WDLBindings]], namespace: str, task_path: str, wdl_options: Optional[Dict[str, Any]] = None, **kwargs: Any) -> None: + def __init__(self, scatter: WDL.Tree.Scatter, prev_node_results: Sequence[Promised[WDLBindings]], namespace: str, task_path: str, wdl_options: Optional[WDL_OPTIONS] = None, **kwargs: Any) -> None: """ Create a subtree that will run a WDL scatter. The scatter itself and the contents live in the given namespace. """ @@ -3144,7 +3142,7 @@ class WDLConditionalJob(WDLSectionJob): """ Job that evaluates a conditional in a WDL workflow. """ - def __init__(self, conditional: WDL.Tree.Conditional, prev_node_results: Sequence[Promised[WDLBindings]], namespace: str, task_path: str, wdl_options: Optional[Dict[str, Any]] = None, **kwargs: Any) -> None: + def __init__(self, conditional: WDL.Tree.Conditional, prev_node_results: Sequence[Promised[WDLBindings]], namespace: str, task_path: str, wdl_options: Optional[WDL_OPTIONS] = None, **kwargs: Any) -> None: """ Create a subtree that will run a WDL conditional. The conditional itself and its contents live in the given namespace. """ @@ -3199,7 +3197,7 @@ class WDLWorkflowJob(WDLSectionJob): Job that evaluates an entire WDL workflow. """ - def __init__(self, workflow: WDL.Tree.Workflow, prev_node_results: Sequence[Promised[WDLBindings]], workflow_id: List[str], namespace: str, task_path: str, wdl_options: Optional[Dict[str, Any]] = None, **kwargs: Any) -> None: + def __init__(self, workflow: WDL.Tree.Workflow, prev_node_results: Sequence[Promised[WDLBindings]], workflow_id: List[str], namespace: str, task_path: str, wdl_options: Optional[WDL_OPTIONS] = None, **kwargs: Any) -> None: """ Create a subtree that will run a WDL workflow. The job returns the return value of the workflow. @@ -3240,7 +3238,7 @@ def run(self, file_store: AbstractFileStore) -> Promised[WDLBindings]: if self._workflow.inputs: try: - bindings = evaluate_bindings_from_decls(self._workflow.inputs, bindings, standard_library, include_previous=True) + bindings = evaluate_decls_to_bindings(self._workflow.inputs, bindings, standard_library, include_previous=True) finally: # Report all files are downloaded now that all expressions are evaluated. self.files_downloaded_hook([(p, p) for p in standard_library.get_local_paths()]) @@ -3266,7 +3264,7 @@ class WDLOutputsJob(WDLBaseJob): Returns an environment with just the outputs bound, in no namespace. """ - def __init__(self, workflow: WDL.Tree.Workflow, bindings: Promised[WDLBindings], wdl_options: Optional[Dict[str, Any]] = None, **kwargs: Any): + def __init__(self, workflow: WDL.Tree.Workflow, bindings: Promised[WDLBindings], wdl_options: Optional[WDL_OPTIONS] = None, **kwargs: Any): """ Make a new WDLWorkflowOutputsJob for the given workflow, with the given set of bindings after its body runs. """ @@ -3305,7 +3303,7 @@ def run(self, file_store: AbstractFileStore) -> WDLBindings: # Output section is declared and is nonempty, so evaluate normally # Combine the bindings from the previous job - output_bindings = evaluate_bindings_from_decls(self._workflow.outputs, unwrap(self._bindings), standard_library, drop_missing_files=True) + output_bindings = evaluate_decls_to_bindings(self._workflow.outputs, unwrap(self._bindings), standard_library, drop_missing_files=True) finally: # We don't actually know when all our files are downloaded since # anything we evaluate might devirtualize inside any expression. @@ -3318,7 +3316,7 @@ def run(self, file_store: AbstractFileStore) -> WDLBindings: self.files_downloaded_hook([(p, p) for p in standard_library.get_local_paths()]) # Null nonexistent optional values and error on the rest - output_bindings = drop_missing_files(output_bindings, self._wdl_options.get("execution_dir")) + output_bindings = drop_missing_files(output_bindings, standard_library=standard_library) return self.postprocess(output_bindings) @@ -3329,7 +3327,7 @@ class WDLRootJob(WDLSectionJob): the workflow name; both forms are accepted. """ - def __init__(self, target: Union[WDL.Tree.Workflow, WDL.Tree.Task], inputs: WDLBindings, wdl_options: Optional[Dict[str, Any]] = None, **kwargs: Any) -> None: + def __init__(self, target: Union[WDL.Tree.Workflow, WDL.Tree.Task], inputs: WDLBindings, wdl_options: Optional[WDL_OPTIONS] = None, **kwargs: Any) -> None: """ Create a subtree to run the workflow and namespace the outputs. """ @@ -3461,14 +3459,12 @@ def main() -> None: # virtualize can happen at task boundaries; files must be converted to their virtualized instances. # Similarly, see if the mapping exists to test if the file (string/path representation) relative to the JSON URL exists, # and use it if so - filename_to_uri = import_url_files(input_bindings, toil, inputs_search_path, include_self=False) + convert_remote_files(input_bindings, toil, inputs_search_path) # Configure workflow interpreter options - wdl_options: Dict[str, Any] = {} + wdl_options: WDL_OPTIONS = {} wdl_options["execution_dir"] = execution_dir wdl_options["container"] = options.container - if filename_to_uri is not None: - wdl_options["filename_to_uri"] = filename_to_uri assert wdl_options.get("container") is not None # Run the workflow and get its outputs namespaced with the workflow name. @@ -3481,18 +3477,20 @@ def main() -> None: virtualized_to_devirtualized: Dict[str, str] = dict() # Fetch all the output files - def devirtualize_output(filename: str) -> str: + def devirtualize_output(file: WDL.Value.File) -> WDL.Value.File: """ 'devirtualize' a file using the "toil" object instead of a filestore. Returns its local path. """ # Make sure the output directory exists if we have output files # that might need to use it. + filename = getattr(file, "virtualized_value", None) or file.value os.makedirs(output_directory, exist_ok=True) - return ToilWDLStdLibBase.devirtualize_to(filename, output_directory, toil, wdl_options, devirtualized_to_virtualized, virtualized_to_devirtualized) + file.value = ToilWDLStdLibBase.devirtualize_to(filename, output_directory, toil, wdl_options, devirtualized_to_virtualized, virtualized_to_devirtualized) + return file # Make all the files local files - output_bindings = map_over_files_in_bindings(output_bindings, devirtualize_output) + output_bindings = map_over_typed_files_in_bindings(output_bindings, devirtualize_output) # Report the result in the right format outputs = WDL.values_to_json(output_bindings) From be0203e1c66133d16b315977f11d1bf98c09c563 Mon Sep 17 00:00:00 2001 From: stxue1 Date: Tue, 20 Aug 2024 16:21:07 -0700 Subject: [PATCH 09/36] Fix overwriting files --- src/toil/wdl/wdltoil.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/toil/wdl/wdltoil.py b/src/toil/wdl/wdltoil.py index 0f1476d6cf..e517519ace 100755 --- a/src/toil/wdl/wdltoil.py +++ b/src/toil/wdl/wdltoil.py @@ -756,6 +756,11 @@ def convert_file_to_url(file: WDL.Value.File) -> WDL.Value.File: # Wasn't found there # Mostly to satisfy mypy continue + + if candidate_uri.startswith("file:") or filename == candidate_uri: + # Don't replace if the original file was already found + # or if the replacement value is the same as the original + return file logger.info('Converting input file path %s to %s', filename, candidate_uri) # Work out what the basename for the file was From f60d47518a283b7be4a7b337ddd4dd58efe86f0b Mon Sep 17 00:00:00 2001 From: stxue1 Date: Thu, 22 Aug 2024 01:06:16 -0700 Subject: [PATCH 10/36] Fix prototype implementation --- src/toil/wdl/wdltoil.py | 53 ++++++++++++++++++++++++++++++++++++----- 1 file changed, 47 insertions(+), 6 deletions(-) diff --git a/src/toil/wdl/wdltoil.py b/src/toil/wdl/wdltoil.py index e517519ace..d375233fc0 100755 --- a/src/toil/wdl/wdltoil.py +++ b/src/toil/wdl/wdltoil.py @@ -484,7 +484,20 @@ def combine_bindings(all_bindings: Sequence[WDLBindings]) -> WDLBindings: # This is a duplicate existing_value = merged[binding.name] if existing_value != binding.value: - raise RuntimeError('Conflicting bindings for %s with values %s and %s', binding.name, existing_value, binding.value) + # This can happen if a binding has been virtualized and then devirtualized as devirtualization will replace the value + # Drop the unvirtualized binding in favor of the virtualized to ensure caching + # todo: figure out a better way to do this + existing_virtualized_value = getattr(existing_value, "virtualized_value", None) + current_virtualized_value = getattr(binding.value, "virtualized_value", None) + both_none = existing_virtualized_value is None and current_virtualized_value is None + both_virtualized = existing_virtualized_value is not None and current_virtualized_value is not None + virtualized_equal = existing_virtualized_value == current_virtualized_value + if both_none or (both_virtualized and not virtualized_equal): + raise RuntimeError('Conflicting bindings for %s with values %s and %s', binding.name, existing_value, binding.value) + elif existing_virtualized_value is not None: + continue + else: + merged = merged.bind(binding.name, binding.value, binding.info) else: logger.debug('Drop duplicate binding for %s', binding.name) else: @@ -887,6 +900,7 @@ def f(file: WDL.Value.File) -> WDL.Value.Base: return parse(infile.read()) return f + def _devirtualize_file(self, file: WDL.Value.File) -> WDL.Value.File: if getattr(file, "nonexistent", False): return file @@ -1090,6 +1104,14 @@ def __init__(self, file_store: AbstractFileStore, container: TaskContainer, wdl_ super().__init__(file_store, wdl_options) self.container = container + def _read(self, parse: Callable[[str], WDL.Value.Base]) -> Callable[[WDL.Value.File], WDL.Value.Base]: + # todo: figure out better way than reoverriding overridden function + def f(file: WDL.Value.File) -> WDL.Value.Base: + with open(self._devirtualize_filename(file.value), "r") as infile: + return parse(infile.read()) + + return f + @memoize def _devirtualize_filename(self, filename: str) -> str: """ @@ -1462,16 +1484,16 @@ def drop_if_missing(file: WDL.Value.File, standard_library: ToilWDLStdLibBase) - the current working directory of the job and is where all relative paths will be interpreted from """ work_dir = standard_library.execution_dir - filename = file.value - virtualized_filename = getattr(file, "virtualized_value", None) + filename = getattr(file, "virtualized_value", None) or file.value value_type = file.type logger.debug("Consider file %s", filename) - if virtualized_filename is not None and is_url(virtualized_filename): - if virtualized_filename.startswith(TOIL_URI_SCHEME) or AbstractJobStore.url_exists(virtualized_filename): + if filename is not None and is_url(filename): + if filename.startswith(TOIL_URI_SCHEME) or AbstractJobStore.url_exists(filename): # We assume anything in the filestore actually exists. - devirtualized_filename = standard_library._devirtualize_filename(virtualized_filename) + devirtualized_filename = standard_library._devirtualize_filename(filename) file.value = devirtualized_filename + setattr(file, "virtualized_value", filename) return file else: logger.warning('File %s with type %s does not actually exist at its URI', filename, value_type) @@ -1810,6 +1832,8 @@ def run(self, file_store: AbstractFileStore) -> Promised[WDLBindings]: # UUID to use for virtualizing files standard_library = ToilWDLStdLibBase(file_store, self._wdl_options) + devirtualize_files(bindings, standard_library) + if self._task.inputs: logger.debug("Evaluating task code") # Evaluate all the inputs that aren't pre-set @@ -2477,6 +2501,9 @@ def run(self, file_store: AbstractFileStore) -> Promised[WDLBindings]: incoming_bindings = combine_bindings(unwrap_all(self._prev_node_results)) # Set up the WDL standard library standard_library = ToilWDLStdLibBase(file_store, self._wdl_options) + + devirtualize_files(incoming_bindings, standard_library) + if isinstance(self._node, WDL.Tree.Decl): # This is a variable assignment logger.info('Setting %s to %s', self._node.name, self._node.expr) @@ -2568,6 +2595,8 @@ def run(self, file_store: AbstractFileStore) -> Promised[WDLBindings]: # Set up the WDL standard library standard_library = ToilWDLStdLibBase(file_store, self._wdl_options) + devirtualize_files(current_bindings, standard_library) + for node in self._nodes: if isinstance(node, WDL.Tree.Decl): # This is a variable assignment @@ -2605,6 +2634,12 @@ def run(self, file_store: AbstractFileStore) -> WDLBindings: """ super().run(file_store) combined = combine_bindings(unwrap_all(self._prev_node_results)) + + # Set up the WDL standard library + standard_library = ToilWDLStdLibBase(file_store, self._wdl_options) + + devirtualize_files(combined, standard_library) + # Make sure to run the universal postprocessing steps return self.postprocess(combined) @@ -3038,6 +3073,8 @@ def run(self, file_store: AbstractFileStore) -> Promised[WDLBindings]: # Set up the WDL standard library standard_library = ToilWDLStdLibBase(file_store, self._wdl_options) + devirtualize_files(bindings, standard_library) + # Get what to scatter over try: scatter_value = evaluate_named_expression(self._scatter, self._scatter.variable, None, self._scatter.expr, bindings, standard_library) @@ -3176,6 +3213,8 @@ def run(self, file_store: AbstractFileStore) -> Promised[WDLBindings]: # Set up the WDL standard library standard_library = ToilWDLStdLibBase(file_store, self._wdl_options) + devirtualize_files(bindings, standard_library) + # Get the expression value. Fake a name. try: expr_value = evaluate_named_expression(self._conditional, "", WDL.Type.Boolean(), self._conditional.expr, bindings, standard_library) @@ -3241,6 +3280,8 @@ def run(self, file_store: AbstractFileStore) -> Promised[WDLBindings]: # Set up the WDL standard library standard_library = ToilWDLStdLibBase(file_store, self._wdl_options) + devirtualize_files(bindings, standard_library) + if self._workflow.inputs: try: bindings = evaluate_decls_to_bindings(self._workflow.inputs, bindings, standard_library, include_previous=True) From 4bbe63d39fe90f5cdfddd4cb43e1e46f5b95ccda Mon Sep 17 00:00:00 2001 From: stxue1 Date: Fri, 23 Aug 2024 13:19:36 -0700 Subject: [PATCH 11/36] Fix documentation --- src/toil/test/wdl/wdltoil_test.py | 2 +- src/toil/wdl/wdltoil.py | 39 +++++++------------------------ 2 files changed, 10 insertions(+), 31 deletions(-) diff --git a/src/toil/test/wdl/wdltoil_test.py b/src/toil/test/wdl/wdltoil_test.py index b0e749d3c0..460ae678f4 100644 --- a/src/toil/test/wdl/wdltoil_test.py +++ b/src/toil/test/wdl/wdltoil_test.py @@ -6,7 +6,7 @@ import subprocess import unittest from uuid import uuid4 -from typing import Optional +from typing import Optional, Union from unittest.mock import patch from typing import Any, Dict, List, Set diff --git a/src/toil/wdl/wdltoil.py b/src/toil/wdl/wdltoil.py index ab9532d9f5..332f1a4f29 100755 --- a/src/toil/wdl/wdltoil.py +++ b/src/toil/wdl/wdltoil.py @@ -93,7 +93,7 @@ logger = logging.getLogger(__name__) -WDL_OPTIONS = TypedDict('WDL_OPTIONS', {"execution_dir": NotRequired[str], "container": NotRequired[str], "enforce_existence": NotRequired[bool], +WDL_OPTIONS = TypedDict('WDL_OPTIONS', {"execution_dir": NotRequired[str], "container": NotRequired[str], "share_files_with": NotRequired["ToilWDLStdLibBase"], "task_path": str}) @@ -888,12 +888,11 @@ class ToilWDLStdLibBase(WDL.StdLib.Base): def __init__(self, file_store: AbstractFileStore, wdl_options: WDL_OPTIONS): """ Set up the standard library. - - :param task_path: Dotted WDL name of the part of the workflow this library is working for. - :param share_files_with: If set to an existing standard library - instance, use the same file upload and download paths as it. - :param wdl_options: Extra options to pass into the standard library to use, ex: - execution_dir: directory to use as the working directory for workflow code, + :param wdl_options: Options to pass into the standard library to use. + Ex: + execution_dir: directory to use as the working directory for workflow code. + task_path: Dotted WDL name of the part of the workflow this library is working for. + share_files_with: If set to an existing standard library instance, use the same file upload and download paths as it. """ # TODO: Just always be the 1.2 standard library. wdl_version = "1.2" @@ -936,30 +935,13 @@ def execution_dir(self) -> Optional[str]: execution_dir: Optional[str] = self._wdl_options.get("execution_dir") return execution_dir - @property - def enforce_existence(self) -> bool: - """ - If enforce_existence is not defined in self._wdl_options, default to true - :return: - """ - enforce_existence: bool = self._wdl_options.get("enforce_existence") or True - return enforce_existence - @property def share_files_with(self) -> Optional["ToilWDLStdLibBase"]: - """ - If enforce_existence is not defined in self._wdl_options, default to true - :return: - """ share_files_with: Optional["ToilWDLStdLibBase"] = self._wdl_options.get("share_files_with") return share_files_with @property def task_path(self) -> str: - """ - If enforce_existence is not defined in self._wdl_options, default to true - :return: - """ task_path: str = self._wdl_options["task_path"] return task_path @@ -1931,10 +1913,8 @@ def __init__(self, task: WDL.Tree.Task, prev_node_results: Sequence[Promised[WDL :param namespace: The namespace that the task's *contents* exist in. The caller has alredy added the task's own name. - - :param task_path: Like the namespace, but including subscript numbers - for scatters. """ + # task_path in wdl_options is like the namespace, but including subscript numbers for scatters super().__init__(unitName=wdl_options["task_path"] + ".inputs", displayName=namespace + ".inputs", wdl_options=wdl_options, **kwargs) logger.info("Preparing to run task code for %s as %s", task.name, namespace) @@ -2090,14 +2070,13 @@ def __init__(self, task: WDL.Tree.Task, task_internal_bindings: Promised[WDLBind :param namespace: The namespace that the task's *contents* exist in. The caller has alredy added the task's own name. - - :param task_path: Like the namespace, but including subscript numbers - for scatters. """ # This job should not be local because it represents a real workflow task. # TODO: Instead of re-scheduling with more resources, add a local # "wrapper" job like CWL uses to determine the actual requirements. + + # task_path in wdl_options is like the namespace, but including subscript numbers for scatters super().__init__(unitName=wdl_options["task_path"] + ".command", displayName=namespace + ".command", local=False, wdl_options=wdl_options, **kwargs) logger.info("Preparing to run task %s as %s", task.name, namespace) From a6b18c7c1e9c1d76417d829b76a4adfe692ca5a5 Mon Sep 17 00:00:00 2001 From: stxue1 Date: Wed, 28 Aug 2024 19:23:45 -0700 Subject: [PATCH 12/36] Resolve nested virtualize files issue by converting back to original and permanent devirtualization of URLs --- src/toil/wdl/wdltoil.py | 38 ++++++++++++++++++++------------------ 1 file changed, 20 insertions(+), 18 deletions(-) diff --git a/src/toil/wdl/wdltoil.py b/src/toil/wdl/wdltoil.py index 332f1a4f29..968e116f18 100755 --- a/src/toil/wdl/wdltoil.py +++ b/src/toil/wdl/wdltoil.py @@ -472,6 +472,11 @@ def combine_bindings(all_bindings: Sequence[WDLBindings]) -> WDLBindings: # # So we do the merge manually. + new_all_bindings = [] + for bindings in all_bindings: + new_bindings = map_over_typed_files_in_bindings(bindings, revert_file_to_original) + new_all_bindings.append(new_bindings) + if len(all_bindings) == 0: # Combine nothing return WDL.Env.Bindings() @@ -486,20 +491,7 @@ def combine_bindings(all_bindings: Sequence[WDLBindings]) -> WDLBindings: # This is a duplicate existing_value = merged[binding.name] if existing_value != binding.value: - # This can happen if a binding has been virtualized and then devirtualized as devirtualization will replace the value - # Drop the unvirtualized binding in favor of the virtualized to ensure caching - # todo: figure out a better way to do this - existing_virtualized_value = getattr(existing_value, "virtualized_value", None) - current_virtualized_value = getattr(binding.value, "virtualized_value", None) - both_none = existing_virtualized_value is None and current_virtualized_value is None - both_virtualized = existing_virtualized_value is not None and current_virtualized_value is not None - virtualized_equal = existing_virtualized_value == current_virtualized_value - if both_none or (both_virtualized and not virtualized_equal): - raise RuntimeError('Conflicting bindings for %s with values %s and %s', binding.name, existing_value, binding.value) - elif existing_virtualized_value is not None: - continue - else: - merged = merged.bind(binding.name, binding.value, binding.info) + raise RuntimeError('Conflicting bindings for %s with values %s and %s', binding.name, existing_value, binding.value) else: logger.debug('Drop duplicate binding for %s', binding.name) else: @@ -507,6 +499,12 @@ def combine_bindings(all_bindings: Sequence[WDLBindings]) -> WDLBindings: return merged +def revert_file_to_original(file: WDL.Value.File) -> Optional[WDL.Value.File]: + original_value = getattr(file, "original_value", None) + if original_value is not None: + file.value = original_value + return file + # TODO: Develop a Protocol that can match the logging function type more closely def log_bindings(log_function: Callable[..., None], message: str, all_bindings: Sequence[Promised[WDLBindings]]) -> None: """ @@ -989,8 +987,9 @@ def f(file: WDL.Value.File) -> WDL.Value.Base: def _devirtualize_file(self, file: WDL.Value.File) -> WDL.Value.File: if getattr(file, "nonexistent", False): return file - virtualized_filename = getattr(file, "virtualized_value", file.value) - file.value = self._devirtualize_filename(virtualized_filename) + virtualized_filename = getattr(file, "virtualized_value", None) + if virtualized_filename is not None: + file.value = self._devirtualize_filename(virtualized_filename) return file def _virtualize_file(self, file: WDL.Value.File, enforce_existence: bool = True) -> WDL.Value.File: @@ -1001,7 +1000,10 @@ def _virtualize_file(self, file: WDL.Value.File, enforce_existence: bool = True) if not os.path.exists(abs_filepath): setattr(file, "nonexistent", True) return file - virtualized = self._virtualize_filename(getattr(file, "virtualized_value", None) or file.value) + current_virtualized_value = getattr(file, "virtualized_value", None) + virtualized = self._virtualize_filename(current_virtualized_value or file.value) + if current_virtualized_value is None: + setattr(file, "original_value", file.value) setattr(file, "virtualized_value", virtualized) return file @@ -2310,7 +2312,7 @@ def run(self, file_store: AbstractFileStore) -> Promised[WDLBindings]: os.makedirs(os.environ['MINIWDL__SINGULARITY__IMAGE_CACHE'], exist_ok=True) # Run containers with Singularity - TaskContainerImplementation: Type[TaskContainer] = SingularityContainer + TaskContainerImplementation: Type[TaskContainer] = SingularityContainer elif self._wdl_options.get("container") in ["docker", "auto"]: # Run containers with Docker # TODO: Poll if it is available and don't just try and fail. From 94da9af63a82665cb3308d0235350414734cc339 Mon Sep 17 00:00:00 2001 From: stxue1 Date: Wed, 28 Aug 2024 22:20:08 -0700 Subject: [PATCH 13/36] Fix virtualization of URLs that aren't toil URIs --- src/toil/wdl/wdltoil.py | 114 ++++++++++++++++++++++++++++------------ 1 file changed, 81 insertions(+), 33 deletions(-) diff --git a/src/toil/wdl/wdltoil.py b/src/toil/wdl/wdltoil.py index 968e116f18..5312c1141a 100755 --- a/src/toil/wdl/wdltoil.py +++ b/src/toil/wdl/wdltoil.py @@ -48,6 +48,8 @@ Union, cast, TypedDict) +from urllib.error import HTTPError + if sys.version_info < (3, 11): from typing_extensions import NotRequired else: @@ -65,6 +67,7 @@ from WDL.runtime.backend.docker_swarm import SwarmContainer from WDL.runtime.backend.singularity import SingularityContainer from WDL.runtime.task_container import TaskContainer +from WDL.runtime.error import DownloadFailed from toil.batchSystems.abstractBatchSystem import InsufficientSystemResources from toil.common import Toil, addOptions @@ -111,6 +114,7 @@ def wdl_error_reporter(task: str, exit: bool = False, log: Callable[[str], None] WDL.Error.ImportError, WDL.Error.ValidationError, WDL.Error.MultipleValidationErrors, + DownloadFailed, FileNotFoundError, InsufficientSystemResources, LocatorException, @@ -779,7 +783,15 @@ def _call_eager(self, expr: "WDL.Expr.Apply", arguments: List[WDL.Value.Base]) - # Return the result as a WDL float value return WDL.Value.Float(total_size) -def is_url(filename: str, schemes: List[str] = ['http:', 'https:', 's3:', 'gs:', TOIL_URI_SCHEME]) -> bool: +def is_toil_url(filename: str) -> bool: + return is_url(filename, schemes=[TOIL_URI_SCHEME]) + + +def is_normal_url(filename: str) -> bool: + return is_url(filename, ['http:', 'https:', 's3:', 'gs:']) + + +def is_url(filename: str, schemes: List[str]=['http:', 'https:', 's3:', 'gs:', TOIL_URI_SCHEME]) -> bool: """ Decide if a filename is a known kind of URL """ @@ -989,22 +1001,31 @@ def _devirtualize_file(self, file: WDL.Value.File) -> WDL.Value.File: return file virtualized_filename = getattr(file, "virtualized_value", None) if virtualized_filename is not None: - file.value = self._devirtualize_filename(virtualized_filename) + devirtualized = self._devirtualize_filename(virtualized_filename) + if file.value != devirtualized: + # Remember the original value for when we merge bindings from finished jobs to future jobs as the devirtualized name + # is job specific, but the merge requires the file names to be the same + setattr(file, "original_value", file.value) + file.value = devirtualized return file def _virtualize_file(self, file: WDL.Value.File, enforce_existence: bool = True) -> WDL.Value.File: if enforce_existence is False: # We only want to error on a nonexistent file in the output section # Since we need to virtualize on task boundaries, don't enforce existence if on a boundary - abs_filepath = os.path.join(self.execution_dir, file.value) if self.execution_dir is not None else os.path.abspath(file.value) - if not os.path.exists(abs_filepath): + if is_normal_url(file.value): + file_uri = Toil.normalize_uri(file.value) + else: + abs_filepath = os.path.join(self.execution_dir, file.value) if self.execution_dir is not None else os.path.abspath(file.value) + file_uri = Toil.normalize_uri(abs_filepath) + + if not AbstractJobStore.url_exists(file_uri): setattr(file, "nonexistent", True) return file current_virtualized_value = getattr(file, "virtualized_value", None) - virtualized = self._virtualize_filename(current_virtualized_value or file.value) if current_virtualized_value is None: - setattr(file, "original_value", file.value) - setattr(file, "virtualized_value", virtualized) + virtualized = self._virtualize_filename(file.value) + setattr(file, "virtualized_value", virtualized) return file @memoize @@ -1142,39 +1163,66 @@ def _virtualize_filename(self, filename: str) -> str: File value """ - if is_url(filename): + if is_toil_url(filename): # Already virtual logger.debug('Already virtual: %s', filename) return filename - - # Otherwise this is a local file and we want to fake it as a Toil file store file - - # Make it an absolute path - if self.execution_dir is not None: - # To support relative paths from execution directory, join the execution dir and filename - # If filename is already an abs path, join() will not do anything - abs_filename = os.path.join(self.execution_dir, filename) + elif is_normal_url(filename): + # This is a URL (http, s3, etc) that we want to virtualize + # First check the cache + if filename in self._devirtualized_to_virtualized: + # Note: this is a little duplicative with the local file path branch, but the keys are different + result = self._devirtualized_to_virtualized[filename] + logger.debug("Re-using virtualized WDL file %s for %s", result, filename) + return result + try: + imported = self._file_store.import_file(filename) + except FileNotFoundError: + # todo: raising exceptions inside of this function will be captured in StdLib.StaticFunction._call_eage which always raises an EvalError. + # Ideally, the error raised here should escape to be captured in the wdl runner main loop, but I can't figure out how + raise WDL.runtime.DownloadFailed("File at URL %s does not exist or is inaccessible." % filename) + except HTTPError: + # Something went wrong with the connection, raise and retry later + raise + file_basename = os.path.basename(urlsplit(filename).path) + # Get the URL to the parent directory and use that. + parent_dir = urljoin(filename, ".") + # Pack a UUID of the parent directory + dir_id = self._parent_dir_to_ids.setdefault(parent_dir, uuid.uuid4()) + result = pack_toil_uri(imported, self.task_path, dir_id, file_basename) + logger.debug('Virtualized %s as WDL file %s', filename, result) + # We can't the Toil URI in the cache because it would point to the URL instead of a file + # which we don't have as we haven't written the contents of the URL into a file yet + # The next devirtualize_filename call will generate a file and store it in the cache for us + return result else: - abs_filename = os.path.abspath(filename) + # Otherwise this is a local file and we want to fake it as a Toil file store file + # Make it an absolute path + if self.execution_dir is not None: + # To support relative paths from execution directory, join the execution dir and filename + # If filename is already an abs path, join() will not do anything + abs_filename = os.path.join(self.execution_dir, filename) + else: + abs_filename = os.path.abspath(filename) - if abs_filename in self._devirtualized_to_virtualized: - # This is a previously devirtualized thing so we can just use the - # virtual version we remembered instead of reuploading it. - result = self._devirtualized_to_virtualized[abs_filename] - logger.debug("Re-using virtualized WDL file %s for %s", result, filename) - return result + if abs_filename in self._devirtualized_to_virtualized: + # This is a previously devirtualized thing so we can just use the + # virtual version we remembered instead of reuploading it. + result = self._devirtualized_to_virtualized[abs_filename] + logger.debug("Re-using virtualized WDL file %s for %s", result, filename) + return result - file_id = self._file_store.writeGlobalFile(abs_filename) + file_id = self._file_store.writeGlobalFile(abs_filename) - file_dir = os.path.dirname(abs_filename) - parent_id = self._parent_dir_to_ids.setdefault(file_dir, uuid.uuid4()) - result = pack_toil_uri(file_id, self.task_path, parent_id, os.path.basename(abs_filename)) - logger.debug('Virtualized %s as WDL file %s', filename, result) - # Remember the upload in case we share a cache - self._devirtualized_to_virtualized[abs_filename] = result - # And remember the local path in case we want a redownload - self._virtualized_to_devirtualized[result] = abs_filename - return result + file_dir = os.path.dirname(abs_filename) + parent_id = self._parent_dir_to_ids.setdefault(file_dir, uuid.uuid4()) + result = pack_toil_uri(file_id, self.task_path, parent_id, os.path.basename(abs_filename)) + logger.debug('Virtualized %s as WDL file %s', filename, result) + # Remember the upload in case we share a cache + self._devirtualized_to_virtualized[abs_filename] = result + # And remember the local path in case we want a redownload + self._virtualized_to_devirtualized[result] = abs_filename + return result class ToilWDLStdLibTaskCommand(ToilWDLStdLibBase): """ From 644fd511b4c356d9ee79744458bc86d691c1c917 Mon Sep 17 00:00:00 2001 From: stxue1 Date: Wed, 28 Aug 2024 22:24:25 -0700 Subject: [PATCH 14/36] Mypy --- src/toil/wdl/wdltoil.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/toil/wdl/wdltoil.py b/src/toil/wdl/wdltoil.py index 5312c1141a..f74a262505 100755 --- a/src/toil/wdl/wdltoil.py +++ b/src/toil/wdl/wdltoil.py @@ -1180,10 +1180,13 @@ def _virtualize_filename(self, filename: str) -> str: except FileNotFoundError: # todo: raising exceptions inside of this function will be captured in StdLib.StaticFunction._call_eage which always raises an EvalError. # Ideally, the error raised here should escape to be captured in the wdl runner main loop, but I can't figure out how - raise WDL.runtime.DownloadFailed("File at URL %s does not exist or is inaccessible." % filename) + raise DownloadFailed("File at URL %s does not exist or is inaccessible." % filename) except HTTPError: # Something went wrong with the connection, raise and retry later raise + if imported is None: + # Satisfy mypy, this should never happen though as we don't pass a shared file name (which is the only way import_file returns None) + raise RuntimeError("Failed to import URL %s into jobstore." % filename) file_basename = os.path.basename(urlsplit(filename).path) # Get the URL to the parent directory and use that. parent_dir = urljoin(filename, ".") From f8c0ef193f1322b86ce73cb6ce0da4bd68e75fd9 Mon Sep 17 00:00:00 2001 From: stxue1 Date: Wed, 28 Aug 2024 22:41:47 -0700 Subject: [PATCH 15/36] Remove documentation that no longer applies --- src/toil/wdl/wdltoil.py | 8 -------- 1 file changed, 8 deletions(-) diff --git a/src/toil/wdl/wdltoil.py b/src/toil/wdl/wdltoil.py index 685cb45b1d..aff3ec3099 100755 --- a/src/toil/wdl/wdltoil.py +++ b/src/toil/wdl/wdltoil.py @@ -3687,14 +3687,6 @@ def main() -> None: # Get the execution directory execution_dir = os.getcwd() - # There are two instances where paths relative to the JSON URL may be necessary: - # ToilStdLibBase._devirtualize_to and ToilStdLibBase._virtualize_filename - # devirtualize can happen when a binding needs a function call that reads the file, which miniwdl will call internally - # as the file (which is represented as a string/path) can be relative to the JSON URL, see if the mapping exists - # and use it if so - # virtualize can happen at task boundaries; files must be converted to their virtualized instances. - # Similarly, see if the mapping exists to test if the file (string/path representation) relative to the JSON URL exists, - # and use it if so convert_remote_files(input_bindings, toil, inputs_search_path) # Configure workflow interpreter options From 6365b348bb7a3753b61d2265e24e1bc0c40acd8b Mon Sep 17 00:00:00 2001 From: stxue1 Date: Wed, 28 Aug 2024 22:45:51 -0700 Subject: [PATCH 16/36] Fix merge conflict issue --- src/toil/wdl/wdltoil.py | 17 +++-------------- 1 file changed, 3 insertions(+), 14 deletions(-) diff --git a/src/toil/wdl/wdltoil.py b/src/toil/wdl/wdltoil.py index aff3ec3099..ed96ffd570 100755 --- a/src/toil/wdl/wdltoil.py +++ b/src/toil/wdl/wdltoil.py @@ -807,7 +807,6 @@ def convert_remote_files(environment: WDLBindings, file_source: Union[AbstractFi :param environment: Bindings to evaluate on :param file_source: Context to search for remote files with :param search_paths: List of paths to search through - :return: """ def convert_file_to_url(file: WDL.Value.File) -> WDL.Value.File: """ @@ -899,21 +898,11 @@ class ToilWDLStdLibBase(WDL.StdLib.Base): def __init__(self, file_store: AbstractFileStore, wdl_options: WDL_OPTIONS): """ Set up the standard library. -<<<<<<< HEAD :param wdl_options: Options to pass into the standard library to use. - Ex: - execution_dir: directory to use as the working directory for workflow code. - task_path: Dotted WDL name of the part of the workflow this library is working for. + Ex: task_path: Dotted WDL name of the part of the workflow this library is working for. + execution_dir: Directory to use as the working directory for workflow code. + enforce_existence: If true, then if a file is detected as nonexistent, raise an error. Else, let it pass through share_files_with: If set to an existing standard library instance, use the same file upload and download paths as it. -======= - - :param task_path: Dotted WDL name of the part of the workflow this library is working for. - :param execution_dir: Directory to use as the working directory for workflow code. - :param enforce_existence: If true, then if a file is detected as - nonexistent, raise an error. Else, let it pass through - :param share_files_with: If set to an existing standard library - instance, use the same file upload and download paths as it. ->>>>>>> f23b1d3f8cf74cb4382d94d6dca4180e423d79df """ # TODO: Just always be the 1.2 standard library. wdl_version = "1.2" From 0a35c53583b8881adf91bf3ace01700343e87c35 Mon Sep 17 00:00:00 2001 From: stxue1 Date: Wed, 28 Aug 2024 22:48:45 -0700 Subject: [PATCH 17/36] Enable symlink conformance test --- src/toil/test/wdl/wdltoil_test.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/toil/test/wdl/wdltoil_test.py b/src/toil/test/wdl/wdltoil_test.py index 8fa5e7248a..3518319b89 100644 --- a/src/toil/test/wdl/wdltoil_test.py +++ b/src/toil/test/wdl/wdltoil_test.py @@ -52,7 +52,6 @@ def tearDown(self) -> None: 16, # Basic object test (deprecated and removed in 1.1); MiniWDL and toil-wdl-runner do not support Objects, so this will fail if ran by them 21, # Parser: expression placeholders in strings in conditional expressions in 1.0, Cromwell style; Fails with MiniWDL and toil-wdl-runner 64, # Legacy test for as_map_as_input; It looks like MiniWDL does not have the function as_map() - 72, # Symlink passthrough; see 77, # Test that array cannot coerce to a string. WDL 1.1 does not allow compound types to coerce into a string. This should return a TypeError. ] From ce210516561bb0feadbc868812ded182da9c231b Mon Sep 17 00:00:00 2001 From: stxue1 Date: Wed, 28 Aug 2024 22:48:54 -0700 Subject: [PATCH 18/36] whitespace --- src/toil/test/wdl/wdltoil_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/toil/test/wdl/wdltoil_test.py b/src/toil/test/wdl/wdltoil_test.py index 3518319b89..256d518970 100644 --- a/src/toil/test/wdl/wdltoil_test.py +++ b/src/toil/test/wdl/wdltoil_test.py @@ -48,7 +48,7 @@ def tearDown(self) -> None: WDL_CONFORMANCE_TEST_COMMIT = "01401a46bc0e60240fb2b69af4b978d0a5bd8fc8" # These tests are known to require things not implemented by # Toil and will not be run in CI. -WDL_CONFORMANCE_TESTS_UNSUPPORTED_BY_TOIL= [ +WDL_CONFORMANCE_TESTS_UNSUPPORTED_BY_TOIL = [ 16, # Basic object test (deprecated and removed in 1.1); MiniWDL and toil-wdl-runner do not support Objects, so this will fail if ran by them 21, # Parser: expression placeholders in strings in conditional expressions in 1.0, Cromwell style; Fails with MiniWDL and toil-wdl-runner 64, # Legacy test for as_map_as_input; It looks like MiniWDL does not have the function as_map() From 343d6d8cae37e0bbad43c711b36f25b73e6be92f Mon Sep 17 00:00:00 2001 From: stxue1 Date: Fri, 30 Aug 2024 23:50:13 -0700 Subject: [PATCH 19/36] Fix virtualize/devirtualize_filename to be called only during command stdlibs and combine_bindings issue with new virtualization representation --- src/toil/wdl/wdltoil.py | 78 +++++++++++++++++++++++++++-------------- 1 file changed, 51 insertions(+), 27 deletions(-) diff --git a/src/toil/wdl/wdltoil.py b/src/toil/wdl/wdltoil.py index ed96ffd570..d8514d67ce 100755 --- a/src/toil/wdl/wdltoil.py +++ b/src/toil/wdl/wdltoil.py @@ -27,6 +27,7 @@ import stat import subprocess import sys +import tempfile import textwrap import uuid from contextlib import ExitStack, contextmanager @@ -47,7 +48,7 @@ TypeVar, Union, cast, - TypedDict) + TypedDict, IO) if sys.version_info < (3, 11): from typing_extensions import NotRequired @@ -454,7 +455,11 @@ async def toil_read_source(uri: str, path: List[str], importer: Optional[WDL.Tre raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), uri) - +def virtualized_equal(file1: WDL.Value.Base, file2: WDL.Value.Base) -> bool: + def f(file: WDL.Value.File) -> WDL.Value.File: + file.value = getattr(file, "virtualized_value", file.value) + return file + return map_over_typed_files_in_value(file1, f) == map_over_typed_files_in_value(file2, f) # Bindings have a long type name WDLBindings = WDL.Env.Bindings[WDL.Value.Base] @@ -477,11 +482,6 @@ def combine_bindings(all_bindings: Sequence[WDLBindings]) -> WDLBindings: # # So we do the merge manually. - new_all_bindings = [] - for bindings in all_bindings: - new_bindings = map_over_typed_files_in_bindings(bindings, revert_file_to_original) - new_all_bindings.append(new_bindings) - if len(all_bindings) == 0: # Combine nothing return WDL.Env.Bindings() @@ -495,7 +495,7 @@ def combine_bindings(all_bindings: Sequence[WDLBindings]) -> WDLBindings: if binding.name in merged: # This is a duplicate existing_value = merged[binding.name] - if existing_value != binding.value: + if not virtualized_equal(existing_value, binding.value): raise RuntimeError('Conflicting bindings for %s with values %s and %s', binding.name, existing_value, binding.value) else: logger.debug('Drop duplicate binding for %s', binding.name) @@ -996,6 +996,24 @@ def f(file: WDL.Value.File) -> WDL.Value.Base: return f + def _write( + self, serialize: Callable[[WDL.Value.Base, IO[bytes]], None] + ) -> Callable[[WDL.Value.Base], WDL.Value.File]: + "generate write_* function implementation based on serialize" + + def _f( + v: WDL.Value.Base, + ) -> WDL.Value.File: + os.makedirs(self._write_dir, exist_ok=True) + with tempfile.NamedTemporaryFile(dir=self._write_dir, delete=False) as outfile: + serialize(v, outfile) + filename = outfile.name + from WDL._util import chmod_R_plus + chmod_R_plus(filename, file_bits=0o660) + return WDL.Value.File(filename) + + return _f + def _devirtualize_file(self, file: WDL.Value.File) -> WDL.Value.File: if getattr(file, "nonexistent", False): return file @@ -1010,6 +1028,9 @@ def _devirtualize_file(self, file: WDL.Value.File) -> WDL.Value.File: return file def _virtualize_file(self, file: WDL.Value.File, enforce_existence: bool = True) -> WDL.Value.File: + if getattr(file, "virtualized_value", None) is not None: + return file + if enforce_existence is False: # We only want to error on a nonexistent file in the output section # Since we need to virtualize on task boundaries, don't enforce existence if on a boundary @@ -1022,10 +1043,8 @@ def _virtualize_file(self, file: WDL.Value.File, enforce_existence: bool = True) if not AbstractJobStore.url_exists(file_uri): setattr(file, "nonexistent", True) return file - current_virtualized_value = getattr(file, "virtualized_value", None) - if current_virtualized_value is None: - virtualized = self._virtualize_filename(file.value) - setattr(file, "virtualized_value", virtualized) + virtualized = self._virtualize_filename(file.value) + setattr(file, "virtualized_value", virtualized) return file @memoize @@ -1198,6 +1217,8 @@ def _virtualize_filename(self, filename: str) -> str: # We can't the Toil URI in the cache because it would point to the URL instead of a file # which we don't have as we haven't written the contents of the URL into a file yet # The next devirtualize_filename call will generate a file and store it in the cache for us + # But store the forward mapping in case another virtualization is called + self._devirtualized_to_virtualized[filename] = result return result else: # Otherwise this is a local file and we want to fake it as a Toil file store file @@ -1254,6 +1275,23 @@ def f(file: WDL.Value.File) -> WDL.Value.Base: return f + def _write( + self, serialize: Callable[[WDL.Value.Base, IO[bytes]], None] + ) -> Callable[[WDL.Value.Base], WDL.Value.File]: + def _f( + v: WDL.Value.Base, + ) -> WDL.Value.File: + os.makedirs(self._write_dir, exist_ok=True) + with tempfile.NamedTemporaryFile(dir=self._write_dir, delete=False) as outfile: + serialize(v, outfile) + filename = outfile.name + from WDL._util import chmod_R_plus + chmod_R_plus(filename, file_bits=0o660) + vfn = self._virtualize_filename(filename) + return WDL.Value.File(vfn) + + return _f + @memoize def _devirtualize_filename(self, filename: str) -> str: """ @@ -1998,8 +2036,6 @@ def run(self, file_store: AbstractFileStore) -> Promised[WDLBindings]: # UUID to use for virtualizing files standard_library = ToilWDLStdLibBase(file_store, self._wdl_options) - devirtualize_files(bindings, standard_library) - if self._task.inputs: logger.debug("Evaluating task code") # Evaluate all the inputs that aren't pre-set @@ -2098,7 +2134,6 @@ def run(self, file_store: AbstractFileStore) -> Promised[WDLBindings]: accelerator_requirement = parse_accelerator(accelerator_spec) runtime_accelerators = [accelerator_requirement] - # Schedule to get resources. Pass along the bindings from evaluating all the inputs and decls, and the runtime, with files virtualized. run_job = WDLTaskJob(self._task, virtualize_files(bindings, standard_library, enforce_existence=False), virtualize_files(runtime_bindings, standard_library, enforce_existence=False), self._task_id, self._namespace, cores=runtime_cores or self.cores, memory=runtime_memory or self.memory, disk=runtime_disk or self.disk, accelerators=runtime_accelerators or self.accelerators, wdl_options=self._wdl_options) # Run that as a child @@ -2681,8 +2716,6 @@ def run(self, file_store: AbstractFileStore) -> Promised[WDLBindings]: # Set up the WDL standard library standard_library = ToilWDLStdLibBase(file_store, self._wdl_options) - devirtualize_files(incoming_bindings, standard_library) - if isinstance(self._node, WDL.Tree.Decl): # This is a variable assignment logger.info('Setting %s to %s', self._node.name, self._node.expr) @@ -2777,8 +2810,6 @@ def run(self, file_store: AbstractFileStore) -> Promised[WDLBindings]: # Set up the WDL standard library standard_library = ToilWDLStdLibBase(file_store, self._wdl_options) - devirtualize_files(current_bindings, standard_library) - for node in self._nodes: if isinstance(node, WDL.Tree.Decl): # This is a variable assignment @@ -2820,8 +2851,6 @@ def run(self, file_store: AbstractFileStore) -> WDLBindings: # Set up the WDL standard library standard_library = ToilWDLStdLibBase(file_store, self._wdl_options) - devirtualize_files(combined, standard_library) - # Make sure to run the universal postprocessing steps return self.postprocess(combined) @@ -3254,8 +3283,6 @@ def run(self, file_store: AbstractFileStore) -> Promised[WDLBindings]: # Set up the WDL standard library standard_library = ToilWDLStdLibBase(file_store, self._wdl_options) - devirtualize_files(bindings, standard_library) - # Get what to scatter over try: scatter_value = evaluate_named_expression(self._scatter, self._scatter.variable, None, self._scatter.expr, bindings, standard_library) @@ -3394,8 +3421,6 @@ def run(self, file_store: AbstractFileStore) -> Promised[WDLBindings]: # Set up the WDL standard library standard_library = ToilWDLStdLibBase(file_store, self._wdl_options) - devirtualize_files(bindings, standard_library) - # Get the expression value. Fake a name. try: expr_value = evaluate_named_expression(self._conditional, "", WDL.Type.Boolean(), self._conditional.expr, bindings, standard_library) @@ -3461,8 +3486,6 @@ def run(self, file_store: AbstractFileStore) -> Promised[WDLBindings]: # Set up the WDL standard library standard_library = ToilWDLStdLibBase(file_store, self._wdl_options) - devirtualize_files(bindings, standard_library) - if self._workflow.inputs: try: bindings = evaluate_decls_to_bindings(self._workflow.inputs, bindings, standard_library, include_previous=True) @@ -3470,6 +3493,7 @@ def run(self, file_store: AbstractFileStore) -> Promised[WDLBindings]: # Report all files are downloaded now that all expressions are evaluated. self.files_downloaded_hook([(p, p) for p in standard_library.get_local_paths()]) + bindings = virtualize_files(bindings, standard_library, enforce_existence=False) # Make jobs to run all the parts of the workflow sink = self.create_subgraph(self._workflow.body, [], bindings) From 37efd55ec749950e90c118f1cb0de2c25d5ce064 Mon Sep 17 00:00:00 2001 From: stxue1 Date: Thu, 5 Sep 2024 18:53:09 -0700 Subject: [PATCH 20/36] Add documentation and make convert_files function import greedily --- src/toil/wdl/wdltoil.py | 120 +++++++++++++++++++++++++++------------- 1 file changed, 82 insertions(+), 38 deletions(-) diff --git a/src/toil/wdl/wdltoil.py b/src/toil/wdl/wdltoil.py index d8514d67ce..340543da0f 100755 --- a/src/toil/wdl/wdltoil.py +++ b/src/toil/wdl/wdltoil.py @@ -98,6 +98,11 @@ logger = logging.getLogger(__name__) +# WDL options to pass into the WDL jobs and standard libraries +# task_path: Dotted WDL name of the part of the workflow this library is working for. +# execution_dir: Directory to use as the working directory for workflow code. +# enforce_existence: If true, then if a file is detected as nonexistent, raise an error. Else, let it pass through +# share_files_with: If set to an existing standard library instance, use the same file upload and download paths as it. WDL_OPTIONS = TypedDict('WDL_OPTIONS', {"execution_dir": NotRequired[str], "container": NotRequired[str], "share_files_with": NotRequired["ToilWDLStdLibBase"], "task_path": str}) @@ -456,6 +461,12 @@ async def toil_read_source(uri: str, path: List[str], importer: Optional[WDL.Tre def virtualized_equal(file1: WDL.Value.Base, file2: WDL.Value.Base) -> bool: + """ + Check if two WDL values are equal when taking account file virtualization. + :param file1: WDL value + :param file2: WDL value + :return: Whether the two file values are true + """ def f(file: WDL.Value.File) -> WDL.Value.File: file.value = getattr(file, "virtualized_value", file.value) return file @@ -504,11 +515,6 @@ def combine_bindings(all_bindings: Sequence[WDLBindings]) -> WDLBindings: return merged -def revert_file_to_original(file: WDL.Value.File) -> Optional[WDL.Value.File]: - original_value = getattr(file, "original_value", None) - if original_value is not None: - file.value = original_value - return file # TODO: Develop a Protocol that can match the logging function type more closely def log_bindings(log_function: Callable[..., None], message: str, all_bindings: Sequence[Promised[WDLBindings]]) -> None: @@ -801,16 +807,24 @@ def is_url(filename: str, schemes: List[str]=['http:', 'https:', 's3:', 'gs:', T return True return False -def convert_remote_files(environment: WDLBindings, file_source: Union[AbstractFileStore, Toil], search_paths: Optional[List[str]] = None) -> None: +def convert_remote_files(environment: WDLBindings, file_source: Toil, task_path: str, search_paths: Optional[List[str]] = None, skip_remote: bool = False) -> None: """ Iterate through the environment and convert all files that reference a remote file from the possible URIs to that URI. + If the file references a local file, leave it unchanged. :param environment: Bindings to evaluate on :param file_source: Context to search for remote files with - :param search_paths: List of paths to search through - """ + :param task_path: Dotted WDL name of the user-level code doing the + importing (probably the workflow name). + :param search_paths: If set, try resolving input location relative to the URLs or + directories in this list. + :param skip_remote: If set, don't try to import files from remote + locations. Leave them as URIs. + """ + path_to_id: Dict[str, uuid.UUID] = {} + @memoize def convert_file_to_url(file: WDL.Value.File) -> WDL.Value.File: """ - Detect if any potential URI exist and convert a file's value to a URI + Detect if any potential URI exists. Will convert a file's value to a URI and import it. """ # Search through any input search paths passed in and download it if found filename = file.value @@ -818,37 +832,44 @@ def convert_file_to_url(file: WDL.Value.File) -> WDL.Value.File: for candidate_uri in potential_absolute_uris(filename, search_paths if search_paths is not None else []): tried.append(candidate_uri) try: - # Detect if the - found = False - if isinstance(file_source, AbstractJobStore): - found = file_source.url_exists(candidate_uri) - elif isinstance(file_source, Toil): - found = file_source.url_exists(candidate_uri) - if found is False: - # Wasn't found there - continue + if skip_remote and is_url(candidate_uri): + # Use remote URIs in place. But we need to find the one that exists. + if not file_source.url_exists(candidate_uri): + # Wasn't found there + continue + + # Now we know this exists, so pass it through + file.value = candidate_uri + return file + else: + # Actually import + # Try to import the file. Don't raise if we can't find it, just + # return None! + imported = file_source.import_file(candidate_uri, check_existence=False) + if imported is None: + # Wasn't found there + continue except UnimplementedURLException as e: # We can't find anything that can even support this URL scheme. # Report to the user, they are probably missing an extra. logger.critical('Error: ' + str(e)) raise + except HTTPError as e: + # Something went wrong looking for it there. + logger.warning("Checked URL %s but got HTTP status %s", candidate_uri, e.code) + # Try the next location. + continue except Exception: # Something went wrong besides the file not being found. Maybe # we have no auth. logger.error("Something went wrong when testing for existence of %s", candidate_uri) raise - if found is False: + if imported is None: # Wasn't found there # Mostly to satisfy mypy continue - if candidate_uri.startswith("file:") or filename == candidate_uri: - # Don't replace if the original file was already found - # or if the replacement value is the same as the original - return file - logger.info('Converting input file path %s to %s', filename, candidate_uri) - # Work out what the basename for the file was file_basename = os.path.basename(urlsplit(candidate_uri).path) @@ -857,12 +878,42 @@ def convert_file_to_url(file: WDL.Value.File) -> WDL.Value.File: # download them at that basename later. raise RuntimeError(f"File {candidate_uri} has no basename and so cannot be a WDL File") + if candidate_uri.startswith("file:") or filename == candidate_uri: + # Don't replace if the original file was already found + # or if the replacement value is the same as the original + return file + + # Was actually found + if is_url(candidate_uri): + # Might be a file URI or other URI. + # We need to make sure file URIs and local paths that point to + # the same place are treated the same. + parsed = urlsplit(candidate_uri) + if parsed.scheme == "file:": + # This is a local file URI. Convert to a path for source directory tracking. + parent_dir = os.path.dirname(unquote(parsed.path)) + else: + # This is some other URL. Get the URL to the parent directory and use that. + parent_dir = urljoin(candidate_uri, ".") + else: + # Must be a local path + parent_dir = os.path.dirname(candidate_uri) + + # Pack a UUID of the parent directory + dir_id = path_to_id.setdefault(parent_dir, uuid.uuid4()) + + toil_uri = pack_toil_uri(imported, task_path, dir_id, file_basename) + + logger.info('Converting input file path %s to %s', filename, candidate_uri) + # Was actually found file.value = candidate_uri + setattr(file, "virtualized_value", toil_uri) return file # If we get here we tried all the candidates raise RuntimeError(f"Could not find {filename} at any of: {tried}") + map_over_typed_files_in_bindings(environment, convert_file_to_url) @@ -899,10 +950,6 @@ def __init__(self, file_store: AbstractFileStore, wdl_options: WDL_OPTIONS): """ Set up the standard library. :param wdl_options: Options to pass into the standard library to use. - Ex: task_path: Dotted WDL name of the part of the workflow this library is working for. - execution_dir: Directory to use as the working directory for workflow code. - enforce_existence: If true, then if a file is detected as nonexistent, raise an error. Else, let it pass through - share_files_with: If set to an existing standard library instance, use the same file upload and download paths as it. """ # TODO: Just always be the 1.2 standard library. wdl_version = "1.2" @@ -1019,12 +1066,7 @@ def _devirtualize_file(self, file: WDL.Value.File) -> WDL.Value.File: return file virtualized_filename = getattr(file, "virtualized_value", None) if virtualized_filename is not None: - devirtualized = self._devirtualize_filename(virtualized_filename) - if file.value != devirtualized: - # Remember the original value for when we merge bindings from finished jobs to future jobs as the devirtualized name - # is job specific, but the merge requires the file names to be the same - setattr(file, "original_value", file.value) - file.value = devirtualized + file.value = self._devirtualize_filename(virtualized_filename) return file def _virtualize_file(self, file: WDL.Value.File, enforce_existence: bool = True) -> WDL.Value.File: @@ -1066,6 +1108,9 @@ def _devirtualize_filename(self, filename: str) -> str: @staticmethod def _devirtualize_uri(filename: str, dest_dir: str, file_source: Union[AbstractFileStore, Toil], state: DirectoryNamingStateDict) -> str: + """ + Given a filename, either return the devirtualized path or the filename itself if not a virtualized URI. + """ if filename.startswith(TOIL_URI_SCHEME): # This is a reference to the Toil filestore. # Deserialize the FileID @@ -1831,7 +1876,6 @@ def ensure_null_files_are_nullable(value: WDL.Value.Base, original_value: WDL.Va :param value: WDL base value to check. This is the WDL value that has been transformed and has the null elements :param original_value: The original WDL base value prior to the transformation. Only used for error messages :param expected_type: The WDL type of the value - :return: """ if isinstance(value, WDL.Value.File): pass @@ -2369,7 +2413,7 @@ def run(self, file_store: AbstractFileStore) -> Promised[WDLBindings]: # Set up the WDL standard library # UUID to use for virtualizing files # We process nonexistent files in WDLTaskWrapperJob as those must be run locally, so don't try to devirtualize them - standard_library = ToilWDLStdLibBase(file_store, wdl_options = {"task_path": self._wdl_options["task_path"]}) + standard_library = ToilWDLStdLibBase(file_store, wdl_options={"task_path": self._wdl_options["task_path"]}) # Get the bindings from after the input section bindings = unwrap(self._task_internal_bindings) @@ -3700,7 +3744,7 @@ def main() -> None: # Get the execution directory execution_dir = os.getcwd() - convert_remote_files(input_bindings, toil, inputs_search_path) + convert_remote_files(input_bindings, toil, task_path=target.name, search_paths=inputs_search_path, skip_remote=options.reference_inputs) # Configure workflow interpreter options wdl_options: WDL_OPTIONS = {"execution_dir": execution_dir, "container": options.container, "task_path": target.name} From 491ba1bc9c83393c5690a1d9203fb6d2f9b07d36 Mon Sep 17 00:00:00 2001 From: stxue1 Date: Mon, 9 Sep 2024 18:32:50 -0700 Subject: [PATCH 21/36] Get rid of memoization as File is not hashable (I believe inside LRU) --- src/toil/wdl/wdltoil.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/toil/wdl/wdltoil.py b/src/toil/wdl/wdltoil.py index 340543da0f..23c60d954d 100755 --- a/src/toil/wdl/wdltoil.py +++ b/src/toil/wdl/wdltoil.py @@ -821,7 +821,6 @@ def convert_remote_files(environment: WDLBindings, file_source: Toil, task_path: locations. Leave them as URIs. """ path_to_id: Dict[str, uuid.UUID] = {} - @memoize def convert_file_to_url(file: WDL.Value.File) -> WDL.Value.File: """ Detect if any potential URI exists. Will convert a file's value to a URI and import it. From a7292be227ac48208070c6aeaaa68f4235736600 Mon Sep 17 00:00:00 2001 From: stxue1 <122345910+stxue1@users.noreply.github.com> Date: Tue, 17 Sep 2024 12:42:56 -0700 Subject: [PATCH 22/36] Apply suggestions from code review Co-authored-by: Adam Novak --- src/toil/wdl/wdltoil.py | 32 ++++++++------------------------ 1 file changed, 8 insertions(+), 24 deletions(-) diff --git a/src/toil/wdl/wdltoil.py b/src/toil/wdl/wdltoil.py index 1045decf93..467826d895 100755 --- a/src/toil/wdl/wdltoil.py +++ b/src/toil/wdl/wdltoil.py @@ -463,9 +463,11 @@ async def toil_read_source(uri: str, path: List[str], importer: Optional[WDL.Tre def virtualized_equal(file1: WDL.Value.Base, file2: WDL.Value.Base) -> bool: """ Check if two WDL values are equal when taking account file virtualization. + + Treats virtualized and non-virtualized `File`s referring to the same underlying file as equal. :param file1: WDL value :param file2: WDL value - :return: Whether the two file values are true + :return: Whether the two values are equal with file virtualization accounted for """ def f(file: WDL.Value.File) -> WDL.Value.File: file.value = getattr(file, "virtualized_value", file.value) @@ -809,16 +811,15 @@ def is_url(filename: str, schemes: List[str]=['http:', 'https:', 's3:', 'gs:', T def convert_remote_files(environment: WDLBindings, file_source: Toil, task_path: str, search_paths: Optional[List[str]] = None, skip_remote: bool = False) -> None: """ - Iterate through the environment and convert all files that reference a remote file from the possible URIs to that URI. - If the file references a local file, leave it unchanged. + Resolve relative-URI files in the given environment and import all files. :param environment: Bindings to evaluate on - :param file_source: Context to search for remote files with + :param file_source: Context to search for files with :param task_path: Dotted WDL name of the user-level code doing the importing (probably the workflow name). :param search_paths: If set, try resolving input location relative to the URLs or directories in this list. - :param skip_remote: If set, don't try to import files from remote - locations. Leave them as URIs. + :param skip_remote: If set, don't actually import files from remote + locations. Leave them as URI references. """ path_to_id: Dict[str, uuid.UUID] = {} def convert_file_to_url(file: WDL.Value.File) -> WDL.Value.File: @@ -1008,23 +1009,6 @@ def get_local_paths(self) -> List[str]: return list(self._virtualized_to_devirtualized.values()) - def share_files(self, other: "ToilWDLStdLibBase") -> None: - """ - Share caches for devirtualizing and virtualizing files with another instance. - - Files devirtualized by one instance can be re-virtualized back to their - original virtualized filenames by the other. - """ - - if id(self._virtualized_to_devirtualized) != id(other._virtualized_to_devirtualized): - # Merge the virtualized to devirtualized mappings - self._virtualized_to_devirtualized.update(other._virtualized_to_devirtualized) - other._virtualized_to_devirtualized = self._virtualized_to_devirtualized - - if id(self._devirtualized_to_virtualized) != id(other._devirtualized_to_virtualized): - # Merge the devirtualized to virtualized mappings - self._devirtualized_to_virtualized.update(other._devirtualized_to_virtualized) - other._devirtualized_to_virtualized = self._devirtualized_to_virtualized def _read(self, parse: Callable[[str], WDL.Value.Base]) -> Callable[[WDL.Value.File], WDL.Value.Base]: # To only virtualize on task/function boundaries, rely on the _read function @@ -1241,7 +1225,7 @@ def _virtualize_filename(self, filename: str) -> str: try: imported = self._file_store.import_file(filename) except FileNotFoundError: - # todo: raising exceptions inside of this function will be captured in StdLib.StaticFunction._call_eage which always raises an EvalError. + # todo: raising exceptions inside of this function will be captured in StdLib.StaticFunction._call_eager which always raises an EvalError. # Ideally, the error raised here should escape to be captured in the wdl runner main loop, but I can't figure out how raise DownloadFailed("File at URL %s does not exist or is inaccessible." % filename) except HTTPError as e: From e6718cdd3c0a6bf65f5803ccf4e623427513f1b4 Mon Sep 17 00:00:00 2001 From: stxue1 <122345910+stxue1@users.noreply.github.com> Date: Tue, 17 Sep 2024 19:30:25 -0700 Subject: [PATCH 23/36] Update src/toil/wdl/wdltoil.py Co-authored-by: Adam Novak --- src/toil/wdl/wdltoil.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/toil/wdl/wdltoil.py b/src/toil/wdl/wdltoil.py index 467826d895..786d0e20e0 100755 --- a/src/toil/wdl/wdltoil.py +++ b/src/toil/wdl/wdltoil.py @@ -2671,8 +2671,6 @@ def get_path_in_container(file: WDL.Value.File) -> Optional[WDL.Value.File]: if workdir_in_container is not None: output_wdl_options["execution_dir"] = workdir_in_container outputs_library = ToilWDLStdLibTaskOutputs(file_store, host_stdout_txt, host_stderr_txt, task_container.input_path_map, wdl_options=output_wdl_options) - # Make sure files downloaded as inputs get re-used if we re-upload them. - outputs_library.share_files(standard_library) output_bindings = evaluate_decls_to_bindings(self._task.outputs, bindings, outputs_library, drop_missing_files=True) # Now we know if the standard output and error were sent somewhere by From 2ad130fd6413864fd6f3df3f31927f63d7585d5d Mon Sep 17 00:00:00 2001 From: stxue1 Date: Tue, 17 Sep 2024 19:32:43 -0700 Subject: [PATCH 24/36] Rename, add comments, remove unused code/comments --- src/toil/wdl/wdltoil.py | 53 ++++++++++++++++------------------------- 1 file changed, 20 insertions(+), 33 deletions(-) diff --git a/src/toil/wdl/wdltoil.py b/src/toil/wdl/wdltoil.py index 467826d895..c636db1665 100755 --- a/src/toil/wdl/wdltoil.py +++ b/src/toil/wdl/wdltoil.py @@ -63,7 +63,7 @@ import WDL.Error import WDL.runtime.config from configargparse import ArgParser -from WDL._util import byte_size_units +from WDL._util import byte_size_units, chmod_R_plus from WDL.Tree import ReadSourceResult from WDL.CLI import print_error from WDL.runtime.backend.docker_swarm import SwarmContainer @@ -101,8 +101,8 @@ # WDL options to pass into the WDL jobs and standard libraries # task_path: Dotted WDL name of the part of the workflow this library is working for. # execution_dir: Directory to use as the working directory for workflow code. -# enforce_existence: If true, then if a file is detected as nonexistent, raise an error. Else, let it pass through # share_files_with: If set to an existing standard library instance, use the same file upload and download paths as it. +# container: The type of container to use when executing a WDL task. Carries through the value of the commandline --container option WDL_OPTIONS = TypedDict('WDL_OPTIONS', {"execution_dir": NotRequired[str], "container": NotRequired[str], "share_files_with": NotRequired["ToilWDLStdLibBase"], "task_path": str}) @@ -796,7 +796,7 @@ def is_toil_url(filename: str) -> bool: return is_url(filename, schemes=[TOIL_URI_SCHEME]) -def is_normal_url(filename: str) -> bool: +def is_standard_url(filename: str) -> bool: return is_url(filename, ['http:', 'https:', 's3:', 'gs:']) @@ -809,17 +809,16 @@ def is_url(filename: str, schemes: List[str]=['http:', 'https:', 's3:', 'gs:', T return True return False -def convert_remote_files(environment: WDLBindings, file_source: Toil, task_path: str, search_paths: Optional[List[str]] = None, skip_remote: bool = False) -> None: +def convert_remote_files(environment: WDLBindings, file_source: Toil, task_path: str, search_paths: Optional[List[str]] = None, import_remote_files: bool = True) -> None: """ - Resolve relative-URI files in the given environment and import all files. + Resolve relative-URI files in the given environment and import all files. Will set the value of the File to the relative-URI :param environment: Bindings to evaluate on :param file_source: Context to search for files with :param task_path: Dotted WDL name of the user-level code doing the importing (probably the workflow name). :param search_paths: If set, try resolving input location relative to the URLs or directories in this list. - :param skip_remote: If set, don't actually import files from remote - locations. Leave them as URI references. + :param import_remote_files: If set, import files from remote locations. Else leave them as URI references. """ path_to_id: Dict[str, uuid.UUID] = {} def convert_file_to_url(file: WDL.Value.File) -> WDL.Value.File: @@ -832,7 +831,7 @@ def convert_file_to_url(file: WDL.Value.File) -> WDL.Value.File: for candidate_uri in potential_absolute_uris(filename, search_paths if search_paths is not None else []): tried.append(candidate_uri) try: - if skip_remote and is_url(candidate_uri): + if not import_remote_files and is_url(candidate_uri): # Use remote URIs in place. But we need to find the one that exists. if not file_source.url_exists(candidate_uri): # Wasn't found there @@ -878,11 +877,6 @@ def convert_file_to_url(file: WDL.Value.File) -> WDL.Value.File: # download them at that basename later. raise RuntimeError(f"File {candidate_uri} has no basename and so cannot be a WDL File") - if candidate_uri.startswith("file:") or filename == candidate_uri: - # Don't replace if the original file was already found - # or if the replacement value is the same as the original - return file - # Was actually found if is_url(candidate_uri): # Might be a file URI or other URI. @@ -967,7 +961,7 @@ def __init__(self, file_store: AbstractFileStore, wdl_options: WDL_OPTIONS): self._wdl_options: WDL_OPTIONS = wdl_options - share_files_with = self.share_files_with + share_files_with = wdl_options.get("share_files_with") if share_files_with is None: # We get fresh file download/upload state @@ -992,11 +986,6 @@ def execution_dir(self) -> Optional[str]: execution_dir: Optional[str] = self._wdl_options.get("execution_dir") return execution_dir - @property - def share_files_with(self) -> Optional["ToilWDLStdLibBase"]: - share_files_with: Optional["ToilWDLStdLibBase"] = self._wdl_options.get("share_files_with") - return share_files_with - @property def task_path(self) -> str: task_path: str = self._wdl_options["task_path"] @@ -1038,13 +1027,13 @@ def _f( with tempfile.NamedTemporaryFile(dir=self._write_dir, delete=False) as outfile: serialize(v, outfile) filename = outfile.name - from WDL._util import chmod_R_plus chmod_R_plus(filename, file_bits=0o660) return WDL.Value.File(filename) return _f def _devirtualize_file(self, file: WDL.Value.File) -> WDL.Value.File: + # We track whether files do not exist with the nonexistent flag in order to coerce to Null/error on use if getattr(file, "nonexistent", False): return file virtualized_filename = getattr(file, "virtualized_value", None) @@ -1053,13 +1042,14 @@ def _devirtualize_file(self, file: WDL.Value.File) -> WDL.Value.File: return file def _virtualize_file(self, file: WDL.Value.File, enforce_existence: bool = True) -> WDL.Value.File: + # If enforce_existence is true, then if a file is detected as nonexistent, raise an error. Else, let it pass through if getattr(file, "virtualized_value", None) is not None: return file if enforce_existence is False: # We only want to error on a nonexistent file in the output section # Since we need to virtualize on task boundaries, don't enforce existence if on a boundary - if is_normal_url(file.value): + if is_standard_url(file.value): file_uri = Toil.normalize_uri(file.value) else: abs_filepath = os.path.join(self.execution_dir, file.value) if self.execution_dir is not None else os.path.abspath(file.value) @@ -1206,15 +1196,16 @@ def devirtualize_to(filename: str, dest_dir: str, file_source: Union[AbstractFil @memoize def _virtualize_filename(self, filename: str) -> str: """ - from a local path in write_dir, 'virtualize' into the filename as it should present in a - File value + from a local path in write_dir, 'virtualize' into the filename as it should present in a File value + + :param filename: Can be a local file path, URL (http, https, s3, gs), or toilfile """ if is_toil_url(filename): # Already virtual logger.debug('Already virtual: %s', filename) return filename - elif is_normal_url(filename): + elif is_standard_url(filename): # This is a URL (http, s3, etc) that we want to virtualize # First check the cache if filename in self._devirtualized_to_virtualized: @@ -1225,9 +1216,8 @@ def _virtualize_filename(self, filename: str) -> str: try: imported = self._file_store.import_file(filename) except FileNotFoundError: - # todo: raising exceptions inside of this function will be captured in StdLib.StaticFunction._call_eager which always raises an EvalError. - # Ideally, the error raised here should escape to be captured in the wdl runner main loop, but I can't figure out how - raise DownloadFailed("File at URL %s does not exist or is inaccessible." % filename) + logger.error("File at URL %s does not exist or is inaccessible." % filename) + raise except HTTPError as e: # Something went wrong with the connection logger.error("File %s could not be downloaded due to HTTP error %d", filename, e.code) @@ -1242,10 +1232,8 @@ def _virtualize_filename(self, filename: str) -> str: dir_id = self._parent_dir_to_ids.setdefault(parent_dir, uuid.uuid4()) result = pack_toil_uri(imported, self.task_path, dir_id, file_basename) logger.debug('Virtualized %s as WDL file %s', filename, result) - # We can't the Toil URI in the cache because it would point to the URL instead of a file - # which we don't have as we haven't written the contents of the URL into a file yet - # The next devirtualize_filename call will generate a file and store it in the cache for us - # But store the forward mapping in case another virtualization is called + # We can't put the Toil URI in the virtualized_to_devirtualized cache because it would point to the URL instead of a + # local file on the machine, so only store the forward mapping self._devirtualized_to_virtualized[filename] = result return result else: @@ -1313,7 +1301,6 @@ def _f( with tempfile.NamedTemporaryFile(dir=self._write_dir, delete=False) as outfile: serialize(v, outfile) filename = outfile.name - from WDL._util import chmod_R_plus chmod_R_plus(filename, file_bits=0o660) vfn = self._virtualize_filename(filename) return WDL.Value.File(vfn) @@ -3728,7 +3715,7 @@ def main() -> None: # Get the execution directory execution_dir = os.getcwd() - convert_remote_files(input_bindings, toil, task_path=target.name, search_paths=inputs_search_path, skip_remote=options.reference_inputs) + convert_remote_files(input_bindings, toil, task_path=target.name, search_paths=inputs_search_path, import_remote_files=options.reference_inputs) # Configure workflow interpreter options wdl_options: WDL_OPTIONS = {"execution_dir": execution_dir, "container": options.container, "task_path": target.name} From b0db027084cdfcb991af3b6e8d0d903b66cb4c33 Mon Sep 17 00:00:00 2001 From: stxue1 Date: Wed, 18 Sep 2024 12:58:42 -0700 Subject: [PATCH 25/36] Add comments and adjust wdl context usage --- src/toil/wdl/wdltoil.py | 98 ++++++++++++++++++++++++----------------- 1 file changed, 57 insertions(+), 41 deletions(-) diff --git a/src/toil/wdl/wdltoil.py b/src/toil/wdl/wdltoil.py index 6e43137bb5..5a4c917072 100755 --- a/src/toil/wdl/wdltoil.py +++ b/src/toil/wdl/wdltoil.py @@ -101,10 +101,9 @@ # WDL options to pass into the WDL jobs and standard libraries # task_path: Dotted WDL name of the part of the workflow this library is working for. # execution_dir: Directory to use as the working directory for workflow code. -# share_files_with: If set to an existing standard library instance, use the same file upload and download paths as it. # container: The type of container to use when executing a WDL task. Carries through the value of the commandline --container option -WDL_OPTIONS = TypedDict('WDL_OPTIONS', {"execution_dir": NotRequired[str], "container": NotRequired[str], - "share_files_with": NotRequired["ToilWDLStdLibBase"], "task_path": str}) +WDL_Context = TypedDict('WDL_Context', {"execution_dir": NotRequired[str], "container": NotRequired[str], + "task_path": str}) @contextmanager @@ -908,7 +907,7 @@ def convert_file_to_url(file: WDL.Value.File) -> WDL.Value.File: # If we get here we tried all the candidates raise RuntimeError(f"Could not find {filename} at any of: {tried}") - map_over_typed_files_in_bindings(environment, convert_file_to_url) + map_over_files_in_bindings(environment, convert_file_to_url) # Both the WDL code itself **and** the commands that it runs will deal in @@ -940,7 +939,7 @@ class ToilWDLStdLibBase(WDL.StdLib.Base): """ Standard library implementation for WDL as run on Toil. """ - def __init__(self, file_store: AbstractFileStore, wdl_options: WDL_OPTIONS): + def __init__(self, file_store: AbstractFileStore, wdl_options: WDL_Context, share_files_with: Optional["ToilWDLStdLibBase"] = None): """ Set up the standard library. :param wdl_options: Options to pass into the standard library to use. @@ -959,9 +958,8 @@ def __init__(self, file_store: AbstractFileStore, wdl_options: WDL_OPTIONS): # Keep the file store around so we can access files. self._file_store = file_store - self._wdl_options: WDL_OPTIONS = wdl_options + self._wdl_options: WDL_Context = wdl_options - share_files_with = wdl_options.get("share_files_with") if share_files_with is None: # We get fresh file download/upload state @@ -1007,13 +1005,13 @@ def _read(self, parse: Callable[[str], WDL.Value.Base]) -> Callable[[WDL.Value.F # Since we also want to preserve the URL/path *and* store the virtualized URI, use setattr # I can't think of another way to do this. I still need to remember the original URL/path, # but I need to virtualize as well, so I can't remove one or the other. - def f(file: WDL.Value.File) -> WDL.Value.Base: + def _f(file: WDL.Value.File) -> WDL.Value.Base: if getattr(file, "virtualized_value", None) is None: setattr(file, "virtualized_value", self._virtualize_filename(file.value)) with open(self._devirtualize_filename(getattr(file, "virtualized_value")), "r") as infile: return parse(infile.read()) - return f + return _f def _write( self, serialize: Callable[[WDL.Value.Base, IO[bytes]], None] @@ -1134,7 +1132,7 @@ def _devirtualize_uri(filename: str, dest_dir: str, file_source: Union[AbstractF return result @staticmethod - def devirtualize_to(filename: str, dest_dir: str, file_source: Union[AbstractFileStore, Toil], state: DirectoryNamingStateDict, wdl_options: WDL_OPTIONS, + def devirtualize_to(filename: str, dest_dir: str, file_source: Union[AbstractFileStore, Toil], state: DirectoryNamingStateDict, wdl_options: WDL_Context, devirtualized_to_virtualized: Optional[Dict[str, str]] = None, virtualized_to_devirtualized: Optional[Dict[str, str]] = None) -> str: """ Download or export a WDL virtualized filename/URL to the given directory. @@ -1274,7 +1272,7 @@ class ToilWDLStdLibTaskCommand(ToilWDLStdLibBase): are host-side paths. """ - def __init__(self, file_store: AbstractFileStore, container: TaskContainer, wdl_options: WDL_OPTIONS): + def __init__(self, file_store: AbstractFileStore, container: TaskContainer, wdl_options: WDL_Context): """ Set up the standard library for the task command section. """ @@ -1283,13 +1281,22 @@ def __init__(self, file_store: AbstractFileStore, container: TaskContainer, wdl_ super().__init__(file_store, wdl_options=wdl_options) self.container = container + # Revert the _read and _write functions to the parent WDL.StdLib.Base implementation + # This is because the task command standard library is used in MiniWDL's internals when executing a task + # which we don't have much control over (miniwdl will create it's own file objects that represent files within the container) + # and MiniWDL seems to treat the task standard library and the base standard library different (mainly in how it creates File objects; + # the file values are valid paths in the base standard library but are container paths in the task library) + # In _read, we typically always ensure a file is virtualized before use. Here, we can't virtualize a within-container file because + # MiniWDL created a file representing the in-container path, which does not exist on the host machine + # In _write, we need virtualize to an in-container path from a host machine path because we mount the file through. The ideal spot for this virtualization + # to happen is here before the path injection def _read(self, parse: Callable[[str], WDL.Value.Base]) -> Callable[[WDL.Value.File], WDL.Value.Base]: # todo: figure out better way than reoverriding overridden function - def f(file: WDL.Value.File) -> WDL.Value.Base: + def _f(file: WDL.Value.File) -> WDL.Value.Base: with open(self._devirtualize_filename(file.value), "r") as infile: return parse(infile.read()) - return f + return _f def _write( self, serialize: Callable[[WDL.Value.Base, IO[bytes]], None] @@ -1359,7 +1366,8 @@ def __init__( stdout_path: str, stderr_path: str, file_to_mountpoint: Dict[str, str], - wdl_options: WDL_OPTIONS, + wdl_options: WDL_Context, + share_files_with: Optional["ToilWDLStdLibBase"] = None ): """ Set up the standard library for a task output section. Needs to know @@ -1649,7 +1657,7 @@ def devirtualize_files(environment: WDLBindings, stdlib: ToilWDLStdLibBase) -> W that are actually available to command line commands. The same virtual file always maps to the same devirtualized filename even with duplicates """ - return map_over_typed_files_in_bindings(environment, stdlib._devirtualize_file) + return map_over_files_in_bindings(environment, stdlib._devirtualize_file) def virtualize_files(environment: WDLBindings, stdlib: ToilWDLStdLibBase, enforce_existence: bool = True) -> WDLBindings: """ @@ -1657,7 +1665,7 @@ def virtualize_files(environment: WDLBindings, stdlib: ToilWDLStdLibBase, enforc that are usable from other machines. """ virtualize_func = partial(stdlib._virtualize_file, enforce_existence=enforce_existence) - return map_over_typed_files_in_bindings(environment, virtualize_func) + return map_over_files_in_bindings(environment, virtualize_func) def add_paths(task_container: TaskContainer, host_paths: Iterable[str]) -> None: """ @@ -1740,7 +1748,7 @@ def drop_missing_files(environment: WDLBindings, standard_library: ToilWDLStdLib # Determine where to evaluate relative paths relative to drop_if_missing_with_workdir = partial(drop_if_missing, standard_library=standard_library) - return map_over_typed_files_in_bindings(environment, drop_if_missing_with_workdir) + return map_over_files_in_bindings(environment, drop_if_missing_with_workdir) def get_file_paths_in_bindings(environment: WDLBindings) -> List[str]: """ @@ -1760,10 +1768,10 @@ def append_to_paths(file: WDL.Value.File) -> Optional[WDL.Value.File]: path = file.value paths.append(path) return file - map_over_typed_files_in_bindings(environment, append_to_paths) + map_over_files_in_bindings(environment, append_to_paths) return paths -def map_over_typed_files_in_bindings(environment: WDLBindings, transform: Callable[[WDL.Value.File], Optional[WDL.Value.File]]) -> WDLBindings: +def map_over_files_in_bindings(environment: WDLBindings, transform: Callable[[WDL.Value.File], Optional[WDL.Value.File]]) -> WDLBindings: """ Run all File values embedded in the given bindings through the given transformation function. @@ -1771,10 +1779,10 @@ def map_over_typed_files_in_bindings(environment: WDLBindings, transform: Callab TODO: Replace with WDL.Value.rewrite_env_paths or WDL.Value.rewrite_files """ - return environment.map(lambda b: map_over_typed_files_in_binding(b, transform)) + return environment.map(lambda b: map_over_files_in_binding(b, transform)) -def map_over_typed_files_in_binding(binding: WDL.Env.Binding[WDL.Value.Base], transform: Callable[[WDL.Value.File], Optional[WDL.Value.File]]) -> WDL.Env.Binding[WDL.Value.Base]: +def map_over_files_in_binding(binding: WDL.Env.Binding[WDL.Value.Base], transform: Callable[[WDL.Value.File], Optional[WDL.Value.File]]) -> WDL.Env.Binding[WDL.Value.Base]: """ Run all File values' types and values embedded in the given binding's value through the given transformation function. @@ -1884,7 +1892,7 @@ class WDLBaseJob(Job): Also responsible for remembering the Toil WDL configuration keys and values. """ - def __init__(self, wdl_options: WDL_OPTIONS, **kwargs: Any) -> None: + def __init__(self, wdl_options: WDL_Context, **kwargs: Any) -> None: """ Make a WDL-related job. @@ -2018,7 +2026,7 @@ class WDLTaskWrapperJob(WDLBaseJob): All bindings are in terms of task-internal names. """ - def __init__(self, task: WDL.Tree.Task, prev_node_results: Sequence[Promised[WDLBindings]], task_id: List[str], namespace: str, wdl_options: WDL_OPTIONS, **kwargs: Any) -> None: + def __init__(self, task: WDL.Tree.Task, prev_node_results: Sequence[Promised[WDLBindings]], task_id: List[str], namespace: str, wdl_options: WDL_Context, **kwargs: Any) -> None: """ Make a new job to determine resources and run a task. @@ -2148,8 +2156,16 @@ def run(self, file_store: AbstractFileStore) -> Promised[WDLBindings]: accelerator_requirement = parse_accelerator(accelerator_spec) runtime_accelerators = [accelerator_requirement] + + task_wdl_options = self._wdl_options.copy() + # A task is not guaranteed to have access to the current execution directory, so get rid of it. The execution directory also is not needed as all files will be virtualized + task_wdl_options.pop("execution_dir") # Schedule to get resources. Pass along the bindings from evaluating all the inputs and decls, and the runtime, with files virtualized. - run_job = WDLTaskJob(self._task, virtualize_files(bindings, standard_library, enforce_existence=False), virtualize_files(runtime_bindings, standard_library, enforce_existence=False), self._task_id, self._namespace, cores=runtime_cores or self.cores, memory=runtime_memory or self.memory, disk=runtime_disk or self.disk, accelerators=runtime_accelerators or self.accelerators, wdl_options=self._wdl_options) + run_job = WDLTaskJob(self._task, + virtualize_files(bindings, standard_library, enforce_existence=False), + virtualize_files(runtime_bindings, standard_library, enforce_existence=False), + self._task_id, self._namespace, cores=runtime_cores or self.cores, memory=runtime_memory or self.memory, disk=runtime_disk or self.disk, + accelerators=runtime_accelerators or self.accelerators, wdl_options=task_wdl_options) # Run that as a child self.addChild(run_job) @@ -2172,7 +2188,7 @@ class WDLTaskJob(WDLBaseJob): All bindings are in terms of task-internal names. """ - def __init__(self, task: WDL.Tree.Task, task_internal_bindings: Promised[WDLBindings], runtime_bindings: Promised[WDLBindings], task_id: List[str], namespace: str, wdl_options: WDL_OPTIONS, **kwargs: Any) -> None: + def __init__(self, task: WDL.Tree.Task, task_internal_bindings: Promised[WDLBindings], runtime_bindings: Promised[WDLBindings], task_id: List[str], namespace: str, wdl_options: WDL_Context, **kwargs: Any) -> None: """ Make a new job to run a task. @@ -2384,7 +2400,7 @@ def run(self, file_store: AbstractFileStore) -> Promised[WDLBindings]: # Set up the WDL standard library # UUID to use for virtualizing files # We process nonexistent files in WDLTaskWrapperJob as those must be run locally, so don't try to devirtualize them - standard_library = ToilWDLStdLibBase(file_store, wdl_options={"task_path": self._wdl_options["task_path"]}) + standard_library = ToilWDLStdLibBase(file_store, wdl_options=self._wdl_options) # Get the bindings from after the input section bindings = unwrap(self._task_internal_bindings) @@ -2565,10 +2581,10 @@ def patched_run_invocation(*args: Any, **kwargs: Any) -> List[str]: def get_path_in_container(file: WDL.Value.File) -> Optional[WDL.Value.File]: if getattr(file, "nonexistent", False) is False: return WDL.Value.File(task_container.input_path_map[file.value]) - contained_bindings = map_over_typed_files_in_bindings(bindings, get_path_in_container) + contained_bindings = map_over_files_in_bindings(bindings, get_path_in_container) # Make a new standard library for evaluating the command specifically, which only deals with in-container paths and out-of-container paths. - command_wdl_options: WDL_OPTIONS = {"task_path": task_path} + command_wdl_options: WDL_Context = self._wdl_options.copy() if workdir_in_container is not None: command_wdl_options["execution_dir"] = workdir_in_container command_library = ToilWDLStdLibTaskCommand(file_store, task_container, wdl_options=command_wdl_options) @@ -2654,10 +2670,10 @@ def get_path_in_container(file: WDL.Value.File) -> Optional[WDL.Value.File]: # container-determined strings that are absolute paths to WDL File # objects, and like MiniWDL we can say we only support # working-directory-based relative paths for globs. - output_wdl_options: WDL_OPTIONS = {"task_path": task_path, "share_files_with": standard_library} + output_wdl_options: WDL_Context = self._wdl_options.copy() if workdir_in_container is not None: output_wdl_options["execution_dir"] = workdir_in_container - outputs_library = ToilWDLStdLibTaskOutputs(file_store, host_stdout_txt, host_stderr_txt, task_container.input_path_map, wdl_options=output_wdl_options) + outputs_library = ToilWDLStdLibTaskOutputs(file_store, host_stdout_txt, host_stderr_txt, task_container.input_path_map, wdl_options=output_wdl_options, share_files_with=standard_library) output_bindings = evaluate_decls_to_bindings(self._task.outputs, bindings, outputs_library, drop_missing_files=True) # Now we know if the standard output and error were sent somewhere by @@ -2703,7 +2719,7 @@ class WDLWorkflowNodeJob(WDLBaseJob): Job that evaluates a WDL workflow node. """ - def __init__(self, node: WDL.Tree.WorkflowNode, prev_node_results: Sequence[Promised[WDLBindings]], namespace: str, wdl_options: WDL_OPTIONS, **kwargs: Any) -> None: + def __init__(self, node: WDL.Tree.WorkflowNode, prev_node_results: Sequence[Promised[WDLBindings]], namespace: str, wdl_options: WDL_Context, **kwargs: Any) -> None: """ Make a new job to run a workflow node to completion. """ @@ -2754,7 +2770,7 @@ def run(self, file_store: AbstractFileStore) -> Promised[WDLBindings]: # aren't meant to be inputs, by not changing their names? passed_down_bindings = incoming_bindings.enter_namespace(self._node.name) task_path = self._wdl_options.get("task_path") - wdl_options = self._wdl_options + wdl_options = self._wdl_options.copy() wdl_options["task_path"] = f'{task_path}.{self._node.name}' if isinstance(self._node.callee, WDL.Tree.Workflow): @@ -2797,7 +2813,7 @@ class WDLWorkflowNodeListJob(WDLBaseJob): workflows or tasks or sections. """ - def __init__(self, nodes: List[WDL.Tree.WorkflowNode], prev_node_results: Sequence[Promised[WDLBindings]], namespace: str, wdl_options: WDL_OPTIONS, **kwargs: Any) -> None: + def __init__(self, nodes: List[WDL.Tree.WorkflowNode], prev_node_results: Sequence[Promised[WDLBindings]], namespace: str, wdl_options: WDL_Context, **kwargs: Any) -> None: """ Make a new job to run a list of workflow nodes to completion. """ @@ -3004,7 +3020,7 @@ class WDLSectionJob(WDLBaseJob): Job that can create more graph for a section of the workflow. """ - def __init__(self, namespace: str, wdl_options: WDL_OPTIONS, **kwargs: Any) -> None: + def __init__(self, namespace: str, wdl_options: WDL_Context, **kwargs: Any) -> None: """ Make a WDLSectionJob where the interior runs in the given namespace, starting with the root workflow. @@ -3263,7 +3279,7 @@ class WDLScatterJob(WDLSectionJob): instance of the body. If an instance of the body doesn't create a binding, it gets a null value in the corresponding array. """ - def __init__(self, scatter: WDL.Tree.Scatter, prev_node_results: Sequence[Promised[WDLBindings]], namespace: str, wdl_options: WDL_OPTIONS, **kwargs: Any) -> None: + def __init__(self, scatter: WDL.Tree.Scatter, prev_node_results: Sequence[Promised[WDLBindings]], namespace: str, wdl_options: WDL_Context, **kwargs: Any) -> None: """ Create a subtree that will run a WDL scatter. The scatter itself and the contents live in the given namespace. """ @@ -3405,7 +3421,7 @@ class WDLConditionalJob(WDLSectionJob): """ Job that evaluates a conditional in a WDL workflow. """ - def __init__(self, conditional: WDL.Tree.Conditional, prev_node_results: Sequence[Promised[WDLBindings]], namespace: str, wdl_options: WDL_OPTIONS, **kwargs: Any) -> None: + def __init__(self, conditional: WDL.Tree.Conditional, prev_node_results: Sequence[Promised[WDLBindings]], namespace: str, wdl_options: WDL_Context, **kwargs: Any) -> None: """ Create a subtree that will run a WDL conditional. The conditional itself and its contents live in the given namespace. """ @@ -3460,7 +3476,7 @@ class WDLWorkflowJob(WDLSectionJob): Job that evaluates an entire WDL workflow. """ - def __init__(self, workflow: WDL.Tree.Workflow, prev_node_results: Sequence[Promised[WDLBindings]], workflow_id: List[str], namespace: str, wdl_options: WDL_OPTIONS, **kwargs: Any) -> None: + def __init__(self, workflow: WDL.Tree.Workflow, prev_node_results: Sequence[Promised[WDLBindings]], workflow_id: List[str], namespace: str, wdl_options: WDL_Context, **kwargs: Any) -> None: """ Create a subtree that will run a WDL workflow. The job returns the return value of the workflow. @@ -3528,7 +3544,7 @@ class WDLOutputsJob(WDLBaseJob): Returns an environment with just the outputs bound, in no namespace. """ - def __init__(self, workflow: WDL.Tree.Workflow, bindings: Promised[WDLBindings], wdl_options: WDL_OPTIONS, **kwargs: Any): + def __init__(self, workflow: WDL.Tree.Workflow, bindings: Promised[WDLBindings], wdl_options: WDL_Context, **kwargs: Any): """ Make a new WDLWorkflowOutputsJob for the given workflow, with the given set of bindings after its body runs. """ @@ -3591,7 +3607,7 @@ class WDLRootJob(WDLSectionJob): the workflow name; both forms are accepted. """ - def __init__(self, target: Union[WDL.Tree.Workflow, WDL.Tree.Task], inputs: WDLBindings, wdl_options: WDL_OPTIONS, **kwargs: Any) -> None: + def __init__(self, target: Union[WDL.Tree.Workflow, WDL.Tree.Task], inputs: WDLBindings, wdl_options: WDL_Context, **kwargs: Any) -> None: """ Create a subtree to run the workflow and namespace the outputs. """ @@ -3716,7 +3732,7 @@ def main() -> None: convert_remote_files(input_bindings, toil, task_path=target.name, search_paths=inputs_search_path, import_remote_files=options.reference_inputs) # Configure workflow interpreter options - wdl_options: WDL_OPTIONS = {"execution_dir": execution_dir, "container": options.container, "task_path": target.name} + wdl_options: WDL_Context = {"execution_dir": execution_dir, "container": options.container, "task_path": target.name} assert wdl_options.get("container") is not None # Run the workflow and get its outputs namespaced with the workflow name. @@ -3743,7 +3759,7 @@ def devirtualize_output(file: WDL.Value.File) -> WDL.Value.File: return file # Make all the files local files - output_bindings = map_over_typed_files_in_bindings(output_bindings, devirtualize_output) + output_bindings = map_over_files_in_bindings(output_bindings, devirtualize_output) # Report the result in the right format outputs = WDL.values_to_json(output_bindings) From 8c24d8faddc1d7e4e1976508b5ce258c0144b8e3 Mon Sep 17 00:00:00 2001 From: stxue1 Date: Wed, 18 Sep 2024 20:11:40 -0700 Subject: [PATCH 26/36] add namespace --- src/toil/wdl/wdltoil.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/toil/wdl/wdltoil.py b/src/toil/wdl/wdltoil.py index 5a4c917072..2b739f79c9 100755 --- a/src/toil/wdl/wdltoil.py +++ b/src/toil/wdl/wdltoil.py @@ -103,7 +103,7 @@ # execution_dir: Directory to use as the working directory for workflow code. # container: The type of container to use when executing a WDL task. Carries through the value of the commandline --container option WDL_Context = TypedDict('WDL_Context', {"execution_dir": NotRequired[str], "container": NotRequired[str], - "task_path": str}) + "task_path": str, "namespace": str}) @contextmanager @@ -1382,7 +1382,7 @@ def __init__( # Just set up as ToilWDLStdLibBase, but it will call into # WDL.StdLib.TaskOutputs next. - super().__init__(file_store, wdl_options) + super().__init__(file_store, wdl_options, share_files_with) # Remember task output files self._stdout_path = stdout_path @@ -2770,8 +2770,10 @@ def run(self, file_store: AbstractFileStore) -> Promised[WDLBindings]: # aren't meant to be inputs, by not changing their names? passed_down_bindings = incoming_bindings.enter_namespace(self._node.name) task_path = self._wdl_options.get("task_path") + namespace = self._wdl_options.get("namespace") wdl_options = self._wdl_options.copy() wdl_options["task_path"] = f'{task_path}.{self._node.name}' + wdl_options["namespace"] = f'{namespace}.{self._node.name}' if isinstance(self._node.callee, WDL.Tree.Workflow): # This is a call of a workflow @@ -3732,7 +3734,7 @@ def main() -> None: convert_remote_files(input_bindings, toil, task_path=target.name, search_paths=inputs_search_path, import_remote_files=options.reference_inputs) # Configure workflow interpreter options - wdl_options: WDL_Context = {"execution_dir": execution_dir, "container": options.container, "task_path": target.name} + wdl_options: WDL_Context = {"execution_dir": execution_dir, "container": options.container, "task_path": target.name, "namespace": target.name} assert wdl_options.get("container") is not None # Run the workflow and get its outputs namespaced with the workflow name. From 666aef5aff7339a2096642c6abdd0ece291ce2ba Mon Sep 17 00:00:00 2001 From: stxue1 Date: Wed, 18 Sep 2024 20:33:43 -0700 Subject: [PATCH 27/36] integrate namespace into wdl_context --- src/toil/wdl/wdltoil.py | 64 +++++++++++++++++++---------------------- 1 file changed, 29 insertions(+), 35 deletions(-) diff --git a/src/toil/wdl/wdltoil.py b/src/toil/wdl/wdltoil.py index 2b739f79c9..fe0a3cfe8d 100755 --- a/src/toil/wdl/wdltoil.py +++ b/src/toil/wdl/wdltoil.py @@ -2026,7 +2026,7 @@ class WDLTaskWrapperJob(WDLBaseJob): All bindings are in terms of task-internal names. """ - def __init__(self, task: WDL.Tree.Task, prev_node_results: Sequence[Promised[WDLBindings]], task_id: List[str], namespace: str, wdl_options: WDL_Context, **kwargs: Any) -> None: + def __init__(self, task: WDL.Tree.Task, prev_node_results: Sequence[Promised[WDLBindings]], task_id: List[str], wdl_options: WDL_Context, **kwargs: Any) -> None: """ Make a new job to determine resources and run a task. @@ -2034,14 +2034,13 @@ def __init__(self, task: WDL.Tree.Task, prev_node_results: Sequence[Promised[WDL The caller has alredy added the task's own name. """ # task_path in wdl_options is like the namespace, but including subscript numbers for scatters - super().__init__(unitName=wdl_options["task_path"] + ".inputs", displayName=namespace + ".inputs", wdl_options=wdl_options, **kwargs) + super().__init__(unitName=wdl_options["task_path"] + ".inputs", displayName=wdl_options["namespace"] + ".inputs", wdl_options=wdl_options, **kwargs) - logger.info("Preparing to run task code for %s as %s", task.name, namespace) + logger.info("Preparing to run task code for %s as %s", task.name, wdl_options["namespace"]) self._task = task self._prev_node_results = prev_node_results self._task_id = task_id - self._namespace = namespace @report_wdl_errors("evaluate task code", exit=True) def run(self, file_store: AbstractFileStore) -> Promised[WDLBindings]: @@ -2049,7 +2048,7 @@ def run(self, file_store: AbstractFileStore) -> Promised[WDLBindings]: Evaluate inputs and runtime and schedule the task. """ super().run(file_store) - logger.info("Evaluating inputs and runtime for task %s (%s) called as %s", self._task.name, self._task_id, self._namespace) + logger.info("Evaluating inputs and runtime for task %s (%s) called as %s", self._task.name, self._task_id, self._wdl_options["namespace"]) # Combine the bindings we get from previous jobs. # For a task we are only passed the inside-the-task namespace. @@ -2164,7 +2163,7 @@ def run(self, file_store: AbstractFileStore) -> Promised[WDLBindings]: run_job = WDLTaskJob(self._task, virtualize_files(bindings, standard_library, enforce_existence=False), virtualize_files(runtime_bindings, standard_library, enforce_existence=False), - self._task_id, self._namespace, cores=runtime_cores or self.cores, memory=runtime_memory or self.memory, disk=runtime_disk or self.disk, + self._task_id, cores=runtime_cores or self.cores, memory=runtime_memory or self.memory, disk=runtime_disk or self.disk, accelerators=runtime_accelerators or self.accelerators, wdl_options=task_wdl_options) # Run that as a child self.addChild(run_job) @@ -2188,7 +2187,7 @@ class WDLTaskJob(WDLBaseJob): All bindings are in terms of task-internal names. """ - def __init__(self, task: WDL.Tree.Task, task_internal_bindings: Promised[WDLBindings], runtime_bindings: Promised[WDLBindings], task_id: List[str], namespace: str, wdl_options: WDL_Context, **kwargs: Any) -> None: + def __init__(self, task: WDL.Tree.Task, task_internal_bindings: Promised[WDLBindings], runtime_bindings: Promised[WDLBindings], task_id: List[str], wdl_options: WDL_Context, **kwargs: Any) -> None: """ Make a new job to run a task. @@ -2201,15 +2200,14 @@ def __init__(self, task: WDL.Tree.Task, task_internal_bindings: Promised[WDLBind # "wrapper" job like CWL uses to determine the actual requirements. # task_path in wdl_options is like the namespace, but including subscript numbers for scatters - super().__init__(unitName=wdl_options["task_path"] + ".command", displayName=namespace + ".command", local=False, wdl_options=wdl_options, **kwargs) + super().__init__(unitName=wdl_options["task_path"] + ".command", displayName=wdl_options["namespace"] + ".command", local=False, wdl_options=wdl_options, **kwargs) - logger.info("Preparing to run task %s as %s", task.name, namespace) + logger.info("Preparing to run task %s as %s", task.name, wdl_options["namespace"]) self._task = task self._task_internal_bindings = task_internal_bindings self._runtime_bindings = runtime_bindings self._task_id = task_id - self._namespace = namespace ### # Runtime code injection system @@ -2395,7 +2393,7 @@ def run(self, file_store: AbstractFileStore) -> Promised[WDLBindings]: Actually run the task. """ super().run(file_store) - logger.info("Running task command for %s (%s) called as %s", self._task.name, self._task_id, self._namespace) + logger.info("Running task command for %s (%s) called as %s", self._task.name, self._task_id, self._wdl_options["namespace"]) # Set up the WDL standard library # UUID to use for virtualizing files @@ -2719,7 +2717,7 @@ class WDLWorkflowNodeJob(WDLBaseJob): Job that evaluates a WDL workflow node. """ - def __init__(self, node: WDL.Tree.WorkflowNode, prev_node_results: Sequence[Promised[WDLBindings]], namespace: str, wdl_options: WDL_Context, **kwargs: Any) -> None: + def __init__(self, node: WDL.Tree.WorkflowNode, prev_node_results: Sequence[Promised[WDLBindings]], wdl_options: WDL_Context, **kwargs: Any) -> None: """ Make a new job to run a workflow node to completion. """ @@ -2727,7 +2725,6 @@ def __init__(self, node: WDL.Tree.WorkflowNode, prev_node_results: Sequence[Prom self._node = node self._prev_node_results = prev_node_results - self._namespace = namespace if isinstance(self._node, WDL.Tree.Call): logger.debug("Preparing job for call node %s", self._node.workflow_node_id) @@ -2777,11 +2774,11 @@ def run(self, file_store: AbstractFileStore) -> Promised[WDLBindings]: if isinstance(self._node.callee, WDL.Tree.Workflow): # This is a call of a workflow - subjob: WDLBaseJob = WDLWorkflowJob(self._node.callee, [input_bindings, passed_down_bindings], self._node.callee_id, f'{self._namespace}.{self._node.name}', wdl_options=wdl_options, local=True) + subjob: WDLBaseJob = WDLWorkflowJob(self._node.callee, [input_bindings, passed_down_bindings], self._node.callee_id, wdl_options=wdl_options, local=True) self.addChild(subjob) elif isinstance(self._node.callee, WDL.Tree.Task): # This is a call of a task - subjob = WDLTaskWrapperJob(self._node.callee, [input_bindings, passed_down_bindings], self._node.callee_id, f'{self._namespace}.{self._node.name}', wdl_options=wdl_options, local=True) + subjob = WDLTaskWrapperJob(self._node.callee, [input_bindings, passed_down_bindings], self._node.callee_id, wdl_options=wdl_options, local=True) self.addChild(subjob) else: raise WDL.Error.InvalidType(self._node, "Cannot call a " + str(type(self._node.callee))) @@ -2792,14 +2789,14 @@ def run(self, file_store: AbstractFileStore) -> Promised[WDLBindings]: self.defer_postprocessing(subjob) return subjob.rv() elif isinstance(self._node, WDL.Tree.Scatter): - subjob = WDLScatterJob(self._node, [incoming_bindings], self._namespace, wdl_options=self._wdl_options, local=True) + subjob = WDLScatterJob(self._node, [incoming_bindings], wdl_options=self._wdl_options, local=True) self.addChild(subjob) # Scatters don't really make a namespace, just kind of a scope? # TODO: Let stuff leave scope! self.defer_postprocessing(subjob) return subjob.rv() elif isinstance(self._node, WDL.Tree.Conditional): - subjob = WDLConditionalJob(self._node, [incoming_bindings], self._namespace, wdl_options=self._wdl_options, local=True) + subjob = WDLConditionalJob(self._node, [incoming_bindings], wdl_options=self._wdl_options, local=True) self.addChild(subjob) # Conditionals don't really make a namespace, just kind of a scope? # TODO: Let stuff leave scope! @@ -2815,7 +2812,7 @@ class WDLWorkflowNodeListJob(WDLBaseJob): workflows or tasks or sections. """ - def __init__(self, nodes: List[WDL.Tree.WorkflowNode], prev_node_results: Sequence[Promised[WDLBindings]], namespace: str, wdl_options: WDL_Context, **kwargs: Any) -> None: + def __init__(self, nodes: List[WDL.Tree.WorkflowNode], prev_node_results: Sequence[Promised[WDLBindings]], wdl_options: WDL_Context, **kwargs: Any) -> None: """ Make a new job to run a list of workflow nodes to completion. """ @@ -2823,7 +2820,6 @@ def __init__(self, nodes: List[WDL.Tree.WorkflowNode], prev_node_results: Sequen self._nodes = nodes self._prev_node_results = prev_node_results - self._namespace = namespace for n in self._nodes: if isinstance(n, (WDL.Tree.Call, WDL.Tree.Scatter, WDL.Tree.Conditional)): @@ -3022,13 +3018,12 @@ class WDLSectionJob(WDLBaseJob): Job that can create more graph for a section of the workflow. """ - def __init__(self, namespace: str, wdl_options: WDL_Context, **kwargs: Any) -> None: + def __init__(self, wdl_options: WDL_Context, **kwargs: Any) -> None: """ Make a WDLSectionJob where the interior runs in the given namespace, starting with the root workflow. """ super().__init__(wdl_options=wdl_options, **kwargs) - self._namespace = namespace @staticmethod def coalesce_nodes(order: List[str], section_graph: WDLWorkflowGraph) -> List[List[str]]: @@ -3181,10 +3176,10 @@ def get_job_set_any(wdl_ids: Set[str]) -> List[WDLBaseJob]: if len(node_ids) == 1: # Make a one-node job - job: WDLBaseJob = WDLWorkflowNodeJob(section_graph.get(node_ids[0]), rvs, self._namespace, wdl_options=self._wdl_options, local=True) + job: WDLBaseJob = WDLWorkflowNodeJob(section_graph.get(node_ids[0]), rvs, wdl_options=self._wdl_options, local=True) else: # Make a multi-node job - job = WDLWorkflowNodeListJob([section_graph.get(node_id) for node_id in node_ids], rvs, self._namespace, wdl_options=self._wdl_options, local=True) + job = WDLWorkflowNodeListJob([section_graph.get(node_id) for node_id in node_ids], rvs, wdl_options=self._wdl_options, local=True) for prev_job in prev_jobs: # Connect up the happens-after relationships to make sure the # return values are available. @@ -3281,11 +3276,11 @@ class WDLScatterJob(WDLSectionJob): instance of the body. If an instance of the body doesn't create a binding, it gets a null value in the corresponding array. """ - def __init__(self, scatter: WDL.Tree.Scatter, prev_node_results: Sequence[Promised[WDLBindings]], namespace: str, wdl_options: WDL_Context, **kwargs: Any) -> None: + def __init__(self, scatter: WDL.Tree.Scatter, prev_node_results: Sequence[Promised[WDLBindings]], wdl_options: WDL_Context, **kwargs: Any) -> None: """ Create a subtree that will run a WDL scatter. The scatter itself and the contents live in the given namespace. """ - super().__init__(namespace, **kwargs, unitName=scatter.workflow_node_id, displayName=scatter.workflow_node_id, wdl_options=wdl_options) + super().__init__(**kwargs, unitName=scatter.workflow_node_id, displayName=scatter.workflow_node_id, wdl_options=wdl_options) # Because we need to return the return value of the workflow, we need # to return a Toil promise for the last/sink job in the workflow's @@ -3423,11 +3418,11 @@ class WDLConditionalJob(WDLSectionJob): """ Job that evaluates a conditional in a WDL workflow. """ - def __init__(self, conditional: WDL.Tree.Conditional, prev_node_results: Sequence[Promised[WDLBindings]], namespace: str, wdl_options: WDL_Context, **kwargs: Any) -> None: + def __init__(self, conditional: WDL.Tree.Conditional, prev_node_results: Sequence[Promised[WDLBindings]], wdl_options: WDL_Context, **kwargs: Any) -> None: """ Create a subtree that will run a WDL conditional. The conditional itself and its contents live in the given namespace. """ - super().__init__(namespace, **kwargs, unitName=conditional.workflow_node_id, displayName=conditional.workflow_node_id, wdl_options=wdl_options) + super().__init__(**kwargs, unitName=conditional.workflow_node_id, displayName=conditional.workflow_node_id, wdl_options=wdl_options) # Once again we need to ship the whole body template to be instantiated # into Toil jobs only if it will actually run. @@ -3478,7 +3473,7 @@ class WDLWorkflowJob(WDLSectionJob): Job that evaluates an entire WDL workflow. """ - def __init__(self, workflow: WDL.Tree.Workflow, prev_node_results: Sequence[Promised[WDLBindings]], workflow_id: List[str], namespace: str, wdl_options: WDL_Context, **kwargs: Any) -> None: + def __init__(self, workflow: WDL.Tree.Workflow, prev_node_results: Sequence[Promised[WDLBindings]], workflow_id: List[str], wdl_options: WDL_Context, **kwargs: Any) -> None: """ Create a subtree that will run a WDL workflow. The job returns the return value of the workflow. @@ -3486,7 +3481,7 @@ def __init__(self, workflow: WDL.Tree.Workflow, prev_node_results: Sequence[Prom :param namespace: the namespace that the workflow's *contents* will be in. Caller has already added the workflow's own name. """ - super().__init__(namespace, wdl_options=wdl_options, **kwargs) + super().__init__(wdl_options=wdl_options, **kwargs) # Because we need to return the return value of the workflow, we need # to return a Toil promise for the last/sink job in the workflow's @@ -3500,7 +3495,6 @@ def __init__(self, workflow: WDL.Tree.Workflow, prev_node_results: Sequence[Prom self._workflow = workflow self._prev_node_results = prev_node_results self._workflow_id = workflow_id - self._namespace = namespace @report_wdl_errors("run workflow") def run(self, file_store: AbstractFileStore) -> Promised[WDLBindings]: @@ -3509,7 +3503,7 @@ def run(self, file_store: AbstractFileStore) -> Promised[WDLBindings]: """ super().run(file_store) - logger.info("Running workflow %s (%s) called as %s", self._workflow.name, self._workflow_id, self._namespace) + logger.info("Running workflow %s (%s) called as %s", self._workflow.name, self._workflow_id, self._wdl_options["namespace"]) # Combine the bindings we get from previous jobs. # For a task we only see the insode-the-task namespace. @@ -3615,7 +3609,7 @@ def __init__(self, target: Union[WDL.Tree.Workflow, WDL.Tree.Task], inputs: WDLB """ # The root workflow names the root namespace and task path. - super().__init__(target.name, wdl_options=wdl_options, **kwargs) + super().__init__(wdl_options=wdl_options, **kwargs) self._target = target self._inputs = inputs @@ -3628,12 +3622,12 @@ def run(self, file_store: AbstractFileStore) -> Promised[WDLBindings]: if isinstance(self._target, WDL.Tree.Workflow): # Create a workflow job. We rely in this to handle entering the input # namespace if needed, or handling free-floating inputs. - job: WDLBaseJob = WDLWorkflowJob(self._target, [self._inputs], [self._target.name], self._namespace, wdl_options=self._wdl_options, local=True) + job: WDLBaseJob = WDLWorkflowJob(self._target, [self._inputs], [self._target.name], wdl_options=self._wdl_options, local=True) else: # There is no workflow. Create a task job. - job = WDLTaskWrapperJob(self._target, [self._inputs], [self._target.name], self._namespace, wdl_options=self._wdl_options, local=True) + job = WDLTaskWrapperJob(self._target, [self._inputs], [self._target.name], wdl_options=self._wdl_options, local=True) # Run the task or workflow - job.then_namespace(self._namespace) + job.then_namespace(self._wdl_options["namespace"]) self.addChild(job) self.defer_postprocessing(job) return job.rv() From 0b6fb827070048a5c2642fca861d89bc9670a9dc Mon Sep 17 00:00:00 2001 From: stxue1 Date: Wed, 18 Sep 2024 20:37:00 -0700 Subject: [PATCH 28/36] properly name wdl value bases --- src/toil/wdl/wdltoil.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/toil/wdl/wdltoil.py b/src/toil/wdl/wdltoil.py index fe0a3cfe8d..12ac5e4412 100755 --- a/src/toil/wdl/wdltoil.py +++ b/src/toil/wdl/wdltoil.py @@ -459,19 +459,19 @@ async def toil_read_source(uri: str, path: List[str], importer: Optional[WDL.Tre raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), uri) -def virtualized_equal(file1: WDL.Value.Base, file2: WDL.Value.Base) -> bool: +def virtualized_equal(value1: WDL.Value.Base, value2: WDL.Value.Base) -> bool: """ Check if two WDL values are equal when taking account file virtualization. Treats virtualized and non-virtualized `File`s referring to the same underlying file as equal. - :param file1: WDL value - :param file2: WDL value + :param value1: WDL value + :param value2: WDL value :return: Whether the two values are equal with file virtualization accounted for """ def f(file: WDL.Value.File) -> WDL.Value.File: file.value = getattr(file, "virtualized_value", file.value) return file - return map_over_typed_files_in_value(file1, f) == map_over_typed_files_in_value(file2, f) + return map_over_typed_files_in_value(value1, f) == map_over_typed_files_in_value(value2, f) # Bindings have a long type name WDLBindings = WDL.Env.Bindings[WDL.Value.Base] From 4d5ecae66783ec926d846282019d54e83b88cb19 Mon Sep 17 00:00:00 2001 From: stxue1 Date: Mon, 23 Sep 2024 20:53:24 -0700 Subject: [PATCH 29/36] Remove irrelevant comment --- src/toil/wdl/wdltoil.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/toil/wdl/wdltoil.py b/src/toil/wdl/wdltoil.py index 12ac5e4412..8fa427d66a 100755 --- a/src/toil/wdl/wdltoil.py +++ b/src/toil/wdl/wdltoil.py @@ -1150,7 +1150,7 @@ def devirtualize_to(filename: str, dest_dir: str, file_source: Union[AbstractFil should not be added to the cache :param state: State dict which must be shared among successive calls into a dest_dir. - :param wdl_options: WDL options to carry through. If enforce_nonexistent is set to true inside, will raise an error if the file is nonexistent. Else, let it pass through. + :param wdl_options: WDL options to carry through. """ if not os.path.isdir(dest_dir): # os.mkdir fails saying the directory *being made* caused a From b43d25600908133c27f7a01c84b0619bae907369 Mon Sep 17 00:00:00 2001 From: Adam Novak Date: Tue, 24 Sep 2024 13:33:11 -0400 Subject: [PATCH 30/36] Adjust docstring formatting to remove RST warnings --- src/toil/wdl/wdltoil.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/src/toil/wdl/wdltoil.py b/src/toil/wdl/wdltoil.py index 8fa427d66a..956a772455 100755 --- a/src/toil/wdl/wdltoil.py +++ b/src/toil/wdl/wdltoil.py @@ -462,8 +462,9 @@ async def toil_read_source(uri: str, path: List[str], importer: Optional[WDL.Tre def virtualized_equal(value1: WDL.Value.Base, value2: WDL.Value.Base) -> bool: """ Check if two WDL values are equal when taking account file virtualization. - + Treats virtualized and non-virtualized `File`s referring to the same underlying file as equal. + :param value1: WDL value :param value2: WDL value :return: Whether the two values are equal with file virtualization accounted for @@ -810,13 +811,16 @@ def is_url(filename: str, schemes: List[str]=['http:', 'https:', 's3:', 'gs:', T def convert_remote_files(environment: WDLBindings, file_source: Toil, task_path: str, search_paths: Optional[List[str]] = None, import_remote_files: bool = True) -> None: """ - Resolve relative-URI files in the given environment and import all files. Will set the value of the File to the relative-URI + Resolve relative-URI files in the given environment and import all files. + + Will set the value of the File to the relative-URI. + :param environment: Bindings to evaluate on :param file_source: Context to search for files with :param task_path: Dotted WDL name of the user-level code doing the - importing (probably the workflow name). + importing (probably the workflow name). :param search_paths: If set, try resolving input location relative to the URLs or - directories in this list. + directories in this list. :param import_remote_files: If set, import files from remote locations. Else leave them as URI references. """ path_to_id: Dict[str, uuid.UUID] = {} @@ -1195,7 +1199,7 @@ def devirtualize_to(filename: str, dest_dir: str, file_source: Union[AbstractFil def _virtualize_filename(self, filename: str) -> str: """ from a local path in write_dir, 'virtualize' into the filename as it should present in a File value - + :param filename: Can be a local file path, URL (http, https, s3, gs), or toilfile """ From 7a36b591ac0bee9447cdbf2c4ffbe1bcd5a653e4 Mon Sep 17 00:00:00 2001 From: Adam Novak Date: Tue, 24 Sep 2024 13:34:05 -0400 Subject: [PATCH 31/36] Adjust comment grammar --- src/toil/wdl/wdltoil.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/toil/wdl/wdltoil.py b/src/toil/wdl/wdltoil.py index 956a772455..aed7697807 100755 --- a/src/toil/wdl/wdltoil.py +++ b/src/toil/wdl/wdltoil.py @@ -461,7 +461,7 @@ async def toil_read_source(uri: str, path: List[str], importer: Optional[WDL.Tre def virtualized_equal(value1: WDL.Value.Base, value2: WDL.Value.Base) -> bool: """ - Check if two WDL values are equal when taking account file virtualization. + Check if two WDL values are equal when taking into account file virtualization. Treats virtualized and non-virtualized `File`s referring to the same underlying file as equal. @@ -1287,7 +1287,7 @@ def __init__(self, file_store: AbstractFileStore, container: TaskContainer, wdl_ # Revert the _read and _write functions to the parent WDL.StdLib.Base implementation # This is because the task command standard library is used in MiniWDL's internals when executing a task - # which we don't have much control over (miniwdl will create it's own file objects that represent files within the container) + # which we don't have much control over (miniwdl will create its own file objects that represent files within the container) # and MiniWDL seems to treat the task standard library and the base standard library different (mainly in how it creates File objects; # the file values are valid paths in the base standard library but are container paths in the task library) # In _read, we typically always ensure a file is virtualized before use. Here, we can't virtualize a within-container file because From 8c42888434af0e9c34870d6dd9a219cac13dcf1d Mon Sep 17 00:00:00 2001 From: Adam Novak Date: Tue, 24 Sep 2024 14:19:09 -0400 Subject: [PATCH 32/36] Remove disallowed backticks --- src/toil/wdl/wdltoil.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/toil/wdl/wdltoil.py b/src/toil/wdl/wdltoil.py index aed7697807..da16b11028 100755 --- a/src/toil/wdl/wdltoil.py +++ b/src/toil/wdl/wdltoil.py @@ -463,7 +463,7 @@ def virtualized_equal(value1: WDL.Value.Base, value2: WDL.Value.Base) -> bool: """ Check if two WDL values are equal when taking into account file virtualization. - Treats virtualized and non-virtualized `File`s referring to the same underlying file as equal. + Treats virtualized and non-virtualized Files referring to the same underlying file as equal. :param value1: WDL value :param value2: WDL value From a3182083dc775b2057e047cd2d9f2affe229a568 Mon Sep 17 00:00:00 2001 From: stxue1 Date: Fri, 27 Sep 2024 12:39:49 -0700 Subject: [PATCH 33/36] Extract out the import function to add back memoization and resolve issue of multiple imports on the same file --- src/toil/wdl/wdltoil.py | 36 ++++++++++++++++++++++++++---------- 1 file changed, 26 insertions(+), 10 deletions(-) diff --git a/src/toil/wdl/wdltoil.py b/src/toil/wdl/wdltoil.py index 855725a36a..3f211b98a8 100755 --- a/src/toil/wdl/wdltoil.py +++ b/src/toil/wdl/wdltoil.py @@ -814,12 +814,16 @@ def convert_remote_files(environment: WDLBindings, file_source: Toil, task_path: :param import_remote_files: If set, import files from remote locations. Else leave them as URI references. """ path_to_id: Dict[str, uuid.UUID] = {} - def convert_file_to_url(file: WDL.Value.File) -> WDL.Value.File: + @memoize + def import_filename(filename: str) -> Tuple[Optional[str], Optional[str]]: """ Detect if any potential URI exists. Will convert a file's value to a URI and import it. + + Separated out from convert_file_to_url in order to properly memoize and avoid importing the same file twice + :param filename: Filename to import + :return: Tuple of the uri the file was found at and the virtualized import """ # Search through any input search paths passed in and download it if found - filename = file.value tried = [] for candidate_uri in potential_absolute_uris(filename, search_paths if search_paths is not None else []): tried.append(candidate_uri) @@ -831,8 +835,7 @@ def convert_file_to_url(file: WDL.Value.File) -> WDL.Value.File: continue # Now we know this exists, so pass it through - file.value = candidate_uri - return file + return candidate_uri, None else: # Actually import # Try to import the file. Don't raise if we can't find it, just @@ -893,15 +896,28 @@ def convert_file_to_url(file: WDL.Value.File) -> WDL.Value.File: logger.info('Converting input file path %s to %s', filename, candidate_uri) - # Was actually found + return candidate_uri, toil_uri + # Not found, return None + return None, None + + def convert_file_to_uri(file: WDL.Value.File) -> WDL.Value.File: + """ + Calls import_filename to detect if a potential URI exists and imports it. Will modify the File object value to the new URI and tack on the virtualized file. + """ + candidate_uri, toil_uri = import_filename(file.value) + if candidate_uri is None and toil_uri is None: + # If we get here we tried all the candidates + raise RuntimeError(f"Could not find {file.value} at any of: {list(potential_absolute_uris(file.value, search_paths if search_paths is not None else []))}") + elif candidate_uri is not None and toil_uri is None: + # A candidate exists but importing is disabled because import_remote_files is false + file.value = candidate_uri + else: + # Was actually found and imported file.value = candidate_uri setattr(file, "virtualized_value", toil_uri) - return file - - # If we get here we tried all the candidates - raise RuntimeError(f"Could not find {filename} at any of: {tried}") + return file - map_over_files_in_bindings(environment, convert_file_to_url) + map_over_files_in_bindings(environment, convert_file_to_uri) # Both the WDL code itself **and** the commands that it runs will deal in From f9db076bb0c41756a18a27d4628fc3c695ffcc84 Mon Sep 17 00:00:00 2001 From: stxue1 Date: Fri, 27 Sep 2024 13:14:59 -0700 Subject: [PATCH 34/36] change wdl_context to wdlcontext --- src/toil/wdl/wdltoil.py | 40 ++++++++++++++++++++-------------------- 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/src/toil/wdl/wdltoil.py b/src/toil/wdl/wdltoil.py index 5ede863bfc..fe753ee020 100755 --- a/src/toil/wdl/wdltoil.py +++ b/src/toil/wdl/wdltoil.py @@ -102,7 +102,7 @@ # task_path: Dotted WDL name of the part of the workflow this library is working for. # execution_dir: Directory to use as the working directory for workflow code. # container: The type of container to use when executing a WDL task. Carries through the value of the commandline --container option -WDL_Context = TypedDict('WDL_Context', {"execution_dir": NotRequired[str], "container": NotRequired[str], +WDLContext = TypedDict('WDL_Context', {"execution_dir": NotRequired[str], "container": NotRequired[str], "task_path": str, "namespace": str, "all_call_outputs": bool}) @@ -949,7 +949,7 @@ class ToilWDLStdLibBase(WDL.StdLib.Base): """ Standard library implementation for WDL as run on Toil. """ - def __init__(self, file_store: AbstractFileStore, wdl_options: WDL_Context, share_files_with: Optional["ToilWDLStdLibBase"] = None): + def __init__(self, file_store: AbstractFileStore, wdl_options: WDLContext, share_files_with: Optional["ToilWDLStdLibBase"] = None): """ Set up the standard library. :param wdl_options: Options to pass into the standard library to use. @@ -968,7 +968,7 @@ def __init__(self, file_store: AbstractFileStore, wdl_options: WDL_Context, shar # Keep the file store around so we can access files. self._file_store = file_store - self._wdl_options: WDL_Context = wdl_options + self._wdl_options: WDLContext = wdl_options if share_files_with is None: # We get fresh file download/upload state @@ -1142,7 +1142,7 @@ def _devirtualize_uri(filename: str, dest_dir: str, file_source: Union[AbstractF return result @staticmethod - def devirtualize_to(filename: str, dest_dir: str, file_source: Union[AbstractFileStore, Toil], state: DirectoryNamingStateDict, wdl_options: WDL_Context, + def devirtualize_to(filename: str, dest_dir: str, file_source: Union[AbstractFileStore, Toil], state: DirectoryNamingStateDict, wdl_options: WDLContext, devirtualized_to_virtualized: Optional[Dict[str, str]] = None, virtualized_to_devirtualized: Optional[Dict[str, str]] = None) -> str: """ Download or export a WDL virtualized filename/URL to the given directory. @@ -1282,7 +1282,7 @@ class ToilWDLStdLibTaskCommand(ToilWDLStdLibBase): are host-side paths. """ - def __init__(self, file_store: AbstractFileStore, container: TaskContainer, wdl_options: WDL_Context): + def __init__(self, file_store: AbstractFileStore, container: TaskContainer, wdl_options: WDLContext): """ Set up the standard library for the task command section. """ @@ -1376,7 +1376,7 @@ def __init__( stdout_path: str, stderr_path: str, file_to_mountpoint: Dict[str, str], - wdl_options: WDL_Context, + wdl_options: WDLContext, share_files_with: Optional["ToilWDLStdLibBase"] = None ): """ @@ -1902,7 +1902,7 @@ class WDLBaseJob(Job): Also responsible for remembering the Toil WDL configuration keys and values. """ - def __init__(self, wdl_options: WDL_Context, **kwargs: Any) -> None: + def __init__(self, wdl_options: WDLContext, **kwargs: Any) -> None: """ Make a WDL-related job. @@ -2036,7 +2036,7 @@ class WDLTaskWrapperJob(WDLBaseJob): All bindings are in terms of task-internal names. """ - def __init__(self, task: WDL.Tree.Task, prev_node_results: Sequence[Promised[WDLBindings]], task_id: List[str], wdl_options: WDL_Context, **kwargs: Any) -> None: + def __init__(self, task: WDL.Tree.Task, prev_node_results: Sequence[Promised[WDLBindings]], task_id: List[str], wdl_options: WDLContext, **kwargs: Any) -> None: """ Make a new job to determine resources and run a task. @@ -2197,7 +2197,7 @@ class WDLTaskJob(WDLBaseJob): All bindings are in terms of task-internal names. """ - def __init__(self, task: WDL.Tree.Task, task_internal_bindings: Promised[WDLBindings], runtime_bindings: Promised[WDLBindings], task_id: List[str], wdl_options: WDL_Context, **kwargs: Any) -> None: + def __init__(self, task: WDL.Tree.Task, task_internal_bindings: Promised[WDLBindings], runtime_bindings: Promised[WDLBindings], task_id: List[str], wdl_options: WDLContext, **kwargs: Any) -> None: """ Make a new job to run a task. @@ -2592,7 +2592,7 @@ def get_path_in_container(file: WDL.Value.File) -> Optional[WDL.Value.File]: contained_bindings = map_over_files_in_bindings(bindings, get_path_in_container) # Make a new standard library for evaluating the command specifically, which only deals with in-container paths and out-of-container paths. - command_wdl_options: WDL_Context = self._wdl_options.copy() + command_wdl_options: WDLContext = self._wdl_options.copy() if workdir_in_container is not None: command_wdl_options["execution_dir"] = workdir_in_container command_library = ToilWDLStdLibTaskCommand(file_store, task_container, wdl_options=command_wdl_options) @@ -2678,7 +2678,7 @@ def get_path_in_container(file: WDL.Value.File) -> Optional[WDL.Value.File]: # container-determined strings that are absolute paths to WDL File # objects, and like MiniWDL we can say we only support # working-directory-based relative paths for globs. - output_wdl_options: WDL_Context = self._wdl_options.copy() + output_wdl_options: WDLContext = self._wdl_options.copy() if workdir_in_container is not None: output_wdl_options["execution_dir"] = workdir_in_container outputs_library = ToilWDLStdLibTaskOutputs(file_store, host_stdout_txt, host_stderr_txt, task_container.input_path_map, wdl_options=output_wdl_options, share_files_with=standard_library) @@ -2727,7 +2727,7 @@ class WDLWorkflowNodeJob(WDLBaseJob): Job that evaluates a WDL workflow node. """ - def __init__(self, node: WDL.Tree.WorkflowNode, prev_node_results: Sequence[Promised[WDLBindings]], wdl_options: WDL_Context, **kwargs: Any) -> None: + def __init__(self, node: WDL.Tree.WorkflowNode, prev_node_results: Sequence[Promised[WDLBindings]], wdl_options: WDLContext, **kwargs: Any) -> None: """ Make a new job to run a workflow node to completion. """ @@ -2822,7 +2822,7 @@ class WDLWorkflowNodeListJob(WDLBaseJob): workflows or tasks or sections. """ - def __init__(self, nodes: List[WDL.Tree.WorkflowNode], prev_node_results: Sequence[Promised[WDLBindings]], wdl_options: WDL_Context, **kwargs: Any) -> None: + def __init__(self, nodes: List[WDL.Tree.WorkflowNode], prev_node_results: Sequence[Promised[WDLBindings]], wdl_options: WDLContext, **kwargs: Any) -> None: """ Make a new job to run a list of workflow nodes to completion. """ @@ -3028,7 +3028,7 @@ class WDLSectionJob(WDLBaseJob): Job that can create more graph for a section of the workflow. """ - def __init__(self, wdl_options: WDL_Context, **kwargs: Any) -> None: + def __init__(self, wdl_options: WDLContext, **kwargs: Any) -> None: """ Make a WDLSectionJob where the interior runs in the given namespace, starting with the root workflow. @@ -3286,7 +3286,7 @@ class WDLScatterJob(WDLSectionJob): instance of the body. If an instance of the body doesn't create a binding, it gets a null value in the corresponding array. """ - def __init__(self, scatter: WDL.Tree.Scatter, prev_node_results: Sequence[Promised[WDLBindings]], wdl_options: WDL_Context, **kwargs: Any) -> None: + def __init__(self, scatter: WDL.Tree.Scatter, prev_node_results: Sequence[Promised[WDLBindings]], wdl_options: WDLContext, **kwargs: Any) -> None: """ Create a subtree that will run a WDL scatter. The scatter itself and the contents live in the given namespace. """ @@ -3428,7 +3428,7 @@ class WDLConditionalJob(WDLSectionJob): """ Job that evaluates a conditional in a WDL workflow. """ - def __init__(self, conditional: WDL.Tree.Conditional, prev_node_results: Sequence[Promised[WDLBindings]], wdl_options: WDL_Context, **kwargs: Any) -> None: + def __init__(self, conditional: WDL.Tree.Conditional, prev_node_results: Sequence[Promised[WDLBindings]], wdl_options: WDLContext, **kwargs: Any) -> None: """ Create a subtree that will run a WDL conditional. The conditional itself and its contents live in the given namespace. """ @@ -3483,7 +3483,7 @@ class WDLWorkflowJob(WDLSectionJob): Job that evaluates an entire WDL workflow. """ - def __init__(self, workflow: WDL.Tree.Workflow, prev_node_results: Sequence[Promised[WDLBindings]], workflow_id: List[str], wdl_options: WDL_Context, **kwargs: Any) -> None: + def __init__(self, workflow: WDL.Tree.Workflow, prev_node_results: Sequence[Promised[WDLBindings]], workflow_id: List[str], wdl_options: WDLContext, **kwargs: Any) -> None: """ Create a subtree that will run a WDL workflow. The job returns the return value of the workflow. @@ -3550,7 +3550,7 @@ class WDLOutputsJob(WDLBaseJob): Returns an environment with just the outputs bound, in no namespace. """ - def __init__(self, workflow: WDL.Tree.Workflow, bindings: Promised[WDLBindings], wdl_options: WDL_Context, **kwargs: Any): + def __init__(self, workflow: WDL.Tree.Workflow, bindings: Promised[WDLBindings], wdl_options: WDLContext, **kwargs: Any): """ Make a new WDLWorkflowOutputsJob for the given workflow, with the given set of bindings after its body runs. """ @@ -3635,7 +3635,7 @@ class WDLRootJob(WDLSectionJob): the workflow name; both forms are accepted. """ - def __init__(self, target: Union[WDL.Tree.Workflow, WDL.Tree.Task], inputs: WDLBindings, wdl_options: WDL_Context, **kwargs: Any) -> None: + def __init__(self, target: Union[WDL.Tree.Workflow, WDL.Tree.Task], inputs: WDLBindings, wdl_options: WDLContext, **kwargs: Any) -> None: """ Create a subtree to run the workflow and namespace the outputs. """ @@ -3780,7 +3780,7 @@ def main() -> None: convert_remote_files(input_bindings, toil, task_path=target.name, search_paths=inputs_search_path, import_remote_files=options.reference_inputs) # Configure workflow interpreter options - wdl_options: WDL_Context = {"execution_dir": execution_dir, "container": options.container, "task_path": target.name, + wdl_options: WDLContext = {"execution_dir": execution_dir, "container": options.container, "task_path": target.name, "namespace": target.name, "all_call_outputs": options.all_call_outputs} assert wdl_options.get("container") is not None From c16290fcf9d8157e7af3b9cba45365da99e5cdc5 Mon Sep 17 00:00:00 2001 From: stxue1 Date: Fri, 27 Sep 2024 13:15:42 -0700 Subject: [PATCH 35/36] also change typeddict --- src/toil/wdl/wdltoil.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/toil/wdl/wdltoil.py b/src/toil/wdl/wdltoil.py index fe753ee020..e23b5a63f0 100755 --- a/src/toil/wdl/wdltoil.py +++ b/src/toil/wdl/wdltoil.py @@ -102,8 +102,8 @@ # task_path: Dotted WDL name of the part of the workflow this library is working for. # execution_dir: Directory to use as the working directory for workflow code. # container: The type of container to use when executing a WDL task. Carries through the value of the commandline --container option -WDLContext = TypedDict('WDL_Context', {"execution_dir": NotRequired[str], "container": NotRequired[str], - "task_path": str, "namespace": str, "all_call_outputs": bool}) +WDLContext = TypedDict('WDLContext', {"execution_dir": NotRequired[str], "container": NotRequired[str], + "task_path": str, "namespace": str, "all_call_outputs": bool}) @contextmanager From cb0f9860d19405d622ee21bf20af431410798a5c Mon Sep 17 00:00:00 2001 From: stxue1 Date: Fri, 27 Sep 2024 13:19:23 -0700 Subject: [PATCH 36/36] Add some documentation for the WDLContext object --- src/toil/wdl/wdltoil.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/toil/wdl/wdltoil.py b/src/toil/wdl/wdltoil.py index e23b5a63f0..9d04da92a0 100755 --- a/src/toil/wdl/wdltoil.py +++ b/src/toil/wdl/wdltoil.py @@ -100,8 +100,10 @@ # WDL options to pass into the WDL jobs and standard libraries # task_path: Dotted WDL name of the part of the workflow this library is working for. +# namespace: namespace of the WDL that the current job is in # execution_dir: Directory to use as the working directory for workflow code. # container: The type of container to use when executing a WDL task. Carries through the value of the commandline --container option +# all_call_outputs: whether a job should include all calls outputs WDLContext = TypedDict('WDLContext', {"execution_dir": NotRequired[str], "container": NotRequired[str], "task_path": str, "namespace": str, "all_call_outputs": bool}) @@ -3781,7 +3783,7 @@ def main() -> None: # Configure workflow interpreter options wdl_options: WDLContext = {"execution_dir": execution_dir, "container": options.container, "task_path": target.name, - "namespace": target.name, "all_call_outputs": options.all_call_outputs} + "namespace": target.name, "all_call_outputs": options.all_call_outputs} assert wdl_options.get("container") is not None # Run the workflow and get its outputs namespaced with the workflow name.