From 88690fc85940047ad6177c9d03822a0f7c334fba Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Famke=20Ba=CC=88uerle?= Date: Mon, 18 Nov 2024 15:45:07 +0100 Subject: [PATCH 01/54] status from PR #33 --- nextflow.config | 20 +- .../main/nextflow/prov/WrrocRenderer.groovy | 710 ++++++++++++++++++ 2 files changed, 729 insertions(+), 1 deletion(-) create mode 100644 plugins/nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy diff --git a/nextflow.config b/nextflow.config index 6219b1b..b4c7573 100644 --- a/nextflow.config +++ b/nextflow.config @@ -20,5 +20,23 @@ prov { file = "${params.outdir}/manifest.json" overwrite = true } + wrroc { + file = "${params.outdir}/ro-crate-metadata.json" + overwrite = true + agent { + name = "John Doe" + orcid = "https://orcid.org/0000-0000-0000-0000" + } + organization { + name = "University of XYZ" + ror = "https://ror.org/000000000" + isPublisher = true + } + publisher { + id = "https://ror.org/000000000" + } + license = "https://spdx.org/licenses/Apache-2.0" + profile = "provenance_run_crate" + } } -} +} \ No newline at end of file diff --git a/plugins/nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy b/plugins/nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy new file mode 100644 index 0000000..87a15a4 --- /dev/null +++ b/plugins/nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy @@ -0,0 +1,710 @@ +/* + * Copyright 2023, Seqera Labs + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package nextflow.prov + +import nextflow.config.ConfigMap +import nextflow.file.FileHolder +import nextflow.script.params.FileInParam +import nextflow.script.params.FileOutParam + +import java.nio.file.* +import java.nio.file.attribute.BasicFileAttributes +import java.time.LocalDateTime +import java.time.format.DateTimeFormatter + +import groovy.json.JsonOutput +import groovy.transform.CompileStatic +import nextflow.Session +import nextflow.processor.TaskRun + +/** + * Renderer for the Provenance Run RO Crate format. + * + * @author Ben Sherman + * @author Felix Bartusch + */ +@CompileStatic +class WrrocRenderer implements Renderer { + + private Path path + private Path crateRootDir + private Path workdir + private Path projectDir + + private LinkedHashMap agent + private LinkedHashMap organization + private String publisherID + + private boolean overwrite + + @Delegate + private PathNormalizer normalizer + + WrrocRenderer(Map opts) { + path = opts.file as Path + overwrite = opts.overwrite as Boolean + + ProvHelper.checkFileOverwrite(path, overwrite) + } + + @Override + void render(Session session, Set tasks, Map workflowOutputs) { + + final params = session.getBinding().getParams() as Map + final configMap = new ConfigMap(session.getConfig()) + + // Set RO-Crate Root and workdir + this.crateRootDir = Path.of(params['outdir'].toString()).toAbsolutePath() + this.workdir = session.getWorkDir() + this.projectDir = session.getWorkflowMetadata().getProjectDir() + + // get workflow inputs + final taskLookup = ProvHelper.getTaskLookup(tasks) + final workflowInputs = ProvHelper.getWorkflowInputs(tasks, taskLookup) + + // Add intermediate input files (produced by workflow tasks and consumed by other tasks) + workflowInputs.addAll(getIntermediateInputFiles(tasks, workflowInputs)); + final Map workflowInputMapping = getWorkflowInputMapping(workflowInputs) + + // Add intermediate output files (produced by workflow tasks and consumed by other tasks) + workflowOutputs.putAll(getIntermediateOutputFiles(tasks, workflowOutputs)); + + // Copy workflow input files into RO-Crate + workflowInputMapping.each {source, dest -> + + if (Files.isDirectory(source)) { + // Recursively copy directory and its contents + Files.walkFileTree(source, new SimpleFileVisitor() { + @Override + FileVisitResult preVisitDirectory(Path dir, BasicFileAttributes attrs) throws IOException { + Path targetDir = dest.resolve(source.relativize(dir)) + Files.createDirectories(targetDir) + return FileVisitResult.CONTINUE + } + + @Override + FileVisitResult visitFile(Path file, BasicFileAttributes attrs) throws IOException { + Path targetFile = dest.resolve(source.relativize(file)) + if (!Files.exists(targetFile)) + Files.copy(file, targetFile) + return FileVisitResult.CONTINUE + } + }) + } else { + try { + Files.createDirectories(dest.getParent()) + if (!Files.exists(dest)) + Files.copy(source, dest) + } catch (Exception e) { + println "Failed to copy $source to $dest: ${e.message}" + } + } + } + + // Copy workflow output files into RO-Crate + workflowOutputs.each {source, dest -> + + if (Files.isDirectory(source)) { + // Recursively copy directory and its contents + Files.walkFileTree(source, new SimpleFileVisitor() { + @Override + FileVisitResult preVisitDirectory(Path dir, BasicFileAttributes attrs) throws IOException { + Path targetDir = dest.resolve(source.relativize(dir)) + Files.createDirectories(targetDir) + return FileVisitResult.CONTINUE + } + + @Override + FileVisitResult visitFile(Path file, BasicFileAttributes attrs) throws IOException { + Path targetFile = dest.resolve(source.relativize(file)) + if (!Files.exists(targetFile)) + Files.copy(file, targetFile) + return FileVisitResult.CONTINUE + } + }) + } else { + try { + Files.createDirectories(dest.getParent()) + Files.copy(source, dest, StandardCopyOption.REPLACE_EXISTING) + } catch (Exception e) { + println "Failed to copy $source to $dest: ${e.message}" + } + } + } + + // get workflow config and store it in crate + Path configFilePath = crateRootDir.resolve("nextflow.config") + FileWriter configFileWriter = new FileWriter(configFilePath.toString()) + configMap.toConfigObject().writeTo(configFileWriter) + + // get workflow metadata + final metadata = session.workflowMetadata + this.normalizer = new PathNormalizer(metadata) + + final manifest = metadata.manifest + final nextflowMeta = metadata.nextflow + final scriptFile = metadata.getScriptFile() + + final formatter = DateTimeFormatter.ISO_OFFSET_DATE_TIME + final dateStarted = formatter.format(metadata.start) + final dateCompleted = formatter.format(metadata.complete) + final nextflowVersion = nextflowMeta.version.toString() + final wrrocParams = session.config.prov["formats"]["wrroc"] as Map + + // Copy workflow into crate directory + Files.copy(scriptFile, crateRootDir.resolve(scriptFile.getFileName()), StandardCopyOption.REPLACE_EXISTING) + + // Copy nextflow_schema_json into crate if it exists + final schemaFile = scriptFile.getParent().resolve("nextflow_schema.json") + // TODO Add to crate metadata + if (Files.exists(schemaFile)) + Files.copy(schemaFile, crateRootDir.resolve(schemaFile.getFileName()), StandardCopyOption.REPLACE_EXISTING) + + + // create manifest + final softwareApplicationId = UUID.randomUUID() + final organizeActionId = UUID.randomUUID() + + // Process wrroc configuration options + agent = parseAgentInfo(wrrocParams) + organization = parseOrganizationInfo(wrrocParams) + publisherID = getPublisherID(wrrocParams, agent, organization) + if(organization) + agent.put("affiliation", ["@id": organization.get("@id")]) + //license = parseLicenseInfo(wrrocParams) + + + // license information + boolean licenseURLvalid = false + String licenseString = null; + URI licenseURL = null + Map license = null + if (wrrocParams.containsKey("license")) { + licenseString = wrrocParams.get("license") + try { + licenseURL = new URL(licenseString).toURI(); + licenseURLvalid = true + + // Entity for license URL + license = [ + "@id" : licenseURL.toString(), + "@type": "CreativeWork" + ] + } catch (Exception e) { + licenseURLvalid = false + } + } + + final formalParameters = params + .collect { name, value -> + [ + "@id" : "#${name}", + "@type" : "FormalParameter", + // TODO: infer type from value at runtime + "additionalType": "String", + // "defaultValue": "", + "conformsTo" : ["@id": "https://bioschemas.org/profiles/FormalParameter/1.0-RELEASE"], + "description" : "", + // TODO: apply only if type is Path + // "encodingFormat": "text/plain", + // TODO: match to output if type is Path + // "workExample": ["@id": outputId], + "name" : name, + // "valueRequired": "True" + ] + } + + final inputFiles = workflowInputMapping + .collect { source, target -> + [ + "@id" : crateRootDir.relativize(target).toString(), + "@type" : "File", + "name" : target.name, + "description" : "", + "encodingFormat": Files.probeContentType(source) ?: "", + "fileType": "whatever", + // TODO: apply if matching param is found + // "exampleOfWork": ["@id": paramId] + ] + } + + final outputFiles = workflowOutputs + .collect { source, target -> + [ + "@id" : crateRootDir.relativize(target).toString(), + "@type" : "File", + "name" : target.name, + "description" : "", + "encodingFormat": Files.probeContentType(target) ?: "", + // TODO: create FormalParameter for each output file? + // "exampleOfWork": {"@id": "#reversed"} + ] + } + + // Combine both, inputFiles and outputFiles into one list. Remove duplicates that occur when an intermediate + // file is output of a task and input of another task. + Map> combinedInputOutputMap = [:] + + inputFiles.each { entry -> + combinedInputOutputMap[entry['@id']] = entry + } + // Overwriting if 'id' already exists + outputFiles.each { entry -> + combinedInputOutputMap[entry['@id']] = entry + } + List> uniqueInputOutputFiles = combinedInputOutputMap.values().toList() + + final propertyValues = params + .collect { name, value -> + [ + "@id" : "#${name}-pv", + "@type" : "PropertyValue", + "exampleOfWork": ["@id": "#${name}"], + "name" : name, + "value" : isNested(value) ? JsonOutput.toJson(value) : value + ] + } + + // Maps used for finding tasks/CreateActions corresponding to a Nextflow process + Map processToTasks = [:].withDefault { [] } + + def createActions = tasks + .collect { task -> + + List resultFileIDs = [] + for (taskOutputParam in task.getOutputsByType(FileOutParam)) { + for (taskOutputFile in taskOutputParam.getValue()) { + // Path to file in workdir + Path taskOutputFilePath = Path.of(taskOutputFile.toString()) + + if (workflowOutputs.containsKey(taskOutputFilePath)) { + resultFileIDs.add(crateRootDir.relativize(workflowOutputs.get(taskOutputFilePath)).toString()) + } else { + System.out.println("taskOutput not contained in workflowOutputs list: " + taskOutputFilePath) + } + } + } + + List objectFileIDs = [] + for (taskInputParam in task.getInputsByType(FileInParam)) { + for (taskInputFileHolder in taskInputParam.getValue()) { + FileHolder holder = (FileHolder) taskInputFileHolder + Path taskInputFilePath = holder.getStorePath() + + if (workflowInputs.contains(taskInputFilePath)) { + // The mapping of input files to their path in the RO-Crate is only available for files we + // expect (e.g. files in workdir and pipeline assets). Have to handle unexpected files ... + try { + objectFileIDs.add(crateRootDir.relativize(workflowInputMapping.get(taskInputFilePath)).toString()) + } catch(Exception e) { + System.out.println("Unexpected input file: " + taskInputFilePath.toString()) + } + } else { + System.out.println("taskInput not contained in workflowInputs list: " + taskInputFilePath) + } + } + } + + def createAction = [ + "@id" : "#" + task.getHash().toString(), + "@type" : "CreateAction", + "name" : task.getName(), + // TODO: There is no description for Nextflow processes? + //"description" : "", + // TODO: task doesn't contain startTime information. TaskHandler does, but is not available to WrrocRenderer + //"startTime": "". + // TODO: Same as for startTime + //"endTime": "", + "instrument" : ["@id": "#" + task.getProcessor().ownerScript.toString()], + "agent" : ["@id": agent.get("@id").toString()], + "object" : objectFileIDs.collect(file -> ["@id": file]), + "result" : resultFileIDs.collect(file -> ["@id": file]), + "actionStatus": task.getExitStatus() == 0 ? "CompletedActionStatus" : "FailedActionStatus" + ] + + // Add error message if there is one + if (task.getExitStatus() != 0) { + createAction.put("error", task.getStderr()) + } + + return createAction + } + + final nextflowProcesses = tasks + .collect { task -> + processToTasks[task.getProcessor().getId().toString()].add("#${task.getHash().toString()}") + return task.getProcessor() + }.unique() + + final wfSofwareApplications = nextflowProcesses + .collect() { process -> + [ + "@id" : "#" + process.ownerScript.toString(), + "@type": "SoftwareApplication", + "name" : process.getName() + ] + } + + final howToSteps = nextflowProcesses + .collect() { process -> + [ + "@id" : metadata.projectName + "#main/" + process.getName(), + "@type" : "HowToStep", + "workExample": ["@id": "#" + process.ownerScript.toString()], + "position" : process.getId().toString() + ] + } + + final controlActions = nextflowProcesses + .collect() { process -> + [ + "@id" : "#" + UUID.randomUUID(), + "@type" : "ControlAction", + "instrument": ["@id": "${metadata.projectName}#main/${process.getName()}"], + "name" : "orchestrate " + "${metadata.projectName}#main/${process.getName()}", + "object" : processToTasks[process.getId().toString()].collect({ taskID -> + ["@id": taskID] + }) + ] + } + + final configFile = + [ + "@id" : "nextflow.config", + "@type" : "File", + "name" : "Effective Nextflow configuration", + "description": "This is the effective configuration during runtime compiled from all configuration sources. " + ] + + final wrroc = [ + "@context": "https://w3id.org/ro/crate/1.1/context", + "@graph" : [ + [ + "@id" : path.name, + "@type" : "CreativeWork", + "about" : ["@id": "./"], + "conformsTo": [ + ["@id": "https://w3id.org/ro/crate/1.1"], + ["@id": "https://w3id.org/workflowhub/workflow-ro-crate/1.0"] + ] + ], + [ + "@id" : "./", + "@type" : "Dataset", + "author" : ["@id": agent.get("@id").toString()], + "publisher" : publisherID ? ["@id": publisherID] : null, + "datePublished": getDatePublished(), + "conformsTo" : [ + ["@id": "https://w3id.org/ro/wfrun/process/0.1"], + ["@id": "https://w3id.org/ro/wfrun/workflow/0.1"], + ["@id": "https://w3id.org/ro/wfrun/provenance/0.1"], + ["@id": "https://w3id.org/workflowhub/workflow-ro-crate/1.0"] + ], + "name" : "Workflow run of ${metadata.projectName}", + "description": manifest.description ?: "", + "hasPart" : [ + ["@id": metadata.projectName], + ["@id": "nextflow.config"], + *uniqueInputOutputFiles.collect(file -> ["@id": file["@id"]]) + ], + "mainEntity" : ["@id": metadata.projectName], + "mentions" : [ + ["@id": "#${session.uniqueId}"], + *createActions.collect(createAction -> ["@id": createAction["@id"]]) + ], + "license" : licenseURLvalid ? ["@id": licenseURL.toString()] : licenseString + ].findAll { it.value != null }, + [ + "@id" : "https://w3id.org/ro/wfrun/process/0.1", + "@type" : "CreativeWork", + "name" : "Process Run Crate", + "version": "0.1" + ], + [ + "@id" : "https://w3id.org/ro/wfrun/workflow/0.1", + "@type" : "CreativeWork", + "name" : "Workflow Run Crate", + "version": "0.1" + ], + [ + "@id" : "https://w3id.org/ro/wfrun/provenance/0.1", + "@type" : "CreativeWork", + "name" : "Provenance Run Crate", + "version": "0.1" + ], + [ + "@id" : "https://w3id.org/workflowhub/workflow-ro-crate/1.0", + "@type" : "CreativeWork", + "name" : "Workflow RO-Crate", + "version": "1.0" + ], + [ + "@id" : metadata.projectName, + "@type" : ["File", "SoftwareSourceCode", "ComputationalWorkflow", "HowTo"], + "name" : metadata.projectName, + "programmingLanguage": ["@id": "https://w3id.org/workflowhub/workflow-ro-crate#nextflow"], + "hasPart" : wfSofwareApplications.collect(sa -> + ["@id": sa["@id"]] + ), + "input" : formalParameters.collect(fp -> + ["@id": fp["@id"]] + ), + "output" : [ + // TODO: id of FormalParameter for each output file + ], + "step" : howToSteps.collect(step -> + ["@id": step["@id"]] + ), + ], + [ + "@id" : "https://w3id.org/workflowhub/workflow-ro-crate#nextflow", + "@type" : "ComputerLanguage", + "name" : "Nextflow", + "identifier": "https://www.nextflow.io/", + "url" : "https://www.nextflow.io/", + "version" : nextflowVersion + ], + *wfSofwareApplications, + *formalParameters, + [ + "@id" : "#${softwareApplicationId}", + "@type": "SoftwareApplication", + "name" : "Nextflow ${nextflowVersion}" + ], + + *howToSteps, + [ + "@id" : "#${organizeActionId}", + "@type" : "OrganizeAction", + "agent" : ["@id": agent.get("@id").toString()], + "instrument": ["@id": "#${softwareApplicationId}"], + "name" : "Run of Nextflow ${nextflowVersion}", + "object" : [ + *controlActions.collect(action -> ["@id": action["@id"]]) + ], + "result" : ["@id": "#${session.uniqueId}"], + "startTime" : dateStarted, + "endTime" : dateCompleted + ], + [ + "@id" : "#${session.uniqueId}", + "@type" : "CreateAction", + "agent" : ["@id": agent.get("@id").toString()], + "name" : "Nextflow workflow run ${session.uniqueId}", + "startTime" : dateStarted, + "endTime" : dateCompleted, + "instrument": ["@id": metadata.projectName], + "object" : [ + *inputFiles.collect(file -> ["@id": file["@id"]]), + *propertyValues.collect(pv -> ["@id": pv["@id"]]) + ], + "result" : outputFiles.collect(file -> + ["@id": file["@id"]] + ) + ], + *[agent], + *[organization], + *controlActions, + *createActions, + configFile, + *uniqueInputOutputFiles, + *propertyValues, + license + ].findAll { it != null } + ] + + // render manifest to JSON file + path.text = JsonOutput.prettyPrint(JsonOutput.toJson(wrroc)) + } + + static Set getIntermediateInputFiles(Set tasks, Set workflowInputs) { + Set intermediateInputFiles = [] + + tasks.collect { task -> + for (taskInputParam in task.getInputsByType(FileInParam)) { + for (taskInputFileHolder in taskInputParam.getValue()) { + FileHolder holder = (FileHolder) taskInputFileHolder + Path taskInputFilePath = holder.getStorePath() + + if (!workflowInputs.contains(taskInputFilePath)) { + intermediateInputFiles.add(taskInputFilePath) + } + } + } + } + + return intermediateInputFiles + } + + def Map getIntermediateOutputFiles(Set tasks, Map workflowOutputs) { + Map intermediateInputFiles = [:] + + tasks.collect { task -> + for (taskOutputParam in task.getOutputsByType(FileOutParam)) { + for (taskOutputFile in taskOutputParam.getValue()) { + // Path to file in workdir + Path taskOutputFilePath = Path.of(taskOutputFile.toString()) + + if (! workflowOutputs.containsKey(taskOutputFilePath)) { + + // Find the relative path from workdir + Path relativePath = workdir.relativize(taskOutputFilePath) + + // Build the new path by combining crateRootDir and the relative part + Path outputFileInCrate = crateRootDir.resolve(workdir.fileName).resolve(relativePath) + + intermediateInputFiles.put(taskOutputFilePath, outputFileInCrate) + } + } + } + } + + return intermediateInputFiles + } + + /** + * Map input files from Nextflow workdir into the RO-Crate. + * + * @param paths Input file paths on the file system + * @return Map of input file paths into the RO-Crate + */ + def Map getWorkflowInputMapping(Set paths) { + + // The resulting mapping + Map workflowInputMapping = [:] + + // Nextflow asset directory + Path assetDir = projectDir.resolve("assets") + + // pipeline_info directory. Although located in the result directory, it is used as input for MultiQC + Path pipelineInfoDir = crateRootDir.resolve("pipeline_info") + + paths.collect { inputPath -> + + // Depending on where the input file is stored, use different Paths for the parent directory. + // We assume that an input file is either stored in the workdir or in the pipeline's asset directory. + Path parentDir = null + if (inputPath.startsWith(workdir)) + parentDir = workdir + else if (inputPath.startsWith(assetDir)) + parentDir = assetDir + else if (inputPath.startsWith(pipelineInfoDir)) + parentDir = pipelineInfoDir + else { + System.out.println("Unknown parentDir: " + inputPath.toString()) + } + + // Ignore file with unkown (e.g. null) parentDir + if(parentDir) { + Path relativePath = parentDir.relativize(inputPath) + Path outputFileInCrate = crateRootDir.resolve(parentDir.fileName).resolve(relativePath) + workflowInputMapping.put(inputPath, outputFileInCrate) + } + } + + return workflowInputMapping + } + + static String getDatePublished() { + return LocalDateTime.now().format(DateTimeFormatter.ISO_DATE) + } + + /** + * Parse information about agent running the workflow from parameters + * + * @param params Nextflow parameters + * @return Map describing agent via '@id'. 'orcid' and 'name' + */ + static def LinkedHashMap parseAgentInfo(Map params) { + final LinkedHashMap agent = new LinkedHashMap() + + if (! params.containsKey("agent")) + return null + + Map agentMap = params["agent"] as Map + agent.put("@id", agentMap.containsKey("orcid") ? agentMap.get("orcid") : "agent-1") + agent.put("@type", "Person") + if(agentMap.containsKey("name")) + agent.put("name", agentMap.get("name")) + + return agent + } + + + /** + * Parse information about organization agent running the workflow belongs to. + * + * @param params Nextflow parameters + * @return Map describing organization via '@id'. 'orcid' and 'name' + */ + static def LinkedHashMap parseOrganizationInfo(Map params) { + final LinkedHashMap org = new LinkedHashMap() + + if (! params.containsKey("organization")) + return null + + Map orgMap = params["organization"] as Map + org.put("@id", orgMap.containsKey("ror") ? orgMap.get("ror") : "organization-1") + org.put("@type", "Organization") + if(orgMap.containsKey("name")) + org.put("name", orgMap.get("name")) + + return org + } + + + /** + * Parse information about the RO-Crate publisher. + * + * @param params Nextflow parameters + * @return Publisher ID + */ + static def String getPublisherID(Map params, Map agent, Map organization) { + + if (! params.containsKey("publisher")) + return null + + Map publisherMap = params["publisher"] as Map + if (! publisherMap.containsKey("id")) + return null + + String publisherID = publisherMap.get("id") + String agentID = "" + String organizationID = "" + if (agent) + agentID = agent.get("@id") + if (organization) + organizationID = organization.get("@id") + + // Check if the publisher ID references either the organization or the agent + if (publisherID != agentID && publisherID != organizationID) + return null + + return publisherID + } + + + /** + * Check if a groovy object contains nested structures, e.g. will not be flattened when serialized as JSON + * + * @param obj The object to be checked + * @return true if the object contains nested structures + */ + static def boolean isNested(Object obj) { + return (obj instanceof Map || obj instanceof List) + } +} From e541606a76d7bbc83c0464b580ff80a4b04ec7fe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Famke=20Ba=CC=88uerle?= Date: Mon, 18 Nov 2024 15:53:59 +0100 Subject: [PATCH 02/54] add encodingFormat for nextflow.config --- .../nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/plugins/nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy b/plugins/nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy index 87a15a4..8e3e7f1 100644 --- a/plugins/nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy +++ b/plugins/nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy @@ -384,10 +384,11 @@ class WrrocRenderer implements Renderer { final configFile = [ - "@id" : "nextflow.config", - "@type" : "File", - "name" : "Effective Nextflow configuration", - "description": "This is the effective configuration during runtime compiled from all configuration sources. " + "@id" : "nextflow.config", + "@type" : "File", + "name" : "Effective Nextflow configuration", + "description" : "This is the effective configuration during runtime compiled from all configuration sources.", + "encodingFormat": "text/plain", ] final wrroc = [ From 9889a692360e435193d917efc9145264a2744445 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Famke=20Ba=CC=88uerle?= Date: Mon, 18 Nov 2024 16:01:32 +0100 Subject: [PATCH 03/54] add encodingFormat for main.nf --- plugins/nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/plugins/nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy b/plugins/nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy index 8e3e7f1..48438cb 100644 --- a/plugins/nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy +++ b/plugins/nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy @@ -388,7 +388,7 @@ class WrrocRenderer implements Renderer { "@type" : "File", "name" : "Effective Nextflow configuration", "description" : "This is the effective configuration during runtime compiled from all configuration sources.", - "encodingFormat": "text/plain", + "encodingFormat": "text/plain" ] final wrroc = [ @@ -456,6 +456,7 @@ class WrrocRenderer implements Renderer { [ "@id" : metadata.projectName, "@type" : ["File", "SoftwareSourceCode", "ComputationalWorkflow", "HowTo"], + "encodingFormat" : "application/nextflow", "name" : metadata.projectName, "programmingLanguage": ["@id": "https://w3id.org/workflowhub/workflow-ro-crate#nextflow"], "hasPart" : wfSofwareApplications.collect(sa -> From 91fc7e2f3f25f6c4bf2daa199838266f349bedc2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Famke=20Ba=CC=88uerle?= Date: Tue, 3 Dec 2024 14:03:37 +0100 Subject: [PATCH 04/54] feat: add wrroc to valid formats --- plugins/nf-prov/src/main/nextflow/prov/ProvObserver.groovy | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/plugins/nf-prov/src/main/nextflow/prov/ProvObserver.groovy b/plugins/nf-prov/src/main/nextflow/prov/ProvObserver.groovy index f508959..7dca105 100644 --- a/plugins/nf-prov/src/main/nextflow/prov/ProvObserver.groovy +++ b/plugins/nf-prov/src/main/nextflow/prov/ProvObserver.groovy @@ -40,7 +40,7 @@ import nextflow.trace.TraceRecord @CompileStatic class ProvObserver implements TraceObserver { - public static final List VALID_FORMATS = ['bco', 'dag', 'legacy'] + public static final List VALID_FORMATS = ['bco', 'dag', 'legacy', 'wrroc'] private Session session @@ -71,6 +71,9 @@ class ProvObserver implements TraceObserver { if( name == 'legacy' ) return new LegacyRenderer(opts) + if( name == 'wrroc' ) + return new WrrocRenderer(opts) + throw new IllegalArgumentException("Invalid provenance format -- valid formats are ${VALID_FORMATS.join(', ')}") } From 141683307f3a951e8992199c3a679b990f7886f0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Famke=20B=C3=A4uerle?= <45968370+famosab@users.noreply.github.com> Date: Fri, 6 Dec 2024 13:30:42 +0100 Subject: [PATCH 05/54] fix: make getIntermediateOutputFiles work again (#18) * fx: make getIntermediateOutputFiles work again * Fix bugs fixes #16 fixes #17 --------- Co-authored-by: fbartusch --- .../main/nextflow/prov/WrrocRenderer.groovy | 79 +++++++++++++------ 1 file changed, 54 insertions(+), 25 deletions(-) diff --git a/plugins/nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy b/plugins/nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy index 48438cb..b6b9c22 100644 --- a/plugins/nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy +++ b/plugins/nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy @@ -41,8 +41,11 @@ import nextflow.processor.TaskRun class WrrocRenderer implements Renderer { private Path path + // The final RO-Crate directory private Path crateRootDir + // Nextflow work directory private Path workdir + // Nextflow pipeline directory (contains main.nf, assets, etc.) private Path projectDir private LinkedHashMap agent @@ -110,7 +113,7 @@ class WrrocRenderer implements Renderer { if (!Files.exists(dest)) Files.copy(source, dest) } catch (Exception e) { - println "Failed to copy $source to $dest: ${e.message}" + println "workflowInput: Failed to copy $source to $dest: ${e.message}" } } } @@ -141,7 +144,7 @@ class WrrocRenderer implements Renderer { Files.createDirectories(dest.getParent()) Files.copy(source, dest, StandardCopyOption.REPLACE_EXISTING) } catch (Exception e) { - println "Failed to copy $source to $dest: ${e.message}" + println "workflowOutput Failed to copy $source to $dest: ${e.message}" } } } @@ -284,18 +287,29 @@ class WrrocRenderer implements Renderer { def createActions = tasks .collect { task -> - List resultFileIDs = [] + + // Collect output files of the path + List outputFileList = [] for (taskOutputParam in task.getOutputsByType(FileOutParam)) { + + if (taskOutputParam.getValue() instanceof Path) { + outputFileList.add(taskOutputParam.getValue() as Path) + continue + } + for (taskOutputFile in taskOutputParam.getValue()) { // Path to file in workdir - Path taskOutputFilePath = Path.of(taskOutputFile.toString()) + outputFileList.add(Path.of(taskOutputFile.toString())) + } + } - if (workflowOutputs.containsKey(taskOutputFilePath)) { - resultFileIDs.add(crateRootDir.relativize(workflowOutputs.get(taskOutputFilePath)).toString()) - } else { - System.out.println("taskOutput not contained in workflowOutputs list: " + taskOutputFilePath) - } + // Check if the output files have a mapping in workflowOutputs + for (outputFile in outputFileList) { + if (workflowOutputs.containsKey(outputFile)) { + resultFileIDs.add(crateRootDir.relativize(workflowOutputs.get(outputFile)).toString()) + } else { + System.out.println("taskOutput not contained in workflowOutputs list: " + outputFile) } } @@ -553,29 +567,41 @@ class WrrocRenderer implements Renderer { } def Map getIntermediateOutputFiles(Set tasks, Map workflowOutputs) { - Map intermediateInputFiles = [:] - tasks.collect { task -> + List intermediateOutputFilesList = [] + Map intermediateOutputFilesMap = [:] + + tasks.each { task -> for (taskOutputParam in task.getOutputsByType(FileOutParam)) { + + // If the param is a Path, just add it to the intermediate list + if (taskOutputParam.getValue() instanceof Path) { + intermediateOutputFilesList.add(taskOutputParam.getValue() as Path) + continue + } + for (taskOutputFile in taskOutputParam.getValue()) { - // Path to file in workdir - Path taskOutputFilePath = Path.of(taskOutputFile.toString()) + intermediateOutputFilesList.add(taskOutputFile as Path) + } + } + } - if (! workflowOutputs.containsKey(taskOutputFilePath)) { + // Iterate over the file list and create the mapping + for (outputFile in intermediateOutputFilesList) { + if (!workflowOutputs.containsKey(outputFile)) { - // Find the relative path from workdir - Path relativePath = workdir.relativize(taskOutputFilePath) + // Find the relative path from workdir + Path relativePath = workdir.relativize(outputFile) - // Build the new path by combining crateRootDir and the relative part - Path outputFileInCrate = crateRootDir.resolve(workdir.fileName).resolve(relativePath) + // Build the new path by combining crateRootDir and the relative part + Path outputFileInCrate = crateRootDir.resolve(workdir.fileName).resolve(relativePath) - intermediateInputFiles.put(taskOutputFilePath, outputFileInCrate) - } - } + Files.createDirectories(outputFileInCrate.parent) + intermediateOutputFilesMap.put(outputFile, outputFileInCrate) } } - return intermediateInputFiles + return intermediateOutputFilesMap } /** @@ -606,15 +632,18 @@ class WrrocRenderer implements Renderer { parentDir = assetDir else if (inputPath.startsWith(pipelineInfoDir)) parentDir = pipelineInfoDir - else { - System.out.println("Unknown parentDir: " + inputPath.toString()) - } + // Ignore file with unkown (e.g. null) parentDir if(parentDir) { Path relativePath = parentDir.relativize(inputPath) Path outputFileInCrate = crateRootDir.resolve(parentDir.fileName).resolve(relativePath) workflowInputMapping.put(inputPath, outputFileInCrate) + } else { + // All other files are simple copied into the crate with their absolute path into the crate root + Path relativePath = Path.of(inputPath.toString().substring(1)) + Path outputFileInCrate = crateRootDir.resolve(relativePath) + workflowInputMapping.put(inputPath, outputFileInCrate) } } From 416d920838ec74784a2d30e0057b2a9c1e5a6320 Mon Sep 17 00:00:00 2001 From: fbartusch Date: Sun, 8 Dec 2024 19:01:03 +0100 Subject: [PATCH 06/54] Check in input and output if file or directory * fix #7 Signed-off-by: fbartusch --- .../main/nextflow/prov/WrrocRenderer.groovy | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/plugins/nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy b/plugins/nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy index b6b9c22..1e81fea 100644 --- a/plugins/nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy +++ b/plugins/nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy @@ -235,7 +235,7 @@ class WrrocRenderer implements Renderer { .collect { source, target -> [ "@id" : crateRootDir.relativize(target).toString(), - "@type" : "File", + "@type" : getType(source), "name" : target.name, "description" : "", "encodingFormat": Files.probeContentType(source) ?: "", @@ -249,7 +249,7 @@ class WrrocRenderer implements Renderer { .collect { source, target -> [ "@id" : crateRootDir.relativize(target).toString(), - "@type" : "File", + "@type" : getType(source), "name" : target.name, "description" : "", "encodingFormat": Files.probeContentType(target) ?: "", @@ -738,4 +738,18 @@ class WrrocRenderer implements Renderer { static def boolean isNested(Object obj) { return (obj instanceof Map || obj instanceof List) } + + /** + * Check if a Path is a file or a directory and return corresponding "@type" + * @param path The path to be checked + * @return type Either "File" or "Directory" + */ + static def String getType(Path path) { + String type = "File" + + if(path.isDirectory()) + type = "Directory" + + return type + } } From 7560d4c41b04ece2987114b9a5ae36189500c5ff Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Famke=20B=C3=A4uerle?= <45968370+famosab@users.noreply.github.com> Date: Mon, 9 Dec 2024 08:05:20 +0100 Subject: [PATCH 07/54] feat: add README to crate (#14) * feat: add README to create * feat: ignore vscode * fix: make getIntermediateOutputFiles work again (#18) (#19) * fx: make getIntermediateOutputFiles work again * Fix bugs fixes #16 fixes #17 --------- Co-authored-by: fbartusch * feat: add README to json * feat: check first if readme exists * Add readme to hasPart Signed-off-by: fbartusch --------- Signed-off-by: fbartusch Co-authored-by: fbartusch --- .gitignore | 3 ++ .../main/nextflow/prov/WrrocRenderer.groovy | 42 ++++++++++++++++++- 2 files changed, 44 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 26b9135..cdf34dc 100644 --- a/.gitignore +++ b/.gitignore @@ -8,3 +8,6 @@ gradle.properties build work results + +# Ignore vscode dirs +.vscode \ No newline at end of file diff --git a/plugins/nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy b/plugins/nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy index b6b9c22..621d73c 100644 --- a/plugins/nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy +++ b/plugins/nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy @@ -31,6 +31,8 @@ import groovy.transform.CompileStatic import nextflow.Session import nextflow.processor.TaskRun +import org.apache.commons.io.FilenameUtils; + /** * Renderer for the Provenance Run RO Crate format. * @@ -154,6 +156,42 @@ class WrrocRenderer implements Renderer { FileWriter configFileWriter = new FileWriter(configFilePath.toString()) configMap.toConfigObject().writeTo(configFileWriter) + // get workflow README file and store it in crate + boolean readmeExists = false + List readmeFiles = ["README.md", "README.txt", "readme.md", "readme.txt", "Readme.md", "Readme.txt", "README"] + Path readmeFilePath = null + String readmeFileName = null + String readmeFileExtension = null + String readmeFileEncoding = null + + for (String fileName : readmeFiles) { + Path potentialReadmePath = projectDir.resolve(fileName) + if (Files.exists(potentialReadmePath)) { + readmeExists = true + readmeFilePath = potentialReadmePath + readmeFileName = fileName + if (FilenameUtils.getExtension(fileName).equals("md")) + readmeFileEncoding = "text/markdown" + else + readmeFileEncoding = "text/plain" + break + } + } + def readmeFile = null + + // Copy the README file into RO-Crate if it exists + if (readmeExists) { + Files.copy(readmeFilePath, crateRootDir.resolve(readmeFileName), StandardCopyOption.REPLACE_EXISTING) + readmeFile = + [ + "@id" : readmeFileName, + "@type" : "File", + "name" : readmeFileName, + "description" : "This is the README file of the workflow.", + "encodingFormat": readmeFileEncoding + ] + } + // get workflow metadata final metadata = session.workflowMetadata this.normalizer = new PathNormalizer(metadata) @@ -434,8 +472,9 @@ class WrrocRenderer implements Renderer { "hasPart" : [ ["@id": metadata.projectName], ["@id": "nextflow.config"], + readmeExists ? ["@id": readmeFile.get("@id")] : null, *uniqueInputOutputFiles.collect(file -> ["@id": file["@id"]]) - ], + ].findAll { it != null }, "mainEntity" : ["@id": metadata.projectName], "mentions" : [ ["@id": "#${session.uniqueId}"], @@ -537,6 +576,7 @@ class WrrocRenderer implements Renderer { *controlActions, *createActions, configFile, + readmeFile, *uniqueInputOutputFiles, *propertyValues, license From e6e3844962d96a85e2f1ccdb450ad384ea21db78 Mon Sep 17 00:00:00 2001 From: fbartusch Date: Mon, 9 Dec 2024 08:58:21 +0100 Subject: [PATCH 08/54] Set correct MIME types * Add getEncodingFormat function that return the encoding format for a file * handle YAML files manually Signed-off-by: fbartusch --- .../main/nextflow/prov/WrrocRenderer.groovy | 86 ++++++++++++++++--- 1 file changed, 72 insertions(+), 14 deletions(-) diff --git a/plugins/nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy b/plugins/nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy index 1e81fea..2eed1fe 100644 --- a/plugins/nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy +++ b/plugins/nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy @@ -30,6 +30,7 @@ import groovy.json.JsonOutput import groovy.transform.CompileStatic import nextflow.Session import nextflow.processor.TaskRun +import org.apache.commons.io.FilenameUtils /** * Renderer for the Provenance Run RO Crate format. @@ -221,14 +222,13 @@ class WrrocRenderer implements Renderer { "additionalType": "String", // "defaultValue": "", "conformsTo" : ["@id": "https://bioschemas.org/profiles/FormalParameter/1.0-RELEASE"], - "description" : "", - // TODO: apply only if type is Path - // "encodingFormat": "text/plain", + "description" : null, + "encodingFormat": getEncodingFormat(value), // TODO: match to output if type is Path // "workExample": ["@id": outputId], "name" : name, // "valueRequired": "True" - ] + ].findAll { it.value != null } } final inputFiles = workflowInputMapping @@ -237,12 +237,12 @@ class WrrocRenderer implements Renderer { "@id" : crateRootDir.relativize(target).toString(), "@type" : getType(source), "name" : target.name, - "description" : "", - "encodingFormat": Files.probeContentType(source) ?: "", - "fileType": "whatever", + "description" : null, + "encodingFormat": getEncodingFormat(source, target), + //"fileType": "whatever", // TODO: apply if matching param is found // "exampleOfWork": ["@id": paramId] - ] + ].findAll { it.value != null } } final outputFiles = workflowOutputs @@ -251,16 +251,17 @@ class WrrocRenderer implements Renderer { "@id" : crateRootDir.relativize(target).toString(), "@type" : getType(source), "name" : target.name, - "description" : "", - "encodingFormat": Files.probeContentType(target) ?: "", + "description" : null, + "encodingFormat": getEncodingFormat(source, target), // TODO: create FormalParameter for each output file? // "exampleOfWork": {"@id": "#reversed"} - ] + ].findAll { it.value != null } } // Combine both, inputFiles and outputFiles into one list. Remove duplicates that occur when an intermediate // file is output of a task and input of another task. - Map> combinedInputOutputMap = [:] + //Map> combinedInputOutputMap = [:] + Map combinedInputOutputMap = [:] inputFiles.each { entry -> combinedInputOutputMap[entry['@id']] = entry @@ -269,7 +270,7 @@ class WrrocRenderer implements Renderer { outputFiles.each { entry -> combinedInputOutputMap[entry['@id']] = entry } - List> uniqueInputOutputFiles = combinedInputOutputMap.values().toList() + List uniqueInputOutputFiles = combinedInputOutputMap.values().toList() final propertyValues = params .collect { name, value -> @@ -430,7 +431,7 @@ class WrrocRenderer implements Renderer { ["@id": "https://w3id.org/workflowhub/workflow-ro-crate/1.0"] ], "name" : "Workflow run of ${metadata.projectName}", - "description": manifest.description ?: "", + "description": manifest.description ?: null, "hasPart" : [ ["@id": metadata.projectName], ["@id": "nextflow.config"], @@ -741,6 +742,7 @@ class WrrocRenderer implements Renderer { /** * Check if a Path is a file or a directory and return corresponding "@type" + * * @param path The path to be checked * @return type Either "File" or "Directory" */ @@ -752,4 +754,60 @@ class WrrocRenderer implements Renderer { return type } + + /** + * Get the encodingFormat of a file as MIME Type. + * + * @param object An object that may be a file + * @return the MIME type of the object or null, if it's not a file. + */ + static def String getEncodingFormat(Object object) { + + // Check if the object is a string and convert it to a Path + if (object instanceof String) { + Path path = Paths.get((String) object); + return getEncodingFormat(path, null) + } else { + return null + } + } + + + /** + * Get the encodingFormat of a file as MIME Type. + * A file can exist at two places. At the source where Nextflow or the user stored the file, + * or in the RO-Crate (i.e. target) location. The method takes both locations as arguments, if one + * of the locations does not exist any more. + * + * @param source Path to file + * @param target Path to file + * @return the MIME type of the file or null, if it's not a file. + */ + static def String getEncodingFormat(Path source, Path target) { + String mime = null + + if(source && source.exists() && source.isFile()) + mime = Files.probeContentType(source) ?: null + else if(target && target.exists() && target.isFile()) + mime = Files.probeContentType(target) ?: null + else { + return mime + } + + // It seems that YAML has a media type only since beginning of 2024 + // Set this by hand if this is run on older systems: + // https://httptoolkit.com/blog/yaml-media-type-rfc/ + if(!mime) { + String extension = null + if(source) + extension = FilenameUtils.getExtension(source.toString()) + else if(target) + extension = FilenameUtils.getExtension(target.toString()) + + if(["yml", "yaml"].contains(extension)) + mime = "application/yaml" + } + + return mime + } } From 0d3fd2dea590d2ae5e454fd79b1421c79898cc74 Mon Sep 17 00:00:00 2001 From: Felix Bartusch Date: Fri, 13 Dec 2024 14:43:47 +0100 Subject: [PATCH 09/54] Add contactPoint for agent and organization (#21) * implements #1 Signed-off-by: fbartusch --- nextflow.config | 3 + .../main/nextflow/prov/WrrocRenderer.groovy | 64 ++++++++++++++++++- 2 files changed, 65 insertions(+), 2 deletions(-) diff --git a/nextflow.config b/nextflow.config index b4c7573..b401f6b 100644 --- a/nextflow.config +++ b/nextflow.config @@ -26,6 +26,9 @@ prov { agent { name = "John Doe" orcid = "https://orcid.org/0000-0000-0000-0000" + email = "john.doe@example.org" + phone = "(0)89-99998 000" + contactType = "Researcher" } organization { name = "University of XYZ" diff --git a/plugins/nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy b/plugins/nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy index 621d73c..de1aeb9 100644 --- a/plugins/nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy +++ b/plugins/nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy @@ -52,6 +52,8 @@ class WrrocRenderer implements Renderer { private LinkedHashMap agent private LinkedHashMap organization + // List of contactPoints (people, organizations) to be added to ro-crate-metadata.json + private List contactPoints = [] private String publisherID private boolean overwrite @@ -573,6 +575,7 @@ class WrrocRenderer implements Renderer { ], *[agent], *[organization], + *contactPoints, *controlActions, *createActions, configFile, @@ -700,18 +703,28 @@ class WrrocRenderer implements Renderer { * @param params Nextflow parameters * @return Map describing agent via '@id'. 'orcid' and 'name' */ - static def LinkedHashMap parseAgentInfo(Map params) { + def LinkedHashMap parseAgentInfo(Map params) { final LinkedHashMap agent = new LinkedHashMap() if (! params.containsKey("agent")) return null Map agentMap = params["agent"] as Map + agent.put("@id", agentMap.containsKey("orcid") ? agentMap.get("orcid") : "agent-1") agent.put("@type", "Person") if(agentMap.containsKey("name")) agent.put("name", agentMap.get("name")) + // Check for contact information + if(agentMap.containsKey("email") || agentMap.containsKey("phone")) { + // Add contact point to ro-crate-metadata.json + String contactPointID = parseContactPointInfo(agentMap) + if(contactPointID) + agent.put("contactPoint", ["@id": contactPointID ]) + + } + return agent } @@ -722,7 +735,7 @@ class WrrocRenderer implements Renderer { * @param params Nextflow parameters * @return Map describing organization via '@id'. 'orcid' and 'name' */ - static def LinkedHashMap parseOrganizationInfo(Map params) { + def LinkedHashMap parseOrganizationInfo(Map params) { final LinkedHashMap org = new LinkedHashMap() if (! params.containsKey("organization")) @@ -734,10 +747,57 @@ class WrrocRenderer implements Renderer { if(orgMap.containsKey("name")) org.put("name", orgMap.get("name")) + // Check for contact information + if(orgMap.containsKey("email") || orgMap.containsKey("phone")) { + // Add contact point to ro-crate-metadata.json + String contactPointID = parseContactPointInfo(orgMap) + if(contactPointID) + org.put("contactPoint", ["@id": contactPointID ]) + } + return org } + /** + * Parse information about contact point and add to contactPoints list. + * + * @param params Map describing an agent or organization + * @return ID of the contactPoint + */ + def String parseContactPointInfo(Map map) { + + String contactPointID = "" + final LinkedHashMap contactPoint = new LinkedHashMap() + + // Prefer email for the contact point ID + if(map.containsKey("email")) + contactPointID = "mailto:" + map.get("email") + else if(map.containsKey("phone")) + contactPointID = map.get("phone") + else + return null + + contactPoint.put("@id", contactPointID) + contactPoint.put("@type", "ContactPoint") + if(map.containsKey("contactType")) + contactPoint.put("contactType", map.get("contactType")) + if(map.containsKey("email")) + contactPoint.put("email", map.get("email")) + if(map.containsKey("phone")) + contactPoint.put("phone", map.get("phone")) + if(map.containsKey("orcid")) + contactPoint.put("url", map.get("orcid")) + if(map.containsKey("orcid")) + contactPoint.put("url", map.get("orcid")) + if(map.containsKey("rar")) + contactPoint.put("url", map.get("rar")) + + contactPoints.add(contactPoint) + return contactPointID + } + + /** * Parse information about the RO-Crate publisher. * From 816cf17f454c7c013c481f9927c2176bb0ac217e Mon Sep 17 00:00:00 2001 From: Felix Bartusch Date: Fri, 13 Dec 2024 14:44:29 +0100 Subject: [PATCH 10/54] Fix #4 (#22) * main workflow complies (more or less) with ComputationalWorkflow profile version 1.0 (if set in manifest add license, url, version, description, ...) * Correct value vor ActionStatus Signed-off-by: fbartusch --- .../main/nextflow/prov/WrrocRenderer.groovy | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/plugins/nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy b/plugins/nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy index de1aeb9..f189c1b 100644 --- a/plugins/nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy +++ b/plugins/nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy @@ -387,7 +387,7 @@ class WrrocRenderer implements Renderer { "agent" : ["@id": agent.get("@id").toString()], "object" : objectFileIDs.collect(file -> ["@id": file]), "result" : resultFileIDs.collect(file -> ["@id": file]), - "actionStatus": task.getExitStatus() == 0 ? "CompletedActionStatus" : "FailedActionStatus" + "actionStatus": task.getExitStatus() == 0 ? "http://schema.org/CompletedActionStatus" : "http://schema.org/FailedActionStatus" ] // Add error message if there is one @@ -469,8 +469,8 @@ class WrrocRenderer implements Renderer { ["@id": "https://w3id.org/ro/wfrun/provenance/0.1"], ["@id": "https://w3id.org/workflowhub/workflow-ro-crate/1.0"] ], - "name" : "Workflow run of ${metadata.projectName}", - "description": manifest.description ?: "", + "name" : "Workflow run of " + manifest.getName() ?: metadata.projectName, + "description": manifest.description ?: null, "hasPart" : [ ["@id": metadata.projectName], ["@id": "nextflow.config"], @@ -511,9 +511,16 @@ class WrrocRenderer implements Renderer { [ "@id" : metadata.projectName, "@type" : ["File", "SoftwareSourceCode", "ComputationalWorkflow", "HowTo"], - "encodingFormat" : "application/nextflow", - "name" : metadata.projectName, + "conformsTo" : ["@id": "https://bioschemas.org/profiles/ComputationalWorkflow/1.0-RELEASE"], + "name" : manifest.getName() ?: metadata.projectName, + "description" : manifest.getDescription() ?: null, "programmingLanguage": ["@id": "https://w3id.org/workflowhub/workflow-ro-crate#nextflow"], + "creator" : manifest.getAuthor() ?: null, + "version" : manifest.getVersion() ?: null, + "license" : manifest.getLicense() ?: null, + "url" : manifest.getHomePage() ?: null, + "encodingFormat" : "application/nextflow", + "runtimePlatform" : manifest.getNextflowVersion() ? "Nextflow " + manifest.getNextflowVersion() : null, "hasPart" : wfSofwareApplications.collect(sa -> ["@id": sa["@id"]] ), @@ -526,7 +533,7 @@ class WrrocRenderer implements Renderer { "step" : howToSteps.collect(step -> ["@id": step["@id"]] ), - ], + ].findAll { it.value != null }, [ "@id" : "https://w3id.org/workflowhub/workflow-ro-crate#nextflow", "@type" : "ComputerLanguage", From 7f0264fe61b0f3b5b099814f3a87c4d0527a6ec7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Famke=20B=C3=A4uerle?= <45968370+famosab@users.noreply.github.com> Date: Wed, 18 Dec 2024 14:57:45 +0100 Subject: [PATCH 11/54] start with metaYaml imports (#12) * start with metaYaml imports * merge dev-wrroc into metaYaml (#23) * add encodingFormat for nextflow.config * add encodingFormat for main.nf * feat: add wrroc to valid formats * fix: make getIntermediateOutputFiles work again (#18) * fx: make getIntermediateOutputFiles work again * Fix bugs fixes #16 fixes #17 --------- Co-authored-by: fbartusch * feat: add README to crate (#14) * feat: add README to create * feat: ignore vscode * fix: make getIntermediateOutputFiles work again (#18) (#19) * fx: make getIntermediateOutputFiles work again * Fix bugs fixes #16 fixes #17 --------- Co-authored-by: fbartusch * feat: add README to json * feat: check first if readme exists * Add readme to hasPart Signed-off-by: fbartusch --------- Signed-off-by: fbartusch Co-authored-by: fbartusch --------- Signed-off-by: fbartusch Co-authored-by: fbartusch * WIP * only add from meta if meta exists * remove usage from ext args * add module name to id --------- Signed-off-by: fbartusch Co-authored-by: fbartusch --- .../main/nextflow/prov/WrrocRenderer.groovy | 84 +++++++++++++++++-- 1 file changed, 78 insertions(+), 6 deletions(-) diff --git a/plugins/nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy b/plugins/nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy index b2adc65..91a6b60 100644 --- a/plugins/nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy +++ b/plugins/nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy @@ -20,6 +20,7 @@ import nextflow.config.ConfigMap import nextflow.file.FileHolder import nextflow.script.params.FileInParam import nextflow.script.params.FileOutParam +import nextflow.script.ScriptMeta import java.nio.file.* import java.nio.file.attribute.BasicFileAttributes @@ -30,9 +31,10 @@ import groovy.json.JsonOutput import groovy.transform.CompileStatic import nextflow.Session import nextflow.processor.TaskRun -import org.apache.commons.io.FilenameUtils +import nextflow.processor.* +import org.yaml.snakeyaml.Yaml -import org.apache.commons.io.FilenameUtils; +import org.apache.commons.io.FilenameUtils /** * Renderer for the Provenance Run RO Crate format. @@ -407,13 +409,65 @@ class WrrocRenderer implements Renderer { final wfSofwareApplications = nextflowProcesses .collect() { process -> + def metaYaml = readMetaYaml(process) + if (metaYaml == null) { + return [ + "@id" : "#" + process.ownerScript.toString(), + "@type" : "SoftwareApplication", + "name" : process.getName(), + ] + } + + def moduleName = metaYaml.get('name') as String + def toolNames = [] + + metaYaml.get('tools')?.each { tool -> + def entry = (tool as Map).entrySet().first() + def toolName = entry.key as String + toolNames << toolName + } + [ - "@id" : "#" + process.ownerScript.toString(), - "@type": "SoftwareApplication", - "name" : process.getName() + "@id" : "#" + process.ownerScript.toString(), + "@type" : "SoftwareApplication", + "name" : process.getName(), + "hasPart": toolNames.isEmpty() ? null : toolNames.collect { name -> ["@id": moduleName + '-' + name] } ] } + final perTool = nextflowProcesses + .collect() { process -> + def metaYaml = readMetaYaml(process) + if (metaYaml == null) { + return null + } + + def moduleName = metaYaml.get('name') as String + def listOfToolMaps = [] + metaYaml.get('tools')?.each { tool -> listOfToolMaps.add(tool as Map) } + + def softwareMaps = listOfToolMaps.collect { toolMap -> + def entry = (toolMap as Map).entrySet().first() + def toolName = entry.key as String + def toolDescription = (entry.value as Map)?.get('description') as String + [(toolName): toolDescription] + } + + // Create a list of SoftwareApplication entries + def softwareApplications = softwareMaps.collect { softwareMap -> + def entry = (softwareMap as Map).entrySet().first() + def toolName = entry.key as String + [ + "@id" : moduleName + '-' + toolName, + "@type" : "SoftwareApplication", + "name" : toolName, + "description" : entry.value?.toString() ?: "" + ] + } + + return softwareApplications + }.findAll { it != null }.flatten() + final howToSteps = nextflowProcesses .collect() { process -> [ @@ -544,6 +598,7 @@ class WrrocRenderer implements Renderer { "version" : nextflowVersion ], *wfSofwareApplications, + *perTool, *formalParameters, [ "@id" : "#${softwareApplicationId}", @@ -590,7 +645,7 @@ class WrrocRenderer implements Renderer { readmeFile, *uniqueInputOutputFiles, *propertyValues, - license + license, ].findAll { it != null } ] @@ -836,6 +891,23 @@ class WrrocRenderer implements Renderer { return publisherID } + /** + * Read meta.yaml (nf-core style) file for a given Nextflow process. + * + * @param TaskProcessor processor Nextflow process + * @return Yaml as Map + */ + static Map readMetaYaml(TaskProcessor processor) { + Path metaFile = ScriptMeta.get(processor.getOwnerScript()).getModuleDir().resolve('meta.yml') + + if (Files.exists(metaFile)) { + Yaml yaml = new Yaml() + return yaml.load(metaFile.text) as Map + } + + return null + } + /** * Check if a groovy object contains nested structures, e.g. will not be flattened when serialized as JSON From daf9725c2d8399c6b18ab3405d031a0930e42919 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Famke=20Ba=CC=88uerle?= Date: Wed, 18 Dec 2024 15:03:24 +0100 Subject: [PATCH 12/54] add information to README --- README.md | 60 ++++++++++++++++++++++++++++++++++++++----------------- 1 file changed, 42 insertions(+), 18 deletions(-) diff --git a/README.md b/README.md index 0e68a96..a54b181 100644 --- a/README.md +++ b/README.md @@ -6,7 +6,7 @@ Nextflow plugin to render provenance reports for pipeline runs. Now supporting [ The `nf-prov` plugin requires Nextflow version `23.04.0` or later. -*New in version 1.3.0: requires Nextflow 24.10.0 or later.* +_New in version 1.3.0: requires Nextflow 24.10.0 or later._ To enable and configure `nf-prov`, include the following snippet to your Nextflow config and update as needed. @@ -26,11 +26,11 @@ prov { } ``` -Finally, run your Nextflow pipeline. You do not need to modify your pipeline script in order to use the `nf-prov` plugin. The plugin will automatically generate a JSON file with provenance information. +Finally, run your Nextflow pipeline. You do not need to modify your pipeline script in order to use the `nf-prov` plugin. The plugin will automatically produce the specified provenance reports at the end of the workflow run. ## Configuration -*The `file`, `format`, and `overwrite` options have been deprecated since version 1.2.0. Use `formats` instead.* +_The `file`, `format`, and `overwrite` options have been deprecated since version 1.2.0. Use `formats` instead._ The following options are available: @@ -40,18 +40,22 @@ Create the provenance report (default: `true` if plugin is loaded). `prov.formats` -*New in version 1.2.0* +_New in version 1.2.0_ Configuration scope for the desired output formats. The following formats are available: - `bco`: Render a [BioCompute Object](https://biocomputeobject.org/). Supports the `file` and `overwrite` options. - *New in version 1.3.0*: additional "pass-through" options are available for BCO fields that can't be inferred from the pipeline. See [BCO.md](./BCO.md) for more information. + _New in version 1.3.0_: additional "pass-through" options are available for BCO fields that can't be inferred from the pipeline. See [BCO.md](./BCO.md) for more information. - `dag`: Render the task graph as a Mermaid diagram embedded in an HTML document. Supports the `file` and `overwrite` options. - `legacy`: Render the legacy format originally defined in this plugin (default). Supports the `file` and `overwrite` options. +_New in version 1.4.0_ + +- `wrroc`: Render a [Workflow Run RO-Crate](https://www.researchobject.org/workflow-run-crate/). Includes all three profiles (Process, Workflow, and Provenance). + Any number of formats can be specified, for example: ```groovy @@ -65,6 +69,27 @@ prov { file = 'manifest.json' overwrite = true } + wrroc { + file = 'ro-crate-metadata.json' + overwrite = true + agent { + name = "John Doe" + orcid = "https://orcid.org/0000-0000-0000-0000" + email = "john.doe@example.org" + phone = "(0)89-99998 000" + contactType = "Researcher" + } + organization { + name = "University of XYZ" + ror = "https://ror.org/000000000" + isPublisher = true + } + publisher { + id = "https://ror.org/000000000" + } + license = "https://spdx.org/licenses/Apache-2.0" + profile = "provenance_run_crate" +} } } ``` @@ -107,23 +132,22 @@ Following these step to package, upload and publish the plugin: 1. Create a file named `gradle.properties` in the project root containing the following attributes (this file should not be committed in the project repository): - * `github_organization`: the GitHub organisation the plugin project is hosted - * `github_username` The GitHub username granting access to the plugin project. - * `github_access_token`: The GitHub access token required to upload and commit changes in the plugin repository. - * `github_commit_email`: The email address associated with your GitHub account. +- `github_organization`: the GitHub organisation the plugin project is hosted +- `github_username` The GitHub username granting access to the plugin project. +- `github_access_token`: The GitHub access token required to upload and commit changes in the plugin repository. +- `github_commit_email`: The email address associated with your GitHub account. 2. Update the `Plugin-Version` field in the following file with the release version: - ```bash - plugins/nf-prov/src/resources/META-INF/MANIFEST.MF - ``` + ```bash + plugins/nf-prov/src/resources/META-INF/MANIFEST.MF + ``` 3. Run the following command to package and upload the plugin in the GitHub project releases page: - ```bash - ./gradlew :plugins:nf-prov:upload - ``` - -4. Create a pull request against the [nextflow-io/plugins](https://github.com/nextflow-io/plugins/blob/main/plugins.json) - project to make the plugin public accessible to Nextflow app. + ```bash + ./gradlew :plugins:nf-prov:upload + ``` +4. Create a pull request against the [nextflow-io/plugins](https://github.com/nextflow-io/plugins/blob/main/plugins.json) + project to make the plugin public accessible to Nextflow app. From f271d910bb78b40d48f65599febd8566a8cc59f4 Mon Sep 17 00:00:00 2001 From: Ben Sherman Date: Fri, 10 Jan 2025 16:01:15 -0600 Subject: [PATCH 13/54] cleanup Signed-off-by: Ben Sherman --- .gitignore | 3 - README.md | 42 ++-- nextflow.config | 7 +- .../main/nextflow/prov/WrrocRenderer.groovy | 198 ++++++++---------- 4 files changed, 100 insertions(+), 150 deletions(-) diff --git a/.gitignore b/.gitignore index cdf34dc..26b9135 100644 --- a/.gitignore +++ b/.gitignore @@ -8,6 +8,3 @@ gradle.properties build work results - -# Ignore vscode dirs -.vscode \ No newline at end of file diff --git a/README.md b/README.md index a54b181..a00100c 100644 --- a/README.md +++ b/README.md @@ -6,7 +6,7 @@ Nextflow plugin to render provenance reports for pipeline runs. Now supporting [ The `nf-prov` plugin requires Nextflow version `23.04.0` or later. -_New in version 1.3.0: requires Nextflow 24.10.0 or later._ +*New in version 1.3.0: requires Nextflow 24.10.0 or later.* To enable and configure `nf-prov`, include the following snippet to your Nextflow config and update as needed. @@ -30,7 +30,7 @@ Finally, run your Nextflow pipeline. You do not need to modify your pipeline scr ## Configuration -_The `file`, `format`, and `overwrite` options have been deprecated since version 1.2.0. Use `formats` instead._ +*The `file`, `format`, and `overwrite` options have been deprecated since version 1.2.0. Use `formats` instead.* The following options are available: @@ -40,19 +40,19 @@ Create the provenance report (default: `true` if plugin is loaded). `prov.formats` -_New in version 1.2.0_ +*New in version 1.2.0* Configuration scope for the desired output formats. The following formats are available: - `bco`: Render a [BioCompute Object](https://biocomputeobject.org/). Supports the `file` and `overwrite` options. - _New in version 1.3.0_: additional "pass-through" options are available for BCO fields that can't be inferred from the pipeline. See [BCO.md](./BCO.md) for more information. + *New in version 1.3.0*: additional "pass-through" options are available for BCO fields that can't be inferred from the pipeline. See [BCO.md](./BCO.md) for more information. - `dag`: Render the task graph as a Mermaid diagram embedded in an HTML document. Supports the `file` and `overwrite` options. - `legacy`: Render the legacy format originally defined in this plugin (default). Supports the `file` and `overwrite` options. -_New in version 1.4.0_ +*New in version 1.4.0* - `wrroc`: Render a [Workflow Run RO-Crate](https://www.researchobject.org/workflow-run-crate/). Includes all three profiles (Process, Workflow, and Provenance). @@ -69,31 +69,12 @@ prov { file = 'manifest.json' overwrite = true } - wrroc { - file = 'ro-crate-metadata.json' - overwrite = true - agent { - name = "John Doe" - orcid = "https://orcid.org/0000-0000-0000-0000" - email = "john.doe@example.org" - phone = "(0)89-99998 000" - contactType = "Researcher" - } - organization { - name = "University of XYZ" - ror = "https://ror.org/000000000" - isPublisher = true - } - publisher { - id = "https://ror.org/000000000" - } - license = "https://spdx.org/licenses/Apache-2.0" - profile = "provenance_run_crate" -} } } ``` +See [nextflow.config](./nextflow.config) for a full example of each provenance format. + `prov.patterns` List of file patterns to include in the provenance report, from the set of published files. By default, all published files are included. @@ -132,10 +113,10 @@ Following these step to package, upload and publish the plugin: 1. Create a file named `gradle.properties` in the project root containing the following attributes (this file should not be committed in the project repository): -- `github_organization`: the GitHub organisation the plugin project is hosted -- `github_username` The GitHub username granting access to the plugin project. -- `github_access_token`: The GitHub access token required to upload and commit changes in the plugin repository. -- `github_commit_email`: The email address associated with your GitHub account. + * `github_organization`: the GitHub organisation the plugin project is hosted + * `github_username` The GitHub username granting access to the plugin project. + * `github_access_token`: The GitHub access token required to upload and commit changes in the plugin repository. + * `github_commit_email`: The email address associated with your GitHub account. 2. Update the `Plugin-Version` field in the following file with the release version: @@ -151,3 +132,4 @@ Following these step to package, upload and publish the plugin: 4. Create a pull request against the [nextflow-io/plugins](https://github.com/nextflow-io/plugins/blob/main/plugins.json) project to make the plugin public accessible to Nextflow app. + diff --git a/nextflow.config b/nextflow.config index b401f6b..e3618f4 100644 --- a/nextflow.config +++ b/nextflow.config @@ -38,8 +38,11 @@ prov { publisher { id = "https://ror.org/000000000" } - license = "https://spdx.org/licenses/Apache-2.0" profile = "provenance_run_crate" } } -} \ No newline at end of file +} + +manifest { + license = "https://spdx.org/licenses/Apache-2.0" +} diff --git a/plugins/nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy b/plugins/nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy index 91a6b60..3132b8e 100644 --- a/plugins/nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy +++ b/plugins/nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy @@ -16,13 +16,12 @@ package nextflow.prov -import nextflow.config.ConfigMap -import nextflow.file.FileHolder -import nextflow.script.params.FileInParam -import nextflow.script.params.FileOutParam -import nextflow.script.ScriptMeta - -import java.nio.file.* +import java.nio.file.FileVisitResult +import java.nio.file.Files +import java.nio.file.Path +import java.nio.file.Paths +import java.nio.file.SimpleFileVisitor +import java.nio.file.StandardCopyOption import java.nio.file.attribute.BasicFileAttributes import java.time.LocalDateTime import java.time.format.DateTimeFormatter @@ -30,40 +29,45 @@ import java.time.format.DateTimeFormatter import groovy.json.JsonOutput import groovy.transform.CompileStatic import nextflow.Session +import nextflow.config.ConfigMap +import nextflow.file.FileHolder +import nextflow.processor.TaskProcessor import nextflow.processor.TaskRun -import nextflow.processor.* -import org.yaml.snakeyaml.Yaml - +import nextflow.script.params.FileInParam +import nextflow.script.params.FileOutParam +import nextflow.script.ScriptMeta import org.apache.commons.io.FilenameUtils +import org.yaml.snakeyaml.Yaml /** * Renderer for the Provenance Run RO Crate format. * * @author Ben Sherman * @author Felix Bartusch + * @author Famke Bäuerle */ @CompileStatic class WrrocRenderer implements Renderer { private Path path + + private boolean overwrite + + @Delegate + private PathNormalizer normalizer + // The final RO-Crate directory private Path crateRootDir // Nextflow work directory private Path workdir // Nextflow pipeline directory (contains main.nf, assets, etc.) private Path projectDir - - private LinkedHashMap agent - private LinkedHashMap organization + private Map agent + private Map organization // List of contactPoints (people, organizations) to be added to ro-crate-metadata.json - private List contactPoints = [] + private List contactPoints = [] private String publisherID - private boolean overwrite - - @Delegate - private PathNormalizer normalizer - WrrocRenderer(Map opts) { path = opts.file as Path overwrite = opts.overwrite as Boolean @@ -71,11 +75,12 @@ class WrrocRenderer implements Renderer { ProvHelper.checkFileOverwrite(path, overwrite) } - @Override - void render(Session session, Set tasks, Map workflowOutputs) { + private static final List README_FILENAMES = List.of("README.md", "README.txt", "readme.md", "readme.txt", "Readme.md", "Readme.txt", "README") - final params = session.getBinding().getParams() as Map - final configMap = new ConfigMap(session.getConfig()) + @Override + void render(Session session, Set tasks, Map workflowOutputs) { + final params = session.params + final configMap = session.config // Set RO-Crate Root and workdir this.crateRootDir = Path.of(params['outdir'].toString()).toAbsolutePath() @@ -87,16 +92,15 @@ class WrrocRenderer implements Renderer { final workflowInputs = ProvHelper.getWorkflowInputs(tasks, taskLookup) // Add intermediate input files (produced by workflow tasks and consumed by other tasks) - workflowInputs.addAll(getIntermediateInputFiles(tasks, workflowInputs)); - final Map workflowInputMapping = getWorkflowInputMapping(workflowInputs) + workflowInputs.addAll(getIntermediateInputFiles(tasks, workflowInputs)) + final workflowInputMapping = getWorkflowInputMapping(workflowInputs) // Add intermediate output files (produced by workflow tasks and consumed by other tasks) - workflowOutputs.putAll(getIntermediateOutputFiles(tasks, workflowOutputs)); + workflowOutputs.putAll(getIntermediateOutputFiles(tasks, workflowOutputs)) // Copy workflow input files into RO-Crate - workflowInputMapping.each {source, dest -> - - if (Files.isDirectory(source)) { + workflowInputMapping.each { source, dest -> + if( Files.isDirectory(source) ) { // Recursively copy directory and its contents Files.walkFileTree(source, new SimpleFileVisitor() { @Override @@ -117,7 +121,7 @@ class WrrocRenderer implements Renderer { } else { try { Files.createDirectories(dest.getParent()) - if (!Files.exists(dest)) + if( !Files.exists(dest) ) Files.copy(source, dest) } catch (Exception e) { println "workflowInput: Failed to copy $source to $dest: ${e.message}" @@ -126,9 +130,8 @@ class WrrocRenderer implements Renderer { } // Copy workflow output files into RO-Crate - workflowOutputs.each {source, dest -> - - if (Files.isDirectory(source)) { + workflowOutputs.each { source, dest -> + if( Files.isDirectory(source) ) { // Recursively copy directory and its contents Files.walkFileTree(source, new SimpleFileVisitor() { @Override @@ -157,44 +160,30 @@ class WrrocRenderer implements Renderer { } // get workflow config and store it in crate - Path configFilePath = crateRootDir.resolve("nextflow.config") - FileWriter configFileWriter = new FileWriter(configFilePath.toString()) + final configFilePath = crateRootDir.resolve("nextflow.config") + final configFileWriter = new FileWriter(configFilePath.toString()) configMap.toConfigObject().writeTo(configFileWriter) // get workflow README file and store it in crate - boolean readmeExists = false - List readmeFiles = ["README.md", "README.txt", "readme.md", "readme.txt", "Readme.md", "Readme.txt", "README"] - Path readmeFilePath = null - String readmeFileName = null - String readmeFileExtension = null - String readmeFileEncoding = null - - for (String fileName : readmeFiles) { - Path potentialReadmePath = projectDir.resolve(fileName) - if (Files.exists(potentialReadmePath)) { - readmeExists = true - readmeFilePath = potentialReadmePath - readmeFileName = fileName - if (FilenameUtils.getExtension(fileName).equals("md")) - readmeFileEncoding = "text/markdown" - else - readmeFileEncoding = "text/plain" - break - } - } - def readmeFile = null - - // Copy the README file into RO-Crate if it exists - if (readmeExists) { - Files.copy(readmeFilePath, crateRootDir.resolve(readmeFileName), StandardCopyOption.REPLACE_EXISTING) - readmeFile = - [ - "@id" : readmeFileName, - "@type" : "File", - "name" : readmeFileName, - "description" : "This is the README file of the workflow.", - "encodingFormat": readmeFileEncoding - ] + Map readmeFile = null + + for( final fileName : README_FILENAMES ) { + final readmeFilePath = projectDir.resolve(fileName) + if( !Files.exists(readmeFilePath) ) + continue + + final encoding = FilenameUtils.getExtension(fileName).equals("md") + ? "text/markdown" + : "text/plain" + readmeFile = [ + "@id" : fileName, + "@type" : "File", + "name" : fileName, + "description" : "This is the README file of the workflow.", + "encodingFormat": encoding + ] + Files.copy(readmeFilePath, crateRootDir.resolve(fileName), StandardCopyOption.REPLACE_EXISTING) + break } // get workflow metadata @@ -209,7 +198,7 @@ class WrrocRenderer implements Renderer { final dateStarted = formatter.format(metadata.start) final dateCompleted = formatter.format(metadata.complete) final nextflowVersion = nextflowMeta.version.toString() - final wrrocParams = session.config.prov["formats"]["wrroc"] as Map + final wrrocParams = session.config.navigate('prov.formats.wrroc', [:]) as Map // Copy workflow into crate directory Files.copy(scriptFile, crateRootDir.resolve(scriptFile.getFileName()), StandardCopyOption.REPLACE_EXISTING) @@ -217,10 +206,9 @@ class WrrocRenderer implements Renderer { // Copy nextflow_schema_json into crate if it exists final schemaFile = scriptFile.getParent().resolve("nextflow_schema.json") // TODO Add to crate metadata - if (Files.exists(schemaFile)) + if( Files.exists(schemaFile) ) Files.copy(schemaFile, crateRootDir.resolve(schemaFile.getFileName()), StandardCopyOption.REPLACE_EXISTING) - // create manifest final softwareApplicationId = UUID.randomUUID() final organizeActionId = UUID.randomUUID() @@ -229,31 +217,14 @@ class WrrocRenderer implements Renderer { agent = parseAgentInfo(wrrocParams) organization = parseOrganizationInfo(wrrocParams) publisherID = getPublisherID(wrrocParams, agent, organization) - if(organization) + if( organization ) agent.put("affiliation", ["@id": organization.get("@id")]) - //license = parseLicenseInfo(wrrocParams) - // license information - boolean licenseURLvalid = false - String licenseString = null; - URI licenseURL = null - Map license = null - if (wrrocParams.containsKey("license")) { - licenseString = wrrocParams.get("license") - try { - licenseURL = new URL(licenseString).toURI(); - licenseURLvalid = true - - // Entity for license URL - license = [ - "@id" : licenseURL.toString(), - "@type": "CreativeWork" - ] - } catch (Exception e) { - licenseURLvalid = false - } - } + final license = [ + "@id" : manifest.license, + "@type": "CreativeWork" + ] final formalParameters = params .collect { name, value -> @@ -302,7 +273,6 @@ class WrrocRenderer implements Renderer { // Combine both, inputFiles and outputFiles into one list. Remove duplicates that occur when an intermediate // file is output of a task and input of another task. - //Map> combinedInputOutputMap = [:] Map combinedInputOutputMap = [:] inputFiles.each { entry -> @@ -328,7 +298,7 @@ class WrrocRenderer implements Renderer { // Maps used for finding tasks/CreateActions corresponding to a Nextflow process Map processToTasks = [:].withDefault { [] } - def createActions = tasks + final createActions = tasks .collect { task -> List resultFileIDs = [] @@ -529,7 +499,7 @@ class WrrocRenderer implements Renderer { "hasPart" : [ ["@id": metadata.projectName], ["@id": "nextflow.config"], - readmeExists ? ["@id": readmeFile.get("@id")] : null, + readmeFile ? ["@id": readmeFile["@id"]] : null, *uniqueInputOutputFiles.collect(file -> ["@id": file["@id"]]) ].findAll { it != null }, "mainEntity" : ["@id": metadata.projectName], @@ -537,7 +507,7 @@ class WrrocRenderer implements Renderer { ["@id": "#${session.uniqueId}"], *createActions.collect(createAction -> ["@id": createAction["@id"]]) ], - "license" : licenseURLvalid ? ["@id": licenseURL.toString()] : licenseString + "license" : license ].findAll { it.value != null }, [ "@id" : "https://w3id.org/ro/wfrun/process/0.1", @@ -672,7 +642,7 @@ class WrrocRenderer implements Renderer { return intermediateInputFiles } - def Map getIntermediateOutputFiles(Set tasks, Map workflowOutputs) { + Map getIntermediateOutputFiles(Set tasks, Map workflowOutputs) { List intermediateOutputFilesList = [] Map intermediateOutputFilesMap = [:] @@ -716,7 +686,7 @@ class WrrocRenderer implements Renderer { * @param paths Input file paths on the file system * @return Map of input file paths into the RO-Crate */ - def Map getWorkflowInputMapping(Set paths) { + Map getWorkflowInputMapping(Set paths) { // The resulting mapping Map workflowInputMapping = [:] @@ -766,8 +736,8 @@ class WrrocRenderer implements Renderer { * @param params Nextflow parameters * @return Map describing agent via '@id'. 'orcid' and 'name' */ - def LinkedHashMap parseAgentInfo(Map params) { - final LinkedHashMap agent = new LinkedHashMap() + Map parseAgentInfo(Map params) { + final agent = [:] if (! params.containsKey("agent")) return null @@ -785,7 +755,6 @@ class WrrocRenderer implements Renderer { String contactPointID = parseContactPointInfo(agentMap) if(contactPointID) agent.put("contactPoint", ["@id": contactPointID ]) - } return agent @@ -798,8 +767,8 @@ class WrrocRenderer implements Renderer { * @param params Nextflow parameters * @return Map describing organization via '@id'. 'orcid' and 'name' */ - def LinkedHashMap parseOrganizationInfo(Map params) { - final LinkedHashMap org = new LinkedHashMap() + Map parseOrganizationInfo(Map params) { + final org = [:] if (! params.containsKey("organization")) return null @@ -828,10 +797,10 @@ class WrrocRenderer implements Renderer { * @param params Map describing an agent or organization * @return ID of the contactPoint */ - def String parseContactPointInfo(Map map) { + String parseContactPointInfo(Map map) { String contactPointID = "" - final LinkedHashMap contactPoint = new LinkedHashMap() + final contactPoint = [:] // Prefer email for the contact point ID if(map.containsKey("email")) @@ -867,7 +836,7 @@ class WrrocRenderer implements Renderer { * @param params Nextflow parameters * @return Publisher ID */ - static def String getPublisherID(Map params, Map agent, Map organization) { + static String getPublisherID(Map params, Map agent, Map organization) { if (! params.containsKey("publisher")) return null @@ -906,8 +875,7 @@ class WrrocRenderer implements Renderer { } return null - } - + } /** * Check if a groovy object contains nested structures, e.g. will not be flattened when serialized as JSON @@ -915,7 +883,7 @@ class WrrocRenderer implements Renderer { * @param obj The object to be checked * @return true if the object contains nested structures */ - static def boolean isNested(Object obj) { + static boolean isNested(Object obj) { return (obj instanceof Map || obj instanceof List) } @@ -925,7 +893,7 @@ class WrrocRenderer implements Renderer { * @param path The path to be checked * @return type Either "File" or "Directory" */ - static def String getType(Path path) { + static String getType(Path path) { String type = "File" if(path.isDirectory()) @@ -940,18 +908,17 @@ class WrrocRenderer implements Renderer { * @param object An object that may be a file * @return the MIME type of the object or null, if it's not a file. */ - static def String getEncodingFormat(Object object) { + static String getEncodingFormat(Object object) { // Check if the object is a string and convert it to a Path if (object instanceof String) { - Path path = Paths.get((String) object); + Path path = Paths.get((String) object) return getEncodingFormat(path, null) } else { return null } } - /** * Get the encodingFormat of a file as MIME Type. * A file can exist at two places. At the source where Nextflow or the user stored the file, @@ -962,7 +929,7 @@ class WrrocRenderer implements Renderer { * @param target Path to file * @return the MIME type of the file or null, if it's not a file. */ - static def String getEncodingFormat(Path source, Path target) { + static String getEncodingFormat(Path source, Path target) { String mime = null if(source && source.exists() && source.isFile()) @@ -989,4 +956,5 @@ class WrrocRenderer implements Renderer { return mime } + } From 70a5f51c46fe46ad6572541a7197765548064d26 Mon Sep 17 00:00:00 2001 From: Ben Sherman Date: Mon, 13 Jan 2025 18:19:26 -0600 Subject: [PATCH 14/54] Set root crate dir to parent of wrroc path Signed-off-by: Ben Sherman --- plugins/nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/plugins/nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy b/plugins/nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy index 3132b8e..3b057c4 100644 --- a/plugins/nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy +++ b/plugins/nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy @@ -83,7 +83,7 @@ class WrrocRenderer implements Renderer { final configMap = session.config // Set RO-Crate Root and workdir - this.crateRootDir = Path.of(params['outdir'].toString()).toAbsolutePath() + this.crateRootDir = path.getParent() this.workdir = session.getWorkDir() this.projectDir = session.getWorkflowMetadata().getProjectDir() From 8f060ca5e396c17b6a5b0cde33d19be094fef218 Mon Sep 17 00:00:00 2001 From: Ben Sherman Date: Mon, 13 Jan 2025 18:24:28 -0600 Subject: [PATCH 15/54] cleanup Signed-off-by: Ben Sherman --- .../main/nextflow/prov/WrrocRenderer.groovy | 73 ++++++++----------- 1 file changed, 29 insertions(+), 44 deletions(-) diff --git a/plugins/nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy b/plugins/nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy index 3b057c4..c62ca26 100644 --- a/plugins/nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy +++ b/plugins/nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy @@ -19,7 +19,6 @@ package nextflow.prov import java.nio.file.FileVisitResult import java.nio.file.Files import java.nio.file.Path -import java.nio.file.Paths import java.nio.file.SimpleFileVisitor import java.nio.file.StandardCopyOption import java.nio.file.attribute.BasicFileAttributes @@ -62,14 +61,11 @@ class WrrocRenderer implements Renderer { private Path workdir // Nextflow pipeline directory (contains main.nf, assets, etc.) private Path projectDir - private Map agent - private Map organization // List of contactPoints (people, organizations) to be added to ro-crate-metadata.json private List contactPoints = [] - private String publisherID WrrocRenderer(Map opts) { - path = opts.file as Path + path = (opts.file as Path).complete() overwrite = opts.overwrite as Boolean ProvHelper.checkFileOverwrite(path, overwrite) @@ -214,9 +210,9 @@ class WrrocRenderer implements Renderer { final organizeActionId = UUID.randomUUID() // Process wrroc configuration options - agent = parseAgentInfo(wrrocParams) - organization = parseOrganizationInfo(wrrocParams) - publisherID = getPublisherID(wrrocParams, agent, organization) + final agent = parseAgentInfo(wrrocParams) + final organization = parseOrganizationInfo(wrrocParams) + final publisherID = getPublisherID(wrrocParams, agent, organization) if( organization ) agent.put("affiliation", ["@id": organization.get("@id")]) @@ -282,7 +278,7 @@ class WrrocRenderer implements Renderer { outputFiles.each { entry -> combinedInputOutputMap[entry['@id']] = entry } - List uniqueInputOutputFiles = combinedInputOutputMap.values().toList() + final uniqueInputOutputFiles = combinedInputOutputMap.values().toList() final propertyValues = params .collect { name, value -> @@ -375,7 +371,8 @@ class WrrocRenderer implements Renderer { .collect { task -> processToTasks[task.getProcessor().getId().toString()].add("#${task.getHash().toString()}") return task.getProcessor() - }.unique() + } + .unique() final wfSofwareApplications = nextflowProcesses .collect() { process -> @@ -436,7 +433,9 @@ class WrrocRenderer implements Renderer { } return softwareApplications - }.findAll { it != null }.flatten() + } + .findAll { it != null } + .flatten() final howToSteps = nextflowProcesses .collect() { process -> @@ -461,14 +460,13 @@ class WrrocRenderer implements Renderer { ] } - final configFile = - [ - "@id" : "nextflow.config", - "@type" : "File", - "name" : "Effective Nextflow configuration", - "description" : "This is the effective configuration during runtime compiled from all configuration sources.", - "encodingFormat": "text/plain" - ] + final configFile = [ + "@id" : "nextflow.config", + "@type" : "File", + "name" : "Effective Nextflow configuration", + "description" : "This is the effective configuration during runtime compiled from all configuration sources.", + "encodingFormat": "text/plain" + ] final wrroc = [ "@context": "https://w3id.org/ro/crate/1.1/context", @@ -575,7 +573,6 @@ class WrrocRenderer implements Renderer { "@type": "SoftwareApplication", "name" : "Nextflow ${nextflowVersion}" ], - *howToSteps, [ "@id" : "#${organizeActionId}", @@ -606,8 +603,8 @@ class WrrocRenderer implements Renderer { ["@id": file["@id"]] ) ], - *[agent], - *[organization], + agent, + organization, *contactPoints, *controlActions, *createActions, @@ -820,8 +817,6 @@ class WrrocRenderer implements Renderer { contactPoint.put("phone", map.get("phone")) if(map.containsKey("orcid")) contactPoint.put("url", map.get("orcid")) - if(map.containsKey("orcid")) - contactPoint.put("url", map.get("orcid")) if(map.containsKey("rar")) contactPoint.put("url", map.get("rar")) @@ -867,14 +862,10 @@ class WrrocRenderer implements Renderer { * @return Yaml as Map */ static Map readMetaYaml(TaskProcessor processor) { - Path metaFile = ScriptMeta.get(processor.getOwnerScript()).getModuleDir().resolve('meta.yml') - - if (Files.exists(metaFile)) { - Yaml yaml = new Yaml() - return yaml.load(metaFile.text) as Map - } - - return null + final metaFile = ScriptMeta.get(processor.getOwnerScript()).getModuleDir().resolve('meta.yml') + return Files.exists(metaFile) + ? new Yaml().load(metaFile.text) as Map + : null } /** @@ -894,12 +885,9 @@ class WrrocRenderer implements Renderer { * @return type Either "File" or "Directory" */ static String getType(Path path) { - String type = "File" - - if(path.isDirectory()) - type = "Directory" - - return type + return path.isDirectory() + ? "Directory" + : "File" } /** @@ -911,12 +899,9 @@ class WrrocRenderer implements Renderer { static String getEncodingFormat(Object object) { // Check if the object is a string and convert it to a Path - if (object instanceof String) { - Path path = Paths.get((String) object) - return getEncodingFormat(path, null) - } else { - return null - } + return object instanceof String + ? getEncodingFormat(Path.of(object), null) + : null } /** From 98d2cc42f99d2120aefd41d5e6e58a3b23298eda Mon Sep 17 00:00:00 2001 From: Ben Sherman Date: Mon, 13 Jan 2025 18:58:57 -0600 Subject: [PATCH 16/54] Add helper functions Signed-off-by: Ben Sherman --- .../main/nextflow/prov/WrrocRenderer.groovy | 175 +++++++++--------- 1 file changed, 83 insertions(+), 92 deletions(-) diff --git a/plugins/nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy b/plugins/nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy index c62ca26..1764a7a 100644 --- a/plugins/nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy +++ b/plugins/nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy @@ -224,7 +224,7 @@ class WrrocRenderer implements Renderer { final formalParameters = params .collect { name, value -> - [ + withoutNulls([ "@id" : "#${name}", "@type" : "FormalParameter", // TODO: infer type from value at runtime @@ -237,12 +237,12 @@ class WrrocRenderer implements Renderer { // "workExample": ["@id": outputId], "name" : name, // "valueRequired": "True" - ].findAll { it.value != null } + ]) } final inputFiles = workflowInputMapping .collect { source, target -> - [ + withoutNulls([ "@id" : crateRootDir.relativize(target).toString(), "@type" : getType(source), "name" : target.name, @@ -251,12 +251,12 @@ class WrrocRenderer implements Renderer { //"fileType": "whatever", // TODO: apply if matching param is found // "exampleOfWork": ["@id": paramId] - ].findAll { it.value != null } + ]) } final outputFiles = workflowOutputs .collect { source, target -> - [ + withoutNulls([ "@id" : crateRootDir.relativize(target).toString(), "@type" : getType(source), "name" : target.name, @@ -264,12 +264,12 @@ class WrrocRenderer implements Renderer { "encodingFormat": getEncodingFormat(source, target), // TODO: create FormalParameter for each output file? // "exampleOfWork": {"@id": "#reversed"} - ].findAll { it.value != null } + ]) } // Combine both, inputFiles and outputFiles into one list. Remove duplicates that occur when an intermediate // file is output of a task and input of another task. - Map combinedInputOutputMap = [:] + final combinedInputOutputMap = [:] inputFiles.each { entry -> combinedInputOutputMap[entry['@id']] = entry @@ -353,9 +353,9 @@ class WrrocRenderer implements Renderer { // TODO: Same as for startTime //"endTime": "", "instrument" : ["@id": "#" + task.getProcessor().ownerScript.toString()], - "agent" : ["@id": agent.get("@id").toString()], - "object" : objectFileIDs.collect(file -> ["@id": file]), - "result" : resultFileIDs.collect(file -> ["@id": file]), + "agent" : ["@id": agent.get("@id")], + "object" : objectFileIDs.collect(id -> ["@id": id]), + "result" : resultFileIDs.collect(id -> ["@id": id]), "actionStatus": task.getExitStatus() == 0 ? "http://schema.org/CompletedActionStatus" : "http://schema.org/FailedActionStatus" ] @@ -374,9 +374,9 @@ class WrrocRenderer implements Renderer { } .unique() - final wfSofwareApplications = nextflowProcesses + final workflowSofwareApplications = nextflowProcesses .collect() { process -> - def metaYaml = readMetaYaml(process) + final metaYaml = readMetaYaml(process) if (metaYaml == null) { return [ "@id" : "#" + process.ownerScript.toString(), @@ -384,58 +384,49 @@ class WrrocRenderer implements Renderer { "name" : process.getName(), ] } - - def moduleName = metaYaml.get('name') as String - def toolNames = [] - - metaYaml.get('tools')?.each { tool -> - def entry = (tool as Map).entrySet().first() - def toolName = entry.key as String - toolNames << toolName - } - - [ + + final moduleName = metaYaml.get('name') as String + final toolNames = metaYaml.containsKey('tools') + ? metaYaml.get('tools').collect { tool -> + final entry = (tool as Map).entrySet().first() + entry.key as String + } + : [] + + final parts = !toolNames.isEmpty() + ? toolNames.collect { name -> ["@id": moduleName + '-' + name] } + : null + + return [ "@id" : "#" + process.ownerScript.toString(), "@type" : "SoftwareApplication", "name" : process.getName(), - "hasPart": toolNames.isEmpty() ? null : toolNames.collect { name -> ["@id": moduleName + '-' + name] } + "hasPart": parts ] } - final perTool = nextflowProcesses - .collect() { process -> - def metaYaml = readMetaYaml(process) - if (metaYaml == null) { - return null - } - - def moduleName = metaYaml.get('name') as String - def listOfToolMaps = [] - metaYaml.get('tools')?.each { tool -> listOfToolMaps.add(tool as Map) } - - def softwareMaps = listOfToolMaps.collect { toolMap -> - def entry = (toolMap as Map).entrySet().first() - def toolName = entry.key as String - def toolDescription = (entry.value as Map)?.get('description') as String - [(toolName): toolDescription] - } - - // Create a list of SoftwareApplication entries - def softwareApplications = softwareMaps.collect { softwareMap -> - def entry = (softwareMap as Map).entrySet().first() - def toolName = entry.key as String - [ - "@id" : moduleName + '-' + toolName, - "@type" : "SoftwareApplication", - "name" : toolName, - "description" : entry.value?.toString() ?: "" - ] - } - - return softwareApplications + final toolSoftwareApplications = nextflowProcesses + .collect { process -> readMetaYaml(process) } + .findAll { metaYaml -> metaYaml != null } + .collectMany { metaYaml -> + final moduleName = metaYaml.get('name') as String + final toolMaps = metaYaml.containsKey('tools') + ? metaYaml.get('tools').collect { tool -> tool as Map } + : [] + + return toolMaps + .collect { toolMap -> + final entry = toolMap.entrySet().first() + final toolName = entry.key as String + final toolDescription = (entry.value as Map)?.get('description') as String + return [ + "@id" : moduleName + '-' + toolName, + "@type" : "SoftwareApplication", + "name" : toolName, + "description" : entry.value?.toString() ?: "" + ] + } } - .findAll { it != null } - .flatten() final howToSteps = nextflowProcesses .collect() { process -> @@ -454,9 +445,7 @@ class WrrocRenderer implements Renderer { "@type" : "ControlAction", "instrument": ["@id": "${metadata.projectName}#main/${process.getName()}"], "name" : "orchestrate " + "${metadata.projectName}#main/${process.getName()}", - "object" : processToTasks[process.getId().toString()].collect({ taskID -> - ["@id": taskID] - }) + "object" : asReferences(processToTasks[process.getId().toString()]) ] } @@ -470,7 +459,7 @@ class WrrocRenderer implements Renderer { final wrroc = [ "@context": "https://w3id.org/ro/crate/1.1/context", - "@graph" : [ + "@graph" : withoutNulls([ [ "@id" : path.name, "@type" : "CreativeWork", @@ -480,10 +469,10 @@ class WrrocRenderer implements Renderer { ["@id": "https://w3id.org/workflowhub/workflow-ro-crate/1.0"] ] ], - [ + withoutNulls([ "@id" : "./", "@type" : "Dataset", - "author" : ["@id": agent.get("@id").toString()], + "author" : ["@id": agent.get("@id")], "publisher" : publisherID ? ["@id": publisherID] : null, "datePublished": getDatePublished(), "conformsTo" : [ @@ -494,19 +483,19 @@ class WrrocRenderer implements Renderer { ], "name" : "Workflow run of " + manifest.getName() ?: metadata.projectName, "description": manifest.description ?: null, - "hasPart" : [ + "hasPart" : withoutNulls([ ["@id": metadata.projectName], ["@id": "nextflow.config"], readmeFile ? ["@id": readmeFile["@id"]] : null, - *uniqueInputOutputFiles.collect(file -> ["@id": file["@id"]]) - ].findAll { it != null }, + *asReferences(uniqueInputOutputFiles) + ]), "mainEntity" : ["@id": metadata.projectName], "mentions" : [ ["@id": "#${session.uniqueId}"], - *createActions.collect(createAction -> ["@id": createAction["@id"]]) + *asReferences(createActions) ], "license" : license - ].findAll { it.value != null }, + ]), [ "@id" : "https://w3id.org/ro/wfrun/process/0.1", "@type" : "CreativeWork", @@ -531,7 +520,7 @@ class WrrocRenderer implements Renderer { "name" : "Workflow RO-Crate", "version": "1.0" ], - [ + withoutNulls([ "@id" : metadata.projectName, "@type" : ["File", "SoftwareSourceCode", "ComputationalWorkflow", "HowTo"], "conformsTo" : ["@id": "https://bioschemas.org/profiles/ComputationalWorkflow/1.0-RELEASE"], @@ -544,19 +533,13 @@ class WrrocRenderer implements Renderer { "url" : manifest.getHomePage() ?: null, "encodingFormat" : "application/nextflow", "runtimePlatform" : manifest.getNextflowVersion() ? "Nextflow " + manifest.getNextflowVersion() : null, - "hasPart" : wfSofwareApplications.collect(sa -> - ["@id": sa["@id"]] - ), - "input" : formalParameters.collect(fp -> - ["@id": fp["@id"]] - ), + "hasPart" : asReferences(workflowSofwareApplications), + "input" : asReferences(formalParameters), "output" : [ // TODO: id of FormalParameter for each output file ], - "step" : howToSteps.collect(step -> - ["@id": step["@id"]] - ), - ].findAll { it.value != null }, + "step" : asReferences(howToSteps), + ]), [ "@id" : "https://w3id.org/workflowhub/workflow-ro-crate#nextflow", "@type" : "ComputerLanguage", @@ -565,8 +548,8 @@ class WrrocRenderer implements Renderer { "url" : "https://www.nextflow.io/", "version" : nextflowVersion ], - *wfSofwareApplications, - *perTool, + *workflowSofwareApplications, + *toolSoftwareApplications, *formalParameters, [ "@id" : "#${softwareApplicationId}", @@ -577,12 +560,10 @@ class WrrocRenderer implements Renderer { [ "@id" : "#${organizeActionId}", "@type" : "OrganizeAction", - "agent" : ["@id": agent.get("@id").toString()], + "agent" : ["@id": agent.get("@id")], "instrument": ["@id": "#${softwareApplicationId}"], "name" : "Run of Nextflow ${nextflowVersion}", - "object" : [ - *controlActions.collect(action -> ["@id": action["@id"]]) - ], + "object" : asReferences(controlActions), "result" : ["@id": "#${session.uniqueId}"], "startTime" : dateStarted, "endTime" : dateCompleted @@ -590,18 +571,16 @@ class WrrocRenderer implements Renderer { [ "@id" : "#${session.uniqueId}", "@type" : "CreateAction", - "agent" : ["@id": agent.get("@id").toString()], + "agent" : ["@id": agent.get("@id")], "name" : "Nextflow workflow run ${session.uniqueId}", "startTime" : dateStarted, "endTime" : dateCompleted, "instrument": ["@id": metadata.projectName], "object" : [ - *inputFiles.collect(file -> ["@id": file["@id"]]), - *propertyValues.collect(pv -> ["@id": pv["@id"]]) + *asReferences(inputFiles), + *asReferences(propertyValues) ], - "result" : outputFiles.collect(file -> - ["@id": file["@id"]] - ) + "result" : asReferences(outputFiles) ], agent, organization, @@ -613,7 +592,7 @@ class WrrocRenderer implements Renderer { *uniqueInputOutputFiles, *propertyValues, license, - ].findAll { it != null } + ]) ] // render manifest to JSON file @@ -942,4 +921,16 @@ class WrrocRenderer implements Renderer { return mime } + private static List asReferences(List values) { + return values.collect { value -> ["@id": value["@id"]] } + } + + private static List withoutNulls(List list) { + return list.findAll { v -> v != null } + } + + private static Map withoutNulls(Map map) { + return map.findAll { k, v -> v != null } + } + } From 4410cba00d2d1b184036f825a46a24f55c52c8cf Mon Sep 17 00:00:00 2001 From: Ben Sherman Date: Tue, 14 Jan 2025 09:19:36 -0600 Subject: [PATCH 17/54] Don't copy intermediate files into crate, normalize inputs against project dir Signed-off-by: Ben Sherman --- .../main/nextflow/prov/WrrocRenderer.groovy | 423 ++++-------------- 1 file changed, 92 insertions(+), 331 deletions(-) diff --git a/plugins/nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy b/plugins/nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy index 1764a7a..e828fc9 100644 --- a/plugins/nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy +++ b/plugins/nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy @@ -16,24 +16,17 @@ package nextflow.prov -import java.nio.file.FileVisitResult import java.nio.file.Files import java.nio.file.Path -import java.nio.file.SimpleFileVisitor import java.nio.file.StandardCopyOption -import java.nio.file.attribute.BasicFileAttributes import java.time.LocalDateTime import java.time.format.DateTimeFormatter import groovy.json.JsonOutput import groovy.transform.CompileStatic import nextflow.Session -import nextflow.config.ConfigMap -import nextflow.file.FileHolder import nextflow.processor.TaskProcessor import nextflow.processor.TaskRun -import nextflow.script.params.FileInParam -import nextflow.script.params.FileOutParam import nextflow.script.ScriptMeta import org.apache.commons.io.FilenameUtils import org.yaml.snakeyaml.Yaml @@ -75,92 +68,39 @@ class WrrocRenderer implements Renderer { @Override void render(Session session, Set tasks, Map workflowOutputs) { - final params = session.params - final configMap = session.config - - // Set RO-Crate Root and workdir - this.crateRootDir = path.getParent() - this.workdir = session.getWorkDir() - this.projectDir = session.getWorkflowMetadata().getProjectDir() - // get workflow inputs final taskLookup = ProvHelper.getTaskLookup(tasks) final workflowInputs = ProvHelper.getWorkflowInputs(tasks, taskLookup) - // Add intermediate input files (produced by workflow tasks and consumed by other tasks) - workflowInputs.addAll(getIntermediateInputFiles(tasks, workflowInputs)) - final workflowInputMapping = getWorkflowInputMapping(workflowInputs) - - // Add intermediate output files (produced by workflow tasks and consumed by other tasks) - workflowOutputs.putAll(getIntermediateOutputFiles(tasks, workflowOutputs)) - - // Copy workflow input files into RO-Crate - workflowInputMapping.each { source, dest -> - if( Files.isDirectory(source) ) { - // Recursively copy directory and its contents - Files.walkFileTree(source, new SimpleFileVisitor() { - @Override - FileVisitResult preVisitDirectory(Path dir, BasicFileAttributes attrs) throws IOException { - Path targetDir = dest.resolve(source.relativize(dir)) - Files.createDirectories(targetDir) - return FileVisitResult.CONTINUE - } + // get workflow metadata + final metadata = session.workflowMetadata + this.crateRootDir = path.getParent() + this.workdir = session.workDir + this.projectDir = metadata.projectDir + this.normalizer = new PathNormalizer(metadata) - @Override - FileVisitResult visitFile(Path file, BasicFileAttributes attrs) throws IOException { - Path targetFile = dest.resolve(source.relativize(file)) - if (!Files.exists(targetFile)) - Files.copy(file, targetFile) - return FileVisitResult.CONTINUE - } - }) - } else { - try { - Files.createDirectories(dest.getParent()) - if( !Files.exists(dest) ) - Files.copy(source, dest) - } catch (Exception e) { - println "workflowInput: Failed to copy $source to $dest: ${e.message}" - } - } - } + final manifest = metadata.manifest + final nextflowMeta = metadata.nextflow + final scriptFile = metadata.getScriptFile() - // Copy workflow output files into RO-Crate - workflowOutputs.each { source, dest -> - if( Files.isDirectory(source) ) { - // Recursively copy directory and its contents - Files.walkFileTree(source, new SimpleFileVisitor() { - @Override - FileVisitResult preVisitDirectory(Path dir, BasicFileAttributes attrs) throws IOException { - Path targetDir = dest.resolve(source.relativize(dir)) - Files.createDirectories(targetDir) - return FileVisitResult.CONTINUE - } + final formatter = DateTimeFormatter.ISO_OFFSET_DATE_TIME + final dateStarted = formatter.format(metadata.start) + final dateCompleted = formatter.format(metadata.complete) + final nextflowVersion = nextflowMeta.version.toString() + final params = session.params + final wrrocParams = session.config.navigate('prov.formats.wrroc', [:]) as Map - @Override - FileVisitResult visitFile(Path file, BasicFileAttributes attrs) throws IOException { - Path targetFile = dest.resolve(source.relativize(file)) - if (!Files.exists(targetFile)) - Files.copy(file, targetFile) - return FileVisitResult.CONTINUE - } - }) - } else { - try { - Files.createDirectories(dest.getParent()) - Files.copy(source, dest, StandardCopyOption.REPLACE_EXISTING) - } catch (Exception e) { - println "workflowOutput Failed to copy $source to $dest: ${e.message}" - } - } + // warn about any output files outside of the crate directory + workflowOutputs.each { source, target -> + if( !target.startsWith(crateRootDir) ) + println "Workflow output file $target is outside of the RO-crate directory" } - // get workflow config and store it in crate - final configFilePath = crateRootDir.resolve("nextflow.config") - final configFileWriter = new FileWriter(configFilePath.toString()) - configMap.toConfigObject().writeTo(configFileWriter) + // save resolved config + final configPath = crateRootDir.resolve("nextflow.config") + configPath.text = session.config.toConfigObject() - // get workflow README file and store it in crate + // save pipeline README file Map readmeFile = null for( final fileName : README_FILENAMES ) { @@ -182,20 +122,6 @@ class WrrocRenderer implements Renderer { break } - // get workflow metadata - final metadata = session.workflowMetadata - this.normalizer = new PathNormalizer(metadata) - - final manifest = metadata.manifest - final nextflowMeta = metadata.nextflow - final scriptFile = metadata.getScriptFile() - - final formatter = DateTimeFormatter.ISO_OFFSET_DATE_TIME - final dateStarted = formatter.format(metadata.start) - final dateCompleted = formatter.format(metadata.complete) - final nextflowVersion = nextflowMeta.version.toString() - final wrrocParams = session.config.navigate('prov.formats.wrroc', [:]) as Map - // Copy workflow into crate directory Files.copy(scriptFile, crateRootDir.resolve(scriptFile.getFileName()), StandardCopyOption.REPLACE_EXISTING) @@ -240,110 +166,64 @@ class WrrocRenderer implements Renderer { ]) } - final inputFiles = workflowInputMapping - .collect { source, target -> + final inputFiles = workflowInputs + .collect { source -> withoutNulls([ - "@id" : crateRootDir.relativize(target).toString(), + "@id" : normalizePath(source), "@type" : getType(source), - "name" : target.name, + "name" : source.name, "description" : null, - "encodingFormat": getEncodingFormat(source, target), + "encodingFormat": getEncodingFormat(source), //"fileType": "whatever", // TODO: apply if matching param is found // "exampleOfWork": ["@id": paramId] ]) } + final intermediateFiles = tasks.collectMany { task -> + ProvHelper.getTaskOutputs(task).collect { target -> + withoutNulls([ + "@id" : normalizePath(target), + "@type" : getType(target), + "name" : target.name, + "encodingFormat": getEncodingFormat(target), + ]) + } + } + final outputFiles = workflowOutputs .collect { source, target -> withoutNulls([ "@id" : crateRootDir.relativize(target).toString(), - "@type" : getType(source), + "@type" : getType(target), "name" : target.name, "description" : null, - "encodingFormat": getEncodingFormat(source, target), + "encodingFormat": getEncodingFormat(target), // TODO: create FormalParameter for each output file? // "exampleOfWork": {"@id": "#reversed"} ]) } - // Combine both, inputFiles and outputFiles into one list. Remove duplicates that occur when an intermediate - // file is output of a task and input of another task. - final combinedInputOutputMap = [:] - - inputFiles.each { entry -> - combinedInputOutputMap[entry['@id']] = entry - } - // Overwriting if 'id' already exists - outputFiles.each { entry -> - combinedInputOutputMap[entry['@id']] = entry - } - final uniqueInputOutputFiles = combinedInputOutputMap.values().toList() - final propertyValues = params .collect { name, value -> - [ + final normalized = + (value instanceof List || value instanceof Map) ? JsonOutput.toJson(value) + : value instanceof CharSequence ? normalizePath(value.toString()) + : value + + return [ "@id" : "#${name}-pv", "@type" : "PropertyValue", "exampleOfWork": ["@id": "#${name}"], "name" : name, - "value" : isNested(value) ? JsonOutput.toJson(value) : value + "value" : normalized ] } - // Maps used for finding tasks/CreateActions corresponding to a Nextflow process - Map processToTasks = [:].withDefault { [] } - final createActions = tasks .collect { task -> - List resultFileIDs = [] - - // Collect output files of the path - List outputFileList = [] - for (taskOutputParam in task.getOutputsByType(FileOutParam)) { - - if (taskOutputParam.getValue() instanceof Path) { - outputFileList.add(taskOutputParam.getValue() as Path) - continue - } - - for (taskOutputFile in taskOutputParam.getValue()) { - // Path to file in workdir - outputFileList.add(Path.of(taskOutputFile.toString())) - } - } - - // Check if the output files have a mapping in workflowOutputs - for (outputFile in outputFileList) { - if (workflowOutputs.containsKey(outputFile)) { - resultFileIDs.add(crateRootDir.relativize(workflowOutputs.get(outputFile)).toString()) - } else { - System.out.println("taskOutput not contained in workflowOutputs list: " + outputFile) - } - } - - List objectFileIDs = [] - for (taskInputParam in task.getInputsByType(FileInParam)) { - for (taskInputFileHolder in taskInputParam.getValue()) { - FileHolder holder = (FileHolder) taskInputFileHolder - Path taskInputFilePath = holder.getStorePath() - - if (workflowInputs.contains(taskInputFilePath)) { - // The mapping of input files to their path in the RO-Crate is only available for files we - // expect (e.g. files in workdir and pipeline assets). Have to handle unexpected files ... - try { - objectFileIDs.add(crateRootDir.relativize(workflowInputMapping.get(taskInputFilePath)).toString()) - } catch(Exception e) { - System.out.println("Unexpected input file: " + taskInputFilePath.toString()) - } - } else { - System.out.println("taskInput not contained in workflowInputs list: " + taskInputFilePath) - } - } - } - - def createAction = [ - "@id" : "#" + task.getHash().toString(), + final createAction = [ + "@id" : "#" + task.hash.toString(), "@type" : "CreateAction", "name" : task.getName(), // TODO: There is no description for Nextflow processes? @@ -352,10 +232,14 @@ class WrrocRenderer implements Renderer { //"startTime": "". // TODO: Same as for startTime //"endTime": "", - "instrument" : ["@id": "#" + task.getProcessor().ownerScript.toString()], + "instrument" : ["@id": "#" + task.processor.ownerScript.toString()], "agent" : ["@id": agent.get("@id")], - "object" : objectFileIDs.collect(id -> ["@id": id]), - "result" : resultFileIDs.collect(id -> ["@id": id]), + "object" : task.getInputFilesMap().collect { name, source -> + ["@id": normalizePath(source)] + }, + "result" : ProvHelper.getTaskOutputs(task).collect { target -> + ["@id": normalizePath(target)] + }, "actionStatus": task.getExitStatus() == 0 ? "http://schema.org/CompletedActionStatus" : "http://schema.org/FailedActionStatus" ] @@ -367,14 +251,11 @@ class WrrocRenderer implements Renderer { return createAction } - final nextflowProcesses = tasks - .collect { task -> - processToTasks[task.getProcessor().getId().toString()].add("#${task.getHash().toString()}") - return task.getProcessor() - } + final processes = tasks + .collect { task -> task.processor } .unique() - final workflowSofwareApplications = nextflowProcesses + final workflowSofwareApplications = processes .collect() { process -> final metaYaml = readMetaYaml(process) if (metaYaml == null) { @@ -405,7 +286,7 @@ class WrrocRenderer implements Renderer { ] } - final toolSoftwareApplications = nextflowProcesses + final toolSoftwareApplications = processes .collect { process -> readMetaYaml(process) } .findAll { metaYaml -> metaYaml != null } .collectMany { metaYaml -> @@ -428,7 +309,7 @@ class WrrocRenderer implements Renderer { } } - final howToSteps = nextflowProcesses + final howToSteps = processes .collect() { process -> [ "@id" : metadata.projectName + "#main/" + process.getName(), @@ -438,14 +319,18 @@ class WrrocRenderer implements Renderer { ] } - final controlActions = nextflowProcesses + final controlActions = processes .collect() { process -> - [ + final taskIds = tasks + .findAll { task -> task.processor == process } + .collect { task -> ["@id": "#" + task.hash.toString()] } + + return [ "@id" : "#" + UUID.randomUUID(), "@type" : "ControlAction", "instrument": ["@id": "${metadata.projectName}#main/${process.getName()}"], "name" : "orchestrate " + "${metadata.projectName}#main/${process.getName()}", - "object" : asReferences(processToTasks[process.getId().toString()]) + "object" : taskIds ] } @@ -487,7 +372,9 @@ class WrrocRenderer implements Renderer { ["@id": metadata.projectName], ["@id": "nextflow.config"], readmeFile ? ["@id": readmeFile["@id"]] : null, - *asReferences(uniqueInputOutputFiles) + *asReferences(inputFiles), + *asReferences(intermediateFiles), + *asReferences(outputFiles) ]), "mainEntity" : ["@id": metadata.projectName], "mentions" : [ @@ -589,7 +476,9 @@ class WrrocRenderer implements Renderer { *createActions, configFile, readmeFile, - *uniqueInputOutputFiles, + *inputFiles, + *intermediateFiles, + *outputFiles, *propertyValues, license, ]) @@ -599,109 +488,6 @@ class WrrocRenderer implements Renderer { path.text = JsonOutput.prettyPrint(JsonOutput.toJson(wrroc)) } - static Set getIntermediateInputFiles(Set tasks, Set workflowInputs) { - Set intermediateInputFiles = [] - - tasks.collect { task -> - for (taskInputParam in task.getInputsByType(FileInParam)) { - for (taskInputFileHolder in taskInputParam.getValue()) { - FileHolder holder = (FileHolder) taskInputFileHolder - Path taskInputFilePath = holder.getStorePath() - - if (!workflowInputs.contains(taskInputFilePath)) { - intermediateInputFiles.add(taskInputFilePath) - } - } - } - } - - return intermediateInputFiles - } - - Map getIntermediateOutputFiles(Set tasks, Map workflowOutputs) { - - List intermediateOutputFilesList = [] - Map intermediateOutputFilesMap = [:] - - tasks.each { task -> - for (taskOutputParam in task.getOutputsByType(FileOutParam)) { - - // If the param is a Path, just add it to the intermediate list - if (taskOutputParam.getValue() instanceof Path) { - intermediateOutputFilesList.add(taskOutputParam.getValue() as Path) - continue - } - - for (taskOutputFile in taskOutputParam.getValue()) { - intermediateOutputFilesList.add(taskOutputFile as Path) - } - } - } - - // Iterate over the file list and create the mapping - for (outputFile in intermediateOutputFilesList) { - if (!workflowOutputs.containsKey(outputFile)) { - - // Find the relative path from workdir - Path relativePath = workdir.relativize(outputFile) - - // Build the new path by combining crateRootDir and the relative part - Path outputFileInCrate = crateRootDir.resolve(workdir.fileName).resolve(relativePath) - - Files.createDirectories(outputFileInCrate.parent) - intermediateOutputFilesMap.put(outputFile, outputFileInCrate) - } - } - - return intermediateOutputFilesMap - } - - /** - * Map input files from Nextflow workdir into the RO-Crate. - * - * @param paths Input file paths on the file system - * @return Map of input file paths into the RO-Crate - */ - Map getWorkflowInputMapping(Set paths) { - - // The resulting mapping - Map workflowInputMapping = [:] - - // Nextflow asset directory - Path assetDir = projectDir.resolve("assets") - - // pipeline_info directory. Although located in the result directory, it is used as input for MultiQC - Path pipelineInfoDir = crateRootDir.resolve("pipeline_info") - - paths.collect { inputPath -> - - // Depending on where the input file is stored, use different Paths for the parent directory. - // We assume that an input file is either stored in the workdir or in the pipeline's asset directory. - Path parentDir = null - if (inputPath.startsWith(workdir)) - parentDir = workdir - else if (inputPath.startsWith(assetDir)) - parentDir = assetDir - else if (inputPath.startsWith(pipelineInfoDir)) - parentDir = pipelineInfoDir - - - // Ignore file with unkown (e.g. null) parentDir - if(parentDir) { - Path relativePath = parentDir.relativize(inputPath) - Path outputFileInCrate = crateRootDir.resolve(parentDir.fileName).resolve(relativePath) - workflowInputMapping.put(inputPath, outputFileInCrate) - } else { - // All other files are simple copied into the crate with their absolute path into the crate root - Path relativePath = Path.of(inputPath.toString().substring(1)) - Path outputFileInCrate = crateRootDir.resolve(relativePath) - workflowInputMapping.put(inputPath, outputFileInCrate) - } - } - - return workflowInputMapping - } - static String getDatePublished() { return LocalDateTime.now().format(DateTimeFormatter.ISO_DATE) } @@ -847,16 +633,6 @@ class WrrocRenderer implements Renderer { : null } - /** - * Check if a groovy object contains nested structures, e.g. will not be flattened when serialized as JSON - * - * @param obj The object to be checked - * @return true if the object contains nested structures - */ - static boolean isNested(Object obj) { - return (obj instanceof Map || obj instanceof List) - } - /** * Check if a Path is a file or a directory and return corresponding "@type" * @@ -872,53 +648,38 @@ class WrrocRenderer implements Renderer { /** * Get the encodingFormat of a file as MIME Type. * - * @param object An object that may be a file - * @return the MIME type of the object or null, if it's not a file. + * @param value A value that may be a file + * @return the MIME type of the value, or null if it's not a file. */ - static String getEncodingFormat(Object object) { + static String getEncodingFormat(Object value) { - // Check if the object is a string and convert it to a Path - return object instanceof String - ? getEncodingFormat(Path.of(object), null) + return value instanceof String + ? getEncodingFormat(Path.of(value)) : null } /** * Get the encodingFormat of a file as MIME Type. - * A file can exist at two places. At the source where Nextflow or the user stored the file, - * or in the RO-Crate (i.e. target) location. The method takes both locations as arguments, if one - * of the locations does not exist any more. * * @param source Path to file - * @param target Path to file - * @return the MIME type of the file or null, if it's not a file. + * @return the MIME type of the file, or null if it's not a file. */ - static String getEncodingFormat(Path source, Path target) { - String mime = null - - if(source && source.exists() && source.isFile()) - mime = Files.probeContentType(source) ?: null - else if(target && target.exists() && target.isFile()) - mime = Files.probeContentType(target) ?: null - else { + static String getEncodingFormat(Path source) { + if( !(source && source.exists() && source.isFile()) ) + return null + + String mime = Files.probeContentType(source) + if( mime ) return mime - } // It seems that YAML has a media type only since beginning of 2024 // Set this by hand if this is run on older systems: // https://httptoolkit.com/blog/yaml-media-type-rfc/ - if(!mime) { - String extension = null - if(source) - extension = FilenameUtils.getExtension(source.toString()) - else if(target) - extension = FilenameUtils.getExtension(target.toString()) - - if(["yml", "yaml"].contains(extension)) - mime = "application/yaml" - } - - return mime + final extension = FilenameUtils.getExtension(source.toString()) + if( ["yml", "yaml"].contains(extension) ) + return "application/yaml" + + return null } private static List asReferences(List values) { From 9a9a816c0d7b7f8b266c56246d2762cc870ba18c Mon Sep 17 00:00:00 2001 From: Ben Sherman Date: Tue, 14 Jan 2025 09:53:13 -0600 Subject: [PATCH 18/54] Fix resolved config Signed-off-by: Ben Sherman --- plugins/nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/plugins/nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy b/plugins/nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy index e828fc9..7b7dd5b 100644 --- a/plugins/nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy +++ b/plugins/nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy @@ -28,6 +28,7 @@ import nextflow.Session import nextflow.processor.TaskProcessor import nextflow.processor.TaskRun import nextflow.script.ScriptMeta +import nextflow.util.ConfigHelper import org.apache.commons.io.FilenameUtils import org.yaml.snakeyaml.Yaml @@ -98,7 +99,7 @@ class WrrocRenderer implements Renderer { // save resolved config final configPath = crateRootDir.resolve("nextflow.config") - configPath.text = session.config.toConfigObject() + configPath.text = ConfigHelper.toCanonicalString(session.config, true) // save pipeline README file Map readmeFile = null From 4220097f666c44405314952e85d41c9759e6887f Mon Sep 17 00:00:00 2001 From: Ben Sherman Date: Tue, 14 Jan 2025 10:20:21 -0600 Subject: [PATCH 19/54] Add CreateAction's for publishing outputs Signed-off-by: Ben Sherman --- .../main/nextflow/prov/WrrocRenderer.groovy | 25 +++++++++++++------ 1 file changed, 18 insertions(+), 7 deletions(-) diff --git a/plugins/nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy b/plugins/nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy index 7b7dd5b..6e32e25 100644 --- a/plugins/nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy +++ b/plugins/nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy @@ -227,12 +227,8 @@ class WrrocRenderer implements Renderer { "@id" : "#" + task.hash.toString(), "@type" : "CreateAction", "name" : task.getName(), - // TODO: There is no description for Nextflow processes? + // TODO: get description from meta yaml or (future) docstring //"description" : "", - // TODO: task doesn't contain startTime information. TaskHandler does, but is not available to WrrocRenderer - //"startTime": "". - // TODO: Same as for startTime - //"endTime": "", "instrument" : ["@id": "#" + task.processor.ownerScript.toString()], "agent" : ["@id": agent.get("@id")], "object" : task.getInputFilesMap().collect { name, source -> @@ -252,6 +248,19 @@ class WrrocRenderer implements Renderer { return createAction } + final publishCreateActions = workflowOutputs + .collect { source, target -> + [ + "@id" : "publish#" + normalizePath(source), + "@type" : "CreateAction", + "name" : "publish", + "instrument" : ["@id": "#${softwareApplicationId}"], + "object" : ["@id": normalizePath(source)], + "result" : ["@id": crateRootDir.relativize(target).toString()], + "actionStatus": "http://schema.org/CompletedActionStatus" + ] + } + final processes = tasks .collect { task -> task.processor } .unique() @@ -380,7 +389,8 @@ class WrrocRenderer implements Renderer { "mainEntity" : ["@id": metadata.projectName], "mentions" : [ ["@id": "#${session.uniqueId}"], - *asReferences(createActions) + *asReferences(taskCreateActions), + *asReferences(publishCreateActions) ], "license" : license ]), @@ -474,7 +484,8 @@ class WrrocRenderer implements Renderer { organization, *contactPoints, *controlActions, - *createActions, + *taskCreateActions, + *publishCreateActions, configFile, readmeFile, *inputFiles, From 3e0b70f3543ba6a0481b9549ffbd10e37600dbc9 Mon Sep 17 00:00:00 2001 From: Ben Sherman Date: Tue, 14 Jan 2025 10:21:03 -0600 Subject: [PATCH 20/54] minor fix Signed-off-by: Ben Sherman --- plugins/nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/plugins/nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy b/plugins/nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy index 6e32e25..3977152 100644 --- a/plugins/nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy +++ b/plugins/nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy @@ -221,7 +221,7 @@ class WrrocRenderer implements Renderer { ] } - final createActions = tasks + final taskCreateActions = tasks .collect { task -> final createAction = [ "@id" : "#" + task.hash.toString(), From d1ba473a4abf3cca6a335cc34f540ef87a36eba3 Mon Sep 17 00:00:00 2001 From: Ben Sherman Date: Tue, 14 Jan 2025 12:00:31 -0600 Subject: [PATCH 21/54] Improve ids for modules, processes, tools, replace main script with repository URL + commit hash Signed-off-by: Ben Sherman --- .../main/nextflow/prov/WrrocRenderer.groovy | 159 +++++++++++------- 1 file changed, 101 insertions(+), 58 deletions(-) diff --git a/plugins/nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy b/plugins/nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy index 3977152..addcde2 100644 --- a/plugins/nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy +++ b/plugins/nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy @@ -27,6 +27,7 @@ import groovy.transform.CompileStatic import nextflow.Session import nextflow.processor.TaskProcessor import nextflow.processor.TaskRun +import nextflow.script.ProcessDef import nextflow.script.ScriptMeta import nextflow.util.ConfigHelper import org.apache.commons.io.FilenameUtils @@ -123,9 +124,6 @@ class WrrocRenderer implements Renderer { break } - // Copy workflow into crate directory - Files.copy(scriptFile, crateRootDir.resolve(scriptFile.getFileName()), StandardCopyOption.REPLACE_EXISTING) - // Copy nextflow_schema_json into crate if it exists final schemaFile = scriptFile.getParent().resolve("nextflow_schema.json") // TODO Add to crate metadata @@ -133,8 +131,8 @@ class WrrocRenderer implements Renderer { Files.copy(schemaFile, crateRootDir.resolve(schemaFile.getFileName()), StandardCopyOption.REPLACE_EXISTING) // create manifest - final softwareApplicationId = UUID.randomUUID() - final organizeActionId = UUID.randomUUID() + final softwareApplicationId = metadata.projectName + '#sa' + final organizeActionId = metadata.projectName + '#organize' // Process wrroc configuration options final agent = parseAgentInfo(wrrocParams) @@ -229,7 +227,7 @@ class WrrocRenderer implements Renderer { "name" : task.getName(), // TODO: get description from meta yaml or (future) docstring //"description" : "", - "instrument" : ["@id": "#" + task.processor.ownerScript.toString()], + "instrument" : ["@id": getModuleId(task.processor)], "agent" : ["@id": agent.get("@id")], "object" : task.getInputFilesMap().collect { name, source -> ["@id": normalizePath(source)] @@ -254,64 +252,65 @@ class WrrocRenderer implements Renderer { "@id" : "publish#" + normalizePath(source), "@type" : "CreateAction", "name" : "publish", - "instrument" : ["@id": "#${softwareApplicationId}"], + "instrument" : ["@id": softwareApplicationId], "object" : ["@id": normalizePath(source)], "result" : ["@id": crateRootDir.relativize(target).toString()], "actionStatus": "http://schema.org/CompletedActionStatus" ] } - final processes = tasks + final taskProcessors = tasks .collect { task -> task.processor } .unique() - final workflowSofwareApplications = processes + final processDefs = taskProcessors + .collect { process -> ScriptMeta.get(process.getOwnerScript()) } + .unique() + .collectMany { meta -> + meta.getDefinitions().findAll { defn -> defn instanceof ProcessDef } + } as List + + final moduleSoftwareApplications = processDefs .collect() { process -> final metaYaml = readMetaYaml(process) if (metaYaml == null) { return [ - "@id" : "#" + process.ownerScript.toString(), + "@id" : getModuleId(process), "@type" : "SoftwareApplication", "name" : process.getName(), ] } final moduleName = metaYaml.get('name') as String - final toolNames = metaYaml.containsKey('tools') - ? metaYaml.get('tools').collect { tool -> - final entry = (tool as Map).entrySet().first() - entry.key as String - } - : [] - - final parts = !toolNames.isEmpty() - ? toolNames.collect { name -> ["@id": moduleName + '-' + name] } - : null + final tools = metaYaml.getOrDefault('tools', []) as List + final parts = tools.collect { tool -> + final entry = (tool as Map).entrySet().first() + final toolName = entry.key as String + ["@id": getToolId(moduleName, toolName)] + } return [ - "@id" : "#" + process.ownerScript.toString(), + "@id" : getModuleId(process), "@type" : "SoftwareApplication", - "name" : process.getName(), - "hasPart": parts + "name" : process.getBaseName(), + "hasPart": !parts.isEmpty() ? parts : null ] } - final toolSoftwareApplications = processes + final toolSoftwareApplications = processDefs .collect { process -> readMetaYaml(process) } .findAll { metaYaml -> metaYaml != null } .collectMany { metaYaml -> final moduleName = metaYaml.get('name') as String - final toolMaps = metaYaml.containsKey('tools') - ? metaYaml.get('tools').collect { tool -> tool as Map } - : [] + final tools = metaYaml.getOrDefault('tools', []) as List - return toolMaps - .collect { toolMap -> - final entry = toolMap.entrySet().first() + return tools + .collect { tool -> + final entry = (tool as Map).entrySet().first() final toolName = entry.key as String final toolDescription = (entry.value as Map)?.get('description') as String return [ - "@id" : moduleName + '-' + toolName, + "@id" : getToolId(moduleName, toolName), "@type" : "SoftwareApplication", "name" : toolName, "description" : entry.value?.toString() ?: "" @@ -319,27 +318,27 @@ class WrrocRenderer implements Renderer { } } - final howToSteps = processes + final howToSteps = taskProcessors .collect() { process -> [ - "@id" : metadata.projectName + "#main/" + process.getName(), + "@id" : getProcessHowToId(metadata.projectName, process), "@type" : "HowToStep", - "workExample": ["@id": "#" + process.ownerScript.toString()], - "position" : process.getId().toString() + "workExample": ["@id": getModuleId(process)], + "position" : process.getId() ] } - final controlActions = processes + final controlActions = taskProcessors .collect() { process -> final taskIds = tasks .findAll { task -> task.processor == process } .collect { task -> ["@id": "#" + task.hash.toString()] } return [ - "@id" : "#" + UUID.randomUUID(), + "@id" : getProcessControlId(metadata.projectName, process), "@type" : "ControlAction", - "instrument": ["@id": "${metadata.projectName}#main/${process.getName()}"], - "name" : "orchestrate " + "${metadata.projectName}#main/${process.getName()}", + "instrument": ["@id": getProcessHowToId(metadata.projectName, process)], + "name" : "Orchestrate process " + process.getName(), "object" : taskIds ] } @@ -376,7 +375,7 @@ class WrrocRenderer implements Renderer { ["@id": "https://w3id.org/ro/wfrun/provenance/0.1"], ["@id": "https://w3id.org/workflowhub/workflow-ro-crate/1.0"] ], - "name" : "Workflow run of " + manifest.getName() ?: metadata.projectName, + "name" : "Workflow run of " + manifest.name ?: metadata.projectName, "description": manifest.description ?: null, "hasPart" : withoutNulls([ ["@id": metadata.projectName], @@ -422,19 +421,20 @@ class WrrocRenderer implements Renderer { "@id" : metadata.projectName, "@type" : ["File", "SoftwareSourceCode", "ComputationalWorkflow", "HowTo"], "conformsTo" : ["@id": "https://bioschemas.org/profiles/ComputationalWorkflow/1.0-RELEASE"], - "name" : manifest.getName() ?: metadata.projectName, - "description" : manifest.getDescription() ?: null, + "name" : manifest.name ?: metadata.projectName, + "description" : manifest.description, "programmingLanguage": ["@id": "https://w3id.org/workflowhub/workflow-ro-crate#nextflow"], - "creator" : manifest.getAuthor() ?: null, - "version" : manifest.getVersion() ?: null, - "license" : manifest.getLicense() ?: null, - "url" : manifest.getHomePage() ?: null, + "creator" : manifest.author, + "codeRepository" : metadata.repository, + "version" : metadata.commitId, + "license" : manifest.license, + "url" : manifest.homePage, "encodingFormat" : "application/nextflow", - "runtimePlatform" : manifest.getNextflowVersion() ? "Nextflow " + manifest.getNextflowVersion() : null, - "hasPart" : asReferences(workflowSofwareApplications), + "runtimePlatform" : "Nextflow " + metadata.nextflow.version.toString(), + "hasPart" : asReferences(moduleSoftwareApplications), "input" : asReferences(formalParameters), "output" : [ - // TODO: id of FormalParameter for each output file + // TODO: workflow output targets ], "step" : asReferences(howToSteps), ]), @@ -446,20 +446,20 @@ class WrrocRenderer implements Renderer { "url" : "https://www.nextflow.io/", "version" : nextflowVersion ], - *workflowSofwareApplications, + *moduleSoftwareApplications, *toolSoftwareApplications, *formalParameters, [ - "@id" : "#${softwareApplicationId}", + "@id" : softwareApplicationId, "@type": "SoftwareApplication", "name" : "Nextflow ${nextflowVersion}" ], *howToSteps, [ - "@id" : "#${organizeActionId}", + "@id" : organizeActionId, "@type" : "OrganizeAction", "agent" : ["@id": agent.get("@id")], - "instrument": ["@id": "#${softwareApplicationId}"], + "instrument": ["@id": softwareApplicationId], "name" : "Run of Nextflow ${nextflowVersion}", "object" : asReferences(controlActions), "result" : ["@id": "#${session.uniqueId}"], @@ -633,13 +633,56 @@ class WrrocRenderer implements Renderer { } /** - * Read meta.yaml (nf-core style) file for a given Nextflow process. + * Get the canonical name of a module script. + * + * @param process + */ + String getModuleId(ProcessDef process) { + final scriptPath = ScriptMeta.get(process.getOwner()).getScriptPath().normalize() + return normalizePath(scriptPath) + } + + /** + * Get the canonical name of a module script. + * + * @param process + */ + String getModuleId(TaskProcessor process) { + final scriptPath = ScriptMeta.get(process.getOwnerScript()).getScriptPath().normalize() + return normalizePath(scriptPath) + } + + /** + * Get the canonical name of a tool used by a module. + * + * @param moduleName + * @param toolName + */ + String getToolId(String moduleName, String toolName) { + return "${moduleName}#${toolName}" + } + + /** + * Get the canonical name of a process in the workflow DAG. + * + * @param projectName + * @param process + */ + static String getProcessControlId(String projectName, TaskProcessor process) { + return "${projectName}#control#${process.getName()}" + } + + static String getProcessHowToId(String projectName, TaskProcessor process) { + return "${projectName}#howto#${process.getName()}" + } + + /** + * Get the nf-core meta.yml of a Nextflow module as a map. * - * @param TaskProcessor processor Nextflow process - * @return Yaml as Map + * @param process */ - static Map readMetaYaml(TaskProcessor processor) { - final metaFile = ScriptMeta.get(processor.getOwnerScript()).getModuleDir().resolve('meta.yml') + static Map readMetaYaml(ProcessDef process) { + final metaFile = ScriptMeta.get(process.getOwner()).getModuleDir().resolve('meta.yml') return Files.exists(metaFile) ? new Yaml().load(metaFile.text) as Map : null From b24737f1042ad6f1fc43daab84887f6d9d24cee6 Mon Sep 17 00:00:00 2001 From: Ben Sherman Date: Tue, 14 Jan 2025 12:48:19 -0600 Subject: [PATCH 22/54] cleanup Signed-off-by: Ben Sherman --- .../main/nextflow/prov/WrrocRenderer.groovy | 239 +++++++++--------- 1 file changed, 118 insertions(+), 121 deletions(-) diff --git a/plugins/nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy b/plugins/nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy index addcde2..d17a660 100644 --- a/plugins/nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy +++ b/plugins/nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy @@ -30,7 +30,6 @@ import nextflow.processor.TaskRun import nextflow.script.ProcessDef import nextflow.script.ScriptMeta import nextflow.util.ConfigHelper -import org.apache.commons.io.FilenameUtils import org.yaml.snakeyaml.Yaml /** @@ -43,6 +42,8 @@ import org.yaml.snakeyaml.Yaml @CompileStatic class WrrocRenderer implements Renderer { + private static final List README_FILENAMES = List.of("README.md", "README.txt", "readme.md", "readme.txt", "Readme.md", "Readme.txt", "README") + private Path path private boolean overwrite @@ -51,7 +52,7 @@ class WrrocRenderer implements Renderer { private PathNormalizer normalizer // The final RO-Crate directory - private Path crateRootDir + private Path createDir // Nextflow work directory private Path workdir // Nextflow pipeline directory (contains main.nf, assets, etc.) @@ -66,8 +67,6 @@ class WrrocRenderer implements Renderer { ProvHelper.checkFileOverwrite(path, overwrite) } - private static final List README_FILENAMES = List.of("README.md", "README.txt", "readme.md", "readme.txt", "Readme.md", "Readme.txt", "README") - @Override void render(Session session, Set tasks, Map workflowOutputs) { // get workflow inputs @@ -76,63 +75,84 @@ class WrrocRenderer implements Renderer { // get workflow metadata final metadata = session.workflowMetadata - this.crateRootDir = path.getParent() + this.createDir = path.getParent() this.workdir = session.workDir this.projectDir = metadata.projectDir this.normalizer = new PathNormalizer(metadata) final manifest = metadata.manifest - final nextflowMeta = metadata.nextflow final scriptFile = metadata.getScriptFile() final formatter = DateTimeFormatter.ISO_OFFSET_DATE_TIME final dateStarted = formatter.format(metadata.start) final dateCompleted = formatter.format(metadata.complete) - final nextflowVersion = nextflowMeta.version.toString() + final nextflowVersion = metadata.nextflow.version.toString() final params = session.params final wrrocParams = session.config.navigate('prov.formats.wrroc', [:]) as Map // warn about any output files outside of the crate directory workflowOutputs.each { source, target -> - if( !target.startsWith(crateRootDir) ) + if( !target.startsWith(createDir) ) println "Workflow output file $target is outside of the RO-crate directory" } - // save resolved config - final configPath = crateRootDir.resolve("nextflow.config") - configPath.text = ConfigHelper.toCanonicalString(session.config, true) + // create manifest + final softwareApplicationId = metadata.projectName + '#sa' + final organizeActionId = metadata.projectName + '#organize' + final datasetParts = [] - // save pipeline README file - Map readmeFile = null + // -- license + final license = [ + "@id" : manifest.license, + "@type": "CreativeWork" + ] + + datasetParts.add(license) + // -- readme file for( final fileName : README_FILENAMES ) { - final readmeFilePath = projectDir.resolve(fileName) - if( !Files.exists(readmeFilePath) ) + final readmePath = projectDir.resolve(fileName) + if( !Files.exists(readmePath) ) continue - final encoding = FilenameUtils.getExtension(fileName).equals("md") - ? "text/markdown" - : "text/plain" - readmeFile = [ + Files.copy(readmePath, createDir.resolve(fileName), StandardCopyOption.REPLACE_EXISTING) + + datasetParts.add([ "@id" : fileName, "@type" : "File", "name" : fileName, - "description" : "This is the README file of the workflow.", - "encodingFormat": encoding - ] - Files.copy(readmeFilePath, crateRootDir.resolve(fileName), StandardCopyOption.REPLACE_EXISTING) + "description" : "The README file of the workflow.", + "encodingFormat": getEncodingFormat(readmePath) + ]) break } - // Copy nextflow_schema_json into crate if it exists - final schemaFile = scriptFile.getParent().resolve("nextflow_schema.json") - // TODO Add to crate metadata - if( Files.exists(schemaFile) ) - Files.copy(schemaFile, crateRootDir.resolve(schemaFile.getFileName()), StandardCopyOption.REPLACE_EXISTING) + // -- parameter schema + final schemaPath = scriptFile.getParent().resolve("nextflow_schema.json") + if( Files.exists(schemaPath) ) { + final fileName = schemaPath.name - // create manifest - final softwareApplicationId = metadata.projectName + '#sa' - final organizeActionId = metadata.projectName + '#organize' + Files.copy(schemaPath, createDir.resolve(fileName), StandardCopyOption.REPLACE_EXISTING) + datasetParts.add([ + "@id" : fileName, + "@type" : "File", + "name" : fileName, + "description" : "The parameter schema of the workflow.", + "encodingFormat": "application/json" + ]) + } + + // -- resolved config + final configPath = createDir.resolve("nextflow.config") + configPath.text = ConfigHelper.toCanonicalString(session.config, true) + + datasetParts.add([ + "@id" : "nextflow.config", + "@type" : "File", + "name" : "Resolved Nextflow configuration", + "description" : "The resolved Nextflow configuration for the workflow run.", + "encodingFormat": "text/plain" + ]) // Process wrroc configuration options final agent = parseAgentInfo(wrrocParams) @@ -141,30 +161,37 @@ class WrrocRenderer implements Renderer { if( organization ) agent.put("affiliation", ["@id": organization.get("@id")]) - // license information - final license = [ - "@id" : manifest.license, - "@type": "CreativeWork" - ] - + // -- pipeline parameters + // TODO: use parameter schema to populate additional fields + // TODO: use parameter schema to add file params to crate final formalParameters = params .collect { name, value -> withoutNulls([ "@id" : "#${name}", "@type" : "FormalParameter", - // TODO: infer type from value at runtime - "additionalType": "String", - // "defaultValue": "", "conformsTo" : ["@id": "https://bioschemas.org/profiles/FormalParameter/1.0-RELEASE"], - "description" : null, "encodingFormat": getEncodingFormat(value), - // TODO: match to output if type is Path - // "workExample": ["@id": outputId], "name" : name, - // "valueRequired": "True" ]) } + final propertyValues = params + .collect { name, value -> + final normalized = + (value instanceof List || value instanceof Map) ? JsonOutput.toJson(value) + : value instanceof CharSequence ? normalizePath(value.toString()) + : value + + return [ + "@id" : "#${name}-pv", + "@type" : "PropertyValue", + "exampleOfWork": ["@id": "#${name}"], + "name" : name, + "value" : normalized + ] + } + + // -- input, output, and intermediate files final inputFiles = workflowInputs .collect { source -> withoutNulls([ @@ -193,7 +220,7 @@ class WrrocRenderer implements Renderer { final outputFiles = workflowOutputs .collect { source, target -> withoutNulls([ - "@id" : crateRootDir.relativize(target).toString(), + "@id" : createDir.relativize(target).toString(), "@type" : getType(target), "name" : target.name, "description" : null, @@ -203,62 +230,7 @@ class WrrocRenderer implements Renderer { ]) } - final propertyValues = params - .collect { name, value -> - final normalized = - (value instanceof List || value instanceof Map) ? JsonOutput.toJson(value) - : value instanceof CharSequence ? normalizePath(value.toString()) - : value - - return [ - "@id" : "#${name}-pv", - "@type" : "PropertyValue", - "exampleOfWork": ["@id": "#${name}"], - "name" : name, - "value" : normalized - ] - } - - final taskCreateActions = tasks - .collect { task -> - final createAction = [ - "@id" : "#" + task.hash.toString(), - "@type" : "CreateAction", - "name" : task.getName(), - // TODO: get description from meta yaml or (future) docstring - //"description" : "", - "instrument" : ["@id": getModuleId(task.processor)], - "agent" : ["@id": agent.get("@id")], - "object" : task.getInputFilesMap().collect { name, source -> - ["@id": normalizePath(source)] - }, - "result" : ProvHelper.getTaskOutputs(task).collect { target -> - ["@id": normalizePath(target)] - }, - "actionStatus": task.getExitStatus() == 0 ? "http://schema.org/CompletedActionStatus" : "http://schema.org/FailedActionStatus" - ] - - // Add error message if there is one - if (task.getExitStatus() != 0) { - createAction.put("error", task.getStderr()) - } - - return createAction - } - - final publishCreateActions = workflowOutputs - .collect { source, target -> - [ - "@id" : "publish#" + normalizePath(source), - "@type" : "CreateAction", - "name" : "publish", - "instrument" : ["@id": softwareApplicationId], - "object" : ["@id": normalizePath(source)], - "result" : ["@id": crateRootDir.relativize(target).toString()], - "actionStatus": "http://schema.org/CompletedActionStatus" - ] - } - + // -- workflow definition final taskProcessors = tasks .collect { task -> task.processor } .unique() @@ -343,13 +315,42 @@ class WrrocRenderer implements Renderer { ] } - final configFile = [ - "@id" : "nextflow.config", - "@type" : "File", - "name" : "Effective Nextflow configuration", - "description" : "This is the effective configuration during runtime compiled from all configuration sources.", - "encodingFormat": "text/plain" - ] + // -- workflow execution + final taskCreateActions = tasks + .collect { task -> + final result = [ + "@id" : "#" + task.hash.toString(), + "@type" : "CreateAction", + "name" : task.getName(), + // TODO: get description from meta yaml + //"description" : "", + "instrument" : ["@id": getModuleId(task.processor)], + "agent" : ["@id": agent.get("@id")], + "object" : task.getInputFilesMap().collect { name, source -> + ["@id": normalizePath(source)] + }, + "result" : ProvHelper.getTaskOutputs(task).collect { target -> + ["@id": normalizePath(target)] + }, + "actionStatus": task.exitStatus == 0 ? "http://schema.org/CompletedActionStatus" : "http://schema.org/FailedActionStatus" + ] + if( task.exitStatus != 0 ) + result["error"] = task.stderr + return result + } + + final publishCreateActions = workflowOutputs + .collect { source, target -> + [ + "@id" : "publish#" + normalizePath(source), + "@type" : "CreateAction", + "name" : "publish", + "instrument" : ["@id": softwareApplicationId], + "object" : ["@id": normalizePath(source)], + "result" : ["@id": createDir.relativize(target).toString()], + "actionStatus": "http://schema.org/CompletedActionStatus" + ] + } final wrroc = [ "@context": "https://w3id.org/ro/crate/1.1/context", @@ -379,8 +380,7 @@ class WrrocRenderer implements Renderer { "description": manifest.description ?: null, "hasPart" : withoutNulls([ ["@id": metadata.projectName], - ["@id": "nextflow.config"], - readmeFile ? ["@id": readmeFile["@id"]] : null, + *asReferences(datasetParts), *asReferences(inputFiles), *asReferences(intermediateFiles), *asReferences(outputFiles) @@ -391,7 +391,7 @@ class WrrocRenderer implements Renderer { *asReferences(taskCreateActions), *asReferences(publishCreateActions) ], - "license" : license + "license" : manifest.license ]), [ "@id" : "https://w3id.org/ro/wfrun/process/0.1", @@ -430,7 +430,7 @@ class WrrocRenderer implements Renderer { "license" : manifest.license, "url" : manifest.homePage, "encodingFormat" : "application/nextflow", - "runtimePlatform" : "Nextflow " + metadata.nextflow.version.toString(), + "runtimePlatform" : "Nextflow " + nextflowVersion, "hasPart" : asReferences(moduleSoftwareApplications), "input" : asReferences(formalParameters), "output" : [ @@ -486,13 +486,11 @@ class WrrocRenderer implements Renderer { *controlActions, *taskCreateActions, *publishCreateActions, - configFile, - readmeFile, + *datasetParts, *inputFiles, *intermediateFiles, *outputFiles, *propertyValues, - license, ]) ] @@ -716,22 +714,21 @@ class WrrocRenderer implements Renderer { /** * Get the encodingFormat of a file as MIME Type. * - * @param source Path to file + * @param path Path to file * @return the MIME type of the file, or null if it's not a file. */ - static String getEncodingFormat(Path source) { - if( !(source && source.exists() && source.isFile()) ) + static String getEncodingFormat(Path path) { + if( !(path && path.exists() && path.isFile()) ) return null - String mime = Files.probeContentType(source) + String mime = Files.probeContentType(path) if( mime ) return mime // It seems that YAML has a media type only since beginning of 2024 // Set this by hand if this is run on older systems: // https://httptoolkit.com/blog/yaml-media-type-rfc/ - final extension = FilenameUtils.getExtension(source.toString()) - if( ["yml", "yaml"].contains(extension) ) + if( ["yml", "yaml"].contains(path.getExtension()) ) return "application/yaml" return null From ce8235e5d048887c79b092cb4ed41a744f7b67d6 Mon Sep 17 00:00:00 2001 From: Ben Sherman Date: Tue, 14 Jan 2025 13:13:51 -0600 Subject: [PATCH 23/54] cleanup config parsing Signed-off-by: Ben Sherman --- .../main/nextflow/prov/WrrocRenderer.groovy | 186 ++++++++---------- 1 file changed, 86 insertions(+), 100 deletions(-) diff --git a/plugins/nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy b/plugins/nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy index d17a660..d7d2f0a 100644 --- a/plugins/nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy +++ b/plugins/nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy @@ -88,7 +88,14 @@ class WrrocRenderer implements Renderer { final dateCompleted = formatter.format(metadata.complete) final nextflowVersion = metadata.nextflow.version.toString() final params = session.params - final wrrocParams = session.config.navigate('prov.formats.wrroc', [:]) as Map + + // parse wrroc configuration + final wrrocOpts = session.config.navigate('prov.formats.wrroc', [:]) as Map + final agent = parseAgentInfo(wrrocOpts) + final organization = parseOrganizationInfo(wrrocOpts) + final publisherId = getPublisherId(wrrocOpts, agent, organization) + if( organization ) + agent["affiliation"] = ["@id": organization.get("@id")] // warn about any output files outside of the crate directory workflowOutputs.each { source, target -> @@ -154,13 +161,6 @@ class WrrocRenderer implements Renderer { "encodingFormat": "text/plain" ]) - // Process wrroc configuration options - final agent = parseAgentInfo(wrrocParams) - final organization = parseOrganizationInfo(wrrocParams) - final publisherID = getPublisherID(wrrocParams, agent, organization) - if( organization ) - agent.put("affiliation", ["@id": organization.get("@id")]) - // -- pipeline parameters // TODO: use parameter schema to populate additional fields // TODO: use parameter schema to add file params to crate @@ -368,7 +368,7 @@ class WrrocRenderer implements Renderer { "@id" : "./", "@type" : "Dataset", "author" : ["@id": agent.get("@id")], - "publisher" : publisherID ? ["@id": publisherID] : null, + "publisher" : publisherId ? ["@id": publisherId] : null, "datePublished": getDatePublished(), "conformsTo" : [ ["@id": "https://w3id.org/ro/wfrun/process/0.1"], @@ -498,136 +498,122 @@ class WrrocRenderer implements Renderer { path.text = JsonOutput.prettyPrint(JsonOutput.toJson(wrroc)) } - static String getDatePublished() { + private static String getDatePublished() { return LocalDateTime.now().format(DateTimeFormatter.ISO_DATE) } /** - * Parse information about agent running the workflow from parameters + * Parse information about the agent running the workflow. * - * @param params Nextflow parameters - * @return Map describing agent via '@id'. 'orcid' and 'name' + * @param opts */ - Map parseAgentInfo(Map params) { - final agent = [:] + private Map parseAgentInfo(Map opts) { + final result = [:] - if (! params.containsKey("agent")) + if( !opts.agent ) return null - Map agentMap = params["agent"] as Map - - agent.put("@id", agentMap.containsKey("orcid") ? agentMap.get("orcid") : "agent-1") - agent.put("@type", "Person") - if(agentMap.containsKey("name")) - agent.put("name", agentMap.get("name")) + final agentOpts = opts.agent as Map + result["@id"] = agentOpts.getOrDefault("orcid", "agent-1") + result["@type"] = "Person" + if( agentOpts.name ) + result.name = agentOpts.name // Check for contact information - if(agentMap.containsKey("email") || agentMap.containsKey("phone")) { - // Add contact point to ro-crate-metadata.json - String contactPointID = parseContactPointInfo(agentMap) - if(contactPointID) - agent.put("contactPoint", ["@id": contactPointID ]) + if( agentOpts.email || agentOpts.phone ) { + final contactPointId = parseContactPointInfo(agentOpts) + if( contactPointId ) + result.contactPoint = ["@id": contactPointId] } - return agent + return result } - /** - * Parse information about organization agent running the workflow belongs to. + * Parse information about the organization of the agent running the workflow. * - * @param params Nextflow parameters - * @return Map describing organization via '@id'. 'orcid' and 'name' + * @param opts */ - Map parseOrganizationInfo(Map params) { - final org = [:] + private Map parseOrganizationInfo(Map opts) { + final result = [:] - if (! params.containsKey("organization")) + if( !opts.organization ) return null - Map orgMap = params["organization"] as Map - org.put("@id", orgMap.containsKey("ror") ? orgMap.get("ror") : "organization-1") - org.put("@type", "Organization") - if(orgMap.containsKey("name")) - org.put("name", orgMap.get("name")) + final orgOpts = opts.organization as Map + result["@id"] = orgOpts.getOrDefault("ror", "organization-1") + result["@type"] = "Organization" + if( orgOpts.name ) + result.name = orgOpts.name // Check for contact information - if(orgMap.containsKey("email") || orgMap.containsKey("phone")) { - // Add contact point to ro-crate-metadata.json - String contactPointID = parseContactPointInfo(orgMap) - if(contactPointID) - org.put("contactPoint", ["@id": contactPointID ]) + if( orgOpts.email || orgOpts.phone ) { + final contactPointId = parseContactPointInfo(orgOpts) + if( contactPointId ) + result.contactPoint = ["@id": contactPointId] } - return org + return result } - /** - * Parse information about contact point and add to contactPoints list. + * Parse a contact point and add it to the list of contact points. * - * @param params Map describing an agent or organization - * @return ID of the contactPoint + * @param opts */ - String parseContactPointInfo(Map map) { - - String contactPointID = "" - final contactPoint = [:] - + private String parseContactPointInfo(Map opts) { // Prefer email for the contact point ID - if(map.containsKey("email")) - contactPointID = "mailto:" + map.get("email") - else if(map.containsKey("phone")) - contactPointID = map.get("phone") - else + String contactPointId = null + if( opts.email ) + contactPointId = "mailto:" + opts.email + else if( opts.phone ) + contactPointId = opts.phone + + if( !contactPointId ) return null - contactPoint.put("@id", contactPointID) - contactPoint.put("@type", "ContactPoint") - if(map.containsKey("contactType")) - contactPoint.put("contactType", map.get("contactType")) - if(map.containsKey("email")) - contactPoint.put("email", map.get("email")) - if(map.containsKey("phone")) - contactPoint.put("phone", map.get("phone")) - if(map.containsKey("orcid")) - contactPoint.put("url", map.get("orcid")) - if(map.containsKey("rar")) - contactPoint.put("url", map.get("rar")) + final contactPoint = [:] + contactPoint["@id"] = contactPointId + contactPoint["@type"] = "ContactPoint" + if( opts.contactType ) + contactPoint.contactType = opts.contactType + if( opts.email ) + contactPoint.email = opts.email + if( opts.phone ) + contactPoint.phone = opts.phone + if( opts.orcid ) + contactPoint.url = opts.orcid + if( opts.rar ) + contactPoint.url = opts.rar contactPoints.add(contactPoint) - return contactPointID + return contactPointId } - /** * Parse information about the RO-Crate publisher. * - * @param params Nextflow parameters - * @return Publisher ID + * @param opts + * @param agent + * @param organization */ - static String getPublisherID(Map params, Map agent, Map organization) { - - if (! params.containsKey("publisher")) + private static String getPublisherId(Map opts, Map agent, Map organization) { + if( !opts.publisher ) return null - Map publisherMap = params["publisher"] as Map - if (! publisherMap.containsKey("id")) + final publisherOpts = opts.publisher as Map + if( !publisherOpts.containsKey("id") ) return null - String publisherID = publisherMap.get("id") - String agentID = "" - String organizationID = "" - if (agent) - agentID = agent.get("@id") - if (organization) - organizationID = organization.get("@id") + final publisherId = publisherOpts.id - // Check if the publisher ID references either the organization or the agent - if (publisherID != agentID && publisherID != organizationID) + // Check if the publisher id references either the agent or the organization + final agentId = agent?["@id"] + final organizationId = organization?["@id"] + if( publisherId != agentId && publisherId != organizationId ) return null - return publisherID + return publisherId } /** @@ -635,7 +621,7 @@ class WrrocRenderer implements Renderer { * * @param process */ - String getModuleId(ProcessDef process) { + private String getModuleId(ProcessDef process) { final scriptPath = ScriptMeta.get(process.getOwner()).getScriptPath().normalize() return normalizePath(scriptPath) } @@ -645,7 +631,7 @@ class WrrocRenderer implements Renderer { * * @param process */ - String getModuleId(TaskProcessor process) { + private String getModuleId(TaskProcessor process) { final scriptPath = ScriptMeta.get(process.getOwnerScript()).getScriptPath().normalize() return normalizePath(scriptPath) } @@ -656,7 +642,7 @@ class WrrocRenderer implements Renderer { * @param moduleName * @param toolName */ - String getToolId(String moduleName, String toolName) { + private static String getToolId(String moduleName, String toolName) { return "${moduleName}#${toolName}" } @@ -666,11 +652,11 @@ class WrrocRenderer implements Renderer { * @param projectName * @param process */ - static String getProcessControlId(String projectName, TaskProcessor process) { + private static String getProcessControlId(String projectName, TaskProcessor process) { return "${projectName}#control#${process.getName()}" } - static String getProcessHowToId(String projectName, TaskProcessor process) { + private static String getProcessHowToId(String projectName, TaskProcessor process) { return "${projectName}#howto#${process.getName()}" } @@ -679,7 +665,7 @@ class WrrocRenderer implements Renderer { * * @param process */ - static Map readMetaYaml(ProcessDef process) { + private static Map readMetaYaml(ProcessDef process) { final metaFile = ScriptMeta.get(process.getOwner()).getModuleDir().resolve('meta.yml') return Files.exists(metaFile) ? new Yaml().load(metaFile.text) as Map @@ -692,7 +678,7 @@ class WrrocRenderer implements Renderer { * @param path The path to be checked * @return type Either "File" or "Directory" */ - static String getType(Path path) { + private static String getType(Path path) { return path.isDirectory() ? "Directory" : "File" @@ -704,7 +690,7 @@ class WrrocRenderer implements Renderer { * @param value A value that may be a file * @return the MIME type of the value, or null if it's not a file. */ - static String getEncodingFormat(Object value) { + private static String getEncodingFormat(Object value) { return value instanceof String ? getEncodingFormat(Path.of(value)) @@ -717,7 +703,7 @@ class WrrocRenderer implements Renderer { * @param path Path to file * @return the MIME type of the file, or null if it's not a file. */ - static String getEncodingFormat(Path path) { + private static String getEncodingFormat(Path path) { if( !(path && path.exists() && path.isFile()) ) return null From 42da1dda687ce95617303662773e55ba16dd6339 Mon Sep 17 00:00:00 2001 From: Ben Sherman Date: Tue, 14 Jan 2025 15:16:23 -0600 Subject: [PATCH 24/54] cleanup Signed-off-by: Ben Sherman --- .../main/nextflow/prov/WrrocRenderer.groovy | 25 +++++++------------ 1 file changed, 9 insertions(+), 16 deletions(-) diff --git a/plugins/nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy b/plugins/nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy index d7d2f0a..a048427 100644 --- a/plugins/nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy +++ b/plugins/nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy @@ -51,13 +51,7 @@ class WrrocRenderer implements Renderer { @Delegate private PathNormalizer normalizer - // The final RO-Crate directory - private Path createDir - // Nextflow work directory - private Path workdir - // Nextflow pipeline directory (contains main.nf, assets, etc.) - private Path projectDir - // List of contactPoints (people, organizations) to be added to ro-crate-metadata.json + // List of contact points (people, organizations) to be added private List contactPoints = [] WrrocRenderer(Map opts) { @@ -75,9 +69,8 @@ class WrrocRenderer implements Renderer { // get workflow metadata final metadata = session.workflowMetadata - this.createDir = path.getParent() - this.workdir = session.workDir - this.projectDir = metadata.projectDir + final crateDir = path.getParent() + final projectDir = metadata.projectDir this.normalizer = new PathNormalizer(metadata) final manifest = metadata.manifest @@ -99,7 +92,7 @@ class WrrocRenderer implements Renderer { // warn about any output files outside of the crate directory workflowOutputs.each { source, target -> - if( !target.startsWith(createDir) ) + if( !target.startsWith(crateDir) ) println "Workflow output file $target is outside of the RO-crate directory" } @@ -122,7 +115,7 @@ class WrrocRenderer implements Renderer { if( !Files.exists(readmePath) ) continue - Files.copy(readmePath, createDir.resolve(fileName), StandardCopyOption.REPLACE_EXISTING) + Files.copy(readmePath, crateDir.resolve(fileName), StandardCopyOption.REPLACE_EXISTING) datasetParts.add([ "@id" : fileName, @@ -139,7 +132,7 @@ class WrrocRenderer implements Renderer { if( Files.exists(schemaPath) ) { final fileName = schemaPath.name - Files.copy(schemaPath, createDir.resolve(fileName), StandardCopyOption.REPLACE_EXISTING) + Files.copy(schemaPath, crateDir.resolve(fileName), StandardCopyOption.REPLACE_EXISTING) datasetParts.add([ "@id" : fileName, "@type" : "File", @@ -150,7 +143,7 @@ class WrrocRenderer implements Renderer { } // -- resolved config - final configPath = createDir.resolve("nextflow.config") + final configPath = crateDir.resolve("nextflow.config") configPath.text = ConfigHelper.toCanonicalString(session.config, true) datasetParts.add([ @@ -220,7 +213,7 @@ class WrrocRenderer implements Renderer { final outputFiles = workflowOutputs .collect { source, target -> withoutNulls([ - "@id" : createDir.relativize(target).toString(), + "@id" : crateDir.relativize(target).toString(), "@type" : getType(target), "name" : target.name, "description" : null, @@ -347,7 +340,7 @@ class WrrocRenderer implements Renderer { "name" : "publish", "instrument" : ["@id": softwareApplicationId], "object" : ["@id": normalizePath(source)], - "result" : ["@id": createDir.relativize(target).toString()], + "result" : ["@id": crateDir.relativize(target).toString()], "actionStatus": "http://schema.org/CompletedActionStatus" ] } From 0ffea5de3e5a917743cac6ba2361530060778d08 Mon Sep 17 00:00:00 2001 From: Ben Sherman Date: Tue, 14 Jan 2025 15:37:35 -0600 Subject: [PATCH 25/54] Improve canonical ids Signed-off-by: Ben Sherman --- .../main/nextflow/prov/WrrocRenderer.groovy | 29 +++++++++++++------ 1 file changed, 20 insertions(+), 9 deletions(-) diff --git a/plugins/nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy b/plugins/nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy index a048427..845ad60 100644 --- a/plugins/nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy +++ b/plugins/nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy @@ -157,10 +157,11 @@ class WrrocRenderer implements Renderer { // -- pipeline parameters // TODO: use parameter schema to populate additional fields // TODO: use parameter schema to add file params to crate + // TODO: formal parameters for workflow output targets final formalParameters = params .collect { name, value -> withoutNulls([ - "@id" : "#${name}", + "@id" : getFormalParameterId(metadata.projectName, name), "@type" : "FormalParameter", "conformsTo" : ["@id": "https://bioschemas.org/profiles/FormalParameter/1.0-RELEASE"], "encodingFormat": getEncodingFormat(value), @@ -176,9 +177,9 @@ class WrrocRenderer implements Renderer { : value return [ - "@id" : "#${name}-pv", + "@id" : "#${name}", "@type" : "PropertyValue", - "exampleOfWork": ["@id": "#${name}"], + "exampleOfWork": ["@id": getFormalParameterId(metadata.projectName, name)], "name" : name, "value" : normalized ] @@ -286,7 +287,7 @@ class WrrocRenderer implements Renderer { final howToSteps = taskProcessors .collect() { process -> [ - "@id" : getProcessHowToId(metadata.projectName, process), + "@id" : getProcessStepId(metadata.projectName, process), "@type" : "HowToStep", "workExample": ["@id": getModuleId(process)], "position" : process.getId() @@ -302,7 +303,7 @@ class WrrocRenderer implements Renderer { return [ "@id" : getProcessControlId(metadata.projectName, process), "@type" : "ControlAction", - "instrument": ["@id": getProcessHowToId(metadata.projectName, process)], + "instrument": ["@id": getProcessStepId(metadata.projectName, process)], "name" : "Orchestrate process " + process.getName(), "object" : taskIds ] @@ -468,8 +469,8 @@ class WrrocRenderer implements Renderer { "endTime" : dateCompleted, "instrument": ["@id": metadata.projectName], "object" : [ + *asReferences(propertyValues), *asReferences(inputFiles), - *asReferences(propertyValues) ], "result" : asReferences(outputFiles) ], @@ -480,10 +481,10 @@ class WrrocRenderer implements Renderer { *taskCreateActions, *publishCreateActions, *datasetParts, + *propertyValues, *inputFiles, *intermediateFiles, *outputFiles, - *propertyValues, ]) ] @@ -609,6 +610,16 @@ class WrrocRenderer implements Renderer { return publisherId } + /** + * Get the canonical name of a module script. + * + * @param projectName + * @param name + */ + private String getFormalParameterId(String projectName, String name) { + return "${projectName}#param#${name}" + } + /** * Get the canonical name of a module script. * @@ -649,8 +660,8 @@ class WrrocRenderer implements Renderer { return "${projectName}#control#${process.getName()}" } - private static String getProcessHowToId(String projectName, TaskProcessor process) { - return "${projectName}#howto#${process.getName()}" + private static String getProcessStepId(String projectName, TaskProcessor process) { + return "${projectName}#step#${process.getName()}" } /** From fe5d4c0a26380a3b619b0c6dea9a02d228989e00 Mon Sep 17 00:00:00 2001 From: Ben Sherman Date: Tue, 14 Jan 2025 16:45:34 -0600 Subject: [PATCH 26/54] Use parameter schema to populate formal parameters, copy input files Signed-off-by: Ben Sherman --- .../main/nextflow/prov/WrrocRenderer.groovy | 147 ++++++++++++++---- 1 file changed, 114 insertions(+), 33 deletions(-) diff --git a/plugins/nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy b/plugins/nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy index 845ad60..e0f4b03 100644 --- a/plugins/nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy +++ b/plugins/nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy @@ -18,12 +18,13 @@ package nextflow.prov import java.nio.file.Files import java.nio.file.Path -import java.nio.file.StandardCopyOption import java.time.LocalDateTime import java.time.format.DateTimeFormatter import groovy.json.JsonOutput +import groovy.json.JsonSlurper import groovy.transform.CompileStatic +import groovy.util.logging.Slf4j import nextflow.Session import nextflow.processor.TaskProcessor import nextflow.processor.TaskRun @@ -39,6 +40,7 @@ import org.yaml.snakeyaml.Yaml * @author Felix Bartusch * @author Famke Bäuerle */ +@Slf4j @CompileStatic class WrrocRenderer implements Renderer { @@ -84,8 +86,8 @@ class WrrocRenderer implements Renderer { // parse wrroc configuration final wrrocOpts = session.config.navigate('prov.formats.wrroc', [:]) as Map - final agent = parseAgentInfo(wrrocOpts) - final organization = parseOrganizationInfo(wrrocOpts) + final agent = getAgentInfo(wrrocOpts) + final organization = getOrganizationInfo(wrrocOpts) final publisherId = getPublisherId(wrrocOpts, agent, organization) if( organization ) agent["affiliation"] = ["@id": organization.get("@id")] @@ -93,7 +95,7 @@ class WrrocRenderer implements Renderer { // warn about any output files outside of the crate directory workflowOutputs.each { source, target -> if( !target.startsWith(crateDir) ) - println "Workflow output file $target is outside of the RO-crate directory" + log.warn "Workflow output file $target is outside of the RO-crate directory" } // create manifest @@ -115,8 +117,7 @@ class WrrocRenderer implements Renderer { if( !Files.exists(readmePath) ) continue - Files.copy(readmePath, crateDir.resolve(fileName), StandardCopyOption.REPLACE_EXISTING) - + readmePath.copyTo(crateDir.resolve(fileName)) datasetParts.add([ "@id" : fileName, "@type" : "File", @@ -129,10 +130,11 @@ class WrrocRenderer implements Renderer { // -- parameter schema final schemaPath = scriptFile.getParent().resolve("nextflow_schema.json") + Map paramSchema = [:] if( Files.exists(schemaPath) ) { final fileName = schemaPath.name - Files.copy(schemaPath, crateDir.resolve(fileName), StandardCopyOption.REPLACE_EXISTING) + schemaPath.copyTo(crateDir.resolve(fileName)) datasetParts.add([ "@id" : fileName, "@type" : "File", @@ -140,6 +142,7 @@ class WrrocRenderer implements Renderer { "description" : "The parameter schema of the workflow.", "encodingFormat": "application/json" ]) + paramSchema = getParameterSchema(schemaPath) } // -- resolved config @@ -155,17 +158,24 @@ class WrrocRenderer implements Renderer { ]) // -- pipeline parameters - // TODO: use parameter schema to populate additional fields - // TODO: use parameter schema to add file params to crate // TODO: formal parameters for workflow output targets final formalParameters = params .collect { name, value -> - withoutNulls([ + final schema = paramSchema[name] ?: [:] + final type = getParameterType(name, value, schema) + final encoding = type == "File" + ? getEncodingFormat(Path.of(value.toString())) + : null + + return withoutNulls([ "@id" : getFormalParameterId(metadata.projectName, name), "@type" : "FormalParameter", + "additionalType": type, "conformsTo" : ["@id": "https://bioschemas.org/profiles/FormalParameter/1.0-RELEASE"], - "encodingFormat": getEncodingFormat(value), + "encodingFormat": encoding, "name" : name, + "defaultValue" : schema.default, + "description" : schema.description, ]) } @@ -185,6 +195,19 @@ class WrrocRenderer implements Renderer { ] } + // -- copy input files from params to crate + params.each { name, value -> + final schema = paramSchema[name] ?: [:] + final type = getParameterType(name, value, schema) + if( type == "File" || type == "Directory" ) { + final source = Path.of(value.toString()).toAbsolutePath() + // don't copy params.outdir into itself... + if( source == crateDir ) + return + source.copyTo(crateDir) + } + } + // -- input, output, and intermediate files final inputFiles = workflowInputs .collect { source -> @@ -194,7 +217,6 @@ class WrrocRenderer implements Renderer { "name" : source.name, "description" : null, "encodingFormat": getEncodingFormat(source), - //"fileType": "whatever", // TODO: apply if matching param is found // "exampleOfWork": ["@id": paramId] ]) @@ -238,7 +260,7 @@ class WrrocRenderer implements Renderer { final moduleSoftwareApplications = processDefs .collect() { process -> - final metaYaml = readMetaYaml(process) + final metaYaml = getModuleSchema(process) if (metaYaml == null) { return [ "@id" : getModuleId(process), @@ -264,7 +286,7 @@ class WrrocRenderer implements Renderer { } final toolSoftwareApplications = processDefs - .collect { process -> readMetaYaml(process) } + .collect { process -> getModuleSchema(process) } .findAll { metaYaml -> metaYaml != null } .collectMany { metaYaml -> final moduleName = metaYaml.get('name') as String @@ -501,7 +523,7 @@ class WrrocRenderer implements Renderer { * * @param opts */ - private Map parseAgentInfo(Map opts) { + private Map getAgentInfo(Map opts) { final result = [:] if( !opts.agent ) @@ -515,7 +537,7 @@ class WrrocRenderer implements Renderer { // Check for contact information if( agentOpts.email || agentOpts.phone ) { - final contactPointId = parseContactPointInfo(agentOpts) + final contactPointId = getContactPointInfo(agentOpts) if( contactPointId ) result.contactPoint = ["@id": contactPointId] } @@ -528,7 +550,7 @@ class WrrocRenderer implements Renderer { * * @param opts */ - private Map parseOrganizationInfo(Map opts) { + private Map getOrganizationInfo(Map opts) { final result = [:] if( !opts.organization ) @@ -542,7 +564,7 @@ class WrrocRenderer implements Renderer { // Check for contact information if( orgOpts.email || orgOpts.phone ) { - final contactPointId = parseContactPointInfo(orgOpts) + final contactPointId = getContactPointInfo(orgOpts) if( contactPointId ) result.contactPoint = ["@id": contactPointId] } @@ -555,7 +577,7 @@ class WrrocRenderer implements Renderer { * * @param opts */ - private String parseContactPointInfo(Map opts) { + private String getContactPointInfo(Map opts) { // Prefer email for the contact point ID String contactPointId = null if( opts.email ) @@ -610,6 +632,78 @@ class WrrocRenderer implements Renderer { return publisherId } + /** + * Get the parameter schema of a pipeline as a map. + * + * @param path + */ + private static Map getParameterSchema(Path path) { + final schema = new JsonSlurper().parseText(path.text) as Map + + Map defs = null + if( schema['$defs'] ) + defs = schema['$defs'] as Map + else if( schema['defs'] ) + defs = schema['defs'] as Map + else if( schema['definitions'] ) + defs = schema['definitions'] as Map + + if( !defs ) + return [:] + + final schemaProps = schema.properties as Map ?: [:] + final defsProps = defs.values().collect { defn -> + (defn as Map).properties ?: [:] + } as List + final allProps = [schemaProps] + defsProps + final entries = allProps.collectMany { props -> + (props as Map).entrySet() + } as Map.Entry[] + + return Map.ofEntries(entries) + } + + /** + * Determine the type of a parameter based on its + * schema and/or runtime value. + * + * @param name + * @param value + * @param schema + */ + private static String getParameterType(String name, Object value, Map schema) { + // infer from schema + if( schema ) { + final type = schema.type + final format = schema.format + + switch( type ) { + case "boolean": + return "Boolean" + case "integer": + case "number": + return "Number" + case "string": + return \ + format == "file-path" ? "File" : + format == "directory-path" ? "Directory" : + "Text" + } + } + + // infer from runtime value + switch( value ) { + case Boolean: + return "Boolean" + case Number: + return "Number" + case CharSequence: + return "Text" + default: + return null + } + } + /** * Get the canonical name of a module script. * @@ -669,7 +763,7 @@ class WrrocRenderer implements Renderer { * * @param process */ - private static Map readMetaYaml(ProcessDef process) { + private static Map getModuleSchema(ProcessDef process) { final metaFile = ScriptMeta.get(process.getOwner()).getModuleDir().resolve('meta.yml') return Files.exists(metaFile) ? new Yaml().load(metaFile.text) as Map @@ -688,19 +782,6 @@ class WrrocRenderer implements Renderer { : "File" } - /** - * Get the encodingFormat of a file as MIME Type. - * - * @param value A value that may be a file - * @return the MIME type of the value, or null if it's not a file. - */ - private static String getEncodingFormat(Object value) { - - return value instanceof String - ? getEncodingFormat(Path.of(value)) - : null - } - /** * Get the encodingFormat of a file as MIME Type. * From 90f9cbfcc94da9646c1d248e61fe3fbfad8b7ef4 Mon Sep 17 00:00:00 2001 From: Ben Sherman Date: Thu, 16 Jan 2025 16:22:55 -0600 Subject: [PATCH 27/54] Exclude null property values Signed-off-by: Ben Sherman --- plugins/nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy | 1 + 1 file changed, 1 insertion(+) diff --git a/plugins/nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy b/plugins/nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy index e0f4b03..1600b6f 100644 --- a/plugins/nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy +++ b/plugins/nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy @@ -180,6 +180,7 @@ class WrrocRenderer implements Renderer { } final propertyValues = params + .findAll { name, value -> value != null } .collect { name, value -> final normalized = (value instanceof List || value instanceof Map) ? JsonOutput.toJson(value) From 34b4d3af0b73007d3eb565eb8385545b04489726 Mon Sep 17 00:00:00 2001 From: Ben Sherman Date: Thu, 16 Jan 2025 16:23:41 -0600 Subject: [PATCH 28/54] Include permalink to main script Signed-off-by: Ben Sherman --- plugins/nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/plugins/nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy b/plugins/nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy index 1600b6f..cc18b49 100644 --- a/plugins/nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy +++ b/plugins/nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy @@ -445,7 +445,7 @@ class WrrocRenderer implements Renderer { "codeRepository" : metadata.repository, "version" : metadata.commitId, "license" : manifest.license, - "url" : manifest.homePage, + "url" : normalizePath(metadata.scriptFile), "encodingFormat" : "application/nextflow", "runtimePlatform" : "Nextflow " + nextflowVersion, "hasPart" : asReferences(moduleSoftwareApplications), From 3ff0d2731334852173604bad358f985502226f05 Mon Sep 17 00:00:00 2001 From: Ben Sherman Date: Thu, 16 Jan 2025 16:26:05 -0600 Subject: [PATCH 29/54] Add cases for lists and maps for formal parameters Signed-off-by: Ben Sherman --- plugins/nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy | 3 +++ 1 file changed, 3 insertions(+) diff --git a/plugins/nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy b/plugins/nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy index cc18b49..9adaaf3 100644 --- a/plugins/nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy +++ b/plugins/nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy @@ -700,6 +700,9 @@ class WrrocRenderer implements Renderer { return "Number" case CharSequence: return "Text" + case List: + case Map: + return "Text" default: return null } From 0f510611b8f3b462ba91e8d6d79bd5b030eca5d0 Mon Sep 17 00:00:00 2001 From: Ben Sherman Date: Thu, 16 Jan 2025 16:29:54 -0600 Subject: [PATCH 30/54] Don't download remote input files into crate Signed-off-by: Ben Sherman --- .../nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/plugins/nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy b/plugins/nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy index 9adaaf3..03aab9a 100644 --- a/plugins/nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy +++ b/plugins/nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy @@ -16,6 +16,7 @@ package nextflow.prov +import java.nio.file.FileSystems import java.nio.file.Files import java.nio.file.Path import java.time.LocalDateTime @@ -164,7 +165,7 @@ class WrrocRenderer implements Renderer { final schema = paramSchema[name] ?: [:] final type = getParameterType(name, value, schema) final encoding = type == "File" - ? getEncodingFormat(Path.of(value.toString())) + ? getEncodingFormat(value as Path) : null return withoutNulls([ @@ -201,7 +202,10 @@ class WrrocRenderer implements Renderer { final schema = paramSchema[name] ?: [:] final type = getParameterType(name, value, schema) if( type == "File" || type == "Directory" ) { - final source = Path.of(value.toString()).toAbsolutePath() + final source = (value as Path).complete() + // don't try to download remote files... + if( source.fileSystem != FileSystems.default ) + return // don't copy params.outdir into itself... if( source == crateDir ) return From 7888bbc3fe0e39cfce152c670846608966956f81 Mon Sep 17 00:00:00 2001 From: Ben Sherman Date: Thu, 16 Jan 2025 16:30:07 -0600 Subject: [PATCH 31/54] Include main script to satisfy WRROC requirement Signed-off-by: Ben Sherman --- .../src/main/nextflow/prov/WrrocRenderer.groovy | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/plugins/nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy b/plugins/nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy index 03aab9a..cc74677 100644 --- a/plugins/nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy +++ b/plugins/nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy @@ -129,6 +129,17 @@ class WrrocRenderer implements Renderer { break } + // -- main script + metadata.scriptFile.copyTo(crateDir) + + datasetParts.add([ + "@id" : "main.nf", + "@type" : "File", + "name" : "Main script", + "description" : "The main script of the workflow.", + "encodingFormat": "text/plain" + ]) + // -- parameter schema final schemaPath = scriptFile.getParent().resolve("nextflow_schema.json") Map paramSchema = [:] From 0d48ce0354a72d180cadab6931c827af18d31c05 Mon Sep 17 00:00:00 2001 From: Ben Sherman Date: Thu, 16 Jan 2025 16:31:16 -0600 Subject: [PATCH 32/54] Replace "Directory" -> "Dataset" Signed-off-by: Ben Sherman --- .../src/main/nextflow/prov/WrrocRenderer.groovy | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/plugins/nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy b/plugins/nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy index cc74677..cca75f1 100644 --- a/plugins/nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy +++ b/plugins/nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy @@ -212,7 +212,7 @@ class WrrocRenderer implements Renderer { params.each { name, value -> final schema = paramSchema[name] ?: [:] final type = getParameterType(name, value, schema) - if( type == "File" || type == "Directory" ) { + if( type == "File" || type == "Dataset" ) { final source = (value as Path).complete() // don't try to download remote files... if( source.fileSystem != FileSystems.default ) @@ -702,7 +702,7 @@ class WrrocRenderer implements Renderer { case "string": return \ format == "file-path" ? "File" : - format == "directory-path" ? "Directory" : + format == "directory-path" ? "Dataset" : "Text" } } @@ -790,14 +790,14 @@ class WrrocRenderer implements Renderer { } /** - * Check if a Path is a file or a directory and return corresponding "@type" + * Get the RO-crate "@type" of a path based on whether + * it is a file or directory. * - * @param path The path to be checked - * @return type Either "File" or "Directory" + * @param path */ private static String getType(Path path) { return path.isDirectory() - ? "Directory" + ? "Dataset" : "File" } From 4b5161ec3385c8e51368f2bdcd1f2696831d2557 Mon Sep 17 00:00:00 2001 From: Ben Sherman Date: Thu, 16 Jan 2025 19:04:37 -0600 Subject: [PATCH 33/54] Exclude (and warn about) published files outside of crate directory Signed-off-by: Ben Sherman --- .../src/main/nextflow/prov/WrrocRenderer.groovy | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/plugins/nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy b/plugins/nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy index cca75f1..f50e8b8 100644 --- a/plugins/nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy +++ b/plugins/nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy @@ -93,12 +93,6 @@ class WrrocRenderer implements Renderer { if( organization ) agent["affiliation"] = ["@id": organization.get("@id")] - // warn about any output files outside of the crate directory - workflowOutputs.each { source, target -> - if( !target.startsWith(crateDir) ) - log.warn "Workflow output file $target is outside of the RO-crate directory" - } - // create manifest final softwareApplicationId = metadata.projectName + '#sa' final organizeActionId = metadata.projectName + '#organize' @@ -250,6 +244,13 @@ class WrrocRenderer implements Renderer { } final outputFiles = workflowOutputs + .findAll { source, target -> + // warn about any output files outside of the crate directory + final result = target.startsWith(crateDir) + if( !result ) + log.warn "Excluding workflow output $target because it is outside of the RO-crate directory" + return result + } .collect { source, target -> withoutNulls([ "@id" : crateDir.relativize(target).toString(), @@ -515,11 +516,11 @@ class WrrocRenderer implements Renderer { agent, organization, *contactPoints, + *datasetParts, + *propertyValues, *controlActions, *taskCreateActions, *publishCreateActions, - *datasetParts, - *propertyValues, *inputFiles, *intermediateFiles, *outputFiles, From 1c9e9f8c85fce83cd1ca29e9549630bc77d44ed3 Mon Sep 17 00:00:00 2001 From: Ben Sherman Date: Thu, 16 Jan 2025 19:08:03 -0600 Subject: [PATCH 34/54] Fix tool description, cleanup Signed-off-by: Ben Sherman --- .../main/nextflow/prov/WrrocRenderer.groovy | 70 +++++++++---------- 1 file changed, 32 insertions(+), 38 deletions(-) diff --git a/plugins/nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy b/plugins/nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy index f50e8b8..b00ef09 100644 --- a/plugins/nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy +++ b/plugins/nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy @@ -91,7 +91,7 @@ class WrrocRenderer implements Renderer { final organization = getOrganizationInfo(wrrocOpts) final publisherId = getPublisherId(wrrocOpts, agent, organization) if( organization ) - agent["affiliation"] = ["@id": organization.get("@id")] + agent["affiliation"] = ["@id": organization["@id"]] // create manifest final softwareApplicationId = metadata.projectName + '#sa' @@ -131,7 +131,7 @@ class WrrocRenderer implements Renderer { "@type" : "File", "name" : "Main script", "description" : "The main script of the workflow.", - "encodingFormat": "text/plain" + "encodingFormat": "application/nextflow" ]) // -- parameter schema @@ -227,8 +227,6 @@ class WrrocRenderer implements Renderer { "name" : source.name, "description" : null, "encodingFormat": getEncodingFormat(source), - // TODO: apply if matching param is found - // "exampleOfWork": ["@id": paramId] ]) } @@ -258,8 +256,6 @@ class WrrocRenderer implements Renderer { "name" : target.name, "description" : null, "encodingFormat": getEncodingFormat(target), - // TODO: create FormalParameter for each output file? - // "exampleOfWork": {"@id": "#reversed"} ]) } @@ -277,36 +273,36 @@ class WrrocRenderer implements Renderer { final moduleSoftwareApplications = processDefs .collect() { process -> + final result = [ + "@id" : getModuleId(process), + "@type" : "SoftwareApplication", + "name" : process.getName(), + ] + final metaYaml = getModuleSchema(process) - if (metaYaml == null) { - return [ - "@id" : getModuleId(process), - "@type" : "SoftwareApplication", - "name" : process.getName(), - ] - } + if( metaYaml ) { + final moduleName = metaYaml.name as String + final tools = metaYaml.getOrDefault('tools', []) as List + final parts = tools.collect { tool -> + final entry = (tool as Map).entrySet().first() + final toolName = entry.key as String + ["@id": getToolId(moduleName, toolName)] + } - final moduleName = metaYaml.get('name') as String - final tools = metaYaml.getOrDefault('tools', []) as List - final parts = tools.collect { tool -> - final entry = (tool as Map).entrySet().first() - final toolName = entry.key as String - ["@id": getToolId(moduleName, toolName)] + if( parts ) + result.hasPart = parts } - return [ - "@id" : getModuleId(process), - "@type" : "SoftwareApplication", - "name" : process.getBaseName(), - "hasPart": !parts.isEmpty() ? parts : null - ] + return result } final toolSoftwareApplications = processDefs - .collect { process -> getModuleSchema(process) } - .findAll { metaYaml -> metaYaml != null } - .collectMany { metaYaml -> - final moduleName = metaYaml.get('name') as String + .collectMany { process -> + final metaYaml = getModuleSchema(process) + if( !metaYaml ) + return [] + + final moduleName = metaYaml.name as String final tools = metaYaml.getOrDefault('tools', []) as List return tools @@ -318,7 +314,7 @@ class WrrocRenderer implements Renderer { "@id" : getToolId(moduleName, toolName), "@type" : "SoftwareApplication", "name" : toolName, - "description" : entry.value?.toString() ?: "" + "description" : toolDescription ] } } @@ -355,8 +351,6 @@ class WrrocRenderer implements Renderer { "@id" : "#" + task.hash.toString(), "@type" : "CreateAction", "name" : task.getName(), - // TODO: get description from meta yaml - //"description" : "", "instrument" : ["@id": getModuleId(task.processor)], "agent" : ["@id": agent.get("@id")], "object" : task.getInputFilesMap().collect { name, source -> @@ -400,7 +394,7 @@ class WrrocRenderer implements Renderer { withoutNulls([ "@id" : "./", "@type" : "Dataset", - "author" : ["@id": agent.get("@id")], + "author" : ["@id": agent["@id"]], "publisher" : publisherId ? ["@id": publisherId] : null, "datePublished": getDatePublished(), "conformsTo" : [ @@ -479,19 +473,19 @@ class WrrocRenderer implements Renderer { "url" : "https://www.nextflow.io/", "version" : nextflowVersion ], - *moduleSoftwareApplications, - *toolSoftwareApplications, - *formalParameters, [ "@id" : softwareApplicationId, "@type": "SoftwareApplication", "name" : "Nextflow ${nextflowVersion}" ], + *moduleSoftwareApplications, + *toolSoftwareApplications, + *formalParameters, *howToSteps, [ "@id" : organizeActionId, "@type" : "OrganizeAction", - "agent" : ["@id": agent.get("@id")], + "agent" : ["@id": agent["@id"]], "instrument": ["@id": softwareApplicationId], "name" : "Run of Nextflow ${nextflowVersion}", "object" : asReferences(controlActions), @@ -502,7 +496,7 @@ class WrrocRenderer implements Renderer { [ "@id" : "#${session.uniqueId}", "@type" : "CreateAction", - "agent" : ["@id": agent.get("@id")], + "agent" : ["@id": agent["@id"]], "name" : "Nextflow workflow run ${session.uniqueId}", "startTime" : dateStarted, "endTime" : dateCompleted, From 310b115cdc7123b467d2095742694bfedb26069f Mon Sep 17 00:00:00 2001 From: Ben Sherman Date: Thu, 16 Jan 2025 19:09:22 -0600 Subject: [PATCH 35/54] Add param input files to dataset parts Signed-off-by: Ben Sherman --- plugins/nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/plugins/nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy b/plugins/nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy index b00ef09..ea3702e 100644 --- a/plugins/nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy +++ b/plugins/nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy @@ -214,6 +214,12 @@ class WrrocRenderer implements Renderer { // don't copy params.outdir into itself... if( source == crateDir ) return + datasetParts.add(withoutNulls([ + "@id" : source.name, + "@type" : type, + "description" : "Input file specified by params.${name}", + "encodingFormat": getEncodingFormat(source) + ])) source.copyTo(crateDir) } } From 9a4ff8a9ce411f00625102bcae9922a29e466af4 Mon Sep 17 00:00:00 2001 From: Ben Sherman Date: Thu, 16 Jan 2025 19:09:51 -0600 Subject: [PATCH 36/54] Improve canonical ids for tasks and task outputs Signed-off-by: Ben Sherman --- .../main/nextflow/prov/WrrocRenderer.groovy | 108 ++++++++++++------ 1 file changed, 76 insertions(+), 32 deletions(-) diff --git a/plugins/nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy b/plugins/nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy index ea3702e..7d4377d 100644 --- a/plugins/nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy +++ b/plugins/nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy @@ -224,7 +224,7 @@ class WrrocRenderer implements Renderer { } } - // -- input, output, and intermediate files + // -- input and output files final inputFiles = workflowInputs .collect { source -> withoutNulls([ @@ -236,17 +236,6 @@ class WrrocRenderer implements Renderer { ]) } - final intermediateFiles = tasks.collectMany { task -> - ProvHelper.getTaskOutputs(task).collect { target -> - withoutNulls([ - "@id" : normalizePath(target), - "@type" : getType(target), - "name" : target.name, - "encodingFormat": getEncodingFormat(target), - ]) - } - } - final outputFiles = workflowOutputs .findAll { source, target -> // warn about any output files outside of the crate directory @@ -339,7 +328,7 @@ class WrrocRenderer implements Renderer { .collect() { process -> final taskIds = tasks .findAll { task -> task.processor == process } - .collect { task -> ["@id": "#" + task.hash.toString()] } + .collect { task -> ["@id": getTaskId(task)] } return [ "@id" : getProcessControlId(metadata.projectName, process), @@ -353,18 +342,23 @@ class WrrocRenderer implements Renderer { // -- workflow execution final taskCreateActions = tasks .collect { task -> + final inputs = task.getInputFilesMap().collect { name, source -> + final id = source in taskLookup + ? getTaskOutputId(taskLookup[source], source) + : normalizePath(source) + ["@id": id] + } + final outputs = ProvHelper.getTaskOutputs(task).collect { target -> + ["@id": getTaskOutputId(task, target)] + } final result = [ - "@id" : "#" + task.hash.toString(), + "@id" : getTaskId(task), "@type" : "CreateAction", "name" : task.getName(), "instrument" : ["@id": getModuleId(task.processor)], - "agent" : ["@id": agent.get("@id")], - "object" : task.getInputFilesMap().collect { name, source -> - ["@id": normalizePath(source)] - }, - "result" : ProvHelper.getTaskOutputs(task).collect { target -> - ["@id": normalizePath(target)] - }, + "agent" : ["@id": agent["@id"]], + "object" : inputs, + "result" : outputs, "actionStatus": task.exitStatus == 0 ? "http://schema.org/CompletedActionStatus" : "http://schema.org/FailedActionStatus" ] if( task.exitStatus != 0 ) @@ -372,14 +366,30 @@ class WrrocRenderer implements Renderer { return result } + final taskOutputs = tasks.collectMany { task -> + ProvHelper.getTaskOutputs(task).collect { target -> + final name = getTaskOutputName(task, target) + + return withoutNulls([ + "@id" : getTaskOutputId(task, name), + "@type" : getType(target), + "name" : name, + "encodingFormat": getEncodingFormat(target), + ]) + } + } + final publishCreateActions = workflowOutputs .collect { source, target -> - [ - "@id" : "publish#" + normalizePath(source), + final task = taskLookup[source] + final sourceName = getTaskOutputName(task, source) + + return [ + "@id" : "publish#${task.hash}/${sourceName}", "@type" : "CreateAction", "name" : "publish", "instrument" : ["@id": softwareApplicationId], - "object" : ["@id": normalizePath(source)], + "object" : ["@id": getTaskOutputId(task, sourceName)], "result" : ["@id": crateDir.relativize(target).toString()], "actionStatus": "http://schema.org/CompletedActionStatus" ] @@ -415,14 +425,14 @@ class WrrocRenderer implements Renderer { ["@id": metadata.projectName], *asReferences(datasetParts), *asReferences(inputFiles), - *asReferences(intermediateFiles), *asReferences(outputFiles) ]), "mainEntity" : ["@id": metadata.projectName], "mentions" : [ ["@id": "#${session.uniqueId}"], *asReferences(taskCreateActions), - *asReferences(publishCreateActions) + *asReferences(taskOutputs), + *asReferences(publishCreateActions), ], "license" : manifest.license ]), @@ -520,9 +530,9 @@ class WrrocRenderer implements Renderer { *propertyValues, *controlActions, *taskCreateActions, + *taskOutputs, *publishCreateActions, *inputFiles, - *intermediateFiles, *outputFiles, ]) ] @@ -725,7 +735,7 @@ class WrrocRenderer implements Renderer { } /** - * Get the canonical name of a module script. + * Get the canonical id of a module script. * * @param projectName * @param name @@ -735,7 +745,7 @@ class WrrocRenderer implements Renderer { } /** - * Get the canonical name of a module script. + * Get the canonical id of a module script. * * @param process */ @@ -745,7 +755,7 @@ class WrrocRenderer implements Renderer { } /** - * Get the canonical name of a module script. + * Get the canonical id of a module script. * * @param process */ @@ -755,7 +765,7 @@ class WrrocRenderer implements Renderer { } /** - * Get the canonical name of a tool used by a module. + * Get the canonical id of a tool used by a module. * * @param moduleName * @param toolName @@ -765,7 +775,7 @@ class WrrocRenderer implements Renderer { } /** - * Get the canonical name of a process in the workflow DAG. + * Get the canonical id of a process in the workflow DAG. * * @param projectName * @param process @@ -778,6 +788,40 @@ class WrrocRenderer implements Renderer { return "${projectName}#step#${process.getName()}" } + /** + * Get the canonical id of a task. + * + * @param task + */ + private static String getTaskId(TaskRun task) { + return 'task#' + task.hash.toString() + } + + /** + * Get the relative name of a task output. + * + * @param task + * @param target + */ + private static String getTaskOutputName(TaskRun task, Path target) { + final workDir = task.workDir.toUriString() + return target.toUriString().replace(workDir + '/', '') + } + + /** + * Get the canonical id of a task output. + * + * @param task + * @param name + */ + private static String getTaskOutputId(TaskRun task, String name) { + return "task#${task.hash}/${name}" + } + + private static String getTaskOutputId(TaskRun task, Path target) { + return "task#${task.hash}/${getTaskOutputName(task, target)}" + } + /** * Get the nf-core meta.yml of a Nextflow module as a map. * From 85c089eb9e948c0f8ee90e4536b8d9c1bfd04228 Mon Sep 17 00:00:00 2001 From: Ben Sherman Date: Thu, 16 Jan 2025 19:52:53 -0600 Subject: [PATCH 37/54] Fix validation issues Signed-off-by: Ben Sherman --- .../main/nextflow/prov/WrrocRenderer.groovy | 24 ++++++++----------- 1 file changed, 10 insertions(+), 14 deletions(-) diff --git a/plugins/nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy b/plugins/nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy index 7d4377d..d05b473 100644 --- a/plugins/nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy +++ b/plugins/nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy @@ -124,16 +124,9 @@ class WrrocRenderer implements Renderer { } // -- main script + final mainScriptId = metadata.scriptFile.name metadata.scriptFile.copyTo(crateDir) - datasetParts.add([ - "@id" : "main.nf", - "@type" : "File", - "name" : "Main script", - "description" : "The main script of the workflow.", - "encodingFormat": "application/nextflow" - ]) - // -- parameter schema final schemaPath = scriptFile.getParent().resolve("nextflow_schema.json") Map paramSchema = [:] @@ -173,6 +166,9 @@ class WrrocRenderer implements Renderer { ? getEncodingFormat(value as Path) : null + if( !type ) + log.warn "Could not determine type of parameter `${name}` for Workflow Run RO-crate" + return withoutNulls([ "@id" : getFormalParameterId(metadata.projectName, name), "@type" : "FormalParameter", @@ -422,16 +418,16 @@ class WrrocRenderer implements Renderer { "name" : "Workflow run of " + manifest.name ?: metadata.projectName, "description": manifest.description ?: null, "hasPart" : withoutNulls([ - ["@id": metadata.projectName], + ["@id": mainScriptId], *asReferences(datasetParts), *asReferences(inputFiles), + *asReferences(taskOutputs), *asReferences(outputFiles) ]), - "mainEntity" : ["@id": metadata.projectName], + "mainEntity" : ["@id": mainScriptId], "mentions" : [ ["@id": "#${session.uniqueId}"], *asReferences(taskCreateActions), - *asReferences(taskOutputs), *asReferences(publishCreateActions), ], "license" : manifest.license @@ -461,7 +457,7 @@ class WrrocRenderer implements Renderer { "version": "1.0" ], withoutNulls([ - "@id" : metadata.projectName, + "@id" : mainScriptId, "@type" : ["File", "SoftwareSourceCode", "ComputationalWorkflow", "HowTo"], "conformsTo" : ["@id": "https://bioschemas.org/profiles/ComputationalWorkflow/1.0-RELEASE"], "name" : manifest.name ?: metadata.projectName, @@ -471,7 +467,7 @@ class WrrocRenderer implements Renderer { "codeRepository" : metadata.repository, "version" : metadata.commitId, "license" : manifest.license, - "url" : normalizePath(metadata.scriptFile), + "url" : metadata.repository ? normalizePath(metadata.scriptFile) : null, "encodingFormat" : "application/nextflow", "runtimePlatform" : "Nextflow " + nextflowVersion, "hasPart" : asReferences(moduleSoftwareApplications), @@ -516,7 +512,7 @@ class WrrocRenderer implements Renderer { "name" : "Nextflow workflow run ${session.uniqueId}", "startTime" : dateStarted, "endTime" : dateCompleted, - "instrument": ["@id": metadata.projectName], + "instrument": ["@id": mainScriptId], "object" : [ *asReferences(propertyValues), *asReferences(inputFiles), From 9972771a54828a2373c8ddf95c8d58d761639e2e Mon Sep 17 00:00:00 2001 From: Ben Sherman Date: Fri, 17 Jan 2025 10:36:08 -0600 Subject: [PATCH 38/54] Separate staged input files from workflow inputs Signed-off-by: Ben Sherman --- .../src/main/nextflow/prov/ProvHelper.groovy | 20 ++++++ .../main/nextflow/prov/WrrocRenderer.groovy | 64 +++++++++++++------ 2 files changed, 66 insertions(+), 18 deletions(-) diff --git a/plugins/nf-prov/src/main/nextflow/prov/ProvHelper.groovy b/plugins/nf-prov/src/main/nextflow/prov/ProvHelper.groovy index 0a3ceba..cbd4fde 100644 --- a/plugins/nf-prov/src/main/nextflow/prov/ProvHelper.groovy +++ b/plugins/nf-prov/src/main/nextflow/prov/ProvHelper.groovy @@ -19,6 +19,7 @@ package nextflow.prov import java.nio.file.Path import groovy.transform.CompileStatic +import nextflow.Session import nextflow.exception.AbortOperationException import nextflow.file.FileHelper import nextflow.processor.TaskRun @@ -49,6 +50,25 @@ class ProvHelper { } } + /** + * Get the remote file staging directory for a workflow run. + * + * @param session + */ + static Path getStageDir(Session session) { + return session.workDir.resolve("stage-${session.uniqueId}") + } + + /** + * Determine whether a task input file was staged into the work directory. + * + * @param source + * @param session + */ + static boolean isStagedInput(Path source, Session session) { + return source.startsWith(getStageDir(session)) + } + /** * Get the list of output files for a task. * diff --git a/plugins/nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy b/plugins/nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy index d05b473..b8a88b6 100644 --- a/plugins/nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy +++ b/plugins/nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy @@ -198,6 +198,19 @@ class WrrocRenderer implements Renderer { ] } + // -- input files + final inputFiles = workflowInputs + .findAll { source -> !ProvHelper.isStagedInput(source, session) } + .collect { source -> + withoutNulls([ + "@id" : normalizePath(source), + "@type" : getType(source), + "name" : source.name, + "description" : null, + "encodingFormat": getEncodingFormat(source), + ]) + } + // -- copy input files from params to crate params.each { name, value -> final schema = paramSchema[name] ?: [:] @@ -210,7 +223,7 @@ class WrrocRenderer implements Renderer { // don't copy params.outdir into itself... if( source == crateDir ) return - datasetParts.add(withoutNulls([ + inputFiles.add(withoutNulls([ "@id" : source.name, "@type" : type, "description" : "Input file specified by params.${name}", @@ -220,18 +233,7 @@ class WrrocRenderer implements Renderer { } } - // -- input and output files - final inputFiles = workflowInputs - .collect { source -> - withoutNulls([ - "@id" : normalizePath(source), - "@type" : getType(source), - "name" : source.name, - "description" : null, - "encodingFormat": getEncodingFormat(source), - ]) - } - + // -- output files final outputFiles = workflowOutputs .findAll { source, target -> // warn about any output files outside of the crate directory @@ -245,7 +247,6 @@ class WrrocRenderer implements Renderer { "@id" : crateDir.relativize(target).toString(), "@type" : getType(target), "name" : target.name, - "description" : null, "encodingFormat": getEncodingFormat(target), ]) } @@ -336,11 +337,25 @@ class WrrocRenderer implements Renderer { } // -- workflow execution + final stagedInputs = workflowInputs + .findAll { source -> ProvHelper.isStagedInput(source, session) } + .collect { source -> + final name = getStagedInputName(source, session) + + withoutNulls([ + "@id" : "stage#${name}", + "@type" : getType(source), + "name" : name, + "encodingFormat": getEncodingFormat(source), + ]) + } + final taskCreateActions = tasks .collect { task -> final inputs = task.getInputFilesMap().collect { name, source -> - final id = source in taskLookup - ? getTaskOutputId(taskLookup[source], source) + final id = + source in taskLookup ? getTaskOutputId(taskLookup[source], source) + : ProvHelper.isStagedInput(source, session) ? "stage#${getStagedInputName(source, session)}" : normalizePath(source) ["@id": id] } @@ -421,6 +436,7 @@ class WrrocRenderer implements Renderer { ["@id": mainScriptId], *asReferences(datasetParts), *asReferences(inputFiles), + *asReferences(stagedInputs), *asReferences(taskOutputs), *asReferences(outputFiles) ]), @@ -526,6 +542,7 @@ class WrrocRenderer implements Renderer { *propertyValues, *controlActions, *taskCreateActions, + *stagedInputs, *taskOutputs, *publishCreateActions, *inputFiles, @@ -784,6 +801,17 @@ class WrrocRenderer implements Renderer { return "${projectName}#step#${process.getName()}" } + /** + * Get the relative name of a staged input. + * + * @param source + * @param session + */ + private static String getStagedInputName(Path source, Session session) { + final stageDir = ProvHelper.getStageDir(session) + return stageDir.relativize(source).toString() + } + /** * Get the canonical id of a task. * @@ -869,11 +897,11 @@ class WrrocRenderer implements Renderer { return values.collect { value -> ["@id": value["@id"]] } } - private static List withoutNulls(List list) { + private static List withoutNulls(List list) { return list.findAll { v -> v != null } } - private static Map withoutNulls(Map map) { + private static Map withoutNulls(Map map) { return map.findAll { k, v -> v != null } } From 3c4aa374dff303fcf2e05f9a458d3f5edda51b5d Mon Sep 17 00:00:00 2001 From: Ben Sherman Date: Fri, 17 Jan 2025 11:02:16 -0600 Subject: [PATCH 39/54] Update docs, fix issues with wrroc config options Signed-off-by: Ben Sherman --- README.md | 6 +-- WRROC.md | 45 +++++++++++++++++++ nextflow.config | 16 ------- .../main/nextflow/prov/WrrocRenderer.groovy | 10 ++--- 4 files changed, 50 insertions(+), 27 deletions(-) create mode 100644 WRROC.md diff --git a/README.md b/README.md index a00100c..970566a 100644 --- a/README.md +++ b/README.md @@ -44,9 +44,7 @@ Create the provenance report (default: `true` if plugin is loaded). Configuration scope for the desired output formats. The following formats are available: -- `bco`: Render a [BioCompute Object](https://biocomputeobject.org/). Supports the `file` and `overwrite` options. - - *New in version 1.3.0*: additional "pass-through" options are available for BCO fields that can't be inferred from the pipeline. See [BCO.md](./BCO.md) for more information. +- `bco`: Render a [BioCompute Object](https://biocomputeobject.org/). Supports the `file` and `overwrite` options. See [BCO.md](./BCO.md) for more information about the additional config options for BCO. - `dag`: Render the task graph as a Mermaid diagram embedded in an HTML document. Supports the `file` and `overwrite` options. @@ -54,7 +52,7 @@ Configuration scope for the desired output formats. The following formats are av *New in version 1.4.0* -- `wrroc`: Render a [Workflow Run RO-Crate](https://www.researchobject.org/workflow-run-crate/). Includes all three profiles (Process, Workflow, and Provenance). +- `wrroc`: Render a [Workflow Run RO-Crate](https://www.researchobject.org/workflow-run-crate/). Includes all three profiles (Process, Workflow, and Provenance). See [WRROC.md](./WRROC.md) for more information about the additional config options for WRROC. Any number of formats can be specified, for example: diff --git a/WRROC.md b/WRROC.md new file mode 100644 index 0000000..cc38963 --- /dev/null +++ b/WRROC.md @@ -0,0 +1,45 @@ +# Additional WRROC configuration + +*New in version 1.4.0* + +The `wrroc` format supports additional options to configure certain aspects of the Workflow Run RO-Crate. These fields cannot be inferred automatically from the pipeline or the run, and so must be entered through the config. + +The following config options are supported: + +- `prov.formats.wrroc.agent.contactType` +- `prov.formats.wrroc.agent.email` +- `prov.formats.wrroc.agent.name` +- `prov.formats.wrroc.agent.orcid` +- `prov.formats.wrroc.agent.phone` +- `prov.formats.wrroc.agent.ror` +- `prov.formats.wrroc.organization.contactType` +- `prov.formats.wrroc.organization.email` +- `prov.formats.wrroc.organization.name` +- `prov.formats.wrroc.organization.phone` +- `prov.formats.wrroc.organization.ror` +- `prov.formats.wrroc.publisher` + +Refer to the [WRROC User Guide](https://www.researchobject.org/workflow-run-crate/) for more information about the associated RO-Crate entities. + +Here is an example config: + +```groovy +prov { + formats { + wrroc { + agent { + name = "John Doe" + orcid = "https://orcid.org/0000-0000-0000-0000" + email = "john.doe@example.org" + phone = "(0)89-99998 000" + contactType = "Researcher" + } + organization { + name = "University of XYZ" + ror = "https://ror.org/000000000" + } + publisher = "https://ror.org/000000000" + } + } +} +``` diff --git a/nextflow.config b/nextflow.config index e3618f4..82b9e4d 100644 --- a/nextflow.config +++ b/nextflow.config @@ -23,22 +23,6 @@ prov { wrroc { file = "${params.outdir}/ro-crate-metadata.json" overwrite = true - agent { - name = "John Doe" - orcid = "https://orcid.org/0000-0000-0000-0000" - email = "john.doe@example.org" - phone = "(0)89-99998 000" - contactType = "Researcher" - } - organization { - name = "University of XYZ" - ror = "https://ror.org/000000000" - isPublisher = true - } - publisher { - id = "https://ror.org/000000000" - } - profile = "provenance_run_crate" } } } diff --git a/plugins/nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy b/plugins/nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy index b8a88b6..c0d0862 100644 --- a/plugins/nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy +++ b/plugins/nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy @@ -639,8 +639,8 @@ class WrrocRenderer implements Renderer { contactPoint.phone = opts.phone if( opts.orcid ) contactPoint.url = opts.orcid - if( opts.rar ) - contactPoint.url = opts.rar + if( opts.ror ) + contactPoint.url = opts.ror contactPoints.add(contactPoint) return contactPointId @@ -657,11 +657,7 @@ class WrrocRenderer implements Renderer { if( !opts.publisher ) return null - final publisherOpts = opts.publisher as Map - if( !publisherOpts.containsKey("id") ) - return null - - final publisherId = publisherOpts.id + final publisherId = opts.publisher // Check if the publisher id references either the agent or the organization final agentId = agent?["@id"] From 5c2b3f1e03c02e44c82391533d1c9245c8112c9e Mon Sep 17 00:00:00 2001 From: Ben Sherman Date: Fri, 17 Jan 2025 11:19:06 -0600 Subject: [PATCH 40/54] Fix null reference error Signed-off-by: Ben Sherman --- plugins/nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy | 2 ++ 1 file changed, 2 insertions(+) diff --git a/plugins/nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy b/plugins/nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy index c0d0862..832a4cc 100644 --- a/plugins/nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy +++ b/plugins/nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy @@ -213,6 +213,8 @@ class WrrocRenderer implements Renderer { // -- copy input files from params to crate params.each { name, value -> + if( !value ) + return final schema = paramSchema[name] ?: [:] final type = getParameterType(name, value, schema) if( type == "File" || type == "Dataset" ) { From 8bf3c9cc8da4065d1f22b8e6052c28f15a3084ef Mon Sep 17 00:00:00 2001 From: Ben Sherman Date: Fri, 17 Jan 2025 11:23:21 -0600 Subject: [PATCH 41/54] Don't copy directories specified by params into crate Signed-off-by: Ben Sherman --- .../nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/plugins/nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy b/plugins/nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy index 832a4cc..0ad9452 100644 --- a/plugins/nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy +++ b/plugins/nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy @@ -217,13 +217,13 @@ class WrrocRenderer implements Renderer { return final schema = paramSchema[name] ?: [:] final type = getParameterType(name, value, schema) - if( type == "File" || type == "Dataset" ) { + if( type == "File" ) { final source = (value as Path).complete() - // don't try to download remote files... + // don't try to download remote files if( source.fileSystem != FileSystems.default ) return - // don't copy params.outdir into itself... - if( source == crateDir ) + // don't try to copy local directories + if( !source.isFile() ) return inputFiles.add(withoutNulls([ "@id" : source.name, @@ -231,6 +231,7 @@ class WrrocRenderer implements Renderer { "description" : "Input file specified by params.${name}", "encodingFormat": getEncodingFormat(source) ])) + log.debug "Copying input file specified by params.${name} into RO-Crate: ${source.toUriString()}" source.copyTo(crateDir) } } From f6831dc835f5620496070ec7533a8a4b9fd7c61e Mon Sep 17 00:00:00 2001 From: Ben Sherman Date: Fri, 17 Jan 2025 11:23:46 -0600 Subject: [PATCH 42/54] minor edits Signed-off-by: Ben Sherman --- .../src/main/nextflow/prov/WrrocRenderer.groovy | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/plugins/nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy b/plugins/nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy index 0ad9452..82f5c77 100644 --- a/plugins/nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy +++ b/plugins/nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy @@ -112,13 +112,13 @@ class WrrocRenderer implements Renderer { if( !Files.exists(readmePath) ) continue - readmePath.copyTo(crateDir.resolve(fileName)) + readmePath.copyTo(crateDir) datasetParts.add([ "@id" : fileName, "@type" : "File", "name" : fileName, "description" : "The README file of the workflow.", - "encodingFormat": getEncodingFormat(readmePath) + "encodingFormat": getEncodingFormat(readmePath) ?: "text/plain" ]) break } @@ -133,7 +133,7 @@ class WrrocRenderer implements Renderer { if( Files.exists(schemaPath) ) { final fileName = schemaPath.name - schemaPath.copyTo(crateDir.resolve(fileName)) + schemaPath.copyTo(crateDir) datasetParts.add([ "@id" : fileName, "@type" : "File", @@ -167,7 +167,7 @@ class WrrocRenderer implements Renderer { : null if( !type ) - log.warn "Could not determine type of parameter `${name}` for Workflow Run RO-crate" + log.warn "Could not determine type of parameter `${name}` for Workflow Run RO-Crate" return withoutNulls([ "@id" : getFormalParameterId(metadata.projectName, name), @@ -206,7 +206,6 @@ class WrrocRenderer implements Renderer { "@id" : normalizePath(source), "@type" : getType(source), "name" : source.name, - "description" : null, "encodingFormat": getEncodingFormat(source), ]) } @@ -242,7 +241,7 @@ class WrrocRenderer implements Renderer { // warn about any output files outside of the crate directory final result = target.startsWith(crateDir) if( !result ) - log.warn "Excluding workflow output $target because it is outside of the RO-crate directory" + log.warn "Excluding workflow output ${target} because it is outside of the RO-Crate directory -- make sure that the workflow output directory and RO-Crate directory are the same" return result } .collect { source, target -> @@ -858,7 +857,7 @@ class WrrocRenderer implements Renderer { } /** - * Get the RO-crate "@type" of a path based on whether + * Get the RO-Crate "@type" of a path based on whether * it is a file or directory. * * @param path From 3c33bc92fcad9acc9bdc166682d11f70e9625cb7 Mon Sep 17 00:00:00 2001 From: Ben Sherman Date: Fri, 17 Jan 2025 12:48:06 -0600 Subject: [PATCH 43/54] Improve entity ids Signed-off-by: Ben Sherman --- .../main/nextflow/prov/WrrocRenderer.groovy | 88 ++++++++++--------- 1 file changed, 47 insertions(+), 41 deletions(-) diff --git a/plugins/nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy b/plugins/nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy index 82f5c77..7438dd4 100644 --- a/plugins/nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy +++ b/plugins/nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy @@ -94,8 +94,6 @@ class WrrocRenderer implements Renderer { agent["affiliation"] = ["@id": organization["@id"]] // create manifest - final softwareApplicationId = metadata.projectName + '#sa' - final organizeActionId = metadata.projectName + '#organize' final datasetParts = [] // -- license @@ -125,6 +123,8 @@ class WrrocRenderer implements Renderer { // -- main script final mainScriptId = metadata.scriptFile.name + final softwareApplicationId = "${mainScriptId}#software-application" + final organizeActionId = "${mainScriptId}#organize" metadata.scriptFile.copyTo(crateDir) // -- parameter schema @@ -170,7 +170,7 @@ class WrrocRenderer implements Renderer { log.warn "Could not determine type of parameter `${name}` for Workflow Run RO-Crate" return withoutNulls([ - "@id" : getFormalParameterId(metadata.projectName, name), + "@id" : getFormalParameterId(name), "@type" : "FormalParameter", "additionalType": type, "conformsTo" : ["@id": "https://bioschemas.org/profiles/FormalParameter/1.0-RELEASE"], @@ -184,15 +184,16 @@ class WrrocRenderer implements Renderer { final propertyValues = params .findAll { name, value -> value != null } .collect { name, value -> + final paramId = getFormalParameterId(name) final normalized = (value instanceof List || value instanceof Map) ? JsonOutput.toJson(value) : value instanceof CharSequence ? normalizePath(value.toString()) : value return [ - "@id" : "#${name}", + "@id" : "${paramId}/value", "@type" : "PropertyValue", - "exampleOfWork": ["@id": getFormalParameterId(metadata.projectName, name)], + "exampleOfWork": ["@id": paramId], "name" : name, "value" : normalized ] @@ -262,27 +263,37 @@ class WrrocRenderer implements Renderer { .collect { process -> ScriptMeta.get(process.getOwnerScript()) } .unique() .collectMany { meta -> - meta.getDefinitions().findAll { defn -> defn instanceof ProcessDef } - } as List + meta.getDefinitions().findAll { defn -> defn instanceof ProcessDef } as List + } + + final processLookup = taskProcessors + .inject([:] as Map) { acc, processor -> + final simpleName = processor.name.split(':').last() + acc[processor] = ScriptMeta.get(processor.getOwnerScript()).getProcess(simpleName) + acc + } final moduleSoftwareApplications = processDefs .collect() { process -> final result = [ "@id" : getModuleId(process), "@type" : "SoftwareApplication", - "name" : process.getName(), + "name" : process.baseName, + "url" : getModuleUrl(process), ] final metaYaml = getModuleSchema(process) if( metaYaml ) { - final moduleName = metaYaml.name as String + final name = metaYaml.name as String final tools = metaYaml.getOrDefault('tools', []) as List final parts = tools.collect { tool -> final entry = (tool as Map).entrySet().first() final toolName = entry.key as String - ["@id": getToolId(moduleName, toolName)] + ["@id": getToolId(process.baseName, toolName)] } + if( name ) + result.name = name if( parts ) result.hasPart = parts } @@ -296,16 +307,14 @@ class WrrocRenderer implements Renderer { if( !metaYaml ) return [] - final moduleName = metaYaml.name as String final tools = metaYaml.getOrDefault('tools', []) as List - return tools .collect { tool -> final entry = (tool as Map).entrySet().first() final toolName = entry.key as String final toolDescription = (entry.value as Map)?.get('description') as String return [ - "@id" : getToolId(moduleName, toolName), + "@id" : getToolId(process.baseName, toolName), "@type" : "SoftwareApplication", "name" : toolName, "description" : toolDescription @@ -316,9 +325,9 @@ class WrrocRenderer implements Renderer { final howToSteps = taskProcessors .collect() { process -> [ - "@id" : getProcessStepId(metadata.projectName, process), + "@id" : getProcessStepId(process), "@type" : "HowToStep", - "workExample": ["@id": getModuleId(process)], + "workExample": ["@id": getModuleId(processLookup[process])], "position" : process.getId() ] } @@ -330,10 +339,10 @@ class WrrocRenderer implements Renderer { .collect { task -> ["@id": getTaskId(task)] } return [ - "@id" : getProcessControlId(metadata.projectName, process), + "@id" : getProcessControlId(process), "@type" : "ControlAction", - "instrument": ["@id": getProcessStepId(metadata.projectName, process)], - "name" : "Orchestrate process " + process.getName(), + "instrument": ["@id": getProcessStepId(process)], + "name" : "Orchestrate process ${process.name}", "object" : taskIds ] } @@ -345,7 +354,7 @@ class WrrocRenderer implements Renderer { final name = getStagedInputName(source, session) withoutNulls([ - "@id" : "stage#${name}", + "@id" : "#stage/${name}", "@type" : getType(source), "name" : name, "encodingFormat": getEncodingFormat(source), @@ -357,7 +366,7 @@ class WrrocRenderer implements Renderer { final inputs = task.getInputFilesMap().collect { name, source -> final id = source in taskLookup ? getTaskOutputId(taskLookup[source], source) - : ProvHelper.isStagedInput(source, session) ? "stage#${getStagedInputName(source, session)}" + : ProvHelper.isStagedInput(source, session) ? "#stage/${getStagedInputName(source, session)}" : normalizePath(source) ["@id": id] } @@ -367,8 +376,8 @@ class WrrocRenderer implements Renderer { final result = [ "@id" : getTaskId(task), "@type" : "CreateAction", - "name" : task.getName(), - "instrument" : ["@id": getModuleId(task.processor)], + "name" : task.name, + "instrument" : ["@id": getModuleId(processLookup[task.processor])], "agent" : ["@id": agent["@id"]], "object" : inputs, "result" : outputs, @@ -398,7 +407,7 @@ class WrrocRenderer implements Renderer { final sourceName = getTaskOutputName(task, source) return [ - "@id" : "publish#${task.hash}/${sourceName}", + "@id" : "#publish/${task.hash}/${sourceName}", "@type" : "CreateAction", "name" : "publish", "instrument" : ["@id": softwareApplicationId], @@ -432,7 +441,7 @@ class WrrocRenderer implements Renderer { ["@id": "https://w3id.org/ro/wfrun/provenance/0.1"], ["@id": "https://w3id.org/workflowhub/workflow-ro-crate/1.0"] ], - "name" : "Workflow run of " + manifest.name ?: metadata.projectName, + "name" : "Workflow run of ${manifest.name ?: metadata.projectName}", "description": manifest.description ?: null, "hasPart" : withoutNulls([ ["@id": mainScriptId], @@ -748,11 +757,10 @@ class WrrocRenderer implements Renderer { /** * Get the canonical id of a module script. * - * @param projectName * @param name */ - private String getFormalParameterId(String projectName, String name) { - return "${projectName}#param#${name}" + private String getFormalParameterId(String name) { + return "#param/${name}" } /** @@ -761,17 +769,16 @@ class WrrocRenderer implements Renderer { * @param process */ private String getModuleId(ProcessDef process) { - final scriptPath = ScriptMeta.get(process.getOwner()).getScriptPath().normalize() - return normalizePath(scriptPath) + return "#module/${process.baseName}" } /** - * Get the canonical id of a module script. + * Get the canonical url of a module script. * * @param process */ - private String getModuleId(TaskProcessor process) { - final scriptPath = ScriptMeta.get(process.getOwnerScript()).getScriptPath().normalize() + private String getModuleUrl(ProcessDef process) { + final scriptPath = ScriptMeta.get(process.getOwner()).getScriptPath().normalize() return normalizePath(scriptPath) } @@ -782,21 +789,20 @@ class WrrocRenderer implements Renderer { * @param toolName */ private static String getToolId(String moduleName, String toolName) { - return "${moduleName}#${toolName}" + return "#module/${moduleName}/${toolName}" } /** * Get the canonical id of a process in the workflow DAG. * - * @param projectName * @param process */ - private static String getProcessControlId(String projectName, TaskProcessor process) { - return "${projectName}#control#${process.getName()}" + private static String getProcessControlId(TaskProcessor process) { + return "#process-control/${process.name}" } - private static String getProcessStepId(String projectName, TaskProcessor process) { - return "${projectName}#step#${process.getName()}" + private static String getProcessStepId(TaskProcessor process) { + return "#process-step/${process.name}" } /** @@ -816,7 +822,7 @@ class WrrocRenderer implements Renderer { * @param task */ private static String getTaskId(TaskRun task) { - return 'task#' + task.hash.toString() + return "#task/${task.hash}" } /** @@ -837,11 +843,11 @@ class WrrocRenderer implements Renderer { * @param name */ private static String getTaskOutputId(TaskRun task, String name) { - return "task#${task.hash}/${name}" + return "#task/${task.hash}/${name}" } private static String getTaskOutputId(TaskRun task, Path target) { - return "task#${task.hash}/${getTaskOutputName(task, target)}" + return "#task/${task.hash}/${getTaskOutputName(task, target)}" } /** From 6f0ffeda309347fc4f70c5174f3356dee4f7d097 Mon Sep 17 00:00:00 2001 From: Ben Sherman Date: Fri, 17 Jan 2025 12:48:58 -0600 Subject: [PATCH 44/54] Make intermediate outputs into contextual entities (CreativeWork) Signed-off-by: Ben Sherman --- .../src/main/nextflow/prov/WrrocRenderer.groovy | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/plugins/nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy b/plugins/nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy index 7438dd4..d818e6e 100644 --- a/plugins/nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy +++ b/plugins/nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy @@ -355,7 +355,7 @@ class WrrocRenderer implements Renderer { withoutNulls([ "@id" : "#stage/${name}", - "@type" : getType(source), + "@type" : "CreativeWork", "name" : name, "encodingFormat": getEncodingFormat(source), ]) @@ -394,7 +394,7 @@ class WrrocRenderer implements Renderer { return withoutNulls([ "@id" : getTaskOutputId(task, name), - "@type" : getType(target), + "@type" : "CreativeWork", "name" : name, "encodingFormat": getEncodingFormat(target), ]) @@ -447,14 +447,14 @@ class WrrocRenderer implements Renderer { ["@id": mainScriptId], *asReferences(datasetParts), *asReferences(inputFiles), - *asReferences(stagedInputs), - *asReferences(taskOutputs), *asReferences(outputFiles) ]), "mainEntity" : ["@id": mainScriptId], "mentions" : [ ["@id": "#${session.uniqueId}"], + *asReferences(stagedInputs), *asReferences(taskCreateActions), + *asReferences(taskOutputs), *asReferences(publishCreateActions), ], "license" : manifest.license @@ -552,8 +552,8 @@ class WrrocRenderer implements Renderer { *datasetParts, *propertyValues, *controlActions, - *taskCreateActions, *stagedInputs, + *taskCreateActions, *taskOutputs, *publishCreateActions, *inputFiles, From 8b13a59eba6dedb831c86dc1165cccf3efe14a0b Mon Sep 17 00:00:00 2001 From: Ben Sherman Date: Wed, 22 Jan 2025 10:18:50 -0600 Subject: [PATCH 45/54] Exclude license entity if it is not specified Signed-off-by: Ben Sherman --- .../src/main/nextflow/prov/WrrocRenderer.groovy | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/plugins/nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy b/plugins/nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy index d818e6e..164ab00 100644 --- a/plugins/nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy +++ b/plugins/nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy @@ -97,12 +97,12 @@ class WrrocRenderer implements Renderer { final datasetParts = [] // -- license - final license = [ - "@id" : manifest.license, - "@type": "CreativeWork" - ] - - datasetParts.add(license) + if( manifest.license ) { + datasetParts.add([ + "@id" : manifest.license, + "@type": "CreativeWork" + ]) + } // -- readme file for( final fileName : README_FILENAMES ) { From 98b3639555228c898b1aeb354ea1f85cbbd3ef3a Mon Sep 17 00:00:00 2001 From: Ben Sherman Date: Wed, 22 Jan 2025 10:19:07 -0600 Subject: [PATCH 46/54] Fix null reference error when agent is not specified Signed-off-by: Ben Sherman --- .../nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/plugins/nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy b/plugins/nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy index 164ab00..fb568f0 100644 --- a/plugins/nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy +++ b/plugins/nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy @@ -378,7 +378,7 @@ class WrrocRenderer implements Renderer { "@type" : "CreateAction", "name" : task.name, "instrument" : ["@id": getModuleId(processLookup[task.processor])], - "agent" : ["@id": agent["@id"]], + "agent" : agent ? ["@id": agent["@id"]] : null, "object" : inputs, "result" : outputs, "actionStatus": task.exitStatus == 0 ? "http://schema.org/CompletedActionStatus" : "http://schema.org/FailedActionStatus" @@ -432,7 +432,7 @@ class WrrocRenderer implements Renderer { withoutNulls([ "@id" : "./", "@type" : "Dataset", - "author" : ["@id": agent["@id"]], + "author" : agent ? ["@id": agent["@id"]] : null, "publisher" : publisherId ? ["@id": publisherId] : null, "datePublished": getDatePublished(), "conformsTo" : [ @@ -524,7 +524,7 @@ class WrrocRenderer implements Renderer { [ "@id" : organizeActionId, "@type" : "OrganizeAction", - "agent" : ["@id": agent["@id"]], + "agent" : agent ? ["@id": agent["@id"]] : null, "instrument": ["@id": softwareApplicationId], "name" : "Run of Nextflow ${nextflowVersion}", "object" : asReferences(controlActions), @@ -535,7 +535,7 @@ class WrrocRenderer implements Renderer { [ "@id" : "#${session.uniqueId}", "@type" : "CreateAction", - "agent" : ["@id": agent["@id"]], + "agent" : agent ? ["@id": agent["@id"]] : null, "name" : "Nextflow workflow run ${session.uniqueId}", "startTime" : dateStarted, "endTime" : dateCompleted, From 2478427340de02fba6b030944c9104032e12a312 Mon Sep 17 00:00:00 2001 From: Ben Sherman Date: Thu, 23 Jan 2025 12:32:53 -0600 Subject: [PATCH 47/54] Add warning if pipeline repo URL can't be determined Signed-off-by: Ben Sherman --- plugins/nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy | 3 +++ 1 file changed, 3 insertions(+) diff --git a/plugins/nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy b/plugins/nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy index fb568f0..b62f1ba 100644 --- a/plugins/nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy +++ b/plugins/nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy @@ -127,6 +127,9 @@ class WrrocRenderer implements Renderer { final organizeActionId = "${mainScriptId}#organize" metadata.scriptFile.copyTo(crateDir) + if( !metadata.repository ) + log.warn "Could not determine pipeline repository URL for Workflow Run RO-Crate -- launch the pipeline with canonical URL (e.g. `nextflow run nextflow-io/hello`) to ensure that the pipeline repository URL is recorded in the crate" + // -- parameter schema final schemaPath = scriptFile.getParent().resolve("nextflow_schema.json") Map paramSchema = [:] From f30394117eb11b053b2aa147a3207136e22e4277 Mon Sep 17 00:00:00 2001 From: Ben Sherman Date: Mon, 27 Jan 2025 15:16:37 -0600 Subject: [PATCH 48/54] Use heuristic to identify original definition of task processor Signed-off-by: Ben Sherman --- .../src/main/nextflow/prov/WrrocRenderer.groovy | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/plugins/nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy b/plugins/nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy index b62f1ba..e544a5b 100644 --- a/plugins/nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy +++ b/plugins/nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy @@ -271,8 +271,13 @@ class WrrocRenderer implements Renderer { final processLookup = taskProcessors .inject([:] as Map) { acc, processor -> - final simpleName = processor.name.split(':').last() - acc[processor] = ScriptMeta.get(processor.getOwnerScript()).getProcess(simpleName) + // HACK: when the owner script of a processor defines only one process, that must be the definition + final meta = ScriptMeta.get(processor.getOwnerScript()) + final defs = meta.getDefinitions().findAll { defn -> defn instanceof ProcessDef } as List + final processDef = defs.size() == 1 ? defs.first() : null + if( !processDef ) + log.warn "Could not identify process definition for `${processor.name}` -- resulting RO-Crate may be invalid (hint: define each process in a separate module script to fix this issue)" + acc[processor] = processDef acc } @@ -327,10 +332,11 @@ class WrrocRenderer implements Renderer { final howToSteps = taskProcessors .collect() { process -> + final processDef = processLookup[process] [ "@id" : getProcessStepId(process), "@type" : "HowToStep", - "workExample": ["@id": getModuleId(processLookup[process])], + "workExample": processDef ? ["@id": getModuleId(processDef)] : null, "position" : process.getId() ] } @@ -366,6 +372,7 @@ class WrrocRenderer implements Renderer { final taskCreateActions = tasks .collect { task -> + final processDef = processLookup[task.processor] final inputs = task.getInputFilesMap().collect { name, source -> final id = source in taskLookup ? getTaskOutputId(taskLookup[source], source) @@ -380,7 +387,7 @@ class WrrocRenderer implements Renderer { "@id" : getTaskId(task), "@type" : "CreateAction", "name" : task.name, - "instrument" : ["@id": getModuleId(processLookup[task.processor])], + "instrument" : processDef ? ["@id": getModuleId(processDef)] : null, "agent" : agent ? ["@id": agent["@id"]] : null, "object" : inputs, "result" : outputs, From 9ee072bfae339b6accf69b0aff51d4a1addeac53 Mon Sep 17 00:00:00 2001 From: Ben Sherman Date: Mon, 27 Jan 2025 15:31:23 -0600 Subject: [PATCH 49/54] Encode missing input files as absolute URIs Signed-off-by: Ben Sherman --- .../main/nextflow/prov/PathNormalizer.groovy | 14 ++++----- .../main/nextflow/prov/WrrocRenderer.groovy | 29 +++++++++++++++++-- 2 files changed, 32 insertions(+), 11 deletions(-) diff --git a/plugins/nf-prov/src/main/nextflow/prov/PathNormalizer.groovy b/plugins/nf-prov/src/main/nextflow/prov/PathNormalizer.groovy index f0dc26f..5d57dac 100644 --- a/plugins/nf-prov/src/main/nextflow/prov/PathNormalizer.groovy +++ b/plugins/nf-prov/src/main/nextflow/prov/PathNormalizer.groovy @@ -32,8 +32,6 @@ class PathNormalizer { private String commitId - private String launchDir - private String projectDir private String workDir @@ -42,14 +40,12 @@ class PathNormalizer { repository = metadata.repository ? new URL(metadata.repository) : null commitId = metadata.commitId projectDir = metadata.projectDir.toUriString() - launchDir = metadata.launchDir.toUriString() workDir = metadata.workDir.toUriString() } /** - * Normalize paths so that local absolute paths become - * relative paths, and local paths derived from remote URLs - * become the URLs. + * Normalize paths against the original remote URL, or + * work directory, where appropriate. * * @param path */ @@ -66,9 +62,9 @@ class PathNormalizer { if( repository && path.startsWith(projectDir) ) return getProjectSourceUrl(path) - // replace launch directory with relative path - if( path.startsWith(launchDir) ) - return path.replace(launchDir + '/', '') + // encode local absolute paths as file URLs + if( path.startsWith('/') ) + return 'file://' + path return path } diff --git a/plugins/nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy b/plugins/nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy index e544a5b..46f1f65 100644 --- a/plugins/nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy +++ b/plugins/nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy @@ -203,18 +203,43 @@ class WrrocRenderer implements Renderer { } // -- input files + Map paramInputFiles = [:] + + params.each { name, value -> + if( !value ) + return + final schema = paramSchema[name] ?: [:] + final type = getParameterType(name, value, schema) + if( type != "File" ) + return + final source = (value as Path).complete() + // don't try to download remote files + if( source.fileSystem != FileSystems.default ) + return + // don't try to copy local directories + if( !source.isFile() ) + return + paramInputFiles.put(source, name) + } + final inputFiles = workflowInputs .findAll { source -> !ProvHelper.isStagedInput(source, session) } .collect { source -> + final paramName = paramInputFiles[source] + if( paramName ) { + log.debug "Copying input file specified by `params.${paramName}` into RO-Crate: ${source.toUriString()}" + source.copyTo(crateDir) + } + withoutNulls([ - "@id" : normalizePath(source), + "@id" : paramName ? source.name : normalizePath(source), "@type" : getType(source), "name" : source.name, "encodingFormat": getEncodingFormat(source), ]) } - // -- copy input files from params to crate + // -- copy local input files specified by params to crate params.each { name, value -> if( !value ) return From ee7ee8c9c2b892a36ea778d6b2af21cc6a6bb84f Mon Sep 17 00:00:00 2001 From: Ben Sherman Date: Mon, 3 Feb 2025 12:31:55 -0500 Subject: [PATCH 50/54] Separate ro-crate license from pipeline license Signed-off-by: Ben Sherman --- WRROC.md | 2 ++ .../nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy | 9 ++++++--- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/WRROC.md b/WRROC.md index cc38963..c26e21a 100644 --- a/WRROC.md +++ b/WRROC.md @@ -17,6 +17,7 @@ The following config options are supported: - `prov.formats.wrroc.organization.name` - `prov.formats.wrroc.organization.phone` - `prov.formats.wrroc.organization.ror` +- `prov.formats.wrroc.license` - `prov.formats.wrroc.publisher` Refer to the [WRROC User Guide](https://www.researchobject.org/workflow-run-crate/) for more information about the associated RO-Crate entities. @@ -38,6 +39,7 @@ prov { name = "University of XYZ" ror = "https://ror.org/000000000" } + license = "https://spdx.org/licenses/MIT" publisher = "https://ror.org/000000000" } } diff --git a/plugins/nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy b/plugins/nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy index 46f1f65..855da39 100644 --- a/plugins/nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy +++ b/plugins/nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy @@ -97,12 +97,15 @@ class WrrocRenderer implements Renderer { final datasetParts = [] // -- license - if( manifest.license ) { + if( wrrocOpts.license ) { datasetParts.add([ - "@id" : manifest.license, + "@id" : wrrocOpts.license, "@type": "CreativeWork" ]) } + else { + log.warn "Missing license for Workflow Run RO-Crate -- the resulting crate will be invalid" + } // -- readme file for( final fileName : README_FILENAMES ) { @@ -492,7 +495,7 @@ class WrrocRenderer implements Renderer { *asReferences(taskOutputs), *asReferences(publishCreateActions), ], - "license" : manifest.license + "license" : wrrocOpts.license ]), [ "@id" : "https://w3id.org/ro/wfrun/process/0.1", From 0394e925f585b5fd50044df2904c67431d59ce80 Mon Sep 17 00:00:00 2001 From: Ben Sherman Date: Mon, 3 Feb 2025 12:58:59 -0500 Subject: [PATCH 51/54] Normalize durations and memory units as raw numbers Signed-off-by: Ben Sherman --- .../main/nextflow/prov/WrrocRenderer.groovy | 34 ++++++++++++++++--- 1 file changed, 30 insertions(+), 4 deletions(-) diff --git a/plugins/nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy b/plugins/nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy index 855da39..2bde7d3 100644 --- a/plugins/nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy +++ b/plugins/nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy @@ -32,6 +32,8 @@ import nextflow.processor.TaskRun import nextflow.script.ProcessDef import nextflow.script.ScriptMeta import nextflow.util.ConfigHelper +import nextflow.util.Duration +import nextflow.util.MemoryUnit import org.yaml.snakeyaml.Yaml /** @@ -191,10 +193,7 @@ class WrrocRenderer implements Renderer { .findAll { name, value -> value != null } .collect { name, value -> final paramId = getFormalParameterId(name) - final normalized = - (value instanceof List || value instanceof Map) ? JsonOutput.toJson(value) - : value instanceof CharSequence ? normalizePath(value.toString()) - : value + final normalized = normalizeParamValue(value) return [ "@id" : "${paramId}/value", @@ -717,6 +716,31 @@ class WrrocRenderer implements Renderer { return publisherId } + /** + * Noraalize a parameter value. + * + * @param value + */ + private Object normalizeParamValue(Object value) { + switch( value ) { + case Boolean: + case Number: + return value + case CharSequence: + return normalizePath(value.toString()) + case List: + case Map: + return JsonOutput.toJson(value) + case Duration: + return ((Duration) value).toMillis() + case MemoryUnit: + return ((MemoryUnit) value).toBytes() + default: + log.warn "Workflow Run RO-Crate encountered parameter value of type ${value.class.name} -- JSON serialization might be incorrect" + return value + } + } + /** * Get the parameter schema of a pipeline as a map. * @@ -781,6 +805,8 @@ class WrrocRenderer implements Renderer { case Boolean: return "Boolean" case Number: + case Duration: + case MemoryUnit: return "Number" case CharSequence: return "Text" From 86c74ccb4ca0d9e3268bb97c8bd1c2c15232a1d2 Mon Sep 17 00:00:00 2001 From: Ben Sherman Date: Tue, 4 Feb 2025 09:43:09 -0500 Subject: [PATCH 52/54] Update warning about unknown parameter type Signed-off-by: Ben Sherman --- plugins/nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/plugins/nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy b/plugins/nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy index 2bde7d3..f1372b7 100644 --- a/plugins/nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy +++ b/plugins/nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy @@ -175,7 +175,7 @@ class WrrocRenderer implements Renderer { : null if( !type ) - log.warn "Could not determine type of parameter `${name}` for Workflow Run RO-Crate" + log.warn "Could not determine type of parameter `${name}` for Workflow Run RO-Crate -- the resulting crate will be invalid" return withoutNulls([ "@id" : getFormalParameterId(name), From dd3f0e1d91e0e169e8dbfe059f0b74c5d73de0ae Mon Sep 17 00:00:00 2001 From: Ben Sherman Date: Tue, 4 Feb 2025 09:45:35 -0500 Subject: [PATCH 53/54] Handle task inputs from work/tmp/ Signed-off-by: Ben Sherman --- .../src/main/nextflow/prov/ProvHelper.groovy | 32 +++++++++++++------ .../main/nextflow/prov/WrrocRenderer.groovy | 18 ++++++++++- 2 files changed, 39 insertions(+), 11 deletions(-) diff --git a/plugins/nf-prov/src/main/nextflow/prov/ProvHelper.groovy b/plugins/nf-prov/src/main/nextflow/prov/ProvHelper.groovy index cbd4fde..d4ed030 100644 --- a/plugins/nf-prov/src/main/nextflow/prov/ProvHelper.groovy +++ b/plugins/nf-prov/src/main/nextflow/prov/ProvHelper.groovy @@ -59,16 +59,6 @@ class ProvHelper { return session.workDir.resolve("stage-${session.uniqueId}") } - /** - * Determine whether a task input file was staged into the work directory. - * - * @param source - * @param session - */ - static boolean isStagedInput(Path source, Session session) { - return source.startsWith(getStageDir(session)) - } - /** * Get the list of output files for a task. * @@ -118,4 +108,26 @@ class ProvHelper { return result } + /** + * Determine whether a task input file was staged into the work directory. + * + * @param source + * @param session + */ + static boolean isStagedInput(Path source, Session session) { + return source.startsWith(getStageDir(session)) + } + + /** + * Determine whether a task input file was created in the work/tmp/ + * directory (i.e. by a collectFile operator). + * + * @param source + * @param session + */ + static boolean isTmpInput(Path source, Session session) { + final tmpDir = session.workDir.resolve('tmp') + return source.startsWith(tmpDir) + } + } diff --git a/plugins/nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy b/plugins/nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy index f1372b7..ed745f3 100644 --- a/plugins/nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy +++ b/plugins/nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy @@ -225,7 +225,9 @@ class WrrocRenderer implements Renderer { } final inputFiles = workflowInputs - .findAll { source -> !ProvHelper.isStagedInput(source, session) } + .findAll { source -> + !ProvHelper.isStagedInput(source, session) && !ProvHelper.isTmpInput(source, session) + } .collect { source -> final paramName = paramInputFiles[source] if( paramName ) { @@ -397,6 +399,17 @@ class WrrocRenderer implements Renderer { ]) } + final tmpInputs = workflowInputs + .findAll { source -> ProvHelper.isTmpInput(source, session) } + .collect { source -> + withoutNulls([ + "@id" : "#tmp/${source.name}", + "@type" : "CreativeWork", + "name" : source.name, + "encodingFormat": getEncodingFormat(source), + ]) + } + final taskCreateActions = tasks .collect { task -> final processDef = processLookup[task.processor] @@ -404,6 +417,7 @@ class WrrocRenderer implements Renderer { final id = source in taskLookup ? getTaskOutputId(taskLookup[source], source) : ProvHelper.isStagedInput(source, session) ? "#stage/${getStagedInputName(source, session)}" + : ProvHelper.isTmpInput(source, session) ? "#tmp/${source.name}" : normalizePath(source) ["@id": id] } @@ -490,6 +504,7 @@ class WrrocRenderer implements Renderer { "mentions" : [ ["@id": "#${session.uniqueId}"], *asReferences(stagedInputs), + *asReferences(tmpInputs), *asReferences(taskCreateActions), *asReferences(taskOutputs), *asReferences(publishCreateActions), @@ -590,6 +605,7 @@ class WrrocRenderer implements Renderer { *propertyValues, *controlActions, *stagedInputs, + *tmpInputs, *taskCreateActions, *taskOutputs, *publishCreateActions, From e11e50fc5c5b13e54fe129aea32ad1f5cf111e24 Mon Sep 17 00:00:00 2001 From: Ben Sherman Date: Tue, 4 Feb 2025 09:55:14 -0500 Subject: [PATCH 54/54] Exclude parameters set to null Signed-off-by: Ben Sherman --- plugins/nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy | 1 + 1 file changed, 1 insertion(+) diff --git a/plugins/nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy b/plugins/nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy index ed745f3..4a4fc4d 100644 --- a/plugins/nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy +++ b/plugins/nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy @@ -167,6 +167,7 @@ class WrrocRenderer implements Renderer { // -- pipeline parameters // TODO: formal parameters for workflow output targets final formalParameters = params + .findAll { name, value -> value != null } .collect { name, value -> final schema = paramSchema[name] ?: [:] final type = getParameterType(name, value, schema)