From f78c5a29189b2da0f19a37fd1dfc1d0738a4a1ef Mon Sep 17 00:00:00 2001 From: Giordon Stark Date: Wed, 12 Mar 2025 11:17:42 -0700 Subject: [PATCH] initial commit from the old code i dug out before --- src/pyhf/cli/cli.py | 1 + src/pyhf/cli/spec.py | 337 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 338 insertions(+) diff --git a/src/pyhf/cli/cli.py b/src/pyhf/cli/cli.py index cd458a6329..81c7a6a389 100644 --- a/src/pyhf/cli/cli.py +++ b/src/pyhf/cli/cli.py @@ -47,6 +47,7 @@ def pyhf(): pyhf.add_command(spec.combine) pyhf.add_command(spec.digest) pyhf.add_command(spec.sort) +pyhf.add_command(spec.hs3_to_hifa) # pyhf.add_command(infer.cli) pyhf.add_command(infer.fit) diff --git a/src/pyhf/cli/spec.py b/src/pyhf/cli/spec.py index 08fdd73d94..25a016f04b 100644 --- a/src/pyhf/cli/spec.py +++ b/src/pyhf/cli/spec.py @@ -9,6 +9,8 @@ from pyhf import modifiers from pyhf import parameters from pyhf import utils +import re +from math import sqrt log = logging.getLogger(__name__) @@ -408,3 +410,338 @@ def sort(workspace, output_file): with open(output_file, "w+", encoding="utf-8") as out_file: json.dump(sorted_ws, out_file, indent=4, sort_keys=True) log.debug(f"Written to {output_file}") + + +def stripname(name, stripcomponents): + while True: + anymatch = False + for c in stripcomponents: + if name.endswith(c): + name = name[: -len(c)].strip("_") + anymatch = True + break + if name.startswith(c): + name = name[len(c) :].strip("_") + anymatch = True + break + if c in name: + name = name.replace("_" + c + "_", "_") + anymatch = True + break + if not anymatch: + return name + + +@cli.command() +@click.argument('workspace', default='-') +@click.option( + '--output-file', + help='The location of the output json file. If not specified, prints to screen.', + default=None, +) +@click.option("--poi", default=None, help="Parameter of Interest") +@click.option("--data", default="obsData", help="Name of the dataset") +@click.option("--fix-other-pois", is_flag=True, help="Fix other POIs") +@click.option( + "--nf-blacklist", + multiple=True, + default=["binWidth_.*", "one"], + help="Blacklist for normfactors", +) +@click.option( + "--sys-blacklist", multiple=True, default=["zero"], help="Blacklist for systematics" +) +@click.option( + "--strip-name-components", + multiple=True, + default=["model"], + help="Components to strip from names", +) +@click.option( + "--defactorize", is_flag=True, help="Merge OverallSys and Histosys of the same name" +) +def hs3_to_hifa( + workspace, + output_file, + poi, + data, + fix_other_pois, + nf_blacklist, + sys_blacklist, + strip_name_components, + defactorize, +): + """ + Convert the HS3 workspace to a HiFa JSON workspace for use with pyhf. + + Taken from: https://gitlab.cern.ch/cburgard/RooFitUtils/-/blob/bbb079c1f597b138e2b638b0d98cff3835596240/scripts/json-roofit2pyhf.py + + Example: + + .. code-block:: shell + + $ curl -sL https://raw.githubusercontent.com/root-project/root/12b7ffe/tutorials/roofit/roofit/rf515_hfJSON.json | pyhf sort | jq '.' | md5 + 8be5186ec249d2704e14dd29ef05ffb0 + + """ + with click.open_file(workspace, "r", encoding="utf-8") as specstream: + spec = json.load(specstream) + + variables = spec.get("variables", {}) + bounds = {} + for domain in spec.get("domains", []): + for parameter in domain["axes"]: + bounds[parameter["name"]] = [parameter["min"], parameter["max"]] + + # setup the main structure + pois = set() + for analysis in spec["analyses"]: + for poi in analysis["parameters_of_interest"]: + pois.add(poi) + measurement = {"name": "meas", "config": {"parameters": []}} + if poi: + measurement["config"]["poi"] = poi + elif len(pois) > 0: + measurement["config"]["poi"] = next(iter(pois)) + output_json = { + "channels": [], + "measurements": [measurement], + "observations": [], + "version": "1.0.0", + } + + # some bookkeeping + nps = set() + nfs = set() + channelnames = [] + + # define observations / data + this_data = {} + for key, channel in this_data.items(): + if not isinstance(channel, dict): + continue + channelname = stripname(key, strip_name_components) + channelnames.append(channelname) + if "counts" in channel.keys(): + output_json["observations"].append( + {"data": channel["counts"], "name": channelname} + ) + else: + output_json["observations"].append( + {"data": channel["weights"], "name": channelname} + ) + + observations = [] + parameters = {} + + # define model /pdf + for pdf in sorted(spec.get("distributions", []), key=lambda x: x["name"]): + if pdf["type"] != "histfactory_dist": + continue + if "name" in pdf.keys(): + key = pdf["name"] + channelname = stripname(key, strip_name_components) + for c in channelnames: + if c.startswith(channelname): + channelname = c + + for any_data in spec["data"]: + if data in any_data["name"] and channelname in any_data["name"]: + observations.append({"data": any_data["contents"], "name": channelname}) + + out_channel = {"name": channelname, "samples": []} + output_json["channels"].append(out_channel) + + if "samples" not in pdf.keys(): + print("workspace is no histfactory workspace") + exit(1) + + sum_values = None + sum_errors2 = None + + for sample in pdf["samples"]: + if "data" not in sample.keys(): + print("workspace no histfactory workspace") + exit(1) + has_staterror = False + for modifier in sample["modifiers"]: + if modifier["type"] == "staterror": + has_staterror = True + + if has_staterror: + values = sample["data"]["contents"] + if sum_values: + for i in range(len(sum_values)): + sum_values[i] += values[i] + else: + sum_values = [v for v in values] + + errors = sample["data"]["errors"] + if sum_errors2: + for i in range(len(sum_errors2)): + sum_errors2[i] += errors[i] * errors[i] + else: + sum_errors2 = [e * e for e in errors] + + for sample in sorted(pdf["samples"], key=lambda x: x["name"]): + values = sample["data"]["contents"] + bins = len(values) + + out_sample = { + "name": stripname(sample["name"], strip_name_components), + "modifiers": [], + } + out_channel["samples"].append(out_sample) + out_sample["data"] = values + + modifiers = out_sample["modifiers"] + for modifier in sorted(sample["modifiers"], key=lambda x: x["name"]): + if modifier.get("constraint", None) == "Const": + continue + if modifier["name"] == "Lumi": + modifiers.append({"name": "lumi", "type": "lumi", "data": None}) + nps.add("lumi") + parameters["lumi"] = { + "fixed": False, + "inits": [1.0], + "auxdata": [1.0], + **( + {"bounds": [bounds[modifier["name"]]]} + if modifier["name"] in bounds + else {} + ), + } + elif modifier["type"] == "staterror": + parname = "staterror_" + channelname + modifiers.append( + { + "name": parname, + "type": "staterror", + "data": [ + sqrt(sum_errors2[i]) / sum_values[i] * values[i] + for i in range(bins) + ], + } + ) + nps.add(parname) + parameters[parname] = { + "fixed": False, + "auxdata": [1.0 for i in range(bins)], + "inits": [1.0 for i in range(bins)], + "bounds": [[-5.0, 5.0] for i in range(bins)], + } + elif modifier["type"] == "normfactor": + modifiers.append( + { + "name": modifier["name"], + "type": "normfactor", + "data": None, + } + ) + nfs.add(modifier["name"]) + parameters[modifier["name"]] = { + "fixed": False, + "inits": [1.0], + **( + {"bounds": [bounds[modifier["name"]]]} + if modifier["name"] in bounds + else {} + ), + } + elif modifier["type"] == "normsys": + modifiers.append( + { + "name": modifier["name"], + "type": "normsys", + "data": modifier["data"], + } + ) + nps.add(modifier["name"]) + parameters[modifier["name"]] = { + "fixed": False, + "auxdata": [0.0], + "inits": [0.0], + **( + {"bounds": [bounds[modifier["name"]]]} + if modifier["name"] in bounds + else {} + ), + } + elif modifier["type"] == "shapesys": + parname = modifier["name"] + nps.add(parname) + modifiers.append( + { + "name": parname, + "type": "shapesys", + "data": modifier["data"]["vals"], + } + ) + parameters[parname] = { + "fixed": False, + "auxdata": [1.0 for i in range(bins)], + "inits": [0.0 for i in range(bins)], + "bounds": [[-5.0, 5.0] for i in range(bins)], + } + if bins == 9: + print(parname) + elif modifier["type"] == "histosys": + modifiers.append( + { + "name": modifier["name"], + "type": "histosys", + "data": { + "hi_data": modifier["data"]["hi"]["contents"], + "lo_data": modifier["data"]["lo"]["contents"], + }, + } + ) + nps.add(modifier["name"]) + parameters[modifier["name"]] = { + "fixed": False, + "auxdata": [0.0], + "inits": [0.0], + **( + {"bounds": [bounds[modifier["name"]]]} + if modifier["name"] in bounds + else {} + ), + } + else: + print( + f"workspace contains unknown modifier type: {modifier['type']}" + ) + exit(1) + + # define parameters + for par in nps: + parameters[par]["name"] = par + measurement["config"]["parameters"].append(parameters[par]) + for par in nfs: + if any([re.match(b, par) for b in nf_blacklist]): + continue + parameters[par]["name"] = par + measurement["config"]["parameters"].append(parameters[par]) + + # some post-processing + for p in measurement["config"]["parameters"]: + pname = p["name"] + if pname in variables.keys(): + if "const" in variables[pname].keys(): + p["fixed"] = variables[pname]["const"] + if fix_other_pois: + for this_poi in pois: + if this_poi == poi: + continue + if pname == this_poi: + p["fixed"] = True + + # write observations + output_json["observations"] = observations + + if output_file is None: + click.echo(json.dumps(output_json, indent=4, sort_keys=True)) + else: + with open(output_file, "w+", encoding="utf-8") as out_file: + json.dump(output_json, out_file, indent=4, sort_keys=True) + log.debug(f"Written to {output_file}")