diff --git a/Config/Delta_Config.py_headnodetesting b/Config/Delta_Config.py_headnodetesting new file mode 100644 index 00000000..35ed2f8b --- /dev/null +++ b/Config/Delta_Config.py_headnodetesting @@ -0,0 +1,235 @@ +# This is the config file for specifying tables necessary to configure XALT: + +# The patterns listed here are the hosts that can track executable with XALT. +# Typical usage is that compute nodes track executable with XALT while login +# nodes do not. + +import sys + +# Note that linking an executable is everywhere and is independent of +# hostname_patterns + +hostname_patterns = [ + ['KEEP', r'cn[0-1][0-9][0-9]\..*'], # compute nodes + ['KEEP', r'gpu[a-b][0-1][0-9][0-9]\..*'], # gpu nodes type a and b + ['KEEP', r'gpuc0[0-6]\..*'], # gpu nodes type c + ['KEEP', r'gpud0[0-6]\..*'], # gpu nodes type d + ['KEEP', r'dt-login0[0-5]\..*'], #track login nodes for debugging + ] + +#------------------------------------------------------------ +# This "table" is use to filter executables by their path +# The value on the left is either KEEP or SKIP. If the value +# is KEEP then if the path matches the regular expression then +# the executable is acceptable as far as the path test goes. +# If the value on the left is SKIP then if the path matches +# the regular expression then executable is not acceptable so +# no XALT tracking is done for that path. + +# This "table" is used to generate a flex routine that processes +# the paths. So the regular express must follow flex rules. +# In particular, in order to match the pattern must match the whole path +# No partial matches allowed. Also do not use $ to match the +# end of the string. Finally slash is a special character and must +# be quoted with a backslash. + +# The path are conceptionally matched from the first regular +# expression to the last. Once a match is found no other later +# matches are checked. The upshot of this is that if you want to +# track say /usr/bin/ddt, but ignore everything in /usr, then keep +# /usr/bin/ddt first and skip /usr/.* after. + +# If a path does not match any patterns it is marked as KEEP. + +# Programs like R, MATLAB and python* are marked as PKGS. These programs +# can optionally track the internal "import" that are used. + +path_patterns = [ + # mark anything with R, matlab, or python/conda in path for PKG tracking + ['SKIP', r'.*\/bin\/pip[0-9]'], + ['PKGS', r'.*\/r*'], + ['PKGS', r'.*\/matlab*'], + ['PKGS', r'.*\/anaconda*'], + ['PKGS', r'.*\/python[0-9.]*'], + ['PKGS', r'.*\/python-*'], + ['PKGS', r'\/usr\/libexec\/.*-python[0-9.]*'], + # track programs run in user directories + ['KEEP', r'^\/u\/*'], + # track module usage + ['KEEP', r'^\/sw\/spack\/.*'], + ['KEEP', r'^\/sw\/external\/.*'], + # track default programs in /usr/bin + ['KEEP', r'^\/usr\/bin\/apptainer*'], + ['KEEP', r'^\/usr\/bin\/c?make*'], + ['KEEP', r'^\/usr\/bin\/gcc*'], + ['KEEP', r'^\/usr\/bin\/gfortran*'], + ['KEEP', r'^\/usr\/bin\/go*'], + ['KEEP', r'^\/usr\/bin\/g++*'], + ['KEEP', r'^\/usr\/bin\/pip*'], + ['KEEP', r'^\/usr\/local\/*'], + ['KEEP', r'^\/usr\/bin\/srun'], + ['KEEP', r'^\/usr\/bin\/salloc'], + # other user spaces that might hold executables + ['KEEP', r'^\/scratch\/*'], + ['KEEP', r'^\/delta\/scratch\/*'], + ['KEEP', r'^\/projects\/*'], + # root directories that do not need to be tracked + ['SKIP', r'^\/usr\/.*'], + ['SKIP', r'^\/boot\/.*'], + ['SKIP', r'^\/dev\/.*'], + ['SKIP', r'^\/delta\/.*'], + ['SKIP', r'^\/etc\/.*'], + ['SKIP', r'^\/ime\/.*'], + ['SKIP', r'^\/install\/.*'], + ['SKIP', r'^\/lib\/.*'], + ['SKIP', r'^\/lib64\/.*'], + ['SKIP', r'^\/media\/.*'], + ['SKIP', r'^\/mnt\/.*'], + ['SKIP', r'^\/opt\/.*'], # double check this! + ['SKIP', r'^\/proc\/.*'], + ['SKIP', r'^\/root\/.*'], + ['SKIP', r'^\/run\/.*'], + ['SKIP', r'^\/sbin\/.*'], + ['SKIP', r'^\/srv\/.*'], + ['KEEP', r'^\/sw\/.*'], + ['SKIP', r'^\/sys\/.*'], + ['SKIP', r'^\/taiga\/.*'], # double check here + ['SKIP', r'^\/tmp\/.*'], + ['SKIP', r'^\/var\/.*'], + ['SKIP', r'^\/xcatpost\/.*'], + ['SKIP', r'^\/bin\/.*'] + ] + +#------------------------------------------------------------ +# XALT samples almost all executions (both MPI and scalar) +# based on this table below. Note that an MPI execution is where +# the number of tasks is greater than 1. There is no check to +# see if there are MPI libraries in the executable. Note that +# the number of tasks are MPI tasks not threads. + +# Any time there are a number of short rapid executions these +# have to be sampled. However, there are MPI executions with large +# number of tasks that are always recorded. This is to allow the +# tracking of long running MPI tasks that never produce an end +# record. By default MPI_ALWAYS_RECORD = 1. Namely that all MPI +# tasks are recorded. + +MPI_ALWAYS_RECORD = 128 + +#------------------------------------------------------------ +# The array of array used by interval_array has the following +# structure: +# +# interval_array = [ +# [ t_0, probability_0], +# [ t_1, probability_1], +# ... +# [ t_n, probability_n], +# [ 1.0e308, 1.0], +# +# +# The first number is the left edge of the time range. The +# second number is the probability of being sampled. Where a +# probability of 1.0 means a 100% chance of being recorded and a +# value of 0.01 means a 1% chance of being recorded. +# +# So a table that looks like this: +# interval_array = [ +# [ 0.0, 0.0001 ], +# [ 300.0, 0.01 ], +# [ 600.0, 1.0 ], +# [ sys.float_info.max, 1.0 ] +# ] +# +# would say that program with execution time that is between +# 0.0 and 300.0 seconds has a 0.01% chance of being recorded. +# Execution times between 300.0 and 600.0 seconds have a 1% +# chance of being recorded and and programs that take longer +# than 600 seconds will always be recorded. +# +# The absolute minimum table would look like: +# +# interval_array = [ +# [ 0.0, 1.0 ], +# [ sys.float_info.max, 1.0 ] +# ] +# +# which says to record every scalar (non-mpi) program no matter +# the execution time. +# +# Note that scalar execution only uses this table IFF +# $XALT_SAMPLING equals yes + + +interval_array = [ + [ 0.0, 1.0 ], + [ 600.0, 0.05 ], # 10 min + [ 1800.0, 0.1 ], # 30 min + [ 7200.0, 1.0 ], # 2 hours + [ sys.float_info.max, 1.0 ] # End of time +] + + +#------------------------------------------------------------ +# Sites can also define a different sampling specification +# for mpi programs different from interval_array. If no +# mpi_interval_array is given then the interval_array is used +# for both scalar and mpi programs. + +mpi_interval_array = [ + [ 0.0, 1.0 ], + [ 600.0, 0.10 ], # 10 min + [ 900.0, 0.2 ], # 15 min + [ 1800.0, 1.0 ], # 2 hours + [ sys.float_info.max, 1.0 ] # End of time +] + + + +#------------------------------------------------------------ +# XALT filter environment variables. Those variables +# which pass through the filter are save in an SQL table that is +# searchable via sql commands. The environment variables are passed +# to this filter routine as: +# +# env_key=env_value +# +# So the regular expression patterns must match the whole string. + + +# The value on the left is either KEEP or SKIP. If the value +# is KEEP then if the environment string matches the regular +# expression then the variable is stored. If the value on the left +# is SKIP then if the variable matches it is not stored. + +# Order of the list matters. The first match is used even if a +# later pattern would also match. The upshot is that special pattern +# matches should appear first and general ones later. + +# If the environment string does not match any pattern then it is +# marked as SKIP. + +# TODO - modify these +env_patterns = [ + [ 'KEEP', r'.*' ], # keep all environment variables + ] + +#------------------------------------------------------------ +# Python pattern for python package tracking + +# Note that sys, os, re, and subprocess can not be tracked due to the way that python tracking works. +# TODO modify these paths +python_pkg_patterns = [ + { 'k_s' : 'SKIP', 'kind' : 'path', 'patt' : r"^[^/]" }, # SKIP all built-in packages + { 'k_s' : 'SKIP', 'kind' : 'name', 'patt' : r"^_" }, # SKIP names that start with a underscore + { 'k_s' : 'SKIP', 'kind' : 'name', 'patt' : r".*\." }, # SKIP all names that are divided with periods: a.b. + { 'k_s' : 'KEEP', 'kind' : 'path', 'patt' : r".*conda\/.*" }, # KEEP all packages installed by users + { 'k_s' : 'KEEP', 'kind' : 'path', 'patt' : r".*\/site-packages\/.*" }, # KEEP all site-packages + { 'k_s' : 'KEEP', 'kind' : 'path', 'patt' : r"^\/delta/scratch\/.*" }, # KEEP all packages the system project directories + { 'k_s' : 'KEEP', 'kind' : 'path', 'patt' : r"^\/scratch\/.*" }, # KEEP all packages the system scratch directories + { 'k_s' : 'KEEP', 'kind' : 'path', 'patt' : r"^\/projects\/.*" }, # KEEP all packages the system project directories + { 'k_s' : 'KEEP', 'kind' : 'path', 'patt' : r"^\/u\/.*" }, # KEEP all packages installed by users + { 'k_s' : 'SKIP', 'kind' : 'path', 'patt' : r"^\/opt" }, # SKIP all python packages in /opt except for ones in .*/site-packages/ + { 'k_s' : 'SKIP', 'kind' : 'path', 'patt' : r"^\/home" }, # SKIP all other packages in user locations + { 'k_s' : 'SKIP', 'kind' : 'path', 'patt' : r"^\/work" }, # SKIP all other packages in user locations +] diff --git a/ncsa_build/3.0.2.local.lua b/ncsa_build/3.0.2.local.lua new file mode 100644 index 00000000..060de501 --- /dev/null +++ b/ncsa_build/3.0.2.local.lua @@ -0,0 +1,61 @@ +-- XALT 3.0.2 Modfile + +-- Setting as sticky to bypass module purge on OOD launch script +add_property("lmod","sticky") + + +-- Get MMYYYY for XALT +local currentDate = os.date("*t") +local month = string.format("%02d", currentDate.month) +local year = currentDate.year +local formattedDate = year .. month + +-- Filepath used for current configuration of XALT +-- LOC local base = "/sw/workload/xalt2/xalt/xalt" --> Change to match your site!!! +local base = "/u/csteffen/xalt_base/xalt2/xalt/xalt" --> Change to match your site!!! +local bin = pathJoin(base,"bin") +local pythonpath = pathJoin(base,"site_packages") +local lib_dir = "/lib64" -- this is referenced to the internals of a container; doesn't need to be updated per local paths + +-- LOC local record_dir = "/sw/workload/delta/json" +local record_dir_base = "/u/csteffen/xalt_base/records" +local record_dir = pathJoin(record_dir_base,"json") + +-- Comma seperated as specified by https://apptainer.org/docs/user/main/bind_paths_and_mounts.html +-- LOC local apptainer_bind_dir = "/sw/workload/xalt2/xalt/xalt, /sw/workload/delta" +local apptainer_bind_dir = base..", "..record_dir_base + + +-- Turn on Module Tracking +setenv("XALT_EXECUTABLE_TRACKING", "yes") + +-- Environment variables for XALT to run on a Compute Node +prepend_path{"PATH", bin, priority="100"} +prepend_path("XALT_DIR", base) +prepend_path("LD_PRELOAD", pathJoin(base, "lib64/libxalt_init.so")) +prepend_path("COMPILER_PATH", bin) + +-- XAlT_DATE_TIME creation +setenv("XALT_FILE_PREFIX", pathJoin(record_dir,formattedDate)) + +-- XAlT_DATE_TIME creation +setenv("XALT_ALWAYS_CREATE_START", pathJoin(record_dir,formattedDate)) + + +-- Variable needed for Python tracking outside a container +prepend_path("PYTHONPATH", pythonpath) + + +-- Variables needed for XALT to get included into containers +prepend_path("APPTAINER_BINDPATH", apptainer_bind_dir) +setenv("APPTAINERENV_LD_PRELOAD", pathJoin(base, lib_dir, "libxalt_init.so")) +setenv("APPTAINERENV_PYTHONPATH", pythonpath) + + +------------------------------------------------------------ +-- Only set this in production not for testing!!! +-- setenv("XALT_SAMPLING", "yes") + +-- Uncomment this to track GPU usage +-- setenv("XALT_GPU_TRACKING", "yes") + diff --git a/ncsa_build/DeltaUploadClasses.py b/ncsa_build/DeltaUploadClasses.py new file mode 100644 index 00000000..cb7aed2b --- /dev/null +++ b/ncsa_build/DeltaUploadClasses.py @@ -0,0 +1,270 @@ +from typing import List, Dict, Tuple +from datetime import datetime + +class PkgObj(): +# def __init__(self, data, path): + def __init__(self, data): +# self.path = path + self.xalt_run_uuid = data.get('xalt_run_uuid') + self.pkg_version = data.get('package_version') + self.pkg_name = data.get('package_name') + self.pkg_path = data.get('package_path') +# self.pkg_version = data.get('pkg_version') +# self.pkg_name = data.get('pkg_name') +# self.pkg_path = data.get('pkg_path') + def writeToDB(self, conn): + """ + Write new package data to the database using provided MariaDB connection. + Uses xalt_run_uuid directly as run_id and 'python' as program. + + Args: + conn: MariaDB connection object + + Returns: + pkg_id: The ID of the inserted package record + """ + print(f"run UUID:{self.xalt_run_uuid}") + print(f"package name:{self.pkg_name}") + cursor = conn.cursor() + try: + cursor.execute(""" + INSERT INTO xalt_pkg + (run_uuid, program, pkg_name, pkg_version, pkg_path) + VALUES (%s, 'python', %s, %s, %s) + """, ( +# int(self.xalt_run_uuid), # Ensure run_id is an integer since it's int(11) + self.xalt_run_uuid, # Craig removed int() of this value + self.pkg_name, # Respect varchar(64) limit + self.pkg_version if self.pkg_version else None, # Respect varchar(32) limit + self.pkg_path if self.pkg_path else None # Respect varchar(1024) limit +# self.pkg_name[:64], # Respect varchar(64) limit +# self.pkg_version[:32] if self.pkg_version else None, # Respect varchar(32) limit +# self.pkg_path[:1024] if self.pkg_path else None # Respect varchar(1024) limit + )) + + pkg_id = cursor.lastrowid + conn.commit() + + return pkg_id + + except Exception as e: + conn.rollback() + raise Exception(f"Error writing package to database: {str(e)}") + finally: + cursor.close() + + +class LinkObj: + def __init__(self, json_data: dict, path): + self.path = path + resultT = json_data["resultT"] # Direct access since the field is guaranteed to exist + print(resultT) + self.xalt_run_uuid = resultT["uuid"] + self.crc = json_data["crc"] + + # Extract resultT fields + + + self.link_program = resultT["link_program"] + self.link_path = resultT["link_path"] + self.build_user = resultT["build_user"] + self.build_epoch = resultT["build_epoch"] + self.exec_path = resultT["exec_path"] + self.hash_id = resultT["hash_id"] + self.wd = resultT["wd"] + self.build_syshost = resultT["build_syshost"] + + # Extract linkA, function, and link_line fields + self.linkA = json_data["linkA"] + self.function = json_data["function"] + self.link_line = json_data["link_line"] + + def __repr__(self): + return (f"LinkObj(xalt_run_uuid={self.xalt_run_uuid}, crc={self.crc}, link_program={self.link_program}, " + f"link_path={self.link_path}, build_user={self.build_user}, build_epoch={self.build_epoch}, " + f"exec_path={self.exec_path}, hash_id={self.hash_id}, wd={self.wd}, build_syshost={self.build_syshost}, " + f"linkA={self.linkA}, function={self.function}, link_line={self.link_line})") + def writeToDB(self, conn): + """ + Write link data to the database using provided MariaDB connection. + Uses xalt_run_uuid directly as uuid and respects the column constraints. + + Args: + conn: MariaDB connection object + + Returns: + link_id: The ID of the inserted link record + """ + cursor = conn.cursor() + try: + cursor.execute(""" + INSERT INTO links + (hash_id, date, link_program, link_path, link_module_name, + link_line, cwd, build_user, build_syshost, build_epoch, + exec_path, uuid) + VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s) + """, ( + self.hash_id[:40], # Ensure hash_id is 40 chars (char(40)) + # datetime.now().strftime('%Y-%m-%d %H:%M:%S'), # Current timestamp for date + self.link_program[:64], # Respect varchar(64) limit for link_program + self.link_path[:1024], # Respect varchar(1024) limit for link_path + self.linkA[:64] if self.linkA else None, # Respect varchar(64) limit for link_module_name + self.link_line, # link_line is a blob, can be passed as-is + self.wd[:1024] if self.wd else None, # Respect varchar(1024) limit for cwd + self.build_user[:64], # Respect varchar(64) limit for build_user + self.build_syshost[:64], # Respect varchar(64) limit for build_syshost + self.build_epoch, # build_epoch is a double + self.exec_path[:1024], # Respect varchar(1024) limit for exec_path + self.xalt_run_uuid # uuid for the link record + )) + + link_id = cursor.lastrowid # Get the ID of the inserted link record + conn.commit() # Commit the transaction + + return link_id + + except Exception as e: + conn.rollback() # Rollback in case of error + raise Exception(f"Error writing link to database: {str(e)}") + + finally: + return # Close the cursor after operation + + + + + +class RunObj: +# def __init__(self, json_data: dict, path): + def __init__(self, json_data: dict): + # Extract fields from the json_data dictionary +# self.path = path + self.crc = json_data.get("crc", "") + self.cmdlineA = json_data.get("cmdlineA", []) + self.hash_id = json_data.get("hash_id", "") + + # libA is a list of strings + self.libA = json_data.get("libA", []) + + # ptA is a list of dictionaries with each dictionary having cmd_name, cmd_path, pid, and cmdlineA + self.ptA = [ + { + "cmd_name": pt.get("cmd_name", ""), + "cmd_path": pt.get("cmd_path", ""), + "pid": pt.get("pid", 0), + "cmdlineA": pt.get("cmdlineA", []) + } + for pt in json_data.get("ptA", []) + ] + + # envT is a dictionary of unlimited key-value pairs (environment variables) + self.envT = json_data.get("envT", {}) + + # userT is a dictionary with information about the user and job environment + self.userT = { + "syshost": json_data.get("userT", {}).get("syshost", ""), + "run_uuid": json_data.get("userT", {}).get("run_uuid", ""), + "exec_path": json_data.get("userT", {}).get("exec_path", ""), + "exec_type": json_data.get("userT", {}).get("exec_type", ""), + "cwd": json_data.get("userT", {}).get("cwd", ""), + "currentEpoch": json_data.get("userT", {}).get("currentEpoch", ""), + "start_date": json_data.get("userT", {}).get("start_date", ""), + "user": json_data.get("userT", {}).get("user", ""), + "execModify": json_data.get("userT", {}).get("execModify", ""), + "scheduler": json_data.get("userT", {}).get("scheduler", ""), + "account": json_data.get("userT", {}).get("account", ""), + "job_id": json_data.get("userT", {}).get("job_id", ""), + "queue": json_data.get("userT", {}).get("queue", ""), + "submit_host": json_data.get("userT", {}).get("submit_host", "") + } + + # userDT is a dictionary containing runtime and task-related data + self.userDT = { + "start_time": json_data.get("userDT", {}).get("start_time", 0.0), + "end_time": json_data.get("userDT", {}).get("end_time", 0.0), + "run_time": json_data.get("userDT", {}).get("run_time", 0.0), + "probability": json_data.get("userDT", {}).get("probability", 1.0), + "num_tasks": json_data.get("userDT", {}).get("num_tasks", 1.0), + "num_gpus": json_data.get("userDT", {}).get("num_gpus", 0.0), + "exec_epoch": json_data.get("userDT", {}).get("exec_epoch", 0.0), + "num_threads": json_data.get("userDT", {}).get("num_threads", 1.0), + "num_cores": json_data.get("userDT", {}).get("num_cores", 1.0), + "num_nodes": json_data.get("userDT", {}).get("num_nodes", 1.0) + } + + # XALT_measureT is a dictionary of measurement values for different steps in the process + self.XALT_measureT = json_data.get("XALT_measureT", {}) + + # XALT_qaT is a dictionary with QA-specific information + self.XALT_qaT = json_data.get("XALT_qaT", {}) + self.xalt_run_uuid = self.userT['run_uuid'] + + def __repr__(self): + return ( + f"RunObj(crc={self.crc}, cmdlineA={self.cmdlineA}, hash_id={self.hash_id}, " + f"libA={self.libA}, ptA={self.ptA}, envT={self.envT}, userT={self.userT}, " + f"userDT={self.userDT}, XALT_measureT={self.XALT_measureT}, XALT_qaT={self.XALT_qaT})" + ) + def writeToDB(self, conn): + """ + structure copied from Prakhar's link object definition with mods for + minimal output testing 2026Feb + """ + print('starting writetoDB() test') + cursor = conn.cursor() + # try: + # cursor.execute(""" + # INSERT INTO links + # (run_uuid,start_time,user,cwd,cmdline) + # VALUES (%s, %s, %s, %s, %s) + # """, ( + + # test + my_user=self.userT["user"] + my_start_date=self.userT["start_date"] + command_count=0 + print(f'user={my_user} date={my_start_date}') + print('writeToDB test: about to output type') + print(type(self.ptA)) + print(f'ptA length={len(self.ptA)}') + print('writeToDB test: about to loop') + # for command_dict in self.ptA.blahblah: + # command_count += 1 + # my_command=command_dict[cmd_name] + # print(f'Command {command_count} = {my_command}') + + print('finished looping') + + # print('writetoDB for run test') + + # return 0 + + + cursor.execute(""" + INSERT INTO xalt_run + (run_uuid,date,syshost,start_time,user,cwd) + VALUES (%s, %s, %s, %s, %s, %s) + """, ( + self.userT["run_uuid"], +# self.userT["start_date"], # original; + datetime.fromtimestamp(self.userDT["start_time"]).strftime('%Y-%m-%d %H:%M:%S'), # Current timestamp for date + self.userT["syshost"], + self.userDT["start_time"], + my_user, + self.userT["cwd"] + )) +# ),use_pure=True) + + print('cursor ran; return value:{link_id}') + link_id = cursor.lastrowid # new index just inserted + conn.commit() + print('commit() ran; returning from writeToDB()') + + return link_id + +# except Exception as e: +# conn.rollback() # undo in case of error +# raise exception(f"Error writing run to database: {str(e)}") + +# finally: +# return diff --git a/ncsa_build/Delta_create_DB_classes.py b/ncsa_build/Delta_create_DB_classes.py new file mode 100644 index 00000000..55617b20 --- /dev/null +++ b/ncsa_build/Delta_create_DB_classes.py @@ -0,0 +1,55 @@ +from typing import List, Dict, Tuple +from datetime import datetime + + + +class PkgObj(): +# def __init__(self, data, path): + def __init__(self, data): +# self.path = path + self.xalt_run_uuid = data.get('xalt_run_uuid') + self.pkg_version = data.get('pkg_version') + self.pkg_name = data.get('pkg_name') + self.pkg_path = data.get('pkg_path') + def writeToDB(self, conn): + """ + Write new package data to the database using provided MariaDB connection. + Uses xalt_run_uuid directly as run_id and 'python' as program. + + Args: + conn: MariaDB connection object + + Returns: + pkg_id: The ID of the inserted package record + """ + cursor = conn.cursor() + try: + + + +cursor.execute(""" + CREATE TABLE IF NOT EXISTS `xalt_pkg` ( + `pkg_id` bigint(20) unsigned NOT NULL auto_increment, + `run_id` int(11) unsigned NOT NULL, + `program` varchar(12) NOT NULL, + `pkg_name` varchar(64) NOT NULL, + `pkg_version` varchar(32) , + `pkg_path` varchar(1024) , + PRIMARY KEY (`pkg_id`), + FOREIGN KEY (`run_id`) REFERENCES `xalt_run`(`run_id`) + ) ENGINE=InnoDB DEFAULT CHARSET=utf8 COLLATE=utf8_bin AUTO_INCREMENT=1 + """) + print("(%d) create xalt_pkg table" % idx); idx += 1 + + + pkg_id = cursor.lastrowid + conn.commit() + + return pkg_id + + except Exception as e: + conn.rollback() + raise Exception(f"Error writing package to database: {str(e)}") + finally: + cursor.close() + diff --git a/ncsa_build/NCSA_README b/ncsa_build/NCSA_README new file mode 100644 index 00000000..044e6cc8 --- /dev/null +++ b/ncsa_build/NCSA_README @@ -0,0 +1,24 @@ +Build a new installation, as of 2026 March 17 on Delta + +get clone the repo +make a target directory somewhere +set and export shell variable XALT_BASE_DIRECTORY to the target +cd into the repo +"git checkout" the branch you want + +Modify Config/Delta_Config.py in the repo to your wishes as far as target executable and paths and hostnames +run build script + +Modify the following in module file in repo according to above + local base = + + -- LOC local record_dir = "/sw/workload/delta/json" + local record_dir_base = "/work/hdd/bbka/csteffen/xalt_tests/00/log_repo/delta" + local record_dir = pathJoin(record_dir_base,"json") + + -- LOC local apptainer_bind_dir = "/sw/workload/xalt2/xalt/xalt, /sw/workload/delta" + local apptainer_bind_dir = base..", "..record_dir_base + +add module directory to MODULEPATH + +load module diff --git a/ncsa_build/build_xalt.sh b/ncsa_build/build_xalt.sh index 52f5ef7b..60c4e502 100755 --- a/ncsa_build/build_xalt.sh +++ b/ncsa_build/build_xalt.sh @@ -108,11 +108,15 @@ if [ $? -eq 0 ]; then echo "about to verify module directory exists" mkdir -p $build_dir/module/xalt cp $src_dir/ncsa_build/$module_ver.lua $build_dir/module/xalt/$module_ver.lua - echo "Add ${base_dir}/module to MODULEPATH to begin using ${module_name}" +# echo "Add ${base_dir}/module to MODULEPATH to begin using ${module_name}" + echo "Add ${build_dir}/module to MODULEPATH to begin using ${module_name}" + echo "For testing, you may need to customize the file in ${build_dir}/module/xalt/" cp $src_dir/ncsa_build/build_xalt.sh $build_dir/build_xalt.sh else echo "Install Failed" + echo "If you need to add include directories to make the build work," + echo " add them to CPATH" fi cd $orig_dir diff --git a/ncsa_build/delta_upload_xalt_records.py b/ncsa_build/delta_upload_xalt_records.py new file mode 100644 index 00000000..7df8ee7b --- /dev/null +++ b/ncsa_build/delta_upload_xalt_records.py @@ -0,0 +1,296 @@ +import sys +import os +import re +from pathlib import Path +try: + import configparser +except: + import ConfigParser as configparser +#import mariadb +import json +from DeltaUploadClasses import PkgObj, LinkObj, RunObj +import mysql.connector +from mysql.connector import Error + + +def connectDB(my_config): + """ + Connect to the MariaDB database using provided credentials. + """ + try: +# conn = mariadb.connect( +# user=username, +# password=password, +# host=hostname, +# port=3306, # Use integer here +# database=dbname +# ) +# my_config.print() + my_database=my_config.get("MYSQL","DB") + my_host=my_config.get("MYSQL","HOST") + my_user=my_config.get("MYSQL","USER") + my_password=my_config.get("MYSQL","PASSWD") + print(f'parsed config host=>>{my_host}<<') + print(f'parsed config user=>>{my_user}<<') + print(f'parsed config pwd=>>{my_password}<<') + print(f'parsed config db=>>{my_database}<<') + + + print('about to try DB connection.') + +# conn = mysql.connector.connect( +# my_host=my_config.get("MYSQL","HOST"), +# user=my_config.get("MYSQL","USER"), +# password=my_config.get("MYSQL","PASSWD"), +# database=my_config.get("MYSQL","DB") +# ) + conn = mysql.connector.connect( + host=my_host, + user=my_user, + password=my_password, + database=my_database, + use_pure=True + ) + print('back from connect call.') + print(conn) + + print("DB connection established.") + return conn + except mysql.Error as e: +# finally: + + #print(f"Error connecting to database: {e}") + print(f"Error connecting to database:") + sys.exit(1) + +def createLogList(): + """ + Create a list of log files based on the XALT_FILE_PREFIX environment variable. + """ + xalt_log_dir = os.getenv("XALT_FILE_PREFIX") + if xalt_log_dir is None: + print("Error: XALT_FILE_PREFIX environment variable not set.") + sys.exit(1) + xalt_logs = list(Path(xalt_log_dir).rglob('*delta*.json')) + return xalt_logs + +def splitLogs(file_paths): + run_files = [] + link_files = [] + pkg_files = [] + + # Updated patterns + run_pattern = re.compile(r"^run\.delta") + link_pattern = re.compile(r"^link\.delta") + pkg_pattern = re.compile(r"^pkg\.delta") + + for file_path in file_paths: + file_name = Path(file_path).name + + if run_pattern.match(file_name): + run_files.append(file_path) + elif link_pattern.match(file_name): + link_files.append(file_path) + elif pkg_pattern.match(file_name): + pkg_files.append(file_path) + else: + print(f"No match: {file_name}") + + return run_files, link_files, pkg_files + + + +def ingestPkgRecords(pkg_paths): + pkg_dict = {} + + for pkg in pkg_paths: + with open(pkg, 'r') as file: + data = json.load(file) + obj = PkgObj(data) + + if obj.xalt_run_uuid in pkg_dict: + pkg_dict[obj.xalt_run_uuid].append(obj) + else: + pkg_dict[obj.xalt_run_uuid] = [obj] # Initialize as an array to bundle pkgs for the same run record + + return pkg_dict + + +def ingestLinkRecords(link_paths): + link_dict = {} + for link in link_paths: + print(link) + with open(link, 'r') as file: + data = json.load(file) + obj = LinkObj(data) + + if obj.xalt_run_uuid in link_dict: + link_dict[obj.xalt_run_uuid].append(obj) + else: + link_dict[obj.xalt_run_uuid] = [obj] # Initialize as an array to bundle links for the same run record + + return link_dict + +def ingestRunRecords(run_paths): + run_dict = {} + + for run in run_paths: + with open(run, 'r') as file: + data = json.load(file) + obj = RunObj(data) + + if obj.xalt_run_uuid in run_dict: + run_dict[obj.xalt_run_uuid].append(obj) + else: + run_dict[obj.xalt_run_uuid] = [obj] # Initialize as an array to bundle runs for the same run record + + return run_dict + + +def main(): + +# log_root_directory = os.environ.get("XALT_LOG_FILE_DIR") +# if not log_root_directory : +# print() +# print("WARNING: XALT_LOG_FILE_DIR unset! Nowhere to read files from.") +# print("aborting") +# print() +# exit() + + XALT_ETC_DIR = os.environ.get("XALT_ETC_DIR") + if not XALT_ETC_DIR : + print() + print("WARNING: XALT_ETC_DIR unset! Without that we have no configuration.") + print("aborting") + print() + exit() + + Configfilename = os.path.join(XALT_ETC_DIR,"xalt_db.conf") + + print("config filename:") + print(Configfilename) + print("filename done") + + print("about to read config file") + config = configparser.ConfigParser() + config.read(Configfilename) + print("configuration read; printing sections: ") + config_sections_list=config.sections() + n_sections=len(config_sections_list) + print(f"The config file contained {n_sections} non-default sections.") + + + + +# # Validate command-line arguments +# if len(sys.argv) < 5: +# print("Usage: script.py ") +# sys.exit(1) + + # Parse command-line arguments +# username, password, hostname, dbname = sys.argv[1:5] + +# # Fetch log files + logfiles = createLogList() + print(f"Found {len(logfiles)} log files to process.") +# print("here is the log file list:") +# print(logfiles) +# print("finished log file list") + + # # Perform operations with `conn` and `logfiles` as needed. + + run, link, pkg = splitLogs(logfiles) + # Establish database connection + conn = connectDB(config) + + if conn.is_connected(): + print("we're connected to database!") + else: + print("we are NOT connected. :-(") + +# print("regardless, exit to test.") +# sys.exit(1) + + + link_dict = ingestLinkRecords(link) + pkg_dict = ingestPkgRecords(pkg) + run_dict = ingestRunRecords(run) + + + + """ + Now we can index into records using the RUN UUID. + The general ingestion workflow can go like this + """ + + + + print('about to run dictionary upload of Run objects') + for run_key,run_obj in run_dict.items(): + print(f"******* RUN using key {run_key}") + #print(run_obj) + print('run object is of type') + print(type(run_obj)) + print('finish type') + for run_subobject in run_obj: + print('run subobject is of type') + print(type(run_subobject)) + print('finish subobject; now print') + print(run_subobject) + print('done printing subobject') + +# serialized_run_obj=json.loads(run_obj) +# print(json.dumps(serialized_run_obj,indent=5)) + print("about to write to DB") + run_obj[0].writeToDB(conn) + + + + print('done uploading RUN dictionary') + + print('finished test run on RUN dictionary') + + + print('about to run dictionary upload of Pkg objects') + for pkg_key,pkg_obj in pkg_dict.items(): + print(f"******* PKG using key {run_key}") + #print(run_obj) + print('pkg object is of type') + print(type(pkg_obj)) + print('finish type') + for pkg_subobject in pkg_obj: + print('run subobject is of type') + print(type(pkg_subobject)) + print('finish subobject; now print') + print(pkg_subobject) + print('done printing subobject') + +# serialized_run_obj=json.loads(run_obj) +# print(json.dumps(serialized_run_obj,indent=5)) + print("about to write to DB") + pkg_obj[0].writeToDB(conn) + + + + print('done uploading PKG dictionary') + + print('finished test run on PKG dictionary') + + +# print('about to run dictionary upload of Package objects') +# for pkg_key,pkg_obj in pkg_dict.items(): +# print(f"using key {pkg_key}") +# print(pkg_obj) +# print("about to write to DB") +# pkg_obj[0].writeToDB(conn) +# print('done uploading PKG dictionary') +# print('about to run dictionary upload of Link objects') +# for link_key,link_obj in link_dict.items(): +# print(f"using key {link_key}") +# print(link_obj) +# print("about to write to DB") +# link_obj[0].writeToDB(conn) +# print ('done uploading LINK objects') + +if __name__ == "__main__": + main() diff --git a/ncsa_build/orm.py b/ncsa_build/orm.py index 20c2c05f..4d44cb4a 100644 --- a/ncsa_build/orm.py +++ b/ncsa_build/orm.py @@ -2,27 +2,70 @@ import os import re from pathlib import Path -import mariadb +try: + import configparser +except: + import ConfigParser as configparser +#import mariadb import json +<<<<<<< Updated upstream:ncsa_build/orm.py from ClassHelper import PkgObj, LinkObj, RunObj +======= +from DeltaUploadClasses import PkgObj, LinkObj, RunObj +import mysql.connector +from mysql.connector import Error +>>>>>>> Stashed changes:ncsa_build/delta_upload_xalt_records.py -def connectDB(username, password, hostname, dbname): + +def connectDB(my_config): """ Connect to the MariaDB database using provided credentials. """ try: - conn = mariadb.connect( - user=username, - password=password, - host=hostname, - port=3306, # Use integer here - database=dbname - ) - print("MariaDB connection established.") - return conn - except mariadb.Error as e: - print(f"Error connecting to MariaDB: {e}") - sys.exit(1) +# conn = mariadb.connect( +# user=username, +# password=password, +# host=hostname, +# port=3306, # Use integer here +# database=dbname +# ) +# my_config.print() + my_database=my_config.get("MYSQL","DB") + my_host=my_config.get("MYSQL","HOST") + my_user=my_config.get("MYSQL","USER") + my_password=my_config.get("MYSQL","PASSWD") + print(f'parsed config host=>>{my_host}<<') + print(f'parsed config user=>>{my_user}<<') + print(f'parsed config pwd=>>{my_password}<<') + print(f'parsed config db=>>{my_database}<<') + + + print('about to try DB connection.') + +# conn = mysql.connector.connect( +# my_host=my_config.get("MYSQL","HOST"), +# user=my_config.get("MYSQL","USER"), +# password=my_config.get("MYSQL","PASSWD"), +# database=my_config.get("MYSQL","DB") +# ) + conn = mysql.connector.connect( + host=my_host, + user=my_user, + password=my_password, + database=my_database, + use_pure=True + ) + print('back from connect call.') + print(conn) + + print("DB connection established.") + return conn + except mysql.Error as e: +# finally: + + #print(f"Error connecting to database: {e}") + print(f"Error connecting to database:") + sys.exit(1) def createLogList(): """ @@ -109,23 +152,70 @@ def ingestRunRecords(run_paths): def main(): - # Validate command-line arguments - if len(sys.argv) < 5: - print("Usage: script.py ") - sys.exit(1) + +# log_root_directory = os.environ.get("XALT_LOG_FILE_DIR") +# if not log_root_directory : +# print() +# print("WARNING: XALT_LOG_FILE_DIR unset! Nowhere to read files from.") +# print("aborting") +# print() +# exit() + + XALT_ETC_DIR = os.environ.get("XALT_ETC_DIR") + if not XALT_ETC_DIR : + print() + print("WARNING: XALT_ETC_DIR unset! Without that we have no configuration.") + print("aborting") + print() + exit() + + Configfilename = os.path.join(XALT_ETC_DIR,"xalt_db.conf") + + print("config filename:") + print(Configfilename) + print("filename done") + + print("about to read config file") + config = configparser.ConfigParser() + config.read(Configfilename) + print("configuration read; printing sections: ") + config_sections_list=config.sections() + n_sections=len(config_sections_list) + print(f"The config file contained {n_sections} non-default sections.") + + + + +# # Validate command-line arguments +# if len(sys.argv) < 5: +# print("Usage: script.py ") +# sys.exit(1) # Parse command-line arguments - username, password, hostname, dbname = sys.argv[1:5] +# username, password, hostname, dbname = sys.argv[1:5] - # Fetch log files +# # Fetch log files logfiles = createLogList() print(f"Found {len(logfiles)} log files to process.") - # Perform operations with `conn` and `logfiles` as needed. +# print("here is the log file list:") +# print(logfiles) +# print("finished log file list") + + # # Perform operations with `conn` and `logfiles` as needed. run, link, pkg = splitLogs(logfiles) # Establish database connection - # conn = connectDB(username, password, hostname, dbname) + conn = connectDB(config) + + if conn.is_connected(): + print("we're connected to database!") + else: + print("we are NOT connected. :-(") + +# print("regardless, exit to test.") +# sys.exit(1) + link_dict = ingestLinkRecords(link) pkg_dict = ingestPkgRecords(pkg) run_dict = ingestRunRecords(run) @@ -137,5 +227,74 @@ def main(): The general ingestion workflow can go like this """ + + + print('about to run dictionary upload of Run objects') + for run_key,run_obj in run_dict.items(): + print(f"******* RUN using key {run_key}") + #print(run_obj) + print('run object is of type') + print(type(run_obj)) + print('finish type') + for run_subobject in run_obj: + print('run subobject is of type') + print(type(run_subobject)) + print('finish subobject; now print') + print(run_subobject) + print('done printing subobject') + +# serialized_run_obj=json.loads(run_obj) +# print(json.dumps(serialized_run_obj,indent=5)) + print("about to write to DB") + run_obj[0].writeToDB(conn) + + + + print('done uploading RUN dictionary') + + print('finished test run on RUN dictionary') + + + print('about to run dictionary upload of Pkg objects') + for pkg_key,pkg_obj in pkg_dict.items(): + print(f"******* PKG using key {run_key}") + #print(run_obj) + print('pkg object is of type') + print(type(pkg_obj)) + print('finish type') + for pkg_subobject in pkg_obj: + print('run subobject is of type') + print(type(pkg_subobject)) + print('finish subobject; now print') + print(pkg_subobject) + print('done printing subobject') + +# serialized_run_obj=json.loads(run_obj) +# print(json.dumps(serialized_run_obj,indent=5)) + print("about to write to DB") + pkg_obj[0].writeToDB(conn) + + + + print('done uploading PKG dictionary') + + print('finished test run on PKG dictionary') + + +# print('about to run dictionary upload of Package objects') +# for pkg_key,pkg_obj in pkg_dict.items(): +# print(f"using key {pkg_key}") +# print(pkg_obj) +# print("about to write to DB") +# pkg_obj[0].writeToDB(conn) +# print('done uploading PKG dictionary') +# print('about to run dictionary upload of Link objects') +# for link_key,link_obj in link_dict.items(): +# print(f"using key {link_key}") +# print(link_obj) +# print("about to write to DB") +# link_obj[0].writeToDB(conn) +# print ('done uploading LINK objects') + if __name__ == "__main__": main() diff --git a/ncsa_build/table_schema.sql b/ncsa_build/table_schema.sql index c422f1a3..40246443 100644 --- a/ncsa_build/table_schema.sql +++ b/ncsa_build/table_schema.sql @@ -177,12 +177,13 @@ CREATE TABLE IF NOT EXISTS `xalt_total_env` ( CREATE TABLE IF NOT EXISTS `xalt_pkg` ( `pkg_id` bigint(20) unsigned NOT NULL auto_increment, `run_id` int(11) unsigned NOT NULL, + `run_uuid` char(36) NOT NULL, `program` varchar(12) NOT NULL, `pkg_name` varchar(64) NOT NULL, `pkg_version` varchar(32) , `pkg_path` varchar(1024) , PRIMARY KEY (`pkg_id`), - FOREIGN KEY (`run_id`) REFERENCES `xalt_run`(`run_id`) + FOREIGN KEY (`run_uuid`) REFERENCES `xalt_run`(`run_uuid`) ) ENGINE=InnoDB DEFAULT CHARSET=utf8 COLLATE=utf8_bin AUTO_INCREMENT=1