Skip to content

Commit 4c5bad6

Browse files
committed
Tested the case where no email is provided in the h5 file. Fixed scicat login to try a different method if the first try fails (there is a discrepancy between the deployed scicat instance and the latest scicatlive version 3.2.5). Including two very small h5 files (with and without email) in the examples/ folder for future tests.
1 parent f1d7a63 commit 4c5bad6

File tree

4 files changed

+120
-31
lines changed

4 files changed

+120
-31
lines changed

examples/tomo_scan_email.h5

14 MB
Binary file not shown.

examples/tomo_scan_no_email.h5

14 MB
Binary file not shown.

orchestration/flows/bl832/ingest_tomo832.py

Lines changed: 81 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -130,7 +130,7 @@ def upload_raw_dataset(
130130
file_name = scicat_metadata.get("/measurement/sample/file_name")
131131
description = build_search_terms(file_name)
132132
appended_keywords = description.split()
133-
133+
logger.info(f"email: {scicat_metadata.get('/measurement/sample/experimenter/email')}")
134134
dataset = RawDataset(
135135
owner=scicat_metadata.get("/measurement/sample/experiment/pi") or "Unknown",
136136
contactEmail=clean_email(scicat_metadata.get("/measurement/sample/experimenter/email"))
@@ -266,14 +266,58 @@ def _get_data_sample(file, sample_size=10):
266266
return data_sample
267267

268268

269-
def clean_email(email: str):
270-
if email:
271-
if not email or email.upper() == "NONE":
272-
# this is a brutal case, but the beamline sometimes puts in "None" and
273-
# the new scicat backend hates that.
274-
return UNKNOWN_EMAIL
275-
return email.replace(" ", "").replace(",", "").replace("'", "")
276-
return None
269+
def clean_email(email: any) -> str:
270+
"""
271+
Clean the provided email address.
272+
273+
This function ensures that the input is a valid email address.
274+
It returns a default email if:
275+
- The input is not a string,
276+
- The input is empty after stripping,
277+
- The input equals "NONE" (case-insensitive), or
278+
- The input does not contain an "@" symbol.
279+
280+
Parameters
281+
----------
282+
email : any
283+
The raw email value extracted from metadata.
284+
285+
Returns
286+
-------
287+
str
288+
A cleaned email address if valid, otherwise the default unknown email.
289+
290+
Example
291+
-------
292+
>>> clean_email(" user@example.com ")
293+
'user@example.com'
294+
>>> clean_email("garbage")
295+
'unknown@example.com'
296+
>>> clean_email(None)
297+
'unknown@example.com'
298+
"""
299+
# Check that the email is a string
300+
if not isinstance(email, str):
301+
logger.info(f"Input email is not a string. Returning {UNKNOWN_EMAIL}")
302+
return UNKNOWN_EMAIL
303+
304+
# Remove surrounding whitespace
305+
cleaned = email.strip()
306+
307+
# Fallback if the email is empty, equals "NONE", or lacks an "@" symbol
308+
if not cleaned or cleaned.upper() == "NONE" or "@" not in cleaned:
309+
logger.info(f"Invalid email address. Returning {UNKNOWN_EMAIL}")
310+
return UNKNOWN_EMAIL
311+
312+
# Optionally, remove spaces from inside the email (typically invalid in an email address)
313+
cleaned = cleaned.replace(" ", "")
314+
315+
# Final verification: ensure that the cleaned email contains "@".
316+
if "@" not in cleaned:
317+
logger.info(f"Invalid email address: {cleaned}. Returning {UNKNOWN_EMAIL}")
318+
return UNKNOWN_EMAIL
319+
320+
return cleaned
277321

278322

279323
scicat_metadata_keys = [
@@ -351,16 +395,32 @@ def clean_email(email: str):
351395
]
352396

353397

398+
def test_ingest_raw_tomo() -> bool:
399+
from orchestration.flows.scicat.ingest import ingest_dataset
400+
TOMO_INGESTOR_MODULE = "orchestration.flows.bl832.ingest_tomo832"
401+
file_path = "examples/tomo_scan_no_email.h5"
402+
print(f"Ingesting {file_path} with {TOMO_INGESTOR_MODULE}")
403+
try:
404+
ingest_dataset(file_path, TOMO_INGESTOR_MODULE)
405+
return True
406+
except Exception as e:
407+
print(f"SciCat ingest failed with {e}")
408+
return False
409+
410+
354411
if __name__ == "__main__":
355-
ingest(
356-
ScicatClient(
357-
# "http://localhost:3000/api/v3",
358-
os.environ.get("SCICAT_API_URL"),
359-
None,
360-
os.environ.get("SCICAT_INGEST_USER"),
361-
os.environ.get("SCICAT_INGEST_PASSWORD"),
362-
),
363-
"/Users/dylanmcreynolds/data/beamlines/8.3.2/raw/20231013_065251_MSB_Book1_Proj77_Cell3_Gen2_Li_R2G_FastCharge_DuringCharge0.h5",
364-
[],
365-
log_level="DEBUG",
366-
)
412+
# ingest(
413+
# ScicatClient(
414+
# # "http://localhost:3000/api/v3",
415+
# os.environ.get("SCICAT_API_URL"),
416+
# None,
417+
# os.environ.get("SCICAT_INGEST_USER"),
418+
# os.environ.get("SCICAT_INGEST_PASSWORD"),
419+
# ),
420+
# "/Users/dylanmcreynolds/data/beamlines/8.3.2/raw/"
421+
# "20231013_065251_MSB_Book1_Proj77_Cell3_Gen2_Li_R2G_FastCharge_DuringCharge0.h5",
422+
# [],
423+
# log_level="DEBUG",
424+
# )
425+
426+
test_ingest_raw_tomo()

orchestration/flows/scicat/ingest.py

Lines changed: 39 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -2,11 +2,12 @@
22
import os
33
from typing import List
44

5-
from pyscicat.client import from_credentials
5+
from pyscicat.client import ScicatClient, from_credentials
66
from prefect import flow, task, get_run_logger
77

88
from orchestration.flows.scicat.utils import Issue
99

10+
1011
@flow(name="scicat_dataset_ingest")
1112
def ingest_dataset(file_path: str, ingestor: str):
1213
""" Ingest a file into SciCat.
@@ -23,7 +24,7 @@ def ingest_dataset(file_path: str, ingestor: str):
2324

2425
@task(name="ingest_scicat")
2526
def ingest_dataset_task(file_path: str, ingestor_module: str):
26-
""" Ingest a file into SciCat.
27+
""" Ingest a file into SciCat.
2728
2829
Parameters
2930
----------
@@ -33,14 +34,15 @@ def ingest_dataset_task(file_path: str, ingestor_module: str):
3334
Thy python module that contains the ingest function, e.g. "foo.bar.ingestor"
3435
"""
3536
logger = get_run_logger()
37+
import dotenv
38+
dotenv.load_dotenv()
3639
SCICAT_API_URL = os.getenv("SCICAT_API_URL")
3740
SCICAT_INGEST_USER = os.getenv("SCICAT_INGEST_USER")
3841
SCICAT_INGEST_PASSWORD = os.getenv("SCICAT_INGEST_PASSWORD")
3942

43+
# files come in with the full pasth on the server that they
44+
# were loaded from.
4045

41-
# files come in with the full pasth on the server that they
42-
# were loaded from.
43-
4446
# relative path: raw/...
4547
# ingestor api maps /globa/cfs/cdirs/als/data_mover to /data_mover
4648
# so we want to prepend /data_mover/8.3.2
@@ -50,10 +52,37 @@ def ingest_dataset_task(file_path: str, ingestor_module: str):
5052
logger.info(
5153
f"Sending ingest job to {SCICAT_API_URL} for file {file_path}"
5254
)
53-
scicat_client = from_credentials(
54-
SCICAT_API_URL,
55-
SCICAT_INGEST_USER,
56-
SCICAT_INGEST_PASSWORD)
55+
try:
56+
scicat_client = from_credentials(
57+
SCICAT_API_URL,
58+
SCICAT_INGEST_USER,
59+
SCICAT_INGEST_PASSWORD)
60+
except Exception as e:
61+
logger.warning(f"Failed to create SciCat client using pyscicat method: {e}")
62+
63+
try:
64+
import requests
65+
from urllib.parse import urljoin
66+
67+
url = urljoin(SCICAT_API_URL, "auth/login")
68+
logger.info(url)
69+
response = requests.post(
70+
url=url,
71+
json={"username": SCICAT_INGEST_USER, "password": SCICAT_INGEST_PASSWORD},
72+
stream=False,
73+
verify=True,
74+
)
75+
logger.info(f"Login response: {response.json()}")
76+
scicat_client = ScicatClient(SCICAT_API_URL, response.json()["access_token"])
77+
logger.info("Logged in to SciCat.")
78+
79+
except requests.exceptions.RequestException as e:
80+
logger.error(f"Failed to log in to SciCat: {e}")
81+
raise e
82+
except Exception as e:
83+
logger.error(f"Failed to log in to SciCat: {e}")
84+
raise e
85+
5786
ingestor_module = importlib.import_module(ingestor_module)
5887
issues: List[Issue] = []
5988
new_dataset_id = ingestor_module.ingest(
@@ -74,4 +103,4 @@ def ingest_dataset_task(file_path: str, ingestor_module: str):
74103

75104
from dotenv import load_dotenv
76105
load_dotenv()
77-
ingest_dataset(sys.argv[1], sys.argv[2])
106+
ingest_dataset(sys.argv[1], sys.argv[2])

0 commit comments

Comments
 (0)