Skip to content

Commit e596d70

Browse files
authored
Merge pull request #565 from mapping-commons/report-prefix-issues
Extend parse_sssom_table to report wrong prefixes and metadata
2 parents e625411 + 34856f1 commit e596d70

File tree

3 files changed

+165
-0
lines changed

3 files changed

+165
-0
lines changed

src/sssom/parsers.py

Lines changed: 98 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -182,6 +182,98 @@ def _get_seperator_symbol_from_file_path(file):
182182
return None
183183

184184

185+
def _is_check_valid_extension_slot(slot_name, meta):
186+
extension_definitions = meta.get("extension_definitions", [])
187+
return any(
188+
"property" in entry and entry.get("slot_name") == slot_name
189+
for entry in extension_definitions
190+
)
191+
192+
193+
def _is_irregular_metadata(metadata_list: List[Dict]):
194+
fail_metadata = False
195+
for m in metadata_list:
196+
for key in m:
197+
if key not in _get_sssom_schema_object().mapping_set_slots:
198+
if not _is_check_valid_extension_slot(key, m):
199+
logging.warning(
200+
f"Metadata key '{key}' is not a standard SSSOM mapping set metadata field. See "
201+
f"https://mapping-commons.github.io/sssom/spec-model/#non-standard-slots on how to "
202+
f"specify additional, non-standard fields in a SSSOM file."
203+
)
204+
fail_metadata = True
205+
return fail_metadata
206+
207+
208+
def _check_redefined_builtin_prefixes(sssom_metadata, meta, prefix_map):
209+
210+
# There are three ways in which prefixes can be communicated, so we will check all of them
211+
# This is a bit overly draconian, as in the end, only the highest priority one gets picked
212+
# But since this only constitues a (logging) warning, I think its worth reporting
213+
builtin_converter = _get_built_in_prefix_map()
214+
sssom_metadata_converter = _get_converter_pop_replace_curie_map(sssom_metadata)
215+
meta_converter = _get_converter_pop_replace_curie_map(meta)
216+
prefix_map_converter = ensure_converter(prefix_map, use_defaults=False)
217+
is_valid_prefixes = True
218+
219+
for converter in [sssom_metadata_converter, meta_converter, prefix_map_converter]:
220+
for builtin_prefix, builtin_uri in builtin_converter.bimap.items():
221+
if builtin_prefix in converter.bimap:
222+
if builtin_uri != converter.bimap[builtin_prefix]:
223+
logging.warning(
224+
f"A built-in prefix ({builtin_prefix}) was provided, "
225+
f"but the provided URI expansion ({converter.bimap[builtin_prefix]}) does not correspond "
226+
f"to the required URI expansion: {builtin_uri}. The prefix will be ignored."
227+
)
228+
is_valid_prefixes = False
229+
# NOTE during refactor replace the following line by https://github.com/biopragmatics/curies/pull/136
230+
reverse_bimap = {value: key for key, value in builtin_converter.bimap.items()}
231+
if builtin_uri in reverse_bimap:
232+
if builtin_prefix != reverse_bimap[builtin_uri]:
233+
logging.warning(
234+
f"A built-in URI namespace ({builtin_uri}) was used in (one of) the provided prefix map(s), "
235+
f"but the provided prefix ({reverse_bimap[builtin_uri]}) does not correspond to the "
236+
f"standard prefix: {builtin_prefix}. The prefix will be ignored."
237+
)
238+
is_valid_prefixes = False
239+
return is_valid_prefixes
240+
241+
242+
def _fail_in_strict_parsing_mode(is_valid_built_in_prefixes, is_valid_metadata):
243+
report = ""
244+
if not is_valid_built_in_prefixes:
245+
report += "STRONG WARNING: The prefix map provided contains built-in prefixes that were redefined.+\n"
246+
if not is_valid_metadata:
247+
report += (
248+
"STRONG WARNING: The metadata provided contains non-standard and undefined metadata.+\n"
249+
)
250+
251+
if report:
252+
raise ValueError(report)
253+
254+
255+
def _get_converter_pop_replace_curie_map(sssom_metadata):
256+
"""
257+
Pop CURIE_MAP from sssom_metadata, process it, and restore it if it existed.
258+
259+
Args:
260+
sssom_metadata (dict): The metadata dictionary.
261+
262+
Returns:
263+
Converter: A Converter object created from the CURIE_MAP.
264+
"""
265+
curie_map = sssom_metadata.pop(CURIE_MAP, {})
266+
267+
# Process the popped value
268+
sssom_metadata_converter = Converter.from_prefix_map(curie_map)
269+
270+
# Reinsert CURIE_MAP if it was present
271+
if curie_map:
272+
sssom_metadata[CURIE_MAP] = curie_map
273+
274+
return sssom_metadata_converter
275+
276+
185277
def parse_sssom_table(
186278
file_path: Union[str, Path, TextIO],
187279
prefix_map: ConverterHint = None,
@@ -197,6 +289,12 @@ def parse_sssom_table(
197289
if meta is None:
198290
meta = {}
199291

292+
is_valid_built_in_prefixes = _check_redefined_builtin_prefixes(sssom_metadata, meta, prefix_map)
293+
is_valid_metadata = _is_irregular_metadata([sssom_metadata, meta])
294+
295+
if kwargs.get("strict"):
296+
_fail_in_strict_parsing_mode(is_valid_built_in_prefixes, is_valid_metadata)
297+
200298
# The priority order for combining prefix maps are:
201299
# 1. Built-in prefix map
202300
# 2. Internal prefix map inside the document

tests/data/basic_strict_fail.tsv

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
# curie_map:
2+
# HP: http://purl.obolibrary.org/obo/HP_
3+
# MP: http://purl.obolibrary.org/obo/MP_
4+
# owl: http://www.w3.org/2002/07/owl#
5+
# rdf: http://www.w3.org/1999/02/22-rdf-syntax-ns#
6+
# rdfs: http://www.w3.org/2000/01/rdf-schema_fail#
7+
# semapv: https://w3id.org/semapv/vocab/
8+
# skos: http://www.w3.org/2004/02/skos/core#
9+
# sssom: https://w3id.org/sssom/
10+
# license_fail: https://creativecommons.org/publicdomain/zero/1.0/
11+
# mapping_provider: http://purl.obolibrary.org/obo/upheno.owl
12+
# mapping_set_id: https://w3id.org/sssom/mappings/27f85fe9-8a72-4e76-909b-7ba4244d9ede
13+
subject_id subject_label predicate_id object_id object_label mapping_fail_justification
14+
HP:0000175 Cleft palate skos:exactMatch MP:0000111 cleft palate semapv:LexicalMatching
15+
HP:0000252 Microcephaly skos:exactMatch MP:0000433 microcephaly semapv:LexicalMatching

tests/test_parsers.py

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -447,3 +447,55 @@ def test_round_trip_rdf(self):
447447
def test_round_trip_tsv(self):
448448
"""Test writing then reading TSV."""
449449
self._basic_round_trip("tsv")
450+
451+
def test_strict_parsing(self):
452+
"""Test Strict parsing mode."""
453+
input_path = f"{test_data_dir}/basic_strict_fail.tsv"
454+
with open(input_path, "r") as file:
455+
input_string = file.read()
456+
stream = io.StringIO(input_string)
457+
458+
with self.assertRaises(ValueError):
459+
parse_sssom_table(stream, strict=True)
460+
461+
# Make sure it parses in non-strict mode
462+
msdf = parse_sssom_table(stream)
463+
self.assertEqual(len(msdf.df), 2)
464+
465+
def test_check_irregular_metadata(self):
466+
"""Test if irregular metadata check works according to https://w3id.org/sssom/spec."""
467+
meta_fail_because_undeclared_extension = {
468+
"licenses": "http://licen.se",
469+
"mapping_set_id": "http://mapping.set/id1",
470+
"ext_test": "value",
471+
}
472+
meta_fail_because_extension_without_property = {
473+
"license": "http://licen.se",
474+
"mapping_set_id": "http://mapping.set/id1",
475+
"ext_test": "value",
476+
"extension_definitions": [{"slot_name": "ext_test"}],
477+
}
478+
479+
meta_ok = {
480+
"license": "http://licen.se",
481+
"mapping_set_id": "http://mapping.set/id1",
482+
"ext_test": "value",
483+
"extension_definitions": [
484+
{"slot_name": "ext_test", "property": "skos:fantasyRelation"}
485+
],
486+
}
487+
488+
from sssom.parsers import _is_check_valid_extension_slot, _is_irregular_metadata
489+
490+
is_irregular_metadata_fail_undeclared_case = _is_irregular_metadata(
491+
[meta_fail_because_undeclared_extension]
492+
)
493+
is_valid_extension = _is_check_valid_extension_slot("ext_test", meta_ok)
494+
is_irregular_metadata_ok_case = _is_irregular_metadata([meta_ok])
495+
is_irregular_metadata_fail_missing_property_case = _is_irregular_metadata(
496+
[meta_fail_because_extension_without_property]
497+
)
498+
self.assertTrue(is_irregular_metadata_fail_undeclared_case)
499+
self.assertTrue(is_irregular_metadata_fail_missing_property_case)
500+
self.assertTrue(is_valid_extension)
501+
self.assertFalse(is_irregular_metadata_ok_case)

0 commit comments

Comments
 (0)