Skip to content

Commit 7d2c718

Browse files
authored
feat: added catalog functions to list and search variables (#5)
* feat: added catalog functions to list and search variables * chore: prepare next version * feat: doc updated * fix: search returned type * chore: code review
1 parent 6f4a317 commit 7d2c718

File tree

5 files changed

+153
-63
lines changed

5 files changed

+153
-63
lines changed

Makefile

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,10 @@
11
install:
22
uv sync --all-extras
33

4+
update:
5+
rm -f uv.lock
6+
uv sync
7+
48
test:
59
uv run --all-extras pytest
610

datashield/api.py

Lines changed: 90 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -140,7 +140,7 @@ def close(self, save: str = None) -> None:
140140
for conn in self.conns:
141141
try:
142142
if save:
143-
conn.save_workspace(f"{conn.name}:{save}")
143+
conn.save_workspace(f"{conn.get_name()}:{save}")
144144
conn.disconnect()
145145
except DSError:
146146
# silently fail
@@ -162,7 +162,7 @@ def get_connection_names(self) -> list[str]:
162162
:return: The list of opened connection names
163163
"""
164164
if self.conns:
165-
return [conn.name for conn in self.conns]
165+
return [conn.get_name() for conn in self.conns]
166166
else:
167167
return []
168168

@@ -194,7 +194,53 @@ def tables(self) -> dict:
194194
"""
195195
rval = {}
196196
for conn in self.conns:
197-
rval[conn.name] = conn.list_tables()
197+
rval[conn.get_name()] = conn.list_tables()
198+
return rval
199+
200+
def variables(self, table: str = None, tables: dict = None) -> dict:
201+
"""
202+
List available variables from the data repository, for a given table.
203+
204+
:param table: The default name of the table to list variables for
205+
:param tables: The name of the table to list variables for, per server name. If not defined, 'table' is used.
206+
:return: The available variables from the data repository, for a given table, per remote server name
207+
"""
208+
rval = {}
209+
for conn in self.conns:
210+
name = table
211+
if tables and conn.get_name() in tables:
212+
name = tables[conn.get_name()]
213+
if name:
214+
rval[conn.get_name()] = conn.list_table_variables(name)
215+
else:
216+
rval[conn.get_name()] = None
217+
return rval
218+
219+
def taxonomies(self) -> dict:
220+
"""
221+
List available taxonomies from the data repository. A taxonomy is a hierarchical structure of vocabulary
222+
terms that can be used to annotate variables in the data repository.
223+
Depending on the data repository's capabilities, taxonomies can be used to perform structured
224+
queries when searching for variables.
225+
226+
:return: The available taxonomies from the data repository, per remote server name
227+
"""
228+
rval = {}
229+
for conn in self.conns:
230+
rval[conn.get_name()] = conn.list_taxonomies()
231+
return rval
232+
233+
def search_variables(self, query: str) -> dict:
234+
"""
235+
Search for variable names matching a given query across all tables in the data repository.
236+
237+
:param query: The query to search for in variable names, e.g., a full-text search and/or structured
238+
query (based on taxonomy terms), depending on the data repository's capabilities
239+
:return: The matching variable names from the data repository, per remote server name
240+
"""
241+
rval = {}
242+
for conn in self.conns:
243+
rval[conn.get_name()] = conn.search_variables(query)
198244
return rval
199245

200246
def resources(self) -> dict:
@@ -205,7 +251,7 @@ def resources(self) -> dict:
205251
"""
206252
rval = {}
207253
for conn in self.conns:
208-
rval[conn.name] = conn.list_resources()
254+
rval[conn.get_name()] = conn.list_resources()
209255
return rval
210256

211257
def profiles(self) -> dict:
@@ -216,7 +262,7 @@ def profiles(self) -> dict:
216262
"""
217263
rval = {}
218264
for conn in self.conns:
219-
rval[conn.name] = conn.list_profiles()
265+
rval[conn.get_name()] = conn.list_profiles()
220266
return rval
221267

222268
def packages(self) -> dict:
@@ -227,7 +273,7 @@ def packages(self) -> dict:
227273
"""
228274
rval = {}
229275
for conn in self.conns:
230-
rval[conn.name] = conn.list_packages()
276+
rval[conn.get_name()] = conn.list_packages()
231277
return rval
232278

233279
def methods(self, type: str = "aggregate") -> dict:
@@ -239,7 +285,7 @@ def methods(self, type: str = "aggregate") -> dict:
239285
"""
240286
rval = {}
241287
for conn in self.conns:
242-
rval[conn.name] = conn.list_methods(type)
288+
rval[conn.get_name()] = conn.list_methods(type)
243289
return rval
244290

245291
#
@@ -254,7 +300,7 @@ def workspaces(self) -> dict:
254300
"""
255301
rval = {}
256302
for conn in self.conns:
257-
rval[conn.name] = conn.list_workspaces()
303+
rval[conn.get_name()] = conn.list_workspaces()
258304
return rval
259305

260306
def workspace_save(self, name: str) -> None:
@@ -264,7 +310,7 @@ def workspace_save(self, name: str) -> None:
264310
:param name: The name of the workspace
265311
"""
266312
for conn in self.conns:
267-
conn.save_workspace(f"{conn.name}:{name}")
313+
conn.save_workspace(f"{conn.get_name()}:{name}")
268314

269315
def workspace_restore(self, name: str) -> None:
270316
"""
@@ -274,7 +320,7 @@ def workspace_restore(self, name: str) -> None:
274320
:param name: The name of the workspace
275321
"""
276322
for conn in self.conns:
277-
conn.restore_workspace(f"{conn.name}:{name}")
323+
conn.restore_workspace(f"{conn.get_name()}:{name}")
278324

279325
def workspace_rm(self, name: str) -> None:
280326
"""
@@ -284,7 +330,7 @@ def workspace_rm(self, name: str) -> None:
284330
:param name: The name of the workspace
285331
"""
286332
for conn in self.conns:
287-
conn.rm_workspace(f"{conn.name}:{name}")
333+
conn.rm_workspace(f"{conn.get_name()}:{name}")
288334

289335
#
290336
# R session
@@ -321,17 +367,17 @@ def sessions(self) -> dict:
321367
if not conn.has_session():
322368
conn.start_session(asynchronous=True)
323369
except Exception as e:
324-
logging.warning(f"Failed to start session: {conn.name} - {e}")
325-
excluded_conns.append(conn.name)
370+
logging.warning(f"Failed to start session: {conn.get_name()} - {e}")
371+
excluded_conns.append(conn.get_name())
326372

327373
# check for session status and wait until all are started
328-
for conn in [c for c in self.conns if c.name not in excluded_conns]:
374+
for conn in [c for c in self.conns if c.get_name() not in excluded_conns]:
329375
try:
330376
if conn.is_session_started():
331-
started_conns.append(conn.name)
377+
started_conns.append(conn.get_name())
332378
except Exception as e:
333-
logging.warning(f"Failed to check session status: {conn.name} - {e}")
334-
excluded_conns.append(conn.name)
379+
logging.warning(f"Failed to check session status: {conn.get_name()} - {e}")
380+
excluded_conns.append(conn.get_name())
335381

336382
# wait until all sessions are started, excluding those that have failed to start or check status
337383
start_time = time.time()
@@ -340,23 +386,25 @@ def sessions(self) -> dict:
340386
raise DSError("Timed out waiting for R sessions to start")
341387
time.sleep(self.start_delay)
342388
remaining_conns = [
343-
conn for conn in self.conns if conn.name not in started_conns and conn.name not in excluded_conns
389+
conn
390+
for conn in self.conns
391+
if conn.get_name() not in started_conns and conn.get_name() not in excluded_conns
344392
]
345393
for conn in remaining_conns:
346394
try:
347395
if conn.is_session_started():
348-
started_conns.append(conn.name)
396+
started_conns.append(conn.get_name())
349397
except Exception as e:
350-
logging.warning(f"Failed to check session status: {conn.name} - {e}")
351-
excluded_conns.append(conn.name)
398+
logging.warning(f"Failed to check session status: {conn.get_name()} - {e}")
399+
excluded_conns.append(conn.get_name())
352400

353401
# at this point, all sessions that could be started have been started, and those that failed to start or check status have been excluded
354402
for conn in self.conns:
355-
if conn.name in started_conns:
356-
rval[conn.name] = conn.get_session()
403+
if conn.get_name() in started_conns:
404+
rval[conn.get_name()] = conn.get_session()
357405
if len(excluded_conns) > 0:
358406
logging.error(f"Some sessions have been excluded due to errors: {', '.join(excluded_conns)}")
359-
self.conns = [conn for conn in self.conns if conn.name not in excluded_conns]
407+
self.conns = [conn for conn in self.conns if conn.get_name() not in excluded_conns]
360408
if len(self.conns) == 0:
361409
raise DSError("No sessions could be started successfully.")
362410
return rval
@@ -372,10 +420,10 @@ def ls(self) -> dict:
372420
rval = {}
373421
for conn in self.conns:
374422
try:
375-
rval[conn.name] = conn.list_symbols()
423+
rval[conn.get_name()] = conn.list_symbols()
376424
except Exception as e:
377425
self._append_error(conn, e)
378-
rval[conn.name] = None
426+
rval[conn.get_name()] = None
379427
self._check_errors()
380428
return rval
381429

@@ -418,12 +466,12 @@ def assign_table(
418466
cmd = {}
419467
for conn in self.conns:
420468
name = table
421-
if tables and conn.name in tables:
422-
name = tables[conn.name]
469+
if tables and conn.get_name() in tables:
470+
name = tables[conn.get_name()]
423471
if name:
424472
try:
425473
res = conn.assign_table(symbol, name, variables, missings, identifiers, id_name, asynchronous)
426-
cmd[conn.name] = res
474+
cmd[conn.get_name()] = res
427475
except Exception as e:
428476
self._append_error(conn, e)
429477
self._do_wait(cmd)
@@ -445,12 +493,12 @@ def assign_resource(
445493
cmd = {}
446494
for conn in self.conns:
447495
name = resource
448-
if resources and conn.name in resources:
449-
name = resources[conn.name]
496+
if resources and conn.get_name() in resources:
497+
name = resources[conn.get_name()]
450498
if name:
451499
try:
452500
res = conn.assign_resource(symbol, name, asynchronous)
453-
cmd[conn.name] = res
501+
cmd[conn.get_name()] = res
454502
except Exception as e:
455503
self._append_error(conn, e)
456504
self._do_wait(cmd)
@@ -470,7 +518,7 @@ def assign_expr(self, symbol: str, expr: str, asynchronous: bool = True) -> None
470518
for conn in self.conns:
471519
try:
472520
res = conn.assign_expr(symbol, expr, asynchronous)
473-
cmd[conn.name] = res
521+
cmd[conn.get_name()] = res
474522
except Exception as e:
475523
self._append_error(conn, e)
476524
self._do_wait(cmd)
@@ -492,10 +540,10 @@ def aggregate(self, expr: str, asynchronous: bool = True) -> dict:
492540
for conn in self.conns:
493541
try:
494542
res = conn.aggregate(expr, asynchronous)
495-
cmd[conn.name] = res
543+
cmd[conn.get_name()] = res
496544
except Exception as e:
497545
self._append_error(conn, e)
498-
rval[conn.name] = None
546+
rval[conn.get_name()] = None
499547
rval = self._do_wait(cmd)
500548
self._check_errors()
501549
return rval
@@ -511,15 +559,15 @@ def _do_wait(self, cmd: dict) -> dict:
511559
rval = {}
512560
while cmd:
513561
for conn in self.conns:
514-
if conn.name in cmd:
515-
res = cmd[conn.name]
516-
# print(f"..checking {conn.name} -> {res.is_completed()}")
562+
if conn.get_name() in cmd:
563+
res = cmd[conn.get_name()]
564+
# print(f"..checking {conn.get_name()} -> {res.is_completed()}")
517565
if res.is_completed():
518566
try:
519-
rval[conn.name] = res.fetch()
567+
rval[conn.get_name()] = res.fetch()
520568
except Exception as e:
521569
self._append_error(conn, e)
522-
cmd.pop(conn.name, None)
570+
cmd.pop(conn.get_name(), None)
523571
else:
524572
conn.keep_alive()
525573
time.sleep(0.1)
@@ -535,8 +583,8 @@ def _append_error(self, conn: DSConnection, error: Exception) -> None:
535583
"""
536584
Append an error.
537585
"""
538-
logging.error(f"[{conn.name}] {error}")
539-
self.errors[conn.name] = error
586+
logging.error(f"[{conn.get_name()}] {error}")
587+
self.errors[conn.get_name()] = error
540588

541589
def _check_errors(self) -> None:
542590
"""

datashield/interface.py

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -194,6 +194,14 @@ class DSConnection:
194194
Connection class to a DataSHIELD server.
195195
"""
196196

197+
def get_name(self) -> str:
198+
"""
199+
Get the name of the connection, which is typically the name of the server or data repository.
200+
201+
:return: The name of the connection
202+
"""
203+
raise NotImplementedError("DSConnection function not available")
204+
197205
#
198206
# Content listing
199207
#
@@ -215,6 +223,36 @@ def has_table(self, name: str) -> bool:
215223
"""
216224
raise NotImplementedError("DSConnection function not available")
217225

226+
def list_table_variables(self, table: str) -> list:
227+
"""
228+
List available variables for a given table from the data repository.
229+
230+
:param table: The name of the table to list variables for
231+
:return: The list of available variables for the given table
232+
"""
233+
raise NotImplementedError("DSConnection function not available")
234+
235+
def list_taxonomies(self) -> list:
236+
"""
237+
List available taxonomies from the data repository. A taxonomy is a hierarchical structure of vocabulary
238+
terms that can be used to annotate variables in the data repository.
239+
Depending on the data repository's capabilities, taxonomies can be used to perform structured
240+
queries when searching for variables.
241+
242+
:return: The list of available taxonomy names
243+
"""
244+
raise NotImplementedError("DSConnection function not available")
245+
246+
def search_variables(self, query: str) -> dict:
247+
"""
248+
Search for variable names matching a given query across all tables in the data repository.
249+
250+
:param query: The query to search for in variable names, e.g., a full-text search and/or structured
251+
query (based on taxonomy terms), depending on the data repository's capabilities
252+
:return: The search result for variables matching the given query across all tables
253+
"""
254+
raise NotImplementedError("DSConnection function not available")
255+
218256
def list_resources(self) -> list:
219257
"""
220258
List available resource names from the data repository.

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[project]
22
name = "datashield"
3-
version = "0.3.0"
3+
version = "0.4.0"
44
description = "DataSHIELD Client Interface in Python."
55
authors = [
66
{name = "Yannick Marcon", email = "yannick.marcon@obiba.org"}

0 commit comments

Comments
 (0)