Skip to content

Commit 6f874ca

Browse files
authored
Merge pull request #6 from The-AI-Alliance/dataset-explorer
initial implementation of dataset explorer Python
2 parents 17aaf72 + 4ad22e5 commit 6f874ca

38 files changed

+1636
-48
lines changed

.github/workflows/test-code-code_profiler.yml

+15-9
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,8 @@ on:
1212
tags:
1313
- "*"
1414
paths:
15+
- ".make.*"
16+
- "transforms/.make.transforms"
1517
- "transforms/code/code_profiler/**"
1618
- "data-processing-lib/**"
1719
- "!transforms/code/code_profiler/**/kfp_ray/**" # This is/will be tested in separate workflow
@@ -26,6 +28,8 @@ on:
2628
- "dev"
2729
- "releases/**"
2830
paths:
31+
- ".make.*"
32+
- "transforms/.make.transforms"
2933
- "transforms/code/code_profiler/**"
3034
- "data-processing-lib/**"
3135
- "!transforms/code/code_profiler/**/kfp_ray/**" # This is/will be tested in separate workflow
@@ -36,6 +40,11 @@ on:
3640
- "!**/images/**"
3741
- "!**.gitignore"
3842

43+
# Taken from https://stackoverflow.com/questions/66335225/how-to-cancel-previous-runs-in-the-pr-when-you-push-new-commitsupdate-the-curre
44+
concurrency:
45+
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
46+
cancel-in-progress: true
47+
3948
jobs:
4049
check_if_push_image:
4150
# check whether the Docker images should be pushed to the remote repository
@@ -63,9 +72,8 @@ jobs:
6372
steps:
6473
- name: Checkout
6574
uses: actions/checkout@v4
66-
- name:
67-
Free up space in github runner
68-
# Free space as indicated here : https://github.com/actions/runner-images/issues/2840#issuecomment-790492173
75+
- name: Free up space in github runner
76+
# Free space as indicated here : https://github.com/actions/runner-images/issues/2840#issuecomment-790492173
6977
run: |
7078
df -h
7179
sudo rm -rf "/usr/local/share/boost"
@@ -90,9 +98,8 @@ jobs:
9098
steps:
9199
- name: Checkout
92100
uses: actions/checkout@v4
93-
- name:
94-
Free up space in github runner
95-
# Free space as indicated here : https://github.com/actions/runner-images/issues/2840#issuecomment-790492173
101+
- name: Free up space in github runner
102+
# Free space as indicated here : https://github.com/actions/runner-images/issues/2840#issuecomment-790492173
96103
run: |
97104
df -h
98105
sudo rm -rf /opt/ghc
@@ -111,9 +118,8 @@ jobs:
111118
else
112119
echo "transforms/code/code_profiler/Makefile not found - testing disabled for this transform."
113120
fi
114-
- name:
115-
Print space
116-
# Free space as indicated here : https://github.com/actions/runner-images/issues/2840#issuecomment-790492173
121+
- name: Print space
122+
# Free space as indicated here : https://github.com/actions/runner-images/issues/2840#issuecomment-790492173
117123
run: |
118124
df -h
119125
docker images
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,133 @@
1+
#
2+
# DO NOT EDIT THIS FILE: it is generated from test-transform.template, Edit there and run make to change these files
3+
#
4+
name: Test - transforms/universal/hf_dataset_explorer
5+
6+
on:
7+
workflow_dispatch:
8+
push:
9+
branches:
10+
- "dev"
11+
- "releases/**"
12+
tags:
13+
- "*"
14+
paths:
15+
- ".make.*"
16+
- "transforms/.make.transforms"
17+
- "transforms/universal/hf_dataset_explorer/**"
18+
- "data-processing-lib/**"
19+
- "!transforms/universal/hf_dataset_explorer/**/kfp_ray/**" # This is/will be tested in separate workflow
20+
- "!data-processing-lib/**/test/**"
21+
- "!data-processing-lib/**/test-data/**"
22+
- "!**.md"
23+
- "!**/doc/**"
24+
- "!**/images/**"
25+
- "!**.gitignore"
26+
pull_request:
27+
branches:
28+
- "dev"
29+
- "releases/**"
30+
paths:
31+
- ".make.*"
32+
- "transforms/.make.transforms"
33+
- "transforms/universal/hf_dataset_explorer/**"
34+
- "data-processing-lib/**"
35+
- "!transforms/universal/hf_dataset_explorer/**/kfp_ray/**" # This is/will be tested in separate workflow
36+
- "!data-processing-lib/**/test/**"
37+
- "!data-processing-lib/**/test-data/**"
38+
- "!**.md"
39+
- "!**/doc/**"
40+
- "!**/images/**"
41+
- "!**.gitignore"
42+
43+
# Taken from https://stackoverflow.com/questions/66335225/how-to-cancel-previous-runs-in-the-pr-when-you-push-new-commitsupdate-the-curre
44+
concurrency:
45+
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
46+
cancel-in-progress: true
47+
48+
jobs:
49+
check_if_push_image:
50+
# check whether the Docker images should be pushed to the remote repository
51+
# The images are pushed if it is a merge to dev branch or a new tag is created.
52+
# The latter being part of the release process.
53+
# The images tag is derived from the value of the DOCKER_IMAGE_VERSION variable set in the .make.versions file.
54+
runs-on: ubuntu-22.04
55+
outputs:
56+
publish_images: ${{ steps.version.outputs.publish_images }}
57+
steps:
58+
- id: version
59+
run: |
60+
publish_images='false'
61+
if [[ ${GITHUB_REF} == refs/heads/dev && ${GITHUB_EVENT_NAME} != 'pull_request' && ${GITHUB_REPOSITORY} == IBM/data-prep-kit ]] ;
62+
then
63+
publish_images='true'
64+
fi
65+
if [[ ${GITHUB_REF} == refs/tags/* && ${GITHUB_REPOSITORY} == IBM/data-prep-kit ]] ;
66+
then
67+
publish_images='true'
68+
fi
69+
echo "publish_images=$publish_images" >> "$GITHUB_OUTPUT"
70+
test-src:
71+
runs-on: ubuntu-22.04
72+
steps:
73+
- name: Checkout
74+
uses: actions/checkout@v4
75+
- name: Free up space in github runner
76+
# Free space as indicated here : https://github.com/actions/runner-images/issues/2840#issuecomment-790492173
77+
run: |
78+
df -h
79+
sudo rm -rf "/usr/local/share/boost"
80+
sudo rm -rf "$AGENT_TOOLSDIRECTORY"
81+
sudo rm -rf /usr/share/dotnet /opt/ghc /usr/local/lib/android /usr/local/share/powershell /usr/share/swift /usr/local/.ghcup
82+
sudo docker rmi $(docker image ls -aq) >/dev/null 2>&1 || true
83+
df -h
84+
- name: Test transform source in transforms/universal/hf_dataset_explorer
85+
run: |
86+
if [ -e "transforms/universal/hf_dataset_explorer/Makefile" ]; then
87+
make -C transforms/universal/hf_dataset_explorer DOCKER=docker test-src
88+
else
89+
echo "transforms/universal/hf_dataset_explorer/Makefile not found - source testing disabled for this transform."
90+
fi
91+
test-image:
92+
needs: [check_if_push_image]
93+
runs-on: ubuntu-22.04
94+
timeout-minutes: 120
95+
env:
96+
DOCKER_REGISTRY_USER: ${{ secrets.DOCKER_REGISTRY_USER }}
97+
DOCKER_REGISTRY_KEY: ${{ secrets.DOCKER_REGISTRY_KEY }}
98+
steps:
99+
- name: Checkout
100+
uses: actions/checkout@v4
101+
- name: Free up space in github runner
102+
# Free space as indicated here : https://github.com/actions/runner-images/issues/2840#issuecomment-790492173
103+
run: |
104+
df -h
105+
sudo rm -rf /opt/ghc
106+
sudo rm -rf "/usr/local/share/boost"
107+
sudo rm -rf "$AGENT_TOOLSDIRECTORY"
108+
sudo rm -rf /usr/share/dotnet /opt/ghc /usr/local/lib/android /usr/local/share/powershell /usr/share/swift /usr/lib/jvm /usr/local/.ghcup
109+
sudo docker rmi $(docker image ls -aq) >/dev/null 2>&1 || true
110+
df -h
111+
- name: Test transform image in transforms/universal/hf_dataset_explorer
112+
run: |
113+
if [ -e "transforms/universal/hf_dataset_explorer/Makefile" ]; then
114+
if [ -d "transforms/universal/hf_dataset_explorer/spark" ]; then
115+
make -C data-processing-lib/spark DOCKER=docker image
116+
fi
117+
make -C transforms/universal/hf_dataset_explorer DOCKER=docker test-image
118+
else
119+
echo "transforms/universal/hf_dataset_explorer/Makefile not found - testing disabled for this transform."
120+
fi
121+
- name: Print space
122+
# Free space as indicated here : https://github.com/actions/runner-images/issues/2840#issuecomment-790492173
123+
run: |
124+
df -h
125+
docker images
126+
- name: Publish images
127+
if: needs.check_if_push_image.outputs.publish_images == 'true'
128+
run: |
129+
if [ -e "transforms/universal/hf_dataset_explorer/Makefile" ]; then
130+
make -C transforms/universal/hf_dataset_explorer publish
131+
else
132+
echo "transforms/universal/hf_dataset_explorer/Makefile not found - publishing disabled for this transform."
133+
fi

data-processing-lib/python/src/data_processing/data_access/data_access_hf.py

+21-4
Original file line numberDiff line numberDiff line change
@@ -10,12 +10,12 @@
1010
# limitations under the License.
1111
################################################################################
1212
import json
13-
from typing import Any
13+
from typing import Any, Iterable, Union
1414

1515
import pyarrow as pa
1616
from data_processing.data_access import DataAccess
1717
from data_processing.utils import TransformUtils, UnrecoverableException, get_logger
18-
from huggingface_hub import HfFileSystem, RepoCard
18+
from huggingface_hub import DatasetInfo, HfApi, HfFileSystem, RepoCard
1919
from huggingface_hub.errors import EntryNotFoundError
2020

2121

@@ -71,6 +71,7 @@ def __init__(
7171
hf_token = None
7272
self.hf_config = hf_config
7373
self.fs = HfFileSystem(token=hf_token)
74+
self.apis = HfApi()
7475

7576
logger.debug(f"hf input folder: {self.input_folder}")
7677
logger.debug(f"hf output folder: {self.output_folder}")
@@ -223,7 +224,7 @@ def get_file(self, path: str) -> tuple[bytes, int]:
223224
return f.read(), 0
224225
except Exception as e:
225226
logger.error(f"Error reading file {path}: {e}")
226-
raise e
227+
return None, 0
227228

228229
def save_file(self, path: str, data: bytes) -> tuple[dict[str, Any], int]:
229230
"""
@@ -250,6 +251,14 @@ def save_file(self, path: str, data: bytes) -> tuple[dict[str, Any], int]:
250251
logger.error(f"Error saving bytes to file {path}: {e}")
251252
return None, 0
252253

254+
def readme_to_repocard(self, content: str) -> RepoCard:
255+
"""
256+
Convert readme file to data card
257+
:param content: readme content
258+
:return: data card
259+
"""
260+
return RepoCard(content=content)
261+
253262
def get_dataset_card(self, ds_name: str) -> RepoCard:
254263
"""
255264
Get the Repo card for the data set
@@ -264,7 +273,7 @@ def get_dataset_card(self, ds_name: str) -> RepoCard:
264273
# read README file
265274
with self.fs.open(path=path, mode="r", newline="", encoding="utf-8") as f:
266275
data = f.read()
267-
return RepoCard(content=data)
276+
return self.readme_to_repocard(content=data)
268277

269278
def update_data_set_card(self, ds_name: str, content: str) -> None:
270279
"""
@@ -292,3 +301,11 @@ def update_data_set_card(self, ds_name: str, content: str) -> None:
292301
# write new Readme file
293302
with self.fs.open(path=path, mode="w", newline="", encoding="utf-8") as f:
294303
f.write(content)
304+
305+
def get_datasets_list(self, filter: Union[str, Iterable[str], None] = None) -> Iterable[DatasetInfo]:
306+
"""
307+
List datasets hosted on the Huggingface Hub, given some filters.
308+
:param filter: a string or list of string to filter datasets on the hub.
309+
:return: Iterator of data set info
310+
"""
311+
return self.apis.list_datasets(filter=filter)
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,79 @@
1+
REPOROOT=../../..
2+
# Use make help, to see the available rules
3+
include $(REPOROOT)/.make.defaults
4+
5+
setup::
6+
@# Help: Recursively make $@ all subdirs
7+
$(MAKE) RULE=$@ .recurse
8+
9+
clean::
10+
@# Help: Recursively make $@ all subdirs
11+
$(MAKE) RULE=$@ .recurse
12+
13+
build::
14+
@# Help: Recursively make $@ in subdirs
15+
$(MAKE) RULE=$@ .recurse
16+
venv::
17+
@# Help: Recursively make $@ in subdirs
18+
$(MAKE) RULE=$@ .recurse
19+
20+
image::
21+
@# Help: Recursively make $@ in all subdirs
22+
@$(MAKE) RULE=$@ .recurse
23+
24+
set-versions:
25+
@# Help: Recursively $@ in all subdirs
26+
@$(MAKE) RULE=$@ .recurse
27+
28+
publish::
29+
@# Help: Recursively make $@ in all subdirs
30+
@$(MAKE) RULE=$@ .recurse
31+
32+
test-image::
33+
@# Help: Recursively make $@ in all subdirs
34+
@$(MAKE) RULE=$@ .recurse
35+
36+
test::
37+
@# Help: Recursively make $@ in all subdirs
38+
@$(MAKE) RULE=$@ .recurse
39+
40+
test-src::
41+
@# Help: Recursively make $@ in all subdirs
42+
$(MAKE) RULE=$@ .recurse
43+
44+
kind-load-image::
45+
@# Help: Recursively make $@ in all subdirs
46+
$(MAKE) RULE=$@ .recurse
47+
48+
docker-load-image::
49+
@# Help: Recursively make $@ in all subdirs
50+
$(MAKE) RULE=$@ .recurse
51+
52+
docker-save-image::
53+
@# Help: Recursively make $@ in all subdirs
54+
$(MAKE) RULE=$@ .recurse
55+
56+
.PHONY: workflow-venv
57+
workflow-venv:
58+
if [ -e kfp_ray ]; then \
59+
$(MAKE) -C kfp_ray workflow-venv; \
60+
fi
61+
62+
.PHONY: workflow-test
63+
workflow-test:
64+
if [ -e kfp_ray ]; then \
65+
$(MAKE) -C kfp_ray workflow-test; \
66+
fi
67+
68+
.PHONY: workflow-upload
69+
workflow-upload:
70+
if [ -e kfp_ray ]; then \
71+
$(MAKE) -C kfp_ray workflow-upload; \
72+
fi
73+
74+
.PHONY: workflow-build
75+
workflow-build:
76+
if [ -e kfp_ray ]; then \
77+
$(MAKE) -C kfp_ray workflow-build; \
78+
fi
79+
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
# HF Dataset Explorer Transform
2+
The HF dataset explorer transforms reads are going through the datasets, rather then a list of
3+
files, reading Readme file and extracting Dataset card. At the moment it only extracts the license
4+
information from the card, but can be extended to extract additional information.
5+
Per the set of
6+
[transform project conventions](../../README.md#transform-project-conventions)
7+
the following runtimes are available:
8+
9+
* [python](python/README.md) - provides the base python-based transformation
10+
implementation.
11+
* [ray](ray/README.md) - enables the running of the base python transformation
12+
in a Ray runtime
13+
* [spark](spark/README.md) - enables the running of a spark-based transformation
14+
in a Spark runtime.
15+
* [kfp](kfp_ray/README.md) - enables running the ray docker image
16+
in a kubernetes cluster using a generated `yaml` file.
17+
18+
## Summary
19+
20+
This transform is going through all of the HF datasets extracting dataset card. It further extracts
21+
the value of the `licanse`. As a result it publishes a metadata file, containing counts and
22+
percentage of every license (name) across all datasets.
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
venv/

0 commit comments

Comments
 (0)