Skip to content
10 changes: 10 additions & 0 deletions metadata-ingestion/src/datahub/ingestion/source/hex/hex.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,10 @@ class HexSourceConfig(
default=DATAHUB_API_PAGE_SIZE_DEFAULT,
description="Number of items to fetch per DataHub API call.",
)
category_pattern: AllowDenyPattern = Field(
default=AllowDenyPattern.allow_all(),
description="Regex pattern for categories to filter in ingestion. This will exclude any project or component that has any category denied or not explicitly allowed.",
)

@model_validator(mode="before")
@classmethod
Expand Down Expand Up @@ -256,6 +260,12 @@ def get_report(self) -> HexReport:
def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
with self.report.new_stage("Fetch Hex assets from Hex API"):
for project_or_component in self.hex_api.fetch_projects():
if project_or_component.categories and any(
Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@alokr-dhub where would you suggest putting this logic in terms of a new function? I ask because having it here pattern matches lines 270 and 278, so I didn't find a natural place to put the function.
i could add it to the HexAPI class if you think that's a good spot for it

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ok. we can keep it here now. ideally all the checks should be in a separate internal method in the hex source class

not self.source_config.category_pattern.allowed(c.name)
for c in project_or_component.categories
):
continue

if isinstance(project_or_component, Project):
if self.source_config.project_title_pattern.allowed(
project_or_component.title
Expand Down
11 changes: 11 additions & 0 deletions metadata-ingestion/tests/unit/hex/conftest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
import json
import os
from pathlib import Path


def load_json_data(filename):
"""Load test data from JSON files in the test_data directory."""
test_dir = Path(os.path.dirname(os.path.abspath(__file__)))
file_path = test_dir / "test_data" / filename
with open(file_path, "r") as f:
return json.load(f)
16 changes: 4 additions & 12 deletions metadata-ingestion/tests/unit/hex/test_api.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,5 @@
import json
import os
import unittest
from datetime import datetime, timezone
from pathlib import Path
from unittest.mock import MagicMock, patch

import requests
Expand All @@ -17,14 +14,7 @@
Component,
Project,
)


# Helper to load test data from JSON files
def load_json_data(filename):
test_dir = Path(os.path.dirname(os.path.abspath(__file__)))
file_path = test_dir / "test_data" / filename
with open(file_path, "r") as f:
return json.load(f)
from tests.unit.hex.conftest import load_json_data


class TestHexAPI(unittest.TestCase):
Expand Down Expand Up @@ -52,7 +42,9 @@ def test_fetch_projects_pagination(self):

# Mock the session.get method after the session is created
with patch.object(
hex_api.session, "get", side_effect=[mock_response1, mock_response2]
hex_api.session,
"get",
side_effect=[mock_response1, mock_response2],
) as mock_get:
results = list(hex_api.fetch_projects())

Expand Down
212 changes: 212 additions & 0 deletions metadata-ingestion/tests/unit/hex/test_data/hex_projects_page3.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,212 @@
{
"values": [
{
"id": "d73da67d-c87b-4dd8-9e7f-b79cb7f822cg",
"title": "PlayNotebook",
"description": "This is a play project\n\n",
"type": "PROJECT",
"creator": {
"email": "user5@example.com"
},
"owner": {
"email": "user5@example.com"
},
"status": {
"name": "In development"
},
"categories": [
{
"name": "Scratchpad",
"description": "Personal scratchpad for a team member. Not intended for broad consumption"
}
],
"reviews": {
"required": false
},
"analytics": {
"appViews": {
"allTime": 16,
"lastSevenDays": 1,
"lastFourteenDays": 1,
"lastThirtyDays": 1
},
"lastViewedAt": "2025-03-17T14:55:34.717Z",
"publishedResultsUpdatedAt": "2025-03-18T12:11:34.907Z"
},
"lastEditedAt": "2024-12-18T08:38:03.873Z",
"lastPublishedAt": "2024-12-13T23:26:27.466Z",
"createdAt": "2024-12-03T09:54:58.471Z",
"archivedAt": null,
"trashedAt": null,
"schedules": [
{
"cadence": "WEEKLY",
"enabled": true,
"hourly": null,
"daily": null,
"weekly": {
"dayOfWeek": "TUESDAY",
"hour": 17,
"minute": 40,
"timezone": "+05:30"
},
"monthly": null,
"custom": null
}
],
"sharing": {
"users": [
{
"user": {
"email": "user4a@example.com"
},
"access": "FULL_ACCESS"
},
{
"user": {
"email": "user5@example.com"
},
"access": "FULL_ACCESS"
}
],
"collections": [],
"groups": [],
"workspace": {
"access": "NONE"
},
"publicWeb": {
"access": "NONE"
},
"support": {
"access": "NONE"
}
}
},
{
"id": "e8d7c5a3-2b4f-4e21-9823-1a3b5c7d9e0f",
"title": "Mixed Categories Project",
"description": "Project with both Scratchpad and Keep_Scratchpad categories",
"type": "PROJECT",
"creator": {
"email": "user6@example.com"
},
"owner": {
"email": "user6@example.com"
},
"status": {
"name": "Published"
},
"categories": [
{
"name": "Scratchpad",
"description": "Personal scratchpad for a team member. Not intended for broad consumption"
},
{
"name": "Keep_Scratchpad",
"description": "Intended for broad consumption"
}
],
"reviews": {
"required": false
},
"analytics": {
"appViews": {
"allTime": 25,
"lastSevenDays": 5,
"lastFourteenDays": 10,
"lastThirtyDays": 20
},
"lastViewedAt": "2025-03-20T10:30:00.000Z",
"publishedResultsUpdatedAt": "2025-03-21T12:00:00.000Z"
},
"lastEditedAt": "2024-12-20T14:30:00.000Z",
"lastPublishedAt": "2024-12-15T09:00:00.000Z",
"createdAt": "2024-12-05T11:00:00.000Z",
"archivedAt": null,
"trashedAt": null,
"schedules": [],
"sharing": {
"users": [
{
"user": {
"email": "user6@example.com"
},
"access": "FULL_ACCESS"
}
],
"collections": [],
"groups": [],
"workspace": {
"access": "NONE"
},
"publicWeb": {
"access": "NONE"
},
"support": {
"access": "NONE"
}
}
},
{
"id": "4759f33c-1ab9-403d-92e8-9bef48de00cg",
"title": "Cancelled Orders",
"description": null,
"type": "COMPONENT",
"creator": {
"email": "user4a@example.com"
},
"owner": {
"email": "user4a@example.com"
},
"status": null,
"categories": [{
"name": "Keep_Scratchpad",
"description": "Intended for broad consumption"
}],
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please add more test cases with mix of allowed and denied categories for more rigorous testing.

"reviews": {
"required": false
},
"analytics": {
"appViews": {
"allTime": 0,
"lastSevenDays": 0,
"lastFourteenDays": 0,
"lastThirtyDays": 0
},
"lastViewedAt": null,
"publishedResultsUpdatedAt": "2024-12-13T23:22:48.995Z"
},
"lastEditedAt": "2024-12-13T23:22:58.183Z",
"lastPublishedAt": "2024-12-13T23:22:58.189Z",
"createdAt": "2024-12-09T15:39:26.093Z",
"archivedAt": null,
"trashedAt": null,
"schedules": [],
"sharing": {
"users": [
{
"user": {
"email": "user4a@example.com"
},
"access": "FULL_ACCESS"
}
],
"collections": [],
"groups": [],
"workspace": {
"access": "NONE"
},
"publicWeb": {
"access": "NONE"
},
"support": {
"access": "NONE"
}
}
}
],
"pagination": {
"after": "djI6WyIyMDI0LTEyLTA5VDE1OjM5OjI2LjA5M1oiLCI0NzU5ZjMzYy0xYWI5LTQwM2QtOTJlOC05YmVmNDhkZTAwYzQiXQ",
"before": "djI6WyIyMDI0LTA4LTIxVDE3OjA3OjE5LjI3NVoiLCJjOGY4MTVjOC04OGMyLTRkZWEtOTgxZi02OWY1NDRkNjE2NWQiXQ"
}
}
Loading
Loading