diff --git a/metadata-ingestion/src/datahub/ingestion/source/hex/hex.py b/metadata-ingestion/src/datahub/ingestion/source/hex/hex.py index 4a6ef3af4af32e..218750ae64093c 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/hex/hex.py +++ b/metadata-ingestion/src/datahub/ingestion/source/hex/hex.py @@ -119,6 +119,10 @@ class HexSourceConfig( default=DATAHUB_API_PAGE_SIZE_DEFAULT, description="Number of items to fetch per DataHub API call.", ) + category_pattern: AllowDenyPattern = Field( + default=AllowDenyPattern.allow_all(), + description="Regex pattern for categories to filter in ingestion. This will exclude any project or component that has any category denied or not explicitly allowed.", + ) @model_validator(mode="before") @classmethod @@ -256,6 +260,12 @@ def get_report(self) -> HexReport: def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: with self.report.new_stage("Fetch Hex assets from Hex API"): for project_or_component in self.hex_api.fetch_projects(): + if project_or_component.categories and any( + not self.source_config.category_pattern.allowed(c.name) + for c in project_or_component.categories + ): + continue + if isinstance(project_or_component, Project): if self.source_config.project_title_pattern.allowed( project_or_component.title diff --git a/metadata-ingestion/tests/unit/hex/conftest.py b/metadata-ingestion/tests/unit/hex/conftest.py new file mode 100644 index 00000000000000..bedb803c09302c --- /dev/null +++ b/metadata-ingestion/tests/unit/hex/conftest.py @@ -0,0 +1,11 @@ +import json +import os +from pathlib import Path + + +def load_json_data(filename): + """Load test data from JSON files in the test_data directory.""" + test_dir = Path(os.path.dirname(os.path.abspath(__file__))) + file_path = test_dir / "test_data" / filename + with open(file_path, "r") as f: + return json.load(f) diff --git a/metadata-ingestion/tests/unit/hex/test_api.py b/metadata-ingestion/tests/unit/hex/test_api.py index 5b83b01fc2d1be..97d5826864cd46 100644 --- a/metadata-ingestion/tests/unit/hex/test_api.py +++ b/metadata-ingestion/tests/unit/hex/test_api.py @@ -1,8 +1,5 @@ -import json -import os import unittest from datetime import datetime, timezone -from pathlib import Path from unittest.mock import MagicMock, patch import requests @@ -17,14 +14,7 @@ Component, Project, ) - - -# Helper to load test data from JSON files -def load_json_data(filename): - test_dir = Path(os.path.dirname(os.path.abspath(__file__))) - file_path = test_dir / "test_data" / filename - with open(file_path, "r") as f: - return json.load(f) +from tests.unit.hex.conftest import load_json_data class TestHexAPI(unittest.TestCase): @@ -52,7 +42,9 @@ def test_fetch_projects_pagination(self): # Mock the session.get method after the session is created with patch.object( - hex_api.session, "get", side_effect=[mock_response1, mock_response2] + hex_api.session, + "get", + side_effect=[mock_response1, mock_response2], ) as mock_get: results = list(hex_api.fetch_projects()) diff --git a/metadata-ingestion/tests/unit/hex/test_data/hex_projects_page3.json b/metadata-ingestion/tests/unit/hex/test_data/hex_projects_page3.json new file mode 100644 index 00000000000000..a1c89688931f44 --- /dev/null +++ b/metadata-ingestion/tests/unit/hex/test_data/hex_projects_page3.json @@ -0,0 +1,212 @@ +{ + "values": [ + { + "id": "d73da67d-c87b-4dd8-9e7f-b79cb7f822cg", + "title": "PlayNotebook", + "description": "This is a play project\n\n", + "type": "PROJECT", + "creator": { + "email": "user5@example.com" + }, + "owner": { + "email": "user5@example.com" + }, + "status": { + "name": "In development" + }, + "categories": [ + { + "name": "Scratchpad", + "description": "Personal scratchpad for a team member. Not intended for broad consumption" + } + ], + "reviews": { + "required": false + }, + "analytics": { + "appViews": { + "allTime": 16, + "lastSevenDays": 1, + "lastFourteenDays": 1, + "lastThirtyDays": 1 + }, + "lastViewedAt": "2025-03-17T14:55:34.717Z", + "publishedResultsUpdatedAt": "2025-03-18T12:11:34.907Z" + }, + "lastEditedAt": "2024-12-18T08:38:03.873Z", + "lastPublishedAt": "2024-12-13T23:26:27.466Z", + "createdAt": "2024-12-03T09:54:58.471Z", + "archivedAt": null, + "trashedAt": null, + "schedules": [ + { + "cadence": "WEEKLY", + "enabled": true, + "hourly": null, + "daily": null, + "weekly": { + "dayOfWeek": "TUESDAY", + "hour": 17, + "minute": 40, + "timezone": "+05:30" + }, + "monthly": null, + "custom": null + } + ], + "sharing": { + "users": [ + { + "user": { + "email": "user4a@example.com" + }, + "access": "FULL_ACCESS" + }, + { + "user": { + "email": "user5@example.com" + }, + "access": "FULL_ACCESS" + } + ], + "collections": [], + "groups": [], + "workspace": { + "access": "NONE" + }, + "publicWeb": { + "access": "NONE" + }, + "support": { + "access": "NONE" + } + } + }, + { + "id": "e8d7c5a3-2b4f-4e21-9823-1a3b5c7d9e0f", + "title": "Mixed Categories Project", + "description": "Project with both Scratchpad and Keep_Scratchpad categories", + "type": "PROJECT", + "creator": { + "email": "user6@example.com" + }, + "owner": { + "email": "user6@example.com" + }, + "status": { + "name": "Published" + }, + "categories": [ + { + "name": "Scratchpad", + "description": "Personal scratchpad for a team member. Not intended for broad consumption" + }, + { + "name": "Keep_Scratchpad", + "description": "Intended for broad consumption" + } + ], + "reviews": { + "required": false + }, + "analytics": { + "appViews": { + "allTime": 25, + "lastSevenDays": 5, + "lastFourteenDays": 10, + "lastThirtyDays": 20 + }, + "lastViewedAt": "2025-03-20T10:30:00.000Z", + "publishedResultsUpdatedAt": "2025-03-21T12:00:00.000Z" + }, + "lastEditedAt": "2024-12-20T14:30:00.000Z", + "lastPublishedAt": "2024-12-15T09:00:00.000Z", + "createdAt": "2024-12-05T11:00:00.000Z", + "archivedAt": null, + "trashedAt": null, + "schedules": [], + "sharing": { + "users": [ + { + "user": { + "email": "user6@example.com" + }, + "access": "FULL_ACCESS" + } + ], + "collections": [], + "groups": [], + "workspace": { + "access": "NONE" + }, + "publicWeb": { + "access": "NONE" + }, + "support": { + "access": "NONE" + } + } + }, + { + "id": "4759f33c-1ab9-403d-92e8-9bef48de00cg", + "title": "Cancelled Orders", + "description": null, + "type": "COMPONENT", + "creator": { + "email": "user4a@example.com" + }, + "owner": { + "email": "user4a@example.com" + }, + "status": null, + "categories": [{ + "name": "Keep_Scratchpad", + "description": "Intended for broad consumption" + }], + "reviews": { + "required": false + }, + "analytics": { + "appViews": { + "allTime": 0, + "lastSevenDays": 0, + "lastFourteenDays": 0, + "lastThirtyDays": 0 + }, + "lastViewedAt": null, + "publishedResultsUpdatedAt": "2024-12-13T23:22:48.995Z" + }, + "lastEditedAt": "2024-12-13T23:22:58.183Z", + "lastPublishedAt": "2024-12-13T23:22:58.189Z", + "createdAt": "2024-12-09T15:39:26.093Z", + "archivedAt": null, + "trashedAt": null, + "schedules": [], + "sharing": { + "users": [ + { + "user": { + "email": "user4a@example.com" + }, + "access": "FULL_ACCESS" + } + ], + "collections": [], + "groups": [], + "workspace": { + "access": "NONE" + }, + "publicWeb": { + "access": "NONE" + }, + "support": { + "access": "NONE" + } + } + } + ], + "pagination": { + "after": "djI6WyIyMDI0LTEyLTA5VDE1OjM5OjI2LjA5M1oiLCI0NzU5ZjMzYy0xYWI5LTQwM2QtOTJlOC05YmVmNDhkZTAwYzQiXQ", + "before": "djI6WyIyMDI0LTA4LTIxVDE3OjA3OjE5LjI3NVoiLCJjOGY4MTVjOC04OGMyLTRkZWEtOTgxZi02OWY1NDRkNjE2NWQiXQ" + } +} \ No newline at end of file diff --git a/metadata-ingestion/tests/unit/hex/test_hex.py b/metadata-ingestion/tests/unit/hex/test_hex.py index f3a03d69d6873d..62f6f57bb6c4ad 100644 --- a/metadata-ingestion/tests/unit/hex/test_hex.py +++ b/metadata-ingestion/tests/unit/hex/test_hex.py @@ -1,7 +1,10 @@ import unittest from datetime import datetime, timedelta, timezone +from unittest.mock import MagicMock, patch -from datahub.ingestion.source.hex.hex import HexSourceConfig +from datahub.ingestion.api.common import PipelineContext +from datahub.ingestion.source.hex.hex import HexSource, HexSourceConfig +from tests.unit.hex.conftest import load_json_data def datetime_approx_equal( @@ -155,3 +158,157 @@ def test_lineage_config(self): config.lineage_end_time, datetime.now(tz=timezone.utc) ) ) + + def test_category_pattern_filtering(self): + """Test that category_pattern filters projects/components correctly using page3_data""" + page3_data = load_json_data("hex_projects_page3.json") + + mock_response = MagicMock() + mock_response.json.return_value = page3_data + + # Configure to deny exact "Scratchpad" match but not "Keep_Scratchpad" + config = { + **self.minimum_input_config, + "category_pattern": { + "deny": ["^Scratchpad$"], + }, + "include_lineage": False, + } + + ctx = PipelineContext(run_id="test-category-filtering") + source = HexSource.create(config, ctx) + + # Mock the API to return page3_data + with patch.object(source.hex_api.session, "get", return_value=mock_response): + list(source.get_workunits_internal()) + + # Verify: project with "Scratchpad" category should be filtered out + assert "d73da67d-c87b-4dd8-9e7f-b79cb7f822cg" not in source.project_registry + + # Verify: project with both "Scratchpad" and "Keep_Scratchpad" should be filtered out + # (deny takes precedence) + assert "e8d7c5a3-2b4f-4e21-9823-1a3b5c7d9e0f" not in source.project_registry + + # Verify: component with "Keep_Scratchpad" category should be kept + assert "4759f33c-1ab9-403d-92e8-9bef48de00cg" in source.component_registry + + def test_category_pattern_allow(self): + """Test that category_pattern allow list works correctly using page3_data""" + page3_data = load_json_data("hex_projects_page3.json") + + mock_response = MagicMock() + mock_response.json.return_value = page3_data + + # Configure to only allow "Keep_Scratchpad" category + config = { + **self.minimum_input_config, + "category_pattern": { + "allow": ["^Keep_Scratchpad$"], + }, + "include_lineage": False, + } + + ctx = PipelineContext(run_id="test-category-allow") + source = HexSource.create(config, ctx) + + with patch.object(source.hex_api.session, "get", return_value=mock_response): + list(source.get_workunits_internal()) + + # Verify: project with "Scratchpad" should be filtered out (not in allow list) + assert "d73da67d-c87b-4dd8-9e7f-b79cb7f822cg" not in source.project_registry + + # Verify: project with both "Scratchpad" and "Keep_Scratchpad" should be kept + # (has at least one allowed category) + assert "e8d7c5a3-2b4f-4e21-9823-1a3b5c7d9e0f" in source.project_registry + + # Verify: only component with "Keep_Scratchpad" should be kept + assert "4759f33c-1ab9-403d-92e8-9bef48de00cg" in source.component_registry + + def test_category_pattern_mixed_categories_deny_precedence(self): + """Test that deny patterns take precedence over having allowed categories""" + page3_data = load_json_data("hex_projects_page3.json") + + mock_response = MagicMock() + mock_response.json.return_value = page3_data + + # Configure with both allow and deny patterns + config = { + **self.minimum_input_config, + "category_pattern": { + "allow": ["^Keep_Scratchpad$"], + "deny": ["^Scratchpad$"], + }, + "include_lineage": False, + } + + ctx = PipelineContext(run_id="test-mixed-categories-deny") + source = HexSource.create(config, ctx) + + with patch.object(source.hex_api.session, "get", return_value=mock_response): + list(source.get_workunits_internal()) + + # Verify: project with only "Scratchpad" is filtered out + assert "d73da67d-c87b-4dd8-9e7f-b79cb7f822cg" not in source.project_registry + + # Verify: project with both categories is filtered out (deny takes precedence) + assert "e8d7c5a3-2b4f-4e21-9823-1a3b5c7d9e0f" not in source.project_registry + + # Verify: component with only "Keep_Scratchpad" is kept + assert "4759f33c-1ab9-403d-92e8-9bef48de00cg" in source.component_registry + + def test_category_pattern_mixed_categories_no_deny(self): + """Test that items with multiple categories are kept if any category matches allow""" + page3_data = load_json_data("hex_projects_page3.json") + + mock_response = MagicMock() + mock_response.json.return_value = page3_data + + # Configure with only allow pattern (no deny) + config = { + **self.minimum_input_config, + "category_pattern": { + "allow": ["^Keep_Scratchpad$"], + }, + "include_lineage": False, + } + + ctx = PipelineContext(run_id="test-mixed-categories-allow-only") + source = HexSource.create(config, ctx) + + with patch.object(source.hex_api.session, "get", return_value=mock_response): + list(source.get_workunits_internal()) + + # Verify: project with only "Scratchpad" is filtered out (not in allow list) + assert "d73da67d-c87b-4dd8-9e7f-b79cb7f822cg" not in source.project_registry + + # Verify: project with both categories is kept (has allowed category) + assert "e8d7c5a3-2b4f-4e21-9823-1a3b5c7d9e0f" in source.project_registry + + # Verify: component with "Keep_Scratchpad" is kept + assert "4759f33c-1ab9-403d-92e8-9bef48de00cg" in source.component_registry + + def test_no_category_pattern_filtering(self): + """Test that all items are kept when no category_pattern is configured""" + page3_data = load_json_data("hex_projects_page3.json") + + mock_response = MagicMock() + mock_response.json.return_value = page3_data + + # Configure without any category_pattern + config = { + **self.minimum_input_config, + "include_lineage": False, + } + + ctx = PipelineContext(run_id="test-no-category-filter") + source = HexSource.create(config, ctx) + + with patch.object(source.hex_api.session, "get", return_value=mock_response): + list(source.get_workunits_internal()) + + # Verify: all projects are kept regardless of categories + assert "d73da67d-c87b-4dd8-9e7f-b79cb7f822cg" in source.project_registry + assert "e8d7c5a3-2b4f-4e21-9823-1a3b5c7d9e0f" in source.project_registry + + # Verify: all components are kept + assert "4759f33c-1ab9-403d-92e8-9bef48de00cg" in source.component_registry