From 8828d7a1033b62ad5c65b0ffc1fe095aa5a37a29 Mon Sep 17 00:00:00 2001 From: maxachis Date: Tue, 14 Oct 2025 11:54:59 -0400 Subject: [PATCH 1/3] Continue draft --- ...1105-a8f36f185694_add_url_scheme_column.py | 63 +++++++++++++++++++ src/api/endpoints/annotate/_shared/extract.py | 2 +- src/api/endpoints/collector/manual/query.py | 7 ++- src/api/endpoints/submit/url/queries/core.py | 12 ++-- src/api/endpoints/url/get/query.py | 2 +- src/collectors/queries/insert/url.py | 6 +- src/collectors/queries/insert/urls/query.py | 2 +- src/core/tasks/handler.py | 1 - .../impl/huggingface/queries/get/core.py | 2 +- .../internet_archives/probe/queries/cte.py | 2 +- .../save/queries/shared/get_valid_entries.py | 2 +- .../queries/ctes/whitelisted_root_urls.py | 2 +- .../tasks/url/operators/html/queries/get.py | 2 +- .../queries/urls/not_probed/get/query.py | 4 +- .../url/operators/screenshot/queries/cte.py | 2 +- .../operators/submit_approved/queries/get.py | 2 +- .../operators/submit_meta_urls/queries/cte.py | 2 +- src/db/client/async_.py | 7 ++- src/db/client/sync.py | 6 +- src/db/models/impl/url/core/sqlalchemy.py | 9 +++ src/util/clean.py | 10 --- src/util/models/__init__.py | 0 src/util/models/url_and_scheme.py | 6 ++ src/util/url.py | 28 +++++++++ .../impl/huggingface/setup/queries/setup.py | 1 + tests/helpers/setup/populate.py | 3 +- 26 files changed, 151 insertions(+), 34 deletions(-) create mode 100644 alembic/versions/2025_10_14_1105-a8f36f185694_add_url_scheme_column.py delete mode 100644 src/util/clean.py create mode 100644 src/util/models/__init__.py create mode 100644 src/util/models/url_and_scheme.py create mode 100644 src/util/url.py diff --git a/alembic/versions/2025_10_14_1105-a8f36f185694_add_url_scheme_column.py b/alembic/versions/2025_10_14_1105-a8f36f185694_add_url_scheme_column.py new file mode 100644 index 00000000..3e302ea4 --- /dev/null +++ b/alembic/versions/2025_10_14_1105-a8f36f185694_add_url_scheme_column.py @@ -0,0 +1,63 @@ +"""Add url scheme column + +Revision ID: a8f36f185694 +Revises: 7aace6587d1a +Create Date: 2025-10-14 11:05:28.686940 + +""" +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa + + +# revision identifiers, used by Alembic. +revision: str = 'a8f36f185694' +down_revision: Union[str, None] = '7aace6587d1a' +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def _populate_column(): + op.execute( + """ + UPDATE urls + SET scheme = lower(split_part(url, '://', 1)) + WHERE url ~* '^[a-z][a-z0-9+.-]*://'; + """ + ) + + +def _remove_schemes_from_url_column(): + op.execute( + """ + UPDATE urls + SET url = regexp_replace(url, '^(?i)[a-z][a-z0-9+.-]*://', '') + WHERE url ~* '^[a-z][a-z0-9+.-]*://'; + """ + ) + + +def _add_check_constraint_to_url_column(): + op.execute( + """ + ALTER TABLE urls + ADD CONSTRAINT check_url_does_not_have_schema CHECK (url !~* '^[a-z][a-z0-9+.-]*://'); + """ + ) + + +def upgrade() -> None: + _add_column() + _populate_column() + _remove_schemes_from_url_column() + _add_check_constraint_to_url_column() + +def _add_column(): + op.add_column( + "urls", + sa.Column("scheme", sa.String(), nullable=True) + ) + +def downgrade() -> None: + pass diff --git a/src/api/endpoints/annotate/_shared/extract.py b/src/api/endpoints/annotate/_shared/extract.py index 390579d9..3534c997 100644 --- a/src/api/endpoints/annotate/_shared/extract.py +++ b/src/api/endpoints/annotate/_shared/extract.py @@ -46,7 +46,7 @@ async def extract_and_format_get_annotation_result( next_annotation=GetNextURLForAllAnnotationInnerResponse( url_info=URLMapping( url_id=url.id, - url=url.url + url=url.full_url ), html_info=html_response_info, url_type_suggestions=url_type_suggestions, diff --git a/src/api/endpoints/collector/manual/query.py b/src/api/endpoints/collector/manual/query.py index 4f8956dc..029b5ecb 100644 --- a/src/api/endpoints/collector/manual/query.py +++ b/src/api/endpoints/collector/manual/query.py @@ -12,6 +12,8 @@ from src.db.models.impl.url.optional_data_source_metadata import URLOptionalDataSourceMetadata from src.db.models.impl.url.record_type.sqlalchemy import URLRecordType from src.db.queries.base.builder import QueryBuilderBase +from src.util.models.url_and_scheme import URLAndScheme +from src.util.url import get_url_and_scheme class UploadManualBatchQueryBuilder(QueryBuilderBase): @@ -43,8 +45,11 @@ async def run(self, session: AsyncSession) -> ManualBatchResponseDTO: duplicate_urls: list[str] = [] for entry in self.dto.entries: + url_and_scheme: URLAndScheme = get_url_and_scheme(entry.url) + url = URL( - url=entry.url, + url=url_and_scheme.url, + scheme=url_and_scheme.scheme, name=entry.name, description=entry.description, collector_metadata=entry.collector_metadata, diff --git a/src/api/endpoints/submit/url/queries/core.py b/src/api/endpoints/submit/url/queries/core.py index 081b5456..4d0269dd 100644 --- a/src/api/endpoints/submit/url/queries/core.py +++ b/src/api/endpoints/submit/url/queries/core.py @@ -19,7 +19,8 @@ from src.db.models.impl.url.suggestion.record_type.user import UserRecordTypeSuggestion from src.db.queries.base.builder import QueryBuilderBase from src.db.utils.validate import is_valid_url -from src.util.clean import clean_url +from src.util.models.url_and_scheme import URLAndScheme +from src.util.url import clean_url, get_url_and_scheme class SubmitURLQueryBuilder(QueryBuilderBase): @@ -41,11 +42,13 @@ async def run(self, session: AsyncSession) -> URLSubmissionResponse: if not valid: return convert_invalid_url_to_url_response(url_original) - # Clean URLs + # Clean URL url_clean: str = clean_url(url_original) + url_and_scheme: URLAndScheme = get_url_and_scheme(url_clean) + # Check if duplicate - is_duplicate: bool = await DeduplicateURLQueryBuilder(url=url_clean).run(session) + is_duplicate: bool = await DeduplicateURLQueryBuilder(url=url_and_scheme.url).run(session) if is_duplicate: return convert_duplicate_urls_to_url_response( clean_url=url_clean, @@ -56,7 +59,8 @@ async def run(self, session: AsyncSession) -> URLSubmissionResponse: # Add URL url_insert = URL( - url=url_clean, + url=url_and_scheme.url, + scheme=url_and_scheme.scheme, source=URLSource.MANUAL, status=URLStatus.OK, ) diff --git a/src/api/endpoints/url/get/query.py b/src/api/endpoints/url/get/query.py index d7198612..6885ef64 100644 --- a/src/api/endpoints/url/get/query.py +++ b/src/api/endpoints/url/get/query.py @@ -50,7 +50,7 @@ async def run(self, session: AsyncSession) -> GetURLsResponseInfo: GetURLsResponseInnerInfo( id=result.id, batch_id=result.batch.id if result.batch is not None else None, - url=result.url, + url=result.full_url, status=URLStatus(result.status), collector_metadata=result.collector_metadata, updated_at=result.updated_at, diff --git a/src/collectors/queries/insert/url.py b/src/collectors/queries/insert/url.py index af72a3aa..8e9e75d3 100644 --- a/src/collectors/queries/insert/url.py +++ b/src/collectors/queries/insert/url.py @@ -4,6 +4,8 @@ from src.db.models.impl.url.core.pydantic.info import URLInfo from src.db.models.impl.url.core.sqlalchemy import URL from src.db.queries.base.builder import QueryBuilderBase +from src.util.models.url_and_scheme import URLAndScheme +from src.util.url import get_url_and_scheme class InsertURLQueryBuilder(QueryBuilderBase): @@ -15,8 +17,10 @@ def __init__(self, url_info: URLInfo): async def run(self, session: AsyncSession) -> int: """Insert a new URL into the database.""" + url_and_scheme: URLAndScheme = get_url_and_scheme(self.url_info.url) url_entry = URL( - url=self.url_info.url, + url=url_and_scheme.url, + scheme=url_and_scheme.scheme, collector_metadata=self.url_info.collector_metadata, status=self.url_info.status.value, source=self.url_info.source diff --git a/src/collectors/queries/insert/urls/query.py b/src/collectors/queries/insert/urls/query.py index 75176158..d4165001 100644 --- a/src/collectors/queries/insert/urls/query.py +++ b/src/collectors/queries/insert/urls/query.py @@ -2,7 +2,7 @@ from sqlalchemy.ext.asyncio import AsyncSession from src.collectors.queries.insert.urls.request_manager import InsertURLsRequestManager -from src.util.clean import clean_url +from src.util.url import clean_url from src.db.dtos.url.insert import InsertURLsInfo from src.db.dtos.url.mapping import URLMapping from src.db.models.impl.duplicate.pydantic.insert import DuplicateInsertInfo diff --git a/src/core/tasks/handler.py b/src/core/tasks/handler.py index 92b96103..7ed0d230 100644 --- a/src/core/tasks/handler.py +++ b/src/core/tasks/handler.py @@ -2,7 +2,6 @@ from discord_poster import DiscordPoster -from src.core.enums import BatchStatus from src.core.tasks.base.run_info import TaskOperatorRunInfo from src.core.tasks.url.enums import TaskOperatorOutcome from src.db.client.async_ import AsyncDatabaseClient diff --git a/src/core/tasks/scheduled/impl/huggingface/queries/get/core.py b/src/core/tasks/scheduled/impl/huggingface/queries/get/core.py index 5b6bd08d..802e8ea5 100644 --- a/src/core/tasks/scheduled/impl/huggingface/queries/get/core.py +++ b/src/core/tasks/scheduled/impl/huggingface/queries/get/core.py @@ -33,7 +33,7 @@ async def run(self, session: AsyncSession) -> list[GetForLoadingToHuggingFaceOut query = ( select( URL.id.label(label_url_id), - URL.url.label(label_url), + URL.full_url.label(label_url), URLRecordType.record_type.label(label_record_type_fine), URLCompressedHTML.compressed_html.label(label_html), FlagURLValidated.type.label(label_type) diff --git a/src/core/tasks/scheduled/impl/internet_archives/probe/queries/cte.py b/src/core/tasks/scheduled/impl/internet_archives/probe/queries/cte.py index 7de8b290..e6886134 100644 --- a/src/core/tasks/scheduled/impl/internet_archives/probe/queries/cte.py +++ b/src/core/tasks/scheduled/impl/internet_archives/probe/queries/cte.py @@ -12,7 +12,7 @@ def __init__(self): self._cte = ( select( URL.id.label("url_id"), - URL.url + URL.full_url.label("url") ) .where( or_( diff --git a/src/core/tasks/scheduled/impl/internet_archives/save/queries/shared/get_valid_entries.py b/src/core/tasks/scheduled/impl/internet_archives/save/queries/shared/get_valid_entries.py index b0f9eeea..1ce9c1d9 100644 --- a/src/core/tasks/scheduled/impl/internet_archives/save/queries/shared/get_valid_entries.py +++ b/src/core/tasks/scheduled/impl/internet_archives/save/queries/shared/get_valid_entries.py @@ -9,7 +9,7 @@ IA_SAVE_VALID_ENTRIES_QUERY = ( select( URL.id, - URL.url, + URL.full_url.label("url"), (URLInternetArchivesSaveMetadata.url_id.is_(None)).label("is_new"), ) # URL must have been previously probed for its online status. diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match_/queries/ctes/whitelisted_root_urls.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match_/queries/ctes/whitelisted_root_urls.py index 272717b5..dd7a5a8c 100644 --- a/src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match_/queries/ctes/whitelisted_root_urls.py +++ b/src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match_/queries/ctes/whitelisted_root_urls.py @@ -34,7 +34,7 @@ # The connected URLs must be Meta URLs FlagURLValidated.type == URLType.META_URL, # Root URL can't be "https://catalog.data.gov" - URL.url != "https://catalog.data.gov" + URL.url != "catalog.data.gov" ) .group_by( URL.id diff --git a/src/core/tasks/url/operators/html/queries/get.py b/src/core/tasks/url/operators/html/queries/get.py index 832d9917..a6cbe4a8 100644 --- a/src/core/tasks/url/operators/html/queries/get.py +++ b/src/core/tasks/url/operators/html/queries/get.py @@ -19,7 +19,7 @@ async def run(self, session: AsyncSession) -> list[URLInfo]: url_info = URLInfo( id=url.id, batch_id=url.batch.id if url.batch is not None else None, - url=url.url, + url=url.full_url, collector_metadata=url.collector_metadata, status=url.status, created_at=url.created_at, diff --git a/src/core/tasks/url/operators/probe/queries/urls/not_probed/get/query.py b/src/core/tasks/url/operators/probe/queries/urls/not_probed/get/query.py index 36450252..0ecc50b3 100644 --- a/src/core/tasks/url/operators/probe/queries/urls/not_probed/get/query.py +++ b/src/core/tasks/url/operators/probe/queries/urls/not_probed/get/query.py @@ -4,7 +4,7 @@ from sqlalchemy.ext.asyncio import AsyncSession from typing_extensions import override, final -from src.util.clean import clean_url +from src.util.url import clean_url from src.db.dtos.url.mapping import URLMapping from src.db.models.impl.url.core.sqlalchemy import URL from src.db.models.impl.url.web_metadata.sqlalchemy import URLWebMetadata @@ -20,7 +20,7 @@ async def run(self, session: AsyncSession) -> list[URLMapping]: query = ( select( URL.id.label("url_id"), - URL.url + URL.full_url.label("url") ) .outerjoin( URLWebMetadata, diff --git a/src/core/tasks/url/operators/screenshot/queries/cte.py b/src/core/tasks/url/operators/screenshot/queries/cte.py index d961aabf..f1b3b1d2 100644 --- a/src/core/tasks/url/operators/screenshot/queries/cte.py +++ b/src/core/tasks/url/operators/screenshot/queries/cte.py @@ -13,7 +13,7 @@ def __init__(self): self._cte: CTE = ( select( URL.id.label("url_id"), - URL.url, + URL.full_url.label("url"), ) .join( URLWebMetadata, diff --git a/src/core/tasks/url/operators/submit_approved/queries/get.py b/src/core/tasks/url/operators/submit_approved/queries/get.py index d4138f9a..fb43dd34 100644 --- a/src/core/tasks/url/operators/submit_approved/queries/get.py +++ b/src/core/tasks/url/operators/submit_approved/queries/get.py @@ -55,7 +55,7 @@ async def _process_result(url: URL) -> SubmitApprovedURLTDO: supplying_entity = optional_metadata.supplying_entity tdo = SubmitApprovedURLTDO( url_id=url.id, - url=url.url, + url=url.full_url, name=url.name, agency_ids=agency_ids, description=url.description, diff --git a/src/core/tasks/url/operators/submit_meta_urls/queries/cte.py b/src/core/tasks/url/operators/submit_meta_urls/queries/cte.py index d350258c..54b1edf8 100644 --- a/src/core/tasks/url/operators/submit_meta_urls/queries/cte.py +++ b/src/core/tasks/url/operators/submit_meta_urls/queries/cte.py @@ -16,7 +16,7 @@ def __init__(self): self._cte = ( select( URL.id.label("url_id"), - URL.url, + URL.full_url.label("url"), LinkURLAgency.agency_id, ) # Validated as Meta URL diff --git a/src/db/client/async_.py b/src/db/client/async_.py index 93c36544..2a15267e 100644 --- a/src/db/client/async_.py +++ b/src/db/client/async_.py @@ -102,6 +102,8 @@ from src.db.templates.markers.bulk.insert import BulkInsertableModel from src.db.templates.markers.bulk.upsert import BulkUpsertableModel from src.db.utils.compression import decompress_html, compress_html +from src.util.models.url_and_scheme import URLAndScheme +from src.util.url import get_url_and_scheme class AsyncDatabaseClient: @@ -828,9 +830,10 @@ async def upload_manual_batch( @session_manager async def search_for_url(self, session: AsyncSession, url: str) -> SearchURLResponse: - query = select(URL).where(URL.url == url) + url_and_scheme: URLAndScheme = get_url_and_scheme(url) + query = select(URL).where(URL.url == url_and_scheme.url) raw_results = await session.execute(query) - url = raw_results.scalars().one_or_none() + url: URL | None = raw_results.scalars().one_or_none() if url is None: return SearchURLResponse( found=False, diff --git a/src/db/client/sync.py b/src/db/client/sync.py index 006d6f0e..407cb3f4 100644 --- a/src/db/client/sync.py +++ b/src/db/client/sync.py @@ -23,6 +23,8 @@ from src.core.tasks.url.operators.submit_approved.tdo import SubmittedURLInfo from src.core.env_var_manager import EnvVarManager from src.core.enums import BatchStatus +from src.util.models.url_and_scheme import URLAndScheme +from src.util.url import get_url_and_scheme # Database Client @@ -116,8 +118,10 @@ def get_url_info_by_url( @session_manager def insert_url(self, session, url_info: URLInfo) -> int: """Insert a new URL into the database.""" + url_and_scheme: URLAndScheme = get_url_and_scheme(url_info.url) url_entry = URL( - url=url_info.url, + url=url_and_scheme.url, + scheme=url_and_scheme.scheme, collector_metadata=url_info.collector_metadata, status=url_info.status, name=url_info.name, diff --git a/src/db/models/impl/url/core/sqlalchemy.py b/src/db/models/impl/url/core/sqlalchemy.py index 3582dd56..98035bbf 100644 --- a/src/db/models/impl/url/core/sqlalchemy.py +++ b/src/db/models/impl/url/core/sqlalchemy.py @@ -1,5 +1,7 @@ from sqlalchemy import Column, Text, String, JSON +from sqlalchemy.ext.hybrid import hybrid_property from sqlalchemy.orm import relationship +from sqlalchemy.util import hybridproperty from src.collectors.enums import URLStatus from src.db.models.helpers import enum_column @@ -19,6 +21,7 @@ class URL(UpdatedAtMixin, CreatedAtMixin, WithIDBase): # The batch this URL is associated with url = Column(Text, unique=True) + scheme = Column(String) name = Column(String) description = Column(Text) # The metadata from the collector @@ -30,6 +33,12 @@ class URL(UpdatedAtMixin, CreatedAtMixin, WithIDBase): nullable=False ) + @hybrid_property + def full_url(self) -> str: + if self.scheme is None: + return self.url + return f"{self.scheme}://{self.url}" + source = enum_column( URLSource, name='url_source', diff --git a/src/util/clean.py b/src/util/clean.py deleted file mode 100644 index 3c0a0f92..00000000 --- a/src/util/clean.py +++ /dev/null @@ -1,10 +0,0 @@ - - -def clean_url(url: str) -> str: - # Remove Non-breaking spaces - url = url.strip(" ") - - # Remove any fragments and everything after them - url = url.split("#")[0] - return url - diff --git a/src/util/models/__init__.py b/src/util/models/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/util/models/url_and_scheme.py b/src/util/models/url_and_scheme.py new file mode 100644 index 00000000..494acd49 --- /dev/null +++ b/src/util/models/url_and_scheme.py @@ -0,0 +1,6 @@ +from pydantic import BaseModel + + +class URLAndScheme(BaseModel): + url: str + scheme: str | None \ No newline at end of file diff --git a/src/util/url.py b/src/util/url.py new file mode 100644 index 00000000..ac4f73ca --- /dev/null +++ b/src/util/url.py @@ -0,0 +1,28 @@ +from urllib.parse import urlparse + +from src.util.models.url_and_scheme import URLAndScheme + + +def clean_url(url: str) -> str: + # Remove Non-breaking spaces + url = url.strip(" ") + + # Remove any fragments and everything after them + url = url.split("#")[0] + return url + +def get_url_and_scheme( + url: str +) -> URLAndScheme: + parsed = urlparse(url) + if parsed.scheme: + remainder = url.replace(f"{parsed.scheme}://", "", 1) + return URLAndScheme( + url=remainder, + scheme=parsed.scheme + ) + # Handle URLs without scheme + return URLAndScheme( + url=url, + scheme=None + ) diff --git a/tests/automated/integration/tasks/scheduled/impl/huggingface/setup/queries/setup.py b/tests/automated/integration/tasks/scheduled/impl/huggingface/setup/queries/setup.py index 417677df..55dbeb76 100644 --- a/tests/automated/integration/tasks/scheduled/impl/huggingface/setup/queries/setup.py +++ b/tests/automated/integration/tasks/scheduled/impl/huggingface/setup/queries/setup.py @@ -37,6 +37,7 @@ async def run(self, session: AsyncSession) -> list[int]: description = None url = URL( url=get_test_url(i), + scheme=None, status=URLStatus.OK, name=name, description=description, diff --git a/tests/helpers/setup/populate.py b/tests/helpers/setup/populate.py index 02c364d6..d0ce5869 100644 --- a/tests/helpers/setup/populate.py +++ b/tests/helpers/setup/populate.py @@ -5,7 +5,8 @@ async def populate_database(adb_client: AsyncDatabaseClient) -> None: """Populate database with test data.""" url = URL( - url="https://www.test-data.com/static-test-data", + url="www.test-data.com/static-test-data", + scheme="https", name="Fake test data", description="Test data populated as a result of `reset_database`, " "which imitates a validated URL synchronized from the Data Sources App.", From 00248c4fd7a4d5cf047f3620a89ea0e138784eb8 Mon Sep 17 00:00:00 2001 From: Max Chis Date: Tue, 14 Oct 2025 13:28:38 -0400 Subject: [PATCH 2/3] Add schema column to `urls` table and update associated logic --- src/collectors/impl/example/core.py | 2 +- src/core/tasks/url/operators/root_url/extract.py | 5 +++-- src/db/models/impl/url/core/pydantic/insert.py | 1 + src/db/models/impl/url/core/sqlalchemy.py | 9 ++++++++- tests/automated/integration/api/test_manual_batch.py | 10 +++++----- .../integration/db/client/test_insert_urls.py | 10 +++++----- .../integration/db/structure/test_root_url.py | 2 +- .../impl/internet_archives/probe/constants.py | 4 ++-- .../scheduled/impl/internet_archives/save/constants.py | 4 ++-- .../integration/tasks/url/impl/html/setup/data.py | 10 +++++----- .../integration/tasks/url/impl/probe/constants.py | 4 ++-- .../tasks/url/impl/probe/no_redirect/test_two_urls.py | 4 ++-- .../url/impl/probe/redirect/test_two_urls_same_dest.py | 4 ++-- .../integration/tasks/url/impl/root_url/constants.py | 6 +++--- .../integration/tasks/url/impl/screenshot/test_core.py | 4 ++-- .../tasks/url/impl/submit_meta_urls/test_core.py | 2 +- tests/helpers/data_creator/generate.py | 3 ++- tests/helpers/simple_test_data_functions.py | 4 ++-- tests/manual/external/pdap/test_check_for_duplicate.py | 2 +- .../manual/external/url_request/test_url_screenshot.py | 2 +- 20 files changed, 51 insertions(+), 41 deletions(-) diff --git a/src/collectors/impl/example/core.py b/src/collectors/impl/example/core.py index 4bccf242..be0b8e07 100644 --- a/src/collectors/impl/example/core.py +++ b/src/collectors/impl/example/core.py @@ -24,7 +24,7 @@ async def run_implementation(self) -> None: await self.sleep() self.data = ExampleOutputDTO( message=f"Data collected by {self.batch_id}", - urls=["https://example.com", "https://example.com/2"], + urls=["example.com", "example.com/2"], parameters=self.dto.model_dump(), ) diff --git a/src/core/tasks/url/operators/root_url/extract.py b/src/core/tasks/url/operators/root_url/extract.py index e384fd15..9cb05c5a 100644 --- a/src/core/tasks/url/operators/root_url/extract.py +++ b/src/core/tasks/url/operators/root_url/extract.py @@ -2,6 +2,7 @@ def extract_root_url(url: str) -> str: - parsed_url: ParseResult = urlparse(url) - root_url = f"{parsed_url.scheme}://{parsed_url.netloc}" + # URLs in DB should not have HTTPS -- add to enable url parse to function properly + parsed_url: ParseResult = urlparse(f"https://{url}") + root_url = parsed_url.netloc return root_url \ No newline at end of file diff --git a/src/db/models/impl/url/core/pydantic/insert.py b/src/db/models/impl/url/core/pydantic/insert.py index f04dd3df..08480b6b 100644 --- a/src/db/models/impl/url/core/pydantic/insert.py +++ b/src/db/models/impl/url/core/pydantic/insert.py @@ -14,6 +14,7 @@ def sa_model(cls) -> type[Base]: return URL url: str + scheme: str | None = None collector_metadata: dict | None = None name: str | None = None status: URLStatus = URLStatus.OK diff --git a/src/db/models/impl/url/core/sqlalchemy.py b/src/db/models/impl/url/core/sqlalchemy.py index 98035bbf..e5bca30d 100644 --- a/src/db/models/impl/url/core/sqlalchemy.py +++ b/src/db/models/impl/url/core/sqlalchemy.py @@ -1,4 +1,4 @@ -from sqlalchemy import Column, Text, String, JSON +from sqlalchemy import Column, Text, String, JSON, case, literal from sqlalchemy.ext.hybrid import hybrid_property from sqlalchemy.orm import relationship from sqlalchemy.util import hybridproperty @@ -39,6 +39,13 @@ def full_url(self) -> str: return self.url return f"{self.scheme}://{self.url}" + @full_url.expression + def full_url(cls): + return case( + (cls.scheme != None, (cls.scheme + literal("://") + cls.url)), + else_=cls.url + ) + source = enum_column( URLSource, name='url_source', diff --git a/tests/automated/integration/api/test_manual_batch.py b/tests/automated/integration/api/test_manual_batch.py index dae5ee4f..9be80c25 100644 --- a/tests/automated/integration/api/test_manual_batch.py +++ b/tests/automated/integration/api/test_manual_batch.py @@ -20,14 +20,14 @@ async def test_manual_batch(api_test_helper): dtos = [] for i in range(50): dto = ManualBatchInnerInputDTO( - url=f"https://example.com/{i}", + url=f"example.com/{i}", ) dtos.append(dto) # Create 50 entries with URL and all optional fields for i in range(50): dto = ManualBatchInnerInputDTO( - url=f"https://example.com/{i+50}", + url=f"example.com/{i+50}", name=manual_batch_name, description=f"Description {i}", collector_metadata={ @@ -142,13 +142,13 @@ def check_opt_metadata(metadata: URLOptionalDataSourceMetadata, no_optional: boo more_dtos = [] for i in range(49): dto = ManualBatchInnerInputDTO( - url=f"https://example.com/{i+100}", + url=f"example.com/{i+100}", ) more_dtos.append(dto) for i in range(2): dto = ManualBatchInnerInputDTO( - url=f"https://example.com/{i+1}", + url=f"example.com/{i+1}", ) more_dtos.append(dto) @@ -162,7 +162,7 @@ def check_opt_metadata(metadata: URLOptionalDataSourceMetadata, no_optional: boo response = await ath.request_validator.submit_manual_batch(duplicate_input_dto) # Check duplicate URLs assert len(response.duplicate_urls) == 2 - assert response.duplicate_urls == ['https://example.com/1', 'https://example.com/2'] + assert response.duplicate_urls == ['example.com/1', 'example.com/2'] assert len(response.urls) == 49 # Check 149 URLs in database diff --git a/tests/automated/integration/db/client/test_insert_urls.py b/tests/automated/integration/db/client/test_insert_urls.py index f2d73f00..852da385 100644 --- a/tests/automated/integration/db/client/test_insert_urls.py +++ b/tests/automated/integration/db/client/test_insert_urls.py @@ -24,17 +24,17 @@ async def test_insert_urls( urls = [ URLInfo( - url="https://example.com/1", + url="example.com/1", collector_metadata={"name": "example_1"}, source=URLSource.COLLECTOR ), URLInfo( - url="https://example.com/2", + url="example.com/2", source=URLSource.COLLECTOR ), # Duplicate URLInfo( - url="https://example.com/1", + url="example.com/1", collector_metadata={"name": "example_duplicate"}, source=URLSource.COLLECTOR ) @@ -46,8 +46,8 @@ async def test_insert_urls( url_mappings = insert_urls_info.url_mappings assert len(url_mappings) == 2 - assert url_mappings[0].url == "https://example.com/1" - assert url_mappings[1].url == "https://example.com/2" + assert url_mappings[0].url == "example.com/1" + assert url_mappings[1].url == "example.com/2" assert insert_urls_info.original_count == 2 diff --git a/tests/automated/integration/db/structure/test_root_url.py b/tests/automated/integration/db/structure/test_root_url.py index 8f8be80b..62755b00 100644 --- a/tests/automated/integration/db/structure/test_root_url.py +++ b/tests/automated/integration/db/structure/test_root_url.py @@ -13,7 +13,7 @@ def test_root_url(db_data_creator: DBDataCreator): ColumnTester( column_name="url", type_=sa.String, - allowed_values=["https://example.com"] + allowed_values=["example.com"] ), ColumnTester( column_name="page_title", diff --git a/tests/automated/integration/tasks/scheduled/impl/internet_archives/probe/constants.py b/tests/automated/integration/tasks/scheduled/impl/internet_archives/probe/constants.py index d41ffb48..60f762e7 100644 --- a/tests/automated/integration/tasks/scheduled/impl/internet_archives/probe/constants.py +++ b/tests/automated/integration/tasks/scheduled/impl/internet_archives/probe/constants.py @@ -1,4 +1,4 @@ -TEST_URL_1 = "https://test-ia-metadata.com/1" -TEST_URL_2 = "https://test-ia-metadata.com/2" \ No newline at end of file +TEST_URL_1 = "test-ia-metadata.com/1" +TEST_URL_2 = "test-ia-metadata.com/2" \ No newline at end of file diff --git a/tests/automated/integration/tasks/scheduled/impl/internet_archives/save/constants.py b/tests/automated/integration/tasks/scheduled/impl/internet_archives/save/constants.py index bc1b5a2e..658d8cb9 100644 --- a/tests/automated/integration/tasks/scheduled/impl/internet_archives/save/constants.py +++ b/tests/automated/integration/tasks/scheduled/impl/internet_archives/save/constants.py @@ -1,5 +1,5 @@ -TEST_URL_1 = "https://ia-save-test.com/1" -TEST_URL_2 = "https://ia-save-test.com/2" \ No newline at end of file +TEST_URL_1 = "ia-save-test.com/1" +TEST_URL_2 = "ia-save-test.com/2" \ No newline at end of file diff --git a/tests/automated/integration/tasks/url/impl/html/setup/data.py b/tests/automated/integration/tasks/url/impl/html/setup/data.py index 5615392c..203eb34b 100644 --- a/tests/automated/integration/tasks/url/impl/html/setup/data.py +++ b/tests/automated/integration/tasks/url/impl/html/setup/data.py @@ -10,7 +10,7 @@ # and their html should be stored TestURLHTMLTaskSetupEntry( url_info=TestURLInfo( - url="https://happy-path.com/pending", + url="happy-path.com/pending", status=URLStatus.OK ), web_metadata_info=TestWebMetadataInfo( @@ -28,7 +28,7 @@ # and their web metadata status should be updated to 404 TestURLHTMLTaskSetupEntry( url_info=TestURLInfo( - url="https://not-found-path.com/submitted", + url="not-found-path.com/submitted", status=URLStatus.ERROR ), web_metadata_info=TestWebMetadataInfo( @@ -47,7 +47,7 @@ # URLs that give errors should be updated with the appropriate scrape status TestURLHTMLTaskSetupEntry( url_info=TestURLInfo( - url="https://error-path.com/submitted", + url="error-path.com/submitted", status=URLStatus.ERROR ), web_metadata_info=TestWebMetadataInfo( @@ -65,7 +65,7 @@ # URLs with non-200 web metadata should not be processed TestURLHTMLTaskSetupEntry( url_info=TestURLInfo( - url="https://not-200-path.com/submitted", + url="not-200-path.com/submitted", status=URLStatus.OK ), web_metadata_info=TestWebMetadataInfo( @@ -82,7 +82,7 @@ # URLs with no web metadata should not be processed TestURLHTMLTaskSetupEntry( url_info=TestURLInfo( - url="https://no-web-metadata.com/submitted", + url="no-web-metadata.com/submitted", status=URLStatus.OK ), web_metadata_info=None, diff --git a/tests/automated/integration/tasks/url/impl/probe/constants.py b/tests/automated/integration/tasks/url/impl/probe/constants.py index 6c218e25..07ebbcc3 100644 --- a/tests/automated/integration/tasks/url/impl/probe/constants.py +++ b/tests/automated/integration/tasks/url/impl/probe/constants.py @@ -1,6 +1,6 @@ from src.db.models.impl.url.core.enums import URLSource PATCH_ROOT = "src.external.url_request.core.URLProbeManager" -TEST_URL = "https://www.example.com" -TEST_DEST_URL = "https://www.example.com/redirect" +TEST_URL = "www.example.com" +TEST_DEST_URL = "www.example.com/redirect" TEST_SOURCE = URLSource.COLLECTOR diff --git a/tests/automated/integration/tasks/url/impl/probe/no_redirect/test_two_urls.py b/tests/automated/integration/tasks/url/impl/probe/no_redirect/test_two_urls.py index cfd1f68f..c3b0c6c4 100644 --- a/tests/automated/integration/tasks/url/impl/probe/no_redirect/test_two_urls.py +++ b/tests/automated/integration/tasks/url/impl/probe/no_redirect/test_two_urls.py @@ -12,8 +12,8 @@ async def test_two_urls( setup_manager: TestURLProbeSetupManager, check_manager: TestURLProbeCheckManager ): - url_1 = "https://example.com/1" - url_2 = "https://example.com/2" + url_1 = "example.com/1" + url_2 = "example.com/2" operator = setup_manager.setup_operator( response_or_responses=[ setup_manager.setup_no_redirect_probe_response( diff --git a/tests/automated/integration/tasks/url/impl/probe/redirect/test_two_urls_same_dest.py b/tests/automated/integration/tasks/url/impl/probe/redirect/test_two_urls_same_dest.py index f0e113ff..bf5dab9f 100644 --- a/tests/automated/integration/tasks/url/impl/probe/redirect/test_two_urls_same_dest.py +++ b/tests/automated/integration/tasks/url/impl/probe/redirect/test_two_urls_same_dest.py @@ -30,12 +30,12 @@ async def test_url_probe_task_redirect_two_urls_same_dest( dest_status_code=200, dest_content_type=None, dest_error=None, - source_url="https://example.com/2", + source_url="example.com/2", ), ] ) source_url_id_1 = await setup_manager.setup_url(URLStatus.OK) - source_url_id_2 = await setup_manager.setup_url(URLStatus.OK, url="https://example.com/2") + source_url_id_2 = await setup_manager.setup_url(URLStatus.OK, url="example.com/2") run_info = await operator.run_task() assert_task_ran_without_error(run_info) await check_manager.check_url( diff --git a/tests/automated/integration/tasks/url/impl/root_url/constants.py b/tests/automated/integration/tasks/url/impl/root_url/constants.py index dc688797..d5e38e8f 100644 --- a/tests/automated/integration/tasks/url/impl/root_url/constants.py +++ b/tests/automated/integration/tasks/url/impl/root_url/constants.py @@ -1,5 +1,5 @@ -ROOT_URL = "https://root.com" -BRANCH_URL = "https://root.com/branch" -SECOND_BRANCH_URL = "https://root.com/second-branch" \ No newline at end of file +ROOT_URL = "root.com" +BRANCH_URL = "root.com/branch" +SECOND_BRANCH_URL = "root.com/second-branch" \ No newline at end of file diff --git a/tests/automated/integration/tasks/url/impl/screenshot/test_core.py b/tests/automated/integration/tasks/url/impl/screenshot/test_core.py index 6f54fbf9..f65aa40d 100644 --- a/tests/automated/integration/tasks/url/impl/screenshot/test_core.py +++ b/tests/automated/integration/tasks/url/impl/screenshot/test_core.py @@ -40,11 +40,11 @@ async def test_core( mock_get_screenshots = AsyncMock(return_value=[ URLScreenshotResponse( - url=screenshot_mapping.url, + url=f"https://{screenshot_mapping.url}", screenshot=bytes(124536), ), URLScreenshotResponse( - url=error_mapping.url, + url=f"https://{error_mapping.url}", screenshot=None, error="error", ) diff --git a/tests/automated/integration/tasks/url/impl/submit_meta_urls/test_core.py b/tests/automated/integration/tasks/url/impl/submit_meta_urls/test_core.py index 37d6e00f..92287454 100644 --- a/tests/automated/integration/tasks/url/impl/submit_meta_urls/test_core.py +++ b/tests/automated/integration/tasks/url/impl/submit_meta_urls/test_core.py @@ -51,7 +51,7 @@ async def test_submit_meta_urls( data={ "meta_urls": [ { - "url": mapping.url, + "url": f"https://{mapping.url}", "agency_id": agency_id, "status": SubmitMetaURLsStatus.SUCCESS.value, "meta_url_id": 2, diff --git a/tests/helpers/data_creator/generate.py b/tests/helpers/data_creator/generate.py index 1cf0a806..bee0993f 100644 --- a/tests/helpers/data_creator/generate.py +++ b/tests/helpers/data_creator/generate.py @@ -48,7 +48,8 @@ def generate_urls( for i in range(count): val: int = next_int() results.append(URLInsertModel( - url=f"http://example.com/{val}", + url=f"example.com/{val}", + scheme="https", status=status, source=source, name=f"Example {val}", diff --git a/tests/helpers/simple_test_data_functions.py b/tests/helpers/simple_test_data_functions.py index 4d321dc5..b250dc83 100644 --- a/tests/helpers/simple_test_data_functions.py +++ b/tests/helpers/simple_test_data_functions.py @@ -10,14 +10,14 @@ def generate_test_urls(count: int) -> list[str]: results = [] for i in range(count): - url = f"https://example.com/{uuid.uuid4().hex}" + url = f"example.com/{uuid.uuid4().hex}" results.append(url) return results def generate_test_url(i: int) -> str: - return f"https://test.com/{i}" + return f"test.com/{i}" def generate_test_name(i: int | None = None) -> str: if i is None: diff --git a/tests/manual/external/pdap/test_check_for_duplicate.py b/tests/manual/external/pdap/test_check_for_duplicate.py index 34bbc317..25a8bc52 100644 --- a/tests/manual/external/pdap/test_check_for_duplicate.py +++ b/tests/manual/external/pdap/test_check_for_duplicate.py @@ -4,6 +4,6 @@ @pytest.mark.asyncio async def test_check_for_duplicate(pdap_client): - response = await pdap_client.is_url_duplicate(url_to_check="https://example.com") + response = await pdap_client.is_url_duplicate(url_to_check="example.com") print(response) diff --git a/tests/manual/external/url_request/test_url_screenshot.py b/tests/manual/external/url_request/test_url_screenshot.py index b16535d6..3388c09f 100644 --- a/tests/manual/external/url_request/test_url_screenshot.py +++ b/tests/manual/external/url_request/test_url_screenshot.py @@ -12,7 +12,7 @@ async def test_url_screenshot(): """ urls: list[str] = [ - "https://www.example.com" + "www.example.com" ] responses: list[URLScreenshotResponse] = await get_screenshots(urls=urls) From 16f2b662436edfd9d562384ed82652116b2cb333 Mon Sep 17 00:00:00 2001 From: Max Chis Date: Tue, 14 Oct 2025 17:59:56 -0400 Subject: [PATCH 3/3] Address alembic duplicates --- ...1105-a8f36f185694_add_url_scheme_column.py | 289 +++++++++++++++++- 1 file changed, 282 insertions(+), 7 deletions(-) diff --git a/alembic/versions/2025_10_14_1105-a8f36f185694_add_url_scheme_column.py b/alembic/versions/2025_10_14_1105-a8f36f185694_add_url_scheme_column.py index 3e302ea4..aa73e268 100644 --- a/alembic/versions/2025_10_14_1105-a8f36f185694_add_url_scheme_column.py +++ b/alembic/versions/2025_10_14_1105-a8f36f185694_add_url_scheme_column.py @@ -18,6 +18,287 @@ depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + _update_foreign_key_constraints() + + _delete_duplicate_urls() + _add_column() + _populate_column() + _remove_schemes_from_url_column() + _add_check_constraint_to_url_column() + +def _update_foreign_key_constraints(): + # URL Optional Data Source Metadata + op.execute(""" + ALTER TABLE url_optional_data_source_metadata + DROP CONSTRAINT IF EXISTS url_optional_data_source_metadata_url_id_fkey; + """) + + op.create_foreign_key( + "url_optional_data_source_metadata_url_id_fkey", + "url_optional_data_source_metadata", + "urls", + ["url_id"], + ["id"], + ondelete="CASCADE" + ) + + # Link URLs Redirect URL + # (Source URL ID) + op.execute(""" + ALTER TABLE link_urls_redirect_url + DROP CONSTRAINT IF EXISTS link_urls_redirect_url_source_url_id_fkey; + """) + + op.create_foreign_key( + "link_urls_redirect_url_source_url_id_fkey", + "link_urls_redirect_url", + "urls", + ["source_url_id"], + ["id"], + ondelete="CASCADE" + ) + + # (Destination URL ID) + op.execute(""" + ALTER TABLE link_urls_redirect_url + DROP CONSTRAINT IF EXISTS link_urls_redirect_url_destination_url_id_fkey; + """) + + op.create_foreign_key( + "link_urls_redirect_url_destination_url_id_fkey", + "link_urls_redirect_url", + "urls", + ["destination_url_id"], + ["id"], + ondelete="CASCADE" + ) + + # Reviewing User URL + op.execute(""" + ALTER TABLE reviewing_user_url + DROP CONSTRAINT IF EXISTS approving_user_url_url_id_fkey; + """) + + op.create_foreign_key( + "approving_user_url_url_id_fkey", + "reviewing_user_url", + "urls", + ["url_id"], + ["id"], + ondelete="CASCADE" + ) + + # user_url_agency_suggestions + op.execute(""" + ALTER TABLE user_url_agency_suggestions + DROP CONSTRAINT IF EXISTS user_url_agency_suggestions_url_id_fkey; + """) + + op.create_foreign_key( + "user_url_agency_suggestions_url_id_fkey", + "user_url_agency_suggestions", + "urls", + ["url_id"], + ["id"], + ondelete="CASCADE" + ) + + # Duplicates + op.execute(""" + ALTER TABLE duplicates + DROP CONSTRAINT IF EXISTS duplicates_original_url_id_fkey; + """) + + op.create_foreign_key( + "duplicates_original_url_id_fkey", + "duplicates", + "urls", + ["original_url_id"], + ["id"], + ondelete="CASCADE" + ) + + # link_user_name_suggestions + op.execute(""" + ALTER TABLE link_user_name_suggestions + DROP CONSTRAINT IF EXISTS link_user_name_suggestions_suggestion_id_fkey; + """) + + op.create_foreign_key( + "link_user_name_suggestions_suggestion_id_fkey", + "link_user_name_suggestions", + "url_name_suggestions", + ["suggestion_id"], + ["id"], + ondelete="CASCADE" + ) + +def _delete_duplicate_urls(): + op.execute(""" + DELETE FROM urls + WHERE id IN ( + 4217, + 15902, + 3472, + 17387, + 24256, + 17617, + 17414, + 15259, + 17952, + 17651, + 18010, + 18496, + 18563, + 18587, + 18592, + 18092, + 18046, + 20467, + 24346, + 28241, + 25075, + 22508, + 22391, + 24256, + 22486, + 28109, + 26336, + 30701, + 17387, + 19348, + 18080, + 27863, + 18855, + 28830, + 18824, + 17414, + 15259, + 20676, + 27716, + 21475, + 23442, + 28553, + 8176, + 22270, + 19161, + 21250, + 15659, + 18821, + 27067, + 27567, + 27318, + 20640, + 21840, + 3472, + 28982, + 28910, + 19527, + 28776, + 15902, + 18468, + 29557, + 22977, + 27694, + 22678, + 19094, + 27203, + 26436, + 18868, + 22813, + 25007, + 7548, + 30088, + 20924, + 22575, + 28149, + 30705, + 28179, + 30660, + 2988, + 17182, + 18893, + 30317, + 19215, + 17651, + 21117, + 17617, + 23742, + 19620, + 16865, + 19320, + 20516, + 25248, + 26122, + 30158, + 30522, + 23307, + 18621, + 27855, + 26922, + 21397, + 18010, + 18592, + 2527, + 26279, + 18563, + 18242, + 21550, + 28288, + 22361, + 24660, + 2989, + 28765, + 10627, + 19625, + 12191, + 27523, + 18373, + 28565, + 25437, + 26077, + 28554, + 23229, + 25631, + 25528, + 18092, + 10765, + 26126, + 51499, + 27375, + 24177, + 22734, + 22459, + 22439, + 18532, + 29064, + 20504, + 21643, + 21551, + 27698, + 19234, + 24308, + 22559, + 26227, + 19080, + 16010, + 3515, + 22658, + 20673, + 21854, + 19361, + 21768, + 26903, + 21253, + 23085, + 3761, + 3565 + ) + """) + def _populate_column(): op.execute( """ @@ -32,7 +313,7 @@ def _remove_schemes_from_url_column(): op.execute( """ UPDATE urls - SET url = regexp_replace(url, '^(?i)[a-z][a-z0-9+.-]*://', '') + SET url = regexp_replace(url, '^[a-z][a-z0-9+.-]*://', '', 'i') WHERE url ~* '^[a-z][a-z0-9+.-]*://'; """ ) @@ -47,12 +328,6 @@ def _add_check_constraint_to_url_column(): ) -def upgrade() -> None: - _add_column() - _populate_column() - _remove_schemes_from_url_column() - _add_check_constraint_to_url_column() - def _add_column(): op.add_column( "urls",