diff --git a/alembic/versions/2025_10_17_1826-7fc6502f1fa3_add_trailing_slash_column.py b/alembic/versions/2025_10_17_1826-7fc6502f1fa3_add_trailing_slash_column.py new file mode 100644 index 00000000..69faae2e --- /dev/null +++ b/alembic/versions/2025_10_17_1826-7fc6502f1fa3_add_trailing_slash_column.py @@ -0,0 +1,147 @@ +"""Add trailing slash column + +Revision ID: 7fc6502f1fa3 +Revises: ff4e8b2f6348 +Create Date: 2025-10-17 18:26:56.756915 + +""" +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa + + +# revision identifiers, used by Alembic. +revision: str = '7fc6502f1fa3' +down_revision: Union[str, None] = 'ff4e8b2f6348' +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + _remove_duplicates() + _add_trailing_slash_column() + _migrate_trailing_slash_to_column() + _remove_trailing_slash_from_url_column() + _add_check_constraint_forbidding_trailing_slash_in_url() + +def _remove_duplicates(): + op.execute( + """ + DELETE FROM urls + WHERE id IN ( + 23504, + 29401, + 21032, + 23687, + 15760, + 17574, + 17669, + 21382, + 11697, + 18076, + 27764, + 11395, + 17702, + 26857, + 30843, + 21850, + 29471, + 26789, + 19428, + 18452, + 30547, + 24004, + 27857, + 30260, + 26968, + 27065, + 29073, + 21827, + 25615, + 28644, + 24417, + 29801, + 27625, + 15708, + 23517, + 26415, + 26081, + 7478, + 20368, + 19494, + 26624, + 3817, + 3597, + 3568, + 16113, + 24125, + 30625, + 29965, + 23134, + 19207, + 12158, + 3835, + 24730, + 17113, + 29987, + 21452, + 24605, + 5043, + 17237, + 25522, + 11065, + 12387, + 12210, + 11185, + 11961, + 4935, + 24200, + 29028, + 24371, + 28355, + 17620, + 19546, + 3598 + ) + """ + ) + +def _add_trailing_slash_column(): + op.add_column( + 'urls', + sa.Column( + 'trailing_slash', + sa.Boolean(), + nullable=False, + server_default=sa.text('false') + ) + ) + +def _migrate_trailing_slash_to_column(): + op.execute( + """ + UPDATE urls + SET trailing_slash = url ~ '/$' + """ + ) + +def _remove_trailing_slash_from_url_column(): + op.execute( + """ + UPDATE urls + SET url = rtrim(url, '/') + WHERE url like '%/'; + """ + ) + +def _add_check_constraint_forbidding_trailing_slash_in_url(): + op.execute( + """ + ALTER TABLE urls + ADD CONSTRAINT no_trailing_slash CHECK (url !~ '/$') + """ + ) + +def downgrade() -> None: + pass diff --git a/src/api/endpoints/annotate/_shared/extract.py b/src/api/endpoints/annotate/_shared/extract.py index 3534c997..61e92c35 100644 --- a/src/api/endpoints/annotate/_shared/extract.py +++ b/src/api/endpoints/annotate/_shared/extract.py @@ -15,7 +15,7 @@ from src.api.endpoints.annotate.all.get.queries.location_.core import GetLocationSuggestionsQueryBuilder from src.api.endpoints.annotate.all.get.queries.name.core import GetNameSuggestionsQueryBuilder from src.db.dto_converter import DTOConverter -from src.db.dtos.url.mapping import URLMapping +from src.db.dtos.url.mapping_.simple import SimpleURLMapping from src.db.models.impl.url.core.sqlalchemy import URL from src.db.models.impl.url.suggestion.agency.user import UserUrlAgencySuggestion @@ -44,7 +44,7 @@ async def extract_and_format_get_annotation_result( await GetNameSuggestionsQueryBuilder(url_id=url.id).run(session) return GetNextURLForAllAnnotationResponse( next_annotation=GetNextURLForAllAnnotationInnerResponse( - url_info=URLMapping( + url_info=SimpleURLMapping( url_id=url.id, url=url.full_url ), diff --git a/src/api/endpoints/annotate/dtos/shared/base/response.py b/src/api/endpoints/annotate/dtos/shared/base/response.py index edcc80e1..0d3ae253 100644 --- a/src/api/endpoints/annotate/dtos/shared/base/response.py +++ b/src/api/endpoints/annotate/dtos/shared/base/response.py @@ -1,14 +1,12 @@ -from typing import Optional - from pydantic import BaseModel, Field from src.api.endpoints.annotate.dtos.shared.batch import AnnotationBatchInfo from src.core.tasks.url.operators.html.scraper.parser.dtos.response_html import ResponseHTMLInfo -from src.db.dtos.url.mapping import URLMapping +from src.db.dtos.url.mapping_.simple import SimpleURLMapping class AnnotationInnerResponseInfoBase(BaseModel): - url_info: URLMapping = Field( + url_info: SimpleURLMapping = Field( title="Information about the URL" ) html_info: ResponseHTMLInfo = Field( diff --git a/src/api/endpoints/collector/manual/query.py b/src/api/endpoints/collector/manual/query.py index 029b5ecb..6cd7d7b8 100644 --- a/src/api/endpoints/collector/manual/query.py +++ b/src/api/endpoints/collector/manual/query.py @@ -48,13 +48,14 @@ async def run(self, session: AsyncSession) -> ManualBatchResponseDTO: url_and_scheme: URLAndScheme = get_url_and_scheme(entry.url) url = URL( - url=url_and_scheme.url, + url=url_and_scheme.url.rstrip('/'), scheme=url_and_scheme.scheme, name=entry.name, description=entry.description, collector_metadata=entry.collector_metadata, status=URLStatus.OK.value, - source=URLSource.MANUAL + source=URLSource.MANUAL, + trailing_slash=url_and_scheme.url.endswith('/'), ) diff --git a/src/api/endpoints/submit/url/queries/core.py b/src/api/endpoints/submit/url/queries/core.py index 4d0269dd..513d26ad 100644 --- a/src/api/endpoints/submit/url/queries/core.py +++ b/src/api/endpoints/submit/url/queries/core.py @@ -63,6 +63,7 @@ async def run(self, session: AsyncSession) -> URLSubmissionResponse: scheme=url_and_scheme.scheme, source=URLSource.MANUAL, status=URLStatus.OK, + trailing_slash=url_and_scheme.url.endswith('/'), ) session.add(url_insert) await session.flush() diff --git a/src/collectors/queries/insert/url.py b/src/collectors/queries/insert/url.py index 8e9e75d3..60f39a2c 100644 --- a/src/collectors/queries/insert/url.py +++ b/src/collectors/queries/insert/url.py @@ -19,11 +19,12 @@ async def run(self, session: AsyncSession) -> int: """Insert a new URL into the database.""" url_and_scheme: URLAndScheme = get_url_and_scheme(self.url_info.url) url_entry = URL( - url=url_and_scheme.url, + url=url_and_scheme.url.rstrip('/'), scheme=url_and_scheme.scheme, collector_metadata=self.url_info.collector_metadata, status=self.url_info.status.value, - source=self.url_info.source + source=self.url_info.source, + trailing_slash=url_and_scheme.url.endswith('/'), ) if self.url_info.created_at is not None: url_entry.created_at = self.url_info.created_at diff --git a/src/collectors/queries/insert/urls/query.py b/src/collectors/queries/insert/urls/query.py index d4165001..77f3fe1b 100644 --- a/src/collectors/queries/insert/urls/query.py +++ b/src/collectors/queries/insert/urls/query.py @@ -4,7 +4,7 @@ from src.collectors.queries.insert.urls.request_manager import InsertURLsRequestManager from src.util.url import clean_url from src.db.dtos.url.insert import InsertURLsInfo -from src.db.dtos.url.mapping import URLMapping +from src.db.dtos.url.mapping_.simple import SimpleURLMapping from src.db.models.impl.duplicate.pydantic.insert import DuplicateInsertInfo from src.db.models.impl.url.core.pydantic.info import URLInfo from src.db.queries.base.builder import QueryBuilderBase @@ -32,7 +32,7 @@ async def run(self, session: AsyncSession) -> InsertURLsInfo: async with session.begin_nested() as sp: url_id = await rm.insert_url(url_info) url_mappings.append( - URLMapping( + SimpleURLMapping( url_id=url_id, url=url_info.url ) diff --git a/src/core/tasks/scheduled/impl/internet_archives/probe/convert.py b/src/core/tasks/scheduled/impl/internet_archives/probe/convert.py index efd5e45c..4d4be86d 100644 --- a/src/core/tasks/scheduled/impl/internet_archives/probe/convert.py +++ b/src/core/tasks/scheduled/impl/internet_archives/probe/convert.py @@ -1,10 +1,10 @@ from src.db.models.impl.url.internet_archives.probe.pydantic import URLInternetArchiveMetadataPydantic from src.external.internet_archives.models.ia_url_mapping import InternetArchivesURLMapping -from src.util.url_mapper import URLMapper +from src.util.url_mapper_.simple import SimpleURLMapper def convert_ia_url_mapping_to_ia_metadata( - url_mapper: URLMapper, + url_mapper: SimpleURLMapper, ia_mapping: InternetArchivesURLMapping ) -> URLInternetArchiveMetadataPydantic: iam = ia_mapping.ia_metadata diff --git a/src/core/tasks/scheduled/impl/internet_archives/probe/operator.py b/src/core/tasks/scheduled/impl/internet_archives/probe/operator.py index f4773417..4c58df00 100644 --- a/src/core/tasks/scheduled/impl/internet_archives/probe/operator.py +++ b/src/core/tasks/scheduled/impl/internet_archives/probe/operator.py @@ -12,7 +12,7 @@ CheckURLInternetArchivesTaskPrerequisitesQueryBuilder from src.core.tasks.scheduled.templates.operator import ScheduledTaskOperatorBase from src.db.client.async_ import AsyncDatabaseClient -from src.db.dtos.url.mapping import URLMapping +from src.db.dtos.url.mapping_.simple import SimpleURLMapping from src.db.enums import TaskType from src.db.models.impl.flag.checked_for_ia.pydantic import FlagURLCheckedForInternetArchivesPydantic from src.db.models.impl.url.internet_archives.probe.pydantic import URLInternetArchiveMetadataPydantic @@ -20,7 +20,7 @@ from src.external.internet_archives.client import InternetArchivesClient from src.external.internet_archives.models.ia_url_mapping import InternetArchivesURLMapping from src.util.progress_bar import get_progress_bar_disabled -from src.util.url_mapper import URLMapper +from src.util.url_mapper_.simple import SimpleURLMapper class InternetArchivesProbeTaskOperator( @@ -51,10 +51,10 @@ async def inner_task_logic(self) -> None: DeleteOldUnsuccessfulIACheckedFlagsQueryBuilder() ) - url_mappings: list[URLMapping] = await self._get_url_mappings() + url_mappings: list[SimpleURLMapping] = await self._get_url_mappings() if len(url_mappings) == 0: return - mapper = URLMapper(url_mappings) + mapper = SimpleURLMapper(url_mappings) await self.link_urls_to_task(mapper.get_all_ids()) @@ -65,7 +65,7 @@ async def inner_task_logic(self) -> None: await self._add_errors_to_db(mapper, ia_mappings=subsets.error) await self._add_ia_metadata_to_db(mapper, ia_mappings=subsets.has_metadata) - async def _add_errors_to_db(self, mapper: URLMapper, ia_mappings: list[InternetArchivesURLMapping]) -> None: + async def _add_errors_to_db(self, mapper: SimpleURLMapper, ia_mappings: list[InternetArchivesURLMapping]) -> None: url_error_info_list: list[URLTaskErrorSmall] = [] for ia_mapping in ia_mappings: url_id = mapper.get_id(ia_mapping.url) @@ -76,7 +76,7 @@ async def _add_errors_to_db(self, mapper: URLMapper, ia_mappings: list[InternetA url_error_info_list.append(url_error_info) await self.add_task_errors(url_error_info_list) - async def _get_url_mappings(self) -> list[URLMapping]: + async def _get_url_mappings(self) -> list[SimpleURLMapping]: return await self.adb_client.run_query_builder( GetURLsForInternetArchivesTaskQueryBuilder() ) @@ -93,7 +93,7 @@ async def _search_for_internet_archive_links(self, urls: list[str]) -> list[Inte async def _add_ia_metadata_to_db( self, - url_mapper: URLMapper, + url_mapper: SimpleURLMapper, ia_mappings: list[InternetArchivesURLMapping], ) -> None: insert_objects: list[URLInternetArchiveMetadataPydantic] = [ @@ -106,7 +106,7 @@ async def _add_ia_metadata_to_db( await self.adb_client.bulk_insert(insert_objects) async def _add_ia_flags_to_db( - self, mapper: URLMapper, ia_mappings: list[InternetArchivesURLMapping]) -> None: + self, mapper: SimpleURLMapper, ia_mappings: list[InternetArchivesURLMapping]) -> None: flags: list[FlagURLCheckedForInternetArchivesPydantic] = [] for ia_mapping in ia_mappings: url_id = mapper.get_id(ia_mapping.url) diff --git a/src/core/tasks/scheduled/impl/internet_archives/probe/queries/get.py b/src/core/tasks/scheduled/impl/internet_archives/probe/queries/get.py index 3306943a..a806b691 100644 --- a/src/core/tasks/scheduled/impl/internet_archives/probe/queries/get.py +++ b/src/core/tasks/scheduled/impl/internet_archives/probe/queries/get.py @@ -1,18 +1,15 @@ -from sqlalchemy import select, or_, exists, text, func +from sqlalchemy import select from sqlalchemy.ext.asyncio import AsyncSession from src.core.tasks.scheduled.impl.internet_archives.probe.queries.cte import CheckURLInternetArchivesCTEContainer -from src.db.dtos.url.mapping import URLMapping -from src.db.helpers.query import not_exists_url -from src.db.models.impl.flag.checked_for_ia.sqlalchemy import FlagURLCheckedForInternetArchives -from src.db.models.impl.url.core.sqlalchemy import URL +from src.db.dtos.url.mapping_.simple import SimpleURLMapping from src.db.queries.base.builder import QueryBuilderBase from src.db.helpers.session import session_helper as sh class GetURLsForInternetArchivesTaskQueryBuilder(QueryBuilderBase): - async def run(self, session: AsyncSession) -> list[URLMapping]: + async def run(self, session: AsyncSession) -> list[SimpleURLMapping]: cte = CheckURLInternetArchivesCTEContainer() query = ( select( @@ -24,7 +21,7 @@ async def run(self, session: AsyncSession) -> list[URLMapping]: db_mappings = await sh.mappings(session, query=query) return [ - URLMapping( + SimpleURLMapping( url_id=mapping["url_id"], url=mapping["url"] ) for mapping in db_mappings diff --git a/src/core/tasks/scheduled/impl/internet_archives/save/models/entry.py b/src/core/tasks/scheduled/impl/internet_archives/save/models/entry.py index 6e4ae84e..280aa51d 100644 --- a/src/core/tasks/scheduled/impl/internet_archives/save/models/entry.py +++ b/src/core/tasks/scheduled/impl/internet_archives/save/models/entry.py @@ -1,6 +1,6 @@ from pydantic import BaseModel -from src.db.dtos.url.mapping import URLMapping +from src.db.dtos.url.mapping_.simple import SimpleURLMapping class InternetArchivesSaveTaskEntry(BaseModel): @@ -8,8 +8,8 @@ class InternetArchivesSaveTaskEntry(BaseModel): url_id: int is_new: bool - def to_url_mapping(self) -> URLMapping: - return URLMapping( + def to_url_mapping(self) -> SimpleURLMapping: + return SimpleURLMapping( url_id=self.url_id, url=self.url ) \ No newline at end of file diff --git a/src/core/tasks/url/operators/probe/convert.py b/src/core/tasks/url/operators/probe/convert.py index dcb211f0..e568de91 100644 --- a/src/core/tasks/url/operators/probe/convert.py +++ b/src/core/tasks/url/operators/probe/convert.py @@ -1,5 +1,6 @@ from src.core.tasks.url.operators.probe.tdo import URLProbeTDO from src.db.models.impl.url.web_metadata.insert import URLWebMetadataPydantic +from src.external.url_request.probe.models.redirect import URLProbeRedirectResponsePair def convert_tdo_to_web_metadata_list(tdos: list[URLProbeTDO]) -> list[URLWebMetadataPydantic]: @@ -16,3 +17,19 @@ def convert_tdo_to_web_metadata_list(tdos: list[URLProbeTDO]) -> list[URLWebMeta results.append(web_metadata_object) return results +def convert_tdos_with_functional_equivalents_to_web_metadata_list( + tdos: list[URLProbeTDO] +) -> list[URLWebMetadataPydantic]: + results: list[URLWebMetadataPydantic] = [] + for tdo in tdos: + response: URLProbeRedirectResponsePair = tdo.response.response + dest = response.destination + web_metadata_object = URLWebMetadataPydantic( + url_id=tdo.url_mapping.url_id, + accessed=dest.status_code != 404, + status_code=dest.status_code, + content_type=dest.content_type, + error_message=dest.error + ) + results.append(web_metadata_object) + return results diff --git a/src/core/tasks/url/operators/probe/core.py b/src/core/tasks/url/operators/probe/core.py index 1c961155..4f38c1d9 100644 --- a/src/core/tasks/url/operators/probe/core.py +++ b/src/core/tasks/url/operators/probe/core.py @@ -1,18 +1,25 @@ from typing import final + from typing_extensions import override from src.core.tasks.url.operators.base import URLTaskOperatorBase -from src.core.tasks.url.operators.probe.convert import convert_tdo_to_web_metadata_list -from src.core.tasks.url.operators.probe.filter import filter_non_redirect_tdos, filter_redirect_tdos +from src.core.tasks.url.operators.probe.convert import convert_tdo_to_web_metadata_list, \ + convert_tdos_with_functional_equivalents_to_web_metadata_list +from src.core.tasks.url.operators.probe.filter import filter_non_redirect_tdos, filter_redirect_tdos, \ + filter_functionally_equivalent_urls +from src.core.tasks.url.operators.probe.models.subsets import RedirectTDOSubsets +from src.core.tasks.url.operators.probe.models.upsert_functional_equivalents import URLFunctionalEquivalentsUpsertModel from src.core.tasks.url.operators.probe.queries.insert_redirects.query import InsertRedirectsQueryBuilder from src.core.tasks.url.operators.probe.queries.urls.not_probed.exists import HasURLsWithoutProbeQueryBuilder from src.core.tasks.url.operators.probe.queries.urls.not_probed.get.query import GetURLsWithoutProbeQueryBuilder from src.core.tasks.url.operators.probe.tdo import URLProbeTDO -from src.db.models.impl.url.web_metadata.insert import URLWebMetadataPydantic -from src.external.url_request.core import URLRequestInterface from src.db.client.async_ import AsyncDatabaseClient -from src.db.dtos.url.mapping import URLMapping +from src.db.dtos.url.mapping_.full import FullURLMapping from src.db.enums import TaskType +from src.db.models.impl.url.task_error.pydantic_.small import URLTaskErrorSmall +from src.db.models.impl.url.web_metadata.insert import URLWebMetadataPydantic +from src.external.url_request.core import URLRequestInterface + @final class URLProbeTaskOperator(URLTaskOperatorBase): @@ -36,7 +43,7 @@ async def meets_task_prerequisites(self) -> bool: return await self.has_urls_without_probe() async def get_urls_without_probe(self) -> list[URLProbeTDO]: - url_mappings: list[URLMapping] = await self.adb_client.run_query_builder( + url_mappings: list[FullURLMapping] = await self.adb_client.run_query_builder( GetURLsWithoutProbeQueryBuilder() ) return [URLProbeTDO(url_mapping=url_mapping) for url_mapping in url_mappings] @@ -57,26 +64,76 @@ async def probe_urls(self, tdos: list[URLProbeTDO]) -> None: URLProbeTDO.response """ url_to_tdo: dict[str, URLProbeTDO] = { - tdo.url_mapping.url: tdo for tdo in tdos + tdo.url_mapping.full_url.id_form: tdo for tdo in tdos } responses = await self.url_request_interface.probe_urls( - urls=[tdo.url_mapping.url for tdo in tdos] + urls=[tdo.url_mapping.full_url for tdo in tdos] ) # Re-associate the responses with the URL mappings for response in responses: - tdo = url_to_tdo[response.original_url] + tdo = url_to_tdo[response.original_url.id_form] tdo.response = response async def update_database(self, tdos: list[URLProbeTDO]) -> None: - non_redirect_tdos = filter_non_redirect_tdos(tdos) + none_tdos: list[URLProbeTDO] = [ + tdo for tdo in tdos if tdo.response is None + ] + await self.upload_none_errors(none_tdos) + + non_error_tdos = [ + tdo for tdo in tdos if tdo.response is not None + ] + + non_redirect_tdos = filter_non_redirect_tdos(non_error_tdos) web_metadata_objects: list[URLWebMetadataPydantic] = convert_tdo_to_web_metadata_list(non_redirect_tdos) await self.adb_client.bulk_upsert(web_metadata_objects) - redirect_tdos: list[URLProbeTDO] = filter_redirect_tdos(tdos) + redirect_tdos: list[URLProbeTDO] = filter_redirect_tdos(non_error_tdos) + + # Filter redirects into true redirects and functional equivalents + redirect_subsets: RedirectTDOSubsets = filter_functionally_equivalent_urls(redirect_tdos) + + await self._insert_true_redirects(redirect_subsets.true_redirects) - query_builder = InsertRedirectsQueryBuilder(tdos=redirect_tdos) - await self.adb_client.run_query_builder(query_builder) + await self._update_functional_equivalents(redirect_subsets.functional_equivalents) + + async def upload_none_errors( + self, + tdos: list[URLProbeTDO] + ) -> None: + error_url_ids: list[int] = [tdo.url_mapping.url_id for tdo in tdos] + task_errors = [ + URLTaskErrorSmall( + url_id=url_id, + error="TDO response is None" + ) + for url_id in error_url_ids + ] + await self.add_task_errors(task_errors) + + + async def _insert_true_redirects(self, tdos: list[URLProbeTDO]) -> None: + await self.adb_client.run_query_builder( + InsertRedirectsQueryBuilder(tdos=tdos) + ) + async def _update_functional_equivalents(self, tdos: list[URLProbeTDO]) -> None: + # For non-true redirects, treat the redirected URL as the true URL and update database + url_updates = [ + URLFunctionalEquivalentsUpsertModel( + id=tdo.url_mapping.url_id, + url=tdo.response.response.destination.url.without_scheme.rstrip('/'), + trailing_slash=tdo.response.response.destination.url.without_scheme.endswith('/') + ) + for tdo in tdos + ] + await self.adb_client.bulk_update(url_updates) + # For these URLs, also update web metadata + func_equiv_web_metadata_objects: list[URLWebMetadataPydantic] = \ + convert_tdos_with_functional_equivalents_to_web_metadata_list( + tdos + ) + await self.adb_client.bulk_upsert(func_equiv_web_metadata_objects) async def has_urls_without_probe(self) -> bool: return await self.adb_client.run_query_builder( diff --git a/src/core/tasks/url/operators/probe/filter.py b/src/core/tasks/url/operators/probe/filter.py index 4a129676..2f9313e8 100644 --- a/src/core/tasks/url/operators/probe/filter.py +++ b/src/core/tasks/url/operators/probe/filter.py @@ -1,8 +1,30 @@ +from src.core.tasks.url.operators.probe.models.subsets import RedirectTDOSubsets from src.core.tasks.url.operators.probe.tdo import URLProbeTDO +from src.external.url_request.probe.models.redirect import URLProbeRedirectResponsePair +from src.util.models.full_url import FullURL def filter_non_redirect_tdos(tdos: list[URLProbeTDO]) -> list[URLProbeTDO]: return [tdo for tdo in tdos if not tdo.response.is_redirect] def filter_redirect_tdos(tdos: list[URLProbeTDO]) -> list[URLProbeTDO]: - return [tdo for tdo in tdos if tdo.response.is_redirect] \ No newline at end of file + return [tdo for tdo in tdos if tdo.response.is_redirect] + +def filter_functionally_equivalent_urls(tdos: list[URLProbeTDO]) -> RedirectTDOSubsets: + true_redirects: list[URLProbeTDO] = [] + functional_equivalents: list[URLProbeTDO] = [] + for tdo in tdos: + og_url: FullURL = tdo.url_mapping.full_url + response: URLProbeRedirectResponsePair = tdo.response.response + redirect_url: FullURL = response.destination.url + + if og_url.id_form != redirect_url.id_form: + true_redirects.append(tdo) + # Otherwise, they are functional equivalents. + else: + functional_equivalents.append(tdo) + + return RedirectTDOSubsets( + true_redirects=true_redirects, + functional_equivalents=functional_equivalents + ) \ No newline at end of file diff --git a/src/core/tasks/url/operators/probe/models/__init__.py b/src/core/tasks/url/operators/probe/models/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/core/tasks/url/operators/probe/models/subsets.py b/src/core/tasks/url/operators/probe/models/subsets.py new file mode 100644 index 00000000..8cad6434 --- /dev/null +++ b/src/core/tasks/url/operators/probe/models/subsets.py @@ -0,0 +1,8 @@ +from pydantic import BaseModel + +from src.core.tasks.url.operators.probe.tdo import URLProbeTDO + + +class RedirectTDOSubsets(BaseModel): + true_redirects: list[URLProbeTDO] + functional_equivalents: list[URLProbeTDO] diff --git a/src/core/tasks/url/operators/probe/models/upsert_functional_equivalents.py b/src/core/tasks/url/operators/probe/models/upsert_functional_equivalents.py new file mode 100644 index 00000000..434f43af --- /dev/null +++ b/src/core/tasks/url/operators/probe/models/upsert_functional_equivalents.py @@ -0,0 +1,22 @@ +from pydantic import BaseModel + +from src.db.models.impl.url.core.sqlalchemy import URL +from src.db.models.templates_.base import Base +from src.db.templates.markers.bulk.upsert import BulkUpsertableModel + + +class URLFunctionalEquivalentsUpsertModel(BulkUpsertableModel): + + @classmethod + def id_field(cls) -> str: + return "id" + + @classmethod + def sa_model(cls) -> type[Base]: + """Defines the SQLAlchemy model.""" + return URL + + id: int + url: str + trailing_slash: bool + diff --git a/src/core/tasks/url/operators/probe/queries/insert_redirects/convert.py b/src/core/tasks/url/operators/probe/queries/insert_redirects/convert.py index eb0597ba..80d58110 100644 --- a/src/core/tasks/url/operators/probe/queries/insert_redirects/convert.py +++ b/src/core/tasks/url/operators/probe/queries/insert_redirects/convert.py @@ -1,10 +1,11 @@ from src.core.tasks.url.operators.probe.queries.insert_redirects.models.url_response_map import URLResponseMapping -from src.core.tasks.url.operators.probe.queries.urls.exist.model import UrlExistsResult from src.core.tasks.url.operators.probe.tdo import URLProbeTDO -from src.db.dtos.url.mapping import URLMapping from src.db.models.impl.url.core.enums import URLSource from src.db.models.impl.url.core.pydantic.insert import URLInsertModel from src.db.models.impl.url.web_metadata.insert import URLWebMetadataPydantic +from src.util.models.full_url import FullURL +from src.util.models.url_and_scheme import URLAndScheme +from src.util.url import get_url_and_scheme def convert_url_response_mapping_to_web_metadata_list( @@ -23,23 +24,15 @@ def convert_url_response_mapping_to_web_metadata_list( results.append(web_metadata_object) return results - -def convert_to_url_mappings(url_exists_results: list[UrlExistsResult]) -> list[URLMapping]: - return [ - URLMapping( - url=url_exists_result.url, - url_id=url_exists_result.url_id - ) for url_exists_result in url_exists_results - ] - - -def convert_to_url_insert_models(urls: list[str]) -> list[URLInsertModel]: - results = [] +def convert_to_url_insert_models(urls: list[FullURL]) -> list[URLInsertModel]: + results: list[URLInsertModel] = [] for url in urls: results.append( URLInsertModel( - url=url, - source=URLSource.REDIRECT + url=url.without_scheme.rstrip('/'), + scheme=url.scheme, + source=URLSource.REDIRECT, + trailing_slash=url.without_scheme.endswith('/') ) ) return results diff --git a/src/core/tasks/url/operators/probe/queries/insert_redirects/extract.py b/src/core/tasks/url/operators/probe/queries/insert_redirects/extract.py index 3de66e85..1f6d83e5 100644 --- a/src/core/tasks/url/operators/probe/queries/insert_redirects/extract.py +++ b/src/core/tasks/url/operators/probe/queries/insert_redirects/extract.py @@ -1,5 +1,4 @@ from src.core.tasks.url.operators.probe.tdo import URLProbeTDO -from src.db.dtos.url.mapping import URLMapping from src.external.url_request.probe.models.redirect import URLProbeRedirectResponsePair diff --git a/src/core/tasks/url/operators/probe/queries/insert_redirects/filter.py b/src/core/tasks/url/operators/probe/queries/insert_redirects/filter.py deleted file mode 100644 index 1f36893d..00000000 --- a/src/core/tasks/url/operators/probe/queries/insert_redirects/filter.py +++ /dev/null @@ -1,14 +0,0 @@ -from src.db.dtos.url.mapping import URLMapping - - -def filter_new_dest_urls( - url_mappings_in_db: list[URLMapping], - all_dest_urls: list[str] -) -> list[str]: - extant_destination_urls: set[str] = set([url_mapping.url for url_mapping in url_mappings_in_db]) - new_dest_urls: list[str] = [ - url - for url in all_dest_urls - if url not in extant_destination_urls - ] - return new_dest_urls \ No newline at end of file diff --git a/src/core/tasks/url/operators/probe/queries/insert_redirects/map.py b/src/core/tasks/url/operators/probe/queries/insert_redirects/map.py index 53f2b2e1..3f83e941 100644 --- a/src/core/tasks/url/operators/probe/queries/insert_redirects/map.py +++ b/src/core/tasks/url/operators/probe/queries/insert_redirects/map.py @@ -1,15 +1,16 @@ from src.core.tasks.url.operators.probe.queries.insert_redirects.models.url_response_map import URLResponseMapping -from src.db.dtos.url.mapping import URLMapping +from src.db.dtos.url.mapping_.full import FullURLMapping from src.external.url_request.probe.models.response import URLProbeResponse +from src.util.models.full_url import FullURL def map_url_mappings_to_probe_responses( - url_mappings: list[URLMapping], - url_to_probe_responses: dict[str, URLProbeResponse] + url_mappings: list[FullURLMapping], + url_to_probe_responses: dict[FullURL, URLProbeResponse] ) -> list[URLResponseMapping]: results = [] for url_mapping in url_mappings: - response = url_to_probe_responses[url_mapping.url] + response = url_to_probe_responses[url_mapping.full_url] results.append( URLResponseMapping( url_mapping=url_mapping, diff --git a/src/core/tasks/url/operators/probe/queries/insert_redirects/models/subset.py b/src/core/tasks/url/operators/probe/queries/insert_redirects/models/subset.py new file mode 100644 index 00000000..c5b26c24 --- /dev/null +++ b/src/core/tasks/url/operators/probe/queries/insert_redirects/models/subset.py @@ -0,0 +1,9 @@ +from pydantic import BaseModel + +from src.util.models.full_url import FullURL + + +class DestinationURLSubsets(BaseModel): + new_urls: list[FullURL] + exist_with_alterations: list[FullURL] + exist_as_is: list[FullURL] \ No newline at end of file diff --git a/src/core/tasks/url/operators/probe/queries/insert_redirects/models/url_response_map.py b/src/core/tasks/url/operators/probe/queries/insert_redirects/models/url_response_map.py index efbd5db8..fd90ab65 100644 --- a/src/core/tasks/url/operators/probe/queries/insert_redirects/models/url_response_map.py +++ b/src/core/tasks/url/operators/probe/queries/insert_redirects/models/url_response_map.py @@ -1,9 +1,10 @@ from pydantic import BaseModel -from src.db.dtos.url.mapping import URLMapping +from src.db.dtos.url.mapping_.full import FullURLMapping +from src.db.dtos.url.mapping_.simple import SimpleURLMapping from src.external.url_request.probe.models.response import URLProbeResponse class URLResponseMapping(BaseModel): - url_mapping: URLMapping + url_mapping: FullURLMapping response: URLProbeResponse \ No newline at end of file diff --git a/src/core/tasks/url/operators/probe/queries/insert_redirects/query.py b/src/core/tasks/url/operators/probe/queries/insert_redirects/query.py index 0ba70c47..8dd4f693 100644 --- a/src/core/tasks/url/operators/probe/queries/insert_redirects/query.py +++ b/src/core/tasks/url/operators/probe/queries/insert_redirects/query.py @@ -1,14 +1,15 @@ from sqlalchemy.ext.asyncio import AsyncSession from src.core.tasks.url.operators.probe.queries.insert_redirects.extract import extract_response_pairs -from src.core.tasks.url.operators.probe.queries.insert_redirects.filter import filter_new_dest_urls from src.core.tasks.url.operators.probe.queries.insert_redirects.request_manager import InsertRedirectsRequestManager +from src.core.tasks.url.operators.probe.queries.urls.exist.model import URLExistsResult from src.core.tasks.url.operators.probe.tdo import URLProbeTDO -from src.db.dtos.url.mapping import URLMapping +from src.db.dtos.url.mapping_.full import FullURLMapping from src.db.queries.base.builder import QueryBuilderBase from src.external.url_request.probe.models.redirect import URLProbeRedirectResponsePair from src.external.url_request.probe.models.response import URLProbeResponse -from src.util.url_mapper import URLMapper +from src.util.models.full_url import FullURL +from src.util.url_mapper_.full import FullURLMapper class InsertRedirectsQueryBuilder(QueryBuilderBase): @@ -19,7 +20,7 @@ def __init__( super().__init__() self.tdos = tdos self.source_url_mappings = [tdo.url_mapping for tdo in self.tdos] - self._mapper = URLMapper(self.source_url_mappings) + self._mapper = FullURLMapper(self.source_url_mappings) self._response_pairs: list[URLProbeRedirectResponsePair] = extract_response_pairs(self.tdos) @@ -27,12 +28,12 @@ def __init__( pair.destination for pair in self._response_pairs ] - self._destination_urls: list[str] = [ + self._destination_urls: list[FullURL] = [ response.url for response in self._destination_probe_responses ] - self._destination_url_to_probe_response_mapping: dict[str, URLProbeResponse] = { + self._destination_url_to_probe_response_mapping: dict[FullURL, URLProbeResponse] = { response.url: response for response in self._destination_probe_responses } @@ -50,29 +51,39 @@ async def run(self, session: AsyncSession) -> None: session=session ) - # Get all destination URLs already in the database - dest_url_mappings_in_db: list[URLMapping] = await rm.get_url_mappings_in_db( + url_exist_results: list[URLExistsResult] = await rm.check_if_urls_exist_in_db( urls=self._destination_urls ) - # Filter out to only have those URLs that are new in the database - new_dest_urls: list[str] = filter_new_dest_urls( - url_mappings_in_db=dest_url_mappings_in_db, - all_dest_urls=self._destination_urls - ) + # Two Options: + # - URLs that do not exist in any form in the database + # - URLs that exist as-is or in slightly modified version (url scheme or trailing slash differs) + new_urls: list[FullURL] = [] + extant_url_mappings: list[FullURLMapping] = [] + for result in url_exist_results: + if not result.exists: + new_urls.append(result.query_url) + else: + extant_url_mappings.append( + FullURLMapping( + full_url=result.query_url, + url_id=result.url_id + ) + ) # Add the new URLs - new_dest_url_mappings: list[URLMapping] = await rm.insert_new_urls( - urls=new_dest_urls + new_dest_url_mappings: list[FullURLMapping] = await rm.insert_new_urls( + urls=new_urls ) - all_dest_url_mappings: list[URLMapping] = dest_url_mappings_in_db + new_dest_url_mappings - self._mapper.add_mappings(all_dest_url_mappings) + all_url_mappings: list[FullURLMapping] = extant_url_mappings + new_dest_url_mappings + + self._mapper.add_mappings(all_url_mappings) # Add web metadata for new URLs await rm.add_web_metadata( - all_dest_url_mappings=all_dest_url_mappings, + all_dest_url_mappings=all_url_mappings, dest_url_to_probe_response_mappings=self._destination_url_to_probe_response_mapping, tdos=self.tdos ) diff --git a/src/core/tasks/url/operators/probe/queries/insert_redirects/request_manager.py b/src/core/tasks/url/operators/probe/queries/insert_redirects/request_manager.py index 35dfded5..45eaa8e3 100644 --- a/src/core/tasks/url/operators/probe/queries/insert_redirects/request_manager.py +++ b/src/core/tasks/url/operators/probe/queries/insert_redirects/request_manager.py @@ -3,23 +3,23 @@ from sqlalchemy import select, tuple_, RowMapping from sqlalchemy.ext.asyncio import AsyncSession -from src.core.tasks.url.operators.probe.queries.insert_redirects.convert import convert_to_url_mappings, \ - convert_to_url_insert_models, convert_tdo_to_url_response_mappings, \ +from src.core.tasks.url.operators.probe.queries.insert_redirects.convert import convert_to_url_insert_models, \ + convert_tdo_to_url_response_mappings, \ convert_url_response_mapping_to_web_metadata_list from src.core.tasks.url.operators.probe.queries.insert_redirects.map import map_url_mappings_to_probe_responses from src.core.tasks.url.operators.probe.queries.insert_redirects.models.url_response_map import URLResponseMapping -from src.core.tasks.url.operators.probe.queries.urls.exist.model import UrlExistsResult +from src.core.tasks.url.operators.probe.queries.urls.exist.model import URLExistsResult from src.core.tasks.url.operators.probe.queries.urls.exist.query import URLsExistInDBQueryBuilder from src.core.tasks.url.operators.probe.tdo import URLProbeTDO -from src.db.dtos.url.mapping import URLMapping +from src.db.dtos.url.mapping_.full import FullURLMapping from src.db.helpers.session import session_helper as sh from src.db.models.impl.link.url_redirect_url.pydantic import LinkURLRedirectURLPydantic from src.db.models.impl.link.url_redirect_url.sqlalchemy import LinkURLRedirectURL -from src.db.models.impl.url.core.sqlalchemy import URL from src.db.models.impl.url.web_metadata.insert import URLWebMetadataPydantic from src.external.url_request.probe.models.redirect import URLProbeRedirectResponsePair from src.external.url_request.probe.models.response import URLProbeResponse -from src.util.url_mapper import URLMapper +from src.util.models.full_url import FullURL +from src.util.url_mapper_.full import FullURLMapper class InsertRedirectsRequestManager: @@ -27,24 +27,23 @@ class InsertRedirectsRequestManager: def __init__(self, session: AsyncSession): self.session = session - async def get_url_mappings_in_db( + async def check_if_urls_exist_in_db( self, - urls: list[str], - ): - results: list[UrlExistsResult] = await URLsExistInDBQueryBuilder( - urls=urls + urls: list[FullURL], + ) -> list[URLExistsResult]: + results: list[URLExistsResult] = await URLsExistInDBQueryBuilder( + full_urls=urls ).run(self.session) - extant_urls = [result for result in results if result.exists] - return convert_to_url_mappings(extant_urls) + return results - async def insert_new_urls(self, urls: list[str]) -> list[URLMapping]: + async def insert_new_urls(self, urls: list[FullURL]) -> list[FullURLMapping]: if len(urls) == 0: return [] deduplicated_urls = list(set(urls)) insert_models = convert_to_url_insert_models(deduplicated_urls) url_ids = await sh.bulk_insert(self.session, models=insert_models, return_ids=True) url_mappings = [ - URLMapping(url=url, url_id=url_id) + FullURLMapping(full_url=url, url_id=url_id) for url, url_id in zip(deduplicated_urls, url_ids) ] @@ -52,8 +51,8 @@ async def insert_new_urls(self, urls: list[str]) -> list[URLMapping]: async def add_web_metadata( self, - all_dest_url_mappings: list[URLMapping], - dest_url_to_probe_response_mappings: dict[str, URLProbeResponse], + all_dest_url_mappings: list[FullURLMapping], + dest_url_to_probe_response_mappings: dict[FullURL, URLProbeResponse], tdos: list[URLProbeTDO], ) -> None: dest_url_response_mappings = map_url_mappings_to_probe_responses( @@ -72,7 +71,7 @@ async def add_web_metadata( async def add_redirect_links( self, response_pairs: list[URLProbeRedirectResponsePair], - mapper: URLMapper + mapper: FullURLMapper ) -> None: # Get all existing links and exclude link_tuples: list[tuple[int, int]] = [] diff --git a/src/core/tasks/url/operators/probe/queries/urls/exist/model.py b/src/core/tasks/url/operators/probe/queries/urls/exist/model.py index 1245044c..72e20cfa 100644 --- a/src/core/tasks/url/operators/probe/queries/urls/exist/model.py +++ b/src/core/tasks/url/operators/probe/queries/urls/exist/model.py @@ -1,10 +1,20 @@ from pydantic import BaseModel +from src.util.models.full_url import FullURL -class UrlExistsResult(BaseModel): - url: str + +class URLExistsResult(BaseModel): + class Config: + arbitrary_types_allowed = True + + query_url: FullURL + db_url: FullURL | None url_id: int | None @property - def exists(self): - return self.url_id is not None \ No newline at end of file + def exists(self) -> bool: + return self.url_id is not None + + @property + def urls_match(self) -> bool: + return self.query_url.id_form == self.db_url.id_form \ No newline at end of file diff --git a/src/core/tasks/url/operators/probe/queries/urls/exist/query.py b/src/core/tasks/url/operators/probe/queries/urls/exist/query.py index 5176add9..4e9d3173 100644 --- a/src/core/tasks/url/operators/probe/queries/urls/exist/query.py +++ b/src/core/tasks/url/operators/probe/queries/urls/exist/query.py @@ -1,29 +1,53 @@ -from sqlalchemy import select +from sqlalchemy import select, func from sqlalchemy.ext.asyncio import AsyncSession -from src.core.tasks.url.operators.probe.queries.urls.exist.model import UrlExistsResult +from src.core.tasks.url.operators.probe.queries.urls.exist.model import URLExistsResult +from src.db.helpers.session.session_helper import results_exist from src.db.models.impl.url.core.sqlalchemy import URL from src.db.queries.base.builder import QueryBuilderBase from src.db.helpers.session import session_helper as sh +from src.util.models.full_url import FullURL + class URLsExistInDBQueryBuilder(QueryBuilderBase): """Checks if URLs exist in the database.""" - def __init__(self, urls: list[str]): + def __init__(self, full_urls: list[FullURL]): super().__init__() - self.urls = urls + self.full_urls = full_urls + self.id_form_urls = [ + url.id_form + for url in full_urls + ] + + async def run(self, session: AsyncSession) -> list[URLExistsResult]: + norm_url = func.rtrim(URL.url, '/').label("norm_url") - async def run(self, session: AsyncSession) -> list[UrlExistsResult]: - query = select(URL.id, URL.url).where(URL.url.in_(self.urls)) + query = select( + URL.id, + norm_url + ).where( + norm_url.in_(self.id_form_urls) + ) db_mappings = await sh.mappings(session, query=query) url_to_id_map: dict[str, int] = { - row["url"]: row["id"] + row["norm_url"]: row["id"] for row in db_mappings } - return [ - UrlExistsResult( - url=url, - url_id=url_to_id_map.get(url) - ) for url in self.urls - ] \ No newline at end of file + id_to_db_url_map: dict[int, FullURL] = { + row["id"]: FullURL(row["norm_url"]) + for row in db_mappings + } + results: list[URLExistsResult] = [] + for full_url in self.full_urls: + url_id: int | None = url_to_id_map.get(full_url.id_form) + db_url: FullURL | None = id_to_db_url_map.get(url_id) + result = URLExistsResult( + query_url=full_url, + db_url=db_url, + url_id=url_id + ) + results.append(result) + + return results \ No newline at end of file diff --git a/src/core/tasks/url/operators/probe/queries/urls/not_probed/get/query.py b/src/core/tasks/url/operators/probe/queries/urls/not_probed/get/query.py index 0ecc50b3..7011a8de 100644 --- a/src/core/tasks/url/operators/probe/queries/urls/not_probed/get/query.py +++ b/src/core/tasks/url/operators/probe/queries/urls/not_probed/get/query.py @@ -4,23 +4,23 @@ from sqlalchemy.ext.asyncio import AsyncSession from typing_extensions import override, final -from src.util.url import clean_url -from src.db.dtos.url.mapping import URLMapping +from src.db.dtos.url.mapping_.full import FullURLMapping +from src.db.helpers.session import session_helper as sh from src.db.models.impl.url.core.sqlalchemy import URL from src.db.models.impl.url.web_metadata.sqlalchemy import URLWebMetadata -from src.db.helpers.session import session_helper as sh from src.db.queries.base.builder import QueryBuilderBase +from src.util.models.full_url import FullURL @final class GetURLsWithoutProbeQueryBuilder(QueryBuilderBase): @override - async def run(self, session: AsyncSession) -> list[URLMapping]: + async def run(self, session: AsyncSession) -> list[FullURLMapping]: query = ( select( URL.id.label("url_id"), - URL.full_url.label("url") + URL.full_url ) .outerjoin( URLWebMetadata, @@ -36,8 +36,8 @@ async def run(self, session: AsyncSession) -> list[URLMapping]: ) db_mappings = await sh.mappings(session, query=query) return [ - URLMapping( + FullURLMapping( url_id=mapping["url_id"], - url=clean_url(mapping["url"]) + full_url=FullURL(mapping["full_url"]) ) for mapping in db_mappings ] \ No newline at end of file diff --git a/src/core/tasks/url/operators/probe/tdo.py b/src/core/tasks/url/operators/probe/tdo.py index 5208fd80..0fcb806c 100644 --- a/src/core/tasks/url/operators/probe/tdo.py +++ b/src/core/tasks/url/operators/probe/tdo.py @@ -1,9 +1,12 @@ from pydantic import BaseModel -from src.db.dtos.url.mapping import URLMapping +from src.db.dtos.url.mapping_.full import FullURLMapping from src.external.url_request.probe.models.wrapper import URLProbeResponseOuterWrapper class URLProbeTDO(BaseModel): - url_mapping: URLMapping + class Config: + arbitrary_types_allowed = True + + url_mapping: FullURLMapping response: URLProbeResponseOuterWrapper | None = None diff --git a/src/core/tasks/url/operators/root_url/convert.py b/src/core/tasks/url/operators/root_url/convert.py index 405cbc49..1c7a3cdc 100644 --- a/src/core/tasks/url/operators/root_url/convert.py +++ b/src/core/tasks/url/operators/root_url/convert.py @@ -1,17 +1,17 @@ from src.core.tasks.url.operators.root_url.extract import extract_root_url from src.core.tasks.url.operators.root_url.models.root_mapping import URLRootURLMapping -from src.db.dtos.url.mapping import URLMapping +from src.db.dtos.url.mapping_.simple import SimpleURLMapping from src.db.models.impl.flag.root_url.pydantic import FlagRootURLPydantic from src.db.models.impl.link.urls_root_url.pydantic import LinkURLRootURLPydantic from src.db.models.impl.url.core.enums import URLSource from src.db.models.impl.url.core.pydantic.insert import URLInsertModel -from src.util.url_mapper import URLMapper +from src.util.url_mapper_.simple import SimpleURLMapper def convert_to_flag_root_url_pydantic(url_ids: list[int]) -> list[FlagRootURLPydantic]: return [FlagRootURLPydantic(url_id=url_id) for url_id in url_ids] -def convert_to_url_root_url_mapping(url_mappings: list[URLMapping]) -> list[URLRootURLMapping]: +def convert_to_url_root_url_mapping(url_mappings: list[SimpleURLMapping]) -> list[URLRootURLMapping]: return [ URLRootURLMapping( url=mapping.url, @@ -22,18 +22,19 @@ def convert_to_url_root_url_mapping(url_mappings: list[URLMapping]) -> list[URLR def convert_to_url_insert_models(urls: list[str]) -> list[URLInsertModel]: return [ URLInsertModel( - url=url, - source=URLSource.ROOT_URL + url=url.rstrip('/'), + source=URLSource.ROOT_URL, + trailing_slash=url.endswith('/') ) for url in urls ] def convert_to_root_url_links( - root_db_mappings: list[URLMapping], - branch_db_mappings: list[URLMapping], + root_db_mappings: list[SimpleURLMapping], + branch_db_mappings: list[SimpleURLMapping], url_root_url_mappings: list[URLRootURLMapping] ) -> list[LinkURLRootURLPydantic]: - root_mapper = URLMapper(root_db_mappings) - branch_mapper = URLMapper(branch_db_mappings) + root_mapper = SimpleURLMapper(root_db_mappings) + branch_mapper = SimpleURLMapper(branch_db_mappings) results: list[LinkURLRootURLPydantic] = [] for url_root_url_mapping in url_root_url_mappings: diff --git a/src/core/tasks/url/operators/root_url/core.py b/src/core/tasks/url/operators/root_url/core.py index e32654da..ece5929f 100644 --- a/src/core/tasks/url/operators/root_url/core.py +++ b/src/core/tasks/url/operators/root_url/core.py @@ -11,12 +11,12 @@ from src.core.tasks.url.operators.root_url.queries.lookup.response import LookupRootsURLResponse from src.core.tasks.url.operators.root_url.queries.prereq import CheckPrereqsForRootURLTaskQueryBuilder from src.db.client.async_ import AsyncDatabaseClient -from src.db.dtos.url.mapping import URLMapping +from src.db.dtos.url.mapping_.simple import SimpleURLMapping from src.db.enums import TaskType from src.db.models.impl.flag.root_url.pydantic import FlagRootURLPydantic from src.db.models.impl.link.urls_root_url.pydantic import LinkURLRootURLPydantic from src.db.models.impl.url.core.pydantic.insert import URLInsertModel -from src.util.url_mapper import URLMapper +from src.util.url_mapper_.simple import SimpleURLMapper @final @@ -37,14 +37,14 @@ def task_type(self) -> TaskType: @override async def inner_task_logic(self) -> None: - all_task_mappings: list[URLMapping] = await self._get_urls_for_root_url_task() + all_task_mappings: list[SimpleURLMapping] = await self._get_urls_for_root_url_task() await self.link_urls_to_task( url_ids=[mapping.url_id for mapping in all_task_mappings] ) # Get the Root URLs for all URLs - mapper = URLMapper(all_task_mappings) + mapper = SimpleURLMapper(all_task_mappings) # -- Identify and Derive Root URLs -- @@ -65,7 +65,7 @@ async def inner_task_logic(self) -> None: for response in derived_root_url_lookup_responses if response.url_id is None ] - new_derived_root_url_mappings: list[URLMapping] = await self._add_new_urls(derived_root_urls_not_in_db) + new_derived_root_url_mappings: list[SimpleURLMapping] = await self._add_new_urls(derived_root_urls_not_in_db) # Add these to the mapper mapper.add_mappings(new_derived_root_url_mappings) @@ -105,7 +105,7 @@ async def inner_task_logic(self) -> None: async def _add_root_url_links( self, - mapper: URLMapper, + mapper: SimpleURLMapper, root_url_mappings: list[URLRootURLMapping], ): # For all task URLs that are not root URLs (i.e. 'branch' URLs): @@ -115,8 +115,8 @@ async def _add_root_url_links( branch_urls: list[str] = [mapping.url for mapping in root_url_mappings] root_urls: list[str] = [mapping.root_url for mapping in root_url_mappings] - root_url_db_mappings: list[URLMapping] = await self._lookup_root_urls(root_urls) - task_url_db_mappings: list[URLMapping] = mapper.get_mappings_by_url(branch_urls) + root_url_db_mappings: list[SimpleURLMapping] = await self._lookup_root_urls(root_urls) + task_url_db_mappings: list[SimpleURLMapping] = mapper.get_mappings_by_url(branch_urls) links: list[LinkURLRootURLPydantic] = convert_to_root_url_links( root_db_mappings=root_url_db_mappings, @@ -131,7 +131,7 @@ async def _flag_root_urls( ): await self._flag_as_root_urls(url_ids) - async def _get_urls_for_root_url_task(self) -> list[URLMapping]: + async def _get_urls_for_root_url_task(self) -> list[SimpleURLMapping]: builder = GetURLsForRootURLTaskQueryBuilder() return await self.adb_client.run_query_builder(builder) @@ -139,15 +139,15 @@ async def _lookup_root_urls(self, urls: list[str]) -> list[LookupRootsURLRespons builder = LookupRootURLsQueryBuilder(urls=list(set(urls))) return await self.adb_client.run_query_builder(builder) - async def _add_new_urls(self, urls: list[str]) -> list[URLMapping]: + async def _add_new_urls(self, urls: list[str]) -> list[SimpleURLMapping]: if len(urls) == 0: return [] insert_models: list[URLInsertModel] = convert_to_url_insert_models(urls) url_ids: list[int] = await self.adb_client.bulk_insert(insert_models, return_ids=True) - mappings: list[URLMapping] = [] + mappings: list[SimpleURLMapping] = [] for url, url_id in zip(urls, url_ids): mappings.append( - URLMapping( + SimpleURLMapping( url=url, url_id=url_id ) diff --git a/src/core/tasks/url/operators/root_url/extract.py b/src/core/tasks/url/operators/root_url/extract.py index 9cb05c5a..67a66c6f 100644 --- a/src/core/tasks/url/operators/root_url/extract.py +++ b/src/core/tasks/url/operators/root_url/extract.py @@ -5,4 +5,4 @@ def extract_root_url(url: str) -> str: # URLs in DB should not have HTTPS -- add to enable url parse to function properly parsed_url: ParseResult = urlparse(f"https://{url}") root_url = parsed_url.netloc - return root_url \ No newline at end of file + return root_url.rstrip("/") \ No newline at end of file diff --git a/src/core/tasks/url/operators/root_url/models/root_mapping.py b/src/core/tasks/url/operators/root_url/models/root_mapping.py index 7b115f36..03f87f66 100644 --- a/src/core/tasks/url/operators/root_url/models/root_mapping.py +++ b/src/core/tasks/url/operators/root_url/models/root_mapping.py @@ -7,4 +7,5 @@ class URLRootURLMapping(BaseModel): @property def is_root_url(self) -> bool: - return self.url == self.root_url \ No newline at end of file + # Add rstrip to handle trailing slashes + return self.url.rstrip("/") == self.root_url.rstrip("/") \ No newline at end of file diff --git a/src/core/tasks/url/operators/root_url/queries/get.py b/src/core/tasks/url/operators/root_url/queries/get.py index 3643f343..e02651b3 100644 --- a/src/core/tasks/url/operators/root_url/queries/get.py +++ b/src/core/tasks/url/operators/root_url/queries/get.py @@ -2,7 +2,7 @@ from typing_extensions import override from src.core.tasks.url.operators.root_url.queries._shared.urls_without_root_id import URLS_WITHOUT_ROOT_ID_QUERY -from src.db.dtos.url.mapping import URLMapping +from src.db.dtos.url.mapping_.simple import SimpleURLMapping from src.db.helpers.session import session_helper as sh from src.db.queries.base.builder import QueryBuilderBase @@ -10,13 +10,13 @@ class GetURLsForRootURLTaskQueryBuilder(QueryBuilderBase): @override - async def run(self, session: AsyncSession) -> list[URLMapping]: + async def run(self, session: AsyncSession) -> list[SimpleURLMapping]: query = ( URLS_WITHOUT_ROOT_ID_QUERY ) mappings = await sh.mappings(session, query=query) return [ - URLMapping( + SimpleURLMapping( url_id=mapping["id"], url=mapping["url"] ) for mapping in mappings diff --git a/src/core/tasks/url/operators/screenshot/core.py b/src/core/tasks/url/operators/screenshot/core.py index 96627ab8..2afea9ed 100644 --- a/src/core/tasks/url/operators/screenshot/core.py +++ b/src/core/tasks/url/operators/screenshot/core.py @@ -8,7 +8,7 @@ from src.core.tasks.url.operators.screenshot.queries.get import GetURLsForScreenshotTaskQueryBuilder from src.core.tasks.url.operators.screenshot.queries.prereq import URLsForScreenshotTaskPrerequisitesQueryBuilder from src.db.client.async_ import AsyncDatabaseClient -from src.db.dtos.url.mapping import URLMapping +from src.db.dtos.url.mapping_.simple import SimpleURLMapping from src.db.enums import TaskType from src.db.models.impl.url.screenshot.pydantic import URLScreenshotPydantic from src.db.models.impl.url.task_error.pydantic_.small import URLTaskErrorSmall @@ -31,7 +31,7 @@ async def meets_task_prerequisites(self) -> bool: URLsForScreenshotTaskPrerequisitesQueryBuilder() ) - async def get_urls_without_screenshot(self) -> list[URLMapping]: + async def get_urls_without_screenshot(self) -> list[SimpleURLMapping]: return await self.adb_client.run_query_builder( GetURLsForScreenshotTaskQueryBuilder() ) @@ -47,7 +47,7 @@ async def upload_errors(self, outcomes: list[URLScreenshotOutcome]) -> None: await self.add_task_errors(insert_models) async def inner_task_logic(self) -> None: - url_mappings: list[URLMapping] = await self.get_urls_without_screenshot() + url_mappings: list[SimpleURLMapping] = await self.get_urls_without_screenshot() await self.link_urls_to_task( url_ids=[url_mapping.url_id for url_mapping in url_mappings] ) diff --git a/src/core/tasks/url/operators/screenshot/get.py b/src/core/tasks/url/operators/screenshot/get.py index 7c0d6a42..7598c43e 100644 --- a/src/core/tasks/url/operators/screenshot/get.py +++ b/src/core/tasks/url/operators/screenshot/get.py @@ -1,12 +1,12 @@ from src.core.tasks.url.operators.screenshot.models.outcome import URLScreenshotOutcome -from src.db.dtos.url.mapping import URLMapping +from src.db.dtos.url.mapping_.simple import SimpleURLMapping from src.external.url_request.dtos.screenshot_response import URLScreenshotResponse from src.external.url_request.screenshot_.core import get_screenshots -from src.util.url_mapper import URLMapper +from src.util.url_mapper_.simple import SimpleURLMapper -async def get_url_screenshots(mappings: list[URLMapping]) -> list[URLScreenshotOutcome]: - mapper = URLMapper(mappings) +async def get_url_screenshots(mappings: list[SimpleURLMapping]) -> list[URLScreenshotOutcome]: + mapper = SimpleURLMapper(mappings) responses: list[URLScreenshotResponse] = await get_screenshots( urls=mapper.get_all_urls() ) diff --git a/src/core/tasks/url/operators/screenshot/queries/get.py b/src/core/tasks/url/operators/screenshot/queries/get.py index e2dd94df..f3bf2839 100644 --- a/src/core/tasks/url/operators/screenshot/queries/get.py +++ b/src/core/tasks/url/operators/screenshot/queries/get.py @@ -1,18 +1,18 @@ -from typing import Any, Sequence +from typing import Sequence from sqlalchemy import select, RowMapping from sqlalchemy.ext.asyncio import AsyncSession from src.core.tasks.url.operators.screenshot.constants import TASK_URL_LIMIT from src.core.tasks.url.operators.screenshot.queries.cte import URLScreenshotPrerequisitesCTEContainer -from src.db.dtos.url.mapping import URLMapping +from src.db.dtos.url.mapping_.simple import SimpleURLMapping from src.db.queries.base.builder import QueryBuilderBase from src.db.helpers.session import session_helper as sh class GetURLsForScreenshotTaskQueryBuilder(QueryBuilderBase): - async def run(self, session: AsyncSession) -> list[URLMapping]: + async def run(self, session: AsyncSession) -> list[SimpleURLMapping]: cte = URLScreenshotPrerequisitesCTEContainer() query = select( @@ -22,4 +22,4 @@ async def run(self, session: AsyncSession) -> list[URLMapping]: mappings: Sequence[RowMapping] = await sh.mappings(session, query=query) - return [URLMapping(**mapping) for mapping in mappings] + return [SimpleURLMapping(**mapping) for mapping in mappings] diff --git a/src/core/tasks/url/operators/submit_meta_urls/core.py b/src/core/tasks/url/operators/submit_meta_urls/core.py index e06901da..ae41d56b 100644 --- a/src/core/tasks/url/operators/submit_meta_urls/core.py +++ b/src/core/tasks/url/operators/submit_meta_urls/core.py @@ -3,7 +3,7 @@ from src.core.tasks.url.operators.submit_meta_urls.queries.prereq import \ MeetsMetaURLSSubmissionPrerequisitesQueryBuilder from src.db.client.async_ import AsyncDatabaseClient -from src.db.dtos.url.mapping import URLMapping +from src.db.dtos.url.mapping_.simple import SimpleURLMapping from src.db.enums import TaskType from src.db.models.impl.url.ds_meta_url.pydantic import URLDSMetaURLPydantic from src.db.models.impl.url.task_error.pydantic_.small import URLTaskErrorSmall @@ -11,7 +11,7 @@ from src.external.pdap.impl.meta_urls.enums import SubmitMetaURLsStatus from src.external.pdap.impl.meta_urls.request import SubmitMetaURLsRequest from src.external.pdap.impl.meta_urls.response import SubmitMetaURLsResponse -from src.util.url_mapper import URLMapper +from src.util.url_mapper_.simple import SimpleURLMapper class SubmitMetaURLsTaskOperator(URLTaskOperatorBase): @@ -38,15 +38,15 @@ async def inner_task_logic(self) -> None: GetMetaURLsForSubmissionQueryBuilder() ) - url_mappings: list[URLMapping] = [ - URLMapping( + url_mappings: list[SimpleURLMapping] = [ + SimpleURLMapping( url=request.url, url_id=request.url_id, ) for request in requests ] - mapper = URLMapper(url_mappings) + mapper = SimpleURLMapper(url_mappings) await self.link_urls_to_task(mapper.get_all_ids()) diff --git a/src/core/tasks/url/operators/validate/queries/insert.py b/src/core/tasks/url/operators/validate/queries/insert.py index 31bdfa74..00dc36ac 100644 --- a/src/core/tasks/url/operators/validate/queries/insert.py +++ b/src/core/tasks/url/operators/validate/queries/insert.py @@ -4,14 +4,14 @@ from sqlalchemy.ext.asyncio import AsyncSession from src.core.tasks.url.operators.validate.queries.get.models.response import GetURLsForAutoValidationResponse +from src.db.helpers.session import session_helper as sh from src.db.models.impl.flag.auto_validated.pydantic import FlagURLAutoValidatedPydantic from src.db.models.impl.flag.url_validated.pydantic import FlagURLValidatedPydantic from src.db.models.impl.link.url_agency.pydantic import LinkURLAgencyPydantic -from src.db.models.impl.url.core.pydantic.upsert import URLUpsertModel from src.db.models.impl.url.core.sqlalchemy import URL from src.db.models.impl.url.record_type.pydantic import URLRecordTypePydantic from src.db.queries.base.builder import QueryBuilderBase -from src.db.helpers.session import session_helper as sh + class InsertURLAutoValidationsQueryBuilder(QueryBuilderBase): diff --git a/src/db/client/async_.py b/src/db/client/async_.py index 87fcb057..d1d093a8 100644 --- a/src/db/client/async_.py +++ b/src/db/client/async_.py @@ -171,7 +171,6 @@ async def bulk_update( session: AsyncSession, models: list[Base], ): - # Note, mapping must include primary key await sh.bulk_update(session=session, models=models) @session_manager diff --git a/src/db/client/sync.py b/src/db/client/sync.py index 407cb3f4..966d4bbd 100644 --- a/src/db/client/sync.py +++ b/src/db/client/sync.py @@ -1,17 +1,16 @@ from functools import wraps from typing import List -from sqlalchemy import create_engine, update, Select +from sqlalchemy import create_engine, Select from sqlalchemy.exc import IntegrityError from sqlalchemy.orm import sessionmaker, scoped_session, Session -from src.collectors.enums import URLStatus from src.db.config_manager import ConfigManager from src.db.models.impl.batch.pydantic.info import BatchInfo from src.db.models.impl.duplicate.pydantic.insert import DuplicateInsertInfo from src.db.dtos.url.insert import InsertURLsInfo from src.db.models.impl.log.pydantic.info import LogInfo -from src.db.dtos.url.mapping import URLMapping +from src.db.dtos.url.mapping_.simple import SimpleURLMapping from src.db.models.impl.link.batch_url.sqlalchemy import LinkBatchURL from src.db.models.impl.url.core.pydantic.info import URLInfo from src.db.models.templates_.base import Base @@ -125,6 +124,7 @@ def insert_url(self, session, url_info: URLInfo) -> int: collector_metadata=url_info.collector_metadata, status=url_info.status, name=url_info.name, + trailing_slash=url_and_scheme.url.endswith('/'), source=url_info.source ) if url_info.created_at is not None: @@ -147,7 +147,7 @@ def insert_urls(self, url_infos: List[URLInfo], batch_id: int) -> InsertURLsInfo url_info.batch_id = batch_id try: url_id = self.insert_url(url_info) - url_mappings.append(URLMapping(url_id=url_id, url=url_info.url)) + url_mappings.append(SimpleURLMapping(url_id=url_id, url=url_info.url)) except IntegrityError as e: orig_url_info = self.get_url_info_by_url(url_info.url) duplicate_info = DuplicateInsertInfo( diff --git a/src/db/dtos/url/insert.py b/src/db/dtos/url/insert.py index f3143668..672cbb9f 100644 --- a/src/db/dtos/url/insert.py +++ b/src/db/dtos/url/insert.py @@ -1,10 +1,10 @@ from pydantic import BaseModel -from src.db.dtos.url.mapping import URLMapping +from src.db.dtos.url.mapping_.simple import SimpleURLMapping class InsertURLsInfo(BaseModel): - url_mappings: list[URLMapping] + url_mappings: list[SimpleURLMapping] url_ids: list[int] total_count: int = 0 original_count: int = 0 diff --git a/src/db/dtos/url/mapping_/__init__.py b/src/db/dtos/url/mapping_/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/db/dtos/url/mapping_/full.py b/src/db/dtos/url/mapping_/full.py new file mode 100644 index 00000000..c60f367c --- /dev/null +++ b/src/db/dtos/url/mapping_/full.py @@ -0,0 +1,14 @@ +from pydantic import BaseModel, ConfigDict + +from src.util.models.full_url import FullURL + + +class FullURLMapping(BaseModel): + """Mapping between full URL and url_id""" + model_config = ConfigDict( + arbitrary_types_allowed=True, + frozen=True # <- makes it immutable & hashable + ) + + full_url: FullURL + url_id: int \ No newline at end of file diff --git a/src/db/dtos/url/mapping.py b/src/db/dtos/url/mapping_/simple.py similarity index 84% rename from src/db/dtos/url/mapping.py rename to src/db/dtos/url/mapping_/simple.py index d48a4649..ff2e4f6b 100644 --- a/src/db/dtos/url/mapping.py +++ b/src/db/dtos/url/mapping_/simple.py @@ -1,7 +1,7 @@ from pydantic import BaseModel, ConfigDict -class URLMapping(BaseModel): +class SimpleURLMapping(BaseModel): """Mapping between url and url_id.""" model_config = ConfigDict(frozen=True) # <- makes it immutable & hashable diff --git a/src/db/helpers/session/session_helper.py b/src/db/helpers/session/session_helper.py index 43369ff3..f451f30c 100644 --- a/src/db/helpers/session/session_helper.py +++ b/src/db/helpers/session/session_helper.py @@ -52,6 +52,12 @@ async def bulk_upsert( session: AsyncSession, models: list[BulkUpsertableModel], ) -> None: + """Bulk update sqlalchemy models via their pydantic counterparts. + + WARNING: All non-id fields in the model will be updated on conflict. Do not include + attributes in the BulkUpdatableModel unless you intend to update them. + + """ if len(models) == 0: return # Parse models to get sa_model and id_field @@ -205,15 +211,19 @@ async def bulk_update( session: AsyncSession, models: list[BulkUpdatableModel], ): - """Bulk update sqlalchemy models via their pydantic counterparts.""" + """Bulk update sqlalchemy models via their pydantic counterparts. + + WARNING: All non-id fields in the model will be updated. Do not include + attributes in the BulkUpdatableModel unless you intend to update them. + """ if len(models) == 0: return parser = BulkActionParser(models) sa_model = parser.sa_model - id_field = parser.id_field - update_fields = parser.get_non_id_fields() + id_field: str = parser.id_field + update_fields: list[str] = parser.get_non_id_fields() for model in models: diff --git a/src/db/models/impl/url/core/pydantic/insert.py b/src/db/models/impl/url/core/pydantic/insert.py index 08480b6b..ed73b6c1 100644 --- a/src/db/models/impl/url/core/pydantic/insert.py +++ b/src/db/models/impl/url/core/pydantic/insert.py @@ -18,4 +18,5 @@ def sa_model(cls) -> type[Base]: collector_metadata: dict | None = None name: str | None = None status: URLStatus = URLStatus.OK - source: URLSource \ No newline at end of file + source: URLSource + trailing_slash: bool \ No newline at end of file diff --git a/src/db/models/impl/url/core/pydantic/upsert.py b/src/db/models/impl/url/core/pydantic/upsert.py index 8a101c70..0ee5695a 100644 --- a/src/db/models/impl/url/core/pydantic/upsert.py +++ b/src/db/models/impl/url/core/pydantic/upsert.py @@ -15,4 +15,6 @@ def sa_model(cls) -> type[Base]: return URL id: int - name: str | None + name: str | None = None + url: str | None = None + trailing_slash: bool | None = None diff --git a/src/db/models/impl/url/core/sqlalchemy.py b/src/db/models/impl/url/core/sqlalchemy.py index e5bca30d..d4d8e7c2 100644 --- a/src/db/models/impl/url/core/sqlalchemy.py +++ b/src/db/models/impl/url/core/sqlalchemy.py @@ -1,4 +1,4 @@ -from sqlalchemy import Column, Text, String, JSON, case, literal +from sqlalchemy import Column, Text, String, JSON, case, literal, Boolean from sqlalchemy.ext.hybrid import hybrid_property from sqlalchemy.orm import relationship from sqlalchemy.util import hybridproperty @@ -32,17 +32,28 @@ class URL(UpdatedAtMixin, CreatedAtMixin, WithIDBase): name='url_status', nullable=False ) + trailing_slash = Column(Boolean, nullable=False) @hybrid_property def full_url(self) -> str: if self.scheme is None: return self.url - return f"{self.scheme}://{self.url}" + url: str = f"{self.scheme}://{self.url}" + if self.trailing_slash: + url += "/" + return url @full_url.expression def full_url(cls): return case( - (cls.scheme != None, (cls.scheme + literal("://") + cls.url)), + ( + (cls.scheme != None) & (cls.trailing_slash == True), + (cls.scheme + literal("://") + cls.url + literal("/")) + ), + ( + (cls.scheme != None) & (cls.trailing_slash == False), + (cls.scheme + literal("://") + cls.url) + ), else_=cls.url ) diff --git a/src/external/url_request/core.py b/src/external/url_request/core.py index 7a6920fe..d49b2649 100644 --- a/src/external/url_request/core.py +++ b/src/external/url_request/core.py @@ -4,6 +4,7 @@ from src.external.url_request.probe.core import URLProbeManager from src.external.url_request.probe.models.wrapper import URLProbeResponseOuterWrapper from src.external.url_request.request import fetch_urls +from src.util.models.full_url import FullURL class URLRequestInterface: @@ -15,7 +16,7 @@ async def make_requests_with_html( return await fetch_urls(urls) @staticmethod - async def probe_urls(urls: list[str]) -> list[URLProbeResponseOuterWrapper]: + async def probe_urls(urls: list[FullURL]) -> list[URLProbeResponseOuterWrapper]: async with ClientSession(timeout=ClientTimeout(total=30)) as session: manager = URLProbeManager(session=session) return await manager.probe_urls(urls=urls) diff --git a/src/external/url_request/probe/convert.py b/src/external/url_request/probe/convert.py index 3b15268a..16258cdb 100644 --- a/src/external/url_request/probe/convert.py +++ b/src/external/url_request/probe/convert.py @@ -6,6 +6,7 @@ from src.external.url_request.probe.models.response import URLProbeResponse from src.external.url_request.probe.models.redirect import URLProbeRedirectResponsePair from src.external.url_request.probe.models.wrapper import URLProbeResponseOuterWrapper +from src.util.models.full_url import FullURL def _process_client_response_history(history: Sequence[ClientResponse]) -> list[str]: @@ -29,7 +30,7 @@ def _extract_redirect_probe_response(cr: ClientResponse) -> URLProbeResponse | N first_url = all_urls[0] return URLProbeResponse( - url=first_url, + url=FullURL(first_url), status_code=HTTPStatus.FOUND.value, content_type=None, error=None, @@ -53,14 +54,14 @@ def _extract_destination_url(cr: ClientResponse) -> str: return str(cr.url) def convert_client_response_to_probe_response( - url: str, + url: FullURL, cr: ClientResponse ) -> URLProbeResponse | URLProbeRedirectResponsePair: error = _extract_error(cr) content_type = _extract_content_type(cr, error=error) if not _has_redirect(cr): return URLProbeResponse( - url=str(cr.url), + url=FullURL(str(cr.url)), status_code=cr.status, content_type=content_type, error=error, @@ -85,7 +86,7 @@ def convert_client_response_to_probe_response( destination_error = _extract_error(destination_cr) destination_content_type = _extract_content_type(destination_cr, error=destination_error) destination_probe_response = URLProbeResponse( - url=destination_url, + url=FullURL(destination_url), status_code=destination_cr.status, content_type=destination_content_type, error=destination_error, @@ -97,7 +98,7 @@ def convert_client_response_to_probe_response( ) def convert_to_error_response( - url: str, + url: FullURL, error: str, status_code: int | None = None ) -> URLProbeResponseOuterWrapper: diff --git a/src/external/url_request/probe/core.py b/src/external/url_request/probe/core.py index 48009381..120e1b66 100644 --- a/src/external/url_request/probe/core.py +++ b/src/external/url_request/probe/core.py @@ -9,6 +9,7 @@ from src.external.url_request.probe.convert import convert_client_response_to_probe_response, convert_to_error_response from src.external.url_request.probe.models.wrapper import URLProbeResponseOuterWrapper +from src.util.models.full_url import FullURL from src.util.progress_bar import get_progress_bar_disabled @@ -20,14 +21,14 @@ def __init__( ): self.session = session - async def probe_urls(self, urls: list[str]) -> list[URLProbeResponseOuterWrapper]: + async def probe_urls(self, urls: list[FullURL]) -> list[URLProbeResponseOuterWrapper]: return await tqdm_asyncio.gather( *[self._probe(url) for url in urls], timeout=60 * 10, # 10 minutes, disable=get_progress_bar_disabled() ) - async def _probe(self, url: str) -> URLProbeResponseOuterWrapper: + async def _probe(self, url: FullURL) -> URLProbeResponseOuterWrapper: try: response = await self._head(url) if not response.is_redirect and response.response.status_code == HTTPStatus.OK: @@ -52,9 +53,9 @@ async def _probe(self, url: str) -> URLProbeResponseOuterWrapper: except ClientOSError as e: return convert_to_error_response(url, error=f"Client OS Error: {e.errno}. {str(e)}") - async def _head(self, url: str) -> URLProbeResponseOuterWrapper: + async def _head(self, url: FullURL) -> URLProbeResponseOuterWrapper: try: - async with self.session.head(url, allow_redirects=True) as response: + async with self.session.head(str(url), allow_redirects=True) as response: return URLProbeResponseOuterWrapper( original_url=url, response=convert_client_response_to_probe_response( @@ -74,9 +75,9 @@ async def _head(self, url: str) -> URLProbeResponseOuterWrapper: status_code=e.status ) - async def _get(self, url: str) -> URLProbeResponseOuterWrapper: + async def _get(self, url: FullURL) -> URLProbeResponseOuterWrapper: try: - async with self.session.get(url, allow_redirects=True) as response: + async with self.session.get(str(url), allow_redirects=True) as response: return URLProbeResponseOuterWrapper( original_url=url, response=convert_client_response_to_probe_response( diff --git a/src/external/url_request/probe/models/response.py b/src/external/url_request/probe/models/response.py index 967f1c4f..ad6eb588 100644 --- a/src/external/url_request/probe/models/response.py +++ b/src/external/url_request/probe/models/response.py @@ -1,9 +1,13 @@ from pydantic import BaseModel, Field, model_validator +from src.util.models.full_url import FullURL class URLProbeResponse(BaseModel): - url: str + class Config: + arbitrary_types_allowed = True + + url: FullURL status_code: int | None = Field(le=999, ge=100) content_type: str | None error: str | None = None diff --git a/src/external/url_request/probe/models/wrapper.py b/src/external/url_request/probe/models/wrapper.py index 04dbc9c4..27fd7be8 100644 --- a/src/external/url_request/probe/models/wrapper.py +++ b/src/external/url_request/probe/models/wrapper.py @@ -2,10 +2,14 @@ from src.external.url_request.probe.models.redirect import URLProbeRedirectResponsePair from src.external.url_request.probe.models.response import URLProbeResponse +from src.util.models.full_url import FullURL class URLProbeResponseOuterWrapper(BaseModel): - original_url: str + class Config: + arbitrary_types_allowed = True + + original_url: FullURL response: URLProbeResponse | URLProbeRedirectResponsePair @property diff --git a/src/util/models/full_url.py b/src/util/models/full_url.py new file mode 100644 index 00000000..1118040e --- /dev/null +++ b/src/util/models/full_url.py @@ -0,0 +1,84 @@ +from urllib.parse import urlparse + +from src.util.url import clean_url + + +class FullURL: + __slots__ = ( + "_full_url", + "_scheme", + "_url_without_scheme" + ) + + def __init__( + self, + full_url: str + ): + if not isinstance(full_url, str): + raise ValueError("full_url must be a string") + self._full_url = full_url + self._scheme = None + self._url_without_scheme = None + + @property + def full_url(self) -> str: + return self._full_url + + def __str__(self): + return self.full_url + + def __repr__(self): + return self.id_form + + def __hash__(self): + return hash(self.id_form) + + def __eq__(self, other): + return isinstance(other, FullURL) and self.id_form == other.id_form + + def _set_url_parts(self): + """ + Modifies: + self._scheme + self._url + + """ + parse_result = urlparse(self.full_url) + self._scheme = parse_result.scheme + if parse_result.scheme is not None: + self._url_without_scheme = self.full_url.replace(f"{parse_result.scheme}://", "", 1) + else: + self._url_without_scheme = self.full_url + + + @property + def scheme(self) -> str | None: + if self._scheme is None: + self._set_url_parts() + return self._scheme + + @property + def without_scheme(self) -> str: + if self._url_without_scheme is None: + self._set_url_parts() + return self._url_without_scheme + + @property + def id_form(self) -> str: + """Retrieves URL in 'Identification Form' + + These are meant to be used to compare URLs with one another. + + They have the following properties: + No Scheme + No Trailing Slash + Cleaned of fragments and query parameters. + """ + no_scheme: str = self.without_scheme + no_trailing_slash: str = no_scheme.rstrip("/") + clean: str = clean_url(no_trailing_slash) + return clean + + def clean(self) -> str: + return clean_url(self.full_url) + diff --git a/src/util/url.py b/src/util/url.py index ac4f73ca..0fdf7d0b 100644 --- a/src/util/url.py +++ b/src/util/url.py @@ -26,3 +26,9 @@ def get_url_and_scheme( url=url, scheme=None ) + +def remove_url_scheme(url: str) -> str: + parsed = urlparse(url) + if parsed.scheme: + return url.replace(f"{parsed.scheme}://", "", 1) + return url \ No newline at end of file diff --git a/src/util/url_mapper_/__init__.py b/src/util/url_mapper_/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/util/url_mapper_/full.py b/src/util/url_mapper_/full.py new file mode 100644 index 00000000..8f6272c2 --- /dev/null +++ b/src/util/url_mapper_/full.py @@ -0,0 +1,49 @@ +from src.db.dtos.url.mapping_.full import FullURLMapping +from src.util.models.full_url import FullURL + + +class FullURLMapper: + + def __init__(self, mappings: list[FullURLMapping]): + self._url_to_id = { + mapping.full_url.id_form: mapping.url_id + for mapping in mappings + } + self._id_to_url = { + mapping.url_id: mapping.full_url + for mapping in mappings + } + + def get_id(self, full_url: FullURL) -> int: + return self._url_to_id[full_url.id_form] + + def get_ids(self, full_urls: list[FullURL]) -> list[int]: + return [ + self._url_to_id[full_url.id_form] + for full_url in full_urls + ] + + def get_all_ids(self) -> list[int]: + return list(self._url_to_id.values()) + + def get_all_urls(self) -> list[FullURL]: + return list(self._id_to_url.values()) + + def get_url(self, url_id: int) -> FullURL: + return self._id_to_url[url_id] + + def get_mappings_by_url(self, full_urls: list[FullURL]) -> list[FullURLMapping]: + return [ + FullURLMapping( + url_id=self._url_to_id[full_url.id_form], + full_url=full_url + ) for full_url in full_urls + ] + + def add_mapping(self, mapping: FullURLMapping) -> None: + self._url_to_id[mapping.full_url.id_form] = mapping.url_id + self._id_to_url[mapping.url_id] = mapping.full_url + + def add_mappings(self, mappings: list[FullURLMapping]) -> None: + for mapping in mappings: + self.add_mapping(mapping) \ No newline at end of file diff --git a/src/util/url_mapper.py b/src/util/url_mapper_/simple.py similarity index 72% rename from src/util/url_mapper.py rename to src/util/url_mapper_/simple.py index 3a399d77..2a7f7353 100644 --- a/src/util/url_mapper.py +++ b/src/util/url_mapper_/simple.py @@ -1,9 +1,9 @@ -from src.db.dtos.url.mapping import URLMapping +from src.db.dtos.url.mapping_.simple import SimpleURLMapping -class URLMapper: +class SimpleURLMapper: - def __init__(self, mappings: list[URLMapping]): + def __init__(self, mappings: list[SimpleURLMapping]): self._url_to_id = { mapping.url: mapping.url_id for mapping in mappings @@ -31,18 +31,18 @@ def get_all_urls(self) -> list[str]: def get_url(self, url_id: int) -> str: return self._id_to_url[url_id] - def get_mappings_by_url(self, urls: list[str]) -> list[URLMapping]: + def get_mappings_by_url(self, urls: list[str]) -> list[SimpleURLMapping]: return [ - URLMapping( + SimpleURLMapping( url_id=self._url_to_id[url], url=url ) for url in urls ] - def add_mapping(self, mapping: URLMapping) -> None: + def add_mapping(self, mapping: SimpleURLMapping) -> None: self._url_to_id[mapping.url] = mapping.url_id self._id_to_url[mapping.url_id] = mapping.url - def add_mappings(self, mappings: list[URLMapping]) -> None: + def add_mappings(self, mappings: list[SimpleURLMapping]) -> None: for mapping in mappings: self.add_mapping(mapping) \ No newline at end of file diff --git a/tests/automated/integration/api/annotate/anonymous/test_core.py b/tests/automated/integration/api/annotate/anonymous/test_core.py index 4b747363..d2b9f691 100644 --- a/tests/automated/integration/api/annotate/anonymous/test_core.py +++ b/tests/automated/integration/api/annotate/anonymous/test_core.py @@ -7,7 +7,7 @@ from src.api.endpoints.annotate.all.post.models.name import AnnotationPostNameInfo from src.api.endpoints.annotate.all.post.models.request import AllAnnotationPostInfo from src.core.enums import RecordType -from src.db.dtos.url.mapping import URLMapping +from src.db.dtos.url.mapping_.simple import SimpleURLMapping from src.db.models.impl.flag.url_validated.enums import URLType from src.db.models.impl.url.suggestion.anonymous.agency.sqlalchemy import AnonymousAnnotationAgency from src.db.models.impl.url.suggestion.anonymous.location.sqlalchemy import AnonymousAnnotationLocation @@ -34,11 +34,11 @@ async def test_annotate_anonymous( setup_info_1 = await setup_for_get_next_url_for_final_review( db_data_creator=ath.db_data_creator, include_user_annotations=True ) - url_mapping_1: URLMapping = setup_info_1.url_mapping + url_mapping_1: SimpleURLMapping = setup_info_1.url_mapping setup_info_2: FinalReviewSetupInfo = await setup_for_get_next_url_for_final_review( db_data_creator=ath.db_data_creator, include_user_annotations=True ) - url_mapping_2: URLMapping = setup_info_2.url_mapping + url_mapping_2: SimpleURLMapping = setup_info_2.url_mapping get_response_1: GetNextURLForAllAnnotationResponse = await get_next_url_for_anonymous_annotation(rv) assert get_response_1.next_annotation is not None diff --git a/tests/automated/integration/api/annotate/helpers.py b/tests/automated/integration/api/annotate/helpers.py index 39cfedab..92392ab1 100644 --- a/tests/automated/integration/api/annotate/helpers.py +++ b/tests/automated/integration/api/annotate/helpers.py @@ -1,10 +1,10 @@ from src.core.tasks.url.operators.html.scraper.parser.dtos.response_html import ResponseHTMLInfo -from src.db.dtos.url.mapping import URLMapping +from src.db.dtos.url.mapping_.simple import SimpleURLMapping def check_url_mappings_match( - map_1: URLMapping, - map_2: URLMapping + map_1: SimpleURLMapping, + map_2: SimpleURLMapping ): assert map_1.url_id == map_2.url_id assert map_2.url == map_2.url diff --git a/tests/automated/integration/api/batch/summaries/test_pending_url_filter.py b/tests/automated/integration/api/batch/summaries/test_pending_url_filter.py index c471b6fa..f4181629 100644 --- a/tests/automated/integration/api/batch/summaries/test_pending_url_filter.py +++ b/tests/automated/integration/api/batch/summaries/test_pending_url_filter.py @@ -2,7 +2,7 @@ from src.collectors.enums import CollectorType from src.core.enums import BatchStatus -from src.db.dtos.url.mapping import URLMapping +from src.db.dtos.url.mapping_.simple import SimpleURLMapping from src.db.models.views.batch_url_status.enums import BatchURLStatusEnum from tests.helpers.batch_creation_parameters.enums import URLCreationEnum from tests.helpers.data_creator.core import DBDataCreator @@ -27,7 +27,7 @@ async def test_get_batch_summaries_pending_url_filter(api_test_helper): # Add a batch with submitted URLs batch_submitted: int = await dbdc.create_batch(status=BatchStatus.READY_TO_LABEL) - submitted_url_mappings: list[URLMapping] = await dbdc.create_submitted_urls(count=2) + submitted_url_mappings: list[SimpleURLMapping] = await dbdc.create_submitted_urls(count=2) submitted_url_ids: list[int] = [url_mapping.url_id for url_mapping in submitted_url_mappings] await dbdc.create_batch_url_links( batch_id=batch_submitted, @@ -39,7 +39,7 @@ async def test_get_batch_summaries_pending_url_filter(api_test_helper): # Add a batch with validated URLs batch_validated: int = await dbdc.create_batch(status=BatchStatus.READY_TO_LABEL) - validated_url_mappings: list[URLMapping] = await dbdc.create_validated_urls( + validated_url_mappings: list[SimpleURLMapping] = await dbdc.create_validated_urls( count=2 ) validated_url_ids: list[int] = [url_mapping.url_id for url_mapping in validated_url_mappings] diff --git a/tests/automated/integration/api/metrics/batches/test_aggregated.py b/tests/automated/integration/api/metrics/batches/test_aggregated.py index 090896e8..97cd805e 100644 --- a/tests/automated/integration/api/metrics/batches/test_aggregated.py +++ b/tests/automated/integration/api/metrics/batches/test_aggregated.py @@ -3,13 +3,11 @@ from src.collectors.enums import CollectorType, URLStatus from src.core.enums import BatchStatus from src.db.client.async_ import AsyncDatabaseClient -from src.db.dtos.url.mapping import URLMapping -from src.db.helpers.connect import get_postgres_connection_string +from src.db.dtos.url.mapping_.simple import SimpleURLMapping from src.db.models.impl.flag.url_validated.enums import URLType from tests.helpers.batch_creation_parameters.core import TestBatchCreationParameters from tests.helpers.data_creator.create import create_batch, create_url_data_sources, create_urls, \ create_batch_url_links, create_validated_flags -from tests.helpers.setup.wipe import wipe_database @pytest.mark.asyncio @@ -25,17 +23,17 @@ async def test_get_batches_aggregated_metrics( adb_client=adb_client, strategy=CollectorType.MANUAL, ) - url_mappings_error: list[URLMapping] = await create_urls( + url_mappings_error: list[SimpleURLMapping] = await create_urls( adb_client=adb_client, status=URLStatus.ERROR, count=4, ) - url_mappings_ok: list[URLMapping] = await create_urls( + url_mappings_ok: list[SimpleURLMapping] = await create_urls( adb_client=adb_client, status=URLStatus.OK, count=11, ) - url_mappings_all: list[URLMapping] = url_mappings_error + url_mappings_ok + url_mappings_all: list[SimpleURLMapping] = url_mappings_error + url_mappings_ok url_ids_all: list[int] = [url_mapping.url_id for url_mapping in url_mappings_all] await create_batch_url_links( adb_client=adb_client, diff --git a/tests/automated/integration/api/metrics/batches/test_breakdown.py b/tests/automated/integration/api/metrics/batches/test_breakdown.py index c6ef6e0b..ca05eaa1 100644 --- a/tests/automated/integration/api/metrics/batches/test_breakdown.py +++ b/tests/automated/integration/api/metrics/batches/test_breakdown.py @@ -1,12 +1,11 @@ from datetime import datetime, timedelta -import pendulum import pytest from src.collectors.enums import CollectorType, URLStatus from src.core.enums import BatchStatus from src.db.client.async_ import AsyncDatabaseClient -from src.db.dtos.url.mapping import URLMapping +from src.db.dtos.url.mapping_.simple import SimpleURLMapping from src.db.models.impl.flag.url_validated.enums import URLType from tests.helpers.data_creator.create import create_batch, create_urls, create_batch_url_links, create_validated_flags, \ create_url_data_sources @@ -23,7 +22,7 @@ async def test_get_batches_breakdown_metrics(api_test_helper): adb_client=adb_client, strategy=CollectorType.MANUAL, ) - url_mappings_1: list[URLMapping] = await create_urls( + url_mappings_1: list[SimpleURLMapping] = await create_urls( adb_client=adb_client, count=3, ) @@ -50,13 +49,13 @@ async def test_get_batches_breakdown_metrics(api_test_helper): strategy=CollectorType.AUTO_GOOGLER, date_generated=today - timedelta(days=14) ) - error_url_mappings: list[URLMapping] = await create_urls( + error_url_mappings: list[SimpleURLMapping] = await create_urls( adb_client=adb_client, status=URLStatus.ERROR, count=4, ) error_url_ids: list[int] = [url_mapping.url_id for url_mapping in error_url_mappings] - validated_url_mappings: list[URLMapping] = await create_urls( + validated_url_mappings: list[SimpleURLMapping] = await create_urls( adb_client=adb_client, count=8, ) diff --git a/tests/automated/integration/api/metrics/test_backlog.py b/tests/automated/integration/api/metrics/test_backlog.py index da8dccd6..09f687f5 100644 --- a/tests/automated/integration/api/metrics/test_backlog.py +++ b/tests/automated/integration/api/metrics/test_backlog.py @@ -2,7 +2,7 @@ import pytest from src.collectors.enums import URLStatus -from src.db.dtos.url.mapping import URLMapping +from src.db.dtos.url.mapping_.simple import SimpleURLMapping from src.db.models.impl.flag.url_validated.enums import URLType from tests.helpers.data_creator.core import DBDataCreator @@ -20,7 +20,7 @@ async def test_get_backlog_metrics(api_test_helper): # Ensure that multiple days in each month are added to the backlog table, with different values batch_1_id: int = await ddc.create_batch() - url_mappings_1: list[URLMapping] = await ddc.create_urls(count=3) + url_mappings_1: list[SimpleURLMapping] = await ddc.create_urls(count=3) url_ids_1: list[int] = [url_mapping.url_id for url_mapping in url_mappings_1] await ddc.create_batch_url_links(url_ids=url_ids_1, batch_id=batch_1_id) submitted_url_ids_1: list[int] = url_ids_1[:2] @@ -39,14 +39,14 @@ async def test_get_backlog_metrics(api_test_helper): ) batch_2_id: int = await ddc.create_batch() - not_relevant_url_mappings_2: list[URLMapping] = await ddc.create_urls(count=6) + not_relevant_url_mappings_2: list[SimpleURLMapping] = await ddc.create_urls(count=6) not_relevant_url_ids_2: list[int] = [url_mapping.url_id for url_mapping in not_relevant_url_mappings_2] await ddc.create_batch_url_links(url_ids=not_relevant_url_ids_2, batch_id=batch_2_id) await ddc.create_validated_flags( url_ids=not_relevant_url_ids_2[:4], validation_type=URLType.NOT_RELEVANT ) - error_url_mappings_2: list[URLMapping] = await ddc.create_urls( + error_url_mappings_2: list[SimpleURLMapping] = await ddc.create_urls( status=URLStatus.ERROR, count=2 ) @@ -62,7 +62,7 @@ async def test_get_backlog_metrics(api_test_helper): ) batch_3_id: int = await ddc.create_batch() - url_mappings_3: list[URLMapping] = await ddc.create_urls(count=12) + url_mappings_3: list[SimpleURLMapping] = await ddc.create_urls(count=12) url_ids_3: list[int] = [url_mapping.url_id for url_mapping in url_mappings_3] await ddc.create_batch_url_links(url_ids=url_ids_3, batch_id=batch_3_id) await ddc.create_validated_flags( diff --git a/tests/automated/integration/api/metrics/urls/aggregated/test_core.py b/tests/automated/integration/api/metrics/urls/aggregated/test_core.py index 64ae5ae4..1d8eb947 100644 --- a/tests/automated/integration/api/metrics/urls/aggregated/test_core.py +++ b/tests/automated/integration/api/metrics/urls/aggregated/test_core.py @@ -1,10 +1,9 @@ -from datetime import datetime, timedelta, timezone +from datetime import datetime, timedelta -import pendulum import pytest from src.collectors.enums import CollectorType, URLStatus -from src.db.dtos.url.mapping import URLMapping +from src.db.dtos.url.mapping_.simple import SimpleURLMapping from src.db.models.impl.flag.url_validated.enums import URLType from tests.helpers.batch_creation_parameters.core import TestBatchCreationParameters from tests.helpers.batch_creation_parameters.enums import URLCreationEnum @@ -33,24 +32,24 @@ async def test_get_urls_aggregated_metrics(api_test_helper): strategy=CollectorType.MANUAL, date_generated=today - timedelta(days=1) ) - url_mappings_0: list[URLMapping] = await ddc.create_urls(batch_id=batch_0) + url_mappings_0: list[SimpleURLMapping] = await ddc.create_urls(batch_id=batch_0) oldest_url_id: int = url_mappings_0[0].url_id batch_1: int = await ddc.create_batch( strategy=CollectorType.MANUAL, ) - url_mappings_1_ok: list[URLMapping] = await ddc.create_urls(batch_id=batch_1, count=1) - url_mappings_1_submitted: list[URLMapping] = await ddc.create_submitted_urls(count=2) + url_mappings_1_ok: list[SimpleURLMapping] = await ddc.create_urls(batch_id=batch_1, count=1) + url_mappings_1_submitted: list[SimpleURLMapping] = await ddc.create_submitted_urls(count=2) url_ids_1_submitted: list[int] = [url_mapping.url_id for url_mapping in url_mappings_1_submitted] await ddc.create_batch_url_links(url_ids=url_ids_1_submitted, batch_id=batch_1) batch_2: int = await ddc.create_batch( strategy=CollectorType.AUTO_GOOGLER, ) - url_mappings_2_ok: list[URLMapping] = await ddc.create_urls(batch_id=batch_2, count=4, status=URLStatus.OK) - url_mappings_2_error: list[URLMapping] = await ddc.create_urls(batch_id=batch_2, count=2, status=URLStatus.ERROR) - url_mappings_2_validated: list[URLMapping] = await ddc.create_validated_urls(count=1, validation_type=URLType.DATA_SOURCE) - url_mappings_2_not_relevant: list[URLMapping] = await ddc.create_validated_urls(count=5, validation_type=URLType.NOT_RELEVANT) + url_mappings_2_ok: list[SimpleURLMapping] = await ddc.create_urls(batch_id=batch_2, count=4, status=URLStatus.OK) + url_mappings_2_error: list[SimpleURLMapping] = await ddc.create_urls(batch_id=batch_2, count=2, status=URLStatus.ERROR) + url_mappings_2_validated: list[SimpleURLMapping] = await ddc.create_validated_urls(count=1, validation_type=URLType.DATA_SOURCE) + url_mappings_2_not_relevant: list[SimpleURLMapping] = await ddc.create_validated_urls(count=5, validation_type=URLType.NOT_RELEVANT) url_ids_2_validated: list[int] = [url_mapping.url_id for url_mapping in url_mappings_2_validated] url_ids_2_not_relevant: list[int] = [url_mapping.url_id for url_mapping in url_mappings_2_not_relevant] await ddc.create_batch_url_links( diff --git a/tests/automated/integration/api/submit/test_duplicate.py b/tests/automated/integration/api/submit/test_duplicate.py index c1ccfd29..0bef1091 100644 --- a/tests/automated/integration/api/submit/test_duplicate.py +++ b/tests/automated/integration/api/submit/test_duplicate.py @@ -3,7 +3,7 @@ from src.api.endpoints.submit.url.enums import URLSubmissionStatus from src.api.endpoints.submit.url.models.request import URLSubmissionRequest from src.api.endpoints.submit.url.models.response import URLSubmissionResponse -from src.db.dtos.url.mapping import URLMapping +from src.db.dtos.url.mapping_.simple import SimpleURLMapping from tests.helpers.api_test_helper import APITestHelper from tests.helpers.data_creator.core import DBDataCreator @@ -13,7 +13,7 @@ async def test_duplicate( api_test_helper: APITestHelper, db_data_creator: DBDataCreator ): - url_mapping: URLMapping = (await db_data_creator.create_urls(count=1))[0] + url_mapping: SimpleURLMapping = (await db_data_creator.create_urls(count=1))[0] response: URLSubmissionResponse = await api_test_helper.request_validator.submit_url( request=URLSubmissionRequest( diff --git a/tests/automated/integration/api/url/by_id/snapshot/test_success.py b/tests/automated/integration/api/url/by_id/snapshot/test_success.py index e3ea9d73..3109706d 100644 --- a/tests/automated/integration/api/url/by_id/snapshot/test_success.py +++ b/tests/automated/integration/api/url/by_id/snapshot/test_success.py @@ -1,6 +1,6 @@ import pytest -from src.db.dtos.url.mapping import URLMapping +from src.db.dtos.url.mapping_.simple import SimpleURLMapping from src.db.models.impl.url.screenshot.sqlalchemy import URLScreenshot from tests.automated.integration.api._helpers.RequestValidator import RequestValidator from tests.helpers.api_test_helper import APITestHelper @@ -15,7 +15,7 @@ async def test_get_url_screenshot_success( ddc: DBDataCreator = api_test_helper.db_data_creator rv: RequestValidator = ath.request_validator - url_mapping: URLMapping = (await ddc.create_urls())[0] + url_mapping: SimpleURLMapping = (await ddc.create_urls())[0] url_id: int = url_mapping.url_id url_screenshot = URLScreenshot( diff --git a/tests/automated/integration/db/structure/test_updated_at.py b/tests/automated/integration/db/structure/test_updated_at.py index 281e6ee8..0a4c18a4 100644 --- a/tests/automated/integration/db/structure/test_updated_at.py +++ b/tests/automated/integration/db/structure/test_updated_at.py @@ -24,7 +24,9 @@ async def test_updated_at(db_data_creator: DBDataCreator): url_upsert = URLUpsertModel( id=url.id, - name="New Name" + name="New Name", + url=url.url, + trailing_slash=url.trailing_slash, ) await db_data_creator.adb_client.bulk_update([url_upsert]) diff --git a/tests/automated/integration/tasks/scheduled/impl/huggingface/setup/queries/setup.py b/tests/automated/integration/tasks/scheduled/impl/huggingface/setup/queries/setup.py index 55dbeb76..1d1085a5 100644 --- a/tests/automated/integration/tasks/scheduled/impl/huggingface/setup/queries/setup.py +++ b/tests/automated/integration/tasks/scheduled/impl/huggingface/setup/queries/setup.py @@ -41,7 +41,8 @@ async def run(self, session: AsyncSession) -> list[int]: status=URLStatus.OK, name=name, description=description, - source=URLSource.COLLECTOR + source=URLSource.COLLECTOR, + trailing_slash=False, ) session.add(url) await session.flush() diff --git a/tests/automated/integration/tasks/scheduled/impl/internet_archives/probe/setup.py b/tests/automated/integration/tasks/scheduled/impl/internet_archives/probe/setup.py index 59b2d77c..7bc33222 100644 --- a/tests/automated/integration/tasks/scheduled/impl/internet_archives/probe/setup.py +++ b/tests/automated/integration/tasks/scheduled/impl/internet_archives/probe/setup.py @@ -11,11 +11,13 @@ async def add_urls(dbc: AsyncDatabaseClient) -> list[int]: insert_models: list[URLInsertModel] = [ URLInsertModel( url=TEST_URL_1, - source=URLSource.COLLECTOR + source=URLSource.COLLECTOR, + trailing_slash=False ), URLInsertModel( url=TEST_URL_2, - source=URLSource.COLLECTOR + source=URLSource.COLLECTOR, + trailing_slash=False ) ] return await dbc.bulk_insert(insert_models, return_ids=True) diff --git a/tests/automated/integration/tasks/scheduled/impl/internet_archives/save/setup.py b/tests/automated/integration/tasks/scheduled/impl/internet_archives/save/setup.py index 36b1bcb9..836ee678 100644 --- a/tests/automated/integration/tasks/scheduled/impl/internet_archives/save/setup.py +++ b/tests/automated/integration/tasks/scheduled/impl/internet_archives/save/setup.py @@ -72,11 +72,13 @@ async def add_test_urls(adb_client: AsyncDatabaseClient) -> list[int]: url_inserts: list[URLInsertModel] = [ URLInsertModel( url=TEST_URL_1, - source=URLSource.COLLECTOR + source=URLSource.COLLECTOR, + trailing_slash=False ), URLInsertModel( url=TEST_URL_2, - source=URLSource.COLLECTOR + source=URLSource.COLLECTOR, + trailing_slash=False ) ] url_ids = await adb_client.bulk_insert(url_inserts, return_ids=True) diff --git a/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/homepage_match/ineligible_cases/test_blacklist.py b/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/homepage_match/ineligible_cases/test_blacklist.py index 2334aa17..a592002f 100644 --- a/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/homepage_match/ineligible_cases/test_blacklist.py +++ b/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/homepage_match/ineligible_cases/test_blacklist.py @@ -1,7 +1,7 @@ import pytest from src.core.tasks.url.operators.agency_identification.core import AgencyIdentificationTaskOperator -from src.db.dtos.url.mapping import URLMapping +from src.db.dtos.url.mapping_.simple import SimpleURLMapping from src.db.models.impl.flag.url_validated.enums import URLType from tests.helpers.data_creator.core import DBDataCreator @@ -27,7 +27,7 @@ async def test_blacklist( await db_data_creator.link_urls_to_root([url_id], root_url_id=root_url_id) # Create Meta URLs - meta_urls: list[URLMapping] = await db_data_creator.create_validated_urls( + meta_urls: list[SimpleURLMapping] = await db_data_creator.create_validated_urls( count=3, validation_type=URLType.META_URL ) diff --git a/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/homepage_match/test_happy_path.py b/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/homepage_match/test_happy_path.py index 10e3f711..7575f37e 100644 --- a/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/homepage_match/test_happy_path.py +++ b/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/homepage_match/test_happy_path.py @@ -5,7 +5,7 @@ from src.core.tasks.base.run_info import TaskOperatorRunInfo from src.core.tasks.url.operators.agency_identification.core import AgencyIdentificationTaskOperator from src.db.client.async_ import AsyncDatabaseClient -from src.db.dtos.url.mapping import URLMapping +from src.db.dtos.url.mapping_.simple import SimpleURLMapping from src.db.models.impl.flag.url_validated.enums import URLType from src.db.models.impl.url.suggestion.agency.subtask.enum import AutoAgencyIDSubtaskType, SubtaskDetailCode from src.db.models.impl.url.suggestion.agency.subtask.sqlalchemy import URLAutoAgencyIDSubtask @@ -26,7 +26,7 @@ async def test_homepage_match( """ # Create 2 root URLs - root_url_mappings: list[URLMapping] = ( + root_url_mappings: list[SimpleURLMapping] = ( await db_data_creator.create_urls(count=2) ) root_url_ids: list[int] = [url_mapping.url_id for url_mapping in root_url_mappings] @@ -60,7 +60,7 @@ async def test_homepage_match( # Create 2 Meta URLs and agencies for multi agency case - multi_meta_urls: list[URLMapping] = await db_data_creator.create_validated_urls( + multi_meta_urls: list[SimpleURLMapping] = await db_data_creator.create_validated_urls( count=2, validation_type=URLType.META_URL ) @@ -84,7 +84,7 @@ async def test_homepage_match( assert not await operator.meets_task_prerequisites() # Set up eligible URLs - eligible_urls: list[URLMapping] = await db_data_creator.create_urls( + eligible_urls: list[SimpleURLMapping] = await db_data_creator.create_urls( count=2, ) single_url_id: int = eligible_urls[0].url_id diff --git a/tests/automated/integration/tasks/url/impl/html/setup/manager.py b/tests/automated/integration/tasks/url/impl/html/setup/manager.py index 986a9f7e..e01f7b6d 100644 --- a/tests/automated/integration/tasks/url/impl/html/setup/manager.py +++ b/tests/automated/integration/tasks/url/impl/html/setup/manager.py @@ -33,7 +33,8 @@ async def _setup_urls(self) -> list[TestURLHTMLTaskSetupRecord]: url=entry.url_info.url, name=f"Test for {entry.url_info.url}", record_type=RecordType.RESOURCES, - source=URLSource.COLLECTOR + source=URLSource.COLLECTOR, + trailing_slash=False ) url_insert_models.append(url_insert_model) url_ids = await self.adb_client.bulk_insert(url_insert_models, return_ids=True) diff --git a/tests/automated/integration/tasks/url/impl/location_identification/subtasks/nlp_location_frequency/end_to_end/conftest.py b/tests/automated/integration/tasks/url/impl/location_identification/subtasks/nlp_location_frequency/end_to_end/conftest.py index 766a7ca5..e3d39db5 100644 --- a/tests/automated/integration/tasks/url/impl/location_identification/subtasks/nlp_location_frequency/end_to_end/conftest.py +++ b/tests/automated/integration/tasks/url/impl/location_identification/subtasks/nlp_location_frequency/end_to_end/conftest.py @@ -1,6 +1,6 @@ import pytest_asyncio -from src.db.dtos.url.mapping import URLMapping +from src.db.dtos.url.mapping_.simple import SimpleURLMapping from tests.helpers.data_creator.core import DBDataCreator @@ -9,7 +9,7 @@ async def url_ids( db_data_creator: DBDataCreator, ) -> list[int]: # Create 2 URLs with compressed HTML - url_mappings: list[URLMapping] = await db_data_creator.create_urls(count=2) + url_mappings: list[SimpleURLMapping] = await db_data_creator.create_urls(count=2) url_ids: list[int] = [url.url_id for url in url_mappings] await db_data_creator.html_data(url_ids=url_ids) return url_ids diff --git a/tests/automated/integration/tasks/url/impl/probe/constants.py b/tests/automated/integration/tasks/url/impl/probe/constants.py index 07ebbcc3..93988afb 100644 --- a/tests/automated/integration/tasks/url/impl/probe/constants.py +++ b/tests/automated/integration/tasks/url/impl/probe/constants.py @@ -2,5 +2,5 @@ PATCH_ROOT = "src.external.url_request.core.URLProbeManager" TEST_URL = "www.example.com" -TEST_DEST_URL = "www.example.com/redirect" +TEST_DEST_URL = "https://www.example.com/redirect" TEST_SOURCE = URLSource.COLLECTOR diff --git a/tests/automated/integration/tasks/url/impl/probe/mocks/url_request_interface.py b/tests/automated/integration/tasks/url/impl/probe/mocks/url_request_interface.py index cc493274..2eb6a5d7 100644 --- a/tests/automated/integration/tasks/url/impl/probe/mocks/url_request_interface.py +++ b/tests/automated/integration/tasks/url/impl/probe/mocks/url_request_interface.py @@ -1,4 +1,5 @@ from src.external.url_request.probe.models.wrapper import URLProbeResponseOuterWrapper +from src.util.models.full_url import FullURL class MockURLRequestInterface: @@ -13,10 +14,10 @@ def __init__( responses = response_or_responses self._url_to_response = { - response.original_url: response for response in responses + response.original_url.id_form: response for response in responses } - async def probe_urls(self, urls: list[str]) -> list[URLProbeResponseOuterWrapper]: + async def probe_urls(self, urls: list[FullURL]) -> list[URLProbeResponseOuterWrapper]: return [ - self._url_to_response[url] for url in urls + self._url_to_response[url.id_form] for url in urls ] diff --git a/tests/automated/integration/tasks/url/impl/probe/redirect/test_dest_exists_in_db.py b/tests/automated/integration/tasks/url/impl/probe/redirect/test_dest_exists_in_db.py index b52dce6b..7aeeb1f8 100644 --- a/tests/automated/integration/tasks/url/impl/probe/redirect/test_dest_exists_in_db.py +++ b/tests/automated/integration/tasks/url/impl/probe/redirect/test_dest_exists_in_db.py @@ -30,7 +30,7 @@ async def test_url_probe_task_redirect_dest_exists_in_db( ) ) source_url_id = await setup_manager.setup_url(URLStatus.OK) - dest_url_id = await setup_manager.setup_url(URLStatus.OK, url=TEST_DEST_URL) + dest_url_id = await setup_manager.setup_url(URLStatus.OK, url=TEST_DEST_URL.replace("https://", "")) # Add web metadata for destination URL, to prevent it from being pulled web_metadata = URLWebMetadataPydantic( url_id=dest_url_id, diff --git a/tests/automated/integration/tasks/url/impl/probe/redirect/test_functional_equivalent.py b/tests/automated/integration/tasks/url/impl/probe/redirect/test_functional_equivalent.py new file mode 100644 index 00000000..a8cb51f7 --- /dev/null +++ b/tests/automated/integration/tasks/url/impl/probe/redirect/test_functional_equivalent.py @@ -0,0 +1,46 @@ +import pytest + +from src.collectors.enums import URLStatus +from src.db.models.impl.url.core.sqlalchemy import URL +from src.db.models.impl.url.web_metadata.sqlalchemy import URLWebMetadata +from src.util.models.full_url import FullURL +from tests.automated.integration.tasks.url.impl.probe.check.manager import TestURLProbeCheckManager +from tests.automated.integration.tasks.url.impl.probe.constants import TEST_URL +from tests.automated.integration.tasks.url.impl.probe.setup.manager import TestURLProbeSetupManager +from tests.helpers.run import run_task_and_confirm_success + + +@pytest.mark.asyncio +async def test_url_probe_task_functional_equivalent( + setup_manager: TestURLProbeSetupManager, + check_manager: TestURLProbeCheckManager +): + """ + If a URL: + - is functionally equivalent to the original URL + The existing URL should be updated to the functional equivalent + And no web metadata added. + """ + + operator = setup_manager.setup_operator( + response_or_responses=setup_manager.setup_redirect_probe_response( + redirect_status_code=303, + dest_status_code=303, + dest_content_type=None, + dest_error=None, + redirect_url=FullURL(TEST_URL + "/") + ) + ) + url_id = await setup_manager.setup_url(URLStatus.OK) + await run_task_and_confirm_success(operator) + + urls: list[URL] = await setup_manager.adb_client.get_all(URL) + assert len(urls) == 1 + url: URL = urls[0] + + assert url.url == TEST_URL + assert url.trailing_slash is True + + # Web metadata should be added + web_metadata: list[URLWebMetadata] = await setup_manager.adb_client.get_all(URLWebMetadata) + assert len(web_metadata) == 1 diff --git a/tests/automated/integration/tasks/url/impl/probe/redirect/test_redirect_infinite.py b/tests/automated/integration/tasks/url/impl/probe/redirect/test_redirect_infinite.py deleted file mode 100644 index 5a66af3d..00000000 --- a/tests/automated/integration/tasks/url/impl/probe/redirect/test_redirect_infinite.py +++ /dev/null @@ -1,46 +0,0 @@ -import pytest - -from src.collectors.enums import URLStatus -from tests.automated.integration.tasks.url.impl.probe.check.manager import TestURLProbeCheckManager -from tests.automated.integration.tasks.url.impl.probe.constants import TEST_URL -from tests.automated.integration.tasks.url.impl.probe.setup.manager import TestURLProbeSetupManager - - -@pytest.mark.asyncio -async def test_url_probe_task_redirect_infinite( - setup_manager: TestURLProbeSetupManager, - check_manager: TestURLProbeCheckManager -): - """ - If a URL: - - returns a redirect response to itself - The task should add a link that points to itself - as well as web metadata response to the database URL - """ - - operator = setup_manager.setup_operator( - response_or_responses=setup_manager.setup_redirect_probe_response( - redirect_status_code=303, - dest_status_code=303, - dest_content_type=None, - dest_error=None, - redirect_url=TEST_URL - ) - ) - url_id = await setup_manager.setup_url(URLStatus.OK) - run_info = await operator.run_task() - await check_manager.check_url( - url_id=url_id, - expected_status=URLStatus.OK - ) - await check_manager.check_web_metadata( - url_id=url_id, - status_code=303, - content_type=None, - error=None, - accessed=True - ) - redirect_url_id = await check_manager.check_redirect( - source_url_id=url_id, - ) - assert redirect_url_id == url_id diff --git a/tests/automated/integration/tasks/url/impl/probe/redirect/test_two_urls_same_dest.py b/tests/automated/integration/tasks/url/impl/probe/redirect/test_two_urls_same_dest.py index bf5dab9f..1dcd98d9 100644 --- a/tests/automated/integration/tasks/url/impl/probe/redirect/test_two_urls_same_dest.py +++ b/tests/automated/integration/tasks/url/impl/probe/redirect/test_two_urls_same_dest.py @@ -1,6 +1,7 @@ import pytest from src.collectors.enums import URLStatus +from src.util.models.full_url import FullURL from tests.automated.integration.tasks.url.impl.asserts import assert_task_ran_without_error from tests.automated.integration.tasks.url.impl.probe.check.manager import TestURLProbeCheckManager from tests.automated.integration.tasks.url.impl.probe.setup.manager import TestURLProbeSetupManager @@ -30,7 +31,7 @@ async def test_url_probe_task_redirect_two_urls_same_dest( dest_status_code=200, dest_content_type=None, dest_error=None, - source_url="example.com/2", + source_url=FullURL("example.com/2"), ), ] ) diff --git a/tests/automated/integration/tasks/url/impl/probe/setup/manager.py b/tests/automated/integration/tasks/url/impl/probe/setup/manager.py index 50405970..44b5bd54 100644 --- a/tests/automated/integration/tasks/url/impl/probe/setup/manager.py +++ b/tests/automated/integration/tasks/url/impl/probe/setup/manager.py @@ -8,6 +8,7 @@ from src.external.url_request.probe.models.redirect import URLProbeRedirectResponsePair from src.external.url_request.probe.models.response import URLProbeResponse from src.external.url_request.probe.models.wrapper import URLProbeResponseOuterWrapper +from src.util.models.full_url import FullURL from tests.automated.integration.tasks.url.impl.probe.constants import TEST_URL, TEST_DEST_URL, TEST_SOURCE from tests.automated.integration.tasks.url.impl.probe.mocks.url_request_interface import MockURLRequestInterface @@ -28,7 +29,8 @@ async def setup_url( url_insert_model = URLInsertModel( url=url, status=url_status, - source=TEST_SOURCE + source=TEST_SOURCE, + trailing_slash=False ) return ( await self.adb_client.bulk_insert( @@ -60,9 +62,9 @@ def setup_no_redirect_probe_response( url: str = TEST_URL ) -> URLProbeResponseOuterWrapper: return URLProbeResponseOuterWrapper( - original_url=url, + original_url=FullURL(url), response=URLProbeResponse( - url=url, + url=FullURL(url), status_code=status_code, content_type=content_type, error=error @@ -75,8 +77,8 @@ def setup_redirect_probe_response( dest_status_code: int, dest_content_type: str | None, dest_error: str | None, - source_url: str = TEST_URL, - redirect_url: str = TEST_DEST_URL + source_url: FullURL = FullURL(TEST_URL), + redirect_url: FullURL = FullURL(TEST_DEST_URL) ) -> URLProbeResponseOuterWrapper: if redirect_status_code not in (301, 302, 303, 307, 308): raise ValueError('Redirect response must be one of 301, 302, 303, 307, 308') diff --git a/tests/automated/integration/tasks/url/impl/root_url/test_branch_root_url_in_db.py b/tests/automated/integration/tasks/url/impl/root_url/test_branch_root_url_in_db.py index 7e8af066..75b7f68f 100644 --- a/tests/automated/integration/tasks/url/impl/root_url/test_branch_root_url_in_db.py +++ b/tests/automated/integration/tasks/url/impl/root_url/test_branch_root_url_in_db.py @@ -25,7 +25,8 @@ async def test_branch_root_url_in_db( # Add URL that is a root URL, and mark as such url_insert_model_root = URLInsertModel( url=ROOT_URL, - source=URLSource.DATA_SOURCES + source=URLSource.DATA_SOURCES, + trailing_slash=False ) root_url_id = (await operator.adb_client.bulk_insert([url_insert_model_root], return_ids=True))[0] root_model_flag_insert = FlagRootURLPydantic( @@ -36,7 +37,8 @@ async def test_branch_root_url_in_db( # Add URL that is a branch of the root URL url_insert_model = URLInsertModel( url=BRANCH_URL, - source=URLSource.COLLECTOR + source=URLSource.COLLECTOR, + trailing_slash=False ) branch_url_id = (await operator.adb_client.bulk_insert([url_insert_model], return_ids=True))[0] diff --git a/tests/automated/integration/tasks/url/impl/root_url/test_branch_root_url_not_in_db.py b/tests/automated/integration/tasks/url/impl/root_url/test_branch_root_url_not_in_db.py index 6c00f8f9..a0a43d3c 100644 --- a/tests/automated/integration/tasks/url/impl/root_url/test_branch_root_url_not_in_db.py +++ b/tests/automated/integration/tasks/url/impl/root_url/test_branch_root_url_not_in_db.py @@ -26,7 +26,8 @@ async def test_branch_root_url_not_in_db( # Add URL that is a branch of a root URL url_insert_model = URLInsertModel( url=BRANCH_URL, - source=URLSource.COLLECTOR + source=URLSource.COLLECTOR, + trailing_slash=False ) branch_url_id = (await operator.adb_client.bulk_insert([url_insert_model], return_ids=True))[0] diff --git a/tests/automated/integration/tasks/url/impl/root_url/test_is_root_url.py b/tests/automated/integration/tasks/url/impl/root_url/test_is_root_url.py index a6a56c7c..f129b582 100644 --- a/tests/automated/integration/tasks/url/impl/root_url/test_is_root_url.py +++ b/tests/automated/integration/tasks/url/impl/root_url/test_is_root_url.py @@ -23,7 +23,8 @@ async def test_is_root_url( # Add URL that is a root URL url_insert_model = URLInsertModel( url=ROOT_URL, - source=URLSource.DATA_SOURCES + source=URLSource.DATA_SOURCES, + trailing_slash=False ) url_id = (await operator.adb_client.bulk_insert([url_insert_model], return_ids=True))[0] diff --git a/tests/automated/integration/tasks/url/impl/root_url/test_two_branches_one_root_in_db.py b/tests/automated/integration/tasks/url/impl/root_url/test_two_branches_one_root_in_db.py index be67d23e..6fe57721 100644 --- a/tests/automated/integration/tasks/url/impl/root_url/test_two_branches_one_root_in_db.py +++ b/tests/automated/integration/tasks/url/impl/root_url/test_two_branches_one_root_in_db.py @@ -23,7 +23,8 @@ async def test_two_branches_one_root_in_db( # Add root URL and mark as such url_insert_model_root = URLInsertModel( url=ROOT_URL, - source=URLSource.DATA_SOURCES + source=URLSource.DATA_SOURCES, + trailing_slash=False ) url_id_root = (await operator.adb_client.bulk_insert([url_insert_model_root], return_ids=True))[0] root_model_flag_insert = FlagRootURLPydantic( @@ -34,13 +35,15 @@ async def test_two_branches_one_root_in_db( # Add two URLs that are branches of that root URL url_insert_model_branch_1 = URLInsertModel( url=BRANCH_URL, - source=URLSource.COLLECTOR + source=URLSource.COLLECTOR, + trailing_slash=False ) url_id_branch_1 = (await operator.adb_client.bulk_insert([url_insert_model_branch_1], return_ids=True))[0] url_insert_model_branch_2 = URLInsertModel( url=SECOND_BRANCH_URL, - source=URLSource.COLLECTOR + source=URLSource.COLLECTOR, + trailing_slash=False ) url_id_branch_2 = (await operator.adb_client.bulk_insert([url_insert_model_branch_2], return_ids=True))[0] diff --git a/tests/automated/integration/tasks/url/impl/root_url/test_two_branches_one_root_in_db_not_flagged.py b/tests/automated/integration/tasks/url/impl/root_url/test_two_branches_one_root_in_db_not_flagged.py index 614796e9..8a40a476 100644 --- a/tests/automated/integration/tasks/url/impl/root_url/test_two_branches_one_root_in_db_not_flagged.py +++ b/tests/automated/integration/tasks/url/impl/root_url/test_two_branches_one_root_in_db_not_flagged.py @@ -26,20 +26,23 @@ async def test_two_branches_one_root_in_db_not_flagged( # Add root URL but do not mark as such url_insert_model_root = URLInsertModel( url=ROOT_URL, - source=URLSource.DATA_SOURCES + source=URLSource.DATA_SOURCES, + trailing_slash=False ) url_id_root = (await operator.adb_client.bulk_insert([url_insert_model_root], return_ids=True))[0] # Add two URLs that are branches of that root URL url_insert_model_branch_1 = URLInsertModel( url=BRANCH_URL, - source=URLSource.COLLECTOR + source=URLSource.COLLECTOR, + trailing_slash=False ) url_id_branch_1 = (await operator.adb_client.bulk_insert([url_insert_model_branch_1], return_ids=True))[0] url_insert_model_branch_2 = URLInsertModel( url=SECOND_BRANCH_URL, - source=URLSource.COLLECTOR + source=URLSource.COLLECTOR, + trailing_slash=False ) url_id_branch_2 = (await operator.adb_client.bulk_insert([url_insert_model_branch_2], return_ids=True))[0] diff --git a/tests/automated/integration/tasks/url/impl/root_url/test_two_branches_one_root_not_in_db.py b/tests/automated/integration/tasks/url/impl/root_url/test_two_branches_one_root_not_in_db.py index f68786b9..8839905b 100644 --- a/tests/automated/integration/tasks/url/impl/root_url/test_two_branches_one_root_not_in_db.py +++ b/tests/automated/integration/tasks/url/impl/root_url/test_two_branches_one_root_not_in_db.py @@ -23,13 +23,15 @@ async def test_two_branches_one_root_in_db_not_flagged( # Add two URLs that are branches of a root URL url_insert_model_branch_1 = URLInsertModel( url=BRANCH_URL, - source=URLSource.COLLECTOR + source=URLSource.COLLECTOR, + trailing_slash=BRANCH_URL.endswith('/') ) url_id_branch_1 = (await operator.adb_client.bulk_insert([url_insert_model_branch_1], return_ids=True))[0] url_insert_model_branch_2 = URLInsertModel( url=SECOND_BRANCH_URL, - source=URLSource.COLLECTOR + source=URLSource.COLLECTOR, + trailing_slash=SECOND_BRANCH_URL.endswith('/') ) url_id_branch_2 = (await operator.adb_client.bulk_insert([url_insert_model_branch_2], return_ids=True))[0] diff --git a/tests/automated/integration/tasks/url/impl/screenshot/test_core.py b/tests/automated/integration/tasks/url/impl/screenshot/test_core.py index f65aa40d..9acffd0e 100644 --- a/tests/automated/integration/tasks/url/impl/screenshot/test_core.py +++ b/tests/automated/integration/tasks/url/impl/screenshot/test_core.py @@ -3,7 +3,7 @@ import pytest from src.core.tasks.url.operators.screenshot.core import URLScreenshotTaskOperator -from src.db.dtos.url.mapping import URLMapping +from src.db.dtos.url.mapping_.simple import SimpleURLMapping from src.db.models.impl.url.screenshot.sqlalchemy import URLScreenshot from src.db.models.impl.url.task_error.sqlalchemy import URLTaskError from src.external.url_request.dtos.screenshot_response import URLScreenshotResponse @@ -24,9 +24,9 @@ async def test_core( assert not await operator.meets_task_prerequisites() # Add two URLs to database - url_mappings: list[URLMapping] = await db_data_creator.create_urls(count=2) - screenshot_mapping: URLMapping = url_mappings[0] - error_mapping: URLMapping = url_mappings[1] + url_mappings: list[SimpleURLMapping] = await db_data_creator.create_urls(count=2) + screenshot_mapping: SimpleURLMapping = url_mappings[0] + error_mapping: SimpleURLMapping = url_mappings[1] url_ids: list[int] = [url_mapping.url_id for url_mapping in url_mappings] # Add web metadata for 200 responses diff --git a/tests/automated/integration/tasks/url/impl/submit_meta_urls/test_core.py b/tests/automated/integration/tasks/url/impl/submit_meta_urls/test_core.py index 92287454..08914bed 100644 --- a/tests/automated/integration/tasks/url/impl/submit_meta_urls/test_core.py +++ b/tests/automated/integration/tasks/url/impl/submit_meta_urls/test_core.py @@ -5,9 +5,8 @@ from pdap_access_manager import ResponseInfo from src.collectors.enums import URLStatus -from src.core.enums import SubmitResponseStatus from src.core.tasks.url.operators.submit_meta_urls.core import SubmitMetaURLsTaskOperator -from src.db.dtos.url.mapping import URLMapping +from src.db.dtos.url.mapping_.simple import SimpleURLMapping from src.db.models.impl.flag.url_validated.enums import URLType from src.db.models.impl.url.core.sqlalchemy import URL from src.db.models.impl.url.ds_meta_url.sqlalchemy import URLDSMetaURL @@ -37,7 +36,7 @@ async def test_submit_meta_urls( # Create validated meta url agency_id: int = (await db_data_creator.create_agencies(count=1))[0] - mapping: URLMapping = (await db_data_creator.create_validated_urls( + mapping: SimpleURLMapping = (await db_data_creator.create_validated_urls( validation_type=URLType.META_URL ))[0] await db_data_creator.link_urls_to_agencies( diff --git a/tests/helpers/data_creator/core.py b/tests/helpers/data_creator/core.py index cbeb207f..6bf7df5f 100644 --- a/tests/helpers/data_creator/core.py +++ b/tests/helpers/data_creator/core.py @@ -10,7 +10,7 @@ from src.db.client.async_ import AsyncDatabaseClient from src.db.client.sync import DatabaseClient from src.db.dtos.url.insert import InsertURLsInfo -from src.db.dtos.url.mapping import URLMapping +from src.db.dtos.url.mapping_.simple import SimpleURLMapping from src.db.enums import TaskType from src.db.models.impl.agency.enums import AgencyType from src.db.models.impl.agency.sqlalchemy import Agency @@ -398,8 +398,8 @@ async def create_validated_urls( record_type: RecordType = RecordType.RESOURCES, validation_type: URLType = URLType.DATA_SOURCE, count: int = 1 - ) -> list[URLMapping]: - url_mappings: list[URLMapping] = await self.create_urls( + ) -> list[SimpleURLMapping]: + url_mappings: list[SimpleURLMapping] = await self.create_urls( record_type=record_type, count=count ) @@ -414,8 +414,8 @@ async def create_submitted_urls( self, record_type: RecordType = RecordType.RESOURCES, count: int = 1 - ) -> list[URLMapping]: - url_mappings: list[URLMapping] = await self.create_urls( + ) -> list[SimpleURLMapping]: + url_mappings: list[SimpleURLMapping] = await self.create_urls( record_type=record_type, count=count ) @@ -436,9 +436,9 @@ async def create_urls( collector_metadata: dict | None = None, count: int = 1, batch_id: int | None = None - ) -> list[URLMapping]: + ) -> list[SimpleURLMapping]: - url_mappings: list[URLMapping] = await create_urls( + url_mappings: list[SimpleURLMapping] = await create_urls( adb_client=self.adb_client, status=status, source=source, diff --git a/tests/helpers/data_creator/create.py b/tests/helpers/data_creator/create.py index 200a34cd..57c9f9da 100644 --- a/tests/helpers/data_creator/create.py +++ b/tests/helpers/data_creator/create.py @@ -4,7 +4,7 @@ from src.core.enums import BatchStatus, RecordType from src.db import County, Locality, USState from src.db.client.async_ import AsyncDatabaseClient -from src.db.dtos.url.mapping import URLMapping +from src.db.dtos.url.mapping_.simple import SimpleURLMapping from src.db.models.impl.batch.pydantic.insert import BatchInsertModel from src.db.models.impl.flag.url_validated.enums import URLType from src.db.models.impl.flag.url_validated.pydantic import FlagURLValidatedPydantic @@ -13,7 +13,7 @@ from src.db.models.impl.url.core.pydantic.insert import URLInsertModel from src.db.models.impl.url.data_source.pydantic import URLDataSourcePydantic from src.db.models.impl.url.record_type.pydantic import URLRecordTypePydantic -from tests.helpers.counter import COUNTER, next_int +from tests.helpers.counter import next_int from tests.helpers.data_creator.generate import generate_batch, generate_urls, generate_validated_flags, \ generate_url_data_sources, generate_batch_url_links from tests.helpers.data_creator.models.creation_info.county import CountyCreationInfo @@ -37,7 +37,7 @@ async def create_urls( record_type: RecordType | None = RecordType.RESOURCES, collector_metadata: dict | None = None, count: int = 1 -) -> list[URLMapping]: +) -> list[SimpleURLMapping]: urls: list[URLInsertModel] = generate_urls( status=status, source=source, @@ -55,7 +55,7 @@ async def create_urls( ] await adb_client.bulk_insert(record_types) - return [URLMapping(url_id=url_id, url=url.url) for url_id, url in zip(url_ids, urls)] + return [SimpleURLMapping(url_id=url_id, url=url.url) for url_id, url in zip(url_ids, urls)] async def create_validated_flags( adb_client: AsyncDatabaseClient, diff --git a/tests/helpers/data_creator/generate.py b/tests/helpers/data_creator/generate.py index bee0993f..f1eefce2 100644 --- a/tests/helpers/data_creator/generate.py +++ b/tests/helpers/data_creator/generate.py @@ -54,6 +54,7 @@ def generate_urls( source=source, name=f"Example {val}", collector_metadata=collector_metadata, + trailing_slash=False )) return results diff --git a/tests/helpers/data_creator/models/creation_info/url.py b/tests/helpers/data_creator/models/creation_info/url.py index 16c45a0a..67e148c0 100644 --- a/tests/helpers/data_creator/models/creation_info/url.py +++ b/tests/helpers/data_creator/models/creation_info/url.py @@ -2,14 +2,13 @@ from pydantic import BaseModel -from src.collectors.enums import URLStatus -from src.db.dtos.url.mapping import URLMapping +from src.db.dtos.url.mapping_.simple import SimpleURLMapping from tests.helpers.batch_creation_parameters.annotation_info import AnnotationInfo from tests.helpers.batch_creation_parameters.enums import URLCreationEnum class URLCreationInfo(BaseModel): - url_mappings: list[URLMapping] + url_mappings: list[SimpleURLMapping] outcome: URLCreationEnum annotation_info: Optional[AnnotationInfo] = None diff --git a/tests/helpers/setup/final_review/core.py b/tests/helpers/setup/final_review/core.py index ababae82..a3a3d42c 100644 --- a/tests/helpers/setup/final_review/core.py +++ b/tests/helpers/setup/final_review/core.py @@ -9,7 +9,7 @@ async def setup_for_get_next_url_for_final_review( db_data_creator: DBDataCreator, - annotation_count: Optional[int] = None, + annotation_count: int | None = None, include_user_annotations: bool = True, include_miscellaneous_metadata: bool = True ) -> FinalReviewSetupInfo: diff --git a/tests/helpers/setup/final_review/model.py b/tests/helpers/setup/final_review/model.py index a3e57a3c..1eac963e 100644 --- a/tests/helpers/setup/final_review/model.py +++ b/tests/helpers/setup/final_review/model.py @@ -1,12 +1,10 @@ -from typing import Optional - from pydantic import BaseModel -from src.db.dtos.url.mapping import URLMapping +from src.db.dtos.url.mapping_.simple import SimpleURLMapping class FinalReviewSetupInfo(BaseModel): batch_id: int - url_mapping: URLMapping + url_mapping: SimpleURLMapping user_agency_id: int | None name_suggestion_id: int | None