Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,147 @@
"""Add trailing slash column

Revision ID: 7fc6502f1fa3
Revises: ff4e8b2f6348
Create Date: 2025-10-17 18:26:56.756915

"""
from typing import Sequence, Union

from alembic import op
import sqlalchemy as sa


# revision identifiers, used by Alembic.
revision: str = '7fc6502f1fa3'
down_revision: Union[str, None] = 'ff4e8b2f6348'
branch_labels: Union[str, Sequence[str], None] = None
depends_on: Union[str, Sequence[str], None] = None


def upgrade() -> None:

Check warning on line 21 in alembic/versions/2025_10_17_1826-7fc6502f1fa3_add_trailing_slash_column.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] alembic/versions/2025_10_17_1826-7fc6502f1fa3_add_trailing_slash_column.py#L21 <103>

Missing docstring in public function
Raw output
./alembic/versions/2025_10_17_1826-7fc6502f1fa3_add_trailing_slash_column.py:21:1: D103 Missing docstring in public function
_remove_duplicates()
_add_trailing_slash_column()
_migrate_trailing_slash_to_column()
_remove_trailing_slash_from_url_column()
_add_check_constraint_forbidding_trailing_slash_in_url()

def _remove_duplicates():
op.execute(
"""
DELETE FROM urls
WHERE id IN (
23504,
29401,
21032,
23687,
15760,
17574,
17669,
21382,
11697,
18076,
27764,
11395,
17702,
26857,
30843,
21850,
29471,
26789,
19428,
18452,
30547,
24004,
27857,
30260,
26968,
27065,
29073,
21827,
25615,
28644,
24417,
29801,
27625,
15708,
23517,
26415,
26081,
7478,
20368,
19494,
26624,
3817,
3597,
3568,
16113,
24125,
30625,
29965,
23134,
19207,
12158,
3835,
24730,
17113,
29987,
21452,
24605,
5043,
17237,
25522,
11065,
12387,
12210,
11185,
11961,
4935,
24200,
29028,
24371,
28355,
17620,
19546,
3598
)
"""
)

def _add_trailing_slash_column():
op.add_column(
'urls',
sa.Column(
'trailing_slash',
sa.Boolean(),
nullable=False,
server_default=sa.text('false')
)
)

def _migrate_trailing_slash_to_column():
op.execute(
"""
UPDATE urls
SET trailing_slash = url ~ '/$'
"""
)

def _remove_trailing_slash_from_url_column():
op.execute(
"""
UPDATE urls
SET url = rtrim(url, '/')
WHERE url like '%/';
"""
)

def _add_check_constraint_forbidding_trailing_slash_in_url():
op.execute(
"""
ALTER TABLE urls
ADD CONSTRAINT no_trailing_slash CHECK (url !~ '/$')
"""
)

def downgrade() -> None:

Check warning on line 146 in alembic/versions/2025_10_17_1826-7fc6502f1fa3_add_trailing_slash_column.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] alembic/versions/2025_10_17_1826-7fc6502f1fa3_add_trailing_slash_column.py#L146 <103>

Missing docstring in public function
Raw output
./alembic/versions/2025_10_17_1826-7fc6502f1fa3_add_trailing_slash_column.py:146:1: D103 Missing docstring in public function
pass
4 changes: 2 additions & 2 deletions src/api/endpoints/annotate/_shared/extract.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
from src.api.endpoints.annotate.all.get.queries.location_.core import GetLocationSuggestionsQueryBuilder
from src.api.endpoints.annotate.all.get.queries.name.core import GetNameSuggestionsQueryBuilder
from src.db.dto_converter import DTOConverter
from src.db.dtos.url.mapping import URLMapping
from src.db.dtos.url.mapping_.simple import SimpleURLMapping
from src.db.models.impl.url.core.sqlalchemy import URL
from src.db.models.impl.url.suggestion.agency.user import UserUrlAgencySuggestion

Expand Down Expand Up @@ -44,7 +44,7 @@ async def extract_and_format_get_annotation_result(
await GetNameSuggestionsQueryBuilder(url_id=url.id).run(session)
return GetNextURLForAllAnnotationResponse(
next_annotation=GetNextURLForAllAnnotationInnerResponse(
url_info=URLMapping(
url_info=SimpleURLMapping(
url_id=url.id,
url=url.full_url
),
Expand Down
6 changes: 2 additions & 4 deletions src/api/endpoints/annotate/dtos/shared/base/response.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,12 @@
from typing import Optional

from pydantic import BaseModel, Field

from src.api.endpoints.annotate.dtos.shared.batch import AnnotationBatchInfo
from src.core.tasks.url.operators.html.scraper.parser.dtos.response_html import ResponseHTMLInfo
from src.db.dtos.url.mapping import URLMapping
from src.db.dtos.url.mapping_.simple import SimpleURLMapping


class AnnotationInnerResponseInfoBase(BaseModel):
url_info: URLMapping = Field(
url_info: SimpleURLMapping = Field(
title="Information about the URL"
)
html_info: ResponseHTMLInfo = Field(
Expand Down
5 changes: 3 additions & 2 deletions src/api/endpoints/collector/manual/query.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,13 +48,14 @@ async def run(self, session: AsyncSession) -> ManualBatchResponseDTO:
url_and_scheme: URLAndScheme = get_url_and_scheme(entry.url)

url = URL(
url=url_and_scheme.url,
url=url_and_scheme.url.rstrip('/'),
scheme=url_and_scheme.scheme,
name=entry.name,
description=entry.description,
collector_metadata=entry.collector_metadata,
status=URLStatus.OK.value,
source=URLSource.MANUAL
source=URLSource.MANUAL,
trailing_slash=url_and_scheme.url.endswith('/'),
)


Expand Down
1 change: 1 addition & 0 deletions src/api/endpoints/submit/url/queries/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@ async def run(self, session: AsyncSession) -> URLSubmissionResponse:
scheme=url_and_scheme.scheme,
source=URLSource.MANUAL,
status=URLStatus.OK,
trailing_slash=url_and_scheme.url.endswith('/'),
)
session.add(url_insert)
await session.flush()
Expand Down
5 changes: 3 additions & 2 deletions src/collectors/queries/insert/url.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,11 +19,12 @@ async def run(self, session: AsyncSession) -> int:
"""Insert a new URL into the database."""
url_and_scheme: URLAndScheme = get_url_and_scheme(self.url_info.url)
url_entry = URL(
url=url_and_scheme.url,
url=url_and_scheme.url.rstrip('/'),
scheme=url_and_scheme.scheme,
collector_metadata=self.url_info.collector_metadata,
status=self.url_info.status.value,
source=self.url_info.source
source=self.url_info.source,
trailing_slash=url_and_scheme.url.endswith('/'),
)
if self.url_info.created_at is not None:
url_entry.created_at = self.url_info.created_at
Expand Down
4 changes: 2 additions & 2 deletions src/collectors/queries/insert/urls/query.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from src.collectors.queries.insert.urls.request_manager import InsertURLsRequestManager
from src.util.url import clean_url
from src.db.dtos.url.insert import InsertURLsInfo
from src.db.dtos.url.mapping import URLMapping
from src.db.dtos.url.mapping_.simple import SimpleURLMapping
from src.db.models.impl.duplicate.pydantic.insert import DuplicateInsertInfo
from src.db.models.impl.url.core.pydantic.info import URLInfo
from src.db.queries.base.builder import QueryBuilderBase
Expand Down Expand Up @@ -32,7 +32,7 @@ async def run(self, session: AsyncSession) -> InsertURLsInfo:
async with session.begin_nested() as sp:
url_id = await rm.insert_url(url_info)
url_mappings.append(
URLMapping(
SimpleURLMapping(
url_id=url_id,
url=url_info.url
)
Expand Down
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
from src.db.models.impl.url.internet_archives.probe.pydantic import URLInternetArchiveMetadataPydantic
from src.external.internet_archives.models.ia_url_mapping import InternetArchivesURLMapping
from src.util.url_mapper import URLMapper
from src.util.url_mapper_.simple import SimpleURLMapper


def convert_ia_url_mapping_to_ia_metadata(
url_mapper: URLMapper,
url_mapper: SimpleURLMapper,
ia_mapping: InternetArchivesURLMapping
) -> URLInternetArchiveMetadataPydantic:
iam = ia_mapping.ia_metadata
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,15 +12,15 @@
CheckURLInternetArchivesTaskPrerequisitesQueryBuilder
from src.core.tasks.scheduled.templates.operator import ScheduledTaskOperatorBase
from src.db.client.async_ import AsyncDatabaseClient
from src.db.dtos.url.mapping import URLMapping
from src.db.dtos.url.mapping_.simple import SimpleURLMapping
from src.db.enums import TaskType
from src.db.models.impl.flag.checked_for_ia.pydantic import FlagURLCheckedForInternetArchivesPydantic
from src.db.models.impl.url.internet_archives.probe.pydantic import URLInternetArchiveMetadataPydantic
from src.db.models.impl.url.task_error.pydantic_.small import URLTaskErrorSmall
from src.external.internet_archives.client import InternetArchivesClient
from src.external.internet_archives.models.ia_url_mapping import InternetArchivesURLMapping
from src.util.progress_bar import get_progress_bar_disabled
from src.util.url_mapper import URLMapper
from src.util.url_mapper_.simple import SimpleURLMapper


class InternetArchivesProbeTaskOperator(
Expand Down Expand Up @@ -51,10 +51,10 @@
DeleteOldUnsuccessfulIACheckedFlagsQueryBuilder()
)

url_mappings: list[URLMapping] = await self._get_url_mappings()
url_mappings: list[SimpleURLMapping] = await self._get_url_mappings()
if len(url_mappings) == 0:
return
mapper = URLMapper(url_mappings)
mapper = SimpleURLMapper(url_mappings)

await self.link_urls_to_task(mapper.get_all_ids())

Expand All @@ -65,7 +65,7 @@
await self._add_errors_to_db(mapper, ia_mappings=subsets.error)
await self._add_ia_metadata_to_db(mapper, ia_mappings=subsets.has_metadata)

async def _add_errors_to_db(self, mapper: URLMapper, ia_mappings: list[InternetArchivesURLMapping]) -> None:
async def _add_errors_to_db(self, mapper: SimpleURLMapper, ia_mappings: list[InternetArchivesURLMapping]) -> None:
url_error_info_list: list[URLTaskErrorSmall] = []
for ia_mapping in ia_mappings:
url_id = mapper.get_id(ia_mapping.url)
Expand All @@ -76,7 +76,7 @@
url_error_info_list.append(url_error_info)
await self.add_task_errors(url_error_info_list)

async def _get_url_mappings(self) -> list[URLMapping]:
async def _get_url_mappings(self) -> list[SimpleURLMapping]:
return await self.adb_client.run_query_builder(
GetURLsForInternetArchivesTaskQueryBuilder()
)
Expand All @@ -93,7 +93,7 @@

async def _add_ia_metadata_to_db(
self,
url_mapper: URLMapper,
url_mapper: SimpleURLMapper,
ia_mappings: list[InternetArchivesURLMapping],
) -> None:
insert_objects: list[URLInternetArchiveMetadataPydantic] = [
Expand All @@ -106,7 +106,7 @@
await self.adb_client.bulk_insert(insert_objects)

async def _add_ia_flags_to_db(
self, mapper: URLMapper, ia_mappings: list[InternetArchivesURLMapping]) -> None:
self, mapper: SimpleURLMapper, ia_mappings: list[InternetArchivesURLMapping]) -> None:

Check failure on line 109 in src/core/tasks/scheduled/impl/internet_archives/probe/operator.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] src/core/tasks/scheduled/impl/internet_archives/probe/operator.py#L109 <125>

continuation line with same indent as next logical line
Raw output
./src/core/tasks/scheduled/impl/internet_archives/probe/operator.py:109:9: E125 continuation line with same indent as next logical line
flags: list[FlagURLCheckedForInternetArchivesPydantic] = []
for ia_mapping in ia_mappings:
url_id = mapper.get_id(ia_mapping.url)
Expand Down
Original file line number Diff line number Diff line change
@@ -1,18 +1,15 @@
from sqlalchemy import select, or_, exists, text, func
from sqlalchemy import select

Check warning on line 1 in src/core/tasks/scheduled/impl/internet_archives/probe/queries/get.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] src/core/tasks/scheduled/impl/internet_archives/probe/queries/get.py#L1 <100>

Missing docstring in public module
Raw output
./src/core/tasks/scheduled/impl/internet_archives/probe/queries/get.py:1:1: D100 Missing docstring in public module
from sqlalchemy.ext.asyncio import AsyncSession

from src.core.tasks.scheduled.impl.internet_archives.probe.queries.cte import CheckURLInternetArchivesCTEContainer
from src.db.dtos.url.mapping import URLMapping
from src.db.helpers.query import not_exists_url
from src.db.models.impl.flag.checked_for_ia.sqlalchemy import FlagURLCheckedForInternetArchives
from src.db.models.impl.url.core.sqlalchemy import URL
from src.db.dtos.url.mapping_.simple import SimpleURLMapping
from src.db.queries.base.builder import QueryBuilderBase

from src.db.helpers.session import session_helper as sh

class GetURLsForInternetArchivesTaskQueryBuilder(QueryBuilderBase):

async def run(self, session: AsyncSession) -> list[URLMapping]:
async def run(self, session: AsyncSession) -> list[SimpleURLMapping]:

Check warning on line 12 in src/core/tasks/scheduled/impl/internet_archives/probe/queries/get.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] src/core/tasks/scheduled/impl/internet_archives/probe/queries/get.py#L12 <102>

Missing docstring in public method
Raw output
./src/core/tasks/scheduled/impl/internet_archives/probe/queries/get.py:12:1: D102 Missing docstring in public method
cte = CheckURLInternetArchivesCTEContainer()
query = (
select(
Expand All @@ -24,7 +21,7 @@

db_mappings = await sh.mappings(session, query=query)
return [
URLMapping(
SimpleURLMapping(
url_id=mapping["url_id"],
url=mapping["url"]
) for mapping in db_mappings
Expand Down
Original file line number Diff line number Diff line change
@@ -1,15 +1,15 @@
from pydantic import BaseModel

from src.db.dtos.url.mapping import URLMapping
from src.db.dtos.url.mapping_.simple import SimpleURLMapping


class InternetArchivesSaveTaskEntry(BaseModel):
url: str
url_id: int
is_new: bool

def to_url_mapping(self) -> URLMapping:
return URLMapping(
def to_url_mapping(self) -> SimpleURLMapping:

Check warning on line 11 in src/core/tasks/scheduled/impl/internet_archives/save/models/entry.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] src/core/tasks/scheduled/impl/internet_archives/save/models/entry.py#L11 <102>

Missing docstring in public method
Raw output
./src/core/tasks/scheduled/impl/internet_archives/save/models/entry.py:11:1: D102 Missing docstring in public method
return SimpleURLMapping(
url_id=self.url_id,
url=self.url
)
17 changes: 17 additions & 0 deletions src/core/tasks/url/operators/probe/convert.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from src.core.tasks.url.operators.probe.tdo import URLProbeTDO
from src.db.models.impl.url.web_metadata.insert import URLWebMetadataPydantic
from src.external.url_request.probe.models.redirect import URLProbeRedirectResponsePair


def convert_tdo_to_web_metadata_list(tdos: list[URLProbeTDO]) -> list[URLWebMetadataPydantic]:
Expand All @@ -16,3 +17,19 @@
results.append(web_metadata_object)
return results

def convert_tdos_with_functional_equivalents_to_web_metadata_list(

Check warning on line 20 in src/core/tasks/url/operators/probe/convert.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] src/core/tasks/url/operators/probe/convert.py#L20 <103>

Missing docstring in public function
Raw output
./src/core/tasks/url/operators/probe/convert.py:20:1: D103 Missing docstring in public function
tdos: list[URLProbeTDO]
) -> list[URLWebMetadataPydantic]:
results: list[URLWebMetadataPydantic] = []
for tdo in tdos:
response: URLProbeRedirectResponsePair = tdo.response.response
dest = response.destination
web_metadata_object = URLWebMetadataPydantic(
url_id=tdo.url_mapping.url_id,
accessed=dest.status_code != 404,
status_code=dest.status_code,
content_type=dest.content_type,
error_message=dest.error
)
results.append(web_metadata_object)
return results
Loading