Skip to content

Commit fbda9c1

Browse files
authored
Merge pull request #497 from Police-Data-Accessibility-Project/mc_489_update_root_url_and_redirect_url_logic
Update root URL and redirect URL logic
2 parents fdf40f9 + 105eefa commit fbda9c1

File tree

97 files changed

+888
-375
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

97 files changed

+888
-375
lines changed
Lines changed: 147 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,147 @@
1+
"""Add trailing slash column
2+
3+
Revision ID: 7fc6502f1fa3
4+
Revises: ff4e8b2f6348
5+
Create Date: 2025-10-17 18:26:56.756915
6+
7+
"""
8+
from typing import Sequence, Union
9+
10+
from alembic import op
11+
import sqlalchemy as sa
12+
13+
14+
# revision identifiers, used by Alembic.
15+
revision: str = '7fc6502f1fa3'
16+
down_revision: Union[str, None] = 'ff4e8b2f6348'
17+
branch_labels: Union[str, Sequence[str], None] = None
18+
depends_on: Union[str, Sequence[str], None] = None
19+
20+
21+
def upgrade() -> None:
22+
_remove_duplicates()
23+
_add_trailing_slash_column()
24+
_migrate_trailing_slash_to_column()
25+
_remove_trailing_slash_from_url_column()
26+
_add_check_constraint_forbidding_trailing_slash_in_url()
27+
28+
def _remove_duplicates():
29+
op.execute(
30+
"""
31+
DELETE FROM urls
32+
WHERE id IN (
33+
23504,
34+
29401,
35+
21032,
36+
23687,
37+
15760,
38+
17574,
39+
17669,
40+
21382,
41+
11697,
42+
18076,
43+
27764,
44+
11395,
45+
17702,
46+
26857,
47+
30843,
48+
21850,
49+
29471,
50+
26789,
51+
19428,
52+
18452,
53+
30547,
54+
24004,
55+
27857,
56+
30260,
57+
26968,
58+
27065,
59+
29073,
60+
21827,
61+
25615,
62+
28644,
63+
24417,
64+
29801,
65+
27625,
66+
15708,
67+
23517,
68+
26415,
69+
26081,
70+
7478,
71+
20368,
72+
19494,
73+
26624,
74+
3817,
75+
3597,
76+
3568,
77+
16113,
78+
24125,
79+
30625,
80+
29965,
81+
23134,
82+
19207,
83+
12158,
84+
3835,
85+
24730,
86+
17113,
87+
29987,
88+
21452,
89+
24605,
90+
5043,
91+
17237,
92+
25522,
93+
11065,
94+
12387,
95+
12210,
96+
11185,
97+
11961,
98+
4935,
99+
24200,
100+
29028,
101+
24371,
102+
28355,
103+
17620,
104+
19546,
105+
3598
106+
)
107+
"""
108+
)
109+
110+
def _add_trailing_slash_column():
111+
op.add_column(
112+
'urls',
113+
sa.Column(
114+
'trailing_slash',
115+
sa.Boolean(),
116+
nullable=False,
117+
server_default=sa.text('false')
118+
)
119+
)
120+
121+
def _migrate_trailing_slash_to_column():
122+
op.execute(
123+
"""
124+
UPDATE urls
125+
SET trailing_slash = url ~ '/$'
126+
"""
127+
)
128+
129+
def _remove_trailing_slash_from_url_column():
130+
op.execute(
131+
"""
132+
UPDATE urls
133+
SET url = rtrim(url, '/')
134+
WHERE url like '%/';
135+
"""
136+
)
137+
138+
def _add_check_constraint_forbidding_trailing_slash_in_url():
139+
op.execute(
140+
"""
141+
ALTER TABLE urls
142+
ADD CONSTRAINT no_trailing_slash CHECK (url !~ '/$')
143+
"""
144+
)
145+
146+
def downgrade() -> None:
147+
pass

src/api/endpoints/annotate/_shared/extract.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
from src.api.endpoints.annotate.all.get.queries.location_.core import GetLocationSuggestionsQueryBuilder
1616
from src.api.endpoints.annotate.all.get.queries.name.core import GetNameSuggestionsQueryBuilder
1717
from src.db.dto_converter import DTOConverter
18-
from src.db.dtos.url.mapping import URLMapping
18+
from src.db.dtos.url.mapping_.simple import SimpleURLMapping
1919
from src.db.models.impl.url.core.sqlalchemy import URL
2020
from src.db.models.impl.url.suggestion.agency.user import UserUrlAgencySuggestion
2121

@@ -44,7 +44,7 @@ async def extract_and_format_get_annotation_result(
4444
await GetNameSuggestionsQueryBuilder(url_id=url.id).run(session)
4545
return GetNextURLForAllAnnotationResponse(
4646
next_annotation=GetNextURLForAllAnnotationInnerResponse(
47-
url_info=URLMapping(
47+
url_info=SimpleURLMapping(
4848
url_id=url.id,
4949
url=url.full_url
5050
),

src/api/endpoints/annotate/dtos/shared/base/response.py

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,12 @@
1-
from typing import Optional
2-
31
from pydantic import BaseModel, Field
42

53
from src.api.endpoints.annotate.dtos.shared.batch import AnnotationBatchInfo
64
from src.core.tasks.url.operators.html.scraper.parser.dtos.response_html import ResponseHTMLInfo
7-
from src.db.dtos.url.mapping import URLMapping
5+
from src.db.dtos.url.mapping_.simple import SimpleURLMapping
86

97

108
class AnnotationInnerResponseInfoBase(BaseModel):
11-
url_info: URLMapping = Field(
9+
url_info: SimpleURLMapping = Field(
1210
title="Information about the URL"
1311
)
1412
html_info: ResponseHTMLInfo = Field(

src/api/endpoints/collector/manual/query.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -48,13 +48,14 @@ async def run(self, session: AsyncSession) -> ManualBatchResponseDTO:
4848
url_and_scheme: URLAndScheme = get_url_and_scheme(entry.url)
4949

5050
url = URL(
51-
url=url_and_scheme.url,
51+
url=url_and_scheme.url.rstrip('/'),
5252
scheme=url_and_scheme.scheme,
5353
name=entry.name,
5454
description=entry.description,
5555
collector_metadata=entry.collector_metadata,
5656
status=URLStatus.OK.value,
57-
source=URLSource.MANUAL
57+
source=URLSource.MANUAL,
58+
trailing_slash=url_and_scheme.url.endswith('/'),
5859
)
5960

6061

src/api/endpoints/submit/url/queries/core.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,7 @@ async def run(self, session: AsyncSession) -> URLSubmissionResponse:
6363
scheme=url_and_scheme.scheme,
6464
source=URLSource.MANUAL,
6565
status=URLStatus.OK,
66+
trailing_slash=url_and_scheme.url.endswith('/'),
6667
)
6768
session.add(url_insert)
6869
await session.flush()

src/collectors/queries/insert/url.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,11 +19,12 @@ async def run(self, session: AsyncSession) -> int:
1919
"""Insert a new URL into the database."""
2020
url_and_scheme: URLAndScheme = get_url_and_scheme(self.url_info.url)
2121
url_entry = URL(
22-
url=url_and_scheme.url,
22+
url=url_and_scheme.url.rstrip('/'),
2323
scheme=url_and_scheme.scheme,
2424
collector_metadata=self.url_info.collector_metadata,
2525
status=self.url_info.status.value,
26-
source=self.url_info.source
26+
source=self.url_info.source,
27+
trailing_slash=url_and_scheme.url.endswith('/'),
2728
)
2829
if self.url_info.created_at is not None:
2930
url_entry.created_at = self.url_info.created_at

src/collectors/queries/insert/urls/query.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
from src.collectors.queries.insert.urls.request_manager import InsertURLsRequestManager
55
from src.util.url import clean_url
66
from src.db.dtos.url.insert import InsertURLsInfo
7-
from src.db.dtos.url.mapping import URLMapping
7+
from src.db.dtos.url.mapping_.simple import SimpleURLMapping
88
from src.db.models.impl.duplicate.pydantic.insert import DuplicateInsertInfo
99
from src.db.models.impl.url.core.pydantic.info import URLInfo
1010
from src.db.queries.base.builder import QueryBuilderBase
@@ -32,7 +32,7 @@ async def run(self, session: AsyncSession) -> InsertURLsInfo:
3232
async with session.begin_nested() as sp:
3333
url_id = await rm.insert_url(url_info)
3434
url_mappings.append(
35-
URLMapping(
35+
SimpleURLMapping(
3636
url_id=url_id,
3737
url=url_info.url
3838
)

src/core/tasks/scheduled/impl/internet_archives/probe/convert.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,10 @@
11
from src.db.models.impl.url.internet_archives.probe.pydantic import URLInternetArchiveMetadataPydantic
22
from src.external.internet_archives.models.ia_url_mapping import InternetArchivesURLMapping
3-
from src.util.url_mapper import URLMapper
3+
from src.util.url_mapper_.simple import SimpleURLMapper
44

55

66
def convert_ia_url_mapping_to_ia_metadata(
7-
url_mapper: URLMapper,
7+
url_mapper: SimpleURLMapper,
88
ia_mapping: InternetArchivesURLMapping
99
) -> URLInternetArchiveMetadataPydantic:
1010
iam = ia_mapping.ia_metadata

src/core/tasks/scheduled/impl/internet_archives/probe/operator.py

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -12,15 +12,15 @@
1212
CheckURLInternetArchivesTaskPrerequisitesQueryBuilder
1313
from src.core.tasks.scheduled.templates.operator import ScheduledTaskOperatorBase
1414
from src.db.client.async_ import AsyncDatabaseClient
15-
from src.db.dtos.url.mapping import URLMapping
15+
from src.db.dtos.url.mapping_.simple import SimpleURLMapping
1616
from src.db.enums import TaskType
1717
from src.db.models.impl.flag.checked_for_ia.pydantic import FlagURLCheckedForInternetArchivesPydantic
1818
from src.db.models.impl.url.internet_archives.probe.pydantic import URLInternetArchiveMetadataPydantic
1919
from src.db.models.impl.url.task_error.pydantic_.small import URLTaskErrorSmall
2020
from src.external.internet_archives.client import InternetArchivesClient
2121
from src.external.internet_archives.models.ia_url_mapping import InternetArchivesURLMapping
2222
from src.util.progress_bar import get_progress_bar_disabled
23-
from src.util.url_mapper import URLMapper
23+
from src.util.url_mapper_.simple import SimpleURLMapper
2424

2525

2626
class InternetArchivesProbeTaskOperator(
@@ -51,10 +51,10 @@ async def inner_task_logic(self) -> None:
5151
DeleteOldUnsuccessfulIACheckedFlagsQueryBuilder()
5252
)
5353

54-
url_mappings: list[URLMapping] = await self._get_url_mappings()
54+
url_mappings: list[SimpleURLMapping] = await self._get_url_mappings()
5555
if len(url_mappings) == 0:
5656
return
57-
mapper = URLMapper(url_mappings)
57+
mapper = SimpleURLMapper(url_mappings)
5858

5959
await self.link_urls_to_task(mapper.get_all_ids())
6060

@@ -65,7 +65,7 @@ async def inner_task_logic(self) -> None:
6565
await self._add_errors_to_db(mapper, ia_mappings=subsets.error)
6666
await self._add_ia_metadata_to_db(mapper, ia_mappings=subsets.has_metadata)
6767

68-
async def _add_errors_to_db(self, mapper: URLMapper, ia_mappings: list[InternetArchivesURLMapping]) -> None:
68+
async def _add_errors_to_db(self, mapper: SimpleURLMapper, ia_mappings: list[InternetArchivesURLMapping]) -> None:
6969
url_error_info_list: list[URLTaskErrorSmall] = []
7070
for ia_mapping in ia_mappings:
7171
url_id = mapper.get_id(ia_mapping.url)
@@ -76,7 +76,7 @@ async def _add_errors_to_db(self, mapper: URLMapper, ia_mappings: list[InternetA
7676
url_error_info_list.append(url_error_info)
7777
await self.add_task_errors(url_error_info_list)
7878

79-
async def _get_url_mappings(self) -> list[URLMapping]:
79+
async def _get_url_mappings(self) -> list[SimpleURLMapping]:
8080
return await self.adb_client.run_query_builder(
8181
GetURLsForInternetArchivesTaskQueryBuilder()
8282
)
@@ -93,7 +93,7 @@ async def _search_for_internet_archive_links(self, urls: list[str]) -> list[Inte
9393

9494
async def _add_ia_metadata_to_db(
9595
self,
96-
url_mapper: URLMapper,
96+
url_mapper: SimpleURLMapper,
9797
ia_mappings: list[InternetArchivesURLMapping],
9898
) -> None:
9999
insert_objects: list[URLInternetArchiveMetadataPydantic] = [
@@ -106,7 +106,7 @@ async def _add_ia_metadata_to_db(
106106
await self.adb_client.bulk_insert(insert_objects)
107107

108108
async def _add_ia_flags_to_db(
109-
self, mapper: URLMapper, ia_mappings: list[InternetArchivesURLMapping]) -> None:
109+
self, mapper: SimpleURLMapper, ia_mappings: list[InternetArchivesURLMapping]) -> None:
110110
flags: list[FlagURLCheckedForInternetArchivesPydantic] = []
111111
for ia_mapping in ia_mappings:
112112
url_id = mapper.get_id(ia_mapping.url)

src/core/tasks/scheduled/impl/internet_archives/probe/queries/get.py

Lines changed: 4 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,15 @@
1-
from sqlalchemy import select, or_, exists, text, func
1+
from sqlalchemy import select
22
from sqlalchemy.ext.asyncio import AsyncSession
33

44
from src.core.tasks.scheduled.impl.internet_archives.probe.queries.cte import CheckURLInternetArchivesCTEContainer
5-
from src.db.dtos.url.mapping import URLMapping
6-
from src.db.helpers.query import not_exists_url
7-
from src.db.models.impl.flag.checked_for_ia.sqlalchemy import FlagURLCheckedForInternetArchives
8-
from src.db.models.impl.url.core.sqlalchemy import URL
5+
from src.db.dtos.url.mapping_.simple import SimpleURLMapping
96
from src.db.queries.base.builder import QueryBuilderBase
107

118
from src.db.helpers.session import session_helper as sh
129

1310
class GetURLsForInternetArchivesTaskQueryBuilder(QueryBuilderBase):
1411

15-
async def run(self, session: AsyncSession) -> list[URLMapping]:
12+
async def run(self, session: AsyncSession) -> list[SimpleURLMapping]:
1613
cte = CheckURLInternetArchivesCTEContainer()
1714
query = (
1815
select(
@@ -24,7 +21,7 @@ async def run(self, session: AsyncSession) -> list[URLMapping]:
2421

2522
db_mappings = await sh.mappings(session, query=query)
2623
return [
27-
URLMapping(
24+
SimpleURLMapping(
2825
url_id=mapping["url_id"],
2926
url=mapping["url"]
3027
) for mapping in db_mappings

0 commit comments

Comments
 (0)