Skip to content

Commit 2626db8

Browse files
authored
Merge pull request #490 from Police-Data-Accessibility-Project/mc_456_add_schema_column
mc_456_add_schema_column
2 parents cc581bf + 16f2b66 commit 2626db8

File tree

45 files changed

+477
-75
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

45 files changed

+477
-75
lines changed
Lines changed: 338 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,338 @@
1+
"""Add url scheme column
2+
3+
Revision ID: a8f36f185694
4+
Revises: 7aace6587d1a
5+
Create Date: 2025-10-14 11:05:28.686940
6+
7+
"""
8+
from typing import Sequence, Union
9+
10+
from alembic import op
11+
import sqlalchemy as sa
12+
13+
14+
# revision identifiers, used by Alembic.
15+
revision: str = 'a8f36f185694'
16+
down_revision: Union[str, None] = '7aace6587d1a'
17+
branch_labels: Union[str, Sequence[str], None] = None
18+
depends_on: Union[str, Sequence[str], None] = None
19+
20+
21+
22+
23+
def upgrade() -> None:
24+
_update_foreign_key_constraints()
25+
26+
_delete_duplicate_urls()
27+
_add_column()
28+
_populate_column()
29+
_remove_schemes_from_url_column()
30+
_add_check_constraint_to_url_column()
31+
32+
def _update_foreign_key_constraints():
33+
# URL Optional Data Source Metadata
34+
op.execute("""
35+
ALTER TABLE url_optional_data_source_metadata
36+
DROP CONSTRAINT IF EXISTS url_optional_data_source_metadata_url_id_fkey;
37+
""")
38+
39+
op.create_foreign_key(
40+
"url_optional_data_source_metadata_url_id_fkey",
41+
"url_optional_data_source_metadata",
42+
"urls",
43+
["url_id"],
44+
["id"],
45+
ondelete="CASCADE"
46+
)
47+
48+
# Link URLs Redirect URL
49+
# (Source URL ID)
50+
op.execute("""
51+
ALTER TABLE link_urls_redirect_url
52+
DROP CONSTRAINT IF EXISTS link_urls_redirect_url_source_url_id_fkey;
53+
""")
54+
55+
op.create_foreign_key(
56+
"link_urls_redirect_url_source_url_id_fkey",
57+
"link_urls_redirect_url",
58+
"urls",
59+
["source_url_id"],
60+
["id"],
61+
ondelete="CASCADE"
62+
)
63+
64+
# (Destination URL ID)
65+
op.execute("""
66+
ALTER TABLE link_urls_redirect_url
67+
DROP CONSTRAINT IF EXISTS link_urls_redirect_url_destination_url_id_fkey;
68+
""")
69+
70+
op.create_foreign_key(
71+
"link_urls_redirect_url_destination_url_id_fkey",
72+
"link_urls_redirect_url",
73+
"urls",
74+
["destination_url_id"],
75+
["id"],
76+
ondelete="CASCADE"
77+
)
78+
79+
# Reviewing User URL
80+
op.execute("""
81+
ALTER TABLE reviewing_user_url
82+
DROP CONSTRAINT IF EXISTS approving_user_url_url_id_fkey;
83+
""")
84+
85+
op.create_foreign_key(
86+
"approving_user_url_url_id_fkey",
87+
"reviewing_user_url",
88+
"urls",
89+
["url_id"],
90+
["id"],
91+
ondelete="CASCADE"
92+
)
93+
94+
# user_url_agency_suggestions
95+
op.execute("""
96+
ALTER TABLE user_url_agency_suggestions
97+
DROP CONSTRAINT IF EXISTS user_url_agency_suggestions_url_id_fkey;
98+
""")
99+
100+
op.create_foreign_key(
101+
"user_url_agency_suggestions_url_id_fkey",
102+
"user_url_agency_suggestions",
103+
"urls",
104+
["url_id"],
105+
["id"],
106+
ondelete="CASCADE"
107+
)
108+
109+
# Duplicates
110+
op.execute("""
111+
ALTER TABLE duplicates
112+
DROP CONSTRAINT IF EXISTS duplicates_original_url_id_fkey;
113+
""")
114+
115+
op.create_foreign_key(
116+
"duplicates_original_url_id_fkey",
117+
"duplicates",
118+
"urls",
119+
["original_url_id"],
120+
["id"],
121+
ondelete="CASCADE"
122+
)
123+
124+
# link_user_name_suggestions
125+
op.execute("""
126+
ALTER TABLE link_user_name_suggestions
127+
DROP CONSTRAINT IF EXISTS link_user_name_suggestions_suggestion_id_fkey;
128+
""")
129+
130+
op.create_foreign_key(
131+
"link_user_name_suggestions_suggestion_id_fkey",
132+
"link_user_name_suggestions",
133+
"url_name_suggestions",
134+
["suggestion_id"],
135+
["id"],
136+
ondelete="CASCADE"
137+
)
138+
139+
def _delete_duplicate_urls():
140+
op.execute("""
141+
DELETE FROM urls
142+
WHERE id IN (
143+
4217,
144+
15902,
145+
3472,
146+
17387,
147+
24256,
148+
17617,
149+
17414,
150+
15259,
151+
17952,
152+
17651,
153+
18010,
154+
18496,
155+
18563,
156+
18587,
157+
18592,
158+
18092,
159+
18046,
160+
20467,
161+
24346,
162+
28241,
163+
25075,
164+
22508,
165+
22391,
166+
24256,
167+
22486,
168+
28109,
169+
26336,
170+
30701,
171+
17387,
172+
19348,
173+
18080,
174+
27863,
175+
18855,
176+
28830,
177+
18824,
178+
17414,
179+
15259,
180+
20676,
181+
27716,
182+
21475,
183+
23442,
184+
28553,
185+
8176,
186+
22270,
187+
19161,
188+
21250,
189+
15659,
190+
18821,
191+
27067,
192+
27567,
193+
27318,
194+
20640,
195+
21840,
196+
3472,
197+
28982,
198+
28910,
199+
19527,
200+
28776,
201+
15902,
202+
18468,
203+
29557,
204+
22977,
205+
27694,
206+
22678,
207+
19094,
208+
27203,
209+
26436,
210+
18868,
211+
22813,
212+
25007,
213+
7548,
214+
30088,
215+
20924,
216+
22575,
217+
28149,
218+
30705,
219+
28179,
220+
30660,
221+
2988,
222+
17182,
223+
18893,
224+
30317,
225+
19215,
226+
17651,
227+
21117,
228+
17617,
229+
23742,
230+
19620,
231+
16865,
232+
19320,
233+
20516,
234+
25248,
235+
26122,
236+
30158,
237+
30522,
238+
23307,
239+
18621,
240+
27855,
241+
26922,
242+
21397,
243+
18010,
244+
18592,
245+
2527,
246+
26279,
247+
18563,
248+
18242,
249+
21550,
250+
28288,
251+
22361,
252+
24660,
253+
2989,
254+
28765,
255+
10627,
256+
19625,
257+
12191,
258+
27523,
259+
18373,
260+
28565,
261+
25437,
262+
26077,
263+
28554,
264+
23229,
265+
25631,
266+
25528,
267+
18092,
268+
10765,
269+
26126,
270+
51499,
271+
27375,
272+
24177,
273+
22734,
274+
22459,
275+
22439,
276+
18532,
277+
29064,
278+
20504,
279+
21643,
280+
21551,
281+
27698,
282+
19234,
283+
24308,
284+
22559,
285+
26227,
286+
19080,
287+
16010,
288+
3515,
289+
22658,
290+
20673,
291+
21854,
292+
19361,
293+
21768,
294+
26903,
295+
21253,
296+
23085,
297+
3761,
298+
3565
299+
)
300+
""")
301+
302+
def _populate_column():
303+
op.execute(
304+
"""
305+
UPDATE urls
306+
SET scheme = lower(split_part(url, '://', 1))
307+
WHERE url ~* '^[a-z][a-z0-9+.-]*://';
308+
"""
309+
)
310+
311+
312+
def _remove_schemes_from_url_column():
313+
op.execute(
314+
"""
315+
UPDATE urls
316+
SET url = regexp_replace(url, '^[a-z][a-z0-9+.-]*://', '', 'i')
317+
WHERE url ~* '^[a-z][a-z0-9+.-]*://';
318+
"""
319+
)
320+
321+
322+
def _add_check_constraint_to_url_column():
323+
op.execute(
324+
"""
325+
ALTER TABLE urls
326+
ADD CONSTRAINT check_url_does_not_have_schema CHECK (url !~* '^[a-z][a-z0-9+.-]*://');
327+
"""
328+
)
329+
330+
331+
def _add_column():
332+
op.add_column(
333+
"urls",
334+
sa.Column("scheme", sa.String(), nullable=True)
335+
)
336+
337+
def downgrade() -> None:
338+
pass

src/api/endpoints/annotate/_shared/extract.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@ async def extract_and_format_get_annotation_result(
4646
next_annotation=GetNextURLForAllAnnotationInnerResponse(
4747
url_info=URLMapping(
4848
url_id=url.id,
49-
url=url.url
49+
url=url.full_url
5050
),
5151
html_info=html_response_info,
5252
url_type_suggestions=url_type_suggestions,

src/api/endpoints/collector/manual/query.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,8 @@
1212
from src.db.models.impl.url.optional_data_source_metadata import URLOptionalDataSourceMetadata
1313
from src.db.models.impl.url.record_type.sqlalchemy import URLRecordType
1414
from src.db.queries.base.builder import QueryBuilderBase
15+
from src.util.models.url_and_scheme import URLAndScheme
16+
from src.util.url import get_url_and_scheme
1517

1618

1719
class UploadManualBatchQueryBuilder(QueryBuilderBase):
@@ -43,8 +45,11 @@ async def run(self, session: AsyncSession) -> ManualBatchResponseDTO:
4345
duplicate_urls: list[str] = []
4446

4547
for entry in self.dto.entries:
48+
url_and_scheme: URLAndScheme = get_url_and_scheme(entry.url)
49+
4650
url = URL(
47-
url=entry.url,
51+
url=url_and_scheme.url,
52+
scheme=url_and_scheme.scheme,
4853
name=entry.name,
4954
description=entry.description,
5055
collector_metadata=entry.collector_metadata,

0 commit comments

Comments
 (0)