From 4340d4ac1286fa3e48c4758ca9e793b2f2ca6d5d Mon Sep 17 00:00:00 2001 From: Max Chis Date: Sat, 18 Oct 2025 15:26:07 -0400 Subject: [PATCH] Update URL Status View Enum --- README.md | 1 + ...d35_update_url_status_materialized_view.py | 104 ++++++++++++++++++ .../query/subqueries/oldest_pending_url.py | 3 +- src/db/models/views/url_status/enums.py | 3 +- 4 files changed, 109 insertions(+), 2 deletions(-) create mode 100644 alembic/versions/2025_10_18_1517-9d57b3b79d35_update_url_status_materialized_view.py diff --git a/README.md b/README.md index ae2263dc..4fa95b40 100644 --- a/README.md +++ b/README.md @@ -156,3 +156,4 @@ if it detects any missing docstrings or type hints in files that you have modifi These will *not* block any Pull request, but exist primarily as advisory comments to encourage good coding standards. Note that `python_checks.yml` will only function on pull requests made from within the repo, not from a forked repo. + diff --git a/alembic/versions/2025_10_18_1517-9d57b3b79d35_update_url_status_materialized_view.py b/alembic/versions/2025_10_18_1517-9d57b3b79d35_update_url_status_materialized_view.py new file mode 100644 index 00000000..2a7db8e5 --- /dev/null +++ b/alembic/versions/2025_10_18_1517-9d57b3b79d35_update_url_status_materialized_view.py @@ -0,0 +1,104 @@ +"""Update URL Status Materialized View + +Revision ID: 9d57b3b79d35 +Revises: 7fc6502f1fa3 +Create Date: 2025-10-18 15:17:23.653448 + +""" +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa + + +# revision identifiers, used by Alembic. +revision: str = '9d57b3b79d35' +down_revision: Union[str, None] = '7fc6502f1fa3' +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + op.execute("DROP MATERIALIZED VIEW IF EXISTS url_status_mat_view") + op.execute(""" + CREATE MATERIALIZED VIEW url_status_mat_view as + with + urls_with_relevant_errors as ( + select + ute.url_id + from + url_task_error ute + where + ute.task_type in ( + 'Screenshot', + 'HTML', + 'URL Probe' + ) + ) + , status_text as ( + select + u.id as url_id, + case + when ( + -- Validated as not relevant, individual record, or not found + fuv.type in ('not relevant', 'individual record', 'not found') + ) Then 'Accepted' + when ( + (fuv.type = 'data source' and uds.url_id is null) + OR + (fuv.type = 'meta url' and udmu.url_id is null) + ) Then 'Awaiting Submission' + when ( + (fuv.type = 'data source' and uds.url_id is not null) + OR + (fuv.type = 'meta url' and udmu.url_id is not null) + ) Then 'Submitted' + when ( + -- Has compressed HTML + uch.url_id is not null + AND + -- Has web metadata + uwm.url_id is not null + AND + -- Has screenshot + us.url_id is not null + ) THEN 'Community Labeling' + when uwre.url_id is not null then 'Error' + ELSE 'Intake' + END as status + + from + urls u + left join urls_with_relevant_errors uwre + on u.id = uwre.url_id + left join url_screenshot us + on u.id = us.url_id + left join url_compressed_html uch + on u.id = uch.url_id + left join url_web_metadata uwm + on u.id = uwm.url_id + left join flag_url_validated fuv + on u.id = fuv.url_id + left join url_ds_meta_url udmu + on u.id = udmu.url_id + left join url_data_source uds + on u.id = uds.url_id + ) + select + url_id, + status, + CASE status + WHEN 'Intake' THEN 100 + WHEN 'Error' THEN 110 + WHEN 'Community Labeling' THEN 200 + WHEN 'Accepted' THEN 300 + WHEN 'Awaiting Submission' THEN 380 + WHEN 'Submitted' THEN 390 + ELSE -1 + END as code + from status_text + """) + + +def downgrade() -> None: + pass diff --git a/src/api/endpoints/metrics/urls/aggregated/query/subqueries/oldest_pending_url.py b/src/api/endpoints/metrics/urls/aggregated/query/subqueries/oldest_pending_url.py index 2a951b4a..e086b752 100644 --- a/src/api/endpoints/metrics/urls/aggregated/query/subqueries/oldest_pending_url.py +++ b/src/api/endpoints/metrics/urls/aggregated/query/subqueries/oldest_pending_url.py @@ -27,8 +27,9 @@ async def run( ).where( URLStatusMatView.status.not_in( [ - URLStatusViewEnum.SUBMITTED_PIPELINE_COMPLETE.value, + URLStatusViewEnum.SUBMITTED.value, URLStatusViewEnum.ACCEPTED.value, + URLStatusViewEnum.AWAITING_SUBMISSION.value, ] ) ).order_by( diff --git a/src/db/models/views/url_status/enums.py b/src/db/models/views/url_status/enums.py index 82995812..a467a33d 100644 --- a/src/db/models/views/url_status/enums.py +++ b/src/db/models/views/url_status/enums.py @@ -4,6 +4,7 @@ class URLStatusViewEnum(Enum): INTAKE = "Intake" ACCEPTED = "Accepted" - SUBMITTED_PIPELINE_COMPLETE = "Submitted/Pipeline Complete" + AWAITING_SUBMISSION = "Awaiting Submission" + SUBMITTED = "Submitted" ERROR = "Error" COMMUNITY_LABELING = "Community Labeling" \ No newline at end of file