From 1e0ac578db76260e9a26e5649ac9178f7550f545 Mon Sep 17 00:00:00 2001 From: Sam-24-dev Date: Tue, 24 Mar 2026 02:16:58 -0500 Subject: [PATCH 1/2] fix(ci): prefer fresh repo reddit baseline --- .github/workflows/etl_semanal.yml | 40 ++++-- scripts/restore_reddit_baseline.py | 194 ++++++++++++++++++++++++++ tests/test_restore_reddit_baseline.py | 96 +++++++++++++ tests/test_workflow_etl_contract.py | 3 + 4 files changed, 324 insertions(+), 9 deletions(-) create mode 100644 scripts/restore_reddit_baseline.py create mode 100644 tests/test_restore_reddit_baseline.py diff --git a/.github/workflows/etl_semanal.yml b/.github/workflows/etl_semanal.yml index 0e0ec5d..5f5cd6c 100644 --- a/.github/workflows/etl_semanal.yml +++ b/.github/workflows/etl_semanal.yml @@ -266,6 +266,22 @@ jobs: name: reddit-data path: artifacts/reddit + - name: Snapshot repo Reddit baseline + shell: bash + run: | + mkdir -p repo_baseline + for file in \ + datos/reddit_sentimiento_frameworks.csv \ + datos/reddit_temas_emergentes.csv \ + datos/interseccion_github_reddit.csv \ + frontend/assets/data/reddit_temas_history.json \ + frontend/assets/data/reddit_interseccion_history.json; do + if [ -f "$file" ]; then + mkdir -p "repo_baseline/$(dirname "$file")" + cp "$file" "repo_baseline/$file" + fi + done + - name: Download latest valid aggregate history (main) continue-on-error: true env: @@ -344,6 +360,16 @@ jobs: exit 1 fi + - name: Restore Reddit source baseline on fallback + if: ${{ always() && needs.job_reddit.outputs.status == 'failed' }} + shell: bash + run: | + python scripts/restore_reddit_baseline.py \ + --project-root . \ + --candidate-root repo_baseline \ + --candidate-root prev_artifacts \ + --mode source + - name: Run Trend Score run: python backend/trend_score.py @@ -354,15 +380,11 @@ jobs: if: ${{ always() && needs.job_reddit.outputs.status == 'failed' }} shell: bash run: | - if [ -d "prev_artifacts/frontend/assets/data" ]; then - for bridge_file in \ - reddit_temas_history.json \ - reddit_interseccion_history.json; do - if [ -f "prev_artifacts/frontend/assets/data/${bridge_file}" ]; then - cp "prev_artifacts/frontend/assets/data/${bridge_file}" "frontend/assets/data/${bridge_file}" - fi - done - fi + python scripts/restore_reddit_baseline.py \ + --project-root . \ + --candidate-root repo_baseline \ + --candidate-root prev_artifacts \ + --mode bridges - name: Validate CSV contract headers run: python backend/validate_csv_contract.py diff --git a/scripts/restore_reddit_baseline.py b/scripts/restore_reddit_baseline.py new file mode 100644 index 0000000..c14aeae --- /dev/null +++ b/scripts/restore_reddit_baseline.py @@ -0,0 +1,194 @@ +"""Restore the freshest valid Reddit baseline into the aggregate workspace.""" + +from __future__ import annotations + +import argparse +import json +import shutil +from dataclasses import dataclass +from datetime import date +from pathlib import Path + + +CSV_SPECS = { + "reddit_sentimiento_frameworks.csv": ( + "reddit_sentimiento", + "reddit_sentimiento_frameworks.csv", + ), + "reddit_temas_emergentes.csv": ( + "reddit_temas", + "reddit_temas_emergentes.csv", + ), + "interseccion_github_reddit.csv": ( + "interseccion", + "interseccion_github_reddit.csv", + ), +} + +BRIDGE_FILES = ( + "reddit_temas_history.json", + "reddit_interseccion_history.json", +) + + +@dataclass(frozen=True) +class Candidate: + root: Path + latest_snapshot_date: date + + +def _load_json(path: Path) -> dict: + return json.loads(path.read_text(encoding="utf-8")) + + +def _parse_snapshot_date(value: object) -> date | None: + if not isinstance(value, str) or not value: + return None + try: + return date.fromisoformat(value) + except ValueError: + return None + + +def _discover_candidate(root: Path) -> Candidate | None: + topics_bridge = root / "frontend" / "assets" / "data" / "reddit_temas_history.json" + intersection_bridge = ( + root / "frontend" / "assets" / "data" / "reddit_interseccion_history.json" + ) + + if not topics_bridge.exists() or not intersection_bridge.exists(): + return None + + for csv_name in CSV_SPECS: + if not (root / "datos" / csv_name).exists(): + return None + + topics_date = _parse_snapshot_date( + _load_json(topics_bridge).get("latest_snapshot_date") + ) + intersection_date = _parse_snapshot_date( + _load_json(intersection_bridge).get("latest_snapshot_date") + ) + if topics_date is None or intersection_date is None: + return None + if topics_date != intersection_date: + return None + + return Candidate(root=root, latest_snapshot_date=topics_date) + + +def _select_best_candidate(candidate_roots: list[Path]) -> Candidate: + candidates: list[Candidate] = [] + for root in candidate_roots: + candidate = _discover_candidate(root) + if candidate is not None: + candidates.append(candidate) + + if not candidates: + raise ValueError("No valid Reddit baseline candidate was found.") + + return max(candidates, key=lambda candidate: candidate.latest_snapshot_date) + + +def _copy_file(source: Path, target: Path) -> None: + target.parent.mkdir(parents=True, exist_ok=True) + shutil.copy2(source, target) + + +def _replace_dir(source: Path, target: Path) -> None: + if target.exists(): + shutil.rmtree(target) + if source.exists(): + shutil.copytree(source, target) + + +def restore_reddit_source_baseline( + project_root: Path, + candidate_roots: list[Path], +) -> dict[str, object]: + candidate = _select_best_candidate(candidate_roots) + + latest_snapshot = candidate.latest_snapshot_date + year = latest_snapshot.strftime("%Y") + month = latest_snapshot.strftime("%m") + day = latest_snapshot.strftime("%d") + + for csv_name, (history_dataset, history_filename) in CSV_SPECS.items(): + source_csv = candidate.root / "datos" / csv_name + target_csv = project_root / "datos" / csv_name + target_latest = project_root / "datos" / "latest" / csv_name + + _copy_file(source_csv, target_csv) + _copy_file(source_csv, target_latest) + + target_history_root = project_root / "datos" / "history" / history_dataset + source_history_root = candidate.root / "datos" / "history" / history_dataset + if source_history_root.exists(): + _replace_dir(source_history_root, target_history_root) + else: + if target_history_root.exists(): + shutil.rmtree(target_history_root) + target_history_file = ( + target_history_root + / f"year={year}" + / f"month={month}" + / f"day={day}" + / history_filename + ) + _copy_file(source_csv, target_history_file) + + return { + "mode": "source", + "selected_root": str(candidate.root), + "latest_snapshot_date": latest_snapshot.isoformat(), + } + + +def restore_reddit_bridges( + project_root: Path, + candidate_roots: list[Path], +) -> dict[str, object]: + candidate = _select_best_candidate(candidate_roots) + + for bridge_file in BRIDGE_FILES: + source_bridge = candidate.root / "frontend" / "assets" / "data" / bridge_file + target_bridge = project_root / "frontend" / "assets" / "data" / bridge_file + _copy_file(source_bridge, target_bridge) + + return { + "mode": "bridges", + "selected_root": str(candidate.root), + "latest_snapshot_date": candidate.latest_snapshot_date.isoformat(), + } + + +def main() -> int: + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("--project-root", required=True) + parser.add_argument( + "--candidate-root", + action="append", + required=True, + help="Candidate baseline root (can be passed multiple times).", + ) + parser.add_argument( + "--mode", + choices=("source", "bridges"), + required=True, + ) + args = parser.parse_args() + + project_root = Path(args.project_root).resolve() + candidate_roots = [Path(value).resolve() for value in args.candidate_root] + + if args.mode == "source": + summary = restore_reddit_source_baseline(project_root, candidate_roots) + else: + summary = restore_reddit_bridges(project_root, candidate_roots) + + print(json.dumps(summary, ensure_ascii=False)) + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/tests/test_restore_reddit_baseline.py b/tests/test_restore_reddit_baseline.py new file mode 100644 index 0000000..3afa166 --- /dev/null +++ b/tests/test_restore_reddit_baseline.py @@ -0,0 +1,96 @@ +import json +from pathlib import Path + +from scripts.restore_reddit_baseline import ( + restore_reddit_bridges, + restore_reddit_source_baseline, +) + + +def _write(path: Path, content: str) -> None: + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text(content, encoding="utf-8") + + +def _create_candidate(root: Path, latest_snapshot_date: str) -> None: + for csv_name in ( + "reddit_sentimiento_frameworks.csv", + "reddit_temas_emergentes.csv", + "interseccion_github_reddit.csv", + ): + _write(root / "datos" / csv_name, "col\n1\n") + + _write( + root / "frontend" / "assets" / "data" / "reddit_temas_history.json", + json.dumps( + { + "latest_snapshot_date": latest_snapshot_date, + "previous_snapshot_date": "2026-03-22", + } + ), + ) + _write( + root / "frontend" / "assets" / "data" / "reddit_interseccion_history.json", + json.dumps( + { + "latest_snapshot_date": latest_snapshot_date, + "previous_snapshot_date": "2026-03-22", + } + ), + ) + + +def test_restore_reddit_source_baseline_prefers_fresher_repo_candidate(tmp_path): + project_root = tmp_path / "project" + repo_baseline = tmp_path / "repo_baseline" + prev_artifacts = tmp_path / "prev_artifacts" + + _create_candidate(repo_baseline, "2026-03-24") + _create_candidate(prev_artifacts, "2026-03-16") + + summary = restore_reddit_source_baseline( + project_root, + [repo_baseline, prev_artifacts], + ) + + assert summary["latest_snapshot_date"] == "2026-03-24" + assert "repo_baseline" in summary["selected_root"] + assert ( + project_root / "datos" / "latest" / "reddit_temas_emergentes.csv" + ).exists() + assert ( + project_root + / "datos" + / "history" + / "reddit_temas" + / "year=2026" + / "month=03" + / "day=24" + / "reddit_temas_emergentes.csv" + ).exists() + + +def test_restore_reddit_bridges_copies_selected_candidate(tmp_path): + project_root = tmp_path / "project" + repo_baseline = tmp_path / "repo_baseline" + prev_artifacts = tmp_path / "prev_artifacts" + + _create_candidate(repo_baseline, "2026-03-24") + _create_candidate(prev_artifacts, "2026-03-16") + + summary = restore_reddit_bridges( + project_root, + [prev_artifacts, repo_baseline], + ) + + restored = json.loads( + ( + project_root + / "frontend" + / "assets" + / "data" + / "reddit_temas_history.json" + ).read_text(encoding="utf-8") + ) + assert summary["latest_snapshot_date"] == "2026-03-24" + assert restored["latest_snapshot_date"] == "2026-03-24" diff --git a/tests/test_workflow_etl_contract.py b/tests/test_workflow_etl_contract.py index b08dd94..d34f6e9 100644 --- a/tests/test_workflow_etl_contract.py +++ b/tests/test_workflow_etl_contract.py @@ -41,6 +41,7 @@ def test_workflow_artifact_handoff_contract_is_defined(): assert "python scripts/check_bridge_integrity.py" in content assert "python scripts/download_valid_aggregate_artifact.py" in content assert "python scripts/hydrate_aggregate_history_seed.py --project-root ." in content + assert "python scripts/restore_reddit_baseline.py" in content assert "steps.previous_history.outputs.expect_previous_history == '1'" in content assert "dawidd6/action-download-artifact" not in content assert "Stage Reddit artifact payload" in content @@ -99,6 +100,8 @@ def test_workflow_reddit_job_resets_stale_outputs_and_requires_fresh_latest_file assert "datos/latest/reddit_sentimiento_frameworks.csv" in content assert "datos/latest/reddit_temas_emergentes.csv" in content assert "datos/latest/interseccion_github_reddit.csv" in content + assert "Snapshot repo Reddit baseline" in content + assert "Restore Reddit source baseline on fallback" in content assert "Restore previous Reddit bridges on source fallback" in content assert "reddit_temas_history.json" in content assert "reddit_interseccion_history.json" in content From 093d5ffef96807b644374dbc71e709885c13b5a9 Mon Sep 17 00:00:00 2001 From: Sam-24-dev Date: Sat, 28 Mar 2026 16:43:45 -0500 Subject: [PATCH 2/2] fix(ci): preserve reddit history on baseline fallback --- scripts/restore_reddit_baseline.py | 14 +++--- tests/test_restore_reddit_baseline.py | 70 +++++++++++++++++++++++++++ 2 files changed, 78 insertions(+), 6 deletions(-) diff --git a/scripts/restore_reddit_baseline.py b/scripts/restore_reddit_baseline.py index c14aeae..f0f4344 100644 --- a/scripts/restore_reddit_baseline.py +++ b/scripts/restore_reddit_baseline.py @@ -63,11 +63,15 @@ def _discover_candidate(root: Path) -> Candidate | None: if not (root / "datos" / csv_name).exists(): return None - topics_date = _parse_snapshot_date( - _load_json(topics_bridge).get("latest_snapshot_date") - ) + try: + topics_payload = _load_json(topics_bridge) + intersection_payload = _load_json(intersection_bridge) + except (OSError, ValueError): + return None + + topics_date = _parse_snapshot_date(topics_payload.get("latest_snapshot_date")) intersection_date = _parse_snapshot_date( - _load_json(intersection_bridge).get("latest_snapshot_date") + intersection_payload.get("latest_snapshot_date") ) if topics_date is None or intersection_date is None: return None @@ -126,8 +130,6 @@ def restore_reddit_source_baseline( if source_history_root.exists(): _replace_dir(source_history_root, target_history_root) else: - if target_history_root.exists(): - shutil.rmtree(target_history_root) target_history_file = ( target_history_root / f"year={year}" diff --git a/tests/test_restore_reddit_baseline.py b/tests/test_restore_reddit_baseline.py index 3afa166..4496ae7 100644 --- a/tests/test_restore_reddit_baseline.py +++ b/tests/test_restore_reddit_baseline.py @@ -70,6 +70,76 @@ def test_restore_reddit_source_baseline_prefers_fresher_repo_candidate(tmp_path) ).exists() +def test_restore_reddit_source_baseline_preserves_existing_history_when_candidate_lacks_history( + tmp_path, +): + project_root = tmp_path / "project" + repo_baseline = tmp_path / "repo_baseline" + + _create_candidate(repo_baseline, "2026-03-24") + existing_history_file = ( + project_root + / "datos" + / "history" + / "reddit_temas" + / "year=2026" + / "month=03" + / "day=22" + / "reddit_temas_emergentes.csv" + ) + _write(existing_history_file, "col\nlegacy\n") + + summary = restore_reddit_source_baseline(project_root, [repo_baseline]) + + assert summary["latest_snapshot_date"] == "2026-03-24" + assert existing_history_file.exists() + assert ( + project_root + / "datos" + / "history" + / "reddit_temas" + / "year=2026" + / "month=03" + / "day=24" + / "reddit_temas_emergentes.csv" + ).exists() + + +def test_restore_reddit_bridges_skips_corrupt_fresher_candidate(tmp_path): + project_root = tmp_path / "project" + corrupt_repo_baseline = tmp_path / "repo_baseline" + prev_artifacts = tmp_path / "prev_artifacts" + + _create_candidate(corrupt_repo_baseline, "2026-03-24") + _create_candidate(prev_artifacts, "2026-03-16") + _write( + corrupt_repo_baseline + / "frontend" + / "assets" + / "data" + / "reddit_temas_history.json", + "{not-json", + ) + + summary = restore_reddit_bridges( + project_root, + [corrupt_repo_baseline, prev_artifacts], + ) + + restored = json.loads( + ( + project_root + / "frontend" + / "assets" + / "data" + / "reddit_temas_history.json" + ).read_text(encoding="utf-8") + ) + assert summary["latest_snapshot_date"] == "2026-03-16" + assert "prev_artifacts" in summary["selected_root"] + assert restored["latest_snapshot_date"] == "2026-03-16" + + def test_restore_reddit_bridges_copies_selected_candidate(tmp_path): project_root = tmp_path / "project" repo_baseline = tmp_path / "repo_baseline"