Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adding support to GTN import for events #2938

Merged
merged 1 commit into from
Feb 14, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
44 changes: 44 additions & 0 deletions .github/workflows/gtn-import.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
---
# Pull news/events from the Galaxy Training Network's RSS feed and add them as
# Galaxy Hub posts.
name: Galaxy Training Network news/events

on:
workflow_dispatch:
schedule:
- cron: "0 3 * * *"

jobs:
collect:
name: Collect news/events from the Galaxy Training Network
runs-on: ubuntu-latest
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
REPO_NAME: ${{ github.repository }}
steps:
- uses: actions/checkout@v4
with:
fetch-depth: 1

- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: "3.12"

- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install -r utils/gtn-import-requirements.txt

- name: Parse GTN news RSS feed
env:
FEED_URL: https://training.galaxyproject.org/training-material/feed.xml
IMPORT_TYPE: news
run: python utils/gtn-import.py

- name: Parse GTN events RSS feed
env:
FEED_URL: https://training.galaxyproject.org/training-material/events/feed.xml
IMPORT_TYPE: events
START_DATE: "2025-02-14"
run: python utils/gtn-import.py
35 changes: 0 additions & 35 deletions .github/workflows/gtn-news.yml

This file was deleted.

2 changes: 2 additions & 0 deletions utils/gtn-import-requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,5 @@ feedparser
PyYAML
PyGithub
python-dateutil
geopy
country-converter
113 changes: 91 additions & 22 deletions utils/gtn-import.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,20 @@
import html
import logging
import os
import re
import sys
from datetime import datetime

import feedparser
import yaml
from country_converter import CountryConverter
from dateutil.parser import isoparse
from geopy.geocoders import Nominatim
from github import Github, GithubException

logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s")

feed = feedparser.parse(os.getenv("GTN_NEWS_FEED_URL"))
feed = feedparser.parse(os.getenv("FEED_URL"))

g = Github(os.getenv("GITHUB_TOKEN") or sys.exit("GITHUB_TOKEN not set"))
repo = g.get_repo(os.getenv("REPO_NAME") or sys.exit("REPO_NAME not set"))
Expand All @@ -22,21 +25,32 @@
for file in pr.get_files()
]

branch_name = f"import-gtn-posts-{datetime.now().strftime('%Y%m%d%H%M%S')}"
import_type = os.getenv("IMPORT_TYPE")
if import_type not in {"news", "events"}:
sys.exit("IMPORT_TYPE should be either news or events")

branch_name = f"import-gtn-{import_type}-{datetime.now().strftime('%Y%m%d%H%M%S')}"
repo.create_git_ref(
ref=f"refs/heads/{branch_name}", sha=repo.get_branch(default_branch).commit.sha
)


created_files = []
for entry in feed.entries:
for entry in feed.get("entries", []):
title = html.unescape(entry.get("title", "Untitled"))
date_ymd = isoparse(
entry.get("published") or entry.get("pubDate") or entry.get("updated")
).strftime("%Y-%m-%d")

tags = {"training", "gtn-news"} | {
tag["term"] for tag in entry.get("tags", []) if "term" in tag
}
start_date = os.getenv("START_DATE")
if start_date and date_ymd < start_date:
logging.info(f"Skipping post {title} published on {date_ymd}")
continue

tags = {"training", "gtn-news"} if import_type == "news" else set()
for tag in entry.get("tags", []):
if "term" in tag:
tags.add(tag["term"])
if "already-on-hub" in tags:
continue

Expand All @@ -45,7 +59,7 @@
summary = html.unescape(entry.get("summary", ""))

slug = os.path.splitext(os.path.basename(link))[0]
folder = f"{date_ymd}-{slug}"
folder = f"{date_ymd}-{slug}" if import_type == "news" else f"{slug}"

pr_exists = False
for pr_url, file_path in existing_files:
Expand All @@ -56,23 +70,78 @@
if pr_exists:
continue

folder_path = os.path.join("content", "news", folder)
folder_path = os.path.join("content", import_type, folder)
if os.path.exists(folder_path):
logging.info(f"Folder Already exists: {folder}")
continue

logging.info(f"New post: {folder}")
created_files.append(f"[{title}]({link})")
meta = {
"subsites": ["all"],
"main_subsite": "global",
"date": date_ymd,
"tags": list(tags),
"title": str(title),
"authors": authors,
"external_url": link,
"tease": str(summary.split(". ")[0]),
}

logging.info(f"New {import_type}: {folder}")

if import_type == "news":
meta = {
"subsites": ["all"],
"main_subsite": "global",
"date": date_ymd,
"tags": list(tags),
"title": str(title),
"authors": authors,
"external_url": link,
"tease": str(summary.split(". ")[0]),
}
elif import_type == "events":
event_str = title.replace("\u2009", " ").replace("–", "-").strip()
pattern = r"\[(\w+)\s+(\d{1,2})\s*-\s*(\d{1,2}),\s*(\d{4})\]\s*(.+)"
match = re.match(pattern, event_str)
if match:
month, start_day, end_day, year, title = match.groups()
start_date = datetime.strptime(f"{start_day} {month} {year}", "%d %B %Y")
end_date = datetime.strptime(f"{end_day} {month} {year}", "%d %B %Y")
duration = (end_date - start_date).days + 1
date = start_date.strftime("%Y-%m-%d")
else:
date = date_ymd
duration = 1

gtn = "external" not in entry.get("category")

if geo := entry.get("georss"):
location_raw = (
Nominatim(user_agent="GTN")
.reverse(map(float, geo.split()), language="en")
.raw
)
city = location_raw.get("address", {}).get("city")
country = location_raw.get("address", {}).get("country")
location = f"{city}, {country}"
continent_name = CountryConverter().convert(names=country, to="continent")
continent_map = {
"Africa": "AF",
"Asia": "AS",
"Australia": "AU",
"Europe": "EU",
"North America": "NA",
"South America": "SA",
"Oceania": "AU",
}
continent = continent_map.get(continent_name, "GL")
else:
location = "Online"
continent = "GL"
meta = {
"subsites": ["all"],
"gtn": gtn,
"date": date,
"days": duration,
"tags": list(tags),
"title": str(title),
"contact": authors,
"location": {"name": location},
"continent": continent,
"external_url": link,
"tease": str(summary.split(". ")[0]),
}
md_config = yaml.dump(
meta, default_flow_style=False, sort_keys=False, allow_unicode=True
)
Expand All @@ -85,13 +154,13 @@

try:
pr = repo.create_pull(
title="Import GTN Posts",
body=f"This PR imports new GTN posts.\nNew posts:\n{"\n".join(created_files)}",
title=f"Import GTN {import_type.capitalize()}",
body=f"This PR imports new GTN {import_type.capitalize()}.\n\n{"\n".join(created_files)}",
head=branch_name,
base=default_branch,
)
logging.info(
f"Pull request created: {pr.html_url}\nTotal new posts: {len(created_files)}"
f"Pull request created: {pr.html_url}\nTotal new {import_type}: {len(created_files)}"
)
except GithubException as e:
repo.get_git_ref(f"heads/{branch_name}").delete()
Expand Down
Loading