From 19bf7906028def904be3646ef28e1abbdcdd9165 Mon Sep 17 00:00:00 2001 From: Arash Date: Tue, 11 Feb 2025 16:17:39 +0100 Subject: [PATCH] Adding support to GTN import for events --- .github/workflows/gtn-import.yml | 44 ++++++++++++ .github/workflows/gtn-news.yml | 35 --------- utils/gtn-import-requirements.txt | 2 + utils/gtn-import.py | 113 ++++++++++++++++++++++++------ 4 files changed, 137 insertions(+), 57 deletions(-) create mode 100644 .github/workflows/gtn-import.yml delete mode 100644 .github/workflows/gtn-news.yml diff --git a/.github/workflows/gtn-import.yml b/.github/workflows/gtn-import.yml new file mode 100644 index 0000000000..0fe005a13f --- /dev/null +++ b/.github/workflows/gtn-import.yml @@ -0,0 +1,44 @@ +--- +# Pull news/events from the Galaxy Training Network's RSS feed and add them as +# Galaxy Hub posts. +name: Galaxy Training Network news/events + +on: + workflow_dispatch: + schedule: + - cron: "0 3 * * *" + +jobs: + collect: + name: Collect news/events from the Galaxy Training Network + runs-on: ubuntu-latest + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + REPO_NAME: ${{ github.repository }} + steps: + - uses: actions/checkout@v4 + with: + fetch-depth: 1 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: "3.12" + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install -r utils/gtn-import-requirements.txt + + - name: Parse GTN news RSS feed + env: + FEED_URL: https://training.galaxyproject.org/training-material/feed.xml + IMPORT_TYPE: news + run: python utils/gtn-import.py + + - name: Parse GTN events RSS feed + env: + FEED_URL: https://training.galaxyproject.org/training-material/events/feed.xml + IMPORT_TYPE: events + START_DATE: "2025-02-14" + run: python utils/gtn-import.py diff --git a/.github/workflows/gtn-news.yml b/.github/workflows/gtn-news.yml deleted file mode 100644 index 7b43a2bebd..0000000000 --- a/.github/workflows/gtn-news.yml +++ /dev/null @@ -1,35 +0,0 @@ ---- -# Pull news from the Galaxy Training Network's RSS feed and add them as -# Galaxy Hub posts. -name: Galaxy Training Network news - -on: - workflow_dispatch: - schedule: - - cron: "0 3 * * *" - -jobs: - collect: - name: Collect news from the Galaxy Training Network - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v4 - with: - fetch-depth: 1 - - - name: Set up Python - uses: actions/setup-python@v5 - with: - python-version: "3.12" - - - name: Install dependencies - run: | - python -m pip install --upgrade pip - pip install -r utils/gtn-import-requirements.txt - - - name: Parse GTN RSS feed - env: - GTN_NEWS_FEED_URL: https://training.galaxyproject.org/training-material/feed.xml - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - REPO_NAME: ${{ github.repository }} - run: python utils/gtn-import.py diff --git a/utils/gtn-import-requirements.txt b/utils/gtn-import-requirements.txt index de9c38b1f4..78d182dd45 100644 --- a/utils/gtn-import-requirements.txt +++ b/utils/gtn-import-requirements.txt @@ -2,3 +2,5 @@ feedparser PyYAML PyGithub python-dateutil +geopy +country-converter diff --git a/utils/gtn-import.py b/utils/gtn-import.py index 7898ca6736..132f49cadf 100644 --- a/utils/gtn-import.py +++ b/utils/gtn-import.py @@ -1,17 +1,20 @@ import html import logging import os +import re import sys from datetime import datetime import feedparser import yaml +from country_converter import CountryConverter from dateutil.parser import isoparse +from geopy.geocoders import Nominatim from github import Github, GithubException logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s") -feed = feedparser.parse(os.getenv("GTN_NEWS_FEED_URL")) +feed = feedparser.parse(os.getenv("FEED_URL")) g = Github(os.getenv("GITHUB_TOKEN") or sys.exit("GITHUB_TOKEN not set")) repo = g.get_repo(os.getenv("REPO_NAME") or sys.exit("REPO_NAME not set")) @@ -22,21 +25,32 @@ for file in pr.get_files() ] -branch_name = f"import-gtn-posts-{datetime.now().strftime('%Y%m%d%H%M%S')}" +import_type = os.getenv("IMPORT_TYPE") +if import_type not in {"news", "events"}: + sys.exit("IMPORT_TYPE should be either news or events") + +branch_name = f"import-gtn-{import_type}-{datetime.now().strftime('%Y%m%d%H%M%S')}" repo.create_git_ref( ref=f"refs/heads/{branch_name}", sha=repo.get_branch(default_branch).commit.sha ) + created_files = [] -for entry in feed.entries: +for entry in feed.get("entries", []): title = html.unescape(entry.get("title", "Untitled")) date_ymd = isoparse( entry.get("published") or entry.get("pubDate") or entry.get("updated") ).strftime("%Y-%m-%d") - tags = {"training", "gtn-news"} | { - tag["term"] for tag in entry.get("tags", []) if "term" in tag - } + start_date = os.getenv("START_DATE") + if start_date and date_ymd < start_date: + logging.info(f"Skipping post {title} published on {date_ymd}") + continue + + tags = {"training", "gtn-news"} if import_type == "news" else set() + for tag in entry.get("tags", []): + if "term" in tag: + tags.add(tag["term"]) if "already-on-hub" in tags: continue @@ -45,7 +59,7 @@ summary = html.unescape(entry.get("summary", "")) slug = os.path.splitext(os.path.basename(link))[0] - folder = f"{date_ymd}-{slug}" + folder = f"{date_ymd}-{slug}" if import_type == "news" else f"{slug}" pr_exists = False for pr_url, file_path in existing_files: @@ -56,23 +70,78 @@ if pr_exists: continue - folder_path = os.path.join("content", "news", folder) + folder_path = os.path.join("content", import_type, folder) if os.path.exists(folder_path): logging.info(f"Folder Already exists: {folder}") continue - logging.info(f"New post: {folder}") created_files.append(f"[{title}]({link})") - meta = { - "subsites": ["all"], - "main_subsite": "global", - "date": date_ymd, - "tags": list(tags), - "title": str(title), - "authors": authors, - "external_url": link, - "tease": str(summary.split(". ")[0]), - } + + logging.info(f"New {import_type}: {folder}") + + if import_type == "news": + meta = { + "subsites": ["all"], + "main_subsite": "global", + "date": date_ymd, + "tags": list(tags), + "title": str(title), + "authors": authors, + "external_url": link, + "tease": str(summary.split(". ")[0]), + } + elif import_type == "events": + event_str = title.replace("\u2009", " ").replace("–", "-").strip() + pattern = r"\[(\w+)\s+(\d{1,2})\s*-\s*(\d{1,2}),\s*(\d{4})\]\s*(.+)" + match = re.match(pattern, event_str) + if match: + month, start_day, end_day, year, title = match.groups() + start_date = datetime.strptime(f"{start_day} {month} {year}", "%d %B %Y") + end_date = datetime.strptime(f"{end_day} {month} {year}", "%d %B %Y") + duration = (end_date - start_date).days + 1 + date = start_date.strftime("%Y-%m-%d") + else: + date = date_ymd + duration = 1 + + gtn = "external" not in entry.get("category") + + if geo := entry.get("georss"): + location_raw = ( + Nominatim(user_agent="GTN") + .reverse(map(float, geo.split()), language="en") + .raw + ) + city = location_raw.get("address", {}).get("city") + country = location_raw.get("address", {}).get("country") + location = f"{city}, {country}" + continent_name = CountryConverter().convert(names=country, to="continent") + continent_map = { + "Africa": "AF", + "Asia": "AS", + "Australia": "AU", + "Europe": "EU", + "North America": "NA", + "South America": "SA", + "Oceania": "AU", + } + continent = continent_map.get(continent_name, "GL") + else: + location = "Online" + continent = "GL" + meta = { + "subsites": ["all"], + "gtn": gtn, + "date": date, + "days": duration, + "tags": list(tags), + "title": str(title), + "contact": authors, + "location": {"name": location}, + "continent": continent, + "external_url": link, + "tease": str(summary.split(". ")[0]), + } md_config = yaml.dump( meta, default_flow_style=False, sort_keys=False, allow_unicode=True ) @@ -85,13 +154,13 @@ try: pr = repo.create_pull( - title="Import GTN Posts", - body=f"This PR imports new GTN posts.\nNew posts:\n{"\n".join(created_files)}", + title=f"Import GTN {import_type.capitalize()}", + body=f"This PR imports new GTN {import_type.capitalize()}.\n\n{"\n".join(created_files)}", head=branch_name, base=default_branch, ) logging.info( - f"Pull request created: {pr.html_url}\nTotal new posts: {len(created_files)}" + f"Pull request created: {pr.html_url}\nTotal new {import_type}: {len(created_files)}" ) except GithubException as e: repo.get_git_ref(f"heads/{branch_name}").delete()