pypi · djwooten · Jul 12, 2024 · Jul 12, 2024 · Jul 12, 2024 · Jul 12, 2024
diff --git a/.github/workflows/account_recovery.yml b/.github/workflows/account_recovery.yml
@@ -0,0 +1,31 @@
+name: Issue Label Trigger
+
+on:
+  issues:
+    types: [labeled]
+
+jobs:
+  parse-issue:
+    runs-on: ubuntu-latest
+    if: contains(github.event.issue.labels.*.name, 'account-recovery')
+    steps:
+    - name: Checkout repository
+      uses: actions/checkout@v2
+
+    - name: Set up Python
+      uses: actions/setup-python@v2
+      with:
+        python-version: '3.x'
+
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install -r .github/workflows/autoreplies/requirements.txt
+
+    - name: Run Python script
+      run: python .github/workflows/autoreplies/check_account_recovery.py
+      env:
+        ISSUE_NUMBER: ${{ github.event.issue.number }}
+        GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        GITHUB_ISSUE_OWNER: ${{ github.repository_owner }}
+        GITHUB_ISSUE_REPO: ${{ github.event.repository.name }}
diff --git a/.github/workflows/autoreplies/__init__.py b/.github/workflows/autoreplies/__init__.py
diff --git a/.github/workflows/autoreplies/check_account_recovery.py b/.github/workflows/autoreplies/check_account_recovery.py
@@ -0,0 +1,187 @@
+"""Parse a GitHub issue to automatically aggregate package ownership information to facilitate account recovery.
+
+Steps
+1) finds all PyPI packages maintained by the user
+2) checks each PyPI package to see if its source code repository listed at PyPI belongs to the github user
+3) adds a comment to the issue summarizing the package ownership information
+
+If the github user owns the source code repositories for all of the PyPI packages, or is an administrator for the github
+organization that owns them, then the issue is automatically labeled with "fasttrack".
+
+Environment Variables
+---------------------
+GITHUB_ISSUE_OWNER
+    The owner (e.g., "pypi") of the issue repository
+
+GITHUB_ISSUE_REPO
+    The repository (e.g., "support") where the issue is located
+
+ISSUE_NUMBER
+    The number of the issue to process
+
+GITHUB_TOKEN
+    (Optional) A GitHub token with permissions to comment on the issue and read the repository.
+"""
+
+import os
+import sys
+
+import pypi_utils
+import gh_utils
+
+
+# Issue body headers
+PYPI_USER_HEADER = "PyPI Username"
+
+# Ownership status levels
+BELONGS = 0
+ORG_ADMIN = 1
+ORG_MEMBER = 2
+UNKNOWN_OWERNSHIP = 3
+NO_REPO = 4
+
+# This notice indicates that the final determination of account recovery rests with the PyPI team
+BOT_NOTICE = (
+    "### NOTE\n\n"
+    "_This action was performed automatically by a bot and **does not guarantee account recovery**. Account recovery"
+    " requires manual approval processing by the PyPI team._"
+)
+
+
+def sanitize_pypi_user(username: str) -> str:
+    """Remove any backticks from the username.
+
+    Some users write their usernames like:
+    `username`
+    for pretty markdown purposes, but we don't want the backticks.
+    """
+    return username.strip().replace("`", "")
+
+
+def format_markdown_table(rows: list) -> str:
+    """Format a list of rows into a markdown table.
+
+    Parameters
+    ----------
+    rows: list
+        A list of rows to format into a table. Each row should be [package_link, repo_url, ownership_level] where
+        ownership_level is an int indicating which column to mark with an "X".
+    """
+    header = ["Package", "Repository", "Owner", "Admin", "Member", "Unknown", "No Repo"]
+    row_strings = []
+    row_strings.append(" | ".join(header))
+    row_strings.append(" | ".join(["---"] * 2 + [":-:"] * (len(header) - 2)))
+    for row in rows:
+        row_fields = [""] * len(header)
+        row_fields[0] = row[0]
+        row_fields[1] = row[1]
+        row_fields[2 + row[2]] = "X"
+        row_strings.append(" | ".join(row_fields))
+    return "\n".join(row_strings)
+
+
+def format_markdown_package_link(package_name: str) -> str:
+    return f"[{package_name}](https://pypi.org/project/{package_name})"
+
+
+def format_markdown_pypi_user_link(pypi_user: str) -> str:
+    return f"[{pypi_user}](https://pypi.org/user/{pypi_user}/)"
+
+
+def format_markdown_gh_user_link(gh_user: str) -> str:
+    return f"[{gh_user}](https://github.com/{gh_user}/)"
+
+
+if __name__ == "__main__":
+    issue_number = os.environ.get("ISSUE_NUMBER", "4386")
+    github_token = os.environ.get("GITHUB_TOKEN", None)
+    github_issue_owner = os.environ.get("GITHUB_ISSUE_OWNER", "pypi")
+    github_issue_repo = os.environ.get("GITHUB_ISSUE_REPO", "support")
+
+    issue_data = gh_utils.fetch_issue_details(
+        github_issue_owner, github_issue_repo, issue_number, github_token=github_token
+    )
+
+    gh_user = issue_data["user"]
+    gh_user_link = format_markdown_gh_user_link(gh_user)
+
+    if PYPI_USER_HEADER not in issue_data["body"]:
+        raise ValueError(f"Issue body does not contain expected header: {PYPI_USER_HEADER}")
+
+    pypi_user = sanitize_pypi_user(issue_data["body"]["PyPI Username"])
+    pypi_user_link = format_markdown_pypi_user_link(pypi_user)
+
+    try:
+        packages = pypi_utils.get_packages_by_user(pypi_user)
+    except ValueError as e:
+        raise e
+
+    # If the pypi user is not a maintainer for any packages
+    if not packages:
+        gh_utils.add_issue_comment(
+            f"User {pypi_user_link} has no packages",
+            github_issue_owner,
+            github_issue_repo,
+            issue_number,
+            github_token=github_token,
+        )
+        sys.exit()
+
+    # Loop over all packages to see if they belong to the user
+    package_ownership = []  # List of [package_name, repo_url, ownership_status]
+    for package_name in packages:
+        pypi_package_link = format_markdown_package_link(package_name)
+        package = pypi_utils.get_pypi_project_info(package_name)
+
+        # Package has source code repo listed at PyPI
+        if "repository_url" not in package or not package["repository_url"]:
+            package_ownership.append([pypi_package_link, "", NO_REPO])
+            continue
+
+        package_repo_url = package["repository_url"]
+
+        # Package source repo directly belongs to the gh_user
+        if gh_utils.does_user_own_repo(package_repo_url, gh_user):
+            package_ownership.append([pypi_package_link, package_repo_url, BELONGS])
+            continue
+
+        # If package source repo belongs to an organization - check if the gh_user is a member or admin
+        org_status = gh_utils.get_user_role_in_org(package_repo_url, gh_user)
+        if org_status == "admin":
+            package_ownership.append([pypi_package_link, package_repo_url, ORG_ADMIN])
+        elif org_status == "member":
+            package_ownership.append([pypi_package_link, package_repo_url, ORG_MEMBER])
+
+        # Otherwise the source repo may not belong to the gh_user
+        else:
+            package_ownership.append([pypi_package_link, package_repo_url, UNKNOWN_OWERNSHIP])
+
+    # Add a comment to the issue with the package ownership information
+    table = format_markdown_table(package_ownership)
+
+    # Count how many packages are not owned or administered by the user
+    num_unverified = len([row for row in package_ownership if row[2] > ORG_ADMIN])
+
+    if num_unverified == 0:
+        label = "fasttrack"
+    else:
+        label = ""
+
+    comment = "\n\n".join(["### Package Ownership", table, BOT_NOTICE])
+
+    try:
+        gh_utils.add_issue_comment(
+            comment, github_issue_owner, github_issue_repo, issue_number, github_token=github_token
+        )
+    except Exception as e:
+        print(f"Failed to add comment to issue {issue_number}: {e}")
+        print("Comment:")
+        print(comment)
+
+    if label:
+        try:
+            gh_utils.add_label_to_issue(
+                label, github_issue_owner, github_issue_repo, issue_number, github_token=github_token
+            )
+        except Exception as e:
+            print(f"Failed to add label to issue {issue_number}: {e}")
diff --git a/.github/workflows/autoreplies/gh_utils.py b/.github/workflows/autoreplies/gh_utils.py
@@ -0,0 +1,202 @@
+import re
+from urllib.parse import urlparse
+
+import requests
+
+
+def fetch_issue_details(gh_user: str, repo_name: str, issue_number, github_token=None) -> dict:
+    """Fetch issue details using the GitHub API."""
+    headers = {"Authorization": f"token {github_token}"} if github_token else {}
+
+    url = f"https://api.github.com/repos/{gh_user}/{repo_name}/issues/{issue_number}"
+    response = requests.get(url, headers=headers)
+    if response.status_code == 200:
+        return parse_issue_details(response.json())
+    raise ValueError(f"Failed to fetch issue details: {response.status_code}")
+
+
+def parse_issue_details(issue: dict) -> dict:
+    """Parse a GitHub issue metadata to retrieve relevant fields."""
+    body = parse_issue_body(issue["body"])
+    return {
+        "created_at": issue["created_at"],
+        "user": issue["user"]["login"],
+        "url": issue["html_url"],
+        "body": body,
+    }
+
+
+def parse_issue_body(body: str) -> dict:
+    """Parse the body of a GitHub issue into a dictionary.
+
+    This function works well with the issue templates, though may run into trouble if users include "### " in their own
+    body text.
+
+    For example:
+
+        ### Some header
+
+        abcd 123 ab
+        cdefg
+
+        ### Another header
+
+        Lorem ipsum dolor sit amet, consectetur adipiscing elit.
+
+    will get processed to:
+    {
+        "Some header": "abcd 123 ab\ncdefg",
+        "Another header": "Lorem ipsum dolor sit amet, consectetur adipiscing elit."
+    }
+
+    Parameters
+    ----------
+    body: str
+        The body of the issue.
+
+    Returns
+    -------
+        dict
+            A dictionary with the issue text keyed by the markdown headers (h3)
+    """
+    RE_GH_ISSUE_HEADER = re.compile(r"### (?P<key>.+)")  # This finds lines beginning with "### " to use as keys
+    body_dict = {}
+    cur_key = None
+    cur_lines = []
+    for line in body.strip().split("\n"):
+        line = line.strip()
+        if not line:
+            continue
+        header_match = RE_GH_ISSUE_HEADER.match(line)
+        if header_match:
+            if cur_key:
+                body_dict[cur_key] = "\n".join(cur_lines)
+                cur_lines = []
+            cur_key = header_match.group("key")
+        else:
+            cur_lines.append(line)
+    return body_dict
+
+
+def _sanitize_url(url: str) -> str:
+    """Ensure the URL starts with "http://" or "https://", and lowercases the URL since GitHub is case-insensitive."""
+    url = url.lower()
+    if not url.startswith("http"):
+        url = f"https://{url}"
+    return url
+
+
+def _is_user_in_org(org_name, username, github_token=None):
+    """Return True if the user is a publically listed member of the organization."""
+    url = f"https://api.github.com/orgs/{org_name}/members/{username}"
+    headers = {"Authorization": f"token {github_token}"} if github_token else {}
+    response = requests.get(url, headers=headers)
+    return response.status_code == 204
+
+
+def _is_user_owner_of_org(org_name, username, github_token=None):
+    """Return True if the user is an owner of the organization."""
+    url = f"https://api.github.com/orgs/{org_name}/memberships/{username}"
+    headers = {"Authorization": f"token {github_token}"} if github_token else {}
+    response = requests.get(url, headers=headers)
+    if response.status_code == 200:
+        membership_info = response.json()
+        return membership_info.get("role") == "admin"
+    return False
+
+
+def _is_github_pages_belonging_to_owner(code_repo_url: str, gh_user: str) -> bool:
+    """Return True if the URL is a GitHub Pages URL for the GitHub user's account."""
+    parsed_url = urlparse(_sanitize_url(code_repo_url))
+
+    # Normalize domain
+    hostname = parsed_url.hostname or ""
+    hostname = hostname.replace("www.", "")
+    return hostname == f"{gh_user}.github.io".lower()
+
+
+def _is_github_repo_belonging_to_owner(code_repo_url: str, gh_user: str) -> bool:
+    """Return True if the URL is a GitHub repo associated to the GitHub user's account."""
+    parsed_url = urlparse(_sanitize_url(code_repo_url))
+
+    # Normalize domain
+    hostname = parsed_url.hostname or ""
+    hostname = hostname.replace("www.", "")
+
+    # Check if the domain is github.com
+    if hostname != "github.com":
+        return False
+
+    # Split the path to analyze its parts
+    path_parts = parsed_url.path.strip("/").split("/")
+
+    # Check if the first part of the path is 'gh_user'
+    return path_parts and path_parts[0] == gh_user.lower()
+
+
+def get_user_role_in_org(code_repo_url: str, gh_user: str, github_token=None) -> str:
+    """Determines the role of the user in an organization.
+
+    Parameters
+    ----------
+    code_repo_url: str
+        The URL of the repository. This can be a GitHub Pages URL or a GitHub repository URL.
+
+    gh_user: str
+        The GitHub username to check for.
+
+    github_token: str
+        The GitHub token to use for API requests.
+
+    Returns
+    -------
+    str
+        "member" or "admin", or an empty string if the user is not in the organization.
+    """
+    parsed_url = urlparse(_sanitize_url(code_repo_url))
+
+    # Normalize domain
+    hostname = parsed_url.hostname or ""
+    hostname = hostname.replace("www.", "").lower()
+
+    RE_GH_PAGES = re.compile(r"^(?P<org_name>.+)\.github\.io$")
+    pages_match = re.match(RE_GH_PAGES, hostname)
+    if pages_match:
+        org_name = pages_match.group("org_name")
+    elif hostname == "github.com":
+        org_name = parsed_url.path.strip("/").split("/")[0]
+    else:
+        return ""
+
+    if _is_user_in_org(org_name, gh_user, github_token=github_token):
+        if _is_user_owner_of_org(org_name, gh_user, github_token=github_token):
+            return "admin"
+        return "member"
+    return ""
+
+
+def does_user_own_repo(code_repo_url: str, gh_user: str) -> bool:
+    """Return True if the GitHub user owns the repository."""
+    return _is_github_repo_belonging_to_owner(code_repo_url, gh_user) or _is_github_pages_belonging_to_owner(
+        code_repo_url, gh_user
+    )
+
+
+def add_issue_comment(comment: str, gh_user: str, repo_name: str, issue_number, github_token=None):
+    """Add a comment to a GitHub issue."""
+    headers = {"Authorization": f"token {github_token}"} if github_token else {}
+    url = f"https://api.github.com/repos/{gh_user}/{repo_name}/issues/{issue_number}/comments"
+    response = requests.post(url, json={"body": comment}, headers=headers)
+    if response.status_code != 201:
+        raise ValueError(f"Failed to add comment: {response.status_code}")
+    return response.json()
+
+
+def add_label_to_issue(label: str, gh_user: str, repo_name: str, issue_number, github_token=None):
+    """Add a label to a GitHub issue."""
+    headers = {"Authorization": f"token {github_token}"} if github_token else {}
+    url = f"https://api.github.com/repos/{gh_user}/{repo_name}/issues/{issue_number}/labels"
+    response = requests.post(url, json=[label], headers=headers)
+    if response.status_code != 200:
+        raise ValueError(f"Failed to add label: {response.status_code}")
+    return response.json()
diff --git a/.github/workflows/autoreplies/pypi_utils.py b/.github/workflows/autoreplies/pypi_utils.py
@@ -0,0 +1,91 @@
+import re
+import time
+from typing import Dict
+
+import requests
+from bs4 import BeautifulSoup
+
+
+def get_packages_by_user(username: str) -> list:
+    """Parse html to get a list of packages for a given PyPI user.
+
+    The pypi api does not provide a way to get a list of packages for a user, hence crawling the html.
+
+    Steps:
+    1) Queries the PyPI user page for the given username.
+    2) Parses the html to get the number of projects and the list of packages. This assumes that the number of projects
+        listed on the page is in the first <h2> tag, in the form "X project" or "X projects".
+    3) Loops over all elements of <a class="package-snippet"> to get the package names.
+    4) Ensure that the number of packages found is equal to the number of projects reported. If not, raise an error.
+    5) Return the list of package names.
+
+    Step 2 is to avoid having to handle pagination of projects. As of now the user with the most projects I have seen
+    has 43, and there was no pagination. If pagination is detected, this function will raise an error.
+
+    Parameters
+    ----------
+        username: str
+            The PyPI username to search for.
+
+    Returns
+    -------
+        list
+            A list of package names
+    """
+    time.sleep(1)
+    url = f"https://pypi.org/user/{username}/"
+    response = requests.get(url)
+    if response.status_code == 200:
+        soup = BeautifulSoup(response.content, "html.parser")
+
+        # Get the reported number of projects maintained by this user, to ensure we later don't miss any
+        num_projects_text = soup.find("h2").text.lower()
+        num_projects_text = num_projects_text.replace("no projects", "0 projects")
+
+        RE_PROJECT_COUNT = re.compile(r"\s*(?P<num_projects>\d+)\s*project(?:s)?")
+        re_num_project_match = RE_PROJECT_COUNT.match(num_projects_text)
+        if not re_num_project_match:
+            raise ValueError(f"Could not determine the number of projects for user {username}")
+
+        num_projects = int(re_num_project_match.group("num_projects"))
+        packages = [a.text.strip().split("\n")[0] for a in soup.find_all("a", class_="package-snippet")]
+        # Check for pagination: if num_projects > len(packages) then there are probably more pages
+        # which aren't handled here yet
+        if len(packages) != num_projects:
+            raise ValueError(f"num_projects {num_projects} != num_packages {len(packages)} for user {username}")
+        return packages
+    raise ValueError(f"Error retrieving project data for user {username}")
+
+
+def get_pypi_project_info(package_name: str) -> Dict[str, str]:
+    """Retrieve relevant information about a PyPI project.
+
+    Parameters
+    ----------
+        package_name: str
+            The name of the package to query.
+
+    Returns
+    -------
+        Dict[str, str]
+            A dictionary containing the following keys:
+                - repository_url ("" if no repository or homepage is listed)
+                - author
+                - author_email
+    """
+    time.sleep(1)
+    url = f"https://pypi.org/pypi/{package_name}/json"
+    response = requests.get(url)
+    if response.status_code != 200:
+        raise ValueError(f"Error retrieving project info for {package_name}")
+
+    data = response.json()
+    info = data.get("info", {})
+    project_urls = info.get("project_urls", {}) or {}
+    author = info.get("author")
+    author_email = info.get("author_email")
+    return {
+        "repository_url": project_urls.get("Source", project_urls.get("Homepage", "")),
+        "author": author,
+        "author_email": author_email,
+    }
diff --git a/.github/workflows/autoreplies/requirements.txt b/.github/workflows/autoreplies/requirements.txt
@@ -0,0 +1,2 @@
+beautifulsoup4>=4.9.1
+requests>=2.24.0
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,163 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+.pybuilder/
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
+.pdm.toml
+.pdm-python
+.pdm-build/
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# pytype static type analyzer
+.pytype/
+
+# Cython debug symbols
+cython_debug/
+
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
+