Skip to content

Stats Scraper

Stats Scraper #151

Workflow file for this run

name: Stats Scraper
on:
schedule:
# Run daily at midnight UTC
- cron: '0 0 * * *'
workflow_dispatch:
# Allow manual triggering
push:
branches: [ main ]
pull_request:
branches: [ main ]
jobs:
lint:
runs-on: ubuntu-latest
steps:
- name: Checkout repository
uses: actions/checkout@v3
- name: Set up Python
uses: actions/setup-python@v4
with:
python-version: '3.10'
cache: 'pip'
- name: Install linting dependencies
run: |
python -m pip install --upgrade pip
pip install flake8 flake8-unused-arguments flake8-import-order flake8-builtins flake8-docstrings black isort
- name: Check formatting with black
run: black --check .
- name: Check import order with isort
run: isort --check --diff .
- name: Lint with flake8
run: |
# stop the build if there are Python syntax errors or undefined names
flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
# check for unused imports and variables
flake8 . --count --select=F401,F841 --show-source --statistics
# exit-zero treats all errors as warnings
flake8 . --count --exit-zero --max-complexity=10 --max-line-length=100 --statistics
scrape_stats:
runs-on: ubuntu-latest
steps:
- name: Checkout repository
uses: actions/checkout@v3
- name: Set up Python
uses: actions/setup-python@v4
with:
python-version: '3.10'
cache: 'pip'
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install -r requirements.txt
# Create config.yaml from example if it doesn't exist
- name: Create config file
run: |
if [ ! -f config.yaml ]; then
cp config.yaml.example config.yaml
fi
# Set environment variables for configuration
- name: Set environment variables
env:
# GitHub configuration
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
GITHUB_OWNER: ${{ vars.GITHUB_OWNER || 'apache' }}
GITHUB_REPO: ${{ vars.GITHUB_REPO || 'superset' }}
# Database configuration
DATABASE_TYPE: ${{ vars.DATABASE_TYPE || 'motherduck' }}
MOTHERDUCK_TOKEN: ${{ secrets.MOTHERDUCK_TOKEN }}
MOTHERDUCK_DATABASE: ${{ vars.MOTHERDUCK_DATABASE || 'superset_stats' }}
# Slack configuration
SLACK_API_TOKEN: ${{ secrets.SLACK_API_TOKEN }}
# Matomo configuration
MATOMO_KEY: ${{ secrets.MATOMO_KEY }}
MATOMO_SITE_ID: ${{ vars.MATOMO_SITE_ID || '22' }}
run: |
echo "Environment variables set for configuration"
# Run all scrapers with the same environment
- name: Run star ranking scraper
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
MOTHERDUCK_TOKEN: ${{ secrets.MOTHERDUCK_TOKEN }}
GITHUB_OWNER: ${{ vars.GITHUB_OWNER || 'apache' }}
GITHUB_REPO: ${{ vars.GITHUB_REPO || 'superset' }}
DATABASE_TYPE: ${{ vars.DATABASE_TYPE || 'motherduck' }}
MOTHERDUCK_DATABASE: ${{ vars.MOTHERDUCK_DATABASE || 'superset_stats' }}
run: |
python scripts/star_ranking.py
- name: Run repository visitors scraper
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
MOTHERDUCK_TOKEN: ${{ secrets.MOTHERDUCK_TOKEN }}
GITHUB_OWNER: ${{ vars.GITHUB_OWNER || 'apache' }}
GITHUB_REPO: ${{ vars.GITHUB_REPO || 'superset' }}
DATABASE_TYPE: ${{ vars.DATABASE_TYPE || 'motherduck' }}
MOTHERDUCK_DATABASE: ${{ vars.MOTHERDUCK_DATABASE || 'superset_stats' }}
run: |
python scripts/repo_visitors.py
- name: Run new contributors scraper
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
MOTHERDUCK_TOKEN: ${{ secrets.MOTHERDUCK_TOKEN }}
GITHUB_OWNER: ${{ vars.GITHUB_OWNER || 'apache' }}
GITHUB_REPO: ${{ vars.GITHUB_REPO || 'superset' }}
DATABASE_TYPE: ${{ vars.DATABASE_TYPE || 'motherduck' }}
MOTHERDUCK_DATABASE: ${{ vars.MOTHERDUCK_DATABASE || 'superset_stats' }}
run: |
python scripts/new_contributors.py
- name: Run new issue creators scraper
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
MOTHERDUCK_TOKEN: ${{ secrets.MOTHERDUCK_TOKEN }}
GITHUB_OWNER: ${{ vars.GITHUB_OWNER || 'apache' }}
GITHUB_REPO: ${{ vars.GITHUB_REPO || 'superset' }}
DATABASE_TYPE: ${{ vars.DATABASE_TYPE || 'motherduck' }}
MOTHERDUCK_DATABASE: ${{ vars.MOTHERDUCK_DATABASE || 'superset_stats' }}
run: |
python scripts/new_issue_creators.py
- name: Run repository activity scraper
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
MOTHERDUCK_TOKEN: ${{ secrets.MOTHERDUCK_TOKEN }}
GITHUB_OWNER: ${{ vars.GITHUB_OWNER || 'apache' }}
GITHUB_REPO: ${{ vars.GITHUB_REPO || 'superset' }}
DATABASE_TYPE: ${{ vars.DATABASE_TYPE || 'motherduck' }}
MOTHERDUCK_DATABASE: ${{ vars.MOTHERDUCK_DATABASE || 'superset_stats' }}
run: |
python scripts/repo_activity.py
- name: Run repository events scraper
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
MOTHERDUCK_TOKEN: ${{ secrets.MOTHERDUCK_TOKEN }}
GITHUB_OWNER: ${{ vars.GITHUB_OWNER || 'apache' }}
GITHUB_REPO: ${{ vars.GITHUB_REPO || 'superset' }}
DATABASE_TYPE: ${{ vars.DATABASE_TYPE || 'motherduck' }}
MOTHERDUCK_DATABASE: ${{ vars.MOTHERDUCK_DATABASE || 'superset_stats' }}
run: |
python scripts/repo_events.py
- name: Run repository releases scraper
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
MOTHERDUCK_TOKEN: ${{ secrets.MOTHERDUCK_TOKEN }}
GITHUB_OWNER: ${{ vars.GITHUB_OWNER || 'apache' }}
GITHUB_REPO: ${{ vars.GITHUB_REPO || 'superset' }}
DATABASE_TYPE: ${{ vars.DATABASE_TYPE || 'motherduck' }}
MOTHERDUCK_DATABASE: ${{ vars.MOTHERDUCK_DATABASE || 'superset_stats' }}
run: |
python scripts/repo_releases.py
- name: Run open issues scraper
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
MOTHERDUCK_TOKEN: ${{ secrets.MOTHERDUCK_TOKEN }}
GITHUB_OWNER: ${{ vars.GITHUB_OWNER || 'apache' }}
GITHUB_REPO: ${{ vars.GITHUB_REPO || 'superset' }}
DATABASE_TYPE: ${{ vars.DATABASE_TYPE || 'motherduck' }}
MOTHERDUCK_DATABASE: ${{ vars.MOTHERDUCK_DATABASE || 'superset_stats' }}
run: |
python scripts/open_issues.py
- name: Run open PRs scraper
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
MOTHERDUCK_TOKEN: ${{ secrets.MOTHERDUCK_TOKEN }}
GITHUB_OWNER: ${{ vars.GITHUB_OWNER || 'apache' }}
GITHUB_REPO: ${{ vars.GITHUB_REPO || 'superset' }}
DATABASE_TYPE: ${{ vars.DATABASE_TYPE || 'motherduck' }}
MOTHERDUCK_DATABASE: ${{ vars.MOTHERDUCK_DATABASE || 'superset_stats' }}
run: |
python scripts/open_prs.py
- name: Run PRs last year scraper
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
MOTHERDUCK_TOKEN: ${{ secrets.MOTHERDUCK_TOKEN }}
GITHUB_OWNER: ${{ vars.GITHUB_OWNER || 'apache' }}
GITHUB_REPO: ${{ vars.GITHUB_REPO || 'superset' }}
DATABASE_TYPE: ${{ vars.DATABASE_TYPE || 'motherduck' }}
MOTHERDUCK_DATABASE: ${{ vars.MOTHERDUCK_DATABASE || 'superset_stats' }}
run: |
python scripts/prs_last_year.py
- name: Run Slack workspace stats scraper
env:
SLACK_API_TOKEN: ${{ secrets.SLACK_API_TOKEN }}
MOTHERDUCK_TOKEN: ${{ secrets.MOTHERDUCK_TOKEN }}
DATABASE_TYPE: ${{ vars.DATABASE_TYPE || 'motherduck' }}
MOTHERDUCK_DATABASE: ${{ vars.MOTHERDUCK_DATABASE || 'superset_stats' }}
run: |
python scripts/slack_workspace_stats.py
- name: Run Slack channel stats scraper
env:
SLACK_API_TOKEN: ${{ secrets.SLACK_API_TOKEN }}
MOTHERDUCK_TOKEN: ${{ secrets.MOTHERDUCK_TOKEN }}
DATABASE_TYPE: ${{ vars.DATABASE_TYPE || 'motherduck' }}
MOTHERDUCK_DATABASE: ${{ vars.MOTHERDUCK_DATABASE || 'superset_stats' }}
run: |
python scripts/slack_channel_stats.py
- name: Run Matomo analytics scraper
env:
MATOMO_KEY: ${{ secrets.MATOMO_KEY }}
MOTHERDUCK_TOKEN: ${{ secrets.MOTHERDUCK_TOKEN }}
DATABASE_TYPE: ${{ vars.DATABASE_TYPE || 'motherduck' }}
MOTHERDUCK_DATABASE: ${{ vars.MOTHERDUCK_DATABASE || 'superset_stats' }}
MATOMO_SITE_ID: ${{ vars.MATOMO_SITE_ID || '22' }}
run: |
python scripts/matomo_analytics.py
- name: Run Matomo visitor map scraper
env:
MATOMO_KEY: ${{ secrets.MATOMO_KEY }}
MOTHERDUCK_TOKEN: ${{ secrets.MOTHERDUCK_TOKEN }}
DATABASE_TYPE: ${{ vars.DATABASE_TYPE || 'motherduck' }}
MOTHERDUCK_DATABASE: ${{ vars.MOTHERDUCK_DATABASE || 'superset_stats' }}
MATOMO_SITE_ID: ${{ vars.MATOMO_SITE_ID || '22' }}
run: |
python scripts/matomo_visitor_map.py
- name: Run Matomo top pages scraper
env:
MATOMO_KEY: ${{ secrets.MATOMO_KEY }}
MOTHERDUCK_TOKEN: ${{ secrets.MOTHERDUCK_TOKEN }}
DATABASE_TYPE: ${{ vars.DATABASE_TYPE || 'motherduck' }}
MOTHERDUCK_DATABASE: ${{ vars.MOTHERDUCK_DATABASE || 'superset_stats' }}
MATOMO_SITE_ID: ${{ vars.MATOMO_SITE_ID || '22' }}
run: |
python scripts/matomo_top_pages.py
- name: Run community calendar scraper
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
MOTHERDUCK_TOKEN: ${{ secrets.MOTHERDUCK_TOKEN }}
GITHUB_OWNER: ${{ vars.GITHUB_OWNER || 'apache' }}
GITHUB_REPO: ${{ vars.GITHUB_REPO || 'superset' }}
DATABASE_TYPE: ${{ vars.DATABASE_TYPE || 'motherduck' }}
MOTHERDUCK_DATABASE: ${{ vars.MOTHERDUCK_DATABASE || 'superset_stats' }}
run: |
python scripts/community_calendar.py
- name: Run Kapa activity scraper
env:
KAPA_KEY: ${{ secrets.KAPA_KEY }}
MOTHERDUCK_TOKEN: ${{ secrets.MOTHERDUCK_TOKEN }}
DATABASE_TYPE: ${{ vars.DATABASE_TYPE || 'motherduck' }}
MOTHERDUCK_DATABASE: ${{ vars.MOTHERDUCK_DATABASE || 'superset_stats' }}
run: |
python scripts/kapa_activity.py