Stats Scraper #151
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: Stats Scraper | |
| on: | |
| schedule: | |
| # Run daily at midnight UTC | |
| - cron: '0 0 * * *' | |
| workflow_dispatch: | |
| # Allow manual triggering | |
| push: | |
| branches: [ main ] | |
| pull_request: | |
| branches: [ main ] | |
| jobs: | |
| lint: | |
| runs-on: ubuntu-latest | |
| steps: | |
| - name: Checkout repository | |
| uses: actions/checkout@v3 | |
| - name: Set up Python | |
| uses: actions/setup-python@v4 | |
| with: | |
| python-version: '3.10' | |
| cache: 'pip' | |
| - name: Install linting dependencies | |
| run: | | |
| python -m pip install --upgrade pip | |
| pip install flake8 flake8-unused-arguments flake8-import-order flake8-builtins flake8-docstrings black isort | |
| - name: Check formatting with black | |
| run: black --check . | |
| - name: Check import order with isort | |
| run: isort --check --diff . | |
| - name: Lint with flake8 | |
| run: | | |
| # stop the build if there are Python syntax errors or undefined names | |
| flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics | |
| # check for unused imports and variables | |
| flake8 . --count --select=F401,F841 --show-source --statistics | |
| # exit-zero treats all errors as warnings | |
| flake8 . --count --exit-zero --max-complexity=10 --max-line-length=100 --statistics | |
| scrape_stats: | |
| runs-on: ubuntu-latest | |
| steps: | |
| - name: Checkout repository | |
| uses: actions/checkout@v3 | |
| - name: Set up Python | |
| uses: actions/setup-python@v4 | |
| with: | |
| python-version: '3.10' | |
| cache: 'pip' | |
| - name: Install dependencies | |
| run: | | |
| python -m pip install --upgrade pip | |
| pip install -r requirements.txt | |
| # Create config.yaml from example if it doesn't exist | |
| - name: Create config file | |
| run: | | |
| if [ ! -f config.yaml ]; then | |
| cp config.yaml.example config.yaml | |
| fi | |
| # Set environment variables for configuration | |
| - name: Set environment variables | |
| env: | |
| # GitHub configuration | |
| GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} | |
| GITHUB_OWNER: ${{ vars.GITHUB_OWNER || 'apache' }} | |
| GITHUB_REPO: ${{ vars.GITHUB_REPO || 'superset' }} | |
| # Database configuration | |
| DATABASE_TYPE: ${{ vars.DATABASE_TYPE || 'motherduck' }} | |
| MOTHERDUCK_TOKEN: ${{ secrets.MOTHERDUCK_TOKEN }} | |
| MOTHERDUCK_DATABASE: ${{ vars.MOTHERDUCK_DATABASE || 'superset_stats' }} | |
| # Slack configuration | |
| SLACK_API_TOKEN: ${{ secrets.SLACK_API_TOKEN }} | |
| # Matomo configuration | |
| MATOMO_KEY: ${{ secrets.MATOMO_KEY }} | |
| MATOMO_SITE_ID: ${{ vars.MATOMO_SITE_ID || '22' }} | |
| run: | | |
| echo "Environment variables set for configuration" | |
| # Run all scrapers with the same environment | |
| - name: Run star ranking scraper | |
| env: | |
| GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} | |
| MOTHERDUCK_TOKEN: ${{ secrets.MOTHERDUCK_TOKEN }} | |
| GITHUB_OWNER: ${{ vars.GITHUB_OWNER || 'apache' }} | |
| GITHUB_REPO: ${{ vars.GITHUB_REPO || 'superset' }} | |
| DATABASE_TYPE: ${{ vars.DATABASE_TYPE || 'motherduck' }} | |
| MOTHERDUCK_DATABASE: ${{ vars.MOTHERDUCK_DATABASE || 'superset_stats' }} | |
| run: | | |
| python scripts/star_ranking.py | |
| - name: Run repository visitors scraper | |
| env: | |
| GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} | |
| MOTHERDUCK_TOKEN: ${{ secrets.MOTHERDUCK_TOKEN }} | |
| GITHUB_OWNER: ${{ vars.GITHUB_OWNER || 'apache' }} | |
| GITHUB_REPO: ${{ vars.GITHUB_REPO || 'superset' }} | |
| DATABASE_TYPE: ${{ vars.DATABASE_TYPE || 'motherduck' }} | |
| MOTHERDUCK_DATABASE: ${{ vars.MOTHERDUCK_DATABASE || 'superset_stats' }} | |
| run: | | |
| python scripts/repo_visitors.py | |
| - name: Run new contributors scraper | |
| env: | |
| GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} | |
| MOTHERDUCK_TOKEN: ${{ secrets.MOTHERDUCK_TOKEN }} | |
| GITHUB_OWNER: ${{ vars.GITHUB_OWNER || 'apache' }} | |
| GITHUB_REPO: ${{ vars.GITHUB_REPO || 'superset' }} | |
| DATABASE_TYPE: ${{ vars.DATABASE_TYPE || 'motherduck' }} | |
| MOTHERDUCK_DATABASE: ${{ vars.MOTHERDUCK_DATABASE || 'superset_stats' }} | |
| run: | | |
| python scripts/new_contributors.py | |
| - name: Run new issue creators scraper | |
| env: | |
| GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} | |
| MOTHERDUCK_TOKEN: ${{ secrets.MOTHERDUCK_TOKEN }} | |
| GITHUB_OWNER: ${{ vars.GITHUB_OWNER || 'apache' }} | |
| GITHUB_REPO: ${{ vars.GITHUB_REPO || 'superset' }} | |
| DATABASE_TYPE: ${{ vars.DATABASE_TYPE || 'motherduck' }} | |
| MOTHERDUCK_DATABASE: ${{ vars.MOTHERDUCK_DATABASE || 'superset_stats' }} | |
| run: | | |
| python scripts/new_issue_creators.py | |
| - name: Run repository activity scraper | |
| env: | |
| GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} | |
| MOTHERDUCK_TOKEN: ${{ secrets.MOTHERDUCK_TOKEN }} | |
| GITHUB_OWNER: ${{ vars.GITHUB_OWNER || 'apache' }} | |
| GITHUB_REPO: ${{ vars.GITHUB_REPO || 'superset' }} | |
| DATABASE_TYPE: ${{ vars.DATABASE_TYPE || 'motherduck' }} | |
| MOTHERDUCK_DATABASE: ${{ vars.MOTHERDUCK_DATABASE || 'superset_stats' }} | |
| run: | | |
| python scripts/repo_activity.py | |
| - name: Run repository events scraper | |
| env: | |
| GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} | |
| MOTHERDUCK_TOKEN: ${{ secrets.MOTHERDUCK_TOKEN }} | |
| GITHUB_OWNER: ${{ vars.GITHUB_OWNER || 'apache' }} | |
| GITHUB_REPO: ${{ vars.GITHUB_REPO || 'superset' }} | |
| DATABASE_TYPE: ${{ vars.DATABASE_TYPE || 'motherduck' }} | |
| MOTHERDUCK_DATABASE: ${{ vars.MOTHERDUCK_DATABASE || 'superset_stats' }} | |
| run: | | |
| python scripts/repo_events.py | |
| - name: Run repository releases scraper | |
| env: | |
| GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} | |
| MOTHERDUCK_TOKEN: ${{ secrets.MOTHERDUCK_TOKEN }} | |
| GITHUB_OWNER: ${{ vars.GITHUB_OWNER || 'apache' }} | |
| GITHUB_REPO: ${{ vars.GITHUB_REPO || 'superset' }} | |
| DATABASE_TYPE: ${{ vars.DATABASE_TYPE || 'motherduck' }} | |
| MOTHERDUCK_DATABASE: ${{ vars.MOTHERDUCK_DATABASE || 'superset_stats' }} | |
| run: | | |
| python scripts/repo_releases.py | |
| - name: Run open issues scraper | |
| env: | |
| GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} | |
| MOTHERDUCK_TOKEN: ${{ secrets.MOTHERDUCK_TOKEN }} | |
| GITHUB_OWNER: ${{ vars.GITHUB_OWNER || 'apache' }} | |
| GITHUB_REPO: ${{ vars.GITHUB_REPO || 'superset' }} | |
| DATABASE_TYPE: ${{ vars.DATABASE_TYPE || 'motherduck' }} | |
| MOTHERDUCK_DATABASE: ${{ vars.MOTHERDUCK_DATABASE || 'superset_stats' }} | |
| run: | | |
| python scripts/open_issues.py | |
| - name: Run open PRs scraper | |
| env: | |
| GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} | |
| MOTHERDUCK_TOKEN: ${{ secrets.MOTHERDUCK_TOKEN }} | |
| GITHUB_OWNER: ${{ vars.GITHUB_OWNER || 'apache' }} | |
| GITHUB_REPO: ${{ vars.GITHUB_REPO || 'superset' }} | |
| DATABASE_TYPE: ${{ vars.DATABASE_TYPE || 'motherduck' }} | |
| MOTHERDUCK_DATABASE: ${{ vars.MOTHERDUCK_DATABASE || 'superset_stats' }} | |
| run: | | |
| python scripts/open_prs.py | |
| - name: Run PRs last year scraper | |
| env: | |
| GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} | |
| MOTHERDUCK_TOKEN: ${{ secrets.MOTHERDUCK_TOKEN }} | |
| GITHUB_OWNER: ${{ vars.GITHUB_OWNER || 'apache' }} | |
| GITHUB_REPO: ${{ vars.GITHUB_REPO || 'superset' }} | |
| DATABASE_TYPE: ${{ vars.DATABASE_TYPE || 'motherduck' }} | |
| MOTHERDUCK_DATABASE: ${{ vars.MOTHERDUCK_DATABASE || 'superset_stats' }} | |
| run: | | |
| python scripts/prs_last_year.py | |
| - name: Run Slack workspace stats scraper | |
| env: | |
| SLACK_API_TOKEN: ${{ secrets.SLACK_API_TOKEN }} | |
| MOTHERDUCK_TOKEN: ${{ secrets.MOTHERDUCK_TOKEN }} | |
| DATABASE_TYPE: ${{ vars.DATABASE_TYPE || 'motherduck' }} | |
| MOTHERDUCK_DATABASE: ${{ vars.MOTHERDUCK_DATABASE || 'superset_stats' }} | |
| run: | | |
| python scripts/slack_workspace_stats.py | |
| - name: Run Slack channel stats scraper | |
| env: | |
| SLACK_API_TOKEN: ${{ secrets.SLACK_API_TOKEN }} | |
| MOTHERDUCK_TOKEN: ${{ secrets.MOTHERDUCK_TOKEN }} | |
| DATABASE_TYPE: ${{ vars.DATABASE_TYPE || 'motherduck' }} | |
| MOTHERDUCK_DATABASE: ${{ vars.MOTHERDUCK_DATABASE || 'superset_stats' }} | |
| run: | | |
| python scripts/slack_channel_stats.py | |
| - name: Run Matomo analytics scraper | |
| env: | |
| MATOMO_KEY: ${{ secrets.MATOMO_KEY }} | |
| MOTHERDUCK_TOKEN: ${{ secrets.MOTHERDUCK_TOKEN }} | |
| DATABASE_TYPE: ${{ vars.DATABASE_TYPE || 'motherduck' }} | |
| MOTHERDUCK_DATABASE: ${{ vars.MOTHERDUCK_DATABASE || 'superset_stats' }} | |
| MATOMO_SITE_ID: ${{ vars.MATOMO_SITE_ID || '22' }} | |
| run: | | |
| python scripts/matomo_analytics.py | |
| - name: Run Matomo visitor map scraper | |
| env: | |
| MATOMO_KEY: ${{ secrets.MATOMO_KEY }} | |
| MOTHERDUCK_TOKEN: ${{ secrets.MOTHERDUCK_TOKEN }} | |
| DATABASE_TYPE: ${{ vars.DATABASE_TYPE || 'motherduck' }} | |
| MOTHERDUCK_DATABASE: ${{ vars.MOTHERDUCK_DATABASE || 'superset_stats' }} | |
| MATOMO_SITE_ID: ${{ vars.MATOMO_SITE_ID || '22' }} | |
| run: | | |
| python scripts/matomo_visitor_map.py | |
| - name: Run Matomo top pages scraper | |
| env: | |
| MATOMO_KEY: ${{ secrets.MATOMO_KEY }} | |
| MOTHERDUCK_TOKEN: ${{ secrets.MOTHERDUCK_TOKEN }} | |
| DATABASE_TYPE: ${{ vars.DATABASE_TYPE || 'motherduck' }} | |
| MOTHERDUCK_DATABASE: ${{ vars.MOTHERDUCK_DATABASE || 'superset_stats' }} | |
| MATOMO_SITE_ID: ${{ vars.MATOMO_SITE_ID || '22' }} | |
| run: | | |
| python scripts/matomo_top_pages.py | |
| - name: Run community calendar scraper | |
| env: | |
| GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} | |
| MOTHERDUCK_TOKEN: ${{ secrets.MOTHERDUCK_TOKEN }} | |
| GITHUB_OWNER: ${{ vars.GITHUB_OWNER || 'apache' }} | |
| GITHUB_REPO: ${{ vars.GITHUB_REPO || 'superset' }} | |
| DATABASE_TYPE: ${{ vars.DATABASE_TYPE || 'motherduck' }} | |
| MOTHERDUCK_DATABASE: ${{ vars.MOTHERDUCK_DATABASE || 'superset_stats' }} | |
| run: | | |
| python scripts/community_calendar.py | |
| - name: Run Kapa activity scraper | |
| env: | |
| KAPA_KEY: ${{ secrets.KAPA_KEY }} | |
| MOTHERDUCK_TOKEN: ${{ secrets.MOTHERDUCK_TOKEN }} | |
| DATABASE_TYPE: ${{ vars.DATABASE_TYPE || 'motherduck' }} | |
| MOTHERDUCK_DATABASE: ${{ vars.MOTHERDUCK_DATABASE || 'superset_stats' }} | |
| run: | | |
| python scripts/kapa_activity.py |