Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Integration tests with multiple configuration files #49

Merged
merged 11 commits into from
Mar 4, 2025
103 changes: 75 additions & 28 deletions .github/workflows/test.yml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
name: Scapyd-k8s CI
name: Scrapyd-k8s CI
on:
push:
branches:
Expand Down Expand Up @@ -46,15 +46,20 @@ jobs:
- name: Pull example spider
run: docker pull ghcr.io/q-m/scrapyd-k8s-spider-example

- name: Run scrapyd-k8s
run: |
cp scrapyd_k8s.sample-docker.conf scrapyd_k8s.conf
python -m scrapyd_k8s &
while ! nc -q 1 localhost 6800 </dev/null; do sleep 1; done
curl http://localhost:6800/daemonstatus.json

- name: Run tests
run: pytest -vv --color=yes scrapyd_k8s/tests/integration/
run: |
for test in scrapyd_k8s/tests/integration/test_*.py; do
echo; echo "# $test"
# run scrapyd-k8s with test-specific configuration file
cfg=`echo "$test" | sed 's/\.py$/.conf/'`
python -m scrapyd_k8s -c scrapyd_k8s.sample-docker.conf -c "$cfg" &
# wait for scrapyd-k8s to become ready
curl -s --retry 30 --retry-delay 1 --retry-all-errors http://localhost:6800/daemonstatus.json
# run test
pytest -vv --color=yes "$test"
# stop scrapyd-k8s again
kill %1; wait %1 || true
done

test-manifest:
container:
Expand Down Expand Up @@ -100,17 +105,54 @@ jobs:
sed -i 's/\(image:\s*\)ghcr\.io\/q-m\/scrapyd-k8s:/\1test:/' kubernetes.yaml
sed -i 's/\(type:\s*\)ClusterIP/\1NodePort/' kubernetes.yaml
kubectl create -f kubernetes.yaml
# and wait for scrapyd-k8s to become ready
kubectl wait --for=condition=Available deploy/scrapyd-k8s --timeout=60s
curl --retry 10 --retry-delay 2 --retry-all-errors `minikube service scrapyd-k8s --url`/daemonstatus.json
# don't start deployment just yet, as we want to run it with test-specific configuration
kubectl scale --replicas=0 deploy/scrapyd-k8s
# add second configuration file for test-specific configuration
kubectl patch deploy scrapyd-k8s --type=json -p='[
{
"op": "add",
"path": "/spec/template/spec/volumes/-",
"value": { "configMap": { "name": "scrapyd-k8s-testcfg" }, "name": "scrapyd-k8s-testcfg" }
},
{
"op": "add",
"path": "/spec/template/spec/containers/0/volumeMounts/-",
"value": { "name": "scrapyd-k8s-testcfg", "mountPath": "/opt/app/scrapyd_k8s.test.conf", "readOnly": true, "subPath": "scrapyd_k8s.test.conf" }
},
{
"op": "replace",
"path": "/spec/template/spec/containers/0/command",
"value": ["python3", "-m", "scrapyd_k8s", "-c", "scrapyd_k8s.conf", "-c", "scrapyd_k8s.test.conf"]
}
]'

- name: Run tests
run: |
TEST_WITH_K8S=1 \
TEST_BASE_URL=`minikube service scrapyd-k8s --url` \
TEST_MAX_WAIT=60 \
TEST_AVAILABLE_VERSIONS=latest,`skopeo list-tags docker://ghcr.io/q-m/scrapyd-k8s-spider-example | jq -r '.Tags | map(select(. != "latest" and (startswith("sha-") | not))) | join(",")'` \
pytest -vv --color=yes scrapyd_k8s/tests/integration/
# setup for in-cluster k8s
# for each integration test file
for test in scrapyd_k8s/tests/integration/test_*.py; do
echo; echo "# $test"
# run scrapyd-k8s with test-specific configuration file
cfg=`echo "$test" | sed 's/\.py$/.conf/'`
kubectl create cm scrapyd-k8s-testcfg --from-file=scrapyd_k8s.test.conf="$cfg"
kubectl scale --replicas=1 deploy/scrapyd-k8s
# wait for scrapyd-k8s to become ready
kubectl wait --for=condition=Available deploy/scrapyd-k8s --timeout=60s
curl -s --retry 10 --retry-delay 2 --retry-all-errors `minikube service scrapyd-k8s --url`/daemonstatus.json
# run test
TEST_WITH_K8S=1 \
TEST_BASE_URL=`minikube service scrapyd-k8s --url` \
TEST_MAX_WAIT=60 \
TEST_AVAILABLE_VERSIONS=latest,`skopeo list-tags docker://ghcr.io/q-m/scrapyd-k8s-spider-example | jq -r '.Tags | map(select(. != "latest" and (startswith("sha-") | not))) | join(",")'` \
pytest -vv --color=yes "$test"
# delete al jobs to start with a clean slate next time
kubectl delete job --all
# stop scrapyd-k8s and delete test-specific configmap
kubectl scale --replicas=0 deploy/scrapyd-k8s
kubectl wait --for=delete pod -l app.kubernetes.io/name=scrapyd-k8s --timeout=90s
kubectl delete cm scrapyd-k8s-testcfg --wait
done

test-k8s:
container:
runs-on: ubuntu-latest
Expand Down Expand Up @@ -139,16 +181,21 @@ jobs:
# already pull image so we don't have to wait for it later
minikube image pull ghcr.io/q-m/scrapyd-k8s-spider-example:latest

- name: Run scrapyd-k8s
run: |
cp scrapyd_k8s.sample-k8s.conf scrapyd_k8s.conf
python -m scrapyd_k8s &
while ! nc -q 1 localhost 6800 </dev/null; do sleep 1; done
curl http://localhost:6800/daemonstatus.json

- name: Run tests
run: |
TEST_WITH_K8S=1 \
TEST_MAX_WAIT=60 \
TEST_AVAILABLE_VERSIONS=latest,`skopeo list-tags docker://ghcr.io/q-m/scrapyd-k8s-spider-example | jq -r '.Tags | map(select(. != "latest" and (startswith("sha-") | not))) | join(",")'` \
pytest -vv --color=yes scrapyd_k8s/tests/integration/
for test in scrapyd_k8s/tests/integration/test_*.py; do
echo "# $test"
# run scrapyd-k8s with test-specific configuration file
cfg=`echo "$test" | sed 's/\.py$/.conf/'`
[ -e "$cfg" ] || cfg=/dev/null
python -m scrapyd_k8s -c scrapyd_k8s.sample-k8s.conf -c "$cfg" &
# wait for scrapyd-k8s to become ready
curl -s --retry 30 --retry-delay 1 --retry-all-errors http://localhost:6800/daemonstatus.json
# run test
TEST_WITH_K8S=1 \
TEST_MAX_WAIT=60 \
TEST_AVAILABLE_VERSIONS=latest,`skopeo list-tags docker://ghcr.io/q-m/scrapyd-k8s-spider-example | jq -r '.Tags | map(select(. != "latest" and (startswith("sha-") | not))) | join(",")'` \
pytest -vv --color=yes "$test"
# stop scrapyd-k8s again
kill %1; wait %1 || true
done
17 changes: 16 additions & 1 deletion scrapyd_k8s/__main__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,19 @@
from .api import run
import argparse

from .api import config, run

def argparser():
parser = argparse.ArgumentParser(
prog='scrapyd-k8s',
description='Deploying and running spiders on container infrastructure, with the scrapyd protocol.'
)
parser.add_argument('-c', '--config', action='append', default=['scrapyd_k8s.conf'],
help='Load configuration file (can be multiple)')
return parser

if __name__ == "__main__":
parser = argparser()
args = parser.parse_args()
config.read(args.config)

run()
34 changes: 17 additions & 17 deletions scrapyd_k8s/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,18 +5,10 @@
from flask_basicauth import BasicAuth
from natsort import natsort_keygen, ns

# setup logging before anything else
from .config import Config
from .logging import setup_logging
config = Config()
log_level = config.scrapyd().get('log_level', 'INFO')
setup_logging(log_level)

app = Flask(__name__)
repository = (config.repository_cls())(config)
launcher = (config.launcher_cls())(config)
scrapyd_config = config.scrapyd()


@app.get("/")
def home():
Expand All @@ -28,9 +20,9 @@ def healthz():

@app.get("/daemonstatus.json")
def api_daemonstatus():
jobs = list(launcher.listjobs())
jobs = list(config.launcher().listjobs())
return {
"node_name": config.scrapyd().get("node_name", launcher.get_node_name()),
"node_name": config.scrapyd().get("node_name", config.launcher().get_node_name()),
"status": "ok",
"pending": len([j for j in jobs if j['state'] == 'pending']),
"running": len([j for j in jobs if j['state'] == 'running']),
Expand All @@ -55,7 +47,7 @@ def api_schedule():
# any other parameter is passed as spider argument
args = { k: v for k, v in request.form.items() if k not in ('project', 'spider', 'setting', 'jobid', 'priority', '_version') }
env_config, env_secret = project.env_config(), project.env_secret()
jobid = launcher.schedule(project, _version, spider, job_id, settings, args)
jobid = config.launcher().schedule(project, _version, spider, job_id, settings, args)
return { 'status': 'ok', 'jobid': job_id }

@app.post("/cancel.json")
Expand All @@ -67,7 +59,7 @@ def api_cancel():
if not job_id:
return error('job missing in form parameters', status=400)
signal = request.form.get('signal', 'TERM') # TODO validate signal?
prevstate = launcher.cancel(project_id, job_id, signal)
prevstate = config.launcher().cancel(project_id, job_id, signal)
if not prevstate:
return error('job not found', status=404)
return { 'status': 'ok', 'prevstate': prevstate }
Expand All @@ -84,7 +76,7 @@ def api_listversions():
project = config.project(project_id)
if not project:
return error('project not found in configuration', status=404)
tags = repository.listtags(project.repository())
tags = config.repository().listtags(project.repository())
tags = [t for t in tags if not t.startswith('sha-')]
tags.sort(key=natsort_keygen(alg=ns.NUMAFTER))
return { 'status': 'ok', 'versions': tags }
Expand All @@ -98,15 +90,15 @@ def api_listspiders():
if not project:
return error('project not found in configuration', status=404)
_version = request.args.get('_version', 'latest') # TODO allow customizing latest tag
spiders = repository.listspiders(project.repository(), project_id, _version)
spiders = config.repository().listspiders(project.repository(), project_id, _version)
if spiders is None:
return error('project version not found in repository', status=404)
return { 'status': 'ok', 'spiders': spiders }

@app.get("/listjobs.json")
def api_listjobs():
project_id = request.args.get('project')
jobs = launcher.listjobs(project_id)
jobs = config.launcher().listjobs(project_id)
pending = [j for j in jobs if j['state'] == 'pending']
running = [j for j in jobs if j['state'] == 'running']
finished = [j for j in jobs if j['state'] == 'finished']
Expand All @@ -133,21 +125,29 @@ def api_delproject():
def after_request(response: Response):
if response.is_json:
data = response.json
data["node_name"] = config.scrapyd().get("node_name", launcher.get_node_name())
data["node_name"] = config.scrapyd().get("node_name", config.launcher().get_node_name())
response.data = jsonify(data).data
return response

def error(msg, status=200):
return { 'status': 'error', 'message': msg }, status

def enable_authentication(app, config_username, config_password):
basic_auth = BasicAuth(app)

# workaround for https://github.com/jpvanhal/flask-basicauth/issues/11
class BasicAuthExceptHealthz(BasicAuth):
def authenticate(self):
return request.path == "/healthz" or super().authenticate()

basic_auth = BasicAuthExceptHealthz(app)
app.config["BASIC_AUTH_USERNAME"] = config_username
app.config["BASIC_AUTH_PASSWORD"] = config_password
app.config["BASIC_AUTH_FORCE"] = True
return basic_auth

def run():
scrapyd_config = config.scrapyd()

# where to listen
host = scrapyd_config.get('bind_address', '127.0.0.1')
port = scrapyd_config.get('http_port', '6800')
Expand Down
29 changes: 25 additions & 4 deletions scrapyd_k8s/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,21 +2,42 @@
from configparser import ConfigParser
from importlib import import_module

from .logging import setup_logging

class Config:
def __init__(self, file='scrapyd_k8s.conf'):
def __init__(self):
self._config = ConfigParser(empty_lines_in_values=False)
self._config.read(file)
self._projects = []
self._launcher = None
self._repository = None

def read(self, files=['scrapyd_k8s.conf']):
self._config.read(files)
self._update()

def _update(self):
self._projects = [s[8:] for s in self._config.sections() if re.match(r'^project\.[^\.]+$', s)]
setup_logging(self.scrapyd().get('log_level', 'INFO'))

def scrapyd(self):
return self._config['scrapyd']

def repository_cls(self):
def repository(self):
if not self._repository:
self._repository = (self._repository_cls())(self)
return self._repository

def _repository_cls(self):
repo = self._config['scrapyd'].get('repository', 'scrapyd_k8s.repository.Remote')
pkg, cls = repo.rsplit('.', 1)
return getattr(import_module(pkg), cls)

def launcher_cls(self):
def launcher(self):
if not self._launcher:
self._launcher = (self._launcher_cls())(self)
return self._launcher

def _launcher_cls(self):
repo = self._config['scrapyd'].get('launcher', 'scrapyd_k8s.launcher.K8s')
pkg, cls = repo.rsplit('.', 1)
return getattr(import_module(pkg), cls)
Expand Down
2 changes: 2 additions & 0 deletions scrapyd_k8s/tests/integration/test_api.conf
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
# additional scrapyd-k8s configuration for test_api.py
# (empty, i.e. there is no additional configuration for this test)
4 changes: 4 additions & 0 deletions scrapyd_k8s/tests/integration/test_auth.conf
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
# additional scrapyd-k8s configuration for test_auth.py
[scrapyd]
username = foo
password = secret
50 changes: 50 additions & 0 deletions scrapyd_k8s/tests/integration/test_auth.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
#!/usr/bin/env python3
import os
import requests

BASE_URL = os.getenv('TEST_BASE_URL', 'http://localhost:6800')

def test_root_no_auth():
response = requests.get(BASE_URL)
assert response.status_code == 401
assert 'scrapyd-k8s' not in response.text

def test_root_incorrect_auth():
session = requests.Session()
session.auth = ('nonexistant', 'incorrect')
response = session.get(BASE_URL)
assert response.status_code == 401
assert 'scrapyd-k8s' not in response.text

def test_root_correct_auth():
session = requests.Session()
session.auth = ('foo', 'secret') # needs to match test_auth.conf
response = session.get(BASE_URL)
assert response.status_code == 200
assert response.headers['Content-Type'] == 'text/html; charset=utf-8'
assert 'scrapyd-k8s' in response.text
assert '</html>' in response.text

# TODO this is going wrong now (!)
#def test_healthz_ok():
# response = requests.get(BASE_URL + '/healthz')
# assert response.status_code == 200

def test_daemonstatus_no_auth():
response = requests.get(BASE_URL + '/daemonstatus.json')
assert response.status_code == 401

def test_daemonstatus_incorrect_auth():
session = requests.Session()
session.auth = ('nonexistant', 'incorrect')
response = session.get(BASE_URL + '/daemonstatus.json')
assert response.status_code == 401
assert 'ok' not in response.text

def test_daemonstatus_correct_auth():
session = requests.Session()
session.auth = ('foo', 'secret') # needs to match test_auth.conf
response = session.get(BASE_URL + '/daemonstatus.json')
assert response.status_code == 200
assert response.headers['Content-Type'] == 'application/json'
assert response.json()['status'] == 'ok'
Loading