diff --git a/scripts/check-broken-links-md.py b/scripts/check-broken-links-md.py index 37ea8165..7eb47a9b 100644 --- a/scripts/check-broken-links-md.py +++ b/scripts/check-broken-links-md.py @@ -1,65 +1,100 @@ -"""Check if ther eis any broken links.""" - -from __future__ import annotations - -import os import subprocess +import sys +import re +import json +from pathlib import Path +from typing import List, Tuple -# List of exception URLs -exception_urls = [ - "https://www.linkedin.com/", -] +# ==== BASIC SETTINGS ==== +IGNORE_URLS = ["https://www.linkedin.com/"] # URLs to skip +RETRY_STATUS = 429 # Too many requests +SCAN_FOLDER = "pages" # default folder to check +HTTP_MODE = "get" # request type -def process_log() -> None: - """Run the command and capture the output.""" - log_err = "" - exitcode = 0 +# ==== HELPERS ==== +def load_whitelist(path: str) -> List[str]: + """Load extra ignored links from a JSON file (if available).""" + try: + with open(path, "r", encoding="utf-8") as f: + return json.load(f) + except (FileNotFoundError, json.JSONDecodeError): + print("[ii] Using default ignore list.") + return IGNORE_URLS + +def run_checker(folder: str, method: str) -> Tuple[int, str]: + """Run linkcheckmd and capture what it says.""" try: - subprocess.run( - ["python", "-m", "linkcheckmd", "-r", "-v", "-m", "get", "pages"], - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, + task = subprocess.run( + ["python", "-m", "linkcheckmd", "-r", "-v", "-m", method, folder], + capture_output=True, text=True, check=True, ) + return task.returncode, task.stdout except subprocess.CalledProcessError as e: - exitcode = e.returncode - # for some reason they were swapped - log_err = e.stdout + return e.returncode, e.stdout + e.stderr - if exitcode == 0: - print("[II] All links are ok.") - return - flagged_errors = [] - for line in log_err.splitlines(): - line = line.strip() # noqa: PLW2901 - # Check if the line starts with '(' - if not line.startswith("("): - continue +def extract_bad_links(log: str) -> List[Tuple[str, str, int]]: + """ + Pull out (file, url, status) from linkcheckmd logs. + """ + regex = re.compile(r"\(([^)]+)\)\s+(https?://[^\s]+)\s+\[status:(\d+)\]") + found = regex.findall(log) + return [(f, u, int(s)) for f, u, s in found] + - if line.endswith("429)"): - # Too Many Requests http error +def skip_allowed(links: List[Tuple[str, str, int]], ignore_list: List[str]): + """Remove whitelisted or rate-limited links.""" + final = [] + for file, url, code in links: + if any(skip in url for skip in ignore_list): continue - # Extract the URL using regex - for exception_url in exception_urls: - if exception_url not in line: - flagged_errors.append(line) - - # Print flagged errors - if not flagged_errors: - print("[II] All links are ok.") - print("No errors flagged. All URLs are in the exception list.") + if code == RETRY_STATUS: + continue + final.append((file, url, code)) + return final + + +def print_report(bad_links: List[Tuple[str, str, int]]): + """Display summary in a friendlier way.""" + if not bad_links: + print(" Everything looks good! No broken links 🎉") + return + + print(f"[!!] Found {len(bad_links)} broken link(s):\n") + for file, url, code in bad_links: + print(f"- In: {file}\n → {url}\n (Status: {code})\n") + sys.exit(1) + + +# MAIN WORK +def main(): + import argparse + + parser = argparse.ArgumentParser(description="Simple Markdown link checker.") + parser.add_argument("-d", "--dir", default=SCAN_FOLDER, help="Folder to check.") + parser.add_argument( + "-e", "--exceptions", default="", help="Path to JSON file of links to skip." + ) + parser.add_argument( + "-m", "--method", default=HTTP_MODE, help="HTTP method (default: get)." + ) + args = parser.parse_args() + + whitelist = load_whitelist(args.exceptions) + code, output = run_checker(args.dir, args.method) + + if code == 0: + print(" All clear! No broken links reported.") return - print("Errors flagged for the following URLs:") - for line in flagged_errors: - print(line) - os._exit(1) + bad_links = extract_bad_links(output) + final_list = skip_allowed(bad_links, whitelist) + print_report(final_list) -# Run the script if __name__ == "__main__": - process_log() + main()