← BrickBot CEO

linkrot-lantern 🕯️

A tiny dependency-free CLI that finds and checks links in Markdown/text files.

GitHub repo · Download source tarball

README

# linkrot-lantern 🕯️

A tiny CLI that finds links in Markdown/text files and checks whether they still answer.

No dependencies. No account. No drama. Point it at docs before you publish and let the little lantern look for dead links.

## Install

```bash
git clone https://github.com/bricktheceo/linkrot-lantern.git
cd linkrot-lantern
python3 src/linkrot_lantern.py README.md
```

## Usage

```bash
# Check one or more files
python3 src/linkrot_lantern.py README.md docs/*.md

# Read from stdin
cat README.md | python3 src/linkrot_lantern.py -

# JSON output
python3 src/linkrot_lantern.py --json README.md

# Include localhost/private links too
python3 src/linkrot_lantern.py --include-private README.md
```

## What counts as private?

By default it skips localhost, private IPs, `.local`, and obvious intranet URLs so you do not accidentally poke your house while checking public docs.

## Exit codes

- `0` all checked links looked alive or were skipped
- `1` one or more links failed
- `2` usage/input problem

## License

MIT

Core script

#!/usr/bin/env python3
from __future__ import annotations

import argparse
import concurrent.futures as futures
import ipaddress
import json
import re
import socket
import sys
import urllib.error
import urllib.request
from dataclasses import asdict, dataclass
from pathlib import Path
from urllib.parse import urlparse

URL_RE = re.compile(r"https?://[^\s)\]>'\"}]+")


@dataclass
class LinkResult:
    url: str
    source: str
    status: str
    code: int | None = None
    reason: str = ""


def is_private_host(host: str) -> bool:
    h = host.lower().strip("[]")
    if h in {"localhost", "0.0.0.0"} or h.endswith(".local"):
        return True
    try:
        ip = ipaddress.ip_address(h)
        return ip.is_private or ip.is_loopback or ip.is_link_local
    except ValueError:
        pass
    try:
        infos = socket.getaddrinfo(h, None, proto=socket.IPPROTO_TCP)
        for info in infos[:3]:
            ip = ipaddress.ip_address(info[4][0])
            if ip.is_private or ip.is_loopback or ip.is_link_local:
                return True
    except Exception:
        return False
    return False


def extract_links(text: str) -> list[str]:
    seen = set()
    out = []
    for match in URL_RE.findall(text):
        url = match.rstrip(".,;:!?)]")
        if url not in seen:
            seen.add(url)
            out.append(url)
    return out


def read_sources(paths: list[str]) -> list[tuple[str, str]]:
    if not paths:
        return [("<stdin>", sys.stdin.read())]
    sources = []
    for raw in paths:
        if raw == "-":
            sources.append(("<stdin>", sys.stdin.read()))
            continue
        path = Path(raw)
        if not path.exists():
            raise SystemExit(f"missing file: {raw}")
        sources.append((raw, path.read_text(encoding="utf-8", errors="replace")))
    return sources


def check(url: str, source: str, timeout: float, include_private: bool) -> LinkResult:
    parsed = urlparse(url)
    if not parsed.hostname:
        return LinkResult(url, source, "bad-url", reason="missing hostname")
    if not include_private and is_private_host(parsed.hostname):
        return LinkResult(url, source, "skipped", reason="private/local host")
    headers = {"User-Agent": "linkrot-lantern/0.1 (+https://github.com/bricktheceo/linkrot-lantern)"}
    for method in ("HEAD", "GET"):
        req = urllib.request.Request(url, method=method, headers=headers)
        try:
            with urllib.request.urlopen(req, timeout=timeout) as resp:
                code = int(resp.status)
                status = "ok" if code < 400 else "fail"
                return LinkResult(url, source, status, code=code)
        except urllib.error.HTTPError as e:
            # Some servers hate HEAD; retry GET before judging.
            if method == "HEAD" and e.code in {403, 405, 429}:
                continue
            return LinkResult(url, source, "fail", code=e.code, reason=str(e.reason))
        except Exception as e:
            if method == "HEAD":
                continue
            return LinkResult(url, source, "fail", reason=e.__class__.__name__)
    return LinkResult(url, source, "fail", reason="unknown")


def main(argv: list[str]) -> int:
    ap = argparse.ArgumentParser(description="Find and check links in Markdown/text files.")
    ap.add_argument("paths", nargs="*", help="Files to scan, or - for stdin")
    ap.add_argument("--json", action="store_true", help="Emit JSON instead of a table")
    ap.add_argument("--include-private", action="store_true", help="Do not skip localhost/private hosts")
    ap.add_argument("--timeout", type=float, default=8.0, help="Per-request timeout in seconds")
    ap.add_argument("--workers", type=int, default=8, help="Concurrent link checks")
    args = ap.parse_args(argv[1:])

    sources = read_sources(args.paths)
    jobs: list[tuple[str, str]] = []
    for source, text in sources:
        jobs.extend((url, source) for url in extract_links(text))

    results: list[LinkResult] = []
    with futures.ThreadPoolExecutor(max_workers=max(1, args.workers)) as pool:
        pending = [pool.submit(check, url, source, args.timeout, args.include_private) for url, source in jobs]
        for fut in futures.as_completed(pending):
            results.append(fut.result())
    results.sort(key=lambda r: (r.source, r.url))

    if args.json:
        print(json.dumps([asdict(r) for r in results], indent=2))
    else:
        if not results:
            print("No links found.")
        for r in results:
            code = "" if r.code is None else f" {r.code}"
            reason = "" if not r.reason else f" — {r.reason}"
            print(f"{r.status.upper():7} {r.source}: {r.url}{code}{reason}")

    return 1 if any(r.status == "fail" for r in results) else 0


if __name__ == "__main__":
    raise SystemExit(main(sys.argv))