From b8ac703bf060aea532130571520cb65b6e887c56 Mon Sep 17 00:00:00 2001 From: Simon Bennetts Date: Thu, 7 May 2026 17:27:42 +0100 Subject: [PATCH] Compare spiders script A script which makes it easy to compare the 2 modern spiders. I'm expecting this to be run locally rather than in CICD as the results will be too variable. I havnt done a detailed check of the comparison code - I'm expecting us to use this as an indication as to wheat we need to focus on in the client spider. And when we start looking into things in detail then if the comparison turns out to have bugs in it then that would be the time to fix them :grin: Signed-off-by: Simon Bennetts --- scans/spider-cmp/README.md | 97 +++++++++ scans/spider-cmp/ajax.yaml | 34 ++++ scans/spider-cmp/client.yaml | 33 +++ scans/spider-cmp/compare_spiders.py | 299 ++++++++++++++++++++++++++++ scans/spider-cmp/std-sites.txt | 5 + 5 files changed, 468 insertions(+) create mode 100644 scans/spider-cmp/README.md create mode 100644 scans/spider-cmp/ajax.yaml create mode 100644 scans/spider-cmp/client.yaml create mode 100644 scans/spider-cmp/compare_spiders.py create mode 100644 scans/spider-cmp/std-sites.txt diff --git a/scans/spider-cmp/README.md b/scans/spider-cmp/README.md new file mode 100644 index 00000000..2b74c242 --- /dev/null +++ b/scans/spider-cmp/README.md @@ -0,0 +1,97 @@ +# Spider Comparison Script + +A script for comparing ZAP's two modern spiders — the **AJAX spider** and the **Client spider** — across one or more target sites. + +## Overview + +`compare_spiders.py` runs both spiders against each target, exports the Site Tree and Client Map from each run, then prints a side-by-side comparison table showing how many nodes each spider found, how many were unique to each, and how long each run took. + +## Requirements + +- Python 3 +- [PyYAML](https://pypi.org/project/PyYAML/): `pip install pyyaml` +- A ZAP installation directory containing `zap.sh` + +## Usage + +```bash +# Single site +python3 compare_spiders.py --zap /path/to/zap --site https://example.com/ + +# Multiple sites from a file +python3 compare_spiders.py --zap /path/to/zap --sites std-sites.txt + +# Set ZAP path via environment variable instead +export ZAP_DIR=/path/to/zap +python3 compare_spiders.py --sites std-sites.txt + +# Custom output directory and port +python3 compare_spiders.py --zap /path/to/zap --sites std-sites.txt --out my-results --port 8090 +``` + +### Options + +| Option | Default | Description | +|--------|---------|-------------| +| `--site URL` | — | Single target URL (mutually exclusive with `--sites`) | +| `--sites FILE` | — | File containing target URLs, one per line (lines starting with `#` are ignored) | +| `--zap DIR` | `$ZAP_DIR` | ZAP installation directory (must contain `zap.sh`) | +| `--port PORT` | `9090` | Port ZAP listens on | +| `--out DIR` | `results` | Root directory for scan output files | + +## Output + +Results are printed as a table after all sites have been scanned: + +``` +| Site | Atime | Ctime | StCom | StAo | StCo | CmCom | CmAo | CmCo | ++-----------------------------------+-------+-------+-------+------+------+-------+------+------+ +| https://brokencrystals.com/ | 2:24 | 1:39 | 103 | 2 | 3 | 104 | 8 | 12 | +| https://ginandjuice.shop/ | 0:59 | 0:36 | 92 | 0 | 20 | 109 | 0 | 31 | +| https://juice-shop.herokuapp.com/ | 10:02 | 12:23 | 603 | 10 | 0 | 500 | 6 | 0 | +``` + +### Column definitions + +| Column | Description | +|--------|-------------| +| `Atime` | Time taken by the AJAX spider (mm:ss) | +| `Ctime` | Time taken by the Client spider (mm:ss) | +| `StCom` | Site Tree nodes found by **both** spiders | +| `StAo` | Site Tree nodes found by the **AJAX spider only** | +| `StCo` | Site Tree nodes found by the **Client spider only** | +| `CmCom` | Client Map nodes found by **both** spiders | +| `CmAo` | Client Map nodes found by the **AJAX spider only** | +| `CmCo` | Client Map nodes found by the **Client spider only** | + +## Output files + +For each site scanned, the following files are written under `--out`: + +``` +results/ + {site}/ + ajax-site.yaml ← Site Tree export from the AJAX spider run + ajax-map.yaml ← Client Map export from the AJAX spider run + client-site.yaml ← Site Tree export from the Client spider run + client-map.yaml ← Client Map export from the Client spider run + ajax-data/ ← ZAP home directory for the AJAX spider run + client-data/ ← ZAP home directory for the Client spider run +``` + +## Configuration + +The spiders are configured by `ajax.yaml` and `client.yaml` in this directory. Both run 6 browser instances with a 5-minute maximum duration. Edit these files to adjust spider parameters such as `numberOfBrowsers` or `maxDuration`. + +## Sites file format + +`std-sites.txt` contains a set of public test targets. Lines beginning with `#` are treated as comments: + +``` +# Public test sites +https://brokencrystals.com/ +https://ginandjuice.shop/ +https://juice-shop.herokuapp.com/ +https://public-firing-range.appspot.com/ +https://security-crawl-maze.app/ +``` diff --git a/scans/spider-cmp/ajax.yaml b/scans/spider-cmp/ajax.yaml new file mode 100644 index 00000000..fe32e848 --- /dev/null +++ b/scans/spider-cmp/ajax.yaml @@ -0,0 +1,34 @@ +--- +env: + contexts: + - name: "Target" + urls: + - "${target}" + includePaths: [] + excludePaths: [] + parameters: + failOnError: true + failOnWarning: false + progressToStdout: true + vars: {} +jobs: +- type: spiderAjax + name: "spiderAjax" + parameters: + numberOfBrowsers: 6 + maxDuration: 5 + enableExtensions: true +- type: passiveScan-wait + name: "passiveScan-wait" +- type: export + parameters: + source: sitestree + context: Target + type: yaml + fileName: ajax-site.yaml +- type: export + parameters: + source: clientmap + context: Target + type: yaml + fileName: ajax-map.yaml diff --git a/scans/spider-cmp/client.yaml b/scans/spider-cmp/client.yaml new file mode 100644 index 00000000..58576cd3 --- /dev/null +++ b/scans/spider-cmp/client.yaml @@ -0,0 +1,33 @@ +--- +env: + contexts: + - name: "Target" + urls: + - "${target}" + includePaths: [] + excludePaths: [] + parameters: + failOnError: true + failOnWarning: false + progressToStdout: true + vars: {} +jobs: +- type: spiderClient + name: "spiderClient" + parameters: + numberOfBrowsers: 6 + maxDuration: 5 +- type: passiveScan-wait + name: "passiveScan-wait" +- type: export + parameters: + source: sitestree + context: Target + type: yaml + fileName: client-site.yaml +- type: export + parameters: + source: clientmap + context: Target + type: yaml + fileName: client-map.yaml diff --git a/scans/spider-cmp/compare_spiders.py b/scans/spider-cmp/compare_spiders.py new file mode 100644 index 00000000..6d04780d --- /dev/null +++ b/scans/spider-cmp/compare_spiders.py @@ -0,0 +1,299 @@ +#!/usr/bin/env python3 +"""Compare ZAP AJAX spider and Client spider across one or more target sites. + +Usage: + compare_spiders.py --zap /path/to/zap --site https://example.com/ + compare_spiders.py --zap /path/to/zap --sites std-sites.txt + +Requires: pyyaml (pip install pyyaml) +""" + +import argparse +import os +import subprocess +import sys +import tempfile +import time +from pathlib import Path + +import yaml + +# PyYAML treats bare '=' as the YAML 1.1 value type, which SafeLoader has no +# constructor for. ZAP exports can produce this in POST data fields. +yaml.SafeLoader.add_constructor( + "tag:yaml.org,2002:value", + lambda loader, node: loader.construct_scalar(node), +) + +HEADERS = ["Site", "Atime", "Ctime", "StCom", "StAo", "StCo", "CmCom", "CmAo", "CmCo"] +KEYS = ["site", "atime", "ctime", "st_com", "st_ao", "st_co", "cm_com", "cm_ao", "cm_co"] + + +def parse_args(): + p = argparse.ArgumentParser( + description="Compare ZAP AJAX spider vs Client spider across target sites" + ) + sites_group = p.add_mutually_exclusive_group(required=True) + sites_group.add_argument("--site", metavar="URL", help="Single target URL") + sites_group.add_argument( + "--sites", metavar="FILE", help="File with target URLs, one per line" + ) + p.add_argument( + "--zap", + metavar="DIR", + default=os.environ.get("ZAP_DIR"), + help="ZAP installation directory containing zap.sh (or set ZAP_DIR env var)", + ) + p.add_argument("--port", type=int, default=9090, help="ZAP port (default: 9090)") + p.add_argument( + "--out", + metavar="DIR", + default="results", + help="Root output directory for scan results (default: results)", + ) + return p.parse_args() + + +def load_sites(args): + if args.site: + return [args.site.strip()] + with open(args.sites) as f: + return [ + line.strip() + for line in f + if line.strip() and not line.startswith("#") + ] + + +def url_to_dirname(url): + """Convert a URL to a safe filesystem directory name.""" + return ( + url.replace("://", "_") + .replace("/", "_") + .replace(".", "-") + .strip("_") + ) + + +def make_run_config(template_path, output_dir): + """ + Write a temp YAML config derived from template_path with each export + job's fileName replaced by an absolute path under output_dir. + Returns the temp file path; caller must delete it. + """ + with open(template_path) as f: + config = yaml.safe_load(f) + + for job in config.get("jobs", []): + if job.get("type") == "export": + fname = job["parameters"]["fileName"] + job["parameters"]["fileName"] = str(output_dir / fname) + + fd, tmp_path = tempfile.mkstemp(suffix=".yaml", prefix="zap_run_") + with os.fdopen(fd, "w") as f: + yaml.dump(config, f, default_flow_style=False, sort_keys=False) + return Path(tmp_path) + + +def run_zap(zap_sh, port, autorun_yaml, target, data_dir): + """Run ZAP in cmd mode against target. Returns elapsed seconds.""" + data_dir.mkdir(parents=True, exist_ok=True) + env = os.environ.copy() + env["target"] = target + cmd = [ + str(zap_sh), + "-cmd", + "-port", str(port), + "-autorun", str(autorun_yaml), + "-dir", str(data_dir), + ] + t0 = time.monotonic() + result = subprocess.run(cmd, env=env) + elapsed = time.monotonic() - t0 + if result.returncode != 0: + print(f" WARNING: ZAP exited with code {result.returncode}", file=sys.stderr) + return elapsed + + +def extract_nodes(data, prefix=""): + """Recursively collect all hierarchical node paths from a ZAP YAML export.""" + nodes = set() + if isinstance(data, list): + for item in data: + nodes.update(extract_nodes(item, prefix)) + elif isinstance(data, dict) and "node" in data: + path = f"{prefix}/{data['node']}" if prefix else str(data["node"]) + nodes.add(path) + for child in data.get("children") or []: + nodes.update(extract_nodes(child, path)) + return nodes + + +def load_nodes(path): + """Load the node set from a ZAP YAML export. Returns empty set if missing.""" + if not path.exists(): + return set() + with open(path) as f: + data = yaml.safe_load(f) + return extract_nodes(data) if data else set() + + +def compare_exports(ajax_path, client_path): + """Return (common, ajax_only, client_only) node counts for two exports.""" + ajax_nodes = load_nodes(ajax_path) + client_nodes = load_nodes(client_path) + return ( + len(ajax_nodes & client_nodes), + len(ajax_nodes - client_nodes), + len(client_nodes - ajax_nodes), + ) + + +def fmt_time(seconds): + m, s = divmod(int(seconds), 60) + return f"{m}:{s:02d}" + + +def compute_col_widths(results): + """Compute column widths wide enough for headers and all data values.""" + widths = [] + for header, key in zip(HEADERS, KEYS): + vals = [str(r[key]) for r in results] + widths.append(max(len(header), max(len(v) for v in vals)) + 2) + return widths + + +def format_cell(value, width, align): + """Return a table cell string of exactly `width` chars with 1-space padding.""" + inner = width - 2 + s = str(value) + if align == "left": + return f" {s:<{inner}} " + elif align == "right": + return f" {s:>{inner}} " + else: + return f" {s:^{inner}} " + + +def build_summary_row(results): + """Average the time columns and total the count columns across results.""" + n = len(results) + return { + "site": "Summary", + "atime": fmt_time(sum(r["atime"] for r in results) / n), + "ctime": fmt_time(sum(r["ctime"] for r in results) / n), + "st_com": sum(r["st_com"] for r in results), + "st_ao": sum(r["st_ao"] for r in results), + "st_co": sum(r["st_co"] for r in results), + "cm_com": sum(r["cm_com"] for r in results), + "cm_ao": sum(r["cm_ao"] for r in results), + "cm_co": sum(r["cm_co"] for r in results), + } + + +def print_table(results): + if not results: + return + + display_rows = [ + {**r, "atime": fmt_time(r["atime"]), "ctime": fmt_time(r["ctime"])} + for r in results + ] + summary = build_summary_row(results) + all_rows = display_rows + [summary] + + widths = compute_col_widths(all_rows) + + header_aligns = ["left"] + ["center"] * (len(HEADERS) - 1) + header_cells = [ + format_cell(h, w, a) for h, w, a in zip(HEADERS, widths, header_aligns) + ] + sep = "+" + "+".join("-" * w for w in widths) + "+" + + print("|" + "|".join(header_cells) + "|") + print(sep) + + data_aligns = ["left"] + ["right"] * (len(KEYS) - 1) + for r in display_rows: + cells = [ + format_cell(r[k], w, a) + for k, w, a in zip(KEYS, widths, data_aligns) + ] + print("|" + "|".join(cells) + "|") + + print(sep) + summary_cells = [ + format_cell(summary[k], w, a) + for k, w, a in zip(KEYS, widths, data_aligns) + ] + print("|" + "|".join(summary_cells) + "|") + + +def main(): + args = parse_args() + + if not args.zap: + sys.exit("ERROR: specify --zap DIR or set ZAP_DIR environment variable") + + zap_sh = Path(args.zap) / "zap.sh" + if not zap_sh.exists(): + sys.exit(f"ERROR: zap.sh not found at {zap_sh}") + + script_dir = Path(__file__).parent.resolve() + ajax_tmpl = script_dir / "ajax.yaml" + client_tmpl = script_dir / "client.yaml" + + for f, name in [(ajax_tmpl, "ajax.yaml"), (client_tmpl, "client.yaml")]: + if not f.exists(): + sys.exit(f"ERROR: {name} not found at {f}") + + sites = load_sites(args) + out_dir = Path(args.out).resolve() + results = [] + + for site in sites: + print(f"\nProcessing: {site}") + site_dir = out_dir / url_to_dirname(site) + site_dir.mkdir(parents=True, exist_ok=True) + + print(" Running AJAX spider ...") + ajax_cfg = make_run_config(ajax_tmpl, site_dir) + try: + atime = run_zap(zap_sh, args.port, ajax_cfg, site, site_dir / "ajax-data") + finally: + ajax_cfg.unlink(missing_ok=True) + print(f" Done in {fmt_time(atime)}") + + print(" Running Client spider ...") + client_cfg = make_run_config(client_tmpl, site_dir) + try: + ctime = run_zap(zap_sh, args.port, client_cfg, site, site_dir / "client-data") + finally: + client_cfg.unlink(missing_ok=True) + print(f" Done in {fmt_time(ctime)}") + + st_com, st_ao, st_co = compare_exports( + site_dir / "ajax-site.yaml", site_dir / "client-site.yaml" + ) + cm_com, cm_ao, cm_co = compare_exports( + site_dir / "ajax-map.yaml", site_dir / "client-map.yaml" + ) + + results.append({ + "site": site, + "atime": atime, + "ctime": ctime, + "st_com": st_com, + "st_ao": st_ao, + "st_co": st_co, + "cm_com": cm_com, + "cm_ao": cm_ao, + "cm_co": cm_co, + }) + + print() + print_table(results) + + +if __name__ == "__main__": + main() diff --git a/scans/spider-cmp/std-sites.txt b/scans/spider-cmp/std-sites.txt new file mode 100644 index 00000000..48e5addf --- /dev/null +++ b/scans/spider-cmp/std-sites.txt @@ -0,0 +1,5 @@ +https://brokencrystals.com/ +https://ginandjuice.shop/ +https://juice-shop.herokuapp.com/ +https://public-firing-range.appspot.com/ +https://security-crawl-maze.app/