Broken Link Crawler (Scan Your Site for 404s)

March 5, 2026 NEW

What this is: A lightweight broken-link crawler you can drop into your site and run on-demand. It crawls your pages, checks internal + external links, and generates a clean report showing what’s broken and where it was found.

Why it’s useful: Broken links quietly hurt SEO and user trust. This tool is a quick “site hygiene” scan you can run anytime — especially after migrations, URL changes, or content imports.

What it does:

  • Crawls internal pages (same domain) up to a safe limit.
  • Checks link status codes (200/301/404/500, etc.).
  • Reports broken links with the source page they appeared on.
  • Skips common non-links (mailto:, tel:, anchors, JS links).
  • No database — just runs and shows results.

Install:

  1. Create: /tools/broken-crawler/
  2. Save the script below as: /tools/broken-crawler/index.php
  3. Edit the ADMIN_TOKEN near the top.
  4. Run it: /tools/broken-crawler/?run=TOKEN&start=/
<?php
declare(strict_types=1);

/**
 * Broken Link Crawler (No DB)
 * File: /tools/broken-crawler/index.php
 *
 * Run:
 *   /tools/broken-crawler/?run=CHANGE_ME_TOKEN&start=/
 *
 * Notes:
 * - Keep this protected (token + optional IP allowlist).
 * - Crawling large sites can be heavy. Increase limits slowly.
 */

header('X-Content-Type-Options: nosniff');
header('Referrer-Policy: strict-origin-when-cross-origin');

const ADMIN_TOKEN = 'CHANGE_ME_TOKEN';

// Optional: lock down by IP (leave empty to allow any)
const ALLOW_IPS = [
  // '123.123.123.123',
];

// Crawl limits
const MAX_PAGES   = 300;  // internal pages to crawl
const MAX_DEPTH   = 7;    // link depth
const REQ_TIMEOUT = 8;    // seconds
const USER_AGENT  = 'BrokenLinkCrawler/1.0';

// Link filters
const SKIP_PREFIXES = ['mailto:','tel:','javascript:','data:','sms:'];
const SKIP_EXTENSIONS = [
  '.jpg','.jpeg','.png','.gif','.webp','.svg','.ico',
  '.css','.js','.map',
  '.pdf','.zip','.rar','.7z',
  '.mp4','.webm','.mov','.mp3','.wav'
];

function h(string $s): string { return htmlspecialchars($s, ENT_QUOTES, 'UTF-8'); }

function ok_ip(): bool {
  if (!ALLOW_IPS) return true;
  $ip = $_SERVER['REMOTE_ADDR'] ?? '';
  return in_array($ip, ALLOW_IPS, true);
}

function is_https(): bool {
  if (!empty($_SERVER['HTTPS']) && $_SERVER['HTTPS'] !== 'off') return true;
  if (!empty($_SERVER['HTTP_X_FORWARDED_PROTO']) && stripos((string)$_SERVER['HTTP_X_FORWARDED_PROTO'], 'https') === 0) return true;
  return false;
}

function site_base(): string {
  $proto = is_https() ? 'https://' : 'http://';
  $host  = $_SERVER['HTTP_HOST'] ?? '';
  return $proto . $host;
}

function should_skip_href(string $href): bool {
  $href = trim($href);
  if ($href === '' || $href === '#') return true;
  if (preg_match('/^#/', $href)) return true;

  foreach (SKIP_PREFIXES as $p) {
    if (stripos($href, $p) === 0) return true;
  }

  $lower = strtolower($href);
  foreach (SKIP_EXTENSIONS as $ext) {
    if (substr($lower, -strlen($ext)) === $ext) return true;
  }

  return false;
}

function abs_url(string $base, string $href, string $currentUrl): string {
  $href = trim($href);
  // strip fragment
  $href = preg_replace('/#.*$/', '', $href) ?? $href;
  if ($href === '') return '';

  // absolute
  if (preg_match('~^https?://~i', $href)) return $href;

  // protocol-relative
  if (strpos($href, '//') === 0) {
    $proto = is_https() ? 'https:' : 'http:';
    return $proto . $href;
  }

  // root-relative
  if (strpos($href, '/') === 0) return $base . $href;

  // relative: resolve against current URL path
  $u = parse_url($currentUrl);
  $path = $u['path'] ?? '/';
  $dir = rtrim(str_replace('\\', '/', dirname($path)), '/');
  if ($dir === '') $dir = '';
  return $base . $dir . '/' . ltrim($href, '/');
}

function clean_internal_url(string $base, string $url): string {
  $u = parse_url($url);
  if (!$u) return $url;
  $p = $u['path'] ?? '/';
  $q = isset($u['query']) ? ('?' . $u['query']) : '';
  $out = $base . ($p ?: '/') . $q;
  return $out;
}

function same_host(string $url, string $host): bool {
  $u = parse_url($url);
  if (!$u) return false;
  $h = strtolower((string)($u['host'] ?? ''));
  return $h === strtolower($host);
}

function http_fetch_html(string $url): array {
  $ch = curl_init();
  curl_setopt_array($ch, [
    CURLOPT_URL            => $url,
    CURLOPT_RETURNTRANSFER => true,
    CURLOPT_FOLLOWLOCATION => true,
    CURLOPT_MAXREDIRS      => 5,
    CURLOPT_CONNECTTIMEOUT => REQ_TIMEOUT,
    CURLOPT_TIMEOUT        => REQ_TIMEOUT,
    CURLOPT_USERAGENT      => USER_AGENT,
    CURLOPT_HEADER         => true,
  ]);

  $raw = curl_exec($ch);
  $err = curl_error($ch);
  $code = (int)curl_getinfo($ch, CURLINFO_RESPONSE_CODE);
  $ctype = (string)curl_getinfo($ch, CURLINFO_CONTENT_TYPE);
  $hdrSize = (int)curl_getinfo($ch, CURLINFO_HEADER_SIZE);
  curl_close($ch);

  if ($raw === false) {
    return ['ok'=>false,'code'=>0,'ctype'=>'','body'=>'','err'=>$err ?: 'fetch failed'];
  }

  $body = substr($raw, $hdrSize);
  return ['ok'=>true,'code'=>$code,'ctype'=>$ctype,'body'=>$body,'err'=>''];
}

function http_status(string $url): array {
  // HEAD first for speed; fallback to GET headers if needed
  $ch = curl_init();
  curl_setopt_array($ch, [
    CURLOPT_URL            => $url,
    CURLOPT_RETURNTRANSFER => true,
    CURLOPT_FOLLOWLOCATION => true,
    CURLOPT_MAXREDIRS      => 5,
    CURLOPT_CONNECTTIMEOUT => REQ_TIMEOUT,
    CURLOPT_TIMEOUT        => REQ_TIMEOUT,
    CURLOPT_USERAGENT      => USER_AGENT,
    CURLOPT_NOBODY         => true,
    CURLOPT_HEADER         => true,
  ]);
  $raw = curl_exec($ch);
  $err = curl_error($ch);
  $code = (int)curl_getinfo($ch, CURLINFO_RESPONSE_CODE);
  curl_close($ch);

  if ($raw === false || $code === 0) {
    $g = http_fetch_html($url);
    return ['code'=>(int)$g['code'], 'err'=>(string)$g['err']];
  }
  return ['code'=>$code, 'err'=>$err ?: ''];
}

function extract_links(string $html): array {
  $links = [];
  libxml_use_internal_errors(true);
  $dom = new DOMDocument();
  $dom->loadHTML($html, LIBXML_NOWARNING | LIBXML_NOERROR);
  libxml_clear_errors();

  foreach ($dom->getElementsByTagName('a') as $a) {
    $href = (string)$a->getAttribute('href');
    if ($href !== '') $links[] = $href;
  }
  return $links;
}

// -------- auth --------
if (!ok_ip()) {
  http_response_code(403);
  header('Content-Type: text/plain; charset=utf-8');
  echo "Forbidden (IP).\n";
  exit;
}

$tok = (string)($_GET['run'] ?? '');
if ($tok === '' || !hash_equals(ADMIN_TOKEN, $tok)) {
  http_response_code(403);
  header('Content-Type: text/plain; charset=utf-8');
  echo "Forbidden.\n";
  exit;
}

// -------- crawl --------
$base = site_base();
$host = $_SERVER['HTTP_HOST'] ?? '';
$start = (string)($_GET['start'] ?? '/');
if ($start === '' || $start[0] !== '/') $start = '/';
$startUrl = $base . $start;

$queue = [];
$seenPages = [];
$queue[] = ['url'=>$startUrl,'depth'=>0,'from'=>$startUrl];

$checked = [];  // url => status code
$broken = [];   // rows: status, link, foundOn, note

$pagesCrawled = 0;

while ($queue && $pagesCrawled < MAX_PAGES) {
  $item = array_shift($queue);
  $url = (string)$item['url'];
  $depth = (int)$item['depth'];

  if (isset($seenPages[$url])) continue;
  $seenPages[$url] = true;
  $pagesCrawled++;

  $res = http_fetch_html($url);
  $code = (int)$res['code'];
  $ctype = strtolower((string)$res['ctype']);

  if (!$res['ok'] || $code >= 400) {
    $broken[] = ['status'=>$code ?: 0, 'link'=>$url, 'foundOn'=>(string)$item['from'], 'note'=>$res['err'] ?: 'page fetch failed'];
    continue;
  }

  if (strpos($ctype, 'text/html') === false) continue;

  $hrefs = extract_links((string)$res['body']);

  foreach ($hrefs as $href) {
    if (should_skip_href($href)) continue;

    $abs = abs_url($base, $href, $url);
    if ($abs === '') continue;

    if (!isset($checked[$abs])) {
      $st = http_status($abs);
      $checked[$abs] = (int)$st['code'];
      if ($checked[$abs] >= 400 || $checked[$abs] === 0) {
        $broken[] = ['status'=>$checked[$abs], 'link'=>$abs, 'foundOn'=>$url, 'note'=>$st['err'] ?: ''];
      }
    }

    if ($depth < MAX_DEPTH && same_host($abs, (string)$host)) {
      $next = clean_internal_url($base, $abs);
      if (!isset($seenPages[$next])) $queue[] = ['url'=>$next,'depth'=>$depth+1,'from'=>$url];
    }
  }
}

// -------- output --------
header('Content-Type: text/html; charset=utf-8');

?><!doctype html>
<html lang="en">
<head>
<meta charset="utf-8" />
<meta name="viewport" content="width=device-width,initial-scale=1" />
<title>Broken Link Crawler</title>
<meta name="robots" content="noindex,nofollow" />
<style>
  :root{
    --bg:#0b0f16; --panel:#111a26; --panel2:#0c1420;
    --text:#e8eef8; --muted:rgba(232,238,248,.72);
    --line:rgba(255,255,255,.10); --accent:#2a8cff;
    --shadow:0 18px 55px rgba(0,0,0,.45); --r:16px;
    --bad:#ff7c7c; --ok:#7cff9a;
  }
  *{box-sizing:border-box}
  body{margin:0;background:var(--bg);color:var(--text);font-family:system-ui,-apple-system,Segoe UI,Roboto,Arial,sans-serif}
  .wrap{max-width:1100px;margin:0 auto;padding:18px}
  .card{background:var(--panel);border:1px solid var(--line);border-radius:var(--r);padding:14px;margin:12px 0;box-shadow:var(--shadow)}
  .muted{color:var(--muted)}
  .pill{display:inline-block;padding:4px 10px;border-radius:999px;background:rgba(42,140,255,.14);border:1px solid rgba(42,140,255,.25);font-weight:900;font-size:12px}
  .code{font-family:ui-monospace,SFMono-Regular,Menlo,Consolas,monospace;font-size:12px}
  .bad{color:var(--bad);font-weight:950}
  .ok{color:var(--ok);font-weight:950}
  table{width:100%;border-collapse:collapse}
  th,td{padding:10px;border-bottom:1px solid rgba(255,255,255,.08);vertical-align:top}
  th{text-align:left;font-size:12px;opacity:.75}
  a{color:#8ad1ff;text-decoration:none}
  a:hover{text-decoration:underline}
</style>
</head>
<body>
<div class="wrap">
  <div class="card">
    <div class="pill">Site Tool</div>
    <h1 style="margin:10px 0 6px;font-size:22px">Broken Link Crawler</h1>
    <div class="muted">
      Start: <span class="code"><?php echo h($startUrl); ?></span><br>
      Crawled pages: <b><?php echo (int)$pagesCrawled; ?></b> •
      Checked links: <b><?php echo (int)count($checked); ?></b> •
      Broken found: <b class="<?php echo count($broken)?'bad':'ok'; ?>"><?php echo (int)count($broken); ?></b>
    </div>
    <div class="muted" style="margin-top:8px;font-size:13px">
      Tip: Keep this private. If you need a bigger crawl, increase <span class="code">MAX_PAGES</span> slowly.
    </div>
  </div>

  <div class="card">
    <?php if (!$broken): ?>
      <div class="muted">No broken links found in this crawl window.</div>
    <?php else: ?>
      <table>
        <thead>
          <tr>
            <th>Status</th>
            <th>Broken link</th>
            <th>Found on page</th>
            <th>Note</th>
          </tr>
        </thead>
        <tbody>
          <?php foreach ($broken as $b): ?>
            <tr>
              <td class="bad"><?php echo (int)$b['status']; ?></td>
              <td class="code"><a href="<?php echo h((string)$b['link']); ?>" target="_blank" rel="noopener"><?php echo h((string)$b['link']); ?></a></td>
              <td class="code"><a href="<?php echo h((string)$b['foundOn']); ?>" target="_blank" rel="noopener"><?php echo h((string)$b['foundOn']); ?></a></td>
              <td class="muted"><?php echo h((string)$b['note']); ?></td>
            </tr>
          <?php endforeach; ?>
        </tbody>
      </table>
    <?php endif; ?>
  </div>
</div>
</body>
</html>

Comments (0)

No comments yet — be the first.

← Back to all scripts