Broken Link Finder

February 14, 2026 NEW

A lightweight broken-link finder you can drop into /tools/broken-links/. It crawls your site starting from a URL, follows internal links, and reports broken pages and broken outbound links it encounters.

Why it’s useful: Broken links quietly kill UX and SEO. This script finds them fast, shows you where they were found, and outputs a clean table you can work through.

What it does:

  • Crawls internal links (same-host) up to a max page limit.
  • Checks status codes (200/301/404/500 etc.) for pages and links.
  • Reports broken links with the page they were found on.
  • Skips common junk (mailto:, tel:, javascript:, hash-only links).
  • Optional: respect robots meta nofollow (kept minimal here).

Install:

  1. Create: /tools/broken-links/
  2. Save the script below as: /tools/broken-links/index.php
  3. Edit the ADMIN_TOKEN at the top.
  4. Run it:
    • /tools/broken-links/?run=TOKEN
    • Optional start URL: &start=/

Tip: Start small (like / or a category page) and bump the limits when you’re confident. Crawling a huge site on shared hosting can be heavy.

<?php
declare(strict_types=1);

/**
 * Tiny Broken Link Finder (No DB)
 * File: /tools/broken-links/index.php
 *
 * Crawl your site and report broken links.
 *
 * Run:
 *   /tools/broken-links/?run=CHANGE_ME_TOKEN
 *   /tools/broken-links/?run=CHANGE_ME_TOKEN&start=/some-page
 */

header('X-Content-Type-Options: nosniff');
header('Referrer-Policy: strict-origin-when-cross-origin');

const ADMIN_TOKEN = 'CHANGE_ME_TOKEN';

// Crawl limits
const MAX_PAGES   = 250;   // how many internal pages to crawl
const MAX_DEPTH   = 6;     // link depth
const REQ_TIMEOUT = 8;     // seconds
const USER_AGENT  = 'TinyBrokenLinkFinder/1.0';

// Link filters
const SKIP_PREFIXES = [
  'mailto:', 'tel:', 'javascript:', 'data:', 'sms:',
];
const SKIP_EXTENSIONS = [
  '.jpg','.jpeg','.png','.gif','.webp','.svg','.ico',
  '.css','.js','.map',
  '.pdf','.zip','.rar','.7z',
  '.mp4','.webm','.mov','.mp3','.wav',
];

function h(string $s): string { return htmlspecialchars($s, ENT_QUOTES, 'UTF-8'); }

function is_https(): bool {
  if (!empty($_SERVER['HTTPS']) && $_SERVER['HTTPS'] !== 'off') return true;
  if (!empty($_SERVER['HTTP_X_FORWARDED_PROTO']) && stripos((string)$_SERVER['HTTP_X_FORWARDED_PROTO'], 'https') === 0) return true;
  return false;
}

function site_base(): string {
  $proto = is_https() ? 'https://' : 'http://';
  $host  = $_SERVER['HTTP_HOST'] ?? '';
  return $proto . $host;
}

function abs_url(string $base, string $href): string {
  $href = trim($href);
  if ($href === '') return '';
  // strip fragment
  $href = preg_replace('/#.*$/', '', $href) ?? $href;
  if ($href === '') return '';

  // already absolute
  if (preg_match('~^https?://~i', $href)) return $href;

  // protocol-relative
  if (strpos($href, '//') === 0) {
    $proto = is_https() ? 'https:' : 'http:';
    return $proto . $href;
  }

  // root-relative
  if (strpos($href, '/') === 0) return $base . $href;

  // relative: resolve against current path
  // base here is like "https://site.com"
  return $base . '/' . ltrim($href, '/');
}

function norm_internal_path(string $url): string {
  $u = parse_url($url);
  if (!$u) return '/';
  $p = $u['path'] ?? '/';
  $q = isset($u['query']) ? ('?' . $u['query']) : '';
  $path = $p . $q;
  if ($path === '') $path = '/';
  return $path;
}

function same_host(string $url, string $host): bool {
  $u = parse_url($url);
  if (!$u) return false;
  $h = strtolower((string)($u['host'] ?? ''));
  return $h === strtolower($host);
}

function should_skip_href(string $href): bool {
  $href = trim($href);
  if ($href === '' || $href === '#') return true;

  foreach (SKIP_PREFIXES as $p) {
    if (stripos($href, $p) === 0) return true;
  }

  $lower = strtolower($href);
  foreach (SKIP_EXTENSIONS as $ext) {
    if (substr($lower, -strlen($ext)) === $ext) return true;
  }

  return false;
}

function http_fetch(string $url): array {
  $ch = curl_init();
  curl_setopt_array($ch, [
    CURLOPT_URL            => $url,
    CURLOPT_RETURNTRANSFER => true,
    CURLOPT_FOLLOWLOCATION => true,
    CURLOPT_MAXREDIRS      => 5,
    CURLOPT_CONNECTTIMEOUT => REQ_TIMEOUT,
    CURLOPT_TIMEOUT        => REQ_TIMEOUT,
    CURLOPT_USERAGENT      => USER_AGENT,
    CURLOPT_HEADER         => true,
    CURLOPT_NOBODY         => false,
  ]);

  $raw = curl_exec($ch);
  $err = curl_error($ch);
  $code = (int)curl_getinfo($ch, CURLINFO_RESPONSE_CODE);
  $ctype = (string)curl_getinfo($ch, CURLINFO_CONTENT_TYPE);
  $hdrSize = (int)curl_getinfo($ch, CURLINFO_HEADER_SIZE);
  curl_close($ch);

  if ($raw === false) {
    return ['ok'=>false,'code'=>0,'ctype'=>'','body'=>'','err'=>$err ?: 'fetch failed'];
  }

  $body = substr($raw, $hdrSize);
  return ['ok'=>true,'code'=>$code,'ctype'=>$ctype,'body'=>$body,'err'=>''];
}

function http_head_status(string $url): array {
  // HEAD first for speed; if blocked or fails, fall back to GET headers
  $ch = curl_init();
  curl_setopt_array($ch, [
    CURLOPT_URL            => $url,
    CURLOPT_RETURNTRANSFER => true,
    CURLOPT_FOLLOWLOCATION => true,
    CURLOPT_MAXREDIRS      => 5,
    CURLOPT_CONNECTTIMEOUT => REQ_TIMEOUT,
    CURLOPT_TIMEOUT        => REQ_TIMEOUT,
    CURLOPT_USERAGENT      => USER_AGENT,
    CURLOPT_NOBODY         => true,
    CURLOPT_HEADER         => true,
  ]);

  $raw = curl_exec($ch);
  $err = curl_error($ch);
  $code = (int)curl_getinfo($ch, CURLINFO_RESPONSE_CODE);
  $ctype = (string)curl_getinfo($ch, CURLINFO_CONTENT_TYPE);
  curl_close($ch);

  if ($raw === false || $code === 0) {
    // fallback
    $g = http_fetch($url);
    return ['code'=>(int)$g['code'], 'ctype'=>(string)$g['ctype'], 'err'=>(string)$g['err']];
  }

  return ['code'=>$code, 'ctype'=>$ctype, 'err'=>$err ?: ''];
}

function extract_links_from_html(string $html): array {
  $links = [];

  libxml_use_internal_errors(true);
  $dom = new DOMDocument();
  // suppress warnings on imperfect HTML
  $dom->loadHTML($html, LIBXML_NOWARNING | LIBXML_NOERROR);
  libxml_clear_errors();

  foreach ($dom->getElementsByTagName('a') as $a) {
    $href = (string)$a->getAttribute('href');
    if ($href !== '') $links[] = $href;
  }

  return $links;
}

// -------- auth --------
$tok = (string)($_GET['run'] ?? '');
if ($tok === '' || !hash_equals(ADMIN_TOKEN, $tok)) {
  http_response_code(403);
  header('Content-Type: text/plain; charset=utf-8');
  echo "Forbidden.\n";
  exit;
}

// -------- crawl setup --------
$base = site_base();
$host = $_SERVER['HTTP_HOST'] ?? '';

$start = (string)($_GET['start'] ?? '/');
if ($start === '' || $start[0] !== '/') $start = '/';

$startUrl = $base . $start;

$queue = [];
$seenPages = [];
$queue[] = ['url'=>$startUrl, 'depth'=>0, 'from'=>$startUrl];

$broken = []; // rows: foundOn, link, status, note
$checkedLinks = []; // cache by URL => [code]

$pagesCrawled = 0;

while ($queue && $pagesCrawled < MAX_PAGES) {
  $item = array_shift($queue);
  $url = (string)$item['url'];
  $depth = (int)$item['depth'];

  if (isset($seenPages[$url])) continue;
  $seenPages[$url] = true;

  $pagesCrawled++;

  $res = http_fetch($url);
  $code = (int)$res['code'];
  $ctype = strtolower((string)$res['ctype']);

  // If the page itself is broken, log and continue (still a useful find)
  if (!$res['ok'] || $code >= 400) {
    $broken[] = [
      'foundOn' => (string)$item['from'],
      'link'    => $url,
      'status'  => $code ?: 0,
      'note'    => $res['err'] ?: 'page fetch failed',
    ];
    continue;
  }

  // only parse HTML
  if (strpos($ctype, 'text/html') === false) continue;

  $hrefs = extract_links_from_html((string)$res['body']);

  foreach ($hrefs as $href) {
    if (should_skip_href($href)) continue;

    $abs = abs_url($base, $href);
    if ($abs === '') continue;

    // check status (cache)
    if (!isset($checkedLinks[$abs])) {
      $st = http_head_status($abs);
      $checkedLinks[$abs] = [
        'code' => (int)$st['code'],
        'err'  => (string)$st['err'],
      ];
    }

    $stc = (int)$checkedLinks[$abs]['code'];

    if ($stc >= 400 || $stc === 0) {
      $broken[] = [
        'foundOn' => $url,
        'link'    => $abs,
        'status'  => $stc,
        'note'    => $checkedLinks[$abs]['err'] ?: '',
      ];
    }

    // if internal, enqueue for crawling
    if ($depth < MAX_DEPTH && same_host($abs, (string)$host)) {
      $path = norm_internal_path($abs);
      $nextUrl = $base . $path;
      if (!isset($seenPages[$nextUrl])) {
        $queue[] = ['url'=>$nextUrl, 'depth'=>$depth+1, 'from'=>$url];
      }
    }
  }
}

// -------- output --------
header('Content-Type: text/html; charset=utf-8');

?><!doctype html>
<html lang="en">
<head>
<meta charset="utf-8">
<meta name="viewport" content="width=device-width,initial-scale=1">
<title>Tiny Broken Link Finder</title>
<meta name="robots" content="noindex,nofollow">
<style>
  :root{
    --bg:#0b0f16; --panel:#111a26; --panel2:#0c1420;
    --text:#e8eef8; --muted:rgba(232,238,248,.72);
    --line:rgba(255,255,255,.10); --accent:#2a8cff;
    --shadow:0 18px 55px rgba(0,0,0,.45); --r:16px;
    --bad:#ff7c7c; --ok:#7cff9a;
  }
  *{box-sizing:border-box}
  body{margin:0;background:var(--bg);color:var(--text);font-family:system-ui,-apple-system,Segoe UI,Roboto,Arial,sans-serif}
  .wrap{max-width:1100px;margin:0 auto;padding:18px}
  .card{background:var(--panel);border:1px solid var(--line);border-radius:var(--r);padding:14px;margin:12px 0;box-shadow:var(--shadow)}
  .muted{color:var(--muted)}
  .pill{display:inline-block;padding:4px 10px;border-radius:999px;background:rgba(42,140,255,.14);border:1px solid rgba(42,140,255,.25);font-weight:900;font-size:12px}
  table{width:100%;border-collapse:collapse}
  th,td{padding:10px;border-bottom:1px solid rgba(255,255,255,.08);vertical-align:top}
  th{text-align:left;font-size:12px;opacity:.75}
  .code{font-family:ui-monospace,SFMono-Regular,Menlo,Consolas,monospace;font-size:12px}
  .bad{color:var(--bad);font-weight:900}
  a{color:#8ad1ff;text-decoration:none}
  a:hover{text-decoration:underline}
</style>
</head>
<body>
<div class="wrap">
  <div class="card">
    <div class="pill">Tool</div>
    <h1 style="margin:10px 0 6px;font-size:22px">Tiny Broken Link Finder</h1>
    <div class="muted">
      Start: <span class="code"><?php echo h($startUrl); ?></span><br>
      Crawled pages: <b><?php echo (int)$pagesCrawled; ?></b> •
      Checked links: <b><?php echo (int)count($checkedLinks); ?></b> •
      Broken found: <b class="<?php echo count($broken)?'bad':'ok'; ?>"><?php echo (int)count($broken); ?></b>
    </div>
    <div class="muted" style="margin-top:8px;font-size:13px">
      Tip: Keep this private. It’s a crawler—running it too often on a big site can be heavy.
    </div>
  </div>

  <div class="card">
    <?php if (!$broken): ?>
      <div class="muted">No broken links found in this crawl window.</div>
    <?php else: ?>
      <table>
        <thead>
          <tr>
            <th>Status</th>
            <th>Broken link</th>
            <th>Found on page</th>
            <th>Note</th>
          </tr>
        </thead>
        <tbody>
          <?php foreach ($broken as $b): ?>
            <tr>
              <td class="bad"><?php echo (int)$b['status']; ?></td>
              <td class="code"><a href="<?php echo h((string)$b['link']); ?>" target="_blank" rel="noopener"><?php echo h((string)$b['link']); ?></a></td>
              <td class="code"><a href="<?php echo h((string)$b['foundOn']); ?>" target="_blank" rel="noopener"><?php echo h((string)$b['foundOn']); ?></a></td>
              <td class="muted"><?php echo h((string)$b['note']); ?></td>
            </tr>
          <?php endforeach; ?>
        </tbody>
      </table>
    <?php endif; ?>
  </div>

  <div class="card">
    <div class="muted" style="font-size:13px;line-height:1.55">
      <b>Want to level it up?</b>
      <ul style="margin:8px 0 0; padding-left:18px">
        <li>Export to CSV.</li>
        <li>Respect robots.txt.</li>
        <li>Store results and compare crawls over time.</li>
      </ul>
    </div>
  </div>
</div>
</body>
</html>

Comments (0)

No comments yet — be the first.

← Back to all scripts