Broken Link Crawler (Scan Your Site for 404s)
What this is: A lightweight broken-link crawler you can drop into your site and run on-demand. It crawls your pages, checks internal + external links, and generates a clean report showing what’s broken and where it was found.
Why it’s useful: Broken links quietly hurt SEO and user trust. This tool is a quick “site hygiene” scan you can run anytime — especially after migrations, URL changes, or content imports.
What it does:
- Crawls internal pages (same domain) up to a safe limit.
- Checks link status codes (200/301/404/500, etc.).
- Reports broken links with the source page they appeared on.
- Skips common non-links (mailto:, tel:, anchors, JS links).
- No database — just runs and shows results.
Install:
- Create:
/tools/broken-crawler/ - Save the script below as:
/tools/broken-crawler/index.php - Edit the
ADMIN_TOKENnear the top. - Run it:
/tools/broken-crawler/?run=TOKEN&start=/
<?php
declare(strict_types=1);
/**
* Broken Link Crawler (No DB)
* File: /tools/broken-crawler/index.php
*
* Run:
* /tools/broken-crawler/?run=CHANGE_ME_TOKEN&start=/
*
* Notes:
* - Keep this protected (token + optional IP allowlist).
* - Crawling large sites can be heavy. Increase limits slowly.
*/
header('X-Content-Type-Options: nosniff');
header('Referrer-Policy: strict-origin-when-cross-origin');
const ADMIN_TOKEN = 'CHANGE_ME_TOKEN';
// Optional: lock down by IP (leave empty to allow any)
const ALLOW_IPS = [
// '123.123.123.123',
];
// Crawl limits
const MAX_PAGES = 300; // internal pages to crawl
const MAX_DEPTH = 7; // link depth
const REQ_TIMEOUT = 8; // seconds
const USER_AGENT = 'BrokenLinkCrawler/1.0';
// Link filters
const SKIP_PREFIXES = ['mailto:','tel:','javascript:','data:','sms:'];
const SKIP_EXTENSIONS = [
'.jpg','.jpeg','.png','.gif','.webp','.svg','.ico',
'.css','.js','.map',
'.pdf','.zip','.rar','.7z',
'.mp4','.webm','.mov','.mp3','.wav'
];
function h(string $s): string { return htmlspecialchars($s, ENT_QUOTES, 'UTF-8'); }
function ok_ip(): bool {
if (!ALLOW_IPS) return true;
$ip = $_SERVER['REMOTE_ADDR'] ?? '';
return in_array($ip, ALLOW_IPS, true);
}
function is_https(): bool {
if (!empty($_SERVER['HTTPS']) && $_SERVER['HTTPS'] !== 'off') return true;
if (!empty($_SERVER['HTTP_X_FORWARDED_PROTO']) && stripos((string)$_SERVER['HTTP_X_FORWARDED_PROTO'], 'https') === 0) return true;
return false;
}
function site_base(): string {
$proto = is_https() ? 'https://' : 'http://';
$host = $_SERVER['HTTP_HOST'] ?? '';
return $proto . $host;
}
function should_skip_href(string $href): bool {
$href = trim($href);
if ($href === '' || $href === '#') return true;
if (preg_match('/^#/', $href)) return true;
foreach (SKIP_PREFIXES as $p) {
if (stripos($href, $p) === 0) return true;
}
$lower = strtolower($href);
foreach (SKIP_EXTENSIONS as $ext) {
if (substr($lower, -strlen($ext)) === $ext) return true;
}
return false;
}
function abs_url(string $base, string $href, string $currentUrl): string {
$href = trim($href);
// strip fragment
$href = preg_replace('/#.*$/', '', $href) ?? $href;
if ($href === '') return '';
// absolute
if (preg_match('~^https?://~i', $href)) return $href;
// protocol-relative
if (strpos($href, '//') === 0) {
$proto = is_https() ? 'https:' : 'http:';
return $proto . $href;
}
// root-relative
if (strpos($href, '/') === 0) return $base . $href;
// relative: resolve against current URL path
$u = parse_url($currentUrl);
$path = $u['path'] ?? '/';
$dir = rtrim(str_replace('\\', '/', dirname($path)), '/');
if ($dir === '') $dir = '';
return $base . $dir . '/' . ltrim($href, '/');
}
function clean_internal_url(string $base, string $url): string {
$u = parse_url($url);
if (!$u) return $url;
$p = $u['path'] ?? '/';
$q = isset($u['query']) ? ('?' . $u['query']) : '';
$out = $base . ($p ?: '/') . $q;
return $out;
}
function same_host(string $url, string $host): bool {
$u = parse_url($url);
if (!$u) return false;
$h = strtolower((string)($u['host'] ?? ''));
return $h === strtolower($host);
}
function http_fetch_html(string $url): array {
$ch = curl_init();
curl_setopt_array($ch, [
CURLOPT_URL => $url,
CURLOPT_RETURNTRANSFER => true,
CURLOPT_FOLLOWLOCATION => true,
CURLOPT_MAXREDIRS => 5,
CURLOPT_CONNECTTIMEOUT => REQ_TIMEOUT,
CURLOPT_TIMEOUT => REQ_TIMEOUT,
CURLOPT_USERAGENT => USER_AGENT,
CURLOPT_HEADER => true,
]);
$raw = curl_exec($ch);
$err = curl_error($ch);
$code = (int)curl_getinfo($ch, CURLINFO_RESPONSE_CODE);
$ctype = (string)curl_getinfo($ch, CURLINFO_CONTENT_TYPE);
$hdrSize = (int)curl_getinfo($ch, CURLINFO_HEADER_SIZE);
curl_close($ch);
if ($raw === false) {
return ['ok'=>false,'code'=>0,'ctype'=>'','body'=>'','err'=>$err ?: 'fetch failed'];
}
$body = substr($raw, $hdrSize);
return ['ok'=>true,'code'=>$code,'ctype'=>$ctype,'body'=>$body,'err'=>''];
}
function http_status(string $url): array {
// HEAD first for speed; fallback to GET headers if needed
$ch = curl_init();
curl_setopt_array($ch, [
CURLOPT_URL => $url,
CURLOPT_RETURNTRANSFER => true,
CURLOPT_FOLLOWLOCATION => true,
CURLOPT_MAXREDIRS => 5,
CURLOPT_CONNECTTIMEOUT => REQ_TIMEOUT,
CURLOPT_TIMEOUT => REQ_TIMEOUT,
CURLOPT_USERAGENT => USER_AGENT,
CURLOPT_NOBODY => true,
CURLOPT_HEADER => true,
]);
$raw = curl_exec($ch);
$err = curl_error($ch);
$code = (int)curl_getinfo($ch, CURLINFO_RESPONSE_CODE);
curl_close($ch);
if ($raw === false || $code === 0) {
$g = http_fetch_html($url);
return ['code'=>(int)$g['code'], 'err'=>(string)$g['err']];
}
return ['code'=>$code, 'err'=>$err ?: ''];
}
function extract_links(string $html): array {
$links = [];
libxml_use_internal_errors(true);
$dom = new DOMDocument();
$dom->loadHTML($html, LIBXML_NOWARNING | LIBXML_NOERROR);
libxml_clear_errors();
foreach ($dom->getElementsByTagName('a') as $a) {
$href = (string)$a->getAttribute('href');
if ($href !== '') $links[] = $href;
}
return $links;
}
// -------- auth --------
if (!ok_ip()) {
http_response_code(403);
header('Content-Type: text/plain; charset=utf-8');
echo "Forbidden (IP).\n";
exit;
}
$tok = (string)($_GET['run'] ?? '');
if ($tok === '' || !hash_equals(ADMIN_TOKEN, $tok)) {
http_response_code(403);
header('Content-Type: text/plain; charset=utf-8');
echo "Forbidden.\n";
exit;
}
// -------- crawl --------
$base = site_base();
$host = $_SERVER['HTTP_HOST'] ?? '';
$start = (string)($_GET['start'] ?? '/');
if ($start === '' || $start[0] !== '/') $start = '/';
$startUrl = $base . $start;
$queue = [];
$seenPages = [];
$queue[] = ['url'=>$startUrl,'depth'=>0,'from'=>$startUrl];
$checked = []; // url => status code
$broken = []; // rows: status, link, foundOn, note
$pagesCrawled = 0;
while ($queue && $pagesCrawled < MAX_PAGES) {
$item = array_shift($queue);
$url = (string)$item['url'];
$depth = (int)$item['depth'];
if (isset($seenPages[$url])) continue;
$seenPages[$url] = true;
$pagesCrawled++;
$res = http_fetch_html($url);
$code = (int)$res['code'];
$ctype = strtolower((string)$res['ctype']);
if (!$res['ok'] || $code >= 400) {
$broken[] = ['status'=>$code ?: 0, 'link'=>$url, 'foundOn'=>(string)$item['from'], 'note'=>$res['err'] ?: 'page fetch failed'];
continue;
}
if (strpos($ctype, 'text/html') === false) continue;
$hrefs = extract_links((string)$res['body']);
foreach ($hrefs as $href) {
if (should_skip_href($href)) continue;
$abs = abs_url($base, $href, $url);
if ($abs === '') continue;
if (!isset($checked[$abs])) {
$st = http_status($abs);
$checked[$abs] = (int)$st['code'];
if ($checked[$abs] >= 400 || $checked[$abs] === 0) {
$broken[] = ['status'=>$checked[$abs], 'link'=>$abs, 'foundOn'=>$url, 'note'=>$st['err'] ?: ''];
}
}
if ($depth < MAX_DEPTH && same_host($abs, (string)$host)) {
$next = clean_internal_url($base, $abs);
if (!isset($seenPages[$next])) $queue[] = ['url'=>$next,'depth'=>$depth+1,'from'=>$url];
}
}
}
// -------- output --------
header('Content-Type: text/html; charset=utf-8');
?><!doctype html>
<html lang="en">
<head>
<meta charset="utf-8" />
<meta name="viewport" content="width=device-width,initial-scale=1" />
<title>Broken Link Crawler</title>
<meta name="robots" content="noindex,nofollow" />
<style>
:root{
--bg:#0b0f16; --panel:#111a26; --panel2:#0c1420;
--text:#e8eef8; --muted:rgba(232,238,248,.72);
--line:rgba(255,255,255,.10); --accent:#2a8cff;
--shadow:0 18px 55px rgba(0,0,0,.45); --r:16px;
--bad:#ff7c7c; --ok:#7cff9a;
}
*{box-sizing:border-box}
body{margin:0;background:var(--bg);color:var(--text);font-family:system-ui,-apple-system,Segoe UI,Roboto,Arial,sans-serif}
.wrap{max-width:1100px;margin:0 auto;padding:18px}
.card{background:var(--panel);border:1px solid var(--line);border-radius:var(--r);padding:14px;margin:12px 0;box-shadow:var(--shadow)}
.muted{color:var(--muted)}
.pill{display:inline-block;padding:4px 10px;border-radius:999px;background:rgba(42,140,255,.14);border:1px solid rgba(42,140,255,.25);font-weight:900;font-size:12px}
.code{font-family:ui-monospace,SFMono-Regular,Menlo,Consolas,monospace;font-size:12px}
.bad{color:var(--bad);font-weight:950}
.ok{color:var(--ok);font-weight:950}
table{width:100%;border-collapse:collapse}
th,td{padding:10px;border-bottom:1px solid rgba(255,255,255,.08);vertical-align:top}
th{text-align:left;font-size:12px;opacity:.75}
a{color:#8ad1ff;text-decoration:none}
a:hover{text-decoration:underline}
</style>
</head>
<body>
<div class="wrap">
<div class="card">
<div class="pill">Site Tool</div>
<h1 style="margin:10px 0 6px;font-size:22px">Broken Link Crawler</h1>
<div class="muted">
Start: <span class="code"><?php echo h($startUrl); ?></span><br>
Crawled pages: <b><?php echo (int)$pagesCrawled; ?></b> •
Checked links: <b><?php echo (int)count($checked); ?></b> •
Broken found: <b class="<?php echo count($broken)?'bad':'ok'; ?>"><?php echo (int)count($broken); ?></b>
</div>
<div class="muted" style="margin-top:8px;font-size:13px">
Tip: Keep this private. If you need a bigger crawl, increase <span class="code">MAX_PAGES</span> slowly.
</div>
</div>
<div class="card">
<?php if (!$broken): ?>
<div class="muted">No broken links found in this crawl window.</div>
<?php else: ?>
<table>
<thead>
<tr>
<th>Status</th>
<th>Broken link</th>
<th>Found on page</th>
<th>Note</th>
</tr>
</thead>
<tbody>
<?php foreach ($broken as $b): ?>
<tr>
<td class="bad"><?php echo (int)$b['status']; ?></td>
<td class="code"><a href="<?php echo h((string)$b['link']); ?>" target="_blank" rel="noopener"><?php echo h((string)$b['link']); ?></a></td>
<td class="code"><a href="<?php echo h((string)$b['foundOn']); ?>" target="_blank" rel="noopener"><?php echo h((string)$b['foundOn']); ?></a></td>
<td class="muted"><?php echo h((string)$b['note']); ?></td>
</tr>
<?php endforeach; ?>
</tbody>
</table>
<?php endif; ?>
</div>
</div>
</body>
</html>
Comments (0)
No comments yet — be the first.