Broken Link Finder
A lightweight broken-link finder you can drop into /tools/broken-links/. It crawls your site starting from a URL, follows internal links, and reports broken pages and broken outbound links it encounters.
Why it’s useful: Broken links quietly kill UX and SEO. This script finds them fast, shows you where they were found, and outputs a clean table you can work through.
What it does:
- Crawls internal links (same-host) up to a max page limit.
- Checks status codes (200/301/404/500 etc.) for pages and links.
- Reports broken links with the page they were found on.
- Skips common junk (mailto:, tel:, javascript:, hash-only links).
- Optional: respect robots meta nofollow (kept minimal here).
Install:
- Create:
/tools/broken-links/ - Save the script below as:
/tools/broken-links/index.php - Edit the
ADMIN_TOKENat the top. - Run it:
/tools/broken-links/?run=TOKEN- Optional start URL:
&start=/
Tip: Start small (like / or a category page) and bump the limits when you’re confident. Crawling a huge site on shared hosting can be heavy.
<?php
declare(strict_types=1);
/**
* Tiny Broken Link Finder (No DB)
* File: /tools/broken-links/index.php
*
* Crawl your site and report broken links.
*
* Run:
* /tools/broken-links/?run=CHANGE_ME_TOKEN
* /tools/broken-links/?run=CHANGE_ME_TOKEN&start=/some-page
*/
header('X-Content-Type-Options: nosniff');
header('Referrer-Policy: strict-origin-when-cross-origin');
const ADMIN_TOKEN = 'CHANGE_ME_TOKEN';
// Crawl limits
const MAX_PAGES = 250; // how many internal pages to crawl
const MAX_DEPTH = 6; // link depth
const REQ_TIMEOUT = 8; // seconds
const USER_AGENT = 'TinyBrokenLinkFinder/1.0';
// Link filters
const SKIP_PREFIXES = [
'mailto:', 'tel:', 'javascript:', 'data:', 'sms:',
];
const SKIP_EXTENSIONS = [
'.jpg','.jpeg','.png','.gif','.webp','.svg','.ico',
'.css','.js','.map',
'.pdf','.zip','.rar','.7z',
'.mp4','.webm','.mov','.mp3','.wav',
];
function h(string $s): string { return htmlspecialchars($s, ENT_QUOTES, 'UTF-8'); }
function is_https(): bool {
if (!empty($_SERVER['HTTPS']) && $_SERVER['HTTPS'] !== 'off') return true;
if (!empty($_SERVER['HTTP_X_FORWARDED_PROTO']) && stripos((string)$_SERVER['HTTP_X_FORWARDED_PROTO'], 'https') === 0) return true;
return false;
}
function site_base(): string {
$proto = is_https() ? 'https://' : 'http://';
$host = $_SERVER['HTTP_HOST'] ?? '';
return $proto . $host;
}
function abs_url(string $base, string $href): string {
$href = trim($href);
if ($href === '') return '';
// strip fragment
$href = preg_replace('/#.*$/', '', $href) ?? $href;
if ($href === '') return '';
// already absolute
if (preg_match('~^https?://~i', $href)) return $href;
// protocol-relative
if (strpos($href, '//') === 0) {
$proto = is_https() ? 'https:' : 'http:';
return $proto . $href;
}
// root-relative
if (strpos($href, '/') === 0) return $base . $href;
// relative: resolve against current path
// base here is like "https://site.com"
return $base . '/' . ltrim($href, '/');
}
function norm_internal_path(string $url): string {
$u = parse_url($url);
if (!$u) return '/';
$p = $u['path'] ?? '/';
$q = isset($u['query']) ? ('?' . $u['query']) : '';
$path = $p . $q;
if ($path === '') $path = '/';
return $path;
}
function same_host(string $url, string $host): bool {
$u = parse_url($url);
if (!$u) return false;
$h = strtolower((string)($u['host'] ?? ''));
return $h === strtolower($host);
}
function should_skip_href(string $href): bool {
$href = trim($href);
if ($href === '' || $href === '#') return true;
foreach (SKIP_PREFIXES as $p) {
if (stripos($href, $p) === 0) return true;
}
$lower = strtolower($href);
foreach (SKIP_EXTENSIONS as $ext) {
if (substr($lower, -strlen($ext)) === $ext) return true;
}
return false;
}
function http_fetch(string $url): array {
$ch = curl_init();
curl_setopt_array($ch, [
CURLOPT_URL => $url,
CURLOPT_RETURNTRANSFER => true,
CURLOPT_FOLLOWLOCATION => true,
CURLOPT_MAXREDIRS => 5,
CURLOPT_CONNECTTIMEOUT => REQ_TIMEOUT,
CURLOPT_TIMEOUT => REQ_TIMEOUT,
CURLOPT_USERAGENT => USER_AGENT,
CURLOPT_HEADER => true,
CURLOPT_NOBODY => false,
]);
$raw = curl_exec($ch);
$err = curl_error($ch);
$code = (int)curl_getinfo($ch, CURLINFO_RESPONSE_CODE);
$ctype = (string)curl_getinfo($ch, CURLINFO_CONTENT_TYPE);
$hdrSize = (int)curl_getinfo($ch, CURLINFO_HEADER_SIZE);
curl_close($ch);
if ($raw === false) {
return ['ok'=>false,'code'=>0,'ctype'=>'','body'=>'','err'=>$err ?: 'fetch failed'];
}
$body = substr($raw, $hdrSize);
return ['ok'=>true,'code'=>$code,'ctype'=>$ctype,'body'=>$body,'err'=>''];
}
function http_head_status(string $url): array {
// HEAD first for speed; if blocked or fails, fall back to GET headers
$ch = curl_init();
curl_setopt_array($ch, [
CURLOPT_URL => $url,
CURLOPT_RETURNTRANSFER => true,
CURLOPT_FOLLOWLOCATION => true,
CURLOPT_MAXREDIRS => 5,
CURLOPT_CONNECTTIMEOUT => REQ_TIMEOUT,
CURLOPT_TIMEOUT => REQ_TIMEOUT,
CURLOPT_USERAGENT => USER_AGENT,
CURLOPT_NOBODY => true,
CURLOPT_HEADER => true,
]);
$raw = curl_exec($ch);
$err = curl_error($ch);
$code = (int)curl_getinfo($ch, CURLINFO_RESPONSE_CODE);
$ctype = (string)curl_getinfo($ch, CURLINFO_CONTENT_TYPE);
curl_close($ch);
if ($raw === false || $code === 0) {
// fallback
$g = http_fetch($url);
return ['code'=>(int)$g['code'], 'ctype'=>(string)$g['ctype'], 'err'=>(string)$g['err']];
}
return ['code'=>$code, 'ctype'=>$ctype, 'err'=>$err ?: ''];
}
function extract_links_from_html(string $html): array {
$links = [];
libxml_use_internal_errors(true);
$dom = new DOMDocument();
// suppress warnings on imperfect HTML
$dom->loadHTML($html, LIBXML_NOWARNING | LIBXML_NOERROR);
libxml_clear_errors();
foreach ($dom->getElementsByTagName('a') as $a) {
$href = (string)$a->getAttribute('href');
if ($href !== '') $links[] = $href;
}
return $links;
}
// -------- auth --------
$tok = (string)($_GET['run'] ?? '');
if ($tok === '' || !hash_equals(ADMIN_TOKEN, $tok)) {
http_response_code(403);
header('Content-Type: text/plain; charset=utf-8');
echo "Forbidden.\n";
exit;
}
// -------- crawl setup --------
$base = site_base();
$host = $_SERVER['HTTP_HOST'] ?? '';
$start = (string)($_GET['start'] ?? '/');
if ($start === '' || $start[0] !== '/') $start = '/';
$startUrl = $base . $start;
$queue = [];
$seenPages = [];
$queue[] = ['url'=>$startUrl, 'depth'=>0, 'from'=>$startUrl];
$broken = []; // rows: foundOn, link, status, note
$checkedLinks = []; // cache by URL => [code]
$pagesCrawled = 0;
while ($queue && $pagesCrawled < MAX_PAGES) {
$item = array_shift($queue);
$url = (string)$item['url'];
$depth = (int)$item['depth'];
if (isset($seenPages[$url])) continue;
$seenPages[$url] = true;
$pagesCrawled++;
$res = http_fetch($url);
$code = (int)$res['code'];
$ctype = strtolower((string)$res['ctype']);
// If the page itself is broken, log and continue (still a useful find)
if (!$res['ok'] || $code >= 400) {
$broken[] = [
'foundOn' => (string)$item['from'],
'link' => $url,
'status' => $code ?: 0,
'note' => $res['err'] ?: 'page fetch failed',
];
continue;
}
// only parse HTML
if (strpos($ctype, 'text/html') === false) continue;
$hrefs = extract_links_from_html((string)$res['body']);
foreach ($hrefs as $href) {
if (should_skip_href($href)) continue;
$abs = abs_url($base, $href);
if ($abs === '') continue;
// check status (cache)
if (!isset($checkedLinks[$abs])) {
$st = http_head_status($abs);
$checkedLinks[$abs] = [
'code' => (int)$st['code'],
'err' => (string)$st['err'],
];
}
$stc = (int)$checkedLinks[$abs]['code'];
if ($stc >= 400 || $stc === 0) {
$broken[] = [
'foundOn' => $url,
'link' => $abs,
'status' => $stc,
'note' => $checkedLinks[$abs]['err'] ?: '',
];
}
// if internal, enqueue for crawling
if ($depth < MAX_DEPTH && same_host($abs, (string)$host)) {
$path = norm_internal_path($abs);
$nextUrl = $base . $path;
if (!isset($seenPages[$nextUrl])) {
$queue[] = ['url'=>$nextUrl, 'depth'=>$depth+1, 'from'=>$url];
}
}
}
}
// -------- output --------
header('Content-Type: text/html; charset=utf-8');
?><!doctype html>
<html lang="en">
<head>
<meta charset="utf-8">
<meta name="viewport" content="width=device-width,initial-scale=1">
<title>Tiny Broken Link Finder</title>
<meta name="robots" content="noindex,nofollow">
<style>
:root{
--bg:#0b0f16; --panel:#111a26; --panel2:#0c1420;
--text:#e8eef8; --muted:rgba(232,238,248,.72);
--line:rgba(255,255,255,.10); --accent:#2a8cff;
--shadow:0 18px 55px rgba(0,0,0,.45); --r:16px;
--bad:#ff7c7c; --ok:#7cff9a;
}
*{box-sizing:border-box}
body{margin:0;background:var(--bg);color:var(--text);font-family:system-ui,-apple-system,Segoe UI,Roboto,Arial,sans-serif}
.wrap{max-width:1100px;margin:0 auto;padding:18px}
.card{background:var(--panel);border:1px solid var(--line);border-radius:var(--r);padding:14px;margin:12px 0;box-shadow:var(--shadow)}
.muted{color:var(--muted)}
.pill{display:inline-block;padding:4px 10px;border-radius:999px;background:rgba(42,140,255,.14);border:1px solid rgba(42,140,255,.25);font-weight:900;font-size:12px}
table{width:100%;border-collapse:collapse}
th,td{padding:10px;border-bottom:1px solid rgba(255,255,255,.08);vertical-align:top}
th{text-align:left;font-size:12px;opacity:.75}
.code{font-family:ui-monospace,SFMono-Regular,Menlo,Consolas,monospace;font-size:12px}
.bad{color:var(--bad);font-weight:900}
a{color:#8ad1ff;text-decoration:none}
a:hover{text-decoration:underline}
</style>
</head>
<body>
<div class="wrap">
<div class="card">
<div class="pill">Tool</div>
<h1 style="margin:10px 0 6px;font-size:22px">Tiny Broken Link Finder</h1>
<div class="muted">
Start: <span class="code"><?php echo h($startUrl); ?></span><br>
Crawled pages: <b><?php echo (int)$pagesCrawled; ?></b> •
Checked links: <b><?php echo (int)count($checkedLinks); ?></b> •
Broken found: <b class="<?php echo count($broken)?'bad':'ok'; ?>"><?php echo (int)count($broken); ?></b>
</div>
<div class="muted" style="margin-top:8px;font-size:13px">
Tip: Keep this private. It’s a crawler—running it too often on a big site can be heavy.
</div>
</div>
<div class="card">
<?php if (!$broken): ?>
<div class="muted">No broken links found in this crawl window.</div>
<?php else: ?>
<table>
<thead>
<tr>
<th>Status</th>
<th>Broken link</th>
<th>Found on page</th>
<th>Note</th>
</tr>
</thead>
<tbody>
<?php foreach ($broken as $b): ?>
<tr>
<td class="bad"><?php echo (int)$b['status']; ?></td>
<td class="code"><a href="<?php echo h((string)$b['link']); ?>" target="_blank" rel="noopener"><?php echo h((string)$b['link']); ?></a></td>
<td class="code"><a href="<?php echo h((string)$b['foundOn']); ?>" target="_blank" rel="noopener"><?php echo h((string)$b['foundOn']); ?></a></td>
<td class="muted"><?php echo h((string)$b['note']); ?></td>
</tr>
<?php endforeach; ?>
</tbody>
</table>
<?php endif; ?>
</div>
<div class="card">
<div class="muted" style="font-size:13px;line-height:1.55">
<b>Want to level it up?</b>
<ul style="margin:8px 0 0; padding-left:18px">
<li>Export to CSV.</li>
<li>Respect robots.txt.</li>
<li>Store results and compare crawls over time.</li>
</ul>
</div>
</div>
</div>
</body>
</html>
Comments (0)
No comments yet — be the first.