AI Visibility Preflight (Answer Engine Readiness Scanner)
A lightweight “AI visibility” preflight tool you can host on your own site. It scans a URL and flags common issues that prevent AI search/answer engines (and regular crawlers) from understanding or surfacing your pages.
What it does:
- Checks
/robots.txtfor common AI/search bot tokens (and tells you if you’re blocking them). - Finds a sitemap via
Sitemap:lines or/sitemap.xml. - Looks for
/llms.txt(optional AI-friendly summary file). - Scans the page HTML for: canonical, meta robots/noindex, title, meta description, Open Graph, and JSON-LD schema.
- Gives fix hints + ready-to-copy snippets.
Install:
- Create:
/tools/ai-visibility-preflight/ - Save the script below as:
/tools/ai-visibility-preflight/index.php - Visit:
/tools/ai-visibility-preflight/and scan any URL
👉 Run it here: /tools/ai-visibility-preflight/
<?php
declare(strict_types=1);
/**
* AI Visibility Preflight (No DB)
* - One-file scanner (robots.txt, sitemap, llms.txt, meta/canonical/schema)
* - SSRF guard (blocks private/reserved IPs)
* - Small cache to avoid hammering targets
*/
header('X-Content-Type-Options: nosniff');
header('Referrer-Policy: strict-origin-when-cross-origin');
$CFG = [
'timeout' => 8,
'max_bytes' => 450_000,
'cache_ttl' => 600, // 10 min
];
function h(string $s): string { return htmlspecialchars($s, ENT_QUOTES, 'UTF-8'); }
function is_public_ip(string $ip): bool {
return filter_var($ip, FILTER_VALIDATE_IP, FILTER_FLAG_NO_PRIV_RANGE | FILTER_FLAG_NO_RES_RANGE) !== false;
}
function resolve_public(string $host): bool {
$ips = [];
if (function_exists('dns_get_record')) {
$a = @dns_get_record($host, DNS_A) ?: [];
foreach ($a as $r) if (!empty($r['ip'])) $ips[] = (string)$r['ip'];
$aaaa = @dns_get_record($host, DNS_AAAA) ?: [];
foreach ($aaaa as $r) if (!empty($r['ipv6'])) $ips[] = (string)$r['ipv6'];
}
if (!$ips) {
$ip = @gethostbyname($host);
if (is_string($ip) && $ip !== $host) $ips[] = $ip;
}
$ips = array_values(array_unique(array_filter($ips)));
if (!$ips) return false;
foreach ($ips as $ip) {
if (!is_public_ip($ip)) return false;
}
return true;
}
function norm_url(string $u): string {
$u = trim($u);
// allow users to paste without scheme
if ($u !== '' && !preg_match('~^https?://~i', $u)) $u = 'https://' . $u;
return $u;
}
function is_safe_url(string $u): bool {
if (!filter_var($u, FILTER_VALIDATE_URL)) return false;
$p = parse_url($u);
if (!$p) return false;
$sch = strtolower((string)($p['scheme'] ?? ''));
$host = (string)($p['host'] ?? '');
if ($sch !== 'http' && $sch !== 'https') return false;
if ($host === '') return false;
if (!resolve_public($host)) return false;
return true;
}
function cache_path(string $key): string {
return sys_get_temp_dir() . DIRECTORY_SEPARATOR . 'aivis_' . sha1($key) . '.json';
}
function fetch_url(string $url, int $timeout, int $maxBytes): array {
$ua = 'AI-Visibility-Preflight/1.0 (+local-tool)';
$headers = [];
$body = '';
$status = 0;
$final = $url;
$err = '';
if (function_exists('curl_init')) {
$ch = curl_init($url);
curl_setopt_array($ch, [
CURLOPT_RETURNTRANSFER => false,
CURLOPT_FOLLOWLOCATION => true,
CURLOPT_MAXREDIRS => 3,
CURLOPT_CONNECTTIMEOUT => $timeout,
CURLOPT_TIMEOUT => $timeout,
CURLOPT_USERAGENT => $ua,
CURLOPT_HEADERFUNCTION => function($ch, $line) use (&$headers) {
$headers[] = rtrim($line, "\r\n");
return strlen($line);
},
CURLOPT_WRITEFUNCTION => function($ch, $chunk) use (&$body, $maxBytes) {
$body .= $chunk;
if (strlen($body) > $maxBytes) return 0; // abort
return strlen($chunk);
},
]);
$ok = curl_exec($ch);
if ($ok === false) $err = (string)curl_error($ch);
$status = (int)curl_getinfo($ch, CURLINFO_RESPONSE_CODE);
$final = (string)curl_getinfo($ch, CURLINFO_EFFECTIVE_URL);
curl_close($ch);
if ($err !== '') return ['ok'=>false,'status'=>$status,'final'=>$final,'headers'=>$headers,'body'=>$body,'error'=>$err];
return ['ok'=>true,'status'=>$status,'final'=>$final,'headers'=>$headers,'body'=>$body,'error'=>''];
}
// fallback (no cURL)
$ctx = stream_context_create([
'http' => ['timeout'=>$timeout,'user_agent'=>$ua,'follow_location'=>1,'max_redirects'=>3],
'ssl' => ['verify_peer'=>true,'verify_peer_name'=>true],
]);
$raw = @file_get_contents($url, false, $ctx);
if ($raw === false) return ['ok'=>false,'status'=>0,'final'=>$url,'headers'=>[],'body'=>'','error'=>'Fetch failed'];
if (strlen($raw) > $maxBytes) $raw = substr($raw, 0, $maxBytes);
return ['ok'=>true,'status'=>0,'final'=>$url,'headers'=>[],'body'=>$raw,'error'=>''];
}
function cached_fetch(string $url, array $CFG): array {
$cp = cache_path($url);
if (is_file($cp)) {
$raw = @file_get_contents($cp);
if ($raw !== false) {
$j = json_decode($raw, true);
if (is_array($j) && !empty($j['t']) && (time() - (int)$j['t'] < (int)$CFG['cache_ttl'])) {
return $j['v'];
}
}
}
$v = fetch_url($url, (int)$CFG['timeout'], (int)$CFG['max_bytes']);
@file_put_contents($cp, json_encode(['t'=>time(),'v'=>$v], JSON_UNESCAPED_SLASHES));
return $v;
}
function site_root(string $url): string {
$p = parse_url($url);
if (!$p) return $url;
$sch = (string)($p['scheme'] ?? 'https');
$host = (string)($p['host'] ?? '');
$port = (string)($p['port'] ?? '');
return $sch . '://' . $host . ($port !== '' ? ':' . $port : '');
}
function header_value(array $headers, string $key): string {
$key = strtolower($key);
foreach ($headers as $h) {
$pos = strpos($h, ':');
if ($pos === false) continue;
$k = strtolower(trim(substr($h, 0, $pos)));
if ($k === $key) return trim(substr($h, $pos + 1));
}
return '';
}
function parse_robots_groups(string $txt): array {
$txt = str_replace("\r\n", "\n", $txt);
$lines = explode("\n", $txt);
$groups = [];
$cur = ['uas'=>[], 'rules'=>[], 'sitemaps'=>[]];
$seenRule = false;
foreach ($lines as $line) {
$line = preg_replace('/\s+#.*$/', '', $line);
$line = trim((string)$line);
if ($line === '') {
if ($cur['uas']) { $groups[] = $cur; }
$cur = ['uas'=>[], 'rules'=>[], 'sitemaps'=>[]];
$seenRule = false;
continue;
}
if (stripos($line, 'user-agent:') === 0) {
if ($seenRule && $cur['uas']) { $groups[] = $cur; $cur = ['uas'=>[], 'rules'=>[], 'sitemaps'=>[]]; $seenRule = false; }
$ua = strtolower(trim(substr($line, strlen('user-agent:'))));
$cur['uas'][] = $ua;
continue;
}
if (stripos($line, 'sitemap:') === 0) {
$sm = trim(substr($line, strlen('sitemap:')));
if ($sm !== '') $cur['sitemaps'][] = $sm;
continue;
}
if (preg_match('/^(allow|disallow)\s*:\s*(.*)$/i', $line, $m)) {
$seenRule = true;
$cur['rules'][] = [strtolower($m[1]), trim($m[2])];
continue;
}
}
if ($cur['uas']) $groups[] = $cur;
return $groups;
}
function robots_status(string $robotsTxt, string $uaToken): array {
// Simplified: detect full-site block via Disallow: /
$uaToken = strtolower($uaToken);
$groups = parse_robots_groups($robotsTxt);
$exact = null;
$star = null;
foreach ($groups as $g) {
$uas = $g['uas'];
if (in_array($uaToken, $uas, true)) $exact = $g;
if (in_array('*', $uas, true)) $star = $g;
}
$g = $exact ?? $star;
if (!$g) return ['state'=>'unknown', 'note'=>'No matching group found (default allow)'];
$disAll = false;
$allowRoot = false;
foreach ($g['rules'] as [$k, $v]) {
if ($k === 'disallow' && $v === '/') $disAll = true;
if ($k === 'allow' && ($v === '/' || $v === '')) $allowRoot = true;
}
if ($disAll && !$allowRoot) return ['state'=>'blocked', 'note'=>'Disallow: / detected'];
return ['state'=>'allowed', 'note'=>'No full-site block detected'];
}
function find_first(string $html, string $pattern): string {
if (preg_match($pattern, $html, $m)) return (string)($m[1] ?? '');
return '';
}
$u = isset($_GET['u']) ? norm_url((string)$_GET['u']) : '';
$scan = ($u !== '');
$err = '';
$res = null;
if ($scan) {
if (!is_safe_url($u)) {
$err = 'That URL is not allowed (must be public http/https and not resolve to private/reserved IPs).';
} else {
$root = site_root($u);
$page = cached_fetch($u, $CFG);
$robots = cached_fetch($root . '/robots.txt', $CFG);
$llms = cached_fetch($root . '/llms.txt', $CFG);
$robotsTxt = ($robots['ok'] && (int)$robots['status'] >= 200 && (int)$robots['status'] < 500) ? (string)$robots['body'] : '';
$groups = $robotsTxt !== '' ? parse_robots_groups($robotsTxt) : [];
// sitemap candidates
$sitemaps = [];
foreach ($groups as $g) foreach ($g['sitemaps'] as $sm) $sitemaps[] = $sm;
$sitemaps = array_values(array_unique(array_filter($sitemaps)));
$sitemapHit = null;
$sitemapUrl = '';
if ($sitemaps) {
$sitemapUrl = $sitemaps[0];
if (is_safe_url($sitemapUrl)) $sitemapHit = cached_fetch($sitemapUrl, $CFG);
} else {
$sitemapUrl = $root . '/sitemap.xml';
$sitemapHit = cached_fetch($sitemapUrl, $CFG);
}
$html = (string)($page['body'] ?? '');
$xrobots = header_value((array)($page['headers'] ?? []), 'x-robots-tag');
$metaRobots = find_first($html, '~<meta\s+[^>]*name=["\']robots["\'][^>]*content=["\']([^"\']+)["\'][^>]*>~i');
$canonical = find_first($html, '~<link\s+[^>]*rel=["\']canonical["\'][^>]*href=["\']([^"\']+)["\'][^>]*>~i');
$title = trim(strip_tags(find_first($html, '~<title[^>]*>(.*?)</title>~is')));
$desc = find_first($html, '~<meta\s+[^>]*name=["\']description["\'][^>]*content=["\']([^"\']+)["\'][^>]*>~i');
$ogTitle = find_first($html, '~<meta\s+[^>]*property=["\']og:title["\'][^>]*content=["\']([^"\']+)["\'][^>]*>~i');
$ogDesc = find_first($html, '~<meta\s+[^>]*property=["\']og:description["\'][^>]*content=["\']([^"\']+)["\'][^>]*>~i');
$jsonldCount = 0;
if ($html !== '') {
preg_match_all('~<script[^>]*type=["\']application/ld\+json["\'][^>]*>~i', $html, $m);
$jsonldCount = isset($m[0]) ? count($m[0]) : 0;
}
$metaNoindex = (stripos($metaRobots, 'noindex') !== false);
$hdrNoindex = (stripos($xrobots, 'noindex') !== false);
$bots = [
'OAI-SearchBot' => robots_status($robotsTxt, 'oai-searchbot'),
'GPTBot' => robots_status($robotsTxt, 'gptbot'),
'ChatGPT-User' => robots_status($robotsTxt, 'chatgpt-user'),
'PerplexityBot' => robots_status($robotsTxt, 'perplexitybot'),
'Perplexity-User' => robots_status($robotsTxt, 'perplexity-user'),
'Google-Extended' => robots_status($robotsTxt, 'google-extended'),
];
$res = [
'url' => $u,
'root' => $root,
'page_status' => (int)($page['status'] ?? 0),
'robots_status' => (int)($robots['status'] ?? 0),
'llms_status' => (int)($llms['status'] ?? 0),
'sitemap_status' => (int)($sitemapHit['status'] ?? 0),
'sitemap_url' => $sitemapUrl,
'bots' => $bots,
'meta_noindex' => $metaNoindex,
'hdr_noindex' => $hdrNoindex,
'meta_robots' => $metaRobots,
'xrobots' => $xrobots,
'canonical' => $canonical,
'title' => $title,
'desc' => $desc,
'og_title' => $ogTitle,
'og_desc' => $ogDesc,
'jsonld_count' => $jsonldCount,
'robots_txt_ok' => ($robotsTxt !== ''),
'llms_ok' => ($llms['ok'] && (int)$llms['status'] >= 200 && (int)$llms['status'] < 400 && trim((string)$llms['body']) !== ''),
'sitemap_ok' => ($sitemapHit && $sitemapHit['ok'] && (int)$sitemapHit['status'] >= 200 && (int)$sitemapHit['status'] < 400),
];
}
}
?>
<style>
body{margin:0;font-family:system-ui,-apple-system,Segoe UI,Roboto,Arial,sans-serif;background:#0b0f16;color:#e8eef8}
.wrap{max-width:980px;margin:0 auto;padding:18px}
.card{background:#111a26;border:1px solid rgba(255,255,255,.08);border-radius:14px;padding:14px;margin:12px 0}
input{width:100%;padding:12px;border-radius:12px;border:1px solid rgba(255,255,255,.14);background:#0c1420;color:#e8eef8}
button{padding:10px 12px;border:0;border-radius:12px;background:#2a8cff;color:#08121e;font-weight:800;cursor:pointer}
.muted{color:rgba(232,238,248,.7)}
.grid{display:grid;grid-template-columns:repeat(12,minmax(0,1fr));gap:12px}
.col6{grid-column:span 6}
.col12{grid-column:span 12}
@media (max-width:900px){.col6{grid-column:span 12}}
code{font-family:ui-monospace,SFMono-Regular,Menlo,Consolas,monospace}
.ok{color:#7CFF9A} .warn{color:#FFD37C} .bad{color:#FF7C7C}
.kv{display:flex;justify-content:space-between;gap:12px;flex-wrap:wrap}
.kv b{font-weight:800}
.pill{display:inline-block;padding:4px 10px;border-radius:999px;background:rgba(42,140,255,.14);border:1px solid rgba(42,140,255,.25)}
pre{white-space:pre-wrap;background:#0c1420;border:1px solid rgba(255,255,255,.10);padding:12px;border-radius:12px;margin:10px 0}
</style>
<div class="wrap">
<div class="card">
<div class="pill">AI Visibility Preflight</div>
<h1 style="margin:10px 0 6px;">Scan a URL</h1>
<div class="muted">Checks robots.txt, sitemap, llms.txt, and key meta/schema signals that affect “AI/answer engine readability”.</div>
<form method="get" style="margin-top:12px;display:flex;gap:10px;flex-wrap:wrap">
<input name="u" placeholder="example.com/page" value="<?php echo h($u); ?>" style="flex:1;min-width:260px">
<button type="submit">Scan</button>
</form>
<?php if ($err): ?>
<div class="card" style="background:#2a0f14;border-color:rgba(255,124,124,.25)"><?php echo h($err); ?></div>
<?php endif; ?>
</div>
<?php if ($res): ?>
<div class="grid">
<div class="card col6">
<h2 style="margin:0 0 10px;font-size:18px;">Access files</h2>
<div class="kv"><span>robots.txt</span>
<b class="<?php echo $res['robots_txt_ok'] ? 'ok':'warn'; ?>"><?php echo $res['robots_txt_ok'] ? 'Found' : 'Missing/blocked'; ?></b>
</div>
<div class="kv"><span>sitemap</span>
<b class="<?php echo $res['sitemap_ok'] ? 'ok':'warn'; ?>"><?php echo $res['sitemap_ok'] ? 'Found' : 'Not found'; ?></b>
</div>
<div class="muted" style="margin-top:6px;font-size:13px;">Sitemap tried: <code><?php echo h($res['sitemap_url']); ?></code></div>
<div class="kv" style="margin-top:10px;"><span>llms.txt (optional)</span>
<b class="<?php echo $res['llms_ok'] ? 'ok':'warn'; ?>"><?php echo $res['llms_ok'] ? 'Found' : 'Not found'; ?></b>
</div>
<div class="muted" style="margin-top:6px;font-size:13px;">Tip: add a short <code>/llms.txt</code> pointing to your best pages/docs for AI assistants.</div>
</div>
<div class="card col6">
<h2 style="margin:0 0 10px;font-size:18px;">Page signals</h2>
<div class="kv"><span>Meta noindex</span>
<b class="<?php echo $res['meta_noindex'] ? 'bad':'ok'; ?>"><?php echo $res['meta_noindex'] ? 'YES (blocked)' : 'No'; ?></b>
</div>
<div class="kv"><span>X-Robots-Tag noindex</span>
<b class="<?php echo $res['hdr_noindex'] ? 'bad':'ok'; ?>"><?php echo $res['hdr_noindex'] ? 'YES (blocked)' : 'No/unknown'; ?></b>
</div>
<div class="kv" style="margin-top:10px;"><span>Canonical</span>
<b class="<?php echo $res['canonical'] ? 'ok':'warn'; ?>"><?php echo $res['canonical'] ? 'Present' : 'Missing'; ?></b>
</div>
<div class="kv"><span>Title</span>
<b class="<?php echo $res['title'] ? 'ok':'warn'; ?>"><?php echo $res['title'] ? 'Present' : 'Missing'; ?></b>
</div>
<div class="kv"><span>Meta description</span>
<b class="<?php echo $res['desc'] ? 'ok':'warn'; ?>"><?php echo $res['desc'] ? 'Present' : 'Missing'; ?></b>
</div>
<div class="kv" style="margin-top:10px;"><span>JSON-LD schema</span>
<b class="<?php echo $res['jsonld_count'] ? 'ok':'warn'; ?>"><?php echo (int)$res['jsonld_count']; ?> found</b>
</div>
<div class="kv"><span>Open Graph</span>
<b class="<?php echo ($res['og_title'] && $res['og_desc']) ? 'ok':'warn'; ?>"><?php echo ($res['og_title'] && $res['og_desc']) ? 'OK' : 'Missing/partial'; ?></b>
</div>
</div>
<div class="card col12">
<h2 style="margin:0 0 10px;font-size:18px;">Robots.txt bot checks (simplified)</h2>
<div class="muted" style="margin-bottom:10px;">This only flags a full-site <code>Disallow: /</code>. It does not fully simulate robots precedence rules.</div>
<div class="grid">
<?php foreach ($res['bots'] as $name => $st): ?>
<div class="card col6" style="margin:0;background:#0c1420">
<div class="kv">
<b><?php echo h($name); ?></b>
<?php
$cls = $st['state'] === 'blocked' ? 'bad' : ($st['state'] === 'allowed' ? 'ok' : 'warn');
?>
<span class="<?php echo $cls; ?>"><?php echo h($st['state']); ?></span>
</div>
<div class="muted" style="margin-top:6px;font-size:13px;"><?php echo h($st['note']); ?></div>
</div>
<?php endforeach; ?>
</div>
<h3 style="margin:14px 0 8px;font-size:16px;">Quick fix snippets</h3>
<div class="muted" style="margin-bottom:8px;">If you’re accidentally blocking search/AI bots, add specific groups (example only):</div>
<pre><code>User-agent: OAI-SearchBot
Allow: /
User-agent: PerplexityBot
Allow: /</code></pre>
<div class="muted" style="margin-top:10px;">If you want to block model-training crawlers but still allow “search bots”, keep those groups separate.</div>
</div>
</div>
<?php endif; ?>
</div>
Comments (0)
No comments yet — be the first.