AI Visibility Preflight (Answer Engine Readiness Scanner)

January 8, 2026

A lightweight “AI visibility” preflight tool you can host on your own site. It scans a URL and flags common issues that prevent AI search/answer engines (and regular crawlers) from understanding or surfacing your pages.

What it does:

  • Checks /robots.txt for common AI/search bot tokens (and tells you if you’re blocking them).
  • Finds a sitemap via Sitemap: lines or /sitemap.xml.
  • Looks for /llms.txt (optional AI-friendly summary file).
  • Scans the page HTML for: canonical, meta robots/noindex, title, meta description, Open Graph, and JSON-LD schema.
  • Gives fix hints + ready-to-copy snippets.

Install:

  1. Create: /tools/ai-visibility-preflight/
  2. Save the script below as: /tools/ai-visibility-preflight/index.php
  3. Visit: /tools/ai-visibility-preflight/ and scan any URL

👉 Run it here: /tools/ai-visibility-preflight/

<?php
declare(strict_types=1);

/**
 * AI Visibility Preflight (No DB)
 * - One-file scanner (robots.txt, sitemap, llms.txt, meta/canonical/schema)
 * - SSRF guard (blocks private/reserved IPs)
 * - Small cache to avoid hammering targets
 */

header('X-Content-Type-Options: nosniff');
header('Referrer-Policy: strict-origin-when-cross-origin');

$CFG = [
  'timeout'   => 8,
  'max_bytes' => 450_000,
  'cache_ttl' => 600, // 10 min
];

function h(string $s): string { return htmlspecialchars($s, ENT_QUOTES, 'UTF-8'); }

function is_public_ip(string $ip): bool {
  return filter_var($ip, FILTER_VALIDATE_IP, FILTER_FLAG_NO_PRIV_RANGE | FILTER_FLAG_NO_RES_RANGE) !== false;
}

function resolve_public(string $host): bool {
  $ips = [];

  if (function_exists('dns_get_record')) {
    $a = @dns_get_record($host, DNS_A) ?: [];
    foreach ($a as $r) if (!empty($r['ip'])) $ips[] = (string)$r['ip'];

    $aaaa = @dns_get_record($host, DNS_AAAA) ?: [];
    foreach ($aaaa as $r) if (!empty($r['ipv6'])) $ips[] = (string)$r['ipv6'];
  }

  if (!$ips) {
    $ip = @gethostbyname($host);
    if (is_string($ip) && $ip !== $host) $ips[] = $ip;
  }

  $ips = array_values(array_unique(array_filter($ips)));
  if (!$ips) return false;

  foreach ($ips as $ip) {
    if (!is_public_ip($ip)) return false;
  }
  return true;
}

function norm_url(string $u): string {
  $u = trim($u);
  // allow users to paste without scheme
  if ($u !== '' && !preg_match('~^https?://~i', $u)) $u = 'https://' . $u;
  return $u;
}

function is_safe_url(string $u): bool {
  if (!filter_var($u, FILTER_VALIDATE_URL)) return false;
  $p = parse_url($u);
  if (!$p) return false;
  $sch = strtolower((string)($p['scheme'] ?? ''));
  $host = (string)($p['host'] ?? '');
  if ($sch !== 'http' && $sch !== 'https') return false;
  if ($host === '') return false;
  if (!resolve_public($host)) return false;
  return true;
}

function cache_path(string $key): string {
  return sys_get_temp_dir() . DIRECTORY_SEPARATOR . 'aivis_' . sha1($key) . '.json';
}

function fetch_url(string $url, int $timeout, int $maxBytes): array {
  $ua = 'AI-Visibility-Preflight/1.0 (+local-tool)';
  $headers = [];
  $body = '';
  $status = 0;
  $final = $url;
  $err = '';

  if (function_exists('curl_init')) {
    $ch = curl_init($url);
    curl_setopt_array($ch, [
      CURLOPT_RETURNTRANSFER => false,
      CURLOPT_FOLLOWLOCATION => true,
      CURLOPT_MAXREDIRS      => 3,
      CURLOPT_CONNECTTIMEOUT => $timeout,
      CURLOPT_TIMEOUT        => $timeout,
      CURLOPT_USERAGENT      => $ua,
      CURLOPT_HEADERFUNCTION => function($ch, $line) use (&$headers) {
        $headers[] = rtrim($line, "\r\n");
        return strlen($line);
      },
      CURLOPT_WRITEFUNCTION  => function($ch, $chunk) use (&$body, $maxBytes) {
        $body .= $chunk;
        if (strlen($body) > $maxBytes) return 0; // abort
        return strlen($chunk);
      },
    ]);

    $ok = curl_exec($ch);
    if ($ok === false) $err = (string)curl_error($ch);
    $status = (int)curl_getinfo($ch, CURLINFO_RESPONSE_CODE);
    $final  = (string)curl_getinfo($ch, CURLINFO_EFFECTIVE_URL);
    curl_close($ch);

    if ($err !== '') return ['ok'=>false,'status'=>$status,'final'=>$final,'headers'=>$headers,'body'=>$body,'error'=>$err];
    return ['ok'=>true,'status'=>$status,'final'=>$final,'headers'=>$headers,'body'=>$body,'error'=>''];
  }

  // fallback (no cURL)
  $ctx = stream_context_create([
    'http' => ['timeout'=>$timeout,'user_agent'=>$ua,'follow_location'=>1,'max_redirects'=>3],
    'ssl'  => ['verify_peer'=>true,'verify_peer_name'=>true],
  ]);
  $raw = @file_get_contents($url, false, $ctx);
  if ($raw === false) return ['ok'=>false,'status'=>0,'final'=>$url,'headers'=>[],'body'=>'','error'=>'Fetch failed'];
  if (strlen($raw) > $maxBytes) $raw = substr($raw, 0, $maxBytes);
  return ['ok'=>true,'status'=>0,'final'=>$url,'headers'=>[],'body'=>$raw,'error'=>''];
}

function cached_fetch(string $url, array $CFG): array {
  $cp = cache_path($url);
  if (is_file($cp)) {
    $raw = @file_get_contents($cp);
    if ($raw !== false) {
      $j = json_decode($raw, true);
      if (is_array($j) && !empty($j['t']) && (time() - (int)$j['t'] < (int)$CFG['cache_ttl'])) {
        return $j['v'];
      }
    }
  }
  $v = fetch_url($url, (int)$CFG['timeout'], (int)$CFG['max_bytes']);
  @file_put_contents($cp, json_encode(['t'=>time(),'v'=>$v], JSON_UNESCAPED_SLASHES));
  return $v;
}

function site_root(string $url): string {
  $p = parse_url($url);
  if (!$p) return $url;
  $sch  = (string)($p['scheme'] ?? 'https');
  $host = (string)($p['host'] ?? '');
  $port = (string)($p['port'] ?? '');
  return $sch . '://' . $host . ($port !== '' ? ':' . $port : '');
}

function header_value(array $headers, string $key): string {
  $key = strtolower($key);
  foreach ($headers as $h) {
    $pos = strpos($h, ':');
    if ($pos === false) continue;
    $k = strtolower(trim(substr($h, 0, $pos)));
    if ($k === $key) return trim(substr($h, $pos + 1));
  }
  return '';
}

function parse_robots_groups(string $txt): array {
  $txt = str_replace("\r\n", "\n", $txt);
  $lines = explode("\n", $txt);

  $groups = [];
  $cur = ['uas'=>[], 'rules'=>[], 'sitemaps'=>[]];
  $seenRule = false;

  foreach ($lines as $line) {
    $line = preg_replace('/\s+#.*$/', '', $line);
    $line = trim((string)$line);
    if ($line === '') {
      if ($cur['uas']) { $groups[] = $cur; }
      $cur = ['uas'=>[], 'rules'=>[], 'sitemaps'=>[]];
      $seenRule = false;
      continue;
    }

    if (stripos($line, 'user-agent:') === 0) {
      if ($seenRule && $cur['uas']) { $groups[] = $cur; $cur = ['uas'=>[], 'rules'=>[], 'sitemaps'=>[]]; $seenRule = false; }
      $ua = strtolower(trim(substr($line, strlen('user-agent:'))));
      $cur['uas'][] = $ua;
      continue;
    }

    if (stripos($line, 'sitemap:') === 0) {
      $sm = trim(substr($line, strlen('sitemap:')));
      if ($sm !== '') $cur['sitemaps'][] = $sm;
      continue;
    }

    if (preg_match('/^(allow|disallow)\s*:\s*(.*)$/i', $line, $m)) {
      $seenRule = true;
      $cur['rules'][] = [strtolower($m[1]), trim($m[2])];
      continue;
    }
  }

  if ($cur['uas']) $groups[] = $cur;
  return $groups;
}

function robots_status(string $robotsTxt, string $uaToken): array {
  // Simplified: detect full-site block via Disallow: /
  $uaToken = strtolower($uaToken);
  $groups = parse_robots_groups($robotsTxt);

  $exact = null;
  $star  = null;

  foreach ($groups as $g) {
    $uas = $g['uas'];
    if (in_array($uaToken, $uas, true)) $exact = $g;
    if (in_array('*', $uas, true)) $star = $g;
  }

  $g = $exact ?? $star;
  if (!$g) return ['state'=>'unknown', 'note'=>'No matching group found (default allow)'];

  $disAll = false;
  $allowRoot = false;

  foreach ($g['rules'] as [$k, $v]) {
    if ($k === 'disallow' && $v === '/') $disAll = true;
    if ($k === 'allow' && ($v === '/' || $v === '')) $allowRoot = true;
  }

  if ($disAll && !$allowRoot) return ['state'=>'blocked', 'note'=>'Disallow: / detected'];
  return ['state'=>'allowed', 'note'=>'No full-site block detected'];
}

function find_first(string $html, string $pattern): string {
  if (preg_match($pattern, $html, $m)) return (string)($m[1] ?? '');
  return '';
}

$u = isset($_GET['u']) ? norm_url((string)$_GET['u']) : '';
$scan = ($u !== '');

$err = '';
$res = null;

if ($scan) {
  if (!is_safe_url($u)) {
    $err = 'That URL is not allowed (must be public http/https and not resolve to private/reserved IPs).';
  } else {
    $root = site_root($u);

    $page   = cached_fetch($u, $CFG);
    $robots = cached_fetch($root . '/robots.txt', $CFG);
    $llms   = cached_fetch($root . '/llms.txt', $CFG);

    $robotsTxt = ($robots['ok'] && (int)$robots['status'] >= 200 && (int)$robots['status'] < 500) ? (string)$robots['body'] : '';
    $groups = $robotsTxt !== '' ? parse_robots_groups($robotsTxt) : [];

    // sitemap candidates
    $sitemaps = [];
    foreach ($groups as $g) foreach ($g['sitemaps'] as $sm) $sitemaps[] = $sm;
    $sitemaps = array_values(array_unique(array_filter($sitemaps)));

    $sitemapHit = null;
    $sitemapUrl = '';
    if ($sitemaps) {
      $sitemapUrl = $sitemaps[0];
      if (is_safe_url($sitemapUrl)) $sitemapHit = cached_fetch($sitemapUrl, $CFG);
    } else {
      $sitemapUrl = $root . '/sitemap.xml';
      $sitemapHit = cached_fetch($sitemapUrl, $CFG);
    }

    $html = (string)($page['body'] ?? '');
    $xrobots = header_value((array)($page['headers'] ?? []), 'x-robots-tag');

    $metaRobots = find_first($html, '~<meta\s+[^>]*name=["\']robots["\'][^>]*content=["\']([^"\']+)["\'][^>]*>~i');
    $canonical  = find_first($html, '~<link\s+[^>]*rel=["\']canonical["\'][^>]*href=["\']([^"\']+)["\'][^>]*>~i');
    $title      = trim(strip_tags(find_first($html, '~<title[^>]*>(.*?)</title>~is')));
    $desc       = find_first($html, '~<meta\s+[^>]*name=["\']description["\'][^>]*content=["\']([^"\']+)["\'][^>]*>~i');

    $ogTitle = find_first($html, '~<meta\s+[^>]*property=["\']og:title["\'][^>]*content=["\']([^"\']+)["\'][^>]*>~i');
    $ogDesc  = find_first($html, '~<meta\s+[^>]*property=["\']og:description["\'][^>]*content=["\']([^"\']+)["\'][^>]*>~i');

    $jsonldCount = 0;
    if ($html !== '') {
      preg_match_all('~<script[^>]*type=["\']application/ld\+json["\'][^>]*>~i', $html, $m);
      $jsonldCount = isset($m[0]) ? count($m[0]) : 0;
    }

    $metaNoindex = (stripos($metaRobots, 'noindex') !== false);
    $hdrNoindex  = (stripos($xrobots, 'noindex') !== false);

    $bots = [
      'OAI-SearchBot'    => robots_status($robotsTxt, 'oai-searchbot'),
      'GPTBot'           => robots_status($robotsTxt, 'gptbot'),
      'ChatGPT-User'     => robots_status($robotsTxt, 'chatgpt-user'),
      'PerplexityBot'    => robots_status($robotsTxt, 'perplexitybot'),
      'Perplexity-User'  => robots_status($robotsTxt, 'perplexity-user'),
      'Google-Extended'  => robots_status($robotsTxt, 'google-extended'),
    ];

    $res = [
      'url' => $u,
      'root' => $root,
      'page_status' => (int)($page['status'] ?? 0),
      'robots_status' => (int)($robots['status'] ?? 0),
      'llms_status' => (int)($llms['status'] ?? 0),
      'sitemap_status' => (int)($sitemapHit['status'] ?? 0),
      'sitemap_url' => $sitemapUrl,
      'bots' => $bots,

      'meta_noindex' => $metaNoindex,
      'hdr_noindex'  => $hdrNoindex,
      'meta_robots'  => $metaRobots,
      'xrobots'      => $xrobots,
      'canonical'    => $canonical,
      'title'        => $title,
      'desc'         => $desc,
      'og_title'     => $ogTitle,
      'og_desc'      => $ogDesc,
      'jsonld_count' => $jsonldCount,

      'robots_txt_ok' => ($robotsTxt !== ''),
      'llms_ok'       => ($llms['ok'] && (int)$llms['status'] >= 200 && (int)$llms['status'] < 400 && trim((string)$llms['body']) !== ''),
      'sitemap_ok'    => ($sitemapHit && $sitemapHit['ok'] && (int)$sitemapHit['status'] >= 200 && (int)$sitemapHit['status'] < 400),
    ];
  }
}
?>
<style>
  body{margin:0;font-family:system-ui,-apple-system,Segoe UI,Roboto,Arial,sans-serif;background:#0b0f16;color:#e8eef8}
  .wrap{max-width:980px;margin:0 auto;padding:18px}
  .card{background:#111a26;border:1px solid rgba(255,255,255,.08);border-radius:14px;padding:14px;margin:12px 0}
  input{width:100%;padding:12px;border-radius:12px;border:1px solid rgba(255,255,255,.14);background:#0c1420;color:#e8eef8}
  button{padding:10px 12px;border:0;border-radius:12px;background:#2a8cff;color:#08121e;font-weight:800;cursor:pointer}
  .muted{color:rgba(232,238,248,.7)}
  .grid{display:grid;grid-template-columns:repeat(12,minmax(0,1fr));gap:12px}
  .col6{grid-column:span 6}
  .col12{grid-column:span 12}
  @media (max-width:900px){.col6{grid-column:span 12}}
  code{font-family:ui-monospace,SFMono-Regular,Menlo,Consolas,monospace}
  .ok{color:#7CFF9A} .warn{color:#FFD37C} .bad{color:#FF7C7C}
  .kv{display:flex;justify-content:space-between;gap:12px;flex-wrap:wrap}
  .kv b{font-weight:800}
  .pill{display:inline-block;padding:4px 10px;border-radius:999px;background:rgba(42,140,255,.14);border:1px solid rgba(42,140,255,.25)}
  pre{white-space:pre-wrap;background:#0c1420;border:1px solid rgba(255,255,255,.10);padding:12px;border-radius:12px;margin:10px 0}
</style>

<div class="wrap">
  <div class="card">
    <div class="pill">AI Visibility Preflight</div>
    <h1 style="margin:10px 0 6px;">Scan a URL</h1>
    <div class="muted">Checks robots.txt, sitemap, llms.txt, and key meta/schema signals that affect “AI/answer engine readability”.</div>

    <form method="get" style="margin-top:12px;display:flex;gap:10px;flex-wrap:wrap">
      <input name="u" placeholder="example.com/page" value="<?php echo h($u); ?>" style="flex:1;min-width:260px">
      <button type="submit">Scan</button>
    </form>

    <?php if ($err): ?>
      <div class="card" style="background:#2a0f14;border-color:rgba(255,124,124,.25)"><?php echo h($err); ?></div>
    <?php endif; ?>
  </div>

  <?php if ($res): ?>
    <div class="grid">
      <div class="card col6">
        <h2 style="margin:0 0 10px;font-size:18px;">Access files</h2>
        <div class="kv"><span>robots.txt</span>
          <b class="<?php echo $res['robots_txt_ok'] ? 'ok':'warn'; ?>"><?php echo $res['robots_txt_ok'] ? 'Found' : 'Missing/blocked'; ?></b>
        </div>
        <div class="kv"><span>sitemap</span>
          <b class="<?php echo $res['sitemap_ok'] ? 'ok':'warn'; ?>"><?php echo $res['sitemap_ok'] ? 'Found' : 'Not found'; ?></b>
        </div>
        <div class="muted" style="margin-top:6px;font-size:13px;">Sitemap tried: <code><?php echo h($res['sitemap_url']); ?></code></div>

        <div class="kv" style="margin-top:10px;"><span>llms.txt (optional)</span>
          <b class="<?php echo $res['llms_ok'] ? 'ok':'warn'; ?>"><?php echo $res['llms_ok'] ? 'Found' : 'Not found'; ?></b>
        </div>
        <div class="muted" style="margin-top:6px;font-size:13px;">Tip: add a short <code>/llms.txt</code> pointing to your best pages/docs for AI assistants.</div>
      </div>

      <div class="card col6">
        <h2 style="margin:0 0 10px;font-size:18px;">Page signals</h2>

        <div class="kv"><span>Meta noindex</span>
          <b class="<?php echo $res['meta_noindex'] ? 'bad':'ok'; ?>"><?php echo $res['meta_noindex'] ? 'YES (blocked)' : 'No'; ?></b>
        </div>
        <div class="kv"><span>X-Robots-Tag noindex</span>
          <b class="<?php echo $res['hdr_noindex'] ? 'bad':'ok'; ?>"><?php echo $res['hdr_noindex'] ? 'YES (blocked)' : 'No/unknown'; ?></b>
        </div>

        <div class="kv" style="margin-top:10px;"><span>Canonical</span>
          <b class="<?php echo $res['canonical'] ? 'ok':'warn'; ?>"><?php echo $res['canonical'] ? 'Present' : 'Missing'; ?></b>
        </div>

        <div class="kv"><span>Title</span>
          <b class="<?php echo $res['title'] ? 'ok':'warn'; ?>"><?php echo $res['title'] ? 'Present' : 'Missing'; ?></b>
        </div>

        <div class="kv"><span>Meta description</span>
          <b class="<?php echo $res['desc'] ? 'ok':'warn'; ?>"><?php echo $res['desc'] ? 'Present' : 'Missing'; ?></b>
        </div>

        <div class="kv" style="margin-top:10px;"><span>JSON-LD schema</span>
          <b class="<?php echo $res['jsonld_count'] ? 'ok':'warn'; ?>"><?php echo (int)$res['jsonld_count']; ?> found</b>
        </div>

        <div class="kv"><span>Open Graph</span>
          <b class="<?php echo ($res['og_title'] && $res['og_desc']) ? 'ok':'warn'; ?>"><?php echo ($res['og_title'] && $res['og_desc']) ? 'OK' : 'Missing/partial'; ?></b>
        </div>
      </div>

      <div class="card col12">
        <h2 style="margin:0 0 10px;font-size:18px;">Robots.txt bot checks (simplified)</h2>
        <div class="muted" style="margin-bottom:10px;">This only flags a full-site <code>Disallow: /</code>. It does not fully simulate robots precedence rules.</div>

        <div class="grid">
          <?php foreach ($res['bots'] as $name => $st): ?>
            <div class="card col6" style="margin:0;background:#0c1420">
              <div class="kv">
                <b><?php echo h($name); ?></b>
                <?php
                  $cls = $st['state'] === 'blocked' ? 'bad' : ($st['state'] === 'allowed' ? 'ok' : 'warn');
                ?>
                <span class="<?php echo $cls; ?>"><?php echo h($st['state']); ?></span>
              </div>
              <div class="muted" style="margin-top:6px;font-size:13px;"><?php echo h($st['note']); ?></div>
            </div>
          <?php endforeach; ?>
        </div>

        <h3 style="margin:14px 0 8px;font-size:16px;">Quick fix snippets</h3>
        <div class="muted" style="margin-bottom:8px;">If you’re accidentally blocking search/AI bots, add specific groups (example only):</div>
        <pre><code>User-agent: OAI-SearchBot
Allow: /

User-agent: PerplexityBot
Allow: /</code></pre>

        <div class="muted" style="margin-top:10px;">If you want to block model-training crawlers but still allow “search bots”, keep those groups separate.</div>
      </div>
    </div>
  <?php endif; ?>
</div>

Comments (0)

No comments yet — be the first.

← Back to all scripts