# Robots.txt for ReVisualize Studio — https://revisualizestudio.com
# AI architectural rendering software. Rules below explicitly opt IN to AI search
# crawlers and training crawlers, because brand recognition inside large language
# models is a positive signal for a marketing-driven site. Aggressive scrapers
# (Bytespider, MJ12bot, DotBot) remain blocked.
#
# Audit context (CiteForge 2026-05-28):
# - P2 ai_crawler.robots_blocks_but_cdn_serves — pointless blocks were removed.
# - P3 ai_crawler.training_bots_blocked — GPTBot, CCBot are now Allowed.
# - P3 ai_crawler.disallowed_uas_receive_200 — defense moved to CDN headers, not robots.txt.

# ============================================================================
# DEFAULT — allow all, gate only true back-office paths.
#
# IMPORTANT — /app is intentionally NOT in Disallow, even though we want it kept
# out of search results. Per Google's robots.txt docs (2026), if robots.txt blocks
# a URL the crawler can never see the noindex meta or X-Robots-Tag header, so the
# URL can still surface in results from inbound links — only without a snippet.
# We instead let crawlers fetch /app and serve them an X-Robots-Tag: noindex,
# nofollow header (staticwebapp.config.json) plus a <meta name="robots"
# content="noindex, nofollow"> tag in /app/index.html. Same pattern for
# /reset-password.
# ============================================================================
User-agent: *
Allow: /
Disallow: /api/
Disallow: /admin/
Disallow: /private/

# Sitemap
Sitemap: https://revisualizestudio.com/sitemap.xml

# ============================================================================
# CORE SEARCH ENGINES
# ============================================================================

User-agent: Googlebot
Allow: /

User-agent: Googlebot-Image
Allow: /

# Google AI Overviews / Gemini training — explicitly allowed for AI visibility.
User-agent: Google-Extended
Allow: /

User-agent: Bingbot
Allow: /

User-agent: DuckDuckBot
Allow: /

User-agent: Slurp
Allow: /

User-agent: Yandex
Allow: /

User-agent: Applebot
Allow: /

# Apple Intelligence / Siri grounding.
User-agent: Applebot-Extended
Allow: /

# ============================================================================
# AI SEARCH & CITATION BOTS (Answer-Engine Optimization)
# These crawlers fetch pages on-demand when a user asks a question, then cite
# the source. We want every one of them indexing the marketing pages.
# ============================================================================

# OpenAI — SERP-style search results inside ChatGPT.
User-agent: OAI-SearchBot
Allow: /

# OpenAI — user-triggered browsing from ChatGPT.
User-agent: ChatGPT-User
Allow: /
User-agent: ChatGPT-User/2.0
Allow: /

# Anthropic Claude — citation fetch.
User-agent: ClaudeBot
Allow: /
User-agent: claude-web
Allow: /
User-agent: Claude-Web
Allow: /
User-agent: Claude-SearchBot
Allow: /

# Perplexity AI.
User-agent: PerplexityBot
Allow: /
User-agent: Perplexity-User
Allow: /

# Meta link previews and Meta AI.
User-agent: FacebookBot
Allow: /
User-agent: meta-externalagent
Allow: /

# DuckAssist (DuckDuckGo's AI answer).
User-agent: DuckAssistBot
Allow: /

# You.com.
User-agent: YouBot
Allow: /

# Mistral.
User-agent: MistralAI-User
Allow: /

# Cohere.
User-agent: cohere-ai
Allow: /

# ============================================================================
# AI TRAINING CRAWLERS — explicitly ALLOWED.
# Brand recognition inside foundation models is a high-value GEO signal. The
# upside of being in training data outweighs the downside for a small B2C/B2B
# marketing site. Re-block here if you need to protect proprietary text.
# ============================================================================

# OpenAI training.
User-agent: GPTBot
Allow: /

# Common Crawl — feeds most open-source LLM training pipelines.
User-agent: CCBot
Allow: /

# Anthropic (legacy training user-agent).
User-agent: anthropic-ai
Allow: /

# Amazon (Alexa+ / Rufus / Nova).
User-agent: Amazonbot
Allow: /

# ============================================================================
# SEO TOOLING — allowed so the team can run audits against the live site.
# ============================================================================

User-agent: AhrefsBot
Allow: /

User-agent: SemrushBot
Allow: /

User-agent: DataForSeoBot
Allow: /

# ============================================================================
# BLOCKED — known-aggressive scrapers or low-quality bots.
# ============================================================================

# ByteDance — historically aggressive crawl rate, low referral value.
User-agent: Bytespider
Disallow: /

# Low-quality SEO scrapers.
User-agent: MJ12bot
Disallow: /

User-agent: DotBot
Disallow: /

# PetalBot (Huawei) — opt out unless you target China specifically.
User-agent: PetalBot
Disallow: /