# SPDX-FileCopyrightText: The itiquette Authors
# SPDX-License-Identifier: CC0-1.0
#
# Policy: allow normal search-engine indexing AND on-demand AI answer
# engines (so a user asking ChatGPT/Claude/Perplexity "how do I configure
# gommitlint?" can be answered from the live docs). Block bulk crawlers
# that scrape the site for AI model training.
#
# This is an honor-system signal — well-behaved bots respect it; others
# won't. See also /.well-known/ai.txt and the meta tags in <head>.

User-agent: *
Allow: /

# ──────────────────────────────────────────────────────────────────────
# Blocked: AI training / dataset crawlers
# ──────────────────────────────────────────────────────────────────────

# OpenAI training crawler
User-agent: GPTBot
Disallow: /

# Anthropic training crawlers
User-agent: ClaudeBot
Disallow: /
User-agent: Claude-Web
Disallow: /
User-agent: anthropic-ai
Disallow: /

# Google AI training (Gemini / Vertex). Does NOT affect Googlebot search indexing.
User-agent: Google-Extended
Disallow: /

# Apple AI training. Does NOT affect Applebot search indexing.
User-agent: Applebot-Extended
Disallow: /

# Meta AI training
User-agent: FacebookBot
Disallow: /
User-agent: Meta-ExternalAgent
Disallow: /

# Amazon AI training
User-agent: Amazonbot
Disallow: /

# ByteDance / TikTok training
User-agent: Bytespider
Disallow: /

# Common Crawl — feeds many training datasets
User-agent: CCBot
Disallow: /

# Perplexity index/training crawler (distinct from Perplexity-User on-demand)
User-agent: PerplexityBot
Disallow: /

# Other training / dataset crawlers
User-agent: YouBot
Disallow: /
User-agent: PhindBot
Disallow: /
User-agent: cohere-ai
Disallow: /
User-agent: cohere-training-data-crawler
Disallow: /
User-agent: Diffbot
Disallow: /
User-agent: ImagesiftBot
Disallow: /
User-agent: img2dataset
Disallow: /
User-agent: omgili
Disallow: /
User-agent: Omgilibot
Disallow: /
User-agent: Timpibot
Disallow: /
User-agent: AI2Bot
Disallow: /
User-agent: AI2Bot-Dolma
Disallow: /
User-agent: Kangaroo Bot
Disallow: /
User-agent: PanguBot
Disallow: /
User-agent: Sidetrade indexer bot
Disallow: /
User-agent: Velen Crawler
Disallow: /
User-agent: webzio-extended
Disallow: /

# ──────────────────────────────────────────────────────────────────────
# Allowed: on-demand AI answer engines (user-initiated single fetch)
# Listed explicitly so the policy is visible — they fall under the
# default "User-agent: * / Allow: /" above.
# ──────────────────────────────────────────────────────────────────────

# OpenAI ChatGPT on-demand browse + search-result fetch
User-agent: ChatGPT-User
Allow: /
User-agent: OAI-SearchBot
Allow: /

# Anthropic on-demand fetch + search-result fetch
User-agent: Claude-User
Allow: /
User-agent: Claude-SearchBot
Allow: /

# Perplexity on-demand answer fetch
User-agent: Perplexity-User
Allow: /

# Meta on-demand link/preview fetcher
User-agent: Meta-ExternalFetcher
Allow: /

# Mistral on-demand fetch
User-agent: MistralAI-User
Allow: /

# DuckDuckGo Assist on-demand fetch
User-agent: DuckAssistBot
Allow: /

Sitemap: https://docs.itiquette.org/sitemap.xml