# Uscp.io - Global Business Directory
# Crawl Budget Optimized | Last updated: 2026

# ─── Default: All Crawlers (Googlebot/Bingbot read this) ──
# We DO NOT add a separate Googlebot or Bingbot block: when a
# crawler-specific block exists Google ignores the * block, so any
# rule we forget to mirror would silently un-block the path. One
# canonical block is safer.
User-agent: *
Allow: /

# Admin & private routes.
# Use /path$ (exact) + /path/ (subtree) instead of bare /path — Google's
# robots.txt pattern matching is prefix-based, so a bare "Disallow: /admin"
# would also block /administrator, /admin-help, etc. The $ anchor pins the
# match to the exact path; the trailing-slash form covers nested routes.
Disallow: /admin$
Disallow: /admin/
Disallow: /api/
Disallow: /login$
Disallow: /register$
Disallow: /profile$
Disallow: /profile/

# Search results - infinite combinations, no crawl value
Disallow: /search$
Disallow: /search/

# Block URL parameters that create infinite duplicate pages.
# Both ?foo= (first param) and &foo= (chained param) variants — robots.txt
# pattern matching is literal, so a single ?sort= rule does NOT cover
# /search?utm=x&sort=y. We list both forms for every duplicate-creating param.
#
# NOTE: ?page=N is NOT blocked — /categories?page=2, /cities?page=2, and
# /countries?page=2 are real paginated listings linked via rel="next/prev"
# (see generateCategoriesListingPage / generateCitiesListingPage). Blocking
# them would prevent Google from discovering deep listings.
Disallow: /*?q=
Disallow: /*&q=
Disallow: /*?sort=
Disallow: /*&sort=
Disallow: /*?filter=
Disallow: /*&filter=
Disallow: /*?rating=
Disallow: /*&rating=
Disallow: /*?min_rating=
Disallow: /*&min_rating=
Disallow: /*?category=
Disallow: /*&category=
Disallow: /*?city=
Disallow: /*&city=
Disallow: /*?country=
Disallow: /*&country=
Disallow: /*?tab=
Disallow: /*&tab=
Disallow: /*?ref=
Disallow: /*&ref=
Disallow: /*?view=
Disallow: /*&view=
Disallow: /*?search=
Disallow: /*&search=
Disallow: /*?type=
Disallow: /*&type=
Disallow: /*?lang=
Disallow: /*&lang=

# Tracking params (UTM, ad networks, email/social trackers).
# Every share link creates a "new" URL; canonical tags help but blocking
# at the crawl layer protects budget for high-value pages.
Disallow: /*?utm_
Disallow: /*&utm_
Disallow: /*?gclid=
Disallow: /*&gclid=
Disallow: /*?fbclid=
Disallow: /*&fbclid=
Disallow: /*?msclkid=
Disallow: /*&msclkid=
Disallow: /*?yclid=
Disallow: /*&yclid=
Disallow: /*?source=
Disallow: /*&source=
Disallow: /*?campaign=
Disallow: /*&campaign=
Disallow: /*?igshid=
Disallow: /*&igshid=
Disallow: /*?mc_cid=
Disallow: /*&mc_cid=
Disallow: /*?mc_eid=
Disallow: /*&mc_eid=
Disallow: /*?pk_campaign=
Disallow: /*&pk_campaign=
Disallow: /*?pk_source=
Disallow: /*&pk_source=
Disallow: /*?_branch_match_id=
Disallow: /*&_branch_match_id=
Disallow: /*?_gl=
Disallow: /*&_gl=
Disallow: /*?affiliate=
Disallow: /*&affiliate=
Disallow: /*?partner=
Disallow: /*&partner=

# Dev/build artifacts - never meant for indexing
Disallow: /src/
Disallow: /node_modules/
Disallow: /@
Disallow: /.vite/
Disallow: /__vite_ping

# Allow critical static assets (JS, CSS, images, fonts).
# Defense-in-depth: the parent /api/ and /admin/ blocks above already
# protect those paths; these Allows guarantee that page-rendering
# resources stay crawlable even if a future Disallow accidentally
# overlaps a CDN path.
Allow: /assets/
Allow: /images/
Allow: /*.js$
Allow: /*.css$
Allow: /*.png$
Allow: /*.jpg$
Allow: /*.jpeg$
Allow: /*.webp$
Allow: /*.avif$
Allow: /*.svg$
Allow: /*.woff2$
Allow: /*.ico$

# ─── AI / LLM Training Crawlers ───────────────────────────
# Allow our content to be cited by AI search products (ChatGPT, Perplexity,
# Gemini Live) — citations are a growing referral source. Block raw training
# scrapers that don't drive traffic. Adjust per business decision.
User-agent: GPTBot
Allow: /

User-agent: ChatGPT-User
Allow: /

User-agent: OAI-SearchBot
Allow: /

User-agent: PerplexityBot
Allow: /

User-agent: Perplexity-User
Allow: /

User-agent: Google-Extended
Allow: /

User-agent: GoogleOther
Allow: /

User-agent: anthropic-ai
Allow: /

User-agent: ClaudeBot
Allow: /

User-agent: Claude-Web
Allow: /

User-agent: Claude-User
Allow: /

User-agent: Applebot-Extended
Allow: /

User-agent: Applebot
Allow: /

User-agent: Amazonbot
Allow: /

User-agent: Meta-ExternalAgent
Allow: /

User-agent: FacebookBot
Allow: /

User-agent: cohere-ai
Allow: /

User-agent: DuckAssistBot
Allow: /

User-agent: YouBot
Allow: /

# Common Crawl — feeds many LLMs without sending traffic. Block.
User-agent: CCBot
Disallow: /

# Bytespider (TikTok/ByteDance) — aggressive scraping, no return traffic.
User-agent: Bytespider
Disallow: /

# Diffbot — scrape-as-a-service for competitors; we publish to LLMs directly
# instead via /llms.txt + /openapi.json + sitemaps. Block this proxy layer.
User-agent: Diffbot
Disallow: /

# Omgilibot / Omgili / webz.io — content aggregation reseller, no return.
User-agent: Omgilibot
Disallow: /

User-agent: Omgili
Disallow: /

User-agent: webzio-extended
Disallow: /

# ─── SEO Tool Bots (Rate Limited) ─────────────────────────
# Crawl-delay is honored by these bots (Google/Bing ignore it, so we don't
# set it for them). Slows down third-party crawl that drives no traffic.
User-agent: AhrefsBot
Crawl-delay: 5

User-agent: SemrushBot
Crawl-delay: 5

User-agent: MJ12bot
Crawl-delay: 10

User-agent: DotBot
Crawl-delay: 10

User-agent: BLEXBot
Crawl-delay: 10

# ─── Sitemaps ─────────────────────────────────────────────
# Sitemap-index — Google/Bing discover all sub-sitemaps from this entry.
Sitemap: https://uscp.io/sitemap.xml