User-agent: * Allow: / # Defensive Allow rules — make absolutely sure programmatic content is indexable. # Listed before Disallows so longer-match wins on Google's parser. Allow: /sponsor-uk/ Allow: /uk-jobs/ Allow: /uk-visa-nationals/ Allow: /sponsor/ Allow: /role/ Allow: /city/ Allow: /guide/ Allow: /uk-guide/ Allow: /industry/ Allow: /usa/ Allow: /canada/ Allow: /australia/ Allow: /sponsor-licence-checker Allow: /sponsor-licence-revocations Allow: /uk-sponsor-licence-atlas Allow: /uk-immigration-glossary Allow: /uk-visa-fees-calculator Allow: /visa-salary-thresholds Allow: /skilled-worker-visa-salary-checker # Anchored Disallow rules — block only the exact path + its subpaths. # Without `$` and the trailing slash, `/admin` would also match `/admin-anything`. Disallow: /dashboard$ Disallow: /dashboard/ Disallow: /onboarding$ Disallow: /onboarding/ Disallow: /stripe-test$ Disallow: /stripe-test/ Disallow: /login$ Disallow: /login/ Disallow: /signup$ Disallow: /signup/ Disallow: /forgot-password$ Disallow: /forgot-password/ Disallow: /reset-password$ Disallow: /reset-password/ Disallow: /api/ Disallow: /admin$ Disallow: /admin/ Disallow: /cv-editor/ Disallow: /candidate-profile$ Disallow: /candidate-profile/ Disallow: /*?session= Disallow: /*?token= # Rate limiting for heavy SEO bots User-agent: SemrushBot Crawl-delay: 10 User-agent: AhrefsBot Crawl-delay: 10 User-agent: MJ12bot Crawl-delay: 10 # Accessibility and social networking bots User-agent: Google-Read-Aloud Allow: / User-agent: LinkedInBot Allow: / # ============================================================================ # AI / LLM CRAWLERS — ALL ALLOWED. # SponsorshipJobs WANTS to be cited by ChatGPT, Claude, Perplexity, Gemini, # Copilot, Meta AI, DuckAssist, and any other assistant that surfaces the # UK Home Office sponsor register. Don't block these, they are our # distribution. See /llms.txt for a machine-readable summary. # ============================================================================ # OpenAI — training crawler + real-time browsing crawler User-agent: GPTBot Allow: / User-agent: ChatGPT-User Allow: / User-agent: OAI-SearchBot Allow: / # Anthropic — Claude training + real-time fetch User-agent: anthropic-ai Allow: / User-agent: ClaudeBot Allow: / User-agent: Claude-Web Allow: / User-agent: Claude-User Allow: / User-agent: Claude-SearchBot Allow: / # Perplexity User-agent: PerplexityBot Allow: / User-agent: Perplexity-User Allow: / # Google AI (AI Overviews, Bard/Gemini training) User-agent: Google-Extended Allow: / User-agent: Gemini-Google Allow: / User-agent: GoogleOther Allow: / # Microsoft Copilot / Bing Chat User-agent: Bingbot Allow: / User-agent: bingbot Allow: / User-agent: msnbot Allow: / # Apple Intelligence / Siri User-agent: Applebot Allow: / User-agent: Applebot-Extended Allow: / # Meta AI User-agent: Meta-ExternalAgent Allow: / User-agent: Meta-ExternalFetcher Allow: / User-agent: FacebookBot Allow: / # Amazon (Alexa, Rufus) User-agent: Amazonbot Allow: / # DuckDuckGo Assist User-agent: DuckAssistBot Allow: / # Cohere User-agent: cohere-ai Allow: / User-agent: cohere-training-data-crawler Allow: / # You.com User-agent: YouBot Allow: / # Common Crawl — feeds open-source LLM training (Llama, Mistral, Falcon etc). # Blocking this cuts us out of half the open-model ecosystem. User-agent: CCBot Allow: / # Diffbot (used by Bard, Bing, and enterprise AI) User-agent: Diffbot Allow: / # TikTok/ByteDance AI User-agent: Bytespider Allow: / # Mistral User-agent: MistralAI-User Allow: / Sitemap: https://sponsorshipjobs.io/sitemap.xml # AI / LLM reference files — machine-readable summaries for assistants. # Summary: https://sponsorshipjobs.io/llms.txt # Full text: https://sponsorshipjobs.io/llms-full.txt # IndexNow verification (Bing, Yandex, Seznam, Naver) # Key file: https://sponsorshipjobs.io/3a7a6f2ad57fb7deaa0c9157c50410bc.txt