# As a condition of accessing this website, you agree to abide by the following
# content signals:

# (a)  If a content-signal = yes, you may collect content for the corresponding
#      use.
# (b)  If a content-signal = no, you may not collect content for the
#      corresponding use.
# (c)  If the website operator does not include a content signal for a
#      corresponding use, the website operator neither grants nor restricts
#      permission via content signal with respect to the corresponding use.

# The content signals and their meanings are:

# search:   building a search index and providing search results (e.g., returning
#           hyperlinks and short excerpts from your website's contents). Search does not
#           include providing AI-generated search summaries.
# ai-input: inputting content into one or more AI models (e.g., retrieval
#           augmented generation, grounding, or other real-time taking of content for
#           generative AI search answers).
# ai-train: training or fine-tuning AI models.

# ANY RESTRICTIONS EXPRESSED VIA CONTENT SIGNALS ARE EXPRESS RESERVATIONS OF
# RIGHTS UNDER ARTICLE 4 OF THE EUROPEAN UNION DIRECTIVE 2019/790 ON COPYRIGHT
# AND RELATED RIGHTS IN THE DIGITAL SINGLE MARKET.

# BEGIN Cloudflare Managed content

User-Agent: *
Content-signal: search=yes,ai-train=no
Allow: /

User-agent: Amazonbot
Disallow: /

User-agent: Applebot-Extended
Disallow: /

User-agent: Bytespider
Disallow: /

User-agent: CCBot
Disallow: /

User-agent: ClaudeBot
Disallow: /

User-agent: Google-Extended
Disallow: /

User-agent: GPTBot
Disallow: /

User-agent: meta-externalagent
Disallow: /

# END Cloudflare Managed Content

# ==============================================
# Typecho专属robots.txt（适配你的URL结构）
# 核心：开放Feed抓取，引导爬虫优先抓正文+分类，同时规避无效内容
# ==============================================

User-agent: *
# 1. 允许抓取的核心路径（严格适配你的URL）
Allow: /                              # 首页
Allow: /archives/*/                   # 文章详情页（/archives/123/）
Allow: /category/*/                   # 分类详情页（/category/tech/）
Allow: /tag/*/                        # 标签详情页（/tag/seo/）
Allow: /*.html                        # 独立页面（/about.html、/contact.html）
Allow: /feed/                         # 全站Feed（让爬虫发现新内容）
Allow: /category/*/feed/              # 分类Feed（补充分类更新）
Allow: /tag/*/feed/                   # 标签Feed（补充标签更新）
Allow: /favicon.ico                   # 网站图标
Allow: /robots.txt                    # 自身

# 2. 规避低价值/重复内容
Disallow: /admin/                     # 后台（无收录价值）
Disallow: /install.php                # 安装文件
Disallow: /usr/                       # Typecho核心目录
Disallow: /var/                       # 缓存目录
Disallow: /search?                    # 搜索页（动态，内容重复）
Disallow: /comment-page-*/            # 评论分页（/archives/123/comment-page-1/）
Disallow: /index.php?                 # 旧动态URL（伪静态已启用，屏蔽）

# 3. 抓取延迟（保护服务器，同时不限制爬虫效率）
Crawl-delay: 2

# ==============================================
# 4. 搜索引擎专属优化（针对Typecho + 你的URL）
# ==============================================
User-agent: Googlebot
Crawl-delay: 1                        # 谷歌优先，加快抓取
Allow: /feed/                         # 允许谷歌通过Feed快速发现新文章
Allow: /category/*/feed/
Allow: /tag/*/feed/

User-agent: Baiduspider
Crawl-delay: 3                        # 百度频率高，适度控制
Allow: /feed/                         # 百度支持Feed，引导收录
Allow: /category/*/feed/

# ==============================================
# 5. Sitemap配置（必须适配Typecho插件生成的路径）
# ==============================================
Sitemap: https://www.nanblog.ink/sitemap.xml          # 主Sitemap（文章+分类+页面）