# As a condition of accessing this website, you agree to abide by the following # content signals: # (a) If a content-signal = yes, you may collect content for the corresponding # use. # (b) If a content-signal = no, you may not collect content for the # corresponding use. # (c) If the website operator does not include a content signal for a # corresponding use, the website operator neither grants nor restricts # permission via content signal with respect to the corresponding use. # The content signals and their meanings are: # search: building a search index and providing search results (e.g., returning # hyperlinks and short excerpts from your website's contents). Search does not # include providing AI-generated search summaries. # ai-input: inputting content into one or more AI models (e.g., retrieval # augmented generation, grounding, or other real-time taking of content for # generative AI search answers). # ai-train: training or fine-tuning AI models. # ANY RESTRICTIONS EXPRESSED VIA CONTENT SIGNALS ARE EXPRESS RESERVATIONS OF # RIGHTS UNDER ARTICLE 4 OF THE EUROPEAN UNION DIRECTIVE 2019/790 ON COPYRIGHT # AND RELATED RIGHTS IN THE DIGITAL SINGLE MARKET. # BEGIN Cloudflare Managed content User-Agent: * Content-signal: search=yes,ai-train=no Allow: / User-agent: Amazonbot Disallow: / User-agent: Applebot-Extended Disallow: / User-agent: Bytespider Disallow: / User-agent: CCBot Disallow: / User-agent: ClaudeBot Disallow: / User-agent: Google-Extended Disallow: / User-agent: GPTBot Disallow: / User-agent: meta-externalagent Disallow: / # END Cloudflare Managed Content # ============================================== # Typecho专属robots.txt(适配你的URL结构) # 核心:开放Feed抓取,引导爬虫优先抓正文+分类,同时规避无效内容 # ============================================== User-agent: * # 1. 允许抓取的核心路径(严格适配你的URL) Allow: / # 首页 Allow: /archives/*/ # 文章详情页(/archives/123/) Allow: /category/*/ # 分类详情页(/category/tech/) Allow: /tag/*/ # 标签详情页(/tag/seo/) Allow: /*.html # 独立页面(/about.html、/contact.html) Allow: /feed/ # 全站Feed(让爬虫发现新内容) Allow: /category/*/feed/ # 分类Feed(补充分类更新) Allow: /tag/*/feed/ # 标签Feed(补充标签更新) Allow: /favicon.ico # 网站图标 Allow: /robots.txt # 自身 # 2. 规避低价值/重复内容 Disallow: /admin/ # 后台(无收录价值) Disallow: /install.php # 安装文件 Disallow: /usr/ # Typecho核心目录 Disallow: /var/ # 缓存目录 Disallow: /search? # 搜索页(动态,内容重复) Disallow: /comment-page-*/ # 评论分页(/archives/123/comment-page-1/) Disallow: /index.php? # 旧动态URL(伪静态已启用,屏蔽) # 3. 抓取延迟(保护服务器,同时不限制爬虫效率) Crawl-delay: 2 # ============================================== # 4. 搜索引擎专属优化(针对Typecho + 你的URL) # ============================================== User-agent: Googlebot Crawl-delay: 1 # 谷歌优先,加快抓取 Allow: /feed/ # 允许谷歌通过Feed快速发现新文章 Allow: /category/*/feed/ Allow: /tag/*/feed/ User-agent: Baiduspider Crawl-delay: 3 # 百度频率高,适度控制 Allow: /feed/ # 百度支持Feed,引导收录 Allow: /category/*/feed/ # ============================================== # 5. Sitemap配置(必须适配Typecho插件生成的路径) # ============================================== Sitemap: https://www.nanblog.ink/sitemap.xml # 主Sitemap(文章+分类+页面)