diff --git a/docs/changelog/148665.yaml b/docs/changelog/148665.yaml new file mode 100644 index 0000000000000..e283a215838fc --- /dev/null +++ b/docs/changelog/148665.yaml @@ -0,0 +1,5 @@ +area: Ingest Node +issues: [] +pr: 148665 +summary: Add AI bots to user-agent regex +type: enhancement diff --git a/modules/user-agent/src/main/resources/regexes.yml b/modules/user-agent/src/main/resources/regexes.yml index ac309ae7ab5b6..525a1d5c50202 100644 --- a/modules/user-agent/src/main/resources/regexes.yml +++ b/modules/user-agent/src/main/resources/regexes.yml @@ -161,6 +161,42 @@ user_agent_parsers: - regex: '(Pinterest(?:bot|))/(\d+)(?:\.(\d+)|)(?:\.(\d+)|)[;\s(]+\+https://www.pinterest.com/bot.html' family_replacement: 'Pinterestbot' + # AI crawlers — named explicitly here (before the generic bot/spider/crawl matchers) because + # these tokens either: + # (a) carry a +https://... URL suffix that the generic matchers mis-read as the bot name + # (e.g. ChatGPT-User -> "com/bot"; meta-externalagent -> "crawler"; cohere-ai -> "cohere-crawler") + # (b) contain no bot/spider/crawl keyword at all, so the generic matchers return "Other" + # (e.g. MistralAI-User, Perplexity-User, Claude-User, meta-externalfetcher, Claude-Web) + # + # Bot inventory cross-referenced against: + # https://developers.cloudflare.com/ai-crawl-control/reference/bots/ + # https://platform.openai.com/docs/bots + # https://support.claude.com/en/articles/8896518 + # https://docs.perplexity.ai/guides/bots + + # OpenAI — https://platform.openai.com/docs/bots + - regex: '(ChatGPT-User)/(\d+)(?:\.(\d+)|)' + - regex: '(OAI-SearchBot)/(\d+)(?:\.(\d+)|)' + + # Anthropic — https://support.claude.com/en/articles/8896518 + # Claude-Web is the legacy UA token (deprecated 2024); kept for log compatibility. + - regex: '(Claude-User)/(\d+)(?:\.(\d+)|)' + - regex: '(Claude-Web)/(\d+)(?:\.(\d+)|)' + + # Perplexity — https://docs.perplexity.ai/guides/bots + - regex: '(Perplexity-User)/(\d+)(?:\.(\d+)|)' + + # Mistral AI — https://docs.mistral.ai/robots + - regex: '(MistralAI-User)/(\d+)(?:\.(\d+)|)' + + # Meta — https://developers.facebook.com/docs/sharing/webmasters/web-crawlers/ + # meta-externalagent carries a URL with "/crawler" that triggers the generic crawl catcher. + - regex: '(meta-externalagent)/(\d+)(?:\.(\d+)|)' + - regex: '(meta-externalfetcher)/(\d+)(?:\.(\d+)|)' + + # Cohere — the UA suffix /cohere-crawler triggers the generic crawl catcher. + - regex: '(cohere-ai)/(\d+)(?:\.(\d+)|)' + # Bots - regex: '(CSimpleSpider|Cityreview Robot|CrawlDaddy|CrawlFire|Finderbots|Index crawler|Job Roboter|KiwiStatus Spider|Lijit Crawler|QuerySeekerSpider|ScollSpider|Trends Crawler|USyd-NLP-Spider|SiteCat Webbot|BotName\/\$BotVersion|123metaspider-Bot|1470\.net crawler|50\.nu|8bo Crawler Bot|Aboundex|Accoona-[A-z]{1,30}-Agent|AdsBot-Google(?:-[a-z]{1,30}|)|altavista|AppEngine-Google|archive.{0,30}\.org_bot|archiver|Ask Jeeves|[Bb]ai[Dd]u[Ss]pider(?:-[A-Za-z]{1,30})(?:-[A-Za-z]{1,30}|)|bingbot|BingPreview|blitzbot|BlogBridge|Bloglovin|BoardReader Blog Indexer|BoardReader Favicon Fetcher|boitho.com-dc|BotSeer|BUbiNG|\b\w{0,30}favicon\w{0,30}\b|\bYeti(?:-[a-z]{1,30}|)|Catchpoint(?: bot|)|[Cc]harlotte|Checklinks|clumboot|Comodo HTTP\(S\) Crawler|Comodo-Webinspector-Crawler|ConveraCrawler|CRAWL-E|CrawlConvera|Daumoa(?:-feedfetcher|)|Feed Seeker Bot|Feedbin|findlinks|Flamingo_SearchEngine|FollowSite Bot|furlbot|Genieo|gigabot|GomezAgent|gonzo1|(?:[a-zA-Z]{1,30}-|)Googlebot(?:-[a-zA-Z]{1,30}|)|Google SketchUp|grub-client|gsa-crawler|heritrix|HiddenMarket|holmes|HooWWWer|htdig|ia_archiver|ICC-Crawler|Icarus6j|ichiro(?:/mobile|)|IconSurf|IlTrovatore(?:-Setaccio|)|InfuzApp|Innovazion Crawler|InternetArchive|IP2[a-z]{1,30}Bot|jbot\b|KaloogaBot|Kraken|Kurzor|larbin|LEIA|LesnikBot|Linguee Bot|LinkAider|LinkedInBot|Lite Bot|Llaut|lycos|Mail\.RU_Bot|masscan|masidani_bot|Mediapartners-Google|Microsoft .{0,30} Bot|mogimogi|mozDex|MJ12bot|msnbot(?:-media {0,2}|)|msrbot|Mtps Feed Aggregation System|netresearch|Netvibes|NewsGator[^/]{0,30}|^NING|Nutch[^/]{0,30}|Nymesis|ObjectsSearch|OgScrper|Orbiter|OOZBOT|PagePeeker|PagesInventory|PaxleFramework|Peeplo Screenshot Bot|PlantyNet_WebRobot|Pompos|Qwantify|Read%20Later|Reaper|RedCarpet|Retreiver|Riddler|Rival IQ|scooter|Scrapy|Scrubby|searchsight|seekbot|semanticdiscovery|SemrushBot|Simpy|SimplePie|SEOstats|SimpleRSS|SiteCon|Slackbot-LinkExpanding|Slack-ImgProxy|Slurp|snappy|Speedy Spider|Squrl Java|Stringer|TheUsefulbot|ThumbShotsBot|Thumbshots\.ru|Tiny Tiny RSS|Twitterbot|WhatsApp|URL2PNG|Vagabondo|VoilaBot|^vortex|Votay bot|^voyager|WASALive.Bot|Web-sniffer|WebThumb|WeSEE:[A-z]{1,30}|WhatWeb|WIRE|WordPress|Wotbox|www\.almaden\.ibm\.com|Xenu(?:.s|) Link Sleuth|Xerka [A-z]{1,30}Bot|yacy(?:bot|)|YahooSeeker|Yahoo! Slurp|Yandex\w{1,30}|YodaoBot(?:-[A-z]{1,30}|)|YottaaMonitor|Yowedo|^Zao|^Zao-Crawler|ZeBot_www\.ze\.bz|ZooShot|ZyBorg|ArcGIS Hub Indexer)(?:[ /]v?(\d+)(?:\.(\d+)(?:\.(\d+)|)|)|)' @@ -195,7 +231,9 @@ user_agent_parsers: # Bots General matcher 'name/0.0' - regex: '(?:\/[A-Za-z0-9\.]+|) {0,5}([A-Za-z0-9 \-_\!\[\]:]{0,50}(?:[Aa]rchiver|[Ii]ndexer|[Ss]craper|[Bb]ot|[Ss]pider|[Cc]rawl[a-z]{0,50}))[/ ](\d+)(?:\.(\d+)(?:\.(\d+)|)|)' # Bots containing bot(but not CUBOT) - - regex: '((?:[A-Za-z][A-Za-z0-9 -]{0,50}|)[^C][^Uu][Bb]ot)\b(?:(?:[ /]| v)(\d+)(?:\.(\d+)|)(?:\.(\d+)|)|)' + # The (?Each case in {@code test-bot-agents.yml} specifies a {@code ua} string together with the + * expected {@code name} and (optional) {@code version} produced by the parser. This keeps the + * pattern regression list separate from Java source code and makes it straightforward to add + * new cases without modifying test logic. + */ + public void testBotAgents() throws IOException { + List> cases = parseBotTestCases(); + assertThat("test-bot-agents.yml must not be empty", cases.isEmpty(), equalTo(false)); + + for (Map tc : cases) { + String ua = tc.get("ua"); + String expectedName = tc.get("name"); + String expectedVersion = tc.get("version"); + + Details details = parser.parseUserAgentInfo(ua, true); + + assertThat("name mismatch for UA: " + ua, details.name(), equalTo(expectedName)); + if (expectedVersion == null) { + assertThat("version should be null for UA: " + ua, details.version(), nullValue()); + } else { + assertThat("version mismatch for UA: " + ua, details.version(), equalTo(expectedVersion)); + } + } + } + + private static List> parseBotTestCases() throws IOException { + InputStream stream = UserAgentParserImplTests.class.getResourceAsStream("/test-bot-agents.yml"); + assertNotNull("test-bot-agents.yml resource not found", stream); + + List> cases = new ArrayList<>(); + try (XContentParser yaml = XContentFactory.xContent(XContentType.YAML).createParser(XContentParserConfiguration.EMPTY, stream)) { + // Top-level object: { bot_test_cases: [ { ua, name, version? }, ... ] } + yaml.nextToken(); // START_OBJECT + while (yaml.nextToken() != XContentParser.Token.END_OBJECT) { + if (yaml.currentToken() == XContentParser.Token.FIELD_NAME && "bot_test_cases".equals(yaml.currentName())) { + yaml.nextToken(); // START_ARRAY + while (yaml.nextToken() != XContentParser.Token.END_ARRAY) { + cases.add(yaml.mapStrings()); + } + } else { + yaml.nextToken(); + yaml.skipChildren(); + } + } + } + return cases; + } } diff --git a/modules/user-agent/src/test/resources/test-bot-agents.yml b/modules/user-agent/src/test/resources/test-bot-agents.yml new file mode 100644 index 0000000000000..7fdeb89b48735 --- /dev/null +++ b/modules/user-agent/src/test/resources/test-bot-agents.yml @@ -0,0 +1,162 @@ +# Test cases for bot / AI-crawler user-agent parsing. +# +# Each entry has: +# ua - the full User-Agent string sent by the crawler +# name - expected details.name() from the parser (omit key when null is expected) +# version - expected details.version() from the parser (omit key when null is expected) +# +# Groups: +# 1. URL-suffix false positives: UA strings that embed a +https://... URL whose path +# contains a bot/spider/crawl token that the generic matchers could mis-read as the name. +# 2. No-keyword bots: UA strings whose bot name contains no bot/spider/crawl keyword, +# so they would fall through all generic patterns if not explicitly listed. +# 3. Explicitly-named AI crawlers: dedicated regex entries in regexes.yml. +# 4. Generic-matcher coverage: bots matched by the Bots General or named-alternation patterns. +# 5. Pinned edge cases: bots whose parsing behaviour is non-obvious and worth locking down. + +bot_test_cases: + + # ------------------------------------------------------------------- + # 1. URL-suffix false positives + # ------------------------------------------------------------------- + + # +https://openai.com/bot — the /bot path suffix is a generic-matcher target. + - ua: "Mozilla/5.0 AppleWebkit/537.36 (KHTML, Like Gecko); compatible; ChatGPT-User/1.0; +https://openai.com/bot" + name: "ChatGPT-User" + version: "1.0" + + # +https://.../webmasters/crawler — the /crawler suffix triggers the generic crawl catcher. + - ua: "meta-externalagent/1.1 (+https://developers.facebook.com/docs/sharing/webmasters/crawler)" + name: "meta-externalagent" + version: "1.1" + + # +https://.../cohere-crawler — the /cohere-crawler suffix triggers the generic crawl catcher. + - ua: "cohere-ai/1.0; +https://docs.cohere.com/docs/cohere-crawler" + name: "cohere-ai" + version: "1.0" + + # ------------------------------------------------------------------- + # 2. No-keyword bots + # ------------------------------------------------------------------- + + - ua: "Mozilla/5.0 AppleWebkit/537.36 (KHTML, Like Gecko); compatible; MistralAI-User/1.0; +https://docs.mistral.ai/robots" + name: "MistralAI-User" + version: "1.0" + + - ua: "Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; Claude-User/1.0; +Claude-User@anthropic.com)" + name: "Claude-User" + version: "1.0" + + - ua: "Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; Perplexity-User/1.0; +https://perplexity.ai/perplexity-user)" + name: "Perplexity-User" + version: "1.0" + + - ua: "meta-externalfetcher/1.0" + name: "meta-externalfetcher" + version: "1.0" + + # ------------------------------------------------------------------- + # 3. Explicitly-named AI crawlers + # ------------------------------------------------------------------- + + - ua: "Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko); compatible; OAI-SearchBot/1.0; +https://openai.com/searchbot" + name: "OAI-SearchBot" + version: "1.0" + + # Claude-Web is the legacy Anthropic UA token (deprecated 2024), kept for log compatibility. + - ua: "Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; Claude-Web/1.0; +Claude-Web@anthropic.com)" + name: "Claude-Web" + version: "1.0" + + # ------------------------------------------------------------------- + # 4. Generic-matcher coverage + # ------------------------------------------------------------------- + + - ua: "Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko); compatible; GPTBot/1.1; +https://openai.com/gptbot" + name: "GPTBot" + version: "1.1" + + - ua: "Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; ClaudeBot/1.0; +claudebot@anthropic.com)" + name: "ClaudeBot" + version: "1.0" + + - ua: "Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; PerplexityBot/1.0; +https://perplexity.ai/perplexitybot)" + name: "PerplexityBot" + version: "1.0" + + - ua: "Mozilla/5.0 (compatible; DuckAssistBot/1.0; +https://duckduckgo.com/duckassistbot)" + name: "DuckAssistBot" + version: "1.0" + + - ua: "Mozilla/5.0 (compatible; Amazonbot/0.1; +https://developer.amazon.com/support/amazonbot)" + name: "Amazonbot" + version: "0.1" + + - ua: "Mozilla/5.0 (compatible; AhrefsBot/7.0; +http://ahrefs.com/robot/)" + name: "AhrefsBot" + version: "7.0" + + - ua: "CCBot/2.0 (https://commoncrawl.org/faq/)" + name: "CCBot" + version: "2.0" + + - ua: "Mozilla/5.0 (compatible; Applebot/0.1; +http://www.apple.com/go/applebot)" + name: "Applebot" + version: "0.1" + + - ua: "Mozilla/5.0 (Linux; Android 5.0) AppleWebKit/537.36 (KHTML, like Gecko) Mobile Safari/537.36 (compatible; Bytespider/1.0; spider-feedback@bytedance.com)" + name: "Bytespider" + version: "1.0" + + - ua: "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)" + name: "Googlebot" + version: "2.1" + + - ua: "Mozilla/5.0 (compatible; bingbot/2.0; +http://www.bing.com/bingbot.htm)" + name: "bingbot" + version: "2.0" + + - ua: "Mozilla/5.0 (compatible; YandexBot/3.0; +http://yandex.com/bots)" + name: "YandexBot" + version: "3.0" + + - ua: "Twitterbot/1.0" + name: "Twitterbot" + version: "1.0" + + # SemrushBot/7~bl — the ~bl suffix is not captured; version is just the leading digit. + - ua: "Mozilla/5.0 (compatible; SemrushBot/7~bl; +http://www.semrush.com/bot.html)" + name: "SemrushBot" + version: "7" + + - ua: "LinkedInBot/1.0 (compatible; Mozilla/5.0; Apache-HttpClient/4.1.1 +http://www.linkedin.com)" + name: "LinkedInBot" + version: "1.0" + + # The alternation captures the full "Yahoo! Slurp" prefix, not just "Slurp". + - ua: "Mozilla/5.0 (compatible; Yahoo! Slurp; http://help.yahoo.com/help/us/ysearch/slurp)" + name: "Yahoo! Slurp" + + - ua: "Mozilla/5.0 (compatible; MJ12bot/v1.4.8; http://mj12bot.com/)" + name: "MJ12bot" + version: "1.4.8" + + - ua: "ia_archiver (+http://www.alexa.com/site/help/webmaster;crawler@alexa.com)" + name: "ia_archiver" + + - ua: "Mozilla/5.0 (compatible; Pinterestbot/1.0; +https://www.pinterest.com/bot.html)" + name: "Pinterestbot" + version: "1.0" + + # ------------------------------------------------------------------- + # 5. Pinned edge cases + # ------------------------------------------------------------------- + + # Google's AI-training opt-out crawler; name contains no recognisable bot keyword, + # so the parser returns null. + - ua: "Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko); compatible; Google-Extended" + + # Meta's link-preview fetcher; has an explicit entry with family_replacement: 'FacebookBot'. + - ua: "facebookexternalhit/1.1 (+http://www.facebook.com/externalhit_uatext.php)" + name: "FacebookBot" + version: "1.1"