elastic · mhl-b · May 14, 2026 · May 9, 2026 · May 9, 2026
diff --git a/docs/changelog/148665.yaml b/docs/changelog/148665.yaml
@@ -0,0 +1,5 @@
+area: Ingest Node
+issues: []
+pr: 148665
+summary: Add AI bots to user-agent regex
+type: enhancement
diff --git a/modules/user-agent/src/main/resources/regexes.yml b/modules/user-agent/src/main/resources/regexes.yml
@@ -161,6 +161,42 @@ user_agent_parsers:
   - regex: '(Pinterest(?:bot|))/(\d+)(?:\.(\d+)|)(?:\.(\d+)|)[;\s(]+\+https://www.pinterest.com/bot.html'
     family_replacement: 'Pinterestbot'
 
+  # AI crawlers — named explicitly here (before the generic bot/spider/crawl matchers) because
+  # these tokens either:
+  #   (a) carry a +https://... URL suffix that the generic matchers mis-read as the bot name
+  #       (e.g. ChatGPT-User -> "com/bot"; meta-externalagent -> "crawler"; cohere-ai -> "cohere-crawler")
+  #   (b) contain no bot/spider/crawl keyword at all, so the generic matchers return "Other"
+  #       (e.g. MistralAI-User, Perplexity-User, Claude-User, meta-externalfetcher, Claude-Web)
+  #
+  # Bot inventory cross-referenced against:
+  #   https://developers.cloudflare.com/ai-crawl-control/reference/bots/
+  #   https://platform.openai.com/docs/bots
+  #   https://support.claude.com/en/articles/8896518
+  #   https://docs.perplexity.ai/guides/bots
+
+  # OpenAI — https://platform.openai.com/docs/bots
+  - regex: '(ChatGPT-User)/(\d+)(?:\.(\d+)|)'
+  - regex: '(OAI-SearchBot)/(\d+)(?:\.(\d+)|)'
+
+  # Anthropic — https://support.claude.com/en/articles/8896518
+  # Claude-Web is the legacy UA token (deprecated 2024); kept for log compatibility.
+  - regex: '(Claude-User)/(\d+)(?:\.(\d+)|)'
+  - regex: '(Claude-Web)/(\d+)(?:\.(\d+)|)'
+
+  # Perplexity — https://docs.perplexity.ai/guides/bots
+  - regex: '(Perplexity-User)/(\d+)(?:\.(\d+)|)'
+
+  # Mistral AI — https://docs.mistral.ai/robots
+  - regex: '(MistralAI-User)/(\d+)(?:\.(\d+)|)'
+
+  # Meta — https://developers.facebook.com/docs/sharing/webmasters/web-crawlers/
+  # meta-externalagent carries a URL with "/crawler" that triggers the generic crawl catcher.
+  - regex: '(meta-externalagent)/(\d+)(?:\.(\d+)|)'
+  - regex: '(meta-externalfetcher)/(\d+)(?:\.(\d+)|)'
+
+  # Cohere — the UA suffix /cohere-crawler triggers the generic crawl catcher.
+  - regex: '(cohere-ai)/(\d+)(?:\.(\d+)|)'
+
   # Bots
   - regex: '(CSimpleSpider|Cityreview Robot|CrawlDaddy|CrawlFire|Finderbots|Index crawler|Job Roboter|KiwiStatus Spider|Lijit Crawler|QuerySeekerSpider|ScollSpider|Trends Crawler|USyd-NLP-Spider|SiteCat Webbot|BotName\/\$BotVersion|123metaspider-Bot|1470\.net crawler|50\.nu|8bo Crawler Bot|Aboundex|Accoona-[A-z]{1,30}-Agent|AdsBot-Google(?:-[a-z]{1,30}|)|altavista|AppEngine-Google|archive.{0,30}\.org_bot|archiver|Ask Jeeves|[Bb]ai[Dd]u[Ss]pider(?:-[A-Za-z]{1,30})(?:-[A-Za-z]{1,30}|)|bingbot|BingPreview|blitzbot|BlogBridge|Bloglovin|BoardReader Blog Indexer|BoardReader Favicon Fetcher|boitho.com-dc|BotSeer|BUbiNG|\b\w{0,30}favicon\w{0,30}\b|\bYeti(?:-[a-z]{1,30}|)|Catchpoint(?: bot|)|[Cc]harlotte|Checklinks|clumboot|Comodo HTTP\(S\) Crawler|Comodo-Webinspector-Crawler|ConveraCrawler|CRAWL-E|CrawlConvera|Daumoa(?:-feedfetcher|)|Feed Seeker Bot|Feedbin|findlinks|Flamingo_SearchEngine|FollowSite Bot|furlbot|Genieo|gigabot|GomezAgent|gonzo1|(?:[a-zA-Z]{1,30}-|)Googlebot(?:-[a-zA-Z]{1,30}|)|Google SketchUp|grub-client|gsa-crawler|heritrix|HiddenMarket|holmes|HooWWWer|htdig|ia_archiver|ICC-Crawler|Icarus6j|ichiro(?:/mobile|)|IconSurf|IlTrovatore(?:-Setaccio|)|InfuzApp|Innovazion Crawler|InternetArchive|IP2[a-z]{1,30}Bot|jbot\b|KaloogaBot|Kraken|Kurzor|larbin|LEIA|LesnikBot|Linguee Bot|LinkAider|LinkedInBot|Lite Bot|Llaut|lycos|Mail\.RU_Bot|masscan|masidani_bot|Mediapartners-Google|Microsoft .{0,30} Bot|mogimogi|mozDex|MJ12bot|msnbot(?:-media {0,2}|)|msrbot|Mtps Feed Aggregation System|netresearch|Netvibes|NewsGator[^/]{0,30}|^NING|Nutch[^/]{0,30}|Nymesis|ObjectsSearch|OgScrper|Orbiter|OOZBOT|PagePeeker|PagesInventory|PaxleFramework|Peeplo Screenshot Bot|PlantyNet_WebRobot|Pompos|Qwantify|Read%20Later|Reaper|RedCarpet|Retreiver|Riddler|Rival IQ|scooter|Scrapy|Scrubby|searchsight|seekbot|semanticdiscovery|SemrushBot|Simpy|SimplePie|SEOstats|SimpleRSS|SiteCon|Slackbot-LinkExpanding|Slack-ImgProxy|Slurp|snappy|Speedy Spider|Squrl Java|Stringer|TheUsefulbot|ThumbShotsBot|Thumbshots\.ru|Tiny Tiny RSS|Twitterbot|WhatsApp|URL2PNG|Vagabondo|VoilaBot|^vortex|Votay bot|^voyager|WASALive.Bot|Web-sniffer|WebThumb|WeSEE:[A-z]{1,30}|WhatWeb|WIRE|WordPress|Wotbox|www\.almaden\.ibm\.com|Xenu(?:.s|) Link Sleuth|Xerka [A-z]{1,30}Bot|yacy(?:bot|)|YahooSeeker|Yahoo! Slurp|Yandex\w{1,30}|YodaoBot(?:-[A-z]{1,30}|)|YottaaMonitor|Yowedo|^Zao|^Zao-Crawler|ZeBot_www\.ze\.bz|ZooShot|ZyBorg|ArcGIS Hub Indexer)(?:[ /]v?(\d+)(?:\.(\d+)(?:\.(\d+)|)|)|)'
 
@@ -195,7 +231,9 @@ user_agent_parsers:
   # Bots General matcher 'name/0.0'
   - regex: '(?:\/[A-Za-z0-9\.]+|) {0,5}([A-Za-z0-9 \-_\!\[\]:]{0,50}(?:[Aa]rchiver|[Ii]ndexer|[Ss]craper|[Bb]ot|[Ss]pider|[Cc]rawl[a-z]{0,50}))[/ ](\d+)(?:\.(\d+)(?:\.(\d+)|)|)'
   # Bots containing bot(but not CUBOT)
-  - regex: '((?:[A-Za-z][A-Za-z0-9 -]{0,50}|)[^C][^Uu][Bb]ot)\b(?:(?:[ /]| v)(\d+)(?:\.(\d+)|)(?:\.(\d+)|)|)'
+  # The (?<![./]) lookbehind prevents matching bot tokens that appear immediately after a '.' or '/'
+  # (e.g. the '/bot' suffix in '+https://openai.com/bot' URL fragments in AI crawler UA strings).
+  - regex: '((?:[A-Za-z][A-Za-z0-9 -]{0,50}|)(?<![./])[^C][^Uu][Bb]ot)\b(?:(?:[ /]| v)(\d+)(?:\.(\d+)|)(?:\.(\d+)|)|)'
   # Bots containing spider|scrape|Crawl
   - regex: '((?:[A-z0-9]{1,50}|[A-z\-]{1,50} ?|)(?: the |)(?:[Ss][Pp][Ii][Dd][Ee][Rr]|[Ss]crape|[Cc][Rr][Aa][Ww][Ll])[A-z0-9]{0,50})(?:(?:[ /]| v)(\d+)(?:\.(\d+)|)(?:\.(\d+)|)|)'
 

diff --git a/modules/user-agent/src/test/java/org/elasticsearch/useragent/UserAgentParserImplTests.java b/modules/user-agent/src/test/java/org/elasticsearch/useragent/UserAgentParserImplTests.java
@@ -12,11 +12,19 @@
 import org.elasticsearch.ElasticsearchParseException;
 import org.elasticsearch.test.ESTestCase;
 import org.elasticsearch.useragent.api.Details;
+import org.elasticsearch.xcontent.XContentFactory;
+import org.elasticsearch.xcontent.XContentParser;
+import org.elasticsearch.xcontent.XContentParserConfiguration;
+import org.elasticsearch.xcontent.XContentType;
 import org.junit.BeforeClass;
 
 import java.io.ByteArrayInputStream;
+import java.io.IOException;
 import java.io.InputStream;
 import java.nio.charset.StandardCharsets;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Map;
 
 import static org.hamcrest.Matchers.equalTo;
 import static org.hamcrest.Matchers.notNullValue;
@@ -351,4 +359,55 @@ public void testPatternsLoadedFromRegexFile() {
         assertThat(parser.getOsPatterns().isEmpty(), equalTo(false));
         assertThat(parser.getDevicePatterns().isEmpty(), equalTo(false));
     }
+
+    /**
+     * Data-driven regression and spot-check suite for bot / AI-crawler user-agent strings.
+     *
+     * <p>Each case in {@code test-bot-agents.yml} specifies a {@code ua} string together with the
+     * expected {@code name} and (optional) {@code version} produced by the parser. This keeps the
+     * pattern regression list separate from Java source code and makes it straightforward to add
+     * new cases without modifying test logic.
+     */
+    public void testBotAgents() throws IOException {
+        List<Map<String, String>> cases = parseBotTestCases();
+        assertThat("test-bot-agents.yml must not be empty", cases.isEmpty(), equalTo(false));
+
+        for (Map<String, String> tc : cases) {
+            String ua = tc.get("ua");
+            String expectedName = tc.get("name");
+            String expectedVersion = tc.get("version");
+
+            Details details = parser.parseUserAgentInfo(ua, true);
+
+            assertThat("name mismatch for UA: " + ua, details.name(), equalTo(expectedName));
+            if (expectedVersion == null) {
+                assertThat("version should be null for UA: " + ua, details.version(), nullValue());
+            } else {
+                assertThat("version mismatch for UA: " + ua, details.version(), equalTo(expectedVersion));
+            }
+        }
+    }
+
+    private static List<Map<String, String>> parseBotTestCases() throws IOException {
+        InputStream stream = UserAgentParserImplTests.class.getResourceAsStream("/test-bot-agents.yml");
+        assertNotNull("test-bot-agents.yml resource not found", stream);
+
+        List<Map<String, String>> cases = new ArrayList<>();
+        try (XContentParser yaml = XContentFactory.xContent(XContentType.YAML).createParser(XContentParserConfiguration.EMPTY, stream)) {
+            // Top-level object: { bot_test_cases: [ { ua, name, version? }, ... ] }
+            yaml.nextToken(); // START_OBJECT
+            while (yaml.nextToken() != XContentParser.Token.END_OBJECT) {
+                if (yaml.currentToken() == XContentParser.Token.FIELD_NAME && "bot_test_cases".equals(yaml.currentName())) {
+                    yaml.nextToken(); // START_ARRAY
+                    while (yaml.nextToken() != XContentParser.Token.END_ARRAY) {
+                        cases.add(yaml.mapStrings());
+                    }
+                } else {
+                    yaml.nextToken();
+                    yaml.skipChildren();
+                }
+            }
+        }
+        return cases;
+    }
 }
diff --git a/modules/user-agent/src/test/resources/test-bot-agents.yml b/modules/user-agent/src/test/resources/test-bot-agents.yml
@@ -0,0 +1,162 @@
+# Test cases for bot / AI-crawler user-agent parsing.
+#
+# Each entry has:
+#   ua      - the full User-Agent string sent by the crawler
+#   name    - expected details.name() from the parser (omit key when null is expected)
+#   version - expected details.version() from the parser (omit key when null is expected)
+#
+# Groups:
+#   1. URL-suffix false positives: UA strings that embed a +https://... URL whose path
+#      contains a bot/spider/crawl token that the generic matchers could mis-read as the name.
+#   2. No-keyword bots: UA strings whose bot name contains no bot/spider/crawl keyword,
+#      so they would fall through all generic patterns if not explicitly listed.
+#   3. Explicitly-named AI crawlers: dedicated regex entries in regexes.yml.
+#   4. Generic-matcher coverage: bots matched by the Bots General or named-alternation patterns.
+#   5. Pinned edge cases: bots whose parsing behaviour is non-obvious and worth locking down.
+
+bot_test_cases:
+
+  # -------------------------------------------------------------------
+  # 1. URL-suffix false positives
+  # -------------------------------------------------------------------
+
+  # +https://openai.com/bot — the /bot path suffix is a generic-matcher target.
+  - ua: "Mozilla/5.0 AppleWebkit/537.36 (KHTML, Like Gecko); compatible; ChatGPT-User/1.0; +https://openai.com/bot"
+    name: "ChatGPT-User"
+    version: "1.0"
+
+  # +https://.../webmasters/crawler — the /crawler suffix triggers the generic crawl catcher.
+  - ua: "meta-externalagent/1.1 (+https://developers.facebook.com/docs/sharing/webmasters/crawler)"
+    name: "meta-externalagent"
+    version: "1.1"
+
+  # +https://.../cohere-crawler — the /cohere-crawler suffix triggers the generic crawl catcher.
+  - ua: "cohere-ai/1.0; +https://docs.cohere.com/docs/cohere-crawler"
+    name: "cohere-ai"
+    version: "1.0"
+
+  # -------------------------------------------------------------------
+  # 2. No-keyword bots
+  # -------------------------------------------------------------------
+
+  - ua: "Mozilla/5.0 AppleWebkit/537.36 (KHTML, Like Gecko); compatible; MistralAI-User/1.0; +https://docs.mistral.ai/robots"
+    name: "MistralAI-User"
+    version: "1.0"
+
+  - ua: "Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; Claude-User/1.0; +Claude-User@anthropic.com)"
+    name: "Claude-User"
+    version: "1.0"
+
+  - ua: "Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; Perplexity-User/1.0; +https://perplexity.ai/perplexity-user)"
+    name: "Perplexity-User"
+    version: "1.0"
+
+  - ua: "meta-externalfetcher/1.0"
+    name: "meta-externalfetcher"
+    version: "1.0"
+
+  # -------------------------------------------------------------------
+  # 3. Explicitly-named AI crawlers
+  # -------------------------------------------------------------------
+
+  - ua: "Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko); compatible; OAI-SearchBot/1.0; +https://openai.com/searchbot"
+    name: "OAI-SearchBot"
+    version: "1.0"
+
+  # Claude-Web is the legacy Anthropic UA token (deprecated 2024), kept for log compatibility.
+  - ua: "Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; Claude-Web/1.0; +Claude-Web@anthropic.com)"
+    name: "Claude-Web"
+    version: "1.0"
+
+  # -------------------------------------------------------------------
+  # 4. Generic-matcher coverage
+  # -------------------------------------------------------------------
+
+  - ua: "Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko); compatible; GPTBot/1.1; +https://openai.com/gptbot"
+    name: "GPTBot"
+    version: "1.1"
+
+  - ua: "Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; ClaudeBot/1.0; +claudebot@anthropic.com)"
+    name: "ClaudeBot"
+    version: "1.0"
+
+  - ua: "Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; PerplexityBot/1.0; +https://perplexity.ai/perplexitybot)"
+    name: "PerplexityBot"
+    version: "1.0"
+
+  - ua: "Mozilla/5.0 (compatible; DuckAssistBot/1.0; +https://duckduckgo.com/duckassistbot)"
+    name: "DuckAssistBot"
+    version: "1.0"
+
+  - ua: "Mozilla/5.0 (compatible; Amazonbot/0.1; +https://developer.amazon.com/support/amazonbot)"
+    name: "Amazonbot"
+    version: "0.1"
+
+  - ua: "Mozilla/5.0 (compatible; AhrefsBot/7.0; +http://ahrefs.com/robot/)"
+    name: "AhrefsBot"
+    version: "7.0"
+
+  - ua: "CCBot/2.0 (https://commoncrawl.org/faq/)"
+    name: "CCBot"
+    version: "2.0"
+
+  - ua: "Mozilla/5.0 (compatible; Applebot/0.1; +http://www.apple.com/go/applebot)"
+    name: "Applebot"
+    version: "0.1"
+
+  - ua: "Mozilla/5.0 (Linux; Android 5.0) AppleWebKit/537.36 (KHTML, like Gecko) Mobile Safari/537.36 (compatible; Bytespider/1.0; spider-feedback@bytedance.com)"
+    name: "Bytespider"
+    version: "1.0"
+
+  - ua: "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"
+    name: "Googlebot"
+    version: "2.1"
+
+  - ua: "Mozilla/5.0 (compatible; bingbot/2.0; +http://www.bing.com/bingbot.htm)"
+    name: "bingbot"
+    version: "2.0"
+
+  - ua: "Mozilla/5.0 (compatible; YandexBot/3.0; +http://yandex.com/bots)"
+    name: "YandexBot"
+    version: "3.0"
+
+  - ua: "Twitterbot/1.0"
+    name: "Twitterbot"
+    version: "1.0"
+
+  # SemrushBot/7~bl — the ~bl suffix is not captured; version is just the leading digit.
+  - ua: "Mozilla/5.0 (compatible; SemrushBot/7~bl; +http://www.semrush.com/bot.html)"
+    name: "SemrushBot"
+    version: "7"
+
+  - ua: "LinkedInBot/1.0 (compatible; Mozilla/5.0; Apache-HttpClient/4.1.1 +http://www.linkedin.com)"
+    name: "LinkedInBot"
+    version: "1.0"
+
+  # The alternation captures the full "Yahoo! Slurp" prefix, not just "Slurp".
+  - ua: "Mozilla/5.0 (compatible; Yahoo! Slurp; http://help.yahoo.com/help/us/ysearch/slurp)"
+    name: "Yahoo! Slurp"
+
+  - ua: "Mozilla/5.0 (compatible; MJ12bot/v1.4.8; http://mj12bot.com/)"
+    name: "MJ12bot"
+    version: "1.4.8"
+
+  - ua: "ia_archiver (+http://www.alexa.com/site/help/webmaster;crawler@alexa.com)"
+    name: "ia_archiver"
+
+  - ua: "Mozilla/5.0 (compatible; Pinterestbot/1.0; +https://www.pinterest.com/bot.html)"
+    name: "Pinterestbot"
+    version: "1.0"
+
+  # -------------------------------------------------------------------
+  # 5. Pinned edge cases
+  # -------------------------------------------------------------------
+
+  # Google's AI-training opt-out crawler; name contains no recognisable bot keyword,
+  # so the parser returns null.
+  - ua: "Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko); compatible; Google-Extended"
+
+  # Meta's link-preview fetcher; has an explicit entry with family_replacement: 'FacebookBot'.
+  - ua: "facebookexternalhit/1.1 (+http://www.facebook.com/externalhit_uatext.php)"
+    name: "FacebookBot"
+    version: "1.1"