Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions docs/changelog/148665.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
area: Ingest Node
issues: []
pr: 148665
summary: Add AI bots to user-agent regex
type: enhancement
40 changes: 39 additions & 1 deletion modules/user-agent/src/main/resources/regexes.yml
Original file line number Diff line number Diff line change
Expand Up @@ -161,6 +161,42 @@ user_agent_parsers:
- regex: '(Pinterest(?:bot|))/(\d+)(?:\.(\d+)|)(?:\.(\d+)|)[;\s(]+\+https://www.pinterest.com/bot.html'
family_replacement: 'Pinterestbot'

# AI crawlers — named explicitly here (before the generic bot/spider/crawl matchers) because
# these tokens either:
# (a) carry a +https://... URL suffix that the generic matchers mis-read as the bot name
# (e.g. ChatGPT-User -> "com/bot"; meta-externalagent -> "crawler"; cohere-ai -> "cohere-crawler")
# (b) contain no bot/spider/crawl keyword at all, so the generic matchers return "Other"
# (e.g. MistralAI-User, Perplexity-User, Claude-User, meta-externalfetcher, Claude-Web)
#
# Bot inventory cross-referenced against:
# https://developers.cloudflare.com/ai-crawl-control/reference/bots/
# https://platform.openai.com/docs/bots
# https://support.claude.com/en/articles/8896518
# https://docs.perplexity.ai/guides/bots

# OpenAI — https://platform.openai.com/docs/bots
- regex: '(ChatGPT-User)/(\d+)(?:\.(\d+)|)'
- regex: '(OAI-SearchBot)/(\d+)(?:\.(\d+)|)'

# Anthropic — https://support.claude.com/en/articles/8896518
# Claude-Web is the legacy UA token (deprecated 2024); kept for log compatibility.
- regex: '(Claude-User)/(\d+)(?:\.(\d+)|)'
- regex: '(Claude-Web)/(\d+)(?:\.(\d+)|)'

# Perplexity — https://docs.perplexity.ai/guides/bots
- regex: '(Perplexity-User)/(\d+)(?:\.(\d+)|)'

# Mistral AI — https://docs.mistral.ai/robots
- regex: '(MistralAI-User)/(\d+)(?:\.(\d+)|)'

# Meta — https://developers.facebook.com/docs/sharing/webmasters/web-crawlers/
# meta-externalagent carries a URL with "/crawler" that triggers the generic crawl catcher.
- regex: '(meta-externalagent)/(\d+)(?:\.(\d+)|)'
- regex: '(meta-externalfetcher)/(\d+)(?:\.(\d+)|)'

# Cohere — the UA suffix /cohere-crawler triggers the generic crawl catcher.
- regex: '(cohere-ai)/(\d+)(?:\.(\d+)|)'

# Bots
- regex: '(CSimpleSpider|Cityreview Robot|CrawlDaddy|CrawlFire|Finderbots|Index crawler|Job Roboter|KiwiStatus Spider|Lijit Crawler|QuerySeekerSpider|ScollSpider|Trends Crawler|USyd-NLP-Spider|SiteCat Webbot|BotName\/\$BotVersion|123metaspider-Bot|1470\.net crawler|50\.nu|8bo Crawler Bot|Aboundex|Accoona-[A-z]{1,30}-Agent|AdsBot-Google(?:-[a-z]{1,30}|)|altavista|AppEngine-Google|archive.{0,30}\.org_bot|archiver|Ask Jeeves|[Bb]ai[Dd]u[Ss]pider(?:-[A-Za-z]{1,30})(?:-[A-Za-z]{1,30}|)|bingbot|BingPreview|blitzbot|BlogBridge|Bloglovin|BoardReader Blog Indexer|BoardReader Favicon Fetcher|boitho.com-dc|BotSeer|BUbiNG|\b\w{0,30}favicon\w{0,30}\b|\bYeti(?:-[a-z]{1,30}|)|Catchpoint(?: bot|)|[Cc]harlotte|Checklinks|clumboot|Comodo HTTP\(S\) Crawler|Comodo-Webinspector-Crawler|ConveraCrawler|CRAWL-E|CrawlConvera|Daumoa(?:-feedfetcher|)|Feed Seeker Bot|Feedbin|findlinks|Flamingo_SearchEngine|FollowSite Bot|furlbot|Genieo|gigabot|GomezAgent|gonzo1|(?:[a-zA-Z]{1,30}-|)Googlebot(?:-[a-zA-Z]{1,30}|)|Google SketchUp|grub-client|gsa-crawler|heritrix|HiddenMarket|holmes|HooWWWer|htdig|ia_archiver|ICC-Crawler|Icarus6j|ichiro(?:/mobile|)|IconSurf|IlTrovatore(?:-Setaccio|)|InfuzApp|Innovazion Crawler|InternetArchive|IP2[a-z]{1,30}Bot|jbot\b|KaloogaBot|Kraken|Kurzor|larbin|LEIA|LesnikBot|Linguee Bot|LinkAider|LinkedInBot|Lite Bot|Llaut|lycos|Mail\.RU_Bot|masscan|masidani_bot|Mediapartners-Google|Microsoft .{0,30} Bot|mogimogi|mozDex|MJ12bot|msnbot(?:-media {0,2}|)|msrbot|Mtps Feed Aggregation System|netresearch|Netvibes|NewsGator[^/]{0,30}|^NING|Nutch[^/]{0,30}|Nymesis|ObjectsSearch|OgScrper|Orbiter|OOZBOT|PagePeeker|PagesInventory|PaxleFramework|Peeplo Screenshot Bot|PlantyNet_WebRobot|Pompos|Qwantify|Read%20Later|Reaper|RedCarpet|Retreiver|Riddler|Rival IQ|scooter|Scrapy|Scrubby|searchsight|seekbot|semanticdiscovery|SemrushBot|Simpy|SimplePie|SEOstats|SimpleRSS|SiteCon|Slackbot-LinkExpanding|Slack-ImgProxy|Slurp|snappy|Speedy Spider|Squrl Java|Stringer|TheUsefulbot|ThumbShotsBot|Thumbshots\.ru|Tiny Tiny RSS|Twitterbot|WhatsApp|URL2PNG|Vagabondo|VoilaBot|^vortex|Votay bot|^voyager|WASALive.Bot|Web-sniffer|WebThumb|WeSEE:[A-z]{1,30}|WhatWeb|WIRE|WordPress|Wotbox|www\.almaden\.ibm\.com|Xenu(?:.s|) Link Sleuth|Xerka [A-z]{1,30}Bot|yacy(?:bot|)|YahooSeeker|Yahoo! Slurp|Yandex\w{1,30}|YodaoBot(?:-[A-z]{1,30}|)|YottaaMonitor|Yowedo|^Zao|^Zao-Crawler|ZeBot_www\.ze\.bz|ZooShot|ZyBorg|ArcGIS Hub Indexer)(?:[ /]v?(\d+)(?:\.(\d+)(?:\.(\d+)|)|)|)'

Expand Down Expand Up @@ -195,7 +231,9 @@ user_agent_parsers:
# Bots General matcher 'name/0.0'
- regex: '(?:\/[A-Za-z0-9\.]+|) {0,5}([A-Za-z0-9 \-_\!\[\]:]{0,50}(?:[Aa]rchiver|[Ii]ndexer|[Ss]craper|[Bb]ot|[Ss]pider|[Cc]rawl[a-z]{0,50}))[/ ](\d+)(?:\.(\d+)(?:\.(\d+)|)|)'
# Bots containing bot(but not CUBOT)
- regex: '((?:[A-Za-z][A-Za-z0-9 -]{0,50}|)[^C][^Uu][Bb]ot)\b(?:(?:[ /]| v)(\d+)(?:\.(\d+)|)(?:\.(\d+)|)|)'
# The (?<![./]) lookbehind prevents matching bot tokens that appear immediately after a '.' or '/'
# (e.g. the '/bot' suffix in '+https://openai.com/bot' URL fragments in AI crawler UA strings).
- regex: '((?:[A-Za-z][A-Za-z0-9 -]{0,50}|)(?<![./])[^C][^Uu][Bb]ot)\b(?:(?:[ /]| v)(\d+)(?:\.(\d+)|)(?:\.(\d+)|)|)'
# Bots containing spider|scrape|Crawl
- regex: '((?:[A-z0-9]{1,50}|[A-z\-]{1,50} ?|)(?: the |)(?:[Ss][Pp][Ii][Dd][Ee][Rr]|[Ss]crape|[Cc][Rr][Aa][Ww][Ll])[A-z0-9]{0,50})(?:(?:[ /]| v)(\d+)(?:\.(\d+)|)(?:\.(\d+)|)|)'

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,19 @@
import org.elasticsearch.ElasticsearchParseException;
import org.elasticsearch.test.ESTestCase;
import org.elasticsearch.useragent.api.Details;
import org.elasticsearch.xcontent.XContentFactory;
import org.elasticsearch.xcontent.XContentParser;
import org.elasticsearch.xcontent.XContentParserConfiguration;
import org.elasticsearch.xcontent.XContentType;
import org.junit.BeforeClass;

import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;

import static org.hamcrest.Matchers.equalTo;
import static org.hamcrest.Matchers.notNullValue;
Expand Down Expand Up @@ -351,4 +359,55 @@ public void testPatternsLoadedFromRegexFile() {
assertThat(parser.getOsPatterns().isEmpty(), equalTo(false));
assertThat(parser.getDevicePatterns().isEmpty(), equalTo(false));
}

/**
* Data-driven regression and spot-check suite for bot / AI-crawler user-agent strings.
*
* <p>Each case in {@code test-bot-agents.yml} specifies a {@code ua} string together with the
* expected {@code name} and (optional) {@code version} produced by the parser. This keeps the
* pattern regression list separate from Java source code and makes it straightforward to add
* new cases without modifying test logic.
*/
public void testBotAgents() throws IOException {
List<Map<String, String>> cases = parseBotTestCases();
assertThat("test-bot-agents.yml must not be empty", cases.isEmpty(), equalTo(false));

for (Map<String, String> tc : cases) {
String ua = tc.get("ua");
String expectedName = tc.get("name");
String expectedVersion = tc.get("version");

Details details = parser.parseUserAgentInfo(ua, true);

assertThat("name mismatch for UA: " + ua, details.name(), equalTo(expectedName));
if (expectedVersion == null) {
assertThat("version should be null for UA: " + ua, details.version(), nullValue());
} else {
assertThat("version mismatch for UA: " + ua, details.version(), equalTo(expectedVersion));
}
}
}

private static List<Map<String, String>> parseBotTestCases() throws IOException {
InputStream stream = UserAgentParserImplTests.class.getResourceAsStream("/test-bot-agents.yml");
assertNotNull("test-bot-agents.yml resource not found", stream);

List<Map<String, String>> cases = new ArrayList<>();
try (XContentParser yaml = XContentFactory.xContent(XContentType.YAML).createParser(XContentParserConfiguration.EMPTY, stream)) {
// Top-level object: { bot_test_cases: [ { ua, name, version? }, ... ] }
yaml.nextToken(); // START_OBJECT
while (yaml.nextToken() != XContentParser.Token.END_OBJECT) {
if (yaml.currentToken() == XContentParser.Token.FIELD_NAME && "bot_test_cases".equals(yaml.currentName())) {
yaml.nextToken(); // START_ARRAY
while (yaml.nextToken() != XContentParser.Token.END_ARRAY) {
cases.add(yaml.mapStrings());
}
} else {
yaml.nextToken();
yaml.skipChildren();
}
}
}
return cases;
}
}
162 changes: 162 additions & 0 deletions modules/user-agent/src/test/resources/test-bot-agents.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,162 @@
# Test cases for bot / AI-crawler user-agent parsing.
#
# Each entry has:
# ua - the full User-Agent string sent by the crawler
# name - expected details.name() from the parser (omit key when null is expected)
# version - expected details.version() from the parser (omit key when null is expected)
#
# Groups:
# 1. URL-suffix false positives: UA strings that embed a +https://... URL whose path
# contains a bot/spider/crawl token that the generic matchers could mis-read as the name.
# 2. No-keyword bots: UA strings whose bot name contains no bot/spider/crawl keyword,
# so they would fall through all generic patterns if not explicitly listed.
# 3. Explicitly-named AI crawlers: dedicated regex entries in regexes.yml.
# 4. Generic-matcher coverage: bots matched by the Bots General or named-alternation patterns.
# 5. Pinned edge cases: bots whose parsing behaviour is non-obvious and worth locking down.

bot_test_cases:

# -------------------------------------------------------------------
# 1. URL-suffix false positives
# -------------------------------------------------------------------

# +https://openai.com/bot — the /bot path suffix is a generic-matcher target.
- ua: "Mozilla/5.0 AppleWebkit/537.36 (KHTML, Like Gecko); compatible; ChatGPT-User/1.0; +https://openai.com/bot"
name: "ChatGPT-User"
version: "1.0"

# +https://.../webmasters/crawler — the /crawler suffix triggers the generic crawl catcher.
- ua: "meta-externalagent/1.1 (+https://developers.facebook.com/docs/sharing/webmasters/crawler)"
name: "meta-externalagent"
version: "1.1"

# +https://.../cohere-crawler — the /cohere-crawler suffix triggers the generic crawl catcher.
- ua: "cohere-ai/1.0; +https://docs.cohere.com/docs/cohere-crawler"
name: "cohere-ai"
version: "1.0"

# -------------------------------------------------------------------
# 2. No-keyword bots
# -------------------------------------------------------------------

- ua: "Mozilla/5.0 AppleWebkit/537.36 (KHTML, Like Gecko); compatible; MistralAI-User/1.0; +https://docs.mistral.ai/robots"
name: "MistralAI-User"
version: "1.0"

- ua: "Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; Claude-User/1.0; +Claude-User@anthropic.com)"
name: "Claude-User"
version: "1.0"

- ua: "Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; Perplexity-User/1.0; +https://perplexity.ai/perplexity-user)"
name: "Perplexity-User"
version: "1.0"

- ua: "meta-externalfetcher/1.0"
name: "meta-externalfetcher"
version: "1.0"

# -------------------------------------------------------------------
# 3. Explicitly-named AI crawlers
# -------------------------------------------------------------------

- ua: "Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko); compatible; OAI-SearchBot/1.0; +https://openai.com/searchbot"
name: "OAI-SearchBot"
version: "1.0"

# Claude-Web is the legacy Anthropic UA token (deprecated 2024), kept for log compatibility.
- ua: "Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; Claude-Web/1.0; +Claude-Web@anthropic.com)"
name: "Claude-Web"
version: "1.0"

# -------------------------------------------------------------------
# 4. Generic-matcher coverage
# -------------------------------------------------------------------

- ua: "Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko); compatible; GPTBot/1.1; +https://openai.com/gptbot"
name: "GPTBot"
version: "1.1"

- ua: "Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; ClaudeBot/1.0; +claudebot@anthropic.com)"
name: "ClaudeBot"
version: "1.0"

- ua: "Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; PerplexityBot/1.0; +https://perplexity.ai/perplexitybot)"
name: "PerplexityBot"
version: "1.0"

- ua: "Mozilla/5.0 (compatible; DuckAssistBot/1.0; +https://duckduckgo.com/duckassistbot)"
name: "DuckAssistBot"
version: "1.0"

- ua: "Mozilla/5.0 (compatible; Amazonbot/0.1; +https://developer.amazon.com/support/amazonbot)"
name: "Amazonbot"
version: "0.1"

- ua: "Mozilla/5.0 (compatible; AhrefsBot/7.0; +http://ahrefs.com/robot/)"
name: "AhrefsBot"
version: "7.0"

- ua: "CCBot/2.0 (https://commoncrawl.org/faq/)"
name: "CCBot"
version: "2.0"

- ua: "Mozilla/5.0 (compatible; Applebot/0.1; +http://www.apple.com/go/applebot)"
name: "Applebot"
version: "0.1"

- ua: "Mozilla/5.0 (Linux; Android 5.0) AppleWebKit/537.36 (KHTML, like Gecko) Mobile Safari/537.36 (compatible; Bytespider/1.0; spider-feedback@bytedance.com)"
name: "Bytespider"
version: "1.0"

- ua: "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"
name: "Googlebot"
version: "2.1"

- ua: "Mozilla/5.0 (compatible; bingbot/2.0; +http://www.bing.com/bingbot.htm)"
name: "bingbot"
version: "2.0"

- ua: "Mozilla/5.0 (compatible; YandexBot/3.0; +http://yandex.com/bots)"
name: "YandexBot"
version: "3.0"

- ua: "Twitterbot/1.0"
name: "Twitterbot"
version: "1.0"

# SemrushBot/7~bl — the ~bl suffix is not captured; version is just the leading digit.
- ua: "Mozilla/5.0 (compatible; SemrushBot/7~bl; +http://www.semrush.com/bot.html)"
name: "SemrushBot"
version: "7"

- ua: "LinkedInBot/1.0 (compatible; Mozilla/5.0; Apache-HttpClient/4.1.1 +http://www.linkedin.com)"
name: "LinkedInBot"
version: "1.0"

# The alternation captures the full "Yahoo! Slurp" prefix, not just "Slurp".
- ua: "Mozilla/5.0 (compatible; Yahoo! Slurp; http://help.yahoo.com/help/us/ysearch/slurp)"
name: "Yahoo! Slurp"

- ua: "Mozilla/5.0 (compatible; MJ12bot/v1.4.8; http://mj12bot.com/)"
name: "MJ12bot"
version: "1.4.8"

- ua: "ia_archiver (+http://www.alexa.com/site/help/webmaster;crawler@alexa.com)"
name: "ia_archiver"

- ua: "Mozilla/5.0 (compatible; Pinterestbot/1.0; +https://www.pinterest.com/bot.html)"
name: "Pinterestbot"
version: "1.0"

# -------------------------------------------------------------------
# 5. Pinned edge cases
# -------------------------------------------------------------------

# Google's AI-training opt-out crawler; name contains no recognisable bot keyword,
# so the parser returns null.
- ua: "Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko); compatible; Google-Extended"

# Meta's link-preview fetcher; has an explicit entry with family_replacement: 'FacebookBot'.
- ua: "facebookexternalhit/1.1 (+http://www.facebook.com/externalhit_uatext.php)"
name: "FacebookBot"
version: "1.1"
Loading