Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
52 changes: 4 additions & 48 deletions src/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -23,37 +23,9 @@ interface SessionData {
* instead of the shared server IP.
*/
keylessClientIp?: string;
/**
* Whether the (experimental) research tools are exposed for this session.
* Enabled locally via `FIRECRAWL_RESEARCH=true`, or per-request via the
* `?research=true` query param on the MCP endpoint.
*/
research?: boolean;
[key: string]: unknown;
}

/**
* Decide whether the research tools should be visible for a session.
* Local/stdio/self-hosted: gated by `FIRECRAWL_RESEARCH=true`.
* Remote (HTTP): additionally enabled by a `?research=true` query param on the
* incoming MCP request URL.
*/
function isResearchEnabled(request?: { url?: string }): boolean {
if (process.env.FIRECRAWL_RESEARCH === 'true') return true;
const url = request?.url;
if (url) {
try {
const research = new URL(url, 'http://localhost').searchParams.get(
'research'
);
if (research === 'true') return true;
} catch {
// malformed URL — fall through to disabled
}
}
return false;
}

function normalizeHeader(
value: string | string[] | undefined
): string | undefined {
Expand Down Expand Up @@ -290,7 +262,6 @@ const server = new FastMCP<SessionData>({
headers: IncomingHttpHeaders;
url?: string;
}): Promise<SessionData> => {
const research = isResearchEnabled(request);
// FastMCP invokes `authenticate(undefined)` for the stdio transport
// because there is no HTTP request context. Without this null guard,
// accessing `request.headers` throws a TypeError, FastMCP silently
Expand All @@ -317,13 +288,13 @@ const server = new FastMCP<SessionData>({
clientIp &&
(await keylessEligible(clientIp))
) {
return { firecrawlApiKey: undefined, research, keylessClientIp: clientIp };
return { firecrawlApiKey: undefined, keylessClientIp: clientIp };
}
throw new Error(
'Firecrawl credentials required: OAuth access token (Authorization: Bearer fco_...) or API key (x-firecrawl-api-key)'
);
}
return { firecrawlApiKey: headerCred, research };
return { firecrawlApiKey: headerCred };
}

const credential = headerCred ?? envCred;
Expand Down Expand Up @@ -352,7 +323,7 @@ const server = new FastMCP<SessionData>({
process.exit(1);
}

return { firecrawlApiKey: credential, research };
return { firecrawlApiKey: credential };
},
// Lightweight health endpoint for LB checks
health: {
Expand Down Expand Up @@ -1959,21 +1930,6 @@ if (
}

registerMonitorTools(server);

// Research tools gating. FastMCP's `canAccess` is only honored on the HTTP
// transport (the stdio path exposes every registered tool regardless), so we
// split the two cases:
// - HTTP (cloud / SSE_LOCAL / HTTP_STREAMABLE_SERVER): always register; each
// tool's `canAccess` hides it unless the session has research enabled
// (`FIRECRAWL_RESEARCH=true` env or `?research=true` on the request).
// - stdio (local): register only when `FIRECRAWL_RESEARCH=true`, since
// `canAccess` cannot hide them there.
const isHttpTransport =
process.env.CLOUD_SERVICE === 'true' ||
process.env.SSE_LOCAL === 'true' ||
process.env.HTTP_STREAMABLE_SERVER === 'true';
if (isHttpTransport || process.env.FIRECRAWL_RESEARCH === 'true') {
registerResearchTools(server, getClient);
}
registerResearchTools(server, getClient);

await server.start(args);
116 changes: 90 additions & 26 deletions src/research.ts
Original file line number Diff line number Diff line change
@@ -1,11 +1,8 @@
/**
* Firecrawl Research tools (experimental).
*
* Thin MCP wrappers over the `/v2/research/*` endpoints (arXiv papers + GitHub
* history/readmes). These tools are hidden unless research is enabled for the
* session — locally via `FIRECRAWL_RESEARCH=true`, or remotely via the
* `?research=true` query param on the MCP endpoint (see `isResearchEnabled` in
* index.ts, which sets `session.research`).
* Thin MCP wrappers over the `/v2/search/research/*` endpoints (arXiv papers + GitHub
* history/readmes).
*
* The installed `@mendable/firecrawl-js` predates the SDK's `research` client,
* so we call the endpoints directly through the SDK's HTTP layer (auth +
Expand All @@ -18,7 +15,6 @@ import { z } from 'zod';

interface SessionData {
firecrawlApiKey?: string;
research?: boolean;
[key: string]: unknown;
}

Expand All @@ -36,7 +32,7 @@ type ClientLike = {
// the callback loosely and narrow to `ClientLike` at each call site.
type GetClient = (session?: SessionData) => unknown;

const BASE = '/v2/research';
const BASE = '/v2/search/research';

/** Append a value (or repeated array values) to a URLSearchParams instance. */
function appendParam(
Expand Down Expand Up @@ -73,18 +69,22 @@ const MAX_AFFIL_CHARS = 60;
const MAX_AUTHORS_LINE_CHARS = 400;

interface PaperHit {
paper_id?: string;
paperId?: string;
primaryId?: string;
ids?: Record<string, string[]>;
title?: string;
abstract?: string;
// Search/metadata responses give a comma-joined string; some shapes give the
// structured form — handle both.
authors?: string | { name: string; affiliation?: string }[];
categories?: string[];
createdDate?: string;
updateDate?: string;
}

/** Best display id for a paper: its arXiv id, falling back to the canonical id. */
/** Display id supplied by the API, already ordered for citation/fetch use. */
function displayId(p: PaperHit): string {
return p.ids?.arxiv?.[0] ?? p.paper_id ?? '?';
return p.primaryId ?? 'missing-primary-id';
}

/** Format the authors line, accepting either the string or structured form. */
Expand Down Expand Up @@ -122,7 +122,7 @@ function fmtHits(results?: PaperHit[]): string {
if (!results || results.length === 0) return '(no results)';
return results
.map((r) => {
const lines = [`[${displayId(r)}] ${r.title ?? '(untitled)'}`];
const lines = [`## [${displayId(r)}] ${r.title ?? '(untitled)'}`];
const authors = fmtAuthors(r.authors);
if (authors) lines.push(authors);
lines.push(
Expand All @@ -135,6 +135,40 @@ function fmtHits(results?: PaperHit[]): string {
.join('\n\n');
}

function fmtPaperMetadata(paper?: PaperHit): string {
if (!paper) return '(paper not found)';
const lines = [`# ${paper.title ?? '(untitled)'}`];
lines.push('');
lines.push(`Paper ID: ${paper.paperId ?? '?'}`);

const ids = Object.entries(paper.ids ?? {})
.flatMap(([namespace, values]) =>
values.map((value) => `${namespace}:${value}`)
)
.join(', ');
if (ids) lines.push(`IDs: ${ids}`);

const authors = fmtAuthors(paper.authors);
if (authors) lines.push(authors);

if (paper.categories?.length) {
lines.push(`Categories: ${paper.categories.join(', ')}`);
}

const dates = [
paper.createdDate ? `created ${paper.createdDate}` : '',
paper.updateDate ? `updated ${paper.updateDate}` : '',
]
.filter(Boolean)
.join('; ');
if (dates) lines.push(`Dates: ${dates}`);

lines.push('');
lines.push('## Abstract');
lines.push((paper.abstract || '(no abstract)').replace(/\s+/g, ' '));
return lines.join('\n');
}

// Cap GitHub matched content so a page of results stays within the MCP
// output-token limit. Higher than abstracts since issue/PR threads carry the
// signal (repro steps, stack traces) the agent actually needs to verify.
Expand Down Expand Up @@ -193,18 +227,13 @@ function fmtGithub(results?: GitHubItem[]): string {
.join('\n\n');
}

/** Only present these tools when the session has research enabled. */
const canAccess = (session?: SessionData): boolean =>
session?.research === true;

export function registerResearchTools(
server: FastMCP<SessionData>,
getClient: GetClient
): void {
// --- search_papers ---
server.addTool({
name: 'firecrawl_research_search_papers',
canAccess,
annotations: {
title: 'Search arXiv papers',
readOnlyHint: true,
Expand Down Expand Up @@ -269,10 +298,42 @@ export function registerResearchTools(
},
});

// --- inspect_paper ---
server.addTool({
name: 'firecrawl_research_inspect_paper',
annotations: {
title: 'Inspect a paper',
readOnlyHint: true,
openWorldHint: true,
},
description:
'Fetch canonical metadata for one paper by primaryId or canonical paperId. ' +
'Use this after search/related results when you need the full title, abstract, authors, ' +
'categories, source ids, and dates rendered as markdown.',
parameters: z.object({
paperId: z
.string()
.min(1)
.describe(
'Canonical paperId or primaryId such as `arxiv:1706.03762`, `pmcid:PMC12530322`, `pmid:40953549`, or `doi:10.1016/j.neunet.2025.108095`.'
),
}),
execute: async (
args: unknown,
{ session }: { session?: SessionData; log: Logger }
): Promise<string> => {
const { paperId } = args as { paperId: string };
const client = getClient(session) as ClientLike;
const res = await client.http.get<{ paper?: PaperHit }>(
`${BASE}/papers/${encodeURIComponent(paperId)}`
);
return fmtPaperMetadata(res.data?.paper);
},
});

// --- related_papers ---
server.addTool({
name: 'firecrawl_research_related_papers',
canAccess,
annotations: {
title: 'Find related arXiv papers',
readOnlyHint: true,
Expand Down Expand Up @@ -320,7 +381,7 @@ export function registerResearchTools(
const client = getClient(session) as ClientLike;
const res = await client.http.get<{
results?: PaperHit[];
pool_size?: number;
poolSize?: number;
note?: string | null;
}>(
withQuery(
Expand All @@ -329,16 +390,15 @@ export function registerResearchTools(
)
);
const note = res.data?.note ? `\nnote: ${res.data.note}` : '';
return `${fmtHits(res.data?.results)}\n(pool_size=${res.data?.pool_size ?? 0})${note}`;
return `${fmtHits(res.data?.results)}\n(poolSize=${res.data?.poolSize ?? 0})${note}`;
},
});

// --- read_paper ---
server.addTool({
name: 'firecrawl_research_read_paper',
canAccess,
annotations: {
title: 'Read an arXiv paper',
title: 'Read a paper',
readOnlyHint: true,
openWorldHint: true,
},
Expand All @@ -348,7 +408,12 @@ export function registerResearchTools(
"reject it (e.g. 'does this paper actually use technique X / report a score on benchmark Y'). " +
"Returns the best-matching passages, or a notice if the paper's full text is unavailable.",
parameters: z.object({
arxiv_id: z.string().min(1),
paperId: z
.string()
.min(1)
.describe(
'Canonical paperId or primaryId such as `arxiv:1706.03762`, `pmcid:PMC12530322`, `pmid:40953549`, or `doi:10.1016/j.neunet.2025.108095`.'
),
question: z.string().min(1),
k: z
.number()
Expand All @@ -362,8 +427,8 @@ export function registerResearchTools(
args: unknown,
{ session }: { session?: SessionData; log: Logger }
): Promise<string> => {
const { arxiv_id, question, k } = args as {
arxiv_id: string;
const { paperId, question, k } = args as {
paperId: string;
question: string;
k?: number;
};
Expand All @@ -372,7 +437,7 @@ export function registerResearchTools(
appendParam(params, 'k', k);
const client = getClient(session) as ClientLike;
const res = await client.http.get<{ passages?: { text: string }[] }>(
withQuery(`${BASE}/papers/${encodeURIComponent(arxiv_id)}`, params)
withQuery(`${BASE}/papers/${encodeURIComponent(paperId)}`, params)
);
const passages = res.data?.passages ?? [];
return passages.length
Expand All @@ -384,7 +449,6 @@ export function registerResearchTools(
// --- search_github ---
server.addTool({
name: 'firecrawl_research_search_github',
canAccess,
annotations: {
title: 'Search GitHub history',
readOnlyHint: true,
Expand Down
Loading