From e955bdd8b70d81ebc4e38a4e6f9aa84f48f6c24a Mon Sep 17 00:00:00 2001 From: Mouad BANI Date: Mon, 29 Jun 2026 13:17:43 +0100 Subject: [PATCH 01/18] feat: scaffold security contacts worker (CM-1243) Signed-off-by: Mouad BANI --- .../V1782950400__security_contacts.sql | 27 +++++ .../apps/packages_worker/src/activities.ts | 1 + .../src/bin/security-contacts-worker.ts | 8 ++ services/apps/packages_worker/src/config.ts | 11 ++ .../src/enricher/installationPool.ts | 56 +++++++++ .../src/enricher/runEnrichmentLoop.ts | 55 +-------- .../src/security-contacts/activities.ts | 17 +++ .../src/security-contacts/processBatch.ts | 112 ++++++++++++++++++ .../src/security-contacts/schedule.ts | 41 +++++++ .../src/security-contacts/types.ts | 60 ++++++++++ .../src/security-contacts/workflows.ts | 21 ++++ .../packages_worker/src/workflows/index.ts | 1 + 12 files changed, 356 insertions(+), 54 deletions(-) create mode 100644 backend/src/osspckgs/migrations/V1782950400__security_contacts.sql create mode 100644 services/apps/packages_worker/src/bin/security-contacts-worker.ts create mode 100644 services/apps/packages_worker/src/enricher/installationPool.ts create mode 100644 services/apps/packages_worker/src/security-contacts/activities.ts create mode 100644 services/apps/packages_worker/src/security-contacts/processBatch.ts create mode 100644 services/apps/packages_worker/src/security-contacts/schedule.ts create mode 100644 services/apps/packages_worker/src/security-contacts/types.ts create mode 100644 services/apps/packages_worker/src/security-contacts/workflows.ts diff --git a/backend/src/osspckgs/migrations/V1782950400__security_contacts.sql b/backend/src/osspckgs/migrations/V1782950400__security_contacts.sql new file mode 100644 index 0000000000..fc1d62410b --- /dev/null +++ b/backend/src/osspckgs/migrations/V1782950400__security_contacts.sql @@ -0,0 +1,27 @@ +CREATE TABLE IF NOT EXISTS security_contacts ( + id BIGSERIAL PRIMARY KEY, + repo_id BIGINT NOT NULL REFERENCES repos(id) ON DELETE CASCADE, + channel TEXT NOT NULL, + value TEXT NOT NULL, + role TEXT NOT NULL, + name TEXT, + score NUMERIC(4,3) NOT NULL, + confidence TEXT NOT NULL, + provenance JSONB NOT NULL DEFAULT '[]', + last_refreshed TIMESTAMPTZ NOT NULL DEFAULT NOW(), + created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), + updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW() +); + +CREATE UNIQUE INDEX IF NOT EXISTS security_contacts_repo_channel_value_uq + ON security_contacts (repo_id, channel, value); + +CREATE INDEX IF NOT EXISTS security_contacts_repo_confidence_idx + ON security_contacts (repo_id, confidence); + +ALTER TABLE repos ADD COLUMN IF NOT EXISTS pvr_enabled BOOL; +ALTER TABLE repos ADD COLUMN IF NOT EXISTS security_policy_url TEXT; +ALTER TABLE repos ADD COLUMN IF NOT EXISTS vulnerability_reporting_url TEXT; +ALTER TABLE repos ADD COLUMN IF NOT EXISTS bug_bounty_url TEXT; +ALTER TABLE repos ADD COLUMN IF NOT EXISTS security_txt_url TEXT; +ALTER TABLE repos ADD COLUMN IF NOT EXISTS contacts_last_refreshed TIMESTAMPTZ; diff --git a/services/apps/packages_worker/src/activities.ts b/services/apps/packages_worker/src/activities.ts index aa564ce3fa..01791b31bc 100644 --- a/services/apps/packages_worker/src/activities.ts +++ b/services/apps/packages_worker/src/activities.ts @@ -26,3 +26,4 @@ export { } from './cargo/activities' export { enrichGoVersionsBatch, enrichGoStatusBatch } from './go/activities' export { processNuGetBatch } from './nuget/activities' +export { processSecurityContactsBatch } from './security-contacts/activities' diff --git a/services/apps/packages_worker/src/bin/security-contacts-worker.ts b/services/apps/packages_worker/src/bin/security-contacts-worker.ts new file mode 100644 index 0000000000..387b0a7c1d --- /dev/null +++ b/services/apps/packages_worker/src/bin/security-contacts-worker.ts @@ -0,0 +1,8 @@ +import { scheduleSecurityContactsIngestion } from '../security-contacts/schedule' +import { svc } from '../service' + +setImmediate(async () => { + await svc.init() + await scheduleSecurityContactsIngestion() + await svc.start() +}) diff --git a/services/apps/packages_worker/src/config.ts b/services/apps/packages_worker/src/config.ts index ab5de33808..6f64e46372 100644 --- a/services/apps/packages_worker/src/config.ts +++ b/services/apps/packages_worker/src/config.ts @@ -46,6 +46,17 @@ export function getEnricherConfig() { } } +export function getSecurityContactsConfig() { + return { + // A repo is re-evaluated once its contacts are older than this many hours. + updateIntervalHours: requireEnvInt('SECURITY_CONTACTS_UPDATE_INTERVAL_HOURS'), + // Lower than the enricher: each repo fans out to several HTTP calls across extractors + concurrency: parseInt(process.env.SECURITY_CONTACTS_CONCURRENCY ?? '20', 10), + fetchTimeoutMs: parseInt(process.env.SECURITY_CONTACTS_FETCH_TIMEOUT_MS ?? '10000', 10), + batchSize: parseInt(process.env.SECURITY_CONTACTS_BATCH_SIZE ?? '500', 10), + } +} + export function getMavenConfig() { return { batchSize: requireEnvInt('MAVEN_FETCHER_BATCH_SIZE'), diff --git a/services/apps/packages_worker/src/enricher/installationPool.ts b/services/apps/packages_worker/src/enricher/installationPool.ts new file mode 100644 index 0000000000..7ce35016fd --- /dev/null +++ b/services/apps/packages_worker/src/enricher/installationPool.ts @@ -0,0 +1,56 @@ +import { getServiceChildLogger } from '@crowd/logging' + +const log = getServiceChildLogger('installation-pool') + +// Park an installation before GitHub starts rejecting — avoids a failed request + requeue +const PROACTIVE_PARK_REMAINING = 50 + +/** Round-robins over installations, skipping ones parked until their rate-limit reset. */ +export class InstallationPool { + private readonly parkedUntil = new Map() + private roundRobinIdx = 0 + + constructor(private readonly ids: number[]) {} + + select(): { installationId: number; waitMs: number } { + const now = Date.now() + const n = this.ids.length + + for (let i = 0; i < n; i++) { + const idx = (this.roundRobinIdx + i) % n + const id = this.ids[idx] + if ((this.parkedUntil.get(id) ?? 0) <= now) { + this.roundRobinIdx = (idx + 1) % n + return { installationId: id, waitMs: 0 } + } + } + + let soonestReset = Infinity + let soonestId = this.ids[0] + for (const id of this.ids) { + const reset = this.parkedUntil.get(id) ?? 0 + if (reset < soonestReset) { + soonestReset = reset + soonestId = id + } + } + return { installationId: soonestId, waitMs: Math.max(1_000, soonestReset - now) } + } + + park(installationId: number, untilMs: number): void { + this.parkedUntil.set(installationId, untilMs) + } + + parkIfBudgetLow( + installationId: number, + remaining: number | null | undefined, + resetAt: string | null | undefined, + ): void { + if (remaining == null || resetAt == null || remaining >= PROACTIVE_PARK_REMAINING) return + this.park(installationId, new Date(resetAt).getTime() + 5_000) + log.info( + { installationId, remaining, resetAt }, + 'Budget low — proactively parking installation', + ) + } +} diff --git a/services/apps/packages_worker/src/enricher/runEnrichmentLoop.ts b/services/apps/packages_worker/src/enricher/runEnrichmentLoop.ts index 0085e0bff9..bb68a380b9 100644 --- a/services/apps/packages_worker/src/enricher/runEnrichmentLoop.ts +++ b/services/apps/packages_worker/src/enricher/runEnrichmentLoop.ts @@ -6,6 +6,7 @@ import { getEnricherConfig } from '../config' import { fetchActivitySnapshot } from './fetchActivitySnapshot' import { fetchLightRepo, parseGithubUrl } from './fetchLightRepo' import { GithubAppConfig, getInstallationToken } from './githubAppAuth' +import { InstallationPool } from './installationPool' import { FetchError, LightRepoResult, RepoActivitySnapshot } from './types' import { bulkUpdateEnrichedRepos, markReposSkipped } from './updateEnrichedRepos' import { bulkUpsertRepoActivitySnapshot } from './updateRepoActivitySnapshot' @@ -17,65 +18,11 @@ const DB_FETCH_SIZE = 2000 const WRITE_FLUSH_SIZE = 500 const WRITE_FLUSH_MS = 5000 const MAX_FLUSH_FAILURES = 3 -// Park an installation before GitHub starts rejecting — avoids a failed request + requeue -const PROACTIVE_PARK_REMAINING = 50 // Rate-limited snapshots retry once with another installation before being skipped const SNAPSHOT_RATE_LIMIT_RETRIES = 1 // Installations whose token mint fails (e.g. org IP allowlist) sit out for an hour const MINT_FAILURE_PARK_MS = 60 * 60 * 1000 -// ─── Installation pool ──────────────────────────────────────────────────────── - -/** Round-robins over installations, skipping ones parked until their rate-limit reset. */ -class InstallationPool { - private readonly parkedUntil = new Map() - private roundRobinIdx = 0 - - constructor(private readonly ids: number[]) {} - - select(): { installationId: number; waitMs: number } { - const now = Date.now() - const n = this.ids.length - - for (let i = 0; i < n; i++) { - const idx = (this.roundRobinIdx + i) % n - const id = this.ids[idx] - if ((this.parkedUntil.get(id) ?? 0) <= now) { - this.roundRobinIdx = (idx + 1) % n - return { installationId: id, waitMs: 0 } - } - } - - let soonestReset = Infinity - let soonestId = this.ids[0] - for (const id of this.ids) { - const reset = this.parkedUntil.get(id) ?? 0 - if (reset < soonestReset) { - soonestReset = reset - soonestId = id - } - } - return { installationId: soonestId, waitMs: Math.max(1_000, soonestReset - now) } - } - - park(installationId: number, untilMs: number): void { - this.parkedUntil.set(installationId, untilMs) - } - - parkIfBudgetLow( - installationId: number, - remaining: number | null | undefined, - resetAt: string | null | undefined, - ): void { - if (remaining == null || resetAt == null || remaining >= PROACTIVE_PARK_REMAINING) return - this.park(installationId, new Date(resetAt).getTime() + 5_000) - log.info( - { installationId, remaining, resetAt }, - 'Budget low — proactively parking installation', - ) - } -} - // ─── Fetch with retries ─────────────────────────────────────────────────────── type FetchOutcome = diff --git a/services/apps/packages_worker/src/security-contacts/activities.ts b/services/apps/packages_worker/src/security-contacts/activities.ts new file mode 100644 index 0000000000..e6be0fecde --- /dev/null +++ b/services/apps/packages_worker/src/security-contacts/activities.ts @@ -0,0 +1,17 @@ +import { getServiceChildLogger } from '@crowd/logging' + +import { getSecurityContactsConfig } from '../config' +import { getPackagesDb } from '../db' + +import { BatchResult, processBatch } from './processBatch' + +const log = getServiceChildLogger('security-contacts-activity') + +export async function processSecurityContactsBatch(): Promise { + const config = getSecurityContactsConfig() + const qx = await getPackagesDb() + + const result = await processBatch(qx, config) + log.info({ ...result }, 'Security contacts batch activity complete') + return result +} diff --git a/services/apps/packages_worker/src/security-contacts/processBatch.ts b/services/apps/packages_worker/src/security-contacts/processBatch.ts new file mode 100644 index 0000000000..af57da8c8e --- /dev/null +++ b/services/apps/packages_worker/src/security-contacts/processBatch.ts @@ -0,0 +1,112 @@ +import { QueryExecutor } from '@crowd/data-access-layer/src/queryExecutor' +import { getServiceChildLogger } from '@crowd/logging' + +import { getSecurityContactsConfig } from '../config' + +import { RepoPackage, RepoTarget } from './types' + +const log = getServiceChildLogger('security-contacts') + +type Config = ReturnType + +export interface BatchResult { + /** Repos evaluated in this batch. 0 signals the workflow there is no more work. */ + processed: number +} + +interface SweepRow { + id: string + url: string + homepage: string | null + packages: RepoPackage[] | null +} + +/** + * One page of github repos linked to an is_critical package whose contacts are + * missing or stale. Progress is driven by repos.contacts_last_refreshed (bumped + * once a repo is evaluated), so each invocation naturally advances to the next + * unprocessed repos — no cross-invocation cursor needed. + */ +async function fetchBatch(qx: QueryExecutor, config: Config): Promise { + return qx.select( + ` + SELECT r.id::text AS id, + r.url, + r.homepage, + json_agg(json_build_object('purl', p.purl, 'ecosystem', p.ecosystem)) AS packages + FROM repos r + JOIN package_repos pr ON pr.repo_id = r.id + JOIN packages p ON p.id = pr.package_id AND p.is_critical + WHERE r.host = 'github' + AND ( + r.contacts_last_refreshed IS NULL + OR r.contacts_last_refreshed < NOW() - INTERVAL '$(updateIntervalHours) hours' + ) + GROUP BY r.id + ORDER BY r.id + LIMIT $(batchSize) + `, + { batchSize: config.batchSize, updateIntervalHours: config.updateIntervalHours }, + ) +} + +function toTarget(row: SweepRow): RepoTarget { + return { + repoId: row.id, + url: row.url, + homepage: row.homepage, + packages: row.packages ?? [], + } +} + +async function runWithConcurrency( + items: T[], + concurrency: number, + task: (item: T) => Promise, +): Promise { + let idx = 0 + await Promise.all( + Array.from({ length: Math.min(concurrency, items.length) }, async () => { + while (idx < items.length) { + const item = items[idx++] + await task(item) + } + }), + ) +} + +/** + * Step 2 placeholder pipeline: builds the RepoTarget and logs it. Extractors, + * reconcile, score and write are wired in later steps; until then we bump + * contacts_last_refreshed so the sweep makes progress and the workflow can drain. + */ +async function processRepo(target: RepoTarget): Promise { + log.trace( + { repoId: target.repoId, url: target.url, packages: target.packages.length }, + 'Evaluated repo (pipeline not yet wired)', + ) +} + +/** Marks repos as evaluated. In Step 6 this moves into the per-repo write tx. */ +async function markRefreshed(qx: QueryExecutor, repoIds: string[]): Promise { + if (repoIds.length === 0) return + await qx.result( + `UPDATE repos SET contacts_last_refreshed = NOW() WHERE id = ANY($(repoIds)::bigint[])`, + { repoIds }, + ) +} + +export async function processBatch(qx: QueryExecutor, config: Config): Promise { + const batch = await fetchBatch(qx, config) + if (batch.length === 0) return { processed: 0 } + + const targets = batch.map(toTarget) + await runWithConcurrency(targets, config.concurrency, processRepo) + await markRefreshed( + qx, + targets.map((t) => t.repoId), + ) + + log.info({ processed: targets.length }, 'Security contacts batch complete') + return { processed: targets.length } +} diff --git a/services/apps/packages_worker/src/security-contacts/schedule.ts b/services/apps/packages_worker/src/security-contacts/schedule.ts new file mode 100644 index 0000000000..1ea3a4ec5b --- /dev/null +++ b/services/apps/packages_worker/src/security-contacts/schedule.ts @@ -0,0 +1,41 @@ +import { ScheduleAlreadyRunning, ScheduleOverlapPolicy } from '@temporalio/client' + +import { svc } from '../service' +import { ingestSecurityContacts } from '../workflows' + +export async function scheduleSecurityContactsIngestion(): Promise { + const { temporal } = svc + if (!temporal) throw new Error('Temporal client not initialized') + + try { + await temporal.schedule.create({ + scheduleId: 'security-contacts-ingestion', + spec: { + cronExpressions: ['0 6 * * *'], + }, + policies: { + overlap: ScheduleOverlapPolicy.SKIP, + catchupWindow: '1 hour', + }, + action: { + type: 'startWorkflow', + workflowType: ingestSecurityContacts, + workflowId: 'security-contacts-daily', + taskQueue: 'packages-worker', + workflowRunTimeout: '24 hours', + retry: { + initialInterval: '30 seconds', + backoffCoefficient: 2, + maximumAttempts: 3, + }, + args: [], + }, + }) + } catch (err) { + if (err instanceof ScheduleAlreadyRunning) { + svc.log.info('Schedule security-contacts-ingestion already exists, skipping creation.') + } else { + throw err + } + } +} diff --git a/services/apps/packages_worker/src/security-contacts/types.ts b/services/apps/packages_worker/src/security-contacts/types.ts new file mode 100644 index 0000000000..7df1a3a54c --- /dev/null +++ b/services/apps/packages_worker/src/security-contacts/types.ts @@ -0,0 +1,60 @@ +export type ContactChannel = 'email' | 'github-pvr' | 'url' | 'github-handle' | 'web-form' + +export type ContactRole = 'security-team' | 'maintainer' | 'admin' | 'committer' | 'org-owner' + +export type ConfidenceBand = 'PRIMARY' | 'SECONDARY' | 'FALLBACK' | 'NONE' + +export type SourceTier = 'A' | 'B' | 'C' | 'D' + +/** Where a single contact value was observed, for auditability and corroboration scoring. */ +export interface ProvenanceEntry { + /** Human-readable source identifier, e.g. 'security-insights', 'pvr', 'security.md'. */ + source: string + sourceTier: SourceTier + /** Path or URL the value was read from, when applicable. */ + path?: string + /** When this worker fetched the source. ISO-8601. */ + fetchedAt: string + /** When the source itself declared the value (e.g. SECURITY-INSIGHTS last-updated). ISO-8601. */ + declaredAt?: string +} + +/** Extractor output, before reconciliation and scoring. */ +export interface RawContact { + channel: ContactChannel + value: string + role: ContactRole + name?: string + tier: SourceTier + provenance: ProvenanceEntry[] +} + +export interface ScoredContact extends RawContact { + score: number + confidence: ConfidenceBand +} + +export interface RepoPolicies { + securityPolicyUrl?: string + vulnerabilityReportingUrl?: string + bugBountyUrl?: string + securityTxtUrl?: string + pvrEnabled?: boolean +} + +export interface ExtractorResult { + contacts: RawContact[] + policies: Partial +} + +export interface RepoPackage { + purl: string + ecosystem: string +} + +export interface RepoTarget { + repoId: string + url: string + homepage: string | null + packages: RepoPackage[] +} diff --git a/services/apps/packages_worker/src/security-contacts/workflows.ts b/services/apps/packages_worker/src/security-contacts/workflows.ts new file mode 100644 index 0000000000..2b15660963 --- /dev/null +++ b/services/apps/packages_worker/src/security-contacts/workflows.ts @@ -0,0 +1,21 @@ +import { continueAsNew, log, proxyActivities } from '@temporalio/workflow' + +import type * as activities from './activities' + +const acts = proxyActivities({ + startToCloseTimeout: '30 minutes', + retry: { + initialInterval: '30 seconds', + backoffCoefficient: 2, + maximumAttempts: 5, + }, +}) + +export async function ingestSecurityContacts(): Promise { + const result = await acts.processSecurityContactsBatch() + if (result.processed === 0) { + log.info('Security contacts ingestion complete — no more work, exiting.') + return + } + await continueAsNew() +} diff --git a/services/apps/packages_worker/src/workflows/index.ts b/services/apps/packages_worker/src/workflows/index.ts index 219aa742fc..22b899d0ec 100644 --- a/services/apps/packages_worker/src/workflows/index.ts +++ b/services/apps/packages_worker/src/workflows/index.ts @@ -21,3 +21,4 @@ export { rankPackagesWorkflow } from '../criticality/workflow' export { cargoSyncWorkflow } from '../cargo/workflows' export { enrichGoVersions, enrichGoStatus } from '../go/workflows' export { ingestNuGetPackages } from '../nuget/workflows' +export { ingestSecurityContacts } from '../security-contacts/workflows' From 7bc5c4fb02fa479cea4566855a2733466c8b27b2 Mon Sep 17 00:00:00 2001 From: Mouad BANI Date: Mon, 29 Jun 2026 13:42:12 +0100 Subject: [PATCH 02/18] feat: security contacts worker scaffold, scoring and reconciliation (CM-1243) Signed-off-by: Mouad BANI --- .../src/security-contacts/reconcile.ts | 109 ++++++++++++++++++ .../src/security-contacts/score.ts | 107 +++++++++++++++++ 2 files changed, 216 insertions(+) create mode 100644 services/apps/packages_worker/src/security-contacts/reconcile.ts create mode 100644 services/apps/packages_worker/src/security-contacts/score.ts diff --git a/services/apps/packages_worker/src/security-contacts/reconcile.ts b/services/apps/packages_worker/src/security-contacts/reconcile.ts new file mode 100644 index 0000000000..a3759619e5 --- /dev/null +++ b/services/apps/packages_worker/src/security-contacts/reconcile.ts @@ -0,0 +1,109 @@ +import { scoreContact } from './score' +import { + ContactChannel, + ContactRole, + ProvenanceEntry, + RawContact, + ScoredContact, + SourceTier, +} from './types' + +const MAX_CONTACTS = 5 + +const ROLE_PRIORITY: Record = { + 'security-team': 5, + maintainer: 4, + admin: 3, + committer: 2, + 'org-owner': 1, +} + +const TIER_RANK: Record = { A: 4, B: 3, C: 2, D: 1 } + +function normalizeValue(channel: ContactChannel, value: string): string { + const v = value.trim() + return channel === 'email' || channel === 'github-handle' ? v.toLowerCase() : v +} + +function higherRole(a: ContactRole, b: ContactRole): ContactRole { + return ROLE_PRIORITY[a] >= ROLE_PRIORITY[b] ? a : b +} + +function higherTier(a: SourceTier, b: SourceTier): SourceTier { + return TIER_RANK[a] >= TIER_RANK[b] ? a : b +} + +function dedupeProvenance(entries: ProvenanceEntry[]): ProvenanceEntry[] { + const seen = new Set() + const out: ProvenanceEntry[] = [] + for (const e of entries) { + const key = `${e.source}|${e.path ?? ''}|${e.fetchedAt}` + if (seen.has(key)) continue + seen.add(key) + out.push(e) + } + return out +} + +function mergeInto(target: RawContact, src: RawContact): void { + target.provenance.push(...src.provenance) + target.role = higherRole(target.role, src.role) + target.tier = higherTier(target.tier, src.tier) + if (!target.name && src.name) target.name = src.name +} + +function exactMatchMerge(contacts: RawContact[]): RawContact[] { + const byKey = new Map() + for (const c of contacts) { + const key = `${c.channel}:${normalizeValue(c.channel, c.value)}` + const existing = byKey.get(key) + if (existing) { + mergeInto(existing, c) + } else { + byKey.set(key, { ...c, provenance: [...c.provenance] }) + } + } + return [...byKey.values()] +} + +// Collapse a resolved handle into its email: extractors set the email contact's +// `name` to the originating handle, so a github-handle whose value matches becomes +// redundant once the email is known. The email (higher-quality) channel is kept. +function identityLinkMerge(contacts: RawContact[]): RawContact[] { + const emailByHandle = new Map() + for (const c of contacts) { + if (c.channel === 'email' && c.name) emailByHandle.set(c.name.toLowerCase(), c) + } + + const out: RawContact[] = [] + for (const c of contacts) { + if (c.channel === 'github-handle') { + const email = emailByHandle.get(normalizeValue(c.channel, c.value)) + if (email) { + mergeInto(email, c) + continue + } + } + out.push(c) + } + return out +} + +export function reconcile(contacts: RawContact[], now: Date = new Date()): ScoredContact[] { + const merged = identityLinkMerge(exactMatchMerge(contacts)) + + const scored: ScoredContact[] = merged.map((c) => { + const contact = { ...c, provenance: dedupeProvenance(c.provenance) } + return { ...contact, ...scoreContact(contact, now) } + }) + + scored.sort( + (a, b) => + b.score - a.score || + ROLE_PRIORITY[b.role] - ROLE_PRIORITY[a.role] || + TIER_RANK[b.tier] - TIER_RANK[a.tier] || + a.value.localeCompare(b.value), + ) + + return scored.slice(0, MAX_CONTACTS) +} diff --git a/services/apps/packages_worker/src/security-contacts/score.ts b/services/apps/packages_worker/src/security-contacts/score.ts new file mode 100644 index 0000000000..1d14aa4578 --- /dev/null +++ b/services/apps/packages_worker/src/security-contacts/score.ts @@ -0,0 +1,107 @@ +import { ConfidenceBand, ContactChannel, ProvenanceEntry, RawContact, SourceTier } from './types' + +const WEIGHTS = { tier: 0.55, channel: 0.2, freshness: 0.15, corroboration: 0.1 } + +const TIER_SCORE: Record = { A: 1.0, B: 0.7, C: 0.4, D: 0.2 } + +// github-handle contacts carry no resolved email; nudge them below an equivalent email +const HANDLE_ONLY_PENALTY = 0.05 + +const FRESH_DAYS = 90 +const STALE_DAYS = 730 +const DAY_MS = 24 * 60 * 60 * 1000 + +const SECURITY_LOCALPARTS = new Set([ + 'security', + 'secure', + 'psirt', + 'sirt', + 'cert', + 'cve', + 'abuse', + 'vuln', + 'vulnerability', + 'vulnerabilities', + 'disclosure', +]) + +const GENERIC_LOCALPARTS = new Set([ + 'info', + 'team', + 'contact', + 'hello', + 'hi', + 'support', + 'admin', + 'help', + 'maintainers', + 'dev', + 'devs', + 'opensource', + 'open-source', + 'office', + 'mail', +]) + +function emailQuality(value: string): number { + const localPart = value.split('@')[0]?.toLowerCase().trim() ?? '' + if (SECURITY_LOCALPARTS.has(localPart) || localPart.startsWith('security')) return 1.0 + if (GENERIC_LOCALPARTS.has(localPart)) return 0.7 + return 0.6 +} + +function channelQuality(channel: ContactChannel, value: string): number { + switch (channel) { + case 'email': + return emailQuality(value) + case 'github-pvr': + return 0.95 + case 'web-form': + case 'url': + return 0.5 + case 'github-handle': + return 0.4 + } +} + +function freshnessScore(provenance: ProvenanceEntry[], now: Date): number { + const declaredTimes = provenance + .map((p) => new Date(p.declaredAt ?? p.fetchedAt).getTime()) + .filter((t) => !Number.isNaN(t)) + if (declaredTimes.length === 0) return 0 + + const ageDays = (now.getTime() - Math.max(...declaredTimes)) / DAY_MS + if (ageDays <= FRESH_DAYS) return 1.0 + if (ageDays >= STALE_DAYS) return 0 + return 1 - (ageDays - FRESH_DAYS) / (STALE_DAYS - FRESH_DAYS) +} + +// Independent = distinct extractors, not the same file re-fetched +function corroborationScore(provenance: ProvenanceEntry[]): number { + const sources = new Set(provenance.map((p) => p.source)) + if (sources.size >= 3) return 1.0 + if (sources.size === 2) return 0.5 + return 0 +} + +export function confidenceBand(score: number): ConfidenceBand { + if (score >= 0.8) return 'PRIMARY' + if (score >= 0.55) return 'SECONDARY' + if (score >= 0.3) return 'FALLBACK' + return 'NONE' +} + +export function scoreContact( + contact: RawContact, + now: Date = new Date(), +): { score: number; confidence: ConfidenceBand } { + const raw = + WEIGHTS.tier * TIER_SCORE[contact.tier] + + WEIGHTS.channel * channelQuality(contact.channel, contact.value) + + WEIGHTS.freshness * freshnessScore(contact.provenance, now) + + WEIGHTS.corroboration * corroborationScore(contact.provenance) - + (contact.channel === 'github-handle' ? HANDLE_ONLY_PENALTY : 0) + + const score = Math.round(Math.min(1, Math.max(0, raw)) * 1000) / 1000 + return { score, confidence: confidenceBand(score) } +} From 55fee287f933f3b7e38ad271b28abe9e1cf55bc1 Mon Sep 17 00:00:00 2001 From: Mouad BANI Date: Mon, 29 Jun 2026 15:42:10 +0100 Subject: [PATCH 03/18] feat: add security-insights extractor (A1) for security contacts (CM-1243) Signed-off-by: Mouad BANI --- pnpm-lock.yaml | 16 +- services/apps/packages_worker/package.json | 2 + .../src/security-contacts/extractors/http.ts | 60 +++++ .../src/security-contacts/extractors/pvr.ts | 55 +++++ .../extractors/securityInsights.ts | 223 ++++++++++++++++++ .../src/security-contacts/types.ts | 8 + 6 files changed, 355 insertions(+), 9 deletions(-) create mode 100644 services/apps/packages_worker/src/security-contacts/extractors/http.ts create mode 100644 services/apps/packages_worker/src/security-contacts/extractors/pvr.ts create mode 100644 services/apps/packages_worker/src/security-contacts/extractors/securityInsights.ts diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 7541586a7a..452aa14b31 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -1345,6 +1345,9 @@ importers: fast-xml-parser: specifier: ^5.8.0 version: 5.8.0 + js-yaml: + specifier: ^4.1.1 + version: 4.1.1 jsonwebtoken: specifier: ^9.0.0 version: 9.0.3 @@ -1367,6 +1370,9 @@ importers: specifier: ^0.12.3 version: 0.12.3 devDependencies: + '@types/js-yaml': + specifier: ^4.0.9 + version: 4.0.9 '@types/jsonwebtoken': specifier: ^9.0.0 version: 9.0.6 @@ -7625,10 +7631,6 @@ packages: resolution: {integrity: sha512-PMSmkqxr106Xa156c2M265Z+FTrPl+oxd/rgOQy2tijQeK5TxQ43psO1ZCwhVOSdnn+RzkzlRz/eY4BgJBYVpg==} hasBin: true - js-yaml@4.1.0: - resolution: {integrity: sha512-wpxZs9NoxZaJESJGIZTyDEaYpl0FKSA+FB9aJiyemKhMwkxQg63h4T1KJgUGHpTqPDNRcmmYLugrRjJlBtWvRA==} - hasBin: true - js-yaml@4.1.1: resolution: {integrity: sha512-qQKT4zQxXl8lLwBtHMWwaTcGfFOZviOJet3Oy/xmGk2gZH677CJM9EvtfdSkgWcATZhj/55JZ0rmy3myCT5lsA==} hasBin: true @@ -16064,7 +16066,7 @@ snapshots: imurmurhash: 0.1.4 is-glob: 4.0.3 is-path-inside: 3.0.3 - js-yaml: 4.1.0 + js-yaml: 4.1.1 json-stable-stringify-without-jsonify: 1.0.1 levn: 0.4.1 lodash.merge: 4.6.2 @@ -17299,10 +17301,6 @@ snapshots: argparse: 1.0.10 esprima: 4.0.1 - js-yaml@4.1.0: - dependencies: - argparse: 2.0.1 - js-yaml@4.1.1: dependencies: argparse: 2.0.1 diff --git a/services/apps/packages_worker/package.json b/services/apps/packages_worker/package.json index d372b3465c..79600403dd 100644 --- a/services/apps/packages_worker/package.json +++ b/services/apps/packages_worker/package.json @@ -72,6 +72,7 @@ "@temporalio/activity": "~1.17.2", "@temporalio/client": "~1.17.2", "@temporalio/workflow": "~1.17.2", + "js-yaml": "^4.1.1", "jsonwebtoken": "^9.0.0", "semver": "^7.6.0", "axios": "^1.16.1", @@ -83,6 +84,7 @@ "unzipper": "^0.12.3" }, "devDependencies": { + "@types/js-yaml": "^4.0.9", "@types/jsonwebtoken": "^9.0.0", "@types/node": "^20.8.2", "@types/pg-copy-streams": "^1.2.5", diff --git a/services/apps/packages_worker/src/security-contacts/extractors/http.ts b/services/apps/packages_worker/src/security-contacts/extractors/http.ts new file mode 100644 index 0000000000..bf83bb2e3d --- /dev/null +++ b/services/apps/packages_worker/src/security-contacts/extractors/http.ts @@ -0,0 +1,60 @@ +export const RAW_BASE = 'https://raw.githubusercontent.com' +export const GITHUB_API = 'https://api.github.com' + +export interface FetchTextResult { + status: number + text: string | null +} + +export async function fetchText( + url: string, + timeoutMs: number, + headers: Record = {}, +): Promise { + const controller = new AbortController() + const timeoutId = setTimeout(() => controller.abort(), timeoutMs) + try { + const res = await fetch(url, { headers, signal: controller.signal }) + if (res.status !== 200) return { status: res.status, text: null } + return { status: 200, text: await res.text() } + } finally { + clearTimeout(timeoutId) + } +} + +export interface FetchJsonResult { + status: number + json: unknown | null +} + +export async function fetchJson( + url: string, + timeoutMs: number, + headers: Record = {}, +): Promise { + const controller = new AbortController() + const timeoutId = setTimeout(() => controller.abort(), timeoutMs) + try { + const res = await fetch(url, { headers, signal: controller.signal }) + if (res.status !== 200) return { status: res.status, json: null } + return { status: 200, json: await res.json() } + } finally { + clearTimeout(timeoutId) + } +} + +export function githubAuthHeaders(token: string): Record { + return { Authorization: `bearer ${token}`, Accept: 'application/vnd.github+json' } +} + +const EMAIL_RE = /^[^@\s]+@[^@\s]+\.[^@\s]+$/ + +export function isEmail(value: string): boolean { + return EMAIL_RE.test(value.trim()) +} + +/** Returns the login if the URL is a bare github profile (github.com/), else null. */ +export function githubHandleFromUrl(value: string): string | null { + const m = value.trim().match(/^https?:\/\/github\.com\/([^/\s]+)\/?$/i) + return m ? m[1] : null +} diff --git a/services/apps/packages_worker/src/security-contacts/extractors/pvr.ts b/services/apps/packages_worker/src/security-contacts/extractors/pvr.ts new file mode 100644 index 0000000000..2d742c5ed6 --- /dev/null +++ b/services/apps/packages_worker/src/security-contacts/extractors/pvr.ts @@ -0,0 +1,55 @@ +import { parseGithubUrl } from '../../enricher/fetchLightRepo' +import { Extractor, ExtractorResult } from '../types' + +import { GITHUB_API, fetchJson, githubAuthHeaders } from './http' + +const SOURCE = 'pvr' + +/* eslint-disable @typescript-eslint/no-explicit-any */ + +export function mapPvr( + body: unknown, + owner: string, + name: string, + fetchedAt: string, +): ExtractorResult { + const enabled = (body as any)?.enabled + + if (typeof enabled !== 'boolean') return { contacts: [], policies: {} } + if (!enabled) return { contacts: [], policies: { pvrEnabled: false } } + + const url = `https://github.com/${owner}/${name}/security/advisories/new` + return { + contacts: [ + { + channel: 'github-pvr', + value: url, + role: 'security-team', + tier: 'A', + provenance: [{ source: SOURCE, sourceTier: 'A', fetchedAt }], + }, + ], + policies: { pvrEnabled: true, vulnerabilityReportingUrl: url }, + } +} + +export const extractPvr: Extractor = async (target, deps) => { + let owner: string + let name: string + try { + ;({ owner, name } = parseGithubUrl(target.url)) + } catch { + return { contacts: [], policies: {} } + } + + // The endpoint works unauthenticated, but we use the token pool for rate-limit budget. + const headers = deps.getToken ? githubAuthHeaders(await deps.getToken()) : {} + const { json } = await fetchJson( + `${GITHUB_API}/repos/${owner}/${name}/private-vulnerability-reporting`, + deps.fetchTimeoutMs, + headers, + ) + if (!json) return { contacts: [], policies: {} } + + return mapPvr(json, owner, name, new Date().toISOString()) +} diff --git a/services/apps/packages_worker/src/security-contacts/extractors/securityInsights.ts b/services/apps/packages_worker/src/security-contacts/extractors/securityInsights.ts new file mode 100644 index 0000000000..b4a9db1a7b --- /dev/null +++ b/services/apps/packages_worker/src/security-contacts/extractors/securityInsights.ts @@ -0,0 +1,223 @@ +import yaml from 'js-yaml' + +import { getServiceChildLogger } from '@crowd/logging' + +import { parseGithubUrl } from '../../enricher/fetchLightRepo' +import { + ContactChannel, + ContactRole, + Extractor, + ExtractorResult, + ProvenanceEntry, + RawContact, + RepoPolicies, +} from '../types' + +import { RAW_BASE, fetchText, githubHandleFromUrl, isEmail } from './http' + +const log = getServiceChildLogger('security-contacts:security-insights') + +const SOURCE = 'security-insights' +const PATHS = [ + 'SECURITY-INSIGHTS.yml', + '.github/SECURITY-INSIGHTS.yml', + '.gitlab/SECURITY-INSIGHTS.yml', +] + +/* eslint-disable @typescript-eslint/no-explicit-any */ + +function arr(x: unknown): any[] { + return Array.isArray(x) ? x : [] +} + +function classifyValue(raw: string): { channel: ContactChannel; value: string } | null { + const v = raw.trim() + if (!v) return null + if (v.toLowerCase().startsWith('github:')) { + const handle = v.slice('github:'.length).trim() + return handle ? { channel: 'github-handle', value: handle } : null + } + if (isEmail(v)) return { channel: 'email', value: v } + const handle = githubHandleFromUrl(v) + if (handle) return { channel: 'github-handle', value: handle } + if (/^https?:\/\//i.test(v)) return { channel: 'url', value: v } + if (v.startsWith('@')) return { channel: 'github-handle', value: v.slice(1) } + return { channel: 'github-handle', value: v } +} + +// MAINTAINERS.md / OWNERS.md links are documents, not contacts. +function isDocumentUrl(value: string): boolean { + const v = value.toLowerCase() + return v.includes('/blob/') || v.endsWith('.md') +} + +type Mapped = { channel: ContactChannel; value: string; name?: string } + +// security-contacts entry: typed object {type,value} or a bare string +function fromContactEntry(entry: unknown): Mapped | null { + if (typeof entry === 'string') return classifyValue(entry) + if (entry && typeof entry === 'object') { + const o = entry as any + const value = o.value ?? o.email ?? o.url + if (typeof value !== 'string') return null + const name = typeof o.name === 'string' ? o.name : undefined + if (o.type === 'email') return { channel: 'email', value, name } + if (o.type === 'url') return { channel: 'url', value, name } + const c = classifyValue(value) + return c ? { ...c, name } : null + } + return null +} + +// people objects (core-team / administrators / champions / v2 contact): prefer email, else social handle +function fromPerson(p: unknown): Mapped | null { + if (!p || typeof p !== 'object') return null + const o = p as any + const name = typeof o.name === 'string' ? o.name : undefined + if (typeof o.email === 'string' && isEmail(o.email)) + return { channel: 'email', value: o.email, name } + if (typeof o.social === 'string') { + const c = classifyValue(o.social) + if (c) return { ...c, name } + } + return null +} + +export function mapSecurityInsights( + doc: unknown, + provPath: string, + fetchedAt: string, +): ExtractorResult { + const contacts: RawContact[] = [] + const policies: Partial = {} + if (doc == null || typeof doc !== 'object') return { contacts, policies } + const root = doc as any + + const declaredAt: string | undefined = + typeof root.header?.['last-updated'] === 'string' + ? root.header['last-updated'] + : typeof root.header?.['last-reviewed'] === 'string' + ? root.header['last-reviewed'] + : undefined + + const prov = (): ProvenanceEntry[] => [ + { source: SOURCE, sourceTier: 'A', path: provPath, fetchedAt, declaredAt }, + ] + + const add = (m: Mapped | null, role: ContactRole): void => { + if (m) contacts.push({ ...m, role, tier: 'A', provenance: prov() }) + } + const setPolicy = (key: keyof RepoPolicies, value: unknown): void => { + if (!policies[key] && typeof value === 'string' && value.trim()) { + ;(policies as any)[key] = value.trim() + } + } + + // Scan flat (v1) and nested (v2 project/repository) layouts uniformly. + const sections = [root, root.project, root.repository].filter((s) => s && typeof s === 'object') + + for (const section of sections) { + for (const e of arr(section['security-contacts'])) add(fromContactEntry(e), 'security-team') + + const vr = section['vulnerability-reporting'] + if (vr && typeof vr === 'object') { + if (typeof vr['email-contact'] === 'string' && isEmail(vr['email-contact'])) { + add({ channel: 'email', value: vr['email-contact'] }, 'security-team') + } + add(fromPerson(vr['contact']), 'security-team') + for (const e of arr(vr['security-contacts'])) add(fromContactEntry(e), 'security-team') + setPolicy('securityPolicyUrl', vr['security-policy']) + const programUrl = arr(vr['bug-bounty-programs']).find((p) => typeof p?.url === 'string')?.url + setPolicy('bugBountyUrl', vr['bug-bounty-url'] ?? programUrl) + } + + for (const p of arr(section['core-team'])) add(fromPerson(p), 'maintainer') + for (const p of arr(section['administrators'])) add(fromPerson(p), 'maintainer') + + const security = section['security'] + if (security && typeof security === 'object') { + for (const p of arr(security['champions'])) add(fromPerson(p), 'security-team') + } + + const pl = section['project-lifecycle'] + if (pl && typeof pl === 'object') { + for (const m of arr(pl['core-maintainers'])) { + if (typeof m !== 'string') { + add(fromPerson(m), 'maintainer') + continue + } + const c = classifyValue(m) + if (!c) continue + // MAINTAINERS.md/OWNERS.md are non-standardized markdown — not safely parseable. + // TODO(CM-1243): follow + extract maintainers from these documents in a later pass. + if (c.channel === 'url' && isDocumentUrl(c.value)) { + log.info( + { provPath, document: c.value }, + 'TODO(security-contacts): unparsed core-maintainers document', + ) + continue + } + add(c, 'maintainer') + } + } + + const documentation = section['documentation'] + if (documentation && typeof documentation === 'object') { + setPolicy('securityPolicyUrl', documentation['security-policy']) + } + } + + return { contacts, policies } +} + +export function parseSecurityInsights( + text: string, + provPath: string, + fetchedAt: string, +): ExtractorResult { + let doc: unknown + try { + doc = yaml.load(text) + } catch (err) { + log.warn({ provPath, errMsg: (err as Error).message }, 'Failed to parse SECURITY-INSIGHTS.yml') + return { contacts: [], policies: {} } + } + return mapSecurityInsights(doc, provPath, fetchedAt) +} + +export const extractSecurityInsights: Extractor = async (target, deps) => { + let owner: string + let name: string + try { + ;({ owner, name } = parseGithubUrl(target.url)) + } catch { + return { contacts: [], policies: {} } + } + + const fetchedAt = new Date().toISOString() + + for (const path of PATHS) { + const { text } = await fetchText( + `${RAW_BASE}/${owner}/${name}/HEAD/${path}`, + deps.fetchTimeoutMs, + ) + if (!text) continue + + let doc: unknown + try { + doc = yaml.load(text) + } catch { + continue + } + + const redirect = (doc as any)?.header?.['project-si-source'] + if (typeof redirect === 'string' && /^https?:\/\//i.test(redirect)) { + const redirected = await fetchText(redirect, deps.fetchTimeoutMs) + if (redirected.text) return parseSecurityInsights(redirected.text, redirect, fetchedAt) + } + + return mapSecurityInsights(doc, path, fetchedAt) + } + + return { contacts: [], policies: {} } +} diff --git a/services/apps/packages_worker/src/security-contacts/types.ts b/services/apps/packages_worker/src/security-contacts/types.ts index 7df1a3a54c..2c3e81901d 100644 --- a/services/apps/packages_worker/src/security-contacts/types.ts +++ b/services/apps/packages_worker/src/security-contacts/types.ts @@ -58,3 +58,11 @@ export interface RepoTarget { homepage: string | null packages: RepoPackage[] } + +export interface ExtractorDeps { + fetchTimeoutMs: number + /** Mints a GitHub installation token; only the authed extractors (A2/A3) use it. */ + getToken?: () => Promise +} + +export type Extractor = (target: RepoTarget, deps: ExtractorDeps) => Promise From 9a851a685400ce425a524602b58b06731684f858 Mon Sep 17 00:00:00 2001 From: Mouad BANI Date: Mon, 29 Jun 2026 15:49:25 +0100 Subject: [PATCH 04/18] feat: add SECURITY_CONTACTS extractor (A3) for security contacts Signed-off-by: Mouad BANI --- .../extractors/securityContactsFile.ts | 97 +++++++++++++++++++ 1 file changed, 97 insertions(+) create mode 100644 services/apps/packages_worker/src/security-contacts/extractors/securityContactsFile.ts diff --git a/services/apps/packages_worker/src/security-contacts/extractors/securityContactsFile.ts b/services/apps/packages_worker/src/security-contacts/extractors/securityContactsFile.ts new file mode 100644 index 0000000000..e58d55d484 --- /dev/null +++ b/services/apps/packages_worker/src/security-contacts/extractors/securityContactsFile.ts @@ -0,0 +1,97 @@ +import { getServiceChildLogger } from '@crowd/logging' + +import { parseGithubUrl } from '../../enricher/fetchLightRepo' +import { Extractor, ProvenanceEntry, RawContact } from '../types' + +import { GITHUB_API, RAW_BASE, fetchJson, fetchText, githubAuthHeaders, isEmail } from './http' + +const log = getServiceChildLogger('security-contacts:security_contacts-file') + +const SOURCE = 'security_contacts' +const PATH = 'SECURITY_CONTACTS' +const HANDLE_RE = /^[a-zA-Z0-9](?:[a-zA-Z0-9-]*[a-zA-Z0-9])?$/ + +export interface SecurityContactEntry { + handle: string + email?: string +} + +export function parseSecurityContacts(text: string): SecurityContactEntry[] { + const entries: SecurityContactEntry[] = [] + for (const rawLine of text.split('\n')) { + const line = rawLine.trim() + if (!line || line.startsWith('#')) continue + + const tokens = line.replace(/^-\s*/, '').split(/\s+/) + const handle = tokens[0].replace(/^@/, '') + if (!HANDLE_RE.test(handle)) continue + + const email = tokens.slice(1).find((t) => isEmail(t)) + entries.push(email ? { handle, email } : { handle }) + } + return entries +} + +async function resolvePublicEmail( + login: string, + timeoutMs: number, + token?: string, +): Promise { + try { + const { json } = await fetchJson( + `${GITHUB_API}/users/${login}`, + timeoutMs, + token ? githubAuthHeaders(token) : {}, + ) + const email = (json as { email?: unknown } | null)?.email + return typeof email === 'string' && isEmail(email) ? email : null + } catch (err) { + log.warn({ login, errMsg: (err as Error).message }, 'Handle email resolution failed') + return null + } +} + +export const extractSecurityContactsFile: Extractor = async (target, deps) => { + let owner: string + let name: string + try { + ;({ owner, name } = parseGithubUrl(target.url)) + } catch { + return { contacts: [], policies: {} } + } + + const { text } = await fetchText(`${RAW_BASE}/${owner}/${name}/HEAD/${PATH}`, deps.fetchTimeoutMs) + if (!text) return { contacts: [], policies: {} } + + const fetchedAt = new Date().toISOString() + const prov = (): ProvenanceEntry[] => [{ source: SOURCE, sourceTier: 'A', path: PATH, fetchedAt }] + + const token = deps.getToken ? await deps.getToken() : undefined + const contacts: RawContact[] = [] + + for (const entry of parseSecurityContacts(text)) { + const email = + entry.email ?? (await resolvePublicEmail(entry.handle, deps.fetchTimeoutMs, token)) + if (email) { + // name = handle so the reconciler can identity-link this to a bare handle from another source + contacts.push({ + channel: 'email', + value: email, + name: entry.handle, + role: 'security-team', + tier: 'A', + provenance: prov(), + }) + } else { + contacts.push({ + channel: 'github-handle', + value: entry.handle, + role: 'security-team', + tier: 'A', + provenance: prov(), + }) + } + } + + return { contacts, policies: {} } +} From 351a6c9e02de9972b6aba18328839c830eae9727 Mon Sep 17 00:00:00 2001 From: Mouad BANI Date: Mon, 29 Jun 2026 15:53:02 +0100 Subject: [PATCH 05/18] feat: add security.txt extractor (A4) for security contacts Signed-off-by: Mouad BANI --- .../extractors/securityTxt.ts | 80 +++++++++++++++++++ 1 file changed, 80 insertions(+) create mode 100644 services/apps/packages_worker/src/security-contacts/extractors/securityTxt.ts diff --git a/services/apps/packages_worker/src/security-contacts/extractors/securityTxt.ts b/services/apps/packages_worker/src/security-contacts/extractors/securityTxt.ts new file mode 100644 index 0000000000..a6bc43dfd6 --- /dev/null +++ b/services/apps/packages_worker/src/security-contacts/extractors/securityTxt.ts @@ -0,0 +1,80 @@ +import { Extractor, ExtractorResult, ProvenanceEntry, RawContact, RepoPolicies } from '../types' + +import { fetchText, isEmail } from './http' + +const SOURCE = 'security.txt' + +// Platform hosts serve their own security.txt, not the project's — never attribute it to a repo. +const PLATFORM_HOSTS = new Set(['github.com', 'www.github.com', 'gitlab.com', 'bitbucket.org']) + +const FIELD_RE = /^([A-Za-z-]+):\s*(.+?)\s*$/ + +export function parseSecurityTxt( + text: string, + sourceUrl: string, + fetchedAt: string, +): ExtractorResult { + const contacts: RawContact[] = [] + const policies: Partial = {} + + // Reject HTML served for a missing file (e.g. SPA 200s). + if (/]/i.test(text)) return { contacts, policies } + + const prov = (): ProvenanceEntry[] => [ + { source: SOURCE, sourceTier: 'A', path: sourceUrl, fetchedAt }, + ] + const add = (channel: RawContact['channel'], value: string): void => { + contacts.push({ channel, value, role: 'security-team', tier: 'A', provenance: prov() }) + } + + let sawField = false + for (const rawLine of text.split('\n')) { + const line = rawLine.trim() + if (!line || line.startsWith('#')) continue + const m = FIELD_RE.exec(line) + if (!m) continue + const field = m[1].toLowerCase() + const value = m[2].trim() + + if (field === 'contact') { + sawField = true + if (value.toLowerCase().startsWith('mailto:')) { + const email = value.slice('mailto:'.length).trim() + if (isEmail(email)) add('email', email) + } else if (isEmail(value)) { + add('email', value) + } else if (/^https?:\/\//i.test(value)) { + add('url', value) + } + // tel: and other schemes are not actionable channels — skip + } else if (field === 'policy') { + sawField = true + if (!policies.securityPolicyUrl && /^https?:\/\//i.test(value)) + policies.securityPolicyUrl = value + } + } + + if (sawField) policies.securityTxtUrl = sourceUrl + return { contacts, policies } +} + +export const extractSecurityTxt: Extractor = async (target, deps) => { + if (!target.homepage) return { contacts: [], policies: {} } + + let origin: string + let host: string + try { + const u = new URL(target.homepage) + origin = u.origin + host = u.hostname.toLowerCase() + } catch { + return { contacts: [], policies: {} } + } + if (PLATFORM_HOSTS.has(host)) return { contacts: [], policies: {} } + + const url = `${origin}/.well-known/security.txt` + const { text } = await fetchText(url, deps.fetchTimeoutMs) + if (!text) return { contacts: [], policies: {} } + + return parseSecurityTxt(text, url, new Date().toISOString()) +} From 6eb4137cea6e0363c0356f858b276981ad1a37e2 Mon Sep 17 00:00:00 2001 From: Mouad BANI Date: Mon, 29 Jun 2026 17:34:50 +0100 Subject: [PATCH 06/18] feat: add SECURITY.md extractor (B1) for security contacts Signed-off-by: Mouad BANI --- .../extractors/securityMd.ts | 121 ++++++++++++++++++ 1 file changed, 121 insertions(+) create mode 100644 services/apps/packages_worker/src/security-contacts/extractors/securityMd.ts diff --git a/services/apps/packages_worker/src/security-contacts/extractors/securityMd.ts b/services/apps/packages_worker/src/security-contacts/extractors/securityMd.ts new file mode 100644 index 0000000000..5491c36c0d --- /dev/null +++ b/services/apps/packages_worker/src/security-contacts/extractors/securityMd.ts @@ -0,0 +1,121 @@ +import { parseGithubUrl } from '../../enricher/fetchLightRepo' +import { + ContactChannel, + Extractor, + ExtractorResult, + ProvenanceEntry, + RawContact, + RepoPolicies, +} from '../types' + +import { RAW_BASE, fetchText, githubHandleFromUrl } from './http' + +const SOURCE = 'security.md' +const PATHS = ['SECURITY.md', '.github/SECURITY.md', 'docs/SECURITY.md'] + +const KEYWORD_RE = + /\b(report|security|vulnerabilit(?:y|ies)|disclosure|contact|advisor(?:y|ies))\b/i +const NEGATIVE_SECTION_RE = /acknowledg|hall of fame|thanks|credits|honou?r|researchers/i +const EMAIL_RE = /[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}/g +const URL_RE = /https?:\/\/[^\s)<>\]"']+/g +const HEADING_RE = /^#{1,6}\s+(.*)$/ + +const PVR_RE = /private vulnerability reporting|\/security\/advisories\/new/i + +function cleanUrl(url: string): string { + return url.replace(/[.,;:]+$/, '') +} + +export function parseSecurityMd( + text: string, + owner: string, + name: string, + provPath: string, + fetchedAt: string, +): ExtractorResult { + const policies: Partial = { + securityPolicyUrl: `https://github.com/${owner}/${name}/blob/HEAD/${provPath}`, + } + const prov = (): ProvenanceEntry[] => [ + { source: SOURCE, sourceTier: 'B', path: provPath, fetchedAt }, + ] + + const seen = new Set() + const contacts: RawContact[] = [] + const add = (channel: ContactChannel, value: string): boolean => { + const key = `${channel}:${value.toLowerCase()}` + if (seen.has(key)) return false + seen.add(key) + contacts.push({ channel, value, role: 'security-team', tier: 'B', provenance: prov() }) + return true + } + + // Verbose policies list many references/people; bound how many B1 promotes. + const MAX_PER_CHANNEL = 3 + let emailCount = 0 + let urlCount = 0 + + const lines = text.split('\n') + let heading = '' + for (let i = 0; i < lines.length; i++) { + const line = lines[i] + const headingMatch = HEADING_RE.exec(line.trim()) + if (headingMatch) { + heading = headingMatch[1] + continue + } + + // Skip researcher credits / acknowledgements sections — those are not contacts. + if (NEGATIVE_SECTION_RE.test(heading)) continue + // Paragraph-local proximity: a keyword must appear within ±2 lines of the candidate. + const windowText = lines.slice(Math.max(0, i - 2), i + 3).join('\n') + if (!KEYWORD_RE.test(windowText)) continue + + if (emailCount < MAX_PER_CHANNEL) { + for (const email of line.match(EMAIL_RE) ?? []) { + if (emailCount >= MAX_PER_CHANNEL) break + if (add('email', email)) emailCount++ + } + } + if (urlCount < MAX_PER_CHANNEL) { + for (const rawUrl of line.match(URL_RE) ?? []) { + if (urlCount >= MAX_PER_CHANNEL) break + const url = cleanUrl(rawUrl) + // The canonical PVR advisory URL is emitted as a github-pvr contact below. + if (/\/security\/advisories/i.test(url)) continue + // Bare github.com/ profile links are people listings, not reporting channels. + if (githubHandleFromUrl(url)) continue + if (add('url', url)) urlCount++ + } + } + } + + // PVR redirect language corroborates A2. Emitted unconditionally here; the pipeline + // vetoes it when A2 reports PVR disabled. TODO(CM-1243): apply the A2 veto in Step 6. + if (PVR_RE.test(text)) { + add('github-pvr', `https://github.com/${owner}/${name}/security/advisories/new`) + } + + return { contacts, policies } +} + +export const extractSecurityMd: Extractor = async (target, deps) => { + let owner: string + let name: string + try { + ;({ owner, name } = parseGithubUrl(target.url)) + } catch { + return { contacts: [], policies: {} } + } + + const fetchedAt = new Date().toISOString() + for (const path of PATHS) { + const { text } = await fetchText( + `${RAW_BASE}/${owner}/${name}/HEAD/${path}`, + deps.fetchTimeoutMs, + ) + if (text) return parseSecurityMd(text, owner, name, path, fetchedAt) + } + + return { contacts: [], policies: {} } +} From 17ddd3d44a3c5775a2b9b85580dbbeeac0be7d6b Mon Sep 17 00:00:00 2001 From: Mouad BANI Date: Mon, 29 Jun 2026 17:48:44 +0100 Subject: [PATCH 07/18] feat: add npm registry contact extractor (B2/npm) Signed-off-by: Mouad BANI --- .../extractors/registry/index.ts | 41 +++++++++++++ .../extractors/registry/npm.ts | 60 +++++++++++++++++++ .../extractors/registry/purl.ts | 33 ++++++++++ 3 files changed, 134 insertions(+) create mode 100644 services/apps/packages_worker/src/security-contacts/extractors/registry/index.ts create mode 100644 services/apps/packages_worker/src/security-contacts/extractors/registry/npm.ts create mode 100644 services/apps/packages_worker/src/security-contacts/extractors/registry/purl.ts diff --git a/services/apps/packages_worker/src/security-contacts/extractors/registry/index.ts b/services/apps/packages_worker/src/security-contacts/extractors/registry/index.ts new file mode 100644 index 0000000000..4c66739330 --- /dev/null +++ b/services/apps/packages_worker/src/security-contacts/extractors/registry/index.ts @@ -0,0 +1,41 @@ +import { Extractor, RawContact, RepoPackage, RepoPolicies } from '../../types' + +import { fetchNpm } from './npm' +import { parsePurl } from './purl' + +type EcosystemFetcher = ( + parsed: ReturnType & object, + pkg: RepoPackage, + timeoutMs: number, +) => Promise + +// Keyed by the lowercased packages.ecosystem value. go has no manifest contacts. +const FETCHERS: Record = { + npm: fetchNpm, +} + +export const extractManifest: Extractor = async (target, deps) => { + const contacts: RawContact[] = [] + const policies: Partial = {} + const seenPurls = new Set() + + for (const pkg of target.packages) { + if (seenPurls.has(pkg.purl)) continue + seenPurls.add(pkg.purl) + + const fetcher = FETCHERS[pkg.ecosystem?.toLowerCase()] + if (!fetcher) continue + + const parsed = parsePurl(pkg.purl) + if (!parsed) continue + + try { + contacts.push(...(await fetcher(parsed, pkg, deps.fetchTimeoutMs))) + } catch { + // one bad package must not sink the rest + continue + } + } + + return { contacts, policies } +} diff --git a/services/apps/packages_worker/src/security-contacts/extractors/registry/npm.ts b/services/apps/packages_worker/src/security-contacts/extractors/registry/npm.ts new file mode 100644 index 0000000000..1a99996dda --- /dev/null +++ b/services/apps/packages_worker/src/security-contacts/extractors/registry/npm.ts @@ -0,0 +1,60 @@ +import { ProvenanceEntry, RawContact, RepoPackage } from '../../types' +import { fetchJson, isEmail } from '../http' + +import { ParsedPurl } from './purl' + +const SOURCE = 'npm-registry' + +/* eslint-disable @typescript-eslint/no-explicit-any */ + +function npmPackagePath(parsed: ParsedPurl): string { + const full = parsed.namespace ? `${parsed.namespace}/${parsed.name}` : parsed.name + // Scoped packages must percent-encode the slash for the registry path. + return full.startsWith('@') ? full.replace('/', '%2F') : full +} + +export function mapNpm(doc: unknown, sourceUrl: string, fetchedAt: string): RawContact[] { + if (!doc || typeof doc !== 'object') return [] + const d = doc as any + + const declaredAt = typeof d.time?.modified === 'string' ? d.time.modified : undefined + const prov = (): ProvenanceEntry[] => [ + { source: SOURCE, sourceTier: 'B', path: sourceUrl, fetchedAt, declaredAt }, + ] + + const contacts: RawContact[] = [] + const seen = new Set() + const addEmail = (email: string, name?: string): void => { + if (!isEmail(email)) return + const key = email.toLowerCase() + if (seen.has(key)) return + seen.add(key) + contacts.push({ + channel: 'email', + value: email, + name, + role: 'maintainer', + tier: 'B', + provenance: prov(), + }) + } + + if (typeof d.bugs?.email === 'string') addEmail(d.bugs.email) + for (const m of Array.isArray(d.maintainers) ? d.maintainers : []) { + if (typeof m?.email === 'string') + addEmail(m.email, typeof m.name === 'string' ? m.name : undefined) + } + + return contacts +} + +export async function fetchNpm( + parsed: ParsedPurl, + _pkg: RepoPackage, + timeoutMs: number, +): Promise { + const url = `https://registry.npmjs.org/${npmPackagePath(parsed)}` + const { json } = await fetchJson(url, timeoutMs) + if (!json) return [] + return mapNpm(json, url, new Date().toISOString()) +} diff --git a/services/apps/packages_worker/src/security-contacts/extractors/registry/purl.ts b/services/apps/packages_worker/src/security-contacts/extractors/registry/purl.ts new file mode 100644 index 0000000000..a0ea5c0454 --- /dev/null +++ b/services/apps/packages_worker/src/security-contacts/extractors/registry/purl.ts @@ -0,0 +1,33 @@ +export interface ParsedPurl { + type: string + namespace: string | null + name: string +} + +/** + * Minimal Package URL parser: pkg:TYPE/NAMESPACE/NAME@VERSION?QUALIFIERS#SUBPATH. + * Only type/namespace/name are needed to address a registry. + */ +export function parsePurl(purl: string): ParsedPurl | null { + if (!purl || !purl.startsWith('pkg:')) return null + + let rest = purl.slice('pkg:'.length) + rest = rest.split('#')[0].split('?')[0] + + const slash = rest.indexOf('/') + if (slash === -1) return null + + const type = rest.slice(0, slash).toLowerCase() + let path = rest.slice(slash + 1) + + // Version is the last @ segment (but not a leading @scope on the name). + const at = path.lastIndexOf('@') + if (at > 0) path = path.slice(0, at) + + const segments = path.split('/').filter(Boolean).map(decodeURIComponent) + if (segments.length === 0) return null + + const name = segments[segments.length - 1] + const namespace = segments.length > 1 ? segments.slice(0, -1).join('/') : null + return { type, namespace, name } +} From 271287390ee0b2441fed7e28c166de254a29e56a Mon Sep 17 00:00:00 2001 From: Mouad BANI Date: Tue, 30 Jun 2026 15:47:24 +0100 Subject: [PATCH 08/18] feat: wire security contacts pipeline + DB write Signed-off-by: Mouad BANI --- services/apps/packages_worker/src/config.ts | 2 + .../src/security-contacts/extractors/http.ts | 12 +++ .../src/security-contacts/extractors/pvr.ts | 3 +- .../extractors/registry/cargo.ts | 60 +++++++++++ .../extractors/registry/composer.ts | 72 +++++++++++++ .../extractors/registry/index.ts | 32 ++++-- .../extractors/registry/maven.ts | 90 ++++++++++++++++ .../extractors/registry/npm.ts | 14 +-- .../extractors/registry/nuget.ts | 77 ++++++++++++++ .../extractors/registry/pypi.ts | 67 ++++++++++++ .../extractors/registry/rubygems.ts | 46 ++++++++ .../extractors/securityContactsFile.ts | 2 +- .../src/security-contacts/githubToken.ts | 74 +++++++++++++ .../src/security-contacts/processBatch.ts | 100 +++++++++++------- .../src/security-contacts/types.ts | 6 +- .../src/security-contacts/writeContacts.ts | 57 ++++++++++ 16 files changed, 660 insertions(+), 54 deletions(-) create mode 100644 services/apps/packages_worker/src/security-contacts/extractors/registry/cargo.ts create mode 100644 services/apps/packages_worker/src/security-contacts/extractors/registry/composer.ts create mode 100644 services/apps/packages_worker/src/security-contacts/extractors/registry/maven.ts create mode 100644 services/apps/packages_worker/src/security-contacts/extractors/registry/nuget.ts create mode 100644 services/apps/packages_worker/src/security-contacts/extractors/registry/pypi.ts create mode 100644 services/apps/packages_worker/src/security-contacts/extractors/registry/rubygems.ts create mode 100644 services/apps/packages_worker/src/security-contacts/githubToken.ts create mode 100644 services/apps/packages_worker/src/security-contacts/writeContacts.ts diff --git a/services/apps/packages_worker/src/config.ts b/services/apps/packages_worker/src/config.ts index 6f64e46372..8a9dfae0ea 100644 --- a/services/apps/packages_worker/src/config.ts +++ b/services/apps/packages_worker/src/config.ts @@ -50,6 +50,8 @@ export function getSecurityContactsConfig() { return { // A repo is re-evaluated once its contacts are older than this many hours. updateIntervalHours: requireEnvInt('SECURITY_CONTACTS_UPDATE_INTERVAL_HOURS'), + // Sent on all registry calls; crates.io rejects requests without an identifying UA. + userAgent: requireEnv('SECURITY_CONTACTS_USER_AGENT'), // Lower than the enricher: each repo fans out to several HTTP calls across extractors concurrency: parseInt(process.env.SECURITY_CONTACTS_CONCURRENCY ?? '20', 10), fetchTimeoutMs: parseInt(process.env.SECURITY_CONTACTS_FETCH_TIMEOUT_MS ?? '10000', 10), diff --git a/services/apps/packages_worker/src/security-contacts/extractors/http.ts b/services/apps/packages_worker/src/security-contacts/extractors/http.ts index bf83bb2e3d..11ee6a5487 100644 --- a/services/apps/packages_worker/src/security-contacts/extractors/http.ts +++ b/services/apps/packages_worker/src/security-contacts/extractors/http.ts @@ -1,6 +1,12 @@ export const RAW_BASE = 'https://raw.githubusercontent.com' export const GITHUB_API = 'https://api.github.com' +// crates.io rejects requests without a descriptive User-Agent (HTTP 403); harmless elsewhere. +// https://crates.io/policies#crawlers +export function registryHeaders(userAgent: string): Record { + return { 'User-Agent': userAgent } +} + export interface FetchTextResult { status: number text: string | null @@ -48,11 +54,17 @@ export function githubAuthHeaders(token: string): Record { } const EMAIL_RE = /^[^@\s]+@[^@\s]+\.[^@\s]+$/ +const EMAIL_GLOBAL_RE = /[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}/g export function isEmail(value: string): boolean { return EMAIL_RE.test(value.trim()) } +// Pulls all email addresses out of free-form strings like "Name , Name2 ". +export function extractEmails(value: string): string[] { + return value.match(EMAIL_GLOBAL_RE) ?? [] +} + /** Returns the login if the URL is a bare github profile (github.com/), else null. */ export function githubHandleFromUrl(value: string): string | null { const m = value.trim().match(/^https?:\/\/github\.com\/([^/\s]+)\/?$/i) diff --git a/services/apps/packages_worker/src/security-contacts/extractors/pvr.ts b/services/apps/packages_worker/src/security-contacts/extractors/pvr.ts index 2d742c5ed6..674c074d40 100644 --- a/services/apps/packages_worker/src/security-contacts/extractors/pvr.ts +++ b/services/apps/packages_worker/src/security-contacts/extractors/pvr.ts @@ -43,7 +43,8 @@ export const extractPvr: Extractor = async (target, deps) => { } // The endpoint works unauthenticated, but we use the token pool for rate-limit budget. - const headers = deps.getToken ? githubAuthHeaders(await deps.getToken()) : {} + const token = deps.getToken ? await deps.getToken() : null + const headers = token ? githubAuthHeaders(token) : {} const { json } = await fetchJson( `${GITHUB_API}/repos/${owner}/${name}/private-vulnerability-reporting`, deps.fetchTimeoutMs, diff --git a/services/apps/packages_worker/src/security-contacts/extractors/registry/cargo.ts b/services/apps/packages_worker/src/security-contacts/extractors/registry/cargo.ts new file mode 100644 index 0000000000..080a383643 --- /dev/null +++ b/services/apps/packages_worker/src/security-contacts/extractors/registry/cargo.ts @@ -0,0 +1,60 @@ +import { ExtractorResult, ProvenanceEntry, RawContact } from '../../types' +import { fetchJson, registryHeaders } from '../http' + +import { ParsedPurl } from './purl' + +const SOURCE = 'crates.io' + +// crates.io policy: max 1 request/second. Serialize calls process-wide. +// https://crates.io/policies#crawlers +let nextAllowedAt = 0 +async function throttle(): Promise { + const now = Date.now() + const waitMs = Math.max(0, nextAllowedAt - now) + nextAllowedAt = Math.max(now, nextAllowedAt) + 1000 + if (waitMs > 0) await new Promise((r) => setTimeout(r, waitMs)) +} + +/* eslint-disable @typescript-eslint/no-explicit-any */ + +export function mapCargoOwners(doc: unknown, sourceUrl: string, fetchedAt: string): RawContact[] { + if (!doc || typeof doc !== 'object') return [] + const users = (doc as any).users + if (!Array.isArray(users)) return [] + + const prov = (): ProvenanceEntry[] => [ + { source: SOURCE, sourceTier: 'B', path: sourceUrl, fetchedAt }, + ] + const contacts: RawContact[] = [] + const seen = new Set() + for (const u of users) { + const login = u?.login + // crates.io logins like "github:org:team" are teams, not personal handles. + if (typeof login !== 'string' || login.includes(':')) continue + const key = login.toLowerCase() + if (seen.has(key)) continue + seen.add(key) + contacts.push({ + channel: 'github-handle', + value: login, + name: typeof u.name === 'string' ? u.name : undefined, + role: 'maintainer', + tier: 'B', + provenance: prov(), + }) + } + return contacts +} + +export async function fetchCargo( + parsed: ParsedPurl, + timeoutMs: number, + userAgent: string, +): Promise { + // crates.io does not expose author emails; owners give GitHub handles. + await throttle() + const url = `https://crates.io/api/v1/crates/${encodeURIComponent(parsed.name)}/owners` + const { json } = await fetchJson(url, timeoutMs, registryHeaders(userAgent)) + if (!json) return { contacts: [], policies: {} } + return { contacts: mapCargoOwners(json, url, new Date().toISOString()), policies: {} } +} diff --git a/services/apps/packages_worker/src/security-contacts/extractors/registry/composer.ts b/services/apps/packages_worker/src/security-contacts/extractors/registry/composer.ts new file mode 100644 index 0000000000..3f5a72d506 --- /dev/null +++ b/services/apps/packages_worker/src/security-contacts/extractors/registry/composer.ts @@ -0,0 +1,72 @@ +import { ExtractorResult, ProvenanceEntry, RawContact, RepoPolicies } from '../../types' +import { fetchJson, isEmail, registryHeaders } from '../http' + +import { ParsedPurl } from './purl' + +const SOURCE = 'packagist' + +/* eslint-disable @typescript-eslint/no-explicit-any */ + +export function mapComposer( + doc: unknown, + fullName: string, + sourceUrl: string, + fetchedAt: string, +): ExtractorResult { + const contacts: RawContact[] = [] + const policies: Partial = {} + if (!doc || typeof doc !== 'object') return { contacts, policies } + + const versions = (doc as any).packages?.[fullName] + const latest = Array.isArray(versions) ? versions[0] : undefined + if (!latest || typeof latest !== 'object') return { contacts, policies } + + const prov = (): ProvenanceEntry[] => [ + { source: SOURCE, sourceTier: 'B', path: sourceUrl, fetchedAt }, + ] + const seen = new Set() + const add = ( + channel: RawContact['channel'], + value: string, + role: RawContact['role'], + name?: string, + ): void => { + const key = `${channel}:${value.toLowerCase()}` + if (seen.has(key)) return + seen.add(key) + contacts.push({ channel, value, name, role, tier: 'B', provenance: prov() }) + } + + for (const a of Array.isArray(latest.authors) ? latest.authors : []) { + if (typeof a?.email === 'string' && isEmail(a.email)) { + add('email', a.email, 'maintainer', typeof a.name === 'string' ? a.name : undefined) + } + } + + const support = latest.support + if (support && typeof support === 'object') { + // support.security is Composer's dedicated security channel. + if (typeof support.security === 'string' && /^https?:\/\//i.test(support.security)) { + policies.securityPolicyUrl = support.security + add('url', support.security, 'security-team') + } + if (typeof support.email === 'string' && isEmail(support.email)) { + add('email', support.email, 'security-team') + } + } + + return { contacts, policies } +} + +export async function fetchComposer( + parsed: ParsedPurl, + timeoutMs: number, + userAgent: string, +): Promise { + if (!parsed.namespace) return { contacts: [], policies: {} } + const fullName = `${parsed.namespace}/${parsed.name}` + const url = `https://repo.packagist.org/p2/${fullName}.json` + const { json } = await fetchJson(url, timeoutMs, registryHeaders(userAgent)) + if (!json) return { contacts: [], policies: {} } + return mapComposer(json, fullName, url, new Date().toISOString()) +} diff --git a/services/apps/packages_worker/src/security-contacts/extractors/registry/index.ts b/services/apps/packages_worker/src/security-contacts/extractors/registry/index.ts index 4c66739330..42b83cf330 100644 --- a/services/apps/packages_worker/src/security-contacts/extractors/registry/index.ts +++ b/services/apps/packages_worker/src/security-contacts/extractors/registry/index.ts @@ -1,17 +1,29 @@ -import { Extractor, RawContact, RepoPackage, RepoPolicies } from '../../types' +import { Extractor, ExtractorResult, RawContact, RepoPolicies } from '../../types' +import { fetchCargo } from './cargo' +import { fetchComposer } from './composer' +import { fetchMaven } from './maven' import { fetchNpm } from './npm' -import { parsePurl } from './purl' +import { fetchNuget } from './nuget' +import { ParsedPurl, parsePurl } from './purl' +import { fetchPypi } from './pypi' +import { fetchRubygems } from './rubygems' type EcosystemFetcher = ( - parsed: ReturnType & object, - pkg: RepoPackage, + parsed: ParsedPurl, timeoutMs: number, -) => Promise + userAgent: string, +) => Promise -// Keyed by the lowercased packages.ecosystem value. go has no manifest contacts. +// Keyed by the lowercased packages.ecosystem value. go has no package-manifest contacts. const FETCHERS: Record = { npm: fetchNpm, + pypi: fetchPypi, + maven: fetchMaven, + cargo: fetchCargo, + nuget: fetchNuget, + rubygems: fetchRubygems, + composer: fetchComposer, } export const extractManifest: Extractor = async (target, deps) => { @@ -30,7 +42,13 @@ export const extractManifest: Extractor = async (target, deps) => { if (!parsed) continue try { - contacts.push(...(await fetcher(parsed, pkg, deps.fetchTimeoutMs))) + const result = await fetcher(parsed, deps.fetchTimeoutMs, deps.userAgent) + contacts.push(...result.contacts) + for (const [key, value] of Object.entries(result.policies)) { + if (!(policies as Record)[key] && value != null) { + ;(policies as Record)[key] = value + } + } } catch { // one bad package must not sink the rest continue diff --git a/services/apps/packages_worker/src/security-contacts/extractors/registry/maven.ts b/services/apps/packages_worker/src/security-contacts/extractors/registry/maven.ts new file mode 100644 index 0000000000..2a268d326a --- /dev/null +++ b/services/apps/packages_worker/src/security-contacts/extractors/registry/maven.ts @@ -0,0 +1,90 @@ +import { XMLParser } from 'fast-xml-parser' + +import { ExtractorResult, ProvenanceEntry, RawContact } from '../../types' +import { fetchText, isEmail, registryHeaders } from '../http' + +import { ParsedPurl } from './purl' + +const SOURCE = 'maven-pom' +const BASE = 'https://repo1.maven.org/maven2' +const parser = new XMLParser({ ignoreAttributes: true }) + +/* eslint-disable @typescript-eslint/no-explicit-any */ + +function asArray(x: T | T[] | undefined): T[] { + if (x === undefined || x === null) return [] + return Array.isArray(x) ? x : [x] +} + +export function mapMavenPom(xml: string, sourceUrl: string, fetchedAt: string): RawContact[] { + let doc: any + try { + doc = parser.parse(xml) + } catch { + return [] + } + const project = doc?.project + if (!project) return [] + + const prov = (): ProvenanceEntry[] => [ + { source: SOURCE, sourceTier: 'B', path: sourceUrl, fetchedAt }, + ] + const contacts: RawContact[] = [] + const seen = new Set() + for (const dev of asArray(project.developers?.developer)) { + const email = dev?.email + if (typeof email !== 'string' || !isEmail(email)) continue + const key = email.toLowerCase() + if (seen.has(key)) continue + seen.add(key) + contacts.push({ + channel: 'email', + value: email, + name: typeof dev.name === 'string' ? dev.name : undefined, + role: 'maintainer', + tier: 'B', + provenance: prov(), + }) + } + return contacts +} + +async function resolveVersion( + groupPath: string, + artifact: string, + timeoutMs: number, + userAgent: string, +): Promise { + const url = `${BASE}/${groupPath}/${artifact}/maven-metadata.xml` + const { text } = await fetchText(url, timeoutMs, registryHeaders(userAgent)) + if (!text) return null + let doc: any + try { + doc = parser.parse(text) + } catch { + return null + } + const versioning = doc?.metadata?.versioning + const release = versioning?.release ?? versioning?.latest + if (typeof release === 'string') return release + const versions = asArray(versioning?.versions?.version) + return versions.length ? String(versions[versions.length - 1]) : null +} + +export async function fetchMaven( + parsed: ParsedPurl, + timeoutMs: number, + userAgent: string, +): Promise { + if (!parsed.namespace) return { contacts: [], policies: {} } + const groupPath = parsed.namespace.replace(/\./g, '/') + const artifact = parsed.name + + const version = await resolveVersion(groupPath, artifact, timeoutMs, userAgent) + if (!version) return { contacts: [], policies: {} } + + const url = `${BASE}/${groupPath}/${artifact}/${version}/${artifact}-${version}.pom` + const { text } = await fetchText(url, timeoutMs, registryHeaders(userAgent)) + if (!text) return { contacts: [], policies: {} } + return { contacts: mapMavenPom(text, url, new Date().toISOString()), policies: {} } +} diff --git a/services/apps/packages_worker/src/security-contacts/extractors/registry/npm.ts b/services/apps/packages_worker/src/security-contacts/extractors/registry/npm.ts index 1a99996dda..87b90dd30a 100644 --- a/services/apps/packages_worker/src/security-contacts/extractors/registry/npm.ts +++ b/services/apps/packages_worker/src/security-contacts/extractors/registry/npm.ts @@ -1,5 +1,5 @@ -import { ProvenanceEntry, RawContact, RepoPackage } from '../../types' -import { fetchJson, isEmail } from '../http' +import { ExtractorResult, ProvenanceEntry, RawContact } from '../../types' +import { fetchJson, isEmail, registryHeaders } from '../http' import { ParsedPurl } from './purl' @@ -50,11 +50,11 @@ export function mapNpm(doc: unknown, sourceUrl: string, fetchedAt: string): RawC export async function fetchNpm( parsed: ParsedPurl, - _pkg: RepoPackage, timeoutMs: number, -): Promise { + userAgent: string, +): Promise { const url = `https://registry.npmjs.org/${npmPackagePath(parsed)}` - const { json } = await fetchJson(url, timeoutMs) - if (!json) return [] - return mapNpm(json, url, new Date().toISOString()) + const { json } = await fetchJson(url, timeoutMs, registryHeaders(userAgent)) + if (!json) return { contacts: [], policies: {} } + return { contacts: mapNpm(json, url, new Date().toISOString()), policies: {} } } diff --git a/services/apps/packages_worker/src/security-contacts/extractors/registry/nuget.ts b/services/apps/packages_worker/src/security-contacts/extractors/registry/nuget.ts new file mode 100644 index 0000000000..908dfe461a --- /dev/null +++ b/services/apps/packages_worker/src/security-contacts/extractors/registry/nuget.ts @@ -0,0 +1,77 @@ +import { XMLParser } from 'fast-xml-parser' + +import { ExtractorResult, ProvenanceEntry, RawContact } from '../../types' +import { extractEmails, fetchJson, fetchText, registryHeaders } from '../http' + +import { ParsedPurl } from './purl' + +const SOURCE = 'nuget-nuspec' +const BASE = 'https://api.nuget.org/v3-flatcontainer' +const parser = new XMLParser({ ignoreAttributes: true }) + +/* eslint-disable @typescript-eslint/no-explicit-any */ + +// NuGet authors/owners are display names; emails appear only when an author string embeds one. +export function mapNuspec(xml: string, sourceUrl: string, fetchedAt: string): RawContact[] { + let doc: any + try { + doc = parser.parse(xml) + } catch { + return [] + } + const meta = doc?.package?.metadata + if (!meta) return [] + + const prov = (): ProvenanceEntry[] => [ + { source: SOURCE, sourceTier: 'B', path: sourceUrl, fetchedAt }, + ] + const contacts: RawContact[] = [] + const seen = new Set() + for (const field of ['authors', 'owners']) { + if (typeof meta[field] !== 'string') continue + for (const email of extractEmails(meta[field])) { + const key = email.toLowerCase() + if (seen.has(key)) continue + seen.add(key) + contacts.push({ + channel: 'email', + value: email, + role: 'maintainer', + tier: 'B', + provenance: prov(), + }) + } + } + return contacts +} + +async function latestStableVersion( + id: string, + timeoutMs: number, + userAgent: string, +): Promise { + const { json } = await fetchJson( + `${BASE}/${id}/index.json`, + timeoutMs, + registryHeaders(userAgent), + ) + const versions = (json as { versions?: unknown } | null)?.versions + if (!Array.isArray(versions) || versions.length === 0) return null + const stable = [...versions].reverse().find((v) => typeof v === 'string' && !v.includes('-')) + return (stable as string) ?? (versions[versions.length - 1] as string) +} + +export async function fetchNuget( + parsed: ParsedPurl, + timeoutMs: number, + userAgent: string, +): Promise { + const id = parsed.name.toLowerCase() + const version = await latestStableVersion(id, timeoutMs, userAgent) + if (!version) return { contacts: [], policies: {} } + + const url = `${BASE}/${id}/${version}/${id}.nuspec` + const { text } = await fetchText(url, timeoutMs, registryHeaders(userAgent)) + if (!text) return { contacts: [], policies: {} } + return { contacts: mapNuspec(text, url, new Date().toISOString()), policies: {} } +} diff --git a/services/apps/packages_worker/src/security-contacts/extractors/registry/pypi.ts b/services/apps/packages_worker/src/security-contacts/extractors/registry/pypi.ts new file mode 100644 index 0000000000..ca53b7b226 --- /dev/null +++ b/services/apps/packages_worker/src/security-contacts/extractors/registry/pypi.ts @@ -0,0 +1,67 @@ +import { ExtractorResult, ProvenanceEntry, RawContact, RepoPolicies } from '../../types' +import { extractEmails, fetchJson, registryHeaders } from '../http' + +import { ParsedPurl } from './purl' + +const SOURCE = 'pypi' + +/* eslint-disable @typescript-eslint/no-explicit-any */ + +export function mapPypi(doc: unknown, sourceUrl: string, fetchedAt: string): ExtractorResult { + const contacts: RawContact[] = [] + const policies: Partial = {} + if (!doc || typeof doc !== 'object') return { contacts, policies } + const info = (doc as any).info + if (!info || typeof info !== 'object') return { contacts, policies } + + const prov = (): ProvenanceEntry[] => [ + { source: SOURCE, sourceTier: 'B', path: sourceUrl, fetchedAt }, + ] + const seen = new Set() + const addEmail = (email: string): void => { + const key = email.toLowerCase() + if (seen.has(key)) return + seen.add(key) + contacts.push({ + channel: 'email', + value: email, + role: 'maintainer', + tier: 'B', + provenance: prov(), + }) + } + + // author_email / maintainer_email are RFC-5322 lists: "Name , Name2 " + for (const field of ['author_email', 'maintainer_email']) { + if (typeof info[field] === 'string') for (const e of extractEmails(info[field])) addEmail(e) + } + + const projectUrls = info.project_urls + if (projectUrls && typeof projectUrls === 'object') { + for (const [key, value] of Object.entries(projectUrls)) { + if (/security/i.test(key) && typeof value === 'string' && /^https?:\/\//i.test(value)) { + if (!policies.securityPolicyUrl) policies.securityPolicyUrl = value + contacts.push({ + channel: 'url', + value, + role: 'security-team', + tier: 'B', + provenance: prov(), + }) + } + } + } + + return { contacts, policies } +} + +export async function fetchPypi( + parsed: ParsedPurl, + timeoutMs: number, + userAgent: string, +): Promise { + const url = `https://pypi.org/pypi/${encodeURIComponent(parsed.name)}/json` + const { json } = await fetchJson(url, timeoutMs, registryHeaders(userAgent)) + if (!json) return { contacts: [], policies: {} } + return mapPypi(json, url, new Date().toISOString()) +} diff --git a/services/apps/packages_worker/src/security-contacts/extractors/registry/rubygems.ts b/services/apps/packages_worker/src/security-contacts/extractors/registry/rubygems.ts new file mode 100644 index 0000000000..63375d36e0 --- /dev/null +++ b/services/apps/packages_worker/src/security-contacts/extractors/registry/rubygems.ts @@ -0,0 +1,46 @@ +import { ExtractorResult, ProvenanceEntry, RawContact } from '../../types' +import { extractEmails, fetchJson, registryHeaders } from '../http' + +import { ParsedPurl } from './purl' + +const SOURCE = 'rubygems' + +/* eslint-disable @typescript-eslint/no-explicit-any */ + +// RubyGems hides author emails; we can only surface emails when an author string embeds one. +// bug_tracker_uri is an issue tracker, not a security contact, so it is intentionally skipped. +export function mapRubygems(doc: unknown, sourceUrl: string, fetchedAt: string): RawContact[] { + if (!doc || typeof doc !== 'object') return [] + const authors = (doc as any).authors + if (typeof authors !== 'string') return [] + + const prov = (): ProvenanceEntry[] => [ + { source: SOURCE, sourceTier: 'B', path: sourceUrl, fetchedAt }, + ] + const contacts: RawContact[] = [] + const seen = new Set() + for (const email of extractEmails(authors)) { + const key = email.toLowerCase() + if (seen.has(key)) continue + seen.add(key) + contacts.push({ + channel: 'email', + value: email, + role: 'maintainer', + tier: 'B', + provenance: prov(), + }) + } + return contacts +} + +export async function fetchRubygems( + parsed: ParsedPurl, + timeoutMs: number, + userAgent: string, +): Promise { + const url = `https://rubygems.org/api/v1/gems/${encodeURIComponent(parsed.name)}.json` + const { json } = await fetchJson(url, timeoutMs, registryHeaders(userAgent)) + if (!json) return { contacts: [], policies: {} } + return { contacts: mapRubygems(json, url, new Date().toISOString()), policies: {} } +} diff --git a/services/apps/packages_worker/src/security-contacts/extractors/securityContactsFile.ts b/services/apps/packages_worker/src/security-contacts/extractors/securityContactsFile.ts index e58d55d484..7d13b975ae 100644 --- a/services/apps/packages_worker/src/security-contacts/extractors/securityContactsFile.ts +++ b/services/apps/packages_worker/src/security-contacts/extractors/securityContactsFile.ts @@ -66,7 +66,7 @@ export const extractSecurityContactsFile: Extractor = async (target, deps) => { const fetchedAt = new Date().toISOString() const prov = (): ProvenanceEntry[] => [{ source: SOURCE, sourceTier: 'A', path: PATH, fetchedAt }] - const token = deps.getToken ? await deps.getToken() : undefined + const token = (deps.getToken ? await deps.getToken() : null) ?? undefined const contacts: RawContact[] = [] for (const entry of parseSecurityContacts(text)) { diff --git a/services/apps/packages_worker/src/security-contacts/githubToken.ts b/services/apps/packages_worker/src/security-contacts/githubToken.ts new file mode 100644 index 0000000000..730f65cf8b --- /dev/null +++ b/services/apps/packages_worker/src/security-contacts/githubToken.ts @@ -0,0 +1,74 @@ +import { getServiceChildLogger } from '@crowd/logging' + +import { getGithubAppConfig } from '../config' +import { + GithubAppConfig, + fetchRateLimitDiagnostics, + getInstallationToken, + resolveInstallations, +} from '../enricher/githubAppAuth' +import { InstallationPool } from '../enricher/installationPool' + +const log = getServiceChildLogger('security-contacts:github-token') + +interface Pool { + pool: InstallationPool + appConfig: GithubAppConfig +} + +// Module-scoped so installations are resolved once and reused across activity invocations. +let cached: Pool | null = null +let initPromise: Promise | null = null + +async function ensurePool(): Promise { + if (cached) return cached + if (!initPromise) { + initPromise = (async () => { + try { + const appConfig = getGithubAppConfig() + const discovered = await resolveInstallations(appConfig) + if (discovered.length === 0) { + log.warn('No GitHub App installations — authed extractors will run unauthenticated') + return null + } + const healthy = await fetchRateLimitDiagnostics( + appConfig.appId, + appConfig.privateKeyPem, + discovered, + ) + cached = { pool: new InstallationPool(healthy.length ? healthy : discovered), appConfig } + return cached + } catch (err) { + log.warn( + { errMsg: (err as Error).message }, + 'GitHub token pool unavailable — running unauthenticated', + ) + return null + } + })() + } + return initPromise +} + +/** + * Returns an installation token (round-robined across the pool) or null when the + * GitHub App is unavailable, so the authed extractors degrade to unauthenticated calls. + */ +export async function getSecurityContactsToken(): Promise { + const resolved = await ensurePool() + if (!resolved) return null + try { + const { installationId } = resolved.pool.select() + return await getInstallationToken( + resolved.appConfig.appId, + resolved.appConfig.privateKeyPem, + installationId, + ) + } catch (err) { + log.warn( + { errMsg: (err as Error).message }, + 'Token mint failed — falling back to unauthenticated', + ) + return null + } +} diff --git a/services/apps/packages_worker/src/security-contacts/processBatch.ts b/services/apps/packages_worker/src/security-contacts/processBatch.ts index af57da8c8e..2e4916d5aa 100644 --- a/services/apps/packages_worker/src/security-contacts/processBatch.ts +++ b/services/apps/packages_worker/src/security-contacts/processBatch.ts @@ -3,7 +3,23 @@ import { getServiceChildLogger } from '@crowd/logging' import { getSecurityContactsConfig } from '../config' -import { RepoPackage, RepoTarget } from './types' +import { extractPvr } from './extractors/pvr' +import { extractManifest } from './extractors/registry' +import { extractSecurityContactsFile } from './extractors/securityContactsFile' +import { extractSecurityInsights } from './extractors/securityInsights' +import { extractSecurityMd } from './extractors/securityMd' +import { extractSecurityTxt } from './extractors/securityTxt' +import { getSecurityContactsToken } from './githubToken' +import { reconcile } from './reconcile' +import { + Extractor, + ExtractorDeps, + RawContact, + RepoPackage, + RepoPolicies, + RepoTarget, +} from './types' +import { writeContacts } from './writeContacts' const log = getServiceChildLogger('security-contacts') @@ -14,6 +30,15 @@ export interface BatchResult { processed: number } +const EXTRACTORS: Extractor[] = [ + extractSecurityInsights, // A1 + extractPvr, // A2 + extractSecurityContactsFile, // A3 + extractSecurityTxt, // A4 + extractSecurityMd, // B1 + extractManifest, // B2 +] + interface SweepRow { id: string url: string @@ -21,12 +46,6 @@ interface SweepRow { packages: RepoPackage[] | null } -/** - * One page of github repos linked to an is_critical package whose contacts are - * missing or stale. Progress is driven by repos.contacts_last_refreshed (bumped - * once a repo is evaluated), so each invocation naturally advances to the next - * unprocessed repos — no cross-invocation cursor needed. - */ async function fetchBatch(qx: QueryExecutor, config: Config): Promise { return qx.select( ` @@ -51,12 +70,7 @@ async function fetchBatch(qx: QueryExecutor, config: Config): Promise( @@ -75,37 +89,51 @@ async function runWithConcurrency( ) } -/** - * Step 2 placeholder pipeline: builds the RepoTarget and logs it. Extractors, - * reconcile, score and write are wired in later steps; until then we bump - * contacts_last_refreshed so the sweep makes progress and the workflow can drain. - */ -async function processRepo(target: RepoTarget): Promise { - log.trace( - { repoId: target.repoId, url: target.url, packages: target.packages.length }, - 'Evaluated repo (pipeline not yet wired)', - ) -} +async function processRepo( + target: RepoTarget, + deps: ExtractorDeps, + qx: QueryExecutor, +): Promise { + // One failing extractor must not sink the repo. + const results = await Promise.allSettled(EXTRACTORS.map((extract) => extract(target, deps))) -/** Marks repos as evaluated. In Step 6 this moves into the per-repo write tx. */ -async function markRefreshed(qx: QueryExecutor, repoIds: string[]): Promise { - if (repoIds.length === 0) return - await qx.result( - `UPDATE repos SET contacts_last_refreshed = NOW() WHERE id = ANY($(repoIds)::bigint[])`, - { repoIds }, - ) + let contacts: RawContact[] = [] + const policies: Partial = {} + for (const r of results) { + if (r.status !== 'fulfilled') { + log.warn({ repoId: target.repoId, errMsg: r.reason?.message }, 'Extractor failed') + continue + } + contacts.push(...r.value.contacts) + for (const [key, value] of Object.entries(r.value.policies)) { + if (!(policies as Record)[key] && value != null) { + ;(policies as Record)[key] = value + } + } + } + + // A2 veto: B1 may emit a github-pvr contact from redirect language; drop it when A2 + // authoritatively reports PVR disabled (Option C from the design discussion). + if (policies.pvrEnabled === false) { + contacts = contacts.filter((c) => c.channel !== 'github-pvr') + } + + const scored = reconcile(contacts) + await writeContacts(qx, target.repoId, scored, policies) } export async function processBatch(qx: QueryExecutor, config: Config): Promise { const batch = await fetchBatch(qx, config) if (batch.length === 0) return { processed: 0 } + const deps: ExtractorDeps = { + fetchTimeoutMs: config.fetchTimeoutMs, + userAgent: config.userAgent, + getToken: getSecurityContactsToken, + } + const targets = batch.map(toTarget) - await runWithConcurrency(targets, config.concurrency, processRepo) - await markRefreshed( - qx, - targets.map((t) => t.repoId), - ) + await runWithConcurrency(targets, config.concurrency, (target) => processRepo(target, deps, qx)) log.info({ processed: targets.length }, 'Security contacts batch complete') return { processed: targets.length } diff --git a/services/apps/packages_worker/src/security-contacts/types.ts b/services/apps/packages_worker/src/security-contacts/types.ts index 2c3e81901d..d52bcc0e8d 100644 --- a/services/apps/packages_worker/src/security-contacts/types.ts +++ b/services/apps/packages_worker/src/security-contacts/types.ts @@ -61,8 +61,10 @@ export interface RepoTarget { export interface ExtractorDeps { fetchTimeoutMs: number - /** Mints a GitHub installation token; only the authed extractors (A2/A3) use it. */ - getToken?: () => Promise + /** Sent on registry calls; required (crates.io rejects requests without an identifying UA). */ + userAgent: string + /** Mints a GitHub installation token (null if unavailable); authed extractors fall back to unauth. */ + getToken?: () => Promise } export type Extractor = (target: RepoTarget, deps: ExtractorDeps) => Promise diff --git a/services/apps/packages_worker/src/security-contacts/writeContacts.ts b/services/apps/packages_worker/src/security-contacts/writeContacts.ts new file mode 100644 index 0000000000..ca90a085b9 --- /dev/null +++ b/services/apps/packages_worker/src/security-contacts/writeContacts.ts @@ -0,0 +1,57 @@ +import { QueryExecutor } from '@crowd/data-access-layer/src/queryExecutor' + +import { RepoPolicies, ScoredContact } from './types' + +/** + * Idempotent per-repo recompute: replace the repo's security_contacts rows and refresh + * the policy columns in one transaction. pvr_enabled uses COALESCE so an "unknown" result + * (A2 endpoint failure) never clears a previously known value. + */ +export async function writeContacts( + qx: QueryExecutor, + repoId: string, + contacts: ScoredContact[], + policies: Partial, +): Promise { + await qx.tx(async (tx) => { + await tx.result('DELETE FROM security_contacts WHERE repo_id = $(repoId)', { repoId }) + + for (const c of contacts) { + await tx.result( + `INSERT INTO security_contacts + (repo_id, channel, value, role, name, score, confidence, provenance, last_refreshed) + VALUES + ($(repoId), $(channel), $(value), $(role), $(name), $(score), $(confidence), $(provenance)::jsonb, NOW())`, + { + repoId, + channel: c.channel, + value: c.value, + role: c.role, + name: c.name ?? null, + score: c.score, + confidence: c.confidence, + provenance: JSON.stringify(c.provenance), + }, + ) + } + + await tx.result( + `UPDATE repos SET + security_policy_url = $(securityPolicyUrl), + vulnerability_reporting_url = $(vulnerabilityReportingUrl), + bug_bounty_url = $(bugBountyUrl), + security_txt_url = $(securityTxtUrl), + pvr_enabled = COALESCE($(pvrEnabled), pvr_enabled), + contacts_last_refreshed = NOW() + WHERE id = $(repoId)`, + { + repoId, + securityPolicyUrl: policies.securityPolicyUrl ?? null, + vulnerabilityReportingUrl: policies.vulnerabilityReportingUrl ?? null, + bugBountyUrl: policies.bugBountyUrl ?? null, + securityTxtUrl: policies.securityTxtUrl ?? null, + pvrEnabled: policies.pvrEnabled ?? null, + }, + ) + }) +} From cd868972d27764169e7e99fa7cac7127b0f466e2 Mon Sep 17 00:00:00 2001 From: Mouad BANI Date: Tue, 30 Jun 2026 15:55:32 +0100 Subject: [PATCH 09/18] chore: wire security-contacts-worker ops (CLI, compose, scripts, env) Signed-off-by: Mouad BANI --- backend/.env.dist.composed | 7 ++ backend/.env.dist.local | 7 ++ scripts/builders/packages.env | 2 +- .../services/security-contacts-worker.yaml | 66 +++++++++++++++++++ services/apps/packages_worker/package.json | 3 + 5 files changed, 84 insertions(+), 1 deletion(-) create mode 100644 scripts/services/security-contacts-worker.yaml diff --git a/backend/.env.dist.composed b/backend/.env.dist.composed index 0bf0dd50d7..a20aaa3960 100644 --- a/backend/.env.dist.composed +++ b/backend/.env.dist.composed @@ -35,3 +35,10 @@ CROWD_PACKAGES_DB_PORT=5432 CROWD_PACKAGES_DB_USERNAME=postgres CROWD_PACKAGES_DB_PASSWORD=example CROWD_PACKAGES_DB_DATABASE=packages-db + +# security-contacts-worker +SECURITY_CONTACTS_UPDATE_INTERVAL_HOURS=24 +SECURITY_CONTACTS_USER_AGENT="lfx-security-contacts-worker" +SECURITY_CONTACTS_CONCURRENCY=20 +SECURITY_CONTACTS_FETCH_TIMEOUT_MS=10000 +SECURITY_CONTACTS_BATCH_SIZE=500 diff --git a/backend/.env.dist.local b/backend/.env.dist.local index bcba7bc5d3..a3c4d22c97 100755 --- a/backend/.env.dist.local +++ b/backend/.env.dist.local @@ -185,6 +185,13 @@ ENRICHER_BATCH_SIZE=100 ENRICHER_REPO_UPDATE_INTERVAL_HOURS=24 ENRICHER_IDLE_SLEEP_SEC=60 +# security-contacts-worker +SECURITY_CONTACTS_UPDATE_INTERVAL_HOURS=24 +SECURITY_CONTACTS_USER_AGENT="lfx-security-contacts-worker (security@linuxfoundation.org)" +SECURITY_CONTACTS_CONCURRENCY=20 +SECURITY_CONTACTS_FETCH_TIMEOUT_MS=10000 +SECURITY_CONTACTS_BATCH_SIZE=500 + OSSPCKGS_GCP_PROJECT=local-dev OSSPCKGS_GCS_BUCKET=local-dev OSSPCKGS_GCP_CREDENTIALS_B64=e30= diff --git a/scripts/builders/packages.env b/scripts/builders/packages.env index 191e44ea95..07536c5ef9 100644 --- a/scripts/builders/packages.env +++ b/scripts/builders/packages.env @@ -1,4 +1,4 @@ DOCKERFILE="./services/docker/Dockerfile.packages" CONTEXT="../" REPO="sjc.ocir.io/axbydjxa5zuh/packages" -SERVICES="github-repos-enricher bq-dataset-ingest npm-worker maven-worker osv-worker dockerhub-sync cargo-worker go-worker nuget-worker" +SERVICES="github-repos-enricher bq-dataset-ingest npm-worker maven-worker osv-worker dockerhub-sync cargo-worker go-worker nuget-worker security-contacts-worker" diff --git a/scripts/services/security-contacts-worker.yaml b/scripts/services/security-contacts-worker.yaml new file mode 100644 index 0000000000..060bbbf0d7 --- /dev/null +++ b/scripts/services/security-contacts-worker.yaml @@ -0,0 +1,66 @@ +version: '3.1' + +x-env-args: &env-args + DOCKER_BUILDKIT: 1 + NODE_ENV: docker + SERVICE: security-contacts-worker + SHELL: /bin/sh + SUPPRESS_NO_CONFIG_WARNING: 'true' + CROWD_TEMPORAL_TASKQUEUE: packages-worker + SECURITY_CONTACTS_UPDATE_INTERVAL_HOURS: '24' + +services: + security-contacts-worker: + build: + context: ../../ + dockerfile: ./scripts/services/docker/Dockerfile.packages + command: 'pnpm run start:security-contacts-worker' + working_dir: /usr/crowd/app/services/apps/packages_worker + env_file: + - ../../backend/.env.dist.local + - ../../backend/.env.dist.composed + - ../../backend/.env.override.local + - ../../backend/.env.override.composed + environment: + <<: *env-args + restart: always + networks: + - crowd-bridge + + security-contacts-worker-dev: + build: + context: ../../ + dockerfile: ./scripts/services/docker/Dockerfile.packages + command: 'pnpm run dev:security-contacts-worker' + working_dir: /usr/crowd/app/services/apps/packages_worker + env_file: + - ../../backend/.env.dist.local + - ../../backend/.env.dist.composed + - ../../backend/.env.override.local + - ../../backend/.env.override.composed + environment: + <<: *env-args + hostname: security-contacts-worker + networks: + - crowd-bridge + volumes: + - ../../services/libs/audit-logs/src:/usr/crowd/app/services/libs/audit-logs/src + - ../../services/libs/common/src:/usr/crowd/app/services/libs/common/src + - ../../services/libs/common_services/src:/usr/crowd/app/services/libs/common_services/src + - ../../services/libs/data-access-layer/src:/usr/crowd/app/services/libs/data-access-layer/src + - ../../services/libs/database/src:/usr/crowd/app/services/libs/database/src + - ../../services/libs/integrations/src:/usr/crowd/app/services/libs/integrations/src + - ../../services/libs/logging/src:/usr/crowd/app/services/libs/logging/src + - ../../services/libs/nango/src:/usr/crowd/app/services/libs/nango/src + - ../../services/libs/opensearch/src:/usr/crowd/app/services/libs/opensearch/src + - ../../services/libs/queue/src:/usr/crowd/app/services/libs/queue/src + - ../../services/libs/redis/src:/usr/crowd/app/services/libs/redis/src + - ../../services/libs/snowflake/src:/usr/crowd/app/services/libs/snowflake/src + - ../../services/libs/telemetry/src:/usr/crowd/app/services/libs/telemetry/src + - ../../services/libs/temporal/src:/usr/crowd/app/services/libs/temporal/src + - ../../services/libs/types/src:/usr/crowd/app/services/libs/types/src + - ../../services/apps/packages_worker/src:/usr/crowd/app/services/apps/packages_worker/src + +networks: + crowd-bridge: + external: true diff --git a/services/apps/packages_worker/package.json b/services/apps/packages_worker/package.json index 79600403dd..9e7dd56e05 100644 --- a/services/apps/packages_worker/package.json +++ b/services/apps/packages_worker/package.json @@ -40,6 +40,9 @@ "start:go-worker": "CROWD_TEMPORAL_TASKQUEUE=go-worker SERVICE=go-worker tsx src/bin/go-worker.ts", "dev:go-worker": "CROWD_TEMPORAL_TASKQUEUE=go-worker SERVICE=go-worker LOG_LEVEL=trace nodemon --watch src --watch ../../libs --ext ts --exec tsx --inspect=0.0.0.0:9241 src/bin/go-worker.ts", "dev:go-worker:local": "set -a && . ../../../backend/.env.dist.local && . ../../../backend/.env.override.local && set +a && CROWD_TEMPORAL_TASKQUEUE=go-worker SERVICE=go-worker LOG_LEVEL=trace nodemon --watch src --watch ../../libs --ext ts --exec tsx --inspect=0.0.0.0:9241 src/bin/go-worker.ts", + "start:security-contacts-worker": "CROWD_TEMPORAL_TASKQUEUE=packages-worker SERVICE=security-contacts-worker tsx src/bin/security-contacts-worker.ts", + "dev:security-contacts-worker": "CROWD_TEMPORAL_TASKQUEUE=packages-worker SERVICE=security-contacts-worker LOG_LEVEL=trace nodemon --watch src --watch ../../libs --ext ts --exec tsx --inspect=0.0.0.0:9243 src/bin/security-contacts-worker.ts", + "dev:security-contacts-worker:local": "set -a && . ../../../backend/.env.dist.local && . ../../../backend/.env.override.local && set +a && CROWD_TEMPORAL_TASKQUEUE=packages-worker SERVICE=security-contacts-worker LOG_LEVEL=trace nodemon --watch src --watch ../../libs --ext ts --exec tsx --inspect=0.0.0.0:9243 src/bin/security-contacts-worker.ts", "start:nuget-worker": "CROWD_TEMPORAL_TASKQUEUE=nuget-worker SERVICE=nuget-worker tsx src/bin/nuget-worker.ts", "dev:nuget-worker": "CROWD_TEMPORAL_TASKQUEUE=nuget-worker SERVICE=nuget-worker LOG_LEVEL=trace nodemon --watch src --watch ../../libs --ext ts --exec tsx --inspect=0.0.0.0:9242 src/bin/nuget-worker.ts", "dev:nuget-worker:local": "set -a && . ../../../backend/.env.dist.local && . ../../../backend/.env.override.local && set +a && CROWD_TEMPORAL_TASKQUEUE=nuget-worker SERVICE=nuget-worker LOG_LEVEL=trace nodemon --watch src --watch ../../libs --ext ts --exec tsx --inspect=0.0.0.0:9242 src/bin/nuget-worker.ts", From 8cb71f6ec9b26cce3b08b899bb42c3414dc3da97 Mon Sep 17 00:00:00 2001 From: Mouad BANI Date: Tue, 30 Jun 2026 19:03:16 +0100 Subject: [PATCH 10/18] chore: use unified github host Signed-off-by: Mouad BANI --- .../apps/packages_worker/src/security-contacts/processBatch.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/apps/packages_worker/src/security-contacts/processBatch.ts b/services/apps/packages_worker/src/security-contacts/processBatch.ts index 2e4916d5aa..b81c68afdc 100644 --- a/services/apps/packages_worker/src/security-contacts/processBatch.ts +++ b/services/apps/packages_worker/src/security-contacts/processBatch.ts @@ -56,7 +56,7 @@ async function fetchBatch(qx: QueryExecutor, config: Config): Promise Date: Wed, 1 Jul 2026 11:44:01 +0100 Subject: [PATCH 11/18] fix: code review fixes Signed-off-by: Mouad BANI --- .../extractors/registry/npm.ts | 2 +- .../extractors/securityInsights.ts | 20 ++++++++++++++++++- .../extractors/securityMd.ts | 4 ++-- .../src/security-contacts/processBatch.ts | 9 ++++++++- .../src/security-contacts/writeContacts.ts | 14 +++++++------ 5 files changed, 38 insertions(+), 11 deletions(-) diff --git a/services/apps/packages_worker/src/security-contacts/extractors/registry/npm.ts b/services/apps/packages_worker/src/security-contacts/extractors/registry/npm.ts index 87b90dd30a..8bfe8a29fb 100644 --- a/services/apps/packages_worker/src/security-contacts/extractors/registry/npm.ts +++ b/services/apps/packages_worker/src/security-contacts/extractors/registry/npm.ts @@ -10,7 +10,7 @@ const SOURCE = 'npm-registry' function npmPackagePath(parsed: ParsedPurl): string { const full = parsed.namespace ? `${parsed.namespace}/${parsed.name}` : parsed.name // Scoped packages must percent-encode the slash for the registry path. - return full.startsWith('@') ? full.replace('/', '%2F') : full + return full.startsWith('@') ? full.replaceAll('/', '%2F') : full } export function mapNpm(doc: unknown, sourceUrl: string, fetchedAt: string): RawContact[] { diff --git a/services/apps/packages_worker/src/security-contacts/extractors/securityInsights.ts b/services/apps/packages_worker/src/security-contacts/extractors/securityInsights.ts index b4a9db1a7b..9bda249c90 100644 --- a/services/apps/packages_worker/src/security-contacts/extractors/securityInsights.ts +++ b/services/apps/packages_worker/src/security-contacts/extractors/securityInsights.ts @@ -45,6 +45,22 @@ function classifyValue(raw: string): { channel: ContactChannel; value: string } return { channel: 'github-handle', value: v } } +// project-si-source may only point at trusted raw-content hosts (SSRF guard). +const REDIRECT_ALLOWED_HOSTS = new Set([ + 'raw.githubusercontent.com', + 'gist.githubusercontent.com', + 'gitlab.com', +]) + +function isAllowedRedirect(url: string): boolean { + try { + const u = new URL(url) + return u.protocol === 'https:' && REDIRECT_ALLOWED_HOSTS.has(u.hostname.toLowerCase()) + } catch { + return false + } +} + // MAINTAINERS.md / OWNERS.md links are documents, not contacts. function isDocumentUrl(value: string): boolean { const v = value.toLowerCase() @@ -211,7 +227,9 @@ export const extractSecurityInsights: Extractor = async (target, deps) => { } const redirect = (doc as any)?.header?.['project-si-source'] - if (typeof redirect === 'string' && /^https?:\/\//i.test(redirect)) { + // Only follow redirects to trusted raw-content hosts — a repo-controlled URL must not + // be able to point the worker at internal/metadata endpoints (SSRF). + if (typeof redirect === 'string' && isAllowedRedirect(redirect)) { const redirected = await fetchText(redirect, deps.fetchTimeoutMs) if (redirected.text) return parseSecurityInsights(redirected.text, redirect, fetchedAt) } diff --git a/services/apps/packages_worker/src/security-contacts/extractors/securityMd.ts b/services/apps/packages_worker/src/security-contacts/extractors/securityMd.ts index 5491c36c0d..6ff4272254 100644 --- a/services/apps/packages_worker/src/security-contacts/extractors/securityMd.ts +++ b/services/apps/packages_worker/src/security-contacts/extractors/securityMd.ts @@ -90,8 +90,8 @@ export function parseSecurityMd( } } - // PVR redirect language corroborates A2. Emitted unconditionally here; the pipeline - // vetoes it when A2 reports PVR disabled. TODO(CM-1243): apply the A2 veto in Step 6. + // PVR redirect language corroborates A2. Emitted unconditionally; processBatch vetoes it + // when A2 authoritatively reports PVR disabled. if (PVR_RE.test(text)) { add('github-pvr', `https://github.com/${owner}/${name}/security/advisories/new`) } diff --git a/services/apps/packages_worker/src/security-contacts/processBatch.ts b/services/apps/packages_worker/src/security-contacts/processBatch.ts index b81c68afdc..ce210cf21a 100644 --- a/services/apps/packages_worker/src/security-contacts/processBatch.ts +++ b/services/apps/packages_worker/src/security-contacts/processBatch.ts @@ -56,7 +56,7 @@ async function fetchBatch(qx: QueryExecutor, config: Config): Promise extract(target, deps))) + // If every extractor failed (e.g. a transient network outage), skip the write entirely so we + // don't delete previously good contacts/policies; leaving contacts_last_refreshed retries next sweep. + if (results.every((r) => r.status === 'rejected')) { + log.warn({ repoId: target.repoId }, 'All extractors failed — skipping write to preserve data') + return + } + let contacts: RawContact[] = [] const policies: Partial = {} for (const r of results) { diff --git a/services/apps/packages_worker/src/security-contacts/writeContacts.ts b/services/apps/packages_worker/src/security-contacts/writeContacts.ts index ca90a085b9..7f92652344 100644 --- a/services/apps/packages_worker/src/security-contacts/writeContacts.ts +++ b/services/apps/packages_worker/src/security-contacts/writeContacts.ts @@ -4,8 +4,8 @@ import { RepoPolicies, ScoredContact } from './types' /** * Idempotent per-repo recompute: replace the repo's security_contacts rows and refresh - * the policy columns in one transaction. pvr_enabled uses COALESCE so an "unknown" result - * (A2 endpoint failure) never clears a previously known value. + * the policy columns in one transaction. Policy columns use COALESCE so a run that doesn't + * rediscover a field (partial/failed extractor pass) never clears a previously known value. */ export async function writeContacts( qx: QueryExecutor, @@ -36,11 +36,13 @@ export async function writeContacts( } await tx.result( + // COALESCE preserves previously stored values when a run doesn't (re)discover a field — + // a partial/failed extractor pass must not wipe still-valid policy URLs or the PVR flag. `UPDATE repos SET - security_policy_url = $(securityPolicyUrl), - vulnerability_reporting_url = $(vulnerabilityReportingUrl), - bug_bounty_url = $(bugBountyUrl), - security_txt_url = $(securityTxtUrl), + security_policy_url = COALESCE($(securityPolicyUrl), security_policy_url), + vulnerability_reporting_url = COALESCE($(vulnerabilityReportingUrl), vulnerability_reporting_url), + bug_bounty_url = COALESCE($(bugBountyUrl), bug_bounty_url), + security_txt_url = COALESCE($(securityTxtUrl), security_txt_url), pvr_enabled = COALESCE($(pvrEnabled), pvr_enabled), contacts_last_refreshed = NOW() WHERE id = $(repoId)`, From f295c3a7d5a011ec899ac81716a93cb36642773c Mon Sep 17 00:00:00 2001 From: Mouad BANI Date: Wed, 1 Jul 2026 12:46:16 +0100 Subject: [PATCH 12/18] chore: raise security contacts worker concurrency; hardcode tuning constants Signed-off-by: Mouad BANI --- backend/.env.dist.composed | 6 +-- backend/.env.dist.local | 4 -- .../services/security-contacts-worker.yaml | 1 - services/apps/packages_worker/src/config.ts | 6 --- .../src/security-contacts/processBatch.ts | 40 ++++++++++++++++--- 5 files changed, 35 insertions(+), 22 deletions(-) diff --git a/backend/.env.dist.composed b/backend/.env.dist.composed index a20aaa3960..ef791867ca 100644 --- a/backend/.env.dist.composed +++ b/backend/.env.dist.composed @@ -37,8 +37,4 @@ CROWD_PACKAGES_DB_PASSWORD=example CROWD_PACKAGES_DB_DATABASE=packages-db # security-contacts-worker -SECURITY_CONTACTS_UPDATE_INTERVAL_HOURS=24 -SECURITY_CONTACTS_USER_AGENT="lfx-security-contacts-worker" -SECURITY_CONTACTS_CONCURRENCY=20 -SECURITY_CONTACTS_FETCH_TIMEOUT_MS=10000 -SECURITY_CONTACTS_BATCH_SIZE=500 +SECURITY_CONTACTS_USER_AGENT="lfx-security-contacts-worker (security@linuxfoundation.org)" diff --git a/backend/.env.dist.local b/backend/.env.dist.local index a3c4d22c97..f1afa32b5c 100755 --- a/backend/.env.dist.local +++ b/backend/.env.dist.local @@ -186,11 +186,7 @@ ENRICHER_REPO_UPDATE_INTERVAL_HOURS=24 ENRICHER_IDLE_SLEEP_SEC=60 # security-contacts-worker -SECURITY_CONTACTS_UPDATE_INTERVAL_HOURS=24 SECURITY_CONTACTS_USER_AGENT="lfx-security-contacts-worker (security@linuxfoundation.org)" -SECURITY_CONTACTS_CONCURRENCY=20 -SECURITY_CONTACTS_FETCH_TIMEOUT_MS=10000 -SECURITY_CONTACTS_BATCH_SIZE=500 OSSPCKGS_GCP_PROJECT=local-dev OSSPCKGS_GCS_BUCKET=local-dev diff --git a/scripts/services/security-contacts-worker.yaml b/scripts/services/security-contacts-worker.yaml index 060bbbf0d7..934afe2c69 100644 --- a/scripts/services/security-contacts-worker.yaml +++ b/scripts/services/security-contacts-worker.yaml @@ -7,7 +7,6 @@ x-env-args: &env-args SHELL: /bin/sh SUPPRESS_NO_CONFIG_WARNING: 'true' CROWD_TEMPORAL_TASKQUEUE: packages-worker - SECURITY_CONTACTS_UPDATE_INTERVAL_HOURS: '24' services: security-contacts-worker: diff --git a/services/apps/packages_worker/src/config.ts b/services/apps/packages_worker/src/config.ts index 8a9dfae0ea..cc5307f161 100644 --- a/services/apps/packages_worker/src/config.ts +++ b/services/apps/packages_worker/src/config.ts @@ -48,14 +48,8 @@ export function getEnricherConfig() { export function getSecurityContactsConfig() { return { - // A repo is re-evaluated once its contacts are older than this many hours. - updateIntervalHours: requireEnvInt('SECURITY_CONTACTS_UPDATE_INTERVAL_HOURS'), // Sent on all registry calls; crates.io rejects requests without an identifying UA. userAgent: requireEnv('SECURITY_CONTACTS_USER_AGENT'), - // Lower than the enricher: each repo fans out to several HTTP calls across extractors - concurrency: parseInt(process.env.SECURITY_CONTACTS_CONCURRENCY ?? '20', 10), - fetchTimeoutMs: parseInt(process.env.SECURITY_CONTACTS_FETCH_TIMEOUT_MS ?? '10000', 10), - batchSize: parseInt(process.env.SECURITY_CONTACTS_BATCH_SIZE ?? '500', 10), } } diff --git a/services/apps/packages_worker/src/security-contacts/processBatch.ts b/services/apps/packages_worker/src/security-contacts/processBatch.ts index ce210cf21a..dbc14982a0 100644 --- a/services/apps/packages_worker/src/security-contacts/processBatch.ts +++ b/services/apps/packages_worker/src/security-contacts/processBatch.ts @@ -30,6 +30,20 @@ export interface BatchResult { processed: number } +// Two-tier refresh cadence (the worker runs on a daily cron). Just under 24h/168h so a repo +// processed at ~06:00 is eligible again at the next daily/weekly tick rather than slipping a day. +const DAILY_INTERVAL_HOURS = 20 // repos never evaluated or with no contacts yet +const WEEKLY_INTERVAL_HOURS = 156 // already-enriched repos (have contacts) + +// Tuned for throughput within the platform ceilings: +// - CONCURRENCY: parallel repos; safe against GitHub REST secondary limits given the token pool. +// - FETCH_TIMEOUT_MS: generous enough for slow registries (Maven metadata/POM) without hanging slots. +// - BATCH_SIZE: bounded by the 30-min activity timeout (worst case: an all-cargo batch throttled +// to crates.io's 1 req/s finishes ~8 min). +const CONCURRENCY = 100 +const FETCH_TIMEOUT_MS = 15000 +const BATCH_SIZE = 500 + const EXTRACTORS: Extractor[] = [ extractSecurityInsights, // A1 extractPvr, // A2 @@ -46,7 +60,7 @@ interface SweepRow { packages: RepoPackage[] | null } -async function fetchBatch(qx: QueryExecutor, config: Config): Promise { +async function fetchBatch(qx: QueryExecutor): Promise { return qx.select( ` SELECT r.id::text AS id, @@ -58,14 +72,28 @@ async function fetchBatch(qx: QueryExecutor, config: Config): Promise { - const batch = await fetchBatch(qx, config) + const batch = await fetchBatch(qx) if (batch.length === 0) return { processed: 0 } const deps: ExtractorDeps = { - fetchTimeoutMs: config.fetchTimeoutMs, + fetchTimeoutMs: FETCH_TIMEOUT_MS, userAgent: config.userAgent, getToken: getSecurityContactsToken, } const targets = batch.map(toTarget) - await runWithConcurrency(targets, config.concurrency, (target) => processRepo(target, deps, qx)) + await runWithConcurrency(targets, CONCURRENCY, (target) => processRepo(target, deps, qx)) log.info({ processed: targets.length }, 'Security contacts batch complete') return { processed: targets.length } From 95dcf42a2a50a712ba8866cd1aabae4da122878c Mon Sep 17 00:00:00 2001 From: Mouad BANI Date: Wed, 1 Jul 2026 13:20:38 +0100 Subject: [PATCH 13/18] fix: code review fixes Signed-off-by: Mouad BANI --- .../src/security-contacts/extractors/http.ts | 14 +++++++--- .../extractors/securityContactsFile.ts | 4 +-- .../extractors/securityTxt.ts | 9 ++++++- .../src/security-contacts/processBatch.ts | 27 +++++-------------- .../src/security-contacts/reconcile.ts | 8 +++--- .../src/security-contacts/types.ts | 2 ++ .../src/security-contacts/writeContacts.ts | 8 ++++++ 7 files changed, 40 insertions(+), 32 deletions(-) diff --git a/services/apps/packages_worker/src/security-contacts/extractors/http.ts b/services/apps/packages_worker/src/security-contacts/extractors/http.ts index 11ee6a5487..84a9054020 100644 --- a/services/apps/packages_worker/src/security-contacts/extractors/http.ts +++ b/services/apps/packages_worker/src/security-contacts/extractors/http.ts @@ -7,6 +7,10 @@ export function registryHeaders(userAgent: string): Record { return { 'User-Agent': userAgent } } +// Genuinely-absent → null body; every other non-200 throws so transient failures (429/5xx/...) +// are treated as failures and the pipeline preserves existing data instead of wiping it. +const ABSENT_STATUSES = new Set([404, 410]) + export interface FetchTextResult { status: number text: string | null @@ -21,8 +25,9 @@ export async function fetchText( const timeoutId = setTimeout(() => controller.abort(), timeoutMs) try { const res = await fetch(url, { headers, signal: controller.signal }) - if (res.status !== 200) return { status: res.status, text: null } - return { status: 200, text: await res.text() } + if (res.status === 200) return { status: 200, text: await res.text() } + if (ABSENT_STATUSES.has(res.status)) return { status: res.status, text: null } + throw new Error(`fetchText ${url} failed: HTTP ${res.status}`) } finally { clearTimeout(timeoutId) } @@ -42,8 +47,9 @@ export async function fetchJson( const timeoutId = setTimeout(() => controller.abort(), timeoutMs) try { const res = await fetch(url, { headers, signal: controller.signal }) - if (res.status !== 200) return { status: res.status, json: null } - return { status: 200, json: await res.json() } + if (res.status === 200) return { status: 200, json: await res.json() } + if (ABSENT_STATUSES.has(res.status)) return { status: res.status, json: null } + throw new Error(`fetchJson ${url} failed: HTTP ${res.status}`) } finally { clearTimeout(timeoutId) } diff --git a/services/apps/packages_worker/src/security-contacts/extractors/securityContactsFile.ts b/services/apps/packages_worker/src/security-contacts/extractors/securityContactsFile.ts index 7d13b975ae..6752a8f968 100644 --- a/services/apps/packages_worker/src/security-contacts/extractors/securityContactsFile.ts +++ b/services/apps/packages_worker/src/security-contacts/extractors/securityContactsFile.ts @@ -73,11 +73,11 @@ export const extractSecurityContactsFile: Extractor = async (target, deps) => { const email = entry.email ?? (await resolvePublicEmail(entry.handle, deps.fetchTimeoutMs, token)) if (email) { - // name = handle so the reconciler can identity-link this to a bare handle from another source + // handle = the username this email was resolved from (used for identity-linking). contacts.push({ channel: 'email', value: email, - name: entry.handle, + handle: entry.handle, role: 'security-team', tier: 'A', provenance: prov(), diff --git a/services/apps/packages_worker/src/security-contacts/extractors/securityTxt.ts b/services/apps/packages_worker/src/security-contacts/extractors/securityTxt.ts index a6bc43dfd6..cd596247ee 100644 --- a/services/apps/packages_worker/src/security-contacts/extractors/securityTxt.ts +++ b/services/apps/packages_worker/src/security-contacts/extractors/securityTxt.ts @@ -9,6 +9,12 @@ const PLATFORM_HOSTS = new Set(['github.com', 'www.github.com', 'gitlab.com', 'b const FIELD_RE = /^([A-Za-z-]+):\s*(.+?)\s*$/ +// target.homepage is externally-sourced. Requiring https already blocks the classic SSRF target +// (cloud-metadata IMDS is http-only); we also reject obvious loopback/localhost. +function isBlockedHost(h: string): boolean { + return h === 'localhost' || h === '::1' || h === '0.0.0.0' || h.startsWith('127.') +} + export function parseSecurityTxt( text: string, sourceUrl: string, @@ -65,12 +71,13 @@ export const extractSecurityTxt: Extractor = async (target, deps) => { let host: string try { const u = new URL(target.homepage) + if (u.protocol !== 'https:') return { contacts: [], policies: {} } origin = u.origin host = u.hostname.toLowerCase() } catch { return { contacts: [], policies: {} } } - if (PLATFORM_HOSTS.has(host)) return { contacts: [], policies: {} } + if (PLATFORM_HOSTS.has(host) || isBlockedHost(host)) return { contacts: [], policies: {} } const url = `${origin}/.well-known/security.txt` const { text } = await fetchText(url, deps.fetchTimeoutMs) diff --git a/services/apps/packages_worker/src/security-contacts/processBatch.ts b/services/apps/packages_worker/src/security-contacts/processBatch.ts index dbc14982a0..6286192341 100644 --- a/services/apps/packages_worker/src/security-contacts/processBatch.ts +++ b/services/apps/packages_worker/src/security-contacts/processBatch.ts @@ -2,6 +2,7 @@ import { QueryExecutor } from '@crowd/data-access-layer/src/queryExecutor' import { getServiceChildLogger } from '@crowd/logging' import { getSecurityContactsConfig } from '../config' +import { mapWithConcurrency } from '../utils/concurrency' import { extractPvr } from './extractors/pvr' import { extractManifest } from './extractors/registry' @@ -19,7 +20,7 @@ import { RepoPolicies, RepoTarget, } from './types' -import { writeContacts } from './writeContacts' +import { markRepoAttempted, writeContacts } from './writeContacts' const log = getServiceChildLogger('security-contacts') @@ -101,22 +102,6 @@ function toTarget(row: SweepRow): RepoTarget { return { repoId: row.id, url: row.url, homepage: row.homepage, packages: row.packages ?? [] } } -async function runWithConcurrency( - items: T[], - concurrency: number, - task: (item: T) => Promise, -): Promise { - let idx = 0 - await Promise.all( - Array.from({ length: Math.min(concurrency, items.length) }, async () => { - while (idx < items.length) { - const item = items[idx++] - await task(item) - } - }), - ) -} - async function processRepo( target: RepoTarget, deps: ExtractorDeps, @@ -125,10 +110,10 @@ async function processRepo( // One failing extractor must not sink the repo. const results = await Promise.allSettled(EXTRACTORS.map((extract) => extract(target, deps))) - // If every extractor failed (e.g. a transient network outage), skip the write entirely so we - // don't delete previously good contacts/policies; leaving contacts_last_refreshed retries next sweep. + // Every extractor failed (transient outage) — preserve existing data, just record the attempt. if (results.every((r) => r.status === 'rejected')) { - log.warn({ repoId: target.repoId }, 'All extractors failed — skipping write to preserve data') + log.warn({ repoId: target.repoId }, 'All extractors failed — preserving existing data') + await markRepoAttempted(qx, target.repoId) return } @@ -168,7 +153,7 @@ export async function processBatch(qx: QueryExecutor, config: Config): Promise processRepo(target, deps, qx)) + await mapWithConcurrency(targets, CONCURRENCY, (target) => processRepo(target, deps, qx)) log.info({ processed: targets.length }, 'Security contacts batch complete') return { processed: targets.length } diff --git a/services/apps/packages_worker/src/security-contacts/reconcile.ts b/services/apps/packages_worker/src/security-contacts/reconcile.ts index a3759619e5..545c343007 100644 --- a/services/apps/packages_worker/src/security-contacts/reconcile.ts +++ b/services/apps/packages_worker/src/security-contacts/reconcile.ts @@ -50,6 +50,7 @@ function mergeInto(target: RawContact, src: RawContact): void { target.role = higherRole(target.role, src.role) target.tier = higherTier(target.tier, src.tier) if (!target.name && src.name) target.name = src.name + if (!target.handle && src.handle) target.handle = src.handle } function exactMatchMerge(contacts: RawContact[]): RawContact[] { @@ -66,13 +67,12 @@ function exactMatchMerge(contacts: RawContact[]): RawContact[] { return [...byKey.values()] } -// Collapse a resolved handle into its email: extractors set the email contact's -// `name` to the originating handle, so a github-handle whose value matches becomes -// redundant once the email is known. The email (higher-quality) channel is kept. +// Collapse a bare github-handle into the email A3 resolved it from, matched via the explicit +// `handle` field (never the display name, to avoid merging unrelated people who share a name). function identityLinkMerge(contacts: RawContact[]): RawContact[] { const emailByHandle = new Map() for (const c of contacts) { - if (c.channel === 'email' && c.name) emailByHandle.set(c.name.toLowerCase(), c) + if (c.channel === 'email' && c.handle) emailByHandle.set(c.handle.toLowerCase(), c) } const out: RawContact[] = [] diff --git a/services/apps/packages_worker/src/security-contacts/types.ts b/services/apps/packages_worker/src/security-contacts/types.ts index d52bcc0e8d..d3e5c6471a 100644 --- a/services/apps/packages_worker/src/security-contacts/types.ts +++ b/services/apps/packages_worker/src/security-contacts/types.ts @@ -25,6 +25,8 @@ export interface RawContact { value: string role: ContactRole name?: string + /** Username an A3 email was resolved from — used only to identity-link a bare github-handle. */ + handle?: string tier: SourceTier provenance: ProvenanceEntry[] } diff --git a/services/apps/packages_worker/src/security-contacts/writeContacts.ts b/services/apps/packages_worker/src/security-contacts/writeContacts.ts index 7f92652344..a1f357dc22 100644 --- a/services/apps/packages_worker/src/security-contacts/writeContacts.ts +++ b/services/apps/packages_worker/src/security-contacts/writeContacts.ts @@ -7,6 +7,14 @@ import { RepoPolicies, ScoredContact } from './types' * the policy columns in one transaction. Policy columns use COALESCE so a run that doesn't * rediscover a field (partial/failed extractor pass) never clears a previously known value. */ +// Records the attempt without touching contacts/policies — preserves existing data on total +// failure while advancing contacts_last_refreshed so the repo isn't reprocessed this sweep. +export async function markRepoAttempted(qx: QueryExecutor, repoId: string): Promise { + await qx.result('UPDATE repos SET contacts_last_refreshed = NOW() WHERE id = $(repoId)', { + repoId, + }) +} + export async function writeContacts( qx: QueryExecutor, repoId: string, From c50d5e90d0a9ae8c202694535c597b04044df057 Mon Sep 17 00:00:00 2001 From: Mouad BANI Date: Wed, 1 Jul 2026 13:34:14 +0100 Subject: [PATCH 14/18] fix: isolate per-repo failures in security contacts batch Signed-off-by: Mouad BANI --- .../src/security-contacts/processBatch.ts | 29 +++++++++++++------ .../src/security-contacts/writeContacts.ts | 10 +++---- 2 files changed, 25 insertions(+), 14 deletions(-) diff --git a/services/apps/packages_worker/src/security-contacts/processBatch.ts b/services/apps/packages_worker/src/security-contacts/processBatch.ts index 6286192341..5a92f16dc8 100644 --- a/services/apps/packages_worker/src/security-contacts/processBatch.ts +++ b/services/apps/packages_worker/src/security-contacts/processBatch.ts @@ -107,12 +107,17 @@ async function processRepo( deps: ExtractorDeps, qx: QueryExecutor, ): Promise { - // One failing extractor must not sink the repo. const results = await Promise.allSettled(EXTRACTORS.map((extract) => extract(target, deps))) - // Every extractor failed (transient outage) — preserve existing data, just record the attempt. - if (results.every((r) => r.status === 'rejected')) { - log.warn({ repoId: target.repoId }, 'All extractors failed — preserving existing data') + // Replace the repo's contacts only when every extractor succeeded. If any failed (transient + // error — non-200s throw), a destructive rewrite would drop contacts a failed tier-A/B extractor + // still has, so preserve existing data and just record the attempt; retried next cadence. + const failed = results.find((r) => r.status === 'rejected') as PromiseRejectedResult | undefined + if (failed) { + log.warn( + { repoId: target.repoId, errMsg: failed.reason?.message }, + 'Extractor failed — preserving existing data', + ) await markRepoAttempted(qx, target.repoId) return } @@ -120,10 +125,7 @@ async function processRepo( let contacts: RawContact[] = [] const policies: Partial = {} for (const r of results) { - if (r.status !== 'fulfilled') { - log.warn({ repoId: target.repoId, errMsg: r.reason?.message }, 'Extractor failed') - continue - } + if (r.status !== 'fulfilled') continue contacts.push(...r.value.contacts) for (const [key, value] of Object.entries(r.value.policies)) { if (!(policies as Record)[key] && value != null) { @@ -153,7 +155,16 @@ export async function processBatch(qx: QueryExecutor, config: Config): Promise processRepo(target, deps, qx)) + // Isolate per-repo failures: mapWithConcurrency is fail-fast, so a single repo's DB error must + // not abort the batch (and, across Temporal retries, halt the whole sweep). Unmarked repos on + // error are simply retried next sweep. + await mapWithConcurrency(targets, CONCURRENCY, async (target) => { + try { + await processRepo(target, deps, qx) + } catch (err) { + log.error({ repoId: target.repoId, errMsg: (err as Error).message }, 'Repo processing failed') + } + }) log.info({ processed: targets.length }, 'Security contacts batch complete') return { processed: targets.length } diff --git a/services/apps/packages_worker/src/security-contacts/writeContacts.ts b/services/apps/packages_worker/src/security-contacts/writeContacts.ts index a1f357dc22..2a5d513f56 100644 --- a/services/apps/packages_worker/src/security-contacts/writeContacts.ts +++ b/services/apps/packages_worker/src/security-contacts/writeContacts.ts @@ -2,11 +2,6 @@ import { QueryExecutor } from '@crowd/data-access-layer/src/queryExecutor' import { RepoPolicies, ScoredContact } from './types' -/** - * Idempotent per-repo recompute: replace the repo's security_contacts rows and refresh - * the policy columns in one transaction. Policy columns use COALESCE so a run that doesn't - * rediscover a field (partial/failed extractor pass) never clears a previously known value. - */ // Records the attempt without touching contacts/policies — preserves existing data on total // failure while advancing contacts_last_refreshed so the repo isn't reprocessed this sweep. export async function markRepoAttempted(qx: QueryExecutor, repoId: string): Promise { @@ -15,6 +10,11 @@ export async function markRepoAttempted(qx: QueryExecutor, repoId: string): Prom }) } +/** + * Idempotent per-repo recompute: replace the repo's security_contacts rows and refresh + * the policy columns in one transaction. Policy columns use COALESCE so a run that doesn't + * rediscover a field (partial/failed extractor pass) never clears a previously known value. + */ export async function writeContacts( qx: QueryExecutor, repoId: string, From 5f32ed1a37e9eaa24ea51db799c866a018fc5b22 Mon Sep 17 00:00:00 2001 From: Mouad BANI Date: Wed, 1 Jul 2026 13:56:00 +0100 Subject: [PATCH 15/18] fix: clear PVR reporting URL on disable; drain poison repos Signed-off-by: Mouad BANI --- .../packages_worker/src/security-contacts/processBatch.ts | 3 +++ .../src/security-contacts/writeContacts.ts | 8 +++++++- 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/services/apps/packages_worker/src/security-contacts/processBatch.ts b/services/apps/packages_worker/src/security-contacts/processBatch.ts index 5a92f16dc8..769c5f3bc1 100644 --- a/services/apps/packages_worker/src/security-contacts/processBatch.ts +++ b/services/apps/packages_worker/src/security-contacts/processBatch.ts @@ -163,6 +163,9 @@ export async function processBatch(qx: QueryExecutor, config: Config): Promise 0 forever). + await markRepoAttempted(qx, target.repoId).catch(() => undefined) } }) diff --git a/services/apps/packages_worker/src/security-contacts/writeContacts.ts b/services/apps/packages_worker/src/security-contacts/writeContacts.ts index 2a5d513f56..6d2196fc53 100644 --- a/services/apps/packages_worker/src/security-contacts/writeContacts.ts +++ b/services/apps/packages_worker/src/security-contacts/writeContacts.ts @@ -46,9 +46,14 @@ export async function writeContacts( await tx.result( // COALESCE preserves previously stored values when a run doesn't (re)discover a field — // a partial/failed extractor pass must not wipe still-valid policy URLs or the PVR flag. + // vulnerability_reporting_url is PVR-derived, so when PVR is authoritatively resolved we + // overwrite it (clearing it once PVR is disabled); otherwise it is preserved. `UPDATE repos SET security_policy_url = COALESCE($(securityPolicyUrl), security_policy_url), - vulnerability_reporting_url = COALESCE($(vulnerabilityReportingUrl), vulnerability_reporting_url), + vulnerability_reporting_url = CASE WHEN $(pvrResolved) + THEN $(vulnerabilityReportingUrl) + ELSE COALESCE($(vulnerabilityReportingUrl), vulnerability_reporting_url) + END, bug_bounty_url = COALESCE($(bugBountyUrl), bug_bounty_url), security_txt_url = COALESCE($(securityTxtUrl), security_txt_url), pvr_enabled = COALESCE($(pvrEnabled), pvr_enabled), @@ -58,6 +63,7 @@ export async function writeContacts( repoId, securityPolicyUrl: policies.securityPolicyUrl ?? null, vulnerabilityReportingUrl: policies.vulnerabilityReportingUrl ?? null, + pvrResolved: policies.pvrEnabled !== undefined, bugBountyUrl: policies.bugBountyUrl ?? null, securityTxtUrl: policies.securityTxtUrl ?? null, pvrEnabled: policies.pvrEnabled ?? null, From 85cb09da8aad9704fb1896d873479d3cec4d6b52 Mon Sep 17 00:00:00 2001 From: Mouad BANI Date: Wed, 1 Jul 2026 15:21:15 +0100 Subject: [PATCH 16/18] fix: incorrect SECURITY_CONTACTS_USER_AGENT value Signed-off-by: Mouad BANI --- backend/.env.dist.composed | 2 +- backend/.env.dist.local | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/backend/.env.dist.composed b/backend/.env.dist.composed index ef791867ca..b0af8d7a05 100644 --- a/backend/.env.dist.composed +++ b/backend/.env.dist.composed @@ -37,4 +37,4 @@ CROWD_PACKAGES_DB_PASSWORD=example CROWD_PACKAGES_DB_DATABASE=packages-db # security-contacts-worker -SECURITY_CONTACTS_USER_AGENT="lfx-security-contacts-worker (security@linuxfoundation.org)" +SECURITY_CONTACTS_USER_AGENT="lfx-security-contacts-worker" diff --git a/backend/.env.dist.local b/backend/.env.dist.local index f1afa32b5c..adca4976c3 100755 --- a/backend/.env.dist.local +++ b/backend/.env.dist.local @@ -186,7 +186,7 @@ ENRICHER_REPO_UPDATE_INTERVAL_HOURS=24 ENRICHER_IDLE_SLEEP_SEC=60 # security-contacts-worker -SECURITY_CONTACTS_USER_AGENT="lfx-security-contacts-worker (security@linuxfoundation.org)" +SECURITY_CONTACTS_USER_AGENT="lfx-security-contacts-worker" OSSPCKGS_GCP_PROJECT=local-dev OSSPCKGS_GCS_BUCKET=local-dev From 082f6a5b08398069c863c793d83e82035a272eec Mon Sep 17 00:00:00 2001 From: Mouad BANI Date: Wed, 1 Jul 2026 15:50:23 +0100 Subject: [PATCH 17/18] fix: skip PVR for archived repos; treat PVR 422 as unknown Signed-off-by: Mouad BANI --- .../src/security-contacts/extractors/http.ts | 8 +++++--- .../src/security-contacts/extractors/pvr.ts | 4 ++++ .../src/security-contacts/processBatch.ts | 10 +++++++++- .../packages_worker/src/security-contacts/types.ts | 2 ++ 4 files changed, 20 insertions(+), 4 deletions(-) diff --git a/services/apps/packages_worker/src/security-contacts/extractors/http.ts b/services/apps/packages_worker/src/security-contacts/extractors/http.ts index 84a9054020..81f7a0f700 100644 --- a/services/apps/packages_worker/src/security-contacts/extractors/http.ts +++ b/services/apps/packages_worker/src/security-contacts/extractors/http.ts @@ -7,9 +7,11 @@ export function registryHeaders(userAgent: string): Record { return { 'User-Agent': userAgent } } -// Genuinely-absent → null body; every other non-200 throws so transient failures (429/5xx/...) -// are treated as failures and the pipeline preserves existing data instead of wiping it. -const ABSENT_STATUSES = new Set([404, 410]) +// Genuinely-absent / not-determinable → null body; every other non-200 throws so transient +// failures (429/5xx/...) are treated as failures and the pipeline preserves data instead of +// wiping it. 422 is included because GitHub's PVR endpoint returns it (per-repo, non-transient) +// when the flag can't be determined — that must read as "unknown", not block the whole repo. +const ABSENT_STATUSES = new Set([404, 410, 422]) export interface FetchTextResult { status: number diff --git a/services/apps/packages_worker/src/security-contacts/extractors/pvr.ts b/services/apps/packages_worker/src/security-contacts/extractors/pvr.ts index 674c074d40..cf971fac3c 100644 --- a/services/apps/packages_worker/src/security-contacts/extractors/pvr.ts +++ b/services/apps/packages_worker/src/security-contacts/extractors/pvr.ts @@ -34,6 +34,10 @@ export function mapPvr( } export const extractPvr: Extractor = async (target, deps) => { + // GitHub's PVR endpoint rejects archived (and private) repos with 422; skip the call for + // known-archived repos. Non-archived-yet-unknown repos still get the 422→unknown safety net. + if (target.archived) return { contacts: [], policies: {} } + let owner: string let name: string try { diff --git a/services/apps/packages_worker/src/security-contacts/processBatch.ts b/services/apps/packages_worker/src/security-contacts/processBatch.ts index 769c5f3bc1..e8a0ecb64f 100644 --- a/services/apps/packages_worker/src/security-contacts/processBatch.ts +++ b/services/apps/packages_worker/src/security-contacts/processBatch.ts @@ -58,6 +58,7 @@ interface SweepRow { id: string url: string homepage: string | null + archived: boolean | null packages: RepoPackage[] | null } @@ -67,6 +68,7 @@ async function fetchBatch(qx: QueryExecutor): Promise { SELECT r.id::text AS id, r.url, r.homepage, + r.archived, json_agg(json_build_object('purl', p.purl, 'ecosystem', p.ecosystem)) AS packages FROM repos r JOIN package_repos pr ON pr.repo_id = r.id @@ -99,7 +101,13 @@ async function fetchBatch(qx: QueryExecutor): Promise { } function toTarget(row: SweepRow): RepoTarget { - return { repoId: row.id, url: row.url, homepage: row.homepage, packages: row.packages ?? [] } + return { + repoId: row.id, + url: row.url, + homepage: row.homepage, + archived: row.archived, + packages: row.packages ?? [], + } } async function processRepo( diff --git a/services/apps/packages_worker/src/security-contacts/types.ts b/services/apps/packages_worker/src/security-contacts/types.ts index d3e5c6471a..5b7454fff6 100644 --- a/services/apps/packages_worker/src/security-contacts/types.ts +++ b/services/apps/packages_worker/src/security-contacts/types.ts @@ -58,6 +58,8 @@ export interface RepoTarget { repoId: string url: string homepage: string | null + /** From the enricher; null = not yet enriched. Archived repos can't be queried for PVR. */ + archived: boolean | null packages: RepoPackage[] } From 2ece757bdbfc4037e51d317d394d940ff702fd74 Mon Sep 17 00:00:00 2001 From: Mouad BANI Date: Wed, 1 Jul 2026 16:56:39 +0100 Subject: [PATCH 18/18] fix: rate-limit-safe GitHub access for security contacts Signed-off-by: Mouad BANI --- .../src/security-contacts/extractors/http.ts | 63 ++++--- .../src/security-contacts/extractors/pvr.ts | 15 +- .../extractors/securityContactsFile.ts | 21 +-- .../extractors/securityInsights.ts | 7 +- .../extractors/securityMd.ts | 7 +- .../src/security-contacts/githubToken.ts | 171 ++++++++++++++++-- .../src/security-contacts/processBatch.ts | 8 +- .../src/security-contacts/types.ts | 14 +- 8 files changed, 221 insertions(+), 85 deletions(-) diff --git a/services/apps/packages_worker/src/security-contacts/extractors/http.ts b/services/apps/packages_worker/src/security-contacts/extractors/http.ts index 81f7a0f700..8f1c625e20 100644 --- a/services/apps/packages_worker/src/security-contacts/extractors/http.ts +++ b/services/apps/packages_worker/src/security-contacts/extractors/http.ts @@ -1,6 +1,3 @@ -export const RAW_BASE = 'https://raw.githubusercontent.com' -export const GITHUB_API = 'https://api.github.com' - // crates.io rejects requests without a descriptive User-Agent (HTTP 403); harmless elsewhere. // https://crates.io/policies#crawlers export function registryHeaders(userAgent: string): Record { @@ -8,11 +5,37 @@ export function registryHeaders(userAgent: string): Record { } // Genuinely-absent / not-determinable → null body; every other non-200 throws so transient -// failures (429/5xx/...) are treated as failures and the pipeline preserves data instead of +// failures (5xx/...) are treated as failures and the pipeline preserves data instead of // wiping it. 422 is included because GitHub's PVR endpoint returns it (per-repo, non-transient) // when the flag can't be determined — that must read as "unknown", not block the whole repo. const ABSENT_STATUSES = new Set([404, 410, 422]) +// Registry rate-limit / overload responses: retried in-process (honoring Retry-After) so a brief +// throttle doesn't fail the extractor and cost the repo a whole refresh cadence. +const RATE_LIMIT_STATUSES = new Set([429, 503]) +const MAX_RATE_LIMIT_RETRIES = 3 + +async function fetchWithRetry( + url: string, + timeoutMs: number, + headers: Record, +): Promise { + for (let attempt = 0; ; attempt++) { + const controller = new AbortController() + const timeoutId = setTimeout(() => controller.abort(), timeoutMs) + let res: Response + try { + res = await fetch(url, { headers, signal: controller.signal }) + } finally { + clearTimeout(timeoutId) + } + if (!RATE_LIMIT_STATUSES.has(res.status) || attempt >= MAX_RATE_LIMIT_RETRIES) return res + const retryAfterSec = parseInt(res.headers.get('retry-after') ?? '0', 10) + const waitMs = retryAfterSec ? retryAfterSec * 1000 : Math.min(30_000, 1_000 * 2 ** attempt) + await new Promise((r) => setTimeout(r, waitMs)) + } +} + export interface FetchTextResult { status: number text: string | null @@ -23,16 +46,10 @@ export async function fetchText( timeoutMs: number, headers: Record = {}, ): Promise { - const controller = new AbortController() - const timeoutId = setTimeout(() => controller.abort(), timeoutMs) - try { - const res = await fetch(url, { headers, signal: controller.signal }) - if (res.status === 200) return { status: 200, text: await res.text() } - if (ABSENT_STATUSES.has(res.status)) return { status: res.status, text: null } - throw new Error(`fetchText ${url} failed: HTTP ${res.status}`) - } finally { - clearTimeout(timeoutId) - } + const res = await fetchWithRetry(url, timeoutMs, headers) + if (res.status === 200) return { status: 200, text: await res.text() } + if (ABSENT_STATUSES.has(res.status)) return { status: res.status, text: null } + throw new Error(`fetchText ${url} failed: HTTP ${res.status}`) } export interface FetchJsonResult { @@ -45,20 +62,10 @@ export async function fetchJson( timeoutMs: number, headers: Record = {}, ): Promise { - const controller = new AbortController() - const timeoutId = setTimeout(() => controller.abort(), timeoutMs) - try { - const res = await fetch(url, { headers, signal: controller.signal }) - if (res.status === 200) return { status: 200, json: await res.json() } - if (ABSENT_STATUSES.has(res.status)) return { status: res.status, json: null } - throw new Error(`fetchJson ${url} failed: HTTP ${res.status}`) - } finally { - clearTimeout(timeoutId) - } -} - -export function githubAuthHeaders(token: string): Record { - return { Authorization: `bearer ${token}`, Accept: 'application/vnd.github+json' } + const res = await fetchWithRetry(url, timeoutMs, headers) + if (res.status === 200) return { status: 200, json: await res.json() } + if (ABSENT_STATUSES.has(res.status)) return { status: res.status, json: null } + throw new Error(`fetchJson ${url} failed: HTTP ${res.status}`) } const EMAIL_RE = /^[^@\s]+@[^@\s]+\.[^@\s]+$/ diff --git a/services/apps/packages_worker/src/security-contacts/extractors/pvr.ts b/services/apps/packages_worker/src/security-contacts/extractors/pvr.ts index cf971fac3c..b573a2cdf1 100644 --- a/services/apps/packages_worker/src/security-contacts/extractors/pvr.ts +++ b/services/apps/packages_worker/src/security-contacts/extractors/pvr.ts @@ -1,8 +1,6 @@ import { parseGithubUrl } from '../../enricher/fetchLightRepo' import { Extractor, ExtractorResult } from '../types' -import { GITHUB_API, fetchJson, githubAuthHeaders } from './http' - const SOURCE = 'pvr' /* eslint-disable @typescript-eslint/no-explicit-any */ @@ -46,15 +44,10 @@ export const extractPvr: Extractor = async (target, deps) => { return { contacts: [], policies: {} } } - // The endpoint works unauthenticated, but we use the token pool for rate-limit budget. - const token = deps.getToken ? await deps.getToken() : null - const headers = token ? githubAuthHeaders(token) : {} - const { json } = await fetchJson( - `${GITHUB_API}/repos/${owner}/${name}/private-vulnerability-reporting`, - deps.fetchTimeoutMs, - headers, + const { text } = await deps.githubGet( + `/repos/${owner}/${name}/private-vulnerability-reporting`, ) - if (!json) return { contacts: [], policies: {} } + if (!text) return { contacts: [], policies: {} } - return mapPvr(json, owner, name, new Date().toISOString()) + return mapPvr(JSON.parse(text), owner, name, new Date().toISOString()) } diff --git a/services/apps/packages_worker/src/security-contacts/extractors/securityContactsFile.ts b/services/apps/packages_worker/src/security-contacts/extractors/securityContactsFile.ts index 6752a8f968..b54860f718 100644 --- a/services/apps/packages_worker/src/security-contacts/extractors/securityContactsFile.ts +++ b/services/apps/packages_worker/src/security-contacts/extractors/securityContactsFile.ts @@ -1,9 +1,9 @@ import { getServiceChildLogger } from '@crowd/logging' import { parseGithubUrl } from '../../enricher/fetchLightRepo' -import { Extractor, ProvenanceEntry, RawContact } from '../types' +import { ExtractorDeps, ProvenanceEntry, RawContact, Extractor } from '../types' -import { GITHUB_API, RAW_BASE, fetchJson, fetchText, githubAuthHeaders, isEmail } from './http' +import { isEmail } from './http' const log = getServiceChildLogger('security-contacts:security_contacts-file') @@ -34,16 +34,11 @@ export function parseSecurityContacts(text: string): SecurityContactEntry[] { async function resolvePublicEmail( login: string, - timeoutMs: number, - token?: string, + githubGet: ExtractorDeps['githubGet'], ): Promise { try { - const { json } = await fetchJson( - `${GITHUB_API}/users/${login}`, - timeoutMs, - token ? githubAuthHeaders(token) : {}, - ) - const email = (json as { email?: unknown } | null)?.email + const { text } = await githubGet(`/users/${login}`) + const email = (text ? (JSON.parse(text) as { email?: unknown }) : null)?.email return typeof email === 'string' && isEmail(email) ? email : null } catch (err) { log.warn({ login, errMsg: (err as Error).message }, 'Handle email resolution failed') @@ -60,18 +55,16 @@ export const extractSecurityContactsFile: Extractor = async (target, deps) => { return { contacts: [], policies: {} } } - const { text } = await fetchText(`${RAW_BASE}/${owner}/${name}/HEAD/${PATH}`, deps.fetchTimeoutMs) + const { text } = await deps.githubGet(`/repos/${owner}/${name}/contents/${PATH}`, { raw: true }) if (!text) return { contacts: [], policies: {} } const fetchedAt = new Date().toISOString() const prov = (): ProvenanceEntry[] => [{ source: SOURCE, sourceTier: 'A', path: PATH, fetchedAt }] - const token = (deps.getToken ? await deps.getToken() : null) ?? undefined const contacts: RawContact[] = [] for (const entry of parseSecurityContacts(text)) { - const email = - entry.email ?? (await resolvePublicEmail(entry.handle, deps.fetchTimeoutMs, token)) + const email = entry.email ?? (await resolvePublicEmail(entry.handle, deps.githubGet)) if (email) { // handle = the username this email was resolved from (used for identity-linking). contacts.push({ diff --git a/services/apps/packages_worker/src/security-contacts/extractors/securityInsights.ts b/services/apps/packages_worker/src/security-contacts/extractors/securityInsights.ts index 9bda249c90..86ebf4abb0 100644 --- a/services/apps/packages_worker/src/security-contacts/extractors/securityInsights.ts +++ b/services/apps/packages_worker/src/security-contacts/extractors/securityInsights.ts @@ -13,7 +13,7 @@ import { RepoPolicies, } from '../types' -import { RAW_BASE, fetchText, githubHandleFromUrl, isEmail } from './http' +import { fetchText, githubHandleFromUrl, isEmail } from './http' const log = getServiceChildLogger('security-contacts:security-insights') @@ -213,10 +213,7 @@ export const extractSecurityInsights: Extractor = async (target, deps) => { const fetchedAt = new Date().toISOString() for (const path of PATHS) { - const { text } = await fetchText( - `${RAW_BASE}/${owner}/${name}/HEAD/${path}`, - deps.fetchTimeoutMs, - ) + const { text } = await deps.githubGet(`/repos/${owner}/${name}/contents/${path}`, { raw: true }) if (!text) continue let doc: unknown diff --git a/services/apps/packages_worker/src/security-contacts/extractors/securityMd.ts b/services/apps/packages_worker/src/security-contacts/extractors/securityMd.ts index 6ff4272254..89d7e7d3ef 100644 --- a/services/apps/packages_worker/src/security-contacts/extractors/securityMd.ts +++ b/services/apps/packages_worker/src/security-contacts/extractors/securityMd.ts @@ -8,7 +8,7 @@ import { RepoPolicies, } from '../types' -import { RAW_BASE, fetchText, githubHandleFromUrl } from './http' +import { githubHandleFromUrl } from './http' const SOURCE = 'security.md' const PATHS = ['SECURITY.md', '.github/SECURITY.md', 'docs/SECURITY.md'] @@ -110,10 +110,7 @@ export const extractSecurityMd: Extractor = async (target, deps) => { const fetchedAt = new Date().toISOString() for (const path of PATHS) { - const { text } = await fetchText( - `${RAW_BASE}/${owner}/${name}/HEAD/${path}`, - deps.fetchTimeoutMs, - ) + const { text } = await deps.githubGet(`/repos/${owner}/${name}/contents/${path}`, { raw: true }) if (text) return parseSecurityMd(text, owner, name, path, fetchedAt) } diff --git a/services/apps/packages_worker/src/security-contacts/githubToken.ts b/services/apps/packages_worker/src/security-contacts/githubToken.ts index 730f65cf8b..7c5cbc38da 100644 --- a/services/apps/packages_worker/src/security-contacts/githubToken.ts +++ b/services/apps/packages_worker/src/security-contacts/githubToken.ts @@ -9,8 +9,49 @@ import { } from '../enricher/githubAppAuth' import { InstallationPool } from '../enricher/installationPool' +import { GithubGetResult } from './types' + const log = getServiceChildLogger('security-contacts:github-token') +const GITHUB_API = 'https://api.github.com' + +// Genuinely-absent / not-determinable → null body (see http.ts for the same set). 422 covers the +// PVR endpoint returning "can't determine" per-repo; it must read as unknown, not a hard failure. +const ABSENT_STATUSES = new Set([404, 410, 422]) + +// App-wide ceiling on concurrent GitHub requests. GitHub's secondary limit rejects bursts of +// >100 concurrent requests from one app; staying under that (across all installations) is the +// single most effective guard against secondary-limit 429s at high repo concurrency. +const MAX_CONCURRENT_GITHUB_REQUESTS = 50 + +// Bound the park/switch/backoff retry loop so a persistently-limited request eventually surfaces +// as a failure (which the pipeline treats as transient and preserves existing data). +const MAX_RATE_LIMIT_RETRIES = 6 + +/** Minimal async semaphore with fair FIFO hand-off, used to cap concurrent GitHub requests. */ +class Semaphore { + private active = 0 + private readonly waiters: Array<() => void> = [] + + constructor(private readonly max: number) {} + + async acquire(): Promise { + if (this.active < this.max) { + this.active++ + return + } + await new Promise((resolve) => this.waiters.push(resolve)) + } + + release(): void { + const next = this.waiters.shift() + if (next) next() + else this.active-- + } +} + +const gate = new Semaphore(MAX_CONCURRENT_GITHUB_REQUESTS) + interface Pool { pool: InstallationPool appConfig: GithubAppConfig @@ -50,25 +91,121 @@ async function ensurePool(): Promise { return initPromise } +const sleep = (ms: number): Promise => new Promise((r) => setTimeout(r, ms)) + +function numOrNull(v: string | null): number | null { + if (v == null) return null + const n = parseInt(v, 10) + return Number.isFinite(n) ? n : null +} + +function resetIso(v: string | null): string | null { + const sec = numOrNull(v) + return sec == null ? null : new Date(sec * 1000).toISOString() +} + +function isRateLimited(status: number, body: string): boolean { + // Primary limit → 403/429 with x-ratelimit-remaining: 0; secondary → 403/429 mentioning it. + return status === 429 || (status === 403 && /rate limit|secondary/i.test(body)) +} + +async function fetchOnce( + url: string, + timeoutMs: number, + headers: Record, +): Promise { + const controller = new AbortController() + const timeoutId = setTimeout(() => controller.abort(), timeoutMs) + await gate.acquire() + try { + return await fetch(url, { headers, signal: controller.signal }) + } finally { + gate.release() + clearTimeout(timeoutId) + } +} + /** - * Returns an installation token (round-robined across the pool) or null when the - * GitHub App is unavailable, so the authed extractors degrade to unauthenticated calls. + * Rate-limit-safe GitHub API GET. Selects an installation from the pool, sleeps if all are parked, + * feeds response budget headers back so exhausted installations get parked before they 403, and on + * a rate-limit response parks (primary) or waits out Retry-After (secondary, app-wide) then retries + * on another installation. Falls back to a single unauthenticated request when no App is configured. + * + * Returns text on 200; null body for absent resources (404/410/422); throws on other non-200s and + * once the retry budget is exhausted (callers treat throws as transient and preserve existing data). */ -export async function getSecurityContactsToken(): Promise { +export async function githubApiGet( + path: string, + timeoutMs: number, + opts: { raw?: boolean } = {}, +): Promise { + const accept = opts.raw ? 'application/vnd.github.raw' : 'application/vnd.github+json' + const url = `${GITHUB_API}${path}` const resolved = await ensurePool() - if (!resolved) return null - try { - const { installationId } = resolved.pool.select() - return await getInstallationToken( - resolved.appConfig.appId, - resolved.appConfig.privateKeyPem, - installationId, - ) - } catch (err) { - log.warn( - { errMsg: (err as Error).message }, - 'Token mint failed — falling back to unauthenticated', - ) - return null + + if (!resolved) { + const res = await fetchOnce(url, timeoutMs, { Accept: accept }) + if (res.status === 200) return { status: 200, text: await res.text() } + if (ABSENT_STATUSES.has(res.status)) return { status: res.status, text: null } + throw new Error(`githubApiGet ${path} failed: HTTP ${res.status}`) + } + + const { pool, appConfig } = resolved + + for (let attempt = 0; attempt <= MAX_RATE_LIMIT_RETRIES; attempt++) { + const { installationId, waitMs } = pool.select() + if (waitMs > 0) { + log.warn({ waitMs: Math.round(waitMs / 1000) }, 'All installations parked — waiting') + await sleep(waitMs) + } + + let token: string + try { + token = await getInstallationToken(appConfig.appId, appConfig.privateKeyPem, installationId) + } catch (err) { + // Mint failure (rate-limited or auth) — park this installation and try another. + pool.park(installationId, Date.now() + 60_000) + if (attempt === MAX_RATE_LIMIT_RETRIES) throw err + continue + } + + const res = await fetchOnce(url, timeoutMs, { + Authorization: `bearer ${token}`, + Accept: accept, + }) + + if (res.status === 200 || ABSENT_STATUSES.has(res.status)) { + pool.parkIfBudgetLow( + installationId, + numOrNull(res.headers.get('x-ratelimit-remaining')), + resetIso(res.headers.get('x-ratelimit-reset')), + ) + return res.status === 200 + ? { status: 200, text: await res.text() } + : { status: res.status, text: null } + } + + const body = await res.text().catch(() => '') + if (isRateLimited(res.status, body)) { + const retryAfterSec = numOrNull(res.headers.get('retry-after')) + if (retryAfterSec) { + // Secondary limits are app-wide, so switching installations won't help — wait it out. + log.warn({ retryAfterSec }, 'GitHub secondary rate limit — backing off') + await sleep(retryAfterSec * 1000 + 1_000) + } else { + // Primary limit — park this installation until its reset and switch to another. + const resetSec = numOrNull(res.headers.get('x-ratelimit-reset')) + pool.park(installationId, resetSec ? resetSec * 1000 + 5_000 : Date.now() + 60_000) + } + if (attempt === MAX_RATE_LIMIT_RETRIES) { + throw new Error(`githubApiGet ${path} rate limited after ${attempt + 1} attempts`) + } + continue + } + + throw new Error(`githubApiGet ${path} failed: HTTP ${res.status}`) } + + // Unreachable: the loop either returns or throws on the final attempt. + throw new Error(`githubApiGet ${path} exhausted retries`) } diff --git a/services/apps/packages_worker/src/security-contacts/processBatch.ts b/services/apps/packages_worker/src/security-contacts/processBatch.ts index e8a0ecb64f..b01d84a381 100644 --- a/services/apps/packages_worker/src/security-contacts/processBatch.ts +++ b/services/apps/packages_worker/src/security-contacts/processBatch.ts @@ -10,7 +10,7 @@ import { extractSecurityContactsFile } from './extractors/securityContactsFile' import { extractSecurityInsights } from './extractors/securityInsights' import { extractSecurityMd } from './extractors/securityMd' import { extractSecurityTxt } from './extractors/securityTxt' -import { getSecurityContactsToken } from './githubToken' +import { githubApiGet } from './githubToken' import { reconcile } from './reconcile' import { Extractor, @@ -37,7 +37,9 @@ const DAILY_INTERVAL_HOURS = 20 // repos never evaluated or with no contacts yet const WEEKLY_INTERVAL_HOURS = 156 // already-enriched repos (have contacts) // Tuned for throughput within the platform ceilings: -// - CONCURRENCY: parallel repos; safe against GitHub REST secondary limits given the token pool. +// - CONCURRENCY: parallel repos. GitHub calls go through the authed Contents API via a +// rate-limit-aware pool (per-installation budget parking + app-wide concurrency gate + +// Retry-After backoff in githubToken), so high repo concurrency won't trip GitHub limits. // - FETCH_TIMEOUT_MS: generous enough for slow registries (Maven metadata/POM) without hanging slots. // - BATCH_SIZE: bounded by the 30-min activity timeout (worst case: an all-cargo batch throttled // to crates.io's 1 req/s finishes ~8 min). @@ -159,7 +161,7 @@ export async function processBatch(qx: QueryExecutor, config: Config): Promise githubApiGet(path, FETCH_TIMEOUT_MS, opts), } const targets = batch.map(toTarget) diff --git a/services/apps/packages_worker/src/security-contacts/types.ts b/services/apps/packages_worker/src/security-contacts/types.ts index 5b7454fff6..b3cd480732 100644 --- a/services/apps/packages_worker/src/security-contacts/types.ts +++ b/services/apps/packages_worker/src/security-contacts/types.ts @@ -63,12 +63,22 @@ export interface RepoTarget { packages: RepoPackage[] } +/** Result of a GitHub API GET routed through the rate-limit-aware pool. */ +export interface GithubGetResult { + status: number + /** Response body (raw file text or JSON string); null for absent resources (404/410/422). */ + text: string | null +} + export interface ExtractorDeps { fetchTimeoutMs: number /** Sent on registry calls; required (crates.io rejects requests without an identifying UA). */ userAgent: string - /** Mints a GitHub installation token (null if unavailable); authed extractors fall back to unauth. */ - getToken?: () => Promise + /** + * Pool-aware, rate-limit-safe GitHub API GET. Handles installation selection, budget parking, + * and 429/secondary-limit backoff internally. `raw` selects the raw media type (file contents). + */ + githubGet: (path: string, opts?: { raw?: boolean }) => Promise } export type Extractor = (target: RepoTarget, deps: ExtractorDeps) => Promise