Skip to content
Merged
6 changes: 5 additions & 1 deletion services/apps/git_integration/src/crowdgit/database/crud.py
Original file line number Diff line number Diff line change
Expand Up @@ -405,11 +405,15 @@ async def update_maintainer_run(repo_id: str, maintainer_file: str):


async def get_maintainers_for_repo(repo_id: str):
# Read filter mirrors the write path: no platform/type/verified narrowing
# (otherwise email-linked rows like platform='git' are hidden from the diff).
# endDate IS NULL keeps only active rows so reappearing maintainers hit the
# "new" branch and get reactivated by upsert_maintainer's ON CONFLICT clause.
maintainers_sql_query = """
SELECT mi.role, mi."originalRole", mi."repoUrl", mi."repoId", mi."identityId", mem.value as github_username
FROM "maintainersInternal" mi
JOIN "memberIdentities" mem ON mi."identityId" = mem.id
WHERE mi."repoId" = $1 AND mem.platform = 'github' AND mem.type = 'username' and mem.verified = True AND mem."deletedAt" is null
WHERE mi."repoId" = $1 AND mi."endDate" IS NULL AND mem."deletedAt" is null
Comment thread
joanagmaia marked this conversation as resolved.
Outdated
Comment thread
joanagmaia marked this conversation as resolved.
Outdated
"""
return await query(
maintainers_sql_query,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -153,43 +153,58 @@ def make_role(self, title: str):
)
return slugify(title)

async def _resolve_identity(
self, github_username: str | None, email: str | None
) -> str | None:
# Fall back to email when github_username is missing/"unknown" — the AI
# extractor emits "unknown" for ~4k entries on the linux MAINTAINERS file.
if github_username and github_username != "unknown":
identity_id = await find_github_identity(github_username)
if identity_id:
return identity_id
if email and email != "unknown":
return await find_maintainer_identity_by_email(email)
return None

async def _resolve_maintainers(
self, maintainers: list[MaintainerInfoItem]
) -> list[tuple[MaintainerInfoItem, str]]:
# Shared by first-run and incremental paths so lookup semantics stay identical.
semaphore = asyncio.Semaphore(self.MAX_CONCURRENT_CHUNKS)

async def resolve(m: MaintainerInfoItem) -> tuple[MaintainerInfoItem, str | None]:
async with semaphore:
identity_id = await self._resolve_identity(m.github_username, m.email)
return m, identity_id

results = await asyncio.gather(*[resolve(m) for m in maintainers])

resolved: list[tuple[MaintainerInfoItem, str]] = []
for m, identity_id in results:
if identity_id is None:
self.logger.warning(f"Identity not found for maintainer: {m}")
continue
Comment thread
joanagmaia marked this conversation as resolved.
resolved.append((m, identity_id))
return resolved

async def insert_new_maintainers(
self, repo_url: str, repo_id: str, maintainers: list[MaintainerInfoItem]
):
async def process_maintainer(maintainer: MaintainerInfoItem):
self.logger.info(f"Processing maintainer: {maintainer.github_username}")
role = maintainer.normalized_title
original_role = self.make_role(maintainer.title)
# Find the identity in the database
github_username = maintainer.github_username
email = maintainer.email

if github_username == "unknown" and email == "unknown":
self.logger.warning("username & email with value 'unknown' aborting")
return
identity_id = (
await find_github_identity(github_username)
if github_username != "unknown"
else await find_maintainer_identity_by_email(email)
)
self.logger.debug(
f"Found identity_id for {github_username}: {identity_id} (type: {type(identity_id)})"
)
if identity_id:
resolved = await self._resolve_maintainers(maintainers)
# Concurrent upserts: large MAINTAINERS files carry thousands of entries.
semaphore = asyncio.Semaphore(self.MAX_CONCURRENT_CHUNKS)

async def upsert(maintainer: MaintainerInfoItem, identity_id: str) -> None:
async with semaphore:
role = maintainer.normalized_title
original_role = self.make_role(maintainer.title)
await upsert_maintainer(repo_id, identity_id, repo_url, role, original_role)
self.logger.info(
f"Successfully upserted maintainer {github_username} with identity_id {identity_id}"
f"Successfully upserted maintainer {maintainer.github_username} "
f"with identity_id {identity_id}"
)
else:
self.logger.warning(f"Identity not found for GitHub user: {maintainer}")

semaphore = asyncio.Semaphore(3)

async def process_with_semaphore(maintainer: MaintainerInfoItem):
async with semaphore:
await process_maintainer(maintainer)

await asyncio.gather(*[process_with_semaphore(maintainer) for maintainer in maintainers])
await asyncio.gather(*[upsert(m, identity_id) for m, identity_id in resolved])

async def compare_and_update_maintainers(
self,
Expand All @@ -200,63 +215,57 @@ async def compare_and_update_maintainers(
):
self.logger.info(f"Comparing and updating maintainers for repo: {repo_id}")
current_maintainers = await get_maintainers_for_repo(repo_id)
current_maintainers_dict = {m["github_username"]: m for m in current_maintainers}
new_maintainers_dict = {m.github_username: m for m in maintainers}

for github_username, maintainer in new_maintainers_dict.items():
role = maintainer.normalized_title
# Key by (identityId, role) — keying by github_username collapsed every
# "unknown" extraction into one slot, silently dropping most email-only
# maintainers (~4k of 4216 entries on the linux MAINTAINERS file).
current_by_key: dict[tuple[str, str], dict] = {
(m["identityId"], m["role"]): m for m in current_maintainers
}

# Resolve before keying so the comparison is identity-based: the same
# person may extract with different github_username values across runs.
resolved = await self._resolve_maintainers(maintainers)
new_by_key: dict[tuple[str, str], MaintainerInfoItem] = {
(identity_id, m.normalized_title): m for m, identity_id in resolved
}

for (identity_id, role), maintainer in new_by_key.items():
if (identity_id, role) in current_by_key:
continue
original_role = self.make_role(maintainer.title)
if github_username == "unknown" and maintainer.email in ("unknown", None):
await upsert_maintainer(
repo_id, identity_id, repo_url, role, original_role, start_date=change_date
)
self.logger.info(
f"Inserted new maintainer {maintainer.github_username} "
f"with identity_id {identity_id} role {role}"
)

# Safety guard: a maintainer whose identity resolution fails (transient DB
# issue, stale extraction value) is absent from new_by_key but may still be
# in the source. Don't end-date if their identifying value is still present.
mentioned_values = set()
for m in maintainers:
for v in (m.github_username, m.email):
if v and v != "unknown":
mentioned_values.add(v.lower())

for (identity_id, role), current in current_by_key.items():
if (identity_id, role) in new_by_key:
continue
current_value = (current.get("github_username") or "").lower()
if current_value and current_value in mentioned_values:
self.logger.warning(
f"Skipping unknown github_username & email with title {maintainer.title}"
f"Maintainer with identity {identity_id} role {role} could not be "
f"re-resolved but is still mentioned in the source; skipping end-date"
)
continue
Comment thread
joanagmaia marked this conversation as resolved.
Outdated
elif github_username not in current_maintainers_dict:
# New maintainer
identity_id = (
await find_github_identity(github_username)
if github_username != "unknown"
else await find_maintainer_identity_by_email(maintainer.email)
)
self.logger.info(f"Found new maintainer {github_username} to be inserted")
if identity_id:
await upsert_maintainer(
repo_id, identity_id, repo_url, role, original_role, start_date=change_date
)
self.logger.info(
f"Successfully inserted new maintainer {github_username} with identity_id {identity_id}"
)
else:
# will happen for new users if their identity isn't created yet but should be fixed on the next iteration
self.logger.warning(f"Identity not found for username: {github_username}")
else:
# Existing maintainer
current_maintainer = current_maintainers_dict[github_username]
if current_maintainer["role"] != role:
# Role has changed: we update maintainer
self.logger.info(
f"Role changed from {current_maintainer['role']} to {role} for maintainer {current_maintainer['identityId']}"
)
await upsert_maintainer(
repo_id,
current_maintainer["identityId"],
repo_url,
role,
original_role,
change_date,
)

for github_username, current_maintainer in current_maintainers_dict.items():
if github_username not in new_maintainers_dict:
self.logger.info(
f"Maintainer {github_username} with identity {current_maintainer['identityId']} no longer exists, updating its endDate..."
)
await set_maintainer_end_date(
repo_id,
current_maintainer["identityId"],
current_maintainer["role"],
change_date,
)
self.logger.info(
f"Maintainer with identity {identity_id} role {role} no longer exists, "
f"updating its endDate..."
)
await set_maintainer_end_date(repo_id, identity_id, role, change_date)

async def save_maintainers(
self,
Expand Down
Loading