Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit d6956e7

Browse files
bobmatnycclaude
andcommitted
fix: populate github_username from noreply emails and config manual_mappings (#44)
Three gaps caused github_username to be NULL on most developer_identities rows, so resolve_by_github_username() could not bridge ticketing/PR-review actors to canonical IDs and ticketing_score was always 0.0: 1. noreply-on-existing-identity: when a noreply email matched an existing identity, the early-return branch in resolve_developer() never wrote the github_username column. Added _set_github_username_if_missing() and call it after both the username_alias and username_identity match branches. 2. bare noreply form: the guard required "+" in the local-part, so [email protected] (no numeric prefix) was ignored. Extraction now handles both forms. 3. manual_mappings had no github_username field. Added support in DeveloperAlias, AliasesManager (load/save/merge/to_manual_mappings), and _apply_manual_mappings() (both DB and in-memory paths). Tests cover all three gaps. No DB migration needed — the column already exists as nullable. Co-Authored-By: Claude Sonnet 4.6 <[email protected]>
1 parent 096de5f commit d6956e7

4 files changed

Lines changed: 204 additions & 12 deletions

File tree

src/gitflow_analytics/config/aliases.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ class DeveloperAlias:
2626
primary_email: str
2727
aliases: list[str] = field(default_factory=list)
2828
name: Optional[str] = None
29+
github_username: str | None = None
2930
confidence: float = 1.0
3031
reasoning: str = ""
3132

@@ -43,6 +44,9 @@ def to_dict(self) -> dict[str, Any]:
4344
if self.name:
4445
result["name"] = self.name
4546

47+
if self.github_username:
48+
result["github_username"] = self.github_username
49+
4650
# Only include confidence and reasoning for LLM-generated aliases
4751
if self.confidence < 1.0:
4852
result["confidence"] = round(self.confidence, 2)
@@ -122,6 +126,7 @@ def load(self) -> None:
122126
primary_email=primary_email,
123127
aliases=alias_data.get("aliases", []),
124128
name=alias_data.get("name"),
129+
github_username=alias_data.get("github_username"),
125130
confidence=alias_data.get("confidence", 1.0),
126131
reasoning=alias_data.get("reasoning", ""),
127132
)
@@ -278,6 +283,9 @@ def to_manual_mappings(self) -> list[dict[str, Any]]:
278283
if alias.name:
279284
mapping["name"] = alias.name
280285

286+
if alias.github_username:
287+
mapping["github_username"] = alias.github_username
288+
281289
mapping["aliases"] = alias.aliases
282290

283291
# Include confidence and reasoning for LLM-generated mappings
@@ -312,6 +320,7 @@ def merge_from_mappings(self, mappings: list[dict[str, Any]]) -> None:
312320
primary_email=primary_email,
313321
aliases=mapping.get("aliases", []),
314322
name=mapping.get("name"),
323+
github_username=mapping.get("github_username"),
315324
confidence=mapping.get("confidence", 1.0),
316325
reasoning=mapping.get("reasoning", ""),
317326
)

src/gitflow_analytics/core/identity.py

Lines changed: 75 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -222,6 +222,12 @@ def _apply_manual_mappings(self, manual_mappings: list[dict[str, Any]]) -> None:
222222
)
223223
aliases = mapping.get("aliases", [])
224224
preferred_name = mapping.get("name") # Optional display name
225+
# Bug #44: Allow operators to supply github_username manually
226+
# for developers who never commit via the GitHub web UI (so the
227+
# noreply heuristic never fires for them).
228+
mapping_github_username: str | None = mapping.get("github_username")
229+
if mapping_github_username:
230+
mapping_github_username = mapping_github_username.strip().lower() or None
225231

226232
# Bug #29 fix: canonical_email is required but aliases can be empty
227233
# when the mapping only intends to set/rename the primary_name.
@@ -255,6 +261,7 @@ def _apply_manual_mappings(self, manual_mappings: list[dict[str, Any]]) -> None:
255261
canonical_id=canonical_id,
256262
primary_name=preferred_name or canonical_email.split("@")[0],
257263
primary_email=canonical_email,
264+
github_username=mapping_github_username,
258265
first_seen=datetime.now(timezone.utc),
259266
last_seen=datetime.now(timezone.utc),
260267
total_commits=0,
@@ -273,6 +280,12 @@ def _apply_manual_mappings(self, manual_mappings: list[dict[str, Any]]) -> None:
273280
)
274281
canonical_identity.primary_name = preferred_name
275282

283+
# Bug #44: Backfill github_username on an already-existing identity
284+
# if the manual mapping provides one. This is deferred until after
285+
# any merge() so it always applies to the surviving canonical row.
286+
if mapping_github_username and not canonical_identity.github_username:
287+
canonical_identity.github_username = mapping_github_username
288+
276289
# Process each alias
277290
for alias_email in aliases:
278291
alias_email = alias_email.lower().strip()
@@ -407,11 +420,14 @@ def resolve_developer(
407420
return canonical_id
408421

409422
# Fix 2: Detect GitHub noreply emails and resolve via username.
410-
# Pattern: {numeric_id}+{username}@users.noreply.github.com
423+
# Supports both forms:
424+
# - {numeric_id}+{username}@users.noreply.github.com (with ID prefix)
425+
# - {username}@users.noreply.github.com (bare form)
411426
# Extract the username portion and try to match it against existing aliases.
412-
if email.endswith("@users.noreply.github.com") and "+" in email:
427+
if email.endswith("@users.noreply.github.com"):
413428
local_part = email.split("@")[0]
414-
github_username = local_part.split("+", 1)[1] # part after the numeric ID
429+
# Handle both forms: "id+username" and bare "username"
430+
github_username = local_part.split("+", 1)[1] if "+" in local_part else local_part
415431
# Look for an existing alias or identity with this username as email/alias
416432
with self.get_session() as session:
417433
# Search aliases where email equals the plain username (common pattern
@@ -426,6 +442,13 @@ def resolve_developer(
426442
# Register the noreply address under the same identity so future
427443
# lookups hit the cache without another DB round-trip.
428444
self._add_alias(username_alias.canonical_id, name, email)
445+
# Bug #44: Backfill github_username on existing identity if missing.
446+
# Without this, identities created from corporate emails before a
447+
# noreply commit ever lands never get their github_username set,
448+
# breaking resolve_by_github_username() for PR review/ticketing.
449+
self._set_github_username_if_missing(
450+
username_alias.canonical_id, github_username
451+
)
429452
self._cache[cache_key] = username_alias.canonical_id
430453
logger.debug(
431454
f"Matched GitHub noreply email {email!r} to username "
@@ -441,6 +464,10 @@ def resolve_developer(
441464
)
442465
if username_identity:
443466
self._add_alias(username_identity.canonical_id, name, email)
467+
# Bug #44: Backfill github_username on existing identity if missing.
468+
self._set_github_username_if_missing(
469+
username_identity.canonical_id, github_username
470+
)
444471
self._cache[cache_key] = username_identity.canonical_id
445472
logger.debug(
446473
f"Matched GitHub noreply email {email!r} to primary identity "
@@ -615,6 +642,51 @@ def _create_identity(self, name: str, email: str, github_username: Optional[str]
615642

616643
return canonical_id
617644

645+
def _set_github_username_if_missing(self, canonical_id: str, github_username: str) -> None:
646+
"""Populate ``github_username`` on an existing identity when missing.
647+
648+
WHY (#44): When we detect a noreply email or a manual mapping for an
649+
identity that already exists in the database, the ``github_username``
650+
column is often NULL because the identity was created earlier from a
651+
corporate email (and ``_create_identity`` only writes the column at
652+
creation time). Without this field populated, ``resolve_by_github_username``
653+
cannot bridge ticketing/PR-review actors to canonical IDs, which makes
654+
the ticketing score always 0.0 for those developers.
655+
656+
The update only runs when the column is NULL or empty so we never
657+
overwrite a previously-resolved username.
658+
"""
659+
if not github_username or not self._database_available:
660+
# Keep in-memory cache consistent for the fallback path too.
661+
if github_username and not self._database_available:
662+
identity = self._in_memory_identities.get(canonical_id)
663+
if identity and not identity.get("github_username"):
664+
identity["github_username"] = github_username.lower()
665+
return
666+
667+
username_normalized = github_username.lower().strip()
668+
if not username_normalized:
669+
return
670+
671+
with self.get_session() as session:
672+
identity = (
673+
session.query(DeveloperIdentity)
674+
.filter(DeveloperIdentity.canonical_id == canonical_id)
675+
.first()
676+
)
677+
if identity and not identity.github_username:
678+
identity.github_username = username_normalized
679+
logger.debug(
680+
"Backfilled github_username=%r on canonical_id=%s",
681+
username_normalized,
682+
canonical_id,
683+
)
684+
# Refresh cache entry for the identity so downstream lookups
685+
# see the new value without waiting for a full reload.
686+
cached = self._cache.get(canonical_id)
687+
if isinstance(cached, dict):
688+
cached["github_username"] = username_normalized
689+
618690
def _add_alias(self, canonical_id: str, name: str, email: str):
619691
"""Add alias for existing developer."""
620692
with self.get_session() as session:

src/gitflow_analytics/core/identity_stats.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -267,6 +267,11 @@ def _apply_manual_mappings_to_memory(self) -> None:
267267
)
268268
aliases = mapping.get("aliases", [])
269269
preferred_name = mapping.get("name") # Optional display name
270+
# Bug #44: Honour github_username from manual mappings so
271+
# resolve_by_github_username() works in the in-memory fallback too.
272+
mapping_github_username = mapping.get("github_username")
273+
if mapping_github_username:
274+
mapping_github_username = mapping_github_username.strip().lower() or None
270275

271276
if not canonical_email or not aliases:
272277
continue
@@ -276,7 +281,7 @@ def _apply_manual_mappings_to_memory(self) -> None:
276281
self._in_memory_identities[canonical_id] = {
277282
"primary_name": preferred_name or canonical_email.split("@")[0],
278283
"primary_email": canonical_email,
279-
"github_username": None,
284+
"github_username": mapping_github_username,
280285
"total_commits": 0,
281286
"total_story_points": 0,
282287
}

tests/core/test_identity.py

Lines changed: 114 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -407,20 +407,25 @@ def test_noreply_email_matches_primary_identity(self, temp_dir):
407407
"is the extracted username"
408408
)
409409

410-
def test_noreply_email_without_plus_creates_new_identity(self, temp_dir):
411-
"""Noreply-style address without the numeric+username format is treated as a new identity."""
410+
def test_noreply_email_bare_form_matches_username_identity(self, temp_dir):
411+
"""Bug #44 Gap 2: bare ``[email protected]`` resolves to the username identity.
412+
413+
Prior to the fix, the guard required a ``+`` in the local-part, so the
414+
bare form silently fell through to "create new identity". After the fix
415+
the bare form is treated the same as the numeric+username form.
416+
"""
412417
db_path = temp_dir / "identities.db"
413418
resolver = DeveloperIdentityResolver(db_path)
414419

415-
resolver.resolve_developer("Alice", "alice")
420+
id_user = resolver.resolve_developer("Alice", "alice")
416421

417-
# An address that ends with noreply domain but lacks "+" should NOT trigger the
418-
# username-extraction path and should therefore create a distinct identity.
422+
# Bare noreply form must now match the existing username identity.
419423
id_plain = resolver.resolve_developer("Alice", "[email protected]")
420-
id_user = resolver.resolve_developer("Alice", "alice")
421424

422-
# These are separate identities; the noreply path only fires when "+" is present.
423-
assert id_plain != id_user
425+
assert id_plain == id_user, (
426+
"Bare '[email protected]' should resolve to the same identity "
427+
"whose primary_email is the extracted username (Bug #44 Gap 2)."
428+
)
424429

425430
def test_noreply_match_is_cached(self, temp_dir):
426431
"""Resolving the same noreply address twice uses the cache on the second call."""
@@ -873,3 +878,104 @@ def test_co_author_counts_not_accumulated(self, temp_dir):
873878
assert (
874879
helper["total_commits"] == 1
875880
), f"Co-author Helper should have 1 commit, got {helper['total_commits']}"
881+
882+
883+
class TestGithubUsernameBackfill:
884+
"""Bug #44: ensure github_username gets populated on existing identities.
885+
886+
Without these backfills, resolve_by_github_username() can't bridge ticketing
887+
actors to canonical identities, and ticketing_score is always 0.0 for
888+
developers whose identity was created before a noreply commit landed.
889+
"""
890+
891+
def test_resolve_developer_sets_github_username_on_existing_identity(self, temp_dir):
892+
"""Bug #44: noreply commit backfills github_username on a pre-existing identity.
893+
894+
Scenario: identity was created with the corporate email as primary and the
895+
GitHub username listed as an alias (typical ``manual_mappings`` output),
896+
but ``github_username`` is still NULL because no noreply commit has been
897+
seen yet. A later noreply commit must backfill the column on the
898+
existing identity so ticketing lookups can bridge to the canonical ID.
899+
"""
900+
db_path = temp_dir / "identities.db"
901+
902+
# Set up an existing identity with the username as a known alias but
903+
# no github_username column populated — mimics the state of every
904+
# developer in the production DB before the fix.
905+
manual_mappings = [
906+
{
907+
"primary_email": "[email protected]",
908+
"name": "Alice Smith",
909+
"aliases": ["alice-gh"], # note: no github_username here
910+
}
911+
]
912+
resolver = DeveloperIdentityResolver(db_path, manual_mappings=manual_mappings)
913+
914+
canonical_id = resolver.resolve_developer("Alice Smith", "[email protected]")
915+
916+
# Sanity: github_username is NOT populated yet.
917+
assert (
918+
resolver.resolve_by_github_username("alice-gh") is None
919+
), "Pre-condition: no github_username set before the noreply commit lands"
920+
921+
# Now a noreply commit with the same person's username arrives.
922+
resolved = resolver.resolve_developer(
923+
"Alice Smith",
924+
925+
)
926+
assert resolved == canonical_id
927+
928+
# The key assertion: resolve_by_github_username now finds the identity.
929+
found = resolver.resolve_by_github_username("alice-gh")
930+
assert (
931+
found == canonical_id
932+
), "github_username should have been backfilled on the existing identity"
933+
934+
def test_resolve_developer_bare_noreply_sets_github_username(self, temp_dir):
935+
"""Bug #44 Gap 2: bare [email protected] form is supported."""
936+
db_path = temp_dir / "identities.db"
937+
resolver = DeveloperIdentityResolver(db_path)
938+
939+
# Pre-create an identity whose primary_email is the bare username,
940+
# so the noreply path's "username_identity" branch fires.
941+
canonical_id = resolver.resolve_developer("Octocat", "octocat")
942+
943+
# Bare noreply form (no numeric ID prefix).
944+
resolved = resolver.resolve_developer(
945+
"Octocat",
946+
947+
)
948+
assert resolved == canonical_id
949+
950+
# github_username must be populated so ticketing lookups succeed.
951+
found = resolver.resolve_by_github_username("octocat")
952+
assert found == canonical_id, (
953+
"Bare '[email protected]' form must backfill github_username "
954+
"so resolve_by_github_username() can bridge ticketing actors."
955+
)
956+
957+
def test_manual_mapping_populates_github_username(self, temp_dir):
958+
"""Bug #44 Gap 3: manual_mappings can supply github_username for existing identities."""
959+
db_path = temp_dir / "identities.db"
960+
961+
# Step 1: create an identity via a commit (no github_username).
962+
resolver_first = DeveloperIdentityResolver(db_path)
963+
canonical_id = resolver_first.resolve_developer("Bob Builder", "[email protected]")
964+
assert resolver_first.resolve_by_github_username("bob-builder") is None
965+
966+
# Step 2: reload with a manual mapping that supplies github_username.
967+
manual_mappings = [
968+
{
969+
"primary_email": "[email protected]",
970+
"name": "Bob Builder",
971+
"github_username": "bob-builder",
972+
"aliases": [],
973+
}
974+
]
975+
resolver = DeveloperIdentityResolver(db_path, manual_mappings=manual_mappings)
976+
977+
found = resolver.resolve_by_github_username("bob-builder")
978+
assert found == canonical_id, (
979+
"Manual mapping with github_username must backfill the field on an "
980+
"already-existing identity so ticketing bridges to the canonical ID."
981+
)

0 commit comments

Comments
 (0)