chore: switch from sonnet to haiku in maintainers processing [CM-1049] (#3915)

mbani01 · web-flow · commit 94324f28658d · 2026-03-12T21:11:26.000Z
Signed-off-by: Mouad BANI &lt;mouad-mb@outlook.com&gt;
diff --git a/services/apps/git_integration/src/crowdgit/services/maintainer/bedrock.py b/services/apps/git_integration/src/crowdgit/services/maintainer/bedrock.py
@@ -78,7 +78,7 @@ async def invoke_bedrock(
             }
         )
 
-        modelId = "us.anthropic.claude-sonnet-4-20250514-v1:0"
+        modelId = "us.anthropic.claude-haiku-4-5-20251001-v1:0"
         accept = "application/json"
         contentType = "application/json"
 
@@ -107,14 +107,20 @@ async def invoke_bedrock(
             response_body = json.loads(body_bytes.decode("utf-8"))
             raw_text = response_body["content"][0]["text"].replace('"""', "").strip()
 
-            # Expect pure JSON - no markdown handling
+            # Strip markdown code fences if present (Haiku sometimes ignores the system prompt)
+            if raw_text.startswith("```"):
+                raw_text = raw_text.split("\n", 1)[-1]
+                if raw_text.endswith("```"):
+                    raw_text = raw_text.rsplit("```", 1)[0]
+                raw_text = raw_text.strip()
+
             output = json.loads(raw_text)
 
-            # Calculate cost
+            # Calculate cost (Claude Haiku 4.5 on AWS Bedrock: $1.00/$5.00 per 1M tokens)
             input_tokens = response_body["usage"]["input_tokens"]
             output_tokens = response_body["usage"]["output_tokens"]
-            input_cost = (input_tokens / 1000) * 0.003
-            output_cost = (output_tokens / 1000) * 0.015
+            input_cost = (input_tokens / 1_000_000) * 1.00
+            output_cost = (output_tokens / 1_000_000) * 5.00
             total_cost = input_cost + output_cost
 
             # Validate output with the provided model if it exists
diff --git a/services/apps/git_integration/src/crowdgit/services/maintainer/maintainer_service.py b/services/apps/git_integration/src/crowdgit/services/maintainer/maintainer_service.py
@@ -218,12 +218,16 @@ def get_extraction_prompt(self, filename: str, content_to_analyze: str) -> str:
             - The person's role, with a maximum of two words (e.g., "Lead Reviewer", "Core Maintainer").
             - The role must be about project governance, not a generic job title like "Software Engineer".
             - Do not include filler words like "repository", "project", or "active".
+            - **If the content does not assign an explicit individual role to each person** (e.g. a flat list with no per-person labels), set the title to the capitalized form of `normalized_title` (i.e. "Maintainer" or "Contributor"). Every person in the same response MUST receive the same derived title.
         4.  `normalized_title`:
-            - Must be exactly "maintainer" or "contributor". If the role is ambiguous, use the `<filename>` as the primary hint. For example, a file named `MAINTAINERS` or `CODEOWNERS` implies "maintainer", while `CONTRIBUTORS` implies "contributor".
+            - Must be exactly "maintainer" or "contributor". If the role is ambiguous, use the `{filename}` as the primary hint:
+              - Filenames containing `MAINTAINERS`, `CODEOWNERS`, `OWNERS`, or `REVIEWERS` → "maintainer"
+              - All other filenames (AUTHORS, CONTRIBUTORS, CREDITS, COMMITTERS, etc.) → "contributor"
         5.  `email`:
             - Extract the person's email address from the content. Look for patterns like `FullName <email@domain>`, `email@domain`, or email addresses in various formats.
             - The email must be a valid email address format (containing @ and a domain).
             - If no valid email can be found for the individual, use the string "unknown".
+            - **You MUST include every person found in the content regardless of whether their email is known. Never omit a person because their email is missing.**
 
         ---
         Filename: {filename}