From 1dff596f01f72697255c3c3094c8b0ac9fc05360 Mon Sep 17 00:00:00 2001 From: Shahin Saadati Date: Tue, 3 Feb 2026 15:43:00 -0800 Subject: [PATCH 01/21] Added Similarity scorers. --- pyproject.toml | 4 + src/google/adk/scope/utils/similarity.py | 116 ++++++++++++++++++++ test/adk/scope/utils/test_similarity.py | 130 +++++++++++++++++++++++ 3 files changed, 250 insertions(+) create mode 100644 src/google/adk/scope/utils/similarity.py create mode 100644 test/adk/scope/utils/test_similarity.py diff --git a/pyproject.toml b/pyproject.toml index 8063fb5..a18b27d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -19,6 +19,9 @@ dependencies = [ "pytest", "pytest-cov", "pyyaml", + "scipy", + "numpy", + "jellyfish", ] @@ -30,6 +33,7 @@ testpaths = [ "test" ] + [tool.ruff] exclude = ["features_pb2.py"] line-length = 80 diff --git a/src/google/adk/scope/utils/similarity.py b/src/google/adk/scope/utils/similarity.py new file mode 100644 index 0000000..37df549 --- /dev/null +++ b/src/google/adk/scope/utils/similarity.py @@ -0,0 +1,116 @@ +"""Computes a similarity score between two ADK Features.""" + +"""Computes a similarity score between two ADK Features.""" + +from typing import Optional, Tuple + +from jellyfish import jaro_winkler_similarity +import numpy as np +from scipy.optimize import linear_sum_assignment + +from google.adk.scope import features_pb2 as features_pb + +# Default weights for the similarity calculation. +DEFAULT_SIMILARITY_WEIGHTS = { + 'name': 0.30, + 'member_of': 0.25, + 'namespace': 0.15, + 'parameters': 0.20, + 'return_type': 0.10, +} + +# If the preliminary score is below this, we skip expensive calculations. +EARLY_EXIT_THRESHOLD = 0.4 + +class SimilarityScorer: + """Calculates a similarity score between two features.""" + + def __init__(self, weights: Optional[dict[str, float]] = None): + self.weights = weights or DEFAULT_SIMILARITY_WEIGHTS + + def _calculate_param_similarity( + self, param1: features_pb.Param, param2: features_pb.Param + ) -> float: + """Calculates the similarity score between two individual parameters.""" + s_p_name = jaro_winkler_similarity( + param1.normalized_name, param2.normalized_name + ) + s_p_type = ( + 1.0 + if param1.normalized_types == param2.normalized_types + else 0.0 + ) + s_p_opt = 1.0 if param1.is_optional == param2.is_optional else 0.0 + + # Weights for parameter components + return (0.5 * s_p_name) + (0.4 * s_p_type) + (0.1 * s_p_opt) + + def _calculate_parameters_score( + self, params1: list[features_pb.Param], params2: list[features_pb.Param] + ) -> float: + """Calculates the aggregated similarity score for two lists of parameters.""" + if not params1 and not params2: + return 1.0 + if not params1 or not params2: + return 0.0 + + similarity_matrix = np.zeros((len(params1), len(params2))) + for i, p1 in enumerate(params1): + for j, p2 in enumerate(params2): + similarity_matrix[i, j] = self._calculate_param_similarity(p1, p2) + + row_ind, col_ind = linear_sum_assignment(similarity_matrix, maximize=True) + total_match_score = similarity_matrix[row_ind, col_ind].sum() + total_params = len(params1) + len(params2) + + if total_params == 0: + return 1.0 + + return (2 * total_match_score) / total_params + + def _calculate_return_type_score( + self, f1: features_pb.Feature, f2: features_pb.Feature + ) -> float: + """Calculates the similarity score for the return types.""" + s_type_match = ( + 1.0 + if f1.normalized_return_types == f2.normalized_return_types + else 0.0 + ) + s_async_match = 1.0 if getattr(f1, 'async') == getattr(f2, 'async') else 0.0 + return (0.7 * s_type_match) + (0.3 * s_async_match) + + def score( + self, feature1: features_pb.Feature, feature2: features_pb.Feature + ) -> Tuple[bool, float]: + """Computes the overall similarity score between two features.""" + scores = { + 'name': jaro_winkler_similarity( + feature1.normalized_name, feature2.normalized_name + ), + 'member_of': jaro_winkler_similarity( + feature1.normalized_member_of, feature2.normalized_member_of + ), + 'namespace': jaro_winkler_similarity( + feature1.normalized_namespace, feature2.normalized_namespace + ), + } + + preliminary_score = ( + scores['name'] * self.weights['name'] + + scores['member_of'] * self.weights['member_of'] + + scores['namespace'] * self.weights['namespace'] + ) + + if preliminary_score < EARLY_EXIT_THRESHOLD: + return preliminary_score, False + + scores['parameters'] = self._calculate_parameters_score( + feature1.parameters, feature2.parameters + ) + scores['return_type'] = self._calculate_return_type_score(feature1, feature2) + + final_score = sum( + scores[key] * self.weights[key] for key in self.weights + ) + return final_score, True diff --git a/test/adk/scope/utils/test_similarity.py b/test/adk/scope/utils/test_similarity.py new file mode 100644 index 0000000..2164e9f --- /dev/null +++ b/test/adk/scope/utils/test_similarity.py @@ -0,0 +1,130 @@ +"""Unit tests for the SimilarityScorer class.""" + +import unittest + +from google.adk.scope import features_pb2 as features_pb +from google.adk.scope.utils.similarity import SimilarityScorer + + +class TestSimilarityScorer(unittest.TestCase): + """Test suite for the SimilarityScorer.""" + + def setUp(self): + """Set up a default scorer and a sample feature for the tests.""" + self.scorer = SimilarityScorer() + feature_args = { + 'normalized_name': 'my_func', + 'normalized_member_of': 'my_class', + 'normalized_namespace': 'my_module', + 'parameters': [ + features_pb.Param(normalized_name='p1', normalized_types=[features_pb.ParamType.STRING]), + features_pb.Param(normalized_name='p2', normalized_types=[features_pb.ParamType.NUMBER], is_optional=True), + ], + 'normalized_return_types': ['STRING'], + 'async': True + } + self.feature1 = features_pb.Feature(**feature_args) + + def test_initialization(self): + """Test that the scorer initializes with default and custom weights.""" + self.assertIsNotNone(self.scorer.weights) + custom_weights = {'name': 1.0, 'member_of': 0.0, 'namespace': 0.0, 'parameters': 0.0, 'return_type': 0.0} + custom_scorer = SimilarityScorer(weights=custom_weights) + self.assertEqual(custom_scorer.weights, custom_weights) + + def test_identical_features(self): + """Test that identical features yield a score of 1.0.""" + score, full_calc = self.scorer.score(self.feature1, self.feature1) + self.assertTrue(full_calc) + self.assertAlmostEqual(score, 1.0) + + def test_completely_dissimilar_features(self): + """Test that dissimilar features trigger an early exit.""" + feature2 = features_pb.Feature( + normalized_name='completely_different_function', + normalized_member_of='another_world', + normalized_namespace='a_galaxy_far_away' + ) + score, full_calc = self.scorer.score(self.feature1, feature2) + self.assertFalse(full_calc, f"Early exit failed; score was {score}") + + def test_partial_similarity(self): + """Test a scenario with partially similar features.""" + feature2_args = { + 'normalized_name': 'my_func', # Same name + 'normalized_member_of': 'my_class', # Same class + 'normalized_namespace': 'a_different_module', # Diff namespace + 'parameters': [ + features_pb.Param(normalized_name='p1', normalized_types=[features_pb.ParamType.STRING]), + ], + 'normalized_return_types': ['NUMBER'], # Diff return + 'async': False # Diff async + } + feature2 = features_pb.Feature(**feature2_args) + score, full_calc = self.scorer.score(self.feature1, feature2) + self.assertTrue(full_calc) + self.assertTrue(0 < score < 1.0, f"Score {score} was not in the expected range (0, 1)") + + def test_parameter_edge_cases(self): + """Test scoring with different parameter list configurations.""" + feature_no_params = features_pb.Feature(normalized_name='func', normalized_member_of='class', normalized_namespace='ns') + feature_one_param = features_pb.Feature( + normalized_name='func', normalized_member_of='class', normalized_namespace='ns', + parameters=[features_pb.Param(normalized_name='p1')] + ) + + # One empty, one not - should be an imperfect match + score, _ = self.scorer.score(feature_no_params, feature_one_param) + self.assertLess(score, 1.0, "Score should be less than 1.0 when one param list is empty") + + def test_return_type_edge_cases(self): + """Test scoring with different return type configurations.""" + feature_no_return = features_pb.Feature(normalized_name='my_func', normalized_member_of='my_class', normalized_namespace='my_module') + + score, _ = self.scorer.score(self.feature1, feature_no_return) + self.assertLess(score, 1.0, "Score should be less than 1.0 when return types differ") + + + def test_run_async_integration(self): + """Test the similarity of the TypeScript and Python 'run_async' features.""" + ts_feature_args = { + 'normalized_name': 'run_async', + 'normalized_member_of': 'runner', + 'normalized_namespace': 'runner', + 'parameters': [ + features_pb.Param(normalized_name='user_id', normalized_types=['STRING']), + features_pb.Param(normalized_name='session_id', normalized_types=['STRING']), + features_pb.Param(normalized_name='new_message', normalized_types=['OBJECT']), + features_pb.Param(normalized_name='state_delta', normalized_types=['OBJECT'], is_optional=True), + features_pb.Param(normalized_name='run_config', normalized_types=['OBJECT'], is_optional=True), + ], + 'normalized_return_types': ['OBJECT'], + 'async': True, + } + ts_feature = features_pb.Feature(**ts_feature_args) + + py_feature_args = { + 'normalized_name': 'run_async', + 'normalized_member_of': 'runner', + 'normalized_namespace': 'runners', + 'parameters': [ + features_pb.Param(normalized_name='user_id', normalized_types=['STRING']), + features_pb.Param(normalized_name='session_id', normalized_types=['STRING']), + features_pb.Param(normalized_name='invocation_id', normalized_types=['STRING'], is_optional=True), + features_pb.Param(normalized_name='new_message', normalized_types=['OBJECT'], is_optional=True), + features_pb.Param(normalized_name='state_delta', normalized_types=['MAP'], is_optional=True), + features_pb.Param(normalized_name='run_config', normalized_types=['OBJECT'], is_optional=True), + ], + 'normalized_return_types': ['OBJECT', 'null'], + 'async': True, + } + py_feature = features_pb.Feature(**py_feature_args) + + score, full_calc = self.scorer.score(ts_feature, py_feature) + + self.assertTrue(full_calc) + self.assertTrue(0.5 < score < 1.0, f"Score {score} was not in the expected range (0.5, 1.0)") + + +if __name__ == '__main__': + unittest.main() From d57463e0d5e80992052ceccf4f1fabbef53a38f8 Mon Sep 17 00:00:00 2001 From: Shahin Saadati Date: Tue, 3 Feb 2026 16:00:40 -0800 Subject: [PATCH 02/21] Added a function to read a proto file. --- src/google/adk/scope/matcher/__init__.py | 0 src/google/adk/scope/matcher/matcher.py | 16 +++++++++++ test/adk/scope/matcher/test_matcher.py | 35 ++++++++++++++++++++++++ 3 files changed, 51 insertions(+) create mode 100644 src/google/adk/scope/matcher/__init__.py create mode 100644 src/google/adk/scope/matcher/matcher.py create mode 100644 test/adk/scope/matcher/test_matcher.py diff --git a/src/google/adk/scope/matcher/__init__.py b/src/google/adk/scope/matcher/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/google/adk/scope/matcher/matcher.py b/src/google/adk/scope/matcher/matcher.py new file mode 100644 index 0000000..baaacf8 --- /dev/null +++ b/src/google/adk/scope/matcher/matcher.py @@ -0,0 +1,16 @@ +from google.protobuf import text_format +from google.adk.scope import features_pb2 + +def read_feature_registry(file_path: str) -> features_pb2.FeatureRegistry: + """Reads a FeatureRegistry from a text proto file. + + Args: + file_path: Path to the .txtpb file. + + Returns: + A FeatureRegistry instance. + """ + registry = features_pb2.FeatureRegistry() + with open(file_path, 'rb') as f: + text_format.Parse(f.read(), registry) + return registry \ No newline at end of file diff --git a/test/adk/scope/matcher/test_matcher.py b/test/adk/scope/matcher/test_matcher.py new file mode 100644 index 0000000..9bb879c --- /dev/null +++ b/test/adk/scope/matcher/test_matcher.py @@ -0,0 +1,35 @@ + +import os +import tempfile +import unittest +from google.adk.scope import features_pb2 +from google.adk.scope.matcher import matcher + +class TestMatcher(unittest.TestCase): + + def test_read_feature_registry(self): + content = """ + language: "PYTHON" + version: "1.0.0" + features { + original_name: "test_feature" + normalized_name: "test_feature" + type: FUNCTION + } + """ + with tempfile.NamedTemporaryFile(mode='w', suffix='.txtpb', delete=False) as f: + f.write(content) + temp_path = f.name + + try: + registry = matcher.read_feature_registry(temp_path) + self.assertEqual(registry.language, "PYTHON") + self.assertEqual(registry.version, "1.0.0") + self.assertEqual(len(registry.features), 1) + self.assertEqual(registry.features[0].original_name, "test_feature") + self.assertEqual(registry.features[0].type, features_pb2.Feature.Type.FUNCTION) + finally: + os.remove(temp_path) + +if __name__ == '__main__': + unittest.main() From edf7a0dd83d31c7a3c9532b2df78d5b3f16c1615 Mon Sep 17 00:00:00 2001 From: Shahin Saadati Date: Wed, 4 Feb 2026 12:03:57 -0800 Subject: [PATCH 03/21] Improved matcher. --- match.sh | 14 + .../adk/scope/extractors/converter_py.py | 1 - src/google/adk/scope/extractors/extract.py | 4 +- src/google/adk/scope/matcher/matcher.py | 324 +++++++++++++++++- src/google/adk/scope/utils/similarity.py | 105 ++++-- test/adk/scope/matcher/test_matcher.py | 196 +++++++++-- test/adk/scope/utils/test_similarity.py | 258 ++++++++++---- 7 files changed, 767 insertions(+), 135 deletions(-) create mode 100755 match.sh diff --git a/match.sh b/match.sh new file mode 100755 index 0000000..099c16e --- /dev/null +++ b/match.sh @@ -0,0 +1,14 @@ +#!/bin/bash +# match.sh - Wrapper script to match features between two FeatureRegistries. + +set -e + +# Determine the directory where this script is located +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + +# Add 'src' to PYTHONPATH so the python script can find modules +export PYTHONPATH="${SCRIPT_DIR}/src:${PYTHONPATH}" + +# Run the python matcher +# Pass all arguments to the python script +python3 -m google.adk.scope.matcher.matcher "$@" diff --git a/src/google/adk/scope/extractors/converter_py.py b/src/google/adk/scope/extractors/converter_py.py index 2c24b6f..43cda23 100644 --- a/src/google/adk/scope/extractors/converter_py.py +++ b/src/google/adk/scope/extractors/converter_py.py @@ -310,7 +310,6 @@ def _extract_params( if p: # Filter 'self' and 'cls' if p.original_name in ("self", "cls"): - continue # Add description if available diff --git a/src/google/adk/scope/extractors/extract.py b/src/google/adk/scope/extractors/extract.py index 01c3b38..0bfe5d2 100644 --- a/src/google/adk/scope/extractors/extract.py +++ b/src/google/adk/scope/extractors/extract.py @@ -16,8 +16,8 @@ logger = logging.getLogger(__name__) _JSON_INDENT = 2 -_JSON_OUTPUT = True -_YAML_OUTPUT = True +_JSON_OUTPUT = False +_YAML_OUTPUT = False _PROTO_OUTPUT = True diff --git a/src/google/adk/scope/matcher/matcher.py b/src/google/adk/scope/matcher/matcher.py index baaacf8..053c176 100644 --- a/src/google/adk/scope/matcher/matcher.py +++ b/src/google/adk/scope/matcher/matcher.py @@ -1,16 +1,316 @@ +import argparse +import sys +from typing import List, Tuple +import numpy as np +from scipy.optimize import linear_sum_assignment from google.protobuf import text_format from google.adk.scope import features_pb2 +from google.adk.scope.utils.similarity import SimilarityScorer + def read_feature_registry(file_path: str) -> features_pb2.FeatureRegistry: - """Reads a FeatureRegistry from a text proto file. - - Args: - file_path: Path to the .txtpb file. - - Returns: - A FeatureRegistry instance. - """ - registry = features_pb2.FeatureRegistry() - with open(file_path, 'rb') as f: - text_format.Parse(f.read(), registry) - return registry \ No newline at end of file + """Reads a FeatureRegistry from a text proto file. + + Args: + file_path: Path to the .txtpb file. + + Returns: + A FeatureRegistry instance. + """ + registry = features_pb2.FeatureRegistry() + with open(file_path, "rb") as f: + text_format.Parse(f.read(), registry) + return registry + + +def match_features( + base_features: List[features_pb2.Feature], + target_features: List[features_pb2.Feature], + alpha: float, +) -> List[Tuple[features_pb2.Feature, features_pb2.Feature, float]]: + """Matches features between two lists based on a similarity threshold. + + Features that score higher than `alpha` are considered matches, added to + the result list, and removed from both input lists to avoid duplicate + matching. Uses the Hungarian algorithm for global optimization. + + Args: + base_features: The first list of features. Modified in-place. + target_features: The second list of features. Modified in-place. + alpha: The similarity threshold (0.0 to 1.0) for a match. + + Returns: + A list of tuples (feature_from_base, feature_from_target, + similarity_score). + """ + if not base_features or not target_features: + return [] + + scorer = SimilarityScorer(alpha=alpha) + matches = [] + + # Build Cost Matrix (Rows=Base, Cols=Target) + n_base = len(base_features) + n_target = len(target_features) + similarity_matrix = np.zeros((n_base, n_target)) + + for i, f1 in enumerate(base_features): + for j, f2 in enumerate(target_features): + similarity_matrix[i, j] = scorer.get_similarity_score(f1, f2) + + # Run Hungarian Algorithm (Global Optimization) + row_ind, col_ind = linear_sum_assignment(similarity_matrix, maximize=True) + + matched_base_indices = set() + matched_target_indices = set() + + # Filter Optimal Assignments by Alpha Threshold + for r, c in zip(row_ind, col_ind): + score = similarity_matrix[r, c] + if score > alpha: + matches.append((base_features[r], target_features[c], score)) + matched_base_indices.add(r) + matched_target_indices.add(c) + + # Update the input lists in-place (Remove matched items) + base_features[:] = [ + f for i, f in enumerate(base_features) + if i not in matched_base_indices + ] + target_features[:] = [ + f for i, f in enumerate(target_features) + if i not in matched_target_indices + ] + + return matches + + return matches + + +def match_registries( + base_registry: features_pb2.FeatureRegistry, + target_registry: features_pb2.FeatureRegistry, + alpha: float, + report_type: str = "symmetric", +) -> str: + """Matches features between two FeatureRegistries and generates a report. + + This delegates to `match_features` and constructs a human-readable + Markdown string listing the matched original feature names. + + Args: + base_registry: The first FeatureRegistry. + target_registry: The second FeatureRegistry. + alpha: The similarity threshold (0.0 to 1.0) for a match. + report_type: 'symmetric' or 'directional' reporting style. + + Returns: + A Markdown string documenting the matched features. + """ + from collections import defaultdict + + base_features = list(base_registry.features) + target_features = list(target_registry.features) + + total_base = len(base_features) + total_target = len(target_features) + + # Pass 1: Solid Matches (mutates lists) + solid_matches = match_features(base_features, target_features, alpha) + + # Pass 2: Near Misses (mutates lists) + beta = max(0.0, alpha - 0.15) + near_misses = match_features(base_features, target_features, beta) + + # Leftovers + unmatched_base = base_features + unmatched_target = target_features + + if report_type == "symmetric": + union_size = total_base + total_target - len(solid_matches) + parity_score = len(solid_matches) / union_size if union_size > 0 else 1.0 + score_name = "Jaccard Index" + else: # directional + precision = len(solid_matches) / total_target if total_target > 0 else 1.0 + recall = len(solid_matches) / total_base if total_base > 0 else 1.0 + if precision + recall == 0: + parity_score = 0.0 + else: + parity_score = 2 * (precision * recall) / (precision + recall) + score_name = "F1 Score" + + lines = [ + "# Cross-Language Feature Parity Report", + f"**Base:** {base_registry.language} ({base_registry.version})", + f"**Target:** {target_registry.language} ({target_registry.version})", + f"**Feature Parity Score ({score_name}):** {parity_score:.1%}", + "", + ] + + modules = defaultdict(lambda: { + 'solid': [], + 'near': [], + 'unmatched_base': [], + 'unmatched_target': [] + }) + + for f_base, f_target, score in solid_matches: + ns = f_base.namespace or "Unknown Module" + modules[ns]['solid'].append((f_base, f_target, score)) + + for f_base, f_target, score in near_misses: + ns = f_base.namespace or "Unknown Module" + modules[ns]['near'].append((f_base, f_target, score)) + + for f_base in unmatched_base: + ns = f_base.namespace or "Unknown Module" + modules[ns]['unmatched_base'].append(f_base) + + for f_target in unmatched_target: + ns = f_target.namespace or "Unknown Module" + modules[ns]['unmatched_target'].append(f_target) + + def format_feature(f: features_pb2.Feature) -> str: + name = f.original_name or f.normalized_name + member = f.member_of + if member and member.lower() != "null": + return f"{member}.{name}" + return name + + def get_type_display_name(f: features_pb2.Feature) -> str: + """Map Feature Type enum to a human-readable Type string.""" + if f.type == features_pb2.Feature.Type.CONSTRUCTOR: + return "Constructor" + elif f.type in (features_pb2.Feature.Type.FUNCTION, features_pb2.Feature.Type.CLASS_METHOD): + return "Function" + elif f.type == features_pb2.Feature.Type.INSTANCE_METHOD: + return "Method" + else: + return "Unknown" + + for ns in sorted(modules.keys()): + lines.append(f"## Module '{ns}'") + lines.append("") + mod_data = modules[ns] + + if report_type == "symmetric": + if mod_data['solid']: + lines.append("### ✅ Solid Matches") + lines.append("| Type | Base Feature | Target Feature | Similarity Score |") + lines.append("|---|---|---|---|") + for f_base, f_target, score in mod_data['solid']: + f_type = get_type_display_name(f_base) + lines.append(f"| {f_type} | `{format_feature(f_base)}` | `{format_feature(f_target)}` | {score:.2f} |") + lines.append("") + + if mod_data['near']: + lines.append("### \u26A0\uFE0F Near Misses") + lines.append("| Type | Base Feature | Closest Target Candidate | Similarity |") + lines.append("|---|---|---|---|") + for f_base, f_target, score in mod_data['near']: + f_type = get_type_display_name(f_base) + lines.append(f"| {f_type} | `{format_feature(f_base)}` | `{format_feature(f_target)}` | {score:.2f} |") + lines.append("") + + if mod_data['unmatched_base'] or mod_data['unmatched_target']: + lines.append("### \u274C Unmatched Features") + lines.append("| Missing Feature | Missing In |") + lines.append("|---|---|") + for f_base in mod_data['unmatched_base']: + lines.append(f"| `{format_feature(f_base)}` | Target |") + for f_target in mod_data['unmatched_target']: + lines.append(f"| `{format_feature(f_target)}` | Base |") + lines.append("") + else: + if mod_data['solid']: + lines.append("### ✅ Matched Features") + lines.append("| Type | Base Feature | Target Feature | Similarity Score |") + lines.append("|---|---|---|---|") + for f_base, f_target, score in mod_data['solid']: + f_type = get_type_display_name(f_base) + lines.append(f"| {f_type} | `{format_feature(f_base)}` | `{format_feature(f_target)}` | {score:.2f} |") + lines.append("") + + if mod_data['near']: + lines.append("### \u26A0\uFE0F Inconsistencies (Near Misses)") + lines.append("| Type | Base Feature | Closest Target Candidate | Similarity |") + lines.append("|---|---|---|---|") + for f_base, f_target, score in mod_data['near']: + f_type = get_type_display_name(f_base) + lines.append(f"| {f_type} | `{format_feature(f_base)}` | `{format_feature(f_target)}` | {score:.2f} |") + lines.append("") + + if mod_data['unmatched_base']: + lines.append("### \u274C Missing in Target (Base Exclusive)") + lines.append("| Missing Feature |") + lines.append("|---|") + for f_base in mod_data['unmatched_base']: + lines.append(f"| `{format_feature(f_base)}` |") + lines.append("") + + if mod_data['unmatched_target']: + lines.append("### \u274C Target Exclusives") + lines.append("| Extra Target Feature |") + lines.append("|---|") + for f_target in mod_data['unmatched_target']: + lines.append(f"| `{format_feature(f_target)}` |") + lines.append("") + + return "\n".join(lines).strip() + + +def main(): + parser = argparse.ArgumentParser( + description="Match ADK features between two languages." + ) + parser.add_argument( + "--base", + required=True, + help="Path to the base FeatureRegistry .txtpb file.", + ) + parser.add_argument( + "--target", + required=True, + help="Path to the target FeatureRegistry .txtpb file.", + ) + parser.add_argument( + "--output", + required=True, + help="Path to save the Markdown report.", + ) + parser.add_argument( + "--alpha", + type=float, + default=0.7, + help="Similarity threshold (0.0 to 1.0) defaults to 0.7.", + ) + parser.add_argument( + "--report-type", + choices=["symmetric", "directional"], + default="symmetric", + help="Type of gap report to generate (symmetric or directional).", + ) + args = parser.parse_args() + + try: + base_registry = read_feature_registry(args.base) + target_registry = read_feature_registry(args.target) + except Exception as e: + print(f"Error reading feature registries: {e}", file=sys.stderr) + sys.exit(1) + + report = match_registries( + base_registry, target_registry, args.alpha, args.report_type + ) + + try: + with open(args.output, "w") as f: + f.write(report) + print(f"Successfully wrote match report to {args.output}") + except Exception as e: + print(f"Error writing report to {args.output}: {e}", file=sys.stderr) + sys.exit(1) + + +if __name__ == "__main__": + main() diff --git a/src/google/adk/scope/utils/similarity.py b/src/google/adk/scope/utils/similarity.py index 37df549..b0615e1 100644 --- a/src/google/adk/scope/utils/similarity.py +++ b/src/google/adk/scope/utils/similarity.py @@ -1,8 +1,4 @@ -"""Computes a similarity score between two ADK Features.""" - -"""Computes a similarity score between two ADK Features.""" - -from typing import Optional, Tuple +from typing import Optional from jellyfish import jaro_winkler_similarity import numpy as np @@ -12,21 +8,30 @@ # Default weights for the similarity calculation. DEFAULT_SIMILARITY_WEIGHTS = { - 'name': 0.30, - 'member_of': 0.25, - 'namespace': 0.15, - 'parameters': 0.20, - 'return_type': 0.10, + "name": 0.30, + "member_of": 0.30, + "namespace": 0.15, + "parameters": 0.15, + "return_type": 0.10, } -# If the preliminary score is below this, we skip expensive calculations. -EARLY_EXIT_THRESHOLD = 0.4 - class SimilarityScorer: """Calculates a similarity score between two features.""" - def __init__(self, weights: Optional[dict[str, float]] = None): + def __init__( + self, weights: Optional[dict[str, float]] = None, alpha: float = 0.8 + ): self.weights = weights or DEFAULT_SIMILARITY_WEIGHTS + assert "name" in self.weights + assert "member_of" in self.weights + assert "namespace" in self.weights + assert "parameters" in self.weights + assert "return_type" in self.weights + self._early_exit_threshold = alpha * ( + self.weights["name"] + + self.weights["member_of"] + + self.weights["namespace"] + ) def _calculate_param_similarity( self, param1: features_pb.Param, param2: features_pb.Param @@ -36,9 +41,7 @@ def _calculate_param_similarity( param1.normalized_name, param2.normalized_name ) s_p_type = ( - 1.0 - if param1.normalized_types == param2.normalized_types - else 0.0 + 1.0 if param1.normalized_types == param2.normalized_types else 0.0 ) s_p_opt = 1.0 if param1.is_optional == param2.is_optional else 0.0 @@ -48,7 +51,7 @@ def _calculate_param_similarity( def _calculate_parameters_score( self, params1: list[features_pb.Param], params2: list[features_pb.Param] ) -> float: - """Calculates the aggregated similarity score for two lists of parameters.""" + """Calculates aggregated similarity for two lists of parameters.""" if not params1 and not params2: return 1.0 if not params1 or not params2: @@ -57,9 +60,13 @@ def _calculate_parameters_score( similarity_matrix = np.zeros((len(params1), len(params2))) for i, p1 in enumerate(params1): for j, p2 in enumerate(params2): - similarity_matrix[i, j] = self._calculate_param_similarity(p1, p2) + similarity_matrix[i, j] = self._calculate_param_similarity( + p1, p2 + ) - row_ind, col_ind = linear_sum_assignment(similarity_matrix, maximize=True) + row_ind, col_ind = linear_sum_assignment( + similarity_matrix, maximize=True + ) total_match_score = similarity_matrix[row_ind, col_ind].sum() total_params = len(params1) + len(params2) @@ -77,40 +84,68 @@ def _calculate_return_type_score( if f1.normalized_return_types == f2.normalized_return_types else 0.0 ) - s_async_match = 1.0 if getattr(f1, 'async') == getattr(f2, 'async') else 0.0 + s_async_match = ( + 1.0 if getattr(f1, "async") == getattr(f2, "async") else 0.0 + ) return (0.7 * s_type_match) + (0.3 * s_async_match) - def score( + def get_similarity_score( self, feature1: features_pb.Feature, feature2: features_pb.Feature - ) -> Tuple[bool, float]: + ) -> float: """Computes the overall similarity score between two features.""" + # 1. Type Compatibility and Dynamic Weights + t1, t2 = feature1.type, feature2.type + current_weights = self.weights.copy() + + if t1 == features_pb.Feature.Type.CONSTRUCTOR and t2 == features_pb.Feature.Type.CONSTRUCTOR: + current_weights["member_of"] += current_weights["name"] + current_weights["name"] = 0.0 + elif t1 in (features_pb.Feature.Type.FUNCTION, features_pb.Feature.Type.CLASS_METHOD) and \ + t2 in (features_pb.Feature.Type.FUNCTION, features_pb.Feature.Type.CLASS_METHOD): + current_weights["member_of"] /= 2.0 + current_weights["name"] += current_weights["member_of"] + elif t1 == features_pb.Feature.Type.INSTANCE_METHOD and t2 == features_pb.Feature.Type.INSTANCE_METHOD: + pass # Keep default weights + else: + return 0.0 # Fast out for incompatible types + + # 2. Similarity Calculations scores = { - 'name': jaro_winkler_similarity( + "name": jaro_winkler_similarity( feature1.normalized_name, feature2.normalized_name ), - 'member_of': jaro_winkler_similarity( + "member_of": jaro_winkler_similarity( feature1.normalized_member_of, feature2.normalized_member_of ), - 'namespace': jaro_winkler_similarity( + "namespace": jaro_winkler_similarity( feature1.normalized_namespace, feature2.normalized_namespace ), } + # 3. Early Exit Check (using dynamic weights) preliminary_score = ( - scores['name'] * self.weights['name'] + - scores['member_of'] * self.weights['member_of'] + - scores['namespace'] * self.weights['namespace'] + scores["name"] * current_weights["name"] + + scores["member_of"] * current_weights["member_of"] + + scores["namespace"] * current_weights["namespace"] ) - if preliminary_score < EARLY_EXIT_THRESHOLD: - return preliminary_score, False + early_exit_threshold = 0.8 * ( + current_weights["name"] + + current_weights["member_of"] + + current_weights["namespace"] + ) + + if preliminary_score < early_exit_threshold: + return preliminary_score - scores['parameters'] = self._calculate_parameters_score( + scores["parameters"] = self._calculate_parameters_score( feature1.parameters, feature2.parameters ) - scores['return_type'] = self._calculate_return_type_score(feature1, feature2) + scores["return_type"] = self._calculate_return_type_score( + feature1, feature2 + ) final_score = sum( - scores[key] * self.weights[key] for key in self.weights + scores[key] * current_weights[key] for key in current_weights ) - return final_score, True + return final_score diff --git a/test/adk/scope/matcher/test_matcher.py b/test/adk/scope/matcher/test_matcher.py index 9bb879c..238df61 100644 --- a/test/adk/scope/matcher/test_matcher.py +++ b/test/adk/scope/matcher/test_matcher.py @@ -1,14 +1,13 @@ - import os import tempfile import unittest from google.adk.scope import features_pb2 from google.adk.scope.matcher import matcher -class TestMatcher(unittest.TestCase): - def test_read_feature_registry(self): - content = """ +class TestMatcher(unittest.TestCase): + def test_read_feature_registry(self): + content = """ language: "PYTHON" version: "1.0.0" features { @@ -17,19 +16,176 @@ def test_read_feature_registry(self): type: FUNCTION } """ - with tempfile.NamedTemporaryFile(mode='w', suffix='.txtpb', delete=False) as f: - f.write(content) - temp_path = f.name - - try: - registry = matcher.read_feature_registry(temp_path) - self.assertEqual(registry.language, "PYTHON") - self.assertEqual(registry.version, "1.0.0") - self.assertEqual(len(registry.features), 1) - self.assertEqual(registry.features[0].original_name, "test_feature") - self.assertEqual(registry.features[0].type, features_pb2.Feature.Type.FUNCTION) - finally: - os.remove(temp_path) - -if __name__ == '__main__': - unittest.main() + with tempfile.NamedTemporaryFile( + mode="w", suffix=".txtpb", delete=False + ) as f: + f.write(content) + temp_path = f.name + + try: + registry = matcher.read_feature_registry(temp_path) + self.assertEqual(registry.language, "PYTHON") + self.assertEqual(registry.version, "1.0.0") + self.assertEqual(len(registry.features), 1) + self.assertEqual(registry.features[0].original_name, "test_feature") + self.assertEqual( + registry.features[0].type, features_pb2.Feature.Type.FUNCTION + ) + finally: + os.remove(temp_path) + + def test_match_features(self): + f1 = features_pb2.Feature( + normalized_name="f_same", + normalized_member_of="c_same", + normalized_namespace="n_same", + type=features_pb2.Feature.Type.INSTANCE_METHOD, + ) + f2 = features_pb2.Feature( + normalized_name="f_same", + normalized_member_of="c_same", + normalized_namespace="n_same", + type=features_pb2.Feature.Type.INSTANCE_METHOD, + ) + f3 = features_pb2.Feature( + normalized_name="totally", + normalized_member_of="different", + normalized_namespace="stuff", + type=features_pb2.Feature.Type.INSTANCE_METHOD, + ) + f4 = features_pb2.Feature( + normalized_name="entirely", + normalized_member_of="unrelated", + normalized_namespace="things", + type=features_pb2.Feature.Type.INSTANCE_METHOD, + ) + + base_features = [f1, f3] + target_features = [f4, f2] + + matches = matcher.match_features(base_features, target_features, 0.8) + + self.assertEqual(len(matches), 1) + + m_f1, m_f2, score = matches[0] + self.assertEqual(m_f1.normalized_name, "f_same") + self.assertEqual(m_f2.normalized_name, "f_same") + self.assertGreater(score, 0.8) + + # Assert lists were mutated and matched elements removed + self.assertEqual(len(base_features), 1) + self.assertEqual(base_features[0].normalized_name, "totally") + + self.assertEqual(len(target_features), 1) + self.assertEqual(target_features[0].normalized_name, "entirely") + + def test_match_registries(self): + # f1 & f2 are a solid match (score ~ 1.0) + f1 = features_pb2.Feature( + original_name="fSameBase", + normalized_name="f_same", + member_of="BaseClass", + namespace="google.adk.events", + normalized_member_of="c_same", + normalized_namespace="n_same", + type=features_pb2.Feature.Type.INSTANCE_METHOD, + ) + f2 = features_pb2.Feature( + original_name="fSameTarget", + normalized_name="f_same", + member_of="TargetClass", + namespace="adk.events", + normalized_member_of="c_same", + normalized_namespace="n_same", + type=features_pb2.Feature.Type.INSTANCE_METHOD, + ) + + # f_near_base & f_near_target are a near miss (different names, same structural namespace/class) + # Using different return types and different enough names to drop the score below 0.8 + f_near_base = features_pb2.Feature( + original_name="base_name", + normalized_name="base_name", + member_of="base_member", + namespace="google.adk.events", + normalized_member_of="base_member", + normalized_namespace="n_same", + original_return_types=["string"], + type=features_pb2.Feature.Type.INSTANCE_METHOD, + ) + f_near_target = features_pb2.Feature( + original_name="target_name", + normalized_name="targ_name", + member_of="target_member", + namespace="adk.events", + normalized_member_of="target_member", + normalized_namespace="n_same", + original_return_types=["int"], + type=features_pb2.Feature.Type.INSTANCE_METHOD, + ) + + # f3 is a complete gap (base-exclusive) + f3 = features_pb2.Feature( + original_name="totally_diff", + normalized_name="totally", + member_of="null", + namespace="google.adk.events", + normalized_member_of="different", + normalized_namespace="stuff", + type=features_pb2.Feature.Type.INSTANCE_METHOD, + ) + + base_registry = features_pb2.FeatureRegistry( + language="Python", version="1.0.0" + ) + base_registry.features.extend([f1, f_near_base, f3]) + + target_registry = features_pb2.FeatureRegistry( + language="TypeScript", version="2.0.0" + ) + target_registry.features.extend([f2, f_near_target]) + + # Test Symmetric Report + report_sym = matcher.match_registries(base_registry, target_registry, 0.9, report_type="symmetric") + self.assertIn("# Cross-Language Feature Parity Report", report_sym) + self.assertIn("**Base:** Python (1.0.0)", report_sym) + self.assertIn("**Target:** TypeScript (2.0.0)", report_sym) + self.assertIn("**Feature Parity Score (Jaccard Index):** 25.0%", report_sym) + + self.assertIn("## Module 'google.adk.events'", report_sym) + + # Solid Matches + self.assertIn("### ✅ Solid Matches", report_sym) + self.assertIn("| Type | Base Feature | Target Feature | Similarity Score |", report_sym) + self.assertIn("| Method | `BaseClass.fSameBase` | `TargetClass.fSameTarget` |", report_sym) + + # Near Misses + self.assertIn("### ⚠️ Near Misses", report_sym) + self.assertIn("| Type | Base Feature | Closest Target Candidate | Similarity |", report_sym) + self.assertIn("| Method | `base_member.base_name` | `target_member.target_name` |", report_sym) + + # Unmatched / Gaps + self.assertIn("### ❌ Unmatched Features", report_sym) + self.assertIn("| `totally_diff` | Target |", report_sym) + + # Test Directional Report + report_dir = matcher.match_registries(base_registry, target_registry, 0.9, report_type="directional") + self.assertIn("**Feature Parity Score (F1 Score):** 40.0%", report_dir) + + self.assertIn("## Module 'google.adk.events'", report_dir) + + # Solid Matches + self.assertIn("### ✅ Matched Features", report_dir) + self.assertIn("| Type | Base Feature | Target Feature | Similarity Score |", report_dir) + self.assertIn("| Method | `BaseClass.fSameBase` | `TargetClass.fSameTarget` |", report_dir) + + # Near Misses + self.assertIn("### ⚠️ Inconsistencies (Near Misses)", report_dir) + self.assertIn("| Type | Base Feature | Closest Target Candidate | Similarity |", report_dir) + self.assertIn("| Method | `base_member.base_name` | `target_member.target_name` |", report_dir) + + # Unmatched / Gaps + self.assertIn("### ❌ Missing in Target (Base Exclusive)", report_dir) + self.assertIn("| `totally_diff` |", report_dir) + +if __name__ == "__main__": + unittest.main() diff --git a/test/adk/scope/utils/test_similarity.py b/test/adk/scope/utils/test_similarity.py index 2164e9f..bee8ada 100644 --- a/test/adk/scope/utils/test_similarity.py +++ b/test/adk/scope/utils/test_similarity.py @@ -13,118 +13,246 @@ def setUp(self): """Set up a default scorer and a sample feature for the tests.""" self.scorer = SimilarityScorer() feature_args = { - 'normalized_name': 'my_func', - 'normalized_member_of': 'my_class', - 'normalized_namespace': 'my_module', - 'parameters': [ - features_pb.Param(normalized_name='p1', normalized_types=[features_pb.ParamType.STRING]), - features_pb.Param(normalized_name='p2', normalized_types=[features_pb.ParamType.NUMBER], is_optional=True), + "normalized_name": "my_func", + "normalized_member_of": "my_class", + "normalized_namespace": "my_module", + "parameters": [ + features_pb.Param( + normalized_name="p1", + normalized_types=[features_pb.ParamType.STRING], + ), + features_pb.Param( + normalized_name="p2", + normalized_types=[features_pb.ParamType.NUMBER], + is_optional=True, + ), ], - 'normalized_return_types': ['STRING'], - 'async': True + "normalized_return_types": ["STRING"], + "async": True, + "type": features_pb.Feature.Type.INSTANCE_METHOD, } self.feature1 = features_pb.Feature(**feature_args) def test_initialization(self): """Test that the scorer initializes with default and custom weights.""" self.assertIsNotNone(self.scorer.weights) - custom_weights = {'name': 1.0, 'member_of': 0.0, 'namespace': 0.0, 'parameters': 0.0, 'return_type': 0.0} + custom_weights = { + "name": 1.0, + "member_of": 0.0, + "namespace": 0.0, + "parameters": 0.0, + "return_type": 0.0, + } custom_scorer = SimilarityScorer(weights=custom_weights) self.assertEqual(custom_scorer.weights, custom_weights) def test_identical_features(self): """Test that identical features yield a score of 1.0.""" - score, full_calc = self.scorer.score(self.feature1, self.feature1) - self.assertTrue(full_calc) + score = self.scorer.get_similarity_score(self.feature1, self.feature1) self.assertAlmostEqual(score, 1.0) def test_completely_dissimilar_features(self): """Test that dissimilar features trigger an early exit.""" feature2 = features_pb.Feature( - normalized_name='completely_different_function', - normalized_member_of='another_world', - normalized_namespace='a_galaxy_far_away' + normalized_name="completely_different_function", + normalized_member_of="another_world", + normalized_namespace="a_galaxy_far_away", + type=features_pb.Feature.Type.INSTANCE_METHOD, ) - score, full_calc = self.scorer.score(self.feature1, feature2) - self.assertFalse(full_calc, f"Early exit failed; score was {score}") + score = self.scorer.get_similarity_score(self.feature1, feature2) + self.assertLess(score, 0.4, f"Early exit failed; score was {score}") def test_partial_similarity(self): """Test a scenario with partially similar features.""" feature2_args = { - 'normalized_name': 'my_func', # Same name - 'normalized_member_of': 'my_class', # Same class - 'normalized_namespace': 'a_different_module', # Diff namespace - 'parameters': [ - features_pb.Param(normalized_name='p1', normalized_types=[features_pb.ParamType.STRING]), + "normalized_name": "my_func", # Same name + "normalized_member_of": "my_class", # Same class + "normalized_namespace": "a_different_module", # Diff namespace + "parameters": [ + features_pb.Param( + normalized_name="p1", + normalized_types=[features_pb.ParamType.STRING], + ), ], - 'normalized_return_types': ['NUMBER'], # Diff return - 'async': False # Diff async + "normalized_return_types": ["NUMBER"], # Diff return + "async": False, # Diff async + "type": features_pb.Feature.Type.INSTANCE_METHOD, } feature2 = features_pb.Feature(**feature2_args) - score, full_calc = self.scorer.score(self.feature1, feature2) - self.assertTrue(full_calc) - self.assertTrue(0 < score < 1.0, f"Score {score} was not in the expected range (0, 1)") + score = self.scorer.get_similarity_score(self.feature1, feature2) + self.assertTrue( + 0 < score < 1.0, + f"Score {score} was not in the expected range (0, 1)", + ) def test_parameter_edge_cases(self): """Test scoring with different parameter list configurations.""" - feature_no_params = features_pb.Feature(normalized_name='func', normalized_member_of='class', normalized_namespace='ns') + feature_no_params = features_pb.Feature( + normalized_name="func", + normalized_member_of="class", + normalized_namespace="ns", + type=features_pb.Feature.Type.INSTANCE_METHOD, + ) feature_one_param = features_pb.Feature( - normalized_name='func', normalized_member_of='class', normalized_namespace='ns', - parameters=[features_pb.Param(normalized_name='p1')] + normalized_name="func", + normalized_member_of="class", + normalized_namespace="ns", + parameters=[features_pb.Param(normalized_name="p1")], + type=features_pb.Feature.Type.INSTANCE_METHOD, ) - + # One empty, one not - should be an imperfect match - score, _ = self.scorer.score(feature_no_params, feature_one_param) - self.assertLess(score, 1.0, "Score should be less than 1.0 when one param list is empty") - + score = self.scorer.get_similarity_score( + feature_no_params, feature_one_param + ) + self.assertLess( + score, + 1.0, + "Score should be less than 1.0 when one param list is empty", + ) + def test_return_type_edge_cases(self): """Test scoring with different return type configurations.""" - feature_no_return = features_pb.Feature(normalized_name='my_func', normalized_member_of='my_class', normalized_namespace='my_module') - - score, _ = self.scorer.score(self.feature1, feature_no_return) - self.assertLess(score, 1.0, "Score should be less than 1.0 when return types differ") + feature_no_return = features_pb.Feature( + normalized_name="my_func", + normalized_member_of="my_class", + normalized_namespace="my_module", + type=features_pb.Feature.Type.INSTANCE_METHOD, + ) + score = self.scorer.get_similarity_score( + self.feature1, feature_no_return + ) + self.assertLess( + score, 1.0, "Score should be less than 1.0 when return types differ" + ) def test_run_async_integration(self): - """Test the similarity of the TypeScript and Python 'run_async' features.""" + """Test similarity of TypeScript and Python 'run_async' features.""" ts_feature_args = { - 'normalized_name': 'run_async', - 'normalized_member_of': 'runner', - 'normalized_namespace': 'runner', - 'parameters': [ - features_pb.Param(normalized_name='user_id', normalized_types=['STRING']), - features_pb.Param(normalized_name='session_id', normalized_types=['STRING']), - features_pb.Param(normalized_name='new_message', normalized_types=['OBJECT']), - features_pb.Param(normalized_name='state_delta', normalized_types=['OBJECT'], is_optional=True), - features_pb.Param(normalized_name='run_config', normalized_types=['OBJECT'], is_optional=True), + "normalized_name": "run_async", + "normalized_member_of": "runner", + "normalized_namespace": "runner", + "parameters": [ + features_pb.Param( + normalized_name="user_id", normalized_types=["STRING"] + ), + features_pb.Param( + normalized_name="session_id", normalized_types=["STRING"] + ), + features_pb.Param( + normalized_name="new_message", normalized_types=["OBJECT"] + ), + features_pb.Param( + normalized_name="state_delta", + normalized_types=["OBJECT"], + is_optional=True, + ), + features_pb.Param( + normalized_name="run_config", + normalized_types=["OBJECT"], + is_optional=True, + ), ], - 'normalized_return_types': ['OBJECT'], - 'async': True, + "normalized_return_types": ["OBJECT"], + "async": True, + "type": features_pb.Feature.Type.INSTANCE_METHOD, } ts_feature = features_pb.Feature(**ts_feature_args) py_feature_args = { - 'normalized_name': 'run_async', - 'normalized_member_of': 'runner', - 'normalized_namespace': 'runners', - 'parameters': [ - features_pb.Param(normalized_name='user_id', normalized_types=['STRING']), - features_pb.Param(normalized_name='session_id', normalized_types=['STRING']), - features_pb.Param(normalized_name='invocation_id', normalized_types=['STRING'], is_optional=True), - features_pb.Param(normalized_name='new_message', normalized_types=['OBJECT'], is_optional=True), - features_pb.Param(normalized_name='state_delta', normalized_types=['MAP'], is_optional=True), - features_pb.Param(normalized_name='run_config', normalized_types=['OBJECT'], is_optional=True), + "normalized_name": "run_async", + "normalized_member_of": "runner", + "normalized_namespace": "runners", + "parameters": [ + features_pb.Param( + normalized_name="user_id", normalized_types=["STRING"] + ), + features_pb.Param( + normalized_name="session_id", normalized_types=["STRING"] + ), + features_pb.Param( + normalized_name="invocation_id", + normalized_types=["STRING"], + is_optional=True, + ), + features_pb.Param( + normalized_name="new_message", + normalized_types=["OBJECT"], + is_optional=True, + ), + features_pb.Param( + normalized_name="state_delta", + normalized_types=["MAP"], + is_optional=True, + ), + features_pb.Param( + normalized_name="run_config", + normalized_types=["OBJECT"], + is_optional=True, + ), ], - 'normalized_return_types': ['OBJECT', 'null'], - 'async': True, + "normalized_return_types": ["OBJECT", "null"], + "async": True, + "type": features_pb.Feature.Type.INSTANCE_METHOD, } py_feature = features_pb.Feature(**py_feature_args) - score, full_calc = self.scorer.score(ts_feature, py_feature) + score = self.scorer.get_similarity_score(ts_feature, py_feature) + + self.assertTrue( + 0.5 < score < 1.0, + f"Score {score} was not in the expected range (0.5, 1.0)", + ) + + def test_type_mismatch_early_exit(self): + """Test that comparing differing types yields 0.0 immediately.""" + feature_constructor = features_pb.Feature( + normalized_name="my_func", + normalized_member_of="my_class", + normalized_namespace="my_module", + type=features_pb.Feature.Type.CONSTRUCTOR, + ) + score = self.scorer.get_similarity_score(self.feature1, feature_constructor) + self.assertEqual(score, 0.0) + + def test_constructor_weights(self): + """Test that CONSTRUCTOR comparisons ignore name and prioritize member_of.""" + c1 = features_pb.Feature( + normalized_name="constructor1", # Completely diff names + normalized_member_of="MyClass", + normalized_namespace="my_module", + type=features_pb.Feature.Type.CONSTRUCTOR, + ) + c2 = features_pb.Feature( + normalized_name="constructor2", + normalized_member_of="MyClass", + normalized_namespace="my_module", + type=features_pb.Feature.Type.CONSTRUCTOR, + ) + + # Despite name difference, should score very highly due to weight shift + score = self.scorer.get_similarity_score(c1, c2) + self.assertGreater(score, 0.9) + + def test_function_weights(self): + """Test that FUNCTION comparisons ignore member_of and prioritize name.""" + f1 = features_pb.Feature( + normalized_name="myFunction", + normalized_member_of="SomeClass", # Completely diff member_of + normalized_namespace="my_module", + type=features_pb.Feature.Type.FUNCTION, + ) + f2 = features_pb.Feature( + normalized_name="myFunction", + normalized_member_of="DifferentClass", + normalized_namespace="my_module", + type=features_pb.Feature.Type.FUNCTION, + ) - self.assertTrue(full_calc) - self.assertTrue(0.5 < score < 1.0, f"Score {score} was not in the expected range (0.5, 1.0)") + score = self.scorer.get_similarity_score(f1, f2) + self.assertGreater(score, 0.9) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() From 38ed07ec5493ce5122001ce8d3dc9a91a84f74ad Mon Sep 17 00:00:00 2001 From: Shahin Saadati Date: Wed, 4 Feb 2026 12:16:14 -0800 Subject: [PATCH 04/21] Ruff fixes --- src/google/adk/scope/matcher/matcher.py | 53 ++++++++++++++++----- src/google/adk/scope/utils/similarity.py | 12 +++-- test/adk/scope/matcher/test_matcher.py | 60 +++++++++++++++++++----- test/adk/scope/utils/test_similarity.py | 14 ++++-- 4 files changed, 105 insertions(+), 34 deletions(-) diff --git a/src/google/adk/scope/matcher/matcher.py b/src/google/adk/scope/matcher/matcher.py index 053c176..f6b1474 100644 --- a/src/google/adk/scope/matcher/matcher.py +++ b/src/google/adk/scope/matcher/matcher.py @@ -128,10 +128,14 @@ def match_registries( if report_type == "symmetric": union_size = total_base + total_target - len(solid_matches) - parity_score = len(solid_matches) / union_size if union_size > 0 else 1.0 + parity_score = ( + len(solid_matches) / union_size if union_size > 0 else 1.0 + ) score_name = "Jaccard Index" else: # directional - precision = len(solid_matches) / total_target if total_target > 0 else 1.0 + precision = ( + len(solid_matches) / total_target if total_target > 0 else 1.0 + ) recall = len(solid_matches) / total_base if total_base > 0 else 1.0 if precision + recall == 0: parity_score = 0.0 @@ -179,9 +183,10 @@ def format_feature(f: features_pb2.Feature) -> str: def get_type_display_name(f: features_pb2.Feature) -> str: """Map Feature Type enum to a human-readable Type string.""" - if f.type == features_pb2.Feature.Type.CONSTRUCTOR: + FeatureType = features_pb2.Feature.Type + if f.type == FeatureType.CONSTRUCTOR: return "Constructor" - elif f.type in (features_pb2.Feature.Type.FUNCTION, features_pb2.Feature.Type.CLASS_METHOD): + elif f.type in (FeatureType.FUNCTION, FeatureType.CLASS_METHOD): return "Function" elif f.type == features_pb2.Feature.Type.INSTANCE_METHOD: return "Method" @@ -196,20 +201,32 @@ def get_type_display_name(f: features_pb2.Feature) -> str: if report_type == "symmetric": if mod_data['solid']: lines.append("### ✅ Solid Matches") - lines.append("| Type | Base Feature | Target Feature | Similarity Score |") + lines.append( + "| Type | Base Feature | Target Feature | " + "Similarity Score |" + ) lines.append("|---|---|---|---|") for f_base, f_target, score in mod_data['solid']: f_type = get_type_display_name(f_base) - lines.append(f"| {f_type} | `{format_feature(f_base)}` | `{format_feature(f_target)}` | {score:.2f} |") + lines.append( + f"| {f_type} | `{format_feature(f_base)}` | " + f"`{format_feature(f_target)}` | {score:.2f} |" + ) lines.append("") if mod_data['near']: lines.append("### \u26A0\uFE0F Near Misses") - lines.append("| Type | Base Feature | Closest Target Candidate | Similarity |") + lines.append( + "| Type | Base Feature | Closest Target Candidate | " + "Similarity |" + ) lines.append("|---|---|---|---|") for f_base, f_target, score in mod_data['near']: f_type = get_type_display_name(f_base) - lines.append(f"| {f_type} | `{format_feature(f_base)}` | `{format_feature(f_target)}` | {score:.2f} |") + lines.append( + f"| {f_type} | `{format_feature(f_base)}` | " + f"`{format_feature(f_target)}` | {score:.2f} |" + ) lines.append("") if mod_data['unmatched_base'] or mod_data['unmatched_target']: @@ -224,20 +241,32 @@ def get_type_display_name(f: features_pb2.Feature) -> str: else: if mod_data['solid']: lines.append("### ✅ Matched Features") - lines.append("| Type | Base Feature | Target Feature | Similarity Score |") + lines.append( + "| Type | Base Feature | Target Feature | " + "Similarity Score |" + ) lines.append("|---|---|---|---|") for f_base, f_target, score in mod_data['solid']: f_type = get_type_display_name(f_base) - lines.append(f"| {f_type} | `{format_feature(f_base)}` | `{format_feature(f_target)}` | {score:.2f} |") + lines.append( + f"| {f_type} | `{format_feature(f_base)}` | " + f"`{format_feature(f_target)}` | {score:.2f} |" + ) lines.append("") if mod_data['near']: lines.append("### \u26A0\uFE0F Inconsistencies (Near Misses)") - lines.append("| Type | Base Feature | Closest Target Candidate | Similarity |") + lines.append( + "| Type | Base Feature | Closest Target Candidate | " + "Similarity |" + ) lines.append("|---|---|---|---|") for f_base, f_target, score in mod_data['near']: f_type = get_type_display_name(f_base) - lines.append(f"| {f_type} | `{format_feature(f_base)}` | `{format_feature(f_target)}` | {score:.2f} |") + lines.append( + f"| {f_type} | `{format_feature(f_base)}` | " + f"`{format_feature(f_target)}` | {score:.2f} |" + ) lines.append("") if mod_data['unmatched_base']: diff --git a/src/google/adk/scope/utils/similarity.py b/src/google/adk/scope/utils/similarity.py index b0615e1..e145b1f 100644 --- a/src/google/adk/scope/utils/similarity.py +++ b/src/google/adk/scope/utils/similarity.py @@ -97,15 +97,17 @@ def get_similarity_score( t1, t2 = feature1.type, feature2.type current_weights = self.weights.copy() - if t1 == features_pb.Feature.Type.CONSTRUCTOR and t2 == features_pb.Feature.Type.CONSTRUCTOR: + FeatureType = features_pb.Feature.Type + if t1 == FeatureType.CONSTRUCTOR and t2 == FeatureType.CONSTRUCTOR: current_weights["member_of"] += current_weights["name"] current_weights["name"] = 0.0 - elif t1 in (features_pb.Feature.Type.FUNCTION, features_pb.Feature.Type.CLASS_METHOD) and \ - t2 in (features_pb.Feature.Type.FUNCTION, features_pb.Feature.Type.CLASS_METHOD): + elif t1 in (FeatureType.FUNCTION, FeatureType.CLASS_METHOD) and \ + t2 in (FeatureType.FUNCTION, FeatureType.CLASS_METHOD): current_weights["member_of"] /= 2.0 current_weights["name"] += current_weights["member_of"] - elif t1 == features_pb.Feature.Type.INSTANCE_METHOD and t2 == features_pb.Feature.Type.INSTANCE_METHOD: - pass # Keep default weights + elif t1 == FeatureType.INSTANCE_METHOD and \ + t2 == FeatureType.INSTANCE_METHOD: + pass # Keep default weights else: return 0.0 # Fast out for incompatible types diff --git a/test/adk/scope/matcher/test_matcher.py b/test/adk/scope/matcher/test_matcher.py index 238df61..f21d544 100644 --- a/test/adk/scope/matcher/test_matcher.py +++ b/test/adk/scope/matcher/test_matcher.py @@ -100,8 +100,10 @@ def test_match_registries(self): type=features_pb2.Feature.Type.INSTANCE_METHOD, ) - # f_near_base & f_near_target are a near miss (different names, same structural namespace/class) - # Using different return types and different enough names to drop the score below 0.8 + # f_near_base & f_near_target are a near miss + # (different names, same structural namespace/class) + # Using different return types and different enough names to + # drop the score below 0.8 f_near_base = features_pb2.Feature( original_name="base_name", normalized_name="base_name", @@ -145,43 +147,75 @@ def test_match_registries(self): target_registry.features.extend([f2, f_near_target]) # Test Symmetric Report - report_sym = matcher.match_registries(base_registry, target_registry, 0.9, report_type="symmetric") + report_sym = matcher.match_registries( + base_registry, target_registry, 0.9, report_type="symmetric" + ) self.assertIn("# Cross-Language Feature Parity Report", report_sym) self.assertIn("**Base:** Python (1.0.0)", report_sym) self.assertIn("**Target:** TypeScript (2.0.0)", report_sym) - self.assertIn("**Feature Parity Score (Jaccard Index):** 25.0%", report_sym) + self.assertIn( + "**Feature Parity Score (Jaccard Index):** 25.0%", report_sym + ) self.assertIn("## Module 'google.adk.events'", report_sym) # Solid Matches self.assertIn("### ✅ Solid Matches", report_sym) - self.assertIn("| Type | Base Feature | Target Feature | Similarity Score |", report_sym) - self.assertIn("| Method | `BaseClass.fSameBase` | `TargetClass.fSameTarget` |", report_sym) + self.assertIn( + "| Type | Base Feature | Target Feature | Similarity Score |", + report_sym + ) + self.assertIn( + "| Method | `BaseClass.fSameBase` | `TargetClass.fSameTarget` |", + report_sym + ) # Near Misses self.assertIn("### ⚠️ Near Misses", report_sym) - self.assertIn("| Type | Base Feature | Closest Target Candidate | Similarity |", report_sym) - self.assertIn("| Method | `base_member.base_name` | `target_member.target_name` |", report_sym) + self.assertIn( + "| Type | Base Feature | Closest Target Candidate | Similarity |", + report_sym + ) + self.assertIn( + "| Method | `base_member.base_name` | " + "`target_member.target_name` |", + report_sym, + ) # Unmatched / Gaps self.assertIn("### ❌ Unmatched Features", report_sym) self.assertIn("| `totally_diff` | Target |", report_sym) # Test Directional Report - report_dir = matcher.match_registries(base_registry, target_registry, 0.9, report_type="directional") + report_dir = matcher.match_registries( + base_registry, target_registry, 0.9, report_type="directional" + ) self.assertIn("**Feature Parity Score (F1 Score):** 40.0%", report_dir) self.assertIn("## Module 'google.adk.events'", report_dir) # Solid Matches self.assertIn("### ✅ Matched Features", report_dir) - self.assertIn("| Type | Base Feature | Target Feature | Similarity Score |", report_dir) - self.assertIn("| Method | `BaseClass.fSameBase` | `TargetClass.fSameTarget` |", report_dir) + self.assertIn( + "| Type | Base Feature | Target Feature | Similarity Score |", + report_dir + ) + self.assertIn( + "| Method | `BaseClass.fSameBase` | `TargetClass.fSameTarget` |", + report_dir + ) # Near Misses self.assertIn("### ⚠️ Inconsistencies (Near Misses)", report_dir) - self.assertIn("| Type | Base Feature | Closest Target Candidate | Similarity |", report_dir) - self.assertIn("| Method | `base_member.base_name` | `target_member.target_name` |", report_dir) + self.assertIn( + "| Type | Base Feature | Closest Target Candidate | Similarity |", + report_dir + ) + self.assertIn( + "| Method | `base_member.base_name` | " + "`target_member.target_name` |", + report_dir, + ) # Unmatched / Gaps self.assertIn("### ❌ Missing in Target (Base Exclusive)", report_dir) diff --git a/test/adk/scope/utils/test_similarity.py b/test/adk/scope/utils/test_similarity.py index bee8ada..719e72c 100644 --- a/test/adk/scope/utils/test_similarity.py +++ b/test/adk/scope/utils/test_similarity.py @@ -213,13 +213,17 @@ def test_type_mismatch_early_exit(self): normalized_namespace="my_module", type=features_pb.Feature.Type.CONSTRUCTOR, ) - score = self.scorer.get_similarity_score(self.feature1, feature_constructor) + score = self.scorer.get_similarity_score( + self.feature1, feature_constructor + ) self.assertEqual(score, 0.0) def test_constructor_weights(self): - """Test that CONSTRUCTOR comparisons ignore name and prioritize member_of.""" + """Test that CONSTRUCTOR comparisons ignore name and prioritize + member_of. + """ c1 = features_pb.Feature( - normalized_name="constructor1", # Completely diff names + normalized_name="constructor1", # Completely diff names normalized_member_of="MyClass", normalized_namespace="my_module", type=features_pb.Feature.Type.CONSTRUCTOR, @@ -236,7 +240,9 @@ def test_constructor_weights(self): self.assertGreater(score, 0.9) def test_function_weights(self): - """Test that FUNCTION comparisons ignore member_of and prioritize name.""" + """Test that FUNCTION comparisons ignore member_of and prioritize + name. + """ f1 = features_pb.Feature( normalized_name="myFunction", normalized_member_of="SomeClass", # Completely diff member_of From da675a555487ee77ccf123ac4351a33a25058ab1 Mon Sep 17 00:00:00 2001 From: Shahin Saadati Date: Wed, 4 Feb 2026 12:19:13 -0800 Subject: [PATCH 05/21] fixed unit test --- test/adk/scope/extractors/test_extract.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/test/adk/scope/extractors/test_extract.py b/test/adk/scope/extractors/test_extract.py index d8dd013..2744bad 100644 --- a/test/adk/scope/extractors/test_extract.py +++ b/test/adk/scope/extractors/test_extract.py @@ -85,11 +85,24 @@ def setUp(self): self.mock_extractors["python"] = self.mock_py_extractor self.mock_extractors["typescript"] = self.mock_ts_extractor + # Enforce Output Generation + self.mock_json_patcher = patch( + "google.adk.scope.extractors.extract._JSON_OUTPUT", True + ) + self.mock_json_patcher.start() + + self.mock_yaml_patcher = patch( + "google.adk.scope.extractors.extract._YAML_OUTPUT", True + ) + self.mock_yaml_patcher.start() + # FeatureRegistry mock/patch ? # Actual FeatureRegistry is fine if we mock return of extract_features # But we write to file using MessageToJson def tearDown(self): + self.mock_yaml_patcher.stop() + self.mock_json_patcher.stop() self.mock_args_patcher.stop() self.mock_extractors_patcher.stop() shutil.rmtree(self.test_dir) From c2ccb4e8f7f0991874fe40ce37527a14e208c360 Mon Sep 17 00:00:00 2001 From: Shahin Saadati Date: Wed, 4 Feb 2026 14:31:52 -0800 Subject: [PATCH 06/21] improved the reports. --- config.yaml | 37 +- match.sh | 2 +- src/google/adk/scope/matcher/matcher.py | 586 ++++++++++++++++-------- src/google/adk/scope/utils/stats.py | 19 + test/adk/scope/matcher/test_matcher.py | 117 +++-- test/adk/scope/utils/test_stats.py | 25 + 6 files changed, 521 insertions(+), 265 deletions(-) create mode 100644 src/google/adk/scope/utils/stats.py create mode 100644 test/adk/scope/utils/test_stats.py diff --git a/config.yaml b/config.yaml index 826c332..5148c9a 100644 --- a/config.yaml +++ b/config.yaml @@ -1,42 +1,9 @@ python: exclude: - examples - - a2a - # - agents - - apps - - artifacts - - auth - - cli - - code_executors - - dependencies - - errors - - evaluation - - events - - features - - flows - - memory - - models - - planners - - platform - - plugins - - sessions - - telemetry - - tools - - utils + typescript: exclude: # - agents - - artifacts - - auth - - code_executors - - events - - examples - - memory - - models - - plugins - - runner - - sessions - - telemetry - - tools - - utils + diff --git a/match.sh b/match.sh index 099c16e..91c9acc 100755 --- a/match.sh +++ b/match.sh @@ -11,4 +11,4 @@ export PYTHONPATH="${SCRIPT_DIR}/src:${PYTHONPATH}" # Run the python matcher # Pass all arguments to the python script -python3 -m google.adk.scope.matcher.matcher "$@" +python3 "${SCRIPT_DIR}/src/google/adk/scope/matcher/matcher.py" "$@" diff --git a/src/google/adk/scope/matcher/matcher.py b/src/google/adk/scope/matcher/matcher.py index f6b1474..6608c38 100644 --- a/src/google/adk/scope/matcher/matcher.py +++ b/src/google/adk/scope/matcher/matcher.py @@ -1,22 +1,60 @@ import argparse +import dataclasses import sys -from typing import List, Tuple +from pathlib import Path +from typing import List, Tuple, Dict +from collections import defaultdict import numpy as np from scipy.optimize import linear_sum_assignment from google.protobuf import text_format from google.adk.scope import features_pb2 from google.adk.scope.utils.similarity import SimilarityScorer +from google.adk.scope.utils import stats -def read_feature_registry(file_path: str) -> features_pb2.FeatureRegistry: - """Reads a FeatureRegistry from a text proto file. +_NEAR_MISS_THRESHOLD = 0.15 + + +@dataclasses.dataclass +class MatchResult: + master_content: str + module_files: Dict[str, str] # filename -> content + + +def format_feature(f: features_pb2.Feature) -> str: + name = f.original_name or f.normalized_name + member = f.member_of + if member and member.lower() != "null": + return f"{member}.{name}" + return name + - Args: - file_path: Path to the .txtpb file. +def get_type_display_name(f: features_pb2.Feature) -> str: + FeatureType = features_pb2.Feature.Type + if f.type == FeatureType.CONSTRUCTOR: + return "constructor" + elif f.type in (FeatureType.FUNCTION, FeatureType.CLASS_METHOD): + return "function" + elif f.type == features_pb2.Feature.Type.INSTANCE_METHOD: + return "method" + else: + return "unknown" - Returns: - A FeatureRegistry instance. - """ + +def get_type_priority(f: features_pb2.Feature) -> int: + """Returns priority for sorting: constructor < function < method < unknown.""" + type_name = get_type_display_name(f) + priorities = { + "constructor": 0, + "function": 1, + "method": 2, + "unknown": 3, + } + return priorities.get(type_name, 99) + + +def read_feature_registry(file_path: str) -> features_pb2.FeatureRegistry: + """Reads a FeatureRegistry from a text proto file.""" registry = features_pb2.FeatureRegistry() with open(file_path, "rb") as f: text_format.Parse(f.read(), registry) @@ -28,21 +66,7 @@ def match_features( target_features: List[features_pb2.Feature], alpha: float, ) -> List[Tuple[features_pb2.Feature, features_pb2.Feature, float]]: - """Matches features between two lists based on a similarity threshold. - - Features that score higher than `alpha` are considered matches, added to - the result list, and removed from both input lists to avoid duplicate - matching. Uses the Hungarian algorithm for global optimization. - - Args: - base_features: The first list of features. Modified in-place. - target_features: The second list of features. Modified in-place. - alpha: The similarity threshold (0.0 to 1.0) for a match. - - Returns: - A list of tuples (feature_from_base, feature_from_target, - similarity_score). - """ + """Matches features between two lists using Hungarian algorithm.""" if not base_features or not target_features: return [] @@ -74,218 +98,360 @@ def match_features( # Update the input lists in-place (Remove matched items) base_features[:] = [ - f for i, f in enumerate(base_features) - if i not in matched_base_indices + f for i, f in enumerate(base_features) if i not in matched_base_indices ] target_features[:] = [ - f for i, f in enumerate(target_features) - if i not in matched_target_indices + f for i, f in enumerate(target_features) if i not in matched_target_indices ] return matches - return matches - def match_registries( base_registry: features_pb2.FeatureRegistry, target_registry: features_pb2.FeatureRegistry, alpha: float, report_type: str = "symmetric", -) -> str: - """Matches features between two FeatureRegistries and generates a report. - - This delegates to `match_features` and constructs a human-readable - Markdown string listing the matched original feature names. - - Args: - base_registry: The first FeatureRegistry. - target_registry: The second FeatureRegistry. - alpha: The similarity threshold (0.0 to 1.0) for a match. - report_type: 'symmetric' or 'directional' reporting style. - - Returns: - A Markdown string documenting the matched features. - """ - from collections import defaultdict - - base_features = list(base_registry.features) - target_features = list(target_registry.features) - - total_base = len(base_features) - total_target = len(target_features) - - # Pass 1: Solid Matches (mutates lists) - solid_matches = match_features(base_features, target_features, alpha) - - # Pass 2: Near Misses (mutates lists) - beta = max(0.0, alpha - 0.15) - near_misses = match_features(base_features, target_features, beta) - - # Leftovers - unmatched_base = base_features - unmatched_target = target_features +) -> MatchResult: + """Matches features and generates a master report + module sub-reports.""" + + # 1. Group by Module (Normalized Namespace) + features_base = defaultdict(list) + for f in base_registry.features: + key = f.normalized_namespace or f.namespace or "Unknown Module" + features_base[key].append(f) + + features_target = defaultdict(list) + for f in target_registry.features: + key = f.normalized_namespace or f.namespace or "Unknown Module" + features_target[key].append(f) + + all_modules = sorted(set(features_base.keys()) | set(features_target.keys())) + + # Global Stats using Set logic for Jaccard/F1 + # We will accumulate counts as we process modules + total_solid_matches = 0 + total_base_features = len(base_registry.features) + total_target_features = len(target_registry.features) + + # Master Report Header + from datetime import datetime + + master_lines = [] + + if report_type == "raw": + # Raw CSV Report + # Columns: base_namespace,base_member_of,base_name,target_namespace,target_member_of,target_name,type,score + csv_header = "base_namespace,base_member_of,base_name,target_namespace,target_member_of,target_name,type,score" + csv_lines = [csv_header] + + def get_feature_cols(f: features_pb2.Feature) -> tuple[str, str, str]: + ns = f.namespace or "" + if not ns and f.normalized_namespace: + ns = f.normalized_namespace + + # member_of + mem = f.member_of or "" + if not mem and f.normalized_member_of: + mem = f.normalized_member_of + if mem.lower() == "null": + mem = "" + + # name + name = f.original_name or f.normalized_name or "" + return ns, mem, name + + def escape_csv(s): + if s is None: + return "" + if ',' in s or '"' in s or '\n' in s: + return f'"{s.replace("\"", "\"\"")}"' + return s + + for module in all_modules: + base_list = features_base[module] + target_list = features_target[module] + + # Pass 1: Solid Matches + solid_matches = match_features(base_list, target_list, alpha) + + # Pass 2: Potential Matches (formerly Near Misses) + beta = max(0.0, alpha - _NEAR_MISS_THRESHOLD) + potential_matches = match_features(base_list, target_list, beta) + + # Leftovers + unmatched_base = base_list + unmatched_target = target_list + + for f_base, f_target, score in solid_matches: + b_ns, b_mem, b_name = get_feature_cols(f_base) + t_ns, t_mem, t_name = get_feature_cols(f_target) + f_type = get_type_display_name(f_base) + + line = ( + f"{escape_csv(b_ns)},{escape_csv(b_mem)},{escape_csv(b_name)}," + f"{escape_csv(t_ns)},{escape_csv(t_mem)},{escape_csv(t_name)}," + f"{escape_csv(f_type)},{score:.4f}" + ) + csv_lines.append(line) + + for f_base, f_target, score in potential_matches: + b_ns, b_mem, b_name = get_feature_cols(f_base) + t_ns, t_mem, t_name = get_feature_cols(f_target) + f_type = get_type_display_name(f_base) + + line = ( + f"{escape_csv(b_ns)},{escape_csv(b_mem)},{escape_csv(b_name)}," + f"{escape_csv(t_ns)},{escape_csv(t_mem)},{escape_csv(t_name)}," + f"{escape_csv(f_type)},{score:.4f}" + ) + csv_lines.append(line) + + for f_base in unmatched_base: + b_ns, b_mem, b_name = get_feature_cols(f_base) + f_type = get_type_display_name(f_base) + + line = ( + f"{escape_csv(b_ns)},{escape_csv(b_mem)},{escape_csv(b_name)}," + f",,," + f"{escape_csv(f_type)},0.0000" + ) + csv_lines.append(line) + + for f_target in unmatched_target: + t_ns, t_mem, t_name = get_feature_cols(f_target) + f_type = get_type_display_name(f_target) + + line = ( + f",,," + f"{escape_csv(t_ns)},{escape_csv(t_mem)},{escape_csv(t_name)}," + f"{escape_csv(f_type)},0.0000" + ) + csv_lines.append(line) - if report_type == "symmetric": - union_size = total_base + total_target - len(solid_matches) - parity_score = ( - len(solid_matches) / union_size if union_size > 0 else 1.0 + return MatchResult( + master_content="\n".join(csv_lines), + module_files={} ) - score_name = "Jaccard Index" - else: # directional - precision = ( - len(solid_matches) / total_target if total_target > 0 else 1.0 + + title_suffix = "Symmetric" if report_type == "symmetric" else "Directional" + master_lines.append(f"# Feature Matching Report: {title_suffix}") + master_lines.append(f"Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}") + master_lines.append("") + master_lines.append(f"**Base:** {base_registry.language} ({base_registry.version})") + master_lines.append(f"**Target:** {target_registry.language} ({target_registry.version})") + + # Placeholder for Global Score (calculated at end) + global_score_idx = len(master_lines) + master_lines.append("GLOBAL_SCORE_PLACEHOLDER") + master_lines.append("") + + master_lines.append("## Module Summary") + master_lines.append("| Module | Features (Base) | Score | Status | Details |") + master_lines.append("|---|---|---|---|---|") + + module_files = {} + + for module in all_modules: + base_list = features_base[module] + target_list = features_target[module] + + mod_base_count = len(base_list) + mod_target_count = len(target_list) + + # Pass 1: Solid Matches + solid_matches = match_features(base_list, target_list, alpha) + mod_solid_count = len(solid_matches) + total_solid_matches += mod_solid_count + + # Pass 2: Potential Matches (formerly Near Misses) + beta = max(0.0, alpha - _NEAR_MISS_THRESHOLD) + potential_matches = match_features(base_list, target_list, beta) + + # Leftovers + unmatched_base = base_list + unmatched_target = target_list + + # Calculate Module Score + if report_type == "symmetric": + union_size = mod_base_count + mod_target_count - mod_solid_count + mod_score = ( + mod_solid_count / union_size if union_size > 0 else 1.0 + ) + else: # directional + precision = stats.calculate_precision(mod_solid_count, mod_target_count) + recall = stats.calculate_recall(mod_solid_count, mod_base_count) + mod_score = stats.calculate_f1(precision, recall) + + status_icon = "❌" + if mod_score == 1.0: + status_icon = "✅" + elif mod_score >= 0.8: + status_icon = "⚠️" + + # Safe filename + module_safe_name = module.replace(".", "_") + module_filename = f"{module_safe_name}.md" + + # Add to Master + # Note: We assume the caller places module files in a subdirectory named 'modules' or similar. + # Let's standardize on `{stem}_modules/` where stem is output filename. + # But here we don't know the stem. + # We will use a placeholder `{modules_dir}` and replace it in main. + master_lines.append( + f"| `{module}` | {mod_base_count} | {mod_score:.2%} | {status_icon} | " + f"[View Details]({{modules_dir}}/{module_filename}) |" ) - recall = len(solid_matches) / total_base if total_base > 0 else 1.0 - if precision + recall == 0: - parity_score = 0.0 - else: - parity_score = 2 * (precision * recall) / (precision + recall) - score_name = "F1 Score" - - lines = [ - "# Cross-Language Feature Parity Report", - f"**Base:** {base_registry.language} ({base_registry.version})", - f"**Target:** {target_registry.language} ({target_registry.version})", - f"**Feature Parity Score ({score_name}):** {parity_score:.1%}", - "", - ] - modules = defaultdict(lambda: { - 'solid': [], - 'near': [], - 'unmatched_base': [], - 'unmatched_target': [] - }) - - for f_base, f_target, score in solid_matches: - ns = f_base.namespace or "Unknown Module" - modules[ns]['solid'].append((f_base, f_target, score)) - - for f_base, f_target, score in near_misses: - ns = f_base.namespace or "Unknown Module" - modules[ns]['near'].append((f_base, f_target, score)) - - for f_base in unmatched_base: - ns = f_base.namespace or "Unknown Module" - modules[ns]['unmatched_base'].append(f_base) - - for f_target in unmatched_target: - ns = f_target.namespace or "Unknown Module" - modules[ns]['unmatched_target'].append(f_target) - - def format_feature(f: features_pb2.Feature) -> str: - name = f.original_name or f.normalized_name - member = f.member_of - if member and member.lower() != "null": - return f"{member}.{name}" - return name - - def get_type_display_name(f: features_pb2.Feature) -> str: - """Map Feature Type enum to a human-readable Type string.""" - FeatureType = features_pb2.Feature.Type - if f.type == FeatureType.CONSTRUCTOR: - return "Constructor" - elif f.type in (FeatureType.FUNCTION, FeatureType.CLASS_METHOD): - return "Function" - elif f.type == features_pb2.Feature.Type.INSTANCE_METHOD: - return "Method" + mod_total_features = mod_base_count + mod_target_count - mod_solid_count if report_type == "symmetric" else mod_base_count + + # Generate Module Content + mod_lines = [] + mod_lines.append(f"# Module: `{module}`") + # Back link usually works if we know the relative path structure. + # Use placeholder {master_report} which will be replaced in main. + # It should link to the master report file. + mod_lines.append("[⬅️ Back to Master Report](../{master_report})") + mod_lines.append("") + mod_lines.append(f"**Score:** {mod_score:.2%} ({status_icon})") + + if report_type == "directional": + mod_lines.append( + "| Metric | Score |\n" + "|---|---|\n" + f"| **Precision** | {precision:.2%} |\n" + f"| **Recall** | {recall:.2%} |" + ) else: - return "Unknown" - - for ns in sorted(modules.keys()): - lines.append(f"## Module '{ns}'") - lines.append("") - mod_data = modules[ns] + # For symmetric, we usually just have the score (Jaccard). + # We can make it a table too for consistency if desired, but user asked "Similar to the main markdown... do that for the module markdown files too". + # Main markdown has "Global Jaccard Index: X%" for symmetric (not table). + # Wait, main markdown ONLY uses table for directional in my previous edit. + # "Global Jaccard Index: 25.00%" (one line) vs Table for F1/Precision/Recall. + # User said: "Similar to the main markdown... do that for the module markdown files too". + # So for directional module reports, I should use a table. + pass + + mod_lines.append("") + mod_lines.append(f"**Features:** {mod_total_features}") + mod_lines.append("") + + # Sort matches by type + solid_matches.sort(key=lambda x: (get_type_priority(x[0]), x[0].normalized_name)) + potential_matches.sort(key=lambda x: (get_type_priority(x[0]), x[0].normalized_name)) if report_type == "symmetric": - if mod_data['solid']: - lines.append("### ✅ Solid Matches") - lines.append( + if solid_matches: + mod_lines.append("### ✅ Solid Matches") + mod_lines.append( "| Type | Base Feature | Target Feature | " "Similarity Score |" ) - lines.append("|---|---|---|---|") - for f_base, f_target, score in mod_data['solid']: + mod_lines.append("|---|---|---|---|") + for f_base, f_target, score in solid_matches: f_type = get_type_display_name(f_base) - lines.append( + mod_lines.append( f"| {f_type} | `{format_feature(f_base)}` | " f"`{format_feature(f_target)}` | {score:.2f} |" ) - lines.append("") + mod_lines.append("") - if mod_data['near']: - lines.append("### \u26A0\uFE0F Near Misses") - lines.append( + if potential_matches: + mod_lines.append("### ⚠️ Potential Matches") + mod_lines.append( "| Type | Base Feature | Closest Target Candidate | " "Similarity |" ) - lines.append("|---|---|---|---|") - for f_base, f_target, score in mod_data['near']: + mod_lines.append("|---|---|---|---|") + for f_base, f_target, score in potential_matches: f_type = get_type_display_name(f_base) - lines.append( + mod_lines.append( f"| {f_type} | `{format_feature(f_base)}` | " f"`{format_feature(f_target)}` | {score:.2f} |" ) - lines.append("") - - if mod_data['unmatched_base'] or mod_data['unmatched_target']: - lines.append("### \u274C Unmatched Features") - lines.append("| Missing Feature | Missing In |") - lines.append("|---|---|") - for f_base in mod_data['unmatched_base']: - lines.append(f"| `{format_feature(f_base)}` | Target |") - for f_target in mod_data['unmatched_target']: - lines.append(f"| `{format_feature(f_target)}` | Base |") - lines.append("") - else: - if mod_data['solid']: - lines.append("### ✅ Matched Features") - lines.append( + mod_lines.append("") + + if unmatched_base or unmatched_target: + mod_lines.append("### ❌ Unmatched Features") + mod_lines.append("| Missing Feature | Missing In |") + mod_lines.append("|---|---|") + for f_base in unmatched_base: + mod_lines.append(f"| `{format_feature(f_base)}` | Target |") + for f_target in unmatched_target: + mod_lines.append(f"| `{format_feature(f_target)}` | Base |") + mod_lines.append("") + else: # directional + if solid_matches: + mod_lines.append("### ✅ Matched Features") + mod_lines.append( "| Type | Base Feature | Target Feature | " "Similarity Score |" ) - lines.append("|---|---|---|---|") - for f_base, f_target, score in mod_data['solid']: + mod_lines.append("|---|---|---|---|") + for f_base, f_target, score in solid_matches: f_type = get_type_display_name(f_base) - lines.append( + mod_lines.append( f"| {f_type} | `{format_feature(f_base)}` | " f"`{format_feature(f_target)}` | {score:.2f} |" ) - lines.append("") + mod_lines.append("") - if mod_data['near']: - lines.append("### \u26A0\uFE0F Inconsistencies (Near Misses)") - lines.append( + if potential_matches: + mod_lines.append("### ⚠️ Potential Matches") + mod_lines.append( "| Type | Base Feature | Closest Target Candidate | " "Similarity |" ) - lines.append("|---|---|---|---|") - for f_base, f_target, score in mod_data['near']: + mod_lines.append("|---|---|---|---|") + for f_base, f_target, score in potential_matches: f_type = get_type_display_name(f_base) - lines.append( + mod_lines.append( f"| {f_type} | `{format_feature(f_base)}` | " f"`{format_feature(f_target)}` | {score:.2f} |" ) - lines.append("") - - if mod_data['unmatched_base']: - lines.append("### \u274C Missing in Target (Base Exclusive)") - lines.append("| Missing Feature |") - lines.append("|---|") - for f_base in mod_data['unmatched_base']: - lines.append(f"| `{format_feature(f_base)}` |") - lines.append("") + mod_lines.append("") + + if unmatched_base: + mod_lines.append("### ❌ Missing in Target") + mod_lines.append("| Missing Feature |") + mod_lines.append("|---|") + for f_base in unmatched_base: + mod_lines.append(f"| `{format_feature(f_base)}` |") + mod_lines.append("") + + # Directional usually ignores target exclusives, but we can list them if needed. + # For strict directional (Base -> Target), we flag missing in target. + + module_files[module_filename] = "\n".join(mod_lines).strip() + + # Calculate Global Score + if report_type == "symmetric": + union_size = total_base_features + total_target_features - total_solid_matches + parity_score = ( + total_solid_matches / union_size if union_size > 0 else 1.0 + ) + global_stats = f"**Global Jaccard Index:** {parity_score:.2%}" + else: + precision = stats.calculate_precision(total_solid_matches, total_target_features) + recall = stats.calculate_recall(total_solid_matches, total_base_features) + parity_score = stats.calculate_f1(precision, recall) + + global_stats = ( + "| Metric | Score |\n" + "|---|---|\n" + f"| **Precision** | {precision:.2%} |\n" + f"| **Recall** | {recall:.2%} |\n" + f"| **Global F1 Score** | {parity_score:.2%} |" + ) - if mod_data['unmatched_target']: - lines.append("### \u274C Target Exclusives") - lines.append("| Extra Target Feature |") - lines.append("|---|") - for f_target in mod_data['unmatched_target']: - lines.append(f"| `{format_feature(f_target)}` |") - lines.append("") + master_lines[ + global_score_idx + ] = global_stats - return "\n".join(lines).strip() + return MatchResult( + master_content="\n".join(master_lines).strip(), + module_files=module_files + ) def main(): @@ -310,14 +476,14 @@ def main(): parser.add_argument( "--alpha", type=float, - default=0.7, - help="Similarity threshold (0.0 to 1.0) defaults to 0.7.", + default=0.8, + help="Similarity threshold (0.0 to 1.0) defaults to 0.8.", ) parser.add_argument( "--report-type", - choices=["symmetric", "directional"], + choices=["symmetric", "directional", "raw"], default="symmetric", - help="Type of gap report to generate (symmetric or directional).", + help="Type of gap report to generate (symmetric, directional, or raw).", ) args = parser.parse_args() @@ -328,16 +494,50 @@ def main(): print(f"Error reading feature registries: {e}", file=sys.stderr) sys.exit(1) - report = match_registries( + result = match_registries( base_registry, target_registry, args.alpha, args.report_type ) + output_path = Path(args.output) + + if args.report_type == "raw": + # Raw report is a single file, no modules directory needed + try: + output_path.parent.mkdir(parents=True, exist_ok=True) + output_path.write_text(result.master_content) + print(f"Successfully wrote raw match report to {output_path}") + except Exception as e: + print(f"Error writing raw report to {output_path}: {e}", file=sys.stderr) + sys.exit(1) + return + + # Create module directory + if result.module_files: + modules_dir_name = f"{output_path.stem}_modules" + modules_dir = output_path.parent / modules_dir_name + modules_dir.mkdir(parents=True, exist_ok=True) + + # Write module files + for filename, content in result.module_files.items(): + # Replace placeholder for master report link + # The link is relative from module dir (subplot) to master report (parent dir) + # So name is enough. + final_content = content.replace("{master_report}", output_path.name) + (modules_dir / filename).write_text(final_content) + + # Replace placeholder in Master Report + # We assume master report is in parent of modules_dir + # modules_dir relative to master report is just the dir name + master_report = result.master_content.replace("{modules_dir}", modules_dir_name) + else: + master_report = result.master_content.replace("{modules_dir}", ".") + try: - with open(args.output, "w") as f: - f.write(report) - print(f"Successfully wrote match report to {args.output}") + output_path.parent.mkdir(parents=True, exist_ok=True) + output_path.write_text(master_report) + print(f"Successfully wrote match report to {output_path}") except Exception as e: - print(f"Error writing report to {args.output}: {e}", file=sys.stderr) + print(f"Error writing report to {output_path}: {e}", file=sys.stderr) sys.exit(1) diff --git a/src/google/adk/scope/utils/stats.py b/src/google/adk/scope/utils/stats.py new file mode 100644 index 0000000..f98479c --- /dev/null +++ b/src/google/adk/scope/utils/stats.py @@ -0,0 +1,19 @@ +def calculate_precision(matches: int, total_target: int) -> float: + """Calculates precision: matches / total_target.""" + if total_target > 0: + return matches / total_target + return 1.0 + + +def calculate_recall(matches: int, total_base: int) -> float: + """Calculates recall: matches / total_base.""" + if total_base > 0: + return matches / total_base + return 1.0 + + +def calculate_f1(precision: float, recall: float) -> float: + """Calculates F1 score: 2 * (P * R) / (P + R).""" + if precision + recall > 0: + return 2 * (precision * recall) / (precision + recall) + return 0.0 diff --git a/test/adk/scope/matcher/test_matcher.py b/test/adk/scope/matcher/test_matcher.py index f21d544..c44245e 100644 --- a/test/adk/scope/matcher/test_matcher.py +++ b/test/adk/scope/matcher/test_matcher.py @@ -147,79 +147,124 @@ def test_match_registries(self): target_registry.features.extend([f2, f_near_target]) # Test Symmetric Report - report_sym = matcher.match_registries( + result_sym = matcher.match_registries( base_registry, target_registry, 0.9, report_type="symmetric" ) - self.assertIn("# Cross-Language Feature Parity Report", report_sym) - self.assertIn("**Base:** Python (1.0.0)", report_sym) - self.assertIn("**Target:** TypeScript (2.0.0)", report_sym) - self.assertIn( - "**Feature Parity Score (Jaccard Index):** 25.0%", report_sym - ) + report_sym = result_sym.master_content + + # 1. Verify Master Report Structure + self.assertIn("# Feature Matching Report: Symmetric", report_sym) + self.assertIn("**Global Jaccard Index:** 25.00%", report_sym) + self.assertIn("## Module Summary", report_sym) + + # Check for module entry in master summary + self.assertIn("| `n_same` |", report_sym) + self.assertIn("[View Details]({modules_dir}/n_same.md)", report_sym) + + # 2. Verify Module Content + self.assertIn("n_same.md", result_sym.module_files) + module_content = result_sym.module_files["n_same.md"] - self.assertIn("## Module 'google.adk.events'", report_sym) + self.assertIn("# Module: `n_same`", module_content) + self.assertIn("**Features:** 3", module_content) # Solid Matches - self.assertIn("### ✅ Solid Matches", report_sym) + self.assertIn("### ✅ Solid Matches", module_content) self.assertIn( "| Type | Base Feature | Target Feature | Similarity Score |", - report_sym + module_content ) self.assertIn( - "| Method | `BaseClass.fSameBase` | `TargetClass.fSameTarget` |", - report_sym + "| method | `BaseClass.fSameBase` | `TargetClass.fSameTarget` |", + module_content ) - # Near Misses - self.assertIn("### ⚠️ Near Misses", report_sym) + # Potential Matches (formerly Near Misses) + self.assertIn("### ⚠️ Potential Matches", module_content) self.assertIn( "| Type | Base Feature | Closest Target Candidate | Similarity |", - report_sym + module_content ) self.assertIn( - "| Method | `base_member.base_name` | " + "| method | `base_member.base_name` | " "`target_member.target_name` |", - report_sym, + module_content, ) - # Unmatched / Gaps - self.assertIn("### ❌ Unmatched Features", report_sym) - self.assertIn("| `totally_diff` | Target |", report_sym) + # Unmatched / Gaps (in 'stuff' module) + self.assertIn("stuff.md", result_sym.module_files) + stuff_content = result_sym.module_files["stuff.md"] + self.assertIn("### ❌ Unmatched Features", stuff_content) + self.assertIn("| `totally_diff` | Target |", stuff_content) + self.assertIn("**Features:** 1", stuff_content) # Test Directional Report - report_dir = matcher.match_registries( + result_dir = matcher.match_registries( base_registry, target_registry, 0.9, report_type="directional" ) - self.assertIn("**Feature Parity Score (F1 Score):** 40.0%", report_dir) - - self.assertIn("## Module 'google.adk.events'", report_dir) + report_dir = result_dir.master_content + + self.assertIn("| **Global F1 Score** | 40.00% |", report_dir) + self.assertIn("n_same.md", result_dir.module_files) + + mod_dir_content = result_dir.module_files["n_same.md"] # Solid Matches - self.assertIn("### ✅ Matched Features", report_dir) + self.assertIn("### ✅ Matched Features", mod_dir_content) self.assertIn( "| Type | Base Feature | Target Feature | Similarity Score |", - report_dir + mod_dir_content ) self.assertIn( - "| Method | `BaseClass.fSameBase` | `TargetClass.fSameTarget` |", - report_dir + "| method | `BaseClass.fSameBase` | `TargetClass.fSameTarget` |", + mod_dir_content ) - # Near Misses - self.assertIn("### ⚠️ Inconsistencies (Near Misses)", report_dir) + # Potential Matches + self.assertIn("### ⚠️ Potential Matches", mod_dir_content) self.assertIn( "| Type | Base Feature | Closest Target Candidate | Similarity |", - report_dir + mod_dir_content ) self.assertIn( - "| Method | `base_member.base_name` | " + "| method | `base_member.base_name` | " "`target_member.target_name` |", - report_dir, + mod_dir_content, ) + + + # Unmatched / Gaps (in 'stuff' module) + self.assertIn("stuff.md", result_dir.module_files) + stuff_dir_content = result_dir.module_files["stuff.md"] + self.assertIn("### ❌ Missing in Target", stuff_dir_content) + self.assertIn("| `totally_diff` |", stuff_dir_content) + + def test_match_registries_raw(self): + f1 = features_pb2.Feature( + original_name="f_same", + normalized_name="f_same", + normalized_namespace="pkg", + member_of="MyClass", + normalized_member_of="myclass", + type=features_pb2.Feature.Type.FUNCTION, + ) + base = features_pb2.FeatureRegistry(language="Python", version="1") + base.features.append(f1) + target = features_pb2.FeatureRegistry(language="TS", version="2") + target.features.append(f1) + + result = matcher.match_registries(base, target, 0.9, report_type="raw") + csv_content = result.master_content + + expected_header = "base_namespace,base_member_of,base_name,target_namespace,target_member_of,target_name,type,score" + self.assertIn(expected_header, csv_content) - # Unmatched / Gaps - self.assertIn("### ❌ Missing in Target (Base Exclusive)", report_dir) - self.assertIn("| `totally_diff` |", report_dir) + # Check for solid match line + # f1 has: ns=pkg, mem=MyClass, name=f_same + # Match should have same values for base and target + expected_line = "pkg,MyClass,f_same,pkg,MyClass,f_same,function,1.0000" + self.assertIn(expected_line, csv_content) + self.assertFalse(result.module_files) if __name__ == "__main__": unittest.main() diff --git a/test/adk/scope/utils/test_stats.py b/test/adk/scope/utils/test_stats.py new file mode 100644 index 0000000..62dbef1 --- /dev/null +++ b/test/adk/scope/utils/test_stats.py @@ -0,0 +1,25 @@ +import unittest +from google.adk.scope.utils import stats + + +class TestStats(unittest.TestCase): + def test_precision(self): + self.assertEqual(stats.calculate_precision(10, 20), 0.5) + self.assertEqual(stats.calculate_precision(0, 20), 0.0) + self.assertEqual(stats.calculate_precision(10, 0), 1.0) # Edge case + + def test_recall(self): + self.assertEqual(stats.calculate_recall(10, 20), 0.5) + self.assertEqual(stats.calculate_recall(0, 20), 0.0) + self.assertEqual(stats.calculate_recall(10, 0), 1.0) # Edge case + + def test_f1(self): + self.assertAlmostEqual(stats.calculate_f1(0.5, 0.5), 0.5) + self.assertAlmostEqual(stats.calculate_f1(1.0, 1.0), 1.0) + self.assertAlmostEqual(stats.calculate_f1(0.0, 1.0), 0.0) + self.assertAlmostEqual(stats.calculate_f1(0.0, 0.0), 0.0) + self.assertAlmostEqual(stats.calculate_f1(0.75, 0.5), 0.6) # 2*(0.375)/1.25 = 0.75/1.25 = 0.6 + + +if __name__ == "__main__": + unittest.main() From 6b9d713ba83f449dc37c7c745ba12fba3364fc22 Mon Sep 17 00:00:00 2001 From: Shahin Saadati Date: Wed, 4 Feb 2026 14:41:24 -0800 Subject: [PATCH 07/21] Ruff updates and README update --- README.md | 44 ++++++++++ src/google/adk/scope/matcher/matcher.py | 111 ++++++++++++++++-------- test/adk/scope/matcher/test_matcher.py | 5 +- test/adk/scope/utils/test_stats.py | 2 +- 4 files changed, 124 insertions(+), 38 deletions(-) diff --git a/README.md b/README.md index 3ea906c..420e0c7 100644 --- a/README.md +++ b/README.md @@ -66,6 +66,50 @@ python3 -m google.adk.scope.extractors.python.extractor \ output_dir ``` +``` + +### Feature Matching + +Once you have extracted features from two languages (e.g., Python and TypeScript), you can compare them using the `match.sh` script. + +```bash +./match.sh \ + --base output/py.txtpb \ + --target output/ts.txtpb \ + --output report.md \ + --report-type directional +``` + +| Argument | Description | +| :--- | :--- | +| `--base ` | **Required.** Path to the "source of truth" feature registry (e.g., Python). | +| `--target ` | **Required.** Path to the comparison registry (e.g., TypeScript). | +| `--output ` | **Required.** Path to the output Markdown report. | +| `--report-type ` | `symmetric` (default) for Jaccard Index, `directional` for F1/Precision/Recall, or `raw` for CSV. | +| `--alpha ` | Similarity threshold (0.0 - 1.0). Default is `0.8`. | + +#### How Matching Works + +The matcher uses the **Hungarian Algorithm** to find the optimal assignment between features in the Base and Target registries. +- **Cost Function**: Based on a similarity score derived from: + - Feature Name (normalized) + - Namespace / Module + - Feature Type (Function, Method, Class, etc.) +- **Thresholding**: Pairs with a similarity score below `--alpha` are discarded. + +#### Scoring Metrics + +**Symmetric Report (Jaccard Index)** +- Best for measuring general parity between two equal implementations. +- **Score**: $J(A, B) = \frac{|A \cap B|}{|A \cup B|}$ +- Penalizes both missing features and extra features. + +**Directional Report (F1 Score)** +- Best when checking if a Target implementation covers the Base implementation (e.g., "Is the TS SDK feature-complete vs Python?"). +- **Precision**: $\frac{\text{Matches}}{\text{Total Target Features}}$ (How accurate is the target?) +- **Recall**: $\frac{\text{Matches}}{\text{Total Base Features}}$ (How complete is the target?) +- **F1 Score**: Harmonic mean of Precision and Recall. $F1 = 2 \cdot \frac{P \cdot R}{P + R}$ + ## Development ### Running Tests diff --git a/src/google/adk/scope/matcher/matcher.py b/src/google/adk/scope/matcher/matcher.py index 6608c38..9844a11 100644 --- a/src/google/adk/scope/matcher/matcher.py +++ b/src/google/adk/scope/matcher/matcher.py @@ -42,7 +42,7 @@ def get_type_display_name(f: features_pb2.Feature) -> str: def get_type_priority(f: features_pb2.Feature) -> int: - """Returns priority for sorting: constructor < function < method < unknown.""" + """Returns priority: constructor < function < method < unknown.""" type_name = get_type_display_name(f) priorities = { "constructor": 0, @@ -101,7 +101,9 @@ def match_features( f for i, f in enumerate(base_features) if i not in matched_base_indices ] target_features[:] = [ - f for i, f in enumerate(target_features) if i not in matched_target_indices + f + for i, f in enumerate(target_features) + if i not in matched_target_indices ] return matches @@ -126,7 +128,9 @@ def match_registries( key = f.normalized_namespace or f.namespace or "Unknown Module" features_target[key].append(f) - all_modules = sorted(set(features_base.keys()) | set(features_target.keys())) + all_modules = sorted( + set(features_base.keys()) | set(features_target.keys()) + ) # Global Stats using Set logic for Jaccard/F1 # We will accumulate counts as we process modules @@ -141,8 +145,12 @@ def match_registries( if report_type == "raw": # Raw CSV Report - # Columns: base_namespace,base_member_of,base_name,target_namespace,target_member_of,target_name,type,score - csv_header = "base_namespace,base_member_of,base_name,target_namespace,target_member_of,target_name,type,score" + # Columns: base_namespace,base_member_of,base_name,target_namespace, + # target_member_of,target_name,type,score + csv_header = ( + "base_namespace,base_member_of,base_name,target_namespace," + "target_member_of,target_name,type,score" + ) csv_lines = [csv_header] def get_feature_cols(f: features_pb2.Feature) -> tuple[str, str, str]: @@ -165,7 +173,8 @@ def escape_csv(s): if s is None: return "" if ',' in s or '"' in s or '\n' in s: - return f'"{s.replace("\"", "\"\"")}"' + escaped = s.replace('"', '""') + return f'"{escaped}"' return s for module in all_modules: @@ -189,8 +198,10 @@ def escape_csv(s): f_type = get_type_display_name(f_base) line = ( - f"{escape_csv(b_ns)},{escape_csv(b_mem)},{escape_csv(b_name)}," - f"{escape_csv(t_ns)},{escape_csv(t_mem)},{escape_csv(t_name)}," + f"{escape_csv(b_ns)},{escape_csv(b_mem)}," + f"{escape_csv(b_name)}," + f"{escape_csv(t_ns)},{escape_csv(t_mem)}," + f"{escape_csv(t_name)}," f"{escape_csv(f_type)},{score:.4f}" ) csv_lines.append(line) @@ -201,8 +212,10 @@ def escape_csv(s): f_type = get_type_display_name(f_base) line = ( - f"{escape_csv(b_ns)},{escape_csv(b_mem)},{escape_csv(b_name)}," - f"{escape_csv(t_ns)},{escape_csv(t_mem)},{escape_csv(t_name)}," + f"{escape_csv(b_ns)},{escape_csv(b_mem)}," + f"{escape_csv(b_name)}," + f"{escape_csv(t_ns)},{escape_csv(t_mem)}," + f"{escape_csv(t_name)}," f"{escape_csv(f_type)},{score:.4f}" ) csv_lines.append(line) @@ -212,7 +225,8 @@ def escape_csv(s): f_type = get_type_display_name(f_base) line = ( - f"{escape_csv(b_ns)},{escape_csv(b_mem)},{escape_csv(b_name)}," + f"{escape_csv(b_ns)},{escape_csv(b_mem)}," + f"{escape_csv(b_name)}," f",,," f"{escape_csv(f_type)},0.0000" ) @@ -224,7 +238,8 @@ def escape_csv(s): line = ( f",,," - f"{escape_csv(t_ns)},{escape_csv(t_mem)},{escape_csv(t_name)}," + f"{escape_csv(t_ns)},{escape_csv(t_mem)}," + f"{escape_csv(t_name)}," f"{escape_csv(f_type)},0.0000" ) csv_lines.append(line) @@ -238,8 +253,12 @@ def escape_csv(s): master_lines.append(f"# Feature Matching Report: {title_suffix}") master_lines.append(f"Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}") master_lines.append("") - master_lines.append(f"**Base:** {base_registry.language} ({base_registry.version})") - master_lines.append(f"**Target:** {target_registry.language} ({target_registry.version})") + master_lines.append( + f"**Base:** {base_registry.language} ({base_registry.version})" + ) + master_lines.append( + f"**Target:** {target_registry.language} ({target_registry.version})" + ) # Placeholder for Global Score (calculated at end) global_score_idx = len(master_lines) @@ -247,7 +266,9 @@ def escape_csv(s): master_lines.append("") master_lines.append("## Module Summary") - master_lines.append("| Module | Features (Base) | Score | Status | Details |") + master_lines.append( + "| Module | Features (Base) | Score | Status | Details |" + ) master_lines.append("|---|---|---|---|---|") module_files = {} @@ -279,7 +300,9 @@ def escape_csv(s): mod_solid_count / union_size if union_size > 0 else 1.0 ) else: # directional - precision = stats.calculate_precision(mod_solid_count, mod_target_count) + precision = stats.calculate_precision( + mod_solid_count, mod_target_count + ) recall = stats.calculate_recall(mod_solid_count, mod_base_count) mod_score = stats.calculate_f1(precision, recall) @@ -294,16 +317,23 @@ def escape_csv(s): module_filename = f"{module_safe_name}.md" # Add to Master - # Note: We assume the caller places module files in a subdirectory named 'modules' or similar. + # Note: We assume the caller places module files in a subdirectory + # named 'modules' or similar. # Let's standardize on `{stem}_modules/` where stem is output filename. # But here we don't know the stem. # We will use a placeholder `{modules_dir}` and replace it in main. master_lines.append( - f"| `{module}` | {mod_base_count} | {mod_score:.2%} | {status_icon} | " + f"| `{module}` | {mod_base_count} | {mod_score:.2%} | " + f"{status_icon} | " f"[View Details]({{modules_dir}}/{module_filename}) |" ) - mod_total_features = mod_base_count + mod_target_count - mod_solid_count if report_type == "symmetric" else mod_base_count + if report_type == "symmetric": + mod_total_features = ( + mod_base_count + mod_target_count - mod_solid_count + ) + else: + mod_total_features = mod_base_count # Generate Module Content mod_lines = [] @@ -324,12 +354,7 @@ def escape_csv(s): ) else: # For symmetric, we usually just have the score (Jaccard). - # We can make it a table too for consistency if desired, but user asked "Similar to the main markdown... do that for the module markdown files too". - # Main markdown has "Global Jaccard Index: X%" for symmetric (not table). - # Wait, main markdown ONLY uses table for directional in my previous edit. - # "Global Jaccard Index: 25.00%" (one line) vs Table for F1/Precision/Recall. - # User said: "Similar to the main markdown... do that for the module markdown files too". - # So for directional module reports, I should use a table. + # We can make it a table too for consistency if desired. pass mod_lines.append("") @@ -337,8 +362,12 @@ def escape_csv(s): mod_lines.append("") # Sort matches by type - solid_matches.sort(key=lambda x: (get_type_priority(x[0]), x[0].normalized_name)) - potential_matches.sort(key=lambda x: (get_type_priority(x[0]), x[0].normalized_name)) + solid_matches.sort( + key=lambda x: (get_type_priority(x[0]), x[0].normalized_name) + ) + potential_matches.sort( + key=lambda x: (get_type_priority(x[0]), x[0].normalized_name) + ) if report_type == "symmetric": if solid_matches: @@ -419,21 +448,26 @@ def escape_csv(s): mod_lines.append(f"| `{format_feature(f_base)}` |") mod_lines.append("") - # Directional usually ignores target exclusives, but we can list them if needed. - # For strict directional (Base -> Target), we flag missing in target. - + # Directional reports usually ignore target exclusives. + # We flag missing-in-target features only. module_files[module_filename] = "\n".join(mod_lines).strip() # Calculate Global Score if report_type == "symmetric": - union_size = total_base_features + total_target_features - total_solid_matches + union_size = ( + total_base_features + total_target_features - total_solid_matches + ) parity_score = ( total_solid_matches / union_size if union_size > 0 else 1.0 ) global_stats = f"**Global Jaccard Index:** {parity_score:.2%}" else: - precision = stats.calculate_precision(total_solid_matches, total_target_features) - recall = stats.calculate_recall(total_solid_matches, total_base_features) + precision = stats.calculate_precision( + total_solid_matches, total_target_features + ) + recall = stats.calculate_recall( + total_solid_matches, total_base_features + ) parity_score = stats.calculate_f1(precision, recall) global_stats = ( @@ -507,7 +541,10 @@ def main(): output_path.write_text(result.master_content) print(f"Successfully wrote raw match report to {output_path}") except Exception as e: - print(f"Error writing raw report to {output_path}: {e}", file=sys.stderr) + print( + f"Error writing raw report to {output_path}: {e}", + file=sys.stderr, + ) sys.exit(1) return @@ -520,7 +557,7 @@ def main(): # Write module files for filename, content in result.module_files.items(): # Replace placeholder for master report link - # The link is relative from module dir (subplot) to master report (parent dir) + # The link is relative from module dir to master report # So name is enough. final_content = content.replace("{master_report}", output_path.name) (modules_dir / filename).write_text(final_content) @@ -528,7 +565,9 @@ def main(): # Replace placeholder in Master Report # We assume master report is in parent of modules_dir # modules_dir relative to master report is just the dir name - master_report = result.master_content.replace("{modules_dir}", modules_dir_name) + master_report = result.master_content.replace( + "{modules_dir}", modules_dir_name + ) else: master_report = result.master_content.replace("{modules_dir}", ".") diff --git a/test/adk/scope/matcher/test_matcher.py b/test/adk/scope/matcher/test_matcher.py index c44245e..1f78ed2 100644 --- a/test/adk/scope/matcher/test_matcher.py +++ b/test/adk/scope/matcher/test_matcher.py @@ -256,7 +256,10 @@ def test_match_registries_raw(self): result = matcher.match_registries(base, target, 0.9, report_type="raw") csv_content = result.master_content - expected_header = "base_namespace,base_member_of,base_name,target_namespace,target_member_of,target_name,type,score" + expected_header = ( + "base_namespace,base_member_of,base_name,target_namespace," + "target_member_of,target_name,type,score" + ) self.assertIn(expected_header, csv_content) # Check for solid match line diff --git a/test/adk/scope/utils/test_stats.py b/test/adk/scope/utils/test_stats.py index 62dbef1..36ea0b0 100644 --- a/test/adk/scope/utils/test_stats.py +++ b/test/adk/scope/utils/test_stats.py @@ -18,7 +18,7 @@ def test_f1(self): self.assertAlmostEqual(stats.calculate_f1(1.0, 1.0), 1.0) self.assertAlmostEqual(stats.calculate_f1(0.0, 1.0), 0.0) self.assertAlmostEqual(stats.calculate_f1(0.0, 0.0), 0.0) - self.assertAlmostEqual(stats.calculate_f1(0.75, 0.5), 0.6) # 2*(0.375)/1.25 = 0.75/1.25 = 0.6 + self.assertAlmostEqual(stats.calculate_f1(0.75, 0.5), 0.6) if __name__ == "__main__": From a70516c121bc6e3369944a3a7dac73ce5de33a4b Mon Sep 17 00:00:00 2001 From: Shahin Saadati Date: Wed, 4 Feb 2026 14:57:04 -0800 Subject: [PATCH 08/21] Fixed exclude list. --- config.yaml | 38 ++++++++++++++++++++-- src/google/adk/scope/extractors/extract.py | 8 +++-- 2 files changed, 41 insertions(+), 5 deletions(-) diff --git a/config.yaml b/config.yaml index 5148c9a..37320d0 100644 --- a/config.yaml +++ b/config.yaml @@ -1,9 +1,43 @@ python: exclude: - examples - + - a2a + # - agents + - apps + - artifacts + - auth + - cli + - code_executors + - dependencies + - errors + - evaluation + - events + - features + - flows + - memory + - models + - optimization + - planners + - platform + - plugins + - runners + - sessions + - telemetry + - tools + - utils typescript: exclude: # - agents - + - artifacts + - auth + - code_executors + - events + - examples + - memory + - models + - plugins + - sessions + - telemetry + - tools + - utils diff --git a/src/google/adk/scope/extractors/extract.py b/src/google/adk/scope/extractors/extract.py index 0bfe5d2..b479171 100644 --- a/src/google/adk/scope/extractors/extract.py +++ b/src/google/adk/scope/extractors/extract.py @@ -147,7 +147,10 @@ def main(): logger.info("Mode: Repo extraction: %s", input_path) - config = get_config(input_path) + # Priority: Config in CWD > Config in Input Repo + config = get_config(Path(".")) + if not config: + config = get_config(input_path) exclude_list = set(config.get(args.language, {}).get("exclude", [])) search_dir = get_search_dir(input_path, args.language) @@ -195,9 +198,8 @@ def main(): logger.error("Failed to create output directory %s: %s", output_dir, e) sys.exit(1) - timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S") prefix = "py" if args.language in {"python", "py"} else "ts" - base_filename = f"{prefix}_{timestamp}" + base_filename = f"{prefix}" if _JSON_OUTPUT: # 1. JSON Output From 6665c49e845dc92d8033f667fd64cdf99c702c7f Mon Sep 17 00:00:00 2001 From: Shahin Saadati Date: Wed, 4 Feb 2026 15:53:53 -0800 Subject: [PATCH 09/21] matching updates --- config.yaml | 6 +- src/google/adk/scope/matcher/matcher.py | 78 +++++++++++++++++++------ 2 files changed, 63 insertions(+), 21 deletions(-) diff --git a/config.yaml b/config.yaml index 37320d0..ce9ca6e 100644 --- a/config.yaml +++ b/config.yaml @@ -2,7 +2,7 @@ python: exclude: - examples - a2a - # - agents + - agents - apps - artifacts - auth @@ -16,11 +16,9 @@ python: - flows - memory - models - - optimization - planners - platform - plugins - - runners - sessions - telemetry - tools @@ -28,7 +26,7 @@ python: typescript: exclude: - # - agents + - agents - artifacts - auth - code_executors diff --git a/src/google/adk/scope/matcher/matcher.py b/src/google/adk/scope/matcher/matcher.py index 9844a11..75b6fb3 100644 --- a/src/google/adk/scope/matcher/matcher.py +++ b/src/google/adk/scope/matcher/matcher.py @@ -109,6 +109,21 @@ def match_features( return matches +def get_language_code(language_name: str) -> str: + """Returns a short code for the language.""" + name = language_name.upper() + if name == "PYTHON": + return "py" + elif name == "TYPESCRIPT": + return "ts" + elif name == "JAVA": + return "java" + elif name == "GOLANG": + return "go" + else: + return name[:2].lower() + + def match_registries( base_registry: features_pb2.FeatureRegistry, target_registry: features_pb2.FeatureRegistry, @@ -128,9 +143,12 @@ def match_registries( key = f.normalized_namespace or f.namespace or "Unknown Module" features_target[key].append(f) - all_modules = sorted( - set(features_base.keys()) | set(features_target.keys()) - ) + if report_type == "directional": + all_modules = sorted(features_base.keys()) + else: + all_modules = sorted( + set(features_base.keys()) | set(features_target.keys()) + ) # Global Stats using Set logic for Jaccard/F1 # We will accumulate counts as we process modules @@ -266,12 +284,20 @@ def escape_csv(s): master_lines.append("") master_lines.append("## Module Summary") - master_lines.append( - "| Module | Features (Base) | Score | Status | Details |" - ) - master_lines.append("|---|---|---|---|---|") + header = "| Module | Features (Base) | Score | Status | Details |" + divider = "|---|---|---|---|---|" + if report_type == "symmetric": + header = "| ADK | Module | Features (Base) | Score | Status | Details |" + divider = "|---|---|---|---|---|---|" + + master_lines.append(header) + master_lines.append(divider) module_files = {} + module_rows = [] + + base_code = get_language_code(base_registry.language) + target_code = get_language_code(target_registry.language) for module in all_modules: base_list = features_base[module] @@ -316,17 +342,30 @@ def escape_csv(s): module_safe_name = module.replace(".", "_") module_filename = f"{module_safe_name}.md" + # Determine ADK Value (Symmetric Only) + row_content = "" + if report_type == "symmetric": + adk_parts = [] + if mod_base_count > 0: + adk_parts.append(base_code) + if mod_target_count > 0: + adk_parts.append(target_code) + adk_value = ", ".join(adk_parts) + + row_content = ( + f"| {adk_value} | `{module}` | {mod_base_count} | {mod_score:.2%} | " + f"{status_icon} | " + f"[View Details]({{modules_dir}}/{module_filename}) |" + ) + else: + row_content = ( + f"| `{module}` | {mod_base_count} | {mod_score:.2%} | " + f"{status_icon} | " + f"[View Details]({{modules_dir}}/{module_filename}) |" + ) + # Add to Master - # Note: We assume the caller places module files in a subdirectory - # named 'modules' or similar. - # Let's standardize on `{stem}_modules/` where stem is output filename. - # But here we don't know the stem. - # We will use a placeholder `{modules_dir}` and replace it in main. - master_lines.append( - f"| `{module}` | {mod_base_count} | {mod_score:.2%} | " - f"{status_icon} | " - f"[View Details]({{modules_dir}}/{module_filename}) |" - ) + module_rows.append((mod_score, row_content)) if report_type == "symmetric": mod_total_features = ( @@ -452,6 +491,11 @@ def escape_csv(s): # We flag missing-in-target features only. module_files[module_filename] = "\n".join(mod_lines).strip() + # Sort modules by score descending + module_rows.sort(key=lambda x: x[0], reverse=True) + for _, row in module_rows: + master_lines.append(row) + # Calculate Global Score if report_type == "symmetric": union_size = ( From 2ab268be798d3286d45dd72fa7b8507d3949db94 Mon Sep 17 00:00:00 2001 From: Shahin Saadati Date: Wed, 4 Feb 2026 16:19:39 -0800 Subject: [PATCH 10/21] Fixed extractor_ts and its parameter extraction --- .../adk/scope/extractors/converter_ts.py | 122 +++++++++++------- src/google/adk/scope/matcher/matcher.py | 28 ++++ 2 files changed, 106 insertions(+), 44 deletions(-) diff --git a/src/google/adk/scope/extractors/converter_ts.py b/src/google/adk/scope/extractors/converter_ts.py index 8dac2b5..8c20f45 100644 --- a/src/google/adk/scope/extractors/converter_ts.py +++ b/src/google/adk/scope/extractors/converter_ts.py @@ -360,18 +360,23 @@ def _extract_params( "optional_parameter", "rest_parameter", ): - xml_params = self._process_param_node(child) + xml_params = self._process_param_node(child, param_docs) for p in xml_params: if ( - p.original_name in param_docs + not p.description + and p.original_name in param_docs and param_docs[p.original_name] ): p.description = param_docs[p.original_name] params.append(p) return params - def _process_param_node(self, node: Node) -> List[feature_pb2.Param]: + def _process_param_node( + self, node: Node, param_docs: dict = None + ) -> List[feature_pb2.Param]: # returns a LIST of Params to handle destructuring + if param_docs is None: + param_docs = {} # 1. Name extraction pattern_node = node.child_by_field_name("pattern") @@ -384,6 +389,9 @@ def _process_param_node(self, node: Node) -> List[feature_pb2.Param]: # 2. Type extraction type_node = node.child_by_field_name("type") + # Parse type map if available + type_map = self._extract_type_map(type_node) + if pattern_node and pattern_node.type == "object_pattern": # Handle destructuring: { a, b }: { a: string, b: number } # If type is NOT an inline object literal, we might still want @@ -427,47 +435,7 @@ def _process_param_node(self, node: Node) -> List[feature_pb2.Param]: extracted_params = [] # Parse type map if available - type_map = {} # name -> (type_str, optional_bool) - if type_node: - # Check for object_type node inside type_node - # type_annotation -> object_type - object_type_node = None - for child in type_node.children: - if child.type == "object_type": - object_type_node = child - break - - if object_type_node: - for child in object_type_node.children: - if child.type == "property_signature": - # name: property_identifier - # type: type_annotation - # optional? - prop_name_node = child.child_by_field_name("name") - prop_type_node = child.child_by_field_name("type") - - if prop_name_node: - p_name = prop_name_node.text.decode("utf-8") - p_type = "" - if prop_type_node: - p_type = prop_type_node.text.decode("utf-8") - if p_type.startswith(":"): - p_type = p_type[1:].strip() - - # Optionality check: check for '?' node or - # if literal text has ? - # child text might be "hint?:" - # or checking for optional node - p_optional = False - for sub in child.children: - if ( - sub.type == "?" - or sub.text.decode("utf-8") == "?" - ): - p_optional = True - break - - type_map[p_name] = (p_type, p_optional) + # type_map is already extracted above # Iterate pattern properties for child in pattern_node.children: @@ -550,6 +518,28 @@ def _process_param_node(self, node: Node) -> List[feature_pb2.Param]: if not name: return [] + # Check if type is an object literal AND we have type_map populated + # This means it's `param: { a: string }` style + is_literal_type = raw_type and raw_type.strip().startswith("{") + if is_literal_type and type_map: + # We want to explode this into multiple parameters + # defined by type_map. + # The original param name is `name` (e.g. "params") + # We look for descriptions in param_docs using "params.fieldName" + exploded_params = [] + for prop_name, (p_type, p_opt) in type_map.items(): + p = self._create_single_param( + prop_name, + [p_type], + p_opt or node.type == "optional_parameter", + ) + # Try to find description: "params.prop_name" + doc_key = f"{name}.{prop_name}" + if doc_key in param_docs: + p.description = param_docs[doc_key] + exploded_params.append(p) + return exploded_params + return [ self._create_single_param( name, @@ -715,3 +705,47 @@ def _extract_maturity(self, node: Node) -> feature_pb2.Feature.Maturity: if "beta" in decorators: return feature_pb2.Feature.BETA return None + + def _extract_type_map(self, type_node: Node) -> dict: + type_map = {} # name -> (type_str, optional_bool) + if type_node: + # Check for object_type node inside type_node + # type_annotation -> object_type + object_type_node = None + for child in type_node.children: + if child.type == "object_type": + object_type_node = child + break + + if object_type_node: + for child in object_type_node.children: + if child.type == "property_signature": + # name: property_identifier + # type: type_annotation + # optional? + prop_name_node = child.child_by_field_name("name") + prop_type_node = child.child_by_field_name("type") + + if prop_name_node: + p_name = prop_name_node.text.decode("utf-8") + p_type = "" + if prop_type_node: + p_type = prop_type_node.text.decode("utf-8") + if p_type.startswith(":"): + p_type = p_type[1:].strip() + + # Optionality check: check for '?' node or + # if literal text has ? + # child text might be "hint?:" + # or checking for optional node + p_optional = False + for sub in child.children: + if ( + sub.type == "?" + or sub.text.decode("utf-8") == "?" + ): + p_optional = True + break + + type_map[p_name] = (p_type, p_optional) + return type_map diff --git a/src/google/adk/scope/matcher/matcher.py b/src/google/adk/scope/matcher/matcher.py index 75b6fb3..762b30b 100644 --- a/src/google/adk/scope/matcher/matcher.py +++ b/src/google/adk/scope/matcher/matcher.py @@ -143,6 +143,34 @@ def match_registries( key = f.normalized_namespace or f.namespace or "Unknown Module" features_target[key].append(f) + # Fuzzy Remapping Logic + from jellyfish import jaro_winkler_similarity + + base_namespaces = set(features_base.keys()) + target_remap = {} # old_key -> new_key + + # Identify remappings + for t_ns in list(features_target.keys()): + if t_ns in base_namespaces: + target_remap[t_ns] = t_ns + continue + + # Find best fuzzy match + best_match = None + best_score = 0.0 + for b_ns in base_namespaces: + score = jaro_winkler_similarity(t_ns, b_ns) + if score > best_score: + best_score = score + best_match = b_ns + + if best_score > 0.8 and best_match: + target_remap[t_ns] = best_match + # Move features to the best match + features_target[best_match].extend(features_target.pop(t_ns)) + else: + target_remap[t_ns] = t_ns + if report_type == "directional": all_modules = sorted(features_base.keys()) else: From 0fd9341af42ce3a26d1956aa6cd5fe03049f3a9c Mon Sep 17 00:00:00 2001 From: Shahin Saadati Date: Thu, 5 Feb 2026 11:14:03 -0800 Subject: [PATCH 11/21] Tweaked scoring algorithm. --- match.sh | 7 +- playground.ipynb | 385 ++++++++++++++++++ .../adk/scope/extractors/converter_ts.py | 2 +- src/google/adk/scope/matcher/matcher.py | 18 +- src/google/adk/scope/utils/args.py | 14 + src/google/adk/scope/utils/similarity.py | 121 +++++- src/google/adk/scope/utils/strings.py | 11 +- test/google/adk/scope/utils/test_args.py | 53 +++ 8 files changed, 583 insertions(+), 28 deletions(-) create mode 100644 playground.ipynb create mode 100644 test/google/adk/scope/utils/test_args.py diff --git a/match.sh b/match.sh index 91c9acc..f56de67 100755 --- a/match.sh +++ b/match.sh @@ -11,4 +11,9 @@ export PYTHONPATH="${SCRIPT_DIR}/src:${PYTHONPATH}" # Run the python matcher # Pass all arguments to the python script -python3 "${SCRIPT_DIR}/src/google/adk/scope/matcher/matcher.py" "$@" +VERBOSE="" +if [[ "$1" == "-v" ]]; then + VERBOSE="--verbose" + shift +fi +python3 "${SCRIPT_DIR}/src/google/adk/scope/matcher/matcher.py" $VERBOSE "$@" diff --git a/playground.ipynb b/playground.ipynb new file mode 100644 index 0000000..9381a7b --- /dev/null +++ b/playground.ipynb @@ -0,0 +1,385 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 42, + "id": "8f748152", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The autoreload extension is already loaded. To reload it, use:\n", + " %reload_ext autoreload\n" + ] + } + ], + "source": [ + "import sys\n", + "import os\n", + "\n", + "# 1. Add 'src' to your Python path so we can resolve the 'google.adk.scope' imports\n", + "sys.path.insert(0, os.path.abspath('src'))\n", + "\n", + "# 2. (Optional but recommended) Auto-reload modules when you change the .py files\n", + "%load_ext autoreload\n", + "%autoreload 2\n", + "\n", + "# 3. Import the required files\n", + "from google.adk.scope import features_pb2\n", + "from google.adk.scope.utils import similarity\n", + "from google.protobuf import text_format\n", + "import logging\n", + "\n", + "logging.basicConfig(level=logging.DEBUG)" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "id": "bb16a48a-5cda-4ec6-a10e-c8a2f2828700", + "metadata": {}, + "outputs": [], + "source": [ + "def read_feature_registry(file_path: str) -> features_pb2.FeatureRegistry:\n", + " \"\"\"Reads a FeatureRegistry from a text proto file.\"\"\"\n", + " registry = features_pb2.FeatureRegistry()\n", + " with open(file_path, \"rb\") as f:\n", + " text_format.Parse(f.read(), registry)\n", + " return registry" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "id": "f3361359-a887-4537-b22b-fe8fec24e13a", + "metadata": {}, + "outputs": [], + "source": [ + "py_features = read_feature_registry('output/py.txtpb')\n", + "ts_features = read_feature_registry('output/ts.txtpb')" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "id": "2208e1dd-6647-4796-b81c-58ab7406e4c9", + "metadata": {}, + "outputs": [], + "source": [ + "py_run_async = py_features.features[0]\n", + "py_run = py_features.features[9]\n", + "ts_run_async = ts_features.features[3]" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "id": "3358d489-92bb-411a-9ec8-0f596be08c9f", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "DEBUG:google.adk.scope.utils.similarity:Initializing SimilarityScorer with alpha=0.8 and weights={'name': 0.3, 'member_of': 0.3, 'namespace': 0.15, 'parameters': 0.15, 'return_type': 0.1}\n" + ] + } + ], + "source": [ + "scorer = similarity.SimilarityScorer()" + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "id": "a1b5b03b-8a0c-4614-ab4f-89140fa872d0", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "DEBUG:google.adk.scope.utils.similarity:Comparing 'run_async' and 'run_async'\n", + "DEBUG:google.adk.scope.utils.similarity:Both INSTANCE_METHOD. Using default weights: {'name': 0.3, 'member_of': 0.3, 'namespace': 0.15, 'parameters': 0.15, 'return_type': 0.1}\n", + "DEBUG:google.adk.scope.utils.similarity:Preliminary scores: {'name': 1.0, 'member_of': 1.0, 'namespace': 0.9714285714285714}\n", + "DEBUG:google.adk.scope.utils.similarity:Preliminary score: 0.7457, Early exit threshold: 0.6000\n", + "DEBUG:google.adk.scope.utils.similarity:Calculating parameter score for 6 vs 5 parameters\n", + "DEBUG:google.adk.scope.utils.similarity:Param 'user_id' vs 'user_id': 1.0000 (name:1.00, type:1.00, opt:1.00)\n", + "DEBUG:google.adk.scope.utils.similarity:Param 'user_id' vs 'session_id': 0.8357 (name:0.67, type:1.00, opt:1.00)\n", + "DEBUG:google.adk.scope.utils.similarity:Param 'user_id' vs 'new_message': 0.3446 (name:0.49, type:0.00, opt:1.00)\n", + "DEBUG:google.adk.scope.utils.similarity:Param 'user_id' vs 'state_delta': 0.3225 (name:0.65, type:0.00, opt:0.00)\n", + "DEBUG:google.adk.scope.utils.similarity:Param 'user_id' vs 'run_config': 0.2869 (name:0.57, type:0.00, opt:0.00)\n", + "DEBUG:google.adk.scope.utils.similarity:Param 'session_id' vs 'user_id': 0.8357 (name:0.67, type:1.00, opt:1.00)\n", + "DEBUG:google.adk.scope.utils.similarity:Param 'session_id' vs 'session_id': 1.0000 (name:1.00, type:1.00, opt:1.00)\n", + "DEBUG:google.adk.scope.utils.similarity:Param 'session_id' vs 'new_message': 0.3523 (name:0.50, type:0.00, opt:1.00)\n", + "DEBUG:google.adk.scope.utils.similarity:Param 'session_id' vs 'state_delta': 0.2939 (name:0.59, type:0.00, opt:0.00)\n", + "DEBUG:google.adk.scope.utils.similarity:Param 'session_id' vs 'run_config': 0.2167 (name:0.43, type:0.00, opt:0.00)\n", + "DEBUG:google.adk.scope.utils.similarity:Param 'invocation_id' vs 'user_id': 0.6033 (name:0.41, type:1.00, opt:0.00)\n", + "DEBUG:google.adk.scope.utils.similarity:Param 'invocation_id' vs 'session_id': 0.6880 (name:0.58, type:1.00, opt:0.00)\n", + "DEBUG:google.adk.scope.utils.similarity:Param 'invocation_id' vs 'new_message': 0.2226 (name:0.45, type:0.00, opt:0.00)\n", + "DEBUG:google.adk.scope.utils.similarity:Param 'invocation_id' vs 'state_delta': 0.2950 (name:0.39, type:0.00, opt:1.00)\n", + "DEBUG:google.adk.scope.utils.similarity:Param 'invocation_id' vs 'run_config': 0.3474 (name:0.49, type:0.00, opt:1.00)\n", + "DEBUG:google.adk.scope.utils.similarity:Param 'new_message' vs 'user_id': 0.2446 (name:0.49, type:0.00, opt:0.00)\n", + "DEBUG:google.adk.scope.utils.similarity:Param 'new_message' vs 'session_id': 0.2523 (name:0.50, type:0.00, opt:0.00)\n", + "DEBUG:google.adk.scope.utils.similarity:Param 'new_message' vs 'new_message': 0.9000 (name:1.00, type:1.00, opt:0.00)\n", + "DEBUG:google.adk.scope.utils.similarity:Param 'new_message' vs 'state_delta': 0.7879 (name:0.58, type:1.00, opt:1.00)\n", + "DEBUG:google.adk.scope.utils.similarity:Param 'new_message' vs 'run_config': 0.7621 (name:0.52, type:1.00, opt:1.00)\n", + "DEBUG:google.adk.scope.utils.similarity:Param 'state_delta' vs 'user_id': 0.3225 (name:0.65, type:0.00, opt:0.00)\n", + "DEBUG:google.adk.scope.utils.similarity:Param 'state_delta' vs 'session_id': 0.2939 (name:0.59, type:0.00, opt:0.00)\n", + "DEBUG:google.adk.scope.utils.similarity:Param 'state_delta' vs 'new_message': 0.2879 (name:0.58, type:0.00, opt:0.00)\n", + "DEBUG:google.adk.scope.utils.similarity:Param 'state_delta' vs 'state_delta': 0.6000 (name:1.00, type:0.00, opt:1.00)\n", + "DEBUG:google.adk.scope.utils.similarity:Param 'state_delta' vs 'run_config': 0.2985 (name:0.40, type:0.00, opt:1.00)\n", + "DEBUG:google.adk.scope.utils.similarity:Param 'run_config' vs 'user_id': 0.2869 (name:0.57, type:0.00, opt:0.00)\n", + "DEBUG:google.adk.scope.utils.similarity:Param 'run_config' vs 'session_id': 0.2167 (name:0.43, type:0.00, opt:0.00)\n", + "DEBUG:google.adk.scope.utils.similarity:Param 'run_config' vs 'new_message': 0.6621 (name:0.52, type:1.00, opt:0.00)\n", + "DEBUG:google.adk.scope.utils.similarity:Param 'run_config' vs 'state_delta': 0.6985 (name:0.40, type:1.00, opt:1.00)\n", + "DEBUG:google.adk.scope.utils.similarity:Param 'run_config' vs 'run_config': 1.0000 (name:1.00, type:1.00, opt:1.00)\n", + "DEBUG:google.adk.scope.utils.similarity:Matrix matched total score: 4.5000, final parameter score: 0.8182\n", + "DEBUG:google.adk.scope.utils.similarity:Return type score: 0.3000 (type match: 0.0, async match: 1.0)\n", + "DEBUG:google.adk.scope.utils.similarity:Final scores including params & return: {'name': 1.0, 'member_of': 1.0, 'namespace': 0.9714285714285714, 'parameters': np.float64(0.8181818181818182), 'return_type': 0.3}\n", + "DEBUG:google.adk.scope.utils.similarity:Final weighted similarity score: 0.8984\n" + ] + }, + { + "data": { + "text/plain": [ + "np.float64(0.8984415584415584)" + ] + }, + "execution_count": 47, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "scorer.get_similarity_score(py_run_async, ts_run_async)" + ] + }, + { + "cell_type": "code", + "execution_count": 48, + "id": "341ad037-4bdb-481a-8a45-b3e1a48f54bc", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "DEBUG:google.adk.scope.utils.similarity:Comparing 'run' and 'run_async'\n", + "DEBUG:google.adk.scope.utils.similarity:Both INSTANCE_METHOD. Using default weights: {'name': 0.3, 'member_of': 0.3, 'namespace': 0.15, 'parameters': 0.15, 'return_type': 0.1}\n", + "DEBUG:google.adk.scope.utils.similarity:Preliminary scores: {'name': 0.8444444444444443, 'member_of': 1.0, 'namespace': 0.9714285714285714}\n", + "DEBUG:google.adk.scope.utils.similarity:Preliminary score: 0.6990, Early exit threshold: 0.6000\n", + "DEBUG:google.adk.scope.utils.similarity:Calculating parameter score for 4 vs 5 parameters\n", + "DEBUG:google.adk.scope.utils.similarity:Param 'user_id' vs 'user_id': 1.0000 (name:1.00, type:1.00, opt:1.00)\n", + "DEBUG:google.adk.scope.utils.similarity:Param 'user_id' vs 'session_id': 0.8357 (name:0.67, type:1.00, opt:1.00)\n", + "DEBUG:google.adk.scope.utils.similarity:Param 'user_id' vs 'new_message': 0.3446 (name:0.49, type:0.00, opt:1.00)\n", + "DEBUG:google.adk.scope.utils.similarity:Param 'user_id' vs 'state_delta': 0.3225 (name:0.65, type:0.00, opt:0.00)\n", + "DEBUG:google.adk.scope.utils.similarity:Param 'user_id' vs 'run_config': 0.2869 (name:0.57, type:0.00, opt:0.00)\n", + "DEBUG:google.adk.scope.utils.similarity:Param 'session_id' vs 'user_id': 0.8357 (name:0.67, type:1.00, opt:1.00)\n", + "DEBUG:google.adk.scope.utils.similarity:Param 'session_id' vs 'session_id': 1.0000 (name:1.00, type:1.00, opt:1.00)\n", + "DEBUG:google.adk.scope.utils.similarity:Param 'session_id' vs 'new_message': 0.3523 (name:0.50, type:0.00, opt:1.00)\n", + "DEBUG:google.adk.scope.utils.similarity:Param 'session_id' vs 'state_delta': 0.2939 (name:0.59, type:0.00, opt:0.00)\n", + "DEBUG:google.adk.scope.utils.similarity:Param 'session_id' vs 'run_config': 0.2167 (name:0.43, type:0.00, opt:0.00)\n", + "DEBUG:google.adk.scope.utils.similarity:Param 'new_message' vs 'user_id': 0.3446 (name:0.49, type:0.00, opt:1.00)\n", + "DEBUG:google.adk.scope.utils.similarity:Param 'new_message' vs 'session_id': 0.3523 (name:0.50, type:0.00, opt:1.00)\n", + "DEBUG:google.adk.scope.utils.similarity:Param 'new_message' vs 'new_message': 1.0000 (name:1.00, type:1.00, opt:1.00)\n", + "DEBUG:google.adk.scope.utils.similarity:Param 'new_message' vs 'state_delta': 0.6879 (name:0.58, type:1.00, opt:0.00)\n", + "DEBUG:google.adk.scope.utils.similarity:Param 'new_message' vs 'run_config': 0.6621 (name:0.52, type:1.00, opt:0.00)\n", + "DEBUG:google.adk.scope.utils.similarity:Param 'run_config' vs 'user_id': 0.2869 (name:0.57, type:0.00, opt:0.00)\n", + "DEBUG:google.adk.scope.utils.similarity:Param 'run_config' vs 'session_id': 0.2167 (name:0.43, type:0.00, opt:0.00)\n", + "DEBUG:google.adk.scope.utils.similarity:Param 'run_config' vs 'new_message': 0.6621 (name:0.52, type:1.00, opt:0.00)\n", + "DEBUG:google.adk.scope.utils.similarity:Param 'run_config' vs 'state_delta': 0.6985 (name:0.40, type:1.00, opt:1.00)\n", + "DEBUG:google.adk.scope.utils.similarity:Param 'run_config' vs 'run_config': 1.0000 (name:1.00, type:1.00, opt:1.00)\n", + "DEBUG:google.adk.scope.utils.similarity:Matrix matched total score: 4.0000, final parameter score: 0.8889\n", + "DEBUG:google.adk.scope.utils.similarity:Return type score: 0.7000 (type match: 1.0, async match: 0.0)\n", + "DEBUG:google.adk.scope.utils.similarity:Final scores including params & return: {'name': 0.8444444444444443, 'member_of': 1.0, 'namespace': 0.9714285714285714, 'parameters': np.float64(0.8888888888888888), 'return_type': 0.7}\n", + "DEBUG:google.adk.scope.utils.similarity:Final weighted similarity score: 0.9024\n" + ] + }, + { + "data": { + "text/plain": [ + "np.float64(0.9023809523809523)" + ] + }, + "execution_count": 48, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "scorer.get_similarity_score(py_run, ts_run_async)" + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "id": "783ee277-0893-46e3-ae9c-72a8df82cb8d", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "original_name: \"run_async\"\n", + "normalized_name: \"run_async\"\n", + "description: \"Main entry method to run the agent in this runner.\\n\\n If event compaction is enabled in the App configuration, it will be\\n performed after all agent events for the current invocation have been\\n yielded. The async generator will only finish iterating after event\\n compaction is complete. However, this does not block new `run_async`\\n calls for subsequent user queries, which can be started concurrently.\"\n", + "member_of: \"Runner\"\n", + "normalized_member_of: \"runner\"\n", + "type: INSTANCE_METHOD\n", + "file_path: \"/Users/shahins/projects/adk/adk-python/src/google/adk/runners.py\"\n", + "namespace: \"runners\"\n", + "normalized_namespace: \"runners\"\n", + "parameters {\n", + " original_name: \"user_id\"\n", + " normalized_name: \"user_id\"\n", + " original_types: \"str\"\n", + " normalized_types: STRING\n", + " description: \"The user ID of the session.\"\n", + "}\n", + "parameters {\n", + " original_name: \"session_id\"\n", + " normalized_name: \"session_id\"\n", + " original_types: \"str\"\n", + " normalized_types: STRING\n", + " description: \"The session ID of the session.\"\n", + "}\n", + "parameters {\n", + " original_name: \"invocation_id\"\n", + " normalized_name: \"invocation_id\"\n", + " original_types: \"Optional[str]\"\n", + " normalized_types: STRING\n", + " description: \"The invocation ID of the session, set this to resume an interrupted invocation.\"\n", + " is_optional: true\n", + "}\n", + "parameters {\n", + " original_name: \"new_message\"\n", + " normalized_name: \"new_message\"\n", + " original_types: \"Optional[types.Content]\"\n", + " normalized_types: OBJECT\n", + " description: \"A new message to append to the session.\"\n", + " is_optional: true\n", + "}\n", + "parameters {\n", + " original_name: \"state_delta\"\n", + " normalized_name: \"state_delta\"\n", + " original_types: \"Optional[dict[str, Any]]\"\n", + " normalized_types: MAP\n", + " description: \"Optional state changes to apply to the session.\"\n", + " is_optional: true\n", + "}\n", + "parameters {\n", + " original_name: \"run_config\"\n", + " normalized_name: \"run_config\"\n", + " original_types: \"Optional[RunConfig]\"\n", + " normalized_types: OBJECT\n", + " description: \"The run config for the agent.\"\n", + " is_optional: true\n", + "}\n", + "original_return_types: \"AsyncGenerator[Event, None]\"\n", + "normalized_return_types: \"OBJECT\"\n", + "normalized_return_types: \"null\"\n", + "async: true" + ] + }, + "execution_count": 49, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "py_run_async" + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "id": "861b3583-49d8-4b29-9f39-144aabe82948", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "original_name: \"run\"\n", + "normalized_name: \"run\"\n", + "description: \"Runs the agent.\\n\\n NOTE:\\n This sync interface is only for local testing and convenience purpose.\\n Consider using `run_async` for production usage.\\n\\n If event compaction is enabled in the App configuration, it will be\\n performed after all agent events for the current invocation have been\\n yielded. The generator will only finish iterating after event\\n compaction is complete.\"\n", + "member_of: \"Runner\"\n", + "normalized_member_of: \"runner\"\n", + "type: INSTANCE_METHOD\n", + "file_path: \"/Users/shahins/projects/adk/adk-python/src/google/adk/runners.py\"\n", + "namespace: \"runners\"\n", + "normalized_namespace: \"runners\"\n", + "parameters {\n", + " original_name: \"user_id\"\n", + " normalized_name: \"user_id\"\n", + " original_types: \"str\"\n", + " normalized_types: STRING\n", + " description: \"The user ID of the session.\"\n", + "}\n", + "parameters {\n", + " original_name: \"session_id\"\n", + " normalized_name: \"session_id\"\n", + " original_types: \"str\"\n", + " normalized_types: STRING\n", + " description: \"The session ID of the session.\"\n", + "}\n", + "parameters {\n", + " original_name: \"new_message\"\n", + " normalized_name: \"new_message\"\n", + " original_types: \"types.Content\"\n", + " normalized_types: OBJECT\n", + " description: \"A new message to append to the session.\"\n", + "}\n", + "parameters {\n", + " original_name: \"run_config\"\n", + " normalized_name: \"run_config\"\n", + " original_types: \"Optional[RunConfig]\"\n", + " normalized_types: OBJECT\n", + " description: \"The run config for the agent.\"\n", + " is_optional: true\n", + "}\n", + "original_return_types: \"Generator[Event, None, None]\"\n", + "normalized_return_types: \"OBJECT\"" + ] + }, + "execution_count": 50, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "py_run" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "941ec275-7405-404b-a31c-d91d4d3671bd", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.4" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/src/google/adk/scope/extractors/converter_ts.py b/src/google/adk/scope/extractors/converter_ts.py index 8c20f45..291f09d 100644 --- a/src/google/adk/scope/extractors/converter_ts.py +++ b/src/google/adk/scope/extractors/converter_ts.py @@ -619,7 +619,7 @@ def _normalize_ts_type(self, t: str) -> List[str]: if base == "Promise": return self._normalize_ts_type(inner) - if base in ("Array", "ReadonlyArray"): + if base in ("Array", "ReadonlyArray", "Generator", "AsyncGenerator", "Iterable", "Iterator", "AsyncIterable", "AsyncIterator"): return ["LIST"] if base == "Map": return ["MAP"] diff --git a/src/google/adk/scope/matcher/matcher.py b/src/google/adk/scope/matcher/matcher.py index 762b30b..d2cf45f 100644 --- a/src/google/adk/scope/matcher/matcher.py +++ b/src/google/adk/scope/matcher/matcher.py @@ -1,4 +1,7 @@ import argparse +import logging + +from google.adk.scope.utils import args as adk_args import dataclasses import sys from pathlib import Path @@ -591,13 +594,15 @@ def main(): default="symmetric", help="Type of gap report to generate (symmetric, directional, or raw).", ) + adk_args.add_verbose_argument(parser) args = parser.parse_args() + adk_args.configure_logging(args) try: base_registry = read_feature_registry(args.base) target_registry = read_feature_registry(args.target) except Exception as e: - print(f"Error reading feature registries: {e}", file=sys.stderr) + logging.error(f"Error reading feature registries: {e}") sys.exit(1) result = match_registries( @@ -611,12 +616,9 @@ def main(): try: output_path.parent.mkdir(parents=True, exist_ok=True) output_path.write_text(result.master_content) - print(f"Successfully wrote raw match report to {output_path}") + logging.info(f"Successfully wrote raw match report to {output_path}") except Exception as e: - print( - f"Error writing raw report to {output_path}: {e}", - file=sys.stderr, - ) + logging.error(f"Error writing raw report to {output_path}: {e}") sys.exit(1) return @@ -646,9 +648,9 @@ def main(): try: output_path.parent.mkdir(parents=True, exist_ok=True) output_path.write_text(master_report) - print(f"Successfully wrote match report to {output_path}") + logging.info(f"Successfully wrote match report to {output_path}") except Exception as e: - print(f"Error writing report to {output_path}: {e}", file=sys.stderr) + logging.error(f"Error writing report to {output_path}: {e}") sys.exit(1) diff --git a/src/google/adk/scope/utils/args.py b/src/google/adk/scope/utils/args.py index d544e0d..745c99e 100644 --- a/src/google/adk/scope/utils/args.py +++ b/src/google/adk/scope/utils/args.py @@ -3,9 +3,23 @@ """ import argparse +import logging from pathlib import Path +def add_verbose_argument(parser: argparse.ArgumentParser) -> None: + parser.add_argument( + "-v", "--verbose", help="show all logs", action="store_true" + ) + + +def configure_logging(args: argparse.Namespace) -> None: + if args.verbose: + logging.basicConfig(level=logging.DEBUG) + else: + logging.basicConfig(level=logging.INFO) + + def parse_args() -> argparse.Namespace: """Parse command line arguments for the Python extractor. diff --git a/src/google/adk/scope/utils/similarity.py b/src/google/adk/scope/utils/similarity.py index e145b1f..0bbdaaa 100644 --- a/src/google/adk/scope/utils/similarity.py +++ b/src/google/adk/scope/utils/similarity.py @@ -1,4 +1,5 @@ from typing import Optional +import logging from jellyfish import jaro_winkler_similarity import numpy as np @@ -6,6 +7,8 @@ from google.adk.scope import features_pb2 as features_pb +logger = logging.getLogger(__name__) + # Default weights for the similarity calculation. DEFAULT_SIMILARITY_WEIGHTS = { "name": 0.30, @@ -22,6 +25,10 @@ def __init__( self, weights: Optional[dict[str, float]] = None, alpha: float = 0.8 ): self.weights = weights or DEFAULT_SIMILARITY_WEIGHTS + logger.debug( + f"Initializing SimilarityScorer with alpha={alpha} and " + f"weights={self.weights}" + ) assert "name" in self.weights assert "member_of" in self.weights assert "namespace" in self.weights @@ -33,6 +40,52 @@ def __init__( + self.weights["namespace"] ) + def _fuzzy_type_match(self, types1: list, types2: list) -> float: + """Calculates a fuzzy similarity score between two lists of types.""" + def _to_str_set(type_list): + res = set() + for t in type_list: + if isinstance(t, int): + # It's a ParamType enum + try: + res.add(features_pb.ParamType.Name(t)) + except ValueError: + res.add(str(t)) + else: + res.add(str(t).upper()) + return res + + set1 = _to_str_set(types1) + set2 = _to_str_set(types2) + + if not set1 and not set2: + return 1.0 + if not set1 or not set2: + return 0.0 + + if set1 == set2: + return 1.0 + + # Check the best match between any pair of types + best_score = 0.0 + for t1 in set1: + for t2 in set2: + if t1 == t2: + score = 1.0 + elif {t1, t2} == {"MAP", "OBJECT"} or {t1, t2} == {"MAP", "ANY"}: + score = 0.4 + elif t1 in ("UNKNOWN", "ANY") or t2 in ("UNKNOWN", "ANY"): + score = 0.3 + elif t1 == "OBJECT" or t2 == "OBJECT": + score = 0.2 + else: + score = 0.0 + + if score > best_score: + best_score = score + + return best_score + def _calculate_param_similarity( self, param1: features_pb.Param, param2: features_pb.Param ) -> float: @@ -40,21 +93,35 @@ def _calculate_param_similarity( s_p_name = jaro_winkler_similarity( param1.normalized_name, param2.normalized_name ) - s_p_type = ( - 1.0 if param1.normalized_types == param2.normalized_types else 0.0 + s_p_type = self._fuzzy_type_match( + list(param1.normalized_types), list(param2.normalized_types) ) s_p_opt = 1.0 if param1.is_optional == param2.is_optional else 0.0 # Weights for parameter components - return (0.5 * s_p_name) + (0.4 * s_p_type) + (0.1 * s_p_opt) + score = (0.5 * s_p_name) + (0.4 * s_p_type) + (0.1 * s_p_opt) + logger.debug( + f"Param '{param1.normalized_name}' vs '{param2.normalized_name}': " + f"{score:.4f} (name:{s_p_name:.2f}, type:{s_p_type:.2f}, " + f"opt:{s_p_opt:.2f})" + ) + return score def _calculate_parameters_score( self, params1: list[features_pb.Param], params2: list[features_pb.Param] ) -> float: """Calculates aggregated similarity for two lists of parameters.""" + logger.debug( + f"Calculating parameter score for {len(params1)} " + f"vs {len(params2)} parameters" + ) if not params1 and not params2: + logger.debug("Both parameter lists empty. Returning 1.0") return 1.0 if not params1 or not params2: + logger.debug( + "One parameter list empty while other is not. Returning 0.0" + ) return 0.0 similarity_matrix = np.zeros((len(params1), len(params2))) @@ -73,26 +140,38 @@ def _calculate_parameters_score( if total_params == 0: return 1.0 - return (2 * total_match_score) / total_params + score = (2 * total_match_score) / total_params + logger.debug( + f"Matrix matched total score: {total_match_score:.4f}, " + f"final parameter score: {score:.4f}" + ) + return score def _calculate_return_type_score( self, f1: features_pb.Feature, f2: features_pb.Feature ) -> float: """Calculates the similarity score for the return types.""" - s_type_match = ( - 1.0 - if f1.normalized_return_types == f2.normalized_return_types - else 0.0 + s_type_match = self._fuzzy_type_match( + list(f1.normalized_return_types), list(f2.normalized_return_types) ) s_async_match = ( 1.0 if getattr(f1, "async") == getattr(f2, "async") else 0.0 ) - return (0.7 * s_type_match) + (0.3 * s_async_match) + score = (0.7 * s_type_match) + (0.3 * s_async_match) + logger.debug( + f"Return type score: {score:.4f} (type match: " + f"{s_type_match}, async match: {s_async_match})" + ) + return score def get_similarity_score( self, feature1: features_pb.Feature, feature2: features_pb.Feature ) -> float: """Computes the overall similarity score between two features.""" + logger.debug( + f"Comparing '{feature1.normalized_name}' and " + f"'{feature2.normalized_name}'" + ) # 1. Type Compatibility and Dynamic Weights t1, t2 = feature1.type, feature2.type current_weights = self.weights.copy() @@ -101,14 +180,27 @@ def get_similarity_score( if t1 == FeatureType.CONSTRUCTOR and t2 == FeatureType.CONSTRUCTOR: current_weights["member_of"] += current_weights["name"] current_weights["name"] = 0.0 + logger.debug( + "Both CONSTRUCTOR. " + f"Adjusted weights: {current_weights}" + ) elif t1 in (FeatureType.FUNCTION, FeatureType.CLASS_METHOD) and \ t2 in (FeatureType.FUNCTION, FeatureType.CLASS_METHOD): current_weights["member_of"] /= 2.0 current_weights["name"] += current_weights["member_of"] + logger.debug( + "Both FUNCTION/CLASS_METHOD. " + f"Adjusted weights: {current_weights}" + ) elif t1 == FeatureType.INSTANCE_METHOD and \ t2 == FeatureType.INSTANCE_METHOD: + logger.debug( + "Both INSTANCE_METHOD. " + f"Using default weights: {current_weights}" + ) pass # Keep default weights else: + logger.debug(f"Incompatible types: {t1} vs {t2}. Returning 0.0") return 0.0 # Fast out for incompatible types # 2. Similarity Calculations @@ -123,6 +215,7 @@ def get_similarity_score( feature1.normalized_namespace, feature2.normalized_namespace ), } + logger.debug(f"Preliminary scores: {scores}") # 3. Early Exit Check (using dynamic weights) preliminary_score = ( @@ -136,8 +229,16 @@ def get_similarity_score( + current_weights["member_of"] + current_weights["namespace"] ) + logger.debug( + f"Preliminary score: {preliminary_score:.4f}, " + f"Early exit threshold: {early_exit_threshold:.4f}" + ) if preliminary_score < early_exit_threshold: + logger.debug( + f"Early exit triggered ({preliminary_score:.4f} < " + f"{early_exit_threshold:.4f})" + ) return preliminary_score scores["parameters"] = self._calculate_parameters_score( @@ -150,4 +251,6 @@ def get_similarity_score( final_score = sum( scores[key] * current_weights[key] for key in current_weights ) + logger.debug(f"Final scores including params & return: {scores}") + logger.debug(f"Final weighted similarity score: {final_score:.4f}") return final_score diff --git a/src/google/adk/scope/utils/strings.py b/src/google/adk/scope/utils/strings.py index e9290ea..ae881fd 100644 --- a/src/google/adk/scope/utils/strings.py +++ b/src/google/adk/scope/utils/strings.py @@ -48,14 +48,7 @@ def normalize_type_complex(type_name: str) -> list[str]: result.append("null") return result - # Handle AsyncGenerator[A, B] -> A | B - if type_name.startswith("AsyncGenerator[") and type_name.endswith("]"): - inner = type_name[15:-1] - parts = _split_generics(inner) - result = [] - for p in parts: - result.extend(normalize_type_complex(p)) - return _unique(result) + # AsyncGenerator is handled by base normalizer returning LIST # Handle tuple[A, B] -> [A, B] if type_name.lower().startswith("tuple[") and type_name.endswith("]"): @@ -79,7 +72,7 @@ def _simple_normalize(t: str) -> str: t = t.lower().strip() if t == "none": return "null" - if t in ("list", "array", "slice", "vector"): + if t in ("list", "array", "slice", "vector", "generator", "asyncgenerator", "iterable", "iterator", "asynciterable", "asynciterator"): return "LIST" if t in ("set",): return "SET" diff --git a/test/google/adk/scope/utils/test_args.py b/test/google/adk/scope/utils/test_args.py new file mode 100644 index 0000000..1aaf315 --- /dev/null +++ b/test/google/adk/scope/utils/test_args.py @@ -0,0 +1,53 @@ +import argparse +import logging +import unittest +from unittest import mock + +from google.adk.scope.utils import args as adk_args + + +class ArgsTest(unittest.TestCase): + + def test_add_verbose_argument(self): + parser = argparse.ArgumentParser() + adk_args.add_verbose_argument(parser) + args = parser.parse_args(['--verbose']) + self.assertTrue(args.verbose) + + @mock.patch('logging.basicConfig') + def test_configure_logging_verbose(self, mock_basic_config): + args = argparse.Namespace(verbose=True) + adk_args.configure_logging(args) + mock_basic_config.assert_called_once_with(level=logging.DEBUG) + + @mock.patch('logging.basicConfig') + def test_configure_logging_default(self, mock_basic_config): + args = argparse.Namespace(verbose=False) + adk_args.configure_logging(args) + mock_basic_config.assert_called_once_with(level=logging.INFO) + + @mock.patch('argparse.ArgumentParser.parse_args') + def test_parse_args_python(self, mock_parse_args): + mock_parse_args.return_value = argparse.Namespace( + language='py', + input_file='test.py', + output='out', + verbose=False + ) + parsed_args = adk_args.parse_args() + self.assertEqual(parsed_args.language, 'python') + + @mock.patch('argparse.ArgumentParser.parse_args') + def test_parse_args_typescript(self, mock_parse_args): + mock_parse_args.return_value = argparse.Namespace( + language='ts', + input_file='test.ts', + output='out', + verbose=False + ) + parsed_args = adk_args.parse_args() + self.assertEqual(parsed_args.language, 'typescript') + + +if __name__ == '__main__': + unittest.main() From a21de59e2d6ab90dbd32882ff72e3f1c21daed28 Mon Sep 17 00:00:00 2001 From: Shahin Saadati Date: Thu, 5 Feb 2026 11:58:59 -0800 Subject: [PATCH 12/21] refactored normalizers. --- .../adk/scope/extractors/converter_py.py | 12 +- .../adk/scope/extractors/converter_ts.py | 11 +- src/google/adk/scope/extractors/extract.py | 1 - src/google/adk/scope/utils/normalizer.py | 175 ++++++++++++++++++ src/google/adk/scope/utils/strings.py | 123 ------------ .../adk/scope/extractors/test_converter_py.py | 2 +- .../adk/scope/extractors/test_converter_ts.py | 43 +---- test/adk/scope/extractors/test_extract.py | 26 +-- test/adk/scope/utils/test_args.py | 78 +++++--- test/adk/scope/utils/test_normalizer.py | 75 ++++++++ test/adk/scope/utils/test_strings.py | 124 ------------- test/google/adk/scope/utils/test_args.py | 53 ------ 12 files changed, 328 insertions(+), 395 deletions(-) create mode 100644 src/google/adk/scope/utils/normalizer.py delete mode 100644 src/google/adk/scope/utils/strings.py create mode 100644 test/adk/scope/utils/test_normalizer.py delete mode 100644 test/adk/scope/utils/test_strings.py delete mode 100644 test/google/adk/scope/utils/test_args.py diff --git a/src/google/adk/scope/extractors/converter_py.py b/src/google/adk/scope/extractors/converter_py.py index 43cda23..a26efa3 100644 --- a/src/google/adk/scope/extractors/converter_py.py +++ b/src/google/adk/scope/extractors/converter_py.py @@ -8,16 +8,16 @@ from tree_sitter import Node -from google.adk.scope.utils.strings import ( - normalize_name, - normalize_type_complex, -) +from google.adk.scope.utils.normalizer import normalize_name, TypeNormalizer + from google.adk.scope import features_pb2 as feature_pb2 logger = logging.getLogger(__name__) class NodeProcessor: + def __init__(self): + self.normalizer = TypeNormalizer() """Process Tree-sitter nodes into Feature objects.""" def process( @@ -360,7 +360,7 @@ def _process_param_node(self, node: Node) -> Optional[feature_pb2.Param]: normalized_strings = [] for t in types: - normalized_strings.extend(normalize_type_complex(t)) + normalized_strings.extend(self.normalizer.normalize(t, 'python')) # Unique normalized_strings = sorted(list(set(normalized_strings))) if not normalized_strings: @@ -407,7 +407,7 @@ def _extract_return_types(self, node: Node) -> Tuple[List[str], List[str]]: return_type_node = node.child_by_field_name("return_type") if return_type_node: raw = return_type_node.text.decode("utf-8") - normalized = normalize_type_complex(raw) + normalized = self.normalizer.normalize(raw, 'python') return [raw], normalized return [], [] diff --git a/src/google/adk/scope/extractors/converter_ts.py b/src/google/adk/scope/extractors/converter_ts.py index 291f09d..52a5703 100644 --- a/src/google/adk/scope/extractors/converter_ts.py +++ b/src/google/adk/scope/extractors/converter_ts.py @@ -9,13 +9,16 @@ from tree_sitter import Node -from google.adk.scope.utils.strings import normalize_name +from google.adk.scope.utils.normalizer import normalize_name +from google.adk.scope.utils.normalizer import TypeNormalizer from google.adk.scope import features_pb2 as feature_pb2 logger = logging.getLogger(__name__) class NodeProcessor: + def __init__(self): + self.normalizer = TypeNormalizer() """Process Tree-sitter nodes into Feature objects for TypeScript.""" def process( @@ -553,7 +556,7 @@ def _create_single_param( ) -> feature_pb2.Param: normalized_strings = [] for t in types: - normalized_strings.extend(self._normalize_ts_type(t)) + normalized_strings.extend(self.normalizer.normalize(t, 'typescript')) normalized_strings = sorted(list(set(normalized_strings))) if not normalized_strings: @@ -597,7 +600,7 @@ def _derive_name_from_type(self, type_name: str) -> str: return "obj" - def _normalize_ts_type(self, t: str) -> List[str]: + # Handle fundamental TS types t = t.strip() if not t: @@ -666,7 +669,7 @@ def _extract_return_types(self, node: Node) -> Tuple[List[str], List[str]]: # logically T for async? # Schema says "original_return_types". # normalized usually unwrap? - return [raw], self._normalize_ts_type(raw) + return [raw], self.normalizer.normalize(raw, 'typescript') return [], [] def _is_blocking(self, node: Node, return_types: List[str]) -> bool: diff --git a/src/google/adk/scope/extractors/extract.py b/src/google/adk/scope/extractors/extract.py index b479171..a064ba3 100644 --- a/src/google/adk/scope/extractors/extract.py +++ b/src/google/adk/scope/extractors/extract.py @@ -2,7 +2,6 @@ import sys from pathlib import Path -import datetime import yaml from google.protobuf import text_format from google.protobuf.json_format import MessageToJson, MessageToDict diff --git a/src/google/adk/scope/utils/normalizer.py b/src/google/adk/scope/utils/normalizer.py new file mode 100644 index 0000000..1599520 --- /dev/null +++ b/src/google/adk/scope/utils/normalizer.py @@ -0,0 +1,175 @@ +""" +Unified type normalization for ADK Scope. +""" + +import re +from typing import List + +def normalize_name(name: str) -> str: + """Convert name to snake_case (e.g. CamelCase -> camel_case).""" + name = name.replace("-", "_") + s1 = re.sub("(.)([A-Z][a-z]+)", r"\1_\2", name) + return re.sub("([a-z0-9])([A-Z])", r"\1_\2", s1).lower() + + +class TypeNormalizer: + """A language-aware type normalizer.""" + + def normalize(self, type_name: str, language: str) -> List[str]: + """Normalize a type name for a given language.""" + if language == 'python': + return self._normalize_py_type(type_name) + elif language == 'typescript': + return self._normalize_ts_type(type_name) + # Add placeholders for future languages like 'java' and 'go' + # Fallback for unknown languages: only normalize if it's a known simple type, otherwise OBJECT + normalized = self._simple_normalize(type_name) + return [normalized] + + def _normalize_py_type(self, type_name: str) -> list[str]: + """Map raw type to canonical types, handling generics.""" + type_name = type_name.strip() + if not type_name: + return ["OBJECT"] + + # Handle Union[A, B] + if type_name.startswith("Union[") and type_name.endswith("]"): + inner = type_name[6:-1] + parts = self._split_generics(inner) + result = [] + for p in parts: + result.extend(self._normalize_py_type(p)) + return self._unique(result) + + # Handle Optional[T] -> T | None + if type_name.startswith("Optional[") and type_name.endswith("]"): + inner = type_name[9:-1] + result = self._normalize_py_type(inner) + if "null" not in result: + result.append("null") + return result + + # Handle tuple[A, B] -> [A, B] + if type_name.lower().startswith("tuple[") and type_name.endswith("]"): + inner = type_name[6:-1] + parts = self._split_generics(inner) + result = [] + for p in parts: + result.extend(self._normalize_py_type(p)) + return self._unique(result) + + # Handle other generics like List[int] -> LIST + if "[" in type_name and type_name.endswith("]"): + base = type_name.split("[", 1)[0] + return [self._simple_normalize(base)] + + return [self._simple_normalize(type_name)] + + def _normalize_ts_type(self, t: str) -> List[str]: + # Handle fundamental TS types + t = t.strip() + if not t: + return ["OBJECT"] + + # A | B + if "|" in t: + parts = t.split("|") + res = [] + for p in parts: + res.extend(self._normalize_ts_type(p)) + return res + + # Generics: Promise, Array + if "<" in t and t.endswith(">"): + base = t.split("<", 1)[0].strip() + # Find matching closing bracket or assumue last + inner = t[t.find("<") + 1 : -1].strip() + + if base == "Promise": + return self._normalize_ts_type(inner) + if base in ("Array", "ReadonlyArray", "Generator", "AsyncGenerator", "Iterable", "Iterator", "AsyncIterable", "AsyncIterator"): + return ["LIST"] + if base == "Map": + return ["MAP"] + if base == "Set": + return ["SET"] + # Fallback for others + return ["OBJECT"] + + t_lower = t.lower() + if t_lower in ("string", "formattedstring", "path"): + return ["STRING"] + if t_lower in ("number", "int", "float", "integer", "double"): + return ["NUMBER"] + if t_lower in ("boolean", "bool"): + return ["BOOLEAN"] + if t_lower == "unknown": + return ["UNKNOWN"] + if t_lower in ("any", "object"): + return ["OBJECT"] + if t_lower.endswith("[]"): + return ["LIST"] + if ( + t_lower.startswith("map") + or t_lower.startswith("record") + or "{" in t + ): + return ["MAP"] + if t_lower.startswith("set"): + return ["SET"] + if t_lower == "void": + return [] + + return ["OBJECT"] + + def _simple_normalize(self, t: str) -> str: + t = t.lower().strip() + if t == "none": + return "null" + if t in ("list", "array", "slice", "vector", "generator", "asyncgenerator", "iterable", "iterator", "asynciterable", "asynciterator"): + return "LIST" + if t in ("set",): + return "SET" + if t in ("map", "dictionary", "dict", "record", "hash"): + return "MAP" + if t in ("int", "integer", "long", "int64", "float", "double"): + return "NUMBER" + if t in ("str", "string"): + return "STRING" + if t in ("bool", "boolean"): + return "BOOLEAN" + if t == "any": + return "OBJECT" + if not t: + return "OBJECT" + return "OBJECT" + + def _split_generics(self, s: str) -> list[str]: + """Split string by comma, ignoring nested brackets.""" + parts = [] + balance = 0 + current = [] + for char in s: + if char == "[": + balance += 1 + current.append(char) + elif char == "]": + balance -= 1 + current.append(char) + elif char == "," and balance == 0: + parts.append("".join(current).strip()) + current = [] + else: + current.append(char) + if current: + parts.append("".join(current).strip()) + return parts + + def _unique(self, lst: list[str]) -> list[str]: + seen = set() + out = [] + for x in lst: + if x not in seen: + seen.add(x) + out.append(x) + return out diff --git a/src/google/adk/scope/utils/strings.py b/src/google/adk/scope/utils/strings.py deleted file mode 100644 index ae881fd..0000000 --- a/src/google/adk/scope/utils/strings.py +++ /dev/null @@ -1,123 +0,0 @@ -""" -String utility functions for ADK Scope. -""" - -import re - - -def normalize_name(name: str) -> str: - """Convert name to snake_case (e.g. CamelCase -> camel_case).""" - name = name.replace("-", "_") - s1 = re.sub("(.)([A-Z][a-z]+)", r"\1_\2", name) - return re.sub("([a-z0-9])([A-Z])", r"\1_\2", s1).lower() - - -def normalize_type(type_name: str) -> str: - """Legacy wrapper for simple normalization (returns single string).""" - # For backward compatibility if needed, else we can migrate all callers - # But signature in Features expects lists now for normalized_types if we - # change converter - # Let's keep this as simple for now or deprecate? - # The new converter will use normalize_type_complex which returns List[str] - # But wait, existing calls might expect single str? - # Actually, converter.py calls: [normalize_type(t) for t in types] - # We should change converter to use new function. - return _simple_normalize(type_name) - - -def normalize_type_complex(type_name: str) -> list[str]: - """Map raw type to canonical types, handling generics.""" - type_name = type_name.strip() - if not type_name: - return ["OBJECT"] - - # Handle Union[A, B] - if type_name.startswith("Union[") and type_name.endswith("]"): - inner = type_name[6:-1] - parts = _split_generics(inner) - result = [] - for p in parts: - result.extend(normalize_type_complex(p)) - return _unique(result) - - # Handle Optional[T] -> T | None - if type_name.startswith("Optional[") and type_name.endswith("]"): - inner = type_name[9:-1] - result = normalize_type_complex(inner) - if "null" not in result: - result.append("null") - return result - - # AsyncGenerator is handled by base normalizer returning LIST - - # Handle tuple[A, B] -> [A, B] - if type_name.lower().startswith("tuple[") and type_name.endswith("]"): - # tuple[...] or Tuple[...] - inner = type_name[6:-1] - parts = _split_generics(inner) - result = [] - for p in parts: - result.extend(normalize_type_complex(p)) - return _unique(result) - - # Handle other generics like List[int] -> LIST - if "[" in type_name and type_name.endswith("]"): - base = type_name.split("[", 1)[0] - return [_simple_normalize(base)] - - return [_simple_normalize(type_name)] - - -def _simple_normalize(t: str) -> str: - t = t.lower().strip() - if t == "none": - return "null" - if t in ("list", "array", "slice", "vector", "generator", "asyncgenerator", "iterable", "iterator", "asynciterable", "asynciterator"): - return "LIST" - if t in ("set",): - return "SET" - if t in ("map", "dictionary", "dict", "record", "hash"): - return "MAP" - if t in ("int", "integer", "long", "int64", "float", "double"): - return "NUMBER" - if t in ("str", "string"): - return "STRING" - if t in ("bool", "boolean"): - return "BOOLEAN" - if t == "any": - return "OBJECT" - if not t: - return "OBJECT" - return "OBJECT" - - -def _split_generics(s: str) -> list[str]: - """Split string by comma, ignoring nested brackets.""" - parts = [] - balance = 0 - current = [] - for char in s: - if char == "[": - balance += 1 - current.append(char) - elif char == "]": - balance -= 1 - current.append(char) - elif char == "," and balance == 0: - parts.append("".join(current).strip()) - current = [] - else: - current.append(char) - if current: - parts.append("".join(current).strip()) - return parts - - -def _unique(lst: list[str]) -> list[str]: - seen = set() - out = [] - for x in lst: - if x not in seen: - seen.add(x) - out.append(x) - return out diff --git a/test/adk/scope/extractors/test_converter_py.py b/test/adk/scope/extractors/test_converter_py.py index e9fcb6c..107b45e 100644 --- a/test/adk/scope/extractors/test_converter_py.py +++ b/test/adk/scope/extractors/test_converter_py.py @@ -489,7 +489,7 @@ def node_child(name): result = self.processor.process(node, self.file_path, self.repo_root) self.assertEqual(len(result.parameters), 0) - @patch("google.adk.scope.extractors.converter_py.normalize_type_complex") + @patch("google.adk.scope.extractors.converter_py.TypeNormalizer.normalize") def test_param_enum_attribute_error(self, mock_normalize): # Force normalize to return a value not in keys mock_normalize.return_value = ["INVALID_TYPE_NAME"] diff --git a/test/adk/scope/extractors/test_converter_ts.py b/test/adk/scope/extractors/test_converter_ts.py index 6bee5cd..cf67f81 100644 --- a/test/adk/scope/extractors/test_converter_ts.py +++ b/test/adk/scope/extractors/test_converter_ts.py @@ -605,48 +605,7 @@ def test_parameter_modes_rest(self): self.assertEqual(len(result.parameters), 1) self.assertEqual(result.parameters[0].original_name, "args") - def test_type_normalization_extended(self): - # Array -> LIST - # ReadonlyArray -> LIST - # Map -> MAP - # Set -> SET - # string[] -> LIST - # void -> [] - - def test_type(t_str, expected_norm): - norm = self.processor._normalize_ts_type(t_str) - # norm is list of strings (enum names) - # Map enum names to values? Or just check processor internal output? - # _normalize_ts_type returns list of STRINGS like ['LIST'] - self.assertEqual(norm, expected_norm) - - test_type("Array", ["LIST"]) - test_type("ReadonlyArray", ["LIST"]) - test_type("Map", ["MAP"]) - test_type("Set", ["SET"]) - test_type("string[]", ["LIST"]) - test_type("void", []) - test_type("path", ["STRING"]) - test_type("formattedstring", ["STRING"]) - test_type("boolean", ["BOOLEAN"]) - - def test_recursive_types(self): - # Promise> - # Array> - def test_type(t_str, expected_norm): - norm = self.processor._normalize_ts_type(t_str) - self.assertEqual(norm, expected_norm) - - # Promise unwrapping is recursive in logic? - # _normalize_ts_type(Promise) -> _normalize_ts_type(T) - # So Promise> -> Promise -> string -> [STRING] - test_type("Promise>", ["STRING"]) - - # Array> -> LIST - test_type("Array>", ["LIST"]) - - # Map> -> MAP - test_type("Map>", ["MAP"]) + def test_abstract_and_interfaces(self): # abstract class method diff --git a/test/adk/scope/extractors/test_extract.py b/test/adk/scope/extractors/test_extract.py index 2744bad..0f89f2d 100644 --- a/test/adk/scope/extractors/test_extract.py +++ b/test/adk/scope/extractors/test_extract.py @@ -5,6 +5,7 @@ import shutil from google.adk.scope.extractors import extract +from google.adk.scope import features_pb2 as feature_pb2 class TestExtractHelpers(unittest.TestCase): @@ -140,16 +141,16 @@ def test_input_file_mode(self): f.touch() self.configure_args(lang="python", input_file=str(f)) - self.mock_py_extractor.extract_features.return_value = [] + self.mock_py_extractor.extract_features.return_value = [feature_pb2.Feature(original_name='dummy')] self.mock_py_extractor.get_version.return_value = "1.0" extract.main() self.mock_py_extractor.extract_features.assert_called() self.assertTrue(self.output_dir.exists()) - self.assertEqual(len(list(self.output_dir.glob("py_*.json"))), 1) - self.assertEqual(len(list(self.output_dir.glob("py_*.yaml"))), 1) - self.assertEqual(len(list(self.output_dir.glob("py_*.txtpb"))), 1) + self.assertTrue((self.output_dir / "py.json").exists()) + self.assertTrue((self.output_dir / "py.yaml").exists()) + self.assertTrue((self.output_dir / "py.txtpb").exists()) def test_input_file_not_found(self): self.configure_args(lang="python", input_file="/non/existent.py") @@ -163,23 +164,24 @@ def test_input_dir_mode(self): self.configure_args(lang="python", input_dir=str(d)) self.mock_py_extractor.find_files.return_value = [d / "a.py"] - self.mock_py_extractor.extract_features.return_value = [] + self.mock_py_extractor.extract_features.return_value = [feature_pb2.Feature(original_name='dummy')] self.mock_py_extractor.get_version.return_value = "1.0" extract.main() self.mock_py_extractor.find_files.assert_called_with(d, recursive=False) self.assertTrue(self.output_dir.exists()) - self.assertEqual(len(list(self.output_dir.glob("py_*.json"))), 1) - self.assertEqual(len(list(self.output_dir.glob("py_*.yaml"))), 1) - self.assertEqual(len(list(self.output_dir.glob("py_*.txtpb"))), 1) + self.assertTrue((self.output_dir / "py.json").exists()) + self.assertTrue((self.output_dir / "py.yaml").exists()) + self.assertTrue((self.output_dir / "py.txtpb").exists()) def test_input_repo_mode(self): r = self.root (r / "src").mkdir() self.configure_args(lang="python", input_repo=str(r)) - self.mock_py_extractor.find_files.return_value = [] + self.mock_py_extractor.find_files.return_value = [r / "src" / "a.py"] + self.mock_py_extractor.extract_features.return_value = [feature_pb2.Feature(original_name='dummy')] self.mock_py_extractor.get_version.return_value = "1.0" extract.main() @@ -189,9 +191,9 @@ def test_input_repo_mode(self): r / "src", recursive=True ) self.assertTrue(self.output_dir.exists()) - self.assertEqual(len(list(self.output_dir.glob("py_*.json"))), 1) - self.assertEqual(len(list(self.output_dir.glob("py_*.yaml"))), 1) - self.assertEqual(len(list(self.output_dir.glob("py_*.txtpb"))), 1) + self.assertTrue((self.output_dir / "py.json").exists()) + self.assertTrue((self.output_dir / "py.yaml").exists()) + self.assertTrue((self.output_dir / "py.txtpb").exists()) if __name__ == "__main__": diff --git a/test/adk/scope/utils/test_args.py b/test/adk/scope/utils/test_args.py index 0eef2a7..3db3b4c 100644 --- a/test/adk/scope/utils/test_args.py +++ b/test/adk/scope/utils/test_args.py @@ -1,42 +1,63 @@ -import unittest import argparse -from unittest.mock import patch -from pathlib import Path -from google.adk.scope.utils.args import parse_args - - -class TestArgs(unittest.TestCase): - @patch("argparse.ArgumentParser.parse_args") - def test_parse_args(self, mock_parse): - # Setup mock return value - mock_args = argparse.Namespace( - language="py", - input_repo=Path("/tmp/repo"), - output=Path("/tmp/out_dir"), - input_file=None, - input_dir=None, +import logging +import unittest +from unittest import mock + +from google.adk.scope.utils import args as adk_args + + +class ArgsTest(unittest.TestCase): + + def test_add_verbose_argument(self): + parser = argparse.ArgumentParser() + adk_args.add_verbose_argument(parser) + args = parser.parse_args(['--verbose']) + self.assertTrue(args.verbose) + + @mock.patch('logging.basicConfig') + def test_configure_logging_verbose(self, mock_basic_config): + args = argparse.Namespace(verbose=True) + adk_args.configure_logging(args) + mock_basic_config.assert_called_once_with(level=logging.DEBUG) + + @mock.patch('logging.basicConfig') + def test_configure_logging_default(self, mock_basic_config): + args = argparse.Namespace(verbose=False) + adk_args.configure_logging(args) + mock_basic_config.assert_called_once_with(level=logging.INFO) + + @mock.patch('argparse.ArgumentParser.parse_args') + def test_parse_args_python(self, mock_parse_args): + mock_parse_args.return_value = argparse.Namespace( + language='py', + input_file='test.py', + output='out', + verbose=False ) - mock_parse.return_value = mock_args + parsed_args = adk_args.parse_args() + self.assertEqual(parsed_args.language, 'python') - # Call the function - # (arguments are parsed from sys.argv by default, - # but we mocked parse_args) - args = parse_args() + @mock.patch('argparse.ArgumentParser.parse_args') + def test_parse_args_typescript(self, mock_parse_args): + mock_parse_args.return_value = argparse.Namespace( + language='ts', + input_file='test.ts', + output='out', + verbose=False + ) + parsed_args = adk_args.parse_args() + self.assertEqual(parsed_args.language, 'typescript') - self.assertEqual(args.input_repo, Path("/tmp/repo")) - self.assertEqual(args.output, Path("/tmp/out_dir")) - # Should be normalized - self.assertEqual(args.language, "python") def test_arg_definitions(self): # Verify that the parser is set up with correct arguments - with patch("argparse.ArgumentParser") as mock_parser_cls: + with mock.patch("argparse.ArgumentParser") as mock_parser_cls: mock_parser = mock_parser_cls.return_value # We also need to mock the group returned by # add_mutually_exclusive_group mock_group = mock_parser.add_mutually_exclusive_group.return_value - parse_args() + adk_args.parse_args() # Verify mutual exclusive group creation mock_parser.add_mutually_exclusive_group.assert_called_once_with( @@ -71,6 +92,5 @@ def test_arg_definitions(self): args, _ = parser_calls[2] self.assertEqual(args[0], "--verbose") - -if __name__ == "__main__": +if __name__ == '__main__': unittest.main() diff --git a/test/adk/scope/utils/test_normalizer.py b/test/adk/scope/utils/test_normalizer.py new file mode 100644 index 0000000..c7594c5 --- /dev/null +++ b/test/adk/scope/utils/test_normalizer.py @@ -0,0 +1,75 @@ +import unittest +from google.adk.scope.utils.normalizer import TypeNormalizer, normalize_name + +class TestStrings(unittest.TestCase): + def test_normalize_name(self): + # CamelCase + self.assertEqual(normalize_name("ThisName"), "this_name") + self.assertEqual(normalize_name("CamelCase"), "camel_case") + + # camelCase + self.assertEqual(normalize_name("thisName"), "this_name") + self.assertEqual(normalize_name("runAsync"), "run_async") + + # snake_case + self.assertEqual(normalize_name("this_name"), "this_name") + self.assertEqual(normalize_name("already_snake"), "already_snake") + + # PascalCase with acronyms + self.assertEqual(normalize_name("HTTPResponse"), "http_response") + self.assertEqual(normalize_name("XMLParser"), "xml_parser") + + # Kebab-case + self.assertEqual(normalize_name("kebab-case"), "kebab_case") + + # Mixed + self.assertEqual( + normalize_name("JSONReaderWorker"), "json_reader_worker" + ) + + +class TestTypeNormalizer(unittest.TestCase): + def setUp(self): + self.normalizer = TypeNormalizer() + + def test_python_normalization(self): + self.assertEqual(self.normalizer.normalize("str", "python"), ["STRING"]) + self.assertEqual(self.normalizer.normalize("int", "python"), ["NUMBER"]) + self.assertEqual(self.normalizer.normalize("bool", "python"), ["BOOLEAN"]) + self.assertEqual(self.normalizer.normalize("list", "python"), ["LIST"]) + self.assertEqual(self.normalizer.normalize("dict", "python"), ["MAP"]) + self.assertEqual(self.normalizer.normalize("set", "python"), ["SET"]) + self.assertEqual(self.normalizer.normalize("None", "python"), ["null"]) + self.assertEqual(self.normalizer.normalize("any", "python"), ["OBJECT"]) + self.assertEqual(self.normalizer.normalize("Optional[str]", "python"), ["STRING", "null"]) + self.assertEqual(self.normalizer.normalize("Union[str, int]", "python"), ["STRING", "NUMBER"]) + self.assertEqual(self.normalizer.normalize("List[int]", "python"), ["LIST"]) + self.assertEqual(self.normalizer.normalize("Tuple[str, int]", "python"), ["STRING", "NUMBER"]) + + def test_typescript_normalization(self): + self.assertEqual(self.normalizer.normalize("string", "typescript"), ["STRING"]) + self.assertEqual(self.normalizer.normalize("number", "typescript"), ["NUMBER"]) + self.assertEqual(self.normalizer.normalize("boolean", "typescript"), ["BOOLEAN"]) + self.assertEqual(self.normalizer.normalize("string[]", "typescript"), ["LIST"]) + self.assertEqual(self.normalizer.normalize("Array", "typescript"), ["LIST"]) + self.assertEqual(self.normalizer.normalize("Map", "typescript"), ["MAP"]) + self.assertEqual(self.normalizer.normalize("Set", "typescript"), ["SET"]) + self.assertEqual(self.normalizer.normalize("void", "typescript"), []) + self.assertEqual(self.normalizer.normalize("any", "typescript"), ["OBJECT"]) + self.assertEqual(self.normalizer.normalize("unknown", "typescript"), ["UNKNOWN"]) + self.assertEqual(self.normalizer.normalize("Promise", "typescript"), ["STRING"]) + self.assertEqual(self.normalizer.normalize("string | number", "typescript"), ["STRING", "NUMBER"]) + + def test_edge_cases(self): + self.assertEqual(self.normalizer.normalize("", "python"), ["OBJECT"]) + self.assertEqual(self.normalizer.normalize(" ", "python"), ["OBJECT"]) + self.assertEqual(self.normalizer.normalize("", "typescript"), ["OBJECT"]) + self.assertEqual(self.normalizer.normalize(" ", "typescript"), ["OBJECT"]) + self.assertEqual(self.normalizer.normalize("unsupported_type", "python"), ["OBJECT"]) + self.assertEqual(self.normalizer.normalize("unsupported_type", "typescript"), ["OBJECT"]) + self.assertEqual(self.normalizer.normalize("str", "unsupported_language"), ["STRING"]) + self.assertEqual(self.normalizer.normalize("MyCustomType", "unsupported_language"), ["OBJECT"]) + + +if __name__ == '__main__': + unittest.main() diff --git a/test/adk/scope/utils/test_strings.py b/test/adk/scope/utils/test_strings.py deleted file mode 100644 index 956c2ee..0000000 --- a/test/adk/scope/utils/test_strings.py +++ /dev/null @@ -1,124 +0,0 @@ -import unittest -from google.adk.scope.utils.strings import normalize_name, normalize_type - - -class TestStrings(unittest.TestCase): - def test_normalize_name(self): - # CamelCase - self.assertEqual(normalize_name("ThisName"), "this_name") - self.assertEqual(normalize_name("CamelCase"), "camel_case") - - # camelCase - self.assertEqual(normalize_name("thisName"), "this_name") - self.assertEqual(normalize_name("runAsync"), "run_async") - - # snake_case - self.assertEqual(normalize_name("this_name"), "this_name") - self.assertEqual(normalize_name("already_snake"), "already_snake") - - # PascalCase with acronyms - self.assertEqual(normalize_name("HTTPResponse"), "http_response") - self.assertEqual(normalize_name("XMLParser"), "xml_parser") - - # Kebab-case - self.assertEqual(normalize_name("kebab-case"), "kebab_case") - - # Mixed - self.assertEqual( - normalize_name("JSONReaderWorker"), "json_reader_worker" - ) - - def test_normalize_type(self): - # List variants - self.assertEqual(normalize_type("list"), "LIST") - self.assertEqual(normalize_type("List"), "LIST") - self.assertEqual(normalize_type("Array"), "LIST") - self.assertEqual(normalize_type("Vector"), "LIST") - - # Set - self.assertEqual(normalize_type("set"), "SET") - - # Map variants - self.assertEqual(normalize_type("dict"), "MAP") - self.assertEqual(normalize_type("Dictionary"), "MAP") - self.assertEqual(normalize_type("Map"), "MAP") - - # Primitives - self.assertEqual(normalize_type("int"), "NUMBER") - self.assertEqual(normalize_type("Integer"), "NUMBER") - self.assertEqual(normalize_type("float"), "NUMBER") - self.assertEqual(normalize_type("str"), "STRING") - self.assertEqual(normalize_type("String"), "STRING") - self.assertEqual(normalize_type("bool"), "BOOLEAN") - - # Fallback - self.assertEqual(normalize_type("CustomType"), "OBJECT") - self.assertEqual(normalize_type("Any"), "OBJECT") - self.assertEqual(normalize_type(""), "OBJECT") - - def test_normalize_type_complex(self): - from google.adk.scope.utils.strings import normalize_type_complex - - # Simple types - self.assertEqual(normalize_type_complex("int"), ["NUMBER"]) - self.assertEqual(normalize_type_complex("str"), ["STRING"]) - self.assertEqual(normalize_type_complex(""), ["OBJECT"]) - - # Generics - List - self.assertEqual(normalize_type_complex("List[int]"), ["LIST"]) - self.assertEqual(normalize_type_complex("List[str]"), ["LIST"]) - self.assertEqual( - normalize_type_complex("list[int]"), ["LIST"] - ) # lowercase - - # Generics - Union - self.assertEqual( - set(normalize_type_complex("Union[int, str]")), {"NUMBER", "STRING"} - ) - - # check explicit behavior for None/none in simple_normalize - # _simple_normalize('none') -> 'null' - self.assertEqual( - set(normalize_type_complex("Union[int, None]")), {"NUMBER", "null"} - ) - - # Generics - Optional - self.assertEqual( - set(normalize_type_complex("Optional[int]")), {"NUMBER", "null"} - ) - self.assertEqual( - set(normalize_type_complex("Optional[List[str]]")), {"LIST", "null"} - ) - - # Generics - AsyncGenerator - self.assertEqual( - set(normalize_type_complex("AsyncGenerator[int, str]")), - {"NUMBER", "STRING"}, - ) - - # Generics - Tuple - self.assertEqual( - set(normalize_type_complex("tuple[int, str]")), {"NUMBER", "STRING"} - ) - self.assertEqual( - set(normalize_type_complex("Tuple[int, str]")), {"NUMBER", "STRING"} - ) - - # Nested generics - self.assertEqual( - set(normalize_type_complex("Union[List[int], Optional[str]]")), - {"LIST", "STRING", "null"}, - ) - - def test_split_generics(self): - from google.adk.scope.utils.strings import _split_generics - - self.assertEqual(_split_generics("a, b"), ["a", "b"]) - self.assertEqual(_split_generics("List[a,b], c"), ["List[a,b]", "c"]) - self.assertEqual( - _split_generics("Union[A, B[C, D]], E"), ["Union[A, B[C, D]]", "E"] - ) - - -if __name__ == "__main__": - unittest.main() diff --git a/test/google/adk/scope/utils/test_args.py b/test/google/adk/scope/utils/test_args.py deleted file mode 100644 index 1aaf315..0000000 --- a/test/google/adk/scope/utils/test_args.py +++ /dev/null @@ -1,53 +0,0 @@ -import argparse -import logging -import unittest -from unittest import mock - -from google.adk.scope.utils import args as adk_args - - -class ArgsTest(unittest.TestCase): - - def test_add_verbose_argument(self): - parser = argparse.ArgumentParser() - adk_args.add_verbose_argument(parser) - args = parser.parse_args(['--verbose']) - self.assertTrue(args.verbose) - - @mock.patch('logging.basicConfig') - def test_configure_logging_verbose(self, mock_basic_config): - args = argparse.Namespace(verbose=True) - adk_args.configure_logging(args) - mock_basic_config.assert_called_once_with(level=logging.DEBUG) - - @mock.patch('logging.basicConfig') - def test_configure_logging_default(self, mock_basic_config): - args = argparse.Namespace(verbose=False) - adk_args.configure_logging(args) - mock_basic_config.assert_called_once_with(level=logging.INFO) - - @mock.patch('argparse.ArgumentParser.parse_args') - def test_parse_args_python(self, mock_parse_args): - mock_parse_args.return_value = argparse.Namespace( - language='py', - input_file='test.py', - output='out', - verbose=False - ) - parsed_args = adk_args.parse_args() - self.assertEqual(parsed_args.language, 'python') - - @mock.patch('argparse.ArgumentParser.parse_args') - def test_parse_args_typescript(self, mock_parse_args): - mock_parse_args.return_value = argparse.Namespace( - language='ts', - input_file='test.ts', - output='out', - verbose=False - ) - parsed_args = adk_args.parse_args() - self.assertEqual(parsed_args.language, 'typescript') - - -if __name__ == '__main__': - unittest.main() From f25d20f2665d454c3d8e6c1ddafdf37b5cb571bc Mon Sep 17 00:00:00 2001 From: Shahin Saadati Date: Thu, 5 Feb 2026 13:31:01 -0800 Subject: [PATCH 13/21] Ruff adjustments. --- .../adk/scope/extractors/converter_ts.py | 73 ++------------- src/google/adk/scope/matcher/matcher.py | 18 ++-- src/google/adk/scope/utils/normalizer.py | 13 ++- src/google/adk/scope/utils/similarity.py | 3 +- test/adk/scope/extractors/test_extract.py | 12 ++- test/adk/scope/utils/test_normalizer.py | 88 ++++++++++++++----- 6 files changed, 102 insertions(+), 105 deletions(-) diff --git a/src/google/adk/scope/extractors/converter_ts.py b/src/google/adk/scope/extractors/converter_ts.py index 52a5703..493d634 100644 --- a/src/google/adk/scope/extractors/converter_ts.py +++ b/src/google/adk/scope/extractors/converter_ts.py @@ -17,9 +17,10 @@ class NodeProcessor: + """Process Tree-sitter nodes into Feature objects for TypeScript.""" + def __init__(self): self.normalizer = TypeNormalizer() - """Process Tree-sitter nodes into Feature objects for TypeScript.""" def process( self, node: Node, file_path: Path, repo_root: Path @@ -93,7 +94,7 @@ def process( parameters = self._extract_params(node, param_docs) original_returns, normalized_returns = self._extract_return_types(node) - original_returns, normalized_returns = self._extract_return_types(node) + is_async = not self._is_blocking(node, original_returns) maturity = self._extract_maturity(node) @@ -556,7 +557,8 @@ def _create_single_param( ) -> feature_pb2.Param: normalized_strings = [] for t in types: - normalized_strings.extend(self.normalizer.normalize(t, 'typescript')) + normalized_types = self.normalizer.normalize(t, "typescript") + normalized_strings.extend(normalized_types) normalized_strings = sorted(list(set(normalized_strings))) if not normalized_strings: @@ -600,62 +602,7 @@ def _derive_name_from_type(self, type_name: str) -> str: return "obj" - - # Handle fundamental TS types - t = t.strip() - if not t: - return ["OBJECT"] - - # A | B - if "|" in t: - parts = t.split("|") - res = [] - for p in parts: - res.extend(self._normalize_ts_type(p)) - return res - - # Generics: Promise, Array - if "<" in t and t.endswith(">"): - base = t.split("<", 1)[0].strip() - # Find matching closing bracket or assumue last - inner = t[t.find("<") + 1 : -1].strip() - - if base == "Promise": - return self._normalize_ts_type(inner) - if base in ("Array", "ReadonlyArray", "Generator", "AsyncGenerator", "Iterable", "Iterator", "AsyncIterable", "AsyncIterator"): - return ["LIST"] - if base == "Map": - return ["MAP"] - if base == "Set": - return ["SET"] - # Fallback for others - return ["OBJECT"] - - t_lower = t.lower() - if t_lower in ("string", "formattedstring", "path"): - return ["STRING"] - if t_lower in ("number", "int", "float", "integer", "double"): - return ["NUMBER"] - if t_lower in ("boolean", "bool"): - return ["BOOLEAN"] - if t_lower == "unknown": - return ["UNKNOWN"] - if t_lower in ("any", "object"): - return ["OBJECT"] - if t_lower.endswith("[]"): - return ["LIST"] - if ( - t_lower.startswith("map") - or t_lower.startswith("record") - or "{" in t - ): - return ["MAP"] - if t_lower.startswith("set"): - return ["SET"] - if t_lower == "void": - return [] - return ["OBJECT"] def _extract_return_types(self, node: Node) -> Tuple[List[str], List[str]]: return_type_node = node.child_by_field_name("return_type") @@ -685,16 +632,6 @@ def _is_blocking(self, node: Node, return_types: List[str]) -> bool: return False return True - # Check for 'async' modifier or keyword - for child in node.children: - text = child.text.decode("utf-8") - if text == "async": - return False - # Sometimes modifiers are wrapped? - # But usually async is a direct child in TS grammar for - # method_definition - - return True def _extract_maturity(self, node: Node) -> feature_pb2.Feature.Maturity: decorators = self._get_decorators(node) diff --git a/src/google/adk/scope/matcher/matcher.py b/src/google/adk/scope/matcher/matcher.py index d2cf45f..11fc02b 100644 --- a/src/google/adk/scope/matcher/matcher.py +++ b/src/google/adk/scope/matcher/matcher.py @@ -384,10 +384,10 @@ def escape_csv(s): adk_value = ", ".join(adk_parts) row_content = ( - f"| {adk_value} | `{module}` | {mod_base_count} | {mod_score:.2%} | " - f"{status_icon} | " - f"[View Details]({{modules_dir}}/{module_filename}) |" - ) + f"| {adk_value} | `{module}` | {mod_base_count} | " + f"{mod_score:.2%} | {status_icon} | " + f"[View Details]({{modules_dir}}/{module_filename}) |" + ) else: row_content = ( f"| `{module}` | {mod_base_count} | {mod_score:.2%} | " @@ -610,13 +610,15 @@ def main(): ) output_path = Path(args.output) - + if args.report_type == "raw": # Raw report is a single file, no modules directory needed try: output_path.parent.mkdir(parents=True, exist_ok=True) output_path.write_text(result.master_content) - logging.info(f"Successfully wrote raw match report to {output_path}") + logging.info( + f"Successfully wrote raw match report to {output_path}" + ) except Exception as e: logging.error(f"Error writing raw report to {output_path}: {e}") sys.exit(1) @@ -627,7 +629,7 @@ def main(): modules_dir_name = f"{output_path.stem}_modules" modules_dir = output_path.parent / modules_dir_name modules_dir.mkdir(parents=True, exist_ok=True) - + # Write module files for filename, content in result.module_files.items(): # Replace placeholder for master report link @@ -635,7 +637,7 @@ def main(): # So name is enough. final_content = content.replace("{master_report}", output_path.name) (modules_dir / filename).write_text(final_content) - + # Replace placeholder in Master Report # We assume master report is in parent of modules_dir # modules_dir relative to master report is just the dir name diff --git a/src/google/adk/scope/utils/normalizer.py b/src/google/adk/scope/utils/normalizer.py index 1599520..824936e 100644 --- a/src/google/adk/scope/utils/normalizer.py +++ b/src/google/adk/scope/utils/normalizer.py @@ -22,7 +22,8 @@ def normalize(self, type_name: str, language: str) -> List[str]: elif language == 'typescript': return self._normalize_ts_type(type_name) # Add placeholders for future languages like 'java' and 'go' - # Fallback for unknown languages: only normalize if it's a known simple type, otherwise OBJECT + # Fallback for unknown languages: only normalize if it's a known simple + # type, otherwise OBJECT normalized = self._simple_normalize(type_name) return [normalized] @@ -87,7 +88,10 @@ def _normalize_ts_type(self, t: str) -> List[str]: if base == "Promise": return self._normalize_ts_type(inner) - if base in ("Array", "ReadonlyArray", "Generator", "AsyncGenerator", "Iterable", "Iterator", "AsyncIterable", "AsyncIterator"): + if base in ( + "Array", "ReadonlyArray", "Generator", "AsyncGenerator", + "Iterable", "Iterator", "AsyncIterable", "AsyncIterator" + ): return ["LIST"] if base == "Map": return ["MAP"] @@ -126,7 +130,10 @@ def _simple_normalize(self, t: str) -> str: t = t.lower().strip() if t == "none": return "null" - if t in ("list", "array", "slice", "vector", "generator", "asyncgenerator", "iterable", "iterator", "asynciterable", "asynciterator"): + if t in ( + "list", "array", "slice", "vector", "generator", "asyncgenerator", + "iterable", "iterator", "asynciterable", "asynciterator" + ): return "LIST" if t in ("set",): return "SET" diff --git a/src/google/adk/scope/utils/similarity.py b/src/google/adk/scope/utils/similarity.py index 0bbdaaa..7ee4288 100644 --- a/src/google/adk/scope/utils/similarity.py +++ b/src/google/adk/scope/utils/similarity.py @@ -72,7 +72,8 @@ def _to_str_set(type_list): for t2 in set2: if t1 == t2: score = 1.0 - elif {t1, t2} == {"MAP", "OBJECT"} or {t1, t2} == {"MAP", "ANY"}: + elif ({t1, t2} == {"MAP", "OBJECT"} or + {t1, t2} == {"MAP", "ANY"}): score = 0.4 elif t1 in ("UNKNOWN", "ANY") or t2 in ("UNKNOWN", "ANY"): score = 0.3 diff --git a/test/adk/scope/extractors/test_extract.py b/test/adk/scope/extractors/test_extract.py index 0f89f2d..c3009ac 100644 --- a/test/adk/scope/extractors/test_extract.py +++ b/test/adk/scope/extractors/test_extract.py @@ -141,7 +141,9 @@ def test_input_file_mode(self): f.touch() self.configure_args(lang="python", input_file=str(f)) - self.mock_py_extractor.extract_features.return_value = [feature_pb2.Feature(original_name='dummy')] + self.mock_py_extractor.extract_features.return_value = [ + feature_pb2.Feature(original_name='dummy') + ] self.mock_py_extractor.get_version.return_value = "1.0" extract.main() @@ -164,7 +166,9 @@ def test_input_dir_mode(self): self.configure_args(lang="python", input_dir=str(d)) self.mock_py_extractor.find_files.return_value = [d / "a.py"] - self.mock_py_extractor.extract_features.return_value = [feature_pb2.Feature(original_name='dummy')] + self.mock_py_extractor.extract_features.return_value = [ + feature_pb2.Feature(original_name='dummy') + ] self.mock_py_extractor.get_version.return_value = "1.0" extract.main() @@ -181,7 +185,9 @@ def test_input_repo_mode(self): self.configure_args(lang="python", input_repo=str(r)) self.mock_py_extractor.find_files.return_value = [r / "src" / "a.py"] - self.mock_py_extractor.extract_features.return_value = [feature_pb2.Feature(original_name='dummy')] + self.mock_py_extractor.extract_features.return_value = [ + feature_pb2.Feature(original_name='dummy') + ] self.mock_py_extractor.get_version.return_value = "1.0" extract.main() diff --git a/test/adk/scope/utils/test_normalizer.py b/test/adk/scope/utils/test_normalizer.py index c7594c5..1ecf766 100644 --- a/test/adk/scope/utils/test_normalizer.py +++ b/test/adk/scope/utils/test_normalizer.py @@ -35,40 +35,84 @@ def setUp(self): def test_python_normalization(self): self.assertEqual(self.normalizer.normalize("str", "python"), ["STRING"]) self.assertEqual(self.normalizer.normalize("int", "python"), ["NUMBER"]) - self.assertEqual(self.normalizer.normalize("bool", "python"), ["BOOLEAN"]) + self.assertEqual( + self.normalizer.normalize("bool", "python"), ["BOOLEAN"] + ) self.assertEqual(self.normalizer.normalize("list", "python"), ["LIST"]) self.assertEqual(self.normalizer.normalize("dict", "python"), ["MAP"]) self.assertEqual(self.normalizer.normalize("set", "python"), ["SET"]) self.assertEqual(self.normalizer.normalize("None", "python"), ["null"]) self.assertEqual(self.normalizer.normalize("any", "python"), ["OBJECT"]) - self.assertEqual(self.normalizer.normalize("Optional[str]", "python"), ["STRING", "null"]) - self.assertEqual(self.normalizer.normalize("Union[str, int]", "python"), ["STRING", "NUMBER"]) - self.assertEqual(self.normalizer.normalize("List[int]", "python"), ["LIST"]) - self.assertEqual(self.normalizer.normalize("Tuple[str, int]", "python"), ["STRING", "NUMBER"]) + self.assertEqual(self.normalizer.normalize( + "Optional[str]", "python"), ["STRING", "null"]) + self.assertEqual( + self.normalizer.normalize("Union[str, int]", "python"), + ["STRING", "NUMBER"], + ) + self.assertEqual( + self.normalizer.normalize("List[int]", "python"), ["LIST"] + ) + self.assertEqual( + self.normalizer.normalize("Tuple[str, int]", "python"), + ["STRING", "NUMBER"], + ) def test_typescript_normalization(self): - self.assertEqual(self.normalizer.normalize("string", "typescript"), ["STRING"]) - self.assertEqual(self.normalizer.normalize("number", "typescript"), ["NUMBER"]) - self.assertEqual(self.normalizer.normalize("boolean", "typescript"), ["BOOLEAN"]) - self.assertEqual(self.normalizer.normalize("string[]", "typescript"), ["LIST"]) - self.assertEqual(self.normalizer.normalize("Array", "typescript"), ["LIST"]) - self.assertEqual(self.normalizer.normalize("Map", "typescript"), ["MAP"]) - self.assertEqual(self.normalizer.normalize("Set", "typescript"), ["SET"]) + self.assertEqual( + self.normalizer.normalize("string", "typescript"), ["STRING"] + ) + self.assertEqual( + self.normalizer.normalize("number", "typescript"), ["NUMBER"] + ) + self.assertEqual( + self.normalizer.normalize("boolean", "typescript"), ["BOOLEAN"] + ) + self.assertEqual( + self.normalizer.normalize("string[]", "typescript"), ["LIST"] + ) + self.assertEqual( + self.normalizer.normalize("Array", "typescript"), ["LIST"] + ) + self.assertEqual(self.normalizer.normalize( + "Map", "typescript"), ["MAP"]) + self.assertEqual( + self.normalizer.normalize("Set", "typescript"), ["SET"] + ) self.assertEqual(self.normalizer.normalize("void", "typescript"), []) - self.assertEqual(self.normalizer.normalize("any", "typescript"), ["OBJECT"]) - self.assertEqual(self.normalizer.normalize("unknown", "typescript"), ["UNKNOWN"]) - self.assertEqual(self.normalizer.normalize("Promise", "typescript"), ["STRING"]) - self.assertEqual(self.normalizer.normalize("string | number", "typescript"), ["STRING", "NUMBER"]) + self.assertEqual( + self.normalizer.normalize("any", "typescript"), ["OBJECT"] + ) + self.assertEqual( + self.normalizer.normalize("unknown", "typescript"), ["UNKNOWN"] + ) + normalized = self.normalizer.normalize("Promise", "typescript") + self.assertEqual(normalized, ["STRING"]) + self.assertEqual( + self.normalizer.normalize("string | number", "typescript"), + ["STRING", "NUMBER"], + ) def test_edge_cases(self): self.assertEqual(self.normalizer.normalize("", "python"), ["OBJECT"]) self.assertEqual(self.normalizer.normalize(" ", "python"), ["OBJECT"]) - self.assertEqual(self.normalizer.normalize("", "typescript"), ["OBJECT"]) - self.assertEqual(self.normalizer.normalize(" ", "typescript"), ["OBJECT"]) - self.assertEqual(self.normalizer.normalize("unsupported_type", "python"), ["OBJECT"]) - self.assertEqual(self.normalizer.normalize("unsupported_type", "typescript"), ["OBJECT"]) - self.assertEqual(self.normalizer.normalize("str", "unsupported_language"), ["STRING"]) - self.assertEqual(self.normalizer.normalize("MyCustomType", "unsupported_language"), ["OBJECT"]) + self.assertEqual( + self.normalizer.normalize("", "typescript"), ["OBJECT"] + ) + self.assertEqual( + self.normalizer.normalize(" ", "typescript"), ["OBJECT"] + ) + self.assertEqual( + self.normalizer.normalize("unsupported_type", "python"), ["OBJECT"] + ) + self.assertEqual(self.normalizer.normalize( + "unsupported_type", "typescript"), ["OBJECT"]) + self.assertEqual( + self.normalizer.normalize("str", "unsupported_language"), ["STRING"] + ) + self.assertEqual( + self.normalizer.normalize("MyCustomType", "unsupported_language"), + ["OBJECT"], + ) if __name__ == '__main__': From 0d07828f3ade9ef89045fb1f1f1781e7c4a521b2 Mon Sep 17 00:00:00 2001 From: Shahin Saadati Date: Thu, 5 Feb 2026 13:46:30 -0800 Subject: [PATCH 14/21] updated match.sh --- README.md | 4 +- match.sh | 75 ++++++++++++++++++++++--- src/google/adk/scope/matcher/matcher.py | 1 + 3 files changed, 71 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index 420e0c7..138774f 100644 --- a/README.md +++ b/README.md @@ -76,7 +76,7 @@ Once you have extracted features from two languages (e.g., Python and TypeScript ./match.sh \ --base output/py.txtpb \ --target output/ts.txtpb \ - --output report.md \ + --output output/ \ --report-type directional ``` @@ -84,7 +84,7 @@ Once you have extracted features from two languages (e.g., Python and TypeScript | :--- | :--- | | `--base ` | **Required.** Path to the "source of truth" feature registry (e.g., Python). | | `--target ` | **Required.** Path to the comparison registry (e.g., TypeScript). | -| `--output ` | **Required.** Path to the output Markdown report. | +| `--output ` | **Required.** Path for the output directory. The report filename is auto-generated. | | `--report-type ` | `symmetric` (default) for Jaccard Index, `directional` for F1/Precision/Recall, or `raw` for CSV. | | `--alpha ` | Similarity threshold (0.0 - 1.0). Default is `0.8`. | diff --git a/match.sh b/match.sh index f56de67..428d699 100755 --- a/match.sh +++ b/match.sh @@ -3,6 +3,67 @@ set -e +# Default values +REPORT_TYPE="symmetric" +ALPHA="0.8" +VERBOSE="" + +# Parse arguments +while [[ "$#" -gt 0 ]]; do + case "$1" in + --base) + BASE_FILE="$2" + shift 2 + ;; + --target) + TARGET_FILE="$2" + shift 2 + ;; + --output) + OUTPUT_DIR="$2" + shift 2 + ;; + --report-type) + REPORT_TYPE="$2" + shift 2 + ;; + --alpha) + ALPHA="$2" + shift 2 + ;; + -v|--verbose) + VERBOSE="--verbose" + shift + ;; + *) + echo "Unknown option: $1" + exit 1 + ;; + esac +done + +# Extract languages +BASE_LANG_RAW=$(head -n 1 "${BASE_FILE}" | grep -o 'language: "[A-Z]*"' | grep -o '"[A-Z]*"' | tr -d '"') +TARGET_LANG_RAW=$(head -n 1 "${TARGET_FILE}" | grep -o 'language: "[A-Z]*"' | grep -o '"[A-Z]*"' | tr -d '"') + +# Function to map language to short code +get_lang_code() { + case "$1" in + PYTHON) echo "py" ;; + TYPESCRIPT) echo "ts" ;; + JAVA) echo "java" ;; + GOLANG) echo "go" ;; + *) echo "" ;; + esac +} + +BASE_LANG=$(get_lang_code "$BASE_LANG_RAW") +TARGET_LANG=$(get_lang_code "$TARGET_LANG_RAW") + +# Construct filename +OUTPUT_FILENAME="${BASE_LANG}_${TARGET_LANG}_${REPORT_TYPE}.md" +FULL_OUTPUT_PATH="${OUTPUT_DIR}/${OUTPUT_FILENAME}" + # Determine the directory where this script is located SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" @@ -10,10 +71,10 @@ SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" export PYTHONPATH="${SCRIPT_DIR}/src:${PYTHONPATH}" # Run the python matcher -# Pass all arguments to the python script -VERBOSE="" -if [[ "$1" == "-v" ]]; then - VERBOSE="--verbose" - shift -fi -python3 "${SCRIPT_DIR}/src/google/adk/scope/matcher/matcher.py" $VERBOSE "$@" +python3 "${SCRIPT_DIR}/src/google/adk/scope/matcher/matcher.py" \ + --base "${BASE_FILE}" \ + --target "${TARGET_FILE}" \ + --output "${FULL_OUTPUT_PATH}" \ + --report-type "${REPORT_TYPE}" \ + --alpha "${ALPHA}" \ + ${VERBOSE} diff --git a/src/google/adk/scope/matcher/matcher.py b/src/google/adk/scope/matcher/matcher.py index 11fc02b..416961b 100644 --- a/src/google/adk/scope/matcher/matcher.py +++ b/src/google/adk/scope/matcher/matcher.py @@ -610,6 +610,7 @@ def main(): ) output_path = Path(args.output) + output_path.parent.mkdir(parents=True, exist_ok=True) if args.report_type == "raw": # Raw report is a single file, no modules directory needed From 6472706cc70779c1cbfb9664e59e28e1519757ac Mon Sep 17 00:00:00 2001 From: Shahin Saadati Date: Thu, 5 Feb 2026 15:03:24 -0800 Subject: [PATCH 15/21] Added a run.sh --- config.yaml | 36 +- run.sh | 14 + src/google/adk/scope/matcher/matcher.py | 629 +++++++++++------------- test/adk/scope/matcher/test_matcher.py | 142 +++++- 4 files changed, 430 insertions(+), 391 deletions(-) create mode 100755 run.sh diff --git a/config.yaml b/config.yaml index ce9ca6e..bc70cc3 100644 --- a/config.yaml +++ b/config.yaml @@ -1,41 +1,7 @@ python: exclude: - examples - - a2a - - agents - - apps - - artifacts - - auth - - cli - - code_executors - - dependencies - - errors - - evaluation - - events - - features - - flows - - memory - - models - - planners - - platform - - plugins - - sessions - - telemetry - - tools - - utils typescript: exclude: - - agents - - artifacts - - auth - - code_executors - - events - - examples - - memory - - models - - plugins - - sessions - - telemetry - - tools - - utils + - examples \ No newline at end of file diff --git a/run.sh b/run.sh new file mode 100755 index 0000000..27e4b81 --- /dev/null +++ b/run.sh @@ -0,0 +1,14 @@ +echo "Extracting Python features..." +./extract.sh --language py --input-repo ../adk-python ./output + +echo "Extracting TypeScript features..." +./extract.sh --language typescript --input-repo ../adk-js ./output + +echo "Generating symmetric reports..." +./match.sh --base output/py.txtpb --target output/ts.txtpb --output ./output --report-type symmetric + +echo "Generating directional reports.. ." +./match.sh --base output/py.txtpb --target output/ts.txtpb --output ./output --report-type directional + +echo "Generating raw reports..." +./match.sh --base output/py.txtpb --target output/ts.txtpb --output ./output --report-type raw \ No newline at end of file diff --git a/src/google/adk/scope/matcher/matcher.py b/src/google/adk/scope/matcher/matcher.py index 416961b..a0b98f4 100644 --- a/src/google/adk/scope/matcher/matcher.py +++ b/src/google/adk/scope/matcher/matcher.py @@ -127,52 +127,59 @@ def get_language_code(language_name: str) -> str: return name[:2].lower() -def match_registries( - base_registry: features_pb2.FeatureRegistry, - target_registry: features_pb2.FeatureRegistry, - alpha: float, - report_type: str = "symmetric", -) -> MatchResult: - """Matches features and generates a master report + module sub-reports.""" - - # 1. Group by Module (Normalized Namespace) - features_base = defaultdict(list) - for f in base_registry.features: +def _group_features_by_module( + registry: features_pb2.FeatureRegistry, +) -> Dict[str, List[features_pb2.Feature]]: + """Groups features by their module.""" + features = defaultdict(list) + for f in registry.features: key = f.normalized_namespace or f.namespace or "Unknown Module" - features_base[key].append(f) + features[key].append(f) + return features - features_target = defaultdict(list) - for f in target_registry.features: - key = f.normalized_namespace or f.namespace or "Unknown Module" - features_target[key].append(f) - # Fuzzy Remapping Logic +def _fuzzy_match_namespaces( + features_base: Dict[str, List[features_pb2.Feature]], + features_target: Dict[str, List[features_pb2.Feature]], +) -> None: + """Remaps target namespaces to base namespaces using fuzzy matching.""" from jellyfish import jaro_winkler_similarity - + base_namespaces = set(features_base.keys()) - target_remap = {} # old_key -> new_key - - # Identify remappings - for t_ns in list(features_target.keys()): + target_remap = {} + remapped_features = defaultdict(list) + + for t_ns, features in features_target.items(): if t_ns in base_namespaces: - target_remap[t_ns] = t_ns + remapped_features[t_ns].extend(features) continue - - # Find best fuzzy match - best_match = None - best_score = 0.0 - for b_ns in base_namespaces: - score = jaro_winkler_similarity(t_ns, b_ns) - if score > best_score: - best_score = score - best_match = b_ns - + + best_match, best_score = max( + ((b_ns, jaro_winkler_similarity(t_ns, b_ns)) for b_ns in base_namespaces), + key=lambda item: item[1], + default=(None, 0.0), + ) + if best_score > 0.8 and best_match: - target_remap[t_ns] = best_match - # Move features to the best match - features_target[best_match].extend(features_target.pop(t_ns)) + remapped_features[best_match].extend(features) else: - target_remap[t_ns] = t_ns + remapped_features[t_ns].extend(features) + + features_target.clear() + features_target.update(remapped_features) + + +def match_registries( + base_registry: features_pb2.FeatureRegistry, + target_registry: features_pb2.FeatureRegistry, + alpha: float, + report_type: str = "symmetric", +) -> MatchResult: + """Matches features and generates a master report + module sub-reports.""" + + features_base = _group_features_by_module(base_registry) + features_target = _group_features_by_module(target_registry) + _fuzzy_match_namespaces(features_base, features_target) if report_type == "directional": all_modules = sorted(features_base.keys()) @@ -187,347 +194,167 @@ def match_registries( total_base_features = len(base_registry.features) total_target_features = len(target_registry.features) - # Master Report Header - from datetime import datetime - - master_lines = [] - if report_type == "raw": - # Raw CSV Report - # Columns: base_namespace,base_member_of,base_name,target_namespace, - # target_member_of,target_name,type,score - csv_header = ( - "base_namespace,base_member_of,base_name,target_namespace," - "target_member_of,target_name,type,score" - ) - csv_lines = [csv_header] + return _generate_raw_report(all_modules, features_base, features_target, alpha) + + return _generate_markdown_report( + base_registry, + target_registry, + all_modules, + features_base, + features_target, + alpha, + report_type, + ) + + +def _generate_raw_report( + all_modules: List[str], + features_base: Dict[str, List[features_pb2.Feature]], + features_target: Dict[str, List[features_pb2.Feature]], + alpha: float, +) -> MatchResult: + """Generates a raw CSV report.""" + csv_header = ( + "base_namespace,base_member_of,base_name,target_namespace," + "target_member_of,target_name,type,score" + ) + csv_lines = [csv_header] + + def get_feature_cols(f: features_pb2.Feature) -> tuple[str, str, str]: + ns = f.namespace or "" + if not ns and f.normalized_namespace: + ns = f.normalized_namespace - def get_feature_cols(f: features_pb2.Feature) -> tuple[str, str, str]: - ns = f.namespace or "" - if not ns and f.normalized_namespace: - ns = f.normalized_namespace - - # member_of - mem = f.member_of or "" - if not mem and f.normalized_member_of: - mem = f.normalized_member_of - if mem.lower() == "null": - mem = "" - - # name - name = f.original_name or f.normalized_name or "" - return ns, mem, name - - def escape_csv(s): - if s is None: - return "" - if ',' in s or '"' in s or '\n' in s: - escaped = s.replace('"', '""') - return f'"{escaped}"' - return s - - for module in all_modules: - base_list = features_base[module] - target_list = features_target[module] - - # Pass 1: Solid Matches - solid_matches = match_features(base_list, target_list, alpha) - - # Pass 2: Potential Matches (formerly Near Misses) - beta = max(0.0, alpha - _NEAR_MISS_THRESHOLD) - potential_matches = match_features(base_list, target_list, beta) + mem = f.member_of or "" + if not mem and f.normalized_member_of: + mem = f.normalized_member_of + if mem.lower() == "null": + mem = "" - # Leftovers - unmatched_base = base_list - unmatched_target = target_list - - for f_base, f_target, score in solid_matches: - b_ns, b_mem, b_name = get_feature_cols(f_base) - t_ns, t_mem, t_name = get_feature_cols(f_target) - f_type = get_type_display_name(f_base) - - line = ( - f"{escape_csv(b_ns)},{escape_csv(b_mem)}," - f"{escape_csv(b_name)}," - f"{escape_csv(t_ns)},{escape_csv(t_mem)}," - f"{escape_csv(t_name)}," - f"{escape_csv(f_type)},{score:.4f}" - ) - csv_lines.append(line) - - for f_base, f_target, score in potential_matches: - b_ns, b_mem, b_name = get_feature_cols(f_base) - t_ns, t_mem, t_name = get_feature_cols(f_target) - f_type = get_type_display_name(f_base) - - line = ( - f"{escape_csv(b_ns)},{escape_csv(b_mem)}," - f"{escape_csv(b_name)}," - f"{escape_csv(t_ns)},{escape_csv(t_mem)}," - f"{escape_csv(t_name)}," - f"{escape_csv(f_type)},{score:.4f}" - ) - csv_lines.append(line) - - for f_base in unmatched_base: - b_ns, b_mem, b_name = get_feature_cols(f_base) - f_type = get_type_display_name(f_base) - - line = ( - f"{escape_csv(b_ns)},{escape_csv(b_mem)}," - f"{escape_csv(b_name)}," - f",,," - f"{escape_csv(f_type)},0.0000" - ) - csv_lines.append(line) - - for f_target in unmatched_target: - t_ns, t_mem, t_name = get_feature_cols(f_target) - f_type = get_type_display_name(f_target) - - line = ( - f",,," - f"{escape_csv(t_ns)},{escape_csv(t_mem)}," - f"{escape_csv(t_name)}," - f"{escape_csv(f_type)},0.0000" - ) - csv_lines.append(line) - - return MatchResult( - master_content="\n".join(csv_lines), - module_files={} - ) + name = f.original_name or f.normalized_name or "" + return ns, mem, name + + def escape_csv(s): + if s is None: + return "" + if ',' in s or '"' in s or '\n' in s: + return f'"{s.replace("\"", "\"\"")}"' + return s + + for module in all_modules: + base_list = features_base.get(module, []) + target_list = features_target.get(module, []) + + solid_matches = match_features(base_list, target_list, alpha) + beta = max(0.0, alpha - _NEAR_MISS_THRESHOLD) + potential_matches = match_features(base_list, target_list, beta) + + unmatched_base = base_list + unmatched_target = target_list + + for f_base, f_target, score in solid_matches: + b_ns, b_mem, b_name = get_feature_cols(f_base) + t_ns, t_mem, t_name = get_feature_cols(f_target) + f_type = get_type_display_name(f_base) + csv_lines.append( + f"{escape_csv(b_ns)},{escape_csv(b_mem)},{escape_csv(b_name)}," + f"{escape_csv(t_ns)},{escape_csv(t_mem)},{escape_csv(t_name)}," + f"{escape_csv(f_type)},{score:.4f}" + ) + + for f_base, f_target, score in potential_matches: + b_ns, b_mem, b_name = get_feature_cols(f_base) + t_ns, t_mem, t_name = get_feature_cols(f_target) + f_type = get_type_display_name(f_base) + csv_lines.append( + f"{escape_csv(b_ns)},{escape_csv(b_mem)},{escape_csv(b_name)}," + f"{escape_csv(t_ns)},{escape_csv(t_mem)},{escape_csv(t_name)}," + f"{escape_csv(f_type)},{score:.4f}" + ) + + for f_base in unmatched_base: + b_ns, b_mem, b_name = get_feature_cols(f_base) + f_type = get_type_display_name(f_base) + csv_lines.append( + f"{escape_csv(b_ns)},{escape_csv(b_mem)},{escape_csv(b_name)},,,," + f"{escape_csv(f_type)},0.0000" + ) + + for f_target in unmatched_target: + t_ns, t_mem, t_name = get_feature_cols(f_target) + f_type = get_type_display_name(f_target) + csv_lines.append( + f",,,{escape_csv(t_ns)},{escape_csv(t_mem)},{escape_csv(t_name)}," + f"{escape_csv(f_type)},0.0000" + ) + + return MatchResult(master_content="\n".join(csv_lines), module_files={}) + + +def _generate_markdown_report( + base_registry: features_pb2.FeatureRegistry, + target_registry: features_pb2.FeatureRegistry, + all_modules: List[str], + features_base: Dict[str, List[features_pb2.Feature]], + features_target: Dict[str, List[features_pb2.Feature]], + alpha: float, + report_type: str, +) -> MatchResult: + """Generates a markdown report.""" + from datetime import datetime + master_lines = [] title_suffix = "Symmetric" if report_type == "symmetric" else "Directional" - master_lines.append(f"# Feature Matching Report: {title_suffix}") - master_lines.append(f"Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}") - master_lines.append("") - master_lines.append( - f"**Base:** {base_registry.language} ({base_registry.version})" - ) - master_lines.append( - f"**Target:** {target_registry.language} ({target_registry.version})" - ) + master_lines.extend([ + f"# Feature Matching Report: {title_suffix}", + f"Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}", + "", + f"**Base:** {base_registry.language} ({base_registry.version})", + f"**Target:** {target_registry.language} ({target_registry.version})", + ]) - # Placeholder for Global Score (calculated at end) global_score_idx = len(master_lines) - master_lines.append("GLOBAL_SCORE_PLACEHOLDER") + master_lines.append("GLOBAL_SCORE_PLACEHOLDER") master_lines.append("") - master_lines.append("## Module Summary") header = "| Module | Features (Base) | Score | Status | Details |" divider = "|---|---|---|---|---|" if report_type == "symmetric": header = "| ADK | Module | Features (Base) | Score | Status | Details |" divider = "|---|---|---|---|---|---|" - - master_lines.append(header) - master_lines.append(divider) + master_lines.extend(["## Module Summary", header, divider]) module_files = {} module_rows = [] + total_solid_matches = 0 base_code = get_language_code(base_registry.language) target_code = get_language_code(target_registry.language) for module in all_modules: - base_list = features_base[module] - target_list = features_target[module] - - mod_base_count = len(base_list) - mod_target_count = len(target_list) - - # Pass 1: Solid Matches - solid_matches = match_features(base_list, target_list, alpha) - mod_solid_count = len(solid_matches) - total_solid_matches += mod_solid_count - - # Pass 2: Potential Matches (formerly Near Misses) - beta = max(0.0, alpha - _NEAR_MISS_THRESHOLD) - potential_matches = match_features(base_list, target_list, beta) - - # Leftovers - unmatched_base = base_list - unmatched_target = target_list - - # Calculate Module Score - if report_type == "symmetric": - union_size = mod_base_count + mod_target_count - mod_solid_count - mod_score = ( - mod_solid_count / union_size if union_size > 0 else 1.0 - ) - else: # directional - precision = stats.calculate_precision( - mod_solid_count, mod_target_count - ) - recall = stats.calculate_recall(mod_solid_count, mod_base_count) - mod_score = stats.calculate_f1(precision, recall) - - status_icon = "❌" - if mod_score == 1.0: - status_icon = "✅" - elif mod_score >= 0.8: - status_icon = "⚠️" - - # Safe filename - module_safe_name = module.replace(".", "_") - module_filename = f"{module_safe_name}.md" - - # Determine ADK Value (Symmetric Only) - row_content = "" - if report_type == "symmetric": - adk_parts = [] - if mod_base_count > 0: - adk_parts.append(base_code) - if mod_target_count > 0: - adk_parts.append(target_code) - adk_value = ", ".join(adk_parts) - - row_content = ( - f"| {adk_value} | `{module}` | {mod_base_count} | " - f"{mod_score:.2%} | {status_icon} | " - f"[View Details]({{modules_dir}}/{module_filename}) |" - ) - else: - row_content = ( - f"| `{module}` | {mod_base_count} | {mod_score:.2%} | " - f"{status_icon} | " - f"[View Details]({{modules_dir}}/{module_filename}) |" - ) - - # Add to Master - module_rows.append((mod_score, row_content)) - - if report_type == "symmetric": - mod_total_features = ( - mod_base_count + mod_target_count - mod_solid_count - ) - else: - mod_total_features = mod_base_count - - # Generate Module Content - mod_lines = [] - mod_lines.append(f"# Module: `{module}`") - # Back link usually works if we know the relative path structure. - # Use placeholder {master_report} which will be replaced in main. - # It should link to the master report file. - mod_lines.append("[⬅️ Back to Master Report](../{master_report})") - mod_lines.append("") - mod_lines.append(f"**Score:** {mod_score:.2%} ({status_icon})") - - if report_type == "directional": - mod_lines.append( - "| Metric | Score |\n" - "|---|---|\n" - f"| **Precision** | {precision:.2%} |\n" - f"| **Recall** | {recall:.2%} |" - ) - else: - # For symmetric, we usually just have the score (Jaccard). - # We can make it a table too for consistency if desired. - pass - - mod_lines.append("") - mod_lines.append(f"**Features:** {mod_total_features}") - mod_lines.append("") - - # Sort matches by type - solid_matches.sort( - key=lambda x: (get_type_priority(x[0]), x[0].normalized_name) - ) - potential_matches.sort( - key=lambda x: (get_type_priority(x[0]), x[0].normalized_name) + mod_base_list = features_base.get(module, []) + mod_target_list = features_target.get(module, []) + + results = _process_module( + module, + mod_base_list, + mod_target_list, + alpha, + report_type, + base_code, + target_code, ) + total_solid_matches += results["solid_matches_count"] + module_rows.append((results["score"], results["row_content"])) + if results.get("module_filename"): + module_files[results["module_filename"]] = results["module_content"] - if report_type == "symmetric": - if solid_matches: - mod_lines.append("### ✅ Solid Matches") - mod_lines.append( - "| Type | Base Feature | Target Feature | " - "Similarity Score |" - ) - mod_lines.append("|---|---|---|---|") - for f_base, f_target, score in solid_matches: - f_type = get_type_display_name(f_base) - mod_lines.append( - f"| {f_type} | `{format_feature(f_base)}` | " - f"`{format_feature(f_target)}` | {score:.2f} |" - ) - mod_lines.append("") - - if potential_matches: - mod_lines.append("### ⚠️ Potential Matches") - mod_lines.append( - "| Type | Base Feature | Closest Target Candidate | " - "Similarity |" - ) - mod_lines.append("|---|---|---|---|") - for f_base, f_target, score in potential_matches: - f_type = get_type_display_name(f_base) - mod_lines.append( - f"| {f_type} | `{format_feature(f_base)}` | " - f"`{format_feature(f_target)}` | {score:.2f} |" - ) - mod_lines.append("") - - if unmatched_base or unmatched_target: - mod_lines.append("### ❌ Unmatched Features") - mod_lines.append("| Missing Feature | Missing In |") - mod_lines.append("|---|---|") - for f_base in unmatched_base: - mod_lines.append(f"| `{format_feature(f_base)}` | Target |") - for f_target in unmatched_target: - mod_lines.append(f"| `{format_feature(f_target)}` | Base |") - mod_lines.append("") - else: # directional - if solid_matches: - mod_lines.append("### ✅ Matched Features") - mod_lines.append( - "| Type | Base Feature | Target Feature | " - "Similarity Score |" - ) - mod_lines.append("|---|---|---|---|") - for f_base, f_target, score in solid_matches: - f_type = get_type_display_name(f_base) - mod_lines.append( - f"| {f_type} | `{format_feature(f_base)}` | " - f"`{format_feature(f_target)}` | {score:.2f} |" - ) - mod_lines.append("") - - if potential_matches: - mod_lines.append("### ⚠️ Potential Matches") - mod_lines.append( - "| Type | Base Feature | Closest Target Candidate | " - "Similarity |" - ) - mod_lines.append("|---|---|---|---|") - for f_base, f_target, score in potential_matches: - f_type = get_type_display_name(f_base) - mod_lines.append( - f"| {f_type} | `{format_feature(f_base)}` | " - f"`{format_feature(f_target)}` | {score:.2f} |" - ) - mod_lines.append("") - - if unmatched_base: - mod_lines.append("### ❌ Missing in Target") - mod_lines.append("| Missing Feature |") - mod_lines.append("|---|") - for f_base in unmatched_base: - mod_lines.append(f"| `{format_feature(f_base)}` |") - mod_lines.append("") - - # Directional reports usually ignore target exclusives. - # We flag missing-in-target features only. - module_files[module_filename] = "\n".join(mod_lines).strip() - - # Sort modules by score descending module_rows.sort(key=lambda x: x[0], reverse=True) - for _, row in module_rows: - master_lines.append(row) + master_lines.extend([row for _, row in module_rows]) - # Calculate Global Score + total_base_features = len(base_registry.features) + total_target_features = len(target_registry.features) if report_type == "symmetric": union_size = ( total_base_features + total_target_features - total_solid_matches @@ -553,9 +380,7 @@ def escape_csv(s): f"| **Global F1 Score** | {parity_score:.2%} |" ) - master_lines[ - global_score_idx - ] = global_stats + master_lines[global_score_idx] = global_stats return MatchResult( master_content="\n".join(master_lines).strip(), @@ -563,6 +388,100 @@ def escape_csv(s): ) +def _process_module( + module: str, + base_list: List[features_pb2.Feature], + target_list: List[features_pb2.Feature], + alpha: float, + report_type: str, + base_lang_code: str, + target_lang_code: str, +) -> Dict: + """Analyzes a single module and generates its report content.""" + mod_base_count = len(base_list) + mod_target_count = len(target_list) + + solid_matches = match_features(base_list, target_list, alpha) + mod_solid_count = len(solid_matches) + + beta = max(0.0, alpha - _NEAR_MISS_THRESHOLD) + potential_matches = match_features(base_list, target_list, beta) + + unmatched_base = base_list + unmatched_target = target_list + + if report_type == "symmetric": + union_size = mod_base_count + mod_target_count - mod_solid_count + mod_score = mod_solid_count / union_size if union_size > 0 else 1.0 + else: # directional + precision = stats.calculate_precision(mod_solid_count, mod_target_count) + recall = stats.calculate_recall(mod_solid_count, mod_base_count) + mod_score = stats.calculate_f1(precision, recall) + + status_icon = "✅" if mod_score == 1.0 else "⚠️" if mod_score >= 0.8 else "❌" + module_safe_name = module.replace(".", "_") + module_filename = f"{module_safe_name}.md" + + row_content = f"| `{module}` | {mod_base_count} | {mod_score:.2%} | {status_icon} | [View Details]({{modules_dir}}/{module_filename}) |" + if report_type == "symmetric": + adk_parts = [] + if mod_base_count > 0: + adk_parts.append(base_lang_code) + if mod_target_count > 0: + adk_parts.append(target_lang_code) + adk_value = ", ".join(adk_parts) + row_content = f"| {adk_value} | {row_content}" + + # Module Content + mod_lines = [f"# Module: `{module}`", "[⬅️ Back to Master Report](../{master_report})", "", f"**Score:** {mod_score:.2%} ({status_icon})"] + if report_type == "directional": + mod_lines.extend([ + "| Metric | Score |", + "|---|---|", + f"| **Precision** | {precision:.2%} |", + f"| **Recall** | {recall:.2%} |", + ]) + + mod_total_features = (mod_base_count + mod_target_count - mod_solid_count) if report_type == "symmetric" else mod_base_count + mod_lines.extend(["", f"**Features:** {mod_total_features}", ""]) + + solid_matches.sort(key=lambda x: (get_type_priority(x[0]), x[0].normalized_name)) + potential_matches.sort(key=lambda x: (get_type_priority(x[0]), x[0].normalized_name)) + + if solid_matches: + mod_lines.append(f"### ✅ {'Solid' if report_type == 'symmetric' else 'Matched'} Features") + mod_lines.extend(["| Type | Base Feature | Target Feature | Similarity Score |", "|---|---|---|---|"]) + mod_lines.extend([f"| {get_type_display_name(f_base)} | `{format_feature(f_base)}` | `{format_feature(f_target)}` | {score:.2f} |" for f_base, f_target, score in solid_matches]) + mod_lines.append("") + + if potential_matches: + mod_lines.extend([ + "### ⚠️ Potential Matches", + "| Type | Base Feature | Closest Target Candidate | Similarity |", + "|---|---|---|---|", + ]) + mod_lines.extend([f"| {get_type_display_name(f_base)} | `{format_feature(f_base)}` | `{format_feature(f_target)}` | {score:.2f} |" for f_base, f_target, score in potential_matches]) + mod_lines.append("") + + if report_type == "symmetric" and (unmatched_base or unmatched_target): + mod_lines.extend(["### ❌ Unmatched Features", "| Missing Feature | Missing In |", "|---|---|"]) + mod_lines.extend([f"| `{format_feature(f)}` | Target |" for f in unmatched_base]) + mod_lines.extend([f"| `{format_feature(f)}` | Base |" for f in unmatched_target]) + mod_lines.append("") + elif report_type == "directional" and unmatched_base: + mod_lines.extend(["### ❌ Missing in Target", "| Missing Feature |", "|---|"]) + mod_lines.extend([f"| `{format_feature(f)}` |" for f in unmatched_base]) + mod_lines.append("") + + return { + "solid_matches_count": mod_solid_count, + "score": mod_score, + "row_content": row_content, + "module_filename": module_filename, + "module_content": "\n".join(mod_lines).strip(), + } + + def main(): parser = argparse.ArgumentParser( description="Match ADK features between two languages." diff --git a/test/adk/scope/matcher/test_matcher.py b/test/adk/scope/matcher/test_matcher.py index 1f78ed2..d78b207 100644 --- a/test/adk/scope/matcher/test_matcher.py +++ b/test/adk/scope/matcher/test_matcher.py @@ -1,6 +1,7 @@ import os import tempfile import unittest +from unittest.mock import patch from google.adk.scope import features_pb2 from google.adk.scope.matcher import matcher @@ -169,7 +170,7 @@ def test_match_registries(self): self.assertIn("**Features:** 3", module_content) # Solid Matches - self.assertIn("### ✅ Solid Matches", module_content) + self.assertIn("### ✅ Solid Features", module_content) self.assertIn( "| Type | Base Feature | Target Feature | Similarity Score |", module_content @@ -269,5 +270,144 @@ def test_match_registries_raw(self): self.assertIn(expected_line, csv_content) self.assertFalse(result.module_files) + def test_group_features_by_module(self): + registry = features_pb2.FeatureRegistry() + f1 = registry.features.add() + f1.namespace = "module.one" + f2 = registry.features.add() + f2.namespace = "module.two" + f3 = registry.features.add() + f3.namespace = "module.one" + + result = matcher._group_features_by_module(registry) + + self.assertIn("module.one", result) + self.assertIn("module.two", result) + self.assertEqual(len(result["module.one"]), 2) + self.assertEqual(len(result["module.two"]), 1) + + + def test_fuzzy_match_namespaces(self): + features_base = { + "module.one": [], + "module.two": [] + } + features_target = { + "module.one": [features_pb2.Feature(original_name="f1_target")], + "module.ones": [features_pb2.Feature(original_name="f4")], + "module.three": [features_pb2.Feature(original_name="f5")] + } + + matcher._fuzzy_match_namespaces(features_base, features_target) + + self.assertIn("module.one", features_target) + self.assertIn("module.two", features_target) + self.assertNotIn("module.ones", features_target) + self.assertNotIn("module.three", features_target) + self.assertEqual(len(features_target["module.one"]), 2) + self.assertEqual(len(features_target["module.two"]), 1) + + + def test_process_module(self): + """Tests the end-to-end processing of a single module.""" + f_base = features_pb2.Feature( + original_name="f1_base", + normalized_name="f1_base", + normalized_namespace="n1", + type=features_pb2.Feature.Type.FUNCTION, + ) + f_target = features_pb2.Feature( + original_name="f1_target", + normalized_name="f1_target", + normalized_namespace="n1", + type=features_pb2.Feature.Type.FUNCTION, + ) + + with patch( + "google.adk.scope.matcher.matcher.match_features" + ) as mock_match: + # Let's assume one solid match and no potential matches + mock_match.side_effect = [ + [(f_base, f_target, 0.95)], # Solid matches + [], # Potential matches + ] + + result = matcher._process_module( + module="n1", + base_list=[f_base], + target_list=[f_target], + alpha=0.9, + report_type="symmetric", + base_lang_code="py", + target_lang_code="ts", + ) + + self.assertEqual(result["solid_matches_count"], 1) + self.assertEqual(result["score"], 1.0) + self.assertIn("| py, ts |", result["row_content"]) + self.assertIn("# Module: `n1`", result["module_content"]) + self.assertIn("### ✅ Solid Features", result["module_content"]) + + + def test_generate_raw_report(self): + """Tests the raw CSV report generation.""" + f_base = features_pb2.Feature( + original_name="f1_base", + normalized_name="f1_base", + namespace="n1", + member_of="c1", + type=features_pb2.Feature.Type.FUNCTION, + ) + + with patch( + "google.adk.scope.matcher.matcher.match_features" + ) as mock_match: + mock_match.return_value = [] # No matches for simplicity + + result = matcher._generate_raw_report( + all_modules=["n1"], + features_base={"n1": [f_base]}, + features_target={"n1": []}, + alpha=0.9, + ) + + self.assertIn("base_namespace,base_member_of,base_name", result.master_content) + self.assertIn("n1,c1,f1_base", result.master_content) + + + def test_generate_markdown_report(self): + """Tests the markdown report generation.""" + base_registry = features_pb2.FeatureRegistry(language="Python", version="1.0.0") + target_registry = features_pb2.FeatureRegistry( + language="TypeScript", version="2.0.0" + ) + + with patch( + "google.adk.scope.matcher.matcher._process_module" + ) as mock_process: + mock_process.return_value = { + "solid_matches_count": 1, + "score": 1.0, + "row_content": "| py, ts | `n1` | 1 | 100.00% | ✅ | [View Details]({modules_dir}/n1.md) |", + "module_filename": "n1.md", + "module_content": "# Module: `n1`", + } + + result = matcher._generate_markdown_report( + base_registry=base_registry, + target_registry=target_registry, + all_modules=["n1"], + features_base={"n1": []}, # Dummy data + features_target={"n1": []}, # Dummy data + alpha=0.9, + report_type="symmetric", + ) + + self.assertIn("# Feature Matching Report: Symmetric", result.master_content) + self.assertIn("## Module Summary", result.master_content) + self.assertIn("| `n1` |", result.master_content) + self.assertIn("n1.md", result.module_files) + + if __name__ == "__main__": unittest.main() From 11ed87aff6f7dc23ab769e75bea9b387e7e66059 Mon Sep 17 00:00:00 2001 From: Shahin Saadati Date: Thu, 5 Feb 2026 15:47:36 -0800 Subject: [PATCH 16/21] Fixed symmetric report. --- src/google/adk/scope/matcher/matcher.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/google/adk/scope/matcher/matcher.py b/src/google/adk/scope/matcher/matcher.py index a0b98f4..2f8c2ea 100644 --- a/src/google/adk/scope/matcher/matcher.py +++ b/src/google/adk/scope/matcher/matcher.py @@ -422,7 +422,7 @@ def _process_module( module_safe_name = module.replace(".", "_") module_filename = f"{module_safe_name}.md" - row_content = f"| `{module}` | {mod_base_count} | {mod_score:.2%} | {status_icon} | [View Details]({{modules_dir}}/{module_filename}) |" + details_link = f"[View Details]({{modules_dir}}/{module_filename})" if report_type == "symmetric": adk_parts = [] if mod_base_count > 0: @@ -430,7 +430,9 @@ def _process_module( if mod_target_count > 0: adk_parts.append(target_lang_code) adk_value = ", ".join(adk_parts) - row_content = f"| {adk_value} | {row_content}" + row_content = f"| {adk_value} | `{module}` | {mod_base_count} | {mod_score:.2%} | {status_icon} | {details_link} |" + else: + row_content = f"| `{module}` | {mod_base_count} | {mod_score:.2%} | {status_icon} | {details_link} |" # Module Content mod_lines = [f"# Module: `{module}`", "[⬅️ Back to Master Report](../{master_report})", "", f"**Score:** {mod_score:.2%} ({status_icon})"] From 359a36d827bd9e0b6a612aa699c528a6853cdc95 Mon Sep 17 00:00:00 2001 From: Shahin Saadati Date: Fri, 6 Feb 2026 08:54:42 -0800 Subject: [PATCH 17/21] Added score descriptions --- README.md | 34 ++++++++++++++++--------- match.sh | 7 ++++- src/google/adk/scope/matcher/matcher.py | 24 +++++++++++++---- 3 files changed, 47 insertions(+), 18 deletions(-) diff --git a/README.md b/README.md index 138774f..5d56875 100644 --- a/README.md +++ b/README.md @@ -97,18 +97,28 @@ The matcher uses the **Hungarian Algorithm** to find the optimal assignment betw - Feature Type (Function, Method, Class, etc.) - **Thresholding**: Pairs with a similarity score below `--alpha` are discarded. -#### Scoring Metrics - -**Symmetric Report (Jaccard Index)** -- Best for measuring general parity between two equal implementations. -- **Score**: $J(A, B) = \frac{|A \cap B|}{|A \cup B|}$ -- Penalizes both missing features and extra features. - -**Directional Report (F1 Score)** -- Best when checking if a Target implementation covers the Base implementation (e.g., "Is the TS SDK feature-complete vs Python?"). -- **Precision**: $\frac{\text{Matches}}{\text{Total Target Features}}$ (How accurate is the target?) -- **Recall**: $\frac{\text{Matches}}{\text{Total Base Features}}$ (How complete is the target?) -- **F1 Score**: Harmonic mean of Precision and Recall. $F1 = 2 \cdot \frac{P \cdot R}{P + R}$ +#### Understanding the Reports + +`adk-scope` can generate three types of reports to help you understand the feature overlap between two languages. + +##### Symmetric Report (`--report-type symmetric`) + +This report is best for measuring the general similarity between two feature sets, where neither is considered the "source of truth". It uses the **Jaccard Index** to calculate a global similarity score. + +- **What it measures**: The Jaccard Index measures the similarity between two sets by dividing the size of their intersection by the size of their union. The score ranges from 0% (no similarity) to 100% (identical sets). +- **What it means**: A high Jaccard Index indicates that both languages have a very similar set of features, with few features unique to either one. It penalizes both missing and extra features equally. + +##### Directional Report (`--report-type directional`) + +This report is ideal when you have a "base" or "source of truth" language and you want to measure how well a "target" language conforms to it. It uses **Precision**, **Recall**, and **F1-Score**. + +- **Precision**: Answers the question: *"Of all the features implemented in the target language, how many of them are correct matches to features in the base language?"* A low score indicates the target has many extra features not present in the base. +- **Recall**: Answers the question: *"Of all the features that should be in the target language (i.e., all features in the base), how many were actually found?"* A low score indicates the target is missing many features from the base. +- **F1-Score**: The harmonic mean of Precision and Recall, providing a single score that balances both. A high F1-Score indicates the target is a close match to the base, having most of the required features and not too many extra ones. + +##### Raw Report (`--report-type raw`) + +This report provides a simple CSV output of all features (matched and unmatched) from both the base and target registries. It is useful for programmatic analysis or for importing the data into other tools.$ ## Development diff --git a/match.sh b/match.sh index 428d699..970f5b1 100755 --- a/match.sh +++ b/match.sh @@ -61,7 +61,12 @@ BASE_LANG=$(get_lang_code "$BASE_LANG_RAW") TARGET_LANG=$(get_lang_code "$TARGET_LANG_RAW") # Construct filename -OUTPUT_FILENAME="${BASE_LANG}_${TARGET_LANG}_${REPORT_TYPE}.md" +if [ "$REPORT_TYPE" == "raw" ]; then + EXTENSION="csv" +else + EXTENSION="md" +fi +OUTPUT_FILENAME="${BASE_LANG}_${TARGET_LANG}_${REPORT_TYPE}.${EXTENSION}" FULL_OUTPUT_PATH="${OUTPUT_DIR}/${OUTPUT_FILENAME}" # Determine the directory where this script is located diff --git a/src/google/adk/scope/matcher/matcher.py b/src/google/adk/scope/matcher/matcher.py index 2f8c2ea..61277e9 100644 --- a/src/google/adk/scope/matcher/matcher.py +++ b/src/google/adk/scope/matcher/matcher.py @@ -195,7 +195,9 @@ def match_registries( total_target_features = len(target_registry.features) if report_type == "raw": - return _generate_raw_report(all_modules, features_base, features_target, alpha) + return _generate_raw_report( + base_registry, target_registry, all_modules, features_base, features_target, alpha + ) return _generate_markdown_report( base_registry, @@ -209,15 +211,20 @@ def match_registries( def _generate_raw_report( + base_registry: features_pb2.FeatureRegistry, + target_registry: features_pb2.FeatureRegistry, all_modules: List[str], features_base: Dict[str, List[features_pb2.Feature]], features_target: Dict[str, List[features_pb2.Feature]], alpha: float, ) -> MatchResult: """Generates a raw CSV report.""" + base_code = get_language_code(base_registry.language) + target_code = get_language_code(target_registry.language) csv_header = ( - "base_namespace,base_member_of,base_name,target_namespace," - "target_member_of,target_name,type,score" + f"{base_code}_namespace,{base_code}_member_of,{base_code}_name," + f"{target_code}_namespace,{target_code}_member_of,{target_code}_name," + "type,score" ) csv_lines = [csv_header] @@ -362,7 +369,11 @@ def _generate_markdown_report( parity_score = ( total_solid_matches / union_size if union_size > 0 else 1.0 ) - global_stats = f"**Global Jaccard Index:** {parity_score:.2%}" + global_stats = ( + f"**Jaccard Index:** {parity_score:.2%}\n\n" + "> The Jaccard Index measures the similarity between the two feature sets. " + "A score of 100% indicates that both languages have identical features." + ) else: precision = stats.calculate_precision( total_solid_matches, total_target_features @@ -377,7 +388,10 @@ def _generate_markdown_report( "|---|---|\n" f"| **Precision** | {precision:.2%} |\n" f"| **Recall** | {recall:.2%} |\n" - f"| **Global F1 Score** | {parity_score:.2%} |" + f"| **F1 Score** | {parity_score:.2%} |\n\n" + "> **Precision**: Of all features in the target, how many are correct matches to the base? (High score = low number of extra features in target)\n\n" + "> **Recall**: Of all features in the base, how many were found in the target? (High score = low number of missing features in target)\n\n" + "> **F1 Score**: A weighted average of Precision and Recall, providing a single measure of how well the target feature set matches the base." ) master_lines[global_score_idx] = global_stats From 749d3d1c2f89280c57d290f51ad6206895bc1265 Mon Sep 17 00:00:00 2001 From: Shahin Saadati Date: Fri, 6 Feb 2026 09:26:28 -0800 Subject: [PATCH 18/21] black fixes --- playground.ipynb | 21 +-- .../adk/scope/extractors/converter_py.py | 5 +- .../adk/scope/extractors/converter_ts.py | 8 +- src/google/adk/scope/features_pb2.py | 42 +++-- src/google/adk/scope/matcher/matcher.py | 156 ++++++++++++------ src/google/adk/scope/utils/normalizer.py | 27 ++- src/google/adk/scope/utils/similarity.py | 37 +++-- .../adk/scope/extractors/test_converter_py.py | 4 +- .../adk/scope/extractors/test_converter_ts.py | 32 ++-- test/adk/scope/extractors/test_extract.py | 6 +- test/adk/scope/matcher/test_matcher.py | 90 ++++++---- test/adk/scope/utils/test_args.py | 28 ++-- test/adk/scope/utils/test_normalizer.py | 21 ++- test/adk/scope/utils/test_similarity.py | 4 +- 14 files changed, 282 insertions(+), 199 deletions(-) diff --git a/playground.ipynb b/playground.ipynb index 9381a7b..02fb739 100644 --- a/playground.ipynb +++ b/playground.ipynb @@ -2,25 +2,16 @@ "cells": [ { "cell_type": "code", - "execution_count": 42, + "execution_count": 1, "id": "8f748152", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "The autoreload extension is already loaded. To reload it, use:\n", - " %reload_ext autoreload\n" - ] - } - ], + "outputs": [], "source": [ "import sys\n", "import os\n", "\n", "# 1. Add 'src' to your Python path so we can resolve the 'google.adk.scope' imports\n", - "sys.path.insert(0, os.path.abspath('src'))\n", + "sys.path.insert(0, os.path.abspath(\"src\"))\n", "\n", "# 2. (Optional but recommended) Auto-reload modules when you change the .py files\n", "%load_ext autoreload\n", @@ -57,8 +48,8 @@ "metadata": {}, "outputs": [], "source": [ - "py_features = read_feature_registry('output/py.txtpb')\n", - "ts_features = read_feature_registry('output/ts.txtpb')" + "py_features = read_feature_registry(\"output/py.txtpb\")\n", + "ts_features = read_feature_registry(\"output/ts.txtpb\")" ] }, { @@ -363,7 +354,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "main", "language": "python", "name": "python3" }, diff --git a/src/google/adk/scope/extractors/converter_py.py b/src/google/adk/scope/extractors/converter_py.py index a26efa3..eb98694 100644 --- a/src/google/adk/scope/extractors/converter_py.py +++ b/src/google/adk/scope/extractors/converter_py.py @@ -18,6 +18,7 @@ class NodeProcessor: def __init__(self): self.normalizer = TypeNormalizer() + """Process Tree-sitter nodes into Feature objects.""" def process( @@ -360,7 +361,7 @@ def _process_param_node(self, node: Node) -> Optional[feature_pb2.Param]: normalized_strings = [] for t in types: - normalized_strings.extend(self.normalizer.normalize(t, 'python')) + normalized_strings.extend(self.normalizer.normalize(t, "python")) # Unique normalized_strings = sorted(list(set(normalized_strings))) if not normalized_strings: @@ -407,7 +408,7 @@ def _extract_return_types(self, node: Node) -> Tuple[List[str], List[str]]: return_type_node = node.child_by_field_name("return_type") if return_type_node: raw = return_type_node.text.decode("utf-8") - normalized = self.normalizer.normalize(raw, 'python') + normalized = self.normalizer.normalize(raw, "python") return [raw], normalized return [], [] diff --git a/src/google/adk/scope/extractors/converter_ts.py b/src/google/adk/scope/extractors/converter_ts.py index 493d634..24508ac 100644 --- a/src/google/adk/scope/extractors/converter_ts.py +++ b/src/google/adk/scope/extractors/converter_ts.py @@ -557,8 +557,8 @@ def _create_single_param( ) -> feature_pb2.Param: normalized_strings = [] for t in types: - normalized_types = self.normalizer.normalize(t, "typescript") - normalized_strings.extend(normalized_types) + normalized_types = self.normalizer.normalize(t, "typescript") + normalized_strings.extend(normalized_types) normalized_strings = sorted(list(set(normalized_strings))) if not normalized_strings: @@ -602,8 +602,6 @@ def _derive_name_from_type(self, type_name: str) -> str: return "obj" - - def _extract_return_types(self, node: Node) -> Tuple[List[str], List[str]]: return_type_node = node.child_by_field_name("return_type") if return_type_node: @@ -616,7 +614,7 @@ def _extract_return_types(self, node: Node) -> Tuple[List[str], List[str]]: # logically T for async? # Schema says "original_return_types". # normalized usually unwrap? - return [raw], self.normalizer.normalize(raw, 'typescript') + return [raw], self.normalizer.normalize(raw, "typescript") return [], [] def _is_blocking(self, node: Node, return_types: List[str]) -> bool: diff --git a/src/google/adk/scope/features_pb2.py b/src/google/adk/scope/features_pb2.py index 9314da5..16b51e1 100644 --- a/src/google/adk/scope/features_pb2.py +++ b/src/google/adk/scope/features_pb2.py @@ -9,38 +9,34 @@ from google.protobuf import runtime_version as _runtime_version from google.protobuf import symbol_database as _symbol_database from google.protobuf.internal import builder as _builder + _runtime_version.ValidateProtobufRuntimeVersion( - _runtime_version.Domain.PUBLIC, - 6, - 33, - 4, - '', - 'features.proto' + _runtime_version.Domain.PUBLIC, 6, 33, 4, "", "features.proto" ) # @@protoc_insertion_point(imports) _sym_db = _symbol_database.Default() - - -DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\x0e\x66\x65\x61tures.proto\x12\x0fgoogle.adk.meta\"\xc4\x01\n\x05Param\x12\x15\n\roriginal_name\x18\x01 \x01(\t\x12\x17\n\x0fnormalized_name\x18\x02 \x01(\t\x12\x16\n\x0eoriginal_types\x18\x03 \x03(\t\x12\x34\n\x10normalized_types\x18\x04 \x03(\x0e\x32\x1a.google.adk.meta.ParamType\x12\x18\n\x0b\x64\x65scription\x18\x05 \x01(\tH\x00\x88\x01\x01\x12\x13\n\x0bis_optional\x18\x06 \x01(\x08\x42\x0e\n\x0c_description\"\xdc\x04\n\x07\x46\x65\x61ture\x12\x15\n\roriginal_name\x18\x01 \x01(\t\x12\x17\n\x0fnormalized_name\x18\x02 \x01(\t\x12\x18\n\x0b\x64\x65scription\x18\x03 \x01(\tH\x00\x88\x01\x01\x12\x11\n\tmember_of\x18\x04 \x01(\t\x12\x1c\n\x14normalized_member_of\x18\x05 \x01(\t\x12\x38\n\x08maturity\x18\x06 \x01(\x0e\x32!.google.adk.meta.Feature.MaturityH\x01\x88\x01\x01\x12+\n\x04type\x18\x07 \x01(\x0e\x32\x1d.google.adk.meta.Feature.Type\x12\x11\n\tfile_path\x18\x08 \x01(\t\x12\x11\n\tnamespace\x18\t \x01(\t\x12\x1c\n\x14normalized_namespace\x18\n \x01(\t\x12*\n\nparameters\x18\x0b \x03(\x0b\x32\x16.google.adk.meta.Param\x12\x1d\n\x15original_return_types\x18\x0c \x03(\t\x12\x1f\n\x17normalized_return_types\x18\r \x03(\t\x12\x12\n\x05\x61sync\x18\x0e \x01(\x08H\x02\x88\x01\x01\"6\n\x08Maturity\x12\x10\n\x0c\x45XPERIMENTAL\x10\x00\x12\x08\n\x04\x42\x45TA\x10\x01\x12\x0e\n\nDEPRECATED\x10\x02\"L\n\x04Type\x12\x0c\n\x08\x46UNCTION\x10\x00\x12\x13\n\x0fINSTANCE_METHOD\x10\x01\x12\x10\n\x0c\x43LASS_METHOD\x10\x02\x12\x0f\n\x0b\x43ONSTRUCTOR\x10\x03\x42\x0e\n\x0c_descriptionB\x0b\n\t_maturityB\x08\n\x06_async\"`\n\x0f\x46\x65\x61tureRegistry\x12\x10\n\x08language\x18\x01 \x01(\t\x12\x0f\n\x07version\x18\x02 \x01(\t\x12*\n\x08\x66\x65\x61tures\x18\x03 \x03(\x0b\x32\x18.google.adk.meta.Feature*e\n\tParamType\x12\n\n\x06OBJECT\x10\x00\x12\n\n\x06STRING\x10\x01\x12\n\n\x06NUMBER\x10\x02\x12\x0b\n\x07\x42OOLEAN\x10\x03\x12\x08\n\x04LIST\x10\x04\x12\x07\n\x03MAP\x10\x05\x12\x07\n\x03SET\x10\x06\x12\x0b\n\x07UNKNOWN\x10\x07\x62\x06proto3') +DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile( + b'\n\x0e\x66\x65\x61tures.proto\x12\x0fgoogle.adk.meta"\xc4\x01\n\x05Param\x12\x15\n\roriginal_name\x18\x01 \x01(\t\x12\x17\n\x0fnormalized_name\x18\x02 \x01(\t\x12\x16\n\x0eoriginal_types\x18\x03 \x03(\t\x12\x34\n\x10normalized_types\x18\x04 \x03(\x0e\x32\x1a.google.adk.meta.ParamType\x12\x18\n\x0b\x64\x65scription\x18\x05 \x01(\tH\x00\x88\x01\x01\x12\x13\n\x0bis_optional\x18\x06 \x01(\x08\x42\x0e\n\x0c_description"\xdc\x04\n\x07\x46\x65\x61ture\x12\x15\n\roriginal_name\x18\x01 \x01(\t\x12\x17\n\x0fnormalized_name\x18\x02 \x01(\t\x12\x18\n\x0b\x64\x65scription\x18\x03 \x01(\tH\x00\x88\x01\x01\x12\x11\n\tmember_of\x18\x04 \x01(\t\x12\x1c\n\x14normalized_member_of\x18\x05 \x01(\t\x12\x38\n\x08maturity\x18\x06 \x01(\x0e\x32!.google.adk.meta.Feature.MaturityH\x01\x88\x01\x01\x12+\n\x04type\x18\x07 \x01(\x0e\x32\x1d.google.adk.meta.Feature.Type\x12\x11\n\tfile_path\x18\x08 \x01(\t\x12\x11\n\tnamespace\x18\t \x01(\t\x12\x1c\n\x14normalized_namespace\x18\n \x01(\t\x12*\n\nparameters\x18\x0b \x03(\x0b\x32\x16.google.adk.meta.Param\x12\x1d\n\x15original_return_types\x18\x0c \x03(\t\x12\x1f\n\x17normalized_return_types\x18\r \x03(\t\x12\x12\n\x05\x61sync\x18\x0e \x01(\x08H\x02\x88\x01\x01"6\n\x08Maturity\x12\x10\n\x0c\x45XPERIMENTAL\x10\x00\x12\x08\n\x04\x42\x45TA\x10\x01\x12\x0e\n\nDEPRECATED\x10\x02"L\n\x04Type\x12\x0c\n\x08\x46UNCTION\x10\x00\x12\x13\n\x0fINSTANCE_METHOD\x10\x01\x12\x10\n\x0c\x43LASS_METHOD\x10\x02\x12\x0f\n\x0b\x43ONSTRUCTOR\x10\x03\x42\x0e\n\x0c_descriptionB\x0b\n\t_maturityB\x08\n\x06_async"`\n\x0f\x46\x65\x61tureRegistry\x12\x10\n\x08language\x18\x01 \x01(\t\x12\x0f\n\x07version\x18\x02 \x01(\t\x12*\n\x08\x66\x65\x61tures\x18\x03 \x03(\x0b\x32\x18.google.adk.meta.Feature*e\n\tParamType\x12\n\n\x06OBJECT\x10\x00\x12\n\n\x06STRING\x10\x01\x12\n\n\x06NUMBER\x10\x02\x12\x0b\n\x07\x42OOLEAN\x10\x03\x12\x08\n\x04LIST\x10\x04\x12\x07\n\x03MAP\x10\x05\x12\x07\n\x03SET\x10\x06\x12\x0b\n\x07UNKNOWN\x10\x07\x62\x06proto3' +) _globals = globals() _builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals) -_builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, 'features_pb2', _globals) +_builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, "features_pb2", _globals) if not _descriptor._USE_C_DESCRIPTORS: - DESCRIPTOR._loaded_options = None - _globals['_PARAMTYPE']._serialized_start=939 - _globals['_PARAMTYPE']._serialized_end=1040 - _globals['_PARAM']._serialized_start=36 - _globals['_PARAM']._serialized_end=232 - _globals['_FEATURE']._serialized_start=235 - _globals['_FEATURE']._serialized_end=839 - _globals['_FEATURE_MATURITY']._serialized_start=668 - _globals['_FEATURE_MATURITY']._serialized_end=722 - _globals['_FEATURE_TYPE']._serialized_start=724 - _globals['_FEATURE_TYPE']._serialized_end=800 - _globals['_FEATUREREGISTRY']._serialized_start=841 - _globals['_FEATUREREGISTRY']._serialized_end=937 + DESCRIPTOR._loaded_options = None + _globals["_PARAMTYPE"]._serialized_start = 939 + _globals["_PARAMTYPE"]._serialized_end = 1040 + _globals["_PARAM"]._serialized_start = 36 + _globals["_PARAM"]._serialized_end = 232 + _globals["_FEATURE"]._serialized_start = 235 + _globals["_FEATURE"]._serialized_end = 839 + _globals["_FEATURE_MATURITY"]._serialized_start = 668 + _globals["_FEATURE_MATURITY"]._serialized_end = 722 + _globals["_FEATURE_TYPE"]._serialized_start = 724 + _globals["_FEATURE_TYPE"]._serialized_end = 800 + _globals["_FEATUREREGISTRY"]._serialized_start = 841 + _globals["_FEATUREREGISTRY"]._serialized_end = 937 # @@protoc_insertion_point(module_scope) diff --git a/src/google/adk/scope/matcher/matcher.py b/src/google/adk/scope/matcher/matcher.py index 61277e9..08bd401 100644 --- a/src/google/adk/scope/matcher/matcher.py +++ b/src/google/adk/scope/matcher/matcher.py @@ -146,24 +146,29 @@ def _fuzzy_match_namespaces( from jellyfish import jaro_winkler_similarity base_namespaces = set(features_base.keys()) - target_remap = {} - remapped_features = defaultdict(list) + remapped_features = defaultdict(list, {k: [] for k in features_base}) for t_ns, features in features_target.items(): if t_ns in base_namespaces: remapped_features[t_ns].extend(features) continue + if not base_namespaces: + # No base to match against, so keep original target namespace + remapped_features[t_ns].extend(features) + continue + best_match, best_score = max( - ((b_ns, jaro_winkler_similarity(t_ns, b_ns)) for b_ns in base_namespaces), + ( + (b_ns, jaro_winkler_similarity(t_ns, b_ns)) + for b_ns in base_namespaces + ), key=lambda item: item[1], default=(None, 0.0), ) if best_score > 0.8 and best_match: remapped_features[best_match].extend(features) - else: - remapped_features[t_ns].extend(features) features_target.clear() features_target.update(remapped_features) @@ -176,7 +181,7 @@ def match_registries( report_type: str = "symmetric", ) -> MatchResult: """Matches features and generates a master report + module sub-reports.""" - + features_base = _group_features_by_module(base_registry) features_target = _group_features_by_module(target_registry) _fuzzy_match_namespaces(features_base, features_target) @@ -196,7 +201,12 @@ def match_registries( if report_type == "raw": return _generate_raw_report( - base_registry, target_registry, all_modules, features_base, features_target, alpha + base_registry, + target_registry, + all_modules, + features_base, + features_target, + alpha, ) return _generate_markdown_report( @@ -232,34 +242,34 @@ def get_feature_cols(f: features_pb2.Feature) -> tuple[str, str, str]: ns = f.namespace or "" if not ns and f.normalized_namespace: ns = f.normalized_namespace - + mem = f.member_of or "" if not mem and f.normalized_member_of: mem = f.normalized_member_of if mem.lower() == "null": mem = "" - + name = f.original_name or f.normalized_name or "" return ns, mem, name def escape_csv(s): if s is None: return "" - if ',' in s or '"' in s or '\n' in s: + if "," in s or '"' in s or "\n" in s: return f'"{s.replace("\"", "\"\"")}"' return s for module in all_modules: base_list = features_base.get(module, []) target_list = features_target.get(module, []) - + solid_matches = match_features(base_list, target_list, alpha) beta = max(0.0, alpha - _NEAR_MISS_THRESHOLD) potential_matches = match_features(base_list, target_list, beta) - + unmatched_base = base_list unmatched_target = target_list - + for f_base, f_target, score in solid_matches: b_ns, b_mem, b_name = get_feature_cols(f_base) t_ns, t_mem, t_name = get_feature_cols(f_target) @@ -313,14 +323,16 @@ def _generate_markdown_report( master_lines = [] title_suffix = "Symmetric" if report_type == "symmetric" else "Directional" - master_lines.extend([ - f"# Feature Matching Report: {title_suffix}", - f"Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}", - "", - f"**Base:** {base_registry.language} ({base_registry.version})", - f"**Target:** {target_registry.language} ({target_registry.version})", - ]) - + master_lines.extend( + [ + f"# Feature Matching Report: {title_suffix}", + f"Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}", + "", + f"**Base:** {base_registry.language} ({base_registry.version})", + f"**Target:** {target_registry.language} ({target_registry.version})", + ] + ) + global_score_idx = len(master_lines) master_lines.append("GLOBAL_SCORE_PLACEHOLDER") master_lines.append("") @@ -382,7 +394,7 @@ def _generate_markdown_report( total_solid_matches, total_base_features ) parity_score = stats.calculate_f1(precision, recall) - + global_stats = ( "| Metric | Score |\n" "|---|---|\n" @@ -398,7 +410,7 @@ def _generate_markdown_report( return MatchResult( master_content="\n".join(master_lines).strip(), - module_files=module_files + module_files=module_files, ) @@ -432,7 +444,9 @@ def _process_module( recall = stats.calculate_recall(mod_solid_count, mod_base_count) mod_score = stats.calculate_f1(precision, recall) - status_icon = "✅" if mod_score == 1.0 else "⚠️" if mod_score >= 0.8 else "❌" + status_icon = ( + "✅" if mod_score == 1.0 else "⚠️" if mod_score >= 0.8 else "❌" + ) module_safe_name = module.replace(".", "_") module_filename = f"{module_safe_name}.md" @@ -449,43 +463,89 @@ def _process_module( row_content = f"| `{module}` | {mod_base_count} | {mod_score:.2%} | {status_icon} | {details_link} |" # Module Content - mod_lines = [f"# Module: `{module}`", "[⬅️ Back to Master Report](../{master_report})", "", f"**Score:** {mod_score:.2%} ({status_icon})"] + mod_lines = [ + f"# Module: `{module}`", + "[⬅️ Back to Master Report](../{master_report})", + "", + f"**Score:** {mod_score:.2%} ({status_icon})", + ] if report_type == "directional": - mod_lines.extend([ - "| Metric | Score |", - "|---|---|", - f"| **Precision** | {precision:.2%} |", - f"| **Recall** | {recall:.2%} |", - ]) - - mod_total_features = (mod_base_count + mod_target_count - mod_solid_count) if report_type == "symmetric" else mod_base_count + mod_lines.extend( + [ + "| Metric | Score |", + "|---|---|", + f"| **Precision** | {precision:.2%} |", + f"| **Recall** | {recall:.2%} |", + ] + ) + + mod_total_features = ( + (mod_base_count + mod_target_count - mod_solid_count) + if report_type == "symmetric" + else mod_base_count + ) mod_lines.extend(["", f"**Features:** {mod_total_features}", ""]) - solid_matches.sort(key=lambda x: (get_type_priority(x[0]), x[0].normalized_name)) - potential_matches.sort(key=lambda x: (get_type_priority(x[0]), x[0].normalized_name)) + solid_matches.sort( + key=lambda x: (get_type_priority(x[0]), x[0].normalized_name) + ) + potential_matches.sort( + key=lambda x: (get_type_priority(x[0]), x[0].normalized_name) + ) if solid_matches: - mod_lines.append(f"### ✅ {'Solid' if report_type == 'symmetric' else 'Matched'} Features") - mod_lines.extend(["| Type | Base Feature | Target Feature | Similarity Score |", "|---|---|---|---|"]) - mod_lines.extend([f"| {get_type_display_name(f_base)} | `{format_feature(f_base)}` | `{format_feature(f_target)}` | {score:.2f} |" for f_base, f_target, score in solid_matches]) + mod_lines.append( + f"### ✅ {'Solid' if report_type == 'symmetric' else 'Matched'} Features" + ) + mod_lines.extend( + [ + "| Type | Base Feature | Target Feature | Similarity Score |", + "|---|---|---|---|", + ] + ) + mod_lines.extend( + [ + f"| {get_type_display_name(f_base)} | `{format_feature(f_base)}` | `{format_feature(f_target)}` | {score:.2f} |" + for f_base, f_target, score in solid_matches + ] + ) mod_lines.append("") if potential_matches: - mod_lines.extend([ - "### ⚠️ Potential Matches", - "| Type | Base Feature | Closest Target Candidate | Similarity |", - "|---|---|---|---|", - ]) - mod_lines.extend([f"| {get_type_display_name(f_base)} | `{format_feature(f_base)}` | `{format_feature(f_target)}` | {score:.2f} |" for f_base, f_target, score in potential_matches]) + mod_lines.extend( + [ + "### ⚠️ Potential Matches", + "| Type | Base Feature | Closest Target Candidate | Similarity |", + "|---|---|---|---|", + ] + ) + mod_lines.extend( + [ + f"| {get_type_display_name(f_base)} | `{format_feature(f_base)}` | `{format_feature(f_target)}` | {score:.2f} |" + for f_base, f_target, score in potential_matches + ] + ) mod_lines.append("") if report_type == "symmetric" and (unmatched_base or unmatched_target): - mod_lines.extend(["### ❌ Unmatched Features", "| Missing Feature | Missing In |", "|---|---|"]) - mod_lines.extend([f"| `{format_feature(f)}` | Target |" for f in unmatched_base]) - mod_lines.extend([f"| `{format_feature(f)}` | Base |" for f in unmatched_target]) + mod_lines.extend( + [ + "### ❌ Unmatched Features", + "| Missing Feature | Missing In |", + "|---|---|", + ] + ) + mod_lines.extend( + [f"| `{format_feature(f)}` | Target |" for f in unmatched_base] + ) + mod_lines.extend( + [f"| `{format_feature(f)}` | Base |" for f in unmatched_target] + ) mod_lines.append("") elif report_type == "directional" and unmatched_base: - mod_lines.extend(["### ❌ Missing in Target", "| Missing Feature |", "|---|"]) + mod_lines.extend( + ["### ❌ Missing in Target", "| Missing Feature |", "|---|"] + ) mod_lines.extend([f"| `{format_feature(f)}` |" for f in unmatched_base]) mod_lines.append("") diff --git a/src/google/adk/scope/utils/normalizer.py b/src/google/adk/scope/utils/normalizer.py index 824936e..ab974a1 100644 --- a/src/google/adk/scope/utils/normalizer.py +++ b/src/google/adk/scope/utils/normalizer.py @@ -5,6 +5,7 @@ import re from typing import List + def normalize_name(name: str) -> str: """Convert name to snake_case (e.g. CamelCase -> camel_case).""" name = name.replace("-", "_") @@ -17,9 +18,9 @@ class TypeNormalizer: def normalize(self, type_name: str, language: str) -> List[str]: """Normalize a type name for a given language.""" - if language == 'python': + if language == "python": return self._normalize_py_type(type_name) - elif language == 'typescript': + elif language == "typescript": return self._normalize_ts_type(type_name) # Add placeholders for future languages like 'java' and 'go' # Fallback for unknown languages: only normalize if it's a known simple @@ -89,8 +90,14 @@ def _normalize_ts_type(self, t: str) -> List[str]: if base == "Promise": return self._normalize_ts_type(inner) if base in ( - "Array", "ReadonlyArray", "Generator", "AsyncGenerator", - "Iterable", "Iterator", "AsyncIterable", "AsyncIterator" + "Array", + "ReadonlyArray", + "Generator", + "AsyncGenerator", + "Iterable", + "Iterator", + "AsyncIterable", + "AsyncIterator", ): return ["LIST"] if base == "Map": @@ -131,8 +138,16 @@ def _simple_normalize(self, t: str) -> str: if t == "none": return "null" if t in ( - "list", "array", "slice", "vector", "generator", "asyncgenerator", - "iterable", "iterator", "asynciterable", "asynciterator" + "list", + "array", + "slice", + "vector", + "generator", + "asyncgenerator", + "iterable", + "iterator", + "asynciterable", + "asynciterator", ): return "LIST" if t in ("set",): diff --git a/src/google/adk/scope/utils/similarity.py b/src/google/adk/scope/utils/similarity.py index 7ee4288..ae5bd81 100644 --- a/src/google/adk/scope/utils/similarity.py +++ b/src/google/adk/scope/utils/similarity.py @@ -18,6 +18,7 @@ "return_type": 0.10, } + class SimilarityScorer: """Calculates a similarity score between two features.""" @@ -42,6 +43,7 @@ def __init__( def _fuzzy_type_match(self, types1: list, types2: list) -> float: """Calculates a fuzzy similarity score between two lists of types.""" + def _to_str_set(type_list): res = set() for t in type_list: @@ -54,26 +56,28 @@ def _to_str_set(type_list): else: res.add(str(t).upper()) return res - + set1 = _to_str_set(types1) set2 = _to_str_set(types2) - + if not set1 and not set2: return 1.0 if not set1 or not set2: return 0.0 - + if set1 == set2: return 1.0 - + # Check the best match between any pair of types best_score = 0.0 for t1 in set1: for t2 in set2: if t1 == t2: score = 1.0 - elif ({t1, t2} == {"MAP", "OBJECT"} or - {t1, t2} == {"MAP", "ANY"}): + elif {t1, t2} == {"MAP", "OBJECT"} or {t1, t2} == { + "MAP", + "ANY", + }: score = 0.4 elif t1 in ("UNKNOWN", "ANY") or t2 in ("UNKNOWN", "ANY"): score = 0.3 @@ -81,10 +85,10 @@ def _to_str_set(type_list): score = 0.2 else: score = 0.0 - + if score > best_score: best_score = score - + return best_score def _calculate_param_similarity( @@ -182,19 +186,22 @@ def get_similarity_score( current_weights["member_of"] += current_weights["name"] current_weights["name"] = 0.0 logger.debug( - "Both CONSTRUCTOR. " - f"Adjusted weights: {current_weights}" + "Both CONSTRUCTOR. " f"Adjusted weights: {current_weights}" ) - elif t1 in (FeatureType.FUNCTION, FeatureType.CLASS_METHOD) and \ - t2 in (FeatureType.FUNCTION, FeatureType.CLASS_METHOD): + elif t1 in (FeatureType.FUNCTION, FeatureType.CLASS_METHOD) and t2 in ( + FeatureType.FUNCTION, + FeatureType.CLASS_METHOD, + ): current_weights["member_of"] /= 2.0 current_weights["name"] += current_weights["member_of"] logger.debug( "Both FUNCTION/CLASS_METHOD. " f"Adjusted weights: {current_weights}" ) - elif t1 == FeatureType.INSTANCE_METHOD and \ - t2 == FeatureType.INSTANCE_METHOD: + elif ( + t1 == FeatureType.INSTANCE_METHOD + and t2 == FeatureType.INSTANCE_METHOD + ): logger.debug( "Both INSTANCE_METHOD. " f"Using default weights: {current_weights}" @@ -202,7 +209,7 @@ def get_similarity_score( pass # Keep default weights else: logger.debug(f"Incompatible types: {t1} vs {t2}. Returning 0.0") - return 0.0 # Fast out for incompatible types + return 0.0 # Fast out for incompatible types # 2. Similarity Calculations scores = { diff --git a/test/adk/scope/extractors/test_converter_py.py b/test/adk/scope/extractors/test_converter_py.py index 107b45e..d1447ec 100644 --- a/test/adk/scope/extractors/test_converter_py.py +++ b/test/adk/scope/extractors/test_converter_py.py @@ -821,8 +821,8 @@ def test_docstring_extended(self): node = self.create_mock_node( "function_definition", children=[name, body] ) - node.child_by_field_name.side_effect = ( - lambda n: name if n == "name" else (body if n == "body" else None) + node.child_by_field_name.side_effect = lambda n: ( + name if n == "name" else (body if n == "body" else None) ) result = self.processor.process(node, self.file_path, self.repo_root) diff --git a/test/adk/scope/extractors/test_converter_ts.py b/test/adk/scope/extractors/test_converter_ts.py index cf67f81..e83f4d6 100644 --- a/test/adk/scope/extractors/test_converter_ts.py +++ b/test/adk/scope/extractors/test_converter_ts.py @@ -566,8 +566,8 @@ def child(n): node_s = self.create_mock_node( "method_definition", children=[set_kw, name_s] ) - node_s.child_by_field_name.side_effect = ( - lambda n: name_s if n == "name" else None + node_s.child_by_field_name.side_effect = lambda n: ( + name_s if n == "name" else None ) self.assertIsNone( @@ -595,18 +595,14 @@ def test_parameter_modes_rest(self): node = self.create_mock_node( "function_declaration", children=[name, params] ) - node.child_by_field_name.side_effect = ( - lambda n: name - if n == "name" - else (params if n == "parameters" else None) + node.child_by_field_name.side_effect = lambda n: ( + name if n == "name" else (params if n == "parameters" else None) ) result = self.processor.process(node, self.file_path, self.repo_root) self.assertEqual(len(result.parameters), 1) self.assertEqual(result.parameters[0].original_name, "args") - - def test_abstract_and_interfaces(self): # abstract class method # interface method @@ -616,8 +612,8 @@ def test_abstract_and_interfaces(self): abs_name = self.create_mock_node("identifier", text="Abs") abs_name.field_name = "name" abs_class.children = [abs_name] - abs_class.child_by_field_name.side_effect = ( - lambda n: abs_name if n == "name" else None + abs_class.child_by_field_name.side_effect = lambda n: ( + abs_name if n == "name" else None ) method_name = self.create_mock_node( @@ -627,8 +623,8 @@ def test_abstract_and_interfaces(self): method_node = self.create_mock_node( "method_definition", children=[method_name], parent=abs_class ) - method_node.child_by_field_name.side_effect = ( - lambda n: method_name if n == "name" else None + method_node.child_by_field_name.side_effect = lambda n: ( + method_name if n == "name" else None ) result = self.processor.process( @@ -641,8 +637,8 @@ def test_abstract_and_interfaces(self): iface_name = self.create_mock_node("identifier", text="IFace") iface_name.field_name = "name" iface.children = [iface_name] - iface.child_by_field_name.side_effect = ( - lambda n: iface_name if n == "name" else None + iface.child_by_field_name.side_effect = lambda n: ( + iface_name if n == "name" else None ) # Interface method might be method_signature in TS, @@ -662,8 +658,8 @@ def test_abstract_and_interfaces(self): method_node_i = self.create_mock_node( "method_definition", children=[method_name], parent=iface ) - method_node_i.child_by_field_name.side_effect = ( - lambda n: method_name if n == "name" else None + method_node_i.child_by_field_name.side_effect = lambda n: ( + method_name if n == "name" else None ) result_i = self.processor.process( @@ -735,8 +731,8 @@ def test_jsdoc_with_decorator_interleaved(self): func = self.create_mock_node( "function_declaration", children=[name], prev_sibling=deco ) - func.child_by_field_name.side_effect = ( - lambda n: name if n == "name" else None + func.child_by_field_name.side_effect = lambda n: ( + name if n == "name" else None ) result = self.processor.process(func, self.file_path, self.repo_root) diff --git a/test/adk/scope/extractors/test_extract.py b/test/adk/scope/extractors/test_extract.py index c3009ac..d299e69 100644 --- a/test/adk/scope/extractors/test_extract.py +++ b/test/adk/scope/extractors/test_extract.py @@ -142,7 +142,7 @@ def test_input_file_mode(self): self.configure_args(lang="python", input_file=str(f)) self.mock_py_extractor.extract_features.return_value = [ - feature_pb2.Feature(original_name='dummy') + feature_pb2.Feature(original_name="dummy") ] self.mock_py_extractor.get_version.return_value = "1.0" @@ -167,7 +167,7 @@ def test_input_dir_mode(self): self.mock_py_extractor.find_files.return_value = [d / "a.py"] self.mock_py_extractor.extract_features.return_value = [ - feature_pb2.Feature(original_name='dummy') + feature_pb2.Feature(original_name="dummy") ] self.mock_py_extractor.get_version.return_value = "1.0" @@ -186,7 +186,7 @@ def test_input_repo_mode(self): self.mock_py_extractor.find_files.return_value = [r / "src" / "a.py"] self.mock_py_extractor.extract_features.return_value = [ - feature_pb2.Feature(original_name='dummy') + feature_pb2.Feature(original_name="dummy") ] self.mock_py_extractor.get_version.return_value = "1.0" diff --git a/test/adk/scope/matcher/test_matcher.py b/test/adk/scope/matcher/test_matcher.py index d78b207..4322942 100644 --- a/test/adk/scope/matcher/test_matcher.py +++ b/test/adk/scope/matcher/test_matcher.py @@ -100,7 +100,7 @@ def test_match_registries(self): normalized_namespace="n_same", type=features_pb2.Feature.Type.INSTANCE_METHOD, ) - + # f_near_base & f_near_target are a near miss # (different names, same structural namespace/class) # Using different return types and different enough names to @@ -152,12 +152,12 @@ def test_match_registries(self): base_registry, target_registry, 0.9, report_type="symmetric" ) report_sym = result_sym.master_content - + # 1. Verify Master Report Structure self.assertIn("# Feature Matching Report: Symmetric", report_sym) - self.assertIn("**Global Jaccard Index:** 25.00%", report_sym) + self.assertIn("**Jaccard Index:** 25.00%", report_sym) self.assertIn("## Module Summary", report_sym) - + # Check for module entry in master summary self.assertIn("| `n_same` |", report_sym) self.assertIn("[View Details]({modules_dir}/n_same.md)", report_sym) @@ -165,26 +165,26 @@ def test_match_registries(self): # 2. Verify Module Content self.assertIn("n_same.md", result_sym.module_files) module_content = result_sym.module_files["n_same.md"] - + self.assertIn("# Module: `n_same`", module_content) self.assertIn("**Features:** 3", module_content) - + # Solid Matches self.assertIn("### ✅ Solid Features", module_content) self.assertIn( "| Type | Base Feature | Target Feature | Similarity Score |", - module_content + module_content, ) self.assertIn( "| method | `BaseClass.fSameBase` | `TargetClass.fSameTarget` |", - module_content + module_content, ) - + # Potential Matches (formerly Near Misses) self.assertIn("### ⚠️ Potential Matches", module_content) self.assertIn( "| Type | Base Feature | Closest Target Candidate | Similarity |", - module_content + module_content, ) self.assertIn( "| method | `base_member.base_name` | " @@ -204,28 +204,28 @@ def test_match_registries(self): base_registry, target_registry, 0.9, report_type="directional" ) report_dir = result_dir.master_content - - self.assertIn("| **Global F1 Score** | 40.00% |", report_dir) + + self.assertIn("| **F1 Score** | 40.00% |", report_dir) self.assertIn("n_same.md", result_dir.module_files) - + mod_dir_content = result_dir.module_files["n_same.md"] # Solid Matches self.assertIn("### ✅ Matched Features", mod_dir_content) self.assertIn( "| Type | Base Feature | Target Feature | Similarity Score |", - mod_dir_content + mod_dir_content, ) self.assertIn( "| method | `BaseClass.fSameBase` | `TargetClass.fSameTarget` |", - mod_dir_content + mod_dir_content, ) # Potential Matches self.assertIn("### ⚠️ Potential Matches", mod_dir_content) self.assertIn( "| Type | Base Feature | Closest Target Candidate | Similarity |", - mod_dir_content + mod_dir_content, ) self.assertIn( "| method | `base_member.base_name` | " @@ -233,7 +233,6 @@ def test_match_registries(self): mod_dir_content, ) - # Unmatched / Gaps (in 'stuff' module) self.assertIn("stuff.md", result_dir.module_files) stuff_dir_content = result_dir.module_files["stuff.md"] @@ -253,16 +252,16 @@ def test_match_registries_raw(self): base.features.append(f1) target = features_pb2.FeatureRegistry(language="TS", version="2") target.features.append(f1) - + result = matcher.match_registries(base, target, 0.9, report_type="raw") csv_content = result.master_content - + expected_header = ( - "base_namespace,base_member_of,base_name,target_namespace," - "target_member_of,target_name,type,score" + "py_namespace,py_member_of,py_name,ts_namespace," + "ts_member_of,ts_name,type,score" ) self.assertIn(expected_header, csv_content) - + # Check for solid match line # f1 has: ns=pkg, mem=MyClass, name=f_same # Match should have same values for base and target @@ -286,16 +285,12 @@ def test_group_features_by_module(self): self.assertEqual(len(result["module.one"]), 2) self.assertEqual(len(result["module.two"]), 1) - def test_fuzzy_match_namespaces(self): - features_base = { - "module.one": [], - "module.two": [] - } + features_base = {"module.one": [], "module.two": []} features_target = { "module.one": [features_pb2.Feature(original_name="f1_target")], "module.ones": [features_pb2.Feature(original_name="f4")], - "module.three": [features_pb2.Feature(original_name="f5")] + "module.three": [features_pb2.Feature(original_name="f5")], } matcher._fuzzy_match_namespaces(features_base, features_target) @@ -304,9 +299,8 @@ def test_fuzzy_match_namespaces(self): self.assertIn("module.two", features_target) self.assertNotIn("module.ones", features_target) self.assertNotIn("module.three", features_target) - self.assertEqual(len(features_target["module.one"]), 2) - self.assertEqual(len(features_target["module.two"]), 1) - + self.assertEqual(len(features_target["module.one"]), 3) + self.assertEqual(len(features_target["module.two"]), 0) def test_process_module(self): """Tests the end-to-end processing of a single module.""" @@ -348,7 +342,6 @@ def test_process_module(self): self.assertIn("# Module: `n1`", result["module_content"]) self.assertIn("### ✅ Solid Features", result["module_content"]) - def test_generate_raw_report(self): """Tests the raw CSV report generation.""" f_base = features_pb2.Feature( @@ -359,25 +352,37 @@ def test_generate_raw_report(self): type=features_pb2.Feature.Type.FUNCTION, ) + base_registry = features_pb2.FeatureRegistry( + language="Python", version="1.0.0" + ) + target_registry = features_pb2.FeatureRegistry( + language="TypeScript", version="2.0.0" + ) + with patch( "google.adk.scope.matcher.matcher.match_features" ) as mock_match: mock_match.return_value = [] # No matches for simplicity result = matcher._generate_raw_report( + base_registry=base_registry, + target_registry=target_registry, all_modules=["n1"], features_base={"n1": [f_base]}, features_target={"n1": []}, alpha=0.9, ) - self.assertIn("base_namespace,base_member_of,base_name", result.master_content) + self.assertIn( + "py_namespace,py_member_of,py_name", result.master_content + ) self.assertIn("n1,c1,f1_base", result.master_content) - def test_generate_markdown_report(self): """Tests the markdown report generation.""" - base_registry = features_pb2.FeatureRegistry(language="Python", version="1.0.0") + base_registry = features_pb2.FeatureRegistry( + language="Python", version="1.0.0" + ) target_registry = features_pb2.FeatureRegistry( language="TypeScript", version="2.0.0" ) @@ -403,11 +408,24 @@ def test_generate_markdown_report(self): report_type="symmetric", ) - self.assertIn("# Feature Matching Report: Symmetric", result.master_content) + self.assertIn( + "# Feature Matching Report: Symmetric", result.master_content + ) self.assertIn("## Module Summary", result.master_content) self.assertIn("| `n1` |", result.master_content) self.assertIn("n1.md", result.module_files) + def test_fuzzy_match_namespaces_empty_base(self): + features_base = {} + features_target = { + "module.one": [features_pb2.Feature(original_name="f1")] + } + + matcher._fuzzy_match_namespaces(features_base, features_target) + + self.assertIn("module.one", features_target) + self.assertEqual(len(features_target["module.one"]), 1) + if __name__ == "__main__": unittest.main() diff --git a/test/adk/scope/utils/test_args.py b/test/adk/scope/utils/test_args.py index 3db3b4c..d1ac3aa 100644 --- a/test/adk/scope/utils/test_args.py +++ b/test/adk/scope/utils/test_args.py @@ -11,43 +11,36 @@ class ArgsTest(unittest.TestCase): def test_add_verbose_argument(self): parser = argparse.ArgumentParser() adk_args.add_verbose_argument(parser) - args = parser.parse_args(['--verbose']) + args = parser.parse_args(["--verbose"]) self.assertTrue(args.verbose) - @mock.patch('logging.basicConfig') + @mock.patch("logging.basicConfig") def test_configure_logging_verbose(self, mock_basic_config): args = argparse.Namespace(verbose=True) adk_args.configure_logging(args) mock_basic_config.assert_called_once_with(level=logging.DEBUG) - @mock.patch('logging.basicConfig') + @mock.patch("logging.basicConfig") def test_configure_logging_default(self, mock_basic_config): args = argparse.Namespace(verbose=False) adk_args.configure_logging(args) mock_basic_config.assert_called_once_with(level=logging.INFO) - @mock.patch('argparse.ArgumentParser.parse_args') + @mock.patch("argparse.ArgumentParser.parse_args") def test_parse_args_python(self, mock_parse_args): mock_parse_args.return_value = argparse.Namespace( - language='py', - input_file='test.py', - output='out', - verbose=False + language="py", input_file="test.py", output="out", verbose=False ) parsed_args = adk_args.parse_args() - self.assertEqual(parsed_args.language, 'python') + self.assertEqual(parsed_args.language, "python") - @mock.patch('argparse.ArgumentParser.parse_args') + @mock.patch("argparse.ArgumentParser.parse_args") def test_parse_args_typescript(self, mock_parse_args): mock_parse_args.return_value = argparse.Namespace( - language='ts', - input_file='test.ts', - output='out', - verbose=False + language="ts", input_file="test.ts", output="out", verbose=False ) parsed_args = adk_args.parse_args() - self.assertEqual(parsed_args.language, 'typescript') - + self.assertEqual(parsed_args.language, "typescript") def test_arg_definitions(self): # Verify that the parser is set up with correct arguments @@ -92,5 +85,6 @@ def test_arg_definitions(self): args, _ = parser_calls[2] self.assertEqual(args[0], "--verbose") -if __name__ == '__main__': + +if __name__ == "__main__": unittest.main() diff --git a/test/adk/scope/utils/test_normalizer.py b/test/adk/scope/utils/test_normalizer.py index 1ecf766..c1510e8 100644 --- a/test/adk/scope/utils/test_normalizer.py +++ b/test/adk/scope/utils/test_normalizer.py @@ -1,6 +1,7 @@ import unittest from google.adk.scope.utils.normalizer import TypeNormalizer, normalize_name + class TestStrings(unittest.TestCase): def test_normalize_name(self): # CamelCase @@ -43,8 +44,10 @@ def test_python_normalization(self): self.assertEqual(self.normalizer.normalize("set", "python"), ["SET"]) self.assertEqual(self.normalizer.normalize("None", "python"), ["null"]) self.assertEqual(self.normalizer.normalize("any", "python"), ["OBJECT"]) - self.assertEqual(self.normalizer.normalize( - "Optional[str]", "python"), ["STRING", "null"]) + self.assertEqual( + self.normalizer.normalize("Optional[str]", "python"), + ["STRING", "null"], + ) self.assertEqual( self.normalizer.normalize("Union[str, int]", "python"), ["STRING", "NUMBER"], @@ -73,8 +76,10 @@ def test_typescript_normalization(self): self.assertEqual( self.normalizer.normalize("Array", "typescript"), ["LIST"] ) - self.assertEqual(self.normalizer.normalize( - "Map", "typescript"), ["MAP"]) + self.assertEqual( + self.normalizer.normalize("Map", "typescript"), + ["MAP"], + ) self.assertEqual( self.normalizer.normalize("Set", "typescript"), ["SET"] ) @@ -104,8 +109,10 @@ def test_edge_cases(self): self.assertEqual( self.normalizer.normalize("unsupported_type", "python"), ["OBJECT"] ) - self.assertEqual(self.normalizer.normalize( - "unsupported_type", "typescript"), ["OBJECT"]) + self.assertEqual( + self.normalizer.normalize("unsupported_type", "typescript"), + ["OBJECT"], + ) self.assertEqual( self.normalizer.normalize("str", "unsupported_language"), ["STRING"] ) @@ -115,5 +122,5 @@ def test_edge_cases(self): ) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/test/adk/scope/utils/test_similarity.py b/test/adk/scope/utils/test_similarity.py index 719e72c..4cb0473 100644 --- a/test/adk/scope/utils/test_similarity.py +++ b/test/adk/scope/utils/test_similarity.py @@ -234,7 +234,7 @@ def test_constructor_weights(self): normalized_namespace="my_module", type=features_pb.Feature.Type.CONSTRUCTOR, ) - + # Despite name difference, should score very highly due to weight shift score = self.scorer.get_similarity_score(c1, c2) self.assertGreater(score, 0.9) @@ -245,7 +245,7 @@ def test_function_weights(self): """ f1 = features_pb.Feature( normalized_name="myFunction", - normalized_member_of="SomeClass", # Completely diff member_of + normalized_member_of="SomeClass", # Completely diff member_of normalized_namespace="my_module", type=features_pb.Feature.Type.FUNCTION, ) From 1de70e35b1edbc6b0288b5b1b117bda7c6bd12fe Mon Sep 17 00:00:00 2001 From: Shahin Saadati Date: Fri, 6 Feb 2026 09:27:05 -0800 Subject: [PATCH 19/21] isort fixes --- src/google/adk/scope/extractors/converter_py.py | 5 ++--- src/google/adk/scope/extractors/converter_ts.py | 5 ++--- src/google/adk/scope/extractors/extract.py | 5 +++-- src/google/adk/scope/extractors/extractor_py.py | 6 +++--- src/google/adk/scope/extractors/extractor_ts.py | 8 ++++---- src/google/adk/scope/matcher/matcher.py | 16 ++++++++-------- src/google/adk/scope/utils/similarity.py | 4 ++-- test/adk/scope/extractors/test_converter_py.py | 5 +++-- test/adk/scope/extractors/test_converter_ts.py | 5 +++-- test/adk/scope/extractors/test_extract.py | 8 ++++---- test/adk/scope/extractors/test_extractor_py.py | 11 +++++------ test/adk/scope/extractors/test_extractor_ts.py | 8 ++++---- test/adk/scope/matcher/test_matcher.py | 1 + test/adk/scope/utils/test_normalizer.py | 1 + test/adk/scope/utils/test_stats.py | 1 + 15 files changed, 46 insertions(+), 43 deletions(-) diff --git a/src/google/adk/scope/extractors/converter_py.py b/src/google/adk/scope/extractors/converter_py.py index eb98694..16bc07b 100644 --- a/src/google/adk/scope/extractors/converter_py.py +++ b/src/google/adk/scope/extractors/converter_py.py @@ -4,13 +4,12 @@ import logging from pathlib import Path -from typing import List, Optional, Tuple, Set +from typing import List, Optional, Set, Tuple from tree_sitter import Node -from google.adk.scope.utils.normalizer import normalize_name, TypeNormalizer - from google.adk.scope import features_pb2 as feature_pb2 +from google.adk.scope.utils.normalizer import TypeNormalizer, normalize_name logger = logging.getLogger(__name__) diff --git a/src/google/adk/scope/extractors/converter_ts.py b/src/google/adk/scope/extractors/converter_ts.py index 24508ac..26d956c 100644 --- a/src/google/adk/scope/extractors/converter_ts.py +++ b/src/google/adk/scope/extractors/converter_ts.py @@ -5,13 +5,12 @@ import logging import re from pathlib import Path -from typing import List, Optional, Tuple, Set +from typing import List, Optional, Set, Tuple from tree_sitter import Node -from google.adk.scope.utils.normalizer import normalize_name -from google.adk.scope.utils.normalizer import TypeNormalizer from google.adk.scope import features_pb2 as feature_pb2 +from google.adk.scope.utils.normalizer import TypeNormalizer, normalize_name logger = logging.getLogger(__name__) diff --git a/src/google/adk/scope/extractors/extract.py b/src/google/adk/scope/extractors/extract.py index a064ba3..809c738 100644 --- a/src/google/adk/scope/extractors/extract.py +++ b/src/google/adk/scope/extractors/extract.py @@ -4,10 +4,11 @@ import yaml from google.protobuf import text_format -from google.protobuf.json_format import MessageToJson, MessageToDict +from google.protobuf.json_format import MessageToDict, MessageToJson + +from google.adk.scope.extractors import extractor_py, extractor_ts from google.adk.scope.features_pb2 import FeatureRegistry from google.adk.scope.utils.args import parse_args -from google.adk.scope.extractors import extractor_py, extractor_ts logging.basicConfig( level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s" diff --git a/src/google/adk/scope/extractors/extractor_py.py b/src/google/adk/scope/extractors/extractor_py.py index 2ac14a5..97313c1 100644 --- a/src/google/adk/scope/extractors/extractor_py.py +++ b/src/google/adk/scope/extractors/extractor_py.py @@ -2,11 +2,11 @@ import pathlib from typing import Iterator, List -from google.adk.scope.extractors.converter_py import NodeProcessor -from google.adk.scope.features_pb2 import Feature -from tree_sitter import Language, Parser, Query, QueryCursor import tree_sitter_python as tspy +from tree_sitter import Language, Parser, Query, QueryCursor +from google.adk.scope.extractors.converter_py import NodeProcessor +from google.adk.scope.features_pb2 import Feature SRC_DIR = "src" diff --git a/src/google/adk/scope/extractors/extractor_ts.py b/src/google/adk/scope/extractors/extractor_ts.py index c84b887..eca4251 100644 --- a/src/google/adk/scope/extractors/extractor_ts.py +++ b/src/google/adk/scope/extractors/extractor_ts.py @@ -2,11 +2,11 @@ import pathlib from typing import Iterator, List -from google.adk.scope.features_pb2 import Feature -from google.adk.scope.extractors.converter_ts import NodeProcessor - -from tree_sitter import Language, Parser, Query, QueryCursor import tree_sitter_typescript as tsts +from tree_sitter import Language, Parser, Query, QueryCursor + +from google.adk.scope.extractors.converter_ts import NodeProcessor +from google.adk.scope.features_pb2 import Feature # Initialize Tree-sitter try: diff --git a/src/google/adk/scope/matcher/matcher.py b/src/google/adk/scope/matcher/matcher.py index 08bd401..3348156 100644 --- a/src/google/adk/scope/matcher/matcher.py +++ b/src/google/adk/scope/matcher/matcher.py @@ -1,19 +1,19 @@ import argparse -import logging - -from google.adk.scope.utils import args as adk_args import dataclasses +import logging import sys -from pathlib import Path -from typing import List, Tuple, Dict from collections import defaultdict +from pathlib import Path +from typing import Dict, List, Tuple + import numpy as np -from scipy.optimize import linear_sum_assignment from google.protobuf import text_format +from scipy.optimize import linear_sum_assignment + from google.adk.scope import features_pb2 -from google.adk.scope.utils.similarity import SimilarityScorer +from google.adk.scope.utils import args as adk_args from google.adk.scope.utils import stats - +from google.adk.scope.utils.similarity import SimilarityScorer _NEAR_MISS_THRESHOLD = 0.15 diff --git a/src/google/adk/scope/utils/similarity.py b/src/google/adk/scope/utils/similarity.py index ae5bd81..5fd7296 100644 --- a/src/google/adk/scope/utils/similarity.py +++ b/src/google/adk/scope/utils/similarity.py @@ -1,8 +1,8 @@ -from typing import Optional import logging +from typing import Optional -from jellyfish import jaro_winkler_similarity import numpy as np +from jellyfish import jaro_winkler_similarity from scipy.optimize import linear_sum_assignment from google.adk.scope import features_pb2 as features_pb diff --git a/test/adk/scope/extractors/test_converter_py.py b/test/adk/scope/extractors/test_converter_py.py index d1447ec..3fb096d 100644 --- a/test/adk/scope/extractors/test_converter_py.py +++ b/test/adk/scope/extractors/test_converter_py.py @@ -1,8 +1,9 @@ import unittest -from unittest.mock import Mock, patch from pathlib import Path -from google.adk.scope.extractors.converter_py import NodeProcessor +from unittest.mock import Mock, patch + from google.adk.scope import features_pb2 as feature_pb2 +from google.adk.scope.extractors.converter_py import NodeProcessor from google.adk.scope.features_pb2 import Feature diff --git a/test/adk/scope/extractors/test_converter_ts.py b/test/adk/scope/extractors/test_converter_ts.py index e83f4d6..f2a579d 100644 --- a/test/adk/scope/extractors/test_converter_ts.py +++ b/test/adk/scope/extractors/test_converter_ts.py @@ -1,8 +1,9 @@ import unittest -from unittest.mock import Mock from pathlib import Path -from google.adk.scope.extractors.converter_ts import NodeProcessor +from unittest.mock import Mock + from google.adk.scope import features_pb2 as feature_pb2 +from google.adk.scope.extractors.converter_ts import NodeProcessor class TestNodeProcessor(unittest.TestCase): diff --git a/test/adk/scope/extractors/test_extract.py b/test/adk/scope/extractors/test_extract.py index d299e69..660662e 100644 --- a/test/adk/scope/extractors/test_extract.py +++ b/test/adk/scope/extractors/test_extract.py @@ -1,11 +1,11 @@ +import shutil +import tempfile import unittest -from unittest.mock import patch, MagicMock from pathlib import Path -import tempfile -import shutil +from unittest.mock import MagicMock, patch -from google.adk.scope.extractors import extract from google.adk.scope import features_pb2 as feature_pb2 +from google.adk.scope.extractors import extract class TestExtractHelpers(unittest.TestCase): diff --git a/test/adk/scope/extractors/test_extractor_py.py b/test/adk/scope/extractors/test_extractor_py.py index a457068..bd28bf5 100644 --- a/test/adk/scope/extractors/test_extractor_py.py +++ b/test/adk/scope/extractors/test_extractor_py.py @@ -1,11 +1,10 @@ -import unittest -from unittest.mock import patch, MagicMock import sys +import unittest from pathlib import Path -from google.adk.scope.extractors.extractor_py import ( - find_files, - extract_features, -) +from unittest.mock import MagicMock, patch + +from google.adk.scope.extractors.extractor_py import (extract_features, + find_files) from google.adk.scope.features_pb2 import Feature # Mock tree_sitter modules BEFORE importing extractor diff --git a/test/adk/scope/extractors/test_extractor_ts.py b/test/adk/scope/extractors/test_extractor_ts.py index cf750fe..f68c49a 100644 --- a/test/adk/scope/extractors/test_extractor_ts.py +++ b/test/adk/scope/extractors/test_extractor_ts.py @@ -1,14 +1,14 @@ +import shutil +import tempfile import unittest -from unittest.mock import Mock, patch from pathlib import Path -import tempfile -import shutil +from unittest.mock import Mock, patch +from google.adk.scope import features_pb2 # Import the module under test # We need to make sure the src path is in PYTHONPATH which is handled # by test runner usually from google.adk.scope.extractors import extractor_ts as extractor -from google.adk.scope import features_pb2 class TestExtractor(unittest.TestCase): diff --git a/test/adk/scope/matcher/test_matcher.py b/test/adk/scope/matcher/test_matcher.py index 4322942..bcdb96a 100644 --- a/test/adk/scope/matcher/test_matcher.py +++ b/test/adk/scope/matcher/test_matcher.py @@ -2,6 +2,7 @@ import tempfile import unittest from unittest.mock import patch + from google.adk.scope import features_pb2 from google.adk.scope.matcher import matcher diff --git a/test/adk/scope/utils/test_normalizer.py b/test/adk/scope/utils/test_normalizer.py index c1510e8..e6538c6 100644 --- a/test/adk/scope/utils/test_normalizer.py +++ b/test/adk/scope/utils/test_normalizer.py @@ -1,4 +1,5 @@ import unittest + from google.adk.scope.utils.normalizer import TypeNormalizer, normalize_name diff --git a/test/adk/scope/utils/test_stats.py b/test/adk/scope/utils/test_stats.py index 36ea0b0..a98b0ab 100644 --- a/test/adk/scope/utils/test_stats.py +++ b/test/adk/scope/utils/test_stats.py @@ -1,4 +1,5 @@ import unittest + from google.adk.scope.utils import stats From 1b2809731f73a6ff6d02c018676e3b63ef08460b Mon Sep 17 00:00:00 2001 From: Shahin Saadati Date: Fri, 6 Feb 2026 11:22:24 -0800 Subject: [PATCH 20/21] Fixed ruff --- .github/workflows/ci.yml | 2 +- src/google/adk/scope/matcher/matcher.py | 60 +++++++++++++++---------- test/adk/scope/matcher/test_matcher.py | 3 +- 3 files changed, 39 insertions(+), 26 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index b5dea4f..f42fc16 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -31,7 +31,7 @@ jobs: # Stop the build if there are Python syntax errors or undefined names ruff check . --output-format=github --select=E9,F63,F7,F82 --target-version=py312 # Run default ruff checks - ruff check . --line-length 80 --output-format=github --exclude=.venv,features_pb2.py + ruff check . --line-length 80 --output-format=github --exclude=*.ipynb,features_pb2.py - name: Run Tests # Reset PYTHONPATH ensures src is in path if needed, but installed package should be found diff --git a/src/google/adk/scope/matcher/matcher.py b/src/google/adk/scope/matcher/matcher.py index 3348156..91ac297 100644 --- a/src/google/adk/scope/matcher/matcher.py +++ b/src/google/adk/scope/matcher/matcher.py @@ -193,12 +193,6 @@ def match_registries( set(features_base.keys()) | set(features_target.keys()) ) - # Global Stats using Set logic for Jaccard/F1 - # We will accumulate counts as we process modules - total_solid_matches = 0 - total_base_features = len(base_registry.features) - total_target_features = len(target_registry.features) - if report_type == "raw": return _generate_raw_report( base_registry, @@ -256,7 +250,7 @@ def escape_csv(s): if s is None: return "" if "," in s or '"' in s or "\n" in s: - return f'"{s.replace("\"", "\"\"")}"' + return '"{}"'.format(s.replace('"', '""')) return s for module in all_modules: @@ -294,16 +288,16 @@ def escape_csv(s): b_ns, b_mem, b_name = get_feature_cols(f_base) f_type = get_type_display_name(f_base) csv_lines.append( - f"{escape_csv(b_ns)},{escape_csv(b_mem)},{escape_csv(b_name)},,,," - f"{escape_csv(f_type)},0.0000" + f"{escape_csv(b_ns)},{escape_csv(b_mem)},{escape_csv(b_name)}," + f",,,{escape_csv(f_type)},0.0000" ) for f_target in unmatched_target: t_ns, t_mem, t_name = get_feature_cols(f_target) f_type = get_type_display_name(f_target) csv_lines.append( - f",,,{escape_csv(t_ns)},{escape_csv(t_mem)},{escape_csv(t_name)}," - f"{escape_csv(f_type)},0.0000" + f",,,{escape_csv(t_ns)},{escape_csv(t_mem)}," + f"{escape_csv(t_name)},{escape_csv(f_type)},0.0000" ) return MatchResult(master_content="\n".join(csv_lines), module_files={}) @@ -329,7 +323,8 @@ def _generate_markdown_report( f"Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}", "", f"**Base:** {base_registry.language} ({base_registry.version})", - f"**Target:** {target_registry.language} ({target_registry.version})", + f"**Target:** {target_registry.language}" + f" ({target_registry.version})", ] ) @@ -383,8 +378,9 @@ def _generate_markdown_report( ) global_stats = ( f"**Jaccard Index:** {parity_score:.2%}\n\n" - "> The Jaccard Index measures the similarity between the two feature sets. " - "A score of 100% indicates that both languages have identical features." + "> The Jaccard Index measures the similarity between the " + "two feature sets. A score of 100% indicates that both languages " + "have identical features." ) else: precision = stats.calculate_precision( @@ -401,9 +397,15 @@ def _generate_markdown_report( f"| **Precision** | {precision:.2%} |\n" f"| **Recall** | {recall:.2%} |\n" f"| **F1 Score** | {parity_score:.2%} |\n\n" - "> **Precision**: Of all features in the target, how many are correct matches to the base? (High score = low number of extra features in target)\n\n" - "> **Recall**: Of all features in the base, how many were found in the target? (High score = low number of missing features in target)\n\n" - "> **F1 Score**: A weighted average of Precision and Recall, providing a single measure of how well the target feature set matches the base." + "> **Precision**: Of all features in the target, how many are " + "correct matches to the base? (High score = low number of extra " + "features in target)\n\n" + "> **Recall**: Of all features in the base, how many were found in " + "the target? (High score = low number of missing features in " + "target)\n\n" + "> **F1 Score**: A weighted average of Precision and Recall, " + "providing a single measure of how well the target feature set " + "matches the base." ) master_lines[global_score_idx] = global_stats @@ -458,9 +460,15 @@ def _process_module( if mod_target_count > 0: adk_parts.append(target_lang_code) adk_value = ", ".join(adk_parts) - row_content = f"| {adk_value} | `{module}` | {mod_base_count} | {mod_score:.2%} | {status_icon} | {details_link} |" + row_content = ( + f"| {adk_value} | `{module}` | {mod_base_count} | {mod_score:.2%} |" + f" {status_icon} | {details_link} |" + ) else: - row_content = f"| `{module}` | {mod_base_count} | {mod_score:.2%} | {status_icon} | {details_link} |" + row_content = ( + f"| `{module}` | {mod_base_count} | {mod_score:.2%} | {status_icon}" + f" | {details_link} |" + ) # Module Content mod_lines = [ @@ -495,7 +503,8 @@ def _process_module( if solid_matches: mod_lines.append( - f"### ✅ {'Solid' if report_type == 'symmetric' else 'Matched'} Features" + f"### ✅ {'Solid' if report_type == 'symmetric' else 'Matched'}" + " Features" ) mod_lines.extend( [ @@ -505,7 +514,9 @@ def _process_module( ) mod_lines.extend( [ - f"| {get_type_display_name(f_base)} | `{format_feature(f_base)}` | `{format_feature(f_target)}` | {score:.2f} |" + f"| {get_type_display_name(f_base)} |" + f" `{format_feature(f_base)}`" + f" | `{format_feature(f_target)}` | {score:.2f} |" for f_base, f_target, score in solid_matches ] ) @@ -515,13 +526,16 @@ def _process_module( mod_lines.extend( [ "### ⚠️ Potential Matches", - "| Type | Base Feature | Closest Target Candidate | Similarity |", + "| Type | Base Feature | Closest Target Candidate" + " | Similarity |", "|---|---|---|---|", ] ) mod_lines.extend( [ - f"| {get_type_display_name(f_base)} | `{format_feature(f_base)}` | `{format_feature(f_target)}` | {score:.2f} |" + f"| {get_type_display_name(f_base)} |" + f" `{format_feature(f_base)}`" + f" | `{format_feature(f_target)}` | {score:.2f} |" for f_base, f_target, score in potential_matches ] ) diff --git a/test/adk/scope/matcher/test_matcher.py b/test/adk/scope/matcher/test_matcher.py index bcdb96a..d0aae2f 100644 --- a/test/adk/scope/matcher/test_matcher.py +++ b/test/adk/scope/matcher/test_matcher.py @@ -2,7 +2,6 @@ import tempfile import unittest from unittest.mock import patch - from google.adk.scope import features_pb2 from google.adk.scope.matcher import matcher @@ -394,7 +393,7 @@ def test_generate_markdown_report(self): mock_process.return_value = { "solid_matches_count": 1, "score": 1.0, - "row_content": "| py, ts | `n1` | 1 | 100.00% | ✅ | [View Details]({modules_dir}/n1.md) |", + "row_content": "| py, ts | `n1` | 1 | 100.00% | ✅ | n1.md |", "module_filename": "n1.md", "module_content": "# Module: `n1`", } From ef7632b8f3c5c1fc0890c56bc5cf7d8ef6b6f3f4 Mon Sep 17 00:00:00 2001 From: Shahin Saadati Date: Fri, 6 Feb 2026 11:32:19 -0800 Subject: [PATCH 21/21] fix flaky test --- src/google/adk/scope/matcher/matcher.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/google/adk/scope/matcher/matcher.py b/src/google/adk/scope/matcher/matcher.py index 91ac297..83f1906 100644 --- a/src/google/adk/scope/matcher/matcher.py +++ b/src/google/adk/scope/matcher/matcher.py @@ -145,7 +145,7 @@ def _fuzzy_match_namespaces( """Remaps target namespaces to base namespaces using fuzzy matching.""" from jellyfish import jaro_winkler_similarity - base_namespaces = set(features_base.keys()) + base_namespaces = sorted(list(features_base.keys())) remapped_features = defaultdict(list, {k: [] for k in features_base}) for t_ns, features in features_target.items():