diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index b5dea4f..f42fc16 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -31,7 +31,7 @@ jobs: # Stop the build if there are Python syntax errors or undefined names ruff check . --output-format=github --select=E9,F63,F7,F82 --target-version=py312 # Run default ruff checks - ruff check . --line-length 80 --output-format=github --exclude=.venv,features_pb2.py + ruff check . --line-length 80 --output-format=github --exclude=*.ipynb,features_pb2.py - name: Run Tests # Reset PYTHONPATH ensures src is in path if needed, but installed package should be found diff --git a/README.md b/README.md index 3ea906c..5d56875 100644 --- a/README.md +++ b/README.md @@ -66,6 +66,60 @@ python3 -m google.adk.scope.extractors.python.extractor \ output_dir ``` +``` + +### Feature Matching + +Once you have extracted features from two languages (e.g., Python and TypeScript), you can compare them using the `match.sh` script. + +```bash +./match.sh \ + --base output/py.txtpb \ + --target output/ts.txtpb \ + --output output/ \ + --report-type directional +``` + +| Argument | Description | +| :--- | :--- | +| `--base ` | **Required.** Path to the "source of truth" feature registry (e.g., Python). | +| `--target ` | **Required.** Path to the comparison registry (e.g., TypeScript). | +| `--output ` | **Required.** Path for the output directory. The report filename is auto-generated. | +| `--report-type ` | `symmetric` (default) for Jaccard Index, `directional` for F1/Precision/Recall, or `raw` for CSV. | +| `--alpha ` | Similarity threshold (0.0 - 1.0). Default is `0.8`. | + +#### How Matching Works + +The matcher uses the **Hungarian Algorithm** to find the optimal assignment between features in the Base and Target registries. +- **Cost Function**: Based on a similarity score derived from: + - Feature Name (normalized) + - Namespace / Module + - Feature Type (Function, Method, Class, etc.) +- **Thresholding**: Pairs with a similarity score below `--alpha` are discarded. + +#### Understanding the Reports + +`adk-scope` can generate three types of reports to help you understand the feature overlap between two languages. + +##### Symmetric Report (`--report-type symmetric`) + +This report is best for measuring the general similarity between two feature sets, where neither is considered the "source of truth". It uses the **Jaccard Index** to calculate a global similarity score. + +- **What it measures**: The Jaccard Index measures the similarity between two sets by dividing the size of their intersection by the size of their union. The score ranges from 0% (no similarity) to 100% (identical sets). +- **What it means**: A high Jaccard Index indicates that both languages have a very similar set of features, with few features unique to either one. It penalizes both missing and extra features equally. + +##### Directional Report (`--report-type directional`) + +This report is ideal when you have a "base" or "source of truth" language and you want to measure how well a "target" language conforms to it. It uses **Precision**, **Recall**, and **F1-Score**. + +- **Precision**: Answers the question: *"Of all the features implemented in the target language, how many of them are correct matches to features in the base language?"* A low score indicates the target has many extra features not present in the base. +- **Recall**: Answers the question: *"Of all the features that should be in the target language (i.e., all features in the base), how many were actually found?"* A low score indicates the target is missing many features from the base. +- **F1-Score**: The harmonic mean of Precision and Recall, providing a single score that balances both. A high F1-Score indicates the target is a close match to the base, having most of the required features and not too many extra ones. + +##### Raw Report (`--report-type raw`) + +This report provides a simple CSV output of all features (matched and unmatched) from both the base and target registries. It is useful for programmatic analysis or for importing the data into other tools.$ + ## Development ### Running Tests diff --git a/config.yaml b/config.yaml index 826c332..bc70cc3 100644 --- a/config.yaml +++ b/config.yaml @@ -1,42 +1,7 @@ python: exclude: - examples - - a2a - # - agents - - apps - - artifacts - - auth - - cli - - code_executors - - dependencies - - errors - - evaluation - - events - - features - - flows - - memory - - models - - planners - - platform - - plugins - - sessions - - telemetry - - tools - - utils typescript: exclude: - # - agents - - artifacts - - auth - - code_executors - - events - - examples - - memory - - models - - plugins - - runner - - sessions - - telemetry - - tools - - utils + - examples \ No newline at end of file diff --git a/match.sh b/match.sh new file mode 100755 index 0000000..970f5b1 --- /dev/null +++ b/match.sh @@ -0,0 +1,85 @@ +#!/bin/bash +# match.sh - Wrapper script to match features between two FeatureRegistries. + +set -e + +# Default values +REPORT_TYPE="symmetric" +ALPHA="0.8" +VERBOSE="" + +# Parse arguments +while [[ "$#" -gt 0 ]]; do + case "$1" in + --base) + BASE_FILE="$2" + shift 2 + ;; + --target) + TARGET_FILE="$2" + shift 2 + ;; + --output) + OUTPUT_DIR="$2" + shift 2 + ;; + --report-type) + REPORT_TYPE="$2" + shift 2 + ;; + --alpha) + ALPHA="$2" + shift 2 + ;; + -v|--verbose) + VERBOSE="--verbose" + shift + ;; + *) + echo "Unknown option: $1" + exit 1 + ;; + esac +done + +# Extract languages +BASE_LANG_RAW=$(head -n 1 "${BASE_FILE}" | grep -o 'language: "[A-Z]*"' | grep -o '"[A-Z]*"' | tr -d '"') +TARGET_LANG_RAW=$(head -n 1 "${TARGET_FILE}" | grep -o 'language: "[A-Z]*"' | grep -o '"[A-Z]*"' | tr -d '"') + +# Function to map language to short code +get_lang_code() { + case "$1" in + PYTHON) echo "py" ;; + TYPESCRIPT) echo "ts" ;; + JAVA) echo "java" ;; + GOLANG) echo "go" ;; + *) echo "" ;; + esac +} + +BASE_LANG=$(get_lang_code "$BASE_LANG_RAW") +TARGET_LANG=$(get_lang_code "$TARGET_LANG_RAW") + +# Construct filename +if [ "$REPORT_TYPE" == "raw" ]; then + EXTENSION="csv" +else + EXTENSION="md" +fi +OUTPUT_FILENAME="${BASE_LANG}_${TARGET_LANG}_${REPORT_TYPE}.${EXTENSION}" +FULL_OUTPUT_PATH="${OUTPUT_DIR}/${OUTPUT_FILENAME}" + +# Determine the directory where this script is located +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + +# Add 'src' to PYTHONPATH so the python script can find modules +export PYTHONPATH="${SCRIPT_DIR}/src:${PYTHONPATH}" + +# Run the python matcher +python3 "${SCRIPT_DIR}/src/google/adk/scope/matcher/matcher.py" \ + --base "${BASE_FILE}" \ + --target "${TARGET_FILE}" \ + --output "${FULL_OUTPUT_PATH}" \ + --report-type "${REPORT_TYPE}" \ + --alpha "${ALPHA}" \ + ${VERBOSE} diff --git a/playground.ipynb b/playground.ipynb new file mode 100644 index 0000000..02fb739 --- /dev/null +++ b/playground.ipynb @@ -0,0 +1,376 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "8f748152", + "metadata": {}, + "outputs": [], + "source": [ + "import sys\n", + "import os\n", + "\n", + "# 1. Add 'src' to your Python path so we can resolve the 'google.adk.scope' imports\n", + "sys.path.insert(0, os.path.abspath(\"src\"))\n", + "\n", + "# 2. (Optional but recommended) Auto-reload modules when you change the .py files\n", + "%load_ext autoreload\n", + "%autoreload 2\n", + "\n", + "# 3. Import the required files\n", + "from google.adk.scope import features_pb2\n", + "from google.adk.scope.utils import similarity\n", + "from google.protobuf import text_format\n", + "import logging\n", + "\n", + "logging.basicConfig(level=logging.DEBUG)" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "id": "bb16a48a-5cda-4ec6-a10e-c8a2f2828700", + "metadata": {}, + "outputs": [], + "source": [ + "def read_feature_registry(file_path: str) -> features_pb2.FeatureRegistry:\n", + " \"\"\"Reads a FeatureRegistry from a text proto file.\"\"\"\n", + " registry = features_pb2.FeatureRegistry()\n", + " with open(file_path, \"rb\") as f:\n", + " text_format.Parse(f.read(), registry)\n", + " return registry" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "id": "f3361359-a887-4537-b22b-fe8fec24e13a", + "metadata": {}, + "outputs": [], + "source": [ + "py_features = read_feature_registry(\"output/py.txtpb\")\n", + "ts_features = read_feature_registry(\"output/ts.txtpb\")" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "id": "2208e1dd-6647-4796-b81c-58ab7406e4c9", + "metadata": {}, + "outputs": [], + "source": [ + "py_run_async = py_features.features[0]\n", + "py_run = py_features.features[9]\n", + "ts_run_async = ts_features.features[3]" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "id": "3358d489-92bb-411a-9ec8-0f596be08c9f", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "DEBUG:google.adk.scope.utils.similarity:Initializing SimilarityScorer with alpha=0.8 and weights={'name': 0.3, 'member_of': 0.3, 'namespace': 0.15, 'parameters': 0.15, 'return_type': 0.1}\n" + ] + } + ], + "source": [ + "scorer = similarity.SimilarityScorer()" + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "id": "a1b5b03b-8a0c-4614-ab4f-89140fa872d0", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "DEBUG:google.adk.scope.utils.similarity:Comparing 'run_async' and 'run_async'\n", + "DEBUG:google.adk.scope.utils.similarity:Both INSTANCE_METHOD. Using default weights: {'name': 0.3, 'member_of': 0.3, 'namespace': 0.15, 'parameters': 0.15, 'return_type': 0.1}\n", + "DEBUG:google.adk.scope.utils.similarity:Preliminary scores: {'name': 1.0, 'member_of': 1.0, 'namespace': 0.9714285714285714}\n", + "DEBUG:google.adk.scope.utils.similarity:Preliminary score: 0.7457, Early exit threshold: 0.6000\n", + "DEBUG:google.adk.scope.utils.similarity:Calculating parameter score for 6 vs 5 parameters\n", + "DEBUG:google.adk.scope.utils.similarity:Param 'user_id' vs 'user_id': 1.0000 (name:1.00, type:1.00, opt:1.00)\n", + "DEBUG:google.adk.scope.utils.similarity:Param 'user_id' vs 'session_id': 0.8357 (name:0.67, type:1.00, opt:1.00)\n", + "DEBUG:google.adk.scope.utils.similarity:Param 'user_id' vs 'new_message': 0.3446 (name:0.49, type:0.00, opt:1.00)\n", + "DEBUG:google.adk.scope.utils.similarity:Param 'user_id' vs 'state_delta': 0.3225 (name:0.65, type:0.00, opt:0.00)\n", + "DEBUG:google.adk.scope.utils.similarity:Param 'user_id' vs 'run_config': 0.2869 (name:0.57, type:0.00, opt:0.00)\n", + "DEBUG:google.adk.scope.utils.similarity:Param 'session_id' vs 'user_id': 0.8357 (name:0.67, type:1.00, opt:1.00)\n", + "DEBUG:google.adk.scope.utils.similarity:Param 'session_id' vs 'session_id': 1.0000 (name:1.00, type:1.00, opt:1.00)\n", + "DEBUG:google.adk.scope.utils.similarity:Param 'session_id' vs 'new_message': 0.3523 (name:0.50, type:0.00, opt:1.00)\n", + "DEBUG:google.adk.scope.utils.similarity:Param 'session_id' vs 'state_delta': 0.2939 (name:0.59, type:0.00, opt:0.00)\n", + "DEBUG:google.adk.scope.utils.similarity:Param 'session_id' vs 'run_config': 0.2167 (name:0.43, type:0.00, opt:0.00)\n", + "DEBUG:google.adk.scope.utils.similarity:Param 'invocation_id' vs 'user_id': 0.6033 (name:0.41, type:1.00, opt:0.00)\n", + "DEBUG:google.adk.scope.utils.similarity:Param 'invocation_id' vs 'session_id': 0.6880 (name:0.58, type:1.00, opt:0.00)\n", + "DEBUG:google.adk.scope.utils.similarity:Param 'invocation_id' vs 'new_message': 0.2226 (name:0.45, type:0.00, opt:0.00)\n", + "DEBUG:google.adk.scope.utils.similarity:Param 'invocation_id' vs 'state_delta': 0.2950 (name:0.39, type:0.00, opt:1.00)\n", + "DEBUG:google.adk.scope.utils.similarity:Param 'invocation_id' vs 'run_config': 0.3474 (name:0.49, type:0.00, opt:1.00)\n", + "DEBUG:google.adk.scope.utils.similarity:Param 'new_message' vs 'user_id': 0.2446 (name:0.49, type:0.00, opt:0.00)\n", + "DEBUG:google.adk.scope.utils.similarity:Param 'new_message' vs 'session_id': 0.2523 (name:0.50, type:0.00, opt:0.00)\n", + "DEBUG:google.adk.scope.utils.similarity:Param 'new_message' vs 'new_message': 0.9000 (name:1.00, type:1.00, opt:0.00)\n", + "DEBUG:google.adk.scope.utils.similarity:Param 'new_message' vs 'state_delta': 0.7879 (name:0.58, type:1.00, opt:1.00)\n", + "DEBUG:google.adk.scope.utils.similarity:Param 'new_message' vs 'run_config': 0.7621 (name:0.52, type:1.00, opt:1.00)\n", + "DEBUG:google.adk.scope.utils.similarity:Param 'state_delta' vs 'user_id': 0.3225 (name:0.65, type:0.00, opt:0.00)\n", + "DEBUG:google.adk.scope.utils.similarity:Param 'state_delta' vs 'session_id': 0.2939 (name:0.59, type:0.00, opt:0.00)\n", + "DEBUG:google.adk.scope.utils.similarity:Param 'state_delta' vs 'new_message': 0.2879 (name:0.58, type:0.00, opt:0.00)\n", + "DEBUG:google.adk.scope.utils.similarity:Param 'state_delta' vs 'state_delta': 0.6000 (name:1.00, type:0.00, opt:1.00)\n", + "DEBUG:google.adk.scope.utils.similarity:Param 'state_delta' vs 'run_config': 0.2985 (name:0.40, type:0.00, opt:1.00)\n", + "DEBUG:google.adk.scope.utils.similarity:Param 'run_config' vs 'user_id': 0.2869 (name:0.57, type:0.00, opt:0.00)\n", + "DEBUG:google.adk.scope.utils.similarity:Param 'run_config' vs 'session_id': 0.2167 (name:0.43, type:0.00, opt:0.00)\n", + "DEBUG:google.adk.scope.utils.similarity:Param 'run_config' vs 'new_message': 0.6621 (name:0.52, type:1.00, opt:0.00)\n", + "DEBUG:google.adk.scope.utils.similarity:Param 'run_config' vs 'state_delta': 0.6985 (name:0.40, type:1.00, opt:1.00)\n", + "DEBUG:google.adk.scope.utils.similarity:Param 'run_config' vs 'run_config': 1.0000 (name:1.00, type:1.00, opt:1.00)\n", + "DEBUG:google.adk.scope.utils.similarity:Matrix matched total score: 4.5000, final parameter score: 0.8182\n", + "DEBUG:google.adk.scope.utils.similarity:Return type score: 0.3000 (type match: 0.0, async match: 1.0)\n", + "DEBUG:google.adk.scope.utils.similarity:Final scores including params & return: {'name': 1.0, 'member_of': 1.0, 'namespace': 0.9714285714285714, 'parameters': np.float64(0.8181818181818182), 'return_type': 0.3}\n", + "DEBUG:google.adk.scope.utils.similarity:Final weighted similarity score: 0.8984\n" + ] + }, + { + "data": { + "text/plain": [ + "np.float64(0.8984415584415584)" + ] + }, + "execution_count": 47, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "scorer.get_similarity_score(py_run_async, ts_run_async)" + ] + }, + { + "cell_type": "code", + "execution_count": 48, + "id": "341ad037-4bdb-481a-8a45-b3e1a48f54bc", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "DEBUG:google.adk.scope.utils.similarity:Comparing 'run' and 'run_async'\n", + "DEBUG:google.adk.scope.utils.similarity:Both INSTANCE_METHOD. Using default weights: {'name': 0.3, 'member_of': 0.3, 'namespace': 0.15, 'parameters': 0.15, 'return_type': 0.1}\n", + "DEBUG:google.adk.scope.utils.similarity:Preliminary scores: {'name': 0.8444444444444443, 'member_of': 1.0, 'namespace': 0.9714285714285714}\n", + "DEBUG:google.adk.scope.utils.similarity:Preliminary score: 0.6990, Early exit threshold: 0.6000\n", + "DEBUG:google.adk.scope.utils.similarity:Calculating parameter score for 4 vs 5 parameters\n", + "DEBUG:google.adk.scope.utils.similarity:Param 'user_id' vs 'user_id': 1.0000 (name:1.00, type:1.00, opt:1.00)\n", + "DEBUG:google.adk.scope.utils.similarity:Param 'user_id' vs 'session_id': 0.8357 (name:0.67, type:1.00, opt:1.00)\n", + "DEBUG:google.adk.scope.utils.similarity:Param 'user_id' vs 'new_message': 0.3446 (name:0.49, type:0.00, opt:1.00)\n", + "DEBUG:google.adk.scope.utils.similarity:Param 'user_id' vs 'state_delta': 0.3225 (name:0.65, type:0.00, opt:0.00)\n", + "DEBUG:google.adk.scope.utils.similarity:Param 'user_id' vs 'run_config': 0.2869 (name:0.57, type:0.00, opt:0.00)\n", + "DEBUG:google.adk.scope.utils.similarity:Param 'session_id' vs 'user_id': 0.8357 (name:0.67, type:1.00, opt:1.00)\n", + "DEBUG:google.adk.scope.utils.similarity:Param 'session_id' vs 'session_id': 1.0000 (name:1.00, type:1.00, opt:1.00)\n", + "DEBUG:google.adk.scope.utils.similarity:Param 'session_id' vs 'new_message': 0.3523 (name:0.50, type:0.00, opt:1.00)\n", + "DEBUG:google.adk.scope.utils.similarity:Param 'session_id' vs 'state_delta': 0.2939 (name:0.59, type:0.00, opt:0.00)\n", + "DEBUG:google.adk.scope.utils.similarity:Param 'session_id' vs 'run_config': 0.2167 (name:0.43, type:0.00, opt:0.00)\n", + "DEBUG:google.adk.scope.utils.similarity:Param 'new_message' vs 'user_id': 0.3446 (name:0.49, type:0.00, opt:1.00)\n", + "DEBUG:google.adk.scope.utils.similarity:Param 'new_message' vs 'session_id': 0.3523 (name:0.50, type:0.00, opt:1.00)\n", + "DEBUG:google.adk.scope.utils.similarity:Param 'new_message' vs 'new_message': 1.0000 (name:1.00, type:1.00, opt:1.00)\n", + "DEBUG:google.adk.scope.utils.similarity:Param 'new_message' vs 'state_delta': 0.6879 (name:0.58, type:1.00, opt:0.00)\n", + "DEBUG:google.adk.scope.utils.similarity:Param 'new_message' vs 'run_config': 0.6621 (name:0.52, type:1.00, opt:0.00)\n", + "DEBUG:google.adk.scope.utils.similarity:Param 'run_config' vs 'user_id': 0.2869 (name:0.57, type:0.00, opt:0.00)\n", + "DEBUG:google.adk.scope.utils.similarity:Param 'run_config' vs 'session_id': 0.2167 (name:0.43, type:0.00, opt:0.00)\n", + "DEBUG:google.adk.scope.utils.similarity:Param 'run_config' vs 'new_message': 0.6621 (name:0.52, type:1.00, opt:0.00)\n", + "DEBUG:google.adk.scope.utils.similarity:Param 'run_config' vs 'state_delta': 0.6985 (name:0.40, type:1.00, opt:1.00)\n", + "DEBUG:google.adk.scope.utils.similarity:Param 'run_config' vs 'run_config': 1.0000 (name:1.00, type:1.00, opt:1.00)\n", + "DEBUG:google.adk.scope.utils.similarity:Matrix matched total score: 4.0000, final parameter score: 0.8889\n", + "DEBUG:google.adk.scope.utils.similarity:Return type score: 0.7000 (type match: 1.0, async match: 0.0)\n", + "DEBUG:google.adk.scope.utils.similarity:Final scores including params & return: {'name': 0.8444444444444443, 'member_of': 1.0, 'namespace': 0.9714285714285714, 'parameters': np.float64(0.8888888888888888), 'return_type': 0.7}\n", + "DEBUG:google.adk.scope.utils.similarity:Final weighted similarity score: 0.9024\n" + ] + }, + { + "data": { + "text/plain": [ + "np.float64(0.9023809523809523)" + ] + }, + "execution_count": 48, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "scorer.get_similarity_score(py_run, ts_run_async)" + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "id": "783ee277-0893-46e3-ae9c-72a8df82cb8d", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "original_name: \"run_async\"\n", + "normalized_name: \"run_async\"\n", + "description: \"Main entry method to run the agent in this runner.\\n\\n If event compaction is enabled in the App configuration, it will be\\n performed after all agent events for the current invocation have been\\n yielded. The async generator will only finish iterating after event\\n compaction is complete. However, this does not block new `run_async`\\n calls for subsequent user queries, which can be started concurrently.\"\n", + "member_of: \"Runner\"\n", + "normalized_member_of: \"runner\"\n", + "type: INSTANCE_METHOD\n", + "file_path: \"/Users/shahins/projects/adk/adk-python/src/google/adk/runners.py\"\n", + "namespace: \"runners\"\n", + "normalized_namespace: \"runners\"\n", + "parameters {\n", + " original_name: \"user_id\"\n", + " normalized_name: \"user_id\"\n", + " original_types: \"str\"\n", + " normalized_types: STRING\n", + " description: \"The user ID of the session.\"\n", + "}\n", + "parameters {\n", + " original_name: \"session_id\"\n", + " normalized_name: \"session_id\"\n", + " original_types: \"str\"\n", + " normalized_types: STRING\n", + " description: \"The session ID of the session.\"\n", + "}\n", + "parameters {\n", + " original_name: \"invocation_id\"\n", + " normalized_name: \"invocation_id\"\n", + " original_types: \"Optional[str]\"\n", + " normalized_types: STRING\n", + " description: \"The invocation ID of the session, set this to resume an interrupted invocation.\"\n", + " is_optional: true\n", + "}\n", + "parameters {\n", + " original_name: \"new_message\"\n", + " normalized_name: \"new_message\"\n", + " original_types: \"Optional[types.Content]\"\n", + " normalized_types: OBJECT\n", + " description: \"A new message to append to the session.\"\n", + " is_optional: true\n", + "}\n", + "parameters {\n", + " original_name: \"state_delta\"\n", + " normalized_name: \"state_delta\"\n", + " original_types: \"Optional[dict[str, Any]]\"\n", + " normalized_types: MAP\n", + " description: \"Optional state changes to apply to the session.\"\n", + " is_optional: true\n", + "}\n", + "parameters {\n", + " original_name: \"run_config\"\n", + " normalized_name: \"run_config\"\n", + " original_types: \"Optional[RunConfig]\"\n", + " normalized_types: OBJECT\n", + " description: \"The run config for the agent.\"\n", + " is_optional: true\n", + "}\n", + "original_return_types: \"AsyncGenerator[Event, None]\"\n", + "normalized_return_types: \"OBJECT\"\n", + "normalized_return_types: \"null\"\n", + "async: true" + ] + }, + "execution_count": 49, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "py_run_async" + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "id": "861b3583-49d8-4b29-9f39-144aabe82948", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "original_name: \"run\"\n", + "normalized_name: \"run\"\n", + "description: \"Runs the agent.\\n\\n NOTE:\\n This sync interface is only for local testing and convenience purpose.\\n Consider using `run_async` for production usage.\\n\\n If event compaction is enabled in the App configuration, it will be\\n performed after all agent events for the current invocation have been\\n yielded. The generator will only finish iterating after event\\n compaction is complete.\"\n", + "member_of: \"Runner\"\n", + "normalized_member_of: \"runner\"\n", + "type: INSTANCE_METHOD\n", + "file_path: \"/Users/shahins/projects/adk/adk-python/src/google/adk/runners.py\"\n", + "namespace: \"runners\"\n", + "normalized_namespace: \"runners\"\n", + "parameters {\n", + " original_name: \"user_id\"\n", + " normalized_name: \"user_id\"\n", + " original_types: \"str\"\n", + " normalized_types: STRING\n", + " description: \"The user ID of the session.\"\n", + "}\n", + "parameters {\n", + " original_name: \"session_id\"\n", + " normalized_name: \"session_id\"\n", + " original_types: \"str\"\n", + " normalized_types: STRING\n", + " description: \"The session ID of the session.\"\n", + "}\n", + "parameters {\n", + " original_name: \"new_message\"\n", + " normalized_name: \"new_message\"\n", + " original_types: \"types.Content\"\n", + " normalized_types: OBJECT\n", + " description: \"A new message to append to the session.\"\n", + "}\n", + "parameters {\n", + " original_name: \"run_config\"\n", + " normalized_name: \"run_config\"\n", + " original_types: \"Optional[RunConfig]\"\n", + " normalized_types: OBJECT\n", + " description: \"The run config for the agent.\"\n", + " is_optional: true\n", + "}\n", + "original_return_types: \"Generator[Event, None, None]\"\n", + "normalized_return_types: \"OBJECT\"" + ] + }, + "execution_count": 50, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "py_run" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "941ec275-7405-404b-a31c-d91d4d3671bd", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "main", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.4" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/pyproject.toml b/pyproject.toml index 8063fb5..a18b27d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -19,6 +19,9 @@ dependencies = [ "pytest", "pytest-cov", "pyyaml", + "scipy", + "numpy", + "jellyfish", ] @@ -30,6 +33,7 @@ testpaths = [ "test" ] + [tool.ruff] exclude = ["features_pb2.py"] line-length = 80 diff --git a/run.sh b/run.sh new file mode 100755 index 0000000..27e4b81 --- /dev/null +++ b/run.sh @@ -0,0 +1,14 @@ +echo "Extracting Python features..." +./extract.sh --language py --input-repo ../adk-python ./output + +echo "Extracting TypeScript features..." +./extract.sh --language typescript --input-repo ../adk-js ./output + +echo "Generating symmetric reports..." +./match.sh --base output/py.txtpb --target output/ts.txtpb --output ./output --report-type symmetric + +echo "Generating directional reports.. ." +./match.sh --base output/py.txtpb --target output/ts.txtpb --output ./output --report-type directional + +echo "Generating raw reports..." +./match.sh --base output/py.txtpb --target output/ts.txtpb --output ./output --report-type raw \ No newline at end of file diff --git a/src/google/adk/scope/extractors/converter_py.py b/src/google/adk/scope/extractors/converter_py.py index 2c24b6f..16bc07b 100644 --- a/src/google/adk/scope/extractors/converter_py.py +++ b/src/google/adk/scope/extractors/converter_py.py @@ -4,20 +4,20 @@ import logging from pathlib import Path -from typing import List, Optional, Tuple, Set +from typing import List, Optional, Set, Tuple from tree_sitter import Node -from google.adk.scope.utils.strings import ( - normalize_name, - normalize_type_complex, -) from google.adk.scope import features_pb2 as feature_pb2 +from google.adk.scope.utils.normalizer import TypeNormalizer, normalize_name logger = logging.getLogger(__name__) class NodeProcessor: + def __init__(self): + self.normalizer = TypeNormalizer() + """Process Tree-sitter nodes into Feature objects.""" def process( @@ -310,7 +310,6 @@ def _extract_params( if p: # Filter 'self' and 'cls' if p.original_name in ("self", "cls"): - continue # Add description if available @@ -361,7 +360,7 @@ def _process_param_node(self, node: Node) -> Optional[feature_pb2.Param]: normalized_strings = [] for t in types: - normalized_strings.extend(normalize_type_complex(t)) + normalized_strings.extend(self.normalizer.normalize(t, "python")) # Unique normalized_strings = sorted(list(set(normalized_strings))) if not normalized_strings: @@ -408,7 +407,7 @@ def _extract_return_types(self, node: Node) -> Tuple[List[str], List[str]]: return_type_node = node.child_by_field_name("return_type") if return_type_node: raw = return_type_node.text.decode("utf-8") - normalized = normalize_type_complex(raw) + normalized = self.normalizer.normalize(raw, "python") return [raw], normalized return [], [] diff --git a/src/google/adk/scope/extractors/converter_ts.py b/src/google/adk/scope/extractors/converter_ts.py index 8dac2b5..26d956c 100644 --- a/src/google/adk/scope/extractors/converter_ts.py +++ b/src/google/adk/scope/extractors/converter_ts.py @@ -5,12 +5,12 @@ import logging import re from pathlib import Path -from typing import List, Optional, Tuple, Set +from typing import List, Optional, Set, Tuple from tree_sitter import Node -from google.adk.scope.utils.strings import normalize_name from google.adk.scope import features_pb2 as feature_pb2 +from google.adk.scope.utils.normalizer import TypeNormalizer, normalize_name logger = logging.getLogger(__name__) @@ -18,6 +18,9 @@ class NodeProcessor: """Process Tree-sitter nodes into Feature objects for TypeScript.""" + def __init__(self): + self.normalizer = TypeNormalizer() + def process( self, node: Node, file_path: Path, repo_root: Path ) -> Optional[feature_pb2.Feature]: @@ -90,7 +93,7 @@ def process( parameters = self._extract_params(node, param_docs) original_returns, normalized_returns = self._extract_return_types(node) - original_returns, normalized_returns = self._extract_return_types(node) + is_async = not self._is_blocking(node, original_returns) maturity = self._extract_maturity(node) @@ -360,18 +363,23 @@ def _extract_params( "optional_parameter", "rest_parameter", ): - xml_params = self._process_param_node(child) + xml_params = self._process_param_node(child, param_docs) for p in xml_params: if ( - p.original_name in param_docs + not p.description + and p.original_name in param_docs and param_docs[p.original_name] ): p.description = param_docs[p.original_name] params.append(p) return params - def _process_param_node(self, node: Node) -> List[feature_pb2.Param]: + def _process_param_node( + self, node: Node, param_docs: dict = None + ) -> List[feature_pb2.Param]: # returns a LIST of Params to handle destructuring + if param_docs is None: + param_docs = {} # 1. Name extraction pattern_node = node.child_by_field_name("pattern") @@ -384,6 +392,9 @@ def _process_param_node(self, node: Node) -> List[feature_pb2.Param]: # 2. Type extraction type_node = node.child_by_field_name("type") + # Parse type map if available + type_map = self._extract_type_map(type_node) + if pattern_node and pattern_node.type == "object_pattern": # Handle destructuring: { a, b }: { a: string, b: number } # If type is NOT an inline object literal, we might still want @@ -427,47 +438,7 @@ def _process_param_node(self, node: Node) -> List[feature_pb2.Param]: extracted_params = [] # Parse type map if available - type_map = {} # name -> (type_str, optional_bool) - if type_node: - # Check for object_type node inside type_node - # type_annotation -> object_type - object_type_node = None - for child in type_node.children: - if child.type == "object_type": - object_type_node = child - break - - if object_type_node: - for child in object_type_node.children: - if child.type == "property_signature": - # name: property_identifier - # type: type_annotation - # optional? - prop_name_node = child.child_by_field_name("name") - prop_type_node = child.child_by_field_name("type") - - if prop_name_node: - p_name = prop_name_node.text.decode("utf-8") - p_type = "" - if prop_type_node: - p_type = prop_type_node.text.decode("utf-8") - if p_type.startswith(":"): - p_type = p_type[1:].strip() - - # Optionality check: check for '?' node or - # if literal text has ? - # child text might be "hint?:" - # or checking for optional node - p_optional = False - for sub in child.children: - if ( - sub.type == "?" - or sub.text.decode("utf-8") == "?" - ): - p_optional = True - break - - type_map[p_name] = (p_type, p_optional) + # type_map is already extracted above # Iterate pattern properties for child in pattern_node.children: @@ -550,6 +521,28 @@ def _process_param_node(self, node: Node) -> List[feature_pb2.Param]: if not name: return [] + # Check if type is an object literal AND we have type_map populated + # This means it's `param: { a: string }` style + is_literal_type = raw_type and raw_type.strip().startswith("{") + if is_literal_type and type_map: + # We want to explode this into multiple parameters + # defined by type_map. + # The original param name is `name` (e.g. "params") + # We look for descriptions in param_docs using "params.fieldName" + exploded_params = [] + for prop_name, (p_type, p_opt) in type_map.items(): + p = self._create_single_param( + prop_name, + [p_type], + p_opt or node.type == "optional_parameter", + ) + # Try to find description: "params.prop_name" + doc_key = f"{name}.{prop_name}" + if doc_key in param_docs: + p.description = param_docs[doc_key] + exploded_params.append(p) + return exploded_params + return [ self._create_single_param( name, @@ -563,7 +556,8 @@ def _create_single_param( ) -> feature_pb2.Param: normalized_strings = [] for t in types: - normalized_strings.extend(self._normalize_ts_type(t)) + normalized_types = self.normalizer.normalize(t, "typescript") + normalized_strings.extend(normalized_types) normalized_strings = sorted(list(set(normalized_strings))) if not normalized_strings: @@ -607,63 +601,6 @@ def _derive_name_from_type(self, type_name: str) -> str: return "obj" - def _normalize_ts_type(self, t: str) -> List[str]: - # Handle fundamental TS types - t = t.strip() - if not t: - return ["OBJECT"] - - # A | B - if "|" in t: - parts = t.split("|") - res = [] - for p in parts: - res.extend(self._normalize_ts_type(p)) - return res - - # Generics: Promise, Array - if "<" in t and t.endswith(">"): - base = t.split("<", 1)[0].strip() - # Find matching closing bracket or assumue last - inner = t[t.find("<") + 1 : -1].strip() - - if base == "Promise": - return self._normalize_ts_type(inner) - if base in ("Array", "ReadonlyArray"): - return ["LIST"] - if base == "Map": - return ["MAP"] - if base == "Set": - return ["SET"] - # Fallback for others - return ["OBJECT"] - - t_lower = t.lower() - if t_lower in ("string", "formattedstring", "path"): - return ["STRING"] - if t_lower in ("number", "int", "float", "integer", "double"): - return ["NUMBER"] - if t_lower in ("boolean", "bool"): - return ["BOOLEAN"] - if t_lower == "unknown": - return ["UNKNOWN"] - if t_lower in ("any", "object"): - return ["OBJECT"] - if t_lower.endswith("[]"): - return ["LIST"] - if ( - t_lower.startswith("map") - or t_lower.startswith("record") - or "{" in t - ): - return ["MAP"] - if t_lower.startswith("set"): - return ["SET"] - if t_lower == "void": - return [] - - return ["OBJECT"] - def _extract_return_types(self, node: Node) -> Tuple[List[str], List[str]]: return_type_node = node.child_by_field_name("return_type") if return_type_node: @@ -676,7 +613,7 @@ def _extract_return_types(self, node: Node) -> Tuple[List[str], List[str]]: # logically T for async? # Schema says "original_return_types". # normalized usually unwrap? - return [raw], self._normalize_ts_type(raw) + return [raw], self.normalizer.normalize(raw, "typescript") return [], [] def _is_blocking(self, node: Node, return_types: List[str]) -> bool: @@ -692,16 +629,6 @@ def _is_blocking(self, node: Node, return_types: List[str]) -> bool: return False return True - # Check for 'async' modifier or keyword - for child in node.children: - text = child.text.decode("utf-8") - if text == "async": - return False - # Sometimes modifiers are wrapped? - # But usually async is a direct child in TS grammar for - # method_definition - - return True def _extract_maturity(self, node: Node) -> feature_pb2.Feature.Maturity: decorators = self._get_decorators(node) @@ -715,3 +642,47 @@ def _extract_maturity(self, node: Node) -> feature_pb2.Feature.Maturity: if "beta" in decorators: return feature_pb2.Feature.BETA return None + + def _extract_type_map(self, type_node: Node) -> dict: + type_map = {} # name -> (type_str, optional_bool) + if type_node: + # Check for object_type node inside type_node + # type_annotation -> object_type + object_type_node = None + for child in type_node.children: + if child.type == "object_type": + object_type_node = child + break + + if object_type_node: + for child in object_type_node.children: + if child.type == "property_signature": + # name: property_identifier + # type: type_annotation + # optional? + prop_name_node = child.child_by_field_name("name") + prop_type_node = child.child_by_field_name("type") + + if prop_name_node: + p_name = prop_name_node.text.decode("utf-8") + p_type = "" + if prop_type_node: + p_type = prop_type_node.text.decode("utf-8") + if p_type.startswith(":"): + p_type = p_type[1:].strip() + + # Optionality check: check for '?' node or + # if literal text has ? + # child text might be "hint?:" + # or checking for optional node + p_optional = False + for sub in child.children: + if ( + sub.type == "?" + or sub.text.decode("utf-8") == "?" + ): + p_optional = True + break + + type_map[p_name] = (p_type, p_optional) + return type_map diff --git a/src/google/adk/scope/extractors/extract.py b/src/google/adk/scope/extractors/extract.py index 01c3b38..809c738 100644 --- a/src/google/adk/scope/extractors/extract.py +++ b/src/google/adk/scope/extractors/extract.py @@ -2,13 +2,13 @@ import sys from pathlib import Path -import datetime import yaml from google.protobuf import text_format -from google.protobuf.json_format import MessageToJson, MessageToDict +from google.protobuf.json_format import MessageToDict, MessageToJson + +from google.adk.scope.extractors import extractor_py, extractor_ts from google.adk.scope.features_pb2 import FeatureRegistry from google.adk.scope.utils.args import parse_args -from google.adk.scope.extractors import extractor_py, extractor_ts logging.basicConfig( level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s" @@ -16,8 +16,8 @@ logger = logging.getLogger(__name__) _JSON_INDENT = 2 -_JSON_OUTPUT = True -_YAML_OUTPUT = True +_JSON_OUTPUT = False +_YAML_OUTPUT = False _PROTO_OUTPUT = True @@ -147,7 +147,10 @@ def main(): logger.info("Mode: Repo extraction: %s", input_path) - config = get_config(input_path) + # Priority: Config in CWD > Config in Input Repo + config = get_config(Path(".")) + if not config: + config = get_config(input_path) exclude_list = set(config.get(args.language, {}).get("exclude", [])) search_dir = get_search_dir(input_path, args.language) @@ -195,9 +198,8 @@ def main(): logger.error("Failed to create output directory %s: %s", output_dir, e) sys.exit(1) - timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S") prefix = "py" if args.language in {"python", "py"} else "ts" - base_filename = f"{prefix}_{timestamp}" + base_filename = f"{prefix}" if _JSON_OUTPUT: # 1. JSON Output diff --git a/src/google/adk/scope/extractors/extractor_py.py b/src/google/adk/scope/extractors/extractor_py.py index 2ac14a5..97313c1 100644 --- a/src/google/adk/scope/extractors/extractor_py.py +++ b/src/google/adk/scope/extractors/extractor_py.py @@ -2,11 +2,11 @@ import pathlib from typing import Iterator, List -from google.adk.scope.extractors.converter_py import NodeProcessor -from google.adk.scope.features_pb2 import Feature -from tree_sitter import Language, Parser, Query, QueryCursor import tree_sitter_python as tspy +from tree_sitter import Language, Parser, Query, QueryCursor +from google.adk.scope.extractors.converter_py import NodeProcessor +from google.adk.scope.features_pb2 import Feature SRC_DIR = "src" diff --git a/src/google/adk/scope/extractors/extractor_ts.py b/src/google/adk/scope/extractors/extractor_ts.py index c84b887..eca4251 100644 --- a/src/google/adk/scope/extractors/extractor_ts.py +++ b/src/google/adk/scope/extractors/extractor_ts.py @@ -2,11 +2,11 @@ import pathlib from typing import Iterator, List -from google.adk.scope.features_pb2 import Feature -from google.adk.scope.extractors.converter_ts import NodeProcessor - -from tree_sitter import Language, Parser, Query, QueryCursor import tree_sitter_typescript as tsts +from tree_sitter import Language, Parser, Query, QueryCursor + +from google.adk.scope.extractors.converter_ts import NodeProcessor +from google.adk.scope.features_pb2 import Feature # Initialize Tree-sitter try: diff --git a/src/google/adk/scope/features_pb2.py b/src/google/adk/scope/features_pb2.py index 9314da5..16b51e1 100644 --- a/src/google/adk/scope/features_pb2.py +++ b/src/google/adk/scope/features_pb2.py @@ -9,38 +9,34 @@ from google.protobuf import runtime_version as _runtime_version from google.protobuf import symbol_database as _symbol_database from google.protobuf.internal import builder as _builder + _runtime_version.ValidateProtobufRuntimeVersion( - _runtime_version.Domain.PUBLIC, - 6, - 33, - 4, - '', - 'features.proto' + _runtime_version.Domain.PUBLIC, 6, 33, 4, "", "features.proto" ) # @@protoc_insertion_point(imports) _sym_db = _symbol_database.Default() - - -DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\x0e\x66\x65\x61tures.proto\x12\x0fgoogle.adk.meta\"\xc4\x01\n\x05Param\x12\x15\n\roriginal_name\x18\x01 \x01(\t\x12\x17\n\x0fnormalized_name\x18\x02 \x01(\t\x12\x16\n\x0eoriginal_types\x18\x03 \x03(\t\x12\x34\n\x10normalized_types\x18\x04 \x03(\x0e\x32\x1a.google.adk.meta.ParamType\x12\x18\n\x0b\x64\x65scription\x18\x05 \x01(\tH\x00\x88\x01\x01\x12\x13\n\x0bis_optional\x18\x06 \x01(\x08\x42\x0e\n\x0c_description\"\xdc\x04\n\x07\x46\x65\x61ture\x12\x15\n\roriginal_name\x18\x01 \x01(\t\x12\x17\n\x0fnormalized_name\x18\x02 \x01(\t\x12\x18\n\x0b\x64\x65scription\x18\x03 \x01(\tH\x00\x88\x01\x01\x12\x11\n\tmember_of\x18\x04 \x01(\t\x12\x1c\n\x14normalized_member_of\x18\x05 \x01(\t\x12\x38\n\x08maturity\x18\x06 \x01(\x0e\x32!.google.adk.meta.Feature.MaturityH\x01\x88\x01\x01\x12+\n\x04type\x18\x07 \x01(\x0e\x32\x1d.google.adk.meta.Feature.Type\x12\x11\n\tfile_path\x18\x08 \x01(\t\x12\x11\n\tnamespace\x18\t \x01(\t\x12\x1c\n\x14normalized_namespace\x18\n \x01(\t\x12*\n\nparameters\x18\x0b \x03(\x0b\x32\x16.google.adk.meta.Param\x12\x1d\n\x15original_return_types\x18\x0c \x03(\t\x12\x1f\n\x17normalized_return_types\x18\r \x03(\t\x12\x12\n\x05\x61sync\x18\x0e \x01(\x08H\x02\x88\x01\x01\"6\n\x08Maturity\x12\x10\n\x0c\x45XPERIMENTAL\x10\x00\x12\x08\n\x04\x42\x45TA\x10\x01\x12\x0e\n\nDEPRECATED\x10\x02\"L\n\x04Type\x12\x0c\n\x08\x46UNCTION\x10\x00\x12\x13\n\x0fINSTANCE_METHOD\x10\x01\x12\x10\n\x0c\x43LASS_METHOD\x10\x02\x12\x0f\n\x0b\x43ONSTRUCTOR\x10\x03\x42\x0e\n\x0c_descriptionB\x0b\n\t_maturityB\x08\n\x06_async\"`\n\x0f\x46\x65\x61tureRegistry\x12\x10\n\x08language\x18\x01 \x01(\t\x12\x0f\n\x07version\x18\x02 \x01(\t\x12*\n\x08\x66\x65\x61tures\x18\x03 \x03(\x0b\x32\x18.google.adk.meta.Feature*e\n\tParamType\x12\n\n\x06OBJECT\x10\x00\x12\n\n\x06STRING\x10\x01\x12\n\n\x06NUMBER\x10\x02\x12\x0b\n\x07\x42OOLEAN\x10\x03\x12\x08\n\x04LIST\x10\x04\x12\x07\n\x03MAP\x10\x05\x12\x07\n\x03SET\x10\x06\x12\x0b\n\x07UNKNOWN\x10\x07\x62\x06proto3') +DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile( + b'\n\x0e\x66\x65\x61tures.proto\x12\x0fgoogle.adk.meta"\xc4\x01\n\x05Param\x12\x15\n\roriginal_name\x18\x01 \x01(\t\x12\x17\n\x0fnormalized_name\x18\x02 \x01(\t\x12\x16\n\x0eoriginal_types\x18\x03 \x03(\t\x12\x34\n\x10normalized_types\x18\x04 \x03(\x0e\x32\x1a.google.adk.meta.ParamType\x12\x18\n\x0b\x64\x65scription\x18\x05 \x01(\tH\x00\x88\x01\x01\x12\x13\n\x0bis_optional\x18\x06 \x01(\x08\x42\x0e\n\x0c_description"\xdc\x04\n\x07\x46\x65\x61ture\x12\x15\n\roriginal_name\x18\x01 \x01(\t\x12\x17\n\x0fnormalized_name\x18\x02 \x01(\t\x12\x18\n\x0b\x64\x65scription\x18\x03 \x01(\tH\x00\x88\x01\x01\x12\x11\n\tmember_of\x18\x04 \x01(\t\x12\x1c\n\x14normalized_member_of\x18\x05 \x01(\t\x12\x38\n\x08maturity\x18\x06 \x01(\x0e\x32!.google.adk.meta.Feature.MaturityH\x01\x88\x01\x01\x12+\n\x04type\x18\x07 \x01(\x0e\x32\x1d.google.adk.meta.Feature.Type\x12\x11\n\tfile_path\x18\x08 \x01(\t\x12\x11\n\tnamespace\x18\t \x01(\t\x12\x1c\n\x14normalized_namespace\x18\n \x01(\t\x12*\n\nparameters\x18\x0b \x03(\x0b\x32\x16.google.adk.meta.Param\x12\x1d\n\x15original_return_types\x18\x0c \x03(\t\x12\x1f\n\x17normalized_return_types\x18\r \x03(\t\x12\x12\n\x05\x61sync\x18\x0e \x01(\x08H\x02\x88\x01\x01"6\n\x08Maturity\x12\x10\n\x0c\x45XPERIMENTAL\x10\x00\x12\x08\n\x04\x42\x45TA\x10\x01\x12\x0e\n\nDEPRECATED\x10\x02"L\n\x04Type\x12\x0c\n\x08\x46UNCTION\x10\x00\x12\x13\n\x0fINSTANCE_METHOD\x10\x01\x12\x10\n\x0c\x43LASS_METHOD\x10\x02\x12\x0f\n\x0b\x43ONSTRUCTOR\x10\x03\x42\x0e\n\x0c_descriptionB\x0b\n\t_maturityB\x08\n\x06_async"`\n\x0f\x46\x65\x61tureRegistry\x12\x10\n\x08language\x18\x01 \x01(\t\x12\x0f\n\x07version\x18\x02 \x01(\t\x12*\n\x08\x66\x65\x61tures\x18\x03 \x03(\x0b\x32\x18.google.adk.meta.Feature*e\n\tParamType\x12\n\n\x06OBJECT\x10\x00\x12\n\n\x06STRING\x10\x01\x12\n\n\x06NUMBER\x10\x02\x12\x0b\n\x07\x42OOLEAN\x10\x03\x12\x08\n\x04LIST\x10\x04\x12\x07\n\x03MAP\x10\x05\x12\x07\n\x03SET\x10\x06\x12\x0b\n\x07UNKNOWN\x10\x07\x62\x06proto3' +) _globals = globals() _builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals) -_builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, 'features_pb2', _globals) +_builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, "features_pb2", _globals) if not _descriptor._USE_C_DESCRIPTORS: - DESCRIPTOR._loaded_options = None - _globals['_PARAMTYPE']._serialized_start=939 - _globals['_PARAMTYPE']._serialized_end=1040 - _globals['_PARAM']._serialized_start=36 - _globals['_PARAM']._serialized_end=232 - _globals['_FEATURE']._serialized_start=235 - _globals['_FEATURE']._serialized_end=839 - _globals['_FEATURE_MATURITY']._serialized_start=668 - _globals['_FEATURE_MATURITY']._serialized_end=722 - _globals['_FEATURE_TYPE']._serialized_start=724 - _globals['_FEATURE_TYPE']._serialized_end=800 - _globals['_FEATUREREGISTRY']._serialized_start=841 - _globals['_FEATUREREGISTRY']._serialized_end=937 + DESCRIPTOR._loaded_options = None + _globals["_PARAMTYPE"]._serialized_start = 939 + _globals["_PARAMTYPE"]._serialized_end = 1040 + _globals["_PARAM"]._serialized_start = 36 + _globals["_PARAM"]._serialized_end = 232 + _globals["_FEATURE"]._serialized_start = 235 + _globals["_FEATURE"]._serialized_end = 839 + _globals["_FEATURE_MATURITY"]._serialized_start = 668 + _globals["_FEATURE_MATURITY"]._serialized_end = 722 + _globals["_FEATURE_TYPE"]._serialized_start = 724 + _globals["_FEATURE_TYPE"]._serialized_end = 800 + _globals["_FEATUREREGISTRY"]._serialized_start = 841 + _globals["_FEATUREREGISTRY"]._serialized_end = 937 # @@protoc_insertion_point(module_scope) diff --git a/src/google/adk/scope/matcher/__init__.py b/src/google/adk/scope/matcher/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/google/adk/scope/matcher/matcher.py b/src/google/adk/scope/matcher/matcher.py new file mode 100644 index 0000000..83f1906 --- /dev/null +++ b/src/google/adk/scope/matcher/matcher.py @@ -0,0 +1,670 @@ +import argparse +import dataclasses +import logging +import sys +from collections import defaultdict +from pathlib import Path +from typing import Dict, List, Tuple + +import numpy as np +from google.protobuf import text_format +from scipy.optimize import linear_sum_assignment + +from google.adk.scope import features_pb2 +from google.adk.scope.utils import args as adk_args +from google.adk.scope.utils import stats +from google.adk.scope.utils.similarity import SimilarityScorer + +_NEAR_MISS_THRESHOLD = 0.15 + + +@dataclasses.dataclass +class MatchResult: + master_content: str + module_files: Dict[str, str] # filename -> content + + +def format_feature(f: features_pb2.Feature) -> str: + name = f.original_name or f.normalized_name + member = f.member_of + if member and member.lower() != "null": + return f"{member}.{name}" + return name + + +def get_type_display_name(f: features_pb2.Feature) -> str: + FeatureType = features_pb2.Feature.Type + if f.type == FeatureType.CONSTRUCTOR: + return "constructor" + elif f.type in (FeatureType.FUNCTION, FeatureType.CLASS_METHOD): + return "function" + elif f.type == features_pb2.Feature.Type.INSTANCE_METHOD: + return "method" + else: + return "unknown" + + +def get_type_priority(f: features_pb2.Feature) -> int: + """Returns priority: constructor < function < method < unknown.""" + type_name = get_type_display_name(f) + priorities = { + "constructor": 0, + "function": 1, + "method": 2, + "unknown": 3, + } + return priorities.get(type_name, 99) + + +def read_feature_registry(file_path: str) -> features_pb2.FeatureRegistry: + """Reads a FeatureRegistry from a text proto file.""" + registry = features_pb2.FeatureRegistry() + with open(file_path, "rb") as f: + text_format.Parse(f.read(), registry) + return registry + + +def match_features( + base_features: List[features_pb2.Feature], + target_features: List[features_pb2.Feature], + alpha: float, +) -> List[Tuple[features_pb2.Feature, features_pb2.Feature, float]]: + """Matches features between two lists using Hungarian algorithm.""" + if not base_features or not target_features: + return [] + + scorer = SimilarityScorer(alpha=alpha) + matches = [] + + # Build Cost Matrix (Rows=Base, Cols=Target) + n_base = len(base_features) + n_target = len(target_features) + similarity_matrix = np.zeros((n_base, n_target)) + + for i, f1 in enumerate(base_features): + for j, f2 in enumerate(target_features): + similarity_matrix[i, j] = scorer.get_similarity_score(f1, f2) + + # Run Hungarian Algorithm (Global Optimization) + row_ind, col_ind = linear_sum_assignment(similarity_matrix, maximize=True) + + matched_base_indices = set() + matched_target_indices = set() + + # Filter Optimal Assignments by Alpha Threshold + for r, c in zip(row_ind, col_ind): + score = similarity_matrix[r, c] + if score > alpha: + matches.append((base_features[r], target_features[c], score)) + matched_base_indices.add(r) + matched_target_indices.add(c) + + # Update the input lists in-place (Remove matched items) + base_features[:] = [ + f for i, f in enumerate(base_features) if i not in matched_base_indices + ] + target_features[:] = [ + f + for i, f in enumerate(target_features) + if i not in matched_target_indices + ] + + return matches + + +def get_language_code(language_name: str) -> str: + """Returns a short code for the language.""" + name = language_name.upper() + if name == "PYTHON": + return "py" + elif name == "TYPESCRIPT": + return "ts" + elif name == "JAVA": + return "java" + elif name == "GOLANG": + return "go" + else: + return name[:2].lower() + + +def _group_features_by_module( + registry: features_pb2.FeatureRegistry, +) -> Dict[str, List[features_pb2.Feature]]: + """Groups features by their module.""" + features = defaultdict(list) + for f in registry.features: + key = f.normalized_namespace or f.namespace or "Unknown Module" + features[key].append(f) + return features + + +def _fuzzy_match_namespaces( + features_base: Dict[str, List[features_pb2.Feature]], + features_target: Dict[str, List[features_pb2.Feature]], +) -> None: + """Remaps target namespaces to base namespaces using fuzzy matching.""" + from jellyfish import jaro_winkler_similarity + + base_namespaces = sorted(list(features_base.keys())) + remapped_features = defaultdict(list, {k: [] for k in features_base}) + + for t_ns, features in features_target.items(): + if t_ns in base_namespaces: + remapped_features[t_ns].extend(features) + continue + + if not base_namespaces: + # No base to match against, so keep original target namespace + remapped_features[t_ns].extend(features) + continue + + best_match, best_score = max( + ( + (b_ns, jaro_winkler_similarity(t_ns, b_ns)) + for b_ns in base_namespaces + ), + key=lambda item: item[1], + default=(None, 0.0), + ) + + if best_score > 0.8 and best_match: + remapped_features[best_match].extend(features) + + features_target.clear() + features_target.update(remapped_features) + + +def match_registries( + base_registry: features_pb2.FeatureRegistry, + target_registry: features_pb2.FeatureRegistry, + alpha: float, + report_type: str = "symmetric", +) -> MatchResult: + """Matches features and generates a master report + module sub-reports.""" + + features_base = _group_features_by_module(base_registry) + features_target = _group_features_by_module(target_registry) + _fuzzy_match_namespaces(features_base, features_target) + + if report_type == "directional": + all_modules = sorted(features_base.keys()) + else: + all_modules = sorted( + set(features_base.keys()) | set(features_target.keys()) + ) + + if report_type == "raw": + return _generate_raw_report( + base_registry, + target_registry, + all_modules, + features_base, + features_target, + alpha, + ) + + return _generate_markdown_report( + base_registry, + target_registry, + all_modules, + features_base, + features_target, + alpha, + report_type, + ) + + +def _generate_raw_report( + base_registry: features_pb2.FeatureRegistry, + target_registry: features_pb2.FeatureRegistry, + all_modules: List[str], + features_base: Dict[str, List[features_pb2.Feature]], + features_target: Dict[str, List[features_pb2.Feature]], + alpha: float, +) -> MatchResult: + """Generates a raw CSV report.""" + base_code = get_language_code(base_registry.language) + target_code = get_language_code(target_registry.language) + csv_header = ( + f"{base_code}_namespace,{base_code}_member_of,{base_code}_name," + f"{target_code}_namespace,{target_code}_member_of,{target_code}_name," + "type,score" + ) + csv_lines = [csv_header] + + def get_feature_cols(f: features_pb2.Feature) -> tuple[str, str, str]: + ns = f.namespace or "" + if not ns and f.normalized_namespace: + ns = f.normalized_namespace + + mem = f.member_of or "" + if not mem and f.normalized_member_of: + mem = f.normalized_member_of + if mem.lower() == "null": + mem = "" + + name = f.original_name or f.normalized_name or "" + return ns, mem, name + + def escape_csv(s): + if s is None: + return "" + if "," in s or '"' in s or "\n" in s: + return '"{}"'.format(s.replace('"', '""')) + return s + + for module in all_modules: + base_list = features_base.get(module, []) + target_list = features_target.get(module, []) + + solid_matches = match_features(base_list, target_list, alpha) + beta = max(0.0, alpha - _NEAR_MISS_THRESHOLD) + potential_matches = match_features(base_list, target_list, beta) + + unmatched_base = base_list + unmatched_target = target_list + + for f_base, f_target, score in solid_matches: + b_ns, b_mem, b_name = get_feature_cols(f_base) + t_ns, t_mem, t_name = get_feature_cols(f_target) + f_type = get_type_display_name(f_base) + csv_lines.append( + f"{escape_csv(b_ns)},{escape_csv(b_mem)},{escape_csv(b_name)}," + f"{escape_csv(t_ns)},{escape_csv(t_mem)},{escape_csv(t_name)}," + f"{escape_csv(f_type)},{score:.4f}" + ) + + for f_base, f_target, score in potential_matches: + b_ns, b_mem, b_name = get_feature_cols(f_base) + t_ns, t_mem, t_name = get_feature_cols(f_target) + f_type = get_type_display_name(f_base) + csv_lines.append( + f"{escape_csv(b_ns)},{escape_csv(b_mem)},{escape_csv(b_name)}," + f"{escape_csv(t_ns)},{escape_csv(t_mem)},{escape_csv(t_name)}," + f"{escape_csv(f_type)},{score:.4f}" + ) + + for f_base in unmatched_base: + b_ns, b_mem, b_name = get_feature_cols(f_base) + f_type = get_type_display_name(f_base) + csv_lines.append( + f"{escape_csv(b_ns)},{escape_csv(b_mem)},{escape_csv(b_name)}," + f",,,{escape_csv(f_type)},0.0000" + ) + + for f_target in unmatched_target: + t_ns, t_mem, t_name = get_feature_cols(f_target) + f_type = get_type_display_name(f_target) + csv_lines.append( + f",,,{escape_csv(t_ns)},{escape_csv(t_mem)}," + f"{escape_csv(t_name)},{escape_csv(f_type)},0.0000" + ) + + return MatchResult(master_content="\n".join(csv_lines), module_files={}) + + +def _generate_markdown_report( + base_registry: features_pb2.FeatureRegistry, + target_registry: features_pb2.FeatureRegistry, + all_modules: List[str], + features_base: Dict[str, List[features_pb2.Feature]], + features_target: Dict[str, List[features_pb2.Feature]], + alpha: float, + report_type: str, +) -> MatchResult: + """Generates a markdown report.""" + from datetime import datetime + + master_lines = [] + title_suffix = "Symmetric" if report_type == "symmetric" else "Directional" + master_lines.extend( + [ + f"# Feature Matching Report: {title_suffix}", + f"Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}", + "", + f"**Base:** {base_registry.language} ({base_registry.version})", + f"**Target:** {target_registry.language}" + f" ({target_registry.version})", + ] + ) + + global_score_idx = len(master_lines) + master_lines.append("GLOBAL_SCORE_PLACEHOLDER") + master_lines.append("") + + header = "| Module | Features (Base) | Score | Status | Details |" + divider = "|---|---|---|---|---|" + if report_type == "symmetric": + header = "| ADK | Module | Features (Base) | Score | Status | Details |" + divider = "|---|---|---|---|---|---|" + master_lines.extend(["## Module Summary", header, divider]) + + module_files = {} + module_rows = [] + total_solid_matches = 0 + + base_code = get_language_code(base_registry.language) + target_code = get_language_code(target_registry.language) + + for module in all_modules: + mod_base_list = features_base.get(module, []) + mod_target_list = features_target.get(module, []) + + results = _process_module( + module, + mod_base_list, + mod_target_list, + alpha, + report_type, + base_code, + target_code, + ) + total_solid_matches += results["solid_matches_count"] + module_rows.append((results["score"], results["row_content"])) + if results.get("module_filename"): + module_files[results["module_filename"]] = results["module_content"] + + module_rows.sort(key=lambda x: x[0], reverse=True) + master_lines.extend([row for _, row in module_rows]) + + total_base_features = len(base_registry.features) + total_target_features = len(target_registry.features) + if report_type == "symmetric": + union_size = ( + total_base_features + total_target_features - total_solid_matches + ) + parity_score = ( + total_solid_matches / union_size if union_size > 0 else 1.0 + ) + global_stats = ( + f"**Jaccard Index:** {parity_score:.2%}\n\n" + "> The Jaccard Index measures the similarity between the " + "two feature sets. A score of 100% indicates that both languages " + "have identical features." + ) + else: + precision = stats.calculate_precision( + total_solid_matches, total_target_features + ) + recall = stats.calculate_recall( + total_solid_matches, total_base_features + ) + parity_score = stats.calculate_f1(precision, recall) + + global_stats = ( + "| Metric | Score |\n" + "|---|---|\n" + f"| **Precision** | {precision:.2%} |\n" + f"| **Recall** | {recall:.2%} |\n" + f"| **F1 Score** | {parity_score:.2%} |\n\n" + "> **Precision**: Of all features in the target, how many are " + "correct matches to the base? (High score = low number of extra " + "features in target)\n\n" + "> **Recall**: Of all features in the base, how many were found in " + "the target? (High score = low number of missing features in " + "target)\n\n" + "> **F1 Score**: A weighted average of Precision and Recall, " + "providing a single measure of how well the target feature set " + "matches the base." + ) + + master_lines[global_score_idx] = global_stats + + return MatchResult( + master_content="\n".join(master_lines).strip(), + module_files=module_files, + ) + + +def _process_module( + module: str, + base_list: List[features_pb2.Feature], + target_list: List[features_pb2.Feature], + alpha: float, + report_type: str, + base_lang_code: str, + target_lang_code: str, +) -> Dict: + """Analyzes a single module and generates its report content.""" + mod_base_count = len(base_list) + mod_target_count = len(target_list) + + solid_matches = match_features(base_list, target_list, alpha) + mod_solid_count = len(solid_matches) + + beta = max(0.0, alpha - _NEAR_MISS_THRESHOLD) + potential_matches = match_features(base_list, target_list, beta) + + unmatched_base = base_list + unmatched_target = target_list + + if report_type == "symmetric": + union_size = mod_base_count + mod_target_count - mod_solid_count + mod_score = mod_solid_count / union_size if union_size > 0 else 1.0 + else: # directional + precision = stats.calculate_precision(mod_solid_count, mod_target_count) + recall = stats.calculate_recall(mod_solid_count, mod_base_count) + mod_score = stats.calculate_f1(precision, recall) + + status_icon = ( + "✅" if mod_score == 1.0 else "⚠️" if mod_score >= 0.8 else "❌" + ) + module_safe_name = module.replace(".", "_") + module_filename = f"{module_safe_name}.md" + + details_link = f"[View Details]({{modules_dir}}/{module_filename})" + if report_type == "symmetric": + adk_parts = [] + if mod_base_count > 0: + adk_parts.append(base_lang_code) + if mod_target_count > 0: + adk_parts.append(target_lang_code) + adk_value = ", ".join(adk_parts) + row_content = ( + f"| {adk_value} | `{module}` | {mod_base_count} | {mod_score:.2%} |" + f" {status_icon} | {details_link} |" + ) + else: + row_content = ( + f"| `{module}` | {mod_base_count} | {mod_score:.2%} | {status_icon}" + f" | {details_link} |" + ) + + # Module Content + mod_lines = [ + f"# Module: `{module}`", + "[⬅️ Back to Master Report](../{master_report})", + "", + f"**Score:** {mod_score:.2%} ({status_icon})", + ] + if report_type == "directional": + mod_lines.extend( + [ + "| Metric | Score |", + "|---|---|", + f"| **Precision** | {precision:.2%} |", + f"| **Recall** | {recall:.2%} |", + ] + ) + + mod_total_features = ( + (mod_base_count + mod_target_count - mod_solid_count) + if report_type == "symmetric" + else mod_base_count + ) + mod_lines.extend(["", f"**Features:** {mod_total_features}", ""]) + + solid_matches.sort( + key=lambda x: (get_type_priority(x[0]), x[0].normalized_name) + ) + potential_matches.sort( + key=lambda x: (get_type_priority(x[0]), x[0].normalized_name) + ) + + if solid_matches: + mod_lines.append( + f"### ✅ {'Solid' if report_type == 'symmetric' else 'Matched'}" + " Features" + ) + mod_lines.extend( + [ + "| Type | Base Feature | Target Feature | Similarity Score |", + "|---|---|---|---|", + ] + ) + mod_lines.extend( + [ + f"| {get_type_display_name(f_base)} |" + f" `{format_feature(f_base)}`" + f" | `{format_feature(f_target)}` | {score:.2f} |" + for f_base, f_target, score in solid_matches + ] + ) + mod_lines.append("") + + if potential_matches: + mod_lines.extend( + [ + "### ⚠️ Potential Matches", + "| Type | Base Feature | Closest Target Candidate" + " | Similarity |", + "|---|---|---|---|", + ] + ) + mod_lines.extend( + [ + f"| {get_type_display_name(f_base)} |" + f" `{format_feature(f_base)}`" + f" | `{format_feature(f_target)}` | {score:.2f} |" + for f_base, f_target, score in potential_matches + ] + ) + mod_lines.append("") + + if report_type == "symmetric" and (unmatched_base or unmatched_target): + mod_lines.extend( + [ + "### ❌ Unmatched Features", + "| Missing Feature | Missing In |", + "|---|---|", + ] + ) + mod_lines.extend( + [f"| `{format_feature(f)}` | Target |" for f in unmatched_base] + ) + mod_lines.extend( + [f"| `{format_feature(f)}` | Base |" for f in unmatched_target] + ) + mod_lines.append("") + elif report_type == "directional" and unmatched_base: + mod_lines.extend( + ["### ❌ Missing in Target", "| Missing Feature |", "|---|"] + ) + mod_lines.extend([f"| `{format_feature(f)}` |" for f in unmatched_base]) + mod_lines.append("") + + return { + "solid_matches_count": mod_solid_count, + "score": mod_score, + "row_content": row_content, + "module_filename": module_filename, + "module_content": "\n".join(mod_lines).strip(), + } + + +def main(): + parser = argparse.ArgumentParser( + description="Match ADK features between two languages." + ) + parser.add_argument( + "--base", + required=True, + help="Path to the base FeatureRegistry .txtpb file.", + ) + parser.add_argument( + "--target", + required=True, + help="Path to the target FeatureRegistry .txtpb file.", + ) + parser.add_argument( + "--output", + required=True, + help="Path to save the Markdown report.", + ) + parser.add_argument( + "--alpha", + type=float, + default=0.8, + help="Similarity threshold (0.0 to 1.0) defaults to 0.8.", + ) + parser.add_argument( + "--report-type", + choices=["symmetric", "directional", "raw"], + default="symmetric", + help="Type of gap report to generate (symmetric, directional, or raw).", + ) + adk_args.add_verbose_argument(parser) + args = parser.parse_args() + adk_args.configure_logging(args) + + try: + base_registry = read_feature_registry(args.base) + target_registry = read_feature_registry(args.target) + except Exception as e: + logging.error(f"Error reading feature registries: {e}") + sys.exit(1) + + result = match_registries( + base_registry, target_registry, args.alpha, args.report_type + ) + + output_path = Path(args.output) + output_path.parent.mkdir(parents=True, exist_ok=True) + + if args.report_type == "raw": + # Raw report is a single file, no modules directory needed + try: + output_path.parent.mkdir(parents=True, exist_ok=True) + output_path.write_text(result.master_content) + logging.info( + f"Successfully wrote raw match report to {output_path}" + ) + except Exception as e: + logging.error(f"Error writing raw report to {output_path}: {e}") + sys.exit(1) + return + + # Create module directory + if result.module_files: + modules_dir_name = f"{output_path.stem}_modules" + modules_dir = output_path.parent / modules_dir_name + modules_dir.mkdir(parents=True, exist_ok=True) + + # Write module files + for filename, content in result.module_files.items(): + # Replace placeholder for master report link + # The link is relative from module dir to master report + # So name is enough. + final_content = content.replace("{master_report}", output_path.name) + (modules_dir / filename).write_text(final_content) + + # Replace placeholder in Master Report + # We assume master report is in parent of modules_dir + # modules_dir relative to master report is just the dir name + master_report = result.master_content.replace( + "{modules_dir}", modules_dir_name + ) + else: + master_report = result.master_content.replace("{modules_dir}", ".") + + try: + output_path.parent.mkdir(parents=True, exist_ok=True) + output_path.write_text(master_report) + logging.info(f"Successfully wrote match report to {output_path}") + except Exception as e: + logging.error(f"Error writing report to {output_path}: {e}") + sys.exit(1) + + +if __name__ == "__main__": + main() diff --git a/src/google/adk/scope/utils/args.py b/src/google/adk/scope/utils/args.py index d544e0d..745c99e 100644 --- a/src/google/adk/scope/utils/args.py +++ b/src/google/adk/scope/utils/args.py @@ -3,9 +3,23 @@ """ import argparse +import logging from pathlib import Path +def add_verbose_argument(parser: argparse.ArgumentParser) -> None: + parser.add_argument( + "-v", "--verbose", help="show all logs", action="store_true" + ) + + +def configure_logging(args: argparse.Namespace) -> None: + if args.verbose: + logging.basicConfig(level=logging.DEBUG) + else: + logging.basicConfig(level=logging.INFO) + + def parse_args() -> argparse.Namespace: """Parse command line arguments for the Python extractor. diff --git a/src/google/adk/scope/utils/normalizer.py b/src/google/adk/scope/utils/normalizer.py new file mode 100644 index 0000000..ab974a1 --- /dev/null +++ b/src/google/adk/scope/utils/normalizer.py @@ -0,0 +1,197 @@ +""" +Unified type normalization for ADK Scope. +""" + +import re +from typing import List + + +def normalize_name(name: str) -> str: + """Convert name to snake_case (e.g. CamelCase -> camel_case).""" + name = name.replace("-", "_") + s1 = re.sub("(.)([A-Z][a-z]+)", r"\1_\2", name) + return re.sub("([a-z0-9])([A-Z])", r"\1_\2", s1).lower() + + +class TypeNormalizer: + """A language-aware type normalizer.""" + + def normalize(self, type_name: str, language: str) -> List[str]: + """Normalize a type name for a given language.""" + if language == "python": + return self._normalize_py_type(type_name) + elif language == "typescript": + return self._normalize_ts_type(type_name) + # Add placeholders for future languages like 'java' and 'go' + # Fallback for unknown languages: only normalize if it's a known simple + # type, otherwise OBJECT + normalized = self._simple_normalize(type_name) + return [normalized] + + def _normalize_py_type(self, type_name: str) -> list[str]: + """Map raw type to canonical types, handling generics.""" + type_name = type_name.strip() + if not type_name: + return ["OBJECT"] + + # Handle Union[A, B] + if type_name.startswith("Union[") and type_name.endswith("]"): + inner = type_name[6:-1] + parts = self._split_generics(inner) + result = [] + for p in parts: + result.extend(self._normalize_py_type(p)) + return self._unique(result) + + # Handle Optional[T] -> T | None + if type_name.startswith("Optional[") and type_name.endswith("]"): + inner = type_name[9:-1] + result = self._normalize_py_type(inner) + if "null" not in result: + result.append("null") + return result + + # Handle tuple[A, B] -> [A, B] + if type_name.lower().startswith("tuple[") and type_name.endswith("]"): + inner = type_name[6:-1] + parts = self._split_generics(inner) + result = [] + for p in parts: + result.extend(self._normalize_py_type(p)) + return self._unique(result) + + # Handle other generics like List[int] -> LIST + if "[" in type_name and type_name.endswith("]"): + base = type_name.split("[", 1)[0] + return [self._simple_normalize(base)] + + return [self._simple_normalize(type_name)] + + def _normalize_ts_type(self, t: str) -> List[str]: + # Handle fundamental TS types + t = t.strip() + if not t: + return ["OBJECT"] + + # A | B + if "|" in t: + parts = t.split("|") + res = [] + for p in parts: + res.extend(self._normalize_ts_type(p)) + return res + + # Generics: Promise, Array + if "<" in t and t.endswith(">"): + base = t.split("<", 1)[0].strip() + # Find matching closing bracket or assumue last + inner = t[t.find("<") + 1 : -1].strip() + + if base == "Promise": + return self._normalize_ts_type(inner) + if base in ( + "Array", + "ReadonlyArray", + "Generator", + "AsyncGenerator", + "Iterable", + "Iterator", + "AsyncIterable", + "AsyncIterator", + ): + return ["LIST"] + if base == "Map": + return ["MAP"] + if base == "Set": + return ["SET"] + # Fallback for others + return ["OBJECT"] + + t_lower = t.lower() + if t_lower in ("string", "formattedstring", "path"): + return ["STRING"] + if t_lower in ("number", "int", "float", "integer", "double"): + return ["NUMBER"] + if t_lower in ("boolean", "bool"): + return ["BOOLEAN"] + if t_lower == "unknown": + return ["UNKNOWN"] + if t_lower in ("any", "object"): + return ["OBJECT"] + if t_lower.endswith("[]"): + return ["LIST"] + if ( + t_lower.startswith("map") + or t_lower.startswith("record") + or "{" in t + ): + return ["MAP"] + if t_lower.startswith("set"): + return ["SET"] + if t_lower == "void": + return [] + + return ["OBJECT"] + + def _simple_normalize(self, t: str) -> str: + t = t.lower().strip() + if t == "none": + return "null" + if t in ( + "list", + "array", + "slice", + "vector", + "generator", + "asyncgenerator", + "iterable", + "iterator", + "asynciterable", + "asynciterator", + ): + return "LIST" + if t in ("set",): + return "SET" + if t in ("map", "dictionary", "dict", "record", "hash"): + return "MAP" + if t in ("int", "integer", "long", "int64", "float", "double"): + return "NUMBER" + if t in ("str", "string"): + return "STRING" + if t in ("bool", "boolean"): + return "BOOLEAN" + if t == "any": + return "OBJECT" + if not t: + return "OBJECT" + return "OBJECT" + + def _split_generics(self, s: str) -> list[str]: + """Split string by comma, ignoring nested brackets.""" + parts = [] + balance = 0 + current = [] + for char in s: + if char == "[": + balance += 1 + current.append(char) + elif char == "]": + balance -= 1 + current.append(char) + elif char == "," and balance == 0: + parts.append("".join(current).strip()) + current = [] + else: + current.append(char) + if current: + parts.append("".join(current).strip()) + return parts + + def _unique(self, lst: list[str]) -> list[str]: + seen = set() + out = [] + for x in lst: + if x not in seen: + seen.add(x) + out.append(x) + return out diff --git a/src/google/adk/scope/utils/similarity.py b/src/google/adk/scope/utils/similarity.py new file mode 100644 index 0000000..5fd7296 --- /dev/null +++ b/src/google/adk/scope/utils/similarity.py @@ -0,0 +1,264 @@ +import logging +from typing import Optional + +import numpy as np +from jellyfish import jaro_winkler_similarity +from scipy.optimize import linear_sum_assignment + +from google.adk.scope import features_pb2 as features_pb + +logger = logging.getLogger(__name__) + +# Default weights for the similarity calculation. +DEFAULT_SIMILARITY_WEIGHTS = { + "name": 0.30, + "member_of": 0.30, + "namespace": 0.15, + "parameters": 0.15, + "return_type": 0.10, +} + + +class SimilarityScorer: + """Calculates a similarity score between two features.""" + + def __init__( + self, weights: Optional[dict[str, float]] = None, alpha: float = 0.8 + ): + self.weights = weights or DEFAULT_SIMILARITY_WEIGHTS + logger.debug( + f"Initializing SimilarityScorer with alpha={alpha} and " + f"weights={self.weights}" + ) + assert "name" in self.weights + assert "member_of" in self.weights + assert "namespace" in self.weights + assert "parameters" in self.weights + assert "return_type" in self.weights + self._early_exit_threshold = alpha * ( + self.weights["name"] + + self.weights["member_of"] + + self.weights["namespace"] + ) + + def _fuzzy_type_match(self, types1: list, types2: list) -> float: + """Calculates a fuzzy similarity score between two lists of types.""" + + def _to_str_set(type_list): + res = set() + for t in type_list: + if isinstance(t, int): + # It's a ParamType enum + try: + res.add(features_pb.ParamType.Name(t)) + except ValueError: + res.add(str(t)) + else: + res.add(str(t).upper()) + return res + + set1 = _to_str_set(types1) + set2 = _to_str_set(types2) + + if not set1 and not set2: + return 1.0 + if not set1 or not set2: + return 0.0 + + if set1 == set2: + return 1.0 + + # Check the best match between any pair of types + best_score = 0.0 + for t1 in set1: + for t2 in set2: + if t1 == t2: + score = 1.0 + elif {t1, t2} == {"MAP", "OBJECT"} or {t1, t2} == { + "MAP", + "ANY", + }: + score = 0.4 + elif t1 in ("UNKNOWN", "ANY") or t2 in ("UNKNOWN", "ANY"): + score = 0.3 + elif t1 == "OBJECT" or t2 == "OBJECT": + score = 0.2 + else: + score = 0.0 + + if score > best_score: + best_score = score + + return best_score + + def _calculate_param_similarity( + self, param1: features_pb.Param, param2: features_pb.Param + ) -> float: + """Calculates the similarity score between two individual parameters.""" + s_p_name = jaro_winkler_similarity( + param1.normalized_name, param2.normalized_name + ) + s_p_type = self._fuzzy_type_match( + list(param1.normalized_types), list(param2.normalized_types) + ) + s_p_opt = 1.0 if param1.is_optional == param2.is_optional else 0.0 + + # Weights for parameter components + score = (0.5 * s_p_name) + (0.4 * s_p_type) + (0.1 * s_p_opt) + logger.debug( + f"Param '{param1.normalized_name}' vs '{param2.normalized_name}': " + f"{score:.4f} (name:{s_p_name:.2f}, type:{s_p_type:.2f}, " + f"opt:{s_p_opt:.2f})" + ) + return score + + def _calculate_parameters_score( + self, params1: list[features_pb.Param], params2: list[features_pb.Param] + ) -> float: + """Calculates aggregated similarity for two lists of parameters.""" + logger.debug( + f"Calculating parameter score for {len(params1)} " + f"vs {len(params2)} parameters" + ) + if not params1 and not params2: + logger.debug("Both parameter lists empty. Returning 1.0") + return 1.0 + if not params1 or not params2: + logger.debug( + "One parameter list empty while other is not. Returning 0.0" + ) + return 0.0 + + similarity_matrix = np.zeros((len(params1), len(params2))) + for i, p1 in enumerate(params1): + for j, p2 in enumerate(params2): + similarity_matrix[i, j] = self._calculate_param_similarity( + p1, p2 + ) + + row_ind, col_ind = linear_sum_assignment( + similarity_matrix, maximize=True + ) + total_match_score = similarity_matrix[row_ind, col_ind].sum() + total_params = len(params1) + len(params2) + + if total_params == 0: + return 1.0 + + score = (2 * total_match_score) / total_params + logger.debug( + f"Matrix matched total score: {total_match_score:.4f}, " + f"final parameter score: {score:.4f}" + ) + return score + + def _calculate_return_type_score( + self, f1: features_pb.Feature, f2: features_pb.Feature + ) -> float: + """Calculates the similarity score for the return types.""" + s_type_match = self._fuzzy_type_match( + list(f1.normalized_return_types), list(f2.normalized_return_types) + ) + s_async_match = ( + 1.0 if getattr(f1, "async") == getattr(f2, "async") else 0.0 + ) + score = (0.7 * s_type_match) + (0.3 * s_async_match) + logger.debug( + f"Return type score: {score:.4f} (type match: " + f"{s_type_match}, async match: {s_async_match})" + ) + return score + + def get_similarity_score( + self, feature1: features_pb.Feature, feature2: features_pb.Feature + ) -> float: + """Computes the overall similarity score between two features.""" + logger.debug( + f"Comparing '{feature1.normalized_name}' and " + f"'{feature2.normalized_name}'" + ) + # 1. Type Compatibility and Dynamic Weights + t1, t2 = feature1.type, feature2.type + current_weights = self.weights.copy() + + FeatureType = features_pb.Feature.Type + if t1 == FeatureType.CONSTRUCTOR and t2 == FeatureType.CONSTRUCTOR: + current_weights["member_of"] += current_weights["name"] + current_weights["name"] = 0.0 + logger.debug( + "Both CONSTRUCTOR. " f"Adjusted weights: {current_weights}" + ) + elif t1 in (FeatureType.FUNCTION, FeatureType.CLASS_METHOD) and t2 in ( + FeatureType.FUNCTION, + FeatureType.CLASS_METHOD, + ): + current_weights["member_of"] /= 2.0 + current_weights["name"] += current_weights["member_of"] + logger.debug( + "Both FUNCTION/CLASS_METHOD. " + f"Adjusted weights: {current_weights}" + ) + elif ( + t1 == FeatureType.INSTANCE_METHOD + and t2 == FeatureType.INSTANCE_METHOD + ): + logger.debug( + "Both INSTANCE_METHOD. " + f"Using default weights: {current_weights}" + ) + pass # Keep default weights + else: + logger.debug(f"Incompatible types: {t1} vs {t2}. Returning 0.0") + return 0.0 # Fast out for incompatible types + + # 2. Similarity Calculations + scores = { + "name": jaro_winkler_similarity( + feature1.normalized_name, feature2.normalized_name + ), + "member_of": jaro_winkler_similarity( + feature1.normalized_member_of, feature2.normalized_member_of + ), + "namespace": jaro_winkler_similarity( + feature1.normalized_namespace, feature2.normalized_namespace + ), + } + logger.debug(f"Preliminary scores: {scores}") + + # 3. Early Exit Check (using dynamic weights) + preliminary_score = ( + scores["name"] * current_weights["name"] + + scores["member_of"] * current_weights["member_of"] + + scores["namespace"] * current_weights["namespace"] + ) + + early_exit_threshold = 0.8 * ( + current_weights["name"] + + current_weights["member_of"] + + current_weights["namespace"] + ) + logger.debug( + f"Preliminary score: {preliminary_score:.4f}, " + f"Early exit threshold: {early_exit_threshold:.4f}" + ) + + if preliminary_score < early_exit_threshold: + logger.debug( + f"Early exit triggered ({preliminary_score:.4f} < " + f"{early_exit_threshold:.4f})" + ) + return preliminary_score + + scores["parameters"] = self._calculate_parameters_score( + feature1.parameters, feature2.parameters + ) + scores["return_type"] = self._calculate_return_type_score( + feature1, feature2 + ) + + final_score = sum( + scores[key] * current_weights[key] for key in current_weights + ) + logger.debug(f"Final scores including params & return: {scores}") + logger.debug(f"Final weighted similarity score: {final_score:.4f}") + return final_score diff --git a/src/google/adk/scope/utils/stats.py b/src/google/adk/scope/utils/stats.py new file mode 100644 index 0000000..f98479c --- /dev/null +++ b/src/google/adk/scope/utils/stats.py @@ -0,0 +1,19 @@ +def calculate_precision(matches: int, total_target: int) -> float: + """Calculates precision: matches / total_target.""" + if total_target > 0: + return matches / total_target + return 1.0 + + +def calculate_recall(matches: int, total_base: int) -> float: + """Calculates recall: matches / total_base.""" + if total_base > 0: + return matches / total_base + return 1.0 + + +def calculate_f1(precision: float, recall: float) -> float: + """Calculates F1 score: 2 * (P * R) / (P + R).""" + if precision + recall > 0: + return 2 * (precision * recall) / (precision + recall) + return 0.0 diff --git a/src/google/adk/scope/utils/strings.py b/src/google/adk/scope/utils/strings.py deleted file mode 100644 index e9290ea..0000000 --- a/src/google/adk/scope/utils/strings.py +++ /dev/null @@ -1,130 +0,0 @@ -""" -String utility functions for ADK Scope. -""" - -import re - - -def normalize_name(name: str) -> str: - """Convert name to snake_case (e.g. CamelCase -> camel_case).""" - name = name.replace("-", "_") - s1 = re.sub("(.)([A-Z][a-z]+)", r"\1_\2", name) - return re.sub("([a-z0-9])([A-Z])", r"\1_\2", s1).lower() - - -def normalize_type(type_name: str) -> str: - """Legacy wrapper for simple normalization (returns single string).""" - # For backward compatibility if needed, else we can migrate all callers - # But signature in Features expects lists now for normalized_types if we - # change converter - # Let's keep this as simple for now or deprecate? - # The new converter will use normalize_type_complex which returns List[str] - # But wait, existing calls might expect single str? - # Actually, converter.py calls: [normalize_type(t) for t in types] - # We should change converter to use new function. - return _simple_normalize(type_name) - - -def normalize_type_complex(type_name: str) -> list[str]: - """Map raw type to canonical types, handling generics.""" - type_name = type_name.strip() - if not type_name: - return ["OBJECT"] - - # Handle Union[A, B] - if type_name.startswith("Union[") and type_name.endswith("]"): - inner = type_name[6:-1] - parts = _split_generics(inner) - result = [] - for p in parts: - result.extend(normalize_type_complex(p)) - return _unique(result) - - # Handle Optional[T] -> T | None - if type_name.startswith("Optional[") and type_name.endswith("]"): - inner = type_name[9:-1] - result = normalize_type_complex(inner) - if "null" not in result: - result.append("null") - return result - - # Handle AsyncGenerator[A, B] -> A | B - if type_name.startswith("AsyncGenerator[") and type_name.endswith("]"): - inner = type_name[15:-1] - parts = _split_generics(inner) - result = [] - for p in parts: - result.extend(normalize_type_complex(p)) - return _unique(result) - - # Handle tuple[A, B] -> [A, B] - if type_name.lower().startswith("tuple[") and type_name.endswith("]"): - # tuple[...] or Tuple[...] - inner = type_name[6:-1] - parts = _split_generics(inner) - result = [] - for p in parts: - result.extend(normalize_type_complex(p)) - return _unique(result) - - # Handle other generics like List[int] -> LIST - if "[" in type_name and type_name.endswith("]"): - base = type_name.split("[", 1)[0] - return [_simple_normalize(base)] - - return [_simple_normalize(type_name)] - - -def _simple_normalize(t: str) -> str: - t = t.lower().strip() - if t == "none": - return "null" - if t in ("list", "array", "slice", "vector"): - return "LIST" - if t in ("set",): - return "SET" - if t in ("map", "dictionary", "dict", "record", "hash"): - return "MAP" - if t in ("int", "integer", "long", "int64", "float", "double"): - return "NUMBER" - if t in ("str", "string"): - return "STRING" - if t in ("bool", "boolean"): - return "BOOLEAN" - if t == "any": - return "OBJECT" - if not t: - return "OBJECT" - return "OBJECT" - - -def _split_generics(s: str) -> list[str]: - """Split string by comma, ignoring nested brackets.""" - parts = [] - balance = 0 - current = [] - for char in s: - if char == "[": - balance += 1 - current.append(char) - elif char == "]": - balance -= 1 - current.append(char) - elif char == "," and balance == 0: - parts.append("".join(current).strip()) - current = [] - else: - current.append(char) - if current: - parts.append("".join(current).strip()) - return parts - - -def _unique(lst: list[str]) -> list[str]: - seen = set() - out = [] - for x in lst: - if x not in seen: - seen.add(x) - out.append(x) - return out diff --git a/test/adk/scope/extractors/test_converter_py.py b/test/adk/scope/extractors/test_converter_py.py index e9fcb6c..3fb096d 100644 --- a/test/adk/scope/extractors/test_converter_py.py +++ b/test/adk/scope/extractors/test_converter_py.py @@ -1,8 +1,9 @@ import unittest -from unittest.mock import Mock, patch from pathlib import Path -from google.adk.scope.extractors.converter_py import NodeProcessor +from unittest.mock import Mock, patch + from google.adk.scope import features_pb2 as feature_pb2 +from google.adk.scope.extractors.converter_py import NodeProcessor from google.adk.scope.features_pb2 import Feature @@ -489,7 +490,7 @@ def node_child(name): result = self.processor.process(node, self.file_path, self.repo_root) self.assertEqual(len(result.parameters), 0) - @patch("google.adk.scope.extractors.converter_py.normalize_type_complex") + @patch("google.adk.scope.extractors.converter_py.TypeNormalizer.normalize") def test_param_enum_attribute_error(self, mock_normalize): # Force normalize to return a value not in keys mock_normalize.return_value = ["INVALID_TYPE_NAME"] @@ -821,8 +822,8 @@ def test_docstring_extended(self): node = self.create_mock_node( "function_definition", children=[name, body] ) - node.child_by_field_name.side_effect = ( - lambda n: name if n == "name" else (body if n == "body" else None) + node.child_by_field_name.side_effect = lambda n: ( + name if n == "name" else (body if n == "body" else None) ) result = self.processor.process(node, self.file_path, self.repo_root) diff --git a/test/adk/scope/extractors/test_converter_ts.py b/test/adk/scope/extractors/test_converter_ts.py index 6bee5cd..f2a579d 100644 --- a/test/adk/scope/extractors/test_converter_ts.py +++ b/test/adk/scope/extractors/test_converter_ts.py @@ -1,8 +1,9 @@ import unittest -from unittest.mock import Mock from pathlib import Path -from google.adk.scope.extractors.converter_ts import NodeProcessor +from unittest.mock import Mock + from google.adk.scope import features_pb2 as feature_pb2 +from google.adk.scope.extractors.converter_ts import NodeProcessor class TestNodeProcessor(unittest.TestCase): @@ -566,8 +567,8 @@ def child(n): node_s = self.create_mock_node( "method_definition", children=[set_kw, name_s] ) - node_s.child_by_field_name.side_effect = ( - lambda n: name_s if n == "name" else None + node_s.child_by_field_name.side_effect = lambda n: ( + name_s if n == "name" else None ) self.assertIsNone( @@ -595,59 +596,14 @@ def test_parameter_modes_rest(self): node = self.create_mock_node( "function_declaration", children=[name, params] ) - node.child_by_field_name.side_effect = ( - lambda n: name - if n == "name" - else (params if n == "parameters" else None) + node.child_by_field_name.side_effect = lambda n: ( + name if n == "name" else (params if n == "parameters" else None) ) result = self.processor.process(node, self.file_path, self.repo_root) self.assertEqual(len(result.parameters), 1) self.assertEqual(result.parameters[0].original_name, "args") - def test_type_normalization_extended(self): - # Array -> LIST - # ReadonlyArray -> LIST - # Map -> MAP - # Set -> SET - # string[] -> LIST - # void -> [] - - def test_type(t_str, expected_norm): - norm = self.processor._normalize_ts_type(t_str) - # norm is list of strings (enum names) - # Map enum names to values? Or just check processor internal output? - # _normalize_ts_type returns list of STRINGS like ['LIST'] - self.assertEqual(norm, expected_norm) - - test_type("Array", ["LIST"]) - test_type("ReadonlyArray", ["LIST"]) - test_type("Map", ["MAP"]) - test_type("Set", ["SET"]) - test_type("string[]", ["LIST"]) - test_type("void", []) - test_type("path", ["STRING"]) - test_type("formattedstring", ["STRING"]) - test_type("boolean", ["BOOLEAN"]) - - def test_recursive_types(self): - # Promise> - # Array> - def test_type(t_str, expected_norm): - norm = self.processor._normalize_ts_type(t_str) - self.assertEqual(norm, expected_norm) - - # Promise unwrapping is recursive in logic? - # _normalize_ts_type(Promise) -> _normalize_ts_type(T) - # So Promise> -> Promise -> string -> [STRING] - test_type("Promise>", ["STRING"]) - - # Array> -> LIST - test_type("Array>", ["LIST"]) - - # Map> -> MAP - test_type("Map>", ["MAP"]) - def test_abstract_and_interfaces(self): # abstract class method # interface method @@ -657,8 +613,8 @@ def test_abstract_and_interfaces(self): abs_name = self.create_mock_node("identifier", text="Abs") abs_name.field_name = "name" abs_class.children = [abs_name] - abs_class.child_by_field_name.side_effect = ( - lambda n: abs_name if n == "name" else None + abs_class.child_by_field_name.side_effect = lambda n: ( + abs_name if n == "name" else None ) method_name = self.create_mock_node( @@ -668,8 +624,8 @@ def test_abstract_and_interfaces(self): method_node = self.create_mock_node( "method_definition", children=[method_name], parent=abs_class ) - method_node.child_by_field_name.side_effect = ( - lambda n: method_name if n == "name" else None + method_node.child_by_field_name.side_effect = lambda n: ( + method_name if n == "name" else None ) result = self.processor.process( @@ -682,8 +638,8 @@ def test_abstract_and_interfaces(self): iface_name = self.create_mock_node("identifier", text="IFace") iface_name.field_name = "name" iface.children = [iface_name] - iface.child_by_field_name.side_effect = ( - lambda n: iface_name if n == "name" else None + iface.child_by_field_name.side_effect = lambda n: ( + iface_name if n == "name" else None ) # Interface method might be method_signature in TS, @@ -703,8 +659,8 @@ def test_abstract_and_interfaces(self): method_node_i = self.create_mock_node( "method_definition", children=[method_name], parent=iface ) - method_node_i.child_by_field_name.side_effect = ( - lambda n: method_name if n == "name" else None + method_node_i.child_by_field_name.side_effect = lambda n: ( + method_name if n == "name" else None ) result_i = self.processor.process( @@ -776,8 +732,8 @@ def test_jsdoc_with_decorator_interleaved(self): func = self.create_mock_node( "function_declaration", children=[name], prev_sibling=deco ) - func.child_by_field_name.side_effect = ( - lambda n: name if n == "name" else None + func.child_by_field_name.side_effect = lambda n: ( + name if n == "name" else None ) result = self.processor.process(func, self.file_path, self.repo_root) diff --git a/test/adk/scope/extractors/test_extract.py b/test/adk/scope/extractors/test_extract.py index d8dd013..660662e 100644 --- a/test/adk/scope/extractors/test_extract.py +++ b/test/adk/scope/extractors/test_extract.py @@ -1,9 +1,10 @@ +import shutil +import tempfile import unittest -from unittest.mock import patch, MagicMock from pathlib import Path -import tempfile -import shutil +from unittest.mock import MagicMock, patch +from google.adk.scope import features_pb2 as feature_pb2 from google.adk.scope.extractors import extract @@ -85,11 +86,24 @@ def setUp(self): self.mock_extractors["python"] = self.mock_py_extractor self.mock_extractors["typescript"] = self.mock_ts_extractor + # Enforce Output Generation + self.mock_json_patcher = patch( + "google.adk.scope.extractors.extract._JSON_OUTPUT", True + ) + self.mock_json_patcher.start() + + self.mock_yaml_patcher = patch( + "google.adk.scope.extractors.extract._YAML_OUTPUT", True + ) + self.mock_yaml_patcher.start() + # FeatureRegistry mock/patch ? # Actual FeatureRegistry is fine if we mock return of extract_features # But we write to file using MessageToJson def tearDown(self): + self.mock_yaml_patcher.stop() + self.mock_json_patcher.stop() self.mock_args_patcher.stop() self.mock_extractors_patcher.stop() shutil.rmtree(self.test_dir) @@ -127,16 +141,18 @@ def test_input_file_mode(self): f.touch() self.configure_args(lang="python", input_file=str(f)) - self.mock_py_extractor.extract_features.return_value = [] + self.mock_py_extractor.extract_features.return_value = [ + feature_pb2.Feature(original_name="dummy") + ] self.mock_py_extractor.get_version.return_value = "1.0" extract.main() self.mock_py_extractor.extract_features.assert_called() self.assertTrue(self.output_dir.exists()) - self.assertEqual(len(list(self.output_dir.glob("py_*.json"))), 1) - self.assertEqual(len(list(self.output_dir.glob("py_*.yaml"))), 1) - self.assertEqual(len(list(self.output_dir.glob("py_*.txtpb"))), 1) + self.assertTrue((self.output_dir / "py.json").exists()) + self.assertTrue((self.output_dir / "py.yaml").exists()) + self.assertTrue((self.output_dir / "py.txtpb").exists()) def test_input_file_not_found(self): self.configure_args(lang="python", input_file="/non/existent.py") @@ -150,23 +166,28 @@ def test_input_dir_mode(self): self.configure_args(lang="python", input_dir=str(d)) self.mock_py_extractor.find_files.return_value = [d / "a.py"] - self.mock_py_extractor.extract_features.return_value = [] + self.mock_py_extractor.extract_features.return_value = [ + feature_pb2.Feature(original_name="dummy") + ] self.mock_py_extractor.get_version.return_value = "1.0" extract.main() self.mock_py_extractor.find_files.assert_called_with(d, recursive=False) self.assertTrue(self.output_dir.exists()) - self.assertEqual(len(list(self.output_dir.glob("py_*.json"))), 1) - self.assertEqual(len(list(self.output_dir.glob("py_*.yaml"))), 1) - self.assertEqual(len(list(self.output_dir.glob("py_*.txtpb"))), 1) + self.assertTrue((self.output_dir / "py.json").exists()) + self.assertTrue((self.output_dir / "py.yaml").exists()) + self.assertTrue((self.output_dir / "py.txtpb").exists()) def test_input_repo_mode(self): r = self.root (r / "src").mkdir() self.configure_args(lang="python", input_repo=str(r)) - self.mock_py_extractor.find_files.return_value = [] + self.mock_py_extractor.find_files.return_value = [r / "src" / "a.py"] + self.mock_py_extractor.extract_features.return_value = [ + feature_pb2.Feature(original_name="dummy") + ] self.mock_py_extractor.get_version.return_value = "1.0" extract.main() @@ -176,9 +197,9 @@ def test_input_repo_mode(self): r / "src", recursive=True ) self.assertTrue(self.output_dir.exists()) - self.assertEqual(len(list(self.output_dir.glob("py_*.json"))), 1) - self.assertEqual(len(list(self.output_dir.glob("py_*.yaml"))), 1) - self.assertEqual(len(list(self.output_dir.glob("py_*.txtpb"))), 1) + self.assertTrue((self.output_dir / "py.json").exists()) + self.assertTrue((self.output_dir / "py.yaml").exists()) + self.assertTrue((self.output_dir / "py.txtpb").exists()) if __name__ == "__main__": diff --git a/test/adk/scope/extractors/test_extractor_py.py b/test/adk/scope/extractors/test_extractor_py.py index a457068..bd28bf5 100644 --- a/test/adk/scope/extractors/test_extractor_py.py +++ b/test/adk/scope/extractors/test_extractor_py.py @@ -1,11 +1,10 @@ -import unittest -from unittest.mock import patch, MagicMock import sys +import unittest from pathlib import Path -from google.adk.scope.extractors.extractor_py import ( - find_files, - extract_features, -) +from unittest.mock import MagicMock, patch + +from google.adk.scope.extractors.extractor_py import (extract_features, + find_files) from google.adk.scope.features_pb2 import Feature # Mock tree_sitter modules BEFORE importing extractor diff --git a/test/adk/scope/extractors/test_extractor_ts.py b/test/adk/scope/extractors/test_extractor_ts.py index cf750fe..f68c49a 100644 --- a/test/adk/scope/extractors/test_extractor_ts.py +++ b/test/adk/scope/extractors/test_extractor_ts.py @@ -1,14 +1,14 @@ +import shutil +import tempfile import unittest -from unittest.mock import Mock, patch from pathlib import Path -import tempfile -import shutil +from unittest.mock import Mock, patch +from google.adk.scope import features_pb2 # Import the module under test # We need to make sure the src path is in PYTHONPATH which is handled # by test runner usually from google.adk.scope.extractors import extractor_ts as extractor -from google.adk.scope import features_pb2 class TestExtractor(unittest.TestCase): diff --git a/test/adk/scope/matcher/test_matcher.py b/test/adk/scope/matcher/test_matcher.py new file mode 100644 index 0000000..d0aae2f --- /dev/null +++ b/test/adk/scope/matcher/test_matcher.py @@ -0,0 +1,431 @@ +import os +import tempfile +import unittest +from unittest.mock import patch +from google.adk.scope import features_pb2 +from google.adk.scope.matcher import matcher + + +class TestMatcher(unittest.TestCase): + def test_read_feature_registry(self): + content = """ + language: "PYTHON" + version: "1.0.0" + features { + original_name: "test_feature" + normalized_name: "test_feature" + type: FUNCTION + } + """ + with tempfile.NamedTemporaryFile( + mode="w", suffix=".txtpb", delete=False + ) as f: + f.write(content) + temp_path = f.name + + try: + registry = matcher.read_feature_registry(temp_path) + self.assertEqual(registry.language, "PYTHON") + self.assertEqual(registry.version, "1.0.0") + self.assertEqual(len(registry.features), 1) + self.assertEqual(registry.features[0].original_name, "test_feature") + self.assertEqual( + registry.features[0].type, features_pb2.Feature.Type.FUNCTION + ) + finally: + os.remove(temp_path) + + def test_match_features(self): + f1 = features_pb2.Feature( + normalized_name="f_same", + normalized_member_of="c_same", + normalized_namespace="n_same", + type=features_pb2.Feature.Type.INSTANCE_METHOD, + ) + f2 = features_pb2.Feature( + normalized_name="f_same", + normalized_member_of="c_same", + normalized_namespace="n_same", + type=features_pb2.Feature.Type.INSTANCE_METHOD, + ) + f3 = features_pb2.Feature( + normalized_name="totally", + normalized_member_of="different", + normalized_namespace="stuff", + type=features_pb2.Feature.Type.INSTANCE_METHOD, + ) + f4 = features_pb2.Feature( + normalized_name="entirely", + normalized_member_of="unrelated", + normalized_namespace="things", + type=features_pb2.Feature.Type.INSTANCE_METHOD, + ) + + base_features = [f1, f3] + target_features = [f4, f2] + + matches = matcher.match_features(base_features, target_features, 0.8) + + self.assertEqual(len(matches), 1) + + m_f1, m_f2, score = matches[0] + self.assertEqual(m_f1.normalized_name, "f_same") + self.assertEqual(m_f2.normalized_name, "f_same") + self.assertGreater(score, 0.8) + + # Assert lists were mutated and matched elements removed + self.assertEqual(len(base_features), 1) + self.assertEqual(base_features[0].normalized_name, "totally") + + self.assertEqual(len(target_features), 1) + self.assertEqual(target_features[0].normalized_name, "entirely") + + def test_match_registries(self): + # f1 & f2 are a solid match (score ~ 1.0) + f1 = features_pb2.Feature( + original_name="fSameBase", + normalized_name="f_same", + member_of="BaseClass", + namespace="google.adk.events", + normalized_member_of="c_same", + normalized_namespace="n_same", + type=features_pb2.Feature.Type.INSTANCE_METHOD, + ) + f2 = features_pb2.Feature( + original_name="fSameTarget", + normalized_name="f_same", + member_of="TargetClass", + namespace="adk.events", + normalized_member_of="c_same", + normalized_namespace="n_same", + type=features_pb2.Feature.Type.INSTANCE_METHOD, + ) + + # f_near_base & f_near_target are a near miss + # (different names, same structural namespace/class) + # Using different return types and different enough names to + # drop the score below 0.8 + f_near_base = features_pb2.Feature( + original_name="base_name", + normalized_name="base_name", + member_of="base_member", + namespace="google.adk.events", + normalized_member_of="base_member", + normalized_namespace="n_same", + original_return_types=["string"], + type=features_pb2.Feature.Type.INSTANCE_METHOD, + ) + f_near_target = features_pb2.Feature( + original_name="target_name", + normalized_name="targ_name", + member_of="target_member", + namespace="adk.events", + normalized_member_of="target_member", + normalized_namespace="n_same", + original_return_types=["int"], + type=features_pb2.Feature.Type.INSTANCE_METHOD, + ) + + # f3 is a complete gap (base-exclusive) + f3 = features_pb2.Feature( + original_name="totally_diff", + normalized_name="totally", + member_of="null", + namespace="google.adk.events", + normalized_member_of="different", + normalized_namespace="stuff", + type=features_pb2.Feature.Type.INSTANCE_METHOD, + ) + + base_registry = features_pb2.FeatureRegistry( + language="Python", version="1.0.0" + ) + base_registry.features.extend([f1, f_near_base, f3]) + + target_registry = features_pb2.FeatureRegistry( + language="TypeScript", version="2.0.0" + ) + target_registry.features.extend([f2, f_near_target]) + + # Test Symmetric Report + result_sym = matcher.match_registries( + base_registry, target_registry, 0.9, report_type="symmetric" + ) + report_sym = result_sym.master_content + + # 1. Verify Master Report Structure + self.assertIn("# Feature Matching Report: Symmetric", report_sym) + self.assertIn("**Jaccard Index:** 25.00%", report_sym) + self.assertIn("## Module Summary", report_sym) + + # Check for module entry in master summary + self.assertIn("| `n_same` |", report_sym) + self.assertIn("[View Details]({modules_dir}/n_same.md)", report_sym) + + # 2. Verify Module Content + self.assertIn("n_same.md", result_sym.module_files) + module_content = result_sym.module_files["n_same.md"] + + self.assertIn("# Module: `n_same`", module_content) + self.assertIn("**Features:** 3", module_content) + + # Solid Matches + self.assertIn("### ✅ Solid Features", module_content) + self.assertIn( + "| Type | Base Feature | Target Feature | Similarity Score |", + module_content, + ) + self.assertIn( + "| method | `BaseClass.fSameBase` | `TargetClass.fSameTarget` |", + module_content, + ) + + # Potential Matches (formerly Near Misses) + self.assertIn("### ⚠️ Potential Matches", module_content) + self.assertIn( + "| Type | Base Feature | Closest Target Candidate | Similarity |", + module_content, + ) + self.assertIn( + "| method | `base_member.base_name` | " + "`target_member.target_name` |", + module_content, + ) + + # Unmatched / Gaps (in 'stuff' module) + self.assertIn("stuff.md", result_sym.module_files) + stuff_content = result_sym.module_files["stuff.md"] + self.assertIn("### ❌ Unmatched Features", stuff_content) + self.assertIn("| `totally_diff` | Target |", stuff_content) + self.assertIn("**Features:** 1", stuff_content) + + # Test Directional Report + result_dir = matcher.match_registries( + base_registry, target_registry, 0.9, report_type="directional" + ) + report_dir = result_dir.master_content + + self.assertIn("| **F1 Score** | 40.00% |", report_dir) + self.assertIn("n_same.md", result_dir.module_files) + + mod_dir_content = result_dir.module_files["n_same.md"] + + # Solid Matches + self.assertIn("### ✅ Matched Features", mod_dir_content) + self.assertIn( + "| Type | Base Feature | Target Feature | Similarity Score |", + mod_dir_content, + ) + self.assertIn( + "| method | `BaseClass.fSameBase` | `TargetClass.fSameTarget` |", + mod_dir_content, + ) + + # Potential Matches + self.assertIn("### ⚠️ Potential Matches", mod_dir_content) + self.assertIn( + "| Type | Base Feature | Closest Target Candidate | Similarity |", + mod_dir_content, + ) + self.assertIn( + "| method | `base_member.base_name` | " + "`target_member.target_name` |", + mod_dir_content, + ) + + # Unmatched / Gaps (in 'stuff' module) + self.assertIn("stuff.md", result_dir.module_files) + stuff_dir_content = result_dir.module_files["stuff.md"] + self.assertIn("### ❌ Missing in Target", stuff_dir_content) + self.assertIn("| `totally_diff` |", stuff_dir_content) + + def test_match_registries_raw(self): + f1 = features_pb2.Feature( + original_name="f_same", + normalized_name="f_same", + normalized_namespace="pkg", + member_of="MyClass", + normalized_member_of="myclass", + type=features_pb2.Feature.Type.FUNCTION, + ) + base = features_pb2.FeatureRegistry(language="Python", version="1") + base.features.append(f1) + target = features_pb2.FeatureRegistry(language="TS", version="2") + target.features.append(f1) + + result = matcher.match_registries(base, target, 0.9, report_type="raw") + csv_content = result.master_content + + expected_header = ( + "py_namespace,py_member_of,py_name,ts_namespace," + "ts_member_of,ts_name,type,score" + ) + self.assertIn(expected_header, csv_content) + + # Check for solid match line + # f1 has: ns=pkg, mem=MyClass, name=f_same + # Match should have same values for base and target + expected_line = "pkg,MyClass,f_same,pkg,MyClass,f_same,function,1.0000" + self.assertIn(expected_line, csv_content) + self.assertFalse(result.module_files) + + def test_group_features_by_module(self): + registry = features_pb2.FeatureRegistry() + f1 = registry.features.add() + f1.namespace = "module.one" + f2 = registry.features.add() + f2.namespace = "module.two" + f3 = registry.features.add() + f3.namespace = "module.one" + + result = matcher._group_features_by_module(registry) + + self.assertIn("module.one", result) + self.assertIn("module.two", result) + self.assertEqual(len(result["module.one"]), 2) + self.assertEqual(len(result["module.two"]), 1) + + def test_fuzzy_match_namespaces(self): + features_base = {"module.one": [], "module.two": []} + features_target = { + "module.one": [features_pb2.Feature(original_name="f1_target")], + "module.ones": [features_pb2.Feature(original_name="f4")], + "module.three": [features_pb2.Feature(original_name="f5")], + } + + matcher._fuzzy_match_namespaces(features_base, features_target) + + self.assertIn("module.one", features_target) + self.assertIn("module.two", features_target) + self.assertNotIn("module.ones", features_target) + self.assertNotIn("module.three", features_target) + self.assertEqual(len(features_target["module.one"]), 3) + self.assertEqual(len(features_target["module.two"]), 0) + + def test_process_module(self): + """Tests the end-to-end processing of a single module.""" + f_base = features_pb2.Feature( + original_name="f1_base", + normalized_name="f1_base", + normalized_namespace="n1", + type=features_pb2.Feature.Type.FUNCTION, + ) + f_target = features_pb2.Feature( + original_name="f1_target", + normalized_name="f1_target", + normalized_namespace="n1", + type=features_pb2.Feature.Type.FUNCTION, + ) + + with patch( + "google.adk.scope.matcher.matcher.match_features" + ) as mock_match: + # Let's assume one solid match and no potential matches + mock_match.side_effect = [ + [(f_base, f_target, 0.95)], # Solid matches + [], # Potential matches + ] + + result = matcher._process_module( + module="n1", + base_list=[f_base], + target_list=[f_target], + alpha=0.9, + report_type="symmetric", + base_lang_code="py", + target_lang_code="ts", + ) + + self.assertEqual(result["solid_matches_count"], 1) + self.assertEqual(result["score"], 1.0) + self.assertIn("| py, ts |", result["row_content"]) + self.assertIn("# Module: `n1`", result["module_content"]) + self.assertIn("### ✅ Solid Features", result["module_content"]) + + def test_generate_raw_report(self): + """Tests the raw CSV report generation.""" + f_base = features_pb2.Feature( + original_name="f1_base", + normalized_name="f1_base", + namespace="n1", + member_of="c1", + type=features_pb2.Feature.Type.FUNCTION, + ) + + base_registry = features_pb2.FeatureRegistry( + language="Python", version="1.0.0" + ) + target_registry = features_pb2.FeatureRegistry( + language="TypeScript", version="2.0.0" + ) + + with patch( + "google.adk.scope.matcher.matcher.match_features" + ) as mock_match: + mock_match.return_value = [] # No matches for simplicity + + result = matcher._generate_raw_report( + base_registry=base_registry, + target_registry=target_registry, + all_modules=["n1"], + features_base={"n1": [f_base]}, + features_target={"n1": []}, + alpha=0.9, + ) + + self.assertIn( + "py_namespace,py_member_of,py_name", result.master_content + ) + self.assertIn("n1,c1,f1_base", result.master_content) + + def test_generate_markdown_report(self): + """Tests the markdown report generation.""" + base_registry = features_pb2.FeatureRegistry( + language="Python", version="1.0.0" + ) + target_registry = features_pb2.FeatureRegistry( + language="TypeScript", version="2.0.0" + ) + + with patch( + "google.adk.scope.matcher.matcher._process_module" + ) as mock_process: + mock_process.return_value = { + "solid_matches_count": 1, + "score": 1.0, + "row_content": "| py, ts | `n1` | 1 | 100.00% | ✅ | n1.md |", + "module_filename": "n1.md", + "module_content": "# Module: `n1`", + } + + result = matcher._generate_markdown_report( + base_registry=base_registry, + target_registry=target_registry, + all_modules=["n1"], + features_base={"n1": []}, # Dummy data + features_target={"n1": []}, # Dummy data + alpha=0.9, + report_type="symmetric", + ) + + self.assertIn( + "# Feature Matching Report: Symmetric", result.master_content + ) + self.assertIn("## Module Summary", result.master_content) + self.assertIn("| `n1` |", result.master_content) + self.assertIn("n1.md", result.module_files) + + def test_fuzzy_match_namespaces_empty_base(self): + features_base = {} + features_target = { + "module.one": [features_pb2.Feature(original_name="f1")] + } + + matcher._fuzzy_match_namespaces(features_base, features_target) + + self.assertIn("module.one", features_target) + self.assertEqual(len(features_target["module.one"]), 1) + + +if __name__ == "__main__": + unittest.main() diff --git a/test/adk/scope/utils/test_args.py b/test/adk/scope/utils/test_args.py index 0eef2a7..d1ac3aa 100644 --- a/test/adk/scope/utils/test_args.py +++ b/test/adk/scope/utils/test_args.py @@ -1,42 +1,56 @@ -import unittest import argparse -from unittest.mock import patch -from pathlib import Path -from google.adk.scope.utils.args import parse_args - - -class TestArgs(unittest.TestCase): - @patch("argparse.ArgumentParser.parse_args") - def test_parse_args(self, mock_parse): - # Setup mock return value - mock_args = argparse.Namespace( - language="py", - input_repo=Path("/tmp/repo"), - output=Path("/tmp/out_dir"), - input_file=None, - input_dir=None, - ) - mock_parse.return_value = mock_args +import logging +import unittest +from unittest import mock + +from google.adk.scope.utils import args as adk_args + + +class ArgsTest(unittest.TestCase): - # Call the function - # (arguments are parsed from sys.argv by default, - # but we mocked parse_args) - args = parse_args() + def test_add_verbose_argument(self): + parser = argparse.ArgumentParser() + adk_args.add_verbose_argument(parser) + args = parser.parse_args(["--verbose"]) + self.assertTrue(args.verbose) - self.assertEqual(args.input_repo, Path("/tmp/repo")) - self.assertEqual(args.output, Path("/tmp/out_dir")) - # Should be normalized - self.assertEqual(args.language, "python") + @mock.patch("logging.basicConfig") + def test_configure_logging_verbose(self, mock_basic_config): + args = argparse.Namespace(verbose=True) + adk_args.configure_logging(args) + mock_basic_config.assert_called_once_with(level=logging.DEBUG) + + @mock.patch("logging.basicConfig") + def test_configure_logging_default(self, mock_basic_config): + args = argparse.Namespace(verbose=False) + adk_args.configure_logging(args) + mock_basic_config.assert_called_once_with(level=logging.INFO) + + @mock.patch("argparse.ArgumentParser.parse_args") + def test_parse_args_python(self, mock_parse_args): + mock_parse_args.return_value = argparse.Namespace( + language="py", input_file="test.py", output="out", verbose=False + ) + parsed_args = adk_args.parse_args() + self.assertEqual(parsed_args.language, "python") + + @mock.patch("argparse.ArgumentParser.parse_args") + def test_parse_args_typescript(self, mock_parse_args): + mock_parse_args.return_value = argparse.Namespace( + language="ts", input_file="test.ts", output="out", verbose=False + ) + parsed_args = adk_args.parse_args() + self.assertEqual(parsed_args.language, "typescript") def test_arg_definitions(self): # Verify that the parser is set up with correct arguments - with patch("argparse.ArgumentParser") as mock_parser_cls: + with mock.patch("argparse.ArgumentParser") as mock_parser_cls: mock_parser = mock_parser_cls.return_value # We also need to mock the group returned by # add_mutually_exclusive_group mock_group = mock_parser.add_mutually_exclusive_group.return_value - parse_args() + adk_args.parse_args() # Verify mutual exclusive group creation mock_parser.add_mutually_exclusive_group.assert_called_once_with( diff --git a/test/adk/scope/utils/test_normalizer.py b/test/adk/scope/utils/test_normalizer.py new file mode 100644 index 0000000..e6538c6 --- /dev/null +++ b/test/adk/scope/utils/test_normalizer.py @@ -0,0 +1,127 @@ +import unittest + +from google.adk.scope.utils.normalizer import TypeNormalizer, normalize_name + + +class TestStrings(unittest.TestCase): + def test_normalize_name(self): + # CamelCase + self.assertEqual(normalize_name("ThisName"), "this_name") + self.assertEqual(normalize_name("CamelCase"), "camel_case") + + # camelCase + self.assertEqual(normalize_name("thisName"), "this_name") + self.assertEqual(normalize_name("runAsync"), "run_async") + + # snake_case + self.assertEqual(normalize_name("this_name"), "this_name") + self.assertEqual(normalize_name("already_snake"), "already_snake") + + # PascalCase with acronyms + self.assertEqual(normalize_name("HTTPResponse"), "http_response") + self.assertEqual(normalize_name("XMLParser"), "xml_parser") + + # Kebab-case + self.assertEqual(normalize_name("kebab-case"), "kebab_case") + + # Mixed + self.assertEqual( + normalize_name("JSONReaderWorker"), "json_reader_worker" + ) + + +class TestTypeNormalizer(unittest.TestCase): + def setUp(self): + self.normalizer = TypeNormalizer() + + def test_python_normalization(self): + self.assertEqual(self.normalizer.normalize("str", "python"), ["STRING"]) + self.assertEqual(self.normalizer.normalize("int", "python"), ["NUMBER"]) + self.assertEqual( + self.normalizer.normalize("bool", "python"), ["BOOLEAN"] + ) + self.assertEqual(self.normalizer.normalize("list", "python"), ["LIST"]) + self.assertEqual(self.normalizer.normalize("dict", "python"), ["MAP"]) + self.assertEqual(self.normalizer.normalize("set", "python"), ["SET"]) + self.assertEqual(self.normalizer.normalize("None", "python"), ["null"]) + self.assertEqual(self.normalizer.normalize("any", "python"), ["OBJECT"]) + self.assertEqual( + self.normalizer.normalize("Optional[str]", "python"), + ["STRING", "null"], + ) + self.assertEqual( + self.normalizer.normalize("Union[str, int]", "python"), + ["STRING", "NUMBER"], + ) + self.assertEqual( + self.normalizer.normalize("List[int]", "python"), ["LIST"] + ) + self.assertEqual( + self.normalizer.normalize("Tuple[str, int]", "python"), + ["STRING", "NUMBER"], + ) + + def test_typescript_normalization(self): + self.assertEqual( + self.normalizer.normalize("string", "typescript"), ["STRING"] + ) + self.assertEqual( + self.normalizer.normalize("number", "typescript"), ["NUMBER"] + ) + self.assertEqual( + self.normalizer.normalize("boolean", "typescript"), ["BOOLEAN"] + ) + self.assertEqual( + self.normalizer.normalize("string[]", "typescript"), ["LIST"] + ) + self.assertEqual( + self.normalizer.normalize("Array", "typescript"), ["LIST"] + ) + self.assertEqual( + self.normalizer.normalize("Map", "typescript"), + ["MAP"], + ) + self.assertEqual( + self.normalizer.normalize("Set", "typescript"), ["SET"] + ) + self.assertEqual(self.normalizer.normalize("void", "typescript"), []) + self.assertEqual( + self.normalizer.normalize("any", "typescript"), ["OBJECT"] + ) + self.assertEqual( + self.normalizer.normalize("unknown", "typescript"), ["UNKNOWN"] + ) + normalized = self.normalizer.normalize("Promise", "typescript") + self.assertEqual(normalized, ["STRING"]) + self.assertEqual( + self.normalizer.normalize("string | number", "typescript"), + ["STRING", "NUMBER"], + ) + + def test_edge_cases(self): + self.assertEqual(self.normalizer.normalize("", "python"), ["OBJECT"]) + self.assertEqual(self.normalizer.normalize(" ", "python"), ["OBJECT"]) + self.assertEqual( + self.normalizer.normalize("", "typescript"), ["OBJECT"] + ) + self.assertEqual( + self.normalizer.normalize(" ", "typescript"), ["OBJECT"] + ) + self.assertEqual( + self.normalizer.normalize("unsupported_type", "python"), ["OBJECT"] + ) + self.assertEqual( + self.normalizer.normalize("unsupported_type", "typescript"), + ["OBJECT"], + ) + self.assertEqual( + self.normalizer.normalize("str", "unsupported_language"), ["STRING"] + ) + self.assertEqual( + self.normalizer.normalize("MyCustomType", "unsupported_language"), + ["OBJECT"], + ) + + +if __name__ == "__main__": + unittest.main() diff --git a/test/adk/scope/utils/test_similarity.py b/test/adk/scope/utils/test_similarity.py new file mode 100644 index 0000000..4cb0473 --- /dev/null +++ b/test/adk/scope/utils/test_similarity.py @@ -0,0 +1,264 @@ +"""Unit tests for the SimilarityScorer class.""" + +import unittest + +from google.adk.scope import features_pb2 as features_pb +from google.adk.scope.utils.similarity import SimilarityScorer + + +class TestSimilarityScorer(unittest.TestCase): + """Test suite for the SimilarityScorer.""" + + def setUp(self): + """Set up a default scorer and a sample feature for the tests.""" + self.scorer = SimilarityScorer() + feature_args = { + "normalized_name": "my_func", + "normalized_member_of": "my_class", + "normalized_namespace": "my_module", + "parameters": [ + features_pb.Param( + normalized_name="p1", + normalized_types=[features_pb.ParamType.STRING], + ), + features_pb.Param( + normalized_name="p2", + normalized_types=[features_pb.ParamType.NUMBER], + is_optional=True, + ), + ], + "normalized_return_types": ["STRING"], + "async": True, + "type": features_pb.Feature.Type.INSTANCE_METHOD, + } + self.feature1 = features_pb.Feature(**feature_args) + + def test_initialization(self): + """Test that the scorer initializes with default and custom weights.""" + self.assertIsNotNone(self.scorer.weights) + custom_weights = { + "name": 1.0, + "member_of": 0.0, + "namespace": 0.0, + "parameters": 0.0, + "return_type": 0.0, + } + custom_scorer = SimilarityScorer(weights=custom_weights) + self.assertEqual(custom_scorer.weights, custom_weights) + + def test_identical_features(self): + """Test that identical features yield a score of 1.0.""" + score = self.scorer.get_similarity_score(self.feature1, self.feature1) + self.assertAlmostEqual(score, 1.0) + + def test_completely_dissimilar_features(self): + """Test that dissimilar features trigger an early exit.""" + feature2 = features_pb.Feature( + normalized_name="completely_different_function", + normalized_member_of="another_world", + normalized_namespace="a_galaxy_far_away", + type=features_pb.Feature.Type.INSTANCE_METHOD, + ) + score = self.scorer.get_similarity_score(self.feature1, feature2) + self.assertLess(score, 0.4, f"Early exit failed; score was {score}") + + def test_partial_similarity(self): + """Test a scenario with partially similar features.""" + feature2_args = { + "normalized_name": "my_func", # Same name + "normalized_member_of": "my_class", # Same class + "normalized_namespace": "a_different_module", # Diff namespace + "parameters": [ + features_pb.Param( + normalized_name="p1", + normalized_types=[features_pb.ParamType.STRING], + ), + ], + "normalized_return_types": ["NUMBER"], # Diff return + "async": False, # Diff async + "type": features_pb.Feature.Type.INSTANCE_METHOD, + } + feature2 = features_pb.Feature(**feature2_args) + score = self.scorer.get_similarity_score(self.feature1, feature2) + self.assertTrue( + 0 < score < 1.0, + f"Score {score} was not in the expected range (0, 1)", + ) + + def test_parameter_edge_cases(self): + """Test scoring with different parameter list configurations.""" + feature_no_params = features_pb.Feature( + normalized_name="func", + normalized_member_of="class", + normalized_namespace="ns", + type=features_pb.Feature.Type.INSTANCE_METHOD, + ) + feature_one_param = features_pb.Feature( + normalized_name="func", + normalized_member_of="class", + normalized_namespace="ns", + parameters=[features_pb.Param(normalized_name="p1")], + type=features_pb.Feature.Type.INSTANCE_METHOD, + ) + + # One empty, one not - should be an imperfect match + score = self.scorer.get_similarity_score( + feature_no_params, feature_one_param + ) + self.assertLess( + score, + 1.0, + "Score should be less than 1.0 when one param list is empty", + ) + + def test_return_type_edge_cases(self): + """Test scoring with different return type configurations.""" + feature_no_return = features_pb.Feature( + normalized_name="my_func", + normalized_member_of="my_class", + normalized_namespace="my_module", + type=features_pb.Feature.Type.INSTANCE_METHOD, + ) + + score = self.scorer.get_similarity_score( + self.feature1, feature_no_return + ) + self.assertLess( + score, 1.0, "Score should be less than 1.0 when return types differ" + ) + + def test_run_async_integration(self): + """Test similarity of TypeScript and Python 'run_async' features.""" + ts_feature_args = { + "normalized_name": "run_async", + "normalized_member_of": "runner", + "normalized_namespace": "runner", + "parameters": [ + features_pb.Param( + normalized_name="user_id", normalized_types=["STRING"] + ), + features_pb.Param( + normalized_name="session_id", normalized_types=["STRING"] + ), + features_pb.Param( + normalized_name="new_message", normalized_types=["OBJECT"] + ), + features_pb.Param( + normalized_name="state_delta", + normalized_types=["OBJECT"], + is_optional=True, + ), + features_pb.Param( + normalized_name="run_config", + normalized_types=["OBJECT"], + is_optional=True, + ), + ], + "normalized_return_types": ["OBJECT"], + "async": True, + "type": features_pb.Feature.Type.INSTANCE_METHOD, + } + ts_feature = features_pb.Feature(**ts_feature_args) + + py_feature_args = { + "normalized_name": "run_async", + "normalized_member_of": "runner", + "normalized_namespace": "runners", + "parameters": [ + features_pb.Param( + normalized_name="user_id", normalized_types=["STRING"] + ), + features_pb.Param( + normalized_name="session_id", normalized_types=["STRING"] + ), + features_pb.Param( + normalized_name="invocation_id", + normalized_types=["STRING"], + is_optional=True, + ), + features_pb.Param( + normalized_name="new_message", + normalized_types=["OBJECT"], + is_optional=True, + ), + features_pb.Param( + normalized_name="state_delta", + normalized_types=["MAP"], + is_optional=True, + ), + features_pb.Param( + normalized_name="run_config", + normalized_types=["OBJECT"], + is_optional=True, + ), + ], + "normalized_return_types": ["OBJECT", "null"], + "async": True, + "type": features_pb.Feature.Type.INSTANCE_METHOD, + } + py_feature = features_pb.Feature(**py_feature_args) + + score = self.scorer.get_similarity_score(ts_feature, py_feature) + + self.assertTrue( + 0.5 < score < 1.0, + f"Score {score} was not in the expected range (0.5, 1.0)", + ) + + def test_type_mismatch_early_exit(self): + """Test that comparing differing types yields 0.0 immediately.""" + feature_constructor = features_pb.Feature( + normalized_name="my_func", + normalized_member_of="my_class", + normalized_namespace="my_module", + type=features_pb.Feature.Type.CONSTRUCTOR, + ) + score = self.scorer.get_similarity_score( + self.feature1, feature_constructor + ) + self.assertEqual(score, 0.0) + + def test_constructor_weights(self): + """Test that CONSTRUCTOR comparisons ignore name and prioritize + member_of. + """ + c1 = features_pb.Feature( + normalized_name="constructor1", # Completely diff names + normalized_member_of="MyClass", + normalized_namespace="my_module", + type=features_pb.Feature.Type.CONSTRUCTOR, + ) + c2 = features_pb.Feature( + normalized_name="constructor2", + normalized_member_of="MyClass", + normalized_namespace="my_module", + type=features_pb.Feature.Type.CONSTRUCTOR, + ) + + # Despite name difference, should score very highly due to weight shift + score = self.scorer.get_similarity_score(c1, c2) + self.assertGreater(score, 0.9) + + def test_function_weights(self): + """Test that FUNCTION comparisons ignore member_of and prioritize + name. + """ + f1 = features_pb.Feature( + normalized_name="myFunction", + normalized_member_of="SomeClass", # Completely diff member_of + normalized_namespace="my_module", + type=features_pb.Feature.Type.FUNCTION, + ) + f2 = features_pb.Feature( + normalized_name="myFunction", + normalized_member_of="DifferentClass", + normalized_namespace="my_module", + type=features_pb.Feature.Type.FUNCTION, + ) + + score = self.scorer.get_similarity_score(f1, f2) + self.assertGreater(score, 0.9) + + +if __name__ == "__main__": + unittest.main() diff --git a/test/adk/scope/utils/test_stats.py b/test/adk/scope/utils/test_stats.py new file mode 100644 index 0000000..a98b0ab --- /dev/null +++ b/test/adk/scope/utils/test_stats.py @@ -0,0 +1,26 @@ +import unittest + +from google.adk.scope.utils import stats + + +class TestStats(unittest.TestCase): + def test_precision(self): + self.assertEqual(stats.calculate_precision(10, 20), 0.5) + self.assertEqual(stats.calculate_precision(0, 20), 0.0) + self.assertEqual(stats.calculate_precision(10, 0), 1.0) # Edge case + + def test_recall(self): + self.assertEqual(stats.calculate_recall(10, 20), 0.5) + self.assertEqual(stats.calculate_recall(0, 20), 0.0) + self.assertEqual(stats.calculate_recall(10, 0), 1.0) # Edge case + + def test_f1(self): + self.assertAlmostEqual(stats.calculate_f1(0.5, 0.5), 0.5) + self.assertAlmostEqual(stats.calculate_f1(1.0, 1.0), 1.0) + self.assertAlmostEqual(stats.calculate_f1(0.0, 1.0), 0.0) + self.assertAlmostEqual(stats.calculate_f1(0.0, 0.0), 0.0) + self.assertAlmostEqual(stats.calculate_f1(0.75, 0.5), 0.6) + + +if __name__ == "__main__": + unittest.main() diff --git a/test/adk/scope/utils/test_strings.py b/test/adk/scope/utils/test_strings.py deleted file mode 100644 index 956c2ee..0000000 --- a/test/adk/scope/utils/test_strings.py +++ /dev/null @@ -1,124 +0,0 @@ -import unittest -from google.adk.scope.utils.strings import normalize_name, normalize_type - - -class TestStrings(unittest.TestCase): - def test_normalize_name(self): - # CamelCase - self.assertEqual(normalize_name("ThisName"), "this_name") - self.assertEqual(normalize_name("CamelCase"), "camel_case") - - # camelCase - self.assertEqual(normalize_name("thisName"), "this_name") - self.assertEqual(normalize_name("runAsync"), "run_async") - - # snake_case - self.assertEqual(normalize_name("this_name"), "this_name") - self.assertEqual(normalize_name("already_snake"), "already_snake") - - # PascalCase with acronyms - self.assertEqual(normalize_name("HTTPResponse"), "http_response") - self.assertEqual(normalize_name("XMLParser"), "xml_parser") - - # Kebab-case - self.assertEqual(normalize_name("kebab-case"), "kebab_case") - - # Mixed - self.assertEqual( - normalize_name("JSONReaderWorker"), "json_reader_worker" - ) - - def test_normalize_type(self): - # List variants - self.assertEqual(normalize_type("list"), "LIST") - self.assertEqual(normalize_type("List"), "LIST") - self.assertEqual(normalize_type("Array"), "LIST") - self.assertEqual(normalize_type("Vector"), "LIST") - - # Set - self.assertEqual(normalize_type("set"), "SET") - - # Map variants - self.assertEqual(normalize_type("dict"), "MAP") - self.assertEqual(normalize_type("Dictionary"), "MAP") - self.assertEqual(normalize_type("Map"), "MAP") - - # Primitives - self.assertEqual(normalize_type("int"), "NUMBER") - self.assertEqual(normalize_type("Integer"), "NUMBER") - self.assertEqual(normalize_type("float"), "NUMBER") - self.assertEqual(normalize_type("str"), "STRING") - self.assertEqual(normalize_type("String"), "STRING") - self.assertEqual(normalize_type("bool"), "BOOLEAN") - - # Fallback - self.assertEqual(normalize_type("CustomType"), "OBJECT") - self.assertEqual(normalize_type("Any"), "OBJECT") - self.assertEqual(normalize_type(""), "OBJECT") - - def test_normalize_type_complex(self): - from google.adk.scope.utils.strings import normalize_type_complex - - # Simple types - self.assertEqual(normalize_type_complex("int"), ["NUMBER"]) - self.assertEqual(normalize_type_complex("str"), ["STRING"]) - self.assertEqual(normalize_type_complex(""), ["OBJECT"]) - - # Generics - List - self.assertEqual(normalize_type_complex("List[int]"), ["LIST"]) - self.assertEqual(normalize_type_complex("List[str]"), ["LIST"]) - self.assertEqual( - normalize_type_complex("list[int]"), ["LIST"] - ) # lowercase - - # Generics - Union - self.assertEqual( - set(normalize_type_complex("Union[int, str]")), {"NUMBER", "STRING"} - ) - - # check explicit behavior for None/none in simple_normalize - # _simple_normalize('none') -> 'null' - self.assertEqual( - set(normalize_type_complex("Union[int, None]")), {"NUMBER", "null"} - ) - - # Generics - Optional - self.assertEqual( - set(normalize_type_complex("Optional[int]")), {"NUMBER", "null"} - ) - self.assertEqual( - set(normalize_type_complex("Optional[List[str]]")), {"LIST", "null"} - ) - - # Generics - AsyncGenerator - self.assertEqual( - set(normalize_type_complex("AsyncGenerator[int, str]")), - {"NUMBER", "STRING"}, - ) - - # Generics - Tuple - self.assertEqual( - set(normalize_type_complex("tuple[int, str]")), {"NUMBER", "STRING"} - ) - self.assertEqual( - set(normalize_type_complex("Tuple[int, str]")), {"NUMBER", "STRING"} - ) - - # Nested generics - self.assertEqual( - set(normalize_type_complex("Union[List[int], Optional[str]]")), - {"LIST", "STRING", "null"}, - ) - - def test_split_generics(self): - from google.adk.scope.utils.strings import _split_generics - - self.assertEqual(_split_generics("a, b"), ["a", "b"]) - self.assertEqual(_split_generics("List[a,b], c"), ["List[a,b]", "c"]) - self.assertEqual( - _split_generics("Union[A, B[C, D]], E"), ["Union[A, B[C, D]]", "E"] - ) - - -if __name__ == "__main__": - unittest.main()