diff --git a/README.md b/README.md index b586719..462f927 100644 --- a/README.md +++ b/README.md @@ -77,7 +77,7 @@ Once you have extracted features from two languages (e.g., Python and TypeScript --base output/py.txtpb \ --target output/ts.txtpb \ --output output/ \ - --report-type directional + --report-type md ``` | Argument | Description | @@ -85,40 +85,27 @@ Once you have extracted features from two languages (e.g., Python and TypeScript | `--base ` | **Required.** Path to the "source of truth" feature registry (e.g., Python). | | `--target ` | **Required.** Path to the comparison registry (e.g., TypeScript). | | `--output ` | **Required.** Path for the output directory. The report filename is auto-generated. | -| `--report-type ` | `symmetric` (default) for Jaccard Index, `directional` for F1/Precision/Recall, or `raw` for CSV. | -| `--alpha ` | Similarity threshold (0.0 - 1.0). Default is `0.8`. | +| `--report-type ` | `md` (default) for Markdown Parity Report, or `raw` for CSV. | #### How Matching Works -The matcher uses the **Hungarian Algorithm** to find the optimal assignment between features in the Base and Target registries. -- **Cost Function**: Based on a similarity score derived from: - - Feature Name (normalized) - - Namespace / Module - - Feature Type (Function, Method, Class, etc.) -- **Thresholding**: Pairs with a similarity score below `--alpha` are discarded. +TODO: This needs updating #### Understanding the Reports -`adk-scope` can generate three types of reports to help you understand the feature overlap between two languages. +`adk-scope` generates two types of reports to help you understand the feature overlap between two languages. -##### Symmetric Report (`--report-type symmetric`) +##### Markdown Parity Report (`--report-type md`) -This report is best for measuring the general similarity between two feature sets, where neither is considered the "source of truth". It uses the **Jaccard Index** to calculate a global similarity score. +This report generates a human-readable Markdown file detailing the feature parity between two SDKs. -- **What it measures**: The Jaccard Index measures the similarity between two sets by dividing the size of their intersection by the size of their union. The score ranges from 0% (no similarity) to 100% (identical sets). -- **What it means**: A high Jaccard Index indicates that both languages have a very similar set of features, with few features unique to either one. It penalizes both missing and extra features equally. - -##### Directional Report (`--report-type directional`) - -This report is ideal when you have a "base" or "source of truth" language and you want to measure how well a "target" language conforms to it. It uses **Precision**, **Recall**, and **F1-Score**. - -- **Precision**: Answers the question: *"Of all the features implemented in the target language, how many of them are correct matches to features in the base language?"* A low score indicates the target has many extra features not present in the base. -- **Recall**: Answers the question: *"Of all the features that should be in the target language (i.e., all features in the base), how many were actually found?"* A low score indicates the target is missing many features from the base. -- **F1-Score**: The harmonic mean of Precision and Recall, providing a single score that balances both. A high F1-Score indicates the target is a close match to the base, having most of the required features and not too many extra ones. +- **Gap Analysis List**: A summary table that breaks down features into "Common Shared", "Exclusive to [Base Language]", and "Exclusive to [Target Language]". +- **Jaccard Score**: It calculates an overall similarity score using the Jaccard Index (Intersection over Union), providing a global metric of feature parity. +- **Module Breakdown**: It provides score details and status links on a per-module basis, highlighting exact matches, potential near-matches, and missing features. ##### Raw Report (`--report-type raw`) -This report provides a simple CSV output of all features (matched and unmatched) from both the base and target registries. It is useful for programmatic analysis or for importing the data into other tools.$ +This report provides a simple CSV output of all features (matched and unmatched) from both the base and target registries. It is useful for programmatic analysis or for importing the data into other tools. ## Development diff --git a/playground.ipynb b/playground.ipynb index 02fb739..9fe9688 100644 --- a/playground.ipynb +++ b/playground.ipynb @@ -2,8 +2,8 @@ "cells": [ { "cell_type": "code", - "execution_count": 1, - "id": "8f748152", + "execution_count": 2, + "id": "02bd4661-609e-40c1-95e7-3f4fc946c69b", "metadata": {}, "outputs": [], "source": [ @@ -21,6 +21,8 @@ "from google.adk.scope import features_pb2\n", "from google.adk.scope.utils import similarity\n", "from google.protobuf import text_format\n", + "import pandas as pd\n", + "import numpy as np\n", "import logging\n", "\n", "logging.basicConfig(level=logging.DEBUG)" @@ -28,325 +30,765 @@ }, { "cell_type": "code", - "execution_count": 43, - "id": "bb16a48a-5cda-4ec6-a10e-c8a2f2828700", + "execution_count": 3, + "id": "96c6f1de-6442-4d9b-b21a-06dcacebff69", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "((1426, 10), (1426, 10), (1426, 10))" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "def read_feature_registry(file_path: str) -> features_pb2.FeatureRegistry:\n", - " \"\"\"Reads a FeatureRegistry from a text proto file.\"\"\"\n", - " registry = features_pb2.FeatureRegistry()\n", - " with open(file_path, \"rb\") as f:\n", - " text_format.Parse(f.read(), registry)\n", - " return registry" + "pg = pd.read_csv('./output/py_go.csv').sort_values(by = ['score', 'type'], ascending=False).copy()\n", + "pj = pd.read_csv('./output/py_java.csv').sort_values(by = ['score', 'type'], ascending=False).copy()\n", + "pt = pd.read_csv('./output/py_ts.csv').sort_values(by = ['score', 'type'], ascending=False).copy()\n", + "\n", + "pg.shape, pj.shape, pt.shape" ] }, { "cell_type": "code", - "execution_count": 44, - "id": "f3361359-a887-4537-b22b-fe8fec24e13a", + "execution_count": 4, + "id": "093a3f6c-131a-42be-aa61-0c1b8209c009", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "count 1426.000000\n", + "mean 0.409128\n", + "std 0.152272\n", + "min 0.000000\n", + "25% 0.349225\n", + "50% 0.407950\n", + "75% 0.476100\n", + "max 0.992100\n", + "Name: score, dtype: float64" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "py_features = read_feature_registry(\"output/py.txtpb\")\n", - "ts_features = read_feature_registry(\"output/ts.txtpb\")" + "pg['score'].describe()" ] }, { "cell_type": "code", - "execution_count": 45, - "id": "2208e1dd-6647-4796-b81c-58ab7406e4c9", + "execution_count": 5, + "id": "b95b8dff-dc50-4505-91a8-562fa6f2d5b9", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "count 1426.000000\n", + "mean 0.528424\n", + "std 0.284260\n", + "min 0.000000\n", + "25% 0.391550\n", + "50% 0.486150\n", + "75% 0.797150\n", + "max 1.000000\n", + "Name: score, dtype: float64" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "py_run_async = py_features.features[0]\n", - "py_run = py_features.features[9]\n", - "ts_run_async = ts_features.features[3]" + "pj['score'].describe()" ] }, { "cell_type": "code", - "execution_count": 46, - "id": "3358d489-92bb-411a-9ec8-0f596be08c9f", + "execution_count": 6, + "id": "9b1f6e3d-3912-4b97-ba50-0ff3894b17c5", "metadata": {}, "outputs": [ { - "name": "stderr", - "output_type": "stream", - "text": [ - "DEBUG:google.adk.scope.utils.similarity:Initializing SimilarityScorer with alpha=0.8 and weights={'name': 0.3, 'member_of': 0.3, 'namespace': 0.15, 'parameters': 0.15, 'return_type': 0.1}\n" - ] + "data": { + "text/plain": [ + "count 1426.000000\n", + "mean 0.532307\n", + "std 0.224664\n", + "min 0.170800\n", + "25% 0.373025\n", + "50% 0.452750\n", + "75% 0.589400\n", + "max 1.000000\n", + "Name: score, dtype: float64" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" } ], "source": [ - "scorer = similarity.SimilarityScorer()" + "pt['score'].describe()" ] }, { "cell_type": "code", - "execution_count": 47, - "id": "a1b5b03b-8a0c-4614-ab4f-89140fa872d0", + "execution_count": 14, + "id": "4a520159-6ef0-4def-b1fb-d96c3412d8ca", "metadata": {}, "outputs": [ { - "name": "stderr", + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAigAAAGdCAYAAAA44ojeAAAAOnRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjEwLjcsIGh0dHBzOi8vbWF0cGxvdGxpYi5vcmcvTLEjVAAAAAlwSFlzAAAPYQAAD2EBqD+naQAAJA5JREFUeJzt3Q9wFdX5//En/0gIksRAQ0gJf8QiICAWBCNoEYEIDIowUy0W0aFQFZiRtIgoYAJiaIaf2joBxhbBzoBaHNECERKgSJEgkpZRQFNBES0kVC0ESQn5s7855zs3ckMAE3aTZ5P3a2a9uffu3ex9crj349lzdkMcx3EEAABAkdDG3gEAAICaCCgAAEAdAgoAAFCHgAIAANQhoAAAAHUIKAAAQB0CCgAAUIeAAgAA1AkXH6qqqpJjx45J69atJSQkpLF3BwAA/ADm3LCnT5+WpKQkCQ0NbXoBxYST5OTkxt4NAABQD19++aV06NCh6QUU03MSeIMxMTGubru8vFxyc3NlxIgREhER4eq2Qa0bA22aWjdFtGt/1rqkpMR2MAS+x5tcQAkc1jHhxIuAEh0dbbdLQPEWtW4Y1LnhUGtq3RSVe/C9+EOGZzBIFgAAqENAAQAA6hBQAACAOgQUAACgDgEFAACoQ0ABAADqEFAAAIA6BBQAAKAOAQUAAKhDQAEAAOoQUAAAgDoEFAAAoA4BBQAAqENAAQAA6oQ39g4AEOn8xEbPyvDpwhGUGIDv0IMCAADUIaAAAAB1CCgAAEAdAgoAAFCHgAIAANQhoAAAAHUIKAAAQB0CCgAAUIeAAgAA1CGgAAAAdQgoAABAHQIKAABQh4ACAADUIaAAAAB1CCgAAEAdAgoAAFCHgAIAANQhoAAAAHUIKAAAQB0CCgAAUIeAAgAA1CGgAAAAfweUZcuWSZ8+fSQmJsYuKSkp8s4771Q/f/bsWZk2bZq0adNGrrrqKhk/frwUFxcHbePo0aMyevRoiY6OloSEBJk1a5ZUVFS4944AAEDzCigdOnSQxYsXS0FBgezdu1eGDh0qd999txw4cMA+P3PmTFm/fr2sXbtW3n33XTl27JiMGzeu+vWVlZU2nJw7d0527dolr7zyiqxatUrmz5/v/jsDAAC+FV6XlceMGRN0f9GiRbZXZffu3Ta8rFixQtasWWODi7Fy5Urp0aOHff7mm2+W3NxcOXjwoGzZskXatWsnffv2lYULF8rs2bMlPT1dWrRo4e67AwAATT+gnM/0hpiekjNnzthDPaZXpby8XIYNG1a9Tvfu3aVjx46Sn59vA4q57d27tw0nAampqfLII4/YXpgbb7yx1t9VVlZml4CSkhJ7a36fWdwU2J7b2wW1vpTIMMezJkKbbjjUmlo3ReUufi/WZRt1DigfffSRDSRmvIkZZ7Ju3Trp2bOn7Nu3z/aAxMXFBa1vwkhRUZH92dyeH04Czweeu5jMzEzJyMi44HHTI2PGsnghLy/Pk+2CWtcma4B3LSPQlmnTDYdaU+umKM+F78XS0lLvAsp1111nw8ipU6fkjTfekEmTJtnxJl6aM2eOpKWlBfWgJCcny4gRI+xgXTeZdGf+CMOHD5eIiAhXtw1qfTG90jd71jz++dRQ2nQD4fOj4VBrf9Y6cATEk4BiekmuvfZa+3O/fv3kgw8+kN///vdy77332sGvJ0+eDOpFMbN4EhMT7c/mds+ePUHbC8zyCaxTm8jISLvUZArlVYjwctug1jWVVYZ41iwC7Zg23XCoNbVuiiJc+F6sy+uv+DwoVVVVdnyICSvmF2/durX6ucLCQjut2BwSMsytOUR04sSJ6nVMKjO9IOYwEQAAQJ17UMyhlpEjR9qBr6dPn7YzdrZv3y6bN2+W2NhYmTx5sj0UEx8fb0PHjBkzbCgxA2QNc0jGBJGJEydKVlaWHXcyd+5ce+6U2npIAABA81SngGJ6Ph544AE5fvy4DSTmpG0mnJjjUsbzzz8voaGh9gRtplfFzNBZunRp9evDwsJkw4YNdtaOCS6tWrWyY1gWLFjg/jsDAADNI6CY85xcSlRUlGRnZ9vlYjp16iQ5OTl1+bUAAKCZ4Vo8AABAHQIKAABQh4ACAADUIaAAAAB1CCgAAEAdAgoAAFCHgAIAANQhoAAAAHUIKAAAQB0CCgAAUIeAAgAA1CGgAAAAdQgoAABAHQIKAABQh4ACAADUIaAAAAB1CCgAAEAdAgoAAFCHgAIAANQhoAAAAHUIKAAAQB0CCgAAUIeAAgAA1CGgAAAAdQgoAABAnfDG3gEA/tX5iY2ebPfI4tGebBeAf9CDAgAA1CGgAAAAdQgoAABAHQIKAABQh4ACAADUIaAAAAB1CCgAAEAdAgoAAFCHgAIAANQhoAAAAHUIKAAAQB0CCgAAUIeAAgAA1CGgAAAAdQgoAABAHQIKAABQh4ACAAD8HVAyMzPlpptuktatW0tCQoKMHTtWCgsLg9YZMmSIhISEBC0PP/xw0DpHjx6V0aNHS3R0tN3OrFmzpKKiwp13BAAAfC+8Liu/++67Mm3aNBtSTKB48sknZcSIEXLw4EFp1apV9XpTpkyRBQsWVN83QSSgsrLShpPExETZtWuXHD9+XB544AGJiIiQZ5991q33BQAAmktA2bRpU9D9VatW2R6QgoICue2224ICiQkgtcnNzbWBZsuWLdKuXTvp27evLFy4UGbPni3p6enSokWL+r4XAADQRFzRGJRTp07Z2/j4+KDHV69eLW3btpVevXrJnDlzpLS0tPq5/Px86d27tw0nAampqVJSUiIHDhy4kt0BAADNsQflfFVVVfLYY4/JoEGDbBAJmDBhgnTq1EmSkpLkww8/tD0jZpzKm2++aZ8vKioKCidG4L55rjZlZWV2CTBhxigvL7eLmwLbc3u7oNaXEhnmeNZEvGzTXu23X//98flBrZuichc/Q+qyjRDHcer1CfPII4/IO++8Izt37pQOHTpcdL1t27bJHXfcIYcOHZKuXbvK1KlT5YsvvpDNmzdXr2N6WMwYlpycHBk5cuQF2zCHfjIyMi54fM2aNUHjWwAAgF7m+950ZJgjMDExMe73oEyfPl02bNggO3bsuGQ4MQYOHGhvAwHFjE3Zs2dP0DrFxcX29mLjVsxhorS0tKAelOTkZDtA93JvsD7pLi8vT4YPH24H7sI71Pp7vdK/D+xu++dTQz1r017t9/70VPEj2jS1borKXfxeDBwB+SHqFFBMZ8uMGTNk3bp1sn37dunSpctlX7Nv3z572759e3ubkpIiixYtkhMnTtgBtoZ54yZo9OzZs9ZtREZG2qUmUyivQoSX2wa1rqmsMsSzZhFox160aa/22+//9vj8oNZNUYQLnyF1eX2dAoqZYmwOq7z99tv2XCiBMSOxsbHSsmVLOXz4sH1+1KhR0qZNGzsGZebMmXaGT58+fey6ptfDBJGJEydKVlaW3cbcuXPttmsLIQAAoPmp0yyeZcuW2eNG5mRspkcksLz++uv2eTNF2EwfNiGke/fu8pvf/EbGjx8v69evr95GWFiYPTxkbk1vyi9/+Ut7HpTzz5sCAACatzof4rkUMy7EnMztcswsHzMgFgAAoDZciwcAAKhDQAEAAOoQUAAAgDoEFAAAoA4BBQAAqENAAQAA6hBQAACAOgQUAACgDgEFAACoQ0ABAADqEFAAAIA6BBQAAKAOAQUAAKhDQAEAAOqEN/YOAPBWr/TNkjXg/27LKkMoNwBfoAcFAACoQ0ABAADqcIgHqIPOT2ykXgDQAOhBAQAA6hBQAACAOgQUAACgDgEFAACoQ0ABAADqEFAAAIA6BBQAAKAOAQUAAKhDQAEAAOoQUAAAgDoEFAAAoA4BBQAAqENAAQAA6hBQAACAOgQUAACgDgEFAACoQ0ABAADqEFAAAIA6BBQAAKAOAQUAAKhDQAEAAOoQUAAAgDoEFAAAoA4BBQAAqENAAQAA/g4omZmZctNNN0nr1q0lISFBxo4dK4WFhUHrnD17VqZNmyZt2rSRq666SsaPHy/FxcVB6xw9elRGjx4t0dHRdjuzZs2SiooKd94RAABoXgHl3XffteFj9+7dkpeXJ+Xl5TJixAg5c+ZM9TozZ86U9evXy9q1a+36x44dk3HjxlU/X1lZacPJuXPnZNeuXfLKK6/IqlWrZP78+e6+MwAA4FvhdVl506ZNQfdNsDA9IAUFBXLbbbfJqVOnZMWKFbJmzRoZOnSoXWflypXSo0cPG2puvvlmyc3NlYMHD8qWLVukXbt20rdvX1m4cKHMnj1b0tPTpUWLFu6+QwAA0LzGoJhAYsTHx9tbE1RMr8qwYcOq1+nevbt07NhR8vPz7X1z27t3bxtOAlJTU6WkpEQOHDhwJbsDAACaYw/K+aqqquSxxx6TQYMGSa9evexjRUVFtgckLi4uaF0TRsxzgXXODyeB5wPP1aasrMwuASbMGCYMmcVNge25vV00jVpHhjniN5GhTtCtH/ipTfi9TfsVtfZnreuyjXoHFDMWZf/+/bJz5876bqJOg3MzMjIueNwcLjIDbb1gxtigYfip1lkDxLcW9q8Sv8jJyRE/81Ob9jtq7a9al5aWehtQpk+fLhs2bJAdO3ZIhw4dqh9PTEy0g19PnjwZ1ItiZvGY5wLr7NmzJ2h7gVk+gXVqmjNnjqSlpQX1oCQnJ9sBujExMeImk+7MH2H48OESERHh6rbh/1r3St8sfmN6Tkw4mbc3VMqqQsQP9qenih/5sU37FbX2Z60DR0BcDyiO48iMGTNk3bp1sn37dunSpUvQ8/369bM7v3XrVju92DDTkM204pSUFHvf3C5atEhOnDhhB9ga5o2boNGzZ89af29kZKRdajK/y6sPAS+3Df/WuqzSH1/wtTHhxC/775f20BTatN9Ra3/Vui6vD6/rYR0zQ+ftt9+250IJjBmJjY2Vli1b2tvJkyfb3g4zcNaEDhNoTCgxM3gM0+thgsjEiRMlKyvLbmPu3Ll227WFEAAA0PzUKaAsW7bM3g4ZMiTocTOV+MEHH7Q/P//88xIaGmp7UMzAVjNDZ+nSpdXrhoWF2cNDjzzyiA0urVq1kkmTJsmCBQvceUcAAMD36nyI53KioqIkOzvbLhfTqVMn3w+CAwAA3uFaPAAAQB0CCgAAUIeAAgAA1CGgAAAAdQgoAABAHQIKAABQh4ACAADUIaAAAAB1CCgAAEAdAgoAAFCHgAIAANQhoAAAAHUIKAAAQB0CCgAAUIeAAgAA1CGgAAAAdQgoAABAHQIKAABQh4ACAADUIaAAAAB1CCgAAEAdAgoAAFCHgAIAANQhoAAAAHUIKAAAQB0CCgAAUIeAAgAA1CGgAAAAdQgoAABAHQIKAABQh4ACAADUIaAAAAB1CCgAAEAdAgoAAFCHgAIAANQhoAAAAHUIKAAAQB0CCgAAUIeAAgAA1CGgAAAAdQgoAABAHQIKAABQh4ACAADUIaAAAAD/B5QdO3bImDFjJCkpSUJCQuStt94Kev7BBx+0j5+/3HnnnUHrfPvtt3L//fdLTEyMxMXFyeTJk+W777678ncDAACaZ0A5c+aM3HDDDZKdnX3RdUwgOX78ePXy6quvBj1vwsmBAwckLy9PNmzYYEPP1KlT6/cOAABAkxNe1xeMHDnSLpcSGRkpiYmJtT738ccfy6ZNm+SDDz6Q/v3728defPFFGTVqlCxZssT2zAAAgOatzgHlh9i+fbskJCTI1VdfLUOHDpVnnnlG2rRpY5/Lz8+3h3UC4cQYNmyYhIaGyvvvvy/33HPPBdsrKyuzS0BJSYm9LS8vt4ubAttze7toGrWODHPEbyJDnaBbP/BTm/B7m/Yrau3PWtdlG64HFHN4Z9y4cdKlSxc5fPiwPPnkk7bHxQSTsLAwKSoqsuElaCfCwyU+Pt4+V5vMzEzJyMi44PHc3FyJjo4WL5jDT2gYfqp11gDxrYX9q8QvcnJyxM/81Kb9jlr7q9alpaWNF1Duu+++6p979+4tffr0ka5du9pelTvuuKNe25wzZ46kpaUF9aAkJyfLiBEj7EBbN5l0Z/4Iw4cPl4iICFe3Df/Xulf6ZvEb03Niwsm8vaFSVhUifrA/PVX8yI9t2q+otT9rHTgC0miHeM53zTXXSNu2beXQoUM2oJixKSdOnAhap6Kiws7sudi4FTOmxSw1mUJ59SHg5bbh31qXVfrjC742Jpz4Zf/90h6aQpv2O2rtr1rX5fWenwflq6++km+++Ubat29v76ekpMjJkyeloKCgep1t27ZJVVWVDBw40OvdAQAAPlDnHhRzvhLTGxLw+eefy759++wYErOYsSLjx4+3vSFmDMrjjz8u1157raSm/l+XbY8ePew4lSlTpsjy5ctt19H06dPtoSFm8AAAgHr1oOzdu1duvPFGuxhmbIj5ef78+XYQ7Icffih33XWXdOvWzZ6ArV+/fvL3v/896BDN6tWrpXv37vaQj5lePHjwYHnppZf4iwAAgPr1oAwZMkQc5+LTFTdvvvwgQtPTsmbNmrr+agAA0ExwLR4AAKAOAQUAAKhDQAEAAOoQUAAAgDoEFAAAoA4BBQAAqENAAQAA6hBQAACAOgQUAACgDgEFAACoQ0ABAADqEFAAAID/LxYIAF7r/MRGz7Z9ZPFoz7YNwD30oAAAAHUIKAAAQB0CCgAAUIeAAgAA1CGgAAAAdQgoAABAHQIKAABQh4ACAADUIaAAAAB1CCgAAEAdAgoAAFCHgAIAANQhoAAAAHUIKAAAQB0CCgAAUIeAAgAA1CGgAAAAdQgoAABAHQIKAABQh4ACAADUIaAAAAB1CCgAAEAdAgoAAFCHgAIAANQhoAAAAHUIKAAAQB0CCgAAUIeAAgAA1CGgAAAAdQgoAADA/wFlx44dMmbMGElKSpKQkBB56623gp53HEfmz58v7du3l5YtW8qwYcPk008/DVrn22+/lfvvv19iYmIkLi5OJk+eLN99992VvxsAANA8A8qZM2fkhhtukOzs7Fqfz8rKkj/84Q+yfPlyef/996VVq1aSmpoqZ8+erV7HhJMDBw5IXl6ebNiwwYaeqVOnXtk7AQAATUZ4XV8wcuRIu9TG9J688MILMnfuXLn77rvtY3/+85+lXbt2tqflvvvuk48//lg2bdokH3zwgfTv39+u8+KLL8qoUaNkyZIltmcGAAA0b3UOKJfy+eefS1FRkT2sExAbGysDBw6U/Px8G1DMrTmsEwgnhlk/NDTU9rjcc889F2y3rKzMLgElJSX2try83C5uCmzP7e2iadQ6MswRv4kMdYJumzsv25sf27RfUWt/1rou23A1oJhwYpgek/OZ+4HnzG1CQkLwToSHS3x8fPU6NWVmZkpGRsYFj+fm5kp0dLR4wRx+QsPwU62zBohvLexf1di7oEJOTo7nv8NPbdrvqLW/al1aWto4AcUrc+bMkbS0tKAelOTkZBkxYoQdaOsmk+7MH2H48OESERHh6rbh/1r3St8sfmN6Tkw4mbc3VMqqQqS525+e6tm2/dim/Ypa+7PWgSMgDR5QEhMT7W1xcbGdxRNg7vft27d6nRMnTgS9rqKiws7sCby+psjISLvUZArl1YeAl9uGf2tdVunfL3gTTvy8/25piLbmpzbtd9TaX7Wuy+tdPQ9Kly5dbMjYunVrUFoyY0tSUlLsfXN78uRJKSgoqF5n27ZtUlVVZceqAAAA1LkHxZyv5NChQ0EDY/ft22fHkHTs2FEee+wxeeaZZ+QnP/mJDSzz5s2zM3PGjh1r1+/Ro4fceeedMmXKFDsV2XQdTZ8+3Q6gZQYPAACoV0DZu3ev3H777dX3A2NDJk2aJKtWrZLHH3/cnivFnNfE9JQMHjzYTiuOioqqfs3q1attKLnjjjvs7J3x48fbc6cAAADUK6AMGTLEnu/kYszZZRcsWGCXizG9LWvWrOEvAAAAasW1eAAAgDoEFAAAoA4BBQAAqENAAQAA6hBQAACAOgQUAACgDgEFAACoQ0ABAADq+OJqxo111Vq3L6x2ZPFoV7cHAEBTRQ8KAABQh4ACAADUIaAAAAB1CCgAAEAdAgoAAFCHgAIAANQhoAAAAHUIKAAAQB0CCgAAUIeAAgAA1CGgAAAAdQgoAABAHQIKAABQh4ACAADUIaAAAAB1CCgAAEAdAgoAAFCHgAIAANQhoAAAAHUIKAAAQB0CCgAAUIeAAgAA1CGgAAAAdQgoAABAHQIKAABQh4ACAADUIaAAAAB1CCgAAEAdAgoAAFCHgAIAANQhoAAAAHUIKAAAQB0CCgAAUIeAAgAAmn5ASU9Pl5CQkKCle/fu1c+fPXtWpk2bJm3atJGrrrpKxo8fL8XFxW7vBgAA8LFwLzZ6/fXXy5YtW77/JeHf/5qZM2fKxo0bZe3atRIbGyvTp0+XcePGyXvvvefFrgBAkM5PbPSkIkcWj6bSgPaAYgJJYmLiBY+fOnVKVqxYIWvWrJGhQ4fax1auXCk9evSQ3bt3y8033+zF7gAAAJ/xJKB8+umnkpSUJFFRUZKSkiKZmZnSsWNHKSgokPLychk2bFj1uubwj3kuPz//ogGlrKzMLgElJSX21mzLLG4KbC8y1HF1u+dvG8H18FNdIsPcbxdeC7RlL9o0vnf+55Gf2rRfUWt/1rou2whxHMfVT6133nlHvvvuO7nuuuvk+PHjkpGRIf/+979l//79sn79ennooYeCwoYxYMAAuf322+V3v/vdRce1mO3UZHpioqOj3dx9AADgkdLSUpkwYYI9ohITE9OwAaWmkydPSqdOneS5556Tli1b1iug1NaDkpycLF9//fVl32B90l1eXp7M2xsqZVUhrm57f3qqq9vzu0Cthw8fLhEREeIHvdI3i9+YnpOF/as8adMI/vftxzbtV9Tan7U2399t27b9QQHFk0M854uLi5Nu3brJoUOH7Js7d+6cDS3m8QAzi6e2MSsBkZGRdqnJFMqrDwHzQV5W6e6HOR9YF6+LX2rjdptoSF60aXzv/Dbspzbtd9TaX7Wuy+s9Pw+KOdxz+PBhad++vfTr18/u3NatW6ufLywslKNHj9qxKgAAAJ70oPz2t7+VMWPG2MM6x44dk6efflrCwsLkF7/4hZ1WPHnyZElLS5P4+HjbvTNjxgwbTpjBAwAAPAsoX331lQ0j33zzjfzoRz+SwYMH2ynE5mfj+eefl9DQUHuCNjOuJDU1VZYuXer2bgAAAB9zPaC89tprl3zeTD3Ozs62CwAAQG24Fg8AAFCHgAIAANQhoAAAAHUIKAAAQB0CCgAAUIeAAgAA1CGgAAAAdQgoAABAHQIKAABQh4ACAADUIaAAAAB1CCgAAEAdAgoAAFCHgAIAANQJb+wdANzW+YmNFBUAfI4eFAAAoA4BBQAAqENAAQAA6hBQAACAOgQUAACgDgEFAACoQ0ABAADqEFAAAIA6BBQAAKAOAQUAAKhDQAEAAOoQUAAAgDpcLBAAAA8vNHpk8WjqWw/0oAAAAHUIKAAAQB0CCgAAUIcxKAAA340TiQxzJGuASK/0zVJWGdLYuwUP0IMCAADUIaAAAAB1OMQDAPDNlF00H/SgAAAAdQgoAABAHQIKAABQh4ACAADUIaAAAAB1CCgAAEAdAgoAAFCH86AAgEvn/fDq9OtHFo8WL3Cukobh1zof8ajd+aIHJTs7Wzp37ixRUVEycOBA2bNnT2PuDgAAaO49KK+//rqkpaXJ8uXLbTh54YUXJDU1VQoLCyUhIaGxdgsA1PHr/4EDvuxBee6552TKlCny0EMPSc+ePW1QiY6OlpdffrmxdgkAADTnHpRz585JQUGBzJkzp/qx0NBQGTZsmOTn51+wfllZmV0CTp06ZW+//fZbKS8vd3XfzPZKS0slvDxUKqvcvYT3N9984+r2/C5Qa1OXiIgI17YbXnHGtW01BeFVjpSWVnnSpkGtGwvt2nuB7yw3P6tPnz5tbx3H0RlQvv76a6msrJR27doFPW7uf/LJJxesn5mZKRkZGRc83qVLF/GTtv+vsfcAzdWExt6BZoRaU+umoq2H31kmqMTGxvp/Fo/paTHjVQKqqqps70mbNm0kJMTd/yMsKSmR5ORk+fLLLyUmJsbVbYNaNwbaNLVuimjX/qy16Tkx4SQpKemy6zZKQGnbtq2EhYVJcXFx0OPmfmJi4gXrR0ZG2uV8cXFxnu6j+SMQUBoGtabOTQ1tmlo3RTEufS9eruekUQfJtmjRQvr16ydbt24N6hUx91NSUhpjlwAAgCKNdojHHLKZNGmS9O/fXwYMGGCnGZ85c8bO6gEAAM1bowWUe++9V/7zn//I/PnzpaioSPr27SubNm26YOBsQzOHkp5++ukLDimBWvsVbZpaN0W066Zf6xDnh8z1AQAAaEBcLBAAAKhDQAEAAOoQUAAAgDoEFAAAoE6zDCjZ2dnSuXNniYqKsldS3rNnzyXXX7t2rXTv3t2u37t3b8nJyWmwfW0udf7jH/8ot956q1x99dV2MddlutzfBfWr9flee+01ezbmsWPHUk6Pan3y5EmZNm2atG/f3s6C6NatG58hHtXanK7iuuuuk5YtW9ozn86cOVPOnj1L276MHTt2yJgxY+zZXc3nwVtvvXW5l8j27dvlpz/9qW3T1157raxatUpc5zQzr732mtOiRQvn5Zdfdg4cOOBMmTLFiYuLc4qLi2td/7333nPCwsKcrKws5+DBg87cuXOdiIgI56OPPmrwfW/KdZ4wYYKTnZ3t/POf/3Q+/vhj58EHH3RiY2Odr776qsH3vanXOuDzzz93fvzjHzu33nqrc/fddzfY/janWpeVlTn9+/d3Ro0a5ezcudPWfPv27c6+ffsafN+beq1Xr17tREZG2ltT582bNzvt27d3Zs6c2eD77jc5OTnOU0895bz55ptmVq+zbt26S67/2WefOdHR0U5aWpr9XnzxxRft9+SmTZtc3a9mF1AGDBjgTJs2rfp+ZWWlk5SU5GRmZta6/s9//nNn9OjRQY8NHDjQ+fWvf+35vjanOtdUUVHhtG7d2nnllVc83MvmW2tT31tuucX505/+5EyaNImA4lGtly1b5lxzzTXOuXPnfvgfFPWqtVl36NChQY+ZL9BBgwZR0Tr4IQHl8ccfd66//vqgx+69914nNTXVcVOzOsRz7tw5KSgosIcPAkJDQ+39/Pz8Wl9jHj9/fSM1NfWi66N+da7JXNrbXOI7Pj6ekrrcpo0FCxZIQkKCTJ48mfp6WOu//vWv9vId5hCPOQllr1695Nlnn7VXc4e7tb7lllvsawKHgT777DN7KG3UqFGU2mUN9b3oi6sZu+Xrr7+2Hww1z1Zr7n/yySe1vsac5ba29c3jcK/ONc2ePdseD635jwBXXuudO3fKihUrZN++fZTT41qbL8lt27bJ/fffb78sDx06JI8++qgN3+bMnHCv1hMmTLCvGzx4sL1ibkVFhTz88MPy5JNPUmaXXex70Vz1+H//+58dA+SGZtWDAn9YvHixHby5bt06OzgO7jGXOZ84caIdlGyuKg5vmYugmp6ql156yV4g1Vzi46mnnpLly5dTepeZQZumd2rp0qXyj3/8Q958803ZuHGjLFy4kFr7VLPqQTEfyGFhYVJcXBz0uLmfmJhY62vM43VZH/Wrc8CSJUtsQNmyZYv06dOHcrrcpg8fPixHjhyxI/bP/xI1wsPDpbCwULp27UrdXai1YWbuRERE2NcF9OjRw/4fqDmMYa7sDndqPW/ePBu+f/WrX9n7ZsaluQDt1KlTbSg0h4jgjot9L8bExLjWe2I0q7+Y+TAw/xezdevWoA9nc98cJ66Nefz89Y28vLyLro/61dnIysqy/7djLhpprnIN99u0mS7/0Ucf2cM7geWuu+6S22+/3f5spmbCnVobgwYNsod1AiHQ+Ne//mWDC+HEvXYdGLdWM4QEgiGXnHNXg30vOs1w6pqZirZq1So7PWrq1Kl26lpRUZF9fuLEic4TTzwRNM04PDzcWbJkiZ3++vTTTzPN2IM6L1682E4pfOONN5zjx49XL6dPn3a/ETTzWtfELB7van306FE7G2369OlOYWGhs2HDBichIcF55plnruAv3jzUtdbms9nU+tVXX7XTYHNzc52uXbvamZi4NPM5a07xYBYTC5577jn78xdffGGfN3U29a45zXjWrFn2e9GcIoJpxi4xc7Y7duxovxDNVLbdu3dXP/ezn/3MfmCf7y9/+YvTrVs3u76ZWrVx40a3dqVJq0udO3XqZP9h1FzMhw7crXVNBBTv2rWxa9cue2oC82VrphwvWrTITvOGu7UuLy930tPTbSiJiopykpOTnUcffdT573//S6kv429/+1utn7+B+ppbU++ar+nbt6/925h2vXLlSsdtIeY/7vbJAAAAXJlmNQYFAAD4AwEFAACoQ0ABAADqEFAAAIA6BBQAAKAOAQUAAKhDQAEAAOoQUAAAgDoEFAAAoA4BBQAAqENAAQAA6hBQAACAaPP/AaCPs5RuguPIAAAAAElFTkSuQmCC", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "pg['score'].hist(bins=20);" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "cdce096a-6081-48d6-861a-a0b39cda5544", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAigAAAGdCAYAAAA44ojeAAAAOnRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjEwLjcsIGh0dHBzOi8vbWF0cGxvdGxpYi5vcmcvTLEjVAAAAAlwSFlzAAAPYQAAD2EBqD+naQAAJ4FJREFUeJzt3Qt0FOX5x/Fnc2EhSoCAMaQGAlguyq2CRIQqtxCBA6JpvUQpUApegHNMThVRwAS0UEqRUxrl2HKxRyKWHkAFBAIISAko2BwFNTUIIuVWtBBJyhKS+Z/37X8jmwuQMJO8k/1+zhnXnZm8zD55d/aXmXdmPZZlWQIAAGCQkLreAAAAgPIIKAAAwDgEFAAAYBwCCgAAMA4BBQAAGIeAAgAAjENAAQAAxiGgAAAA44SJC5WWlsqxY8ekcePG4vF46npzAADAVVD3hv3+++8lNjZWQkJC6l9AUeEkLi6urjcDAADUwDfffCM33XRT/Qso6siJ/wVGRkba2nZxcbFs2rRJBg8eLOHh4ba2Depc2+jP1Lk+oT+7v9YFBQX6AIP/c7zeBRT/aR0VTpwIKBEREbpdAopzqHPtoM7UuT6hP9efWl/N8AwGyQIAAOMQUAAAgHEIKAAAwDgEFAAAYBwCCgAAMA4BBQAAGIeAAgAAjENAAQAA7g4os2fPlttvv13fAS46OlpGjhwpeXl5AeucP39eJk6cKM2bN5frr79ekpOT5eTJkwHrHDlyRIYNG6ZvAqPaefrpp+XixYv2vCIAABBcAWX79u06fOzevVuys7P1nebUbXALCwvL1klNTZV3331XVq5cqddX35tz//33ly0vKSnR4eTChQuya9cuef3112XZsmUyY8YMe18ZAABwrWrd6n7Dhg0Bz1WwUEdA9u3bJ3fddZecPXtWFi9eLFlZWTJgwAC9ztKlS6VTp0461Nxxxx363v6fffaZbN68WW688Ubp3r27zJo1S6ZMmSLp6enSoEEDe18hAAAIrjEoKpAoUVFR+lEFFXVUZdCgQWXrdOzYUVq1aiU5OTn6uXrs0qWLDid+SUlJ+guEDhw4cC2bAwAA6okaf1lgaWmpPPXUU9KnTx/p3LmznnfixAl9BKRp06YB66owopb517k0nPiX+5dVxufz6clPhRlFhSE12cnfnt3tgjrXBfozda5P6M/ur3V12qtxQFFjUfbv3y87d+4Up6nBuRkZGRXmq9NFaqCtE9QYGziPOtcO6kyd6xP6s3trXVRU5GxAmTRpkqxdu1Z27NghN910U9n8mJgYPfj1zJkzAUdR1FU8apl/nQ8//DCgPf9VPv51yps6daqkpaUFHEGJi4vTA3TVV0Hbne7ULyQxMdGRr5gGda5M5/SNjnQNb4gls3qW0p8dxn6jdlBn99fafwbE9oBiWZZMnjxZVq9eLdu2bZM2bdoELO/Ro4d+IVu2bNGXFyvqMmR1WXHv3r31c/X40ksvyalTp/QAW0UVQQWNW265pdJ/1+v16qk89W85FSKcbBvUuTxficfRbkF/rh3UmTrXN+E2fxZWp62w6p7WUVfovP322/peKP4xI02aNJFGjRrpx3HjxumjHWrgrAodKtCoUKKu4FHUUQ8VREaNGiVz587VbUybNk23XVkIAQAAwadaAeXVV1/Vj/369QuYry4lHjNmjP7/l19+WUJCQvQRFDWwVV2h88orr5StGxoaqk8PPfHEEzq4XHfddTJ69GiZOXOmPa8IAAC4XrVP8VxJw4YNJTMzU09Vad26taxfv746/zQAAAgifBcPAAAwDgEFAAAYh4ACAACMQ0ABAADGIaAAAADjEFAAAIBxCCgAAMA4BBQAAGAcAgoAADAOAQUAABiHgAIAAIxDQAEAAMYhoAAAAHd/mzEAd+qcvlF8JR7b2z08Z5jtbQKAwhEUAABgHAIKAAAwDgEFAAAYh4ACAACMQ0ABAADGIaAAAADjEFAAAIBxCCgAAMA4BBQAAGAcAgoAADAOAQUAABiHgAIAAIxDQAEAAMYhoAAAAOMQUAAAgHEIKAAAwDgEFAAAYBwCCgAAMA4BBQAAGIeAAgAA3B9QduzYIcOHD5fY2FjxeDyyZs2agOVqXmXT7373u7J14uPjKyyfM2eOPa8IAAAEX0ApLCyUbt26SWZmZqXLjx8/HjAtWbJEB5Dk5OSA9WbOnBmw3uTJk2v+KgAAQL0SVt0fGDJkiJ6qEhMTE/D87bfflv79+0vbtm0D5jdu3LjCugAAADUKKNVx8uRJWbdunbz++usVlqlTOrNmzZJWrVpJSkqKpKamSlhY5Zvj8/n05FdQUKAfi4uL9WQnf3t2twvqfDneUMuRLuINsQIe7cb7JLAO1MNZ1Nn9ta5Oex7Lsmq851KnblavXi0jR46sdPncuXN1EDl27Jg0bNiwbP78+fPltttuk6ioKNm1a5dMnTpVxo4dq+dXJj09XTIyMirMz8rKkoiIiJpuPgAAqEVFRUX6oMTZs2clMjKy7gJKx44dJTExURYuXHjZdtQ4lccee0zOnTsnXq/3qo6gxMXFyenTp6/4AmuS7rKzs/V2h4eH29o2qHNVOqdvdKR7qCMns3qWyvS9IeIr9dje/v70JNvbdCP2G9S5vil26LNQfX63aNHiqgKKY6d4PvjgA8nLy5O33nrriusmJCTIxYsX5fDhw9KhQ4cKy1VoqSy4qKI5FSKcbBvUuTxficfZ9ks9jvwbvEcq1oOaOI86u7fW1WnLsYCyePFi6dGjh77i50pyc3MlJCREoqOjxaS/aO3eoR+eM8zW9gAAqK+qHVDUaZj8/Pyy54cOHdIBQ40nUQNe/YdwVq5cKb///e8r/HxOTo7s2bNHX9mjruRRz9UA2UcffVSaNWt2ra8HAAAEY0DZu3evDhd+aWlp+nH06NGybNky/f8rVqwQNbTl4YcfrvDz6lSNWq4GvqpxJW3atNEBxd8OAABAtQNKv379dPi4nAkTJuipMurqnd27d1N5AABQJb6LBwAAGIeAAgAAjENAAQAAxiGgAAAA4xBQAACAcQgoAADAOAQUAABgHMdudQ+g/ot/dp0j7fK1EAA4ggIAAIxDQAEAAMYhoAAAAOMQUAAAgHEIKAAAwDgEFAAAYBwCCgAAMA4BBQAAGIeAAgAAjENAAQAAxiGgAAAA4xBQAACAcQgoAADAOAQUAABgHAIKAAAwDgEFAAAYh4ACAACMQ0ABAADGIaAAAADjEFAAAIBxCCgAAMA4BBQAAGAcAgoAADAOAQUAABiHgAIAANwfUHbs2CHDhw+X2NhY8Xg8smbNmoDlY8aM0fMvne65556Adb777jt55JFHJDIyUpo2bSrjxo2Tc+fOXfurAQAAwRlQCgsLpVu3bpKZmVnlOiqQHD9+vGx68803A5arcHLgwAHJzs6WtWvX6tAzYcKEmr0CAABQ74RV9weGDBmip8vxer0SExNT6bLPP/9cNmzYIB999JH07NlTz1u4cKEMHTpU5s2bp4/MAACA4FbtgHI1tm3bJtHR0dKsWTMZMGCAvPjii9K8eXO9LCcnR5/W8YcTZdCgQRISEiJ79uyR++67r0J7Pp9PT34FBQX6sbi4WE928rfnDbFsbffStvFDLajJ/3hDLWfa/f9+7ER/dpLb+gX9mTrXN8UO7aOr057tAUWd3rn//vulTZs2cvDgQXnuuef0ERcVTEJDQ+XEiRM6vARsRFiYREVF6WWVmT17tmRkZFSYv2nTJomIiBAnzOpZanub69evt71Nt1On+SAyt5ezVXCiPzvJre8V+jN1rm+ybd5HFxUV1V1Aeeihh8r+v0uXLtK1a1dp166dPqoycODAGrU5depUSUtLCziCEhcXJ4MHD9YDbe1Od+oXMn1viPhKPba2vT89ydb23Mxf58TERAkPD5dg1zl9oyPtqiMnKpw40Z+d5Lb3Cv2ZOtc3xQ7to/1nQOrsFM+l2rZtKy1atJD8/HwdUNTYlFOnTgWsc/HiRX1lT1XjVtSYFjWVp4rm1Ieb2pn7SuzdofNBXHlNqIvY3tdqoz87ya19gv5MneubcJv30dVpy/GAcvToUfn222+lZcuW+nnv3r3lzJkzsm/fPunRo4eet3XrViktLZWEhASnNweAC8Q/u86xtg/PGeZY2wDsU+2Aou5Xoo6G+B06dEhyc3P1GBI1qbEiycnJ+miIGoPyzDPPyM033yxJSf87ZNupUyc9TmX8+PGyaNEifRhp0qRJ+tQQV/AAAIAa3Qdl79698pOf/ERPihobov5/xowZehDsJ598IiNGjJD27dvrG7CpoyQffPBBwCma5cuXS8eOHfUpH3V5cd++feW1117jNwIAAGp2BKVfv35iWVVfsrhx45UH+6kjLVlZWdX9pwEAQJDgu3gAAIBxCCgAAMA4BBQAAGAcAgoAADAOAQUAABiHgAIAAIxDQAEAAMYhoAAAAOMQUAAAgHEIKAAAwDgEFAAAYBwCCgAAMA4BBQAAGIeAAgAAjENAAQAAxiGgAAAA4xBQAACAcQgoAADAOAQUAABgHAIKAAAwDgEFAAAYh4ACAACMQ0ABAADGIaAAAADjEFAAAIBxCCgAAMA4BBQAAGAcAgoAADAOAQUAABiHgAIAAIxDQAEAAMYhoAAAAOOE1fUGAACAmol/dp0jpfOGWjK3l7jrCMqOHTtk+PDhEhsbKx6PR9asWVO2rLi4WKZMmSJdunSR6667Tq/zi1/8Qo4dOxbQRnx8vP7ZS6c5c+bY84oAAIDrVTugFBYWSrdu3SQzM7PCsqKiIvn4449l+vTp+nHVqlWSl5cnI0aMqLDuzJkz5fjx42XT5MmTa/4qAABAcJ/iGTJkiJ4q06RJE8nOzg6Y98c//lF69eolR44ckVatWpXNb9y4scTExNRkmwEAQD3n+BiUs2fP6lM4TZs2DZivTunMmjVLh5aUlBRJTU2VsLDKN8fn8+nJr6CgoOyUkprs5G/PG2LZ2u6lbeOHWlCTH873OsHfj53oz27lRJ+jP9cO6lz7+w6nPmOvhseyrBq/OhU8Vq9eLSNHjqx0+fnz56VPnz7SsWNHWb58edn8+fPny2233SZRUVGya9cumTp1qowdO1bPr0x6erpkZGRUmJ+VlSURERE13XwAAFCL1FAQdVBCHbyIjIysm4CiUlJycrIcPXpUtm3bdtkNWbJkiTz22GNy7tw58Xq9V3UEJS4uTk6fPn3FF1hdarvVaarpe0PEV+qxte396Um2tudm/jonJiZKeHi4BLvO6Rsd+ytoVs9SR/qzWznxPqQ/1w7qXPv7Drv30erzu0WLFlcVUMKc6kQPPPCAfP3117J169YrbkRCQoJcvHhRDh8+LB06dKiwXIWWyoKLKppTH25qZ+4rsXeHzgdx5TWhLmJ7X6uN/uxWTvY3+nPtoM4/cPp9bXetq9NWmFPh5Msvv5T3339fmjdvfsWfyc3NlZCQEImOjrZ7cwAAgAtVO6Co0zD5+fllzw8dOqQDhhpP0rJlS/nZz36mLzFeu3atlJSUyIkTJ/R6anmDBg0kJydH9uzZI/3799dX8qjnaoDso48+Ks2aNbP31QEAgOAIKHv37tXhwi8tLU0/jh49Wg9mfeedd/Tz7t27B/ycOprSr18/fapmxYoVel01rqRNmzY6oPjbAQAAqHZAUSHjcuNqrzTmVl29s3v3bioPAACqxJcFAgAA4xBQAACAcQgoAADAOAQUAABgHAIKAAAwDgEFAAAYh4ACAACMQ0ABAADGIaAAAADjEFAAAIBxCCgAAMA4BBQAAGAcAgoAADAOAQUAABiHgAIAAIxDQAEAAMYhoAAAAOMQUAAAgHEIKAAAwDgEFAAAYBwCCgAAMA4BBQAAGIeAAgAAjENAAQAAxiGgAAAA4xBQAACAcQgoAADAOGF1vQEAUJvin11ne5veUEvm9rK9WSCocQQFAAAYh4ACAACMQ0ABAADGIaAAAADjMEgWAACXDcwOBtU+grJjxw4ZPny4xMbGisfjkTVr1gQstyxLZsyYIS1btpRGjRrJoEGD5MsvvwxY57vvvpNHHnlEIiMjpWnTpjJu3Dg5d+7ctb8aAAAQnAGlsLBQunXrJpmZmZUunzt3rvzhD3+QRYsWyZ49e+S6666TpKQkOX/+fNk6KpwcOHBAsrOzZe3atTr0TJgw4dpeCQAACN5TPEOGDNFTZdTRkwULFsi0adPk3nvv1fP+8pe/yI033qiPtDz00EPy+eefy4YNG+Sjjz6Snj176nUWLlwoQ4cOlXnz5ukjMwAAILjZOgbl0KFDcuLECX1ax69JkyaSkJAgOTk5OqCoR3Vaxx9OFLV+SEiIPuJy3333VWjX5/Ppya+goEA/FhcX68lO/va8IZat7V7aNn6oBTX54UZfTvD3Yyf6MyrWmf7sLLfuN5x6f7uxT1enPVsDigonijpicin13L9MPUZHRwduRFiYREVFla1T3uzZsyUjI6PC/E2bNklERIQ4YVbPUtvbXL9+ve1tup06zQdx/C6kTvRnVER/rh1uq7Ob7zKcbXOti4qK6tdVPFOnTpW0tLSAIyhxcXEyePBgPdDW7nSnfiHT94aIr9Rja9v705Nsbc/N/HVOTEyU8PBwCXad0zc69leQCidO9GdUrDP92Vlu3W849f52Y5/2nwGp9YASExOjH0+ePKmv4vFTz7t37162zqlTpwJ+7uLFi/rKHv/Pl+f1evVUniqaU51U7cx9Jfbu0N30hqotTv4O3cTuvlYb/RkV0Z9rh9vq7Ob3XrjNta5OW7beqK1NmzY6ZGzZsiUgLamxJb1799bP1eOZM2dk3759Zets3bpVSktL9VgVAACAah9BUfcryc/PDxgYm5ubq8eQtGrVSp566il58cUX5cc//rEOLNOnT9dX5owcOVKv36lTJ7nnnntk/Pjx+lJkdchu0qRJegAtV/AAAIAaBZS9e/dK//79y577x4aMHj1ali1bJs8884y+V4q6r4k6UtK3b199WXHDhg3Lfmb58uU6lAwcOFBfvZOcnKzvnQIAAFCjgNKvXz99v5OqqLvLzpw5U09VUUdbsrKy+A0AAIBK8WWBAADAOAQUAABgHAIKAAAwDgEFAAAYh4ACAACMQ0ABAADGIaAAAADjEFAAAIBxCCgAAMA4BBQAAGAcAgoAADAOAQUAABiHgAIAAIxDQAEAAMYhoAAAAOMQUAAAgHEIKAAAwDgEFAAAYBwCCgAAMA4BBQAAGIeAAgAAjENAAQAAxiGgAAAA4xBQAACAcQgoAADAOAQUAABgHAIKAAAwDgEFAAAYh4ACAACMQ0ABAADGIaAAAADjEFAAAIBxCCgAAKD+B5T4+HjxeDwVpokTJ+rl/fr1q7Ds8ccft3szAACAi4XZ3eBHH30kJSUlZc/3798viYmJ8vOf/7xs3vjx42XmzJllzyMiIuzeDAAA4GK2B5Qbbrgh4PmcOXOkXbt2cvfddwcEkpiYGLv/aQAAUE84OgblwoUL8sYbb8gvf/lLfSrHb/ny5dKiRQvp3LmzTJ06VYqKipzcDAAAEOxHUC61Zs0aOXPmjIwZM6ZsXkpKirRu3VpiY2Plk08+kSlTpkheXp6sWrWqynZ8Pp+e/AoKCvRjcXGxnuzkb88bYtna7qVt44daUJP/8YZazrT7//3Yif6MinWmPzvLrfsNp97fbuzT1WnPY1mWY5VLSkqSBg0ayLvvvlvlOlu3bpWBAwdKfn6+PhVUmfT0dMnIyKgwPysri/ErAAC4hDpjog5UnD17ViIjI+smoHz99dfStm1bfWTk3nvvrXK9wsJCuf7662XDhg060FztEZS4uDg5ffr0FV9gTdJddna2TN8bIr7SH05L2WF/euWvLxj566wGUIeHh0uw65y+0bG/gmb1LHWkP6NinenP7t5vOPU+dCOvQ31afX6rIR5XE1AcO8WzdOlSiY6OlmHDhl12vdzcXP3YsmXLKtfxer16Kk8VzakPN7Uz95XYu0Png7jymlAXsb2v1UZ/RkX0Z3fXmfeI87WuTluOBJTS0lIdUEaPHi1hYT/8EwcPHtSnZYYOHSrNmzfXY1BSU1Plrrvukq5duzqxKQAAwIUcCSibN2+WI0eO6Kt3LqXGo6hlCxYs0Kd21Gma5ORkmTZtmhObAQAAXMqRgDJ48GCpbGiLCiTbt2934p8EAAD1CN/FAwAAjENAAQAAxiGgAAAA4xBQAACAcQgoAADAOAQUAABgHAIKAAAwDgEFAAAYh4ACAACMQ0ABAADGIaAAAADjEFAAAIBxCCgAAMA4BBQAAGAcAgoAADAOAQUAABiHgAIAAIxDQAEAAMYhoAAAAOMQUAAAgHEIKAAAwDgEFAAAYBwCCgAAMA4BBQAAGIeAAgAAjENAAQAAxiGgAAAA4xBQAACAcQgoAADAOAQUAABgHAIKAAAwDgEFAAAYh4ACAACMQ0ABAAD1P6Ckp6eLx+MJmDp27Fi2/Pz58zJx4kRp3ry5XH/99ZKcnCwnT560ezMAAICLOXIE5dZbb5Xjx4+XTTt37ixblpqaKu+++66sXLlStm/fLseOHZP777/fic0AAAAuFeZIo2FhEhMTU2H+2bNnZfHixZKVlSUDBgzQ85YuXSqdOnWS3bt3yx133OHE5gAAAJdxJKB8+eWXEhsbKw0bNpTevXvL7NmzpVWrVrJv3z4pLi6WQYMGla2rTv+oZTk5OVUGFJ/Ppye/goIC/ajaUpOd/O15Qyxb2720bfxQC2ryP95Qy5l2/78fO9GfUbHO9Gd37zeceh+6kdehPl2d9jyWZdn6G3nvvffk3Llz0qFDB316JyMjQ/71r3/J/v379amdsWPHBoQNpVevXtK/f3/57W9/W+W4FtVOeepITEREhJ2bDwAAHFJUVCQpKSn6jEpkZGTtBpTyzpw5I61bt5b58+dLo0aNahRQKjuCEhcXJ6dPn77iC6xJusvOzpbpe0PEV+qxte396Um2tudm/jonJiZKeHi4BLvO6Rsd+ytoVs9SR/ozKtaZ/uzu/YZT70M38jrUp9Xnd4sWLa4qoDhyiudSTZs2lfbt20t+fr5+oRcuXNChRc33U1fxVDZmxc/r9eqpPFU0pz7c1M7cV2LvDp0P4sprQl3E9r5WG/0ZFdGf3V1n3iPO17o6bTkeUNTpnoMHD8qoUaOkR48eeuO2bNmiLy9W8vLy5MiRI3qsCgAAV3OkgzBR/9keUH7961/L8OHD9WkddQnxCy+8IKGhofLwww9LkyZNZNy4cZKWliZRUVH68M7kyZN1OOEKHgAA4FhAOXr0qA4j3377rdxwww3St29ffQmx+n/l5ZdflpCQEH0ERY0rSUpKkldeecXuzQAAAC5me0BZsWLFZZerS48zMzP1BAAAUBm+iwcAABiHgAIAAIxDQAEAAMYhoAAAAOMQUAAAgHEIKAAAwDgEFAAAYBwCCgAAMA4BBQAAGIeAAgAAjENAAQAAxiGgAAAA4xBQAACAcQgoAADAOGF1vQEAgPon/tl1trfpDbVkbi/bm4WhOIICAACMQ0ABAADGIaAAAADjEFAAAIBxCCgAAMA4BBQAAGAcAgoAADAOAQUAABiHgAIAAIxDQAEAAMYhoAAAAOMQUAAAgHEIKAAAwDh8mzFQx9/QCgCoiCMoAADAOAQUAABgHAIKAAAwDgEFAADU/4Aye/Zsuf3226Vx48YSHR0tI0eOlLy8vIB1+vXrJx6PJ2B6/PHH7d4UAADgUrYHlO3bt8vEiRNl9+7dkp2dLcXFxTJ48GApLCwMWG/8+PFy/Pjxsmnu3Ll2bwoAAHAp2y8z3rBhQ8DzZcuW6SMp+/btk7vuuqtsfkREhMTExNj9zwMAgHrA8fugnD17Vj9GRUUFzF++fLm88cYbOqQMHz5cpk+frkNLZXw+n578CgoK9KM6OqMmO/nb84ZYtrZ7adv4oRZuq4k31P5+4SR/P3aiP6Nind3Wn932XqE/u79PV6c9j2VZju25SktLZcSIEXLmzBnZuXNn2fzXXntNWrduLbGxsfLJJ5/IlClTpFevXrJq1apK20lPT5eMjIwK87OysqoMNQAAwCxFRUWSkpKiD15ERkbWXUB54okn5L333tPh5Kabbqpyva1bt8rAgQMlPz9f2rVrd1VHUOLi4uT06dNXfIE1SXdq7Mz0vSHiK/XY2vb+9CRb23Mzf50TExMlPDxc3KJz+kZx219Bs3qWOtKfUbHObuvPbnuv0J/d36fV53eLFi2uKqA4dopn0qRJsnbtWtmxY8dlw4mSkJCgH6sKKF6vV0/lqaI5tTNQO3Nfib07dHZcldfETXWxu0/UFif6M9zfn53kZH+jP7u3T1enLdsDijogM3nyZFm9erVs27ZN2rRpc8Wfyc3N1Y8tW7a0e3MAAIAL2R5Q1CXGamzI22+/re+FcuLECT2/SZMm0qhRIzl48KBePnToUGnevLkeg5Kamqqv8OnatavdmwMAAFzI9oDy6quvlt2M7VJLly6VMWPGSIMGDWTz5s2yYMECfW8UNZYkOTlZpk2bZvemAAAAl3LkFM/lqECibuYGAABQFb6LBwAAGIeAAgAAjENAAQAAxiGgAAAA4xBQAACAcQgoAADAOAQUAABgHAIKAAAwDgEFAAAYx7FvMwYAmC3+2XV1vQlAlTiCAgAAjENAAQAAxiGgAAAA4xBQAACAcQgoAADAOAQUAABgHAIKAAAwDgEFAAAYh4ACAACMQ0ABAADGIaAAAADjEFAAAIBxCCgAAMA4BBQAAGAcAgoAADAOAQUAABiHgAIAAIwTVtcbAAC4vPhn11EiBB2OoAAAAOMQUAAAgHEIKAAAwDgEFAAAYBwCCgAAME6dBpTMzEyJj4+Xhg0bSkJCgnz44Yd1uTkAACDYA8pbb70laWlp8sILL8jHH38s3bp1k6SkJDl16lRdbRIAAAj2gDJ//nwZP368jB07Vm655RZZtGiRREREyJIlS+pqkwAAQDDfqO3ChQuyb98+mTp1atm8kJAQGTRokOTk5FRY3+fz6cnv7Nmz+vG7776T4uJiW7dNtVdUVCRhxSFSUuqxte1vv/3W1vbczF9nVZPw8HBxi7CLheImYaWWFBWVOtKfUbHO3Z9fJT4H6swdNQPrTH+uvVrbvY/+/vvv9aNlWVfeBqkDp0+flpKSErnxxhsD5qvnX3zxRYX1Z8+eLRkZGRXmt2nTRtykxe/regsQjFLqegOCBHWmzvVNioNtq6DSpEkT9wdzdaRFjVfxKy0t1UdPmjdvLh6PvX+tFBQUSFxcnHzzzTcSGRlpa9ugzrWN/kyd6xP6s/trrY6cqHASGxt7xXXrJKC0aNFCQkND5eTJkwHz1fOYmJgK63u9Xj1dqmnTpo5uo/qFEFCcR51rB3WmzvUJ/dndtb7SkZM6HSTboEED6dGjh2zZsiXgqIh63rt377rYJAAAYJA6O8WjTtmMHj1aevbsKb169ZIFCxZIYWGhvqoHAAAEtzoLKA8++KD8+9//lhkzZsiJEyeke/fusmHDhgoDZ2ubOpWk7s1S/pQSqLMb0Z+pc31Cfw6uWnusq7nWBwAAoBbxXTwAAMA4BBQAAGAcAgoAADAOAQUAABgnKANKZmamxMfHS8OGDSUhIUE+/PDDy66/cuVK6dixo16/S5cusn79+lrb1mCp85/+9Cf56U9/Ks2aNdOT+l6mK/1eUP06X2rFihX6TswjR46klDb3Z+XMmTMyceJEadmypb4Son379uw7HKizukVFhw4dpFGjRvrOp6mpqXL+/Hn69GXs2LFDhg8fru/mqvYBa9askSvZtm2b3Hbbbbov33zzzbJs2TJxnBVkVqxYYTVo0MBasmSJdeDAAWv8+PFW06ZNrZMnT1a6/t///ncrNDTUmjt3rvXZZ59Z06ZNs8LDw61PP/201re9Ptc5JSXFyszMtP7xj39Yn3/+uTVmzBirSZMm1tGjR2t92+tznf0OHTpk/ehHP7J++tOfWvfee2+tbW+w1Nnn81k9e/a0hg4dau3cuVPXe9u2bVZubm6tb3t9rvPy5cstr9erH1WNN27caLVs2dJKTU2t9W13k/Xr11vPP/+8tWrVKnUVr7V69erLrv/VV19ZERERVlpamv4cXLhwof5c3LBhg6PbGXQBpVevXtbEiRPLnpeUlFixsbHW7NmzK13/gQcesIYNGxYwLyEhwXrssccc39ZgqnN5Fy9etBo3bmy9/vrrDm5lcNZZ1fbOO++0/vznP1ujR48moDhQ51dffdVq27atdeHCher9QoNcdeus1h0wYEDAPPUh2qdPH8e3tb6QqwgozzzzjHXrrbcGzHvwwQetpKQkR7ctqE7xXLhwQfbt26dPH/iFhITo5zk5OZX+jJp/6fpKUlJSleujZnUur6ioSIqLiyUqKoqS2tiflZkzZ0p0dLSMGzeO2jpU53feeUd/bYc6xaNuPtm5c2f5zW9+o7/FHfbV+c4779Q/4z8N9NVXX+nTaEOHDqXMNqqrz0FXfJuxXU6fPq13EOXvVquef/HFF5X+jLrLbWXrq/mwr87lTZkyRZ8fLf+mwLXVeefOnbJ48WLJzc2llA7WWX1Qbt26VR555BH9gZmfny9PPvmkDt3q7pywp84pKSn65/r27au/JffixYvy+OOPy3PPPUeJbVTV56D6xuP//ve/evyPE4LqCArcYc6cOXoA5+rVq/VAOdhDfcX5qFGj9IBk9Y3icI768lN1lOq1117TX4yqvtrj+eefl0WLFlF2G6mBm+rI1CuvvCIff/yxrFq1StatWyezZs2izvVAUB1BUTvl0NBQOXnyZMB89TwmJqbSn1Hzq7M+alZnv3nz5umAsnnzZunatSvltLE/Hzx4UA4fPqxH71/6QaqEhYVJXl6etGvXjppfY50VdeVOeHi4/jm/Tp066b9E1akM9Y3uuPY6T58+XYfuX/3qV/q5uspSfenshAkTdCBUp4hw7ar6HIyMjHTs6IkSVL89tVNQf81s2bIlYAetnqvzxZVR8y9dX8nOzq5yfdSszsrcuXP1Xz7qSyPVt1zD3v6sLpX/9NNP9ekd/zRixAjp37+//n91iSauvc5Knz599GkdfwBU/vnPf+rgQjixpz/7x6qVDyH+UMjXzNmnzj4HrSC8jE1dlrZs2TJ9udSECRP0ZWwnTpzQy0eNGmU9++yzAZcZh4WFWfPmzdOXv77wwgtcZuxAnefMmaMvL/zb3/5mHT9+vGz6/vvv7e8EQVzn8riKx5k6HzlyRF+FNmnSJCsvL89au3atFR0dbb344ovX+Buv36pbZ7U/VnV+88039aWwmzZtstq1a6evvkTV1H5V3dJBTSoGzJ8/X///119/rZerGqtal7/M+Omnn9afg+qWEFxm7BB1DXerVq30B6K6rG337t1ly+6++269077UX//6V6t9+/Z6fXWp1bp165zatKCtc+vWrfUbpfykdkCwr87lEVCc6c/Krl279C0J1AeuuuT4pZde0pd4w746FxcXW+np6TqUNGzY0IqLi7OefPJJ6z//+Q9lvoz333+/0v2tv7bqUdW6/M90795d/15Uf166dKnlNI/6j7PHaAAAAKonqMagAAAAdyCgAAAA4xBQAACAcQgoAADAOAQUAABgHAIKAAAwDgEFAAAYh4ACAACMQ0ABAADGIaAAAADjEFAAAIBxCCgAAEBM83/rYGLijZs3IwAAAABJRU5ErkJggg==", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "pj['score'].hist(bins=20);" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "c101439b-f429-4529-9831-d182922d95d6", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAigAAAGdCAYAAAA44ojeAAAAOnRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjEwLjcsIGh0dHBzOi8vbWF0cGxvdGxpYi5vcmcvTLEjVAAAAAlwSFlzAAAPYQAAD2EBqD+naQAAHSFJREFUeJzt3QuMVOX5P/B3d8FFVECwCFQU1KooKhYKovirIrKKwRtJtRqrhkrqLRFS7zdQK5QYNTWosVVpEy+NjdoWKIJYtVS8YU2912vVyqVqEZSyLOz8c86/u3GRyy7uMM+wn09yHGbmzNl353HmfPc973tORaFQKCQAgEAqS90AAIB1CSgAQDgCCgAQjoACAIQjoAAA4QgoAEA4AgoAEI6AAgCE0y6Vofr6+vTxxx+nHXbYIVVUVJS6OQBAM2Tnhl2xYkXq1atXqqys3PoCShZOevfuXepmAACb4cMPP0y77LLL1hdQsp6Thl+wU6dOKZK6uro0Z86cNHLkyNS+fftSN4dNUK/yo2blR83KT12R9mXLly/POxga9uNbXUBpOKyThZOIAaVjx455uwSU+NSr/KhZ+VGz8lNX5H1Zc4ZnGCQLAIQjoAAA4QgoAEA4AgoAEI6AAgCEI6AAAOEIKABAOAIKABCOgAIAhCOgAADhCCgAQDgCCgAQjoACAIQjoAAA4bQrdQNou/pcOrMo231/yrFF2S4AW44eFAAgHAEFAAhHQAEAwhFQAIBwBBQAIBwBBQAIR0ABAMIRUACAcAQUACAcAQUACEdAAQDCEVAAgHAEFAAgHAEFAAhHQAEAwhFQAIBwBBQAIBwBBQAIR0ABAMIRUACAcAQUACAcAQUACEdAAQDCEVAAgHAEFAAgHAEFAAhHQAEAwhFQAIBwBBQAIBwBBQAIR0ABAMIRUACAcAQUACAcAQUACEdAAQDCEVAAgHAEFAAgHAEFAAhHQAEAwhFQAIBwBBQAIBwBBQAIR0ABAMIRUACA8g4okydPTt/73vfSDjvskLp3755OOOGE9OabbzZZZ9WqVem8885L3bp1S9tvv30aM2ZMWrJkSZN1Pvjgg3Tsscemjh075tu56KKL0po1a1rnNwIAyl67lqz85JNP5uEjCylZoLj88svTyJEj02uvvZa22267fJ3x48enmTNnpgcffDB17tw5nX/++emkk05Kf/3rX/Pn165dm4eTHj16pKeffjotWrQo/ehHP0rt27dPN9xwQ3F+S9qUPpfObPa61VWFNHVwSv0nPppq11Zscv33pxz7DVsHQKsHlNmzZze5P3369LwHZOHChen//u//0ueff57uuuuudN9996Xhw4fn69xzzz2pX79+6ZlnnkkHH3xwmjNnTh5oHnvssbTzzjunAQMGpOuuuy5dcsklaeLEiWmbbbZpSZMAgLYeUNaVBZJM165d89ssqNTV1aURI0Y0rrPPPvukXXfdNS1YsCAPKNnt/vvvn4eTBjU1Nemcc85Jr776ajrooIO+9nNqa2vzpcHy5cvz2+xnZUskDe2J1q6Ist6LUquuLDS53RR1LT2fsfKjZuWnrkj7spZsb7MDSn19fbrwwgvToYcemvr3758/tnjx4rwHpEuXLk3WzcJI9lzDOl8NJw3PNzy3obEvkyZN+trjWW9MNo4lorlz55a6CeFlh1aiuG5QfbPWmzVrVtHbQvP4jJUfNSs/c1t5X7Zy5criB5RsLMorr7yS5s+fn4rtsssuSxMmTGjSg9K7d+98/EunTp1SJFk6zAp61FFH5eNq2LBs3EepZT0nWTi56oXKVFu/6TEor0ys2SLtYsN8xsqPmpWfuiLtyxqOgBQtoGQDX2fMmJGeeuqptMsuuzQ+ng18Xb16dVq2bFmTXpRsFk/2XMM6zz33XJPtNczyaVhnXdXV1fmyruxNixoCIrctiuYMSt1SsnDSnPaoaRw+Y+VHzcpP+1bel7VkWy2aZlwoFPJw8vDDD6fHH3889e3bt8nzAwcOzH/4vHnzGh/LpiFn04qHDh2a389uX3755bR06dLGdbKUlvWE7Lvvvi1pDgCwlWrX0sM62Qyd3//+9/m5UBrGjGTTibfddtv8duzYsfnhmGzgbBY6LrjggjyUZANkM9lhmSyInH766Wnq1Kn5Nq688sp82+vrJQEA2p4WBZTbb789vz388MObPJ5NJT7zzDPzf998882psrIyP0FbNvMmm6Fz2223Na5bVVWVHx7KZu1kwSU7f8oZZ5yRrr322tb5jQCAthVQskM8m9KhQ4c0bdq0fNmQ3XbbzWwIAGCDXIsHAAhHQAEAwhFQAIBwBBQAIBwBBQAIR0ABAMIRUACAcAQUACAcAQUACEdAAQDCEVAAgHAEFAAgHAEFAAhHQAEAwhFQAIBwBBQAIBwBBQAIR0ABAMIRUACAcAQUACAcAQUACEdAAQDCEVAAgHAEFAAgHAEFAAhHQAEAwhFQAIBwBBQAIBwBBQAIR0ABAMIRUACAcAQUACAcAQUACEdAAQDCEVAAgHAEFAAgHAEFAAhHQAEAwhFQAIBwBBQAIBwBBQAIR0ABAMIRUACAcAQUACAcAQUACEdAAQDCEVAAgHAEFAAgHAEFAAhHQAEAwhFQAIBwBBQAIBwBBQAIR0ABAMIRUACAcAQUACAcAQUACEdAAQDCEVAAgHAEFAAgHAEFAAhHQAEAwhFQAIBwBBQAIBwBBQAo/4Dy1FNPpdGjR6devXqlioqK9MgjjzR5/swzz8wf/+py9NFHN1nns88+S6eddlrq1KlT6tKlSxo7dmz64osvvvlvAwC0zYDy5ZdfpgMPPDBNmzZtg+tkgWTRokWNy/3339/k+SycvPrqq2nu3LlpxowZeegZN27c5v0GAMBWp11LX3DMMcfky8ZUV1enHj16rPe5119/Pc2ePTs9//zzadCgQfljt956axo1alS68cYb854ZAKBta3FAaY4nnngide/ePe24445p+PDh6frrr0/dunXLn1uwYEF+WKchnGRGjBiRKisr07PPPptOPPHEr22vtrY2XxosX748v62rq8uXSBraE61dEVVXFUrdhFRdWWhyuynqWno+Y+VHzcpPXZH2ZS3ZXqsHlOzwzkknnZT69u2b3nnnnXT55ZfnPS5ZMKmqqkqLFy/Ow0uTRrRrl7p27Zo/tz6TJ09OkyZN+trjc+bMSR07dkwRZYev2Lipg+O8Q9cNqm/WerNmzSp6W2gen7Hyo2blZ24r78tWrlxZuoByyimnNP57//33TwcccEDaY4898l6VI488crO2edlll6UJEyY06UHp3bt3GjlyZD7QNpIsHWYFPeqoo1L79u1L3ZzQ+k98tNRNyHtOsnBy1QuVqba+YpPrvzKxZou0iw3zGSs/alZ+6oq0L2s4AlKyQzxftfvuu6eddtopvf3223lAycamLF26tMk6a9asyWf2bGjcSjamJVvWlb1pUUNA5LZFUbt204FgS8nCSXPao6Zx+IyVHzUrP+1beV/Wkm0V/TwoH330Ufr0009Tz5498/tDhw5Ny5YtSwsXLmxc5/HHH0/19fVpyJAhxW4OAFAGWtyDkp2vJOsNafDee++ll156KR9Dki3ZWJExY8bkvSHZGJSLL7447bnnnqmm5v93jffr1y8fp3L22WenO+64I+9GOv/88/NDQ2bwAACb1YPywgsvpIMOOihfMtnYkOzfV199dT4I9u9//3s67rjj0l577ZWfgG3gwIHpL3/5S5NDNPfee2/aZ5998kM+2fTiYcOGpTvvvFNFAIDN60E5/PDDU6Gw4SmZjz666YGPWU/Lfffd19IfDQC0EUUfJAtbkz6Xzkzl5v0px5a6CQAt5mKBAEA4AgoAEI6AAgCEI6AAAOEIKABAOAIKABCOgAIAhCOgAADhCCgAQDgCCgAQjoACAIQjoAAA4QgoAEA4AgoAEI6AAgCEI6AAAOEIKABAOAIKABCOgAIAhCOgAADhCCgAQDgCCgAQjoACAIQjoAAA4QgoAEA4AgoAEI6AAgCEI6AAAOEIKABAOAIKABCOgAIAhCOgAADhCCgAQDgCCgAQjoACAIQjoAAA4QgoAEA4AgoAEI6AAgCEI6AAAOEIKABAOAIKABCOgAIAhCOgAADhCCgAQDgCCgAQTrtSNwAA2Dx9Lp1ZlLeuuqqQpg5OJaUHBQAIR0ABAMIRUACAcAQUACAcAQUACEdAAQDCEVAAgHAEFAAgHAEFAAhHQAEAwhFQAIBwBBQAIBwBBQAIR0ABAMIRUACAcAQUAKD8A8pTTz2VRo8enXr16pUqKirSI4880uT5QqGQrr766tSzZ8+07bbbphEjRqS33nqryTqfffZZOu2001KnTp1Sly5d0tixY9MXX3zxzX8bAKBtBpQvv/wyHXjggWnatGnrfX7q1KnpF7/4RbrjjjvSs88+m7bbbrtUU1OTVq1a1bhOFk5effXVNHfu3DRjxow89IwbN+6b/SYAwFajXUtfcMwxx+TL+mS9J7fccku68sor0/HHH58/9pvf/CbtvPPOeU/LKaeckl5//fU0e/bs9Pzzz6dBgwbl69x6661p1KhR6cYbb8x7ZgCAtq3FAWVj3nvvvbR48eL8sE6Dzp07pyFDhqQFCxbkASW7zQ7rNISTTLZ+ZWVl3uNy4okntmaToM3rc+nMor0H7085ts2/v0AZBJQsnGSyHpOvyu43PJfddu/evWkj2rVLXbt2bVxnXbW1tfnSYPny5fltXV1dvkTS0J5o7YqouqpQ6iak6spCk1taphT/n/uMlR81K7/v0er/fSe29me8Jdtr1YBSLJMnT06TJk362uNz5sxJHTt2TBFl42vYuKmD47xD1w2qL3UTytKsWbNK9rN9xsqPmpXf9+jcVt6XrVy5sjQBpUePHvntkiVL8lk8DbL7AwYMaFxn6dKlTV63Zs2afGZPw+vXddlll6UJEyY06UHp3bt3GjlyZD4TKJIsHWYFPeqoo1L79u1L3ZzQ+k98tNRNyP9KyMLJVS9Uptr6ilI3p+y8MrFmi/9Mn7Hyo2bl9z1a/b/vxtbelzUcAdniAaVv3755yJg3b15jIMkak40tOeecc/L7Q4cOTcuWLUsLFy5MAwcOzB97/PHHU319fT5WZX2qq6vzZV3ZmxY1BERuWxS1a+MEgiycRGpPuSjl/+M+Y+VHzVpfbZG/t1q7Zi3ZVosDSna+krfffrvJwNiXXnopH0Oy6667pgsvvDBdf/316Tvf+U4eWK666qp8Zs4JJ5yQr9+vX7909NFHp7PPPjufipwl6/PPPz8fQGsGDwCwWQHlhRdeSEcccUTj/YZDL2eccUaaPn16uvjii/NzpWTnNcl6SoYNG5ZPK+7QoUPja+699948lBx55JH57J0xY8bk504BANisgHL44Yfn5zvZkOzsstdee22+bEjW23LfffepAACwXq7FAwCEI6AAAOEIKABAOAIKABCOgAIAhCOgAADhCCgAQDgCCgAQjoACAIQjoAAA4QgoAEA4AgoAEI6AAgCEI6AAAOEIKABAOAIKABCOgAIAhCOgAADhtCt1A4Dy1efSmUXZ7vtTji3KdoHyoQcFAAhHQAEAwnGIh5J04QPAxuhBAQDCEVAAgHAEFAAgHAEFAAhHQAEAwhFQAIBwBBQAIBwBBQAIR0ABAMIRUACAcAQUACAcAQUACEdAAQDCEVAAgHAEFAAgHAEFAAhHQAEAwhFQAIBwBBQAIBwBBQAIR0ABAMIRUACAcAQUACAcAQUACEdAAQDCEVAAgHDalboBALA163PpzFI3oSzpQQEAwhFQAIBwBBQAIBwBBQAIR0ABAMIRUACAcAQUACAcAQUACMeJ2gDACdXC0YMCAIQjoAAA4QgoAEA4AgoAEI6AAgCEI6AAAOEIKADA1h9QJk6cmCoqKpos++yzT+Pzq1atSuedd17q1q1b2n777dOYMWPSkiVLWrsZAEAZK0oPyn777ZcWLVrUuMyfP7/xufHjx6c//vGP6cEHH0xPPvlk+vjjj9NJJ51UjGYAAGWqKGeSbdeuXerRo8fXHv/888/TXXfdle677740fPjw/LF77rkn9evXLz3zzDPp4IMPLkZzAIAyU5SA8tZbb6VevXqlDh06pKFDh6bJkyenXXfdNS1cuDDV1dWlESNGNK6bHf7JnluwYMEGA0ptbW2+NFi+fHl+m20rWyJpaE+0dm2u6qpC2ppVVxaa3BLDxj4/W9tnrC0ol5pt7d93LdHwndjaNWvJ9ioKhUKrVuRPf/pT+uKLL9Lee++dH96ZNGlS+te//pVeeeWV/NDOWWed1SRsZAYPHpyOOOKI9POf/3yD41qy7awr64np2LFjazYfACiSlStXplNPPTU/otKpU6ctG1DWtWzZsrTbbrulm266KW277babFVDW14PSu3fv9Mknn2zyF9zSsnQ4d+7cdNRRR6X27dunctd/4qNpa/8r4bpB9emqFypTbX1FqZvD/7wysabNfMbagnKp2db+fbc5342tXbNs/73TTjs1K6AU/WrGXbp0SXvttVd6++2381909erVeWjJHm+QzeJZ35iVBtXV1fmyruxNi/o/e+S2tUTt2rax087CSVv5XctBcz47W8tnrC2JXjPfAcWvWUu2VfTzoGSHe955553Us2fPNHDgwLxx8+bNa3z+zTffTB988EE+VgUAoCg9KD/96U/T6NGj88M62RTia665JlVVVaUf/vCHqXPnzmns2LFpwoQJqWvXrnn3zgUXXJCHEzN4AICiBZSPPvooDyOffvpp+ta3vpWGDRuWTyHO/p25+eabU2VlZX6CtmxcSU1NTbrttttauxkAQBlr9YDywAMPbPT5bOrxtGnT8gUAYH1ciwcACEdAAQDCEVAAgHAEFAAgHAEFAAhHQAEAwhFQAIBwBBQAIBwBBQAIR0ABAMIRUACAcAQUACAcAQUACEdAAQDCEVAAgHAEFAAgHAEFAAhHQAEAwhFQAIBw2pW6AW1Jn0tnFm3b7085tmjbBoAtTQ8KABCOgAIAhOMQz1aimIePAGBL04MCAIQjoAAA4TjEA0DZcDi77dCDAgCEowcFKKu/kqurCmnq4JT6T3w01a6taPG2nTMIyoMeFAAgHAEFAAjHIR4AinqY7pselqNt0oMCAIQjoAAA4QgoAEA4AgoAEI6AAgCEI6AAAOEIKABAOAIKABCOgAIAhCOgAADhCCgAQDgCCgAQjosFArRRX72gH0SjBwUACEdAAQDCEVAAgHAEFAAgHAEFAAhHQAEAwhFQAIBwnAellc8NUF1VSFMHp9R/4qOpdm3FN6kNALRZelAAgHAEFAAgHId4AIKfNv79KccWbdsQlR4UACAcPSgAwbmoH22RHhQAIBwBBQAIR0ABAMIRUACAcAQUACAcAQUACMc0Y6BNMWUXyoMeFAAgnJIGlGnTpqU+ffqkDh06pCFDhqTnnnuulM0BANp6QPntb3+bJkyYkK655pr04osvpgMPPDDV1NSkpUuXlqpJAEBbDyg33XRTOvvss9NZZ52V9t1333THHXekjh07prvvvrtUTQIA2vIg2dWrV6eFCxemyy67rPGxysrKNGLEiLRgwYKvrV9bW5svDT7//PP89rPPPkt1dXWt3r52a77c/NfWF9LKlfWpXV1lWltf0artovWpV/lRs/KjZuVbs08//TS1b9++1ba7YsWK/LZQKGy6DakEPvnkk7R27dq08847N3k8u//GG298bf3JkyenSZMmfe3xvn37pohOLXUDaBH1Kj9qVn7UrPycWsRtZ0Glc+fO5T/NOOtpycarNKivr897T7p165YqKmL1Uixfvjz17t07ffjhh6lTp06lbg6boF7lR83Kj5qVn+VF2pdlPSdZOOnVq9cm1y1JQNlpp51SVVVVWrJkSZPHs/s9evT42vrV1dX58lVdunRJkWUFFVDKh3qVHzUrP2pWfjoVYV+2qZ6Tkg6S3WabbdLAgQPTvHnzmvSKZPeHDh1aiiYBAIGU7BBPdsjmjDPOSIMGDUqDBw9Ot9xyS/ryyy/zWT0AQNtWsoBy8sknp3//+9/p6quvTosXL04DBgxIs2fP/trA2XKTHYrKzu2y7iEpYlKv8qNm5UfNyk91gH1ZRaE5c30AALYg1+IBAMIRUACAcAQUACAcAQUACEdAaaFp06alPn36pA4dOqQhQ4ak5557boPr/vKXv0yHHXZY2nHHHfMlu9bQxtan9DX7qgceeCA/U/EJJ5ygNMFrtmzZsnTeeeelnj175rMO9tprrzRr1qwt1l5aXrPs1BJ777132nbbbfMzlo4fPz6tWrXKW7kFPPXUU2n06NH52Vyz77hHHnlkk6954okn0ne/+93887Xnnnum6dOnF7+h2SwemueBBx4obLPNNoW777678OqrrxbOPvvsQpcuXQpLlixZ7/qnnnpqYdq0aYW//e1vhddff71w5plnFjp37lz46KOPvOVBa9bgvffeK3z7298uHHbYYYXjjz9evQLXrLa2tjBo0KDCqFGjCvPnz89r98QTTxReeukldQtas3vvvbdQXV2d32b1evTRRws9e/YsjB8/Xs22gFmzZhWuuOKKwkMPPZTN4i08/PDDG13/3XffLXTs2LEwYcKEwmuvvVa49dZbC1VVVYXZs2cXtZ0CSgsMHjy4cN555zXeX7t2baFXr16FyZMnN+v1a9asKeywww6FX//61y2vFFusZlmdDjnkkMKvfvWrwhlnnCGgBK/Z7bffXth9990Lq1ev3oKt5JvULFt3+PDhTR7Ldn6HHnqoN3YLS80IKBdffHFhv/32a/LYySefXKipqSlq2xziaabVq1enhQsX5odpGlRWVub3FyxY0KxtrFy5MtXV1aWuXbtuXncXW6Rm1157berevXsaO3asd7wMavaHP/whv0RGdognO9Fj//790w033JBfMZ2YNTvkkEPy1zQcBnr33XfzQ3KjRo1SsoAWLFjQpL6ZmpqaZu/7NldZXM04gk8++ST/wlv3TLfZ/TfeeKNZ27jkkkvyY37rFpo4NZs/f36666670ksvvaQsZVKzbOf2+OOPp9NOOy3fyb399tvp3HPPzf8YyM6ESbyanXrqqfnrhg0bll/dds2aNeknP/lJuvzyy5UroMWLF6+3vtkVj//73//m44iKQQ/KFjJlypR80OXDDz+cDyIjnuwS4Keffno+uDm74jblIbvQaNbjdeedd+YXIc0uo3HFFVekO+64o9RNYyMDLrNerttuuy29+OKL6aGHHkozZ85M1113nfeMRnpQminbYVVVVaUlS5Y0eTy736NHj42+9sYbb8wDymOPPZYOOOCA5v5ItnDN3nnnnfT+++/no9u/uvPLtGvXLr355ptpjz32UJdgn7Ns5k779u3z1zXo169f/ldfdvghu3o6sWp21VVX5X8M/PjHP87v77///vnFYseNG5eHy+wQEXFkdVxffTt16lS03pOM/wuaKfuSy/46mzdvXpOdV3Y/O/69IVOnTs3/KsguhJhduZm4Ndtnn33Syy+/nB/eaViOO+64dMQRR+T/zqZCEqtmmUMPPTQ/rNMQJjP/+Mc/8uAinMSsWTYeb90Q0hAwXR4unqFDhzapb2bu3Lkb3fe1iqIOwd0Kp9JlU+OmT5+eT7UaN25cPpVu8eLF+fOnn3564dJLL21cf8qUKfnUu9/97neFRYsWNS4rVqwo4W/RtrS0Zusyiyd+zT744IN8dtz5559fePPNNwszZswodO/evXD99deXoPVtU0trds011+Q1u//++/MprHPmzCnssccehR/84Acl/C3ajhUrVuSnv8iWLAbcdNNN+b//+c9/5s9ntcpqtu4044suuig/ZUZ2+gzTjAPK5n/vuuuuefDIptY988wzjc99//vfz3doDXbbbbe8+Osu2YeTmDVbl4BSHjV7+umnC0OGDMl3ktmU45/97Gf5dHFi1qyurq4wceLEPJR06NCh0Lt378K5555b+M9//qNkW8Cf//zn9e6bGmqU3WY1W/c1AwYMyOubfcbuueeeorezIvtPcftoAABaxhgUACAcAQUACEdAAQDCEVAAgHAEFAAgHAEFAAhHQAEAwhFQAIBwBBQAIBwBBQAIR0ABAMIRUACAFM3/A299nSLZhZEWAAAAAElFTkSuQmCC", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "pt['score'].hist(bins=20);" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "d5fda75a-93fc-4d0d-827b-ef57c9cc2993", + "metadata": {}, + "outputs": [ + { + "name": "stdout", "output_type": "stream", "text": [ - "DEBUG:google.adk.scope.utils.similarity:Comparing 'run_async' and 'run_async'\n", - "DEBUG:google.adk.scope.utils.similarity:Both INSTANCE_METHOD. Using default weights: {'name': 0.3, 'member_of': 0.3, 'namespace': 0.15, 'parameters': 0.15, 'return_type': 0.1}\n", - "DEBUG:google.adk.scope.utils.similarity:Preliminary scores: {'name': 1.0, 'member_of': 1.0, 'namespace': 0.9714285714285714}\n", - "DEBUG:google.adk.scope.utils.similarity:Preliminary score: 0.7457, Early exit threshold: 0.6000\n", - "DEBUG:google.adk.scope.utils.similarity:Calculating parameter score for 6 vs 5 parameters\n", - "DEBUG:google.adk.scope.utils.similarity:Param 'user_id' vs 'user_id': 1.0000 (name:1.00, type:1.00, opt:1.00)\n", - "DEBUG:google.adk.scope.utils.similarity:Param 'user_id' vs 'session_id': 0.8357 (name:0.67, type:1.00, opt:1.00)\n", - "DEBUG:google.adk.scope.utils.similarity:Param 'user_id' vs 'new_message': 0.3446 (name:0.49, type:0.00, opt:1.00)\n", - "DEBUG:google.adk.scope.utils.similarity:Param 'user_id' vs 'state_delta': 0.3225 (name:0.65, type:0.00, opt:0.00)\n", - "DEBUG:google.adk.scope.utils.similarity:Param 'user_id' vs 'run_config': 0.2869 (name:0.57, type:0.00, opt:0.00)\n", - "DEBUG:google.adk.scope.utils.similarity:Param 'session_id' vs 'user_id': 0.8357 (name:0.67, type:1.00, opt:1.00)\n", - "DEBUG:google.adk.scope.utils.similarity:Param 'session_id' vs 'session_id': 1.0000 (name:1.00, type:1.00, opt:1.00)\n", - "DEBUG:google.adk.scope.utils.similarity:Param 'session_id' vs 'new_message': 0.3523 (name:0.50, type:0.00, opt:1.00)\n", - "DEBUG:google.adk.scope.utils.similarity:Param 'session_id' vs 'state_delta': 0.2939 (name:0.59, type:0.00, opt:0.00)\n", - "DEBUG:google.adk.scope.utils.similarity:Param 'session_id' vs 'run_config': 0.2167 (name:0.43, type:0.00, opt:0.00)\n", - "DEBUG:google.adk.scope.utils.similarity:Param 'invocation_id' vs 'user_id': 0.6033 (name:0.41, type:1.00, opt:0.00)\n", - "DEBUG:google.adk.scope.utils.similarity:Param 'invocation_id' vs 'session_id': 0.6880 (name:0.58, type:1.00, opt:0.00)\n", - "DEBUG:google.adk.scope.utils.similarity:Param 'invocation_id' vs 'new_message': 0.2226 (name:0.45, type:0.00, opt:0.00)\n", - "DEBUG:google.adk.scope.utils.similarity:Param 'invocation_id' vs 'state_delta': 0.2950 (name:0.39, type:0.00, opt:1.00)\n", - "DEBUG:google.adk.scope.utils.similarity:Param 'invocation_id' vs 'run_config': 0.3474 (name:0.49, type:0.00, opt:1.00)\n", - "DEBUG:google.adk.scope.utils.similarity:Param 'new_message' vs 'user_id': 0.2446 (name:0.49, type:0.00, opt:0.00)\n", - "DEBUG:google.adk.scope.utils.similarity:Param 'new_message' vs 'session_id': 0.2523 (name:0.50, type:0.00, opt:0.00)\n", - "DEBUG:google.adk.scope.utils.similarity:Param 'new_message' vs 'new_message': 0.9000 (name:1.00, type:1.00, opt:0.00)\n", - "DEBUG:google.adk.scope.utils.similarity:Param 'new_message' vs 'state_delta': 0.7879 (name:0.58, type:1.00, opt:1.00)\n", - "DEBUG:google.adk.scope.utils.similarity:Param 'new_message' vs 'run_config': 0.7621 (name:0.52, type:1.00, opt:1.00)\n", - "DEBUG:google.adk.scope.utils.similarity:Param 'state_delta' vs 'user_id': 0.3225 (name:0.65, type:0.00, opt:0.00)\n", - "DEBUG:google.adk.scope.utils.similarity:Param 'state_delta' vs 'session_id': 0.2939 (name:0.59, type:0.00, opt:0.00)\n", - "DEBUG:google.adk.scope.utils.similarity:Param 'state_delta' vs 'new_message': 0.2879 (name:0.58, type:0.00, opt:0.00)\n", - "DEBUG:google.adk.scope.utils.similarity:Param 'state_delta' vs 'state_delta': 0.6000 (name:1.00, type:0.00, opt:1.00)\n", - "DEBUG:google.adk.scope.utils.similarity:Param 'state_delta' vs 'run_config': 0.2985 (name:0.40, type:0.00, opt:1.00)\n", - "DEBUG:google.adk.scope.utils.similarity:Param 'run_config' vs 'user_id': 0.2869 (name:0.57, type:0.00, opt:0.00)\n", - "DEBUG:google.adk.scope.utils.similarity:Param 'run_config' vs 'session_id': 0.2167 (name:0.43, type:0.00, opt:0.00)\n", - "DEBUG:google.adk.scope.utils.similarity:Param 'run_config' vs 'new_message': 0.6621 (name:0.52, type:1.00, opt:0.00)\n", - "DEBUG:google.adk.scope.utils.similarity:Param 'run_config' vs 'state_delta': 0.6985 (name:0.40, type:1.00, opt:1.00)\n", - "DEBUG:google.adk.scope.utils.similarity:Param 'run_config' vs 'run_config': 1.0000 (name:1.00, type:1.00, opt:1.00)\n", - "DEBUG:google.adk.scope.utils.similarity:Matrix matched total score: 4.5000, final parameter score: 0.8182\n", - "DEBUG:google.adk.scope.utils.similarity:Return type score: 0.3000 (type match: 0.0, async match: 1.0)\n", - "DEBUG:google.adk.scope.utils.similarity:Final scores including params & return: {'name': 1.0, 'member_of': 1.0, 'namespace': 0.9714285714285714, 'parameters': np.float64(0.8181818181818182), 'return_type': 0.3}\n", - "DEBUG:google.adk.scope.utils.similarity:Final weighted similarity score: 0.8984\n" + "218\n" ] }, { "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
py_namespacepy_member_ofpy_namego_namespacego_member_ofgo_nametypescorematchconfidence
1399flows.llm_flowsNaNmerge_parallel_function_response_eventsinternal.utilsNaNFunctionResponsesfunction0.4991Falsehigh
263pluginsBasePluginon_model_error_callbackinternal.plugininternalPluginManagerRunOnModelErrorCallbackmethod0.4989Falsehigh
1213tools.openapi_tool.authNaNdict_to_auth_schemeinternal.typeutilNaNConvertToWithJSONSchemafunction0.4989Falsehigh
1288cli.pluginsReplayPlugin__init__pluginPluginNewconstructor0.4989Falsehigh
219pluginsHybridContentParserparsecmd.launcher.consoleConsoleLauncherParsemethod0.4983Falsehigh
\n", + "
" + ], "text/plain": [ - "np.float64(0.8984415584415584)" + " py_namespace py_member_of \\\n", + "1399 flows.llm_flows NaN \n", + "263 plugins BasePlugin \n", + "1213 tools.openapi_tool.auth NaN \n", + "1288 cli.plugins ReplayPlugin \n", + "219 plugins HybridContentParser \n", + "\n", + " py_name go_namespace \\\n", + "1399 merge_parallel_function_response_events internal.utils \n", + "263 on_model_error_callback internal.plugininternal \n", + "1213 dict_to_auth_scheme internal.typeutil \n", + "1288 __init__ plugin \n", + "219 parse cmd.launcher.console \n", + "\n", + " go_member_of go_name type score match \\\n", + "1399 NaN FunctionResponses function 0.4991 False \n", + "263 PluginManager RunOnModelErrorCallback method 0.4989 False \n", + "1213 NaN ConvertToWithJSONSchema function 0.4989 False \n", + "1288 Plugin New constructor 0.4989 False \n", + "219 ConsoleLauncher Parse method 0.4983 False \n", + "\n", + " confidence \n", + "1399 high \n", + "263 high \n", + "1213 high \n", + "1288 high \n", + "219 high " ] }, - "execution_count": 47, + "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "scorer.get_similarity_score(py_run_async, ts_run_async)" + "# PYTHON <-> GO:\n", + "# 0.6 <= GOOD\n", + "# 0.5 < AVERAGE <= 0.6\n", + "# BAD <= 0.5 \n", + "LOWER, UPPER = 0.4599, 0.4999\n", + "print(len(pg[(LOWER < pg['score']) & (pg['score'] < UPPER)]))\n", + "pg[(LOWER < pg['score']) & (pg['score'] < UPPER)].head(5)" ] }, { "cell_type": "code", - "execution_count": 48, - "id": "341ad037-4bdb-481a-8a45-b3e1a48f54bc", + "execution_count": 11, + "id": "dace4311-5318-4196-a982-3dafa6e39747", "metadata": {}, "outputs": [ { - "name": "stderr", + "name": "stdout", "output_type": "stream", "text": [ - "DEBUG:google.adk.scope.utils.similarity:Comparing 'run' and 'run_async'\n", - "DEBUG:google.adk.scope.utils.similarity:Both INSTANCE_METHOD. Using default weights: {'name': 0.3, 'member_of': 0.3, 'namespace': 0.15, 'parameters': 0.15, 'return_type': 0.1}\n", - "DEBUG:google.adk.scope.utils.similarity:Preliminary scores: {'name': 0.8444444444444443, 'member_of': 1.0, 'namespace': 0.9714285714285714}\n", - "DEBUG:google.adk.scope.utils.similarity:Preliminary score: 0.6990, Early exit threshold: 0.6000\n", - "DEBUG:google.adk.scope.utils.similarity:Calculating parameter score for 4 vs 5 parameters\n", - "DEBUG:google.adk.scope.utils.similarity:Param 'user_id' vs 'user_id': 1.0000 (name:1.00, type:1.00, opt:1.00)\n", - "DEBUG:google.adk.scope.utils.similarity:Param 'user_id' vs 'session_id': 0.8357 (name:0.67, type:1.00, opt:1.00)\n", - "DEBUG:google.adk.scope.utils.similarity:Param 'user_id' vs 'new_message': 0.3446 (name:0.49, type:0.00, opt:1.00)\n", - "DEBUG:google.adk.scope.utils.similarity:Param 'user_id' vs 'state_delta': 0.3225 (name:0.65, type:0.00, opt:0.00)\n", - "DEBUG:google.adk.scope.utils.similarity:Param 'user_id' vs 'run_config': 0.2869 (name:0.57, type:0.00, opt:0.00)\n", - "DEBUG:google.adk.scope.utils.similarity:Param 'session_id' vs 'user_id': 0.8357 (name:0.67, type:1.00, opt:1.00)\n", - "DEBUG:google.adk.scope.utils.similarity:Param 'session_id' vs 'session_id': 1.0000 (name:1.00, type:1.00, opt:1.00)\n", - "DEBUG:google.adk.scope.utils.similarity:Param 'session_id' vs 'new_message': 0.3523 (name:0.50, type:0.00, opt:1.00)\n", - "DEBUG:google.adk.scope.utils.similarity:Param 'session_id' vs 'state_delta': 0.2939 (name:0.59, type:0.00, opt:0.00)\n", - "DEBUG:google.adk.scope.utils.similarity:Param 'session_id' vs 'run_config': 0.2167 (name:0.43, type:0.00, opt:0.00)\n", - "DEBUG:google.adk.scope.utils.similarity:Param 'new_message' vs 'user_id': 0.3446 (name:0.49, type:0.00, opt:1.00)\n", - "DEBUG:google.adk.scope.utils.similarity:Param 'new_message' vs 'session_id': 0.3523 (name:0.50, type:0.00, opt:1.00)\n", - "DEBUG:google.adk.scope.utils.similarity:Param 'new_message' vs 'new_message': 1.0000 (name:1.00, type:1.00, opt:1.00)\n", - "DEBUG:google.adk.scope.utils.similarity:Param 'new_message' vs 'state_delta': 0.6879 (name:0.58, type:1.00, opt:0.00)\n", - "DEBUG:google.adk.scope.utils.similarity:Param 'new_message' vs 'run_config': 0.6621 (name:0.52, type:1.00, opt:0.00)\n", - "DEBUG:google.adk.scope.utils.similarity:Param 'run_config' vs 'user_id': 0.2869 (name:0.57, type:0.00, opt:0.00)\n", - "DEBUG:google.adk.scope.utils.similarity:Param 'run_config' vs 'session_id': 0.2167 (name:0.43, type:0.00, opt:0.00)\n", - "DEBUG:google.adk.scope.utils.similarity:Param 'run_config' vs 'new_message': 0.6621 (name:0.52, type:1.00, opt:0.00)\n", - "DEBUG:google.adk.scope.utils.similarity:Param 'run_config' vs 'state_delta': 0.6985 (name:0.40, type:1.00, opt:1.00)\n", - "DEBUG:google.adk.scope.utils.similarity:Param 'run_config' vs 'run_config': 1.0000 (name:1.00, type:1.00, opt:1.00)\n", - "DEBUG:google.adk.scope.utils.similarity:Matrix matched total score: 4.0000, final parameter score: 0.8889\n", - "DEBUG:google.adk.scope.utils.similarity:Return type score: 0.7000 (type match: 1.0, async match: 0.0)\n", - "DEBUG:google.adk.scope.utils.similarity:Final scores including params & return: {'name': 0.8444444444444443, 'member_of': 1.0, 'namespace': 0.9714285714285714, 'parameters': np.float64(0.8888888888888888), 'return_type': 0.7}\n", - "DEBUG:google.adk.scope.utils.similarity:Final weighted similarity score: 0.9024\n" + "252\n" ] }, { "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
py_namespacepy_member_ofpy_namejava_namespacejava_member_ofjava_nametypescorematchconfidence
1runnersRunnerrewind_asyncrunnerRunnerrunAsyncmethod0.5978Truelow
1071tools.computer_useComputerUseToolsetclosetools.mcpMcpAsyncToolsetclosemethod0.5975Truelow
1153tools.application_integration_tool.clientsConnectionsClientget_entity_schema_and_operationstools.applicationintegrationtoolsetConnectionsClientconvertJsonSchemaToOpenApiSchemamethod0.5970Truelow
594cliAdkWebServerget_session_tracewebAdkWebServersessionServicemethod0.5962Truelow
1416flows.llm_flowsBaseLlmFlowrun_asyncflows.llmflowsBaseLlmFlowrunmethod0.5959Truelow
\n", + "
" + ], "text/plain": [ - "np.float64(0.9023809523809523)" + " py_namespace py_member_of \\\n", + "1 runners Runner \n", + "1071 tools.computer_use ComputerUseToolset \n", + "1153 tools.application_integration_tool.clients ConnectionsClient \n", + "594 cli AdkWebServer \n", + "1416 flows.llm_flows BaseLlmFlow \n", + "\n", + " py_name java_namespace \\\n", + "1 rewind_async runner \n", + "1071 close tools.mcp \n", + "1153 get_entity_schema_and_operations tools.applicationintegrationtoolset \n", + "594 get_session_trace web \n", + "1416 run_async flows.llmflows \n", + "\n", + " java_member_of java_name type score \\\n", + "1 Runner runAsync method 0.5978 \n", + "1071 McpAsyncToolset close method 0.5975 \n", + "1153 ConnectionsClient convertJsonSchemaToOpenApiSchema method 0.5970 \n", + "594 AdkWebServer sessionService method 0.5962 \n", + "1416 BaseLlmFlow run method 0.5959 \n", + "\n", + " match confidence \n", + "1 True low \n", + "1071 True low \n", + "1153 True low \n", + "594 True low \n", + "1416 True low " ] }, - "execution_count": 48, + "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "scorer.get_similarity_score(py_run, ts_run_async)" + "# PYTHON <-> JAVA:\n", + "# 0.6 <= GOOD\n", + "# BAD <= 0.6\n", + "LOWER, UPPER = 0.4999, 0.5999\n", + "print(len(pj[(LOWER < pj['score']) & (pj['score'] < UPPER)]))\n", + "pj[(LOWER < pj['score']) & (pj['score'] < UPPER)].head(5)" ] }, { "cell_type": "code", - "execution_count": 49, - "id": "783ee277-0893-46e3-ae9c-72a8df82cb8d", + "execution_count": 12, + "id": "b71d2c93-6f89-4e1e-ad47-0cb0285d7182", "metadata": {}, "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "102\n" + ] + }, { "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
py_namespacepy_member_ofpy_namets_namespacets_member_ofts_nametypescorematchconfidence
942tools.google_api_toolDocsToolset__init__tools.mcpMCPToolsetconstructorconstructor0.5564Truelow
762evaluationNaNget_all_tool_callstelemetryNaNtraceToolCallfunction0.5551Truelow
1280cli.pluginsRecordingsPlugin__init__pluginsLoggingPluginconstructorconstructor0.5527Truelow
1006tools.pubsubPubSubToolset__init__toolsBaseToolsetconstructorconstructor0.5524Truelow
943tools.google_api_toolGoogleApiTool__init__toolsGoogleSearchToolconstructorconstructor0.5521Truelow
\n", + "
" + ], "text/plain": [ - "original_name: \"run_async\"\n", - "normalized_name: \"run_async\"\n", - "description: \"Main entry method to run the agent in this runner.\\n\\n If event compaction is enabled in the App configuration, it will be\\n performed after all agent events for the current invocation have been\\n yielded. The async generator will only finish iterating after event\\n compaction is complete. However, this does not block new `run_async`\\n calls for subsequent user queries, which can be started concurrently.\"\n", - "member_of: \"Runner\"\n", - "normalized_member_of: \"runner\"\n", - "type: INSTANCE_METHOD\n", - "file_path: \"/Users/shahins/projects/adk/adk-python/src/google/adk/runners.py\"\n", - "namespace: \"runners\"\n", - "normalized_namespace: \"runners\"\n", - "parameters {\n", - " original_name: \"user_id\"\n", - " normalized_name: \"user_id\"\n", - " original_types: \"str\"\n", - " normalized_types: STRING\n", - " description: \"The user ID of the session.\"\n", - "}\n", - "parameters {\n", - " original_name: \"session_id\"\n", - " normalized_name: \"session_id\"\n", - " original_types: \"str\"\n", - " normalized_types: STRING\n", - " description: \"The session ID of the session.\"\n", - "}\n", - "parameters {\n", - " original_name: \"invocation_id\"\n", - " normalized_name: \"invocation_id\"\n", - " original_types: \"Optional[str]\"\n", - " normalized_types: STRING\n", - " description: \"The invocation ID of the session, set this to resume an interrupted invocation.\"\n", - " is_optional: true\n", - "}\n", - "parameters {\n", - " original_name: \"new_message\"\n", - " normalized_name: \"new_message\"\n", - " original_types: \"Optional[types.Content]\"\n", - " normalized_types: OBJECT\n", - " description: \"A new message to append to the session.\"\n", - " is_optional: true\n", - "}\n", - "parameters {\n", - " original_name: \"state_delta\"\n", - " normalized_name: \"state_delta\"\n", - " original_types: \"Optional[dict[str, Any]]\"\n", - " normalized_types: MAP\n", - " description: \"Optional state changes to apply to the session.\"\n", - " is_optional: true\n", - "}\n", - "parameters {\n", - " original_name: \"run_config\"\n", - " normalized_name: \"run_config\"\n", - " original_types: \"Optional[RunConfig]\"\n", - " normalized_types: OBJECT\n", - " description: \"The run config for the agent.\"\n", - " is_optional: true\n", - "}\n", - "original_return_types: \"AsyncGenerator[Event, None]\"\n", - "normalized_return_types: \"OBJECT\"\n", - "normalized_return_types: \"null\"\n", - "async: true" + " py_namespace py_member_of py_name \\\n", + "942 tools.google_api_tool DocsToolset __init__ \n", + "762 evaluation NaN get_all_tool_calls \n", + "1280 cli.plugins RecordingsPlugin __init__ \n", + "1006 tools.pubsub PubSubToolset __init__ \n", + "943 tools.google_api_tool GoogleApiTool __init__ \n", + "\n", + " ts_namespace ts_member_of ts_name type score \\\n", + "942 tools.mcp MCPToolset constructor constructor 0.5564 \n", + "762 telemetry NaN traceToolCall function 0.5551 \n", + "1280 plugins LoggingPlugin constructor constructor 0.5527 \n", + "1006 tools BaseToolset constructor constructor 0.5524 \n", + "943 tools GoogleSearchTool constructor constructor 0.5521 \n", + "\n", + " match confidence \n", + "942 True low \n", + "762 True low \n", + "1280 True low \n", + "1006 True low \n", + "943 True low " ] }, - "execution_count": 49, + "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "py_run_async" + "# PYTHON <-> TS:\n", + "# 0.7 <= GOOD\n", + "# 0.55 < AVERAGE <= 0.7\n", + "# BAD <= 0.55\n", + "LOWER, UPPER = 0.55, 0.7\n", + "print(len(pt[(LOWER < pt['score']) & (pt['score'] < UPPER)]))\n", + "pt[(LOWER < pt['score']) & (pt['score'] < UPPER)].tail(5)" ] }, { "cell_type": "code", - "execution_count": 50, - "id": "861b3583-49d8-4b29-9f39-144aabe82948", + "execution_count": 13, + "id": "f1e5b76d-22d2-432f-b3c6-5090b847cfe1", "metadata": {}, "outputs": [ { "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
score
matchconfidencetype
Falsehighconstructor377
function206
method604
Truehighconstructor10
function8
method46
lowconstructor19
function45
method111
\n", + "
" + ], "text/plain": [ - "original_name: \"run\"\n", - "normalized_name: \"run\"\n", - "description: \"Runs the agent.\\n\\n NOTE:\\n This sync interface is only for local testing and convenience purpose.\\n Consider using `run_async` for production usage.\\n\\n If event compaction is enabled in the App configuration, it will be\\n performed after all agent events for the current invocation have been\\n yielded. The generator will only finish iterating after event\\n compaction is complete.\"\n", - "member_of: \"Runner\"\n", - "normalized_member_of: \"runner\"\n", - "type: INSTANCE_METHOD\n", - "file_path: \"/Users/shahins/projects/adk/adk-python/src/google/adk/runners.py\"\n", - "namespace: \"runners\"\n", - "normalized_namespace: \"runners\"\n", - "parameters {\n", - " original_name: \"user_id\"\n", - " normalized_name: \"user_id\"\n", - " original_types: \"str\"\n", - " normalized_types: STRING\n", - " description: \"The user ID of the session.\"\n", - "}\n", - "parameters {\n", - " original_name: \"session_id\"\n", - " normalized_name: \"session_id\"\n", - " original_types: \"str\"\n", - " normalized_types: STRING\n", - " description: \"The session ID of the session.\"\n", - "}\n", - "parameters {\n", - " original_name: \"new_message\"\n", - " normalized_name: \"new_message\"\n", - " original_types: \"types.Content\"\n", - " normalized_types: OBJECT\n", - " description: \"A new message to append to the session.\"\n", - "}\n", - "parameters {\n", - " original_name: \"run_config\"\n", - " normalized_name: \"run_config\"\n", - " original_types: \"Optional[RunConfig]\"\n", - " normalized_types: OBJECT\n", - " description: \"The run config for the agent.\"\n", - " is_optional: true\n", - "}\n", - "original_return_types: \"Generator[Event, None, None]\"\n", - "normalized_return_types: \"OBJECT\"" + " score\n", + "match confidence type \n", + "False high constructor 377\n", + " function 206\n", + " method 604\n", + "True high constructor 10\n", + " function 8\n", + " method 46\n", + " low constructor 19\n", + " function 45\n", + " method 111" ] }, - "execution_count": 50, + "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "py_run" + "pg[['match', 'confidence', 'type', 'score']].groupby(['match', 'confidence', 'type']).count()" ] }, { "cell_type": "code", "execution_count": null, - "id": "941ec275-7405-404b-a31c-d91d4d3671bd", + "id": "5dc6f20d", "metadata": {}, "outputs": [], "source": [] diff --git a/pyproject.toml b/pyproject.toml index a18b27d..fe81589 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -22,6 +22,8 @@ dependencies = [ "scipy", "numpy", "jellyfish", + "RapidFuzz", + "pandas", ] diff --git a/report.sh b/report.sh index 2850338..9d2192a 100755 --- a/report.sh +++ b/report.sh @@ -4,11 +4,12 @@ set -e # Default values -REPORT_TYPE="symmetric" -ALPHA="0.8" +REPORT_TYPE="md" VERBOSE="" +COMMON="" # Parse arguments +REGISTRIES=() while [[ "$#" -gt 0 ]]; do case "$1" in --base) @@ -19,6 +20,13 @@ while [[ "$#" -gt 0 ]]; do TARGET_FILE="$2" shift 2 ;; + --registries) + shift + while [[ "$#" -gt 0 && ! "$1" =~ ^-- ]]; do + REGISTRIES+=("$1") + shift + done + ;; --output) OUTPUT_DIR="$2" shift 2 @@ -27,14 +35,14 @@ while [[ "$#" -gt 0 ]]; do REPORT_TYPE="$2" shift 2 ;; - --alpha) - ALPHA="$2" - shift 2 - ;; -v|--verbose) VERBOSE="--verbose" shift ;; + --common) + COMMON="--common" + shift + ;; *) echo "Unknown option: $1" exit 1 @@ -42,10 +50,6 @@ while [[ "$#" -gt 0 ]]; do esac done -# Extract languages -BASE_LANG_RAW=$(head -n 1 "${BASE_FILE}" | grep -o 'language: "[A-Z]*"' | grep -o '"[A-Z]*"' | tr -d '"') -TARGET_LANG_RAW=$(head -n 1 "${TARGET_FILE}" | grep -o 'language: "[A-Z]*"' | grep -o '"[A-Z]*"' | tr -d '"') - # Function to map language to short code get_lang_code() { case "$1" in @@ -57,16 +61,33 @@ get_lang_code() { esac } -BASE_LANG=$(get_lang_code "$BASE_LANG_RAW") -TARGET_LANG=$(get_lang_code "$TARGET_LANG_RAW") +if [[ ${#REGISTRIES[@]} -eq 0 && -n "$BASE_FILE" && -n "$TARGET_FILE" ]]; then + REGISTRIES+=("$BASE_FILE" "$TARGET_FILE") +fi + +if [[ ${#REGISTRIES[@]} -lt 2 ]]; then + echo "Error: Must provide at least two registries via --registries or --base/--target" + exit 1 +fi + +# Extract languages and construct filename +LANG_CODES=() +for REG_FILE in "${REGISTRIES[@]}"; do + LANG_RAW=$(head -n 1 "${REG_FILE}" | grep -o 'language: "[A-Z]*"' | grep -o '"[A-Z]*"' | tr -d '"') + LANG_CODES+=($(get_lang_code "$LANG_RAW")) +done # Construct filename +# Default to markdown extension. The python script will generate CSV alongside it. +EXTENSION="md" + +# Standard 2-way report +OUTPUT_FILENAME="${LANG_CODES[0]}_${LANG_CODES[1]}.${EXTENSION}" +# Ensure report type is 'md' for standard logic so unified generator runs if [ "$REPORT_TYPE" == "raw" ]; then - EXTENSION="csv" -else - EXTENSION="md" + REPORT_TYPE="md" fi -OUTPUT_FILENAME="${BASE_LANG}_${TARGET_LANG}_${REPORT_TYPE}.${EXTENSION}" + FULL_OUTPUT_PATH="${OUTPUT_DIR}/${OUTPUT_FILENAME}" # Determine the directory where this script is located @@ -75,11 +96,10 @@ SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" # Add 'src' to PYTHONPATH so the python script can find modules export PYTHONPATH="${SCRIPT_DIR}/src:${PYTHONPATH}" -# Run the python matcher +# Run the python reporter python3 "${SCRIPT_DIR}/src/google/adk/scope/reporter/reporter.py" \ - --base "${BASE_FILE}" \ - --target "${TARGET_FILE}" \ + --registries "${REGISTRIES[@]}" \ --output "${FULL_OUTPUT_PATH}" \ --report-type "${REPORT_TYPE}" \ - --alpha "${ALPHA}" \ + ${COMMON} \ ${VERBOSE} diff --git a/run.sh b/run.sh index a3f5884..4ca47f4 100755 --- a/run.sh +++ b/run.sh @@ -12,28 +12,20 @@ echo "Extracting Go features..." # Py -> TS -echo "Generating symmetric reports..." -./report.sh --base output/py.txtpb --target output/ts.txtpb --output ./output --report-type symmetric - -echo "Generating directional reports.. ." -./report.sh --base output/py.txtpb --target output/ts.txtpb --output ./output --report-type directional - -echo "Generating raw reports..." -./report.sh --base output/py.txtpb --target output/ts.txtpb --output ./output --report-type raw +echo "Generating raw and markdown reports..." +./report.sh --base output/py.txtpb --target output/ts.txtpb --output ./output --report-type md # Py -> Java -echo "Generating symmetric reports..." -./report.sh --base output/py.txtpb --target output/java.txtpb --output ./output --report-type symmetric - -echo "Generating directional reports (py->java)..." -./report.sh --base output/py.txtpb --target output/java.txtpb --output ./output --report-type directional - +echo "Generating raw and markdown reports..." +./report.sh --base output/py.txtpb --target output/java.txtpb --output ./output --report-type md # Py -> Go -echo "Generating symmetric reports..." -./report.sh --base output/py.txtpb --target output/go.txtpb --output ./output --report-type symmetric +echo "Generating raw and markdown reports..." +./report.sh --base output/py.txtpb --target output/go.txtpb --output ./output --report-type md + +# Matrix reports -echo "Generating directional reports (py->go)..." -./report.sh --base output/py.txtpb --target output/go.txtpb --output ./output --report-type directional \ No newline at end of file +#echo "Generating matrix reports..." +#./report.sh --registries output/py.txtpb output/ts.txtpb output/java.txtpb output/go.txtpb --output ./output --report-type matrix --common \ No newline at end of file diff --git a/score.sh b/score.sh new file mode 100755 index 0000000..7a5f822 --- /dev/null +++ b/score.sh @@ -0,0 +1,13 @@ +#!/bin/bash +set -e + +# Resolve the project root +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +export PYTHONPATH="${SCRIPT_DIR}/src:${PYTHONPATH}" + +if [ "$#" -lt 2 ]; then + echo "Usage: $0 [options]" + exit 1 +fi + +python3 "${SCRIPT_DIR}/src/google/adk/scope/utils/score_features.py" "$@" diff --git a/src/google/adk/scope/extractors/converter_go.py b/src/google/adk/scope/extractors/converter_go.py index f7ed2d2..4b2cd8c 100644 --- a/src/google/adk/scope/extractors/converter_go.py +++ b/src/google/adk/scope/extractors/converter_go.py @@ -19,6 +19,9 @@ class NodeProcessor: def __init__(self): self.normalizer = TypeNormalizer() + # Mapping from struct name to list of + # (field_name, field_type, is_optional) + self._struct_definitions: dict[str, list[tuple[str, str, bool]]] = {} def process( self, @@ -29,7 +32,12 @@ def process( normalized_namespace: str, ) -> Optional[feature_pb2.Feature]: """Convert a Tree-sitter node into a Feature.""" - if node.type not in ("function_declaration", "method_declaration"): + valid_nodes = ( + "function_declaration", + "method_declaration", + "method_elem", + ) + if node.type not in valid_nodes: return None original_name = self._extract_name(node) @@ -56,11 +64,32 @@ def process( "New" ): feature_type = feature_pb2.Feature.Type.CONSTRUCTOR + # For constructors, try to infer member_of from the return type + # e.g. func NewAgent() *Agent -> member_of = Agent + original_returns, _ = self._extract_return_types(node) + if original_returns: + # Typically the first return value is the struct + ret_type = original_returns[0] + # access the struct name, e.g. *Agent -> Agent, + # mypkg.Agent -> Agent + # Similar logic to parameter flattening type extraction + clean_ret = ret_type.lstrip("*").split(".")[-1] + if clean_ret: + member_of = clean_ret + normalized_member_of = normalize_name(member_of) + elif node.type == "method_elem": + feature_type = feature_pb2.Feature.Type.INSTANCE_METHOD + member_of = self._extract_interface_name(node) + normalized_member_of = ( + normalize_name(member_of) if member_of else "" + ) - parameters = self._extract_params(node) + parameters, is_async = self._extract_params(node) original_returns, normalized_returns = self._extract_return_types(node) + docstring = self._extract_docstring(node) + feature = feature_pb2.Feature( original_name=original_name, normalized_name=normalize_name(original_name), @@ -74,9 +103,94 @@ def process( original_return_types=original_returns, normalized_return_types=normalized_returns, ) + if is_async: + setattr(feature, "async", True) + + if docstring: + feature.description = docstring + + if docstring: + feature.description = docstring return feature + def register_struct(self, node: Node) -> None: + """Register a struct definition to allow parameter flattening.""" + # Find struct name from parent type_spec + parent = node.parent + # The query capture is on: + # (type_spec name: ... type: (struct_type) @struct_body) + # So node is the struct_type node. Parent should be type_spec. + if not parent or parent.type != "type_spec": + return + + name_node = parent.child_by_field_name("name") + if not name_node: + return + + struct_name = name_node.text.decode("utf-8") + + # Parse fields + fields = [] + + # Iterating children to find field_declaration_list because + # child_by_field_name might be failing or the field name is different + # in this version of tree-sitter-go + field_list = None + for child in node.children: + if child.type == "field_declaration_list": + field_list = child + break + + if field_list: + for child in field_list.children: + if child.type == "field_declaration": + # Handle multiple names for same type e.g. A, B int + type_node = child.child_by_field_name("type") + if not type_node: + continue + + type_str = type_node.text.decode("utf-8") + + # Determine if optional + is_optional = False + if type_node.type == "pointer_type": + is_optional = True + + # field_declaration children names + # Loop through children to find all field_identifier nodes + field_names = [] + for subchild in child.children: + if subchild.type == "field_identifier": + field_names.append(subchild.text.decode("utf-8")) + + for fname in field_names: + fields.append((fname, type_str, is_optional)) + + self._struct_definitions[struct_name] = fields + + def _extract_docstring(self, node: Node) -> str: + """Extract comments immediately preceding the declaration.""" + comments = [] + prev = node.prev_sibling + while prev and prev.type == "comment": + clean_comment = prev.text.decode("utf-8").lstrip("//").strip() + comments.insert(0, clean_comment) + prev = prev.prev_sibling + return "\n".join(comments) + + def _extract_interface_name(self, node: Node) -> str: + """Walk up the AST from a method_spec to find the interface type name. + """ + parent = node.parent + while parent: + if parent.type == "type_spec": + name_node = parent.child_by_field_name("name") + if name_node: + return name_node.text.decode("utf-8") + parent = parent.parent + return "" + def _extract_receiver_type(self, node: Node) -> str: """Extract the receiver type from a method_declaration.""" receiver_node = node.child_by_field_name("receiver") @@ -131,13 +245,16 @@ def _extract_return_types( return original_returns, normalized_returns - def _extract_params(self, node: Node) -> list[feature_pb2.Param]: + def _extract_params( + self, node: Node + ) -> tuple[list[feature_pb2.Param], bool]: """Extract parameters from a function_declaration node.""" params = [] params_node = node.child_by_field_name("parameters") if not params_node: - return [] + return [], False + is_async = False for child in params_node.children: if child.type == "parameter_declaration": name_node = child.child_by_field_name("name") @@ -150,19 +267,54 @@ def _extract_params(self, node: Node) -> list[feature_pb2.Param]: # Skip Go context.Context parameters to align with other # languages if param_type == "context.Context": + is_async = True continue - norm_types = self.normalizer.normalize(param_type, "go") - norm_enums = [getattr(feature_pb2, nt) for nt in norm_types] - - p = feature_pb2.Param( - original_name=param_name, - normalized_name=normalize_name(param_name), - original_types=[param_type], - normalized_types=norm_enums, - ) - params.append(p) - return params + # Check if this parameter type should be flattened + # We strip pointer and module prefix to find the struct name + # e.g. *Config -> Config, mypkg.Config -> Config + # Simple heuristic: take the last part after dot, strip * + clean_type_name = param_type.lstrip("*").split(".")[-1] + + if clean_type_name in self._struct_definitions: + # FLATTEN: Add all fields of the struct as parameters + for ( + field_name, + field_type, + is_optional, + ) in self._struct_definitions[clean_type_name]: + # Recursively normalize the field type + norm_types = self.normalizer.normalize( + field_type, "go" + ) + norm_enums = [ + getattr(feature_pb2, nt) for nt in norm_types + ] + + p = feature_pb2.Param( + original_name=field_name, + normalized_name=normalize_name(field_name), + original_types=[field_type], + normalized_types=norm_enums, + ) + if is_optional: + p.is_optional = True + params.append(p) + else: + # Normal processing + norm_types = self.normalizer.normalize(param_type, "go") + norm_enums = [ + getattr(feature_pb2, nt) for nt in norm_types + ] + + p = feature_pb2.Param( + original_name=param_name, + normalized_name=normalize_name(param_name), + original_types=[param_type], + normalized_types=norm_enums, + ) + params.append(p) + return params, is_async def _extract_name(self, node: Node) -> str: """Extract the name from a function_declaration node.""" diff --git a/src/google/adk/scope/extractors/converter_py.py b/src/google/adk/scope/extractors/converter_py.py index cc3c540..9c1ba9b 100644 --- a/src/google/adk/scope/extractors/converter_py.py +++ b/src/google/adk/scope/extractors/converter_py.py @@ -55,6 +55,15 @@ def process( # 2. Context member_of, normalized_member_of = self._extract_member_of(node) + # If the member belongs to a private class, skip it + if member_of and member_of.startswith("_"): + logger.debug( + "Skipping method %s of private class %s", + original_name, + member_of, + ) + return None + feature_type = self._determine_type( node, original_name, bool(member_of) ) @@ -142,6 +151,10 @@ def _extract_name(self, node: Node) -> str: def _process_dataclass( self, node: Node, file_path: Path, repo_root: Path ) -> Optional[feature_pb2.Feature]: + original_name = self._extract_name(node) + if not original_name or original_name.startswith("_"): + return None + body_node = node.child_by_field_name("body") if not body_node: return None @@ -201,7 +214,6 @@ def _process_dataclass( if has_init or not params: return None - original_name = self._extract_name(node) normalized_name = normalize_name(original_name) namespace, normalized_namespace = self._extract_namespace( file_path, repo_root diff --git a/src/google/adk/scope/extractors/extractor_go.py b/src/google/adk/scope/extractors/extractor_go.py index 14848e0..57d2377 100644 --- a/src/google/adk/scope/extractors/extractor_go.py +++ b/src/google/adk/scope/extractors/extractor_go.py @@ -28,9 +28,13 @@ def find_files( iterator = root.rglob("*.go") if recursive else root.glob("*.go") for path in iterator: - # Check if any part of the path starts with '.' (excluding '.' and '..') + if path.name.endswith("_test.go"): + continue + + # Exclude hidden directories, files, and common testing directories if any( - part.startswith(".") and part not in (".", "..") + (part.startswith(".") and part not in (".", "..")) + or part in ("tests", "testutil", "testing", "testdata") for part in path.parts ): continue @@ -66,6 +70,11 @@ def extract_features( return [] processor = NodeProcessor() + + # Pre-process structs to build the definition map + # We need to re-query or process struct nodes specifically. + # To keep it simple, let's just use the query we have. + pass features = [] # REVISED QUERY: Matches the declaration nodes. @@ -73,43 +82,91 @@ def extract_features( query_text = """ (function_declaration) @func (method_declaration) @method + (type_declaration + (type_spec + name: (type_identifier) @interface_name + type: (interface_type + (method_elem) @interface_method + ) + ) + ) + (type_declaration + (type_spec + name: (type_identifier) @struct_name + type: (struct_type) @struct_body + ) + ) """ query = Query(GO_LANGUAGE, query_text) cursor = QueryCursor(query) captures = cursor.captures(root_node) all_nodes = [] - for node_list in captures.values(): - all_nodes.extend(node_list) + struct_nodes = [] + # We only want to process the actual function/method nodes, not the + # interface names which are captured just for context by the processor + # (via tree traversal). + for capture_name, node_list in captures.items(): + if capture_name in ("func", "method", "interface_method"): + all_nodes.extend(node_list) + elif capture_name == "struct_body": + # We need to associate the struct body with its name. + # The query captures @struct_name and @struct_body separately but + # in order. + # However, 'captures' is a dict of lists, so order might be tricky + # if we rely on index alignment across lists. + # Better strategy: Capture the parent type_spec and process it? + # Or iterate the captures list (which we can't easily do with the + # dict output). + # Let's rely on NodeProcessor to find the name from the struct_body + # node's parent. + struct_nodes.extend(node_list) # Log results for debugging logger.debug("Found %d potential nodes in %s", len(all_nodes), file_path) + # Build struct definitions map first + for node in struct_nodes: + processor.register_struct(node) + for node in all_nodes: - # Filter out simple functions (e.g., getters, setters) by checking - # the body. Note: In Go AST, the function 'body' is a 'block' which - # contains a 'statement_list'. We need to check the size of the - # 'statement_list' to know the actual number of statements. - body_node = node.child_by_field_name("body") - if body_node: - stmt_list = next( - ( - child - for child in body_node.children - if child.type == "statement_list" - ), - None, - ) - # If there is no statement list, or it has 1 or fewer statements, - # consider it simple. - if stmt_list is None or stmt_list.named_child_count <= 1: - function_name_node = node.child_by_field_name("name") - if function_name_node: - logger.debug( - "Skipping simple function: %s", - function_name_node.text.decode("utf8"), - ) - continue + # Prevent filtering out abstract interface methods which have no body + if node.type == "method_elem": + pass + else: + # Filter out simple functions (e.g., getters, setters) by checking + # the body. Note: In Go AST, the function 'body' is a 'block' which + # contains a 'statement_list'. We need to check the size of the + # 'statement_list' to know the actual number of statements. + body_node = node.child_by_field_name("body") + if body_node: + stmt_list = next( + ( + child + for child in body_node.children + if child.type == "statement_list" + ), + None, + ) + # If there is no statement list, or it has 1 or fewer + # statements, consider it simple. + if stmt_list is None or stmt_list.named_child_count <= 1: + # Also check physical line span to prevent skipping large + # single-statement functions (e.g. methods returning a large + # anonymous function). + start_row = body_node.start_point[0] + end_row = body_node.end_point[0] + line_span = end_row - start_row + 1 + + if line_span <= 4: + function_name_node = node.child_by_field_name("name") + if function_name_node: + logger.debug( + "Skipping simple function: %s (span: %d lines)", + function_name_node.text.decode("utf8"), + line_span, + ) + continue # Prepare namespace and normalized namespace try: @@ -146,7 +203,21 @@ def extract_features( def get_version(repo_root: pathlib.Path) -> str: - """Get the module path from a go.mod file.""" + """Get the version of the ADK from internal/version/version.go.""" + version_path = repo_root / "internal" / "version" / "version.go" + if version_path.exists(): + try: + content = version_path.read_text() + for line in content.splitlines(): + if "const Version string =" in line: + # e.g., const Version string = "0.3.0" + parts = line.split('"') + if len(parts) >= 3: + return parts[1] + except Exception as e: + logger.warning("Failed to read version.go file: %s", e) + + # Fallback to reading go.mod module path if version isn't found go_mod_path = repo_root / "go.mod" if go_mod_path.exists(): try: @@ -156,4 +227,5 @@ def get_version(repo_root: pathlib.Path) -> str: return line.split()[1] except Exception as e: logger.warning("Failed to read go.mod file: %s", e) + return "" diff --git a/src/google/adk/scope/matcher/__init__.py b/src/google/adk/scope/matcher/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/src/google/adk/scope/matcher/matcher.py b/src/google/adk/scope/matcher/matcher.py deleted file mode 100644 index 40ba4c1..0000000 --- a/src/google/adk/scope/matcher/matcher.py +++ /dev/null @@ -1,285 +0,0 @@ -from collections import defaultdict -from typing import Dict, List, Tuple - -import numpy as np -from jellyfish import jaro_winkler_similarity -from scipy.optimize import linear_sum_assignment - -from google.adk.scope import features_pb2 -from google.adk.scope.utils import stats -from google.adk.scope.utils.similarity import SimilarityScorer - -_NEAR_MISS_THRESHOLD = 0.15 - - -def _format_feature(f: features_pb2.Feature) -> str: - name = f.original_name or f.normalized_name - member = f.member_of - if member and member.lower() != "null": - return f"{member}.{name}" - return name - - -def get_type_display_name(f: features_pb2.Feature) -> str: - FeatureType = features_pb2.Feature.Type - if f.type == FeatureType.CONSTRUCTOR: - return "constructor" - elif f.type in (FeatureType.FUNCTION, FeatureType.CLASS_METHOD): - return "function" - elif f.type == features_pb2.Feature.Type.INSTANCE_METHOD: - return "method" - else: - return "unknown" - - -def _get_type_priority(f: features_pb2.Feature) -> int: - """Returns priority: constructor < function < method < unknown.""" - type_name = get_type_display_name(f) - priorities = { - "constructor": 0, - "function": 1, - "method": 2, - "unknown": 3, - } - return priorities.get(type_name, 99) - - -def match_features( - base_features: List[features_pb2.Feature], - target_features: List[features_pb2.Feature], - alpha: float, -) -> List[Tuple[features_pb2.Feature, features_pb2.Feature, float]]: - """Matches features between two lists using Hungarian algorithm.""" - if not base_features or not target_features: - return [] - - scorer = SimilarityScorer(alpha=alpha) - matches = [] - - # Build Cost Matrix (Rows=Base, Cols=Target) - n_base = len(base_features) - n_target = len(target_features) - similarity_matrix = np.zeros((n_base, n_target)) - - for i, f1 in enumerate(base_features): - for j, f2 in enumerate(target_features): - similarity_matrix[i, j] = scorer.get_similarity_score(f1, f2) - - # Run Hungarian Algorithm (Global Optimization) - row_ind, col_ind = linear_sum_assignment(similarity_matrix, maximize=True) - - matched_base_indices = set() - matched_target_indices = set() - - # Filter Optimal Assignments by Alpha Threshold - for r, c in zip(row_ind, col_ind): - score = similarity_matrix[r, c] - if score > alpha: - matches.append((base_features[r], target_features[c], score)) - matched_base_indices.add(r) - matched_target_indices.add(c) - - # Update the input lists in-place (Remove matched items) - base_features[:] = [ - f for i, f in enumerate(base_features) if i not in matched_base_indices - ] - target_features[:] = [ - f - for i, f in enumerate(target_features) - if i not in matched_target_indices - ] - - return matches - - -def fuzzy_match_namespaces( - features_base: Dict[str, List[features_pb2.Feature]], - features_target: Dict[str, List[features_pb2.Feature]], -) -> None: - """Remaps target namespaces to base namespaces using fuzzy matching.""" - - base_namespaces = sorted(list(features_base.keys())) - remapped_features = defaultdict(list, {k: [] for k in features_base}) - - for t_ns, features in features_target.items(): - if t_ns in base_namespaces: - remapped_features[t_ns].extend(features) - continue - - if not base_namespaces: - # No base to match against, so keep original target namespace - remapped_features[t_ns].extend(features) - continue - - best_match, best_score = max( - ( - (b_ns, jaro_winkler_similarity(t_ns, b_ns)) - for b_ns in base_namespaces - ), - key=lambda item: item[1], - default=(None, 0.0), - ) - - if best_score > 0.8 and best_match: - remapped_features[best_match].extend(features) - - features_target.clear() - features_target.update(remapped_features) - - -def process_module( - module: str, - base_list: List[features_pb2.Feature], - target_list: List[features_pb2.Feature], - alpha: float, - report_type: str, - base_lang_code: str, - target_lang_code: str, -) -> Dict: - """Analyzes a single module and generates its report content.""" - mod_base_count = len(base_list) - mod_target_count = len(target_list) - - solid_matches = match_features(base_list, target_list, alpha) - mod_solid_count = len(solid_matches) - - beta = max(0.0, alpha - _NEAR_MISS_THRESHOLD) - potential_matches = match_features(base_list, target_list, beta) - - unmatched_base = base_list - unmatched_target = target_list - - if report_type == "symmetric": - union_size = mod_base_count + mod_target_count - mod_solid_count - mod_score = mod_solid_count / union_size if union_size > 0 else 1.0 - else: # directional - precision = stats.calculate_precision(mod_solid_count, mod_target_count) - recall = stats.calculate_recall(mod_solid_count, mod_base_count) - mod_score = stats.calculate_f1(precision, recall) - - status_icon = ( - "✅" if mod_score == 1.0 else "⚠️" if mod_score >= 0.8 else "❌" - ) - module_safe_name = module.replace(".", "_") - module_filename = f"{module_safe_name}.md" - - details_link = f"[View Details]({{modules_dir}}/{module_filename})" - if report_type == "symmetric": - adk_parts = [] - if mod_base_count > 0: - adk_parts.append(base_lang_code) - if mod_target_count > 0: - adk_parts.append(target_lang_code) - adk_value = ", ".join(adk_parts) - row_content = ( - f"| {adk_value} | `{module}` | {mod_base_count} | {mod_score:.2%} |" - f" {status_icon} | {details_link} |" - ) - else: - row_content = ( - f"| `{module}` | {mod_base_count} | {mod_score:.2%} | {status_icon}" - f" | {details_link} |" - ) - - # Module Content - mod_lines = [ - f"# Module: `{module}`", - "[⬅️ Back to Master Report](../{master_report})", - "", - f"**Score:** {mod_score:.2%} ({status_icon})", - ] - if report_type == "directional": - mod_lines.extend( - [ - "\n| Metric | Score |", - "|---|---|", - f"| **Precision** | {precision:.2%} |", - f"| **Recall** | {recall:.2%} |", - ] - ) - - mod_total_features = ( - (mod_base_count + mod_target_count - mod_solid_count) - if report_type == "symmetric" - else mod_base_count - ) - mod_lines.extend(["", f"**Features:** {mod_total_features}", ""]) - - solid_matches.sort( - key=lambda x: (_get_type_priority(x[0]), x[0].normalized_name) - ) - potential_matches.sort( - key=lambda x: (_get_type_priority(x[0]), x[0].normalized_name) - ) - - if solid_matches: - mod_lines.append( - f"### ✅ {'Solid' if report_type == 'symmetric' else 'Matched'}" - " Features" - ) - mod_lines.extend( - [ - "| Type | Base Feature | Target Feature | Similarity Score |", - "|---|---|---|---|", - ] - ) - mod_lines.extend( - [ - f"| {get_type_display_name(f_base)} |" - f" `{_format_feature(f_base)}`" - f" | `{_format_feature(f_target)}` | {score:.2f} |" - for f_base, f_target, score in solid_matches - ] - ) - mod_lines.append("") - - if potential_matches: - mod_lines.extend( - [ - "### ⚠️ Potential Matches", - "| Type | Base Feature | Closest Target Candidate" - " | Similarity |", - "|---|---|---|---|", - ] - ) - mod_lines.extend( - [ - f"| {get_type_display_name(f_base)} |" - f" `{_format_feature(f_base)}`" - f" | `{_format_feature(f_target)}` | {score:.2f} |" - for f_base, f_target, score in potential_matches - ] - ) - mod_lines.append("") - - if report_type == "symmetric" and (unmatched_base or unmatched_target): - mod_lines.extend( - [ - "### ❌ Unmatched Features", - "\n| Missing Feature | Missing In |", - "|---|---|", - ] - ) - mod_lines.extend( - [f"| `{_format_feature(f)}` | Target |" for f in unmatched_base] - ) - mod_lines.extend( - [f"| `{_format_feature(f)}` | Base |" for f in unmatched_target] - ) - mod_lines.append("") - elif report_type == "directional" and unmatched_base: - mod_lines.extend( - ["### ❌ Missing in Target", "| Missing Feature |", "|---|"] - ) - mod_lines.extend( - [f"| `{_format_feature(f)}` |" for f in unmatched_base] - ) - mod_lines.append("") - - return { - "solid_matches_count": mod_solid_count, - "score": mod_score, - "row_content": row_content, - "module_filename": module_filename, - "module_content": "\n".join(mod_lines).strip(), - } diff --git a/src/google/adk/scope/reporter/markdown.py b/src/google/adk/scope/reporter/markdown.py new file mode 100644 index 0000000..11f6cae --- /dev/null +++ b/src/google/adk/scope/reporter/markdown.py @@ -0,0 +1,282 @@ +import dataclasses +from datetime import datetime +from typing import Dict + +import pandas as pd + +from google.adk.scope import features_pb2 + + +@dataclasses.dataclass +class MarkdownReport: + main_report_content: str + module_reports: Dict[str, str] # filename -> content + + +def _get_language_code(language_name: str) -> str: + """Returns a short code for the language.""" + name = language_name.upper() + if name in {"PYTHON", "PY"}: + return "py" + elif name in {"TYPESCRIPT", "TS"}: + return "ts" + elif name == "JAVA": + return "java" + elif name in {"GOLANG", "GO"}: + return "go" + else: + return name.lower() + + +def _get_language_name(language_name: str) -> str: + """Returns a properly capitalized display name for the language.""" + name = language_name.upper() + if name in {"PYTHON", "PY"}: + return "Python" + elif name in {"TYPESCRIPT", "TS"}: + return "TypeScript" + elif name == "JAVA": + return "Java" + elif name in {"GOLANG", "GO"}: + return "Go" + else: + return language_name.title() + + +class MarkdownReportGenerator: + def __init__( + self, + base_registry: features_pb2.FeatureRegistry, + target_registry: features_pb2.FeatureRegistry, + df: pd.DataFrame, + ): + self.base_registry = base_registry + self.target_registry = target_registry + self.df = df + + self.base_code = _get_language_code(base_registry.language) + self.target_code = _get_language_code(target_registry.language) + self.base_name = _get_language_name(base_registry.language) + self.target_name = _get_language_name(target_registry.language) + + def generate(self) -> MarkdownReport: + """Generates a Markdown parity report from the DataFrame.""" + master_lines = [] + master_lines.extend( + [ + "# Feature Matching Parity Report", + f"Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}", + "", + "| Role | Language | Version |", + "| :--- | :--- | :--- |", + ( + f"| **Base** | {self.base_registry.language} |" + f" {self.base_registry.version} |" + ), + ( + f"| **Target** | {self.target_registry.language} |" + f" {self.target_registry.version} |" + ), + "", + ] + ) + + global_score_idx = len(master_lines) + master_lines.append("GLOBAL_SCORE_PLACEHOLDER") + master_lines.append("") + + header = ( + f"| Module | Features ({self.base_name}) | Score | Status | " + f"Details |" + ) + divider = "|---|---|---|---|---|" + + master_lines.extend(["## Module Summary", header, divider]) + + module_reports = {} + module_rows = [] + + # Determine cols based on language codes + col_ns = f"{self.base_code}_namespace" + + # Group by base namespace + # If namespace is empty, group under "Unknown Module" + self.df["_module_group"] = self.df[col_ns].replace("", "Unknown Module") + + grouped = self.df.groupby("_module_group") + + total_high = 0 + total_low = 0 + total_base_features = len(self.df) + + for module, group in grouped: + # Calculate module stats + mismatches = len(group[group["match"] == "false"]) + + # Actually, `high` and `low` confidence applies to matches usually + # But let's verify what `match` column says. + matches_high = len( + group[ + (group["match"] == "true") & (group["confidence"] == "high") + ] + ) + matches_low = len( + group[ + (group["match"] == "true") & (group["confidence"] == "low") + ] + ) + # Everything else is a mismatch or low confidence match? + # Let's trust `match` column for parity score + solid_matches_count = len(group[group["match"] == "true"]) + + total_high += matches_high + total_low += matches_low + + module_total = len(group) + score = ( + solid_matches_count / module_total if module_total > 0 else 0.0 + ) + + # Generate Module File Content + module_filename = f"{module}.md" + module_content = self._generate_module_content( + module, + group, + module_total, + matches_high, + matches_low, + mismatches, + ) + module_reports[module_filename] = module_content + + # Add summary row + status_icon = "✅" if score == 1.0 else "⚠️" if score > 0.5 else "❌" + row_str = ( + f"| `{module}` | {module_total} | " + f"{score:.2%} | {status_icon} | " + f"[View Details]({{modules_dir}}/{module_filename}) |" + ) + module_rows.append((score, row_str)) + + module_rows.sort(key=lambda x: x[0], reverse=True) + master_lines.extend([row for _, row in module_rows]) + + # Summary Stats + total_matches = total_high + total_low + parity_score = ( + total_matches / total_base_features + if total_base_features > 0 + else 1.0 + ) + + base_exclusive = total_base_features - total_matches + + global_stats = ( + "## Summary\n\n" + "| Feature Category | Count | Details |\n" + "| :--- | :--- | :--- |\n" + f"| **✅ High Confidence Matches** | **{total_high}** | " + f"Strong matches found in `{self.target_name}` |\n" + f"| **⚠️ Low Confidence Matches** | **{total_low}** | " + f"Likely matches needing verification |\n" + f"| **❌ Mismatches** | **{base_exclusive}** | " + f"No suitable match found in `{self.target_name}` |\n" + f"| **📊 Coverage Score** | **{parity_score:.2%}** | " + f"Matches / Total Base Features ({total_matches} / " + f"{total_base_features}) |" + ) + + master_lines[global_score_idx] = global_stats + + return MarkdownReport( + main_report_content="\n".join(master_lines).strip(), + module_reports=module_reports, + ) + + def _generate_module_content( + self, + module: str, + group: pd.DataFrame, + total_features: int, + high_conf: int, + low_conf: int, + mismatches: int, + ) -> str: + + # Calculate scores for summary + total_matches = high_conf + low_conf + coverage = total_matches / total_features if total_features > 0 else 0.0 + + summary_table = ( + "## Summary\n\n" + "| Feature Category | Count | Details |\n" + "| :--- | :--- | :--- |\n" + f"| **✅ High Confidence Matches** | **{high_conf}** | " + f"Strong matches found in `{self.target_name}` |\n" + f"| **⚠️ Low Confidence Matches** | **{low_conf}** | " + f"Likely matches needing verification |\n" + f"| **❌ Mismatches** | **{mismatches}** | " + f"No suitable match found in `{self.target_name}` |\n" + f"| **📊 Coverage Score** | **{coverage:.2%}** | " + f"Matches / Total Base Features ({total_matches} / " + f"{total_features}) |\n" + ) + + lines = [ + f"# Module: `{module}`", + "", + "[← Back to Master Report]({master_report})", + "", + summary_table, + "## Feature Details", + "", + f"| Module ({self.base_name}) | Container ({self.base_name}) | " + f"Name ({self.base_name}) | Module ({self.target_name}) | " + f"Container ({self.target_name}) | Name ({self.target_name}) | " + "Score | Match | Confidence |", + "| :--- | :--- | :--- | :--- | :--- | :--- | :--- | :---: | " + ":---: |", + ] + + # Sort by score desc, then name + group_sorted = group.sort_values( + by=["score", f"{self.base_code}_name"], ascending=[False, True] + ) + + for _, row in group_sorted.iterrows(): + # Base logic + b_ns = row[f"{self.base_code}_namespace"] + b_mem = row[f"{self.base_code}_member_of"] + b_name = row[f"{self.base_code}_name"] + + # Target logic + t_ns = row[f"{self.target_code}_namespace"] + t_mem = row[f"{self.target_code}_member_of"] + t_name = row[f"{self.target_code}_name"] + + if t_name == "" and t_mem == "" and t_ns == "": + t_name = "*(None)*" + + score = row["score"] + match_val = row["match"] + conf_val = row["confidence"] + + if match_val == "true": + if conf_val == "high": + match_icon = "✅" + else: + match_icon = "⚠️" + else: + match_icon = "❌" + + conf_display = conf_val.title() + if conf_display == "High": + conf_display = "**High**" + + lines.append( + f"| `{b_ns}` | `{b_mem}` | `{b_name}` | " + f"`{t_ns}` | `{t_mem}` | `{t_name}` | " + f"{score:.4f} | {match_icon} | {conf_display} |" + ) + + return "\n".join(lines) diff --git a/src/google/adk/scope/reporter/raw.py b/src/google/adk/scope/reporter/raw.py new file mode 100644 index 0000000..a32b47d --- /dev/null +++ b/src/google/adk/scope/reporter/raw.py @@ -0,0 +1,188 @@ +from collections import defaultdict +from typing import Any, Dict, List, Optional, Tuple + +import pandas as pd + +from google.adk.scope import features_pb2 +from google.adk.scope.utils.similarity import SimilarityScorer + +# Global thresholds for match confidence +SIMILARITY_THRESHOLDS = { + frozenset(["py", "go"]): {"high": 0.6, "avg": 0.5}, + frozenset(["py", "java"]): {"high": 0.6, "avg": 0.58}, + frozenset(["py", "ts"]): {"high": 0.7, "avg": 0.55}, +} + +# Fallback thresholds if language pair not explicitly defined +DEFAULT_THRESHOLDS = {"high": 0.8, "avg": 0.6} + + +def get_type_display_name(f: features_pb2.Feature) -> str: + FeatureType = features_pb2.Feature.Type + if f.type == FeatureType.CONSTRUCTOR: + return "constructor" + elif f.type in (FeatureType.FUNCTION, FeatureType.CLASS_METHOD): + return "function" + elif f.type == features_pb2.Feature.Type.INSTANCE_METHOD: + return "method" + else: + return "unknown" + + +def _get_lang_code(language: str) -> str: + """Returns a short code for the language (e.g. PYTHON -> py).""" + name = language.upper() + if name in {"PYTHON", "PY"}: + return "py" + elif name in {"TYPESCRIPT", "TS"}: + return "ts" + elif name == "JAVA": + return "java" + elif name in {"GOLANG", "GO"}: + return "go" + return name.lower() + + +class RawReportGenerator: + def __init__( + self, + base_registry: features_pb2.FeatureRegistry, + target_registry: features_pb2.FeatureRegistry, + ): + self.base_registry = base_registry + self.target_registry = target_registry + self.scorer = SimilarityScorer() + + # Pre-compute useful attributes + self.base_code = _get_lang_code(self.base_registry.language) + self.target_code = _get_lang_code(self.target_registry.language) + self.thresholds = SIMILARITY_THRESHOLDS.get( + frozenset([self.base_code, self.target_code]), + DEFAULT_THRESHOLDS, + ) + + # Index target features by type + self.target_by_type = defaultdict(list) + for f in self.target_registry.features: + self.target_by_type[f.type].append(f) + + def generate(self, output_path: Optional[str] = None) -> pd.DataFrame: + """Generates the raw report DataFrame and optionally saves it to CSV.""" + rows = [] + for f_base in self.base_registry.features: + best_match, best_score = self._find_best_match(f_base) + row = self._create_row_data(f_base, best_match, best_score) + rows.append(row) + + df = self._create_dataframe(rows) + + if output_path: + self._save_csv(df, output_path) + + return df + + def _find_best_match( + self, f_base: features_pb2.Feature + ) -> Tuple[Optional[features_pb2.Feature], float]: + """Finds the best matching feature in the target registry.""" + candidates = self.target_by_type.get(f_base.type, []) + if not candidates: + return None, 0.0 + + best_match = None + best_score = -1.0 + + for f_target in candidates: + score = self.scorer.get_similarity_score(f_base, f_target) + if score > best_score: + best_score = score + best_match = f_target + + return best_match, best_score + + def _create_row_data( + self, + f_base: features_pb2.Feature, + f_target: Optional[features_pb2.Feature], + score: float, + ) -> Dict[str, Any]: + """Constructs a dictionary representing a single row in the report.""" + row: Dict[str, Any] = {} + + # Base columns + self._fill_feature_cols(row, f_base, self.base_code) + + # Target columns + if f_target: + self._fill_feature_cols(row, f_target, self.target_code) + else: + self._fill_empty_cols(row, self.target_code) + + # Metadata + row["type"] = get_type_display_name(f_base) + row["score"] = score + + # Match status + match_str, confidence_str = self._determine_match_status(score) + row["match"] = match_str + row["confidence"] = confidence_str + + return row + + def _fill_feature_cols( + self, row: Dict[str, Any], f: features_pb2.Feature, prefix: str + ): + """Populates namespace, member_of, and name columns for a feature.""" + ns = f.namespace or f.normalized_namespace or "" + mem = f.member_of or f.normalized_member_of or "" + if str(mem).lower() == "null": + mem = "" + name = f.original_name or f.normalized_name or "" + + row[f"{prefix}_namespace"] = ns + row[f"{prefix}_member_of"] = mem + row[f"{prefix}_name"] = name + + def _fill_empty_cols(self, row: Dict[str, Any], prefix: str): + """Fills feature columns with empty strings.""" + row[f"{prefix}_namespace"] = "" + row[f"{prefix}_member_of"] = "" + row[f"{prefix}_name"] = "" + + def _determine_match_status(self, score: float) -> Tuple[str, str]: + """Determines match (true/false) and confidence (high/low).""" + if score > self.thresholds["high"]: + return "true", "high" + elif score >= self.thresholds["avg"]: + return "true", "low" + else: + return "false", "high" + + def _create_dataframe(self, rows: List[Dict[str, Any]]) -> pd.DataFrame: + """Creates and formats the pandas DataFrame.""" + cols_order = [ + f"{self.base_code}_namespace", + f"{self.base_code}_member_of", + f"{self.base_code}_name", + f"{self.target_code}_namespace", + f"{self.target_code}_member_of", + f"{self.target_code}_name", + "type", + "score", + "match", + "confidence", + ] + + if not rows: + return pd.DataFrame(columns=cols_order) + + df = pd.DataFrame(rows) + # Ensure correct column order and fill missing + return df.reindex(columns=cols_order, fill_value="") + + def _save_csv(self, df: pd.DataFrame, output_path: str): + """Saves DataFrame to CSV, creating directories if needed.""" + from pathlib import Path + + Path(output_path).parent.mkdir(parents=True, exist_ok=True) + df.to_csv(output_path, index=False) diff --git a/src/google/adk/scope/reporter/reporter.py b/src/google/adk/scope/reporter/reporter.py index 4044a8b..b89864d 100644 --- a/src/google/adk/scope/reporter/reporter.py +++ b/src/google/adk/scope/reporter/reporter.py @@ -1,52 +1,14 @@ import argparse -import dataclasses import logging import sys -from collections import defaultdict -from datetime import datetime from pathlib import Path -from typing import Dict, List +from typing import List, Optional from google.protobuf import text_format from google.adk.scope import features_pb2 -from google.adk.scope.matcher import matcher +from google.adk.scope.reporter import markdown, raw from google.adk.scope.utils import args as adk_args -from google.adk.scope.utils import stats - -_NEAR_MISS_THRESHOLD = 0.15 - - -@dataclasses.dataclass -class MatchResult: - master_content: str - module_files: Dict[str, str] # filename -> content - - -def _group_features_by_module( - registry: features_pb2.FeatureRegistry, -) -> Dict[str, List[features_pb2.Feature]]: - """Groups features by their module.""" - features = defaultdict(list) - for f in registry.features: - key = f.normalized_namespace or f.namespace or "Unknown Module" - features[key].append(f) - return features - - -def _get_language_code(language_name: str) -> str: - """Returns a short code for the language.""" - name = language_name.upper() - if name == {"PYTHON", "PY"}: - return "py" - elif name in {"TYPESCRIPT", "TS"}: - return "ts" - elif name == "JAVA": - return "java" - elif name in {"GOLANG", "GO"}: - return "go" - else: - return name.lower() def _read_feature_registry(file_path: str) -> features_pb2.FeatureRegistry: @@ -57,320 +19,71 @@ def _read_feature_registry(file_path: str) -> features_pb2.FeatureRegistry: return registry -def match_registries( - base_registry: features_pb2.FeatureRegistry, - target_registry: features_pb2.FeatureRegistry, - alpha: float, - report_type: str = "symmetric", -) -> MatchResult: - """Matches features and generates a master report + module sub-reports.""" - reporter = ReportGenerator( - base_registry, - target_registry, - alpha, +def generate_markdown_raw_reports( + registries: List[features_pb2.FeatureRegistry], + report_type: str = "md", # Kept for backward compatibility + output_path: Optional[Path] = None, +) -> markdown.MarkdownReport: + """Matches features and generates reports.""" + # New unified flow for standard reports + generator = raw.RawReportGenerator(registries[0], registries[1]) + + # Generate DataFrame (and CSV if path provided) + csv_path = None + if output_path: + # If output is "report.md", csv will be "report.csv" + # If output is "report.csv", md will be "report.md" + stem = output_path.stem + parent = output_path.parent + csv_path = str(parent / f"{stem}.csv") + + df = generator.generate(output_path=csv_path) + + # Generate Markdown Report from DataFrame + reporter = markdown.MarkdownReportGenerator( + registries[0], registries[1], df ) + result = reporter.generate() + if result.module_reports: + modules_dir_name = f"{output_path.stem}_modules" + modules_dir = output_path.parent / modules_dir_name + modules_dir.mkdir(parents=True, exist_ok=True) - return reporter.generate_report(report_type) - - -class ReportGenerator: - def __init__( - self, - base_registry: features_pb2.FeatureRegistry, - target_registry: features_pb2.FeatureRegistry, - alpha: float, - ): - self.base_registry = base_registry - self.target_registry = target_registry - - self.features_base = _group_features_by_module(base_registry) - self.features_target = _group_features_by_module(target_registry) - matcher.fuzzy_match_namespaces(self.features_base, self.features_target) - self.alpha = alpha - - def generate_report(self, report_type) -> MatchResult: - """Generates report.""" - if report_type == "raw": - return self.generate_raw_report() - elif report_type == "directional": - return self.generate_directional_report() - elif report_type == "symmetric": - return self.generate_symmetric_report() - else: - raise ValueError(f"Unknown report type: {report_type}") - - def generate_raw_report(self) -> MatchResult: - """Generates a raw CSV report.""" - base_code = _get_language_code(self.base_registry.language) - target_code = _get_language_code(self.target_registry.language) - all_modules = sorted( - set(self.features_base.keys()) | set(self.features_target.keys()) - ) - csv_header = ( - f"{base_code}_namespace,{base_code}_member_of,{base_code}_name," - f"{target_code}_namespace,{target_code}_member_of,{target_code}_name," - "type,score" - ) - csv_lines = [csv_header] - - def get_feature_cols(f: features_pb2.Feature) -> tuple[str, str, str]: - ns = f.namespace or "" - if not ns and f.normalized_namespace: - ns = f.normalized_namespace - - mem = f.member_of or "" - if not mem and f.normalized_member_of: - mem = f.normalized_member_of - if mem.lower() == "null": - mem = "" - - name = f.original_name or f.normalized_name or "" - return ns, mem, name - - def esc_csv(s): - if s is None: - return "" - if "," in s or '"' in s or "\n" in s: - return '"{}"'.format(s.replace('"', '""')) - return s - - for module in all_modules: - base_list = self.features_base.get(module, []) - target_list = self.features_target.get(module, []) - - solid_matches = matcher.match_features( - base_list, target_list, self.alpha - ) - beta = max(0.0, self.alpha - _NEAR_MISS_THRESHOLD) - potential_matches = matcher.match_features( - base_list, target_list, beta - ) - - unmatched_base = list(base_list) - unmatched_target = list(target_list) - - for f_base, f_target, score in solid_matches: - b_ns, b_mem, b_name = get_feature_cols(f_base) - t_ns, t_mem, t_name = get_feature_cols(f_target) - f_type = matcher.get_type_display_name(f_base) - csv_lines.append( - f"{esc_csv(b_ns)},{esc_csv(b_mem)},{esc_csv(b_name)}," - f"{esc_csv(t_ns)},{esc_csv(t_mem)},{esc_csv(t_name)}," - f"{esc_csv(f_type)},{score:.4f}" - ) - - for f_base, f_target, score in potential_matches: - b_ns, b_mem, b_name = get_feature_cols(f_base) - t_ns, t_mem, t_name = get_feature_cols(f_target) - f_type = matcher.get_type_display_name(f_base) - csv_lines.append( - f"{esc_csv(b_ns)},{esc_csv(b_mem)},{esc_csv(b_name)}," - f"{esc_csv(t_ns)},{esc_csv(t_mem)},{esc_csv(t_name)}," - f"{esc_csv(f_type)},{score:.4f}" - ) - - for f_base in unmatched_base: - b_ns, b_mem, b_name = get_feature_cols(f_base) - f_type = matcher.get_type_display_name(f_base) - csv_lines.append( - f"{esc_csv(b_ns)},{esc_csv(b_mem)},{esc_csv(b_name)}," - f",,,{esc_csv(f_type)},0.0000" - ) - - for f_target in unmatched_target: - t_ns, t_mem, t_name = get_feature_cols(f_target) - f_type = matcher.get_type_display_name(f_target) - csv_lines.append( - f",,,{esc_csv(t_ns)},{esc_csv(t_mem)}," - f"{esc_csv(t_name)},{esc_csv(f_type)},0.0000" - ) - - return MatchResult( - master_content="\n".join(csv_lines), - module_files={}, - ) - - def generate_directional_report(self) -> MatchResult: - """Generates a directional report.""" - all_modules = sorted(self.features_base.keys()) - master_lines = [] - title_suffix = "Directional" - master_lines.extend( - [ - f"# Feature Matching Report: {title_suffix}", - f"Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}", - "", - ( - f"**Base:** {self.base_registry.language}" - f" ({self.base_registry.version})" - ), - ( - f"**Target:** {self.target_registry.language}" - f" ({self.target_registry.version})" - ), - ] - ) - - global_score_idx = len(master_lines) - master_lines.append("GLOBAL_SCORE_PLACEHOLDER") - master_lines.append("") - - header = "| Module | Features (Base) | Score | Status | Details |" - divider = "|---|---|---|---|---|" - master_lines.extend(["## Module Summary", header, divider]) - - module_files = {} - module_rows = [] - total_solid_matches = 0 - - base_code = _get_language_code(self.base_registry.language) - target_code = _get_language_code(self.target_registry.language) - - for module in all_modules: - mod_base_list = self.features_base.get(module, []) - mod_target_list = self.features_target.get(module, []) - - results = matcher.process_module( - module, - mod_base_list, - mod_target_list, - self.alpha, - "directional", - base_code, - target_code, + # Write module files + for filename, content in result.module_reports.items(): + # Replace placeholder for master report link + # The link is relative from module dir to master report + # We are in {stem}_modules/, so we need to go up one level. + final_content = content.replace( + "{master_report}", f"../{output_path.name}" ) - total_solid_matches += results["solid_matches_count"] - module_rows.append((results["score"], results["row_content"])) - if results.get("module_filename"): - module_files[results["module_filename"]] = results[ - "module_content" - ] - - module_rows.sort(key=lambda x: x[0], reverse=True) - master_lines.extend([row for _, row in module_rows]) - - total_base_features = len(self.base_registry.features) - total_target_features = len(self.target_registry.features) - - precision = stats.calculate_precision( - total_solid_matches, total_target_features - ) - recall = stats.calculate_recall( - total_solid_matches, total_base_features - ) - parity_score = stats.calculate_f1(precision, recall) - - global_stats = ( - "\n| Metric | Score |\n" - "|---|---|\n" - f"| **Precision** | {precision:.2%} |\n" - f"| **Recall** | {recall:.2%} |\n" - f"| **F1 Score** | {parity_score:.2%} |\n\n" - "> **Precision**: Of all features in the target, how many are " - "correct matches to the base? (High score = low number of extra " - "features in target)\n\n" - "> **Recall**: Of all features in the base, how many were found in " - "the target? (High score = low number of missing features in " - "target)\n\n" - "> **F1 Score**: A weighted average of Precision and Recall, " - "providing a single measure of how well the target feature set " - "matches the base." - ) - - master_lines[global_score_idx] = global_stats - - return MatchResult( - master_content="\n".join(master_lines).strip(), - module_files=module_files, - ) + (modules_dir / filename).write_text(final_content) - def generate_symmetric_report(self) -> MatchResult: - """Generates a symmetric report.""" - all_modules = sorted( - set(self.features_base.keys()) | set(self.features_target.keys()) - ) - master_lines = [] - title_suffix = "Symmetric" - master_lines.extend( - [ - f"# Feature Matching Report: {title_suffix}", - f"Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}", - "", - "| Registry | Language | Version |", - "| :--- | :--- | :--- |", - ( - f"| **Base** | {self.base_registry.language} |" - f" {self.base_registry.version} |" - ), - ( - f"| **Target** | {self.target_registry.language} |" - f" {self.target_registry.version} |" - ), - "", - ] + # Replace placeholder in Master Report + # We assume master report is in parent of modules_dir + # modules_dir relative to master report is just the dir name + master_report = result.main_report_content.replace( + "{modules_dir}", modules_dir_name ) + else: + master_report = result.main_report_content.replace("{modules_dir}", ".") - global_score_idx = len(master_lines) - master_lines.append("GLOBAL_SCORE_PLACEHOLDER") - master_lines.append("") - - header = "| ADK | Module | Features (Base) | Score | Status | Details |" - divider = "|---|---|---|---|---|---|" - - master_lines.extend(["## Module Summary", header, divider]) - - module_files = {} - module_rows = [] - total_solid_matches = 0 - - base_code = _get_language_code(self.base_registry.language) - target_code = _get_language_code(self.target_registry.language) - - for module in all_modules: - mod_base_list = self.features_base.get(module, []) - mod_target_list = self.features_target.get(module, []) - - results = matcher.process_module( - module, - mod_base_list, - mod_target_list, - self.alpha, - "symmetric", - base_code, - target_code, - ) - total_solid_matches += results["solid_matches_count"] - module_rows.append((results["score"], results["row_content"])) - if results.get("module_filename"): - module_files[results["module_filename"]] = results[ - "module_content" - ] - - module_rows.sort(key=lambda x: x[0], reverse=True) - master_lines.extend([row for _, row in module_rows]) - - total_base_features = len(self.base_registry.features) - total_target_features = len(self.target_registry.features) - - union_size = ( - total_base_features + total_target_features - total_solid_matches - ) - parity_score = ( - total_solid_matches / union_size if union_size > 0 else 1.0 - ) - global_stats = ( - f"**Jaccard Index:** {parity_score:.2%}\n\n" - "> The Jaccard Index measures the similarity between the " - "two feature sets. A score of 100% indicates that both languages " - "have identical features." - ) + try: + output_path.write_text(master_report) + logging.info(f"Successfully wrote match report to {output_path}") + # Note: CSV writing is logged inside RawReportGenerator or we should + # log it here. Actually RawReportGenerator doesn't log, so we might + # want to Add a log here if we knew it matched + stem = output_path.stem + csv_path = output_path.parent / f"{stem}.csv" + if csv_path.exists(): + logging.info(f"Successfully wrote raw match report to {csv_path}") - master_lines[global_score_idx] = global_stats + return result - return MatchResult( - master_content="\n".join(master_lines).strip(), - module_files=module_files, - ) + except Exception as e: + logging.error(f"Error writing report to {output_path}: {e}") + sys.exit(1) def main(): @@ -379,92 +92,65 @@ def main(): ) parser.add_argument( "--base", - required=True, + required=False, help="Path to the base FeatureRegistry .txtpb file.", ) parser.add_argument( "--target", - required=True, + required=False, help="Path to the target FeatureRegistry .txtpb file.", ) parser.add_argument( - "--output", - required=True, - help="Path to save the Markdown report.", + "--registries", + nargs="+", + required=False, + help="Paths to multiple FeatureRegistry .txtpb files.", ) parser.add_argument( - "--alpha", - type=float, - default=0.8, - help="Similarity threshold (0.0 to 1.0) defaults to 0.8.", + "--output", + required=True, + help=( + "Path to save the Markdown report. Corresponding CSV will be " + "saved with same stem." + ), ) parser.add_argument( "--report-type", - choices=["symmetric", "directional", "raw"], - default="symmetric", - help="Type of gap report to generate (symmetric, directional, or raw).", + choices=["md", "matrix"], + default="md", + help="Type of gap report. 'md' or 'matrix' now produce both.", ) adk_args.add_verbose_argument(parser) args = parser.parse_args() adk_args.configure_logging(args) try: - base_registry = _read_feature_registry(args.base) - target_registry = _read_feature_registry(args.target) + registry_paths = [] + if args.registries: + registry_paths.extend(args.registries) + elif args.base and args.target: + registry_paths.extend([args.base, args.target]) + else: + logging.error( + "Must provide either --registries or both --base and --target" + ) + sys.exit(1) + + if len(registry_paths) < 2: + logging.error("Must provide at least 2 registries to compare.") + sys.exit(1) + + registries = [_read_feature_registry(p) for p in registry_paths] except Exception as e: logging.error(f"Error reading feature registries: {e}") sys.exit(1) - result = match_registries( - base_registry, target_registry, args.alpha, args.report_type - ) - output_path = Path(args.output) output_path.parent.mkdir(parents=True, exist_ok=True) - if args.report_type == "raw": - # Raw report is a single file, no modules directory needed - try: - output_path.parent.mkdir(parents=True, exist_ok=True) - output_path.write_text(result.master_content) - logging.info( - f"Successfully wrote raw match report to {output_path}" - ) - except Exception as e: - logging.error(f"Error writing raw report to {output_path}: {e}") - sys.exit(1) - return - - # Create module directory - if result.module_files: - modules_dir_name = f"{output_path.stem}_modules" - modules_dir = output_path.parent / modules_dir_name - modules_dir.mkdir(parents=True, exist_ok=True) - - # Write module files - for filename, content in result.module_files.items(): - # Replace placeholder for master report link - # The link is relative from module dir to master report - # So name is enough. - final_content = content.replace("{master_report}", output_path.name) - (modules_dir / filename).write_text(final_content) - - # Replace placeholder in Master Report - # We assume master report is in parent of modules_dir - # modules_dir relative to master report is just the dir name - master_report = result.master_content.replace( - "{modules_dir}", modules_dir_name - ) - else: - master_report = result.master_content.replace("{modules_dir}", ".") - - try: - output_path.parent.mkdir(parents=True, exist_ok=True) - output_path.write_text(master_report) - logging.info(f"Successfully wrote match report to {output_path}") - except Exception as e: - logging.error(f"Error writing report to {output_path}: {e}") - sys.exit(1) + generate_markdown_raw_reports( + registries, args.report_type, output_path=output_path + ) if __name__ == "__main__": diff --git a/src/google/adk/scope/utils/score_features.py b/src/google/adk/scope/utils/score_features.py new file mode 100644 index 0000000..27c5c28 --- /dev/null +++ b/src/google/adk/scope/utils/score_features.py @@ -0,0 +1,51 @@ +import argparse +import logging +import sys +from pathlib import Path + +from google.protobuf import text_format + +from google.adk.scope import features_pb2 +from google.adk.scope.utils.similarity import SimilarityScorer + + +def main(): + parser = argparse.ArgumentParser( + description="Calculate similarity score between two features." + ) + parser.add_argument( + "feature1", type=Path, help="Path to first feature file (text proto)." + ) + parser.add_argument( + "feature2", type=Path, help="Path to second feature file (text proto)." + ) + + args = parser.parse_args() + + # Configure logging to DEBUG + logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s") + + try: + f1_content = args.feature1.read_text() + f2_content = args.feature2.read_text() + + f1 = features_pb2.Feature() + text_format.Parse(f1_content, f1) + + f2 = features_pb2.Feature() + text_format.Parse(f2_content, f2) + + scorer = SimilarityScorer() + score = scorer.get_similarity_score(f1, f2) + + print("-" * 40) + print(f"Similarity Score: {score:.4f}") + print("-" * 40) + + except Exception as e: + logging.error(f"Error: {e}") + sys.exit(1) + + +if __name__ == "__main__": + main() diff --git a/src/google/adk/scope/utils/similarity.py b/src/google/adk/scope/utils/similarity.py index 97fb226..57510f6 100644 --- a/src/google/adk/scope/utils/similarity.py +++ b/src/google/adk/scope/utils/similarity.py @@ -2,7 +2,7 @@ from typing import Optional import numpy as np -from jellyfish import jaro_winkler_similarity +from jellyfish import levenshtein_distance from scipy.optimize import linear_sum_assignment from google.adk.scope import features_pb2 as features_pb @@ -23,12 +23,12 @@ class SimilarityScorer: """Calculates a similarity score between two features.""" def __init__( - self, weights: Optional[dict[str, float]] = None, alpha: float = 0.8 + self, + weights: Optional[dict[str, float]] = None, ): self.weights = weights or DEFAULT_SIMILARITY_WEIGHTS logger.debug( - f"Initializing SimilarityScorer with alpha={alpha} and " - f"weights={self.weights}" + f"Initializing SimilarityScorer with " f"weights={self.weights}" ) assert "name" in self.weights assert "member_of" in self.weights @@ -36,6 +36,19 @@ def __init__( assert "parameters" in self.weights assert "return_type" in self.weights + def get_similarity(self, s1: str, s2: str) -> float: + """Calculates similarity between two strings using the selected + algorithm.""" + if not s1 and not s2: + return 1.0 + if not s1 or not s2: + return 0.0 + + # Default to Levenshtein + dist = levenshtein_distance(s1, s2) + max_len = max(len(s1), len(s2)) + return 1.0 - (dist / max_len) + def _fuzzy_type_match(self, types1: list, types2: list) -> float: """Calculates a fuzzy similarity score between two lists of types.""" @@ -61,10 +74,13 @@ def _to_str_set(type_list): return 0.0 if set1 == set2: + logger.debug(f"Exact type match: {set1}") return 1.0 # Check the best match between any pair of types best_score = 0.0 + + logger.debug(f"Fuzzy type match between {set1} and {set2}") for t1 in set1: for t2 in set2: if t1 == t2: @@ -90,7 +106,7 @@ def _calculate_param_similarity( self, param1: features_pb.Param, param2: features_pb.Param ) -> float: """Calculates the similarity score between two individual parameters.""" - s_p_name = jaro_winkler_similarity( + s_p_name = self.get_similarity( param1.normalized_name, param2.normalized_name ) s_p_type = self._fuzzy_type_match( @@ -145,6 +161,14 @@ def _calculate_parameters_score( f"Matrix matched total score: {total_match_score:.4f}, " f"final parameter score: {score:.4f}" ) + # Log parameter matches + for r, c in zip(row_ind, col_ind): + if similarity_matrix[r, c] > 0: + logger.debug( + f" Matched param '{params1[r].normalized_name}' with " + f"'{params2[c].normalized_name}': " + f"{similarity_matrix[r, c]:.4f}" + ) return score def _calculate_return_type_score( @@ -208,16 +232,25 @@ def get_similarity_score( # 2. Similarity Calculations scores = { - "name": jaro_winkler_similarity( + "name": self.get_similarity( feature1.normalized_name, feature2.normalized_name ), - "member_of": jaro_winkler_similarity( + "member_of": self.get_similarity( feature1.normalized_member_of, feature2.normalized_member_of ), - "namespace": jaro_winkler_similarity( + "namespace": self.get_similarity( feature1.normalized_namespace, feature2.normalized_namespace ), } + logger.debug( + f"Comparison Details:\n" + f" Name: '{feature1.normalized_name}' vs " + f"'{feature2.normalized_name}' -> {scores['name']:.4f}\n" + f" MemberOf: '{feature1.normalized_member_of}' vs " + f"'{feature2.normalized_member_of}' -> {scores['member_of']:.4f}\n" + f" Namespace: '{feature1.normalized_namespace}' vs " + f"'{feature2.normalized_namespace}' -> {scores['namespace']:.4f}" + ) logger.debug(f"Preliminary scores: {scores}") # 3. Early Exit Check (using dynamic weights) @@ -255,5 +288,15 @@ def get_similarity_score( scores[key] * current_weights[key] for key in current_weights ) logger.debug(f"Final scores including params & return: {scores}") + + # Log contributions + logger.debug("Score Contributions:") + for key in current_weights: + contribution = scores[key] * current_weights[key] + logger.debug( + f" {key}: {scores[key]:.4f} * {current_weights[key]:.4f} = " + f"{contribution:.4f}" + ) + logger.debug(f"Final weighted similarity score: {final_score:.4f}") return final_score diff --git a/src/google/adk/scope/utils/stats.py b/src/google/adk/scope/utils/stats.py deleted file mode 100644 index f98479c..0000000 --- a/src/google/adk/scope/utils/stats.py +++ /dev/null @@ -1,19 +0,0 @@ -def calculate_precision(matches: int, total_target: int) -> float: - """Calculates precision: matches / total_target.""" - if total_target > 0: - return matches / total_target - return 1.0 - - -def calculate_recall(matches: int, total_base: int) -> float: - """Calculates recall: matches / total_base.""" - if total_base > 0: - return matches / total_base - return 1.0 - - -def calculate_f1(precision: float, recall: float) -> float: - """Calculates F1 score: 2 * (P * R) / (P + R).""" - if precision + recall > 0: - return 2 * (precision * recall) / (precision + recall) - return 0.0 diff --git a/test/adk/scope/extractors/test_extractor_go.py b/test/adk/scope/extractors/test_extractor_go.py index c1b2f65..b2a0626 100644 --- a/test/adk/scope/extractors/test_extractor_go.py +++ b/test/adk/scope/extractors/test_extractor_go.py @@ -36,6 +36,8 @@ def test_extract_features( mock_func_node = MagicMock() mock_func_body = MagicMock() + mock_func_body.start_point = (1, 0) + mock_func_body.end_point = (10, 0) # span = 10 lines mock_func_stmt_list = MagicMock() mock_func_stmt_list.type = "statement_list" mock_func_stmt_list.named_child_count = 2 @@ -44,6 +46,8 @@ def test_extract_features( mock_method_node = MagicMock() mock_method_body = MagicMock() + mock_method_body.start_point = (12, 0) + mock_method_body.end_point = (20, 0) # span = 9 lines mock_method_stmt_list = MagicMock() mock_method_stmt_list.type = "statement_list" mock_method_stmt_list.named_child_count = 2 diff --git a/test/adk/scope/extractors/test_extractor_py.py b/test/adk/scope/extractors/test_extractor_py.py index 165d7f4..4d78a71 100644 --- a/test/adk/scope/extractors/test_extractor_py.py +++ b/test/adk/scope/extractors/test_extractor_py.py @@ -95,6 +95,74 @@ def test_extract_features_read_error(self): features = extract_features(mock_path, Path("/repo"), ".") self.assertEqual(features, []) + @patch("google.adk.scope.extractors.extractor_py.QueryCursor") + @patch("google.adk.scope.extractors.extractor_py.Query") + @patch("google.adk.scope.extractors.extractor_py.PARSER") + def test_private_classes_filtered( + self, mock_parser, mock_query_cls, mock_cursor_cls + ): + mock_path = MagicMock(spec=Path) + mock_path.read_bytes.return_value = b"class _PrivateClass: pass" + + mock_tree = MagicMock() + mock_parser.parse.return_value = mock_tree + mock_tree.root_node = MagicMock() + + mock_cursor_instance = mock_cursor_cls.return_value + + mock_node = MagicMock() + mock_node.type = "class_definition" + + # Simulate query returning the private class + mock_cursor_instance.captures.return_value = {"class": [mock_node]} + + with patch( + "google.adk.scope.extractors.extractor_py.NodeProcessor" + ) as MockProcessor: + processor_instance = MockProcessor.return_value + # The processor returns None for private classes + processor_instance.process.return_value = None + + features = extract_features(mock_path, Path("/repo"), ".") + + self.assertEqual(features, []) + processor_instance.process.assert_called_once() + + @patch("google.adk.scope.extractors.extractor_py.QueryCursor") + @patch("google.adk.scope.extractors.extractor_py.Query") + @patch("google.adk.scope.extractors.extractor_py.PARSER") + def test_private_class_methods_filtered( + self, mock_parser, mock_query_cls, mock_cursor_cls + ): + mock_path = MagicMock(spec=Path) + mock_path.read_bytes.return_value = ( + b"class _PrivateClass:\n def method(self): pass" + ) + + mock_tree = MagicMock() + mock_parser.parse.return_value = mock_tree + mock_tree.root_node = MagicMock() + + mock_cursor_instance = mock_cursor_cls.return_value + + mock_node = MagicMock() + mock_node.type = "function_definition" + + # Simulate query returning the method + mock_cursor_instance.captures.return_value = {"func": [mock_node]} + + with patch( + "google.adk.scope.extractors.extractor_py.NodeProcessor" + ) as MockProcessor: + processor_instance = MockProcessor.return_value + # The processor returns None for methods in private classes + processor_instance.process.return_value = None + + features = extract_features(mock_path, Path("/repo"), ".") + + self.assertEqual(features, []) + processor_instance.process.assert_called_once() + if __name__ == "__main__": unittest.main() diff --git a/test/adk/scope/extractors/test_extractor_ts.py b/test/adk/scope/extractors/test_extractor_ts.py index a30aa37..21f879c 100644 --- a/test/adk/scope/extractors/test_extractor_ts.py +++ b/test/adk/scope/extractors/test_extractor_ts.py @@ -62,7 +62,9 @@ def test_extract_features(self, mock_parser): # Mock Query and QueryCursor with ( - patch("google.adk.scope.extractors.extractor_ts._build_global_type_map"), + patch( + "google.adk.scope.extractors.extractor_ts._build_global_type_map" + ), patch("google.adk.scope.extractors.extractor_ts.Query"), patch( "google.adk.scope.extractors.extractor_ts.QueryCursor" diff --git a/test/adk/scope/matcher/test_matcher.py b/test/adk/scope/matcher/test_matcher.py deleted file mode 100644 index bfa5cb6..0000000 --- a/test/adk/scope/matcher/test_matcher.py +++ /dev/null @@ -1,84 +0,0 @@ -import unittest - -from google.adk.scope import features_pb2 -from google.adk.scope.matcher import matcher - - -class TestMatcher(unittest.TestCase): - - def test_match_features(self): - f1 = features_pb2.Feature( - normalized_name="f_same", - normalized_member_of="c_same", - normalized_namespace="n_same", - type=features_pb2.Feature.Type.INSTANCE_METHOD, - ) - f2 = features_pb2.Feature( - normalized_name="f_same", - normalized_member_of="c_same", - normalized_namespace="n_same", - type=features_pb2.Feature.Type.INSTANCE_METHOD, - ) - f3 = features_pb2.Feature( - normalized_name="totally", - normalized_member_of="different", - normalized_namespace="stuff", - type=features_pb2.Feature.Type.INSTANCE_METHOD, - ) - f4 = features_pb2.Feature( - normalized_name="entirely", - normalized_member_of="unrelated", - normalized_namespace="things", - type=features_pb2.Feature.Type.INSTANCE_METHOD, - ) - - base_features = [f1, f3] - target_features = [f4, f2] - - matches = matcher.match_features(base_features, target_features, 0.8) - - self.assertEqual(len(matches), 1) - - m_f1, m_f2, score = matches[0] - self.assertEqual(m_f1.normalized_name, "f_same") - self.assertEqual(m_f2.normalized_name, "f_same") - self.assertGreater(score, 0.8) - - # Assert lists were mutated and matched elements removed - self.assertEqual(len(base_features), 1) - self.assertEqual(base_features[0].normalized_name, "totally") - - self.assertEqual(len(target_features), 1) - self.assertEqual(target_features[0].normalized_name, "entirely") - - def test_fuzzy_match_namespaces(self): - features_base = {"module.one": [], "module.two": []} - features_target = { - "module.one": [features_pb2.Feature(original_name="f1_target")], - "module.ones": [features_pb2.Feature(original_name="f4")], - "module.three": [features_pb2.Feature(original_name="f5")], - } - - matcher.fuzzy_match_namespaces(features_base, features_target) - - self.assertIn("module.one", features_target) - self.assertIn("module.two", features_target) - self.assertNotIn("module.ones", features_target) - self.assertNotIn("module.three", features_target) - self.assertEqual(len(features_target["module.one"]), 3) - self.assertEqual(len(features_target["module.two"]), 0) - - def test_fuzzy_match_namespaces_empty_base(self): - features_base = {} - features_target = { - "module.one": [features_pb2.Feature(original_name="f1")] - } - - matcher.fuzzy_match_namespaces(features_base, features_target) - - self.assertIn("module.one", features_target) - self.assertEqual(len(features_target["module.one"]), 1) - - -if __name__ == "__main__": - unittest.main() diff --git a/test/adk/scope/reporter/test_reporter.py b/test/adk/scope/reporter/test_reporter.py index f3f0223..287e73c 100644 --- a/test/adk/scope/reporter/test_reporter.py +++ b/test/adk/scope/reporter/test_reporter.py @@ -1,12 +1,12 @@ import os import tempfile import unittest +from pathlib import Path from unittest.mock import patch from google.protobuf import text_format from google.adk.scope import features_pb2 -from google.adk.scope.matcher import matcher from google.adk.scope.reporter import reporter @@ -90,7 +90,7 @@ def test_match_registries(self): original_name="totally_diff", normalized_name="totally", member_of="null", - namespace="google.adk.events", + namespace="stuff", normalized_member_of="different", normalized_namespace="stuff", type=features_pb2.Feature.Type.INSTANCE_METHOD, @@ -106,187 +106,76 @@ def test_match_registries(self): ) target_registry.features.extend([f2, f_near_target]) - # Test Symmetric Report - result_sym = reporter.match_registries( - base_registry, target_registry, 0.9, report_type="symmetric" - ) - report_sym = result_sym.master_content - - # 1. Verify Master Report Structure - self.assertIn("# Feature Matching Report: Symmetric", report_sym) - self.assertIn("**Jaccard Index:** 25.00%", report_sym) - self.assertIn("## Module Summary", report_sym) - - # Check for module entry in master summary - self.assertIn("| `n_same` |", report_sym) - self.assertIn("[View Details]({modules_dir}/n_same.md)", report_sym) - - # 2. Verify Module Content - self.assertIn("n_same.md", result_sym.module_files) - module_content = result_sym.module_files["n_same.md"] - - self.assertIn("# Module: `n_same`", module_content) - self.assertIn("**Features:** 3", module_content) - - # Solid Matches - self.assertIn("### ✅ Solid Features", module_content) - self.assertIn( - "| Type | Base Feature | Target Feature | Similarity Score |", - module_content, - ) - self.assertIn( - "| method | `BaseClass.fSameBase` | `TargetClass.fSameTarget` |", - module_content, - ) - - # Potential Matches (formerly Near Misses) - self.assertIn("### ⚠️ Potential Matches", module_content) - self.assertIn( - "| Type | Base Feature | Closest Target Candidate | Similarity |", - module_content, - ) - self.assertIn( - "| method | `base_member.base_name` | " - "`target_member.target_name` |", - module_content, - ) - - # Unmatched / Gaps (in 'stuff' module) - self.assertIn("stuff.md", result_sym.module_files) - stuff_content = result_sym.module_files["stuff.md"] - self.assertIn("### ❌ Unmatched Features", stuff_content) - self.assertIn("| `totally_diff` | Target |", stuff_content) - self.assertIn("**Features:** 1", stuff_content) - - # Test Directional Report - result_dir = reporter.match_registries( - base_registry, target_registry, 0.9, report_type="directional" - ) - report_dir = result_dir.master_content - - self.assertIn("| **F1 Score** | 40.00% |", report_dir) - self.assertIn("n_same.md", result_dir.module_files) - - mod_dir_content = result_dir.module_files["n_same.md"] - - # Solid Matches - self.assertIn("### ✅ Matched Features", mod_dir_content) - self.assertIn( - "| Type | Base Feature | Target Feature | Similarity Score |", - mod_dir_content, - ) - self.assertIn( - "| method | `BaseClass.fSameBase` | `TargetClass.fSameTarget` |", - mod_dir_content, - ) + # Test Markdown Report + with tempfile.TemporaryDirectory() as temp_dir: + output_path = Path(temp_dir) / "report.md" + result_md = reporter.generate_markdown_raw_reports( + [base_registry, target_registry], + report_type="md", + output_path=output_path, + ) + report_md = result_md.main_report_content - # Potential Matches - self.assertIn("### ⚠️ Potential Matches", mod_dir_content) - self.assertIn( - "| Type | Base Feature | Closest Target Candidate | Similarity |", - mod_dir_content, - ) - self.assertIn( - "| method | `base_member.base_name` | " - "`target_member.target_name` |", - mod_dir_content, - ) + # 1. Verify Master Report Structure + self.assertIn("# Feature Matching Parity Report", report_md) + self.assertIn("## Summary", report_md) + # Check for High/Low confidence summaries + self.assertIn( + "| **✅ High Confidence Matches** | **1** |", report_md + ) + self.assertIn("| **⚠️ Low Confidence Matches** | **1** |", report_md) + self.assertIn("| **❌ Mismatches** | **1** |", report_md) + self.assertIn("## Module Summary", report_md) - # Unmatched / Gaps (in 'stuff' module) - self.assertIn("stuff.md", result_dir.module_files) - stuff_dir_content = result_dir.module_files["stuff.md"] - self.assertIn("### ❌ Missing in Target", stuff_dir_content) - self.assertIn("| `totally_diff` |", stuff_dir_content) + # Check for module entry in master summary + self.assertIn( + "| Module | Features (Python) | Score | Status | Details |", + report_md, + ) + self.assertIn("| `google.adk.events` |", report_md) - def test_match_registries_raw(self): - f1 = features_pb2.Feature( - original_name="f_same", - normalized_name="f_same", - normalized_namespace="pkg", - member_of="MyClass", - normalized_member_of="myclass", - type=features_pb2.Feature.Type.FUNCTION, - ) - base = features_pb2.FeatureRegistry(language="Python", version="1") - base.features.append(f1) - target = features_pb2.FeatureRegistry(language="TS", version="2") - target.features.append(f1) + self.assertIn( + "[View Details]({modules_dir}/google.adk.events.md)", report_md + ) - result = reporter.match_registries(base, target, 0.9, report_type="raw") - csv_content = result.master_content + # 2. Verify Module Content + self.assertIn("google.adk.events.md", result_md.module_reports) + module_content = result_md.module_reports["google.adk.events.md"] + + self.assertIn("# Module: `google.adk.events`", module_content) + # New summary table in module + self.assertIn("## Summary", module_content) + self.assertIn("## Feature Details", module_content) + + # Solid Matches (High Confidence) + self.assertIn("✅", module_content) + self.assertIn("**High**", module_content) + self.assertIn("`fSameBase`", module_content) + self.assertIn("`fSameTarget`", module_content) + + # Potential Matches (Low Confidence) + self.assertIn("⚠️", module_content) + self.assertIn("Low", module_content) + self.assertIn("`base_name`", module_content) + self.assertIn("`target_name`", module_content) + + # Unmatched / Gaps (in 'stuff' module) + self.assertIn("stuff.md", result_md.module_reports) + stuff_content = result_md.module_reports["stuff.md"] + self.assertIn("❌", stuff_content) + self.assertIn("`totally_diff`", stuff_content) - expected_header = ( - "python_namespace,python_member_of,python_name,ts_namespace," - "ts_member_of,ts_name,type,score" - ) - self.assertIn(expected_header, csv_content) - - # Check for solid match line - # f1 has: ns=pkg, mem=MyClass, name=f_same - # Match should have same values for base and target - expected_line = "pkg,MyClass,f_same,pkg,MyClass,f_same,function,1.0000" - self.assertIn(expected_line, csv_content) - self.assertFalse(result.module_files) - - def test_group_features_by_module(self): - registry = features_pb2.FeatureRegistry() - f1 = registry.features.add() - f1.namespace = "module.one" - f2 = registry.features.add() - f2.namespace = "module.two" - f3 = registry.features.add() - f3.namespace = "module.one" - - result = reporter._group_features_by_module(registry) - - self.assertIn("module.one", result) - self.assertIn("module.two", result) - self.assertEqual(len(result["module.one"]), 2) - self.assertEqual(len(result["module.two"]), 1) - - def test_process_module(self): - """Tests the end-to-end processing of a single module.""" + def test_generate_raw_report(self): + """Tests the raw CSV report generation via RawReportGenerator.""" f_base = features_pb2.Feature( original_name="f1_base", normalized_name="f1_base", - normalized_namespace="n1", + namespace="n1", + member_of="c1", type=features_pb2.Feature.Type.FUNCTION, ) + # f_target is a perfect match f_target = features_pb2.Feature( - original_name="f1_target", - normalized_name="f1_target", - normalized_namespace="n1", - type=features_pb2.Feature.Type.FUNCTION, - ) - - with patch( - "google.adk.scope.reporter.reporter.matcher.match_features" - ) as mock_match: - # Let's assume one solid match and no potential matches - mock_match.side_effect = [ - [(f_base, f_target, 0.95)], # Solid matches - [], # Potential matches - ] - - result = matcher.process_module( - module="n1", - base_list=[f_base], - target_list=[f_target], - alpha=0.9, - report_type="symmetric", - base_lang_code="py", - target_lang_code="ts", - ) - - self.assertEqual(result["solid_matches_count"], 1) - self.assertEqual(result["score"], 1.0) - self.assertIn("| py, ts |", result["row_content"]) - self.assertIn("# Module: `n1`", result["module_content"]) - self.assertIn("### ✅ Solid Features", result["module_content"]) - - def test_generate_raw_report(self): - """Tests the raw CSV report generation.""" - f_base = features_pb2.Feature( original_name="f1_base", normalized_name="f1_base", namespace="n1", @@ -301,89 +190,70 @@ def test_generate_raw_report(self): target_registry = features_pb2.FeatureRegistry( language="TypeScript", version="2.0.0" ) + target_registry.features.extend([f_target]) - with patch( - "google.adk.scope.reporter.reporter.matcher.match_features" - ) as mock_match: - mock_match.return_value = [] # No matches for simplicity + # Use RawReportGenerator directly + generator = reporter.raw.RawReportGenerator( + base_registry, target_registry + ) + df = generator.generate() - result = reporter.ReportGenerator( - base_registry, target_registry, 0.9 - ).generate_raw_report() + # Check columns + self.assertIn("py_namespace", df.columns) + self.assertIn("score", df.columns) - self.assertIn( - "python_namespace,python_member_of,python_name", - result.master_content, - ) - self.assertIn("n1,c1,f1_base", result.master_content) + # Check content + row = df.iloc[0] + self.assertEqual(row["py_name"], "f1_base") + self.assertEqual(row["score"], 1.0) - def test_generate_symmetric_report(self): - """Tests the symmetric report generation.""" - base_registry = features_pb2.FeatureRegistry( - language="Python", version="1.0.0" - ) - f1 = base_registry.features.add() - f1.namespace = "n1" - target_registry = features_pb2.FeatureRegistry( - language="TypeScript", version="2.0.0" + def test_global_best_match(self): + """Tests that a feature matches best candidate globally, ignoring + namespace.""" + # Base feature in namespace 'n1' + f_base = features_pb2.Feature( + original_name="my_feature", + normalized_name="my_feature", + namespace="n1", + type=features_pb2.Feature.Type.FUNCTION, ) - with patch( - "google.adk.scope.reporter.reporter.matcher.process_module" - ) as mock_process: - mock_process.return_value = { - "solid_matches_count": 1, - "score": 1.0, - "row_content": "| py, ts | `n1` | 1 | 100.00% | ✅ | n1.md |", - "module_filename": "n1.md", - "module_content": "# Module: `n1`", - } - - result = reporter.ReportGenerator( - base_registry, target_registry, 0.9 - ).generate_symmetric_report() + # Target feature 1: Same namespace, but different name (low score) + f_target_bad = features_pb2.Feature( + original_name="other_feature", + normalized_name="other_feature", + namespace="n1", + type=features_pb2.Feature.Type.FUNCTION, + ) - self.assertIn( - "# Feature Matching Report: Symmetric", result.master_content - ) - self.assertIn("**Jaccard Index:**", result.master_content) - self.assertIn("## Module Summary", result.master_content) - self.assertIn("| `n1` |", result.master_content) - self.assertIn("n1.md", result.module_files) + # Target feature 2: Different namespace, but same name (high score) + f_target_good = features_pb2.Feature( + original_name="my_feature", + normalized_name="my_feature", + namespace="n2", + type=features_pb2.Feature.Type.FUNCTION, + ) - def test_generate_directional_report(self): - """Tests the directional report generation.""" base_registry = features_pb2.FeatureRegistry( - language="Python", version="1.0.0" + language="Python", version="1" ) - f1 = base_registry.features.add() - f1.namespace = "n1" + base_registry.features.append(f_base) + target_registry = features_pb2.FeatureRegistry( - language="TypeScript", version="2.0.0" + language="Java", version="2" ) + target_registry.features.extend([f_target_bad, f_target_good]) - with patch( - "google.adk.scope.reporter.reporter.matcher.process_module" - ) as mock_process: - mock_process.return_value = { - "solid_matches_count": 1, - "score": 1.0, - "row_content": "| `n1` | 1 | 100.00% | ✅ | n1.md |", - "module_filename": "n1.md", - "module_content": "# Module: `n1`", - } - - result = reporter.ReportGenerator( - base_registry, target_registry, 0.9 - ).generate_directional_report() + # RawReportGenerator logic + generator = reporter.raw.RawReportGenerator( + base_registry, target_registry + ) + df = generator.generate() - self.assertIn( - "# Feature Matching Report: Directional", result.master_content - ) - self.assertIn("| **F1 Score** |", result.master_content) - self.assertIn("## Module Summary", result.master_content) - self.assertIn("| `n1` |", result.master_content) - self.assertIn("n1.md", result.module_files) + # Check that we found the match in n2 + row = df.iloc[0] + self.assertEqual(row["java_namespace"], "n2") + self.assertEqual(row["score"], 1.0) def test_raw_integration(self): """Tests the raw report generation end-to-end.""" @@ -455,18 +325,99 @@ def test_raw_integration(self): typescript_features_str, features_pb2.FeatureRegistry() ) - result = reporter.ReportGenerator( - py_registry, ts_registry, 0.8 - ).generate_raw_report() - - self.assertIn( - "python_namespace,python_member_of,python_name,ts_namespace,ts_member_of,ts_name,type,score", - result.master_content, + generator = reporter.raw.RawReportGenerator(py_registry, ts_registry) + df = generator.generate() + + # Verify solid match (high score) + row = df.iloc[0] + self.assertEqual(row["py_name"], "load_artifact") + self.assertEqual(row["ts_name"], "loadArtifact") + self.assertGreater(row["score"], 0.8) + + def test_raw_report_match_confidence(self): + """Tests match and confidence columns with various scores.""" + # 1. High match (score 0.9 > 0.6 for py/go) + f_high = features_pb2.Feature( + original_name="high", + normalized_name="high", + type=features_pb2.Feature.Type.FUNCTION, + ) + # 2. Avg match (score 0.55 between 0.5 and 0.6 for py/go) + f_avg = features_pb2.Feature( + original_name="high", + normalized_name="high_ish", + type=features_pb2.Feature.Type.FUNCTION, + ) + # 3. Low match (score 0.1 < 0.5 for py/go) + f_low = features_pb2.Feature( + original_name="high", + normalized_name="completely_different", + type=features_pb2.Feature.Type.FUNCTION, ) - print(result.master_content) - self.assertEqual(len(result.master_content.splitlines()), 2) - # A known match + base = features_pb2.FeatureRegistry(language="Python", version="1") + base.features.append(f_high) + + target = features_pb2.FeatureRegistry(language="Go", version="1") + # We need to craft targets that produce specific scores or mock the + # scorer. It's easier to mock SimilarityScorer to return fixed scores. + target.features.extend([f_high, f_avg, f_low]) + + with patch( + "google.adk.scope.reporter.raw.SimilarityScorer" + ) as MockScorer: + instance = MockScorer.return_value + + # Case 1: High match + instance.get_similarity_score.return_value = 0.9 + gen = reporter.raw.RawReportGenerator(base, target) + df = gen.generate() + # Since generate iterates through base features, and we have 1 base + # feature, it will run once. We need to test behavior for different + # scores. But generate() does all at once. + + # Actually, `generate` iterates through base features. + # If we want to test different outcomes, we should perhaps just + # test the _get_confidence_level method or ensure our mock returns + # different values for different calls if possible, or just run 3 + # separate gens. + + # Test High + self.assertEqual(df.iloc[0]["match"], "true") + self.assertEqual(df.iloc[0]["confidence"], "high") + + # Test Avg (Low Confidence) + instance.get_similarity_score.return_value = 0.55 + gen = reporter.raw.RawReportGenerator(base, target) + df = gen.generate() + self.assertEqual(df.iloc[0]["match"], "true") + self.assertEqual(df.iloc[0]["confidence"], "low") + + # Test Low (Mismatch) + instance.get_similarity_score.return_value = 0.4 + gen = reporter.raw.RawReportGenerator(base, target) + df = gen.generate() + self.assertEqual(df.iloc[0]["match"], "false") + self.assertEqual( + df.iloc[0]["confidence"], "high" + ) # Mismatches are high confidence if very low score? + # Wait, raw.py logic: + # if score > high_thresh: true, high + # elif score > avg_thresh: true, low + # else: match=false + # if match=false, confidence depends on score? + # Actually raw.py says: + # if match: ... + # else: row["match"] = "false" + # And confidence is set to "high" by default for mismatches in + # raw.py? Let's check raw.py. + # "confidence": "high" is default init. + # If match found, it might be updated to "low". + # If no match found (score < avg), it remains "high" (High + # confidence that it is NOT a match). + + self.assertEqual(df.iloc[0]["match"], "false") + self.assertEqual(df.iloc[0]["confidence"], "high") if __name__ == "__main__": diff --git a/test/adk/scope/utils/test_stats.py b/test/adk/scope/utils/test_stats.py deleted file mode 100644 index a98b0ab..0000000 --- a/test/adk/scope/utils/test_stats.py +++ /dev/null @@ -1,26 +0,0 @@ -import unittest - -from google.adk.scope.utils import stats - - -class TestStats(unittest.TestCase): - def test_precision(self): - self.assertEqual(stats.calculate_precision(10, 20), 0.5) - self.assertEqual(stats.calculate_precision(0, 20), 0.0) - self.assertEqual(stats.calculate_precision(10, 0), 1.0) # Edge case - - def test_recall(self): - self.assertEqual(stats.calculate_recall(10, 20), 0.5) - self.assertEqual(stats.calculate_recall(0, 20), 0.0) - self.assertEqual(stats.calculate_recall(10, 0), 1.0) # Edge case - - def test_f1(self): - self.assertAlmostEqual(stats.calculate_f1(0.5, 0.5), 0.5) - self.assertAlmostEqual(stats.calculate_f1(1.0, 1.0), 1.0) - self.assertAlmostEqual(stats.calculate_f1(0.0, 1.0), 0.0) - self.assertAlmostEqual(stats.calculate_f1(0.0, 0.0), 0.0) - self.assertAlmostEqual(stats.calculate_f1(0.75, 0.5), 0.6) - - -if __name__ == "__main__": - unittest.main()