From 17029f09269d60ab21b9e391596fcc2c6ef52a67 Mon Sep 17 00:00:00 2001 From: Satrio Adi Rukmono Date: Wed, 27 Nov 2024 11:29:16 +0100 Subject: [PATCH 01/34] November update WIP --- arcana/filters.py | 162 +++++++++++++++++++++++++--------- arcana/templates.py | 32 +++---- arcanalib/graph.py | 210 +++++++++++++++++++++++++++++++++++++++----- 3 files changed, 317 insertions(+), 87 deletions(-) diff --git a/arcana/filters.py b/arcana/filters.py index 472e9b7..845a117 100644 --- a/arcana/filters.py +++ b/arcana/filters.py @@ -1,3 +1,4 @@ +from io import TextIOWrapper import json import os import re @@ -11,7 +12,7 @@ from tqdm.auto import tqdm from arcana import templates -from arcanalib.graph import Graph, Node, triplets, invert, lift +from arcanalib.graph import Edge, Graph, Node, triplets, invert, lift from arcanalib.pipefilter import Filter, Seeder @@ -153,22 +154,32 @@ def dependency_profile_category(inn: int, out: int) -> str: return data - -def build_hierarchy(data: Graph) -> dict: - """Build a hierarchical structure of packages, classes, and methods.""" - methods = sorted(triplets(data.edges['contains'], data.edges['hasScript'])) - classes = sorted({(pkg, clz) for pkg, clz, _ in methods}) +def build_triplets(edge_list1, edge_list2) -> dict: + + methods = sorted(triplets(edge_list1, edge_list2)) + + return methods + +def build_hierarchy(method_triplets) -> dict: + classes = sorted({(pkg, clz) for pkg, clz, _ in method_triplets}) packages = sorted({pkg for pkg, _ in classes}) hierarchy = { pkg_id: { - cls_id: [met_id for _, c, met_id in methods if c == cls_id] + cls_id: [met_id for _, c, met_id in method_triplets if c == cls_id] for p, cls_id in classes if p == pkg_id } for pkg_id in packages } return hierarchy +class CustomJSONEncoder(json.JSONEncoder): + def default(self, obj): + if isinstance(obj, set): + # Convert set to list + return list(obj) + # Call the default method for other types + return super().default(obj) class LLMFilter(Filter): def __init__(self, config: Dict[str, Dict[str, Any]]): @@ -188,15 +199,14 @@ def process(self, data: Graph) -> Graph: Graph: The processed data with generated descriptions. """ self.project_name, self.project_desc, self.openai_client_args, model, client = self.setup() - - hierarchy = build_hierarchy(data) timestr = time.strftime("%Y%m%d-%H%M%S") - with open(f'arcana-{timestr}.jsonl', 'a', encoding="utf-8") as file: - try: - self.process_hierarchy(data, hierarchy, client, model, file) - except StopIteration: - pass + with open(f'arcana-{timestr}.jsonl', 'a', encoding="utf-8") as jsonl_file: + with open(f'arcana-{timestr}.log', 'a', encoding="utf-8") as log_file: + try: + self.process_hierarchy(data, client, model, jsonl_file, log_file) + except StopIteration: + pass return data @@ -215,41 +225,88 @@ def setup(self): return project_name, project_desc, openai_client_args, model, client - def describe(self, node: dict) -> str: + def describe(self, node: dict, *keys) -> str: """Generate a description for a given node.""" - keys = ['description', 'returns', 'reason', 'howToUse', 'howItWorks', 'assertions', 'roleStereotype', 'layer'] - return ' '.join(f"**{key}**: {str(node.properties[key])}. " for key in keys if key in node.properties) + sr, sn = '\r', '\n' + if not keys: + keys = ['description', 'docComment', 'returns', 'reason', 'howToUse', 'howItWorks', 'assertions', 'roleStereotype', 'layer'] + return ' '.join(f"**{key}**: {sentence(str(node.properties[key]).replace(sr,'').replace(sn,' '))} " for key in keys if key in node.properties) - def process_hierarchy(self, data: Graph, hierarchy: dict, client: OpenAI, model: str, file): + def process_hierarchy(self, data: Graph, client: OpenAI, model: str, jsonl_file, log_file): """Process each package, class, and method in the hierarchy.""" - for pkg_id, pkg_data in tqdm(hierarchy.items(), desc="Processing packages"): + + # all_method_ids = [ node.id + # for node + # in (data.find_nodes(label="Operation") + data.find_nodes(label="Constructor")) ] + # independent_method_ids = [ node_id + # for node_id + # in all_method_ids + # if node_id not in [edge.source for edge in data.find_edges(label="invokes")] ] + st_contains_st = data.find_edges(label='contains',source_label='Structure',target_label='Structure') + ct_contains_st = data.find_edges(label='contains',target_label='Structure', where_source=lambda node: 'Container' in node.labels and 'Structure' not in node.labels) + new_ct_sources = {edge.target:data.find_source(data.edges['contains'],data.nodes[edge.target],lambda node:'Structure' not in node.labels,data.nodes[edge.source]).id for edge in st_contains_st} + ct_contains_st.extend([Edge(source=source, target=target, label='contains') for target, source in new_ct_sources.items()]) + + triplets = build_triplets(ct_contains_st, data.edges['hasScript']) + met_to_cls_pkg = {met_id: (cls_id, pkg_id) for pkg_id, cls_id, met_id in triplets} + # print(met_to_cls_pkg) + # print('######################################################################') + sorted_method_ids, method_deps = data.toposorted_nodes(data.edges['invokes']) + # print(sorted_method_ids) + + counter = 0 + + for met_id in tqdm(sorted_method_ids, desc='Processing methods', position=0, leave=False): + cls_id, pkg_id = met_to_cls_pkg[met_id] + clasz = data.nodes[cls_id] + + class_name = clasz.properties['qualifiedName'] + class_kind = clasz.properties['kind'] + class_kind = 'enum' if class_kind == 'enumeration' else 'abstract class' if class_kind == 'abstract' else class_kind + self.process_method(data, client, model, jsonl_file, log_file, met_id, class_name, class_kind, method_deps) + + if os.path.exists('stop'): + raise StopIteration + + counter = (counter+1)%5 + if counter==4: + log_file.flush() + jsonl_file.flush() + + + hierarchy = build_hierarchy(triplets) + + sorted_pkg_ids, pkg_deps = data.toposorted_nodes(data.edges['contains']) + + for pkg_id in tqdm(sorted_pkg_ids, desc="Processing packages", position=1): + pkg_data = hierarchy.get(pkg_id, dict()) package = data.nodes[pkg_id] - for cls_id, cls_data in tqdm(pkg_data.items(), desc="Processing classes", position=1, leave=False): + for cls_id, cls_data in tqdm(pkg_data.items(), desc="Processing classes", position=2, leave=False): clasz = data.nodes[cls_id] class_name = clasz.properties['qualifiedName'] class_kind = clasz.properties['kind'] class_kind = 'enum' if class_kind == 'enumeration' else 'abstract class' if class_kind == 'abstract' else class_kind - for met_id in tqdm(cls_data, desc='Processing methods', position=2, leave=False): - self.process_method(data, client, model, file, met_id, class_name, class_kind) + # for met_id in tqdm(cls_data, desc='Processing methods', position=2, leave=False): + # self.process_method(data, client, model, file, met_id, class_name, class_kind) - if os.path.exists('stop'): - raise StopIteration + # if os.path.exists('stop'): + # raise StopIteration - self.process_class(data, client, model, file, cls_id, clasz, class_name, class_kind, cls_data) + self.process_class(data, client, model, jsonl_file, log_file, cls_id, clasz, class_name, class_kind, cls_data) if os.path.exists('stop'): raise StopIteration - self.process_package(data, client, model, file, pkg_id, package, pkg_data) + self.process_package(data, client, model, jsonl_file, log_file, pkg_id, package, pkg_data, pkg_deps) if os.path.exists('stop'): raise StopIteration - def process_method(self, data: Graph, client: OpenAI, model: str, file, met_id: str, class_name: str, - class_kind: str): + def process_method(self, data: Graph, client: OpenAI, model: str, jsonl_file: TextIOWrapper, log_file: TextIOWrapper, met_id: str, class_name: str, + class_kind: str, node_deps: dict): """Process a single method and generate its description.""" method = data.nodes[met_id] @@ -262,23 +319,27 @@ def process_method(self, data: Graph, client: OpenAI, model: str, file, met_id: struct_kind=class_kind, struct_name=class_name, op_src=method_src, + other_ops="(none)" if not node_deps[met_id] else "\n".join(f"- `{data.nodes[node_id].properties['simpleName']}`: {self.describe(data.nodes[node_id], 'description', 'returns', 'docComment')}" for node_id in node_deps[met_id]), project_name=self.project_name, project_desc=self.project_desc ) + + log_file.write(prompt) + log_file.write('\n\n======\n\n') description = self.generate_description(client, model, prompt) self.update_method_properties(data, description, method) - file.write(json.dumps({ + jsonl_file.write(json.dumps({ 'data': { 'id': method.id, 'labels': method.labels, 'properties': description } - })) - file.write('\n') - - def process_class(self, data: Graph, client: OpenAI, model: str, file, cls_id: str, clasz: dict, class_name: str, + }, cls=CustomJSONEncoder)) + jsonl_file.write('\n') + + def process_class(self, data: Graph, client: OpenAI, model: str, jsonl_file, log_file, cls_id: str, clasz: dict, class_name: str, class_kind: str, cls_data: list): """Process a single class and generate its description.""" ancestors, fields = self.get_class_relations(data, cls_id) @@ -293,40 +354,48 @@ def process_class(self, data: Graph, client: OpenAI, model: str, file, cls_id: s project_name=self.project_name, project_desc=self.project_desc ) + + log_file.write(prompt) + log_file.write('\n\n======\n\n') description = self.generate_description(client, model, prompt) self.update_class_properties(data, description, clasz) - file.write(json.dumps({ + jsonl_file.write(json.dumps({ 'data': { 'id': clasz.id, 'labels': list(clasz.labels), 'properties': description } })) - file.write('\n') + jsonl_file.write('\n') - def process_package(self, data: Graph, client: OpenAI, model: str, file, pkg_id: str, package: dict, - pkg_data: dict): + def process_package(self, data: Graph, client: OpenAI, model: str, jsonl_file, log_file, pkg_id: str, package: dict, + pkg_data: dict, pkg_deps: dict): """Process a single package and generate its description.""" classes_descriptions = self.get_classes_descriptions(data, pkg_data) + package_descriptions = self.get_packages_descriptions(data, pkg_deps[pkg_id]) prompt = templates.component_analysis.format( pkg_name=package.properties['qualifiedName'], - classes="\n".join(classes_descriptions), + classes="(none)" if not classes_descriptions else "\n".join(classes_descriptions), + packages="(none)" if not package_descriptions else "\n".join(classes_descriptions), project_name=self.project_name, project_desc=self.project_desc ) + log_file.write(prompt) + log_file.write('\n\n======\n\n') + description = self.generate_description(client, model, prompt) self.update_package_properties(data, description, package) - file.write(json.dumps({ + jsonl_file.write(json.dumps({ 'data': { 'id': package.id, 'labels': list(package.labels), 'properties': description}})) - file.write('\n') + jsonl_file.write('\n') def generate_description(self, client: OpenAI, model: str, prompt: str) -> dict: """Generate a description using the OpenAI client.""" @@ -369,11 +438,11 @@ def update_method_properties(self, data: Graph, description: dict, method: dict) if node.properties['simpleName'] == param['name'] ] if matching_params: - param_node_id = matching_params[0]['id'] + param_node_id = matching_params[0].id if param_node_id in data.nodes: data.nodes[param_node_id].properties['description'] = param.get('description') - elif key_lower == 'returns': - method.properties['returns'] = value.get('description', None) if value else None + # elif key_lower == 'returns': + # method.properties['returns'] = value.get('description', None) if value and hasattr(value, 'get') else None else: method.properties[key_lower] = value @@ -421,6 +490,13 @@ def get_classes_descriptions(self, data: Graph, pkg_data: dict) -> list: for cls_id, _ in pkg_data.items() ] + def get_packages_descriptions(self, data: Graph, package_ids: list) -> list: + """Generate descriptions for packages.""" + return [ + f"- `{data.nodes[pkg_id].properties['qualifiedName']}`: {self.describe(data.nodes[pkg_id])}" + for pkg_id in package_ids + ] + def merge_node_properties(dict1: Dict[str, Node], dict2: Dict[str, Node], simplify_names=False): for id2, obj2 in dict2.items(): diff --git a/arcana/templates.py b/arcana/templates.py index c280737..add1056 100644 --- a/arcana/templates.py +++ b/arcana/templates.py @@ -4,14 +4,18 @@ {op_src} ``` +The method may use the following other methods: + +{other_ops} + Explain the above method on the following aspects: {{ description: "Describe the functionality of the method in one sentence.", parameters: [ {{ name:..., type:..., description:... }}, ... ], // empty list if there is no parameter - returns: {{ type:..., description: ... }}, // In case of a constructor, consider the constructed class as the return type. - reason: "Explain, in one sentence, the reason why the method is provided or the design rationale of the method.", - howToUse: "Describe the usage or the expected set-up of using the method in less than 3 sentences.", - howItWorks: "Describe the implementation details of the method in less than 5 sentences.", + returns: "Describe the returned object/value in one sentence. (In case of a constructor, consider the newly created instance as the return value.)" + reason: "Explain the reason why the method is provided or the design rationale of the method, in one sentence.", + howToUse: "Describe the usage or the expected set-up of using the method, in less than 3 sentences.", + howItWorks: "Describe the implementation details of the method, in less than 5 sentences.", assertions: {{ preConditions: ["pre-conditions of the method", ...], postConditions: ["pre-conditions of the method", ...] }}, layer:..., layerReason:... @@ -20,11 +24,8 @@ For the `layer`, fill the value with one of the following architectural layer which functionality is exhibited by the method source code: - **Presentation Layer**: Manages the user interface, defines UI elements and behavior, displays information, responds to user input, and updates views. - - **Service Layer**: Controls the application flow, orchestrates domain operations, connects UI events with domain logic, and synchronizes domain changes with the UI. - - **Domain Layer**: Handles business logic, represents domain data and behavior, and performs necessary computations for domain operations. - - **Data Source Layer**: Interacts with databases, filesystems, hardware, messaging systems, or other data sources, performs CRUD operations, handles data conversion, and ensures data integrity. In `layerReason`, explain why this method fits your layer of choice but not the other layers. @@ -56,19 +57,12 @@ For the `roleStereotype`, fill the value with one of the following role stereotypes which responsibility is exhibited by the {struct_type}: - **Information Holder** is responsible for knowing facts and providing information to other objects. POJOs and Java Beans are usually information holders. - - **Service Provider** is responsible for handling requests and performing specific services. It usually implements a specific interface with a small number of methods. Concrete strategies are service providers. - - **Structurer** is responsible for managing relationships and constraints among related things. It is usually a collection or mapping of some sort. - - **Controller** is responsible for making decisions, directing the work of others, and handling important events. It directs the flow of the application or business process. - - **Coordinator** is responsible for managing the actions of a group of workers and facilitating communication and work of other objects. It delegates requests to other objects. Very abstract classes and interfaces might be coordinators as they delegate the work to subclasses. - - **User Interfacer** is responsible for transmitting user requests for action or display/render information that can be updated. It handles interactions with users. - - **External Interfacer** is responsible for loading and storing information from/to external services, including database systems, web services, filesystems, hardware, etc. - - **Internal Interfacer** is responsible for interfacing between two subsystems. It may bundle together information of requests from a group of objects to be sent to another object. Abstract adapters, bridges, facades, and proxies are internal interfacers. In `roleStereotypeReason`, explain why this {struct_type} fits your stereotype of choice but not the other stereotypes. @@ -76,11 +70,8 @@ For the `layer`, consider the functionalities of architectural layers below: - **Presentation Layer**: Manages the user interface, defines UI elements and behavior, displays information, responds to user input, and updates views. Typically (but not only) contains User Interfacers. - - **Service Layer**: Controls the application flow, orchestrates domain operations, connects UI events with domain logic, and synchronizes domain changes with the UI. Typically (but not only) contains Coordinators and (Application) Controllers. - - **Domain Layer**: Handles business logic, represents domain data and behavior, and performs necessary computations for domain operations. Typically (but not only) contains Information Holders, Service Providers, Structurers, Coordinators, and (Domain) Controllers. - - **Data Source Layer**: Interacts with databases, filesystems, hardware, messaging systems, or other data sources, performs CRUD operations, handles data conversion, and ensures data integrity. Typically (but not only) contains External Interfacers. In `layerReason`, explain why this {struct_type} fits your layer of choice but not the other layers. @@ -91,6 +82,10 @@ {classes} +and the following subpackages: + +{packages} + Explain the above package on the following aspects: {{ description: "Describe the purpose of the package in one sentence.", @@ -100,11 +95,8 @@ For the `layer`, consider the functionalities of architectural layers below: - **Presentation Layer**: Manages the user interface, defines UI elements and behavior, displays information, responds to user input, and updates views. Typically (but not only) contains User Interfacers. - - **Service Layer**: Controls the application flow, orchestrates domain operations, connects UI events with domain logic, and synchronizes domain changes with the UI. Typically (but not only) contains Coordinators and (Application) Controllers. - - **Domain Layer**: Handles business logic, represents domain data and behavior, and performs necessary computations for domain operations. Typically (but not only) contains Information Holders, Service Providers, Structurers, Coordinators, and (Domain) Controllers. - - **Data Source Layer**: Interacts with databases, filesystems, hardware, messaging systems, or other data sources, performs CRUD operations, handles data conversion, and ensures data integrity. Typically (but not only) contains External Interfacers. In `layerReason`, explain why this package fits your layer of choice but not the other layers. diff --git a/arcanalib/graph.py b/arcanalib/graph.py index 0d89981..4ae7b79 100644 --- a/arcanalib/graph.py +++ b/arcanalib/graph.py @@ -1,6 +1,7 @@ import json from collections.abc import Iterable -from typing import Optional, List, Dict, Union, Set, Tuple +from collections import defaultdict +from typing import DefaultDict, Optional, List, Dict, Union, Set, Tuple class Node: @@ -18,7 +19,7 @@ def to_dict(self): } } - def __str__(self): + def __repr__(self): return json.dumps(self.to_dict()) class Edge: @@ -40,7 +41,7 @@ def to_dict(self): } } - def __str__(self): + def __repr__(self): return json.dumps(self.to_dict()) def invert(edge_list: List[Edge], new_label: Optional[str] = None) -> List[Edge]: @@ -113,14 +114,19 @@ def lift(edges1: List[Edge], edges2: List[Edge], new_label: Optional[str] = None def triplets(edge_list1: List[Edge], edge_list2: List[Edge]) -> Set[Tuple[str, str, str]]: - source_mapping = {edge.target: edge.source for edge in edge_list1} + source_mapping = DefaultDict(list) + + for edge in edge_list1: + source_mapping[edge.target].append(edge.source) + # {edge.target: edge.source for edge in edge_list1} paths = set() for edge in edge_list2: if edge.source in source_mapping: - source1 = source_mapping[edge.source] - triplet = (source1, edge.source, edge.target) - paths.add(triplet) + sources = source_mapping[edge.source] + for source1 in sources: + triplet = (source1, edge.source, edge.target) + paths.add(triplet) return paths @@ -134,24 +140,28 @@ class Graph: edges (dict): A dictionary of edges categorized by labels. """ - def __init__(self, graph_data: dict) -> None: + def __init__(self, graph_data: dict = None) -> None: """ Initializes the Graph with nodes and edges from the provided data. Args: graph_data (dict): A dictionary containing graph data with nodes and edges. """ - self.nodes: Dict[str, Node] = { - node['data']['id']: Node(node['data']['id'], *node['data']['labels'], **node['data']['properties']) - for node in graph_data['elements']['nodes'] - } - self.edges: Dict[str, List[Edge]] = {} - for edge in graph_data['elements']['edges']: - edge_data = edge['data'] - edge_obj = Edge(edge_data['source'], edge_data['target'], edge_data['label'], **edge_data['properties']) - if edge_obj.label not in self.edges: - self.edges[edge_obj.label] = [] - self.edges[edge_obj.label].append(edge_obj) + if not graph_data: + self.nodes = dict() + self.edges = dict() + else: + self.nodes: Dict[str, Node] = { + node['data']['id']: Node(node['data']['id'], *node['data']['labels'], **node['data']['properties']) + for node in graph_data['elements']['nodes'] + } + self.edges: Dict[str, List[Edge]] = {} + for edge in graph_data['elements']['edges']: + edge_data = edge['data'] + edge_obj = Edge(edge_data['source'], edge_data['target'], edge_data['label'], **edge_data['properties']) + if edge_obj.label not in self.edges: + self.edges[edge_obj.label] = [] + self.edges[edge_obj.label].append(edge_obj) def invert_edges(self, edge_label: str, new_label: Optional[str] = None) -> None: """ @@ -290,14 +300,18 @@ def get_source_and_target_labels(self, edge_label: str) -> Set[str]: def generate_ontology(self) -> Dict[str, Set[str]]: """ Generates an ontology from the graph's edges and nodes. - - Returns: - dict: A dictionary representing the ontology. """ - return { + ontology = { label: self.get_source_and_target_labels(label) for label in self.edges } + onto_graph = Graph() + onto_graph.edges = {label:[Edge(src,tgt,label) for src,tgt in ontology[label]] for label in ontology} + sources = {src for label in ontology for src,_ in ontology[label]} + targets = {tgt for label in ontology for _,tgt in ontology[label]} + node_ids = sources | targets + onto_graph.nodes = {id:Node(id,id) for id in node_ids} + return onto_graph def find_nodes(self, label=None, where=None) -> List[Node]: return [node for node in self.nodes.values() if (not label or label in node.labels) and (not where or where(node))] @@ -317,6 +331,154 @@ def find_edges(self, label=None, source_label=None, target_label=None, where_edg and (not where_target or where_target(self.nodes[edge.target])) ] + def find_source(self, edge_list: List[Edge], start_node: Node, predicate, default: Node = None): + """ + Optimized version to find the first source node for a given node `start_node` + that satisfies a predicate. + + Parameters: + - edges: List of tuples (source, target) representing the graph. + - start_node: The target node to trace back from. + - predicate: A function that takes a node and returns True if the node satisfies the condition. + + Returns: + - The first source node that satisfies the predicate, or None if no such node exists. + """ + # Create an in-memory adjacency list for the graph + predecessors = defaultdict(list) + for edge in edge_list: + predecessors[edge.target].append(edge.source) + + # Perform DFS directly without building a reverse adjacency list + visited = set() + stack = [start_node.id] + + while stack: + current = stack.pop() + if current in visited: + continue + visited.add(current) + + # Check if the current node satisfies the predicate + if predicate(self.nodes[current]): + return self.nodes[current] + + # Add predecessors directly to the stack + stack.extend(predecessors[current]) + + return default + + def _adj_list(edge_list: List[Edge]): + """ + Build adjacency list and outdegree dictionary from a list of edges. + """ + adj_list = {} + outdegree = {} + + for edge in edge_list: + source_id = edge.source + target_id = edge.target + if source_id not in adj_list: + adj_list[source_id] = [] + if source_id not in outdegree: + outdegree[source_id] = 0 + if target_id not in adj_list: + adj_list[target_id] = [] + if target_id not in outdegree: + outdegree[target_id] = 0 + + adj_list[source_id].append(target_id) + outdegree[source_id] += 1 + + return adj_list, outdegree + + + def _outdeg_leaf_nodes(outdegree): + """ + Get nodes with an outdegree of 0. + """ + return [node for node, count in outdegree.items() if count == 0] + + + def process_nodes(self, edges, node_processor): + """ + Process nodes in a directed graph using a lazy approach to handle cyclic and self-dependencies. + + Parameters: + - edges: List of tuples (source, target) representing the graph. + - node_processor: A user-defined function that takes a node and its dependencies and returns a result. + + Returns: + - Dictionary with nodes as keys and results of the node_processor as values. + """ + # Build the adj list + adj_list, outdegree = Graph._adj_list(edges) + results = {} + + # Start processing with leaf nodes + queue = Graph._outdeg_leaf_nodes(outdegree) + + while queue: + next_queue = [] + for node_id in queue: + # Apply the processing function to the current node + dependencies = adj_list.get(node_id, []) + resolved_dependencies = {dep: results[dep] for dep in dependencies if dep in results} + results[node_id] = node_processor(self.nodes[node_id], resolved_dependencies) + + # Update outdegree for nodes that call this node + for caller, targets in adj_list.items(): + if node_id in targets: + outdegree[caller] -= 1 + if outdegree[caller] == 0: + next_queue.append(caller) + + queue = next_queue + + # Handle unresolved nodes (due to cycles or recursion) + for node_id, degree in outdegree.items(): + if degree > 0: + dependencies = adj_list.get(node_id, []) + resolved_dependencies = {dep: results[dep] for dep in dependencies if dep in results} + results[node_id] = node_processor(self.nodes[node_id], resolved_dependencies) + + return results + + def toposorted_nodes(self, edges): + # Build the adj list + adj_list, outdegree = Graph._adj_list(edges) + sorted_nodes = [] + node_deps = {} + + # Start processing with leaf nodes + queue = Graph._outdeg_leaf_nodes(outdegree) + + while queue: + next_queue = [] + for node_id in queue: + dependencies = adj_list.get(node_id, []) + sorted_nodes.append(node_id) + node_deps[node_id] = dependencies + + # Update outdegree for nodes that call this node + for caller, targets in adj_list.items(): + if node_id in targets: + outdegree[caller] -= 1 + if outdegree[caller] == 0: + next_queue.append(caller) + + queue = next_queue + + # Handle unresolved nodes (due to cycles or recursion) + for node_id, degree in outdegree.items(): + if degree > 0: + dependencies = adj_list.get(node_id, []) + sorted_nodes.append(node_id) + node_deps[node_id] = dependencies + + return (sorted_nodes, node_deps) + + def clean_up(self): for edge_type in list(self.edges.keys()): self.edges[edge_type] = [ @@ -324,7 +486,7 @@ def clean_up(self): if edge.source in self.nodes and edge.target in self.nodes ] - def __str__(self): + def __repr__(self): return json.dumps(self.to_dict()) def to_dict(self, *args: str, node_labels: Optional[Union[str, Iterable[str]]] = None) -> dict: From 92edc8e845ab11f2ae455c7e7ef8e7e671028554 Mon Sep 17 00:00:00 2001 From: Satrio Adi Rukmono Date: Wed, 27 Nov 2024 18:14:22 +0100 Subject: [PATCH 02/34] Fix some algorithms --- arcana/filters.py | 37 +++++++++++++++++++++++++++++++------ arcana/templates.py | 5 ++++- 2 files changed, 35 insertions(+), 7 deletions(-) diff --git a/arcana/filters.py b/arcana/filters.py index 845a117..e62a9dd 100644 --- a/arcana/filters.py +++ b/arcana/filters.py @@ -16,6 +16,11 @@ from arcanalib.pipefilter import Filter, Seeder + +def remove_author(s): + return '\n'.join(t.strip() for t in s.split('\n') if not '@author' in t) + + def remove_java_comments(java_source: str) -> str: """ Remove single-line and multi-line comments from a given Java source code string. @@ -40,7 +45,11 @@ def sentence(s: str) -> str: Returns: str: The formatted string. """ + if not s: + return "" t = s.strip() + if not t: + return "" if t[-1] in '.?!…~–—': return f'{t[0].upper()}{t[1:]}' return f'{t[0].upper()}{t[1:]}.' @@ -230,7 +239,12 @@ def describe(self, node: dict, *keys) -> str: sr, sn = '\r', '\n' if not keys: keys = ['description', 'docComment', 'returns', 'reason', 'howToUse', 'howItWorks', 'assertions', 'roleStereotype', 'layer'] - return ' '.join(f"**{key}**: {sentence(str(node.properties[key]).replace(sr,'').replace(sn,' '))} " for key in keys if key in node.properties) + + lines = {key:f"**{key}**: {sentence(str(node.properties[key]).replace(sr,'').replace(sn,' '))} " for key in keys if key in node.properties and key != 'docComment' and node.properties[key]} + if 'docComment' in keys and 'docComment' in node.properties and node.properties['docComment']: + lines['docComment'] = f"**docComment**: {sentence(remove_author(str(node.properties['docComment'])).replace(sr,'').replace(sn,' '))} " + # print(lines) + return ' '.join(lines[key] for key in keys if key in lines) def process_hierarchy(self, data: Graph, client: OpenAI, model: str, jsonl_file, log_file): """Process each package, class, and method in the hierarchy.""" @@ -275,8 +289,15 @@ def process_hierarchy(self, data: Graph, client: OpenAI, model: str, jsonl_file, hierarchy = build_hierarchy(triplets) + # print(hierarchy) + # print('==========') - sorted_pkg_ids, pkg_deps = data.toposorted_nodes(data.edges['contains']) + sorted_pkg_ids, pkg_deps = data.toposorted_nodes(data.find_edges(label='contains',where_source=lambda node: 'Structure' not in node.labels,where_target=lambda node: 'Structure' not in node.labels)) + # print(sorted_pkg_ids) + # print('==========') + + # print(pkg_deps) + # print('==========') for pkg_id in tqdm(sorted_pkg_ids, desc="Processing packages", position=1): pkg_data = hierarchy.get(pkg_id, dict()) @@ -302,6 +323,10 @@ def process_hierarchy(self, data: Graph, client: OpenAI, model: str, jsonl_file, self.process_package(data, client, model, jsonl_file, log_file, pkg_id, package, pkg_data, pkg_deps) + + log_file.flush() + jsonl_file.flush() + if os.path.exists('stop'): raise StopIteration @@ -379,7 +404,7 @@ def process_package(self, data: Graph, client: OpenAI, model: str, jsonl_file, l prompt = templates.component_analysis.format( pkg_name=package.properties['qualifiedName'], classes="(none)" if not classes_descriptions else "\n".join(classes_descriptions), - packages="(none)" if not package_descriptions else "\n".join(classes_descriptions), + packages="(none)" if not package_descriptions else "\n".join(package_descriptions), project_name=self.project_name, project_desc=self.project_desc ) @@ -418,7 +443,7 @@ def generate_description(self, client: OpenAI, model: str, prompt: str) -> dict: return description - def update_method_properties(self, data: Graph, description: dict, method: dict): + def update_method_properties(self, data: Graph, description: dict, method: Node): """Update method properties with the generated description.""" param_nodes = [ data.nodes[edge.target] @@ -446,13 +471,13 @@ def update_method_properties(self, data: Graph, description: dict, method: dict) else: method.properties[key_lower] = value - def update_class_properties(self, data: Graph, description: dict, clasz: dict): + def update_class_properties(self, data: Graph, description: dict, clasz: Node): """Update class properties with the generated description.""" for key in description: if not key.endswith('Reason'): clasz.properties[lower1(key)] = description[key] - def update_package_properties(self, data: Graph, description: dict, package: dict): + def update_package_properties(self, data: Graph, description: dict, package: Node): """Update package properties with the generated description.""" for key in description: if not key.endswith('Reason'): diff --git a/arcana/templates.py b/arcana/templates.py index add1056..e880312 100644 --- a/arcana/templates.py +++ b/arcana/templates.py @@ -49,6 +49,7 @@ Explain the above {struct_type} on the following aspects: {{ description: "Describe the responsibility of the {struct_type} in one sentence.", + keywords: ["list", "of", "keywords", "relevant", "to", "the", "{struct_type}"], roleStereotype:..., roleStereotypeReason:..., layer:..., @@ -88,7 +89,9 @@ Explain the above package on the following aspects: -{{ description: "Describe the purpose of the package in one sentence.", +{{ description: "Describe the purpose of the package in at most five sentences. Try to describe the package at a more abstract level of functionality rather than implementation detail.", + title: "A Noun Phrase that Describes the Package", + keywords: ["list", "of", "keywords", "relevant", "to", "the", "package"], layer:..., layerReason:... }} From 86be259d559412a89d617611366719ac024a59db Mon Sep 17 00:00:00 2001 From: Satrio Adi Rukmono Date: Thu, 28 Nov 2024 17:45:39 +0100 Subject: [PATCH 03/34] Add error handling --- arcana/filters.py | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/arcana/filters.py b/arcana/filters.py index e62a9dd..ab9413e 100644 --- a/arcana/filters.py +++ b/arcana/filters.py @@ -1,3 +1,4 @@ +from collections.abc import Iterable from io import TextIOWrapper import json import os @@ -282,10 +283,11 @@ def process_hierarchy(self, data: Graph, client: OpenAI, model: str, jsonl_file, if os.path.exists('stop'): raise StopIteration - counter = (counter+1)%5 - if counter==4: + counter += 1 + if counter==10: log_file.flush() jsonl_file.flush() + counter %= 10 hierarchy = build_hierarchy(triplets) @@ -445,17 +447,18 @@ def generate_description(self, client: OpenAI, model: str, prompt: str) -> dict: def update_method_properties(self, data: Graph, description: dict, method: Node): """Update method properties with the generated description.""" - param_nodes = [ - data.nodes[edge.target] - for edge in data.edges['hasParameter'] - if edge.source == method.id - ] + for key, value in description.items(): if key.endswith('Reason'): continue key_lower = lower1(key) - if key_lower == 'parameters': + if key_lower == 'parameters' and isinstance(value, Iterable): + param_nodes = [ + data.nodes[edge.target] + for edge in data.edges['hasParameter'] + if edge.source == method.id + ] for param in value: matching_params = [ node From abcea4bfc2f9adbd5d18c795e7594a0f4a22d83d Mon Sep 17 00:00:00 2001 From: Satrio Adi Rukmono Date: Mon, 16 Dec 2024 16:44:03 +0100 Subject: [PATCH 04/34] Describe package interactions --- arcana/filters.py | 142 +++++++++++++++++++++++++++++++++++------- arcana/templates.py | 53 +++++++++++----- arcanalib/__init__.py | 2 +- arcanalib/graph.py | 103 +++++++++++++++++++++--------- 4 files changed, 231 insertions(+), 69 deletions(-) diff --git a/arcana/filters.py b/arcana/filters.py index ab9413e..0aa9414 100644 --- a/arcana/filters.py +++ b/arcana/filters.py @@ -6,8 +6,9 @@ import subprocess import sys import time -from collections import Counter -from typing import Dict, Any +from itertools import combinations +from collections import Counter, defaultdict +from typing import Dict, Any, Tuple, List from openai import OpenAI from tqdm.auto import tqdm @@ -169,7 +170,7 @@ def build_triplets(edge_list1, edge_list2) -> dict: methods = sorted(triplets(edge_list1, edge_list2)) return methods - + def build_hierarchy(method_triplets) -> dict: classes = sorted({(pkg, clz) for pkg, clz, _ in method_triplets}) packages = sorted({pkg for pkg, _ in classes}) @@ -184,12 +185,31 @@ def build_hierarchy(method_triplets) -> dict: return hierarchy class CustomJSONEncoder(json.JSONEncoder): - def default(self, obj): - if isinstance(obj, set): - # Convert set to list - return list(obj) - # Call the default method for other types - return super().default(obj) + def default(self, obj): + if isinstance(obj, set): + # Convert set to list + return list(obj) + # Call the default method for other types + return super().default(obj) + +def group_paths_by_endpoints(paths: List[List[Edge]]) -> Dict[Tuple[str, str], List[List[Edge]]]: + """ + Groups paths by the tuple of (first edge's source, last edge's target). + + Args: + paths (List[List[Edge]]): The list of paths to group. + + Returns: + Dict[Tuple[str, str], List[List[Edge]]]: A dictionary where keys are + tuples (first edge's source, last edge's target), and values are lists of paths. + """ + grouped_paths = defaultdict(list) + for path in paths: + if path: # Ensure the path is not empty + start = path[0].source + end = path[-1].target + grouped_paths[(start, end)].append(path) + return grouped_paths class LLMFilter(Filter): def __init__(self, config: Dict[str, Dict[str, Any]]): @@ -239,7 +259,7 @@ def describe(self, node: dict, *keys) -> str: """Generate a description for a given node.""" sr, sn = '\r', '\n' if not keys: - keys = ['description', 'docComment', 'returns', 'reason', 'howToUse', 'howItWorks', 'assertions', 'roleStereotype', 'layer'] + keys = ['description', 'docComment', 'returns', 'reason', 'howToUse', 'howItWorks', 'assertions', 'stereotype', 'roleStereotype', 'layer'] lines = {key:f"**{key}**: {sentence(str(node.properties[key]).replace(sr,'').replace(sn,' '))} " for key in keys if key in node.properties and key != 'docComment' and node.properties[key]} if 'docComment' in keys and 'docComment' in node.properties and node.properties['docComment']: @@ -251,12 +271,12 @@ def process_hierarchy(self, data: Graph, client: OpenAI, model: str, jsonl_file, """Process each package, class, and method in the hierarchy.""" # all_method_ids = [ node.id - # for node - # in (data.find_nodes(label="Operation") + data.find_nodes(label="Constructor")) ] + # for node + # in (data.find_nodes(label="Operation") + data.find_nodes(label="Constructor")) ] # independent_method_ids = [ node_id # for node_id - # in all_method_ids - # if node_id not in [edge.source for edge in data.find_edges(label="invokes")] ] + # in all_method_ids + # if node_id not in [edge.source for edge in data.find_edges(label="invokes")] ] st_contains_st = data.find_edges(label='contains',source_label='Structure',target_label='Structure') ct_contains_st = data.find_edges(label='contains',target_label='Structure', where_source=lambda node: 'Container' in node.labels and 'Structure' not in node.labels) new_ct_sources = {edge.target:data.find_source(data.edges['contains'],data.nodes[edge.target],lambda node:'Structure' not in node.labels,data.nodes[edge.source]).id for edge in st_contains_st} @@ -302,6 +322,7 @@ def process_hierarchy(self, data: Graph, client: OpenAI, model: str, jsonl_file, # print('==========') for pkg_id in tqdm(sorted_pkg_ids, desc="Processing packages", position=1): + break pkg_data = hierarchy.get(pkg_id, dict()) package = data.nodes[pkg_id] @@ -332,8 +353,72 @@ def process_hierarchy(self, data: Graph, client: OpenAI, model: str, jsonl_file, if os.path.exists('stop'): raise StopIteration + paths = data.find_paths("contains", "hasScript", "invokes", "-hasScript", "-contains") + path_groups = group_paths_by_endpoints(paths) + + # for k, v in path_groups.items(): + # print(k) + # for i in v: + # print("*", [(edge.source, edge.target) for edge in i]) + + pkg_pairs = list(combinations(sorted_pkg_ids,2)) + for pkg2_id, pkg1_id in tqdm(pkg_pairs, desc='Processing package interactions', position=0, leave=False): + pkg1 = data.nodes[pkg1_id] + pkg2 = data.nodes[pkg2_id] + if ('Structure' not in pkg1.labels) and ('Structure' not in pkg2.labels): + if path_groups[(pkg1_id,pkg2_id)]: + self.process_interactions(data, client, model, pkg1, pkg2, path_groups[(pkg1_id,pkg2_id)], hierarchy, jsonl_file, log_file) + if path_groups[(pkg2_id,pkg1_id)]: + self.process_interactions(data, client, model, pkg2, pkg1, path_groups[(pkg2_id,pkg1_id)], hierarchy, jsonl_file, log_file) + + def process_interactions(self, data: Graph, client: OpenAI, model: str, pkg1: Node, pkg2: Node, path_groups: List[Edge], hierarchy, jsonl_file: TextIOWrapper, log_file: TextIOWrapper): + pkg1_name = pkg1.properties["qualifiedName"] + pkg2_name = pkg2.properties["qualifiedName"] + pkg1_desc = pkg1.properties["description"] + pkg2_desc = pkg2.properties["description"] + + pkg1_data = hierarchy.get(pkg1.id, dict()) + pkg2_data = hierarchy.get(pkg2.id, dict()) + + cls1_info = "\n".join(f" - `{data.nodes[c_id].properties['simpleName']}`: {data.nodes[c_id].properties['description']}" for c_id, _ in pkg1_data.items()) + cls2_info = "\n".join(f" - `{data.nodes[c_id].properties['simpleName']}`: {data.nodes[c_id].properties['description']}" for c_id, _ in pkg2_data.items()) + + def describe_path(path): + src_cls = data.nodes[path[1].source] + src_mth = data.nodes[path[1].target] + tgt_mth = data.nodes[path[-2].source] + tgt_cls = data.nodes[path[-2].target] + return f"Method `{src_mth.properties['simpleName']}` ({src_mth.properties['description']}) of class `{src_cls.properties['qualifiedName']}` invokes method `{tgt_mth.properties['simpleName']}` ({tgt_mth.properties['description']}) of class `{tgt_cls.properties['qualifiedName']}`." + + dep_info = f" - Dependencies from `{pkg1_name}` to `{pkg2_name}`:\n" + "\n".join(f" - {describe_path(path)}" for path in path_groups) if path_groups else "" + + prompt = templates.interaction_analysis.format( + project_name=self.project_name, + project_desc=self.project_desc, + pkg1_name=pkg1_name, + pkg2_name=pkg2_name, + pkg1_desc=pkg1_desc, + pkg2_desc=pkg2_desc, + cls1_info=cls1_info, + cls2_info=cls2_info, + dep_info=dep_info + ) + + log_file.write(prompt) + log_file.write('\n\n======\n\n') + + description = self.generate_text_description(client, model, prompt) + pkg1_edge = Edge(source=pkg1.id, target=pkg2.id, label="dependsOn", description=description) if dep_info else None + + if pkg1_edge: + if "dependsOn" not in data.edges: + data.edges["dependsOn"] = [] + data.edges["dependsOn"].append(pkg1_edge) + jsonl_file.write(json.dumps(pkg1_edge.to_dict(), cls=CustomJSONEncoder)) + jsonl_file.write('\n') + def process_method(self, data: Graph, client: OpenAI, model: str, jsonl_file: TextIOWrapper, log_file: TextIOWrapper, met_id: str, class_name: str, - class_kind: str, node_deps: dict): + class_kind: str, node_deps: dict): """Process a single method and generate its description.""" method = data.nodes[met_id] @@ -354,7 +439,7 @@ def process_method(self, data: Graph, client: OpenAI, model: str, jsonl_file: Te log_file.write(prompt) log_file.write('\n\n======\n\n') - description = self.generate_description(client, model, prompt) + description = self.generate_json_description(client, model, prompt) self.update_method_properties(data, description, method) jsonl_file.write(json.dumps({ @@ -367,7 +452,7 @@ def process_method(self, data: Graph, client: OpenAI, model: str, jsonl_file: Te jsonl_file.write('\n') def process_class(self, data: Graph, client: OpenAI, model: str, jsonl_file, log_file, cls_id: str, clasz: dict, class_name: str, - class_kind: str, cls_data: list): + class_kind: str, cls_data: list): """Process a single class and generate its description.""" ancestors, fields = self.get_class_relations(data, cls_id) methods_descriptions = self.get_methods_descriptions(data, cls_data) @@ -385,7 +470,7 @@ def process_class(self, data: Graph, client: OpenAI, model: str, jsonl_file, log log_file.write(prompt) log_file.write('\n\n======\n\n') - description = self.generate_description(client, model, prompt) + description = self.generate_json_description(client, model, prompt) self.update_class_properties(data, description, clasz) jsonl_file.write(json.dumps({ @@ -398,7 +483,7 @@ def process_class(self, data: Graph, client: OpenAI, model: str, jsonl_file, log jsonl_file.write('\n') def process_package(self, data: Graph, client: OpenAI, model: str, jsonl_file, log_file, pkg_id: str, package: dict, - pkg_data: dict, pkg_deps: dict): + pkg_data: dict, pkg_deps: dict): """Process a single package and generate its description.""" classes_descriptions = self.get_classes_descriptions(data, pkg_data) package_descriptions = self.get_packages_descriptions(data, pkg_deps[pkg_id]) @@ -414,7 +499,7 @@ def process_package(self, data: Graph, client: OpenAI, model: str, jsonl_file, l log_file.write(prompt) log_file.write('\n\n======\n\n') - description = self.generate_description(client, model, prompt) + description = self.generate_json_description(client, model, prompt) self.update_package_properties(data, description, package) jsonl_file.write(json.dumps({ @@ -424,7 +509,7 @@ def process_package(self, data: Graph, client: OpenAI, model: str, jsonl_file, l 'properties': description}})) jsonl_file.write('\n') - def generate_description(self, client: OpenAI, model: str, prompt: str) -> dict: + def generate_json_description(self, client: OpenAI, model: str, prompt: str) -> dict: """Generate a description using the OpenAI client.""" try: response = client.chat.completions.create( @@ -445,6 +530,21 @@ def generate_description(self, client: OpenAI, model: str, prompt: str) -> dict: return description + def generate_text_description(self, client: OpenAI, model: str, prompt: str) -> dict: + """Generate a description using the OpenAI client.""" + try: + response = client.chat.completions.create( + model=model, + messages=[{"role": "user", "content": prompt}], + max_tokens=4096, + temperature=0 + ) + description = response.choices[0].message.content + except: + description = "" + + return description + def update_method_properties(self, data: Graph, description: dict, method: Node): """Update method properties with the generated description.""" diff --git a/arcana/templates.py b/arcana/templates.py index e880312..c3396a3 100644 --- a/arcana/templates.py +++ b/arcana/templates.py @@ -17,6 +17,8 @@ howToUse: "Describe the usage or the expected set-up of using the method, in less than 3 sentences.", howItWorks: "Describe the implementation details of the method, in less than 5 sentences.", assertions: {{ preConditions: ["pre-conditions of the method", ...], postConditions: ["pre-conditions of the method", ...] }}, + stereotype: one of "Accessor", "Mutator", "Creational", "Collaborational", or "Other", + stereotypeReason: "Explain the rationale of the stereotype choice", layer:..., layerReason:... }} @@ -48,12 +50,12 @@ Explain the above {struct_type} on the following aspects: -{{ description: "Describe the responsibility of the {struct_type} in one sentence.", - keywords: ["list", "of", "keywords", "relevant", "to", "the", "{struct_type}"], +{{ description: "Describe the key responsibilities of the {struct_type} in up to three sentences.", + keywords: ["list", "of", "keywords", "relevant", "to", "the", "{struct_type}"], // try to have nouns as well as verb keywords roleStereotype:..., - roleStereotypeReason:..., - layer:..., - layerReason:... }} + roleStereotypeReason:... }} + +When describing the responsibilities, consider that a responsibility can be fulfilled by a group of methods within the {struct_type}. In other words, an intermediate step for describing the {struct_type} is to cluster its methods into a few method responsibility-type. For the `roleStereotype`, fill the value with one of the following role stereotypes which responsibility is exhibited by the {struct_type}: @@ -68,15 +70,6 @@ In `roleStereotypeReason`, explain why this {struct_type} fits your stereotype of choice but not the other stereotypes. -For the `layer`, consider the functionalities of architectural layers below: - -- **Presentation Layer**: Manages the user interface, defines UI elements and behavior, displays information, responds to user input, and updates views. Typically (but not only) contains User Interfacers. -- **Service Layer**: Controls the application flow, orchestrates domain operations, connects UI events with domain logic, and synchronizes domain changes with the UI. Typically (but not only) contains Coordinators and (Application) Controllers. -- **Domain Layer**: Handles business logic, represents domain data and behavior, and performs necessary computations for domain operations. Typically (but not only) contains Information Holders, Service Providers, Structurers, Coordinators, and (Domain) Controllers. -- **Data Source Layer**: Interacts with databases, filesystems, hardware, messaging systems, or other data sources, performs CRUD operations, handles data conversion, and ensures data integrity. Typically (but not only) contains External Interfacers. - -In `layerReason`, explain why this {struct_type} fits your layer of choice but not the other layers. - Respond with a well-formatted JSON object. Do not use any quote marks ("'`) within the JSON values. In the `description`, do not mention the name of the role stereotype or layer.''' component_analysis = '''Consider a project {project_name}, {project_desc}. Given a package `{pkg_name}` containing the following classes: @@ -89,9 +82,9 @@ Explain the above package on the following aspects: -{{ description: "Describe the purpose of the package in at most five sentences. Try to describe the package at a more abstract level of functionality rather than implementation detail.", +{{ description: "Describe the functionality of the package in up to five sentences.", title: "A Noun Phrase that Describes the Package", - keywords: ["list", "of", "keywords", "relevant", "to", "the", "package"], + keywords: ["list", "of", "keywords", "relevant", "to", "the", "package"], // try to have nouns as well as verb keywords layer:..., layerReason:... }} @@ -105,3 +98,31 @@ In `layerReason`, explain why this package fits your layer of choice but not the other layers. Respond with a well-formatted JSON object. Do not use any quote marks ("'`) within the JSON values. In the `description`, do not mention the name of the layer.''' + +interaction_analysis = '''## Input: + +Consider a project {project_name}, {project_desc}. + +- Package Information: + - `{pkg1_name}`: {pkg1_desc} + - `{pkg2_name}`: {pkg2_desc} + +- Class Information: + - `{pkg1_name}`: +{cls1_info} + - `{pkg2_name}`: +{cls2_info} + +- Inter-Package Dependencies: +{dep_info} + +## Task: + +Using the provided information, describe the interaction between the {pkg1_name} and {pkg2_name} packages, focusing on: + +- The purpose and nature of their dependency in terms of design. +- An abstract, high-level description of the relationship without referencing specific classes or methods. + +## Output: + +Provide a cohesive explanation of the interaction in one to two sentences. Keep the response plain text.''' diff --git a/arcanalib/__init__.py b/arcanalib/__init__.py index ead43bc..698659a 100644 --- a/arcanalib/__init__.py +++ b/arcanalib/__init__.py @@ -1,2 +1,2 @@ -from .graph import Graph, invert, compose, lift, triplets +from .graph import Graph, Node, Edge, invert, compose, lift, triplets from .pipefilter import Pipeline, Filter diff --git a/arcanalib/graph.py b/arcanalib/graph.py index 4ae7b79..84dc33f 100644 --- a/arcanalib/graph.py +++ b/arcanalib/graph.py @@ -65,37 +65,40 @@ def invert(edge_list: List[Edge], new_label: Optional[str] = None) -> List[Edge] ] def compose(edges1: List[Edge], edges2: List[Edge], new_label: Optional[str] = None) -> List[Edge]: - """ - Composes two lists of edges. - - Args: - edges1 (list): The first list of edges. - edges2 (list): The second list of edges. - new_label (str, optional): A new label for the composed edges. Defaults to None. - - Returns: - list: A list of composed edges. - """ - mapping = { - edge.source: { - 'target': edge.target, - 'label': edge.label, - 'weight': edge.properties.get('weight', 1) - } - for edge in edges2 - } - composed_edges = [] - for edge in edges1: - if edge.target in mapping: - new_weight = mapping[edge.target]['weight'] * edge.properties.get('weight', 1) - composed_edge = Edge( - source=edge.source, - target=mapping[edge.target]['target'], - label=new_label if new_label else f"{edge.label},{mapping[edge.target]['label']}", - weight=new_weight - ) - composed_edges.append(composed_edge) - return composed_edges + """ + Composes two lists of edges. + + Args: + edges1 (list): The first list of edges. + edges2 (list): The second list of edges. + new_label (str, optional): A new label for the composed edges. Defaults to None. + + Returns: + list: A list of composed edges. + """ + # Create a mapping that allows multiple targets for each source + mapping = defaultdict(list) + for edge in edges2: + mapping[edge.source].append({ + 'target': edge.target, + 'label': edge.label, + 'weight': edge.properties.get('weight', 1) + }) + + composed_edges = [] + for edge in edges1: + if edge.target in mapping: + for target_mapping in mapping[edge.target]: + new_weight = target_mapping['weight'] * edge.properties.get('weight', 1) + composed_edge = Edge( + source=edge.source, + target=target_mapping['target'], + label=new_label if new_label else f"{edge.label},{target_mapping['label']}", + properties={"weight": new_weight} + ) + composed_edges.append(composed_edge) + + return composed_edges def lift(edges1: List[Edge], edges2: List[Edge], new_label: Optional[str] = None) -> List[Edge]: @@ -530,3 +533,41 @@ def to_dict(self, *args: str, node_labels: Optional[Union[str, Iterable[str]]] = "edges": [{"data": edge.to_dict()['data']} for edge in sum(included_edges.values(), [])] } } + + def find_paths(self, *edge_sequence: List[str]) -> List[List[Edge]]: + """ + Finds paths in the graph that match a given sequence of edge labels. + + Args: + graph (Graph): The graph to search for paths. + edge_sequence (List[str]): The sequence of edge labels to match. + + Returns: + List[List[Edge]]: A list of paths, where each path is a list of edges. + """ + def get_edges(label: str) -> List[Edge]: + """Retrieve edges matching a label, considering `-` for inverse edges.""" + if label.startswith("-"): + base_label = label[1:] + if base_label in self.edges: + return invert(self.edges[base_label]) + return [] + return self.edges.get(label, []) + + def find_next(paths: List[List[Edge]], label: str) -> List[List[Edge]]: + """Extend existing paths by one step based on the label.""" + next_paths = [] + for path in paths: + last_node = path[-1].target if path else None + next_edges = get_edges(label) + for edge in next_edges: + if not path or edge.source == last_node: + next_paths.append(path + [edge]) + return next_paths + + # Start generating paths + paths = [[]] # Begin with an empty path + for label in edge_sequence: + paths = find_next(paths, label) + + return paths From 21bc9aa019e087e91d5274c5cf7d914a3dd3797a Mon Sep 17 00:00:00 2001 From: Satrio Adi Rukmono Date: Mon, 16 Dec 2024 16:51:59 +0100 Subject: [PATCH 05/34] Remove leftover break --- arcana/filters.py | 1 - 1 file changed, 1 deletion(-) diff --git a/arcana/filters.py b/arcana/filters.py index 0aa9414..afce42b 100644 --- a/arcana/filters.py +++ b/arcana/filters.py @@ -322,7 +322,6 @@ def process_hierarchy(self, data: Graph, client: OpenAI, model: str, jsonl_file, # print('==========') for pkg_id in tqdm(sorted_pkg_ids, desc="Processing packages", position=1): - break pkg_data = hierarchy.get(pkg_id, dict()) package = data.nodes[pkg_id] From 4310148060d30105efba20157ba9d58621c06dd4 Mon Sep 17 00:00:00 2001 From: Satrio Adi Rukmono Date: Wed, 15 Jan 2025 17:36:25 +0100 Subject: [PATCH 06/34] Update graph lib --- arcana/filters.py | 38 +-- arcanalib/graph.py | 721 ++++++++++++++++++++------------------------- 2 files changed, 332 insertions(+), 427 deletions(-) diff --git a/arcana/filters.py b/arcana/filters.py index afce42b..9d11145 100644 --- a/arcana/filters.py +++ b/arcana/filters.py @@ -1,22 +1,20 @@ -from collections.abc import Iterable -from io import TextIOWrapper import json import os import re import subprocess import sys import time -from itertools import combinations -from collections import Counter, defaultdict -from typing import Dict, Any, Tuple, List - -from openai import OpenAI -from tqdm.auto import tqdm from arcana import templates from arcanalib.graph import Edge, Graph, Node, triplets, invert, lift from arcanalib.pipefilter import Filter, Seeder - +from collections import Counter, defaultdict +from collections.abc import Iterable +from io import TextIOWrapper +from itertools import combinations +from openai import OpenAI +from tqdm.auto import tqdm +from typing import Dict, Any, Tuple, List def remove_author(s): @@ -93,8 +91,6 @@ def __init__(self, command) -> None: """ self.command = command - # sys.stderr.write(f"Command: {self.command}\n") - def generate(self) -> Graph: """ Execute the command, parse the JSON output into a dict, and pass the dict to the Graph constructor. @@ -264,19 +260,12 @@ def describe(self, node: dict, *keys) -> str: lines = {key:f"**{key}**: {sentence(str(node.properties[key]).replace(sr,'').replace(sn,' '))} " for key in keys if key in node.properties and key != 'docComment' and node.properties[key]} if 'docComment' in keys and 'docComment' in node.properties and node.properties['docComment']: lines['docComment'] = f"**docComment**: {sentence(remove_author(str(node.properties['docComment'])).replace(sr,'').replace(sn,' '))} " - # print(lines) + return ' '.join(lines[key] for key in keys if key in lines) def process_hierarchy(self, data: Graph, client: OpenAI, model: str, jsonl_file, log_file): """Process each package, class, and method in the hierarchy.""" - # all_method_ids = [ node.id - # for node - # in (data.find_nodes(label="Operation") + data.find_nodes(label="Constructor")) ] - # independent_method_ids = [ node_id - # for node_id - # in all_method_ids - # if node_id not in [edge.source for edge in data.find_edges(label="invokes")] ] st_contains_st = data.find_edges(label='contains',source_label='Structure',target_label='Structure') ct_contains_st = data.find_edges(label='contains',target_label='Structure', where_source=lambda node: 'Container' in node.labels and 'Structure' not in node.labels) new_ct_sources = {edge.target:data.find_source(data.edges['contains'],data.nodes[edge.target],lambda node:'Structure' not in node.labels,data.nodes[edge.source]).id for edge in st_contains_st} @@ -332,12 +321,6 @@ def process_hierarchy(self, data: Graph, client: OpenAI, model: str, jsonl_file, class_kind = clasz.properties['kind'] class_kind = 'enum' if class_kind == 'enumeration' else 'abstract class' if class_kind == 'abstract' else class_kind - # for met_id in tqdm(cls_data, desc='Processing methods', position=2, leave=False): - # self.process_method(data, client, model, file, met_id, class_name, class_kind) - - # if os.path.exists('stop'): - # raise StopIteration - self.process_class(data, client, model, jsonl_file, log_file, cls_id, clasz, class_name, class_kind, cls_data) if os.path.exists('stop'): @@ -355,11 +338,6 @@ def process_hierarchy(self, data: Graph, client: OpenAI, model: str, jsonl_file, paths = data.find_paths("contains", "hasScript", "invokes", "-hasScript", "-contains") path_groups = group_paths_by_endpoints(paths) - # for k, v in path_groups.items(): - # print(k) - # for i in v: - # print("*", [(edge.source, edge.target) for edge in i]) - pkg_pairs = list(combinations(sorted_pkg_ids,2)) for pkg2_id, pkg1_id in tqdm(pkg_pairs, desc='Processing package interactions', position=0, leave=False): pkg1 = data.nodes[pkg1_id] diff --git a/arcanalib/graph.py b/arcanalib/graph.py index 84dc33f..8b9ed77 100644 --- a/arcanalib/graph.py +++ b/arcanalib/graph.py @@ -1,15 +1,76 @@ import json -from collections.abc import Iterable from collections import defaultdict -from typing import DefaultDict, Optional, List, Dict, Union, Set, Tuple - +from collections.abc import Iterable +from typing import Optional, List, Dict, Union, Set, Tuple class Node: def __init__(self, _id, *labels, **properties): self.id = _id self.labels = set(labels) self.properties = properties - + + # Meta/cache references + self._graph = None # The parent Graph, for on-demand lookups + self._sources_cache = {} # edge_label -> List[Node] + self._targets_cache = {} # edge_label -> List[Node] + + def set_graph(self, graph): + self._graph = graph + self._invalidate_cache() + + def _invalidate_cache(self): + self._sources_cache.clear() + self._targets_cache.clear() + + def has_label(self, label: str) -> bool: + return label in self.labels + + def add_label(self, label: str): + self.labels.add(label) + return self + + def remove_label(self, label: str): + self.labels.discard(label) + return self + + def replace_label(self, old_label: str, new_label: str): + if old_label in self.labels: + self.labels.remove(old_label) + self.labels.add(new_label) + return self + + def has_property(self, key: str) -> bool: + return key in self.properties + + def property(self, key: str, value=None): + if value is None and value is not False: + return self.properties.get(key) + elif value is None: # allow explicit removal if value is None + self.properties.pop(key, None) + else: + self.properties[key] = value + return self + + def sources(self, edge_label: str): + if edge_label not in self._sources_cache: + if not self._graph: + return [] + es = self._graph.edges.get(edge_label, []) + self._sources_cache[edge_label] = [ + self._graph.nodes[e.source] for e in es if e.target == self.id + ] + return self._sources_cache[edge_label] + + def targets(self, edge_label: str): + if edge_label not in self._targets_cache: + if not self._graph: + return [] + es = self._graph.edges.get(edge_label, []) + self._targets_cache[edge_label] = [ + self._graph.nodes[e.target] for e in es if e.source == self.id + ] + return self._targets_cache[edge_label] + def to_dict(self): return { 'data': { @@ -18,7 +79,7 @@ def to_dict(self): 'properties': self.properties } } - + def __repr__(self): return json.dumps(self.to_dict()) @@ -27,16 +88,53 @@ def __init__(self, source, target, label, **properties): self.id = f'{source}-{label}-{target}' self.source = source self.target = target - self.label = label + self.label_val = label self.properties = properties - + + # Meta/cache references + self._graph = None + self._cached_source_node = None + self._cached_target_node = None + + def set_graph(self, graph): + self._graph = graph + self._cached_source_node = None + self._cached_target_node = None + + def label(self, new_label=None): + if new_label is None: + return self.label_val + else: + self.label_val = new_label + self.id = f'{self.source}-{self.label_val}-{self.target}' + return self + + def property(self, key: str, value=None): + if value is None and value is not False: + return self.properties.get(key) + elif value is None: # remove + self.properties.pop(key, None) + else: + self.properties[key] = value + return self + + def source_node(self): + if self._cached_source_node is None and self._graph is not None: + self._cached_source_node = self._graph.nodes.get(self.source, None) + return self._cached_source_node + + def target_node(self): + if self._cached_target_node is None and self._graph is not None: + self._cached_target_node = self._graph.nodes.get(self.target, None) + return self._cached_target_node + def to_dict(self): return { 'data': { 'id': self.id, 'source': self.source, 'target': self.target, - 'label': self.label, + 'label': self.label_val, 'properties': self.properties } } @@ -45,314 +143,215 @@ def __repr__(self): return json.dumps(self.to_dict()) def invert(edge_list: List[Edge], new_label: Optional[str] = None) -> List[Edge]: - """ - Inverts the direction of edges in the given edge list. - - Args: - edge_list (list): A list of edges to invert. - new_label (str, optional): A new label for the inverted edges. Defaults to None. - - Returns: - list: A list of inverted edges with updated labels. - """ - return [ - Edge( - source=edge.target, + aggregated = [] + for edge in edge_list: + lbl = new_label if new_label else f"inv_{edge.label_val}" + e = Edge( + source=edge.target, target=edge.source, - label=new_label if new_label else f"inv_{edge.label}", - **edge.properties) - for edge in edge_list - ] + label=lbl, + **edge.properties + ) + aggregated.append(e) + return aggregated def compose(edges1: List[Edge], edges2: List[Edge], new_label: Optional[str] = None) -> List[Edge]: - """ - Composes two lists of edges. - - Args: - edges1 (list): The first list of edges. - edges2 (list): The second list of edges. - new_label (str, optional): A new label for the composed edges. Defaults to None. - - Returns: - list: A list of composed edges. - """ - # Create a mapping that allows multiple targets for each source - mapping = defaultdict(list) - for edge in edges2: - mapping[edge.source].append({ - 'target': edge.target, - 'label': edge.label, - 'weight': edge.properties.get('weight', 1) - }) - - composed_edges = [] - for edge in edges1: - if edge.target in mapping: - for target_mapping in mapping[edge.target]: - new_weight = target_mapping['weight'] * edge.properties.get('weight', 1) - composed_edge = Edge( - source=edge.source, - target=target_mapping['target'], - label=new_label if new_label else f"{edge.label},{target_mapping['label']}", - properties={"weight": new_weight} - ) - composed_edges.append(composed_edge) - - return composed_edges - + mapping = defaultdict(list) + for edge in edges2: + w = edge.properties.get('weight', 1) + mapping[edge.source].append({ + 'target': edge.target, + 'label': edge.label_val, + 'weight': w + }) + + aggregated = {} + for edge in edges1: + w1 = edge.properties.get('weight', 1) + if edge.target in mapping: + for m in mapping[edge.target]: + new_w = w1 * m['weight'] + key = f"{edge.source}-{m['target']}" + if key not in aggregated: + lbl = new_label if new_label else f"{edge.label_val}-{m['label']}" + e = Edge(source=edge.source, target=m['target'], label=lbl, weight=new_w) + aggregated[key] = e + else: + aggregated[key].properties['weight'] += new_w + return list(aggregated.values()) def lift(edges1: List[Edge], edges2: List[Edge], new_label: Optional[str] = None) -> List[Edge]: - """ - Lifts relations by composing two lists of edges and their inverses. - - Args: - edges1 (list): The first list of edges. - edges2 (list): The second list of edges. - new_label (str, optional): A new label for the lifted edges. Defaults to None. - - Returns: - list: A list of lifted edges. - """ return compose(compose(edges1, edges2), invert(edges1), new_label) - def triplets(edge_list1: List[Edge], edge_list2: List[Edge]) -> Set[Tuple[str, str, str]]: - source_mapping = DefaultDict(list) - + source_mapping = defaultdict(list) for edge in edge_list1: source_mapping[edge.target].append(edge.source) - # {edge.target: edge.source for edge in edge_list1} paths = set() for edge in edge_list2: if edge.source in source_mapping: sources = source_mapping[edge.source] for source1 in sources: - triplet = (source1, edge.source, edge.target) - paths.add(triplet) - + paths.add((source1, edge.source, edge.target)) return paths - class Graph: - """ - A class to represent a graph with nodes and edges. - - Attributes: - nodes (dict): A dictionary of nodes. - edges (dict): A dictionary of edges categorized by labels. - """ - def __init__(self, graph_data: dict = None) -> None: - """ - Initializes the Graph with nodes and edges from the provided data. - - Args: - graph_data (dict): A dictionary containing graph data with nodes and edges. - """ if not graph_data: - self.nodes = dict() - self.edges = dict() - else: - self.nodes: Dict[str, Node] = { - node['data']['id']: Node(node['data']['id'], *node['data']['labels'], **node['data']['properties']) - for node in graph_data['elements']['nodes'] - } + self.nodes: Dict[str, Node] = {} self.edges: Dict[str, List[Edge]] = {} - for edge in graph_data['elements']['edges']: - edge_data = edge['data'] - edge_obj = Edge(edge_data['source'], edge_data['target'], edge_data['label'], **edge_data['properties']) - if edge_obj.label not in self.edges: - self.edges[edge_obj.label] = [] - self.edges[edge_obj.label].append(edge_obj) + return + + self.nodes: Dict[str, Node] = {} + for node_data in graph_data['elements']['nodes']: + n = Node(node_data['data']['id'], *node_data['data']['labels'], **node_data['data']['properties']) + self.nodes[n.id] = n + + self.edges: Dict[str, List[Edge]] = {} + for edge_data in graph_data['elements']['edges']: + d = edge_data['data'] + e = Edge(d['source'], d['target'], d['label'], **d['properties']) + if e.label_val not in self.edges: + self.edges[e.label_val] = [] + self.edges[e.label_val].append(e) + + self._set_graph_refs() + + def _set_graph_refs(self): + for node in self.nodes.values(): + node.set_graph(self) + for elist in self.edges.values(): + for edge in elist: + edge.set_graph(self) + + def add_node(self, _id: str, labels=None, properties=None): + if _id in self.nodes: + pass # Overwrite or warn if needed + n = Node(_id, *(labels or []), **(properties or {})) + self.nodes[_id] = n + n.set_graph(self) + + def add_edge(self, source_id: str, target_id: str, edge_label: str, properties=None): + if source_id not in self.nodes or target_id not in self.nodes: + raise ValueError("Source or target not in graph") + + e = Edge(source_id, target_id, edge_label, **(properties or {})) + if edge_label not in self.edges: + self.edges[edge_label] = [] + self.edges[edge_label].append(e) + e.set_graph(self) + + # Invalidate caching for the involved nodes + self.nodes[source_id]._invalidate_cache() + self.nodes[target_id]._invalidate_cache() def invert_edges(self, edge_label: str, new_label: Optional[str] = None) -> None: - """ - Inverts the edges with the specified label and saves them under a new label. - - Args: - edge_label (str): The label of the edges to invert. - new_label (str, optional): The label for the inverted edges. Defaults to None. - """ if edge_label in self.edges: inverted = invert(self.edges[edge_label], new_label) - new_label = new_label or f"inv_{edge_label}" - self.edges[new_label] = inverted + nlabel = new_label or f"inv_{edge_label}" + self.edges[nlabel] = inverted + self._set_graph_refs() def compose_edges(self, edge_label1: str, edge_label2: str, new_label: Optional[str] = None) -> None: - """ - Composes edges with the specified labels and saves them under a new label. - - Args: - edge_label1 (str): The label of the first list of edges. - edge_label2 (str): The label of the second list of edges. - new_label (str, optional): The label for the composed edges. Defaults to None. - """ if (edge_label1 in self.edges) and (edge_label2 in self.edges): - new_label = new_label or f"{edge_label1}_{edge_label2}" - composed = compose(self.edges[edge_label1], self.edges[edge_label2], new_label) - self.edges[new_label] = composed + nlabel = new_label or f"{edge_label1}_{edge_label2}" + composed_list = compose(self.edges[edge_label1], self.edges[edge_label2], nlabel) + self.edges[nlabel] = composed_list + self._set_graph_refs() def lift_edges(self, edge_label1: str, edge_label2: str, new_label: Optional[str] = None) -> None: - """ - Lifts relations by composing edges with the specified labels and their inverses, then saves them under a new label. - - Args: - edge_label1 (str): The label of the first list of edges. - edge_label2 (str): The label of the second list of edges. - new_label (str, optional): The label for the lifted edges. Defaults to None. - """ if (edge_label1 in self.edges) and (edge_label2 in self.edges): - lifted = lift(self.edges[edge_label1], self.edges[edge_label2], new_label) - new_label = new_label or f"lifted_{edge_label1}_{edge_label2}" - self.edges[new_label] = lifted + lifted_list = lift(self.edges[edge_label1], self.edges[edge_label2], new_label) + nlabel = new_label or f"lifted_{edge_label1}_{edge_label2}" + self.edges[nlabel] = lifted_list + self._set_graph_refs() def filter_nodes_by_labels(self, labels: Union[List[str], Set[str]]) -> Dict[str, Node]: - """ - Filters nodes by the specified labels. - - Args: - labels (list or set): A list of labels to filter nodes by. - - Returns: - dict: A dictionary of filtered nodes. - """ return { - key: node - for key, node in self.nodes.items() - if any(label in labels for label in node.labels) + k: v + for k, v in self.nodes.items() + if any(label in v.labels for label in labels) } def get_all_node_labels(self) -> Set[str]: - """ - Retrieves all unique node labels present in the graph. - - Returns: - set: A set of all node labels. - """ - return { - label - for node in self.nodes.values() - for label in node.labels - } + return {label for node in self.nodes.values() for label in node.labels} def get_all_edge_labels(self) -> Set[str]: - """ - Retrieves all unique edge labels present in the graph. - - Returns: - set: A set of all edge labels. - """ return set(self.edges.keys()) def get_edges_with_node_labels(self, edge_label: str, node_label: str) -> List[Edge]: - """ - Retrieves edges whose source and target nodes have the specified labels. - - Args: - edge_label (str): The label of the edges to retrieve. - node_label (str): The label of the nodes to filter by. - - Returns: - list: A list of edges that match the criteria. - """ if edge_label in self.edges: return [ - edge - for edge in self.edges[edge_label] - if (node_label in self.nodes[edge.source].labels) - and (node_label in self.nodes[edge.target].labels) + edge for edge in self.edges[edge_label] + if node_label in self.nodes[edge.source].labels + and node_label in self.nodes[edge.target].labels ] return [] def get_edge_node_labels(self, edge: Edge) -> List[Tuple[str, str]]: - """ - Retrieves the labels of the source and target nodes for a given edge. - - Args: - edge (Edge): The edge to retrieve node labels for. - - Returns: - list: A list of tuples containing source and target node labels. - """ source_labels = self.nodes.get(edge.source, Node(None)).labels target_labels = self.nodes.get(edge.target, Node(None)).labels - return [ - (source_label, target_label) - for source_label in source_labels - for target_label in target_labels - ] + return [(sl, tl) for sl in source_labels for tl in target_labels] - def get_source_and_target_labels(self, edge_label: str) -> Set[str]: - """ - Retrieves the set of source and target labels for a given list of edges. - - Args: - edge_label (str): The label of the edges to retrieve labels for. - - Returns: - set: A set of source and target labels. - """ - edge_node_labels: Set[str] = { - label - for edge in self.edges[edge_label] - for label in self.get_edge_node_labels(edge) + def get_source_and_target_labels(self, edge_label: str) -> Set[Tuple[str, str]]: + if edge_label not in self.edges: + return set() + return { + (sl, tl) + for e in self.edges[edge_label] + for (sl, tl) in self.get_edge_node_labels(e) } - return edge_node_labels - - def generate_ontology(self) -> Dict[str, Set[str]]: - """ - Generates an ontology from the graph's edges and nodes. - """ - ontology = { - label: self.get_source_and_target_labels(label) - for label in self.edges + + def generate_ontology(self) -> 'Graph': + ontology_map = { + label: self.get_source_and_target_labels(label) for label in self.edges } onto_graph = Graph() - onto_graph.edges = {label:[Edge(src,tgt,label) for src,tgt in ontology[label]] for label in ontology} - sources = {src for label in ontology for src,_ in ontology[label]} - targets = {tgt for label in ontology for _,tgt in ontology[label]} - node_ids = sources | targets - onto_graph.nodes = {id:Node(id,id) for id in node_ids} + onto_graph.edges = { + lbl: [ + Edge(src, tgt, lbl) for (src, tgt) in ontology_map[lbl] + ] for lbl in ontology_map + } + sources = {src for lbl in ontology_map for (src, _) in ontology_map[lbl]} + targets = {tgt for lbl in ontology_map for (_, tgt) in ontology_map[lbl]} + all_ids = sources.union(targets) + onto_graph.nodes = {i: Node(i, i) for i in all_ids} + onto_graph._set_graph_refs() return onto_graph def find_nodes(self, label=None, where=None) -> List[Node]: - return [node for node in self.nodes.values() if (not label or label in node.labels) and (not where or where(node))] + return [ + node for node in self.nodes.values() + if (not label or label in node.labels) and (not where or where(node)) + ] - def find_edges(self, label=None, source_label=None, target_label=None, where_edge=None, where_source=None, where_target=None): + def find_edges(self, + label=None, + source_label=None, + target_label=None, + where_edge=None, + where_source=None, + where_target=None + ): if label: edge_list = self.edges.get(label, []) else: - edge_list = [edge for edges in self.edges.values() for edge in edges] - + edge_list = [e for edges in self.edges.values() for e in edges] + return [ - edge for edge in edge_list - if (not source_label or source_label in self.nodes[edge.source].labels) - and (not target_label or target_label in self.nodes[edge.target].labels) - and (not where_edge or where_edge(edge)) - and (not where_source or where_source(self.nodes[edge.source])) - and (not where_target or where_target(self.nodes[edge.target])) + e for e in edge_list + if (not source_label or source_label in self.nodes[e.source].labels) + and (not target_label or target_label in self.nodes[e.target].labels) + and (not where_edge or where_edge(e)) + and (not where_source or where_source(self.nodes[e.source])) + and (not where_target or where_target(self.nodes[e.target])) ] def find_source(self, edge_list: List[Edge], start_node: Node, predicate, default: Node = None): - """ - Optimized version to find the first source node for a given node `start_node` - that satisfies a predicate. - - Parameters: - - edges: List of tuples (source, target) representing the graph. - - start_node: The target node to trace back from. - - predicate: A function that takes a node and returns True if the node satisfies the condition. - - Returns: - - The first source node that satisfies the predicate, or None if no such node exists. - """ - # Create an in-memory adjacency list for the graph predecessors = defaultdict(list) - for edge in edge_list: - predecessors[edge.target].append(edge.source) - - # Perform DFS directly without building a reverse adjacency list + for e in edge_list: + predecessors[e.target].append(e.source) visited = set() stack = [start_node.id] @@ -361,213 +360,141 @@ def find_source(self, edge_list: List[Edge], start_node: Node, predicate, defaul if current in visited: continue visited.add(current) - - # Check if the current node satisfies the predicate if predicate(self.nodes[current]): return self.nodes[current] - - # Add predecessors directly to the stack stack.extend(predecessors[current]) - return default + @staticmethod def _adj_list(edge_list: List[Edge]): - """ - Build adjacency list and outdegree dictionary from a list of edges. - """ adj_list = {} outdegree = {} - - for edge in edge_list: - source_id = edge.source - target_id = edge.target - if source_id not in adj_list: - adj_list[source_id] = [] - if source_id not in outdegree: - outdegree[source_id] = 0 - if target_id not in adj_list: - adj_list[target_id] = [] - if target_id not in outdegree: - outdegree[target_id] = 0 - - adj_list[source_id].append(target_id) - outdegree[source_id] += 1 - + for e in edge_list: + s, t = e.source, e.target + if s not in adj_list: + adj_list[s] = [] + if s not in outdegree: + outdegree[s] = 0 + if t not in adj_list: + adj_list[t] = [] + if t not in outdegree: + outdegree[t] = 0 + adj_list[s].append(t) + outdegree[s] += 1 return adj_list, outdegree - + @staticmethod def _outdeg_leaf_nodes(outdegree): - """ - Get nodes with an outdegree of 0. - """ - return [node for node, count in outdegree.items() if count == 0] - - - def process_nodes(self, edges, node_processor): - """ - Process nodes in a directed graph using a lazy approach to handle cyclic and self-dependencies. - - Parameters: - - edges: List of tuples (source, target) representing the graph. - - node_processor: A user-defined function that takes a node and its dependencies and returns a result. - - Returns: - - Dictionary with nodes as keys and results of the node_processor as values. - """ - # Build the adj list + return [n for n, c in outdegree.items() if c == 0] + + def process_nodes(self, edges: List[Edge], node_processor): adj_list, outdegree = Graph._adj_list(edges) results = {} - - # Start processing with leaf nodes queue = Graph._outdeg_leaf_nodes(outdegree) - while queue: next_queue = [] - for node_id in queue: - # Apply the processing function to the current node - dependencies = adj_list.get(node_id, []) - resolved_dependencies = {dep: results[dep] for dep in dependencies if dep in results} - results[node_id] = node_processor(self.nodes[node_id], resolved_dependencies) - - # Update outdegree for nodes that call this node + for n_id in queue: + dependencies = adj_list.get(n_id, []) + resolved = {dep: results[dep] for dep in dependencies if dep in results} + results[n_id] = node_processor(self.nodes[n_id], resolved) for caller, targets in adj_list.items(): - if node_id in targets: + if n_id in targets: outdegree[caller] -= 1 if outdegree[caller] == 0: next_queue.append(caller) - queue = next_queue - - # Handle unresolved nodes (due to cycles or recursion) - for node_id, degree in outdegree.items(): - if degree > 0: - dependencies = adj_list.get(node_id, []) - resolved_dependencies = {dep: results[dep] for dep in dependencies if dep in results} - results[node_id] = node_processor(self.nodes[node_id], resolved_dependencies) - + for n_id, deg in outdegree.items(): + if deg > 0 and n_id not in results: + dependencies = adj_list.get(n_id, []) + resolved = {dep: results[dep] for dep in dependencies if dep in results} + results[n_id] = node_processor(self.nodes[n_id], resolved) return results - def toposorted_nodes(self, edges): - # Build the adj list + def toposorted_nodes(self, edges: List[Edge]): adj_list, outdegree = Graph._adj_list(edges) sorted_nodes = [] node_deps = {} - - # Start processing with leaf nodes queue = Graph._outdeg_leaf_nodes(outdegree) while queue: next_queue = [] - for node_id in queue: - dependencies = adj_list.get(node_id, []) - sorted_nodes.append(node_id) - node_deps[node_id] = dependencies - - # Update outdegree for nodes that call this node + for n_id in queue: + dependencies = adj_list.get(n_id, []) + sorted_nodes.append(n_id) + node_deps[n_id] = dependencies for caller, targets in adj_list.items(): - if node_id in targets: + if n_id in targets: outdegree[caller] -= 1 if outdegree[caller] == 0: next_queue.append(caller) - queue = next_queue - # Handle unresolved nodes (due to cycles or recursion) - for node_id, degree in outdegree.items(): - if degree > 0: - dependencies = adj_list.get(node_id, []) - sorted_nodes.append(node_id) - node_deps[node_id] = dependencies - + for n_id, deg in outdegree.items(): + if deg > 0 and n_id not in sorted_nodes: + dependencies = adj_list.get(n_id, []) + sorted_nodes.append(n_id) + node_deps[n_id] = dependencies return (sorted_nodes, node_deps) - def clean_up(self): for edge_type in list(self.edges.keys()): self.edges[edge_type] = [ - edge for edge in self.edges[edge_type] - if edge.source in self.nodes and edge.target in self.nodes + e for e in self.edges[edge_type] + if e.source in self.nodes and e.target in self.nodes ] - + + def find_paths(self, *edge_sequence: List[str]) -> List[List[Edge]]: + def get_edges(label: str) -> List[Edge]: + if label.startswith('-'): + base_label = label[1:] + if base_label in self.edges: + return invert(self.edges[base_label]) + return [] + return self.edges.get(label, []) + + def find_next(current_paths: List[List[Edge]], lbl: str) -> List[List[Edge]]: + result = [] + for path in current_paths: + last_node = path[-1].target if path else None + for candidate in get_edges(lbl): + if not path or candidate.source == last_node: + result.append(path + [candidate]) + return result + + paths = [[]] + for lbl in edge_sequence: + paths = find_next(paths, lbl) + return paths + def __repr__(self): return json.dumps(self.to_dict()) def to_dict(self, *args: str, node_labels: Optional[Union[str, Iterable[str]]] = None) -> dict: - """ - Converts the graph into a dictionary format with specified edge and node labels. - - Args: - *args: Variable length argument list of edge labels to include. - node_labels (str or iterable, optional): Labels of nodes to include. Defaults to None. - - Returns: - dict: A dictionary representation of the graph with specified elements. - """ included_edge_labels = list(args) if args else list(self.edges.keys()) - if node_labels == 'all': included_node_labels = self.get_all_node_labels() else: included_node_labels: Set[str] = { - node_label - for edge_label in included_edge_labels - for node_label_pair in self.get_source_and_target_labels(edge_label) - for node_label in node_label_pair + nlbl + for elbl in included_edge_labels + for nlbl_pair in self.get_source_and_target_labels(elbl) + for nlbl in nlbl_pair } if isinstance(node_labels, str): included_node_labels.add(node_labels) elif isinstance(node_labels, Iterable): included_node_labels.update(node_labels) - included_nodes: Dict[str, Node] = self.filter_nodes_by_labels(included_node_labels) - - included_edges: Dict[str, List[Edge]] = { - label: edge_list - for label, edge_list in self.edges.items() - if label in included_edge_labels + included_nodes = { + k: v + for k, v in self.filter_nodes_by_labels(included_node_labels).items() + } + included_edges = { + lbl: eds for lbl, eds in self.edges.items() if lbl in included_edge_labels } - return { "elements": { - "nodes": [{"data": node.to_dict()['data']} for node in included_nodes.values()], - "edges": [{"data": edge.to_dict()['data']} for edge in sum(included_edges.values(), [])] + "nodes": [{"data": n.to_dict()['data']} for n in included_nodes.values()], + "edges": [{"data": e.to_dict()['data']} for e in sum(included_edges.values(), [])] } } - - def find_paths(self, *edge_sequence: List[str]) -> List[List[Edge]]: - """ - Finds paths in the graph that match a given sequence of edge labels. - - Args: - graph (Graph): The graph to search for paths. - edge_sequence (List[str]): The sequence of edge labels to match. - - Returns: - List[List[Edge]]: A list of paths, where each path is a list of edges. - """ - def get_edges(label: str) -> List[Edge]: - """Retrieve edges matching a label, considering `-` for inverse edges.""" - if label.startswith("-"): - base_label = label[1:] - if base_label in self.edges: - return invert(self.edges[base_label]) - return [] - return self.edges.get(label, []) - - def find_next(paths: List[List[Edge]], label: str) -> List[List[Edge]]: - """Extend existing paths by one step based on the label.""" - next_paths = [] - for path in paths: - last_node = path[-1].target if path else None - next_edges = get_edges(label) - for edge in next_edges: - if not path or edge.source == last_node: - next_paths.append(path + [edge]) - return next_paths - - # Start generating paths - paths = [[]] # Begin with an empty path - for label in edge_sequence: - paths = find_next(paths, label) - - return paths From 8f3bd4ef5ed115cd2e878cb5c1feb528076d6b19 Mon Sep 17 00:00:00 2001 From: Satrio Adi Rukmono Date: Wed, 29 Jan 2025 10:19:43 +0100 Subject: [PATCH 07/34] Client feedback fix --- arcana/filters.py | 42 +++++++++++++++++++++++++++++++++--------- 1 file changed, 33 insertions(+), 9 deletions(-) diff --git a/arcana/filters.py b/arcana/filters.py index 9d11145..b93c5cf 100644 --- a/arcana/filters.py +++ b/arcana/filters.py @@ -1,20 +1,22 @@ +from collections.abc import Iterable +from io import TextIOWrapper import json import os import re import subprocess import sys import time +from itertools import combinations +from collections import Counter, defaultdict +from typing import Dict, Any, Tuple, List + +from openai import OpenAI +from tqdm.auto import tqdm from arcana import templates from arcanalib.graph import Edge, Graph, Node, triplets, invert, lift from arcanalib.pipefilter import Filter, Seeder -from collections import Counter, defaultdict -from collections.abc import Iterable -from io import TextIOWrapper -from itertools import combinations -from openai import OpenAI -from tqdm.auto import tqdm -from typing import Dict, Any, Tuple, List + def remove_author(s): @@ -91,6 +93,8 @@ def __init__(self, command) -> None: """ self.command = command + # sys.stderr.write(f"Command: {self.command}\n") + def generate(self) -> Graph: """ Execute the command, parse the JSON output into a dict, and pass the dict to the Graph constructor. @@ -260,12 +264,19 @@ def describe(self, node: dict, *keys) -> str: lines = {key:f"**{key}**: {sentence(str(node.properties[key]).replace(sr,'').replace(sn,' '))} " for key in keys if key in node.properties and key != 'docComment' and node.properties[key]} if 'docComment' in keys and 'docComment' in node.properties and node.properties['docComment']: lines['docComment'] = f"**docComment**: {sentence(remove_author(str(node.properties['docComment'])).replace(sr,'').replace(sn,' '))} " - + # print(lines) return ' '.join(lines[key] for key in keys if key in lines) def process_hierarchy(self, data: Graph, client: OpenAI, model: str, jsonl_file, log_file): """Process each package, class, and method in the hierarchy.""" + # all_method_ids = [ node.id + # for node + # in (data.find_nodes(label="Operation") + data.find_nodes(label="Constructor")) ] + # independent_method_ids = [ node_id + # for node_id + # in all_method_ids + # if node_id not in [edge.source for edge in data.find_edges(label="invokes")] ] st_contains_st = data.find_edges(label='contains',source_label='Structure',target_label='Structure') ct_contains_st = data.find_edges(label='contains',target_label='Structure', where_source=lambda node: 'Container' in node.labels and 'Structure' not in node.labels) new_ct_sources = {edge.target:data.find_source(data.edges['contains'],data.nodes[edge.target],lambda node:'Structure' not in node.labels,data.nodes[edge.source]).id for edge in st_contains_st} @@ -321,6 +332,12 @@ def process_hierarchy(self, data: Graph, client: OpenAI, model: str, jsonl_file, class_kind = clasz.properties['kind'] class_kind = 'enum' if class_kind == 'enumeration' else 'abstract class' if class_kind == 'abstract' else class_kind + # for met_id in tqdm(cls_data, desc='Processing methods', position=2, leave=False): + # self.process_method(data, client, model, file, met_id, class_name, class_kind) + + # if os.path.exists('stop'): + # raise StopIteration + self.process_class(data, client, model, jsonl_file, log_file, cls_id, clasz, class_name, class_kind, cls_data) if os.path.exists('stop'): @@ -338,6 +355,11 @@ def process_hierarchy(self, data: Graph, client: OpenAI, model: str, jsonl_file, paths = data.find_paths("contains", "hasScript", "invokes", "-hasScript", "-contains") path_groups = group_paths_by_endpoints(paths) + # for k, v in path_groups.items(): + # print(k) + # for i in v: + # print("*", [(edge.source, edge.target) for edge in i]) + pkg_pairs = list(combinations(sorted_pkg_ids,2)) for pkg2_id, pkg1_id in tqdm(pkg_pairs, desc='Processing package interactions', position=0, leave=False): pkg1 = data.nodes[pkg1_id] @@ -505,6 +527,8 @@ def generate_json_description(self, client: OpenAI, model: str, prompt: str) -> except: description = dict() + if 'description' not in description: + description['description'] = "(no description)" return description def generate_text_description(self, client: OpenAI, model: str, prompt: str) -> dict: @@ -518,7 +542,7 @@ def generate_text_description(self, client: OpenAI, model: str, prompt: str) -> ) description = response.choices[0].message.content except: - description = "" + description = "(no description)" return description From 03c6d3a2f3b3101c590bccae91623e9d55cf21d6 Mon Sep 17 00:00:00 2001 From: Satrio Adi Rukmono Date: Mon, 3 Feb 2025 09:36:39 +0100 Subject: [PATCH 08/34] Parameterize layer name and descriptions --- arcana/filters.py | 188 ++++++++++++++++++++++++++------------------ arcana/templates.py | 10 +-- 2 files changed, 112 insertions(+), 86 deletions(-) diff --git a/arcana/filters.py b/arcana/filters.py index b93c5cf..34962da 100644 --- a/arcana/filters.py +++ b/arcana/filters.py @@ -1,22 +1,20 @@ -from collections.abc import Iterable -from io import TextIOWrapper import json import os import re import subprocess import sys import time -from itertools import combinations -from collections import Counter, defaultdict -from typing import Dict, Any, Tuple, List - -from openai import OpenAI -from tqdm.auto import tqdm from arcana import templates from arcanalib.graph import Edge, Graph, Node, triplets, invert, lift from arcanalib.pipefilter import Filter, Seeder - +from collections import Counter, defaultdict +from collections.abc import Iterable +from io import TextIOWrapper +from itertools import combinations +from openai import OpenAI +from tqdm.auto import tqdm +from typing import Dict, Any, Tuple, List def remove_author(s): @@ -83,6 +81,18 @@ def prettify_json(obj: dict) -> str: return json.dumps(obj, indent='\t') +def layers_to_list(d): + result = [] + i = 1 + while True: + name_key, desc_key = f"layer{i}name", f"layer{i}desc" + if name_key not in d or desc_key not in d: + break + result.append((d[name_key], d[desc_key])) + i += 1 + return result + + class CLISeeder(Seeder): def __init__(self, command) -> None: @@ -93,8 +103,6 @@ def __init__(self, command) -> None: """ self.command = command - # sys.stderr.write(f"Command: {self.command}\n") - def generate(self) -> Graph: """ Execute the command, parse the JSON output into a dict, and pass the dict to the Graph constructor. @@ -211,12 +219,28 @@ def group_paths_by_endpoints(paths: List[List[Edge]]) -> Dict[Tuple[str, str], L grouped_paths[(start, end)].append(path) return grouped_paths +def format_layers(layers): + return "\n".join(f"- **{name}**: {desc}" for name, desc in layers) + class LLMFilter(Filter): def __init__(self, config: Dict[str, Dict[str, Any]]): super().__init__(config) self.project_name = None self.project_desc = None self.openai_client_args = None + + layers = layers_to_list(config["layers"]) if "layers" in config else None + if not layers: + layers = [ + ("Presentation Layer", "Manages the user interface, defines UI elements and behavior, displays information, responds to user input, and updates views."), + ("Service Layer", "Controls the application flow, orchestrates domain operations, connects UI events with domain logic, and synchronizes domain changes with the UI."), + ("Domain Layer", "Handles business logic, represents domain data and behavior, and performs necessary computations for domain operations."), + ("Data Source Layer", "Interacts with databases, filesystems, hardware, messaging systems, or other data sources, performs CRUD operations, handles data conversion, and ensures data integrity."), + ] + self.layers = layers + self.layers_text = format_layers(layers) + + def process(self, data: Graph) -> Graph: """ @@ -230,6 +254,14 @@ def process(self, data: Graph) -> Graph: """ self.project_name, self.project_desc, self.openai_client_args, model, client = self.setup() timestr = time.strftime("%Y%m%d-%H%M%S") + + for i,(name,desc) in enumerate(self.layers): + data.add_node(f"layer:{name}", "Grouping", kind="architectural layer", simpleName=name, qualifiedName=name, description=desc, layerOrder=i) + + for i in range(len(self.layers)-1): + src = self.layers[i][0] + tgt = self.layers[i+1][0] + data.add_edge(f"layer:{src}", f"layer:{tgt}", "allowedDependency", weight=1) with open(f'arcana-{timestr}.jsonl', 'a', encoding="utf-8") as jsonl_file: with open(f'arcana-{timestr}.log', 'a', encoding="utf-8") as log_file: @@ -264,19 +296,12 @@ def describe(self, node: dict, *keys) -> str: lines = {key:f"**{key}**: {sentence(str(node.properties[key]).replace(sr,'').replace(sn,' '))} " for key in keys if key in node.properties and key != 'docComment' and node.properties[key]} if 'docComment' in keys and 'docComment' in node.properties and node.properties['docComment']: lines['docComment'] = f"**docComment**: {sentence(remove_author(str(node.properties['docComment'])).replace(sr,'').replace(sn,' '))} " - # print(lines) + return ' '.join(lines[key] for key in keys if key in lines) def process_hierarchy(self, data: Graph, client: OpenAI, model: str, jsonl_file, log_file): """Process each package, class, and method in the hierarchy.""" - # all_method_ids = [ node.id - # for node - # in (data.find_nodes(label="Operation") + data.find_nodes(label="Constructor")) ] - # independent_method_ids = [ node_id - # for node_id - # in all_method_ids - # if node_id not in [edge.source for edge in data.find_edges(label="invokes")] ] st_contains_st = data.find_edges(label='contains',source_label='Structure',target_label='Structure') ct_contains_st = data.find_edges(label='contains',target_label='Structure', where_source=lambda node: 'Container' in node.labels and 'Structure' not in node.labels) new_ct_sources = {edge.target:data.find_source(data.edges['contains'],data.nodes[edge.target],lambda node:'Structure' not in node.labels,data.nodes[edge.source]).id for edge in st_contains_st} @@ -332,12 +357,6 @@ def process_hierarchy(self, data: Graph, client: OpenAI, model: str, jsonl_file, class_kind = clasz.properties['kind'] class_kind = 'enum' if class_kind == 'enumeration' else 'abstract class' if class_kind == 'abstract' else class_kind - # for met_id in tqdm(cls_data, desc='Processing methods', position=2, leave=False): - # self.process_method(data, client, model, file, met_id, class_name, class_kind) - - # if os.path.exists('stop'): - # raise StopIteration - self.process_class(data, client, model, jsonl_file, log_file, cls_id, clasz, class_name, class_kind, cls_data) if os.path.exists('stop'): @@ -355,11 +374,6 @@ def process_hierarchy(self, data: Graph, client: OpenAI, model: str, jsonl_file, paths = data.find_paths("contains", "hasScript", "invokes", "-hasScript", "-contains") path_groups = group_paths_by_endpoints(paths) - # for k, v in path_groups.items(): - # print(k) - # for i in v: - # print("*", [(edge.source, edge.target) for edge in i]) - pkg_pairs = list(combinations(sorted_pkg_ids,2)) for pkg2_id, pkg1_id in tqdm(pkg_pairs, desc='Processing package interactions', position=0, leave=False): pkg1 = data.nodes[pkg1_id] @@ -370,52 +384,6 @@ def process_hierarchy(self, data: Graph, client: OpenAI, model: str, jsonl_file, if path_groups[(pkg2_id,pkg1_id)]: self.process_interactions(data, client, model, pkg2, pkg1, path_groups[(pkg2_id,pkg1_id)], hierarchy, jsonl_file, log_file) - def process_interactions(self, data: Graph, client: OpenAI, model: str, pkg1: Node, pkg2: Node, path_groups: List[Edge], hierarchy, jsonl_file: TextIOWrapper, log_file: TextIOWrapper): - pkg1_name = pkg1.properties["qualifiedName"] - pkg2_name = pkg2.properties["qualifiedName"] - pkg1_desc = pkg1.properties["description"] - pkg2_desc = pkg2.properties["description"] - - pkg1_data = hierarchy.get(pkg1.id, dict()) - pkg2_data = hierarchy.get(pkg2.id, dict()) - - cls1_info = "\n".join(f" - `{data.nodes[c_id].properties['simpleName']}`: {data.nodes[c_id].properties['description']}" for c_id, _ in pkg1_data.items()) - cls2_info = "\n".join(f" - `{data.nodes[c_id].properties['simpleName']}`: {data.nodes[c_id].properties['description']}" for c_id, _ in pkg2_data.items()) - - def describe_path(path): - src_cls = data.nodes[path[1].source] - src_mth = data.nodes[path[1].target] - tgt_mth = data.nodes[path[-2].source] - tgt_cls = data.nodes[path[-2].target] - return f"Method `{src_mth.properties['simpleName']}` ({src_mth.properties['description']}) of class `{src_cls.properties['qualifiedName']}` invokes method `{tgt_mth.properties['simpleName']}` ({tgt_mth.properties['description']}) of class `{tgt_cls.properties['qualifiedName']}`." - - dep_info = f" - Dependencies from `{pkg1_name}` to `{pkg2_name}`:\n" + "\n".join(f" - {describe_path(path)}" for path in path_groups) if path_groups else "" - - prompt = templates.interaction_analysis.format( - project_name=self.project_name, - project_desc=self.project_desc, - pkg1_name=pkg1_name, - pkg2_name=pkg2_name, - pkg1_desc=pkg1_desc, - pkg2_desc=pkg2_desc, - cls1_info=cls1_info, - cls2_info=cls2_info, - dep_info=dep_info - ) - - log_file.write(prompt) - log_file.write('\n\n======\n\n') - - description = self.generate_text_description(client, model, prompt) - pkg1_edge = Edge(source=pkg1.id, target=pkg2.id, label="dependsOn", description=description) if dep_info else None - - if pkg1_edge: - if "dependsOn" not in data.edges: - data.edges["dependsOn"] = [] - data.edges["dependsOn"].append(pkg1_edge) - jsonl_file.write(json.dumps(pkg1_edge.to_dict(), cls=CustomJSONEncoder)) - jsonl_file.write('\n') - def process_method(self, data: Graph, client: OpenAI, model: str, jsonl_file: TextIOWrapper, log_file: TextIOWrapper, met_id: str, class_name: str, class_kind: str, node_deps: dict): """Process a single method and generate its description.""" @@ -432,7 +400,8 @@ def process_method(self, data: Graph, client: OpenAI, model: str, jsonl_file: Te op_src=method_src, other_ops="(none)" if not node_deps[met_id] else "\n".join(f"- `{data.nodes[node_id].properties['simpleName']}`: {self.describe(data.nodes[node_id], 'description', 'returns', 'docComment')}" for node_id in node_deps[met_id]), project_name=self.project_name, - project_desc=self.project_desc + project_desc=self.project_desc, + layers=self.layers_text ) log_file.write(prompt) @@ -441,6 +410,14 @@ def process_method(self, data: Graph, client: OpenAI, model: str, jsonl_file: Te description = self.generate_json_description(client, model, prompt) self.update_method_properties(data, description, method) + layer_id = None + if method.has_property("layer") and \ + method.property("layer") in [name for name, _ in self.layers]: + layer_id = f"layer:{method.property('layer')}" + layer_node = data.find_node(label="Grouping", where=lambda node: node.id == layer_id) + if layer_node: + data.add_edge(method.id, layer_node.id, "implements", weight=1) + jsonl_file.write(json.dumps({ 'data': { 'id': method.id, @@ -492,7 +469,8 @@ def process_package(self, data: Graph, client: OpenAI, model: str, jsonl_file, l classes="(none)" if not classes_descriptions else "\n".join(classes_descriptions), packages="(none)" if not package_descriptions else "\n".join(package_descriptions), project_name=self.project_name, - project_desc=self.project_desc + project_desc=self.project_desc, + layers=self.layers_text ) log_file.write(prompt) @@ -501,6 +479,14 @@ def process_package(self, data: Graph, client: OpenAI, model: str, jsonl_file, l description = self.generate_json_description(client, model, prompt) self.update_package_properties(data, description, package) + layer_id = None + if package.has_property("layer") and \ + package.property("layer") in [name for name, _ in self.layers]: + layer_id = f"layer:{package.property('layer')}" + layer_node = data.find_node(label="Grouping", where=lambda node: node.id == layer_id) + if layer_node: + data.add_edge(package.id, layer_node.id, "implements", weight=1) + jsonl_file.write(json.dumps({ 'data': { 'id': package.id, @@ -508,6 +494,52 @@ def process_package(self, data: Graph, client: OpenAI, model: str, jsonl_file, l 'properties': description}})) jsonl_file.write('\n') + def process_interactions(self, data: Graph, client: OpenAI, model: str, pkg1: Node, pkg2: Node, path_groups: List[Edge], hierarchy, jsonl_file: TextIOWrapper, log_file: TextIOWrapper): + pkg1_name = pkg1.properties["qualifiedName"] + pkg2_name = pkg2.properties["qualifiedName"] + pkg1_desc = pkg1.properties["description"] + pkg2_desc = pkg2.properties["description"] + + pkg1_data = hierarchy.get(pkg1.id, dict()) + pkg2_data = hierarchy.get(pkg2.id, dict()) + + cls1_info = "\n".join(f" - `{data.nodes[c_id].properties['simpleName']}`: {data.nodes[c_id].properties['description']}" for c_id, _ in pkg1_data.items()) + cls2_info = "\n".join(f" - `{data.nodes[c_id].properties['simpleName']}`: {data.nodes[c_id].properties['description']}" for c_id, _ in pkg2_data.items()) + + def describe_path(path): + src_cls = data.nodes[path[1].source] + src_mth = data.nodes[path[1].target] + tgt_mth = data.nodes[path[-2].source] + tgt_cls = data.nodes[path[-2].target] + return f"Method `{src_mth.properties['simpleName']}` ({src_mth.properties['description']}) of class `{src_cls.properties['qualifiedName']}` invokes method `{tgt_mth.properties['simpleName']}` ({tgt_mth.properties['description']}) of class `{tgt_cls.properties['qualifiedName']}`." + + dep_info = f" - Dependencies from `{pkg1_name}` to `{pkg2_name}`:\n" + "\n".join(f" - {describe_path(path)}" for path in path_groups) if path_groups else "" + + prompt = templates.interaction_analysis.format( + project_name=self.project_name, + project_desc=self.project_desc, + pkg1_name=pkg1_name, + pkg2_name=pkg2_name, + pkg1_desc=pkg1_desc, + pkg2_desc=pkg2_desc, + cls1_info=cls1_info, + cls2_info=cls2_info, + dep_info=dep_info + ) + + log_file.write(prompt) + log_file.write('\n\n======\n\n') + + description = self.generate_text_description(client, model, prompt) + pkg1_edge = Edge(source=pkg1.id, target=pkg2.id, label="dependsOn", description=description) if dep_info else None + + if pkg1_edge: + if "dependsOn" not in data.edges: + data.edges["dependsOn"] = [] + data.edges["dependsOn"].append(pkg1_edge) + jsonl_file.write(json.dumps(pkg1_edge.to_dict(), cls=CustomJSONEncoder)) + jsonl_file.write('\n') + def generate_json_description(self, client: OpenAI, model: str, prompt: str) -> dict: """Generate a description using the OpenAI client.""" try: diff --git a/arcana/templates.py b/arcana/templates.py index c3396a3..2e68fd1 100644 --- a/arcana/templates.py +++ b/arcana/templates.py @@ -25,10 +25,7 @@ For the `layer`, fill the value with one of the following architectural layer which functionality is exhibited by the method source code: -- **Presentation Layer**: Manages the user interface, defines UI elements and behavior, displays information, responds to user input, and updates views. -- **Service Layer**: Controls the application flow, orchestrates domain operations, connects UI events with domain logic, and synchronizes domain changes with the UI. -- **Domain Layer**: Handles business logic, represents domain data and behavior, and performs necessary computations for domain operations. -- **Data Source Layer**: Interacts with databases, filesystems, hardware, messaging systems, or other data sources, performs CRUD operations, handles data conversion, and ensures data integrity. +{layers} In `layerReason`, explain why this method fits your layer of choice but not the other layers. @@ -90,10 +87,7 @@ For the `layer`, consider the functionalities of architectural layers below: -- **Presentation Layer**: Manages the user interface, defines UI elements and behavior, displays information, responds to user input, and updates views. Typically (but not only) contains User Interfacers. -- **Service Layer**: Controls the application flow, orchestrates domain operations, connects UI events with domain logic, and synchronizes domain changes with the UI. Typically (but not only) contains Coordinators and (Application) Controllers. -- **Domain Layer**: Handles business logic, represents domain data and behavior, and performs necessary computations for domain operations. Typically (but not only) contains Information Holders, Service Providers, Structurers, Coordinators, and (Domain) Controllers. -- **Data Source Layer**: Interacts with databases, filesystems, hardware, messaging systems, or other data sources, performs CRUD operations, handles data conversion, and ensures data integrity. Typically (but not only) contains External Interfacers. +{layers} In `layerReason`, explain why this package fits your layer of choice but not the other layers. From 9af24af24bbcab114e9a29ecd52afd19aa98fa38 Mon Sep 17 00:00:00 2001 From: Satrio Adi Rukmono Date: Mon, 3 Feb 2025 09:38:30 +0100 Subject: [PATCH 09/34] Update config example --- config.ini.example | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/config.ini.example b/config.ini.example index f6b02cb..0dcf0cb 100644 --- a/config.ini.example +++ b/config.ini.example @@ -13,3 +13,11 @@ model=llama3 command={javaexe} -jar {jarfile} -i {input} -a -n {name} -f json javaexe=./javapers/jdk-17.0.11+9-jre/bin/java.exe jarfile=./javapers/javapers-1.1.2-jar-with-dependencies.jar + +[layers] +layer1name=UI +layer1desc=Handles user interface, such as instatiating, setting properties of, or laying out widget objects and capturing user interactions. +layer2name=Logic +layer2desc=Handles application and domain logic, i.e., neither UI nor data access. +layer3name=Data +layer3desc=Handles data access, e.g., managing database connections, querying databases, reading/writing files, and invoking web services. From fec7e1b23932fa646e3f6886544556fee0ae43ce Mon Sep 17 00:00:00 2001 From: Satrio Adi Rukmono Date: Tue, 18 Feb 2025 09:52:52 +0100 Subject: [PATCH 10/34] Use tool calling mode to constrain response format --- arcana/filters.py | 413 +++++++++++++++++++++++++++----------------- arcana/templates.py | 296 +++++++++++++++++++++---------- arcanalib/graph.py | 29 +++- config.ini.example | 4 +- 4 files changed, 481 insertions(+), 261 deletions(-) diff --git a/arcana/filters.py b/arcana/filters.py index 34962da..75fd0bb 100644 --- a/arcana/filters.py +++ b/arcana/filters.py @@ -8,7 +8,7 @@ from arcana import templates from arcanalib.graph import Edge, Graph, Node, triplets, invert, lift from arcanalib.pipefilter import Filter, Seeder -from collections import Counter, defaultdict +from collections import Counter, defaultdict, OrderedDict from collections.abc import Iterable from io import TextIOWrapper from itertools import combinations @@ -114,7 +114,8 @@ def generate(self) -> Graph: self.command, capture_output=True, text=True, - encoding="utf-8" + shell=True, + encoding="utf-8" ) sys.stderr.write(process.stderr) @@ -275,7 +276,7 @@ def process(self, data: Graph) -> Graph: def setup(self): """Setup necessary configuration and client.""" project_name = self.config['project']['name'] - project_desc = self.config['project']['desc'] + project_desc = sentence(self.config['project']['desc']) openai_client_args = { 'api_key': self.config['llm'].get('apikey'), @@ -293,11 +294,11 @@ def describe(self, node: dict, *keys) -> str: if not keys: keys = ['description', 'docComment', 'returns', 'reason', 'howToUse', 'howItWorks', 'assertions', 'stereotype', 'roleStereotype', 'layer'] - lines = {key:f"**{key}**: {sentence(str(node.properties[key]).replace(sr,'').replace(sn,' '))} " for key in keys if key in node.properties and key != 'docComment' and node.properties[key]} + lines = {key:f"**{key}**: {sentence(str(node.properties[key]).replace(sr,'').replace(sn,' '))}" for key in keys if key in node.properties and key != 'docComment' and node.properties[key]} if 'docComment' in keys and 'docComment' in node.properties and node.properties['docComment']: lines['docComment'] = f"**docComment**: {sentence(remove_author(str(node.properties['docComment'])).replace(sr,'').replace(sn,' '))} " - return ' '.join(lines[key] for key in keys if key in lines) + return ' '.join(lines[key] for key in keys if key in lines).strip() def process_hierarchy(self, data: Graph, client: OpenAI, model: str, jsonl_file, log_file): """Process each package, class, and method in the hierarchy.""" @@ -313,17 +314,15 @@ def process_hierarchy(self, data: Graph, client: OpenAI, model: str, jsonl_file, # print('######################################################################') sorted_method_ids, method_deps = data.toposorted_nodes(data.edges['invokes']) # print(sorted_method_ids) - + counter = 0 for met_id in tqdm(sorted_method_ids, desc='Processing methods', position=0, leave=False): cls_id, pkg_id = met_to_cls_pkg[met_id] + method = data.nodes[met_id] clasz = data.nodes[cls_id] - class_name = clasz.properties['qualifiedName'] - class_kind = clasz.properties['kind'] - class_kind = 'enum' if class_kind == 'enumeration' else 'abstract class' if class_kind == 'abstract' else class_kind - self.process_method(data, client, model, jsonl_file, log_file, met_id, class_name, class_kind, method_deps) + self.process_script(data, client, model, jsonl_file, log_file, method, clasz, method_deps) if os.path.exists('stop'): raise StopIteration @@ -334,18 +333,9 @@ def process_hierarchy(self, data: Graph, client: OpenAI, model: str, jsonl_file, jsonl_file.flush() counter %= 10 - hierarchy = build_hierarchy(triplets) - # print(hierarchy) - # print('==========') - sorted_pkg_ids, pkg_deps = data.toposorted_nodes(data.find_edges(label='contains',where_source=lambda node: 'Structure' not in node.labels,where_target=lambda node: 'Structure' not in node.labels)) - # print(sorted_pkg_ids) - # print('==========') - - # print(pkg_deps) - # print('==========') - + for pkg_id in tqdm(sorted_pkg_ids, desc="Processing packages", position=1): pkg_data = hierarchy.get(pkg_id, dict()) package = data.nodes[pkg_id] @@ -353,17 +343,12 @@ def process_hierarchy(self, data: Graph, client: OpenAI, model: str, jsonl_file, for cls_id, cls_data in tqdm(pkg_data.items(), desc="Processing classes", position=2, leave=False): clasz = data.nodes[cls_id] - class_name = clasz.properties['qualifiedName'] - class_kind = clasz.properties['kind'] - class_kind = 'enum' if class_kind == 'enumeration' else 'abstract class' if class_kind == 'abstract' else class_kind - - self.process_class(data, client, model, jsonl_file, log_file, cls_id, clasz, class_name, class_kind, cls_data) + self.process_structure(data, client, model, jsonl_file, log_file, clasz, cls_data) if os.path.exists('stop'): raise StopIteration - self.process_package(data, client, model, jsonl_file, log_file, pkg_id, package, pkg_data, pkg_deps) - + self.process_component(data, client, model, jsonl_file, log_file, package, pkg_data, pkg_deps) log_file.flush() jsonl_file.flush() @@ -376,6 +361,10 @@ def process_hierarchy(self, data: Graph, client: OpenAI, model: str, jsonl_file, pkg_pairs = list(combinations(sorted_pkg_ids,2)) for pkg2_id, pkg1_id in tqdm(pkg_pairs, desc='Processing package interactions', position=0, leave=False): + + if os.path.exists('stop'): + raise StopIteration + pkg1 = data.nodes[pkg1_id] pkg2 = data.nodes[pkg2_id] if ('Structure' not in pkg1.labels) and ('Structure' not in pkg2.labels): @@ -384,146 +373,183 @@ def process_hierarchy(self, data: Graph, client: OpenAI, model: str, jsonl_file, if path_groups[(pkg2_id,pkg1_id)]: self.process_interactions(data, client, model, pkg2, pkg1, path_groups[(pkg2_id,pkg1_id)], hierarchy, jsonl_file, log_file) - def process_method(self, data: Graph, client: OpenAI, model: str, jsonl_file: TextIOWrapper, log_file: TextIOWrapper, met_id: str, class_name: str, - class_kind: str, node_deps: dict): + def compose_prompt(self, p, function_parameters): + prompt = p + for k,v in function_parameters.items(): + if isinstance(v, dict) and len(v): + prompt += f"## {k}\n\n" + for k1,v1 in v.items(): + if v1: + prompt += f"* {k1}: {str(v1)}\n" + prompt += "\n\n" + elif isinstance(v, list) and len(v): + prompt += f"## {k}\n\n" + for v1 in v: + if v1: + prompt += f"* {str(v1)}\n" + prompt += "\n\n" + elif v: + prompt += f"## {k}\n\n{str(v)}\n\n" + return prompt.strip() + + def process_script(self, graph: Graph, client: OpenAI, model: str, jsonl_file: TextIOWrapper, log_file: TextIOWrapper, script: Node, structure: Node, node_deps: dict): """Process a single method and generate its description.""" - method = data.nodes[met_id] - - if 'description' not in method.properties or not method.properties['description']: - method_name = method.properties['simpleName'] - method_src = remove_java_comments(method.properties['sourceText']) - - prompt = templates.script_analysis.format( - op_name=method_name, - struct_kind=class_kind, - struct_name=class_name, - op_src=method_src, - other_ops="(none)" if not node_deps[met_id] else "\n".join(f"- `{data.nodes[node_id].properties['simpleName']}`: {self.describe(data.nodes[node_id], 'description', 'returns', 'docComment')}" for node_id in node_deps[met_id]), - project_name=self.project_name, - project_desc=self.project_desc, - layers=self.layers_text - ) + + if 'description' not in script.properties or not script.properties['description'] or script.properties['description'] == "(no description)": + script_name = script.properties['simpleName'] + script_src = remove_java_comments(script.properties['sourceText']) + script_kind = script.properties.get('kind', 'function') + + structure_name = structure.properties['qualifiedName'] + structure_kind = structure.properties['kind'] + structure_kind = 'enum' if structure_kind == 'enumeration' else 'abstract class' if structure_kind == 'abstract' else structure_kind + + prompt = f"Describe the following {script_kind} by using the AnalyzeScript tool.\n\n" + script_parameters = OrderedDict() + script_parameters["Project Name"] = self.project_name + script_parameters["Project Description"] = self.project_desc + script_parameters[f"{script_kind.title()} Declaration"] = f"The {script_kind} {script_name} is declared within the {structure_kind} {structure_name}." + script_parameters[f"{script_kind.title()} Source Code"] = script_src + script_parameters[f"Other Functions/Methods Used"] = { + graph.nodes[node_id].properties['qualifiedName']: f"{self.describe(graph.nodes[node_id], 'description', 'returns', 'howToUse', 'docComment')}" + for node_id in node_deps[script.id] + } + script_parameters["Possible Architectural Layers"] = dict(self.layers) + + prompt = self.compose_prompt(prompt, script_parameters) log_file.write(prompt) log_file.write('\n\n======\n\n') - description = self.generate_json_description(client, model, prompt) - self.update_method_properties(data, description, method) + description = self.generate_json_description(client, model, prompt, "AnalyzeScript") + self.update_method_properties(graph, description, script) layer_id = None - if method.has_property("layer") and \ - method.property("layer") in [name for name, _ in self.layers]: - layer_id = f"layer:{method.property('layer')}" - layer_node = data.find_node(label="Grouping", where=lambda node: node.id == layer_id) + if script.has_property("layer") and \ + script.property("layer") in [name for name, _ in self.layers]: + layer_id = f"layer:{script.property('layer')}" + layer_node = graph.find_node(label="Grouping", where=lambda node: node.id == layer_id) if layer_node: - data.add_edge(method.id, layer_node.id, "implements", weight=1) + graph.add_edge(script.id, layer_node.id, "implements", weight=1) jsonl_file.write(json.dumps({ 'data': { - 'id': method.id, - 'labels': method.labels, + 'id': script.id, + 'labels': script.labels, 'properties': description } }, cls=CustomJSONEncoder)) jsonl_file.write('\n') - def process_class(self, data: Graph, client: OpenAI, model: str, jsonl_file, log_file, cls_id: str, clasz: dict, class_name: str, - class_kind: str, cls_data: list): + def process_structure(self, graph: Graph, client: OpenAI, model: str, jsonl_file, log_file, structure: Node, structure_scripts: list): """Process a single class and generate its description.""" - ancestors, fields = self.get_class_relations(data, cls_id) - methods_descriptions = self.get_methods_descriptions(data, cls_data) - - prompt = templates.structure_analysis.format( - struct_type=class_kind, - struct_name=class_name, - ancestors="\n".join([f"- `{ancestor}`" for ancestor in ancestors]) if ancestors else "(none)", - fields="\n".join([f"- `{field}`" for field in fields]) if fields else "(none)", - methods="\n".join(methods_descriptions) if methods_descriptions else "(none)", - project_name=self.project_name, - project_desc=self.project_desc - ) + # if 'description' not in structure.properties or not structure.properties['description'] or structure.properties['description'] == "(no description)": + ancestors, variables = self.get_structure_relations(graph, structure.id) + script_descriptions = self.get_script_descriptions(graph, structure_scripts) + + structure_name = structure.properties['qualifiedName'] + structure_kind = structure.properties['kind'] + structure_kind = 'enum' if structure_kind == 'enumeration' else 'abstract class' if structure_kind == 'abstract' else structure_kind + + prompt = f"Describe the following {structure_kind} using the AnalyzeStructure tool.\n\n" + structure_parameters = OrderedDict() + structure_parameters["Project Name"] = self.project_name + structure_parameters["Project Description"] = self.project_desc + structure_parameters[f"{structure_kind.title()} Name"] = structure_name + structure_parameters[f"{structure_kind.title()} Inhertis From"] = ancestors + structure_parameters[f"Enclosed Variables/Fields"] = variables + structure_parameters[f"Enclosed Functions/Methods"] = script_descriptions + + prompt = self.compose_prompt(prompt, structure_parameters) + log_file.write(prompt) log_file.write('\n\n======\n\n') - description = self.generate_json_description(client, model, prompt) - self.update_class_properties(data, description, clasz) + description = self.generate_json_description(client, model, prompt, "AnalyzeStructure") + self.update_class_properties(graph, description, structure) jsonl_file.write(json.dumps({ 'data': { - 'id': clasz.id, - 'labels': list(clasz.labels), + 'id': structure.id, + 'labels': list(structure.labels), 'properties': description } })) jsonl_file.write('\n') - def process_package(self, data: Graph, client: OpenAI, model: str, jsonl_file, log_file, pkg_id: str, package: dict, - pkg_data: dict, pkg_deps: dict): + def process_component(self, graph: Graph, client: OpenAI, model: str, jsonl_file, log_file, component: Node, + component_contents: dict, component_deps: dict): """Process a single package and generate its description.""" - classes_descriptions = self.get_classes_descriptions(data, pkg_data) - package_descriptions = self.get_packages_descriptions(data, pkg_deps[pkg_id]) - - prompt = templates.component_analysis.format( - pkg_name=package.properties['qualifiedName'], - classes="(none)" if not classes_descriptions else "\n".join(classes_descriptions), - packages="(none)" if not package_descriptions else "\n".join(package_descriptions), - project_name=self.project_name, - project_desc=self.project_desc, - layers=self.layers_text - ) + + # if 'description' not in component.properties or not component.properties['description'] or component.properties['description'] == "(no description)": + structure_descriptions = self.get_structure_descriptions(graph, component_contents) + subcomponent_descriptions = self.get_component_descriptions(graph, component_deps[component.id]) + component_kind = component.properties.get('kind', "component") + + prompt = f"Describe the following {component_kind} using the AnalyzeComponent tool.\n\n" + component_parameters = OrderedDict() + component_parameters["Project Name"] = self.project_name + component_parameters["Project Description"] = self.project_desc + component_parameters["Component Type"] = component_kind + component_parameters["Component Name"] = component.properties['qualifiedName'] + component_parameters["Enclosed Sub-components"] = subcomponent_descriptions + component_parameters["Enclosed Classes"] = structure_descriptions + component_parameters["Possible Architectural Layers"] = dict(self.layers) + + prompt = self.compose_prompt(prompt, component_parameters) log_file.write(prompt) log_file.write('\n\n======\n\n') - description = self.generate_json_description(client, model, prompt) - self.update_package_properties(data, description, package) + description = self.generate_json_description(client, model, prompt, "AnalyzeComponent") + self.update_package_properties(graph, description, component) layer_id = None - if package.has_property("layer") and \ - package.property("layer") in [name for name, _ in self.layers]: - layer_id = f"layer:{package.property('layer')}" - layer_node = data.find_node(label="Grouping", where=lambda node: node.id == layer_id) + if component.has_property("layer") and \ + component.property("layer") in [name for name, _ in self.layers]: + layer_id = f"layer:{component.property('layer')}" + layer_node = graph.find_node(label="Grouping", where=lambda node: node.id == layer_id) if layer_node: - data.add_edge(package.id, layer_node.id, "implements", weight=1) + graph.add_edge(component.id, layer_node.id, "implements", weight=1) jsonl_file.write(json.dumps({ 'data': { - 'id': package.id, - 'labels': list(package.labels), + 'id': component.id, + 'labels': list(component.labels), 'properties': description}})) jsonl_file.write('\n') - def process_interactions(self, data: Graph, client: OpenAI, model: str, pkg1: Node, pkg2: Node, path_groups: List[Edge], hierarchy, jsonl_file: TextIOWrapper, log_file: TextIOWrapper): - pkg1_name = pkg1.properties["qualifiedName"] - pkg2_name = pkg2.properties["qualifiedName"] - pkg1_desc = pkg1.properties["description"] - pkg2_desc = pkg2.properties["description"] + def process_interactions(self, graph: Graph, client: OpenAI, model: str, c1: Node, c2: Node, path_groups: List[Edge], hierarchy, jsonl_file: TextIOWrapper, log_file: TextIOWrapper): + c1_name = c1.properties["qualifiedName"] + c2_name = c2.properties["qualifiedName"] + c1_desc = c1.properties["description"] + c2_desc = c2.properties["description"] - pkg1_data = hierarchy.get(pkg1.id, dict()) - pkg2_data = hierarchy.get(pkg2.id, dict()) + c1_contents = hierarchy.get(c1.id, dict()) + c2_contents = hierarchy.get(c2.id, dict()) - cls1_info = "\n".join(f" - `{data.nodes[c_id].properties['simpleName']}`: {data.nodes[c_id].properties['description']}" for c_id, _ in pkg1_data.items()) - cls2_info = "\n".join(f" - `{data.nodes[c_id].properties['simpleName']}`: {data.nodes[c_id].properties['description']}" for c_id, _ in pkg2_data.items()) + c1_structure_info = "\n".join(f" - `{graph.nodes[c_id].properties['simpleName']}`: {graph.nodes[c_id].properties['description']}" for c_id, _ in c1_contents.items()) + c2_structure_info = "\n".join(f" - `{graph.nodes[c_id].properties['simpleName']}`: {graph.nodes[c_id].properties['description']}" for c_id, _ in c2_contents.items()) def describe_path(path): - src_cls = data.nodes[path[1].source] - src_mth = data.nodes[path[1].target] - tgt_mth = data.nodes[path[-2].source] - tgt_cls = data.nodes[path[-2].target] - return f"Method `{src_mth.properties['simpleName']}` ({src_mth.properties['description']}) of class `{src_cls.properties['qualifiedName']}` invokes method `{tgt_mth.properties['simpleName']}` ({tgt_mth.properties['description']}) of class `{tgt_cls.properties['qualifiedName']}`." + src_structure = graph.nodes[path[1].source] + src_method = graph.nodes[path[1].target] + tgt_method = graph.nodes[path[-2].source] + tgt_structure = graph.nodes[path[-2].target] + return f"{src_method.properties['kind'].capitalize()} `{src_method.properties['simpleName']}` ({src_method.properties['description']}) of {src_structure.properties['kind']} `{src_structure.properties['qualifiedName']}` invokes {tgt_method.properties['kind']} `{tgt_method.properties['simpleName']}` ({tgt_method.properties['description']}) of {tgt_structure.properties['kind']} `{tgt_structure.properties['qualifiedName']}`." - dep_info = f" - Dependencies from `{pkg1_name}` to `{pkg2_name}`:\n" + "\n".join(f" - {describe_path(path)}" for path in path_groups) if path_groups else "" + dep_info = f" - Dependencies from `{c1_name}` to `{c2_name}`:\n" + "\n".join(f" - {describe_path(path)}" for path in path_groups) if path_groups else "" prompt = templates.interaction_analysis.format( project_name=self.project_name, project_desc=self.project_desc, - pkg1_name=pkg1_name, - pkg2_name=pkg2_name, - pkg1_desc=pkg1_desc, - pkg2_desc=pkg2_desc, - cls1_info=cls1_info, - cls2_info=cls2_info, + pkg1_name=c1_name, + pkg2_name=c2_name, + pkg1_desc=c1_desc, + pkg2_desc=c2_desc, + cls1_info=c1_structure_info, + cls2_info=c2_structure_info, dep_info=dep_info ) @@ -531,33 +557,62 @@ def describe_path(path): log_file.write('\n\n======\n\n') description = self.generate_text_description(client, model, prompt) - pkg1_edge = Edge(source=pkg1.id, target=pkg2.id, label="dependsOn", description=description) if dep_info else None + pkg1_edge = Edge(source=c1.id, target=c2.id, label="dependsOn", description=description) if dep_info else None if pkg1_edge: - if "dependsOn" not in data.edges: - data.edges["dependsOn"] = [] - data.edges["dependsOn"].append(pkg1_edge) + if "dependsOn" not in graph.edges: + graph.edges["dependsOn"] = [] + graph.edges["dependsOn"].append(pkg1_edge) jsonl_file.write(json.dumps(pkg1_edge.to_dict(), cls=CustomJSONEncoder)) jsonl_file.write('\n') - def generate_json_description(self, client: OpenAI, model: str, prompt: str) -> dict: + def generate_json_description(self, client: OpenAI, model: str, prompt: str = None, tool: str = None) -> dict: """Generate a description using the OpenAI client.""" try: - response = client.chat.completions.create( - model=model, - response_format={"type": "json_object"}, - messages=[{"role": "user", "content": prompt}], - max_tokens=1024, - temperature=0 - ) - description = response.choices[0].message.content - except: - description = '{}' + if tool: + print(prompt) + response = client.chat.completions.create( + model=model, + messages=[ + {"role": "system", "content": "You are a software architecture analysis tool."}, + {"role": "user", "content": prompt} + ], + tools=[templates.analyze_script_tool, templates.analyze_structure_tool, templates.analyze_component_tool], + tool_choice={"name": tool}, + temperature=0, + seed=42, + timeout=float(self.config['llm'].get('timeout', 300)) + ) + print(response) + + tool_calls = response.choices[0].message.tool_calls + + if tool_calls: + args_str = tool_calls[0].function.arguments + description = json.loads(args_str) + else: + content = response.choices[0].message.content + json_content = find_first_valid_json(content) + if json_content: + description = json.loads(json_content) + else: + description = dict() - try: - description = json.loads(description) + else: + response = client.chat.completions.create( + model=model, + response_format={"type": "json_object"}, + messages=[{"role": "user", "content": prompt}], + max_tokens=4096, + temperature=0, + seed=42, + timeout=float(self.config['llm'].get('timeout', 300)) + ) + + content = response.choices[0].message.content + description = json.loads(content) except: - description = dict() + description = {} if 'description' not in description: description['description'] = "(no description)" @@ -570,7 +625,9 @@ def generate_text_description(self, client: OpenAI, model: str, prompt: str) -> model=model, messages=[{"role": "user", "content": prompt}], max_tokens=4096, - temperature=0 + temperature=0, + seed=42, + timeout=float(self.config['llm'].get('timeout', 300)) ) description = response.choices[0].message.content except: @@ -581,7 +638,6 @@ def generate_text_description(self, client: OpenAI, model: str, prompt: str) -> def update_method_properties(self, data: Graph, description: dict, method: Node): """Update method properties with the generated description.""" - for key, value in description.items(): if key.endswith('Reason'): continue @@ -593,15 +649,16 @@ def update_method_properties(self, data: Graph, description: dict, method: Node) if edge.source == method.id ] for param in value: - matching_params = [ - node - for node in param_nodes - if node.properties['simpleName'] == param['name'] - ] - if matching_params: - param_node_id = matching_params[0].id - if param_node_id in data.nodes: - data.nodes[param_node_id].properties['description'] = param.get('description') + if isinstance(param, dict): + matching_params = [ + node + for node in param_nodes + if node.properties['simpleName'] == param.get('name') + ] + if matching_params: + param_node_id = matching_params[0].id + if param_node_id in data.nodes: + data.nodes[param_node_id].properties['description'] = param.get('description') # elif key_lower == 'returns': # method.properties['returns'] = value.get('description', None) if value and hasattr(value, 'get') else None else: @@ -619,45 +676,77 @@ def update_package_properties(self, data: Graph, description: dict, package: Nod if not key.endswith('Reason'): package.properties[lower1(key)] = description[key] - def get_class_relations(self, data: Graph, cls_id: str) -> tuple: + def get_structure_relations(self, data: Graph, cls_id: str) -> tuple: """Retrieve class ancestors and fields.""" - ancestors = { - edge.target + ancestors = list({ + data.nodes[edge.target].property("qualifiedName") for edge in data.edges['specializes'] if edge.source == cls_id - } + }) fields = { - edge.target + data.nodes[edge.target] for edge in data.edges['hasVariable'] if edge.source == cls_id } fields = [ - ' '.join(remove_java_comments(data.nodes[field].properties['sourceText']).split()) + ' '.join(remove_java_comments(field.properties['sourceText']).split()) for field in fields ] return ancestors, fields - def get_methods_descriptions(self, data: Graph, cls_data: list) -> list: + def get_script_descriptions(self, data: Graph, cls_data: list) -> dict: """Generate descriptions for methods.""" - return [ - f"- `{data.nodes[met_id].properties['simpleName']}`: {self.describe(data.nodes[met_id])}" + return { + data.nodes[met_id].properties['simpleName']: self.describe(data.nodes[met_id]) for met_id in cls_data - ] + } - def get_classes_descriptions(self, data: Graph, pkg_data: dict) -> list: + def get_structure_descriptions(self, data: Graph, pkg_data: dict) -> list: """Generate descriptions for classes.""" - return [ - f"- {data.nodes[cls_id].properties['kind']} `{data.nodes[cls_id].properties['qualifiedName']}`: {self.describe(data.nodes[cls_id])}" + return { + f"{data.nodes[cls_id].properties['kind']} {data.nodes[cls_id].properties['qualifiedName']}": self.describe(data.nodes[cls_id]) for cls_id, _ in pkg_data.items() - ] + } - def get_packages_descriptions(self, data: Graph, package_ids: list) -> list: + def get_component_descriptions(self, data: Graph, package_ids: list) -> list: """Generate descriptions for packages.""" - return [ - f"- `{data.nodes[pkg_id].properties['qualifiedName']}`: {self.describe(data.nodes[pkg_id])}" + return { + data.nodes[pkg_id].properties['qualifiedName']: self.describe(data.nodes[pkg_id]) for pkg_id in package_ids - ] + } +def find_first_valid_json(text: str) -> str: + """ + Finds the first valid JSON substring in the given text using a stack-based approach. + + It scans the text from left to right, and when it encounters a '{', it tracks the balanced + braces until a complete JSON object is formed. Once a candidate is found, it attempts to parse + it with json.loads(). If parsing succeeds, that candidate is returned immediately. + + Args: + text (str): The input string that may contain a JSON object. + + Returns: + str: The first valid JSON substring found, or an empty string if none is found. + """ + n = len(text) + for i in range(n): + if text[i] == '{': + stack = 0 + for j in range(i, n): + if text[j] == '{': + stack += 1 + elif text[j] == '}': + stack -= 1 + if stack == 0: + candidate = text[i:j+1] + try: + json.loads(candidate) + return candidate + except json.JSONDecodeError: + # If this candidate isn't valid JSON, break and continue scanning. + break + return "" def merge_node_properties(dict1: Dict[str, Node], dict2: Dict[str, Node], simplify_names=False): for id2, obj2 in dict2.items(): diff --git a/arcana/templates.py b/arcana/templates.py index 2e68fd1..6dd53dc 100644 --- a/arcana/templates.py +++ b/arcana/templates.py @@ -1,97 +1,205 @@ -script_analysis = '''Consider a project {project_name}, {project_desc}. This is method `{op_name}` of {struct_kind} `{struct_name}`: - -```java -{op_src} -``` - -The method may use the following other methods: - -{other_ops} - -Explain the above method on the following aspects: - -{{ description: "Describe the functionality of the method in one sentence.", - parameters: [ {{ name:..., type:..., description:... }}, ... ], // empty list if there is no parameter - returns: "Describe the returned object/value in one sentence. (In case of a constructor, consider the newly created instance as the return value.)" - reason: "Explain the reason why the method is provided or the design rationale of the method, in one sentence.", - howToUse: "Describe the usage or the expected set-up of using the method, in less than 3 sentences.", - howItWorks: "Describe the implementation details of the method, in less than 5 sentences.", - assertions: {{ preConditions: ["pre-conditions of the method", ...], postConditions: ["pre-conditions of the method", ...] }}, - stereotype: one of "Accessor", "Mutator", "Creational", "Collaborational", or "Other", - stereotypeReason: "Explain the rationale of the stereotype choice", - layer:..., - layerReason:... -}} - -For the `layer`, fill the value with one of the following architectural layer which functionality is exhibited by the method source code: - -{layers} - -In `layerReason`, explain why this method fits your layer of choice but not the other layers. - -Respond with a well-formatted JSON object. Do not use any quote marks ("'`) within the JSON values.''' - -structure_analysis = '''Consider a project {project_name}, {project_desc}. A {struct_type} `{struct_name}` specializes the following class(es) or interface(s): - -{ancestors} - -This {struct_type} contains the following field(s) and method(s): - -Fields: - -{fields} - -Methods: - -{methods} - -Explain the above {struct_type} on the following aspects: - -{{ description: "Describe the key responsibilities of the {struct_type} in up to three sentences.", - keywords: ["list", "of", "keywords", "relevant", "to", "the", "{struct_type}"], // try to have nouns as well as verb keywords - roleStereotype:..., - roleStereotypeReason:... }} - -When describing the responsibilities, consider that a responsibility can be fulfilled by a group of methods within the {struct_type}. In other words, an intermediate step for describing the {struct_type} is to cluster its methods into a few method responsibility-type. - -For the `roleStereotype`, fill the value with one of the following role stereotypes which responsibility is exhibited by the {struct_type}: - -- **Information Holder** is responsible for knowing facts and providing information to other objects. POJOs and Java Beans are usually information holders. -- **Service Provider** is responsible for handling requests and performing specific services. It usually implements a specific interface with a small number of methods. Concrete strategies are service providers. -- **Structurer** is responsible for managing relationships and constraints among related things. It is usually a collection or mapping of some sort. -- **Controller** is responsible for making decisions, directing the work of others, and handling important events. It directs the flow of the application or business process. -- **Coordinator** is responsible for managing the actions of a group of workers and facilitating communication and work of other objects. It delegates requests to other objects. Very abstract classes and interfaces might be coordinators as they delegate the work to subclasses. -- **User Interfacer** is responsible for transmitting user requests for action or display/render information that can be updated. It handles interactions with users. -- **External Interfacer** is responsible for loading and storing information from/to external services, including database systems, web services, filesystems, hardware, etc. -- **Internal Interfacer** is responsible for interfacing between two subsystems. It may bundle together information of requests from a group of objects to be sent to another object. Abstract adapters, bridges, facades, and proxies are internal interfacers. - -In `roleStereotypeReason`, explain why this {struct_type} fits your stereotype of choice but not the other stereotypes. - -Respond with a well-formatted JSON object. Do not use any quote marks ("'`) within the JSON values. In the `description`, do not mention the name of the role stereotype or layer.''' - -component_analysis = '''Consider a project {project_name}, {project_desc}. Given a package `{pkg_name}` containing the following classes: - -{classes} - -and the following subpackages: - -{packages} - -Explain the above package on the following aspects: - -{{ description: "Describe the functionality of the package in up to five sentences.", - title: "A Noun Phrase that Describes the Package", - keywords: ["list", "of", "keywords", "relevant", "to", "the", "package"], // try to have nouns as well as verb keywords - layer:..., - layerReason:... }} - -For the `layer`, consider the functionalities of architectural layers below: - -{layers} - -In `layerReason`, explain why this package fits your layer of choice but not the other layers. - -Respond with a well-formatted JSON object. Do not use any quote marks ("'`) within the JSON values. In the `description`, do not mention the name of the layer.''' +analyze_script_tool = { + "type": "function", + "function": { + "name": "AnalyzeScript", + "description": "Analyzes a program script given its source code and context. Returns an explanation covering functionality, parameters, return value, design rationale, usage, implementation details, assertions, stereotype, and architectural layer classification.", + "parameters": { + "type": "object", + "properties": { + "description": { + "type": "string", + "description": "One-sentence description of the script functionality." + }, + "parameters": { + "type": "array", + "items": { + "type": "object", + "properties": { + "name": { + "type": "string", + "description": "Parameter name." + }, + "type": { + "type": "string", + "description": "Parameter type." + }, + "description": { + "type": "string", + "description": "Brief description of the parameter." + } + }, + "required": [ + "name", + "description" + ] + }, + "description": "List of script parameters. Empty if none." + }, + "returns": { + "type": "string", + "description": "One-sentence description of the returned object or value. For constructors, consider the newly created instance as the return." + }, + "howToUse": { + "type": "string", + "description": "Usage instructions in less than three sentences." + }, + "howItWorks": { + "type": "string", + "description": "Implementation details in less than five sentences." + }, + "preConditions": { + "type": "array", + "items": { + "type": "string" + }, + "description": "List of pre-conditions for the method." + }, + "postConditions": { + "type": "array", + "items": { + "type": "string" + }, + "description": "List of post-conditions for the method." + }, + "stereotype": { + "type": "string", + "enum": [ + "Accessor", + "Mutator", + "Creational", + "Collaborational", + "Other" + ], + "description": "Design stereotype of the method." + }, + "stereotypeReason": { + "type": "string", + "description": "One-sentence explanation for the chosen stereotype." + }, + "layer": { + "type": "string", + "description": "Architectural layer classification selected from the provided options." + }, + "layerReason": { + "type": "string", + "description": "Explanation why the method fits the chosen architectural layer but not others." + } + }, + "required": [ + "description", + "parameters", + "returns", + "howToUse", + "howItWorks", + "preConditions", + "postConditions", + "stereotype", + "stereotypeReason", + "layer", + "layerReason" + ] + } + } +} + +analyze_structure_tool = { + "type": "function", + "function": { + "name": "AnalyzeStructure", + "description": "Analyzes a software structure based on its inheritance, fields, and methods. Returns an explanation covering the key responsibilities of the structure, relevant keywords, role stereotype, and rationale for the chosen stereotype.", + "parameters": { + "type": "object", + "properties": { + "description": { + "type": "string", + "description": "Up to three sentences describing the key responsibilities of the structure." + }, + "keywords": { + "type": "array", + "items": { + "type": "string" + }, + "description": "List of keywords relevant to the structure." + }, + "roleStereotype": { + "type": "string", + "enum": [ + "Information Holder", + "Service Provider", + "Structurer", + "Controller", + "Coordinator", + "User Interfacer", + "External Interfacer", + "Internal Interfacer" + ], + "description": "The role stereotype of the structure: \ + **Information Holder** is responsible for knowing facts and providing information to other objects. POJOs, Java Beans, and enumerations are usually information holders. \ + **Service Provider** is responsible for handling requests and performing specific services. It usually implements a specific interface with a small number of methods. Concrete strategies are service providers. \ + **Structurer** is responsible for managing relationships and constraints among related things. It is usually a collection or mapping of some sort, i.e., a subclass of a List, Set, Map, etc. \ + **Controller** is responsible for making decisions, directing the work of others, and handling important events. It directs the flow of the application or business process. \ + **Coordinator** is responsible for managing the actions of a group of workers and facilitating communication and work of other objects. It delegates requests to other objects. Very abstract classes and interfaces might be coordinators as they delegate the work to subclasses. \ + **User Interfacer** is responsible for transmitting user requests for action or display/render information that can be updated. It handles interactions with users. \ + **External Interfacer** is responsible for loading and storing information from/to external services, including database systems, web services, filesystems, hardware, etc. \ + **Internal Interfacer** is responsible for interfacing between two subsystems. It may bundle together information of requests from a group of objects to be sent to another object. Abstract adapters, bridges, facades, and proxies are internal interfacers." + }, + "roleStereotypeReason": { + "type": "string", + "description": "One-sentence explanation for the chosen role stereotype." + } + }, + "required": [ + "description", + "keywords", + "roleStereotype", + "roleStereotypeReason" + ] + } + } +} + +analyze_component_tool = { + "type": "function", + "function": { + "name": "AnalyzeComponent", + "description": "Analyzes a software component by examining its contents. Returns an explanation including a description of component functionality, a descriptive title, a list of keywords, the selected architectural layer, and the rationale for that layer.", + "parameters": { + "type": "object", + "properties": { + "description": { + "type": "string", + "description": "Describe the functionality of the package in up to five sentences (do not mention the layer name)." + }, + "title": { + "type": "string", + "description": "A noun phrase that describes the package." + }, + "keywords": { + "type": "array", + "items": { + "type": "string" + }, + "description": "List of keywords relevant to the package." + }, + "layer": { + "type": "string", + "description": "Architectural layer classification selected from the provided options." + }, + "layerReason": { + "type": "string", + "description": "Explanation why the package fits the chosen layer but not others." + } + }, + "required": [ + "description", + "title", + "keywords", + "layer", + "layerReason" + ] + } + } +} interaction_analysis = '''## Input: diff --git a/arcanalib/graph.py b/arcanalib/graph.py index 8b9ed77..db5c9a6 100644 --- a/arcanalib/graph.py +++ b/arcanalib/graph.py @@ -225,16 +225,18 @@ def _set_graph_refs(self): for edge in elist: edge.set_graph(self) - def add_node(self, _id: str, labels=None, properties=None): + def add_node(self, _id: str, *labels, **properties): if _id in self.nodes: - pass # Overwrite or warn if needed + return n = Node(_id, *(labels or []), **(properties or {})) self.nodes[_id] = n n.set_graph(self) - def add_edge(self, source_id: str, target_id: str, edge_label: str, properties=None): + def add_edge(self, source_id: str, target_id: str, edge_label: str, **properties): if source_id not in self.nodes or target_id not in self.nodes: - raise ValueError("Source or target not in graph") + return + if self.find_edges(label=edge_label, where_source=lambda n: n.id == source_id, where_target=lambda n: n.id == target_id): + return e = Edge(source_id, target_id, edge_label, **(properties or {})) if edge_label not in self.edges: @@ -326,6 +328,25 @@ def find_nodes(self, label=None, where=None) -> List[Node]: if (not label or label in node.labels) and (not where or where(node)) ] + def find_node(self, label=None, where=None) -> Node: + nodes = self.find_nodes(label, where) + if nodes: + return nodes[0] + return None + + def find_edge(self, + label=None, + source_label=None, + target_label=None, + where_edge=None, + where_source=None, + where_target=None + ): + edges = self.find_edges(label,source_label,target_label,where_edge,where_source,where_target) + if edges: + return edges[0] + return None + def find_edges(self, label=None, source_label=None, diff --git a/config.ini.example b/config.ini.example index 0dcf0cb..574dd1a 100644 --- a/config.ini.example +++ b/config.ini.example @@ -8,6 +8,8 @@ output=zxing-3.5.3-output.json apikey=example-apikey apibase=http://localhost:8000/v1 model=llama3 +; LLM chat completion http request timeout, in seconds. If not specified, the default is 5 minutes. +timeout=120.0 [seeder] command={javaexe} -jar {jarfile} -i {input} -a -n {name} -f json @@ -20,4 +22,4 @@ layer1desc=Handles user interface, such as instatiating, setting properties of, layer2name=Logic layer2desc=Handles application and domain logic, i.e., neither UI nor data access. layer3name=Data -layer3desc=Handles data access, e.g., managing database connections, querying databases, reading/writing files, and invoking web services. +layer3desc=Handles loading and storing data from/to external services, including database systems, web services, filesystems, hardware, etc. From 7948bad9c3ad6daf94e2008f5243c3c78c75c239 Mon Sep 17 00:00:00 2001 From: Satrio Adi Rukmono Date: Tue, 18 Feb 2025 10:05:02 +0100 Subject: [PATCH 11/34] Update template --- arcana/templates.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/arcana/templates.py b/arcana/templates.py index 6dd53dc..bf4fbcd 100644 --- a/arcana/templates.py +++ b/arcana/templates.py @@ -52,14 +52,14 @@ "items": { "type": "string" }, - "description": "List of pre-conditions for the method." + "description": "List of pre-conditions for the script." }, "postConditions": { "type": "array", "items": { "type": "string" }, - "description": "List of post-conditions for the method." + "description": "List of post-conditions for the script." }, "stereotype": { "type": "string", @@ -70,7 +70,7 @@ "Collaborational", "Other" ], - "description": "Design stereotype of the method." + "description": "Design stereotype of the script." }, "stereotypeReason": { "type": "string", @@ -82,7 +82,7 @@ }, "layerReason": { "type": "string", - "description": "Explanation why the method fits the chosen architectural layer but not others." + "description": "Explanation why the script fits the chosen architectural layer but not others." } }, "required": [ @@ -119,7 +119,7 @@ "items": { "type": "string" }, - "description": "List of keywords relevant to the structure." + "description": "List of important keywords related to the key responsibilities of the structure." }, "roleStereotype": { "type": "string", @@ -168,18 +168,18 @@ "properties": { "description": { "type": "string", - "description": "Describe the functionality of the package in up to five sentences (do not mention the layer name)." + "description": "Describe the functionality of the component in up to five sentences." }, "title": { "type": "string", - "description": "A noun phrase that describes the package." + "description": "A noun phrase that describes the component." }, "keywords": { "type": "array", "items": { "type": "string" }, - "description": "List of keywords relevant to the package." + "description": "List of important keywords related to the core functionalities of the component." }, "layer": { "type": "string", @@ -187,7 +187,7 @@ }, "layerReason": { "type": "string", - "description": "Explanation why the package fits the chosen layer but not others." + "description": "Explanation why the component fits the chosen layer but not others." } }, "required": [ From d271a2744fa490fa9c3a2fd67f2b2be5f09f3469 Mon Sep 17 00:00:00 2001 From: Satrio Adi Rukmono Date: Wed, 19 Feb 2025 13:51:26 +0100 Subject: [PATCH 12/34] Edge access bug fix --- arcana/filters.py | 16 ++++++++-------- arcanalib/graph.py | 18 +++++++++--------- 2 files changed, 17 insertions(+), 17 deletions(-) diff --git a/arcana/filters.py b/arcana/filters.py index 75fd0bb..b81c1e8 100644 --- a/arcana/filters.py +++ b/arcana/filters.py @@ -141,10 +141,10 @@ def process(self, data: Graph) -> Graph: Returns: Graph: The processed data with dependency profiles. """ - parents = {e.source: e.target for e in invert(data.edges['contains'])} + parents = {e.source: e.target for e in invert(data.find_edges(label='contains'))} dependency_profiles = {} - calls = data.edges.get('calls', lift(data.edges['hasScript'], data.edges['invokes'], 'calls')) + calls = data.edges.get('calls', lift(data.find_edges(label='hasScript'), data.find_edges(label='invokes'), 'calls')) for edge in calls: source_id, target_id = edge.source, edge.target @@ -305,14 +305,14 @@ def process_hierarchy(self, data: Graph, client: OpenAI, model: str, jsonl_file, st_contains_st = data.find_edges(label='contains',source_label='Structure',target_label='Structure') ct_contains_st = data.find_edges(label='contains',target_label='Structure', where_source=lambda node: 'Container' in node.labels and 'Structure' not in node.labels) - new_ct_sources = {edge.target:data.find_source(data.edges['contains'],data.nodes[edge.target],lambda node:'Structure' not in node.labels,data.nodes[edge.source]).id for edge in st_contains_st} + new_ct_sources = {edge.target:data.find_source(data.find_edges(label='contains'),data.nodes[edge.target],lambda node:'Structure' not in node.labels,data.nodes[edge.source]).id for edge in st_contains_st} ct_contains_st.extend([Edge(source=source, target=target, label='contains') for target, source in new_ct_sources.items()]) - triplets = build_triplets(ct_contains_st, data.edges['hasScript']) + triplets = build_triplets(ct_contains_st, data.find_edges(label='hasScript')) met_to_cls_pkg = {met_id: (cls_id, pkg_id) for pkg_id, cls_id, met_id in triplets} # print(met_to_cls_pkg) # print('######################################################################') - sorted_method_ids, method_deps = data.toposorted_nodes(data.edges['invokes']) + sorted_method_ids, method_deps = data.toposorted_nodes(data.find_edges(label='invokes')) # print(sorted_method_ids) counter = 0 @@ -645,7 +645,7 @@ def update_method_properties(self, data: Graph, description: dict, method: Node) if key_lower == 'parameters' and isinstance(value, Iterable): param_nodes = [ data.nodes[edge.target] - for edge in data.edges['hasParameter'] + for edge in data.find_edges(label='hasParameter') if edge.source == method.id ] for param in value: @@ -680,12 +680,12 @@ def get_structure_relations(self, data: Graph, cls_id: str) -> tuple: """Retrieve class ancestors and fields.""" ancestors = list({ data.nodes[edge.target].property("qualifiedName") - for edge in data.edges['specializes'] + for edge in data.find_edges(label='specializes') if edge.source == cls_id }) fields = { data.nodes[edge.target] - for edge in data.edges['hasVariable'] + for edge in data.find_edges(label='hasVariable') if edge.source == cls_id } fields = [ diff --git a/arcanalib/graph.py b/arcanalib/graph.py index db5c9a6..0a52847 100644 --- a/arcanalib/graph.py +++ b/arcanalib/graph.py @@ -250,7 +250,7 @@ def add_edge(self, source_id: str, target_id: str, edge_label: str, **properties def invert_edges(self, edge_label: str, new_label: Optional[str] = None) -> None: if edge_label in self.edges: - inverted = invert(self.edges[edge_label], new_label) + inverted = invert(self.edges.get(edge_label,[]), new_label) nlabel = new_label or f"inv_{edge_label}" self.edges[nlabel] = inverted self._set_graph_refs() @@ -258,13 +258,13 @@ def invert_edges(self, edge_label: str, new_label: Optional[str] = None) -> None def compose_edges(self, edge_label1: str, edge_label2: str, new_label: Optional[str] = None) -> None: if (edge_label1 in self.edges) and (edge_label2 in self.edges): nlabel = new_label or f"{edge_label1}_{edge_label2}" - composed_list = compose(self.edges[edge_label1], self.edges[edge_label2], nlabel) + composed_list = compose(self.edges.get(edge_label1, []), self.edges.get(edge_label2,[]), nlabel) self.edges[nlabel] = composed_list self._set_graph_refs() def lift_edges(self, edge_label1: str, edge_label2: str, new_label: Optional[str] = None) -> None: if (edge_label1 in self.edges) and (edge_label2 in self.edges): - lifted_list = lift(self.edges[edge_label1], self.edges[edge_label2], new_label) + lifted_list = lift(self.edges.get(edge_label1,[]), self.edges.get(edge_label2,[]), new_label) nlabel = new_label or f"lifted_{edge_label1}_{edge_label2}" self.edges[nlabel] = lifted_list self._set_graph_refs() @@ -285,9 +285,9 @@ def get_all_edge_labels(self) -> Set[str]: def get_edges_with_node_labels(self, edge_label: str, node_label: str) -> List[Edge]: if edge_label in self.edges: return [ - edge for edge in self.edges[edge_label] - if node_label in self.nodes[edge.source].labels - and node_label in self.nodes[edge.target].labels + edge for edge in self.edges.get(edge_label,[]) + if node_label in self.self.nodes.get(edge.source, Node(None)).labels + and node_label in self.nodes.get(edge.target, Node(None)).labels ] return [] @@ -301,7 +301,7 @@ def get_source_and_target_labels(self, edge_label: str) -> Set[Tuple[str, str]]: return set() return { (sl, tl) - for e in self.edges[edge_label] + for e in self.edges.get(edge_label,[]) for (sl, tl) in self.get_edge_node_labels(e) } @@ -460,7 +460,7 @@ def toposorted_nodes(self, edges: List[Edge]): def clean_up(self): for edge_type in list(self.edges.keys()): self.edges[edge_type] = [ - e for e in self.edges[edge_type] + e for e in self.edges.get(edge_type,[]) if e.source in self.nodes and e.target in self.nodes ] @@ -469,7 +469,7 @@ def get_edges(label: str) -> List[Edge]: if label.startswith('-'): base_label = label[1:] if base_label in self.edges: - return invert(self.edges[base_label]) + return invert(self.edges.get(base_label,[])) return [] return self.edges.get(label, []) From 7ec48e7fe9779ea91f3dd6c2c6f7a72a4a55044b Mon Sep 17 00:00:00 2001 From: Satrio Adi Rukmono Date: Thu, 20 Feb 2025 13:14:18 +0100 Subject: [PATCH 13/34] Write additional nodes and edges to jsonl output --- arcana/filters.py | 28 ++++++++++++++---------- arcanalib/graph.py | 13 ++++++----- update_json.py | 54 ++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 79 insertions(+), 16 deletions(-) create mode 100644 update_json.py diff --git a/arcana/filters.py b/arcana/filters.py index b81c1e8..b07cb41 100644 --- a/arcana/filters.py +++ b/arcana/filters.py @@ -240,8 +240,6 @@ def __init__(self, config: Dict[str, Dict[str, Any]]): ] self.layers = layers self.layers_text = format_layers(layers) - - def process(self, data: Graph) -> Graph: """ @@ -255,16 +253,22 @@ def process(self, data: Graph) -> Graph: """ self.project_name, self.project_desc, self.openai_client_args, model, client = self.setup() timestr = time.strftime("%Y%m%d-%H%M%S") - - for i,(name,desc) in enumerate(self.layers): - data.add_node(f"layer:{name}", "Grouping", kind="architectural layer", simpleName=name, qualifiedName=name, description=desc, layerOrder=i) - - for i in range(len(self.layers)-1): - src = self.layers[i][0] - tgt = self.layers[i+1][0] - data.add_edge(f"layer:{src}", f"layer:{tgt}", "allowedDependency", weight=1) with open(f'arcana-{timestr}.jsonl', 'a', encoding="utf-8") as jsonl_file: + + for i,(name,desc) in enumerate(self.layers): + n = data.add_node(f"layer:{name}", "Grouping", kind="architectural layer", simpleName=name, qualifiedName=name, description=desc, layerOrder=i) + jsonl_file.write(json.dumps(n.to_dict(), cls=CustomJSONEncoder)) + jsonl_file.write('\n') + + + for i in range(len(self.layers)-1): + src = self.layers[i][0] + tgt = self.layers[i+1][0] + e = data.add_edge(f"layer:{src}", f"layer:{tgt}", "allowedDependency", weight=1) + jsonl_file.write(json.dumps(e.to_dict(), cls=CustomJSONEncoder)) + jsonl_file.write('\n') + with open(f'arcana-{timestr}.log', 'a', encoding="utf-8") as log_file: try: self.process_hierarchy(data, client, model, jsonl_file, log_file) @@ -430,7 +434,9 @@ def process_script(self, graph: Graph, client: OpenAI, model: str, jsonl_file: T layer_id = f"layer:{script.property('layer')}" layer_node = graph.find_node(label="Grouping", where=lambda node: node.id == layer_id) if layer_node: - graph.add_edge(script.id, layer_node.id, "implements", weight=1) + e = graph.add_edge(script.id, layer_node.id, "implements", weight=1) + jsonl_file.write(str(e), cls=CustomJSONEncoder) + jsonl_file.write('\n') jsonl_file.write(json.dumps({ 'data': { diff --git a/arcanalib/graph.py b/arcanalib/graph.py index 0a52847..896307f 100644 --- a/arcanalib/graph.py +++ b/arcanalib/graph.py @@ -225,18 +225,19 @@ def _set_graph_refs(self): for edge in elist: edge.set_graph(self) - def add_node(self, _id: str, *labels, **properties): + def add_node(self, _id: str, *labels, **properties) -> Node: if _id in self.nodes: - return + return None n = Node(_id, *(labels or []), **(properties or {})) self.nodes[_id] = n n.set_graph(self) + return n - def add_edge(self, source_id: str, target_id: str, edge_label: str, **properties): + def add_edge(self, source_id: str, target_id: str, edge_label: str, **properties) -> Edge: if source_id not in self.nodes or target_id not in self.nodes: - return + return None if self.find_edges(label=edge_label, where_source=lambda n: n.id == source_id, where_target=lambda n: n.id == target_id): - return + return None e = Edge(source_id, target_id, edge_label, **(properties or {})) if edge_label not in self.edges: @@ -248,6 +249,8 @@ def add_edge(self, source_id: str, target_id: str, edge_label: str, **properties self.nodes[source_id]._invalidate_cache() self.nodes[target_id]._invalidate_cache() + return e + def invert_edges(self, edge_label: str, new_label: Optional[str] = None) -> None: if edge_label in self.edges: inverted = invert(self.edges.get(edge_label,[]), new_label) diff --git a/update_json.py b/update_json.py new file mode 100644 index 0000000..3b6821a --- /dev/null +++ b/update_json.py @@ -0,0 +1,54 @@ +#!/usr/bin/env python3 +import sys +import json + +if len(sys.argv) != 3: + sys.exit(f"Usage: {sys.argv[0]} ") + +json_file = sys.argv[1] +jsonl_file = sys.argv[2] + +with open(json_file, "r") as f: + data = json.load(f) + +elements = data.get("elements", {}) +nodes = elements.get("nodes", []) +edges = elements.get("edges", []) + +node_map = {node["data"]["id"]: node for node in nodes if "data" in node and "id" in node["data"]} +edge_map = {} +for edge in edges: + edata = edge.get("data", {}) + if all(k in edata for k in ("source", "target", "label")): + key = (edata["source"], edata["target"], edata["label"]) + edge_map[key] = edge + +with open(jsonl_file, "r") as f: + for line in f: + line = line.strip() + if not line: + continue + entry = json.loads(line) + edata = entry.get("data", {}) + if all(k in edata for k in ("source", "target", "label")): + key = (edata["source"], edata["target"], edata["label"]) + new_props = edata.get("properties", {}) + if key in edge_map: + curr_props = edge_map[key]["data"].get("properties", {}) + curr_props.update(new_props) + edge_map[key]["data"]["properties"] = curr_props + else: + edges.append(entry) + edge_map[key] = entry + elif "id" in edata: + node_id = edata["id"] + new_props = edata.get("properties", {}) + if node_id in node_map: + curr_props = node_map[node_id]["data"].get("properties", {}) + curr_props.update(new_props) + node_map[node_id]["data"]["properties"] = curr_props + else: + nodes.append(entry) + node_map[node_id] = entry + +print(json.dumps(data, indent=4)) From 203de2e7bb55fc6fdcfa564aebd7184e46607e97 Mon Sep 17 00:00:00 2001 From: Satrio Adi Rukmono Date: Fri, 21 Feb 2025 16:28:41 +0100 Subject: [PATCH 14/34] Bug fix --- arcana/filters.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arcana/filters.py b/arcana/filters.py index b07cb41..a907b8e 100644 --- a/arcana/filters.py +++ b/arcana/filters.py @@ -435,7 +435,7 @@ def process_script(self, graph: Graph, client: OpenAI, model: str, jsonl_file: T layer_node = graph.find_node(label="Grouping", where=lambda node: node.id == layer_id) if layer_node: e = graph.add_edge(script.id, layer_node.id, "implements", weight=1) - jsonl_file.write(str(e), cls=CustomJSONEncoder) + jsonl_file.write(json.dumps(e.to_dict(), cls=CustomJSONEncoder)) jsonl_file.write('\n') jsonl_file.write(json.dumps({ From e78afc8ee271db7255ce37eefca8b80870d3b84d Mon Sep 17 00:00:00 2001 From: Satrio Adi Rukmono Date: Fri, 28 Feb 2025 14:01:02 +0100 Subject: [PATCH 15/34] Refine code, WIP --- arcana/filters.py | 321 +++++++++++++++++++--------------------- arcanalib/pipefilter.py | 42 +++--- 2 files changed, 172 insertions(+), 191 deletions(-) diff --git a/arcana/filters.py b/arcana/filters.py index a907b8e..aa78648 100644 --- a/arcana/filters.py +++ b/arcana/filters.py @@ -4,36 +4,26 @@ import subprocess import sys import time - -from arcana import templates -from arcanalib.graph import Edge, Graph, Node, triplets, invert, lift -from arcanalib.pipefilter import Filter, Seeder -from collections import Counter, defaultdict, OrderedDict +from collections import Counter, OrderedDict, defaultdict from collections.abc import Iterable from io import TextIOWrapper from itertools import combinations +from typing import Any, Dict, List, Tuple + from openai import OpenAI from tqdm.auto import tqdm -from typing import Dict, Any, Tuple, List +from arcana import templates +from arcanalib.graph import Edge, Graph, Node, invert, lift, triplets +from arcanalib.pipefilter import Filter, Seeder -def remove_author(s): - return '\n'.join(t.strip() for t in s.split('\n') if not '@author' in t) +def remove_author(s: str) -> str: + return "\n".join(line.strip() for line in s.splitlines() if '@author' not in line) +_JAVA_COMMENT_RE = re.compile(r"(//.*?$)|(/\*.*?\*/)", flags=re.MULTILINE | re.DOTALL) def remove_java_comments(java_source: str) -> str: - """ - Remove single-line and multi-line comments from a given Java source code string. - - Args: - java_source (str): The Java source code as a string. - - Returns: - str: The Java source code without comments. - """ - pattern = r"(//.*?$)|(/\*.*?\*/)" - return re.sub(pattern, "", java_source, flags=re.MULTILINE | re.DOTALL).strip() - + return _JAVA_COMMENT_RE.sub("", java_source).strip() def sentence(s: str) -> str: """ @@ -55,7 +45,7 @@ def sentence(s: str) -> str: return f'{t[0].upper()}{t[1:]}.' -def lower1(s: str) -> str: +def lower_first(s: str) -> str: """ Lowercase the first character of a string. @@ -78,20 +68,30 @@ def prettify_json(obj: dict) -> str: Returns: str: The pretty-printed JSON string. """ - return json.dumps(obj, indent='\t') + return json.dumps(obj, indent=4) + +def layers_to_list(d: Dict[str, Any]) -> List[Tuple[str, str]]: + result = [] + i = 1 + while True: + name_key, desc_key = f"layer{i}name", f"layer{i}desc" + if name_key not in d or desc_key not in d: + break + result.append((d[name_key], d[desc_key])) + i += 1 + return result -def layers_to_list(d): - result = [] - i = 1 - while True: - name_key, desc_key = f"layer{i}name", f"layer{i}desc" - if name_key not in d or desc_key not in d: - break - result.append((d[name_key], d[desc_key])) - i += 1 - return result +def write_jsonl(file: TextIOWrapper, obj: Any) -> None: + file.write(json.dumps(obj, cls=CustomJSONEncoder) + '\n') +class StopProcessing(Exception): + """Raised when a stop signal is detected.""" + pass + +def check_stop() -> None: + if os.path.exists('stop'): + raise StopProcessing("Stop file detected, halting processing.") class CLISeeder(Seeder): @@ -109,26 +109,28 @@ def generate(self) -> Graph: :return: The generated Graph object. """ - # Execute the command process = subprocess.run( self.command, capture_output=True, text=True, shell=True, - encoding="utf-8" + encoding="utf-8", + check=True ) + if process.stderr: + sys.stderr.write(process.stderr) + output_dict = json.loads(process.stdout) + return Graph(output_dict) - sys.stderr.write(process.stderr) - - # Parse the JSON output into a dict - if process.returncode == 0: - output_dict = json.loads(process.stdout) - - # Pass the dict to the Graph constructor and return the Graph object - return Graph(output_dict) - else: - raise "Command execution failed." +def dependency_profile_category(inn: int, out: int) -> str: + if inn == 0 and out > 0: + return "outbound" + elif inn > 0 and out == 0: + return "inbound" + elif inn > 0 and out > 0: + return "transit" + return "hidden" class MetricsFilter(Filter): def process(self, data: Graph) -> Graph: @@ -142,29 +144,21 @@ def process(self, data: Graph) -> Graph: Graph: The processed data with dependency profiles. """ parents = {e.source: e.target for e in invert(data.find_edges(label='contains'))} - dependency_profiles = {} + dependency_profiles = defaultdict(list) - calls = data.edges.get('calls', lift(data.find_edges(label='hasScript'), data.find_edges(label='invokes'), 'calls')) + calls = data.edges.get('calls', lift( + data.find_edges(label='hasScript'), + data.find_edges(label='invokes'), + 'calls' + )) for edge in calls: source_id, target_id = edge.source, edge.target - dependency_profiles.setdefault(source_id, []) - dependency_profiles.setdefault(target_id, []) - - if parents[source_id] != parents[target_id]: + if parents.get(source_id) != parents.get(target_id): dependency_profiles[source_id].append('out') dependency_profiles[target_id].append('in') - dependency_profiles = {id: Counter(profile) for id, profile in dependency_profiles.items()} - - def dependency_profile_category(inn: int, out: int) -> str: - if inn == 0 and out > 0: - return "outbound" - elif inn > 0 and out == 0: - return "inbound" - elif inn > 0 and out > 0: - return "transit" - return "hidden" + dependency_profiles = {node_id: Counter(prof) for node_id, prof in dependency_profiles.items()} for id, profile in dependency_profiles.items(): data.nodes[id].properties['dependencyProfile'] = dependency_profile_category( @@ -221,25 +215,20 @@ def group_paths_by_endpoints(paths: List[List[Edge]]) -> Dict[Tuple[str, str], L return grouped_paths def format_layers(layers): - return "\n".join(f"- **{name}**: {desc}" for name, desc in layers) + return "\n".join(f"- **{name}**: {desc}" for name, desc in layers) class LLMFilter(Filter): def __init__(self, config: Dict[str, Dict[str, Any]]): super().__init__(config) - self.project_name = None - self.project_desc = None - self.openai_client_args = None - layers = layers_to_list(config["layers"]) if "layers" in config else None - if not layers: - layers = [ - ("Presentation Layer", "Manages the user interface, defines UI elements and behavior, displays information, responds to user input, and updates views."), - ("Service Layer", "Controls the application flow, orchestrates domain operations, connects UI events with domain logic, and synchronizes domain changes with the UI."), - ("Domain Layer", "Handles business logic, represents domain data and behavior, and performs necessary computations for domain operations."), - ("Data Source Layer", "Interacts with databases, filesystems, hardware, messaging systems, or other data sources, performs CRUD operations, handles data conversion, and ensures data integrity."), - ] - self.layers = layers - self.layers_text = format_layers(layers) + default_layers = [ + ("Presentation Layer", "Manages the user interface, defines UI elements and behavior, displays information, responds to user input, and updates views."), + ("Service Layer", "Controls the application flow, orchestrates domain operations, connects UI events with domain logic, and synchronizes domain changes with the UI."), + ("Domain Layer", "Handles business logic, represents domain data and behavior, and performs necessary computations for domain operations."), + ("Data Source Layer", "Interacts with databases, filesystems, hardware, messaging systems, or other data sources, performs CRUD operations, handles data conversion, and ensures data integrity."), + ] + self.layers = layers_to_list(config.get("layers", {})) or default_layers + self.layers_text = format_layers(self.layers) def process(self, data: Graph) -> Graph: """ @@ -258,16 +247,14 @@ def process(self, data: Graph) -> Graph: for i,(name,desc) in enumerate(self.layers): n = data.add_node(f"layer:{name}", "Grouping", kind="architectural layer", simpleName=name, qualifiedName=name, description=desc, layerOrder=i) - jsonl_file.write(json.dumps(n.to_dict(), cls=CustomJSONEncoder)) - jsonl_file.write('\n') + write_jsonl(jsonl_file, n.to_dict()) for i in range(len(self.layers)-1): src = self.layers[i][0] tgt = self.layers[i+1][0] e = data.add_edge(f"layer:{src}", f"layer:{tgt}", "allowedDependency", weight=1) - jsonl_file.write(json.dumps(e.to_dict(), cls=CustomJSONEncoder)) - jsonl_file.write('\n') + write_jsonl(jsonl_file, e.to_dict()) with open(f'arcana-{timestr}.log', 'a', encoding="utf-8") as log_file: try: @@ -296,7 +283,7 @@ def describe(self, node: dict, *keys) -> str: """Generate a description for a given node.""" sr, sn = '\r', '\n' if not keys: - keys = ['description', 'docComment', 'returns', 'reason', 'howToUse', 'howItWorks', 'assertions', 'stereotype', 'roleStereotype', 'layer'] + keys = ['description', 'docComment', 'returns', 'reason', 'howToUse', 'howItWorks', 'assertions', 'roleStereotype', 'layer'] lines = {key:f"**{key}**: {sentence(str(node.properties[key]).replace(sr,'').replace(sn,' '))}" for key in keys if key in node.properties and key != 'docComment' and node.properties[key]} if 'docComment' in keys and 'docComment' in node.properties and node.properties['docComment']: @@ -304,32 +291,31 @@ def describe(self, node: dict, *keys) -> str: return ' '.join(lines[key] for key in keys if key in lines).strip() - def process_hierarchy(self, data: Graph, client: OpenAI, model: str, jsonl_file, log_file): + def process_hierarchy(self, graph: Graph, client: OpenAI, model: str, jsonl_file, log_file): """Process each package, class, and method in the hierarchy.""" - st_contains_st = data.find_edges(label='contains',source_label='Structure',target_label='Structure') - ct_contains_st = data.find_edges(label='contains',target_label='Structure', where_source=lambda node: 'Container' in node.labels and 'Structure' not in node.labels) - new_ct_sources = {edge.target:data.find_source(data.find_edges(label='contains'),data.nodes[edge.target],lambda node:'Structure' not in node.labels,data.nodes[edge.source]).id for edge in st_contains_st} + st_contains_st = graph.find_edges(label='contains',source_label='Structure',target_label='Structure') + ct_contains_st = graph.find_edges(label='contains',target_label='Structure', where_source=lambda node: 'Container' in node.labels and 'Structure' not in node.labels) + new_ct_sources = {edge.target:graph.find_source(graph.find_edges(label='contains'),graph.nodes[edge.target],lambda node:'Structure' not in node.labels,graph.nodes[edge.source]).id for edge in st_contains_st} ct_contains_st.extend([Edge(source=source, target=target, label='contains') for target, source in new_ct_sources.items()]) - triplets = build_triplets(ct_contains_st, data.find_edges(label='hasScript')) + triplets = build_triplets(ct_contains_st, graph.find_edges(label='hasScript')) met_to_cls_pkg = {met_id: (cls_id, pkg_id) for pkg_id, cls_id, met_id in triplets} # print(met_to_cls_pkg) # print('######################################################################') - sorted_method_ids, method_deps = data.toposorted_nodes(data.find_edges(label='invokes')) + sorted_method_ids, method_deps = graph.toposorted_nodes(graph.find_edges(label='invokes')) # print(sorted_method_ids) counter = 0 for met_id in tqdm(sorted_method_ids, desc='Processing methods', position=0, leave=False): cls_id, pkg_id = met_to_cls_pkg[met_id] - method = data.nodes[met_id] - clasz = data.nodes[cls_id] + method = graph.nodes[met_id] + clasz = graph.nodes[cls_id] - self.process_script(data, client, model, jsonl_file, log_file, method, clasz, method_deps) + self.process_script(graph, client, model, jsonl_file, log_file, method, clasz, method_deps) - if os.path.exists('stop'): - raise StopIteration + check_stop() counter += 1 if counter==10: @@ -338,44 +324,41 @@ def process_hierarchy(self, data: Graph, client: OpenAI, model: str, jsonl_file, counter %= 10 hierarchy = build_hierarchy(triplets) - sorted_pkg_ids, pkg_deps = data.toposorted_nodes(data.find_edges(label='contains',where_source=lambda node: 'Structure' not in node.labels,where_target=lambda node: 'Structure' not in node.labels)) + sorted_pkg_ids, pkg_deps = graph.toposorted_nodes(graph.find_edges(label='contains',where_source=lambda node: 'Structure' not in node.labels,where_target=lambda node: 'Structure' not in node.labels)) for pkg_id in tqdm(sorted_pkg_ids, desc="Processing packages", position=1): pkg_data = hierarchy.get(pkg_id, dict()) - package = data.nodes[pkg_id] + package = graph.nodes[pkg_id] for cls_id, cls_data in tqdm(pkg_data.items(), desc="Processing classes", position=2, leave=False): - clasz = data.nodes[cls_id] + clasz = graph.nodes[cls_id] - self.process_structure(data, client, model, jsonl_file, log_file, clasz, cls_data) + self.process_structure(graph, client, model, jsonl_file, log_file, clasz, cls_data) - if os.path.exists('stop'): - raise StopIteration + check_stop() - self.process_component(data, client, model, jsonl_file, log_file, package, pkg_data, pkg_deps) + self.process_component(graph, client, model, jsonl_file, log_file, package, pkg_data, pkg_deps) log_file.flush() jsonl_file.flush() - if os.path.exists('stop'): - raise StopIteration + check_stop() - paths = data.find_paths("contains", "hasScript", "invokes", "-hasScript", "-contains") + paths = graph.find_paths("contains", "hasScript", "invokes", "-hasScript", "-contains") path_groups = group_paths_by_endpoints(paths) pkg_pairs = list(combinations(sorted_pkg_ids,2)) for pkg2_id, pkg1_id in tqdm(pkg_pairs, desc='Processing package interactions', position=0, leave=False): - - if os.path.exists('stop'): - raise StopIteration + + check_stop() - pkg1 = data.nodes[pkg1_id] - pkg2 = data.nodes[pkg2_id] + pkg1 = graph.nodes[pkg1_id] + pkg2 = graph.nodes[pkg2_id] if ('Structure' not in pkg1.labels) and ('Structure' not in pkg2.labels): if path_groups[(pkg1_id,pkg2_id)]: - self.process_interactions(data, client, model, pkg1, pkg2, path_groups[(pkg1_id,pkg2_id)], hierarchy, jsonl_file, log_file) + self.process_interactions(graph, client, model, pkg1, pkg2, path_groups[(pkg1_id,pkg2_id)], hierarchy, jsonl_file, log_file) if path_groups[(pkg2_id,pkg1_id)]: - self.process_interactions(data, client, model, pkg2, pkg1, path_groups[(pkg2_id,pkg1_id)], hierarchy, jsonl_file, log_file) + self.process_interactions(graph, client, model, pkg2, pkg1, path_groups[(pkg2_id,pkg1_id)], hierarchy, jsonl_file, log_file) def compose_prompt(self, p, function_parameters): prompt = p @@ -395,7 +378,7 @@ def compose_prompt(self, p, function_parameters): elif v: prompt += f"## {k}\n\n{str(v)}\n\n" return prompt.strip() - + def process_script(self, graph: Graph, client: OpenAI, model: str, jsonl_file: TextIOWrapper, log_file: TextIOWrapper, script: Node, structure: Node, node_deps: dict): """Process a single method and generate its description.""" @@ -435,17 +418,15 @@ def process_script(self, graph: Graph, client: OpenAI, model: str, jsonl_file: T layer_node = graph.find_node(label="Grouping", where=lambda node: node.id == layer_id) if layer_node: e = graph.add_edge(script.id, layer_node.id, "implements", weight=1) - jsonl_file.write(json.dumps(e.to_dict(), cls=CustomJSONEncoder)) - jsonl_file.write('\n') + write_jsonl(jsonl_file, e.to_dict()) - jsonl_file.write(json.dumps({ + write_jsonl(jsonl_file, { 'data': { 'id': script.id, 'labels': script.labels, 'properties': description } - }, cls=CustomJSONEncoder)) - jsonl_file.write('\n') + }) def process_structure(self, graph: Graph, client: OpenAI, model: str, jsonl_file, log_file, structure: Node, structure_scripts: list): """Process a single class and generate its description.""" @@ -475,14 +456,13 @@ def process_structure(self, graph: Graph, client: OpenAI, model: str, jsonl_file description = self.generate_json_description(client, model, prompt, "AnalyzeStructure") self.update_class_properties(graph, description, structure) - jsonl_file.write(json.dumps({ + write_jsonl(jsonl_file, { 'data': { 'id': structure.id, 'labels': list(structure.labels), 'properties': description } - })) - jsonl_file.write('\n') + }) def process_component(self, graph: Graph, client: OpenAI, model: str, jsonl_file, log_file, component: Node, component_contents: dict, component_deps: dict): @@ -519,12 +499,13 @@ def process_component(self, graph: Graph, client: OpenAI, model: str, jsonl_file if layer_node: graph.add_edge(component.id, layer_node.id, "implements", weight=1) - jsonl_file.write(json.dumps({ + write_jsonl(jsonl_file, { 'data': { 'id': component.id, 'labels': list(component.labels), - 'properties': description}})) - jsonl_file.write('\n') + 'properties': description + } + }) def process_interactions(self, graph: Graph, client: OpenAI, model: str, c1: Node, c2: Node, path_groups: List[Edge], hierarchy, jsonl_file: TextIOWrapper, log_file: TextIOWrapper): c1_name = c1.properties["qualifiedName"] @@ -569,8 +550,8 @@ def describe_path(path): if "dependsOn" not in graph.edges: graph.edges["dependsOn"] = [] graph.edges["dependsOn"].append(pkg1_edge) - jsonl_file.write(json.dumps(pkg1_edge.to_dict(), cls=CustomJSONEncoder)) - jsonl_file.write('\n') + + write_jsonl(jsonl_file, pkg1_edge.to_dict()) def generate_json_description(self, client: OpenAI, model: str, prompt: str = None, tool: str = None) -> dict: """Generate a description using the OpenAI client.""" @@ -590,9 +571,9 @@ def generate_json_description(self, client: OpenAI, model: str, prompt: str = No timeout=float(self.config['llm'].get('timeout', 300)) ) print(response) - + tool_calls = response.choices[0].message.tool_calls - + if tool_calls: args_str = tool_calls[0].function.arguments description = json.loads(args_str) @@ -614,10 +595,11 @@ def generate_json_description(self, client: OpenAI, model: str, prompt: str = No seed=42, timeout=float(self.config['llm'].get('timeout', 300)) ) - + content = response.choices[0].message.content description = json.loads(content) - except: + except Exception as e: + sys.stderr.write("Generate JSON description error: %s", e) description = {} if 'description' not in description: @@ -636,7 +618,8 @@ def generate_text_description(self, client: OpenAI, model: str, prompt: str) -> timeout=float(self.config['llm'].get('timeout', 300)) ) description = response.choices[0].message.content - except: + except Exception as e: + sys.stderr.write("Generate text description error: %s", e) description = "(no description)" return description @@ -647,7 +630,7 @@ def update_method_properties(self, data: Graph, description: dict, method: Node) for key, value in description.items(): if key.endswith('Reason'): continue - key_lower = lower1(key) + key_lower = lower_first(key) if key_lower == 'parameters' and isinstance(value, Iterable): param_nodes = [ data.nodes[edge.target] @@ -668,19 +651,19 @@ def update_method_properties(self, data: Graph, description: dict, method: Node) # elif key_lower == 'returns': # method.properties['returns'] = value.get('description', None) if value and hasattr(value, 'get') else None else: - method.properties[key_lower] = value + data.nodes[method.id].properties[key_lower] = value def update_class_properties(self, data: Graph, description: dict, clasz: Node): """Update class properties with the generated description.""" for key in description: if not key.endswith('Reason'): - clasz.properties[lower1(key)] = description[key] + data.nodes[clasz.id].properties[lower_first(key)] = description[key] def update_package_properties(self, data: Graph, description: dict, package: Node): """Update package properties with the generated description.""" for key in description: if not key.endswith('Reason'): - package.properties[lower1(key)] = description[key] + data.nodes[package.id].properties[lower_first(key)] = description[key] def get_structure_relations(self, data: Graph, cls_id: str) -> tuple: """Retrieve class ancestors and fields.""" @@ -722,37 +705,45 @@ def get_component_descriptions(self, data: Graph, package_ids: list) -> list: } def find_first_valid_json(text: str) -> str: - """ - Finds the first valid JSON substring in the given text using a stack-based approach. - - It scans the text from left to right, and when it encounters a '{', it tracks the balanced - braces until a complete JSON object is formed. Once a candidate is found, it attempts to parse - it with json.loads(). If parsing succeeds, that candidate is returned immediately. - - Args: - text (str): The input string that may contain a JSON object. - - Returns: - str: The first valid JSON substring found, or an empty string if none is found. - """ - n = len(text) - for i in range(n): - if text[i] == '{': - stack = 0 - for j in range(i, n): - if text[j] == '{': - stack += 1 - elif text[j] == '}': - stack -= 1 - if stack == 0: - candidate = text[i:j+1] - try: - json.loads(candidate) - return candidate - except json.JSONDecodeError: - # If this candidate isn't valid JSON, break and continue scanning. - break - return "" + """ + Finds the first valid JSON substring in the given text using a stack-based approach. + + It scans the text from left to right, and when it encounters a '{', it tracks the balanced + braces until a complete JSON object is formed. Once a candidate is found, it attempts to parse + it with json.loads(). If parsing succeeds, that candidate is returned immediately. + + Args: + text (str): The input string that may contain a JSON object. + + Returns: + str: The first valid JSON substring found, or an empty string if none is found. + """ + n = len(text) + for i in range(n): + if text[i] == '{': + stack = 0 + for j in range(i, n): + if text[j] == '{': + stack += 1 + elif text[j] == '}': + stack -= 1 + if stack == 0: + candidate = text[i:j+1] + try: + json.loads(candidate) + return candidate + except json.JSONDecodeError: + # If this candidate isn't valid JSON, break and continue scanning. + break + return "" + +def simplify_name(name): + if '(' in name and name.endswith(')'): + prefix, params = name.split('(', 2) + params = [param.split('.')[-1].split('$')[-1] for param in params.split(')', 1)[0].split(',')] + return prefix + '(' + ','.join(params) + ')' + else: + return name def merge_node_properties(dict1: Dict[str, Node], dict2: Dict[str, Node], simplify_names=False): for id2, obj2 in dict2.items(): @@ -763,14 +754,6 @@ def merge_node_properties(dict1: Dict[str, Node], dict2: Dict[str, Node], simpli elif simplify_names: - def simplify_name(name): - if '(' in name and name.endswith(')'): - prefix, params = name.split('(', 2) - params = [param.split('.')[-1].split('$')[-1] for param in params.split(')', 1)[0].split(',')] - return prefix + '(' + ','.join(params) + ')' - else: - return name - dict1_name_remap = { simplify_name(key): key for key in dict1 diff --git a/arcanalib/pipefilter.py b/arcanalib/pipefilter.py index 46a0869..87509b3 100644 --- a/arcanalib/pipefilter.py +++ b/arcanalib/pipefilter.py @@ -1,9 +1,10 @@ -from typing import Any, Dict, List, Union +from abc import ABC, abstractmethod +from typing import Any, Dict, List from arcanalib.graph import Graph -class Filter: +class Filter(ABC): def __init__(self, config: Dict[str, Dict[str, Any]]) -> None: """ Initialize the filter with a configuration. @@ -12,6 +13,7 @@ def __init__(self, config: Dict[str, Dict[str, Any]]) -> None: """ self.config = config + @abstractmethod def process(self, data: Graph) -> Any: """ Process the data. This method should be implemented by subclasses. @@ -19,36 +21,35 @@ def process(self, data: Graph) -> Any: :param data: The input data to be processed. :return: The processed data. """ - raise NotImplementedError("Subclasses must implement this method") + raise NotImplementedError class EndFilter(Filter): - """ - A special type of filter that marks the end of the pipeline processing. - """ + """A special filter that marks the end of pipeline processing.""" pass -class Seeder: +class Seeder(ABC): """ A class that generates graph data. """ + @abstractmethod def generate(self) -> Graph: """ Generate graph data. This method should be implemented by subclasses. """ - raise NotImplementedError("Subclasses must implement this method") + raise NotImplementedError class Pipeline: - def __init__(self, *args: Filter) -> None: - self.filters: List[Filter] = list(args) + def __init__(self, *filters: Filter) -> None: + self.filters: List[Filter] = list(filters) - def add_filter(self, filter: Filter) -> None: - self.filters.append(filter) + def add_filter(self, filt: Filter) -> None: + self.filters.append(filt) - def process(self, data: Union[Graph, Seeder]) -> Any: + def process(self, data: Graph | Seeder) -> Any: """ Process the data through the sequence of filters in the pipeline. If a seeder is provided instead of graph data, use the seeder to generate the graph data. @@ -56,13 +57,10 @@ def process(self, data: Union[Graph, Seeder]) -> Any: :param data: The input data to be processed or a seeder to generate the data. :return: The processed data. """ - # If a seeder is provided, use it to generate the graph data - if isinstance(data, Seeder): - data = data.generate() - - # sys.stderr.write(f"Graph stats: {len(data.nodes)} nodes, {len(data.edges)} edge types.") - for filter in self.filters: - data = filter.process(data) - if isinstance(filter, EndFilter): + d = data.generate() if isinstance(data, Seeder) else data + + for filt in self.filters: + d = filt.process(d) + if isinstance(filt, EndFilter): break - return data + return d From 37a208c2ab306ab447fe18ba82278051861d850f Mon Sep 17 00:00:00 2001 From: Satrio Adi Rukmono Date: Fri, 28 Feb 2025 15:53:51 +0100 Subject: [PATCH 16/34] Refactor --- arcana/filters.py | 500 +++++++++++++++++++++------------------------- 1 file changed, 230 insertions(+), 270 deletions(-) diff --git a/arcana/filters.py b/arcana/filters.py index aa78648..a006ed5 100644 --- a/arcana/filters.py +++ b/arcana/filters.py @@ -6,9 +6,8 @@ import time from collections import Counter, OrderedDict, defaultdict from collections.abc import Iterable -from io import TextIOWrapper from itertools import combinations -from typing import Any, Dict, List, Tuple +from typing import Any, Dict, List, Tuple, TextIO from openai import OpenAI from tqdm.auto import tqdm @@ -17,14 +16,18 @@ from arcanalib.graph import Edge, Graph, Node, invert, lift, triplets from arcanalib.pipefilter import Filter, Seeder + def remove_author(s: str) -> str: return "\n".join(line.strip() for line in s.splitlines() if '@author' not in line) + _JAVA_COMMENT_RE = re.compile(r"(//.*?$)|(/\*.*?\*/)", flags=re.MULTILINE | re.DOTALL) + def remove_java_comments(java_source: str) -> str: return _JAVA_COMMENT_RE.sub("", java_source).strip() + def sentence(s: str) -> str: """ Capitalize the first letter of a string and ensure it ends with a period. @@ -68,7 +71,7 @@ def prettify_json(obj: dict) -> str: Returns: str: The pretty-printed JSON string. """ - return json.dumps(obj, indent=4) + return json.dumps(obj, indent=2) def layers_to_list(d: Dict[str, Any]) -> List[Tuple[str, str]]: @@ -82,16 +85,20 @@ def layers_to_list(d: Dict[str, Any]) -> List[Tuple[str, str]]: i += 1 return result -def write_jsonl(file: TextIOWrapper, obj: Any) -> None: + +def write_jsonl(file: TextIO, obj: Any) -> None: file.write(json.dumps(obj, cls=CustomJSONEncoder) + '\n') + class StopProcessing(Exception): - """Raised when a stop signal is detected.""" - pass + """Raised when a stop signal is detected.""" + pass + def check_stop() -> None: - if os.path.exists('stop'): - raise StopProcessing("Stop file detected, halting processing.") + if os.path.exists('stop'): + raise StopProcessing("Stop file detected, halting processing.") + class CLISeeder(Seeder): @@ -109,14 +116,7 @@ def generate(self) -> Graph: :return: The generated Graph object. """ - process = subprocess.run( - self.command, - capture_output=True, - text=True, - shell=True, - encoding="utf-8", - check=True - ) + process = subprocess.run(self.command, capture_output=True, text=True, shell=True, encoding="utf-8", check=True) if process.stderr: sys.stderr.write(process.stderr) output_dict = json.loads(process.stdout) @@ -132,6 +132,7 @@ def dependency_profile_category(inn: int, out: int) -> str: return "transit" return "hidden" + class MetricsFilter(Filter): def process(self, data: Graph) -> Graph: """ @@ -146,11 +147,8 @@ def process(self, data: Graph) -> Graph: parents = {e.source: e.target for e in invert(data.find_edges(label='contains'))} dependency_profiles = defaultdict(list) - calls = data.edges.get('calls', lift( - data.find_edges(label='hasScript'), - data.find_edges(label='invokes'), - 'calls' - )) + calls = data.edges.get('calls', + lift(data.find_edges(label='hasScript'), data.find_edges(label='invokes'), 'calls')) for edge in calls: source_id, target_id = edge.source, edge.target @@ -160,33 +158,29 @@ def process(self, data: Graph) -> Graph: dependency_profiles = {node_id: Counter(prof) for node_id, prof in dependency_profiles.items()} - for id, profile in dependency_profiles.items(): - data.nodes[id].properties['dependencyProfile'] = dependency_profile_category( - profile['in'], - profile['out'] - ) + for node_id, profile in dependency_profiles.items(): + data.nodes[node_id].properties['dependencyProfile'] = dependency_profile_category(profile['in'], profile['out']) return data -def build_triplets(edge_list1, edge_list2) -> dict: - + +def build_triplets(edge_list1, edge_list2) -> list: methods = sorted(triplets(edge_list1, edge_list2)) - + return methods - + + def build_hierarchy(method_triplets) -> dict: classes = sorted({(pkg, clz) for pkg, clz, _ in method_triplets}) packages = sorted({pkg for pkg, _ in classes}) hierarchy = { - pkg_id: { - cls_id: [met_id for _, c, met_id in method_triplets if c == cls_id] - for p, cls_id in classes if p == pkg_id - } for pkg_id in packages - } + pkg_id: {cls_id: [met_id for _, c, met_id in method_triplets if c == cls_id] for p, cls_id in classes if + p == pkg_id} for pkg_id in packages} return hierarchy + class CustomJSONEncoder(json.JSONEncoder): def default(self, obj): if isinstance(obj, set): @@ -195,6 +189,7 @@ def default(self, obj): # Call the default method for other types return super().default(obj) + def group_paths_by_endpoints(paths: List[List[Edge]]) -> Dict[Tuple[str, str], List[List[Edge]]]: """ Groups paths by the tuple of (first edge's source, last edge's target). @@ -203,7 +198,7 @@ def group_paths_by_endpoints(paths: List[List[Edge]]) -> Dict[Tuple[str, str], L paths (List[List[Edge]]): The list of paths to group. Returns: - Dict[Tuple[str, str], List[List[Edge]]]: A dictionary where keys are + Dict[Tuple[str, str], List[List[Edge]]]: A dictionary where keys are tuples (first edge's source, last edge's target), and values are lists of paths. """ grouped_paths = defaultdict(list) @@ -214,22 +209,43 @@ def group_paths_by_endpoints(paths: List[List[Edge]]) -> Dict[Tuple[str, str], L grouped_paths[(start, end)].append(path) return grouped_paths + def format_layers(layers): return "\n".join(f"- **{name}**: {desc}" for name, desc in layers) + class LLMFilter(Filter): def __init__(self, config: Dict[str, Dict[str, Any]]): super().__init__(config) - - default_layers = [ - ("Presentation Layer", "Manages the user interface, defines UI elements and behavior, displays information, responds to user input, and updates views."), - ("Service Layer", "Controls the application flow, orchestrates domain operations, connects UI events with domain logic, and synchronizes domain changes with the UI."), - ("Domain Layer", "Handles business logic, represents domain data and behavior, and performs necessary computations for domain operations."), - ("Data Source Layer", "Interacts with databases, filesystems, hardware, messaging systems, or other data sources, performs CRUD operations, handles data conversion, and ensures data integrity."), - ] + + self.project_name = None + self.project_desc = None + self.model = None + self.client = None + self.timeout = None + + default_layers = [("Presentation Layer", + "Manages the user interface, defines UI elements and behavior, displays information, responds to user input, and updates views."), + ("Service Layer", + "Controls the application flow, orchestrates domain operations, connects UI events with domain logic, and synchronizes domain changes with the UI."), + ("Domain Layer", + "Handles business logic, represents domain data and behavior, and performs necessary computations for domain operations."), + ("Data Source Layer", + "Interacts with databases, filesystems, hardware, messaging systems, or other data sources, performs CRUD operations, handles data conversion, and ensures data integrity."), ] self.layers = layers_to_list(config.get("layers", {})) or default_layers self.layers_text = format_layers(self.layers) + self.setup() + + def setup(self) -> None: + self.project_name = self.config['project']['name'] + self.project_desc = sentence(self.config['project']['desc']) + openai_client_args = {'api_key': self.config['llm'].get('apikey'), + 'base_url': self.config['llm'].get('apibase')} + self.model = self.config['llm'].get('model', "gpt-4o-mini") + self.client = OpenAI(**openai_client_args) + self.timeout = float(self.config['llm'].get('timeout', 300)) + def process(self, data: Graph) -> Graph: """ Process the data using a language model to generate descriptions. @@ -240,92 +256,81 @@ def process(self, data: Graph) -> Graph: Returns: Graph: The processed data with generated descriptions. """ - self.project_name, self.project_desc, self.openai_client_args, model, client = self.setup() - timestr = time.strftime("%Y%m%d-%H%M%S") - - with open(f'arcana-{timestr}.jsonl', 'a', encoding="utf-8") as jsonl_file: - - for i,(name,desc) in enumerate(self.layers): - n = data.add_node(f"layer:{name}", "Grouping", kind="architectural layer", simpleName=name, qualifiedName=name, description=desc, layerOrder=i) + current_time_str = time.strftime("%Y%m%d-%H%M%S") + with open(f'arcana-{current_time_str}.jsonl', 'a', encoding="utf-8") as jsonl_file: + for i, (layer_name, layer_desc) in enumerate(self.layers): + n = data.add_node(f"layer:{layer_name}", "Grouping", kind="architectural layer", simpleName=layer_name, + qualifiedName=layer_name, description=layer_desc, layerOrder=i) write_jsonl(jsonl_file, n.to_dict()) - - - for i in range(len(self.layers)-1): + for i in range(len(self.layers) - 1): src = self.layers[i][0] - tgt = self.layers[i+1][0] + tgt = self.layers[i + 1][0] e = data.add_edge(f"layer:{src}", f"layer:{tgt}", "allowedDependency", weight=1) write_jsonl(jsonl_file, e.to_dict()) - - with open(f'arcana-{timestr}.log', 'a', encoding="utf-8") as log_file: + with open(f'arcana-{current_time_str}.log', 'a', encoding="utf-8") as log_file: try: - self.process_hierarchy(data, client, model, jsonl_file, log_file) - except StopIteration: + self.process_hierarchy(data, jsonl_file, log_file) + except Exception as e: pass - return data - def setup(self): - """Setup necessary configuration and client.""" - project_name = self.config['project']['name'] - project_desc = sentence(self.config['project']['desc']) - - openai_client_args = { - 'api_key': self.config['llm'].get('apikey'), - 'base_url': self.config['llm'].get('apibase') - } - model = self.config['llm'].get('model', "gpt-4o-mini") - - client = OpenAI(**openai_client_args) - - return project_name, project_desc, openai_client_args, model, client - - def describe(self, node: dict, *keys) -> str: + @staticmethod + def describe(node: Node, *keys) -> str: """Generate a description for a given node.""" sr, sn = '\r', '\n' if not keys: - keys = ['description', 'docComment', 'returns', 'reason', 'howToUse', 'howItWorks', 'assertions', 'roleStereotype', 'layer'] + keys = ['description', 'docComment', 'returns', 'reason', 'howToUse', 'howItWorks', 'assertions', + 'roleStereotype', 'layer'] - lines = {key:f"**{key}**: {sentence(str(node.properties[key]).replace(sr,'').replace(sn,' '))}" for key in keys if key in node.properties and key != 'docComment' and node.properties[key]} + lines = {key: f"**{key}**: {sentence(str(node.properties[key]).replace(sr, '').replace(sn, ' '))}" for key in + keys if key in node.properties and key != 'docComment' and node.properties[key]} if 'docComment' in keys and 'docComment' in node.properties and node.properties['docComment']: - lines['docComment'] = f"**docComment**: {sentence(remove_author(str(node.properties['docComment'])).replace(sr,'').replace(sn,' '))} " + lines[ + 'docComment'] = f"**docComment**: {sentence(remove_author(str(node.properties['docComment'])).replace(sr, '').replace(sn, ' '))} " return ' '.join(lines[key] for key in keys if key in lines).strip() - def process_hierarchy(self, graph: Graph, client: OpenAI, model: str, jsonl_file, log_file): + def process_hierarchy(self, graph: Graph, jsonl_file, log_file): """Process each package, class, and method in the hierarchy.""" - - st_contains_st = graph.find_edges(label='contains',source_label='Structure',target_label='Structure') - ct_contains_st = graph.find_edges(label='contains',target_label='Structure', where_source=lambda node: 'Container' in node.labels and 'Structure' not in node.labels) - new_ct_sources = {edge.target:graph.find_source(graph.find_edges(label='contains'),graph.nodes[edge.target],lambda node:'Structure' not in node.labels,graph.nodes[edge.source]).id for edge in st_contains_st} - ct_contains_st.extend([Edge(source=source, target=target, label='contains') for target, source in new_ct_sources.items()]) - - triplets = build_triplets(ct_contains_st, graph.find_edges(label='hasScript')) - met_to_cls_pkg = {met_id: (cls_id, pkg_id) for pkg_id, cls_id, met_id in triplets} + + st_contains_st = graph.find_edges(label='contains', source_label='Structure', target_label='Structure') + ct_contains_st = graph.find_edges(label='contains', target_label='Structure', where_source=lambda + node: 'Container' in node.labels and 'Structure' not in node.labels) + new_ct_sources = {edge.target: graph.find_source(graph.find_edges(label='contains'), graph.nodes[edge.target], + lambda node: 'Structure' not in node.labels, + graph.nodes[edge.source]).id for edge in st_contains_st} + ct_contains_st.extend( + [Edge(source=source, target=target, label='contains') for target, source in new_ct_sources.items()]) + + trips = build_triplets(ct_contains_st, graph.find_edges(label='hasScript')) + met_to_cls_pkg = {met_id: (cls_id, pkg_id) for pkg_id, cls_id, met_id in trips} # print(met_to_cls_pkg) # print('######################################################################') sorted_method_ids, method_deps = graph.toposorted_nodes(graph.find_edges(label='invokes')) # print(sorted_method_ids) - + counter = 0 - + for met_id in tqdm(sorted_method_ids, desc='Processing methods', position=0, leave=False): cls_id, pkg_id = met_to_cls_pkg[met_id] method = graph.nodes[met_id] clasz = graph.nodes[cls_id] - self.process_script(graph, client, model, jsonl_file, log_file, method, clasz, method_deps) + self.process_script(graph, jsonl_file, log_file, method, clasz, method_deps) check_stop() counter += 1 - if counter==10: + if counter == 10: log_file.flush() jsonl_file.flush() counter %= 10 - hierarchy = build_hierarchy(triplets) - sorted_pkg_ids, pkg_deps = graph.toposorted_nodes(graph.find_edges(label='contains',where_source=lambda node: 'Structure' not in node.labels,where_target=lambda node: 'Structure' not in node.labels)) - + hierarchy = build_hierarchy(trips) + sorted_pkg_ids, pkg_deps = graph.toposorted_nodes( + graph.find_edges(label='contains', where_source=lambda node: 'Structure' not in node.labels, + where_target=lambda node: 'Structure' not in node.labels)) + for pkg_id in tqdm(sorted_pkg_ids, desc="Processing packages", position=1): pkg_data = hierarchy.get(pkg_id, dict()) package = graph.nodes[pkg_id] @@ -333,11 +338,11 @@ def process_hierarchy(self, graph: Graph, client: OpenAI, model: str, jsonl_file for cls_id, cls_data in tqdm(pkg_data.items(), desc="Processing classes", position=2, leave=False): clasz = graph.nodes[cls_id] - self.process_structure(graph, client, model, jsonl_file, log_file, clasz, cls_data) + self.process_structure(graph, jsonl_file, log_file, clasz, cls_data) check_stop() - self.process_component(graph, client, model, jsonl_file, log_file, package, pkg_data, pkg_deps) + self.process_component(graph, jsonl_file, log_file, package, pkg_data, pkg_deps) log_file.flush() jsonl_file.flush() @@ -346,26 +351,29 @@ def process_hierarchy(self, graph: Graph, client: OpenAI, model: str, jsonl_file paths = graph.find_paths("contains", "hasScript", "invokes", "-hasScript", "-contains") path_groups = group_paths_by_endpoints(paths) - - pkg_pairs = list(combinations(sorted_pkg_ids,2)) + + pkg_pairs = list(combinations(sorted_pkg_ids, 2)) for pkg2_id, pkg1_id in tqdm(pkg_pairs, desc='Processing package interactions', position=0, leave=False): - + check_stop() pkg1 = graph.nodes[pkg1_id] pkg2 = graph.nodes[pkg2_id] if ('Structure' not in pkg1.labels) and ('Structure' not in pkg2.labels): - if path_groups[(pkg1_id,pkg2_id)]: - self.process_interactions(graph, client, model, pkg1, pkg2, path_groups[(pkg1_id,pkg2_id)], hierarchy, jsonl_file, log_file) - if path_groups[(pkg2_id,pkg1_id)]: - self.process_interactions(graph, client, model, pkg2, pkg1, path_groups[(pkg2_id,pkg1_id)], hierarchy, jsonl_file, log_file) - - def compose_prompt(self, p, function_parameters): + if path_groups[(pkg1_id, pkg2_id)]: + self.process_interactions(graph, pkg1, pkg2, path_groups[(pkg1_id, pkg2_id)], hierarchy, jsonl_file, + log_file) + if path_groups[(pkg2_id, pkg1_id)]: + self.process_interactions(graph, pkg2, pkg1, path_groups[(pkg2_id, pkg1_id)], hierarchy, jsonl_file, + log_file) + + @staticmethod + def compose_prompt(p, function_parameters): prompt = p - for k,v in function_parameters.items(): + for k, v in function_parameters.items(): if isinstance(v, dict) and len(v): prompt += f"## {k}\n\n" - for k1,v1 in v.items(): + for k1, v1 in v.items(): if v1: prompt += f"* {k1}: {str(v1)}\n" prompt += "\n\n" @@ -378,15 +386,17 @@ def compose_prompt(self, p, function_parameters): elif v: prompt += f"## {k}\n\n{str(v)}\n\n" return prompt.strip() - - def process_script(self, graph: Graph, client: OpenAI, model: str, jsonl_file: TextIOWrapper, log_file: TextIOWrapper, script: Node, structure: Node, node_deps: dict): + + def process_script(self, graph: Graph, jsonl_file: TextIO, log_file: TextIO, script: Node, + structure: Node, node_deps: dict): """Process a single method and generate its description.""" - if 'description' not in script.properties or not script.properties['description'] or script.properties['description'] == "(no description)": + if 'description' not in script.properties or not script.properties['description'] or script.properties[ + 'description'] == "(no description)": script_name = script.properties['simpleName'] script_src = remove_java_comments(script.properties['sourceText']) script_kind = script.properties.get('kind', 'function') - + structure_name = structure.properties['qualifiedName'] structure_kind = structure.properties['kind'] structure_kind = 'enum' if structure_kind == 'enumeration' else 'abstract class' if structure_kind == 'abstract' else structure_kind @@ -395,42 +405,35 @@ def process_script(self, graph: Graph, client: OpenAI, model: str, jsonl_file: T script_parameters = OrderedDict() script_parameters["Project Name"] = self.project_name script_parameters["Project Description"] = self.project_desc - script_parameters[f"{script_kind.title()} Declaration"] = f"The {script_kind} {script_name} is declared within the {structure_kind} {structure_name}." + script_parameters[ + f"{script_kind.title()} Declaration"] = f"The {script_kind} {script_name} is declared within the {structure_kind} {structure_name}." script_parameters[f"{script_kind.title()} Source Code"] = script_src - script_parameters[f"Other Functions/Methods Used"] = { - graph.nodes[node_id].properties['qualifiedName']: f"{self.describe(graph.nodes[node_id], 'description', 'returns', 'howToUse', 'docComment')}" - for node_id in node_deps[script.id] - } + script_parameters[f"Other Functions/Methods Used"] = {graph.nodes[node_id].properties[ + 'qualifiedName']: f"{self.describe(graph.nodes[node_id], 'description', 'returns', 'howToUse', 'docComment')}" + for node_id in node_deps[script.id]} script_parameters["Possible Architectural Layers"] = dict(self.layers) prompt = self.compose_prompt(prompt, script_parameters) - + log_file.write(prompt) log_file.write('\n\n======\n\n') - description = self.generate_json_description(client, model, prompt, "AnalyzeScript") + description = self.generate_json_description(prompt, "AnalyzeScript") self.update_method_properties(graph, description, script) layer_id = None - if script.has_property("layer") and \ - script.property("layer") in [name for name, _ in self.layers]: + if script.has_property("layer") and script.property("layer") in [name for name, _ in self.layers]: layer_id = f"layer:{script.property('layer')}" layer_node = graph.find_node(label="Grouping", where=lambda node: node.id == layer_id) if layer_node: e = graph.add_edge(script.id, layer_node.id, "implements", weight=1) write_jsonl(jsonl_file, e.to_dict()) - write_jsonl(jsonl_file, { - 'data': { - 'id': script.id, - 'labels': script.labels, - 'properties': description - } - }) - - def process_structure(self, graph: Graph, client: OpenAI, model: str, jsonl_file, log_file, structure: Node, structure_scripts: list): + write_jsonl(jsonl_file, {'data': {'id': script.id, 'labels': script.labels, 'properties': description}}) + + def process_structure(self, graph: Graph, jsonl_file, log_file, structure: Node, structure_scripts: list): """Process a single class and generate its description.""" - + # if 'description' not in structure.properties or not structure.properties['description'] or structure.properties['description'] == "(no description)": ancestors, variables = self.get_structure_relations(graph, structure.id) script_descriptions = self.get_script_descriptions(graph, structure_scripts) @@ -453,21 +456,16 @@ def process_structure(self, graph: Graph, client: OpenAI, model: str, jsonl_file log_file.write(prompt) log_file.write('\n\n======\n\n') - description = self.generate_json_description(client, model, prompt, "AnalyzeStructure") + description = self.generate_json_description(prompt, "AnalyzeStructure") self.update_class_properties(graph, description, structure) - write_jsonl(jsonl_file, { - 'data': { - 'id': structure.id, - 'labels': list(structure.labels), - 'properties': description - } - }) + write_jsonl(jsonl_file, + {'data': {'id': structure.id, 'labels': list(structure.labels), 'properties': description}}) - def process_component(self, graph: Graph, client: OpenAI, model: str, jsonl_file, log_file, component: Node, - component_contents: dict, component_deps: dict): + def process_component(self, graph: Graph, jsonl_file, log_file, component: Node, component_contents: dict, + component_deps: dict): """Process a single package and generate its description.""" - + # if 'description' not in component.properties or not component.properties['description'] or component.properties['description'] == "(no description)": structure_descriptions = self.get_structure_descriptions(graph, component_contents) subcomponent_descriptions = self.get_component_descriptions(graph, component_deps[component.id]) @@ -488,92 +486,71 @@ def process_component(self, graph: Graph, client: OpenAI, model: str, jsonl_file log_file.write(prompt) log_file.write('\n\n======\n\n') - description = self.generate_json_description(client, model, prompt, "AnalyzeComponent") + description = self.generate_json_description(prompt, "AnalyzeComponent") self.update_package_properties(graph, description, component) layer_id = None - if component.has_property("layer") and \ - component.property("layer") in [name for name, _ in self.layers]: + if component.has_property("layer") and component.property("layer") in [name for name, _ in self.layers]: layer_id = f"layer:{component.property('layer')}" layer_node = graph.find_node(label="Grouping", where=lambda node: node.id == layer_id) if layer_node: graph.add_edge(component.id, layer_node.id, "implements", weight=1) - write_jsonl(jsonl_file, { - 'data': { - 'id': component.id, - 'labels': list(component.labels), - 'properties': description - } - }) + write_jsonl(jsonl_file, + {'data': {'id': component.id, 'labels': list(component.labels), 'properties': description}}) - def process_interactions(self, graph: Graph, client: OpenAI, model: str, c1: Node, c2: Node, path_groups: List[Edge], hierarchy, jsonl_file: TextIOWrapper, log_file: TextIOWrapper): + def process_interactions(self, graph: Graph, c1: Node, c2: Node, path_groups: List[List[Edge]], hierarchy, + jsonl_file: TextIO, log_file: TextIO): c1_name = c1.properties["qualifiedName"] c2_name = c2.properties["qualifiedName"] c1_desc = c1.properties["description"] c2_desc = c2.properties["description"] - + c1_contents = hierarchy.get(c1.id, dict()) c2_contents = hierarchy.get(c2.id, dict()) - - c1_structure_info = "\n".join(f" - `{graph.nodes[c_id].properties['simpleName']}`: {graph.nodes[c_id].properties['description']}" for c_id, _ in c1_contents.items()) - c2_structure_info = "\n".join(f" - `{graph.nodes[c_id].properties['simpleName']}`: {graph.nodes[c_id].properties['description']}" for c_id, _ in c2_contents.items()) - - def describe_path(path): - src_structure = graph.nodes[path[1].source] - src_method = graph.nodes[path[1].target] - tgt_method = graph.nodes[path[-2].source] - tgt_structure = graph.nodes[path[-2].target] - return f"{src_method.properties['kind'].capitalize()} `{src_method.properties['simpleName']}` ({src_method.properties['description']}) of {src_structure.properties['kind']} `{src_structure.properties['qualifiedName']}` invokes {tgt_method.properties['kind']} `{tgt_method.properties['simpleName']}` ({tgt_method.properties['description']}) of {tgt_structure.properties['kind']} `{tgt_structure.properties['qualifiedName']}`." - - dep_info = f" - Dependencies from `{c1_name}` to `{c2_name}`:\n" + "\n".join(f" - {describe_path(path)}" for path in path_groups) if path_groups else "" - - prompt = templates.interaction_analysis.format( - project_name=self.project_name, - project_desc=self.project_desc, - pkg1_name=c1_name, - pkg2_name=c2_name, - pkg1_desc=c1_desc, - pkg2_desc=c2_desc, - cls1_info=c1_structure_info, - cls2_info=c2_structure_info, - dep_info=dep_info - ) + + c1_structure_info = "\n".join( + f" - `{graph.nodes[c_id].properties['simpleName']}`: {graph.nodes[c_id].properties['description']}" + for c_id, _ in c1_contents.items()) + c2_structure_info = "\n".join( + f" - `{graph.nodes[c_id].properties['simpleName']}`: {graph.nodes[c_id].properties['description']}" + for c_id, _ in c2_contents.items()) + + dep_info = f" - Dependencies from `{c1_name}` to `{c2_name}`:\n" + "\n".join( + f" - {describe_path(graph, path)}" for path in path_groups) if path_groups else "" + + prompt = templates.interaction_analysis.format(project_name=self.project_name, project_desc=self.project_desc, + pkg1_name=c1_name, pkg2_name=c2_name, pkg1_desc=c1_desc, + pkg2_desc=c2_desc, cls1_info=c1_structure_info, + cls2_info=c2_structure_info, dep_info=dep_info) log_file.write(prompt) log_file.write('\n\n======\n\n') - description = self.generate_text_description(client, model, prompt) + description = self.generate_text_description(prompt) pkg1_edge = Edge(source=c1.id, target=c2.id, label="dependsOn", description=description) if dep_info else None if pkg1_edge: if "dependsOn" not in graph.edges: graph.edges["dependsOn"] = [] graph.edges["dependsOn"].append(pkg1_edge) - + write_jsonl(jsonl_file, pkg1_edge.to_dict()) - def generate_json_description(self, client: OpenAI, model: str, prompt: str = None, tool: str = None) -> dict: + def generate_json_description(self, prompt: str = None, tool: str = None) -> dict: """Generate a description using the OpenAI client.""" try: if tool: - print(prompt) - response = client.chat.completions.create( - model=model, - messages=[ - {"role": "system", "content": "You are a software architecture analysis tool."}, - {"role": "user", "content": prompt} - ], - tools=[templates.analyze_script_tool, templates.analyze_structure_tool, templates.analyze_component_tool], - tool_choice={"name": tool}, - temperature=0, - seed=42, - timeout=float(self.config['llm'].get('timeout', 300)) - ) - print(response) - + response = self.client.chat.completions.create(model=self.model, messages=[ + {"role": "system", "content": "You are a software architecture analysis tool."}, + {"role": "user", "content": prompt}], tools=[templates.analyze_script_tool, + templates.analyze_structure_tool, + templates.analyze_component_tool], + tool_choice="required", temperature=0, seed=42, + timeout=self.timeout) + tool_calls = response.choices[0].message.tool_calls - + if tool_calls: args_str = tool_calls[0].function.arguments description = json.loads(args_str) @@ -586,64 +563,51 @@ def generate_json_description(self, client: OpenAI, model: str, prompt: str = No description = dict() else: - response = client.chat.completions.create( - model=model, - response_format={"type": "json_object"}, - messages=[{"role": "user", "content": prompt}], - max_tokens=4096, - temperature=0, - seed=42, - timeout=float(self.config['llm'].get('timeout', 300)) - ) - + response = self.client.chat.completions.create(model=self.model, + response_format={"type": "json_object"}, + messages=[{"role": "user", "content": prompt}], + max_tokens=4096, temperature=0, seed=42, + timeout=self.timeout) + content = response.choices[0].message.content description = json.loads(content) except Exception as e: - sys.stderr.write("Generate JSON description error: %s", e) + sys.stderr.write(f"Generate JSON description error: {e}") description = {} if 'description' not in description: description['description'] = "(no description)" return description - def generate_text_description(self, client: OpenAI, model: str, prompt: str) -> dict: + def generate_text_description(self, prompt: str) -> str: """Generate a description using the OpenAI client.""" try: - response = client.chat.completions.create( - model=model, - messages=[{"role": "user", "content": prompt}], - max_tokens=4096, - temperature=0, - seed=42, - timeout=float(self.config['llm'].get('timeout', 300)) - ) + response = self.client.chat.completions.create(model=self.model, + messages=[{"role": "user", "content": prompt}], + max_tokens=4096, temperature=0, seed=42, + timeout=float(self.config['llm'].get('timeout', 300))) description = response.choices[0].message.content except Exception as e: - sys.stderr.write("Generate text description error: %s", e) + sys.stderr.write(f"Generate text description error: {e}") description = "(no description)" return description - def update_method_properties(self, data: Graph, description: dict, method: Node): + @staticmethod + def update_method_properties(data: Graph, description: dict, method: Node): """Update method properties with the generated description.""" - + for key, value in description.items(): if key.endswith('Reason'): continue key_lower = lower_first(key) if key_lower == 'parameters' and isinstance(value, Iterable): - param_nodes = [ - data.nodes[edge.target] - for edge in data.find_edges(label='hasParameter') - if edge.source == method.id - ] + param_nodes = [data.nodes[edge.target] for edge in data.find_edges(label='hasParameter') if + edge.source == method.id] for param in value: if isinstance(param, dict): - matching_params = [ - node - for node in param_nodes - if node.properties['simpleName'] == param.get('name') - ] + matching_params = [node for node in param_nodes if + node.properties['simpleName'] == param.get('name')] if matching_params: param_node_id = matching_params[0].id if param_node_id in data.nodes: @@ -653,68 +617,65 @@ def update_method_properties(self, data: Graph, description: dict, method: Node) else: data.nodes[method.id].properties[key_lower] = value - def update_class_properties(self, data: Graph, description: dict, clasz: Node): + @staticmethod + def update_class_properties(data: Graph, description: dict, clasz: Node): """Update class properties with the generated description.""" for key in description: if not key.endswith('Reason'): data.nodes[clasz.id].properties[lower_first(key)] = description[key] - def update_package_properties(self, data: Graph, description: dict, package: Node): + @staticmethod + def update_package_properties(data: Graph, description: dict, package: Node): """Update package properties with the generated description.""" for key in description: if not key.endswith('Reason'): data.nodes[package.id].properties[lower_first(key)] = description[key] - def get_structure_relations(self, data: Graph, cls_id: str) -> tuple: + @staticmethod + def get_structure_relations(data: Graph, cls_id: str) -> tuple: """Retrieve class ancestors and fields.""" - ancestors = list({ - data.nodes[edge.target].property("qualifiedName") - for edge in data.find_edges(label='specializes') - if edge.source == cls_id - }) - fields = { - data.nodes[edge.target] - for edge in data.find_edges(label='hasVariable') - if edge.source == cls_id - } - fields = [ - ' '.join(remove_java_comments(field.properties['sourceText']).split()) - for field in fields - ] + ancestors = list( + {data.nodes[edge.target].property("qualifiedName") for edge in data.find_edges(label='specializes') if + edge.source == cls_id}) + fields = {data.nodes[edge.target] for edge in data.find_edges(label='hasVariable') if edge.source == cls_id} + fields = [' '.join(remove_java_comments(field.properties['sourceText']).split()) for field in fields] return ancestors, fields - def get_script_descriptions(self, data: Graph, cls_data: list) -> dict: + def get_script_descriptions(self, data: Graph, cls_data: list) -> dict[str,str]: """Generate descriptions for methods.""" - return { - data.nodes[met_id].properties['simpleName']: self.describe(data.nodes[met_id]) - for met_id in cls_data - } + return {data.nodes[met_id].properties['simpleName']: self.describe(data.nodes[met_id]) for met_id in cls_data} - def get_structure_descriptions(self, data: Graph, pkg_data: dict) -> list: + def get_structure_descriptions(self, data: Graph, pkg_data: dict) -> dict[str,str]: """Generate descriptions for classes.""" return { - f"{data.nodes[cls_id].properties['kind']} {data.nodes[cls_id].properties['qualifiedName']}": self.describe(data.nodes[cls_id]) - for cls_id, _ in pkg_data.items() - } + f"{data.nodes[cls_id].properties['kind']} {data.nodes[cls_id].properties['qualifiedName']}": self.describe( + data.nodes[cls_id]) for cls_id, _ in pkg_data.items()} - def get_component_descriptions(self, data: Graph, package_ids: list) -> list: + def get_component_descriptions(self, data: Graph, package_ids: list) -> dict[str,str]: """Generate descriptions for packages.""" - return { - data.nodes[pkg_id].properties['qualifiedName']: self.describe(data.nodes[pkg_id]) - for pkg_id in package_ids - } + return {data.nodes[pkg_id].properties['qualifiedName']: self.describe(data.nodes[pkg_id]) for pkg_id in + package_ids} + + +def describe_path(graph, path): + src_structure = graph.nodes[path[1].source] + src_method = graph.nodes[path[1].target] + tgt_method = graph.nodes[path[-2].source] + tgt_structure = graph.nodes[path[-2].target] + return f"{src_method.properties['kind'].capitalize()} `{src_method.properties['simpleName']}` ({src_method.properties['description']}) of {src_structure.properties['kind']} `{src_structure.properties['qualifiedName']}` invokes {tgt_method.properties['kind']} `{tgt_method.properties['simpleName']}` ({tgt_method.properties['description']}) of {tgt_structure.properties['kind']} `{tgt_structure.properties['qualifiedName']}`." + def find_first_valid_json(text: str) -> str: """ Finds the first valid JSON substring in the given text using a stack-based approach. - + It scans the text from left to right, and when it encounters a '{', it tracks the balanced braces until a complete JSON object is formed. Once a candidate is found, it attempts to parse it with json.loads(). If parsing succeeds, that candidate is returned immediately. - + Args: text (str): The input string that may contain a JSON object. - + Returns: str: The first valid JSON substring found, or an empty string if none is found. """ @@ -728,7 +689,7 @@ def find_first_valid_json(text: str) -> str: elif text[j] == '}': stack -= 1 if stack == 0: - candidate = text[i:j+1] + candidate = text[i:j + 1] try: json.loads(candidate) return candidate @@ -737,6 +698,7 @@ def find_first_valid_json(text: str) -> str: break return "" + def simplify_name(name): if '(' in name and name.endswith(')'): prefix, params = name.split('(', 2) @@ -745,20 +707,18 @@ def simplify_name(name): else: return name + def merge_node_properties(dict1: Dict[str, Node], dict2: Dict[str, Node], simplify_names=False): for id2, obj2 in dict2.items(): - matched_obj: Node = None + matched_obj = None if id2 in dict1 and set(dict1[id2].labels) & set(obj2.labels): matched_obj = dict1[id2] elif simplify_names: - dict1_name_remap = { - simplify_name(key): key - for key in dict1 - if {'Script', 'Operation', 'Constructor'} & set(dict1[key].labels) - } + dict1_name_remap = {simplify_name(key): key for key in dict1 if + {'Script', 'Operation', 'Constructor'} & set(dict1[key].labels)} if id2 in dict1_name_remap and set(dict1[dict1_name_remap[id2]].labels) & set(obj2.labels): matched_obj = dict1[dict1_name_remap[id2]] From 83c3e3e4aa87b2e936d408030cd0cb8f06081f2a Mon Sep 17 00:00:00 2001 From: Satrio Adi Rukmono Date: Fri, 28 Feb 2025 16:24:17 +0100 Subject: [PATCH 17/34] Refactor --- arcanalib/graph.py | 187 +++++++++++++--------------------------- arcanalib/pipefilter.py | 4 +- 2 files changed, 63 insertions(+), 128 deletions(-) diff --git a/arcanalib/graph.py b/arcanalib/graph.py index 896307f..4c4a6fc 100644 --- a/arcanalib/graph.py +++ b/arcanalib/graph.py @@ -3,6 +3,7 @@ from collections.abc import Iterable from typing import Optional, List, Dict, Union, Set, Tuple + class Node: def __init__(self, _id, *labels, **properties): self.id = _id @@ -10,9 +11,9 @@ def __init__(self, _id, *labels, **properties): self.properties = properties # Meta/cache references - self._graph = None # The parent Graph, for on-demand lookups - self._sources_cache = {} # edge_label -> List[Node] - self._targets_cache = {} # edge_label -> List[Node] + self._graph = None # The parent Graph, for on-demand lookups + self._sources_cache = {} # edge_label -> List[Node] + self._targets_cache = {} # edge_label -> List[Node] def set_graph(self, graph): self._graph = graph @@ -56,9 +57,7 @@ def sources(self, edge_label: str): if not self._graph: return [] es = self._graph.edges.get(edge_label, []) - self._sources_cache[edge_label] = [ - self._graph.nodes[e.source] for e in es if e.target == self.id - ] + self._sources_cache[edge_label] = [self._graph.nodes[e.source] for e in es if e.target == self.id] return self._sources_cache[edge_label] def targets(self, edge_label: str): @@ -66,23 +65,16 @@ def targets(self, edge_label: str): if not self._graph: return [] es = self._graph.edges.get(edge_label, []) - self._targets_cache[edge_label] = [ - self._graph.nodes[e.target] for e in es if e.source == self.id - ] + self._targets_cache[edge_label] = [self._graph.nodes[e.target] for e in es if e.source == self.id] return self._targets_cache[edge_label] def to_dict(self): - return { - 'data': { - 'id': self.id, - 'labels': list(self.labels), - 'properties': self.properties - } - } + return {'data': {'id': self.id, 'labels': list(self.labels), 'properties': self.properties}} def __repr__(self): return json.dumps(self.to_dict()) + class Edge: def __init__(self, source, target, label, **properties): self.id = f'{source}-{label}-{target}' @@ -129,41 +121,27 @@ def target_node(self): return self._cached_target_node def to_dict(self): - return { - 'data': { - 'id': self.id, - 'source': self.source, - 'target': self.target, - 'label': self.label_val, - 'properties': self.properties - } - } + return {'data': {'id': self.id, 'source': self.source, 'target': self.target, 'label': self.label_val, + 'properties': self.properties}} def __repr__(self): return json.dumps(self.to_dict()) + def invert(edge_list: List[Edge], new_label: Optional[str] = None) -> List[Edge]: aggregated = [] for edge in edge_list: lbl = new_label if new_label else f"inv_{edge.label_val}" - e = Edge( - source=edge.target, - target=edge.source, - label=lbl, - **edge.properties - ) + e = Edge(source=edge.target, target=edge.source, label=lbl, **edge.properties) aggregated.append(e) return aggregated + def compose(edges1: List[Edge], edges2: List[Edge], new_label: Optional[str] = None) -> List[Edge]: mapping = defaultdict(list) for edge in edges2: w = edge.properties.get('weight', 1) - mapping[edge.source].append({ - 'target': edge.target, - 'label': edge.label_val, - 'weight': w - }) + mapping[edge.source].append({'target': edge.target, 'label': edge.label_val, 'weight': w}) aggregated = {} for edge in edges1: @@ -180,9 +158,11 @@ def compose(edges1: List[Edge], edges2: List[Edge], new_label: Optional[str] = N aggregated[key].properties['weight'] += new_w return list(aggregated.values()) + def lift(edges1: List[Edge], edges2: List[Edge], new_label: Optional[str] = None) -> List[Edge]: return compose(compose(edges1, edges2), invert(edges1), new_label) + def triplets(edge_list1: List[Edge], edge_list2: List[Edge]) -> Set[Tuple[str, str, str]]: source_mapping = defaultdict(list) for edge in edge_list1: @@ -196,6 +176,7 @@ def triplets(edge_list1: List[Edge], edge_list2: List[Edge]) -> Set[Tuple[str, s paths.add((source1, edge.source, edge.target)) return paths + class Graph: def __init__(self, graph_data: dict = None) -> None: if not graph_data: @@ -225,7 +206,7 @@ def _set_graph_refs(self): for edge in elist: edge.set_graph(self) - def add_node(self, _id: str, *labels, **properties) -> Node: + def add_node(self, _id: str, *labels, **properties) -> Optional[Node]: if _id in self.nodes: return None n = Node(_id, *(labels or []), **(properties or {})) @@ -233,10 +214,11 @@ def add_node(self, _id: str, *labels, **properties) -> Node: n.set_graph(self) return n - def add_edge(self, source_id: str, target_id: str, edge_label: str, **properties) -> Edge: + def add_edge(self, source_id: str, target_id: str, edge_label: str, **properties) -> Optional[Edge]: if source_id not in self.nodes or target_id not in self.nodes: return None - if self.find_edges(label=edge_label, where_source=lambda n: n.id == source_id, where_target=lambda n: n.id == target_id): + if self.find_edges(label=edge_label, where_source=lambda n: n.id == source_id, + where_target=lambda n: n.id == target_id): return None e = Edge(source_id, target_id, edge_label, **(properties or {})) @@ -253,7 +235,7 @@ def add_edge(self, source_id: str, target_id: str, edge_label: str, **properties def invert_edges(self, edge_label: str, new_label: Optional[str] = None) -> None: if edge_label in self.edges: - inverted = invert(self.edges.get(edge_label,[]), new_label) + inverted = invert(self.edges.get(edge_label, []), new_label) nlabel = new_label or f"inv_{edge_label}" self.edges[nlabel] = inverted self._set_graph_refs() @@ -261,23 +243,19 @@ def invert_edges(self, edge_label: str, new_label: Optional[str] = None) -> None def compose_edges(self, edge_label1: str, edge_label2: str, new_label: Optional[str] = None) -> None: if (edge_label1 in self.edges) and (edge_label2 in self.edges): nlabel = new_label or f"{edge_label1}_{edge_label2}" - composed_list = compose(self.edges.get(edge_label1, []), self.edges.get(edge_label2,[]), nlabel) + composed_list = compose(self.edges.get(edge_label1, []), self.edges.get(edge_label2, []), nlabel) self.edges[nlabel] = composed_list self._set_graph_refs() def lift_edges(self, edge_label1: str, edge_label2: str, new_label: Optional[str] = None) -> None: if (edge_label1 in self.edges) and (edge_label2 in self.edges): - lifted_list = lift(self.edges.get(edge_label1,[]), self.edges.get(edge_label2,[]), new_label) + lifted_list = lift(self.edges.get(edge_label1, []), self.edges.get(edge_label2, []), new_label) nlabel = new_label or f"lifted_{edge_label1}_{edge_label2}" self.edges[nlabel] = lifted_list self._set_graph_refs() def filter_nodes_by_labels(self, labels: Union[List[str], Set[str]]) -> Dict[str, Node]: - return { - k: v - for k, v in self.nodes.items() - if any(label in v.labels for label in labels) - } + return {k: v for k, v in self.nodes.items() if any(label in v.labels for label in labels)} def get_all_node_labels(self) -> Set[str]: return {label for node in self.nodes.values() for label in node.labels} @@ -287,11 +265,9 @@ def get_all_edge_labels(self) -> Set[str]: def get_edges_with_node_labels(self, edge_label: str, node_label: str) -> List[Edge]: if edge_label in self.edges: - return [ - edge for edge in self.edges.get(edge_label,[]) - if node_label in self.self.nodes.get(edge.source, Node(None)).labels - and node_label in self.nodes.get(edge.target, Node(None)).labels - ] + return [edge for edge in self.edges.get(edge_label, []) if + node_label in self.nodes.get(edge.source, Node(None)).labels and node_label in self.nodes.get( + edge.target, Node(None)).labels] return [] def get_edge_node_labels(self, edge: Edge) -> List[Tuple[str, str]]: @@ -302,22 +278,12 @@ def get_edge_node_labels(self, edge: Edge) -> List[Tuple[str, str]]: def get_source_and_target_labels(self, edge_label: str) -> Set[Tuple[str, str]]: if edge_label not in self.edges: return set() - return { - (sl, tl) - for e in self.edges.get(edge_label,[]) - for (sl, tl) in self.get_edge_node_labels(e) - } + return {(sl, tl) for e in self.edges.get(edge_label, []) for (sl, tl) in self.get_edge_node_labels(e)} def generate_ontology(self) -> 'Graph': - ontology_map = { - label: self.get_source_and_target_labels(label) for label in self.edges - } + ontology_map = {label: self.get_source_and_target_labels(label) for label in self.edges} onto_graph = Graph() - onto_graph.edges = { - lbl: [ - Edge(src, tgt, lbl) for (src, tgt) in ontology_map[lbl] - ] for lbl in ontology_map - } + onto_graph.edges = {lbl: [Edge(src, tgt, lbl) for (src, tgt) in ontology_map[lbl]] for lbl in ontology_map} sources = {src for lbl in ontology_map for (src, _) in ontology_map[lbl]} targets = {tgt for lbl in ontology_map for (_, tgt) in ontology_map[lbl]} all_ids = sources.union(targets) @@ -326,51 +292,34 @@ def generate_ontology(self) -> 'Graph': return onto_graph def find_nodes(self, label=None, where=None) -> List[Node]: - return [ - node for node in self.nodes.values() - if (not label or label in node.labels) and (not where or where(node)) - ] + return [node for node in self.nodes.values() if + (not label or label in node.labels) and (not where or where(node))] - def find_node(self, label=None, where=None) -> Node: + def find_node(self, label=None, where=None) -> Optional[Node]: nodes = self.find_nodes(label, where) if nodes: return nodes[0] return None - - def find_edge(self, - label=None, - source_label=None, - target_label=None, - where_edge=None, - where_source=None, - where_target=None - ): - edges = self.find_edges(label,source_label,target_label,where_edge,where_source,where_target) + + def find_edge(self, label=None, source_label=None, target_label=None, where_edge=None, where_source=None, + where_target=None): + edges = self.find_edges(label, source_label, target_label, where_edge, where_source, where_target) if edges: return edges[0] return None - - def find_edges(self, - label=None, - source_label=None, - target_label=None, - where_edge=None, - where_source=None, - where_target=None - ): + + def find_edges(self, label=None, source_label=None, target_label=None, where_edge=None, where_source=None, + where_target=None): if label: edge_list = self.edges.get(label, []) else: edge_list = [e for edges in self.edges.values() for e in edges] - return [ - e for e in edge_list - if (not source_label or source_label in self.nodes[e.source].labels) - and (not target_label or target_label in self.nodes[e.target].labels) - and (not where_edge or where_edge(e)) - and (not where_source or where_source(self.nodes[e.source])) - and (not where_target or where_target(self.nodes[e.target])) - ] + return [e for e in edge_list if (not source_label or source_label in self.nodes[e.source].labels) and ( + not target_label or target_label in self.nodes[e.target].labels) and ( + not where_edge or where_edge(e)) and ( + not where_source or where_source(self.nodes[e.source])) and ( + not where_target or where_target(self.nodes[e.target]))] def find_source(self, edge_list: List[Edge], start_node: Node, predicate, default: Node = None): predecessors = defaultdict(list) @@ -434,7 +383,8 @@ def process_nodes(self, edges: List[Edge], node_processor): results[n_id] = node_processor(self.nodes[n_id], resolved) return results - def toposorted_nodes(self, edges: List[Edge]): + @staticmethod + def toposorted_nodes(edges: List[Edge]): adj_list, outdegree = Graph._adj_list(edges) sorted_nodes = [] node_deps = {} @@ -458,29 +408,27 @@ def toposorted_nodes(self, edges: List[Edge]): dependencies = adj_list.get(n_id, []) sorted_nodes.append(n_id) node_deps[n_id] = dependencies - return (sorted_nodes, node_deps) + return sorted_nodes, node_deps def clean_up(self): for edge_type in list(self.edges.keys()): - self.edges[edge_type] = [ - e for e in self.edges.get(edge_type,[]) - if e.source in self.nodes and e.target in self.nodes - ] + self.edges[edge_type] = [e for e in self.edges.get(edge_type, []) if + e.source in self.nodes and e.target in self.nodes] - def find_paths(self, *edge_sequence: List[str]) -> List[List[Edge]]: + def find_paths(self, *edge_sequence: str) -> List[List[Edge]]: def get_edges(label: str) -> List[Edge]: if label.startswith('-'): base_label = label[1:] if base_label in self.edges: - return invert(self.edges.get(base_label,[])) + return invert(self.edges.get(base_label, [])) return [] return self.edges.get(label, []) - def find_next(current_paths: List[List[Edge]], lbl: str) -> List[List[Edge]]: + def find_next(current_paths: List[List[Edge]], label: str) -> List[List[Edge]]: result = [] for path in current_paths: last_node = path[-1].target if path else None - for candidate in get_edges(lbl): + for candidate in get_edges(label): if not path or candidate.source == last_node: result.append(path + [candidate]) return result @@ -498,27 +446,14 @@ def to_dict(self, *args: str, node_labels: Optional[Union[str, Iterable[str]]] = if node_labels == 'all': included_node_labels = self.get_all_node_labels() else: - included_node_labels: Set[str] = { - nlbl - for elbl in included_edge_labels - for nlbl_pair in self.get_source_and_target_labels(elbl) - for nlbl in nlbl_pair - } + included_node_labels: Set[str] = {nlbl for elbl in included_edge_labels for nlbl_pair in + self.get_source_and_target_labels(elbl) for nlbl in nlbl_pair} if isinstance(node_labels, str): included_node_labels.add(node_labels) elif isinstance(node_labels, Iterable): included_node_labels.update(node_labels) - included_nodes = { - k: v - for k, v in self.filter_nodes_by_labels(included_node_labels).items() - } - included_edges = { - lbl: eds for lbl, eds in self.edges.items() if lbl in included_edge_labels - } - return { - "elements": { - "nodes": [{"data": n.to_dict()['data']} for n in included_nodes.values()], - "edges": [{"data": e.to_dict()['data']} for e in sum(included_edges.values(), [])] - } - } + included_nodes = {k: v for k, v in self.filter_nodes_by_labels(included_node_labels).items()} + included_edges = {lbl: eds for lbl, eds in self.edges.items() if lbl in included_edge_labels} + return {"elements": {"nodes": [{"data": n.to_dict()['data']} for n in included_nodes.values()], + "edges": [{"data": e.to_dict()['data']} for e in sum(included_edges.values(), [])]}} diff --git a/arcanalib/pipefilter.py b/arcanalib/pipefilter.py index 87509b3..4e5fcf2 100644 --- a/arcanalib/pipefilter.py +++ b/arcanalib/pipefilter.py @@ -1,5 +1,5 @@ from abc import ABC, abstractmethod -from typing import Any, Dict, List +from typing import Any, Dict, List, Union from arcanalib.graph import Graph @@ -49,7 +49,7 @@ def __init__(self, *filters: Filter) -> None: def add_filter(self, filt: Filter) -> None: self.filters.append(filt) - def process(self, data: Graph | Seeder) -> Any: + def process(self, data: Union[Graph,Seeder]) -> Any: """ Process the data through the sequence of filters in the pipeline. If a seeder is provided instead of graph data, use the seeder to generate the graph data. From 1d5e5b34035188c19b18710c5bf7eb43e06da8da Mon Sep 17 00:00:00 2001 From: Satrio Adi Rukmono Date: Wed, 21 May 2025 17:46:30 +0200 Subject: [PATCH 18/34] Restructure and migrate to ontology v2 --- arcana/__main__.py | 18 +- arcana/checkpoint.py | 34 ++ arcana/custom_encoder.py | 9 + arcana/filters.py | 738 +------------------------------- arcana/graph_utils.py | 58 +++ arcana/llm_filter/client.py | 66 +++ arcana/llm_filter/filter.py | 274 ++++++++++++ arcana/llm_filter/processors.py | 191 +++++++++ arcana/llm_filter/prompt.py | 101 +++++ arcana/merge_filter.py | 19 + arcana/metrics.py | 67 +++ arcana/seeder.py | 29 ++ arcana/templates.py | 393 +++++++++-------- arcana/utils.py | 126 ++++++ arcanalib/graph.py | 8 +- 15 files changed, 1206 insertions(+), 925 deletions(-) create mode 100644 arcana/checkpoint.py create mode 100644 arcana/custom_encoder.py create mode 100644 arcana/graph_utils.py create mode 100644 arcana/llm_filter/client.py create mode 100644 arcana/llm_filter/filter.py create mode 100644 arcana/llm_filter/processors.py create mode 100644 arcana/llm_filter/prompt.py create mode 100644 arcana/merge_filter.py create mode 100644 arcana/metrics.py create mode 100644 arcana/seeder.py create mode 100644 arcana/utils.py diff --git a/arcana/__main__.py b/arcana/__main__.py index 5827e96..280b87f 100644 --- a/arcana/__main__.py +++ b/arcana/__main__.py @@ -1,9 +1,15 @@ import argparse import configparser +import time import json +import logging import sys -from arcana.filters import CLISeeder, MetricsFilter, LLMFilter, MergeFilter +from arcana.checkpoint import writer +from arcana.llm_filter.filter import LLMFilter +from arcana.merge_filter import MergeFilter +from arcana.metrics import MetricsFilter +from arcana.seeder import CLISeeder from arcanalib.graph import Graph from arcanalib.pipefilter import Pipeline @@ -55,6 +61,16 @@ def main(): commands = args.command.split('-') if commands: + current_time_str = time.strftime("%Y%m%d-%H%M%S") + jsonl_file = f'arcana-{current_time_str}.jsonl' + w = writer(jsonl_file) + log_file = f'arcana-{current_time_str}.log' + logging.basicConfig( + level=logging.DEBUG, + format="%(asctime)s %(name)s %(levelname)s: %(message)s", + filename=log_file, + filemode="a", # append + ) pipeline = Pipeline(*[ filters[command](config) for command in commands diff --git a/arcana/checkpoint.py b/arcana/checkpoint.py new file mode 100644 index 0000000..df7ee8a --- /dev/null +++ b/arcana/checkpoint.py @@ -0,0 +1,34 @@ +# checkpoint.py +import json +from threading import Lock + +from arcana.custom_encoder import CustomJSONEncoder + +class JSONLWriter: + _instance = None + _lock = Lock() + + def __new__(cls, path): + with cls._lock: + if cls._instance is None: + inst = super().__new__(cls) + inst._file = open(path, "a", buffering=1) + cls._instance = inst + return cls._instance + + def write(self, data: dict): + self._file.write(json.dumps(data, cls=CustomJSONEncoder) + "\n") + + def flush(self): + try: + self._file.flush() + finally: + pass + +def writer(path=None): + """ + If path is provided and writer not yet instantiated, uses it. + Otherwise, falls back to default. + """ + effective = path or "checkpoints.jsonl" + return JSONLWriter(effective) diff --git a/arcana/custom_encoder.py b/arcana/custom_encoder.py new file mode 100644 index 0000000..f42bd95 --- /dev/null +++ b/arcana/custom_encoder.py @@ -0,0 +1,9 @@ +import json + +class CustomJSONEncoder(json.JSONEncoder): + def default(self, obj): + if isinstance(obj, set): + # Convert set to list + return list(obj) + # Call the default method for other types + return super().default(obj) \ No newline at end of file diff --git a/arcana/filters.py b/arcana/filters.py index a006ed5..88b019c 100644 --- a/arcana/filters.py +++ b/arcana/filters.py @@ -1,78 +1,5 @@ -import json import os -import re -import subprocess -import sys -import time -from collections import Counter, OrderedDict, defaultdict -from collections.abc import Iterable -from itertools import combinations -from typing import Any, Dict, List, Tuple, TextIO - -from openai import OpenAI -from tqdm.auto import tqdm - -from arcana import templates -from arcanalib.graph import Edge, Graph, Node, invert, lift, triplets -from arcanalib.pipefilter import Filter, Seeder - - -def remove_author(s: str) -> str: - return "\n".join(line.strip() for line in s.splitlines() if '@author' not in line) - - -_JAVA_COMMENT_RE = re.compile(r"(//.*?$)|(/\*.*?\*/)", flags=re.MULTILINE | re.DOTALL) - - -def remove_java_comments(java_source: str) -> str: - return _JAVA_COMMENT_RE.sub("", java_source).strip() - - -def sentence(s: str) -> str: - """ - Capitalize the first letter of a string and ensure it ends with a period. - - Args: - s (str): The input string. - - Returns: - str: The formatted string. - """ - if not s: - return "" - t = s.strip() - if not t: - return "" - if t[-1] in '.?!…~–—': - return f'{t[0].upper()}{t[1:]}' - return f'{t[0].upper()}{t[1:]}.' - - -def lower_first(s: str) -> str: - """ - Lowercase the first character of a string. - - Args: - s (str): The input string. - - Returns: - str: The string with the first character lowercased. - """ - return s[0].lower() + s[1:] if s else s - - -def prettify_json(obj: dict) -> str: - """ - Convert a dictionary to a pretty-printed JSON string. - - Args: - obj (dict): The input dictionary. - - Returns: - str: The pretty-printed JSON string. - """ - return json.dumps(obj, indent=2) - +from typing import Any, Dict, List, Tuple def layers_to_list(d: Dict[str, Any]) -> List[Tuple[str, str]]: result = [] @@ -85,663 +12,26 @@ def layers_to_list(d: Dict[str, Any]) -> List[Tuple[str, str]]: i += 1 return result +from collections import OrderedDict +from typing import Any, Dict, OrderedDict as OD -def write_jsonl(file: TextIO, obj: Any) -> None: - file.write(json.dumps(obj, cls=CustomJSONEncoder) + '\n') +def layers_to_ordereddict(d: Dict[str, Any]) -> OD[str, str]: + layers: OD[str, str] = OrderedDict() + i = 1 + while True: + name_key = f"layer{i}name" + desc_key = f"layer{i}desc" + if name_key not in d or desc_key not in d: + break + layers[d[name_key]] = d[desc_key] + i += 1 + return layers class StopProcessing(Exception): """Raised when a stop signal is detected.""" pass - def check_stop() -> None: if os.path.exists('stop'): raise StopProcessing("Stop file detected, halting processing.") - - -class CLISeeder(Seeder): - - def __init__(self, command) -> None: - """ - Initialize the seeder with a command. - - :param command: The command to be executed. - """ - self.command = command - - def generate(self) -> Graph: - """ - Execute the command, parse the JSON output into a dict, and pass the dict to the Graph constructor. - - :return: The generated Graph object. - """ - process = subprocess.run(self.command, capture_output=True, text=True, shell=True, encoding="utf-8", check=True) - if process.stderr: - sys.stderr.write(process.stderr) - output_dict = json.loads(process.stdout) - return Graph(output_dict) - - -def dependency_profile_category(inn: int, out: int) -> str: - if inn == 0 and out > 0: - return "outbound" - elif inn > 0 and out == 0: - return "inbound" - elif inn > 0 and out > 0: - return "transit" - return "hidden" - - -class MetricsFilter(Filter): - def process(self, data: Graph) -> Graph: - """ - Process the data to generate dependency profiles and categorize nodes. - - Args: - data (Graph): The input data. - - Returns: - Graph: The processed data with dependency profiles. - """ - parents = {e.source: e.target for e in invert(data.find_edges(label='contains'))} - dependency_profiles = defaultdict(list) - - calls = data.edges.get('calls', - lift(data.find_edges(label='hasScript'), data.find_edges(label='invokes'), 'calls')) - - for edge in calls: - source_id, target_id = edge.source, edge.target - if parents.get(source_id) != parents.get(target_id): - dependency_profiles[source_id].append('out') - dependency_profiles[target_id].append('in') - - dependency_profiles = {node_id: Counter(prof) for node_id, prof in dependency_profiles.items()} - - for node_id, profile in dependency_profiles.items(): - data.nodes[node_id].properties['dependencyProfile'] = dependency_profile_category(profile['in'], profile['out']) - - return data - - -def build_triplets(edge_list1, edge_list2) -> list: - methods = sorted(triplets(edge_list1, edge_list2)) - - return methods - - -def build_hierarchy(method_triplets) -> dict: - classes = sorted({(pkg, clz) for pkg, clz, _ in method_triplets}) - packages = sorted({pkg for pkg, _ in classes}) - - hierarchy = { - pkg_id: {cls_id: [met_id for _, c, met_id in method_triplets if c == cls_id] for p, cls_id in classes if - p == pkg_id} for pkg_id in packages} - - return hierarchy - - -class CustomJSONEncoder(json.JSONEncoder): - def default(self, obj): - if isinstance(obj, set): - # Convert set to list - return list(obj) - # Call the default method for other types - return super().default(obj) - - -def group_paths_by_endpoints(paths: List[List[Edge]]) -> Dict[Tuple[str, str], List[List[Edge]]]: - """ - Groups paths by the tuple of (first edge's source, last edge's target). - - Args: - paths (List[List[Edge]]): The list of paths to group. - - Returns: - Dict[Tuple[str, str], List[List[Edge]]]: A dictionary where keys are - tuples (first edge's source, last edge's target), and values are lists of paths. - """ - grouped_paths = defaultdict(list) - for path in paths: - if path: # Ensure the path is not empty - start = path[0].source - end = path[-1].target - grouped_paths[(start, end)].append(path) - return grouped_paths - - -def format_layers(layers): - return "\n".join(f"- **{name}**: {desc}" for name, desc in layers) - - -class LLMFilter(Filter): - def __init__(self, config: Dict[str, Dict[str, Any]]): - super().__init__(config) - - self.project_name = None - self.project_desc = None - self.model = None - self.client = None - self.timeout = None - - default_layers = [("Presentation Layer", - "Manages the user interface, defines UI elements and behavior, displays information, responds to user input, and updates views."), - ("Service Layer", - "Controls the application flow, orchestrates domain operations, connects UI events with domain logic, and synchronizes domain changes with the UI."), - ("Domain Layer", - "Handles business logic, represents domain data and behavior, and performs necessary computations for domain operations."), - ("Data Source Layer", - "Interacts with databases, filesystems, hardware, messaging systems, or other data sources, performs CRUD operations, handles data conversion, and ensures data integrity."), ] - self.layers = layers_to_list(config.get("layers", {})) or default_layers - self.layers_text = format_layers(self.layers) - - self.setup() - - def setup(self) -> None: - self.project_name = self.config['project']['name'] - self.project_desc = sentence(self.config['project']['desc']) - openai_client_args = {'api_key': self.config['llm'].get('apikey'), - 'base_url': self.config['llm'].get('apibase')} - self.model = self.config['llm'].get('model', "gpt-4o-mini") - self.client = OpenAI(**openai_client_args) - self.timeout = float(self.config['llm'].get('timeout', 300)) - - def process(self, data: Graph) -> Graph: - """ - Process the data using a language model to generate descriptions. - - Args: - data (Graph): The input data. - - Returns: - Graph: The processed data with generated descriptions. - """ - current_time_str = time.strftime("%Y%m%d-%H%M%S") - with open(f'arcana-{current_time_str}.jsonl', 'a', encoding="utf-8") as jsonl_file: - for i, (layer_name, layer_desc) in enumerate(self.layers): - n = data.add_node(f"layer:{layer_name}", "Grouping", kind="architectural layer", simpleName=layer_name, - qualifiedName=layer_name, description=layer_desc, layerOrder=i) - write_jsonl(jsonl_file, n.to_dict()) - for i in range(len(self.layers) - 1): - src = self.layers[i][0] - tgt = self.layers[i + 1][0] - e = data.add_edge(f"layer:{src}", f"layer:{tgt}", "allowedDependency", weight=1) - write_jsonl(jsonl_file, e.to_dict()) - with open(f'arcana-{current_time_str}.log', 'a', encoding="utf-8") as log_file: - try: - self.process_hierarchy(data, jsonl_file, log_file) - except Exception as e: - pass - return data - - @staticmethod - def describe(node: Node, *keys) -> str: - """Generate a description for a given node.""" - sr, sn = '\r', '\n' - if not keys: - keys = ['description', 'docComment', 'returns', 'reason', 'howToUse', 'howItWorks', 'assertions', - 'roleStereotype', 'layer'] - - lines = {key: f"**{key}**: {sentence(str(node.properties[key]).replace(sr, '').replace(sn, ' '))}" for key in - keys if key in node.properties and key != 'docComment' and node.properties[key]} - if 'docComment' in keys and 'docComment' in node.properties and node.properties['docComment']: - lines[ - 'docComment'] = f"**docComment**: {sentence(remove_author(str(node.properties['docComment'])).replace(sr, '').replace(sn, ' '))} " - - return ' '.join(lines[key] for key in keys if key in lines).strip() - - def process_hierarchy(self, graph: Graph, jsonl_file, log_file): - """Process each package, class, and method in the hierarchy.""" - - st_contains_st = graph.find_edges(label='contains', source_label='Structure', target_label='Structure') - ct_contains_st = graph.find_edges(label='contains', target_label='Structure', where_source=lambda - node: 'Container' in node.labels and 'Structure' not in node.labels) - new_ct_sources = {edge.target: graph.find_source(graph.find_edges(label='contains'), graph.nodes[edge.target], - lambda node: 'Structure' not in node.labels, - graph.nodes[edge.source]).id for edge in st_contains_st} - ct_contains_st.extend( - [Edge(source=source, target=target, label='contains') for target, source in new_ct_sources.items()]) - - trips = build_triplets(ct_contains_st, graph.find_edges(label='hasScript')) - met_to_cls_pkg = {met_id: (cls_id, pkg_id) for pkg_id, cls_id, met_id in trips} - # print(met_to_cls_pkg) - # print('######################################################################') - sorted_method_ids, method_deps = graph.toposorted_nodes(graph.find_edges(label='invokes')) - # print(sorted_method_ids) - - counter = 0 - - for met_id in tqdm(sorted_method_ids, desc='Processing methods', position=0, leave=False): - cls_id, pkg_id = met_to_cls_pkg[met_id] - method = graph.nodes[met_id] - clasz = graph.nodes[cls_id] - - self.process_script(graph, jsonl_file, log_file, method, clasz, method_deps) - - check_stop() - - counter += 1 - if counter == 10: - log_file.flush() - jsonl_file.flush() - counter %= 10 - - hierarchy = build_hierarchy(trips) - sorted_pkg_ids, pkg_deps = graph.toposorted_nodes( - graph.find_edges(label='contains', where_source=lambda node: 'Structure' not in node.labels, - where_target=lambda node: 'Structure' not in node.labels)) - - for pkg_id in tqdm(sorted_pkg_ids, desc="Processing packages", position=1): - pkg_data = hierarchy.get(pkg_id, dict()) - package = graph.nodes[pkg_id] - - for cls_id, cls_data in tqdm(pkg_data.items(), desc="Processing classes", position=2, leave=False): - clasz = graph.nodes[cls_id] - - self.process_structure(graph, jsonl_file, log_file, clasz, cls_data) - - check_stop() - - self.process_component(graph, jsonl_file, log_file, package, pkg_data, pkg_deps) - - log_file.flush() - jsonl_file.flush() - - check_stop() - - paths = graph.find_paths("contains", "hasScript", "invokes", "-hasScript", "-contains") - path_groups = group_paths_by_endpoints(paths) - - pkg_pairs = list(combinations(sorted_pkg_ids, 2)) - for pkg2_id, pkg1_id in tqdm(pkg_pairs, desc='Processing package interactions', position=0, leave=False): - - check_stop() - - pkg1 = graph.nodes[pkg1_id] - pkg2 = graph.nodes[pkg2_id] - if ('Structure' not in pkg1.labels) and ('Structure' not in pkg2.labels): - if path_groups[(pkg1_id, pkg2_id)]: - self.process_interactions(graph, pkg1, pkg2, path_groups[(pkg1_id, pkg2_id)], hierarchy, jsonl_file, - log_file) - if path_groups[(pkg2_id, pkg1_id)]: - self.process_interactions(graph, pkg2, pkg1, path_groups[(pkg2_id, pkg1_id)], hierarchy, jsonl_file, - log_file) - - @staticmethod - def compose_prompt(p, function_parameters): - prompt = p - for k, v in function_parameters.items(): - if isinstance(v, dict) and len(v): - prompt += f"## {k}\n\n" - for k1, v1 in v.items(): - if v1: - prompt += f"* {k1}: {str(v1)}\n" - prompt += "\n\n" - elif isinstance(v, list) and len(v): - prompt += f"## {k}\n\n" - for v1 in v: - if v1: - prompt += f"* {str(v1)}\n" - prompt += "\n\n" - elif v: - prompt += f"## {k}\n\n{str(v)}\n\n" - return prompt.strip() - - def process_script(self, graph: Graph, jsonl_file: TextIO, log_file: TextIO, script: Node, - structure: Node, node_deps: dict): - """Process a single method and generate its description.""" - - if 'description' not in script.properties or not script.properties['description'] or script.properties[ - 'description'] == "(no description)": - script_name = script.properties['simpleName'] - script_src = remove_java_comments(script.properties['sourceText']) - script_kind = script.properties.get('kind', 'function') - - structure_name = structure.properties['qualifiedName'] - structure_kind = structure.properties['kind'] - structure_kind = 'enum' if structure_kind == 'enumeration' else 'abstract class' if structure_kind == 'abstract' else structure_kind - - prompt = f"Describe the following {script_kind} by using the AnalyzeScript tool.\n\n" - script_parameters = OrderedDict() - script_parameters["Project Name"] = self.project_name - script_parameters["Project Description"] = self.project_desc - script_parameters[ - f"{script_kind.title()} Declaration"] = f"The {script_kind} {script_name} is declared within the {structure_kind} {structure_name}." - script_parameters[f"{script_kind.title()} Source Code"] = script_src - script_parameters[f"Other Functions/Methods Used"] = {graph.nodes[node_id].properties[ - 'qualifiedName']: f"{self.describe(graph.nodes[node_id], 'description', 'returns', 'howToUse', 'docComment')}" - for node_id in node_deps[script.id]} - script_parameters["Possible Architectural Layers"] = dict(self.layers) - - prompt = self.compose_prompt(prompt, script_parameters) - - log_file.write(prompt) - log_file.write('\n\n======\n\n') - - description = self.generate_json_description(prompt, "AnalyzeScript") - self.update_method_properties(graph, description, script) - - layer_id = None - if script.has_property("layer") and script.property("layer") in [name for name, _ in self.layers]: - layer_id = f"layer:{script.property('layer')}" - layer_node = graph.find_node(label="Grouping", where=lambda node: node.id == layer_id) - if layer_node: - e = graph.add_edge(script.id, layer_node.id, "implements", weight=1) - write_jsonl(jsonl_file, e.to_dict()) - - write_jsonl(jsonl_file, {'data': {'id': script.id, 'labels': script.labels, 'properties': description}}) - - def process_structure(self, graph: Graph, jsonl_file, log_file, structure: Node, structure_scripts: list): - """Process a single class and generate its description.""" - - # if 'description' not in structure.properties or not structure.properties['description'] or structure.properties['description'] == "(no description)": - ancestors, variables = self.get_structure_relations(graph, structure.id) - script_descriptions = self.get_script_descriptions(graph, structure_scripts) - - structure_name = structure.properties['qualifiedName'] - structure_kind = structure.properties['kind'] - structure_kind = 'enum' if structure_kind == 'enumeration' else 'abstract class' if structure_kind == 'abstract' else structure_kind - - prompt = f"Describe the following {structure_kind} using the AnalyzeStructure tool.\n\n" - structure_parameters = OrderedDict() - structure_parameters["Project Name"] = self.project_name - structure_parameters["Project Description"] = self.project_desc - structure_parameters[f"{structure_kind.title()} Name"] = structure_name - structure_parameters[f"{structure_kind.title()} Inhertis From"] = ancestors - structure_parameters[f"Enclosed Variables/Fields"] = variables - structure_parameters[f"Enclosed Functions/Methods"] = script_descriptions - - prompt = self.compose_prompt(prompt, structure_parameters) - - log_file.write(prompt) - log_file.write('\n\n======\n\n') - - description = self.generate_json_description(prompt, "AnalyzeStructure") - self.update_class_properties(graph, description, structure) - - write_jsonl(jsonl_file, - {'data': {'id': structure.id, 'labels': list(structure.labels), 'properties': description}}) - - def process_component(self, graph: Graph, jsonl_file, log_file, component: Node, component_contents: dict, - component_deps: dict): - """Process a single package and generate its description.""" - - # if 'description' not in component.properties or not component.properties['description'] or component.properties['description'] == "(no description)": - structure_descriptions = self.get_structure_descriptions(graph, component_contents) - subcomponent_descriptions = self.get_component_descriptions(graph, component_deps[component.id]) - component_kind = component.properties.get('kind', "component") - - prompt = f"Describe the following {component_kind} using the AnalyzeComponent tool.\n\n" - component_parameters = OrderedDict() - component_parameters["Project Name"] = self.project_name - component_parameters["Project Description"] = self.project_desc - component_parameters["Component Type"] = component_kind - component_parameters["Component Name"] = component.properties['qualifiedName'] - component_parameters["Enclosed Sub-components"] = subcomponent_descriptions - component_parameters["Enclosed Classes"] = structure_descriptions - component_parameters["Possible Architectural Layers"] = dict(self.layers) - - prompt = self.compose_prompt(prompt, component_parameters) - - log_file.write(prompt) - log_file.write('\n\n======\n\n') - - description = self.generate_json_description(prompt, "AnalyzeComponent") - self.update_package_properties(graph, description, component) - - layer_id = None - if component.has_property("layer") and component.property("layer") in [name for name, _ in self.layers]: - layer_id = f"layer:{component.property('layer')}" - layer_node = graph.find_node(label="Grouping", where=lambda node: node.id == layer_id) - if layer_node: - graph.add_edge(component.id, layer_node.id, "implements", weight=1) - - write_jsonl(jsonl_file, - {'data': {'id': component.id, 'labels': list(component.labels), 'properties': description}}) - - def process_interactions(self, graph: Graph, c1: Node, c2: Node, path_groups: List[List[Edge]], hierarchy, - jsonl_file: TextIO, log_file: TextIO): - c1_name = c1.properties["qualifiedName"] - c2_name = c2.properties["qualifiedName"] - c1_desc = c1.properties["description"] - c2_desc = c2.properties["description"] - - c1_contents = hierarchy.get(c1.id, dict()) - c2_contents = hierarchy.get(c2.id, dict()) - - c1_structure_info = "\n".join( - f" - `{graph.nodes[c_id].properties['simpleName']}`: {graph.nodes[c_id].properties['description']}" - for c_id, _ in c1_contents.items()) - c2_structure_info = "\n".join( - f" - `{graph.nodes[c_id].properties['simpleName']}`: {graph.nodes[c_id].properties['description']}" - for c_id, _ in c2_contents.items()) - - dep_info = f" - Dependencies from `{c1_name}` to `{c2_name}`:\n" + "\n".join( - f" - {describe_path(graph, path)}" for path in path_groups) if path_groups else "" - - prompt = templates.interaction_analysis.format(project_name=self.project_name, project_desc=self.project_desc, - pkg1_name=c1_name, pkg2_name=c2_name, pkg1_desc=c1_desc, - pkg2_desc=c2_desc, cls1_info=c1_structure_info, - cls2_info=c2_structure_info, dep_info=dep_info) - - log_file.write(prompt) - log_file.write('\n\n======\n\n') - - description = self.generate_text_description(prompt) - pkg1_edge = Edge(source=c1.id, target=c2.id, label="dependsOn", description=description) if dep_info else None - - if pkg1_edge: - if "dependsOn" not in graph.edges: - graph.edges["dependsOn"] = [] - graph.edges["dependsOn"].append(pkg1_edge) - - write_jsonl(jsonl_file, pkg1_edge.to_dict()) - - def generate_json_description(self, prompt: str = None, tool: str = None) -> dict: - """Generate a description using the OpenAI client.""" - try: - if tool: - response = self.client.chat.completions.create(model=self.model, messages=[ - {"role": "system", "content": "You are a software architecture analysis tool."}, - {"role": "user", "content": prompt}], tools=[templates.analyze_script_tool, - templates.analyze_structure_tool, - templates.analyze_component_tool], - tool_choice="required", temperature=0, seed=42, - timeout=self.timeout) - - tool_calls = response.choices[0].message.tool_calls - - if tool_calls: - args_str = tool_calls[0].function.arguments - description = json.loads(args_str) - else: - content = response.choices[0].message.content - json_content = find_first_valid_json(content) - if json_content: - description = json.loads(json_content) - else: - description = dict() - - else: - response = self.client.chat.completions.create(model=self.model, - response_format={"type": "json_object"}, - messages=[{"role": "user", "content": prompt}], - max_tokens=4096, temperature=0, seed=42, - timeout=self.timeout) - - content = response.choices[0].message.content - description = json.loads(content) - except Exception as e: - sys.stderr.write(f"Generate JSON description error: {e}") - description = {} - - if 'description' not in description: - description['description'] = "(no description)" - return description - - def generate_text_description(self, prompt: str) -> str: - """Generate a description using the OpenAI client.""" - try: - response = self.client.chat.completions.create(model=self.model, - messages=[{"role": "user", "content": prompt}], - max_tokens=4096, temperature=0, seed=42, - timeout=float(self.config['llm'].get('timeout', 300))) - description = response.choices[0].message.content - except Exception as e: - sys.stderr.write(f"Generate text description error: {e}") - description = "(no description)" - - return description - - @staticmethod - def update_method_properties(data: Graph, description: dict, method: Node): - """Update method properties with the generated description.""" - - for key, value in description.items(): - if key.endswith('Reason'): - continue - key_lower = lower_first(key) - if key_lower == 'parameters' and isinstance(value, Iterable): - param_nodes = [data.nodes[edge.target] for edge in data.find_edges(label='hasParameter') if - edge.source == method.id] - for param in value: - if isinstance(param, dict): - matching_params = [node for node in param_nodes if - node.properties['simpleName'] == param.get('name')] - if matching_params: - param_node_id = matching_params[0].id - if param_node_id in data.nodes: - data.nodes[param_node_id].properties['description'] = param.get('description') - # elif key_lower == 'returns': - # method.properties['returns'] = value.get('description', None) if value and hasattr(value, 'get') else None - else: - data.nodes[method.id].properties[key_lower] = value - - @staticmethod - def update_class_properties(data: Graph, description: dict, clasz: Node): - """Update class properties with the generated description.""" - for key in description: - if not key.endswith('Reason'): - data.nodes[clasz.id].properties[lower_first(key)] = description[key] - - @staticmethod - def update_package_properties(data: Graph, description: dict, package: Node): - """Update package properties with the generated description.""" - for key in description: - if not key.endswith('Reason'): - data.nodes[package.id].properties[lower_first(key)] = description[key] - - @staticmethod - def get_structure_relations(data: Graph, cls_id: str) -> tuple: - """Retrieve class ancestors and fields.""" - ancestors = list( - {data.nodes[edge.target].property("qualifiedName") for edge in data.find_edges(label='specializes') if - edge.source == cls_id}) - fields = {data.nodes[edge.target] for edge in data.find_edges(label='hasVariable') if edge.source == cls_id} - fields = [' '.join(remove_java_comments(field.properties['sourceText']).split()) for field in fields] - return ancestors, fields - - def get_script_descriptions(self, data: Graph, cls_data: list) -> dict[str,str]: - """Generate descriptions for methods.""" - return {data.nodes[met_id].properties['simpleName']: self.describe(data.nodes[met_id]) for met_id in cls_data} - - def get_structure_descriptions(self, data: Graph, pkg_data: dict) -> dict[str,str]: - """Generate descriptions for classes.""" - return { - f"{data.nodes[cls_id].properties['kind']} {data.nodes[cls_id].properties['qualifiedName']}": self.describe( - data.nodes[cls_id]) for cls_id, _ in pkg_data.items()} - - def get_component_descriptions(self, data: Graph, package_ids: list) -> dict[str,str]: - """Generate descriptions for packages.""" - return {data.nodes[pkg_id].properties['qualifiedName']: self.describe(data.nodes[pkg_id]) for pkg_id in - package_ids} - - -def describe_path(graph, path): - src_structure = graph.nodes[path[1].source] - src_method = graph.nodes[path[1].target] - tgt_method = graph.nodes[path[-2].source] - tgt_structure = graph.nodes[path[-2].target] - return f"{src_method.properties['kind'].capitalize()} `{src_method.properties['simpleName']}` ({src_method.properties['description']}) of {src_structure.properties['kind']} `{src_structure.properties['qualifiedName']}` invokes {tgt_method.properties['kind']} `{tgt_method.properties['simpleName']}` ({tgt_method.properties['description']}) of {tgt_structure.properties['kind']} `{tgt_structure.properties['qualifiedName']}`." - - -def find_first_valid_json(text: str) -> str: - """ - Finds the first valid JSON substring in the given text using a stack-based approach. - - It scans the text from left to right, and when it encounters a '{', it tracks the balanced - braces until a complete JSON object is formed. Once a candidate is found, it attempts to parse - it with json.loads(). If parsing succeeds, that candidate is returned immediately. - - Args: - text (str): The input string that may contain a JSON object. - - Returns: - str: The first valid JSON substring found, or an empty string if none is found. - """ - n = len(text) - for i in range(n): - if text[i] == '{': - stack = 0 - for j in range(i, n): - if text[j] == '{': - stack += 1 - elif text[j] == '}': - stack -= 1 - if stack == 0: - candidate = text[i:j + 1] - try: - json.loads(candidate) - return candidate - except json.JSONDecodeError: - # If this candidate isn't valid JSON, break and continue scanning. - break - return "" - - -def simplify_name(name): - if '(' in name and name.endswith(')'): - prefix, params = name.split('(', 2) - params = [param.split('.')[-1].split('$')[-1] for param in params.split(')', 1)[0].split(',')] - return prefix + '(' + ','.join(params) + ')' - else: - return name - - -def merge_node_properties(dict1: Dict[str, Node], dict2: Dict[str, Node], simplify_names=False): - for id2, obj2 in dict2.items(): - - matched_obj = None - if id2 in dict1 and set(dict1[id2].labels) & set(obj2.labels): - matched_obj = dict1[id2] - - elif simplify_names: - - dict1_name_remap = {simplify_name(key): key for key in dict1 if - {'Script', 'Operation', 'Constructor'} & set(dict1[key].labels)} - - if id2 in dict1_name_remap and set(dict1[dict1_name_remap[id2]].labels) & set(obj2.labels): - matched_obj = dict1[dict1_name_remap[id2]] - - if matched_obj: - # sys.stderr.write(f"{id2}->{matched_obj['id']}\n") - # Merge properties from obj2 into matched_obj - matched_obj.properties.update(obj2.properties) - else: - # sys.stderr.write(f"{id2}->None\n") - pass - - -# Note: dict1 is updated in place, no need to return anything - -class MergeFilter(Filter): - def __init__(self, config: Dict[str, Dict[str, Any]]): - super().__init__(config) - - with open(config['merge']['input'], 'r', encoding="utf-8") as file: - data = json.load(file) - self.node_dict_to_merge = data - - def process(self, data: Graph) -> Any: - merge_node_properties(data.nodes, self.node_dict_to_merge, True) - return data diff --git a/arcana/graph_utils.py b/arcana/graph_utils.py new file mode 100644 index 0000000..30b327e --- /dev/null +++ b/arcana/graph_utils.py @@ -0,0 +1,58 @@ +from collections import OrderedDict, defaultdict +from typing import Dict, List, Tuple + +from arcanalib.graph import Edge, triplets + + +def dependency_profile_category(inn: int, out: int) -> str: + if inn == 0 and out > 0: + return "outbound" + elif inn > 0 and out == 0: + return "inbound" + elif inn > 0 and out > 0: + return "transit" + return "hidden" + +def build_triplets(edge_list1, edge_list2) -> list: + methods = sorted(triplets(edge_list1, edge_list2)) + + return methods + +def build_hierarchy(method_triplets) -> dict: + classes = sorted({(pkg, clz) for pkg, clz, _ in method_triplets}) + packages = sorted({pkg for pkg, _ in classes}) + + hierarchy = { + pkg_id: {cls_id: [met_id for _, c, met_id in method_triplets if c == cls_id] for p, cls_id in classes if + p == pkg_id} for pkg_id in packages} + + return hierarchy + +def group_paths_by_endpoints(paths: List[List[Edge]]) -> Dict[Tuple[str, str], List[List[Edge]]]: + """ + Groups paths by the tuple of (first edge's source, last edge's target). + + Args: + paths (List[List[Edge]]): The list of paths to group. + + Returns: + Dict[Tuple[str, str], List[List[Edge]]]: A dictionary where keys are + tuples (first edge's source, last edge's target), and values are lists of paths. + """ + grouped_paths = defaultdict(list) + for path in paths: + if path: # Ensure the path is not empty + start = path[0].source + end = path[-1].target + grouped_paths[(start, end)].append(path) + return grouped_paths + +def format_layers(layers: OrderedDict): + return "\n".join(f"- **{name}**: {desc}" for name, desc in layers.items()) + +def describe_path(graph, path): + src_structure = graph.nodes[path[1].source] + src_method = graph.nodes[path[1].target] + tgt_method = graph.nodes[path[-2].source] + tgt_structure = graph.nodes[path[-2].target] + return f"{src_method.properties['kind'].capitalize()} `{src_method.properties['simpleName']}` ({src_method.properties['description']}) of {src_structure.properties['kind']} `{src_structure.properties['qualifiedName']}` invokes {tgt_method.properties['kind']} `{tgt_method.properties['simpleName']}` ({tgt_method.properties['description']}) of {tgt_structure.properties['kind']} `{tgt_structure.properties['qualifiedName']}`." \ No newline at end of file diff --git a/arcana/llm_filter/client.py b/arcana/llm_filter/client.py new file mode 100644 index 0000000..7134ab5 --- /dev/null +++ b/arcana/llm_filter/client.py @@ -0,0 +1,66 @@ +from openai import OpenAI +import json, sys + +from arcana import templates +from arcana.utils import find_first_valid_json + +class LLMClient: + def __init__(self, llm_cfg, project_cfg): + self.client = OpenAI(api_key=llm_cfg['apikey'], base_url=llm_cfg.get('apibase')) + self.model = llm_cfg.get('model', 'gpt-4o-mini') + self.timeout = float(llm_cfg.get('timeout', 300)) + + def generate_json(self, prompt, tool): + """Generate a description using the OpenAI client.""" + try: + if tool: + response = self.client.chat.completions.create(model=self.model, messages=[ + {"role": "system", "content": "You are a software architecture analysis tool."}, + {"role": "user", "content": prompt}], tools=[templates.analyze_script_tool, + templates.analyze_structure_tool, + templates.analyze_component_tool], + tool_choice="required", temperature=0, seed=42, + timeout=self.timeout) + + tool_calls = response.choices[0].message.tool_calls + + if tool_calls: + args_str = tool_calls[0].function.arguments + description = json.loads(args_str) + else: + content = response.choices[0].message.content + json_content = find_first_valid_json(content) + if json_content: + description = json.loads(json_content) + else: + description = dict() + + else: + response = self.client.chat.completions.create(model=self.model, + response_format={"type": "json_object"}, + messages=[{"role": "user", "content": prompt}], + max_tokens=4096, temperature=0, seed=42, + timeout=self.timeout) + + content = response.choices[0].message.content + description = json.loads(content) + except Exception as e: + sys.stderr.write(f"Generate JSON description error: {e}") + description = {} + + if 'description' not in description: + description['description'] = "(no description)" + return description + + def generate_text(self, prompt): + try: + response = self.client.chat.completions.create(model=self.model, + messages=[{"role": "user", "content": prompt}], + max_tokens=4096, temperature=0, seed=42, + timeout=float(self.config['llm'].get('timeout', 300))) + description = response.choices[0].message.content + except Exception as e: + sys.stderr.write(f"Generate text description error: {e}") + description = "(no description)" + + return description \ No newline at end of file diff --git a/arcana/llm_filter/filter.py b/arcana/llm_filter/filter.py new file mode 100644 index 0000000..c8aca13 --- /dev/null +++ b/arcana/llm_filter/filter.py @@ -0,0 +1,274 @@ +import time +from collections import OrderedDict +from itertools import combinations +from typing import Any, Dict, List, TextIO + +from tqdm.auto import tqdm + +from arcana import templates +from arcana.filters import check_stop, layers_to_ordereddict +from arcana.graph_utils import (build_hierarchy, build_triplets, describe_path, + group_paths_by_endpoints) +from arcana.llm_filter.client import LLMClient +from arcana.llm_filter.processors import ComponentProcessor, InteractionProcessor, ScriptProcessor, StructureProcessor +from arcana.llm_filter.prompt import PromptBuilder +from arcana.utils import (lower_first, remove_java_comments, write_jsonl) +from arcanalib.graph import Edge, Graph, Node +from arcanalib.pipefilter import Filter + +def default_layers(): + return OrderedDict([ + ('Presentation Layer', "Manages the user interface, defines UI elements and behavior, displays information, responds to user input, and updates views."), + ('Service Layer', "Controls the application flow, orchestrates domain operations, connects UI events with domain logic, and synchronizes domain changes with the UI."), + ('Domain Layer', "Handles business logic, represents domain data and behavior, and performs necessary computations for domain operations."), + ('Data Source Layer', "Interacts with databases, filesystems, hardware, messaging systems, or other data sources, performs CRUD operations, handles data conversion, and ensures data integrity."), + ]) + +def default_role_stereotypes(): + # **Information Holder** is responsible for knowing facts and providing information to other objects. POJOs, Java Beans, and enumerations are usually information holders. \ + # **Service Provider** is responsible for handling requests and performing specific services. It usually implements a specific interface with a small number of methods. Concrete strategies are service providers. \ + # **Structurer** is responsible for managing relationships and constraints among related things. It is usually a collection or mapping of some sort, i.e., a subclass of a List, Set, Map, etc. \ + # **Controller** is responsible for making decisions, directing the work of others, and handling important events. It directs the flow of the application or business process. \ + # **Coordinator** is responsible for managing the actions of a group of workers and facilitating communication and work of other objects. It delegates requests to other objects. Very abstract classes and interfaces might be coordinators as they delegate the work to subclasses. \ + # **User Interfacer** is responsible for transmitting user requests for action or display/render information that can be updated. It handles interactions with users. \ + # **External Interfacer** is responsible for loading and storing information from/to external services, including database systems, web services, filesystems, hardware, etc. \ + # **Internal Interfacer** is responsible for interfacing between two subsystems. It may bundle together information of requests from a group of objects to be sent to another object. Abstract adapters, bridges, facades, and proxies are internal interfacers." + return OrderedDict([ + ("Information Holder", "Knows facts and provides information (POJOs, beans, enums)."), + ("Service Provider", "Handles requests, performs services; implements a specific interface with a small number of methods (strategies, handlers)."), + ("Structurer", "Manages relationships among things (collections, maps)."), + ("Controller", "Makes decisions, directs flow of the program."), + ("Coordinator", "Delegates work across workers."), + ("User Interfacer", "Handles user input/output."), + ("External Interfacer", "Loads/stores from external services."), + ("Internal Interfacer", "Bridges subsystems (adapters, bridges, facades, proxies)."), + ]) + +class LLMFilter(Filter): + def __init__(self, config: Dict[str, Dict[str, Any]]): + super().__init__(config) + self.client = LLMClient(config['llm'], config['project']) + + layer_cfg = config.get('layers') + self.layers = layers_to_ordereddict(layer_cfg) if layer_cfg else default_layers() + + stereo_cfg = config.get('stereotypes') + self.role_stereotypes = OrderedDict(stereo_cfg) if stereo_cfg else default_role_stereotypes() + + self.prompt_builder = PromptBuilder(config['project'], self.layers, self.role_stereotypes) + self.script_processor = ScriptProcessor(self.client, self.prompt_builder) + self.structure_processor = StructureProcessor(self.client, self.prompt_builder) + self.component_processor = ComponentProcessor(self.client, self.prompt_builder) + self.interaction_processor = InteractionProcessor(self.client, self.prompt_builder) + + def process(self, graph): + # 1. initialize layers and write baseline nodes/edges + self.prompt_builder.initialize_layers(graph) + + # 2. process methods + self.script_processor.process_all(graph) + + # 3. process classes + self.structure_processor.process_all(graph) + + # 4. process packages + self.component_processor.process_all(graph) + + # 5. process interactions + self.interaction_processor.process_all(graph) + + return graph + + def process_old(self, data: Graph) -> Graph: + """ + Process the data using a language model to generate descriptions. + + Args: + data (Graph): The input data. + + Returns: + Graph: The processed data with generated descriptions. + """ + current_time_str = time.strftime("%Y%m%d-%H%M%S") + with open(f'arcana-{current_time_str}.jsonl', 'a', encoding="utf-8") as jsonl_file: + + with open(f'arcana-{current_time_str}.log', 'a', encoding="utf-8") as log_file: + try: + self.process_hierarchy(data, jsonl_file, log_file) + except Exception as e: + pass + return data + + def process_hierarchy(self, graph: Graph, jsonl_file, log_file): + """Process each package, class, and method in the hierarchy.""" + + st_contains_st = graph.find_edges(label='contains', source_label='Structure', target_label='Structure') + ct_contains_st = graph.find_edges(label='contains', target_label='Structure', where_source=lambda + node: 'Container' in node.labels and 'Structure' not in node.labels) + new_ct_sources = {edge.target: graph.find_source(graph.find_edges(label='contains'), graph.nodes[edge.target], + lambda node: 'Structure' not in node.labels, + graph.nodes[edge.source]).id for edge in st_contains_st} + ct_contains_st.extend( + [Edge(source=source, target=target, label='contains') for target, source in new_ct_sources.items()]) + + trips = build_triplets(ct_contains_st, graph.find_edges(label='hasScript')) + met_to_cls_pkg = {met_id: (cls_id, pkg_id) for pkg_id, cls_id, met_id in trips} + # print(met_to_cls_pkg) + # print('######################################################################') + sorted_method_ids, method_deps = graph.toposorted_nodes(graph.find_edges(label='invokes')) + # print(sorted_method_ids) + + counter = 0 + + for met_id in tqdm(sorted_method_ids, desc='Processing methods', position=0, leave=False): + cls_id, pkg_id = met_to_cls_pkg[met_id] + method = graph.nodes[met_id] + clasz = graph.nodes[cls_id] + + self.process_script(graph, jsonl_file, log_file, method, clasz, method_deps) + + check_stop() + + counter += 1 + if counter == 10: + log_file.flush() + jsonl_file.flush() + counter %= 10 + + hierarchy = build_hierarchy(trips) + sorted_pkg_ids, pkg_deps = graph.toposorted_nodes( + graph.find_edges(label='contains', where_source=lambda node: 'Structure' not in node.labels, + where_target=lambda node: 'Structure' not in node.labels)) + + for pkg_id in tqdm(sorted_pkg_ids, desc="Processing packages", position=1): + pkg_data = hierarchy.get(pkg_id, dict()) + package = graph.nodes[pkg_id] + + for cls_id, cls_data in tqdm(pkg_data.items(), desc="Processing classes", position=2, leave=False): + clasz = graph.nodes[cls_id] + + self.process_structure(graph, jsonl_file, log_file, clasz, cls_data) + + check_stop() + + self.process_component(graph, jsonl_file, log_file, package, pkg_data, pkg_deps) + + log_file.flush() + jsonl_file.flush() + + check_stop() + + paths = graph.find_paths("contains", "hasScript", "invokes", "-hasScript", "-contains") + path_groups = group_paths_by_endpoints(paths) + + pkg_pairs = list(combinations(sorted_pkg_ids, 2)) + for pkg2_id, pkg1_id in tqdm(pkg_pairs, desc='Processing package interactions', position=0, leave=False): + + check_stop() + + pkg1 = graph.nodes[pkg1_id] + pkg2 = graph.nodes[pkg2_id] + if ('Structure' not in pkg1.labels) and ('Structure' not in pkg2.labels): + if path_groups[(pkg1_id, pkg2_id)]: + self.process_interactions(graph, pkg1, pkg2, path_groups[(pkg1_id, pkg2_id)], hierarchy, jsonl_file, + log_file) + if path_groups[(pkg2_id, pkg1_id)]: + self.process_interactions(graph, pkg2, pkg1, path_groups[(pkg2_id, pkg1_id)], hierarchy, jsonl_file, + log_file) + + def process_component(self, graph: Graph, jsonl_file, log_file, component: Node, component_contents: dict, + component_deps: dict): + """Process a single package and generate its description.""" + + # if 'description' not in component.properties or not component.properties['description'] or component.properties['description'] == "(no description)": + structure_descriptions = self.get_structure_descriptions(graph, component_contents) + subcomponent_descriptions = self.get_component_descriptions(graph, component_deps[component.id]) + component_kind = component.properties.get('kind', "component") + + prompt = f"Describe the following {component_kind} using the AnalyzeComponent tool.\n\n" + component_parameters = OrderedDict() + component_parameters["Project Name"] = self.project_name + component_parameters["Project Description"] = self.project_desc + component_parameters["Component Type"] = component_kind + component_parameters["Component Name"] = component.properties['qualifiedName'] + component_parameters["Enclosed Sub-components"] = subcomponent_descriptions + component_parameters["Enclosed Classes"] = structure_descriptions + component_parameters["Possible Architectural Layers"] = dict(self.layers) + + prompt = self.compose_prompt(prompt, component_parameters) + + log_file.write(prompt) + log_file.write('\n\n======\n\n') + + description = generate_json(prompt, "AnalyzeComponent") + self.update_package_properties(graph, description, component) + + layer_id = None + if component.has_property("layer") and component.property("layer") in [name for name, _ in self.layers]: + layer_id = f"layer:{component.property('layer')}" + layer_node = graph.find_node(label="Grouping", where=lambda node: node.id == layer_id) + if layer_node: + graph.add_edge(component.id, layer_node.id, "implements", weight=1) + + write_jsonl(jsonl_file, + {'data': {'id': component.id, 'labels': list(component.labels), 'properties': description}}) + + def process_interactions(self, graph: Graph, c1: Node, c2: Node, path_groups: List[List[Edge]], hierarchy, + jsonl_file: TextIO, log_file: TextIO): + c1_name = c1.properties["qualifiedName"] + c2_name = c2.properties["qualifiedName"] + c1_desc = c1.properties["description"] + c2_desc = c2.properties["description"] + + c1_contents = hierarchy.get(c1.id, dict()) + c2_contents = hierarchy.get(c2.id, dict()) + + c1_structure_info = "\n".join( + f" - `{graph.nodes[c_id].properties['simpleName']}`: {graph.nodes[c_id].properties['description']}" + for c_id, _ in c1_contents.items()) + c2_structure_info = "\n".join( + f" - `{graph.nodes[c_id].properties['simpleName']}`: {graph.nodes[c_id].properties['description']}" + for c_id, _ in c2_contents.items()) + + dep_info = f" - Dependencies from `{c1_name}` to `{c2_name}`:\n" + "\n".join( + f" - {describe_path(graph, path)}" for path in path_groups) if path_groups else "" + + prompt = templates.interaction_analysis.format(project_name=self.project_name, project_desc=self.project_desc, + pkg1_name=c1_name, pkg2_name=c2_name, pkg1_desc=c1_desc, + pkg2_desc=c2_desc, cls1_info=c1_structure_info, + cls2_info=c2_structure_info, dep_info=dep_info) + + log_file.write(prompt) + log_file.write('\n\n======\n\n') + + description = generate_text(prompt) + pkg1_edge = Edge(source=c1.id, target=c2.id, label="dependsOn", description=description) if dep_info else None + + if pkg1_edge: + if "dependsOn" not in graph.edges: + graph.edges["dependsOn"] = [] + graph.edges["dependsOn"].append(pkg1_edge) + + write_jsonl(jsonl_file, pkg1_edge.to_dict()) + + @staticmethod + def update_package_properties(data: Graph, description: dict, package: Node): + """Update package properties with the generated description.""" + for key in description: + if not key.endswith('Reason'): + data.nodes[package.id].properties[lower_first(key)] = description[key] + + def get_script_descriptions(self, data: Graph, cls_data: list) -> dict[str,str]: + """Generate descriptions for methods.""" + return {data.nodes[met_id].properties['simpleName']: self.describe(data.nodes[met_id]) for met_id in cls_data} + + def get_structure_descriptions(self, data: Graph, pkg_data: dict) -> dict[str,str]: + """Generate descriptions for classes.""" + return { + f"{data.nodes[cls_id].properties['kind']} {data.nodes[cls_id].properties['qualifiedName']}": self.describe( + data.nodes[cls_id]) for cls_id, _ in pkg_data.items()} + + def get_component_descriptions(self, data: Graph, package_ids: list) -> dict[str,str]: + """Generate descriptions for packages.""" + return {data.nodes[pkg_id].properties['qualifiedName']: self.describe(data.nodes[pkg_id]) for pkg_id in + package_ids} \ No newline at end of file diff --git a/arcana/llm_filter/processors.py b/arcana/llm_filter/processors.py new file mode 100644 index 0000000..0831a5f --- /dev/null +++ b/arcana/llm_filter/processors.py @@ -0,0 +1,191 @@ +from abc import ABC, abstractmethod +from collections import OrderedDict +from collections.abc import Iterable +import logging + +from tqdm.auto import tqdm + +from arcana.checkpoint import writer +from arcana.filters import check_stop +from arcana.llm_filter.client import LLMClient +from arcana.llm_filter.prompt import PromptBuilder, describe +from arcana.utils import lower_first, remove_java_comments +from arcanalib.graph import Graph, Node + +logger = logging.getLogger(__name__) + +class Processor(ABC): + def __init__(self, client, prompt_builder): + self.client: LLMClient = client + self.prompt: PromptBuilder = prompt_builder + + @abstractmethod + def process_all(self, graph): + raise NotImplementedError + +class ScriptProcessor(Processor): + + def process_all(self, graph: Graph): + sorted_method_ids, method_deps = Graph.toposorted_nodes(graph.find_edges(label='invokes'), graph.find_nodes('Operation')) + counter = 0 + logger.debug(sorted_method_ids) + logger.debug(method_deps) + + for met_id in tqdm(sorted_method_ids, desc='Processing methods', position=0, leave=False): + method: Node = graph.nodes[met_id] + clasz: Node = [n for n in method.sources('encapsulates') if n.has_label('Type')][0] + self.process_one(graph, method, clasz, method_deps) + + check_stop() + + counter += 1 + if counter == 10: + writer().flush() + counter %= 10 + + def process_one(self, graph: Graph, method: Node, clasz: Node, method_deps): + if 'description' not in method.properties or not method.properties['description'] or method.properties[ + 'description'] == "(no description)": + script_name = method.properties['simpleName'] + script_src = remove_java_comments(method.properties['sourceText']) + script_kind = method.properties.get('kind', 'function') + + structure_name = clasz.properties['qualifiedName'] + structure_kind = clasz.properties['kind'] + structure_kind = 'enum' if structure_kind == 'enumeration' else 'abstract class' if structure_kind == 'abstract' else structure_kind + + prompt = f"Describe the following {script_kind} by using the AnalyzeScript tool.\n\n" + script_parameters = OrderedDict() + script_parameters["Project Name"] = self.prompt.project_name + script_parameters["Project Description"] = self.prompt.project_desc + script_parameters[f"{script_kind.title()} Declaration"] = f"The {script_kind} {script_name} is declared within the {structure_kind} {structure_name}." + script_parameters[f"{script_kind.title()} Source Code"] = script_src + script_parameters["Outgoing Dependencies (Invokes)"] = {graph.nodes[node_id].properties[ + 'qualifiedName']: f"{describe(graph.nodes[node_id], 'description', 'returns', 'howToUse', 'docComment')}" + for node_id in method_deps[method.id]} + script_parameters["Incoming Dependencies (Invoked By)"] = [m.properties['qualifiedName'] for m in method.sources('invokes')] + script_parameters["Possible Architectural Layers"] = dict(self.prompt.layers) + + prompt = self.prompt.compose(prompt, **script_parameters) + + logger.debug(prompt) + + description = self.client.generate_json(prompt, "AnalyzeScript") + + layer = description.pop('layer', None) + if layer: + node_id = f"layer:{layer}" + target = graph.find_node(label="Category", where=lambda n: n.id == node_id) + if target: + impl_edge = graph.add_edge(clasz.id, target.id, "implements", weight=1, reason=description.get('layerReason')) + writer().write(impl_edge.to_dict()) + + self.update_method_properties(graph, description, method) + + writer().write({'data': {'id': method.id, 'labels': method.labels, 'properties': description}}) + + @staticmethod + def update_method_properties(data: Graph, description: dict, method: Node): + """Update method properties with the generated description.""" + + for key, value in description.items(): + if key.endswith('Reason'): + continue + key_lower = lower_first(key) + if key_lower == 'parameters' and isinstance(value, Iterable): + param_nodes = [data.nodes[edge.target] for edge in data.find_edges(label='hasParameter') if + edge.source == method.id] + for param in value: + if isinstance(param, dict): + matching_params = [node for node in param_nodes if + node.properties['simpleName'] == param.get('name')] + if matching_params: + param_node_id = matching_params[0].id + if param_node_id in data.nodes: + data.nodes[param_node_id].properties['description'] = param.get('description') + # elif key_lower == 'returns': + # method.properties['returns'] = value.get('description', None) if value and hasattr(value, 'get') else None + else: + data.nodes[method.id].properties[key_lower] = value + + +class StructureProcessor(Processor): + + def process_all(self, graph: Graph): + sorted_class_ids, class_deps = Graph.toposorted_nodes(graph.find_edges(label='specializes'), graph.find_nodes('Type')) + counter = 0 + + for cls_id in tqdm(sorted_class_ids, desc='Processing classes', position=1, leave=False): + clasz: Node = graph.nodes[cls_id] + package: Node = [n for n in clasz.sources('encloses') if n.has_label('Scope')][0] + self.process_one(graph, clasz, package, class_deps) + + check_stop() + + counter += 1 + if counter == 10: + writer().flush() + counter %= 10 + + def process_one(self, graph: Graph, clasz: Node, package: Node, class_deps): + _, variables = StructureProcessor.get_structure_relations(graph, clasz.id) + script_descriptions = { method.properties['qualifiedName']: describe(method) for method in clasz.targets('encapsulates') if method.has_label('Operation') } + + structure_name = clasz.properties['qualifiedName'] + structure_kind = clasz.properties['kind'] + structure_kind = 'enum' if structure_kind == 'enumeration' else 'abstract class' if structure_kind == 'abstract' else structure_kind + + prompt = f"Describe the following {structure_kind} using the AnalyzeStructure tool.\n\n" + structure_parameters = OrderedDict() + structure_parameters["Project Name"] = self.prompt.project_name + structure_parameters["Project Description"] = self.prompt.project_desc + structure_parameters[f"{structure_kind.title()} Name"] = structure_name + structure_parameters[f"{structure_kind.title()} Inhertis From"] = {graph.nodes[node_id].properties[ + 'qualifiedName']: f"{describe(graph.nodes[node_id], 'description', 'docComment')}" + for node_id in class_deps[clasz.id]} + structure_parameters[f"Enclosed Variables/Fields"] = variables + structure_parameters[f"Enclosed Functions/Methods"] = script_descriptions + structure_parameters['Possible Role Stereotypes'] = dict(self.prompt.role_stereotypes) + + prompt = self.prompt.compose(prompt, **structure_parameters) + + logger.debug(prompt) + + description = self.client.generate_json(prompt, "AnalyzeStructure") + + st = description.pop('roleStereotype', None) + if st: + node_id = f"rs:{st}" + target = graph.find_node(label="Category", where=lambda n: n.id == node_id) + if target: + impl_edge = graph.add_edge(clasz.id, target.id, "implements", weight=1, reason=description.get('roleStereotypeReason')) + writer().write(impl_edge.to_dict()) + + for k, v in description.items(): + if not k.endswith('Reason'): + graph.nodes[clasz.id].properties[lower_first(k)] = v + + writer().write({'data': {'id': clasz.id, 'labels': list(clasz.labels), 'properties': description}}) + + @staticmethod + def get_structure_relations(data: Graph, cls_id: str) -> tuple: + """Retrieve class ancestors and fields.""" + ancestors = list( + {data.nodes[edge.target] for edge in data.find_edges(label='specializes') if + edge.source == cls_id}) + fields = {data.nodes[edge.target] for edge in data.find_edges(label='encapsulates') if edge.source == cls_id} + fields = [' '.join(remove_java_comments(field.properties['sourceText']).split()) for field in fields if field.has_label('Variable')] + return ancestors, fields + + +class ComponentProcessor(Processor): + + def process_all(self, graph): + # iterate and describe each package + pass + +class InteractionProcessor(Processor): + + def process_all(self, graph): + # compute and describe interactions between packages + pass diff --git a/arcana/llm_filter/prompt.py b/arcana/llm_filter/prompt.py new file mode 100644 index 0000000..62ae5a4 --- /dev/null +++ b/arcana/llm_filter/prompt.py @@ -0,0 +1,101 @@ +from collections import OrderedDict +from arcana.checkpoint import writer +from arcana.utils import remove_author, sentence +from arcanalib.graph import Graph, Node + + +class PromptBuilder: + def __init__(self, project_cfg, layers_cfg=None, stereotypes_cfg=None): + self.project_name = project_cfg['name'] + self.project_desc = project_cfg['desc'] + self.layers = layers_cfg or OrderedDict() + self.role_stereotypes = stereotypes_cfg or OrderedDict() + # self.layers_str = format_layers(layers_cfg) + + def initialize_layers(self, graph: Graph): + layer_dimension = graph.add_node( + f"Architectural Layer", + "Dimension", + kind="categorical-ordered", + simpleName="Architectural Layer", + qualifiedName="Architectural Layer") + writer().write(layer_dimension.to_dict()) + + for i, (name, desc) in enumerate(self.layers.items()): + cat = graph.add_node( + f"layer:{name}", "Category", + kind="architectural layer", + simpleName=name, + qualifiedName=name, + description=desc, + order=i + ) + writer().write(cat.to_dict()) + e = graph.add_edge(cat.id, layer_dimension.id, "composes", weight=1) + writer().write(e.to_dict()) + + t_layers = list(self.layers.items()) + for i in range(len(t_layers) - 1): + src = t_layers[i][0] + tgt = t_layers[i + 1][0] + e = graph.add_edge(f"layer:{src}", f"layer:{tgt}", "succeeds", weight=1) + writer().write(e.to_dict()) + + + stereo_dimension = graph.add_node( + "Role Stereotype", + "Dimension", + kind="categorical-nominal", + simpleName="Role Stereotype", + qualifiedName="Role Stereotype" + ) + writer().write(stereo_dimension.to_dict()) + + for i, (name, desc) in enumerate(self.role_stereotypes.items()): + cat = graph.add_node( + f"rs:{name}", "Category", + kind="role stereotype", + simpleName=name, + qualifiedName=name, + description=desc, + order=i + ) + writer().write(cat.to_dict()) + e = graph.add_edge(cat.id, stereo_dimension.id, "composes", weight=1) + writer().write(e.to_dict()) + + def compose(self, base_prompt, **parameters): + + prompt = base_prompt + for k, v in parameters.items(): + if isinstance(v, dict) and len(v): + prompt += f"## {k}\n\n" + for k1, v1 in v.items(): + if v1: + prompt += f"* {k1}: {str(v1)}\n" + prompt += "\n\n" + elif isinstance(v, list) and len(v): + prompt += f"## {k}\n\n" + for v1 in v: + if v1: + prompt += f"* {str(v1)}\n" + prompt += "\n\n" + elif v: + prompt += f"## {k}\n\n{str(v)}\n\n" + return prompt.strip() + + +def describe(node: Node, *keys) -> str: + """Generate a description for a given node.""" + sr, sn = '\r', '\n' + if not keys: + keys = ['description', 'docComment', 'returns', 'reason', 'howToUse', 'howItWorks', 'assertions', + 'roleStereotype', 'layer'] + + lines = {key: f"**{key}**: {sentence(str(node.properties[key]).replace(sr, '').replace(sn, ' '))}" for key in + keys if key in node.properties and key != 'docComment' and node.properties[key]} + if 'docComment' in keys and 'docComment' in node.properties and node.properties['docComment']: + lines[ + 'docComment'] = f"**docComment**: {sentence(remove_author(str(node.properties['docComment'])).replace(sr, '').replace(sn, ' '))} " + + return ' '.join(lines[key] for key in keys if key in lines).strip() \ No newline at end of file diff --git a/arcana/merge_filter.py b/arcana/merge_filter.py new file mode 100644 index 0000000..03f9634 --- /dev/null +++ b/arcana/merge_filter.py @@ -0,0 +1,19 @@ +import json +from typing import Any, Dict + +from arcana.utils import merge_node_properties +from arcanalib.graph import Graph +from arcanalib.pipefilter import Filter + + +class MergeFilter(Filter): + def __init__(self, config: Dict[str, Dict[str, Any]]): + super().__init__(config) + + with open(config['merge']['input'], 'r', encoding="utf-8") as file: + data = json.load(file) + self.node_dict_to_merge = data + + def process(self, data: Graph) -> Any: + merge_node_properties(data.nodes, self.node_dict_to_merge, True) + return data \ No newline at end of file diff --git a/arcana/metrics.py b/arcana/metrics.py new file mode 100644 index 0000000..e70baf2 --- /dev/null +++ b/arcana/metrics.py @@ -0,0 +1,67 @@ +from collections import Counter, OrderedDict, defaultdict + +from arcana.graph_utils import dependency_profile_category +from arcanalib.graph import Graph, invert, lift +from arcanalib.pipefilter import Filter + + +class MetricsFilter(Filter): + def process(self, data: Graph) -> Graph: + """ + Process the data to generate dependency profiles and categorize nodes. + + Args: + data (Graph): The input data. + + Returns: + Graph: The processed data with dependency profiles. + """ + # 1. Create a Dependency Profile dimension with its 4 categories + dim = data.add_node( + "Dependency Profile", "Dimension", + kind="categorical", + simpleName="Dependency Profile", + qualifiedName="Dependency Profile" + ) + categories = OrderedDict([ + ("outbound", "Calls leaving the module"), + ("inbound", "Calls entering the module"), + ("transit", "Both inbound and outbound"), + ("hidden", "Neither inbound nor outbound"), + ]) + cat_ids = {} + for idx, (key, desc) in enumerate(categories.items()): + cat = data.add_node( + f"dp:{key}", "Category", + kind="dependency profile", + simpleName=key, + qualifiedName=key, + description=desc, + order=idx + ) + cat_ids[key] = cat.id + data.add_edge(cat.id, dim.id, "composes", weight=1) + + # 2. Compute raw in/out counts + parents = {e.source: e.target for e in invert(data.find_edges(label='encloses'))} + dependency_profiles = defaultdict(list) + + calls = data.edges.get('calls', + lift(data.find_edges(label='encapsulates'), data.find_edges(label='invokes'), 'calls')) + + for edge in calls: + source_id, target_id = edge.source, edge.target + if parents.get(source_id) != parents.get(target_id): + dependency_profiles[source_id].append('out') + dependency_profiles[target_id].append('in') + + dependency_profiles = {node_id: Counter(prof) for node_id, prof in dependency_profiles.items()} + + # 3. Attach classification edges instead of setting a string property + for node_id, profile in dependency_profiles.items(): + cat_key = dependency_profile_category(profile['in'], profile['out']) + target_id = cat_ids.get(cat_key) + if target_id: + data.add_edge(node_id, target_id, "implements", weight=1) + + return data \ No newline at end of file diff --git a/arcana/seeder.py b/arcana/seeder.py new file mode 100644 index 0000000..340cffd --- /dev/null +++ b/arcana/seeder.py @@ -0,0 +1,29 @@ +import json +import subprocess +import sys + +from arcanalib.graph import Graph +from arcanalib.pipefilter import Seeder + + +class CLISeeder(Seeder): + + def __init__(self, command) -> None: + """ + Initialize the seeder with a command. + + :param command: The command to be executed. + """ + self.command = command + + def generate(self) -> Graph: + """ + Execute the command, parse the JSON output into a dict, and pass the dict to the Graph constructor. + + :return: The generated Graph object. + """ + process = subprocess.run(self.command, capture_output=True, text=True, encoding="utf-8", check=True) + if process.stderr: + sys.stderr.write(process.stderr) + output_dict = json.loads(process.stdout) + return Graph(output_dict) \ No newline at end of file diff --git a/arcana/templates.py b/arcana/templates.py index bf4fbcd..83ef9f8 100644 --- a/arcana/templates.py +++ b/arcana/templates.py @@ -1,204 +1,199 @@ +script_description = { + "$schema": "http://json-schema.org/draft-07/schema#", + "title": "ScriptDescription", + "type": "object", + "properties": { + "description": { + "type": "string", + "description": "One-sentence description of the method/constructor/function functionality." + }, + "parameters": { + "type": "array", + "items": { + "type": "object", + "properties": { + "name": { + "type": "string", + "description": "Parameter name." + }, + "type": { + "type": "string", + "description": "Parameter type." + }, + "description": { + "type": "string", + "description": "Brief description of the parameter." + } + }, + "required": [ + "name", + "description" + ] + }, + "description": "List of script parameters. Empty if none." + }, + "returns": { + "type": "string", + "description": "One-sentence description of the returned object or value. For constructors, consider the newly created instance as the return." + }, + "howToUse": { + "type": "string", + "description": "Usage instructions in less than three sentences." + }, + "howItWorks": { + "type": "string", + "description": "Implementation details in less than five sentences." + }, + "preConditions": { + "type": "array", + "items": { + "type": "string" + }, + "description": "List of pre-conditions for the script." + }, + "postConditions": { + "type": "array", + "items": { + "type": "string" + }, + "description": "List of post-conditions for the script." + }, + "stereotype": { + "type": "string", + "enum": [ + "Accessor", + "Mutator", + "Creational", + "Collaborational", + "Other" + ], + "description": "Design stereotype of the script." + }, + "stereotypeReason": { + "type": "string", + "description": "One-sentence explanation for the chosen stereotype." + }, + "layer": { + "type": "string", + "description": "Architectural layer classification selected from the provided options." + }, + "layerReason": { + "type": "string", + "description": "Explanation why the script fits the chosen architectural layer but not others." + } + }, + "required": [ + "description", + "howItWorks", + "howToUse", + "layer", + "layerReason", + "parameters", + "postConditions", + "preConditions", + "returns", + "stereotype", + "stereotypeReason" + ], + "additionalProperties": False +} + analyze_script_tool = { - "type": "function", - "function": { - "name": "AnalyzeScript", - "description": "Analyzes a program script given its source code and context. Returns an explanation covering functionality, parameters, return value, design rationale, usage, implementation details, assertions, stereotype, and architectural layer classification.", - "parameters": { - "type": "object", - "properties": { - "description": { - "type": "string", - "description": "One-sentence description of the script functionality." - }, - "parameters": { - "type": "array", - "items": { - "type": "object", - "properties": { - "name": { - "type": "string", - "description": "Parameter name." - }, - "type": { - "type": "string", - "description": "Parameter type." - }, - "description": { - "type": "string", - "description": "Brief description of the parameter." - } - }, - "required": [ - "name", - "description" - ] - }, - "description": "List of script parameters. Empty if none." - }, - "returns": { - "type": "string", - "description": "One-sentence description of the returned object or value. For constructors, consider the newly created instance as the return." - }, - "howToUse": { - "type": "string", - "description": "Usage instructions in less than three sentences." - }, - "howItWorks": { - "type": "string", - "description": "Implementation details in less than five sentences." - }, - "preConditions": { - "type": "array", - "items": { - "type": "string" - }, - "description": "List of pre-conditions for the script." - }, - "postConditions": { - "type": "array", - "items": { - "type": "string" - }, - "description": "List of post-conditions for the script." - }, - "stereotype": { - "type": "string", - "enum": [ - "Accessor", - "Mutator", - "Creational", - "Collaborational", - "Other" - ], - "description": "Design stereotype of the script." - }, - "stereotypeReason": { - "type": "string", - "description": "One-sentence explanation for the chosen stereotype." - }, - "layer": { - "type": "string", - "description": "Architectural layer classification selected from the provided options." - }, - "layerReason": { - "type": "string", - "description": "Explanation why the script fits the chosen architectural layer but not others." - } - }, - "required": [ - "description", - "parameters", - "returns", - "howToUse", - "howItWorks", - "preConditions", - "postConditions", - "stereotype", - "stereotypeReason", - "layer", - "layerReason" - ] - } - } + "type": "function", + "function": { + "name": "AnalyzeScript", + "description": "Analyzes a program method/constructor/function given its source code and context. Returns an explanation covering functionality, parameters, return value, design rationale, usage, implementation details, assertions, stereotype, and architectural layer classification.", + "parameters": script_description + } +} + +structure_description = { + "$schema": "http://json-schema.org/draft-07/schema#", + "title": "StructureDescription", + "type": "object", + "properties": { + "description": { + "type": "string", + "description": "Up to three sentences describing the key responsibilities of the class/struct/type." + }, + "keywords": { + "type": "array", + "items": { + "type": "string" + }, + "description": "List of important keywords related to the key responsibilities of the class/struct/type." + }, + "roleStereotype": { + "type": "string", + "description": "Role stereotype of the class/struct/type; options are supplied at runtime." + }, + "roleStereotypeReason": { + "type": "string", + "description": "One-sentence explanation for the chosen role stereotype." + } + }, + "required": [ + "description", + "keywords", + "roleStereotype", + "roleStereotypeReason" + ] } analyze_structure_tool = { - "type": "function", - "function": { - "name": "AnalyzeStructure", - "description": "Analyzes a software structure based on its inheritance, fields, and methods. Returns an explanation covering the key responsibilities of the structure, relevant keywords, role stereotype, and rationale for the chosen stereotype.", - "parameters": { - "type": "object", - "properties": { - "description": { - "type": "string", - "description": "Up to three sentences describing the key responsibilities of the structure." - }, - "keywords": { - "type": "array", - "items": { - "type": "string" - }, - "description": "List of important keywords related to the key responsibilities of the structure." - }, - "roleStereotype": { - "type": "string", - "enum": [ - "Information Holder", - "Service Provider", - "Structurer", - "Controller", - "Coordinator", - "User Interfacer", - "External Interfacer", - "Internal Interfacer" - ], - "description": "The role stereotype of the structure: \ - **Information Holder** is responsible for knowing facts and providing information to other objects. POJOs, Java Beans, and enumerations are usually information holders. \ - **Service Provider** is responsible for handling requests and performing specific services. It usually implements a specific interface with a small number of methods. Concrete strategies are service providers. \ - **Structurer** is responsible for managing relationships and constraints among related things. It is usually a collection or mapping of some sort, i.e., a subclass of a List, Set, Map, etc. \ - **Controller** is responsible for making decisions, directing the work of others, and handling important events. It directs the flow of the application or business process. \ - **Coordinator** is responsible for managing the actions of a group of workers and facilitating communication and work of other objects. It delegates requests to other objects. Very abstract classes and interfaces might be coordinators as they delegate the work to subclasses. \ - **User Interfacer** is responsible for transmitting user requests for action or display/render information that can be updated. It handles interactions with users. \ - **External Interfacer** is responsible for loading and storing information from/to external services, including database systems, web services, filesystems, hardware, etc. \ - **Internal Interfacer** is responsible for interfacing between two subsystems. It may bundle together information of requests from a group of objects to be sent to another object. Abstract adapters, bridges, facades, and proxies are internal interfacers." - }, - "roleStereotypeReason": { - "type": "string", - "description": "One-sentence explanation for the chosen role stereotype." - } - }, - "required": [ - "description", - "keywords", - "roleStereotype", - "roleStereotypeReason" - ] - } - } + "type": "function", + "function": { + "name": "AnalyzeStructure", + "description": "Analyzes a software class/struct/type based on its inheritance, fields, and methods. Returns an explanation covering the key responsibilities of the structure, relevant keywords, role stereotype, and rationale for the chosen stereotype.", + "parameters": structure_description + } +} + +component_description = { + "$schema": "http://json-schema.org/draft-07/schema#", + "title": "ComponentDescription", + "type": "object", + "properties": { + "description": { + "type": "string", + "description": "Describe the functionality of the component/package in up to five sentences." + }, + "title": { + "type": "string", + "description": "A noun phrase that describes the component/package." + }, + "keywords": { + "type": "array", + "items": { + "type": "string" + }, + "description": "List of important keywords related to the core functionalities of the component/package." + }, + "layer": { + "type": "string", + "description": "Architectural layer classification selected from the provided options." + }, + "layerReason": { + "type": "string", + "description": "Explanation why the component/package fits the chosen layer but not others." + } + }, + "required": [ + "description", + "title", + "keywords", + "layer", + "layerReason" + ] } analyze_component_tool = { - "type": "function", - "function": { - "name": "AnalyzeComponent", - "description": "Analyzes a software component by examining its contents. Returns an explanation including a description of component functionality, a descriptive title, a list of keywords, the selected architectural layer, and the rationale for that layer.", - "parameters": { - "type": "object", - "properties": { - "description": { - "type": "string", - "description": "Describe the functionality of the component in up to five sentences." - }, - "title": { - "type": "string", - "description": "A noun phrase that describes the component." - }, - "keywords": { - "type": "array", - "items": { - "type": "string" - }, - "description": "List of important keywords related to the core functionalities of the component." - }, - "layer": { - "type": "string", - "description": "Architectural layer classification selected from the provided options." - }, - "layerReason": { - "type": "string", - "description": "Explanation why the component fits the chosen layer but not others." - } - }, - "required": [ - "description", - "title", - "keywords", - "layer", - "layerReason" - ] - } - } + "type": "function", + "function": { + "name": "AnalyzeComponent", + "description": "Analyzes a software component/package by examining its contents. Returns an explanation including a description of component/package responsibility, a descriptive title, a list of keywords, the selected architectural layer, and the rationale for that layer.", + "parameters": component_description + } } interaction_analysis = '''## Input: @@ -206,13 +201,13 @@ Consider a project {project_name}, {project_desc}. - Package Information: - - `{pkg1_name}`: {pkg1_desc} - - `{pkg2_name}`: {pkg2_desc} + - `{pkg1_name}`: {pkg1_desc} + - `{pkg2_name}`: {pkg2_desc} - Class Information: - - `{pkg1_name}`: + - `{pkg1_name}`: {cls1_info} - - `{pkg2_name}`: + - `{pkg2_name}`: {cls2_info} - Inter-Package Dependencies: @@ -224,7 +219,7 @@ - The purpose and nature of their dependency in terms of design. - An abstract, high-level description of the relationship without referencing specific classes or methods. - + ## Output: Provide a cohesive explanation of the interaction in one to two sentences. Keep the response plain text.''' diff --git a/arcana/utils.py b/arcana/utils.py new file mode 100644 index 0000000..2021b63 --- /dev/null +++ b/arcana/utils.py @@ -0,0 +1,126 @@ +import json +import re +from typing import Any, Dict, TextIO + +from arcana.custom_encoder import CustomJSONEncoder +from arcanalib.graph import Node + +def remove_author(s: str) -> str: + return "\n".join(line.strip() for line in s.splitlines() if '@author' not in line) + +_JAVA_COMMENT_RE = re.compile(r"(//.*?$)|(/\*.*?\*/)", flags=re.MULTILINE | re.DOTALL) + +def remove_java_comments(java_source: str) -> str: + return _JAVA_COMMENT_RE.sub("", java_source).strip() + +def sentence(s: str) -> str: + """ + Capitalize the first letter of a string and ensure it ends with a period. + + Args: + s (str): The input string. + + Returns: + str: The formatted string. + """ + if not s: + return "" + t = s.strip() + if not t: + return "" + if t[-1] in '.?!…~–—': + return f'{t[0].upper()}{t[1:]}' + return f'{t[0].upper()}{t[1:]}.' + +def lower_first(s: str) -> str: + """ + Lowercase the first character of a string. + + Args: + s (str): The input string. + + Returns: + str: The string with the first character lowercased. + """ + return s[0].lower() + s[1:] if s else s + +def prettify_json(obj: dict) -> str: + """ + Convert a dictionary to a pretty-printed JSON string. + + Args: + obj (dict): The input dictionary. + + Returns: + str: The pretty-printed JSON string. + """ + return json.dumps(obj, indent=2) + +def write_jsonl(file: TextIO, obj: Any) -> None: + file.write(json.dumps(obj, cls=CustomJSONEncoder) + '\n') + +def find_first_valid_json(text: str) -> str: + """ + Finds the first valid JSON substring in the given text using a stack-based approach. + + It scans the text from left to right, and when it encounters a '{', it tracks the balanced + braces until a complete JSON object is formed. Once a candidate is found, it attempts to parse + it with json.loads(). If parsing succeeds, that candidate is returned immediately. + + Args: + text (str): The input string that may contain a JSON object. + + Returns: + str: The first valid JSON substring found, or an empty string if none is found. + """ + n = len(text) + for i in range(n): + if text[i] == '{': + stack = 0 + for j in range(i, n): + if text[j] == '{': + stack += 1 + elif text[j] == '}': + stack -= 1 + if stack == 0: + candidate = text[i:j + 1] + try: + json.loads(candidate) + return candidate + except json.JSONDecodeError: + # If this candidate isn't valid JSON, break and continue scanning. + break + return "" + +def simplify_name(name): + if '(' in name and name.endswith(')'): + prefix, params = name.split('(', 2) + params = [param.split('.')[-1].split('$')[-1] for param in params.split(')', 1)[0].split(',')] + return prefix + '(' + ','.join(params) + ')' + else: + return name + +def merge_node_properties(dict1: Dict[str, Node], dict2: Dict[str, Node], simplify_names=False): + for id2, obj2 in dict2.items(): + + matched_obj = None + if id2 in dict1 and set(dict1[id2].labels) & set(obj2.labels): + matched_obj = dict1[id2] + + elif simplify_names: + + dict1_name_remap = {simplify_name(key): key for key in dict1 if + {'Script', 'Operation', 'Constructor'} & set(dict1[key].labels)} + + if id2 in dict1_name_remap and set(dict1[dict1_name_remap[id2]].labels) & set(obj2.labels): + matched_obj = dict1[dict1_name_remap[id2]] + + if matched_obj: + # sys.stderr.write(f"{id2}->{matched_obj['id']}\n") + # Merge properties from obj2 into matched_obj + matched_obj.properties.update(obj2.properties) + else: + # sys.stderr.write(f"{id2}->None\n") + pass + + diff --git a/arcanalib/graph.py b/arcanalib/graph.py index 4c4a6fc..e1faf05 100644 --- a/arcanalib/graph.py +++ b/arcanalib/graph.py @@ -384,7 +384,7 @@ def process_nodes(self, edges: List[Edge], node_processor): return results @staticmethod - def toposorted_nodes(edges: List[Edge]): + def toposorted_nodes(edges: List[Edge], nodes: List[Node] = None): adj_list, outdegree = Graph._adj_list(edges) sorted_nodes = [] node_deps = {} @@ -408,6 +408,12 @@ def toposorted_nodes(edges: List[Edge]): dependencies = adj_list.get(n_id, []) sorted_nodes.append(n_id) node_deps[n_id] = dependencies + + for node in nodes: + if node.id not in sorted_nodes: + sorted_nodes.insert(0, node.id) + node_deps[node.id] = [] + return sorted_nodes, node_deps def clean_up(self): From e6ec776ddc88eb3eec406c8f6d9e1d37d198eb60 Mon Sep 17 00:00:00 2001 From: Satrio Adi Rukmono Date: Wed, 21 May 2025 18:24:02 +0200 Subject: [PATCH 19/34] Restructure WIP --- arcana/llm_filter/processors.py | 161 +++++++++++++++++++++----------- arcanalib/graph.py | 3 +- 2 files changed, 108 insertions(+), 56 deletions(-) diff --git a/arcana/llm_filter/processors.py b/arcana/llm_filter/processors.py index 0831a5f..ccbe4e2 100644 --- a/arcana/llm_filter/processors.py +++ b/arcana/llm_filter/processors.py @@ -31,7 +31,7 @@ def process_all(self, graph: Graph): logger.debug(sorted_method_ids) logger.debug(method_deps) - for met_id in tqdm(sorted_method_ids, desc='Processing methods', position=0, leave=False): + for met_id in tqdm(sorted_method_ids, desc='Processing methods'): method: Node = graph.nodes[met_id] clasz: Node = [n for n in method.sources('encapsulates') if n.has_label('Type')][0] self.process_one(graph, method, clasz, method_deps) @@ -43,30 +43,30 @@ def process_all(self, graph: Graph): writer().flush() counter %= 10 - def process_one(self, graph: Graph, method: Node, clasz: Node, method_deps): - if 'description' not in method.properties or not method.properties['description'] or method.properties[ + def process_one(self, graph: Graph, operation: Node, type: Node, operation_deps): + if 'description' not in operation.properties or not operation.properties['description'] or operation.properties[ 'description'] == "(no description)": - script_name = method.properties['simpleName'] - script_src = remove_java_comments(method.properties['sourceText']) - script_kind = method.properties.get('kind', 'function') - - structure_name = clasz.properties['qualifiedName'] - structure_kind = clasz.properties['kind'] - structure_kind = 'enum' if structure_kind == 'enumeration' else 'abstract class' if structure_kind == 'abstract' else structure_kind - - prompt = f"Describe the following {script_kind} by using the AnalyzeScript tool.\n\n" - script_parameters = OrderedDict() - script_parameters["Project Name"] = self.prompt.project_name - script_parameters["Project Description"] = self.prompt.project_desc - script_parameters[f"{script_kind.title()} Declaration"] = f"The {script_kind} {script_name} is declared within the {structure_kind} {structure_name}." - script_parameters[f"{script_kind.title()} Source Code"] = script_src - script_parameters["Outgoing Dependencies (Invokes)"] = {graph.nodes[node_id].properties[ + op_name = operation.properties['simpleName'] + op_src = remove_java_comments(operation.properties['sourceText']) + op_kind = operation.properties.get('kind', 'function') + + typ_name = type.properties['qualifiedName'] + typ_kind = type.properties['kind'] + typ_kind = 'enum' if typ_kind == 'enumeration' else 'abstract class' if typ_kind == 'abstract' else typ_kind + + prompt = f"Describe the following {op_kind} by using the AnalyzeScript tool.\n\n" + op_parameters = OrderedDict() + op_parameters["Project Name"] = self.prompt.project_name + op_parameters["Project Description"] = self.prompt.project_desc + op_parameters[f"{op_kind.title()} Declaration"] = f"The {op_kind} {op_name} is declared within the {typ_kind} {typ_name}." + op_parameters[f"{op_kind.title()} Source Code"] = op_src + op_parameters["Outgoing Dependencies (Invokes)"] = {graph.nodes[node_id].properties[ 'qualifiedName']: f"{describe(graph.nodes[node_id], 'description', 'returns', 'howToUse', 'docComment')}" - for node_id in method_deps[method.id]} - script_parameters["Incoming Dependencies (Invoked By)"] = [m.properties['qualifiedName'] for m in method.sources('invokes')] - script_parameters["Possible Architectural Layers"] = dict(self.prompt.layers) + for node_id in operation_deps[operation.id]} + op_parameters["Incoming Dependencies (Invoked By)"] = [m.properties['qualifiedName'] for m in operation.sources('invokes')] + op_parameters["Possible Architectural Layers"] = dict(self.prompt.layers) - prompt = self.prompt.compose(prompt, **script_parameters) + prompt = self.prompt.compose(prompt, **op_parameters) logger.debug(prompt) @@ -77,12 +77,12 @@ def process_one(self, graph: Graph, method: Node, clasz: Node, method_deps): node_id = f"layer:{layer}" target = graph.find_node(label="Category", where=lambda n: n.id == node_id) if target: - impl_edge = graph.add_edge(clasz.id, target.id, "implements", weight=1, reason=description.get('layerReason')) + impl_edge = graph.add_edge(operation.id, target.id, "implements", weight=1, reason=description.get('layerReason')) writer().write(impl_edge.to_dict()) - self.update_method_properties(graph, description, method) + self.update_method_properties(graph, description, operation) - writer().write({'data': {'id': method.id, 'labels': method.labels, 'properties': description}}) + writer().write({'data': {'id': operation.id, 'labels': operation.labels, 'properties': description}}) @staticmethod def update_method_properties(data: Graph, description: dict, method: Node): @@ -115,7 +115,7 @@ def process_all(self, graph: Graph): sorted_class_ids, class_deps = Graph.toposorted_nodes(graph.find_edges(label='specializes'), graph.find_nodes('Type')) counter = 0 - for cls_id in tqdm(sorted_class_ids, desc='Processing classes', position=1, leave=False): + for cls_id in tqdm(sorted_class_ids, desc='Processing classes'): clasz: Node = graph.nodes[cls_id] package: Node = [n for n in clasz.sources('encloses') if n.has_label('Scope')][0] self.process_one(graph, clasz, package, class_deps) @@ -127,48 +127,52 @@ def process_all(self, graph: Graph): writer().flush() counter %= 10 - def process_one(self, graph: Graph, clasz: Node, package: Node, class_deps): - _, variables = StructureProcessor.get_structure_relations(graph, clasz.id) - script_descriptions = { method.properties['qualifiedName']: describe(method) for method in clasz.targets('encapsulates') if method.has_label('Operation') } - - structure_name = clasz.properties['qualifiedName'] - structure_kind = clasz.properties['kind'] - structure_kind = 'enum' if structure_kind == 'enumeration' else 'abstract class' if structure_kind == 'abstract' else structure_kind - - prompt = f"Describe the following {structure_kind} using the AnalyzeStructure tool.\n\n" - structure_parameters = OrderedDict() - structure_parameters["Project Name"] = self.prompt.project_name - structure_parameters["Project Description"] = self.prompt.project_desc - structure_parameters[f"{structure_kind.title()} Name"] = structure_name - structure_parameters[f"{structure_kind.title()} Inhertis From"] = {graph.nodes[node_id].properties[ + def process_one(self, graph: Graph, type: Node, scope: Node, type_deps): + _, variables = StructureProcessor.get_type_relations(graph, type.id) + op_descriptions = { method.properties['qualifiedName']: describe(method) for method in type.targets('encapsulates') if method.has_label('Operation') } + + typ_name = type.properties['qualifiedName'] + typ_kind = type.properties.get('kind', "type") + typ_kind = 'enum' if typ_kind == 'enumeration' else 'abstract class' if typ_kind == 'abstract' else typ_kind + + scope_name = scope.properties['qualifiedName'] + scope_kind = scope.properties.get('kind', "scope") + + prompt = f"Describe the following {typ_kind} using the AnalyzeStructure tool.\n\n" + typ_parameters = OrderedDict() + typ_parameters["Project Name"] = self.prompt.project_name + typ_parameters["Project Description"] = self.prompt.project_desc + typ_parameters[f"{typ_kind.title()} Name"] = typ_name + typ_parameters[f"{typ_kind.title()} Declaration"] = f"The {typ_kind} {typ_name} is declared within the {scope_kind} {scope_name}." + typ_parameters[f"{typ_kind.title()} Inhertis From"] = {graph.nodes[node_id].properties[ 'qualifiedName']: f"{describe(graph.nodes[node_id], 'description', 'docComment')}" - for node_id in class_deps[clasz.id]} - structure_parameters[f"Enclosed Variables/Fields"] = variables - structure_parameters[f"Enclosed Functions/Methods"] = script_descriptions - structure_parameters['Possible Role Stereotypes'] = dict(self.prompt.role_stereotypes) + for node_id in type_deps[type.id]} + typ_parameters[f"Enclosed Variables/Fields"] = variables + typ_parameters[f"Enclosed Functions/Methods"] = op_descriptions + typ_parameters['Possible Role Stereotypes'] = dict(self.prompt.role_stereotypes) - prompt = self.prompt.compose(prompt, **structure_parameters) + prompt = self.prompt.compose(prompt, **typ_parameters) logger.debug(prompt) description = self.client.generate_json(prompt, "AnalyzeStructure") - st = description.pop('roleStereotype', None) - if st: - node_id = f"rs:{st}" + rs = description.pop('roleStereotype', None) + if rs: + node_id = f"rs:{rs}" target = graph.find_node(label="Category", where=lambda n: n.id == node_id) if target: - impl_edge = graph.add_edge(clasz.id, target.id, "implements", weight=1, reason=description.get('roleStereotypeReason')) + impl_edge = graph.add_edge(type.id, target.id, "implements", weight=1, reason=description.get('roleStereotypeReason')) writer().write(impl_edge.to_dict()) for k, v in description.items(): if not k.endswith('Reason'): - graph.nodes[clasz.id].properties[lower_first(k)] = v + graph.nodes[type.id].properties[lower_first(k)] = v - writer().write({'data': {'id': clasz.id, 'labels': list(clasz.labels), 'properties': description}}) + writer().write({'data': {'id': type.id, 'labels': list(type.labels), 'properties': description}}) @staticmethod - def get_structure_relations(data: Graph, cls_id: str) -> tuple: + def get_type_relations(data: Graph, cls_id: str) -> tuple: """Retrieve class ancestors and fields.""" ancestors = list( {data.nodes[edge.target] for edge in data.find_edges(label='specializes') if @@ -180,9 +184,56 @@ def get_structure_relations(data: Graph, cls_id: str) -> tuple: class ComponentProcessor(Processor): - def process_all(self, graph): - # iterate and describe each package - pass + def process_all(self, graph: Graph): + sorted_pkg_ids, pkg_deps = graph.toposorted_nodes( + graph.find_edges(label='encloses', where_source=lambda node: node.has_label('Scope') and not node.has_label('Type'), + where_target=lambda node: node.has_label('Scope') and not node.has_label('Type')), graph.find_nodes('Scope', where=lambda node: not node.has_label('Type'))) + counter = 0 + + for pkg_id in tqdm(sorted_pkg_ids, desc='Processing packages'): + package: Node = graph.nodes[pkg_id] + self.process_one(graph, package, pkg_deps) + + check_stop() + + counter += 1 + if counter == 10: + writer().flush() + counter %= 10 + + def process_one(self, graph: Graph, scope: Node, scope_deps): + typ_descriptions = { f"{type.properties['kind']} {type.properties['qualifiedName']}": describe(type) for type in scope.targets('encloses') if type.has_label('Type') } + subscp_descriptions = {graph.nodes[node_id].properties['qualifiedName']: f"{describe(graph.nodes[node_id], 'description', 'returns', 'howToUse', 'docComment')}" + for node_id in scope_deps[scope.id]} + scp_kind = scope.properties.get('kind', "component") + + prompt = f"Describe the following {scp_kind} using the AnalyzeComponent tool.\n\n" + scp_parameters = OrderedDict() + scp_parameters["Project Name"] = self.prompt.project_name + scp_parameters["Project Description"] = self.prompt.project_desc + scp_parameters[f"{scp_kind.title()} Kind"] = scp_kind + scp_parameters[f"{scp_kind.title()} Name"] = scope.properties['qualifiedName'] + scp_parameters[f"Enclosed Sub-{scp_kind}s"] = subscp_descriptions + scp_parameters["Enclosed Classes"] = typ_descriptions + scp_parameters["Possible Architectural Layers"] = dict(self.prompt.layers) + + prompt = self.prompt.compose(prompt, **scp_parameters) + + logger.debug(prompt) + + description = self.client.generate_json(prompt, "AnalyzeComponent") + + layer = description.pop('layer', None) + if layer: + node_id = f"layer:{layer}" + target = graph.find_node(label="Category", where=lambda n: n.id == node_id) + if target: + impl_edge = graph.add_edge(scope.id, target.id, "implements", weight=1, reason=description.get('layerReason')) + writer().write(impl_edge.to_dict()) + + self.update_package_properties(graph, description, scope) + + writer().write({'data': {'id': scope.id, 'labels': list(scope.labels), 'properties': description}}) class InteractionProcessor(Processor): diff --git a/arcanalib/graph.py b/arcanalib/graph.py index e1faf05..d3fe0bb 100644 --- a/arcanalib/graph.py +++ b/arcanalib/graph.py @@ -408,7 +408,8 @@ def toposorted_nodes(edges: List[Edge], nodes: List[Node] = None): dependencies = adj_list.get(n_id, []) sorted_nodes.append(n_id) node_deps[n_id] = dependencies - + + nodes = nodes or list() for node in nodes: if node.id not in sorted_nodes: sorted_nodes.insert(0, node.id) From 758b29d46640658a083d7408a53e8c2a535cdce8 Mon Sep 17 00:00:00 2001 From: Satrio Adi Rukmono Date: Wed, 21 May 2025 18:50:39 +0200 Subject: [PATCH 20/34] Fix dependency profile bug --- arcana/llm_filter/processors.py | 30 +++++++++++++++++++++--------- arcana/metrics.py | 8 ++++++-- arcana/templates.py | 12 +++++++++++- 3 files changed, 38 insertions(+), 12 deletions(-) diff --git a/arcana/llm_filter/processors.py b/arcana/llm_filter/processors.py index ccbe4e2..4e8bb4c 100644 --- a/arcana/llm_filter/processors.py +++ b/arcana/llm_filter/processors.py @@ -128,7 +128,7 @@ def process_all(self, graph: Graph): counter %= 10 def process_one(self, graph: Graph, type: Node, scope: Node, type_deps): - _, variables = StructureProcessor.get_type_relations(graph, type.id) + vars = StructureProcessor.get_type_relations(graph, type.id) op_descriptions = { method.properties['qualifiedName']: describe(method) for method in type.targets('encapsulates') if method.has_label('Operation') } typ_name = type.properties['qualifiedName'] @@ -147,9 +147,10 @@ def process_one(self, graph: Graph, type: Node, scope: Node, type_deps): typ_parameters[f"{typ_kind.title()} Inhertis From"] = {graph.nodes[node_id].properties[ 'qualifiedName']: f"{describe(graph.nodes[node_id], 'description', 'docComment')}" for node_id in type_deps[type.id]} - typ_parameters[f"Enclosed Variables/Fields"] = variables + typ_parameters[f"Enclosed Variables/Fields"] = vars typ_parameters[f"Enclosed Functions/Methods"] = op_descriptions typ_parameters['Possible Role Stereotypes'] = dict(self.prompt.role_stereotypes) + typ_parameters["Possible Architectural Layers"] = dict(self.prompt.layers) prompt = self.prompt.compose(prompt, **typ_parameters) @@ -165,6 +166,14 @@ def process_one(self, graph: Graph, type: Node, scope: Node, type_deps): impl_edge = graph.add_edge(type.id, target.id, "implements", weight=1, reason=description.get('roleStereotypeReason')) writer().write(impl_edge.to_dict()) + layer = description.pop('layer', None) + if layer: + node_id = f"layer:{layer}" + target = graph.find_node(label="Category", where=lambda n: n.id == node_id) + if target: + impl_edge = graph.add_edge(type.id, target.id, "implements", weight=1, reason=description.get('layerReason')) + writer().write(impl_edge.to_dict()) + for k, v in description.items(): if not k.endswith('Reason'): graph.nodes[type.id].properties[lower_first(k)] = v @@ -173,13 +182,10 @@ def process_one(self, graph: Graph, type: Node, scope: Node, type_deps): @staticmethod def get_type_relations(data: Graph, cls_id: str) -> tuple: - """Retrieve class ancestors and fields.""" - ancestors = list( - {data.nodes[edge.target] for edge in data.find_edges(label='specializes') if - edge.source == cls_id}) + """Retrieve class fields.""" fields = {data.nodes[edge.target] for edge in data.find_edges(label='encapsulates') if edge.source == cls_id} fields = [' '.join(remove_java_comments(field.properties['sourceText']).split()) for field in fields if field.has_label('Variable')] - return ancestors, fields + return fields class ComponentProcessor(Processor): @@ -211,7 +217,6 @@ def process_one(self, graph: Graph, scope: Node, scope_deps): scp_parameters = OrderedDict() scp_parameters["Project Name"] = self.prompt.project_name scp_parameters["Project Description"] = self.prompt.project_desc - scp_parameters[f"{scp_kind.title()} Kind"] = scp_kind scp_parameters[f"{scp_kind.title()} Name"] = scope.properties['qualifiedName'] scp_parameters[f"Enclosed Sub-{scp_kind}s"] = subscp_descriptions scp_parameters["Enclosed Classes"] = typ_descriptions @@ -231,10 +236,17 @@ def process_one(self, graph: Graph, scope: Node, scope_deps): impl_edge = graph.add_edge(scope.id, target.id, "implements", weight=1, reason=description.get('layerReason')) writer().write(impl_edge.to_dict()) - self.update_package_properties(graph, description, scope) + ComponentProcessor.update_package_properties(graph, description, scope) writer().write({'data': {'id': scope.id, 'labels': list(scope.labels), 'properties': description}}) + @staticmethod + def update_package_properties(data: Graph, description: dict, package: Node): + """Update package properties with the generated description.""" + for key in description: + if not key.endswith('Reason'): + data.nodes[package.id].properties[lower_first(key)] = description[key] + class InteractionProcessor(Processor): def process_all(self, graph): diff --git a/arcana/metrics.py b/arcana/metrics.py index e70baf2..3a97f09 100644 --- a/arcana/metrics.py +++ b/arcana/metrics.py @@ -1,10 +1,13 @@ -from collections import Counter, OrderedDict, defaultdict +from collections import Counter, OrderedDict +import logging from arcana.graph_utils import dependency_profile_category from arcanalib.graph import Graph, invert, lift from arcanalib.pipefilter import Filter +logger = logging.getLogger(__name__) + class MetricsFilter(Filter): def process(self, data: Graph) -> Graph: """ @@ -44,7 +47,7 @@ def process(self, data: Graph) -> Graph: # 2. Compute raw in/out counts parents = {e.source: e.target for e in invert(data.find_edges(label='encloses'))} - dependency_profiles = defaultdict(list) + dependency_profiles = {node.id:list() for node in data.find_nodes('Type')} calls = data.edges.get('calls', lift(data.find_edges(label='encapsulates'), data.find_edges(label='invokes'), 'calls')) @@ -56,6 +59,7 @@ def process(self, data: Graph) -> Graph: dependency_profiles[target_id].append('in') dependency_profiles = {node_id: Counter(prof) for node_id, prof in dependency_profiles.items()} + logger.debug(dependency_profiles) # 3. Attach classification edges instead of setting a string property for node_id, profile in dependency_profiles.items(): diff --git a/arcana/templates.py b/arcana/templates.py index 83ef9f8..65af77f 100644 --- a/arcana/templates.py +++ b/arcana/templates.py @@ -130,13 +130,23 @@ "roleStereotypeReason": { "type": "string", "description": "One-sentence explanation for the chosen role stereotype." + }, + "layer": { + "type": "string", + "description": "Architectural layer classification selected from the provided options." + }, + "layerReason": { + "type": "string", + "description": "Explanation why the script fits the chosen architectural layer but not others." } }, "required": [ "description", "keywords", "roleStereotype", - "roleStereotypeReason" + "roleStereotypeReason", + "layer", + "layerReason" ] } From 948d83822dc15daf8e928d6865dc7cde96370e14 Mon Sep 17 00:00:00 2001 From: Satrio Adi Rukmono Date: Thu, 22 May 2025 16:44:27 +0200 Subject: [PATCH 21/34] Restructure WIP ... --- arcana/llm_filter/filter.py | 98 --------------------------------- arcana/llm_filter/processors.py | 7 +-- 2 files changed, 3 insertions(+), 102 deletions(-) diff --git a/arcana/llm_filter/filter.py b/arcana/llm_filter/filter.py index c8aca13..9b5e0ef 100644 --- a/arcana/llm_filter/filter.py +++ b/arcana/llm_filter/filter.py @@ -79,26 +79,6 @@ def process(self, graph): return graph - def process_old(self, data: Graph) -> Graph: - """ - Process the data using a language model to generate descriptions. - - Args: - data (Graph): The input data. - - Returns: - Graph: The processed data with generated descriptions. - """ - current_time_str = time.strftime("%Y%m%d-%H%M%S") - with open(f'arcana-{current_time_str}.jsonl', 'a', encoding="utf-8") as jsonl_file: - - with open(f'arcana-{current_time_str}.log', 'a', encoding="utf-8") as log_file: - try: - self.process_hierarchy(data, jsonl_file, log_file) - except Exception as e: - pass - return data - def process_hierarchy(self, graph: Graph, jsonl_file, log_file): """Process each package, class, and method in the hierarchy.""" @@ -112,52 +92,11 @@ def process_hierarchy(self, graph: Graph, jsonl_file, log_file): [Edge(source=source, target=target, label='contains') for target, source in new_ct_sources.items()]) trips = build_triplets(ct_contains_st, graph.find_edges(label='hasScript')) - met_to_cls_pkg = {met_id: (cls_id, pkg_id) for pkg_id, cls_id, met_id in trips} - # print(met_to_cls_pkg) - # print('######################################################################') - sorted_method_ids, method_deps = graph.toposorted_nodes(graph.find_edges(label='invokes')) - # print(sorted_method_ids) - - counter = 0 - - for met_id in tqdm(sorted_method_ids, desc='Processing methods', position=0, leave=False): - cls_id, pkg_id = met_to_cls_pkg[met_id] - method = graph.nodes[met_id] - clasz = graph.nodes[cls_id] - - self.process_script(graph, jsonl_file, log_file, method, clasz, method_deps) - - check_stop() - - counter += 1 - if counter == 10: - log_file.flush() - jsonl_file.flush() - counter %= 10 - hierarchy = build_hierarchy(trips) sorted_pkg_ids, pkg_deps = graph.toposorted_nodes( graph.find_edges(label='contains', where_source=lambda node: 'Structure' not in node.labels, where_target=lambda node: 'Structure' not in node.labels)) - for pkg_id in tqdm(sorted_pkg_ids, desc="Processing packages", position=1): - pkg_data = hierarchy.get(pkg_id, dict()) - package = graph.nodes[pkg_id] - - for cls_id, cls_data in tqdm(pkg_data.items(), desc="Processing classes", position=2, leave=False): - clasz = graph.nodes[cls_id] - - self.process_structure(graph, jsonl_file, log_file, clasz, cls_data) - - check_stop() - - self.process_component(graph, jsonl_file, log_file, package, pkg_data, pkg_deps) - - log_file.flush() - jsonl_file.flush() - - check_stop() - paths = graph.find_paths("contains", "hasScript", "invokes", "-hasScript", "-contains") path_groups = group_paths_by_endpoints(paths) @@ -176,43 +115,6 @@ def process_hierarchy(self, graph: Graph, jsonl_file, log_file): self.process_interactions(graph, pkg2, pkg1, path_groups[(pkg2_id, pkg1_id)], hierarchy, jsonl_file, log_file) - def process_component(self, graph: Graph, jsonl_file, log_file, component: Node, component_contents: dict, - component_deps: dict): - """Process a single package and generate its description.""" - - # if 'description' not in component.properties or not component.properties['description'] or component.properties['description'] == "(no description)": - structure_descriptions = self.get_structure_descriptions(graph, component_contents) - subcomponent_descriptions = self.get_component_descriptions(graph, component_deps[component.id]) - component_kind = component.properties.get('kind', "component") - - prompt = f"Describe the following {component_kind} using the AnalyzeComponent tool.\n\n" - component_parameters = OrderedDict() - component_parameters["Project Name"] = self.project_name - component_parameters["Project Description"] = self.project_desc - component_parameters["Component Type"] = component_kind - component_parameters["Component Name"] = component.properties['qualifiedName'] - component_parameters["Enclosed Sub-components"] = subcomponent_descriptions - component_parameters["Enclosed Classes"] = structure_descriptions - component_parameters["Possible Architectural Layers"] = dict(self.layers) - - prompt = self.compose_prompt(prompt, component_parameters) - - log_file.write(prompt) - log_file.write('\n\n======\n\n') - - description = generate_json(prompt, "AnalyzeComponent") - self.update_package_properties(graph, description, component) - - layer_id = None - if component.has_property("layer") and component.property("layer") in [name for name, _ in self.layers]: - layer_id = f"layer:{component.property('layer')}" - layer_node = graph.find_node(label="Grouping", where=lambda node: node.id == layer_id) - if layer_node: - graph.add_edge(component.id, layer_node.id, "implements", weight=1) - - write_jsonl(jsonl_file, - {'data': {'id': component.id, 'labels': list(component.labels), 'properties': description}}) - def process_interactions(self, graph: Graph, c1: Node, c2: Node, path_groups: List[List[Edge]], hierarchy, jsonl_file: TextIO, log_file: TextIO): c1_name = c1.properties["qualifiedName"] diff --git a/arcana/llm_filter/processors.py b/arcana/llm_filter/processors.py index 4e8bb4c..e8ce12f 100644 --- a/arcana/llm_filter/processors.py +++ b/arcana/llm_filter/processors.py @@ -58,7 +58,7 @@ def process_one(self, graph: Graph, operation: Node, type: Node, operation_deps) op_parameters = OrderedDict() op_parameters["Project Name"] = self.prompt.project_name op_parameters["Project Description"] = self.prompt.project_desc - op_parameters[f"{op_kind.title()} Declaration"] = f"The {op_kind} {op_name} is declared within the {typ_kind} {typ_name}." + op_parameters[f"{op_kind.title()} to Analyze"] = f"`{op_name}` from the {typ_kind} `{typ_name}`." op_parameters[f"{op_kind.title()} Source Code"] = op_src op_parameters["Outgoing Dependencies (Invokes)"] = {graph.nodes[node_id].properties[ 'qualifiedName']: f"{describe(graph.nodes[node_id], 'description', 'returns', 'howToUse', 'docComment')}" @@ -142,8 +142,7 @@ def process_one(self, graph: Graph, type: Node, scope: Node, type_deps): typ_parameters = OrderedDict() typ_parameters["Project Name"] = self.prompt.project_name typ_parameters["Project Description"] = self.prompt.project_desc - typ_parameters[f"{typ_kind.title()} Name"] = typ_name - typ_parameters[f"{typ_kind.title()} Declaration"] = f"The {typ_kind} {typ_name} is declared within the {scope_kind} {scope_name}." + typ_parameters[f"{typ_kind.title()} to Analyze"] = f"`{typ_kind} {typ_name}` from the {scope_kind} `{scope_name}`." typ_parameters[f"{typ_kind.title()} Inhertis From"] = {graph.nodes[node_id].properties[ 'qualifiedName']: f"{describe(graph.nodes[node_id], 'description', 'docComment')}" for node_id in type_deps[type.id]} @@ -217,7 +216,7 @@ def process_one(self, graph: Graph, scope: Node, scope_deps): scp_parameters = OrderedDict() scp_parameters["Project Name"] = self.prompt.project_name scp_parameters["Project Description"] = self.prompt.project_desc - scp_parameters[f"{scp_kind.title()} Name"] = scope.properties['qualifiedName'] + scp_parameters[f"{scp_kind.title()} to Analyze"] = scope.properties['qualifiedName'] scp_parameters[f"Enclosed Sub-{scp_kind}s"] = subscp_descriptions scp_parameters["Enclosed Classes"] = typ_descriptions scp_parameters["Possible Architectural Layers"] = dict(self.prompt.layers) From d061b4d5c441f11c2b24c0434e4bb55f63810a35 Mon Sep 17 00:00:00 2001 From: Satrio Adi Rukmono Date: Wed, 28 May 2025 17:22:40 +0200 Subject: [PATCH 22/34] Minor improvements --- arcana/llm_filter/client.py | 30 ++++++++++++++++++------------ arcana/llm_filter/processors.py | 28 +++++++++++++++++----------- arcana/llm_filter/prompt.py | 29 ++++++++++++++++++++--------- arcana/metrics.py | 2 +- arcana/templates.py | 4 ++-- 5 files changed, 58 insertions(+), 35 deletions(-) diff --git a/arcana/llm_filter/client.py b/arcana/llm_filter/client.py index 7134ab5..1c8aa48 100644 --- a/arcana/llm_filter/client.py +++ b/arcana/llm_filter/client.py @@ -14,13 +14,16 @@ def generate_json(self, prompt, tool): """Generate a description using the OpenAI client.""" try: if tool: - response = self.client.chat.completions.create(model=self.model, messages=[ - {"role": "system", "content": "You are a software architecture analysis tool."}, - {"role": "user", "content": prompt}], tools=[templates.analyze_script_tool, - templates.analyze_structure_tool, - templates.analyze_component_tool], - tool_choice="required", temperature=0, seed=42, - timeout=self.timeout) + response = self.client.chat.completions.create( + model=self.model, + messages=[ + {"role": "system", "content": "You are a tool for analyzing software architecture of code implementations."}, + {"role": "user", "content": prompt}], + tools=[templates.analyze_script_tool, + templates.analyze_structure_tool, + templates.analyze_component_tool], + tool_choice="required", temperature=0, seed=42, + timeout=self.timeout) tool_calls = response.choices[0].message.tool_calls @@ -36,11 +39,14 @@ def generate_json(self, prompt, tool): description = dict() else: - response = self.client.chat.completions.create(model=self.model, - response_format={"type": "json_object"}, - messages=[{"role": "user", "content": prompt}], - max_tokens=4096, temperature=0, seed=42, - timeout=self.timeout) + response = self.client.chat.completions.create( + model=self.model, + response_format={"type": "json_object"}, + messages=[ + {"role": "system", "content": "You are an expert in analyzing software architecture of code implementations."}, + {"role": "user", "content": prompt}], + max_tokens=4096, temperature=0, seed=42, + timeout=self.timeout) content = response.choices[0].message.content description = json.loads(content) diff --git a/arcana/llm_filter/processors.py b/arcana/llm_filter/processors.py index e8ce12f..de4e30d 100644 --- a/arcana/llm_filter/processors.py +++ b/arcana/llm_filter/processors.py @@ -28,8 +28,8 @@ class ScriptProcessor(Processor): def process_all(self, graph: Graph): sorted_method_ids, method_deps = Graph.toposorted_nodes(graph.find_edges(label='invokes'), graph.find_nodes('Operation')) counter = 0 - logger.debug(sorted_method_ids) - logger.debug(method_deps) + # logger.debug(sorted_method_ids) + # logger.debug(method_deps) for met_id in tqdm(sorted_method_ids, desc='Processing methods'): method: Node = graph.nodes[met_id] @@ -64,7 +64,7 @@ def process_one(self, graph: Graph, operation: Node, type: Node, operation_deps) 'qualifiedName']: f"{describe(graph.nodes[node_id], 'description', 'returns', 'howToUse', 'docComment')}" for node_id in operation_deps[operation.id]} op_parameters["Incoming Dependencies (Invoked By)"] = [m.properties['qualifiedName'] for m in operation.sources('invokes')] - op_parameters["Possible Architectural Layers"] = dict(self.prompt.layers) + op_parameters["Possible Architectural Layers"] = self.prompt.layers prompt = self.prompt.compose(prompt, **op_parameters) @@ -117,7 +117,8 @@ def process_all(self, graph: Graph): for cls_id in tqdm(sorted_class_ids, desc='Processing classes'): clasz: Node = graph.nodes[cls_id] - package: Node = [n for n in clasz.sources('encloses') if n.has_label('Scope')][0] + enclosers = [n for n in clasz.sources('encloses') if n.has_label('Scope')] + package: Node = enclosers[0] if enclosers else None self.process_one(graph, clasz, package, class_deps) check_stop() @@ -135,21 +136,26 @@ def process_one(self, graph: Graph, type: Node, scope: Node, type_deps): typ_kind = type.properties.get('kind', "type") typ_kind = 'enum' if typ_kind == 'enumeration' else 'abstract class' if typ_kind == 'abstract' else typ_kind - scope_name = scope.properties['qualifiedName'] - scope_kind = scope.properties.get('kind', "scope") - prompt = f"Describe the following {typ_kind} using the AnalyzeStructure tool.\n\n" typ_parameters = OrderedDict() typ_parameters["Project Name"] = self.prompt.project_name typ_parameters["Project Description"] = self.prompt.project_desc - typ_parameters[f"{typ_kind.title()} to Analyze"] = f"`{typ_kind} {typ_name}` from the {scope_kind} `{scope_name}`." + + if scope: + scope_name = scope.properties['qualifiedName'] + scope_kind = scope.properties.get('kind', "scope") + typ_parameters[f"{typ_kind.title()} to Analyze"] = f"`{typ_kind} {typ_name}` from the {scope_kind} `{scope_name}`." + else: + typ_parameters[f"{typ_kind.title()} to Analyze"] = f"`{typ_kind} {typ_name}`." + typ_parameters[f"{typ_kind.title()} Inhertis From"] = {graph.nodes[node_id].properties[ 'qualifiedName']: f"{describe(graph.nodes[node_id], 'description', 'docComment')}" for node_id in type_deps[type.id]} + typ_parameters["Inherited By"] = [f"{t.properties['kind']} {t.properties['qualifiedName']}" for t in type.sources('specializes')] typ_parameters[f"Enclosed Variables/Fields"] = vars typ_parameters[f"Enclosed Functions/Methods"] = op_descriptions - typ_parameters['Possible Role Stereotypes'] = dict(self.prompt.role_stereotypes) - typ_parameters["Possible Architectural Layers"] = dict(self.prompt.layers) + typ_parameters['Possible Role Stereotypes'] = self.prompt.role_stereotypes + typ_parameters["Possible Architectural Layers"] = self.prompt.layers prompt = self.prompt.compose(prompt, **typ_parameters) @@ -219,7 +225,7 @@ def process_one(self, graph: Graph, scope: Node, scope_deps): scp_parameters[f"{scp_kind.title()} to Analyze"] = scope.properties['qualifiedName'] scp_parameters[f"Enclosed Sub-{scp_kind}s"] = subscp_descriptions scp_parameters["Enclosed Classes"] = typ_descriptions - scp_parameters["Possible Architectural Layers"] = dict(self.prompt.layers) + scp_parameters["Possible Architectural Layers"] = self.prompt.layers prompt = self.prompt.compose(prompt, **scp_parameters) diff --git a/arcana/llm_filter/prompt.py b/arcana/llm_filter/prompt.py index 62ae5a4..a3766c0 100644 --- a/arcana/llm_filter/prompt.py +++ b/arcana/llm_filter/prompt.py @@ -9,7 +9,15 @@ def __init__(self, project_cfg, layers_cfg=None, stereotypes_cfg=None): self.project_name = project_cfg['name'] self.project_desc = project_cfg['desc'] self.layers = layers_cfg or OrderedDict() + self.layers.update({ + 'Undetermined': "Architectural layer cannot be determined for this element." + }) + # self.layers.move_to_end('Undetermined', False) self.role_stereotypes = stereotypes_cfg or OrderedDict() + self.role_stereotypes.update({ + 'Undetermined': "Role stereotype cannot be determined for this element." + }) + # self.role_stereotypes.move_to_end('Undetermined', False) # self.layers_str = format_layers(layers_cfg) def initialize_layers(self, graph: Graph): @@ -20,22 +28,24 @@ def initialize_layers(self, graph: Graph): simpleName="Architectural Layer", qualifiedName="Architectural Layer") writer().write(layer_dimension.to_dict()) - - for i, (name, desc) in enumerate(self.layers.items()): + + layers = self.layers.copy() + layers.move_to_end('Undetermined', False) + for i, (name, desc) in enumerate(layers.items()): cat = graph.add_node( f"layer:{name}", "Category", kind="architectural layer", simpleName=name, qualifiedName=name, description=desc, - order=i + order=i-1 ) writer().write(cat.to_dict()) e = graph.add_edge(cat.id, layer_dimension.id, "composes", weight=1) writer().write(e.to_dict()) - - t_layers = list(self.layers.items()) - for i in range(len(t_layers) - 1): + + t_layers = list(layers.items()) + for i in range(1, len(t_layers) - 1): src = t_layers[i][0] tgt = t_layers[i + 1][0] e = graph.add_edge(f"layer:{src}", f"layer:{tgt}", "succeeds", weight=1) @@ -51,14 +61,15 @@ def initialize_layers(self, graph: Graph): ) writer().write(stereo_dimension.to_dict()) - for i, (name, desc) in enumerate(self.role_stereotypes.items()): + role_stereotypes = self.role_stereotypes.copy() + role_stereotypes.move_to_end('Undetermined', False) + for i, (name, desc) in enumerate(role_stereotypes.items()): cat = graph.add_node( f"rs:{name}", "Category", kind="role stereotype", simpleName=name, qualifiedName=name, - description=desc, - order=i + description=desc ) writer().write(cat.to_dict()) e = graph.add_edge(cat.id, stereo_dimension.id, "composes", weight=1) diff --git a/arcana/metrics.py b/arcana/metrics.py index 3a97f09..ed8e29d 100644 --- a/arcana/metrics.py +++ b/arcana/metrics.py @@ -59,7 +59,7 @@ def process(self, data: Graph) -> Graph: dependency_profiles[target_id].append('in') dependency_profiles = {node_id: Counter(prof) for node_id, prof in dependency_profiles.items()} - logger.debug(dependency_profiles) + # logger.debug(dependency_profiles) # 3. Attach classification edges instead of setting a string property for node_id, profile in dependency_profiles.items(): diff --git a/arcana/templates.py b/arcana/templates.py index 65af77f..9dda8f5 100644 --- a/arcana/templates.py +++ b/arcana/templates.py @@ -5,7 +5,7 @@ "properties": { "description": { "type": "string", - "description": "One-sentence description of the method/constructor/function functionality." + "description": "One-sentence description – suitable for a documentation comment – of the method/constructor/function functionality, in imperative mood." }, "parameters": { "type": "array", @@ -114,7 +114,7 @@ "properties": { "description": { "type": "string", - "description": "Up to three sentences describing the key responsibilities of the class/struct/type." + "description": "Up to three sentences, suitable for a documentation comment, describing the key responsibilities of the class/struct/type." }, "keywords": { "type": "array", From a8d1800f9b412bff7737f2c8b2f767a316d857f6 Mon Sep 17 00:00:00 2001 From: Satrio Adi Rukmono Date: Sun, 1 Mar 2026 11:53:29 +0100 Subject: [PATCH 23/34] Generalise classification (Codex) --- arcana/llm_filter/classification.py | 109 +++++++++++++++++++++++++ arcana/llm_filter/filter.py | 39 ++------- arcana/llm_filter/processors.py | 69 ++++++++-------- arcana/llm_filter/prompt.py | 118 +++++++++++++--------------- 4 files changed, 204 insertions(+), 131 deletions(-) create mode 100644 arcana/llm_filter/classification.py diff --git a/arcana/llm_filter/classification.py b/arcana/llm_filter/classification.py new file mode 100644 index 0000000..f179edd --- /dev/null +++ b/arcana/llm_filter/classification.py @@ -0,0 +1,109 @@ +from dataclasses import dataclass +from collections import OrderedDict +from collections.abc import Iterable + + +def default_layers(): + return OrderedDict([ + ('Presentation Layer', "Manages the user interface, defines UI elements and behavior, displays information, responds to user input, and updates views."), + ('Service Layer', "Controls the application flow, orchestrates domain operations, connects UI events with domain logic, and synchronizes domain changes with the UI."), + ('Domain Layer', "Handles business logic, represents domain data and behavior, and performs necessary computations for domain operations."), + ('Data Source Layer', "Interacts with databases, filesystems, hardware, messaging systems, or other data sources, performs CRUD operations, handles data conversion, and ensures data integrity."), + ]) + + +def default_role_stereotypes(): + return OrderedDict([ + ("Information Holder", "Knows facts and provides information (POJOs, beans, enums)."), + ("Service Provider", "Handles requests, performs services; implements a specific interface with a small number of methods (strategies, handlers)."), + ("Structurer", "Manages relationships among things (collections, maps)."), + ("Controller", "Makes decisions, directs flow of the program."), + ("Coordinator", "Delegates work across workers."), + ("User Interfacer", "Handles user input/output."), + ("External Interfacer", "Loads/stores from external services."), + ("Internal Interfacer", "Bridges subsystems (adapters, bridges, facades, proxies)."), + ]) + + +@dataclass +class ClassificationScheme: + name: str + dimension_id: str + dimension_name: str + dimension_kind: str + category_prefix: str + category_kind: str + prompt_label: str + response_key: str + response_reason_key: str + options: OrderedDict + undetermined_description: str + ordered: bool = False + applies_to: tuple = () + + def options_with_undetermined(self) -> OrderedDict: + result = OrderedDict(self.options or OrderedDict()) + result['Undetermined'] = self.undetermined_description + return result + + def ordered_options(self) -> OrderedDict: + result = self.options_with_undetermined() + result.move_to_end('Undetermined', False) + return result + + def category_id(self, category_name: str) -> str: + return f"{self.category_prefix}:{category_name}" + + +def ordered_dict_from_mapping(mapping) -> OrderedDict: + if not mapping: + return OrderedDict() + if isinstance(mapping, OrderedDict): + return mapping + if isinstance(mapping, dict): + return OrderedDict(mapping) + if isinstance(mapping, Iterable): + return OrderedDict(mapping) + return OrderedDict() + + +def default_classification_schemes(layers_cfg=None, role_stereotypes_cfg=None): + layers = ordered_dict_from_mapping(layers_cfg) or default_layers() + role_stereotypes = ordered_dict_from_mapping(role_stereotypes_cfg) or default_role_stereotypes() + + layer_scheme = ClassificationScheme( + name="layer", + dimension_id="Architectural Layer", + dimension_name="Architectural Layer", + dimension_kind="categorical-ordered", + category_prefix="layer", + category_kind="architectural layer", + prompt_label="Possible Architectural Layers", + response_key="layer", + response_reason_key="layerReason", + options=layers, + undetermined_description="Architectural layer cannot be determined for this element.", + ordered=True, + applies_to=("script", "structure", "component") + ) + + role_scheme = ClassificationScheme( + name="roleStereotype", + dimension_id="Role Stereotype", + dimension_name="Role Stereotype", + dimension_kind="categorical-nominal", + category_prefix="rs", + category_kind="role stereotype", + prompt_label="Possible Role Stereotypes", + response_key="roleStereotype", + response_reason_key="roleStereotypeReason", + options=role_stereotypes, + undetermined_description="Role stereotype cannot be determined for this element.", + ordered=False, + applies_to=("structure",) + ) + + return OrderedDict([ + (layer_scheme.name, layer_scheme), + (role_scheme.name, role_scheme), + ]) diff --git a/arcana/llm_filter/filter.py b/arcana/llm_filter/filter.py index 9b5e0ef..6a03359 100644 --- a/arcana/llm_filter/filter.py +++ b/arcana/llm_filter/filter.py @@ -1,4 +1,3 @@ -import time from collections import OrderedDict from itertools import combinations from typing import Any, Dict, List, TextIO @@ -9,6 +8,7 @@ from arcana.filters import check_stop, layers_to_ordereddict from arcana.graph_utils import (build_hierarchy, build_triplets, describe_path, group_paths_by_endpoints) +from arcana.llm_filter.classification import default_classification_schemes from arcana.llm_filter.client import LLMClient from arcana.llm_filter.processors import ComponentProcessor, InteractionProcessor, ScriptProcessor, StructureProcessor from arcana.llm_filter.prompt import PromptBuilder @@ -16,46 +16,19 @@ from arcanalib.graph import Edge, Graph, Node from arcanalib.pipefilter import Filter -def default_layers(): - return OrderedDict([ - ('Presentation Layer', "Manages the user interface, defines UI elements and behavior, displays information, responds to user input, and updates views."), - ('Service Layer', "Controls the application flow, orchestrates domain operations, connects UI events with domain logic, and synchronizes domain changes with the UI."), - ('Domain Layer', "Handles business logic, represents domain data and behavior, and performs necessary computations for domain operations."), - ('Data Source Layer', "Interacts with databases, filesystems, hardware, messaging systems, or other data sources, performs CRUD operations, handles data conversion, and ensures data integrity."), - ]) - -def default_role_stereotypes(): - # **Information Holder** is responsible for knowing facts and providing information to other objects. POJOs, Java Beans, and enumerations are usually information holders. \ - # **Service Provider** is responsible for handling requests and performing specific services. It usually implements a specific interface with a small number of methods. Concrete strategies are service providers. \ - # **Structurer** is responsible for managing relationships and constraints among related things. It is usually a collection or mapping of some sort, i.e., a subclass of a List, Set, Map, etc. \ - # **Controller** is responsible for making decisions, directing the work of others, and handling important events. It directs the flow of the application or business process. \ - # **Coordinator** is responsible for managing the actions of a group of workers and facilitating communication and work of other objects. It delegates requests to other objects. Very abstract classes and interfaces might be coordinators as they delegate the work to subclasses. \ - # **User Interfacer** is responsible for transmitting user requests for action or display/render information that can be updated. It handles interactions with users. \ - # **External Interfacer** is responsible for loading and storing information from/to external services, including database systems, web services, filesystems, hardware, etc. \ - # **Internal Interfacer** is responsible for interfacing between two subsystems. It may bundle together information of requests from a group of objects to be sent to another object. Abstract adapters, bridges, facades, and proxies are internal interfacers." - return OrderedDict([ - ("Information Holder", "Knows facts and provides information (POJOs, beans, enums)."), - ("Service Provider", "Handles requests, performs services; implements a specific interface with a small number of methods (strategies, handlers)."), - ("Structurer", "Manages relationships among things (collections, maps)."), - ("Controller", "Makes decisions, directs flow of the program."), - ("Coordinator", "Delegates work across workers."), - ("User Interfacer", "Handles user input/output."), - ("External Interfacer", "Loads/stores from external services."), - ("Internal Interfacer", "Bridges subsystems (adapters, bridges, facades, proxies)."), - ]) - class LLMFilter(Filter): def __init__(self, config: Dict[str, Dict[str, Any]]): super().__init__(config) self.client = LLMClient(config['llm'], config['project']) layer_cfg = config.get('layers') - self.layers = layers_to_ordereddict(layer_cfg) if layer_cfg else default_layers() + self.layers = layers_to_ordereddict(layer_cfg) if layer_cfg else OrderedDict() stereo_cfg = config.get('stereotypes') - self.role_stereotypes = OrderedDict(stereo_cfg) if stereo_cfg else default_role_stereotypes() + self.role_stereotypes = OrderedDict(stereo_cfg) if stereo_cfg else OrderedDict() - self.prompt_builder = PromptBuilder(config['project'], self.layers, self.role_stereotypes) + classifications = default_classification_schemes(self.layers, self.role_stereotypes) + self.prompt_builder = PromptBuilder(config['project'], classifications) self.script_processor = ScriptProcessor(self.client, self.prompt_builder) self.structure_processor = StructureProcessor(self.client, self.prompt_builder) self.component_processor = ComponentProcessor(self.client, self.prompt_builder) @@ -173,4 +146,4 @@ def get_structure_descriptions(self, data: Graph, pkg_data: dict) -> dict[str,st def get_component_descriptions(self, data: Graph, package_ids: list) -> dict[str,str]: """Generate descriptions for packages.""" return {data.nodes[pkg_id].properties['qualifiedName']: self.describe(data.nodes[pkg_id]) for pkg_id in - package_ids} \ No newline at end of file + package_ids} diff --git a/arcana/llm_filter/processors.py b/arcana/llm_filter/processors.py index de4e30d..eece678 100644 --- a/arcana/llm_filter/processors.py +++ b/arcana/llm_filter/processors.py @@ -7,6 +7,7 @@ from arcana.checkpoint import writer from arcana.filters import check_stop +from arcana.llm_filter.classification import ClassificationScheme from arcana.llm_filter.client import LLMClient from arcana.llm_filter.prompt import PromptBuilder, describe from arcana.utils import lower_first, remove_java_comments @@ -23,6 +24,31 @@ def __init__(self, client, prompt_builder): def process_all(self, graph): raise NotImplementedError + def add_classification_options(self, parameters: OrderedDict, element_kind: str): + for scheme in self.prompt.classification_schemes(element_kind): + parameters[scheme.prompt_label] = scheme.options_with_undetermined() + + def apply_classifications(self, graph: Graph, element: Node, description: dict, element_kind: str): + for scheme in self.prompt.classification_schemes(element_kind): + self._apply_classification(graph, element, description, scheme) + + @staticmethod + def _apply_classification(graph: Graph, element: Node, description: dict, scheme: ClassificationScheme): + classification = description.pop(scheme.response_key, None) + if not classification: + return + + target = graph.find_node(label="Category", where=lambda n: n.id == scheme.category_id(classification)) + if target: + impl_edge = graph.add_edge( + element.id, + target.id, + "implements", + weight=1, + reason=description.get(scheme.response_reason_key), + ) + writer().write(impl_edge.to_dict()) + class ScriptProcessor(Processor): def process_all(self, graph: Graph): @@ -64,21 +90,15 @@ def process_one(self, graph: Graph, operation: Node, type: Node, operation_deps) 'qualifiedName']: f"{describe(graph.nodes[node_id], 'description', 'returns', 'howToUse', 'docComment')}" for node_id in operation_deps[operation.id]} op_parameters["Incoming Dependencies (Invoked By)"] = [m.properties['qualifiedName'] for m in operation.sources('invokes')] - op_parameters["Possible Architectural Layers"] = self.prompt.layers + self.add_classification_options(op_parameters, "script") prompt = self.prompt.compose(prompt, **op_parameters) logger.debug(prompt) description = self.client.generate_json(prompt, "AnalyzeScript") - - layer = description.pop('layer', None) - if layer: - node_id = f"layer:{layer}" - target = graph.find_node(label="Category", where=lambda n: n.id == node_id) - if target: - impl_edge = graph.add_edge(operation.id, target.id, "implements", weight=1, reason=description.get('layerReason')) - writer().write(impl_edge.to_dict()) + + self.apply_classifications(graph, operation, description, "script") self.update_method_properties(graph, description, operation) @@ -154,30 +174,15 @@ def process_one(self, graph: Graph, type: Node, scope: Node, type_deps): typ_parameters["Inherited By"] = [f"{t.properties['kind']} {t.properties['qualifiedName']}" for t in type.sources('specializes')] typ_parameters[f"Enclosed Variables/Fields"] = vars typ_parameters[f"Enclosed Functions/Methods"] = op_descriptions - typ_parameters['Possible Role Stereotypes'] = self.prompt.role_stereotypes - typ_parameters["Possible Architectural Layers"] = self.prompt.layers + self.add_classification_options(typ_parameters, "structure") prompt = self.prompt.compose(prompt, **typ_parameters) logger.debug(prompt) description = self.client.generate_json(prompt, "AnalyzeStructure") - - rs = description.pop('roleStereotype', None) - if rs: - node_id = f"rs:{rs}" - target = graph.find_node(label="Category", where=lambda n: n.id == node_id) - if target: - impl_edge = graph.add_edge(type.id, target.id, "implements", weight=1, reason=description.get('roleStereotypeReason')) - writer().write(impl_edge.to_dict()) - - layer = description.pop('layer', None) - if layer: - node_id = f"layer:{layer}" - target = graph.find_node(label="Category", where=lambda n: n.id == node_id) - if target: - impl_edge = graph.add_edge(type.id, target.id, "implements", weight=1, reason=description.get('layerReason')) - writer().write(impl_edge.to_dict()) + + self.apply_classifications(graph, type, description, "structure") for k, v in description.items(): if not k.endswith('Reason'): @@ -225,7 +230,7 @@ def process_one(self, graph: Graph, scope: Node, scope_deps): scp_parameters[f"{scp_kind.title()} to Analyze"] = scope.properties['qualifiedName'] scp_parameters[f"Enclosed Sub-{scp_kind}s"] = subscp_descriptions scp_parameters["Enclosed Classes"] = typ_descriptions - scp_parameters["Possible Architectural Layers"] = self.prompt.layers + self.add_classification_options(scp_parameters, "component") prompt = self.prompt.compose(prompt, **scp_parameters) @@ -233,13 +238,7 @@ def process_one(self, graph: Graph, scope: Node, scope_deps): description = self.client.generate_json(prompt, "AnalyzeComponent") - layer = description.pop('layer', None) - if layer: - node_id = f"layer:{layer}" - target = graph.find_node(label="Category", where=lambda n: n.id == node_id) - if target: - impl_edge = graph.add_edge(scope.id, target.id, "implements", weight=1, reason=description.get('layerReason')) - writer().write(impl_edge.to_dict()) + self.apply_classifications(graph, scope, description, "component") ComponentProcessor.update_package_properties(graph, description, scope) diff --git a/arcana/llm_filter/prompt.py b/arcana/llm_filter/prompt.py index a3766c0..8c44bc9 100644 --- a/arcana/llm_filter/prompt.py +++ b/arcana/llm_filter/prompt.py @@ -1,79 +1,71 @@ from collections import OrderedDict from arcana.checkpoint import writer from arcana.utils import remove_author, sentence +from arcana.llm_filter.classification import ClassificationScheme from arcanalib.graph import Graph, Node class PromptBuilder: - def __init__(self, project_cfg, layers_cfg=None, stereotypes_cfg=None): + def __init__(self, project_cfg, classifications=None): self.project_name = project_cfg['name'] self.project_desc = project_cfg['desc'] - self.layers = layers_cfg or OrderedDict() - self.layers.update({ - 'Undetermined': "Architectural layer cannot be determined for this element." - }) - # self.layers.move_to_end('Undetermined', False) - self.role_stereotypes = stereotypes_cfg or OrderedDict() - self.role_stereotypes.update({ - 'Undetermined': "Role stereotype cannot be determined for this element." - }) - # self.role_stereotypes.move_to_end('Undetermined', False) - # self.layers_str = format_layers(layers_cfg) + self.classifications = classifications or OrderedDict() + self.layers = self.classification_options('layer') + self.role_stereotypes = self.classification_options('roleStereotype') - def initialize_layers(self, graph: Graph): - layer_dimension = graph.add_node( - f"Architectural Layer", - "Dimension", - kind="categorical-ordered", - simpleName="Architectural Layer", - qualifiedName="Architectural Layer") - writer().write(layer_dimension.to_dict()) + def classification_options(self, classification_name: str) -> OrderedDict: + scheme: ClassificationScheme = self.classifications.get(classification_name) + if not scheme: + return OrderedDict() + return scheme.options_with_undetermined() + + def classification_schemes(self, element_kind: str = None): + schemes = list(self.classifications.values()) + if not element_kind: + return schemes + return [scheme for scheme in schemes if element_kind in scheme.applies_to] - layers = self.layers.copy() - layers.move_to_end('Undetermined', False) - for i, (name, desc) in enumerate(layers.items()): - cat = graph.add_node( - f"layer:{name}", "Category", - kind="architectural layer", - simpleName=name, - qualifiedName=name, - description=desc, - order=i-1 + def initialize_classifications(self, graph: Graph): + for scheme in self.classification_schemes(): + dimension = graph.add_node( + scheme.dimension_id, + "Dimension", + kind=scheme.dimension_kind, + simpleName=scheme.dimension_name, + qualifiedName=scheme.dimension_name, ) - writer().write(cat.to_dict()) - e = graph.add_edge(cat.id, layer_dimension.id, "composes", weight=1) - writer().write(e.to_dict()) + writer().write(dimension.to_dict()) - t_layers = list(layers.items()) - for i in range(1, len(t_layers) - 1): - src = t_layers[i][0] - tgt = t_layers[i + 1][0] - e = graph.add_edge(f"layer:{src}", f"layer:{tgt}", "succeeds", weight=1) - writer().write(e.to_dict()) - - - stereo_dimension = graph.add_node( - "Role Stereotype", - "Dimension", - kind="categorical-nominal", - simpleName="Role Stereotype", - qualifiedName="Role Stereotype" - ) - writer().write(stereo_dimension.to_dict()) + categories = scheme.ordered_options() + category_names = list(categories.keys()) + for i, (name, desc) in enumerate(categories.items()): + cat_kwargs = dict( + kind=scheme.category_kind, + simpleName=name, + qualifiedName=name, + description=desc, + ) + if scheme.ordered: + cat_kwargs["order"] = i - 1 + cat = graph.add_node( + scheme.category_id(name), "Category", **cat_kwargs + ) + writer().write(cat.to_dict()) + e = graph.add_edge(cat.id, dimension.id, "composes", weight=1) + writer().write(e.to_dict()) - role_stereotypes = self.role_stereotypes.copy() - role_stereotypes.move_to_end('Undetermined', False) - for i, (name, desc) in enumerate(role_stereotypes.items()): - cat = graph.add_node( - f"rs:{name}", "Category", - kind="role stereotype", - simpleName=name, - qualifiedName=name, - description=desc - ) - writer().write(cat.to_dict()) - e = graph.add_edge(cat.id, stereo_dimension.id, "composes", weight=1) - writer().write(e.to_dict()) + if scheme.ordered: + for i in range(1, len(category_names) - 1): + src = category_names[i] + tgt = category_names[i + 1] + e = graph.add_edge( + scheme.category_id(src), scheme.category_id(tgt), "succeeds", weight=1 + ) + writer().write(e.to_dict()) + + def initialize_layers(self, graph: Graph): + # Backward-compatible alias. + self.initialize_classifications(graph) def compose(self, base_prompt, **parameters): @@ -109,4 +101,4 @@ def describe(node: Node, *keys) -> str: lines[ 'docComment'] = f"**docComment**: {sentence(remove_author(str(node.properties['docComment'])).replace(sr, '').replace(sn, ' '))} " - return ' '.join(lines[key] for key in keys if key in lines).strip() \ No newline at end of file + return ' '.join(lines[key] for key in keys if key in lines).strip() From 44d6e4fd98159496cbb9d6248c46428f7ad29eed Mon Sep 17 00:00:00 2001 From: Satrio Adi Rukmono Date: Sun, 1 Mar 2026 20:21:26 +0100 Subject: [PATCH 24/34] Change hasParameter to parameterizes (SABO-2) --- arcana/llm_filter/processors.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arcana/llm_filter/processors.py b/arcana/llm_filter/processors.py index eece678..539fff1 100644 --- a/arcana/llm_filter/processors.py +++ b/arcana/llm_filter/processors.py @@ -113,8 +113,8 @@ def update_method_properties(data: Graph, description: dict, method: Node): continue key_lower = lower_first(key) if key_lower == 'parameters' and isinstance(value, Iterable): - param_nodes = [data.nodes[edge.target] for edge in data.find_edges(label='hasParameter') if - edge.source == method.id] + param_nodes = [data.nodes[edge.source] for edge in data.find_edges(label='parameterizes') if + edge.traget == method.id] for param in value: if isinstance(param, dict): matching_params = [node for node in param_nodes if From 6a4e4f94fe5aaae232341e81db8fd9e9cd59df49 Mon Sep 17 00:00:00 2001 From: Satrio Adi Rukmono Date: Mon, 9 Mar 2026 15:32:12 +0700 Subject: [PATCH 25/34] Add optional SecDFD classification --- .gitignore | 5 + README.md | 1 + arcana/llm_filter/classification.py | 41 +++++- arcana/llm_filter/filter.py | 20 ++- arcana/llm_filter/processors.py | 198 +++++++++++++++++++++++++--- arcana/templates.py | 28 +++- config.ini.example | 9 ++ legacy/arvisaninator.ipynb | 14 +- 8 files changed, 279 insertions(+), 37 deletions(-) create mode 100644 .gitignore diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..66af02d --- /dev/null +++ b/.gitignore @@ -0,0 +1,5 @@ +*.jsonl +*.log +*.ini +.* +!.gitignore \ No newline at end of file diff --git a/README.md b/README.md index f0f0437..20aec19 100644 --- a/README.md +++ b/README.md @@ -40,6 +40,7 @@ The `command` argument can be one of the following: - `description`: A one-sentence summary of *packages*, *classes*, and *methods*/*constructors*. - `roleStereotype`: A classification of *classes* into one of [Wirfs-Brock's role stereotypes](https://wirfs-brock.com/PDFs/Characterizing%20Classes.pdf). - `layer`: A classification of *packages*, *classes*, and *methods*/*constructors* into architectural layers. + - `secdfdTypes` (optional, enable via `[secdfd]` config): Multi-label SecDFD classification for v2 `Type`, `Operation`, and `Variable` nodes. Currently, this command adds all the properties above, i.e., there is no way to select only one or two properties to add to the graph. diff --git a/arcana/llm_filter/classification.py b/arcana/llm_filter/classification.py index f179edd..9697e3d 100644 --- a/arcana/llm_filter/classification.py +++ b/arcana/llm_filter/classification.py @@ -40,6 +40,7 @@ class ClassificationScheme: undetermined_description: str ordered: bool = False applies_to: tuple = () + allow_multi_label: bool = False def options_with_undetermined(self) -> OrderedDict: result = OrderedDict(self.options or OrderedDict()) @@ -67,7 +68,17 @@ def ordered_dict_from_mapping(mapping) -> OrderedDict: return OrderedDict() -def default_classification_schemes(layers_cfg=None, role_stereotypes_cfg=None): +def default_secdfd_types(): + return OrderedDict([ + ("External Entity", "Represents an external actor or system that interacts with the software."), + ("DataStore", "Represents persisted storage or a data access boundary."), + ("Process", "Represents non-trivial computation or orchestration logic."), + ("Asset", "Represents data objects with business or security value."), + ("Flow", "Represents data transfer across operations or boundaries."), + ]) + + +def default_classification_schemes(layers_cfg=None, role_stereotypes_cfg=None, secdfd_enabled=False): layers = ordered_dict_from_mapping(layers_cfg) or default_layers() role_stereotypes = ordered_dict_from_mapping(role_stereotypes_cfg) or default_role_stereotypes() @@ -84,7 +95,7 @@ def default_classification_schemes(layers_cfg=None, role_stereotypes_cfg=None): options=layers, undetermined_description="Architectural layer cannot be determined for this element.", ordered=True, - applies_to=("script", "structure", "component") + applies_to=("operation", "type", "scope") ) role_scheme = ClassificationScheme( @@ -100,10 +111,30 @@ def default_classification_schemes(layers_cfg=None, role_stereotypes_cfg=None): options=role_stereotypes, undetermined_description="Role stereotype cannot be determined for this element.", ordered=False, - applies_to=("structure",) + applies_to=("type",) ) - - return OrderedDict([ + schemes = OrderedDict([ (layer_scheme.name, layer_scheme), (role_scheme.name, role_scheme), ]) + + if secdfd_enabled: + secdfd_scheme = ClassificationScheme( + name="secdfd", + dimension_id="SecDFD Type", + dimension_name="SecDFD Type", + dimension_kind="categorical-nominal", + category_prefix="secdfd", + category_kind="secdfd type", + prompt_label="Possible SecDFD Types", + response_key="secdfdTypes", + response_reason_key="secdfdEvidence", + options=default_secdfd_types(), + undetermined_description="SecDFD type cannot be determined for this element.", + ordered=False, + applies_to=("operation", "type", "variable"), + allow_multi_label=True, + ) + schemes[secdfd_scheme.name] = secdfd_scheme + + return schemes diff --git a/arcana/llm_filter/filter.py b/arcana/llm_filter/filter.py index 6a03359..64304ee 100644 --- a/arcana/llm_filter/filter.py +++ b/arcana/llm_filter/filter.py @@ -10,7 +10,7 @@ group_paths_by_endpoints) from arcana.llm_filter.classification import default_classification_schemes from arcana.llm_filter.client import LLMClient -from arcana.llm_filter.processors import ComponentProcessor, InteractionProcessor, ScriptProcessor, StructureProcessor +from arcana.llm_filter.processors import ComponentProcessor, InteractionProcessor, ScriptProcessor, StructureProcessor, VariableProcessor from arcana.llm_filter.prompt import PromptBuilder from arcana.utils import (lower_first, remove_java_comments, write_jsonl) from arcanalib.graph import Edge, Graph, Node @@ -27,10 +27,18 @@ def __init__(self, config: Dict[str, Dict[str, Any]]): stereo_cfg = config.get('stereotypes') self.role_stereotypes = OrderedDict(stereo_cfg) if stereo_cfg else OrderedDict() - classifications = default_classification_schemes(self.layers, self.role_stereotypes) + self.secdfd_cfg = config.get('secdfd', {}) + self.secdfd_enabled = str(self.secdfd_cfg.get("enabled", "false")).strip().lower() in {"1", "true", "yes", "on"} + + classifications = default_classification_schemes( + self.layers, + self.role_stereotypes, + secdfd_enabled=self.secdfd_enabled, + ) self.prompt_builder = PromptBuilder(config['project'], classifications) self.script_processor = ScriptProcessor(self.client, self.prompt_builder) self.structure_processor = StructureProcessor(self.client, self.prompt_builder) + self.variable_processor = VariableProcessor(self.client, self.prompt_builder, self.secdfd_cfg) self.component_processor = ComponentProcessor(self.client, self.prompt_builder) self.interaction_processor = InteractionProcessor(self.client, self.prompt_builder) @@ -44,10 +52,14 @@ def process(self, graph): # 3. process classes self.structure_processor.process_all(graph) - # 4. process packages + # 4. process variables for SecDFD (v2 variable semantics) + if self.secdfd_enabled: + self.variable_processor.process_all(graph) + + # 5. process packages self.component_processor.process_all(graph) - # 5. process interactions + # 6. process interactions self.interaction_processor.process_all(graph) return graph diff --git a/arcana/llm_filter/processors.py b/arcana/llm_filter/processors.py index 539fff1..b6de7ca 100644 --- a/arcana/llm_filter/processors.py +++ b/arcana/llm_filter/processors.py @@ -2,6 +2,7 @@ from collections import OrderedDict from collections.abc import Iterable import logging +import re from tqdm.auto import tqdm @@ -34,20 +35,41 @@ def apply_classifications(self, graph: Graph, element: Node, description: dict, @staticmethod def _apply_classification(graph: Graph, element: Node, description: dict, scheme: ClassificationScheme): - classification = description.pop(scheme.response_key, None) - if not classification: + if scheme.allow_multi_label: + classifications = description.get(scheme.response_key, None) + else: + classifications = description.pop(scheme.response_key, None) + if not classifications: return - target = graph.find_node(label="Category", where=lambda n: n.id == scheme.category_id(classification)) - if target: - impl_edge = graph.add_edge( - element.id, - target.id, - "implements", - weight=1, - reason=description.get(scheme.response_reason_key), - ) - writer().write(impl_edge.to_dict()) + if isinstance(classifications, str): + classifications = [classifications] + elif isinstance(classifications, (list, tuple, set)): + classifications = list(classifications) + else: + classifications = [] + + if not scheme.allow_multi_label and classifications: + classifications = classifications[:1] + elif scheme.allow_multi_label and classifications and "primarySecdfdType" not in description: + description["primarySecdfdType"] = classifications[0] + + seen = set() + for classification in classifications: + if not classification or classification in seen: + continue + seen.add(classification) + target = graph.find_node(label="Category", where=lambda n: n.id == scheme.category_id(classification)) + if target: + impl_edge = graph.add_edge( + element.id, + target.id, + "implements", + weight=1, + reason=description.get(scheme.response_reason_key), + ) + if impl_edge: + writer().write(impl_edge.to_dict()) class ScriptProcessor(Processor): @@ -90,7 +112,7 @@ def process_one(self, graph: Graph, operation: Node, type: Node, operation_deps) 'qualifiedName']: f"{describe(graph.nodes[node_id], 'description', 'returns', 'howToUse', 'docComment')}" for node_id in operation_deps[operation.id]} op_parameters["Incoming Dependencies (Invoked By)"] = [m.properties['qualifiedName'] for m in operation.sources('invokes')] - self.add_classification_options(op_parameters, "script") + self.add_classification_options(op_parameters, "operation") prompt = self.prompt.compose(prompt, **op_parameters) @@ -98,7 +120,7 @@ def process_one(self, graph: Graph, operation: Node, type: Node, operation_deps) description = self.client.generate_json(prompt, "AnalyzeScript") - self.apply_classifications(graph, operation, description, "script") + self.apply_classifications(graph, operation, description, "operation") self.update_method_properties(graph, description, operation) @@ -114,7 +136,7 @@ def update_method_properties(data: Graph, description: dict, method: Node): key_lower = lower_first(key) if key_lower == 'parameters' and isinstance(value, Iterable): param_nodes = [data.nodes[edge.source] for edge in data.find_edges(label='parameterizes') if - edge.traget == method.id] + edge.target == method.id] for param in value: if isinstance(param, dict): matching_params = [node for node in param_nodes if @@ -174,7 +196,7 @@ def process_one(self, graph: Graph, type: Node, scope: Node, type_deps): typ_parameters["Inherited By"] = [f"{t.properties['kind']} {t.properties['qualifiedName']}" for t in type.sources('specializes')] typ_parameters[f"Enclosed Variables/Fields"] = vars typ_parameters[f"Enclosed Functions/Methods"] = op_descriptions - self.add_classification_options(typ_parameters, "structure") + self.add_classification_options(typ_parameters, "type") prompt = self.prompt.compose(prompt, **typ_parameters) @@ -182,7 +204,7 @@ def process_one(self, graph: Graph, type: Node, scope: Node, type_deps): description = self.client.generate_json(prompt, "AnalyzeStructure") - self.apply_classifications(graph, type, description, "structure") + self.apply_classifications(graph, type, description, "type") for k, v in description.items(): if not k.endswith('Reason'): @@ -230,7 +252,7 @@ def process_one(self, graph: Graph, scope: Node, scope_deps): scp_parameters[f"{scp_kind.title()} to Analyze"] = scope.properties['qualifiedName'] scp_parameters[f"Enclosed Sub-{scp_kind}s"] = subscp_descriptions scp_parameters["Enclosed Classes"] = typ_descriptions - self.add_classification_options(scp_parameters, "component") + self.add_classification_options(scp_parameters, "scope") prompt = self.prompt.compose(prompt, **scp_parameters) @@ -238,7 +260,7 @@ def process_one(self, graph: Graph, scope: Node, scope_deps): description = self.client.generate_json(prompt, "AnalyzeComponent") - self.apply_classifications(graph, scope, description, "component") + self.apply_classifications(graph, scope, description, "scope") ComponentProcessor.update_package_properties(graph, description, scope) @@ -256,3 +278,141 @@ class InteractionProcessor(Processor): def process_all(self, graph): # compute and describe interactions between packages pass + + +class VariableProcessor(Processor): + def __init__(self, client, prompt_builder, secdfd_cfg=None): + super().__init__(client, prompt_builder) + cfg = secdfd_cfg or {} + self.label_score_threshold = float(cfg.get("label_score_threshold", 0.60)) + self.process_min_out_invokes = int(cfg.get("process_min_out_invokes", 2)) + self.process_min_in_invokes = int(cfg.get("process_min_in_invokes", 2)) + self.asset_sensitive_term_hit_min = int(cfg.get("asset_sensitive_term_hit_min", 1)) + self.datastore_crud_hit_min = int(cfg.get("datastore_crud_hit_min", 1)) + self.external_entity_max_participation = int(cfg.get("external_entity_max_participation", 2)) + self.external_keywords = set("client rest entity user customer bank".split()) + self.datastore_keywords = set("db database dao repository storage cache data record table".split()) + self.asset_keywords = set("password secret policy user document card money balance account pin token key".split()) + self.flow_keywords = set("request response payload dto input output transfer amount source target".split()) + + def process_all(self, graph: Graph): + counter = 0 + signature_counts = self.build_signature_counts(graph) + for var in tqdm(graph.find_nodes("Variable"), desc="Processing variables"): + self.process_one(graph, var, signature_counts) + check_stop() + counter += 1 + if counter == 50: + writer().flush() + counter = 0 + + def process_one(self, graph: Graph, var: Node, signature_counts: dict): + description = self.infer_secdfd(graph, var, signature_counts) + if not description: + return + self.apply_classifications(graph, var, description, "variable") + for k, v in description.items(): + if not k.endswith("Reason"): + graph.nodes[var.id].properties[lower_first(k)] = v + writer().write({"data": {"id": var.id, "labels": list(var.labels), "properties": description}}) + + def infer_secdfd(self, graph: Graph, var: Node, signature_counts: dict) -> dict: + var_name = str(var.properties.get("simpleName", "")).strip() + name_tokens = self.tokenize(var_name) + signature = self.variable_signature(var) + participation = len(var.targets("parameterizes")) + len(var.sources("encapsulates")) + owners = [n for n in var.sources("encapsulates") if n.has_label("Type")] + ops = [n for n in var.targets("parameterizes") if n.has_label("Operation")] + scores = { + "External Entity": 0.0, + "DataStore": 0.0, + "Process": 0.0, + "Asset": 0.0, + "Flow": 0.0, + } + evidence = [] + + external_hits = self.keyword_hits(name_tokens, self.external_keywords) + if external_hits: + scores["External Entity"] += 0.7 + evidence.append(f"external_keywords={','.join(sorted(external_hits))}") + if participation <= self.external_entity_max_participation and (ops or owners): + scores["External Entity"] += 0.2 + evidence.append("low_participation") + + datastore_hits = self.keyword_hits(name_tokens, self.datastore_keywords) + if len(datastore_hits) >= self.datastore_crud_hit_min: + scores["DataStore"] += 0.7 + evidence.append(f"datastore_keywords={','.join(sorted(datastore_hits))}") + if any(any(v in self.tokenize(op.properties.get("simpleName", "")) for v in {"save", "find", "delete", "create", "read", "update"}) for op in ops): + scores["DataStore"] += 0.2 + evidence.append("crud_related_operation") + + asset_hits = self.keyword_hits(name_tokens, self.asset_keywords) + if len(asset_hits) >= self.asset_sensitive_term_hit_min: + scores["Asset"] += 0.8 + evidence.append(f"asset_keywords={','.join(sorted(asset_hits))}") + if owners and not ops: + scores["Asset"] += 0.1 + evidence.append("field_like_variable") + + flow_hits = self.keyword_hits(name_tokens, self.flow_keywords) + if flow_hits: + scores["Flow"] += 0.4 + evidence.append(f"flow_keywords={','.join(sorted(flow_hits))}") + if ops: + scores["Flow"] += 0.2 + evidence.append("parameterizes_operation") + if signature_counts.get(signature, 0) > 1: + scores["Flow"] += 0.3 + evidence.append("shared_signature") + + # Variables are not typically processes, keep score near-zero unless explicitly verb-named. + if self.looks_like_verb(var_name): + scores["Process"] += 0.2 + evidence.append("verb_like_name") + if any(len(op.targets("invokes")) >= self.process_min_out_invokes or len(op.sources("invokes")) >= self.process_min_in_invokes for op in ops): + scores["Process"] += 0.3 + evidence.append("connected_to_high_interaction_operation") + + selected = [label for label, score in sorted(scores.items(), key=lambda x: x[1], reverse=True) if score >= self.label_score_threshold][:3] + if not selected: + selected = ["Undetermined"] + primary = "Undetermined" + else: + primary = selected[0] + + return { + "secdfdTypes": selected, + "primarySecdfdType": primary, + "secdfdConfidence": {k: round(v, 3) for k, v in scores.items() if v > 0}, + "secdfdEvidence": "; ".join(evidence) if evidence else "No strong SecDFD evidence found.", + } + + @staticmethod + def keyword_hits(tokens: set, keywords: set) -> set: + return {t for t in tokens if t in keywords} + + @staticmethod + def tokenize(text: str) -> set: + text = re.sub(r"([a-z0-9])([A-Z])", r"\1 \2", text or "") + text = text.replace("_", " ").replace("-", " ").lower() + return {t for t in re.findall(r"[a-z0-9]+", text)} + + @staticmethod + def looks_like_verb(name: str) -> bool: + l = (name or "").lower() + return l.startswith(("get", "set", "create", "save", "find", "load", "send", "fetch", "verify")) + + @staticmethod + def variable_signature(var: Node): + name = str(var.properties.get("simpleName", "")).strip().lower() + types = tuple(sorted(t.id for t in var.targets("type"))) + return name, types + + def build_signature_counts(self, graph: Graph): + counts = {} + for var in graph.find_nodes("Variable"): + sig = self.variable_signature(var) + counts[sig] = counts.get(sig, 0) + 1 + return counts diff --git a/arcana/templates.py b/arcana/templates.py index 9dda8f5..99d1c06 100644 --- a/arcana/templates.py +++ b/arcana/templates.py @@ -80,6 +80,17 @@ "layerReason": { "type": "string", "description": "Explanation why the script fits the chosen architectural layer but not others." + }, + "secdfdTypes": { + "type": "array", + "items": { + "type": "string" + }, + "description": "One or more SecDFD classifications selected from the provided options." + }, + "secdfdEvidence": { + "type": "string", + "description": "Short evidence summary supporting the selected SecDFD classifications." } }, "required": [ @@ -93,7 +104,8 @@ "preConditions", "returns", "stereotype", - "stereotypeReason" + "stereotypeReason", + "secdfdTypes" ], "additionalProperties": False } @@ -138,6 +150,17 @@ "layerReason": { "type": "string", "description": "Explanation why the script fits the chosen architectural layer but not others." + }, + "secdfdTypes": { + "type": "array", + "items": { + "type": "string" + }, + "description": "One or more SecDFD classifications selected from the provided options." + }, + "secdfdEvidence": { + "type": "string", + "description": "Short evidence summary supporting the selected SecDFD classifications." } }, "required": [ @@ -146,7 +169,8 @@ "roleStereotype", "roleStereotypeReason", "layer", - "layerReason" + "layerReason", + "secdfdTypes" ] } diff --git a/config.ini.example b/config.ini.example index 574dd1a..83f28ad 100644 --- a/config.ini.example +++ b/config.ini.example @@ -23,3 +23,12 @@ layer2name=Logic layer2desc=Handles application and domain logic, i.e., neither UI nor data access. layer3name=Data layer3desc=Handles loading and storing data from/to external services, including database systems, web services, filesystems, hardware, etc. + +[secdfd] +enabled=false +label_score_threshold=0.60 +process_min_out_invokes=2 +process_min_in_invokes=2 +external_entity_max_participation=2 +datastore_crud_hit_min=1 +asset_sensitive_term_hit_min=1 diff --git a/legacy/arvisaninator.ipynb b/legacy/arvisaninator.ipynb index f5c1b66..12af60d 100644 --- a/legacy/arvisaninator.ipynb +++ b/legacy/arvisaninator.ipynb @@ -75,10 +75,10 @@ "metadata": {}, "outputs": [], "source": [ - "config = read_ini_file('config.ini')\n", + "config = read_ini_file('../config.ini')\n", "project_name = config['project']['name']\n", "project_desc = config['project']['desc']\n", - "ifile = config['project']['input']\n", + "ifile = config['project']['output']\n", "(project_name,project_desc,ifile)" ] }, @@ -371,15 +371,15 @@ { "cell_type": "code", "execution_count": null, + "metadata": { + "collapsed": false + }, "outputs": [], "source": [ "from arcanalib import lift\n", "\n", "edges_calls = edges['calls'] if 'calls' in edges else lift(edges['hasScript'], edges['invokes'])" - ], - "metadata": { - "collapsed": false - } + ] }, { "cell_type": "code", @@ -553,7 +553,7 @@ "outputs": [], "source": [ "# (\"id:ID\",\":LABEL\",\"fullName\",\"simpleName\",\"color\",\"dependencyProfileCategory\",\"cohesion\")\n", - "modules = [(id, 'Module', id, node['properties']['simpleName'], roleStereotypeColors[node['properties'].get('roleStereotype', 'Unknown')], dependencyProfiles.get(id, None), None)\n", + "modules = [(id, 'Module', id, node['properties']['simpleName'], roleStereotypeColors[node['properties'].get('roleStereotype', 'Unknown')], node['properties'].get('dependencyProfile', None), None)\n", " for id,node in nodes.items() if 'Structure' in node['labels'] and id != 'java.lang.String']\n", "\n", "modules" From a6b90e95396eadbec7efc625bf722f086422585e Mon Sep 17 00:00:00 2001 From: Satrio Adi Rukmono Date: Tue, 28 Apr 2026 13:19:09 +0700 Subject: [PATCH 26/34] Add pydantic>=2.0 and bump openai to >=1.50 Co-Authored-By: Claude Sonnet 4.6 --- requirements.txt | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index 40a1599..be810f5 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,2 +1,3 @@ -openai>=1.32 -tqdm>=4.66 \ No newline at end of file +openai>=1.50 +pydantic>=2.0 +tqdm>=4.66 From 31d9edf2778fa5d5210ecffe97b5b00b1c472cf2 Mon Sep 17 00:00:00 2001 From: Satrio Adi Rukmono Date: Tue, 28 Apr 2026 13:19:21 +0700 Subject: [PATCH 27/34] Replace hand-maintained JSON schema dicts with Pydantic v2 models MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ScriptDescription, StructureDescription, ComponentDescription are now Pydantic BaseModels. Tool dicts for the legacy tool-calling path are auto-generated via model_json_schema(), eliminating the risk of schema drift. TOOL_MODELS maps tool name → model class for the structured-output client path. Co-Authored-By: Claude Sonnet 4.6 --- arcana/templates.py | 318 +++++++++++++------------------------------- 1 file changed, 92 insertions(+), 226 deletions(-) diff --git a/arcana/templates.py b/arcana/templates.py index 99d1c06..ac3ea02 100644 --- a/arcana/templates.py +++ b/arcana/templates.py @@ -1,235 +1,101 @@ -script_description = { - "$schema": "http://json-schema.org/draft-07/schema#", - "title": "ScriptDescription", - "type": "object", - "properties": { - "description": { - "type": "string", - "description": "One-sentence description – suitable for a documentation comment – of the method/constructor/function functionality, in imperative mood." - }, - "parameters": { - "type": "array", - "items": { - "type": "object", - "properties": { - "name": { - "type": "string", - "description": "Parameter name." - }, - "type": { - "type": "string", - "description": "Parameter type." - }, - "description": { - "type": "string", - "description": "Brief description of the parameter." - } - }, - "required": [ - "name", - "description" - ] - }, - "description": "List of script parameters. Empty if none." - }, - "returns": { - "type": "string", - "description": "One-sentence description of the returned object or value. For constructors, consider the newly created instance as the return." - }, - "howToUse": { - "type": "string", - "description": "Usage instructions in less than three sentences." - }, - "howItWorks": { - "type": "string", - "description": "Implementation details in less than five sentences." - }, - "preConditions": { - "type": "array", - "items": { - "type": "string" - }, - "description": "List of pre-conditions for the script." - }, - "postConditions": { - "type": "array", - "items": { - "type": "string" - }, - "description": "List of post-conditions for the script." - }, - "stereotype": { - "type": "string", - "enum": [ - "Accessor", - "Mutator", - "Creational", - "Collaborational", - "Other" - ], - "description": "Design stereotype of the script." - }, - "stereotypeReason": { - "type": "string", - "description": "One-sentence explanation for the chosen stereotype." - }, - "layer": { - "type": "string", - "description": "Architectural layer classification selected from the provided options." - }, - "layerReason": { - "type": "string", - "description": "Explanation why the script fits the chosen architectural layer but not others." - }, - "secdfdTypes": { - "type": "array", - "items": { - "type": "string" - }, - "description": "One or more SecDFD classifications selected from the provided options." - }, - "secdfdEvidence": { - "type": "string", - "description": "Short evidence summary supporting the selected SecDFD classifications." - } - }, - "required": [ - "description", - "howItWorks", - "howToUse", - "layer", - "layerReason", - "parameters", - "postConditions", - "preConditions", - "returns", - "stereotype", - "stereotypeReason", - "secdfdTypes" - ], - "additionalProperties": False -} +from typing import Literal +from pydantic import BaseModel, Field -analyze_script_tool = { - "type": "function", - "function": { - "name": "AnalyzeScript", - "description": "Analyzes a program method/constructor/function given its source code and context. Returns an explanation covering functionality, parameters, return value, design rationale, usage, implementation details, assertions, stereotype, and architectural layer classification.", - "parameters": script_description - } -} -structure_description = { - "$schema": "http://json-schema.org/draft-07/schema#", - "title": "StructureDescription", - "type": "object", - "properties": { - "description": { - "type": "string", - "description": "Up to three sentences, suitable for a documentation comment, describing the key responsibilities of the class/struct/type." - }, - "keywords": { - "type": "array", - "items": { - "type": "string" - }, - "description": "List of important keywords related to the key responsibilities of the class/struct/type." - }, - "roleStereotype": { - "type": "string", - "description": "Role stereotype of the class/struct/type; options are supplied at runtime." - }, - "roleStereotypeReason": { - "type": "string", - "description": "One-sentence explanation for the chosen role stereotype." - }, - "layer": { - "type": "string", - "description": "Architectural layer classification selected from the provided options." - }, - "layerReason": { - "type": "string", - "description": "Explanation why the script fits the chosen architectural layer but not others." - }, - "secdfdTypes": { - "type": "array", - "items": { - "type": "string" - }, - "description": "One or more SecDFD classifications selected from the provided options." - }, - "secdfdEvidence": { - "type": "string", - "description": "Short evidence summary supporting the selected SecDFD classifications." - } - }, - "required": [ - "description", - "keywords", - "roleStereotype", - "roleStereotypeReason", - "layer", - "layerReason", - "secdfdTypes" - ] -} +# --------------------------------------------------------------------------- +# Output models +# --------------------------------------------------------------------------- -analyze_structure_tool = { - "type": "function", - "function": { - "name": "AnalyzeStructure", - "description": "Analyzes a software class/struct/type based on its inheritance, fields, and methods. Returns an explanation covering the key responsibilities of the structure, relevant keywords, role stereotype, and rationale for the chosen stereotype.", - "parameters": structure_description - } -} +class Parameter(BaseModel): + name: str + type: str = "" + description: str -component_description = { - "$schema": "http://json-schema.org/draft-07/schema#", - "title": "ComponentDescription", - "type": "object", - "properties": { - "description": { - "type": "string", - "description": "Describe the functionality of the component/package in up to five sentences." - }, - "title": { - "type": "string", - "description": "A noun phrase that describes the component/package." - }, - "keywords": { - "type": "array", - "items": { - "type": "string" - }, - "description": "List of important keywords related to the core functionalities of the component/package." - }, - "layer": { - "type": "string", - "description": "Architectural layer classification selected from the provided options." - }, - "layerReason": { - "type": "string", - "description": "Explanation why the component/package fits the chosen layer but not others." - } - }, - "required": [ - "description", - "title", - "keywords", - "layer", - "layerReason" - ] -} -analyze_component_tool = { - "type": "function", - "function": { - "name": "AnalyzeComponent", - "description": "Analyzes a software component/package by examining its contents. Returns an explanation including a description of component/package responsibility, a descriptive title, a list of keywords, the selected architectural layer, and the rationale for that layer.", - "parameters": component_description - } +class ScriptDescription(BaseModel): + description: str = Field(description="One-sentence description of the method/constructor/function functionality, in imperative mood.") + parameters: list[Parameter] = Field(default_factory=list, description="List of parameters. Empty if none.") + returns: str = Field(description="One-sentence description of the returned value. For constructors, describe the created instance.") + howToUse: str = Field(description="Usage instructions in less than three sentences.") + howItWorks: str = Field(description="Implementation details in less than five sentences.") + preConditions: list[str] = Field(default_factory=list, description="Pre-conditions for the script.") + postConditions: list[str] = Field(default_factory=list, description="Post-conditions for the script.") + stereotype: Literal["Accessor", "Mutator", "Creational", "Collaborational", "Other"] = Field(description="Design stereotype.") + stereotypeReason: str = Field(description="One-sentence explanation for the chosen stereotype.") + layer: str = Field(description="Architectural layer selected from the provided options.") + layerReason: str = Field(description="Explanation why this fits the chosen layer but not others.") + secdfdTypes: list[str] = Field(default_factory=list, description="One or more SecDFD classifications from the provided options.") + secdfdEvidence: str = Field(default="", description="Short evidence summary for the SecDFD classifications.") + + +class StructureDescription(BaseModel): + description: str = Field(description="Up to three sentences describing the key responsibilities of the class/struct/type.") + keywords: list[str] = Field(default_factory=list, description="Important keywords related to key responsibilities.") + roleStereotype: str = Field(description="Role stereotype; options are supplied at runtime.") + roleStereotypeReason: str = Field(description="One-sentence explanation for the chosen role stereotype.") + layer: str = Field(description="Architectural layer selected from the provided options.") + layerReason: str = Field(description="Explanation why this fits the chosen layer but not others.") + secdfdTypes: list[str] = Field(default_factory=list, description="One or more SecDFD classifications from the provided options.") + secdfdEvidence: str = Field(default="", description="Short evidence summary for the SecDFD classifications.") + + +class ComponentDescription(BaseModel): + description: str = Field(description="Describe the functionality of the component/package in up to five sentences.") + title: str = Field(description="A noun phrase describing the component/package.") + keywords: list[str] = Field(default_factory=list, description="Important keywords related to the core functionalities.") + layer: str = Field(description="Architectural layer selected from the provided options.") + layerReason: str = Field(description="Explanation why this fits the chosen layer but not others.") + + +# --------------------------------------------------------------------------- +# Tool dicts (OpenAI function-calling format, derived from Pydantic schemas) +# Used as fallback when use_structured_output = false. +# --------------------------------------------------------------------------- + +def _tool(name: str, description: str, model: type[BaseModel]) -> dict: + schema = model.model_json_schema() + # Pydantic v2 may emit $defs for nested models; OpenAI function-calling + # accepts these inline definitions without issue. + return { + "type": "function", + "function": { + "name": name, + "description": description, + "parameters": schema, + }, + } + + +analyze_script_tool = _tool( + "AnalyzeScript", + "Analyzes a program method/constructor/function given its source code and context.", + ScriptDescription, +) + +analyze_structure_tool = _tool( + "AnalyzeStructure", + "Analyzes a software class/struct/type based on its inheritance, fields, and methods.", + StructureDescription, +) + +analyze_component_tool = _tool( + "AnalyzeComponent", + "Analyzes a software component/package by examining its contents.", + ComponentDescription, +) + +# Map tool name → Pydantic model (used by the structured-output client path). +TOOL_MODELS: dict[str, type[BaseModel]] = { + "AnalyzeScript": ScriptDescription, + "AnalyzeStructure": StructureDescription, + "AnalyzeComponent": ComponentDescription, } + +# --------------------------------------------------------------------------- +# Interaction analysis prompt template (unchanged) +# --------------------------------------------------------------------------- + interaction_analysis = '''## Input: Consider a project {project_name}, {project_desc}. @@ -253,7 +119,7 @@ - The purpose and nature of their dependency in terms of design. - An abstract, high-level description of the relationship without referencing specific classes or methods. - + ## Output: Provide a cohesive explanation of the interaction in one to two sentences. Keep the response plain text.''' From d52d99ea7343688ef2265cc2dddc9ecd2e79ea7a Mon Sep 17 00:00:00 2001 From: Satrio Adi Rukmono Date: Tue, 28 Apr 2026 13:19:33 +0700 Subject: [PATCH 28/34] Switch LLM client to structured outputs with retry Primary path uses OpenAI json_schema strict mode and validates responses with Pydantic model_validate_json(). Falls back to the legacy tool-calling path if the provider does not support structured outputs (controlled by [llm] use_structured_output). Both paths retry up to 3 times with exponential backoff (2s/4s/8s) before giving up. Co-Authored-By: Claude Sonnet 4.6 --- arcana/llm_filter/client.py | 199 ++++++++++++++++++++++++------------ 1 file changed, 132 insertions(+), 67 deletions(-) diff --git a/arcana/llm_filter/client.py b/arcana/llm_filter/client.py index 1c8aa48..e64d2c0 100644 --- a/arcana/llm_filter/client.py +++ b/arcana/llm_filter/client.py @@ -1,72 +1,137 @@ -from openai import OpenAI -import json, sys +import json +import sys +import time +import logging + +from pydantic import BaseModel from arcana import templates from arcana.utils import find_first_valid_json +logger = logging.getLogger(__name__) + +_RETRY_DELAYS = (2, 4, 8) + + class LLMClient: - def __init__(self, llm_cfg, project_cfg): - self.client = OpenAI(api_key=llm_cfg['apikey'], base_url=llm_cfg.get('apibase')) - self.model = llm_cfg.get('model', 'gpt-4o-mini') - self.timeout = float(llm_cfg.get('timeout', 300)) - - def generate_json(self, prompt, tool): - """Generate a description using the OpenAI client.""" - try: - if tool: - response = self.client.chat.completions.create( - model=self.model, - messages=[ - {"role": "system", "content": "You are a tool for analyzing software architecture of code implementations."}, - {"role": "user", "content": prompt}], - tools=[templates.analyze_script_tool, - templates.analyze_structure_tool, - templates.analyze_component_tool], - tool_choice="required", temperature=0, seed=42, - timeout=self.timeout) - - tool_calls = response.choices[0].message.tool_calls - - if tool_calls: - args_str = tool_calls[0].function.arguments - description = json.loads(args_str) - else: - content = response.choices[0].message.content - json_content = find_first_valid_json(content) - if json_content: - description = json.loads(json_content) - else: - description = dict() - - else: - response = self.client.chat.completions.create( - model=self.model, - response_format={"type": "json_object"}, - messages=[ - {"role": "system", "content": "You are an expert in analyzing software architecture of code implementations."}, - {"role": "user", "content": prompt}], - max_tokens=4096, temperature=0, seed=42, - timeout=self.timeout) - - content = response.choices[0].message.content - description = json.loads(content) - except Exception as e: - sys.stderr.write(f"Generate JSON description error: {e}") - description = {} - - if 'description' not in description: - description['description'] = "(no description)" - return description - - def generate_text(self, prompt): - try: - response = self.client.chat.completions.create(model=self.model, - messages=[{"role": "user", "content": prompt}], - max_tokens=4096, temperature=0, seed=42, - timeout=float(self.config['llm'].get('timeout', 300))) - description = response.choices[0].message.content - except Exception as e: - sys.stderr.write(f"Generate text description error: {e}") - description = "(no description)" - - return description \ No newline at end of file + def __init__(self, llm_cfg, project_cfg): + from openai import OpenAI + self.client = OpenAI(api_key=llm_cfg['apikey'], base_url=llm_cfg.get('apibase')) + self.model = llm_cfg.get('model', 'gpt-4o-mini') + self.timeout = float(llm_cfg.get('timeout', 300)) + self.use_structured_output = str(llm_cfg.get('use_structured_output', 'true')).strip().lower() in {'1', 'true', 'yes', 'on'} + + # ------------------------------------------------------------------ + # Public interface + # ------------------------------------------------------------------ + + def generate_json(self, prompt: str, tool: str) -> dict: + """Generate a structured description. + + `tool` is the tool name string (e.g. "AnalyzeScript"). The method tries + the modern structured-output path first (if enabled) and falls back to + the legacy tool-calling path on failure or when disabled. + """ + model_class = templates.TOOL_MODELS.get(tool) + + if self.use_structured_output and model_class: + result = self._generate_structured(prompt, tool, model_class) + else: + result = self._generate_tool_call(prompt) + + if 'description' not in result: + result['description'] = "(no description)" + return result + + def generate_text(self, prompt: str) -> str: + try: + response = self.client.chat.completions.create( + model=self.model, + messages=[{"role": "user", "content": prompt}], + max_tokens=4096, temperature=0, seed=42, + timeout=self.timeout, + ) + return response.choices[0].message.content + except Exception as e: + sys.stderr.write(f"Generate text error: {e}\n") + return "(no description)" + + # ------------------------------------------------------------------ + # Private helpers + # ------------------------------------------------------------------ + + def _generate_structured(self, prompt: str, tool_name: str, model_class: type[BaseModel]) -> dict: + """Use OpenAI structured outputs (json_schema strict mode).""" + schema = model_class.model_json_schema() + # Remove $schema key if present — not accepted by the API + schema.pop("$schema", None) + + for attempt, delay in enumerate((*_RETRY_DELAYS, None)): + try: + response = self.client.chat.completions.create( + model=self.model, + messages=[ + {"role": "system", "content": "You are a tool for analyzing software architecture of code implementations."}, + {"role": "user", "content": prompt}, + ], + response_format={ + "type": "json_schema", + "json_schema": { + "name": tool_name, + "schema": schema, + "strict": True, + }, + }, + temperature=0, seed=42, + timeout=self.timeout, + ) + content = response.choices[0].message.content + instance = model_class.model_validate_json(content) + return instance.model_dump() + except Exception as e: + if delay is None: + logger.warning("Structured output failed after retries (%s); falling back to tool-calling.", e) + return self._generate_tool_call(prompt) + logger.warning("Structured output attempt %d failed (%s); retrying in %ds.", attempt + 1, e, delay) + time.sleep(delay) + + return {} # unreachable + + def _generate_tool_call(self, prompt: str) -> dict: + """Legacy path: OpenAI tool-calling to constrain output format.""" + for attempt, delay in enumerate((*_RETRY_DELAYS, None)): + try: + response = self.client.chat.completions.create( + model=self.model, + messages=[ + {"role": "system", "content": "You are a tool for analyzing software architecture of code implementations."}, + {"role": "user", "content": prompt}, + ], + tools=[ + templates.analyze_script_tool, + templates.analyze_structure_tool, + templates.analyze_component_tool, + ], + tool_choice="required", + temperature=0, seed=42, + timeout=self.timeout, + ) + + tool_calls = response.choices[0].message.tool_calls + if tool_calls: + return json.loads(tool_calls[0].function.arguments) + + content = response.choices[0].message.content + json_content = find_first_valid_json(content) + if json_content: + return json.loads(json_content) + return {} + + except Exception as e: + if delay is None: + sys.stderr.write(f"Generate JSON (tool-call) error: {e}\n") + return {} + logger.warning("Tool-call attempt %d failed (%s); retrying in %ds.", attempt + 1, e, delay) + time.sleep(delay) + + return {} # unreachable From 65f0fd1e52b93277ba20ccd67954560cc603436b Mon Sep 17 00:00:00 2001 From: Satrio Adi Rukmono Date: Tue, 28 Apr 2026 13:20:37 +0700 Subject: [PATCH 29/34] Add resumable checkpoint support load_checkpoint() reads a JSONL checkpoint file and applies node property updates and edges back into a graph, so a resumed run skips already-processed nodes via the existing per-node description guards in processors. configure_writer() sets a stable checkpoint path and open mode: 'w' for fresh runs (overwrite), 'a' for resume runs (append). Writer calls are now lock-protected for concurrent processor access. Co-Authored-By: Claude Sonnet 4.6 --- arcana/checkpoint.py | 119 +++++++++++++++++++++++++++++++++---------- 1 file changed, 93 insertions(+), 26 deletions(-) diff --git a/arcana/checkpoint.py b/arcana/checkpoint.py index df7ee8a..75c8a44 100644 --- a/arcana/checkpoint.py +++ b/arcana/checkpoint.py @@ -1,34 +1,101 @@ -# checkpoint.py import json +import logging from threading import Lock from arcana.custom_encoder import CustomJSONEncoder +logger = logging.getLogger(__name__) + + class JSONLWriter: - _instance = None - _lock = Lock() - - def __new__(cls, path): - with cls._lock: - if cls._instance is None: - inst = super().__new__(cls) - inst._file = open(path, "a", buffering=1) - cls._instance = inst - return cls._instance - - def write(self, data: dict): - self._file.write(json.dumps(data, cls=CustomJSONEncoder) + "\n") - - def flush(self): - try: - self._file.flush() - finally: - pass + _instance = None + _lock = Lock() + + def __new__(cls, path: str, append: bool = False): + with cls._lock: + if cls._instance is None: + inst = super().__new__(cls) + mode = "a" if append else "w" + inst._file = open(path, mode, buffering=1) + cls._instance = inst + return cls._instance + + def write(self, data: dict): + with self._lock: + self._file.write(json.dumps(data, cls=CustomJSONEncoder) + "\n") + + def flush(self): + with self._lock: + try: + self._file.flush() + except Exception: + pass + + +_writer_path: str = "checkpoints.jsonl" +_writer_append: bool = False + + +def configure_writer(path: str, append: bool = False): + """Call once before the first writer() use to set path and open mode.""" + global _writer_path, _writer_append + _writer_path = path + _writer_append = append + def writer(path=None): - """ - If path is provided and writer not yet instantiated, uses it. - Otherwise, falls back to default. - """ - effective = path or "checkpoints.jsonl" - return JSONLWriter(effective) + effective = path or _writer_path + return JSONLWriter(effective, _writer_append) + + +# --------------------------------------------------------------------------- +# Checkpoint loading (for resume) +# --------------------------------------------------------------------------- + +def load_checkpoint(path: str, graph): + """Load a JSONL checkpoint file into *graph*, applying node properties and edges. + + Handles both node entries {"data": {"id": ..., "labels": [...], "properties": {...}}} + and edge entries {"data": {"id": ..., "source": ..., "target": ..., "label": ..., "properties": {...}}}. + """ + loaded_nodes = 0 + loaded_edges = 0 + try: + with open(path, encoding="utf-8") as f: + for lineno, line in enumerate(f, 1): + line = line.strip() + if not line: + continue + try: + entry = json.loads(line).get('data', {}) + except json.JSONDecodeError as e: + logger.warning("Skipping malformed checkpoint line %d: %s", lineno, e) + continue + + eid = entry.get('id') + if not eid: + continue + + if 'source' in entry and 'target' in entry and 'label' in entry: + # Edge entry + graph.add_edge( + entry['source'], + entry['target'], + entry['label'], + **entry.get('properties', {}), + ) + loaded_edges += 1 + else: + # Node entry + props = entry.get('properties', {}) + if eid in graph.nodes: + graph.nodes[eid].properties.update(props) + else: + graph.add_node(eid, *entry.get('labels', []), **props) + loaded_nodes += 1 + + except FileNotFoundError: + logger.warning("Checkpoint file not found: %s", path) + return + + logger.info("Loaded checkpoint %s: %d nodes, %d edges.", path, loaded_nodes, loaded_edges) From 8dd08abb97a6c7a4d9e22756b5a6903f27380c2b Mon Sep 17 00:00:00 2001 From: Satrio Adi Rukmono Date: Tue, 28 Apr 2026 13:22:00 +0700 Subject: [PATCH 30/34] Replace topo-sort loop with futures DAG executor; add override/sibling context Execution model: NodeExecutor submits all nodes to a ThreadPoolExecutor immediately. Each worker waits only on the futures of its direct dependencies before making its LLM call, giving finer-grained parallelism than level-by-level topological batching. Workers is configurable via [llm] workers (default 8). Override/overload context: ScriptProcessor collects all methods sharing the same simpleName in the enclosing class (overloads) and ancestor classes (overrides/hides). When present, these are injected into the prompt with an explicit instruction to differentiate implementations while keeping terminology consistent. Sibling-aware class processing: StructureProcessor collects already-processed sibling classes (sharing the same parent) and passes them as context with an instruction to maintain terminological consistency while differentiating responsibilities. Also strengthens the parent-class context instruction and fixes a "Inhertis" typo. Co-Authored-By: Claude Sonnet 4.6 --- arcana/llm_filter/processors.py | 1015 +++++++++++++++++++------------ 1 file changed, 631 insertions(+), 384 deletions(-) diff --git a/arcana/llm_filter/processors.py b/arcana/llm_filter/processors.py index b6de7ca..fbf407a 100644 --- a/arcana/llm_filter/processors.py +++ b/arcana/llm_filter/processors.py @@ -1,8 +1,10 @@ from abc import ABC, abstractmethod from collections import OrderedDict from collections.abc import Iterable +from concurrent.futures import Future, ThreadPoolExecutor, wait as futures_wait import logging import re +import threading from tqdm.auto import tqdm @@ -16,403 +18,648 @@ logger = logging.getLogger(__name__) + +# --------------------------------------------------------------------------- +# Futures-based DAG executor +# --------------------------------------------------------------------------- + +class NodeExecutor: + """Executes graph nodes concurrently while honouring dependency ordering. + + Each node is submitted once. Before running its own processing function a + worker thread waits for the futures of all direct dependencies to finish. + This is equivalent to a topological sort but without a global pre-pass and + with finer-grained parallelism: a node starts as soon as *its own* deps are + ready rather than waiting for an entire topo-level to drain. + + Starvation note: if max_workers < longest dependency chain length every + thread could block and make no progress. Default of 8 is safe for typical + Java inheritance depths. Users can raise it in config if needed. + """ + + def __init__(self, max_workers: int = 8): + self._pool = ThreadPoolExecutor(max_workers=max_workers) + self._futures: dict[str, Future] = {} + self._lock = threading.Lock() + self._completed = 0 + self._completed_lock = threading.Lock() + + def submit(self, node_id: str, dep_ids: list[str], fn) -> Future: + with self._lock: + if node_id in self._futures: + return self._futures[node_id] + # Snapshot dep futures while holding the lock so they cannot + # disappear between the check and the submit. + dep_futures = [self._futures[d] for d in dep_ids if d in self._futures] + f = self._pool.submit(self._run, dep_futures, fn, node_id) + self._futures[node_id] = f + return f + + def wait_all(self): + with self._lock: + all_futures = list(self._futures.values()) + futures_wait(all_futures) + + def shutdown(self): + self._pool.shutdown(wait=True) + + def _run(self, dep_futures: list[Future], fn, node_id: str): + for df in dep_futures: + df.result() # block until dependency is processed + fn(node_id) + with self._completed_lock: + self._completed += 1 + + @property + def completed(self) -> int: + with self._completed_lock: + return self._completed + + +# --------------------------------------------------------------------------- +# Base processor +# --------------------------------------------------------------------------- + class Processor(ABC): - def __init__(self, client, prompt_builder): - self.client: LLMClient = client - self.prompt: PromptBuilder = prompt_builder - - @abstractmethod - def process_all(self, graph): - raise NotImplementedError - - def add_classification_options(self, parameters: OrderedDict, element_kind: str): - for scheme in self.prompt.classification_schemes(element_kind): - parameters[scheme.prompt_label] = scheme.options_with_undetermined() - - def apply_classifications(self, graph: Graph, element: Node, description: dict, element_kind: str): - for scheme in self.prompt.classification_schemes(element_kind): - self._apply_classification(graph, element, description, scheme) - - @staticmethod - def _apply_classification(graph: Graph, element: Node, description: dict, scheme: ClassificationScheme): - if scheme.allow_multi_label: - classifications = description.get(scheme.response_key, None) - else: - classifications = description.pop(scheme.response_key, None) - if not classifications: - return - - if isinstance(classifications, str): - classifications = [classifications] - elif isinstance(classifications, (list, tuple, set)): - classifications = list(classifications) - else: - classifications = [] - - if not scheme.allow_multi_label and classifications: - classifications = classifications[:1] - elif scheme.allow_multi_label and classifications and "primarySecdfdType" not in description: - description["primarySecdfdType"] = classifications[0] - - seen = set() - for classification in classifications: - if not classification or classification in seen: - continue - seen.add(classification) - target = graph.find_node(label="Category", where=lambda n: n.id == scheme.category_id(classification)) - if target: - impl_edge = graph.add_edge( - element.id, - target.id, - "implements", - weight=1, - reason=description.get(scheme.response_reason_key), - ) - if impl_edge: - writer().write(impl_edge.to_dict()) + def __init__(self, client: LLMClient, prompt_builder: PromptBuilder, max_workers: int = 8): + self.client = client + self.prompt = prompt_builder + self.max_workers = max_workers + + @abstractmethod + def process_all(self, graph: Graph): + raise NotImplementedError + + def add_classification_options(self, parameters: OrderedDict, element_kind: str): + for scheme in self.prompt.classification_schemes(element_kind): + parameters[scheme.prompt_label] = scheme.options_with_undetermined() + + def apply_classifications(self, graph: Graph, element: Node, description: dict, element_kind: str): + for scheme in self.prompt.classification_schemes(element_kind): + self._apply_classification(graph, element, description, scheme) + + @staticmethod + def _apply_classification(graph: Graph, element: Node, description: dict, scheme: ClassificationScheme): + if scheme.allow_multi_label: + classifications = description.get(scheme.response_key, None) + else: + classifications = description.pop(scheme.response_key, None) + if not classifications: + return + + if isinstance(classifications, str): + classifications = [classifications] + elif isinstance(classifications, (list, tuple, set)): + classifications = list(classifications) + else: + classifications = [] + + if not scheme.allow_multi_label and classifications: + classifications = classifications[:1] + elif scheme.allow_multi_label and classifications and "primarySecdfdType" not in description: + description["primarySecdfdType"] = classifications[0] + + seen = set() + for classification in classifications: + if not classification or classification in seen: + continue + seen.add(classification) + target = graph.find_node(label="Category", where=lambda n: n.id == scheme.category_id(classification)) + if target: + impl_edge = graph.add_edge( + element.id, + target.id, + "implements", + weight=1, + reason=description.get(scheme.response_reason_key), + ) + if impl_edge: + writer().write(impl_edge.to_dict()) + + +# --------------------------------------------------------------------------- +# Helper: same-name method family (overloads in same class + overrides in ancestors) +# --------------------------------------------------------------------------- + +def collect_same_name_family( + graph: Graph, + method: Node, + enclosing_type: Node, + type_ancestor_ids: list[str], +) -> dict[str, str]: + """Return qualifiedName → describe() for related methods sharing the same simpleName. + + Covers: + - Overloads: same class, same simpleName, different node id. + - Overrides/hides: any ancestor class method with the same simpleName. + + Only includes methods that already have a description so the context is + always meaningful. + """ + name = method.properties.get('simpleName', '') + family: dict[str, str] = {} + + for op in enclosing_type.targets('encapsulates'): + if (op.has_label('Operation') + and op.id != method.id + and op.properties.get('simpleName') == name + and 'description' in op.properties): + family[op.properties['qualifiedName']] = describe(op, 'description', 'returns', 'howItWorks') + + for ancestor_id in type_ancestor_ids: + ancestor = graph.nodes.get(ancestor_id) + if not ancestor: + continue + for op in ancestor.targets('encapsulates'): + if (op.has_label('Operation') + and op.properties.get('simpleName') == name + and 'description' in op.properties): + family[op.properties['qualifiedName']] = describe(op, 'description', 'returns', 'howItWorks') + + return family + + +# --------------------------------------------------------------------------- +# ScriptProcessor +# --------------------------------------------------------------------------- class ScriptProcessor(Processor): - - def process_all(self, graph: Graph): - sorted_method_ids, method_deps = Graph.toposorted_nodes(graph.find_edges(label='invokes'), graph.find_nodes('Operation')) - counter = 0 - # logger.debug(sorted_method_ids) - # logger.debug(method_deps) - - for met_id in tqdm(sorted_method_ids, desc='Processing methods'): - method: Node = graph.nodes[met_id] - clasz: Node = [n for n in method.sources('encapsulates') if n.has_label('Type')][0] - self.process_one(graph, method, clasz, method_deps) - - check_stop() - - counter += 1 - if counter == 10: - writer().flush() - counter %= 10 - - def process_one(self, graph: Graph, operation: Node, type: Node, operation_deps): - if 'description' not in operation.properties or not operation.properties['description'] or operation.properties[ - 'description'] == "(no description)": - op_name = operation.properties['simpleName'] - op_src = remove_java_comments(operation.properties['sourceText']) - op_kind = operation.properties.get('kind', 'function') - - typ_name = type.properties['qualifiedName'] - typ_kind = type.properties['kind'] - typ_kind = 'enum' if typ_kind == 'enumeration' else 'abstract class' if typ_kind == 'abstract' else typ_kind - - prompt = f"Describe the following {op_kind} by using the AnalyzeScript tool.\n\n" - op_parameters = OrderedDict() - op_parameters["Project Name"] = self.prompt.project_name - op_parameters["Project Description"] = self.prompt.project_desc - op_parameters[f"{op_kind.title()} to Analyze"] = f"`{op_name}` from the {typ_kind} `{typ_name}`." - op_parameters[f"{op_kind.title()} Source Code"] = op_src - op_parameters["Outgoing Dependencies (Invokes)"] = {graph.nodes[node_id].properties[ - 'qualifiedName']: f"{describe(graph.nodes[node_id], 'description', 'returns', 'howToUse', 'docComment')}" - for node_id in operation_deps[operation.id]} - op_parameters["Incoming Dependencies (Invoked By)"] = [m.properties['qualifiedName'] for m in operation.sources('invokes')] - self.add_classification_options(op_parameters, "operation") - - prompt = self.prompt.compose(prompt, **op_parameters) - - logger.debug(prompt) - - description = self.client.generate_json(prompt, "AnalyzeScript") - - self.apply_classifications(graph, operation, description, "operation") - - self.update_method_properties(graph, description, operation) - - writer().write({'data': {'id': operation.id, 'labels': operation.labels, 'properties': description}}) - - @staticmethod - def update_method_properties(data: Graph, description: dict, method: Node): - """Update method properties with the generated description.""" - - for key, value in description.items(): - if key.endswith('Reason'): - continue - key_lower = lower_first(key) - if key_lower == 'parameters' and isinstance(value, Iterable): - param_nodes = [data.nodes[edge.source] for edge in data.find_edges(label='parameterizes') if - edge.target == method.id] - for param in value: - if isinstance(param, dict): - matching_params = [node for node in param_nodes if - node.properties['simpleName'] == param.get('name')] - if matching_params: - param_node_id = matching_params[0].id - if param_node_id in data.nodes: - data.nodes[param_node_id].properties['description'] = param.get('description') - # elif key_lower == 'returns': - # method.properties['returns'] = value.get('description', None) if value and hasattr(value, 'get') else None - else: - data.nodes[method.id].properties[key_lower] = value + def __init__(self, client, prompt_builder, max_workers=8): + super().__init__(client, prompt_builder, max_workers) + # Populated by LLMFilter before process_all is called. + self.type_ancestor_ids: dict[str, list[str]] = {} + + def process_all(self, graph: Graph): + methods = graph.find_nodes('Operation') + invokes_edges = graph.find_edges(label='invokes') + + # Build a dep map: method_id → [method_id, ...] of methods it invokes. + invokes_map: dict[str, list[str]] = {m.id: [] for m in methods} + for edge in invokes_edges: + if edge.source in invokes_map: + invokes_map[edge.source].append(edge.target) + + executor = NodeExecutor(max_workers=self.max_workers) + total = len(methods) + + def process_fn(met_id: str): + method = graph.nodes[met_id] + enclosers = [n for n in method.sources('encapsulates') if n.has_label('Type')] + if not enclosers: + return + clasz = enclosers[0] + self.process_one(graph, method, clasz) + check_stop() + + for method in methods: + executor.submit(method.id, invokes_map.get(method.id, []), process_fn) + + with tqdm(total=total, desc='Processing methods') as pbar: + last = 0 + while True: + done = executor.completed + if done > last: + pbar.update(done - last) + last = done + if done >= total: + break + import time; time.sleep(0.1) + + executor.wait_all() + executor.shutdown() + writer().flush() + + def process_one(self, graph: Graph, operation: Node, type: Node): + if (operation.properties.get('description') + and operation.properties['description'] != "(no description)"): + return + + op_name = operation.properties['simpleName'] + op_src = remove_java_comments(operation.properties.get('sourceText', '')) + op_kind = operation.properties.get('kind', 'function') + + typ_name = type.properties['qualifiedName'] + typ_kind = type.properties.get('kind', 'class') + typ_kind = 'enum' if typ_kind == 'enumeration' else 'abstract class' if typ_kind == 'abstract' else typ_kind + + ancestor_ids = self.type_ancestor_ids.get(type.id, []) + family = collect_same_name_family(graph, operation, type, ancestor_ids) + + base_instruction = f"Describe the following {op_kind} by using the AnalyzeScript tool.\n\n" + if family: + base_instruction = ( + "Related methods sharing the same name are listed below under " + "\"Related Methods\". If this overrides or hides a parent-class method, " + "clearly differentiate what this implementation changes or adds. " + "If this overloads another method in the same class, ensure descriptions " + "are consistent but distinguish the parameter/use-case differences.\n\n" + + base_instruction + ) + + op_parameters = OrderedDict() + op_parameters["Project Name"] = self.prompt.project_name + op_parameters["Project Description"] = self.prompt.project_desc + op_parameters[f"{op_kind.title()} to Analyze"] = f"`{op_name}` from the {typ_kind} `{typ_name}`." + op_parameters[f"{op_kind.title()} Source Code"] = op_src + + invoked_ids = [e.target for e in graph.find_edges(label='invokes') if e.source == operation.id] + op_parameters["Outgoing Dependencies (Invokes)"] = { + graph.nodes[nid].properties['qualifiedName']: describe(graph.nodes[nid], 'description', 'returns', 'howToUse', 'docComment') + for nid in invoked_ids if nid in graph.nodes + } + op_parameters["Incoming Dependencies (Invoked By)"] = [ + m.properties['qualifiedName'] for m in operation.sources('invokes') + ] + if family: + op_parameters["Related Methods (same name — overloads / overrides)"] = family + + self.add_classification_options(op_parameters, "operation") + + prompt = self.prompt.compose(base_instruction, **op_parameters) + logger.debug(prompt) + + description = self.client.generate_json(prompt, "AnalyzeScript") + self.apply_classifications(graph, operation, description, "operation") + self.update_method_properties(graph, description, operation) + writer().write({'data': {'id': operation.id, 'labels': list(operation.labels), 'properties': description}}) + + @staticmethod + def update_method_properties(data: Graph, description: dict, method: Node): + for key, value in description.items(): + if key.endswith('Reason'): + continue + key_lower = lower_first(key) + if key_lower == 'parameters' and isinstance(value, Iterable): + param_nodes = [data.nodes[edge.source] for edge in data.find_edges(label='parameterizes') + if edge.target == method.id] + for param in value: + if isinstance(param, dict): + matching = [n for n in param_nodes if n.properties['simpleName'] == param.get('name')] + if matching and matching[0].id in data.nodes: + data.nodes[matching[0].id].properties['description'] = param.get('description') + else: + data.nodes[method.id].properties[key_lower] = value + + +# --------------------------------------------------------------------------- +# StructureProcessor +# --------------------------------------------------------------------------- class StructureProcessor(Processor): - def process_all(self, graph: Graph): - sorted_class_ids, class_deps = Graph.toposorted_nodes(graph.find_edges(label='specializes'), graph.find_nodes('Type')) - counter = 0 - - for cls_id in tqdm(sorted_class_ids, desc='Processing classes'): - clasz: Node = graph.nodes[cls_id] - enclosers = [n for n in clasz.sources('encloses') if n.has_label('Scope')] - package: Node = enclosers[0] if enclosers else None - self.process_one(graph, clasz, package, class_deps) - - check_stop() - - counter += 1 - if counter == 10: - writer().flush() - counter %= 10 - - def process_one(self, graph: Graph, type: Node, scope: Node, type_deps): - vars = StructureProcessor.get_type_relations(graph, type.id) - op_descriptions = { method.properties['qualifiedName']: describe(method) for method in type.targets('encapsulates') if method.has_label('Operation') } - - typ_name = type.properties['qualifiedName'] - typ_kind = type.properties.get('kind', "type") - typ_kind = 'enum' if typ_kind == 'enumeration' else 'abstract class' if typ_kind == 'abstract' else typ_kind - - prompt = f"Describe the following {typ_kind} using the AnalyzeStructure tool.\n\n" - typ_parameters = OrderedDict() - typ_parameters["Project Name"] = self.prompt.project_name - typ_parameters["Project Description"] = self.prompt.project_desc - - if scope: - scope_name = scope.properties['qualifiedName'] - scope_kind = scope.properties.get('kind', "scope") - typ_parameters[f"{typ_kind.title()} to Analyze"] = f"`{typ_kind} {typ_name}` from the {scope_kind} `{scope_name}`." - else: - typ_parameters[f"{typ_kind.title()} to Analyze"] = f"`{typ_kind} {typ_name}`." - - typ_parameters[f"{typ_kind.title()} Inhertis From"] = {graph.nodes[node_id].properties[ - 'qualifiedName']: f"{describe(graph.nodes[node_id], 'description', 'docComment')}" - for node_id in type_deps[type.id]} - typ_parameters["Inherited By"] = [f"{t.properties['kind']} {t.properties['qualifiedName']}" for t in type.sources('specializes')] - typ_parameters[f"Enclosed Variables/Fields"] = vars - typ_parameters[f"Enclosed Functions/Methods"] = op_descriptions - self.add_classification_options(typ_parameters, "type") - - prompt = self.prompt.compose(prompt, **typ_parameters) - - logger.debug(prompt) - - description = self.client.generate_json(prompt, "AnalyzeStructure") - - self.apply_classifications(graph, type, description, "type") - - for k, v in description.items(): - if not k.endswith('Reason'): - graph.nodes[type.id].properties[lower_first(k)] = v - - writer().write({'data': {'id': type.id, 'labels': list(type.labels), 'properties': description}}) - - @staticmethod - def get_type_relations(data: Graph, cls_id: str) -> tuple: - """Retrieve class fields.""" - fields = {data.nodes[edge.target] for edge in data.find_edges(label='encapsulates') if edge.source == cls_id} - fields = [' '.join(remove_java_comments(field.properties['sourceText']).split()) for field in fields if field.has_label('Variable')] - return fields - + def process_all(self, graph: Graph): + types = graph.find_nodes('Type') + specializes_edges = graph.find_edges(label='specializes') + + # Build dep map: class_id → [parent_id, ...] (parents = classes it specializes) + spec_map: dict[str, list[str]] = {t.id: [] for t in types} + for edge in specializes_edges: + if edge.source in spec_map: + spec_map[edge.source].append(edge.target) + + executor = NodeExecutor(max_workers=self.max_workers) + total = len(types) + + def process_fn(cls_id: str): + clasz = graph.nodes[cls_id] + enclosers = [n for n in clasz.sources('encloses') if n.has_label('Scope')] + package = enclosers[0] if enclosers else None + self.process_one(graph, clasz, package, spec_map) + check_stop() + + for t in types: + executor.submit(t.id, spec_map.get(t.id, []), process_fn) + + with tqdm(total=total, desc='Processing classes') as pbar: + last = 0 + while True: + done = executor.completed + if done > last: + pbar.update(done - last) + last = done + if done >= total: + break + import time; time.sleep(0.1) + + executor.wait_all() + executor.shutdown() + writer().flush() + + def process_one(self, graph: Graph, type: Node, scope: Node, spec_map: dict[str, list[str]]): + vars_list = StructureProcessor.get_type_relations(graph, type.id) + op_descriptions = { + method.properties['qualifiedName']: describe(method) + for method in type.targets('encapsulates') if method.has_label('Operation') + } + + typ_name = type.properties['qualifiedName'] + typ_kind = type.properties.get('kind', 'type') + typ_kind = 'enum' if typ_kind == 'enumeration' else 'abstract class' if typ_kind == 'abstract' else typ_kind + + parent_ids = spec_map.get(type.id, []) + siblings = self._already_processed_siblings(graph, type, parent_ids) + + base_instruction = f"Describe the following {typ_kind} using the AnalyzeStructure tool.\n\n" + if siblings or parent_ids: + base_instruction = ( + "The parent class description is provided below under \"Inherits From\". " + "Build on the parent's context and maintain consistent layer/stereotype " + "choices unless there is a clear reason to differ. " + + ( + "Sibling classes (sharing the same parent, already analysed) are also " + "provided — your description should be consistent in terminology with " + "them but must clearly differentiate this class's specific responsibilities. " + if siblings else "" + ) + + "\n\n" + base_instruction + ) + + typ_parameters = OrderedDict() + typ_parameters["Project Name"] = self.prompt.project_name + typ_parameters["Project Description"] = self.prompt.project_desc + + if scope: + scope_name = scope.properties['qualifiedName'] + scope_kind = scope.properties.get('kind', 'scope') + typ_parameters[f"{typ_kind.title()} to Analyze"] = f"`{typ_kind} {typ_name}` from the {scope_kind} `{scope_name}`." + else: + typ_parameters[f"{typ_kind.title()} to Analyze"] = f"`{typ_kind} {typ_name}`." + + typ_parameters[f"{typ_kind.title()} Inherits From"] = { + graph.nodes[pid].properties['qualifiedName']: describe(graph.nodes[pid], 'description', 'docComment') + for pid in parent_ids if pid in graph.nodes + } + typ_parameters["Inherited By"] = [ + f"{t.properties['kind']} {t.properties['qualifiedName']}" + for t in type.sources('specializes') + ] + if siblings: + typ_parameters["Sibling Classes (already analysed, same parent)"] = siblings + typ_parameters["Enclosed Variables/Fields"] = vars_list + typ_parameters["Enclosed Functions/Methods"] = op_descriptions + self.add_classification_options(typ_parameters, "type") + + prompt = self.prompt.compose(base_instruction, **typ_parameters) + logger.debug(prompt) + + description = self.client.generate_json(prompt, "AnalyzeStructure") + self.apply_classifications(graph, type, description, "type") + + for k, v in description.items(): + if not k.endswith('Reason'): + graph.nodes[type.id].properties[lower_first(k)] = v + + writer().write({'data': {'id': type.id, 'labels': list(type.labels), 'properties': description}}) + + @staticmethod + def _already_processed_siblings(graph: Graph, type_node: Node, parent_ids: list[str]) -> dict[str, str]: + result = {} + for parent_id in parent_ids: + parent = graph.nodes.get(parent_id) + if not parent: + continue + for sibling in parent.sources('specializes'): + if sibling.id != type_node.id and 'description' in sibling.properties: + result[sibling.properties['qualifiedName']] = describe(sibling, 'description', 'roleStereotype') + return result + + @staticmethod + def get_type_relations(data: Graph, cls_id: str) -> list: + fields = {data.nodes[edge.target] for edge in data.find_edges(label='encapsulates') if edge.source == cls_id} + return [' '.join(remove_java_comments(f.properties['sourceText']).split()) + for f in fields if f.has_label('Variable')] + + +# --------------------------------------------------------------------------- +# ComponentProcessor +# --------------------------------------------------------------------------- class ComponentProcessor(Processor): - def process_all(self, graph: Graph): - sorted_pkg_ids, pkg_deps = graph.toposorted_nodes( - graph.find_edges(label='encloses', where_source=lambda node: node.has_label('Scope') and not node.has_label('Type'), - where_target=lambda node: node.has_label('Scope') and not node.has_label('Type')), graph.find_nodes('Scope', where=lambda node: not node.has_label('Type'))) - counter = 0 - - for pkg_id in tqdm(sorted_pkg_ids, desc='Processing packages'): - package: Node = graph.nodes[pkg_id] - self.process_one(graph, package, pkg_deps) - - check_stop() - - counter += 1 - if counter == 10: - writer().flush() - counter %= 10 - - def process_one(self, graph: Graph, scope: Node, scope_deps): - typ_descriptions = { f"{type.properties['kind']} {type.properties['qualifiedName']}": describe(type) for type in scope.targets('encloses') if type.has_label('Type') } - subscp_descriptions = {graph.nodes[node_id].properties['qualifiedName']: f"{describe(graph.nodes[node_id], 'description', 'returns', 'howToUse', 'docComment')}" - for node_id in scope_deps[scope.id]} - scp_kind = scope.properties.get('kind', "component") - - prompt = f"Describe the following {scp_kind} using the AnalyzeComponent tool.\n\n" - scp_parameters = OrderedDict() - scp_parameters["Project Name"] = self.prompt.project_name - scp_parameters["Project Description"] = self.prompt.project_desc - scp_parameters[f"{scp_kind.title()} to Analyze"] = scope.properties['qualifiedName'] - scp_parameters[f"Enclosed Sub-{scp_kind}s"] = subscp_descriptions - scp_parameters["Enclosed Classes"] = typ_descriptions - self.add_classification_options(scp_parameters, "scope") - - prompt = self.prompt.compose(prompt, **scp_parameters) - - logger.debug(prompt) - - description = self.client.generate_json(prompt, "AnalyzeComponent") - - self.apply_classifications(graph, scope, description, "scope") - - ComponentProcessor.update_package_properties(graph, description, scope) - - writer().write({'data': {'id': scope.id, 'labels': list(scope.labels), 'properties': description}}) - - @staticmethod - def update_package_properties(data: Graph, description: dict, package: Node): - """Update package properties with the generated description.""" - for key in description: - if not key.endswith('Reason'): - data.nodes[package.id].properties[lower_first(key)] = description[key] + def process_all(self, graph: Graph): + scopes = graph.find_nodes('Scope', where=lambda n: not n.has_label('Type')) + encloses_edges = graph.find_edges( + label='encloses', + where_source=lambda n: n.has_label('Scope') and not n.has_label('Type'), + where_target=lambda n: n.has_label('Scope') and not n.has_label('Type'), + ) + + enc_map: dict[str, list[str]] = {s.id: [] for s in scopes} + for edge in encloses_edges: + if edge.source in enc_map: + enc_map[edge.source].append(edge.target) + + # A package depends on its sub-packages: reverse — sub-pkg must be done first. + dep_map: dict[str, list[str]] = {s.id: [] for s in scopes} + for parent_id, child_ids in enc_map.items(): + for child_id in child_ids: + if child_id in dep_map: + dep_map[child_id] # ensure key exists + # parent depends on children being done + dep_map[parent_id] = list(child_ids) + + executor = NodeExecutor(max_workers=self.max_workers) + total = len(scopes) + + def process_fn(pkg_id: str): + scope = graph.nodes[pkg_id] + self.process_one(graph, scope, enc_map) + check_stop() + + for s in scopes: + executor.submit(s.id, dep_map.get(s.id, []), process_fn) + + with tqdm(total=total, desc='Processing packages') as pbar: + last = 0 + while True: + done = executor.completed + if done > last: + pbar.update(done - last) + last = done + if done >= total: + break + import time; time.sleep(0.1) + + executor.wait_all() + executor.shutdown() + writer().flush() + + def process_one(self, graph: Graph, scope: Node, enc_map: dict[str, list[str]]): + typ_descriptions = { + f"{t.properties['kind']} {t.properties['qualifiedName']}": describe(t) + for t in scope.targets('encloses') if t.has_label('Type') + } + sub_ids = enc_map.get(scope.id, []) + subscp_descriptions = { + graph.nodes[nid].properties['qualifiedName']: describe(graph.nodes[nid], 'description', 'returns', 'howToUse', 'docComment') + for nid in sub_ids if nid in graph.nodes + } + scp_kind = scope.properties.get('kind', 'component') + + prompt_base = f"Describe the following {scp_kind} using the AnalyzeComponent tool.\n\n" + scp_parameters = OrderedDict() + scp_parameters["Project Name"] = self.prompt.project_name + scp_parameters["Project Description"] = self.prompt.project_desc + scp_parameters[f"{scp_kind.title()} to Analyze"] = scope.properties['qualifiedName'] + scp_parameters[f"Enclosed Sub-{scp_kind}s"] = subscp_descriptions + scp_parameters["Enclosed Classes"] = typ_descriptions + self.add_classification_options(scp_parameters, "scope") + + prompt = self.prompt.compose(prompt_base, **scp_parameters) + logger.debug(prompt) + + description = self.client.generate_json(prompt, "AnalyzeComponent") + self.apply_classifications(graph, scope, description, "scope") + ComponentProcessor.update_package_properties(graph, description, scope) + writer().write({'data': {'id': scope.id, 'labels': list(scope.labels), 'properties': description}}) + + @staticmethod + def update_package_properties(data: Graph, description: dict, package: Node): + for key in description: + if not key.endswith('Reason'): + data.nodes[package.id].properties[lower_first(key)] = description[key] + + +# --------------------------------------------------------------------------- +# InteractionProcessor (stub) +# --------------------------------------------------------------------------- class InteractionProcessor(Processor): + def process_all(self, graph: Graph): + pass - def process_all(self, graph): - # compute and describe interactions between packages - pass +# --------------------------------------------------------------------------- +# VariableProcessor (SecDFD heuristic — unchanged logic) +# --------------------------------------------------------------------------- class VariableProcessor(Processor): - def __init__(self, client, prompt_builder, secdfd_cfg=None): - super().__init__(client, prompt_builder) - cfg = secdfd_cfg or {} - self.label_score_threshold = float(cfg.get("label_score_threshold", 0.60)) - self.process_min_out_invokes = int(cfg.get("process_min_out_invokes", 2)) - self.process_min_in_invokes = int(cfg.get("process_min_in_invokes", 2)) - self.asset_sensitive_term_hit_min = int(cfg.get("asset_sensitive_term_hit_min", 1)) - self.datastore_crud_hit_min = int(cfg.get("datastore_crud_hit_min", 1)) - self.external_entity_max_participation = int(cfg.get("external_entity_max_participation", 2)) - self.external_keywords = set("client rest entity user customer bank".split()) - self.datastore_keywords = set("db database dao repository storage cache data record table".split()) - self.asset_keywords = set("password secret policy user document card money balance account pin token key".split()) - self.flow_keywords = set("request response payload dto input output transfer amount source target".split()) - - def process_all(self, graph: Graph): - counter = 0 - signature_counts = self.build_signature_counts(graph) - for var in tqdm(graph.find_nodes("Variable"), desc="Processing variables"): - self.process_one(graph, var, signature_counts) - check_stop() - counter += 1 - if counter == 50: - writer().flush() - counter = 0 - - def process_one(self, graph: Graph, var: Node, signature_counts: dict): - description = self.infer_secdfd(graph, var, signature_counts) - if not description: - return - self.apply_classifications(graph, var, description, "variable") - for k, v in description.items(): - if not k.endswith("Reason"): - graph.nodes[var.id].properties[lower_first(k)] = v - writer().write({"data": {"id": var.id, "labels": list(var.labels), "properties": description}}) - - def infer_secdfd(self, graph: Graph, var: Node, signature_counts: dict) -> dict: - var_name = str(var.properties.get("simpleName", "")).strip() - name_tokens = self.tokenize(var_name) - signature = self.variable_signature(var) - participation = len(var.targets("parameterizes")) + len(var.sources("encapsulates")) - owners = [n for n in var.sources("encapsulates") if n.has_label("Type")] - ops = [n for n in var.targets("parameterizes") if n.has_label("Operation")] - scores = { - "External Entity": 0.0, - "DataStore": 0.0, - "Process": 0.0, - "Asset": 0.0, - "Flow": 0.0, - } - evidence = [] - - external_hits = self.keyword_hits(name_tokens, self.external_keywords) - if external_hits: - scores["External Entity"] += 0.7 - evidence.append(f"external_keywords={','.join(sorted(external_hits))}") - if participation <= self.external_entity_max_participation and (ops or owners): - scores["External Entity"] += 0.2 - evidence.append("low_participation") - - datastore_hits = self.keyword_hits(name_tokens, self.datastore_keywords) - if len(datastore_hits) >= self.datastore_crud_hit_min: - scores["DataStore"] += 0.7 - evidence.append(f"datastore_keywords={','.join(sorted(datastore_hits))}") - if any(any(v in self.tokenize(op.properties.get("simpleName", "")) for v in {"save", "find", "delete", "create", "read", "update"}) for op in ops): - scores["DataStore"] += 0.2 - evidence.append("crud_related_operation") - - asset_hits = self.keyword_hits(name_tokens, self.asset_keywords) - if len(asset_hits) >= self.asset_sensitive_term_hit_min: - scores["Asset"] += 0.8 - evidence.append(f"asset_keywords={','.join(sorted(asset_hits))}") - if owners and not ops: - scores["Asset"] += 0.1 - evidence.append("field_like_variable") - - flow_hits = self.keyword_hits(name_tokens, self.flow_keywords) - if flow_hits: - scores["Flow"] += 0.4 - evidence.append(f"flow_keywords={','.join(sorted(flow_hits))}") - if ops: - scores["Flow"] += 0.2 - evidence.append("parameterizes_operation") - if signature_counts.get(signature, 0) > 1: - scores["Flow"] += 0.3 - evidence.append("shared_signature") - - # Variables are not typically processes, keep score near-zero unless explicitly verb-named. - if self.looks_like_verb(var_name): - scores["Process"] += 0.2 - evidence.append("verb_like_name") - if any(len(op.targets("invokes")) >= self.process_min_out_invokes or len(op.sources("invokes")) >= self.process_min_in_invokes for op in ops): - scores["Process"] += 0.3 - evidence.append("connected_to_high_interaction_operation") - - selected = [label for label, score in sorted(scores.items(), key=lambda x: x[1], reverse=True) if score >= self.label_score_threshold][:3] - if not selected: - selected = ["Undetermined"] - primary = "Undetermined" - else: - primary = selected[0] - - return { - "secdfdTypes": selected, - "primarySecdfdType": primary, - "secdfdConfidence": {k: round(v, 3) for k, v in scores.items() if v > 0}, - "secdfdEvidence": "; ".join(evidence) if evidence else "No strong SecDFD evidence found.", - } - - @staticmethod - def keyword_hits(tokens: set, keywords: set) -> set: - return {t for t in tokens if t in keywords} - - @staticmethod - def tokenize(text: str) -> set: - text = re.sub(r"([a-z0-9])([A-Z])", r"\1 \2", text or "") - text = text.replace("_", " ").replace("-", " ").lower() - return {t for t in re.findall(r"[a-z0-9]+", text)} - - @staticmethod - def looks_like_verb(name: str) -> bool: - l = (name or "").lower() - return l.startswith(("get", "set", "create", "save", "find", "load", "send", "fetch", "verify")) - - @staticmethod - def variable_signature(var: Node): - name = str(var.properties.get("simpleName", "")).strip().lower() - types = tuple(sorted(t.id for t in var.targets("type"))) - return name, types - - def build_signature_counts(self, graph: Graph): - counts = {} - for var in graph.find_nodes("Variable"): - sig = self.variable_signature(var) - counts[sig] = counts.get(sig, 0) + 1 - return counts + def __init__(self, client, prompt_builder, secdfd_cfg=None, max_workers=8): + super().__init__(client, prompt_builder, max_workers) + cfg = secdfd_cfg or {} + self.label_score_threshold = float(cfg.get("label_score_threshold", 0.60)) + self.process_min_out_invokes = int(cfg.get("process_min_out_invokes", 2)) + self.process_min_in_invokes = int(cfg.get("process_min_in_invokes", 2)) + self.asset_sensitive_term_hit_min = int(cfg.get("asset_sensitive_term_hit_min", 1)) + self.datastore_crud_hit_min = int(cfg.get("datastore_crud_hit_min", 1)) + self.external_entity_max_participation = int(cfg.get("external_entity_max_participation", 2)) + self.external_keywords = set("client rest entity user customer bank".split()) + self.datastore_keywords = set("db database dao repository storage cache data record table".split()) + self.asset_keywords = set("password secret policy user document card money balance account pin token key".split()) + self.flow_keywords = set("request response payload dto input output transfer amount source target".split()) + + def process_all(self, graph: Graph): + counter = 0 + signature_counts = self.build_signature_counts(graph) + for var in tqdm(graph.find_nodes("Variable"), desc="Processing variables"): + self.process_one(graph, var, signature_counts) + check_stop() + counter += 1 + if counter == 50: + writer().flush() + counter = 0 + + def process_one(self, graph: Graph, var: Node, signature_counts: dict): + description = self.infer_secdfd(graph, var, signature_counts) + if not description: + return + self.apply_classifications(graph, var, description, "variable") + for k, v in description.items(): + if not k.endswith("Reason"): + graph.nodes[var.id].properties[lower_first(k)] = v + writer().write({"data": {"id": var.id, "labels": list(var.labels), "properties": description}}) + + def infer_secdfd(self, graph: Graph, var: Node, signature_counts: dict) -> dict: + var_name = str(var.properties.get("simpleName", "")).strip() + name_tokens = self.tokenize(var_name) + signature = self.variable_signature(var) + participation = len(var.targets("parameterizes")) + len(var.sources("encapsulates")) + owners = [n for n in var.sources("encapsulates") if n.has_label("Type")] + ops = [n for n in var.targets("parameterizes") if n.has_label("Operation")] + scores = {"External Entity": 0.0, "DataStore": 0.0, "Process": 0.0, "Asset": 0.0, "Flow": 0.0} + evidence = [] + + external_hits = self.keyword_hits(name_tokens, self.external_keywords) + if external_hits: + scores["External Entity"] += 0.7 + evidence.append(f"external_keywords={','.join(sorted(external_hits))}") + if participation <= self.external_entity_max_participation and (ops or owners): + scores["External Entity"] += 0.2 + evidence.append("low_participation") + + datastore_hits = self.keyword_hits(name_tokens, self.datastore_keywords) + if len(datastore_hits) >= self.datastore_crud_hit_min: + scores["DataStore"] += 0.7 + evidence.append(f"datastore_keywords={','.join(sorted(datastore_hits))}") + if any(any(v in self.tokenize(op.properties.get("simpleName", "")) for v in {"save", "find", "delete", "create", "read", "update"}) for op in ops): + scores["DataStore"] += 0.2 + evidence.append("crud_related_operation") + + asset_hits = self.keyword_hits(name_tokens, self.asset_keywords) + if len(asset_hits) >= self.asset_sensitive_term_hit_min: + scores["Asset"] += 0.8 + evidence.append(f"asset_keywords={','.join(sorted(asset_hits))}") + if owners and not ops: + scores["Asset"] += 0.1 + evidence.append("field_like_variable") + + flow_hits = self.keyword_hits(name_tokens, self.flow_keywords) + if flow_hits: + scores["Flow"] += 0.4 + evidence.append(f"flow_keywords={','.join(sorted(flow_hits))}") + if ops: + scores["Flow"] += 0.2 + evidence.append("parameterizes_operation") + if signature_counts.get(signature, 0) > 1: + scores["Flow"] += 0.3 + evidence.append("shared_signature") + + if self.looks_like_verb(var_name): + scores["Process"] += 0.2 + evidence.append("verb_like_name") + if any(len(op.targets("invokes")) >= self.process_min_out_invokes or len(op.sources("invokes")) >= self.process_min_in_invokes for op in ops): + scores["Process"] += 0.3 + evidence.append("connected_to_high_interaction_operation") + + selected = [label for label, score in sorted(scores.items(), key=lambda x: x[1], reverse=True) if score >= self.label_score_threshold][:3] + if not selected: + selected = ["Undetermined"] + primary = "Undetermined" + else: + primary = selected[0] + + return { + "secdfdTypes": selected, + "primarySecdfdType": primary, + "secdfdConfidence": {k: round(v, 3) for k, v in scores.items() if v > 0}, + "secdfdEvidence": "; ".join(evidence) if evidence else "No strong SecDFD evidence found.", + } + + @staticmethod + def keyword_hits(tokens: set, keywords: set) -> set: + return {t for t in tokens if t in keywords} + + @staticmethod + def tokenize(text: str) -> set: + text = re.sub(r"([a-z0-9])([A-Z])", r"\1 \2", text or "") + text = text.replace("_", " ").replace("-", " ").lower() + return {t for t in re.findall(r"[a-z0-9]+", text)} + + @staticmethod + def looks_like_verb(name: str) -> bool: + l = (name or "").lower() + return l.startswith(("get", "set", "create", "save", "find", "load", "send", "fetch", "verify")) + + @staticmethod + def variable_signature(var: Node): + name = str(var.properties.get("simpleName", "")).strip().lower() + types = tuple(sorted(t.id for t in var.targets("type"))) + return name, types + + def build_signature_counts(self, graph: Graph): + counts = {} + for var in graph.find_nodes("Variable"): + sig = self.variable_signature(var) + counts[sig] = counts.get(sig, 0) + 1 + return counts From a03449fe13f70335ae59eff09f464f74da488fcc Mon Sep 17 00:00:00 2001 From: Satrio Adi Rukmono Date: Tue, 28 Apr 2026 13:22:13 +0700 Subject: [PATCH 31/34] Wire max_workers, resume, and ancestor map into LLMFilter - Reads [llm] workers, checkpoint_file, and resume from config; calls configure_writer() so processors share the right checkpoint file/mode. - Calls load_checkpoint() before processing when resume = true. - Builds a type_ancestor_ids map (BFS over specializes edges) and passes it to ScriptProcessor for same-name family / override context lookups. Co-Authored-By: Claude Sonnet 4.6 --- arcana/llm_filter/filter.py | 320 ++++++++++++++++++++---------------- 1 file changed, 175 insertions(+), 145 deletions(-) diff --git a/arcana/llm_filter/filter.py b/arcana/llm_filter/filter.py index 64304ee..15b4f12 100644 --- a/arcana/llm_filter/filter.py +++ b/arcana/llm_filter/filter.py @@ -1,161 +1,191 @@ +import os from collections import OrderedDict from itertools import combinations from typing import Any, Dict, List, TextIO +import logging from tqdm.auto import tqdm from arcana import templates +from arcana.checkpoint import configure_writer, load_checkpoint, writer from arcana.filters import check_stop, layers_to_ordereddict from arcana.graph_utils import (build_hierarchy, build_triplets, describe_path, - group_paths_by_endpoints) + group_paths_by_endpoints) from arcana.llm_filter.classification import default_classification_schemes from arcana.llm_filter.client import LLMClient -from arcana.llm_filter.processors import ComponentProcessor, InteractionProcessor, ScriptProcessor, StructureProcessor, VariableProcessor +from arcana.llm_filter.processors import (ComponentProcessor, InteractionProcessor, + ScriptProcessor, StructureProcessor, VariableProcessor) from arcana.llm_filter.prompt import PromptBuilder -from arcana.utils import (lower_first, remove_java_comments, write_jsonl) +from arcana.utils import lower_first, remove_java_comments, write_jsonl from arcanalib.graph import Edge, Graph, Node from arcanalib.pipefilter import Filter +logger = logging.getLogger(__name__) + + class LLMFilter(Filter): - def __init__(self, config: Dict[str, Dict[str, Any]]): - super().__init__(config) - self.client = LLMClient(config['llm'], config['project']) - - layer_cfg = config.get('layers') - self.layers = layers_to_ordereddict(layer_cfg) if layer_cfg else OrderedDict() - - stereo_cfg = config.get('stereotypes') - self.role_stereotypes = OrderedDict(stereo_cfg) if stereo_cfg else OrderedDict() - - self.secdfd_cfg = config.get('secdfd', {}) - self.secdfd_enabled = str(self.secdfd_cfg.get("enabled", "false")).strip().lower() in {"1", "true", "yes", "on"} - - classifications = default_classification_schemes( - self.layers, - self.role_stereotypes, - secdfd_enabled=self.secdfd_enabled, - ) - self.prompt_builder = PromptBuilder(config['project'], classifications) - self.script_processor = ScriptProcessor(self.client, self.prompt_builder) - self.structure_processor = StructureProcessor(self.client, self.prompt_builder) - self.variable_processor = VariableProcessor(self.client, self.prompt_builder, self.secdfd_cfg) - self.component_processor = ComponentProcessor(self.client, self.prompt_builder) - self.interaction_processor = InteractionProcessor(self.client, self.prompt_builder) - - def process(self, graph): - # 1. initialize layers and write baseline nodes/edges - self.prompt_builder.initialize_layers(graph) - - # 2. process methods - self.script_processor.process_all(graph) - - # 3. process classes - self.structure_processor.process_all(graph) - - # 4. process variables for SecDFD (v2 variable semantics) - if self.secdfd_enabled: - self.variable_processor.process_all(graph) - - # 5. process packages - self.component_processor.process_all(graph) - - # 6. process interactions - self.interaction_processor.process_all(graph) - - return graph - - def process_hierarchy(self, graph: Graph, jsonl_file, log_file): - """Process each package, class, and method in the hierarchy.""" - - st_contains_st = graph.find_edges(label='contains', source_label='Structure', target_label='Structure') - ct_contains_st = graph.find_edges(label='contains', target_label='Structure', where_source=lambda - node: 'Container' in node.labels and 'Structure' not in node.labels) - new_ct_sources = {edge.target: graph.find_source(graph.find_edges(label='contains'), graph.nodes[edge.target], - lambda node: 'Structure' not in node.labels, - graph.nodes[edge.source]).id for edge in st_contains_st} - ct_contains_st.extend( - [Edge(source=source, target=target, label='contains') for target, source in new_ct_sources.items()]) - - trips = build_triplets(ct_contains_st, graph.find_edges(label='hasScript')) - hierarchy = build_hierarchy(trips) - sorted_pkg_ids, pkg_deps = graph.toposorted_nodes( - graph.find_edges(label='contains', where_source=lambda node: 'Structure' not in node.labels, - where_target=lambda node: 'Structure' not in node.labels)) - - paths = graph.find_paths("contains", "hasScript", "invokes", "-hasScript", "-contains") - path_groups = group_paths_by_endpoints(paths) - - pkg_pairs = list(combinations(sorted_pkg_ids, 2)) - for pkg2_id, pkg1_id in tqdm(pkg_pairs, desc='Processing package interactions', position=0, leave=False): - - check_stop() - - pkg1 = graph.nodes[pkg1_id] - pkg2 = graph.nodes[pkg2_id] - if ('Structure' not in pkg1.labels) and ('Structure' not in pkg2.labels): - if path_groups[(pkg1_id, pkg2_id)]: - self.process_interactions(graph, pkg1, pkg2, path_groups[(pkg1_id, pkg2_id)], hierarchy, jsonl_file, - log_file) - if path_groups[(pkg2_id, pkg1_id)]: - self.process_interactions(graph, pkg2, pkg1, path_groups[(pkg2_id, pkg1_id)], hierarchy, jsonl_file, - log_file) - - def process_interactions(self, graph: Graph, c1: Node, c2: Node, path_groups: List[List[Edge]], hierarchy, - jsonl_file: TextIO, log_file: TextIO): - c1_name = c1.properties["qualifiedName"] - c2_name = c2.properties["qualifiedName"] - c1_desc = c1.properties["description"] - c2_desc = c2.properties["description"] - - c1_contents = hierarchy.get(c1.id, dict()) - c2_contents = hierarchy.get(c2.id, dict()) - - c1_structure_info = "\n".join( - f" - `{graph.nodes[c_id].properties['simpleName']}`: {graph.nodes[c_id].properties['description']}" - for c_id, _ in c1_contents.items()) - c2_structure_info = "\n".join( - f" - `{graph.nodes[c_id].properties['simpleName']}`: {graph.nodes[c_id].properties['description']}" - for c_id, _ in c2_contents.items()) - - dep_info = f" - Dependencies from `{c1_name}` to `{c2_name}`:\n" + "\n".join( - f" - {describe_path(graph, path)}" for path in path_groups) if path_groups else "" - - prompt = templates.interaction_analysis.format(project_name=self.project_name, project_desc=self.project_desc, - pkg1_name=c1_name, pkg2_name=c2_name, pkg1_desc=c1_desc, - pkg2_desc=c2_desc, cls1_info=c1_structure_info, - cls2_info=c2_structure_info, dep_info=dep_info) - - log_file.write(prompt) - log_file.write('\n\n======\n\n') - - description = generate_text(prompt) - pkg1_edge = Edge(source=c1.id, target=c2.id, label="dependsOn", description=description) if dep_info else None - - if pkg1_edge: - if "dependsOn" not in graph.edges: - graph.edges["dependsOn"] = [] - graph.edges["dependsOn"].append(pkg1_edge) - - write_jsonl(jsonl_file, pkg1_edge.to_dict()) - - @staticmethod - def update_package_properties(data: Graph, description: dict, package: Node): - """Update package properties with the generated description.""" - for key in description: - if not key.endswith('Reason'): - data.nodes[package.id].properties[lower_first(key)] = description[key] - - def get_script_descriptions(self, data: Graph, cls_data: list) -> dict[str,str]: - """Generate descriptions for methods.""" - return {data.nodes[met_id].properties['simpleName']: self.describe(data.nodes[met_id]) for met_id in cls_data} - - def get_structure_descriptions(self, data: Graph, pkg_data: dict) -> dict[str,str]: - """Generate descriptions for classes.""" - return { - f"{data.nodes[cls_id].properties['kind']} {data.nodes[cls_id].properties['qualifiedName']}": self.describe( - data.nodes[cls_id]) for cls_id, _ in pkg_data.items()} - - def get_component_descriptions(self, data: Graph, package_ids: list) -> dict[str,str]: - """Generate descriptions for packages.""" - return {data.nodes[pkg_id].properties['qualifiedName']: self.describe(data.nodes[pkg_id]) for pkg_id in - package_ids} + def __init__(self, config: Dict[str, Dict[str, Any]]): + super().__init__(config) + self.client = LLMClient(config['llm'], config['project']) + + llm_cfg = config.get('llm', {}) + self.max_workers = int(llm_cfg.get('workers', 8)) + + # Checkpoint / resume config + self.checkpoint_file = llm_cfg.get('checkpoint_file', 'checkpoints.jsonl') + self.resume = str(llm_cfg.get('resume', 'false')).strip().lower() in {'1', 'true', 'yes', 'on'} + configure_writer(self.checkpoint_file, append=self.resume) + + layer_cfg = config.get('layers') + self.layers = layers_to_ordereddict(layer_cfg) if layer_cfg else OrderedDict() + + stereo_cfg = config.get('stereotypes') + self.role_stereotypes = OrderedDict(stereo_cfg) if stereo_cfg else OrderedDict() + + self.secdfd_cfg = config.get('secdfd', {}) + self.secdfd_enabled = str(self.secdfd_cfg.get("enabled", "false")).strip().lower() in {"1", "true", "yes", "on"} + + classifications = default_classification_schemes( + self.layers, + self.role_stereotypes, + secdfd_enabled=self.secdfd_enabled, + ) + self.prompt_builder = PromptBuilder(config['project'], classifications) + self.script_processor = ScriptProcessor(self.client, self.prompt_builder, self.max_workers) + self.structure_processor = StructureProcessor(self.client, self.prompt_builder, self.max_workers) + self.variable_processor = VariableProcessor(self.client, self.prompt_builder, self.secdfd_cfg, self.max_workers) + self.component_processor = ComponentProcessor(self.client, self.prompt_builder, self.max_workers) + self.interaction_processor = InteractionProcessor(self.client, self.prompt_builder, self.max_workers) + + def process(self, graph: Graph) -> Graph: + # 0. Resume: pre-load previous checkpoint so processors skip done nodes. + if self.resume and os.path.exists(self.checkpoint_file): + logger.info("Resuming from checkpoint: %s", self.checkpoint_file) + load_checkpoint(self.checkpoint_file, graph) + + # 1. Initialize classification dimension/category nodes. + self.prompt_builder.initialize_layers(graph) + + # 2. Build ancestor map for ScriptProcessor (same-name family / override context). + # Maps type_id → [all ancestor type_ids] via specializes edges. + type_ancestor_ids = self._build_type_ancestor_map(graph) + self.script_processor.type_ancestor_ids = type_ancestor_ids + + # 3. Process methods. + self.script_processor.process_all(graph) + + # 4. Process classes. + self.structure_processor.process_all(graph) + + # 5. Process variables (SecDFD). + if self.secdfd_enabled: + self.variable_processor.process_all(graph) + + # 6. Process packages. + self.component_processor.process_all(graph) + + # 7. Process interactions (stub). + self.interaction_processor.process_all(graph) + + return graph + + @staticmethod + def _build_type_ancestor_map(graph: Graph) -> dict[str, list[str]]: + """Return type_id → ordered list of ancestor type ids (parents first) via specializes.""" + specializes_edges = graph.find_edges(label='specializes') + # Build direct parent map: child_id → [parent_id, ...] + parents: dict[str, list[str]] = {} + for edge in specializes_edges: + parents.setdefault(edge.source, []).append(edge.target) + + # BFS from each type to collect all ancestors in order. + result: dict[str, list[str]] = {} + for t in graph.find_nodes('Type'): + visited: list[str] = [] + seen: set[str] = set() + queue = list(parents.get(t.id, [])) + while queue: + pid = queue.pop(0) + if pid in seen: + continue + seen.add(pid) + visited.append(pid) + queue.extend(parents.get(pid, [])) + result[t.id] = visited + return result + + # ------------------------------------------------------------------ + # Legacy interaction processing (kept from original, not currently + # wired into process() but available for future use) + # ------------------------------------------------------------------ + + def process_hierarchy(self, graph: Graph, jsonl_file, log_file): + st_contains_st = graph.find_edges(label='contains', source_label='Structure', target_label='Structure') + ct_contains_st = graph.find_edges(label='contains', target_label='Structure', where_source=lambda + node: 'Container' in node.labels and 'Structure' not in node.labels) + new_ct_sources = {edge.target: graph.find_source(graph.find_edges(label='contains'), graph.nodes[edge.target], + lambda node: 'Structure' not in node.labels, + graph.nodes[edge.source]).id for edge in st_contains_st} + ct_contains_st.extend( + [Edge(source=source, target=target, label='contains') for target, source in new_ct_sources.items()]) + + trips = build_triplets(ct_contains_st, graph.find_edges(label='hasScript')) + hierarchy = build_hierarchy(trips) + sorted_pkg_ids, pkg_deps = graph.toposorted_nodes( + graph.find_edges(label='contains', where_source=lambda node: 'Structure' not in node.labels, + where_target=lambda node: 'Structure' not in node.labels)) + + paths = graph.find_paths("contains", "hasScript", "invokes", "-hasScript", "-contains") + path_groups = group_paths_by_endpoints(paths) + + pkg_pairs = list(combinations(sorted_pkg_ids, 2)) + for pkg2_id, pkg1_id in tqdm(pkg_pairs, desc='Processing package interactions', position=0, leave=False): + check_stop() + pkg1 = graph.nodes[pkg1_id] + pkg2 = graph.nodes[pkg2_id] + if ('Structure' not in pkg1.labels) and ('Structure' not in pkg2.labels): + if path_groups[(pkg1_id, pkg2_id)]: + self.process_interactions(graph, pkg1, pkg2, path_groups[(pkg1_id, pkg2_id)], hierarchy, jsonl_file, log_file) + if path_groups[(pkg2_id, pkg1_id)]: + self.process_interactions(graph, pkg2, pkg1, path_groups[(pkg2_id, pkg1_id)], hierarchy, jsonl_file, log_file) + + def process_interactions(self, graph: Graph, c1: Node, c2: Node, path_groups: List[List[Edge]], hierarchy, + jsonl_file: TextIO, log_file: TextIO): + c1_name = c1.properties["qualifiedName"] + c2_name = c2.properties["qualifiedName"] + c1_desc = c1.properties.get("description", "") + c2_desc = c2.properties.get("description", "") + + c1_contents = hierarchy.get(c1.id, dict()) + c2_contents = hierarchy.get(c2.id, dict()) + + c1_structure_info = "\n".join( + f" - `{graph.nodes[c_id].properties['simpleName']}`: {graph.nodes[c_id].properties.get('description', '')}" + for c_id, _ in c1_contents.items()) + c2_structure_info = "\n".join( + f" - `{graph.nodes[c_id].properties['simpleName']}`: {graph.nodes[c_id].properties.get('description', '')}" + for c_id, _ in c2_contents.items()) + + dep_info = f" - Dependencies from `{c1_name}` to `{c2_name}`:\n" + "\n".join( + f" - {describe_path(graph, path)}" for path in path_groups) if path_groups else "" + + prompt = templates.interaction_analysis.format( + project_name=self.prompt_builder.project_name, + project_desc=self.prompt_builder.project_desc, + pkg1_name=c1_name, pkg2_name=c2_name, + pkg1_desc=c1_desc, pkg2_desc=c2_desc, + cls1_info=c1_structure_info, cls2_info=c2_structure_info, + dep_info=dep_info, + ) + + log_file.write(prompt) + log_file.write('\n\n======\n\n') + + description = self.client.generate_text(prompt) + pkg1_edge = Edge(source=c1.id, target=c2.id, label="dependsOn", description=description) if dep_info else None + + if pkg1_edge: + if "dependsOn" not in graph.edges: + graph.edges["dependsOn"] = [] + graph.edges["dependsOn"].append(pkg1_edge) + write_jsonl(jsonl_file, pkg1_edge.to_dict()) From fe7e85e6151186ee6a0a5d44ea29d64c7ed9a483 Mon Sep 17 00:00:00 2001 From: Satrio Adi Rukmono Date: Tue, 28 Apr 2026 13:36:11 +0700 Subject: [PATCH 32/34] Add Stereocode method and class stereotype classification Adds two new classification dimensions based on the Dragan/Al-Ramadan Stereocode taxonomy (17 method stereotypes across five categories: structural accessors, structural mutators, creational, collaborational, degenerate; and 14 class stereotypes). Each becomes a Dimension node with Category nodes in the graph, and elements receive 'implements' edges to their assigned categories. Method stereotypes are multi-label (e.g. 'get collaborator'); class stereotypes are single-label. Both are gated behind [stereocode] enabled in config so existing runs are unaffected. Pydantic output models gain optional stereocodeStereotype / stereocodeClassStereotype fields (default empty) so they parse cleanly regardless of whether stereocode is enabled. Co-Authored-By: Claude Sonnet 4.6 --- arcana/llm_filter/classification.py | 84 ++++++++++++++++++++++++++++- arcana/llm_filter/filter.py | 4 ++ arcana/templates.py | 4 ++ 3 files changed, 91 insertions(+), 1 deletion(-) diff --git a/arcana/llm_filter/classification.py b/arcana/llm_filter/classification.py index 9697e3d..86b99e4 100644 --- a/arcana/llm_filter/classification.py +++ b/arcana/llm_filter/classification.py @@ -78,7 +78,53 @@ def default_secdfd_types(): ]) -def default_classification_schemes(layers_cfg=None, role_stereotypes_cfg=None, secdfd_enabled=False): +def stereocode_method_stereotypes(): + return OrderedDict([ + # Structural Accessors + ("get", "Returns a data member directly."), + ("predicate", "Returns a Boolean value that is not itself a data member."), + ("property", "Returns information derived from or about data members (non-Boolean)."), + ("void-accessor", "Returns information about data members through method parameters (out/ref params) rather than the return value."), + # Structural Mutators + ("set", "Modifies a single data member."), + ("command", "Performs a complex change to the object's state (e.g., modifies multiple data members); returns void."), + ("non-void-command", "Like command but also returns a value."), + # Creational + ("constructor", "Creates (or initialises) a new object instance."), + ("copy-constructor", "Creates a new object by copying an existing one."), + ("destructor", "Destroys or cleans up an object."), + ("factory", "Creates and returns an instance of another class."), + # Collaborational + ("collaborator", "Works primarily with objects belonging to classes other than itself (passed as parameter, stored as local/data member, or returned)."), + ("controller", "Changes only the state of an external object, not 'this'."), + ("wrapper", "Does not change the object's state but delegates to at least one free function call."), + # Degenerate + ("incidental", "Does not read or change the object's state and makes no calls to other class methods or free functions."), + ("stateless", "Does not read or change the object's state but has at least one call to other class methods or free functions."), + ("empty", "Has no statements at all."), + ]) + + +def stereocode_class_stereotypes(): + return OrderedDict([ + ("entity", "Encapsulates both data and behaviour; keeper of the data model and/or business logic."), + ("minimal-entity", "Special case of entity consisting only of get, set, and command methods."), + ("data-provider", "Encapsulates data and consists mainly of accessors (get/property/predicate)."), + ("commander", "Encapsulates behaviour and consists mainly of mutators (set/command)."), + ("boundary", "Communicator with a large percentage of collaborational methods and a low percentage of controller methods; few factory methods."), + ("factory", "Creator of objects; has mostly factory methods."), + ("controller", "Provides functionality to control external objects; consists mostly of controller and factory methods."), + ("pure-controller", "Special case of controller consisting only of controller and factory methods."), + ("large-class", "Contains a large number of methods combining multiple roles such as data-provider, commander, controller, and factory."), + ("lazy-class", "Consists mostly of get, set, and degenerate methods; occurrence of other methods is low."), + ("degenerate", "Consists mostly of degenerate methods that do not read or write to the object's state."), + ("data-class", "Consists only of get and set methods."), + ("small-class", "Consists of only one or two methods."), + ("empty", "Has no methods."), + ]) + + +def default_classification_schemes(layers_cfg=None, role_stereotypes_cfg=None, secdfd_enabled=False, stereocode_enabled=False): layers = ordered_dict_from_mapping(layers_cfg) or default_layers() role_stereotypes = ordered_dict_from_mapping(role_stereotypes_cfg) or default_role_stereotypes() @@ -137,4 +183,40 @@ def default_classification_schemes(layers_cfg=None, role_stereotypes_cfg=None, s ) schemes[secdfd_scheme.name] = secdfd_scheme + if stereocode_enabled: + stereocode_method_scheme = ClassificationScheme( + name="stereocodeMethod", + dimension_id="Stereocode Method Stereotype", + dimension_name="Stereocode Method Stereotype", + dimension_kind="categorical-nominal", + category_prefix="scm", + category_kind="stereocode method stereotype", + prompt_label="Possible Stereocode Method Stereotypes", + response_key="stereocodeStereotype", + response_reason_key="stereocodeStereotypeReason", + options=stereocode_method_stereotypes(), + undetermined_description="Stereocode method stereotype cannot be determined.", + ordered=False, + applies_to=("operation",), + allow_multi_label=True, + ) + stereocode_class_scheme = ClassificationScheme( + name="stereocodeClass", + dimension_id="Stereocode Class Stereotype", + dimension_name="Stereocode Class Stereotype", + dimension_kind="categorical-nominal", + category_prefix="scc", + category_kind="stereocode class stereotype", + prompt_label="Possible Stereocode Class Stereotypes", + response_key="stereocodeClassStereotype", + response_reason_key="stereocodeClassStereotypeReason", + options=stereocode_class_stereotypes(), + undetermined_description="Stereocode class stereotype cannot be determined.", + ordered=False, + applies_to=("type",), + allow_multi_label=False, + ) + schemes[stereocode_method_scheme.name] = stereocode_method_scheme + schemes[stereocode_class_scheme.name] = stereocode_class_scheme + return schemes diff --git a/arcana/llm_filter/filter.py b/arcana/llm_filter/filter.py index 15b4f12..727c864 100644 --- a/arcana/llm_filter/filter.py +++ b/arcana/llm_filter/filter.py @@ -45,10 +45,14 @@ def __init__(self, config: Dict[str, Dict[str, Any]]): self.secdfd_cfg = config.get('secdfd', {}) self.secdfd_enabled = str(self.secdfd_cfg.get("enabled", "false")).strip().lower() in {"1", "true", "yes", "on"} + stereocode_cfg = config.get('stereocode', {}) + self.stereocode_enabled = str(stereocode_cfg.get("enabled", "false")).strip().lower() in {"1", "true", "yes", "on"} + classifications = default_classification_schemes( self.layers, self.role_stereotypes, secdfd_enabled=self.secdfd_enabled, + stereocode_enabled=self.stereocode_enabled, ) self.prompt_builder = PromptBuilder(config['project'], classifications) self.script_processor = ScriptProcessor(self.client, self.prompt_builder, self.max_workers) diff --git a/arcana/templates.py b/arcana/templates.py index ac3ea02..d3c04b7 100644 --- a/arcana/templates.py +++ b/arcana/templates.py @@ -26,6 +26,8 @@ class ScriptDescription(BaseModel): layerReason: str = Field(description="Explanation why this fits the chosen layer but not others.") secdfdTypes: list[str] = Field(default_factory=list, description="One or more SecDFD classifications from the provided options.") secdfdEvidence: str = Field(default="", description="Short evidence summary for the SecDFD classifications.") + stereocodeStereotype: list[str] = Field(default_factory=list, description="One or more Stereocode method stereotypes from the provided options. Methods may combine stereotypes, e.g. 'get collaborator'.") + stereocodeStereotypeReason: str = Field(default="", description="One-sentence explanation for the chosen Stereocode method stereotype(s).") class StructureDescription(BaseModel): @@ -37,6 +39,8 @@ class StructureDescription(BaseModel): layerReason: str = Field(description="Explanation why this fits the chosen layer but not others.") secdfdTypes: list[str] = Field(default_factory=list, description="One or more SecDFD classifications from the provided options.") secdfdEvidence: str = Field(default="", description="Short evidence summary for the SecDFD classifications.") + stereocodeClassStereotype: str = Field(default="", description="Stereocode class stereotype selected from the provided options.") + stereocodeClassStereotypeReason: str = Field(default="", description="One-sentence explanation for the chosen Stereocode class stereotype.") class ComponentDescription(BaseModel): From c200ac54b57eff52b5d018b50a6029fdaa0c353c Mon Sep 17 00:00:00 2001 From: Satrio Adi Rukmono Date: Wed, 6 May 2026 15:04:47 +0700 Subject: [PATCH 33/34] Add setup.py --- setup.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) create mode 100644 setup.py diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..6218ab3 --- /dev/null +++ b/setup.py @@ -0,0 +1,12 @@ +from setuptools import setup, find_packages + +def read_requirements(): + with open('requirements.txt') as req: + return [line.strip() for line in req if line.strip() and not line.startswith('#')] + +setup( + name="arcana", + version="0.1.0", + packages=find_packages(), + install_requires=read_requirements(), +) From 961bc6a9ae561aa5f5f846dcb02ec6eec1c5019f Mon Sep 17 00:00:00 2001 From: Satrio Adi Rukmono Date: Wed, 6 May 2026 16:57:24 +0700 Subject: [PATCH 34/34] Expand config example --- config.ini.example | 51 ++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 42 insertions(+), 9 deletions(-) diff --git a/config.ini.example b/config.ini.example index 83f28ad..5d3adfa 100644 --- a/config.ini.example +++ b/config.ini.example @@ -1,22 +1,41 @@ [project] name=zxing-3.5.3 desc=a Java application +# The input directory to be parsed, or the path to an existing graph JSON file if using just the LLM filter. +# If 'input' is missing or 'stdin', it reads from standard input. input=D:/Code/zxing-zxing-3.5.3/core/src/main/java/ +# The final JSON graph output path. If missing or 'stdout', prints to standard output. output=zxing-3.5.3-output.json +[seeder] +# Command string to invoke the extractor. Placeholders like {input}, {name}, and other keys will be formatted. +command={javaexe} -jar {jarfile} -i {input} -a -n {name} -f json +javaexe=./javapers/jdk-17.0.11+9-jre/bin/java.exe +jarfile=./javapers/javapers-1.1.2-jar-with-dependencies.jar + +[merge] +# Specifies the input graph for the merge_filter to process +input=another-graph-to-merge-with.json + [llm] apikey=example-apikey +# Optional custom base URL for local LLMs (e.g. Ollama, LMStudio) apibase=http://localhost:8000/v1 +# Name of the model. Defaults to 'gpt-4o-mini' if not specified. model=llama3 -; LLM chat completion http request timeout, in seconds. If not specified, the default is 5 minutes. +# HTTP request timeout in seconds. Defaults to 300 (5 minutes). timeout=120.0 - -[seeder] -command={javaexe} -jar {jarfile} -i {input} -a -n {name} -f json -javaexe=./javapers/jdk-17.0.11+9-jre/bin/java.exe -jarfile=./javapers/javapers-1.1.2-jar-with-dependencies.jar +# Whether to use OpenAI structured outputs (requires a compatible model). Defaults to true. +use_structured_output=true +# Maximum number of parallel worker threads for LLM API calls. Defaults to 8. +workers=8 +# Path to the file where checkpoints are saved. Defaults to 'checkpoints.jsonl'. +checkpoint_file=checkpoints.jsonl +# Set to true/1/yes/on to resume processing from the checkpoint_file. Defaults to false. +resume=true [layers] +# Defines architectural layers used during categorization layer1name=UI layer1desc=Handles user interface, such as instatiating, setting properties of, or laying out widget objects and capturing user interactions. layer2name=Logic @@ -24,11 +43,25 @@ layer2desc=Handles application and domain logic, i.e., neither UI nor data acces layer3name=Data layer3desc=Handles loading and storing data from/to external services, including database systems, web services, filesystems, hardware, etc. +[stereotypes] +# Similar to [layers], defines architectural role stereotypes for components/types +stereo1name=Controller +stereo1desc=A component that orchestrates interactions and controls the flow of data. +stereo2name=Repository +stereo2desc=A component that abstracts data access from the underlying storage. + [secdfd] -enabled=false +# Enables or disables the Security Data Flow Diagram processing step. Defaults to false. +enabled=true +# Float threshold for determining component categories based on semantic similarity/LLM label. Defaults to 0.60. label_score_threshold=0.60 +# Heuristic rules thresholds for DFD type classification process_min_out_invokes=2 process_min_in_invokes=2 -external_entity_max_participation=2 -datastore_crud_hit_min=1 asset_sensitive_term_hit_min=1 +datastore_crud_hit_min=1 +external_entity_max_participation=2 + +[stereocode] +# Enables or disables stereo-code generation behavior/classification. Defaults to false. +enabled=true