From c8e789cd565e07d3b83a6f367454294263b368f6 Mon Sep 17 00:00:00 2001 From: Steven West Date: Mon, 14 Jul 2025 10:43:36 -0700 Subject: [PATCH] Add helper utilities to interface with databases to dapper-python library --- dataset-generation/Create_Linux_DB.py | 178 ++++++++++++--------- dataset-generation/Create_PyPI_DB.py | 85 ++-------- python/dapper_python/databases/database.py | 81 ++++++++++ python/dapper_python/databases/linuxDB.py | 68 ++++++++ python/dapper_python/databases/pythonDB.py | 92 +++++++++++ 5 files changed, 350 insertions(+), 154 deletions(-) create mode 100644 python/dapper_python/databases/database.py create mode 100644 python/dapper_python/databases/linuxDB.py create mode 100644 python/dapper_python/databases/pythonDB.py diff --git a/dataset-generation/Create_Linux_DB.py b/dataset-generation/Create_Linux_DB.py index 1d67773..2a730f2 100644 --- a/dataset-generation/Create_Linux_DB.py +++ b/dataset-generation/Create_Linux_DB.py @@ -37,11 +37,12 @@ from typing_extensions import Self +from dapper_python.databases.database import Database from dapper_python.normalize import NormalizedFileName, normalize_file_name @dataclass -class PackageInfo: +class PackageDetails: full_package_name: str file_path: PurePosixPath @@ -59,8 +60,8 @@ def __post_init__(self): @classmethod def from_linux_package_file(cls, line:str) -> Self: - """Creates a PackageInfo object out of a single line from the linux contents file - Uses simple parsing to split the line into package_name and file_path and then construct the PackageInfo object + """Creates a PackageDetails object out of a single line from the linux contents file + Uses simple parsing to split the line into package_name and file_path and then construct the PackageDetails object :param line: A line of text from the linux contents file :return: The package info for that line @@ -72,7 +73,91 @@ def from_linux_package_file(cls, line:str) -> Self: ) -def read_data(uri:str|Path, *, encoding='utf-8') -> TextIOWrapper: +class LinuxDatabase(Database): + + def __init__(self, db_path:Path) -> None: + super().__init__(db_path, mode='rwc') + self._init_database() + + def _init_database(self) -> None: + with self.cursor() as cursor: + #Would there be any benefit to having a separate package table + #Which the files table references as a foreign key vs directly saving the package into the files table? + create_table_cmd = """ + CREATE TABLE + IF NOT EXISTS package_files( + id INTEGER PRIMARY KEY, + file_name TEXT, + normalized_file_name TEXT, + file_path TEXT, + package_name TEXT, + full_package_name TEXT + ) + """ + cursor.execute(create_table_cmd) + + #Index the filename colum for fast lookups + #Currently does not index package name as use case does not require fast lookups on package name and reduces filesize + index_cmd = """ + CREATE INDEX idx_file_name + ON package_files(file_name); + """ + cursor.execute(index_cmd) + index_cmd = """ + CREATE INDEX idx_normalized_file_name + ON package_files(normalized_file_name); + """ + cursor.execute(index_cmd) + + #Metadata information about dataset + create_table_cmd = """ + CREATE TABLE + IF NOT EXISTS dataset_version( + version INTEGER PRIMARY KEY, + format TEXT, + timestamp INTEGER + ) + """ + cursor.execute(create_table_cmd) + + def set_version(self, version:int) -> None: + with self.cursor() as cursor: + metdata_remove_cmd = """ + DELETE FROM dataset_version + """ + cursor.execute(metdata_remove_cmd) + + metadata_add_cmd = """ + INSERT INTO dataset_version(version, format, timestamp) + VALUES (?, "Linux", ?) + """ + cursor.execute(metadata_add_cmd, (version, int(datetime.now().timestamp()))) + + def add_package(self, package_details:PackageDetails) -> None: + #Lower seems like it should work? As far as the OS is concerned ß.json is not the same file as ss.json + normalized_file = normalize_file_name(package_details.file_name) + match normalized_file: + case str(name): + normalized_file_name = name.lower() + case NormalizedFileName(): + normalized_file_name = normalized_file.name.lower() + case _: + raise TypeError(f"Failed to normalize file: {package_details.file_name}") + + cursor = self.cursor() + insert_cmd = """ + INSERT INTO package_files(file_name, normalized_file_name, file_path, package_name, full_package_name) + VALUES (?, ?, ?, ?, ?) + """ + + cursor.execute( + insert_cmd, + (package_details.file_name, normalized_file_name, str(package_details.file_path), + package_details.package_name, package_details.full_package_name,) + ) + + +def read_package_data(uri: str | Path, *, encoding='utf-8') -> TextIOWrapper: """Reads a file either from disk or by downloading it from the provided URL Will attempt to read the provided file as a text file @@ -83,7 +168,6 @@ def read_data(uri:str|Path, *, encoding='utf-8') -> TextIOWrapper: if isinstance(uri, Path): if not uri.exists(): raise FileNotFoundError(f"File {uri} does not exist") - return TextIOWrapper(FileIO(uri, mode='rb'), encoding=encoding) elif isinstance(uri, str): @@ -99,11 +183,11 @@ def read_data(uri:str|Path, *, encoding='utf-8') -> TextIOWrapper: content = BytesIO() progress_bar = tqdm( - total=file_size, - desc='Downloading package file', colour='blue', - unit='B', unit_divisor=1024, unit_scale=True, - position=None, leave=None, - ) + total=file_size, + desc='Downloading package file', colour='blue', + unit='B', unit_divisor=1024, unit_scale=True, + position=None, leave=None, + ) with progress_bar: for chunk in web_request.iter_content(chunk_size=8*1024): content.write(chunk) @@ -125,7 +209,6 @@ def read_data(uri:str|Path, *, encoding='utf-8') -> TextIOWrapper: else: raise TypeError(f"Invalid input: {uri}") - def main(): parser = argparse.ArgumentParser( description="Create Linux DB by parsing the Linux Contents file" @@ -156,31 +239,11 @@ def main(): if args.output.exists(): raise FileExistsError(f"File {args.output} already exists") - file = read_data(args.input) + file = read_package_data(args.input) line_count = sum(1 for _ in file) file.seek(0) - with sqlite3.connect(args.output) as db: - cursor = db.cursor() - - #Would there be any benefit to having a separate package table - #Which the files table references as a foreign key vs directly saving the package into the files table? - create_table_cmd = """ - CREATE TABLE package_files( - id INTEGER PRIMARY KEY, - file_name TEXT, - normalized_file_name TEXT, - file_path TEXT, - package_name TEXT, - full_package_name TEXT - ) - """ - cursor.execute(create_table_cmd) - - insert_cmd = """ - INSERT INTO package_files(file_name, normalized_file_name, file_path, package_name, full_package_name) - VALUES (?, ?, ?, ?, ?) - """ + with LinuxDatabase(args.output) as db: progress_iter = tqdm( file, total=line_count, @@ -188,53 +251,10 @@ def main(): unit='Entry', ) for line in progress_iter: - package = PackageInfo.from_linux_package_file(line) - - #Lower seems like it should work? As far as the OS is concerned ß.json is not the same file as ss.json - normalized_file = normalize_file_name(package.file_name) - match normalized_file: - case str(name): - normalized_name = name.lower() - case NormalizedFileName(): - normalized_name = normalized_file.name.lower() - case _: - raise TypeError(f"Failed to normalize file: {package.file_name}") - - cursor.execute( - insert_cmd, - ( - package.file_name, normalized_name, str(package.file_path), - package.package_name, package.full_package_name, - ) - ) + package = PackageDetails.from_linux_package_file(line) + db.add_package(package) - #Index the filename colum for fast lookups - #Currently does not index package name as use case does not require fast lookups on package name and reduces filesize - index_cmd = """ - CREATE INDEX idx_file_name - ON package_files(file_name); - """ - cursor.execute(index_cmd) - index_cmd = """ - CREATE INDEX idx_normalized_file_name - ON package_files(normalized_file_name); - """ - cursor.execute(index_cmd) - - #Metadata information about table - create_table_cmd = """ - CREATE TABLE dataset_version( - version INTEGER PRIMARY KEY, - format TEXT, - timestamp INTEGER - ) - """ - cursor.execute(create_table_cmd) - metadata_add_cmd = """ - INSERT INTO dataset_version(version, format, timestamp) - VALUES (?, "Linux", ?) - """ - cursor.execute(metadata_add_cmd, (args.version, int(datetime.now().timestamp()))) + db.set_version(args.version) if __name__ == "__main__": main() \ No newline at end of file diff --git a/dataset-generation/Create_PyPI_DB.py b/dataset-generation/Create_PyPI_DB.py index a3948b7..7699a58 100644 --- a/dataset-generation/Create_PyPI_DB.py +++ b/dataset-generation/Create_PyPI_DB.py @@ -52,6 +52,7 @@ from typing import Generator, Iterable, Any from typing_extensions import Self +from dapper_python.databases.database import Database from dapper_python.normalize import normalize_file_name @@ -82,74 +83,21 @@ class FileDetails: -T = TypeVar("T") -P = ParamSpec("P") -class PyPIDatabase: +class PyPIDatabase(Database): """Handles reading from and writing to the database - SQLite doesn't support stored procedures, so this class contains several pre-defined function (e.g add_package_imports) + SQLite doesn't support stored procedures, so this class contains several pre-defined functions (e.g add_package_imports) Which take their place of an API-of-sorts for an operation with several backend steps - - Connection opening/closing is handled using context manager - So it should be used as - with PyPIDatabase() as db: #Database is opened here - - #Database is closed here """ - class TransactionCursor(sqlite3.Cursor): - """A modified subclass of sqlite3.Cursor that adds support for transaction handling inside a context manager - - The PyPIDatabase class already uses the context manager to handle database opening/closing - So it can't be used in the same way that the sqlite3.Connection class does to handle transactions - - Since the cursor class does not implement its own context manager (and is normally used to run commands anyway) - It is used to add similar functionality - - This allows for running multiple commands inside the context which will all be treated as part of a transaction - Then upon exiting, all changes are commited, or if an exception occurs, the transaction is rolled back and none are commited - This helps ensure atomicity with multiple commands - """ - def __enter__(self) -> Self: - self.connection.__enter__() - return self - - def __exit__(self, exc_type, exc_val, exc_tb) -> Literal[False]: - self.connection.__exit__(exc_type, exc_val, exc_tb) - return False - - @staticmethod - def _requires_connection(func:Callable[[P], T]) -> Callable[[P], T]: - """Wrapper function which is used to decorate functions that require a database connection to work - If a decorated function is called when the database is not open/connected, then an exception is raised - """ - @functools.wraps(func) - def wrapper(self:PyPIDatabase, *args:P.args, **kwargs:P.kwargs) -> T: - if self._database is None: - raise sqlite3.ProgrammingError("Cannot operate on a closed database") - return func(self, *args, **kwargs) - return wrapper - def __init__(self, db_path:Path): - self._db_path = db_path - self._database:sqlite3.Connection|None = None - - def __enter__(self) -> Self: - self._database = sqlite3.connect(self._db_path) + super().__init__(db_path, mode='rwc') self._init_database() - return self - - def __exit__(self, exc_type, exc_val, exc_tb) -> Literal[False]: - if self._database is not None: - self._database.close() - self._database = None - return False - @_requires_connection def _init_database(self) -> None: """Initializes the database to create the required tables, indexes, and views """ - with self.get_cursor() as cursor: + with self.cursor() as cursor: #The last_serial column is used for checking values when resuming from a partially-built DB or updating the DB create_table_cmd = """ CREATE TABLE @@ -241,9 +189,8 @@ def _init_database(self) -> None: cursor.execute(create_table_cmd) - @_requires_connection def set_version(self, version:int) -> None: - with self.get_cursor() as cursor: + with self.cursor() as cursor: #We only want a single version information row, so if the table already has values, clear it metadata_remove_cmd = """ DELETE FROM dataset_version @@ -256,16 +203,6 @@ def set_version(self, version:int) -> None: """ cursor.execute(metadata_add_cmd, (version, int(datetime.now().timestamp()))) - @_requires_connection - def get_cursor(self) -> TransactionCursor: - """Gets a cursor for interacting with the database - Uses the TransactionCursor subclass to allow for automatic transaction handling - - :return: Cursor object with additional support for use with a context manager to automatically run transactions - """ - return self._database.cursor(factory=self.TransactionCursor) - - @_requires_connection def get_processed_packages(self) -> Generator[tuple[str,int], None, None]: """Gets a list of all the packages that have been added to the database along with their serial number @@ -274,18 +211,17 @@ def get_processed_packages(self) -> Generator[tuple[str,int], None, None]: :return: A generator of tuples with the format (package_name, serial) """ - cursor = self.get_cursor() + cursor = self.cursor() package_query = """ SELECT package_name, last_serial FROM packages """ packages = ( (package_name, last_serial) - for package_name, last_serial, *_ in cursor.execute(package_query).fetchall() + for package_name, last_serial, *_ in cursor.execute(package_query).fetchall_chunked() ) yield from packages - @_requires_connection def add_package(self, package_details:PackageDetails) -> None: """Adds a package and import names to the database @@ -298,7 +234,7 @@ def add_package(self, package_details:PackageDetails) -> None: #Nothing useful to add to the database return - with self.get_cursor() as cursor: + with self.cursor() as cursor: insert_package_cmd = """ INSERT INTO packages(package_name, last_serial) values (?, ?) @@ -326,7 +262,6 @@ def add_package(self, package_details:PackageDetails) -> None: ) cursor.executemany(insert_file_cmd, data) - @_requires_connection def remove_packages(self, package_names:str|Iterable[str]) -> None: """Removes the specified package(s) from the database @@ -338,7 +273,7 @@ def remove_packages(self, package_names:str|Iterable[str]) -> None: if isinstance(package_names, str): package_names = (package_names, ) - with self.get_cursor() as cursor: + with self.cursor() as cursor: remove_package_cmd = """ DELETE FROM packages WHERE package_name = ? diff --git a/python/dapper_python/databases/database.py b/python/dapper_python/databases/database.py new file mode 100644 index 0000000..fd208ea --- /dev/null +++ b/python/dapper_python/databases/database.py @@ -0,0 +1,81 @@ +from __future__ import annotations + +import sqlite3 +import functools + +from pathlib import Path + +from typing import Literal, Callable, TypeVar, ParamSpec +from typing import Any +from collections.abc import Generator + + +T = TypeVar("T") +P = ParamSpec("P") +class Database: + """Base class for databases""" + + class Cursor(sqlite3.Cursor): + """A modified subclass of sqlite3.Cursor that adds support for transaction handling inside a context manager + + This allows for running multiple commands inside the context which will all be treated as part of a transaction + Upon exiting, all changes are committed, or if an exception occurs, the transaction is rolled back and none are committed + This helps ensure atomicity with multiple commands + Behaves the same as using the connection's context manager, but is usable on the cursor object as well + + Adds support for streaming_based functionality similar to fetch_all but does not read all values in memory + Allowing for lighter-weight memory usage + """ + #TODO: Potentially replace with typing_extensions.self + def __enter__(self) -> Database.Cursor: + self.connection.__enter__() + return self + + def __exit__(self, exc_type, exc_val, exc_tb) -> Literal[False]: + self.connection.__exit__(exc_type, exc_val, exc_tb) + return False + + def fetchall_chunked(self, *, chunk_size:int=1024) -> Generator[Any, None, None]: + """Streams the results of previously executed command(s) + + Behaves similarly to fetchall() in that it will retrieve all results + But fetches them in chunks of N entries (specified by chunk_size) and yields each entry one at a time + Instead of loading all results into memory to reduce memory overhead + """ + while True: + entries = self.fetchmany(chunk_size) + if not entries: + return + yield from entries + + @staticmethod + def _requires_connection(func:Callable[[P], T]) -> Callable[[P], T]: + """Wrapper function which is used to decorate functions that require a database connection to work + If a decorated function is called when the database is not open/connected, then an exception is raised + """ + @functools.wraps(func) + def wrapper(self:Database, *args:P.args, **kwargs:P.kwargs) -> T: + if self._db is None: + raise sqlite3.ProgrammingError("Cannot operate on a closed database") + return func(self, *args, **kwargs) + return wrapper + + def __init__(self, db_path:Path, *, mode='rw') -> None: + #Base class defaults to allow both reading and writing + #Implementations intended for end-use should set to read-only + self._db_path = db_path + + uri = f'file:{db_path}?mode={mode}' + self._db = sqlite3.connect(uri, uri=True) + + def __enter__(self) -> Database: + return self + + def __exit__(self, exc_type, exc_val, exc_tb) -> Literal[False]: + return False + + @_requires_connection + def cursor(self) -> Database.Cursor: + #Allow user to access the cursor to perform queries outside of things we've thought of + #End-use applications should be opened in readonly mode to prevent changes, and only used for querying + return self._db.cursor(factory=self.Cursor) \ No newline at end of file diff --git a/python/dapper_python/databases/linuxDB.py b/python/dapper_python/databases/linuxDB.py new file mode 100644 index 0000000..33d30dc --- /dev/null +++ b/python/dapper_python/databases/linuxDB.py @@ -0,0 +1,68 @@ +from __future__ import annotations + +from pathlib import PurePosixPath, Path +from dataclasses import dataclass + +from collections.abc import Generator + +from dapper_python.normalize import normalize_file_name +from dapper_python.databases.database import Database + + +@dataclass +class PackageFile: + file_name: str + normalized_name: str + file_path: PurePosixPath + package_name: str + full_package_name: str + + +class LinuxDB(Database): + """Helper class to read and query the Linux dataset + + Contains some of the most commonly used query functionality + And also exposes the database to enable specific queries for specific use cases + """ + + def __init__(self, db_path:Path) -> None: + super().__init__(db_path, mode='ro') + + def list_packages(self) -> Generator[str]: + """Lists names of all packages available in the database""" + cursor = self.cursor() + query = """ + SELECT DISTINCT package_name + FROM package_files + ORDER BY package_name + """ + + packages = ( + pkg + for pkg, *_ in cursor.execute(query).fetchall_chunked() + ) + yield from packages + + def query_filename(self, file_name:str, *, normalize=True) -> list[PackageFile]: + if normalize: + file_name = str(normalize_file_name(file_name)) + + cursor = self.cursor() + query = f""" + SELECT (file_name, normalized_file_name, file_path, package_name, full_package_name) + FROM package_files + WHERE {'normalized_file_name' if normalize else 'file_name'} = ? + """ + + package_files = [ + PackageFile( + file_name=file_name, + normalized_name=normalized_filename, + file_path=PurePosixPath(file_path), + package_name=package_name, + full_package_name=full_package_name + ) + for file_name, normalized_filename, file_path, package_name, full_package_name, *_ + in cursor.execute(query, (file_name,)).fetchall_chunked() + ] + return package_files diff --git a/python/dapper_python/databases/pythonDB.py b/python/dapper_python/databases/pythonDB.py new file mode 100644 index 0000000..9cc4c1c --- /dev/null +++ b/python/dapper_python/databases/pythonDB.py @@ -0,0 +1,92 @@ +from __future__ import annotations + +from pathlib import PurePosixPath, Path +from dataclasses import dataclass + +from collections.abc import Generator + +from dapper_python.databases.database import Database + + +@dataclass +class PackageFile: + file_name: str + normalized_file_name: str + file_path: PurePosixPath + mime_type: str + magic_string: str # Magic_string is named as such as it is the result string from checking the file with libmagic + + +class PythonDB(Database): + """Helper class to read and query the PyPI dataset + + Contains some of the most commonly used query functionality + And also exposes the database to enable specific queries for specific use cases + """ + + def __init__(self, db_path:Path) -> None: + super().__init__(db_path, mode='ro') + + def list_packages(self) -> Generator[str, None, None]: + """Lists names of all packages available in the database""" + cursor = self.cursor() + query = """ + SELECT DISTINCT package_name + FROM packages + ORDER BY package_name + """ + + packages = ( + pkg + for pkg, *_ in cursor.execute(query).fetchall_chunked() + ) + yield from packages + + def query_import(self, import_name:str) -> list[str]: + """Queries packages by their import name + The package(s) often use the same name for both the package and the import, but this is not always the case. + + E.g: + Numpy is imported as "numpy" + Whereas BeautifulSoup[4] is imported as "bs4" + """ + cursor = self.cursor() + query = """ + SELECT package_name + FROM packages JOIN package_imports ON packages.id = package_imports.package_id + WHERE import_as = ? + """ + + packages = [ + package + for package, *_ in cursor.execute(query, (import_name,)).fetchall_chunked() + ] + return packages + + + def query_package_files(self, package_name:str) -> list[PackageFile]: + """Gets a list of the files contained in the package + Along with additional details of the file inspected with libmagic + + File lists are extracted from the published tarball for the package on PyPI + Paths are relative to the root directory of the tarball + """ + cursor = self.cursor() + query = """ + SELECT (file_name, normalized_file_name, file_path, mime_type, magic_string) + FROM packages JOIN package_files ON packages.id = package_files.package_id + WHERE package_name = ? + """ + + package_files = [ + PackageFile( + file_name=file_name, + normalized_file_name=normalized_file_name, + file_path=PurePosixPath(file_path), + mime_type=mime_type, + magic_string=magic_string, + ) + for file_name, normalized_file_name, file_path, mime_type, magic_string, *_ + in cursor.execute(query, (package_name,)).fetchall_chunked() + ] + return package_files