From c8e789cd565e07d3b83a6f367454294263b368f6 Mon Sep 17 00:00:00 2001
From: Steven West <west50@llnl.gov>
Date: Mon, 14 Jul 2025 10:43:36 -0700
Subject: [PATCH] Add helper utilities to interface with databases to
 dapper-python library

---
 dataset-generation/Create_Linux_DB.py      | 178 ++++++++++++---------
 dataset-generation/Create_PyPI_DB.py       |  85 ++--------
 python/dapper_python/databases/database.py |  81 ++++++++++
 python/dapper_python/databases/linuxDB.py  |  68 ++++++++
 python/dapper_python/databases/pythonDB.py |  92 +++++++++++
 5 files changed, 350 insertions(+), 154 deletions(-)
 create mode 100644 python/dapper_python/databases/database.py
 create mode 100644 python/dapper_python/databases/linuxDB.py
 create mode 100644 python/dapper_python/databases/pythonDB.py

diff --git a/dataset-generation/Create_Linux_DB.py b/dataset-generation/Create_Linux_DB.py
index 1d67773..2a730f2 100644
--- a/dataset-generation/Create_Linux_DB.py
+++ b/dataset-generation/Create_Linux_DB.py
@@ -37,11 +37,12 @@
 
 from typing_extensions import Self
 
+from dapper_python.databases.database import Database
 from dapper_python.normalize import NormalizedFileName, normalize_file_name
 
 
 @dataclass
-class PackageInfo:
+class PackageDetails:
     full_package_name: str
     file_path: PurePosixPath
 
@@ -59,8 +60,8 @@ def __post_init__(self):
 
     @classmethod
     def from_linux_package_file(cls, line:str) -> Self:
-        """Creates a PackageInfo object out of a single line from the linux contents file
-        Uses simple parsing to split the line into package_name and file_path and then construct the PackageInfo object
+        """Creates a PackageDetails object out of a single line from the linux contents file
+        Uses simple parsing to split the line into package_name and file_path and then construct the PackageDetails object
 
         :param line: A line of text from the linux contents file
         :return: The package info for that line
@@ -72,7 +73,91 @@ def from_linux_package_file(cls, line:str) -> Self:
         )
 
 
-def read_data(uri:str|Path, *, encoding='utf-8') -> TextIOWrapper:
+class LinuxDatabase(Database):
+
+    def __init__(self, db_path:Path) -> None:
+        super().__init__(db_path, mode='rwc')
+        self._init_database()
+
+    def _init_database(self) -> None:
+        with self.cursor() as cursor:
+            #Would there be any benefit to having a separate package table
+            #Which the files table references as a foreign key vs directly saving the package into the files table?
+            create_table_cmd = """
+            CREATE TABLE
+            IF NOT EXISTS package_files(
+                id INTEGER PRIMARY KEY,
+                file_name TEXT,
+                normalized_file_name TEXT,
+                file_path TEXT,
+                package_name TEXT,
+                full_package_name TEXT
+            )
+            """
+            cursor.execute(create_table_cmd)
+
+            #Index the filename colum for fast lookups
+            #Currently does not index package name as use case does not require fast lookups on package name and reduces filesize
+            index_cmd = """
+                CREATE INDEX idx_file_name
+                ON package_files(file_name);
+            """
+            cursor.execute(index_cmd)
+            index_cmd = """
+                CREATE INDEX idx_normalized_file_name
+                ON package_files(normalized_file_name);
+            """
+            cursor.execute(index_cmd)
+
+            #Metadata information about dataset
+            create_table_cmd = """
+                CREATE TABLE
+                IF NOT EXISTS dataset_version(
+                    version INTEGER PRIMARY KEY,
+                    format TEXT,
+                    timestamp INTEGER
+                )
+            """
+            cursor.execute(create_table_cmd)
+
+    def set_version(self, version:int) -> None:
+        with self.cursor() as cursor:
+            metdata_remove_cmd = """
+                DELETE FROM dataset_version
+            """
+            cursor.execute(metdata_remove_cmd)
+
+            metadata_add_cmd = """
+                INSERT INTO dataset_version(version, format, timestamp)
+                VALUES (?, "Linux", ?)
+            """
+            cursor.execute(metadata_add_cmd, (version, int(datetime.now().timestamp())))
+
+    def add_package(self, package_details:PackageDetails) -> None:
+        #Lower seems like it should work? As far as the OS is concerned ß.json is not the same file as ss.json
+        normalized_file = normalize_file_name(package_details.file_name)
+        match normalized_file:
+            case str(name):
+                normalized_file_name = name.lower()
+            case NormalizedFileName():
+                normalized_file_name = normalized_file.name.lower()
+            case _:
+                raise TypeError(f"Failed to normalize file: {package_details.file_name}")
+
+        cursor = self.cursor()
+        insert_cmd = """
+            INSERT INTO package_files(file_name, normalized_file_name, file_path, package_name, full_package_name)
+            VALUES (?, ?, ?, ?, ?)
+        """
+
+        cursor.execute(
+            insert_cmd,
+            (package_details.file_name, normalized_file_name, str(package_details.file_path),
+             package_details.package_name, package_details.full_package_name,)
+        )
+
+
+def read_package_data(uri: str | Path, *, encoding='utf-8') -> TextIOWrapper:
     """Reads a file either from disk or by downloading it from the provided URL
     Will attempt to read the provided file as a text file
 
@@ -83,7 +168,6 @@ def read_data(uri:str|Path, *, encoding='utf-8') -> TextIOWrapper:
     if isinstance(uri, Path):
         if not uri.exists():
             raise FileNotFoundError(f"File {uri} does not exist")
-
         return TextIOWrapper(FileIO(uri, mode='rb'), encoding=encoding)
 
     elif isinstance(uri, str):
@@ -99,11 +183,11 @@ def read_data(uri:str|Path, *, encoding='utf-8') -> TextIOWrapper:
 
             content = BytesIO()
             progress_bar = tqdm(
-                    total=file_size,
-                    desc='Downloading package file', colour='blue',
-                    unit='B', unit_divisor=1024, unit_scale=True,
-                    position=None, leave=None,
-                )
+                total=file_size,
+                desc='Downloading package file', colour='blue',
+                unit='B', unit_divisor=1024, unit_scale=True,
+                position=None, leave=None,
+            )
             with progress_bar:
                 for chunk in web_request.iter_content(chunk_size=8*1024):
                     content.write(chunk)
@@ -125,7 +209,6 @@ def read_data(uri:str|Path, *, encoding='utf-8') -> TextIOWrapper:
     else:
         raise TypeError(f"Invalid input: {uri}")
 
-
 def main():
     parser = argparse.ArgumentParser(
         description="Create Linux DB by parsing the Linux Contents file"
@@ -156,31 +239,11 @@ def main():
     if args.output.exists():
         raise FileExistsError(f"File {args.output} already exists")
 
-    file = read_data(args.input)
+    file = read_package_data(args.input)
     line_count = sum(1 for _ in file)
     file.seek(0)
 
-    with sqlite3.connect(args.output) as db:
-        cursor = db.cursor()
-
-        #Would there be any benefit to having a separate package table
-        #Which the files table references as a foreign key vs directly saving the package into the files table?
-        create_table_cmd = """
-            CREATE TABLE package_files(
-                id INTEGER PRIMARY KEY,
-                file_name TEXT,
-                normalized_file_name TEXT,
-                file_path TEXT,
-                package_name TEXT,
-                full_package_name TEXT
-            )
-        """
-        cursor.execute(create_table_cmd)
-
-        insert_cmd = """
-            INSERT INTO package_files(file_name, normalized_file_name, file_path, package_name, full_package_name)
-            VALUES (?, ?, ?, ?, ?)
-        """
+    with LinuxDatabase(args.output) as db:
         progress_iter = tqdm(
             file,
             total=line_count,
@@ -188,53 +251,10 @@ def main():
             unit='Entry',
         )
         for line in progress_iter:
-            package = PackageInfo.from_linux_package_file(line)
-
-            #Lower seems like it should work? As far as the OS is concerned ß.json is not the same file as ss.json
-            normalized_file = normalize_file_name(package.file_name)
-            match normalized_file:
-                case str(name):
-                    normalized_name = name.lower()
-                case NormalizedFileName():
-                    normalized_name = normalized_file.name.lower()
-                case _:
-                    raise TypeError(f"Failed to normalize file: {package.file_name}")
-
-            cursor.execute(
-                insert_cmd,
-                (
-                    package.file_name, normalized_name, str(package.file_path),
-                    package.package_name, package.full_package_name,
-                )
-            )
+            package = PackageDetails.from_linux_package_file(line)
+            db.add_package(package)
 
-        #Index the filename colum for fast lookups
-        #Currently does not index package name as use case does not require fast lookups on package name and reduces filesize
-        index_cmd = """
-            CREATE INDEX idx_file_name
-            ON package_files(file_name);
-        """
-        cursor.execute(index_cmd)
-        index_cmd = """
-            CREATE INDEX idx_normalized_file_name
-            ON package_files(normalized_file_name);
-        """
-        cursor.execute(index_cmd)
-
-        #Metadata information about table
-        create_table_cmd = """
-            CREATE TABLE dataset_version(
-                version INTEGER PRIMARY KEY,
-                format TEXT,
-                timestamp INTEGER
-            )
-        """
-        cursor.execute(create_table_cmd)
-        metadata_add_cmd = """
-            INSERT INTO dataset_version(version, format, timestamp)
-            VALUES (?, "Linux", ?)
-        """
-        cursor.execute(metadata_add_cmd, (args.version, int(datetime.now().timestamp())))
+        db.set_version(args.version)
 
 if __name__ == "__main__":
     main()
\ No newline at end of file
diff --git a/dataset-generation/Create_PyPI_DB.py b/dataset-generation/Create_PyPI_DB.py
index a3948b7..7699a58 100644
--- a/dataset-generation/Create_PyPI_DB.py
+++ b/dataset-generation/Create_PyPI_DB.py
@@ -52,6 +52,7 @@
 from typing import Generator, Iterable, Any
 from typing_extensions import Self
 
+from dapper_python.databases.database import Database
 from dapper_python.normalize import normalize_file_name
 
 
@@ -82,74 +83,21 @@ class FileDetails:
 
 
 
-T = TypeVar("T")
-P = ParamSpec("P")
-class PyPIDatabase:
+class PyPIDatabase(Database):
     """Handles reading from and writing to the database
 
-    SQLite doesn't support stored procedures, so this class contains several pre-defined function (e.g add_package_imports)
+    SQLite doesn't support stored procedures, so this class contains several pre-defined functions (e.g add_package_imports)
     Which take their place of an API-of-sorts for an operation with several backend steps
-
-    Connection opening/closing is handled using context manager
-    So it should be used as
-    with PyPIDatabase(<db_path>) as db: #Database is opened here
-        <code>
-    #Database is closed here
     """
 
-    class TransactionCursor(sqlite3.Cursor):
-        """A modified subclass of sqlite3.Cursor that adds support for transaction handling inside a context manager
-
-        The PyPIDatabase class already uses the context manager to handle database opening/closing
-        So it can't be used in the same way that the sqlite3.Connection class does to handle transactions
-
-        Since the cursor class does not implement its own context manager (and is normally used to run commands anyway)
-        It is used to add similar functionality
-
-        This allows for running multiple commands inside the context which will all be treated as part of a transaction
-        Then upon exiting, all changes are commited, or if an exception occurs, the transaction is rolled back and none are commited
-        This helps ensure atomicity with multiple commands
-        """
-        def __enter__(self) -> Self:
-            self.connection.__enter__()
-            return self
-
-        def __exit__(self, exc_type, exc_val, exc_tb) -> Literal[False]:
-            self.connection.__exit__(exc_type, exc_val, exc_tb)
-            return False
-
-    @staticmethod
-    def _requires_connection(func:Callable[[P], T]) -> Callable[[P], T]:
-        """Wrapper function which is used to decorate functions that require a database connection to work
-        If a decorated function is called when the database is not open/connected, then an exception is raised
-        """
-        @functools.wraps(func)
-        def wrapper(self:PyPIDatabase, *args:P.args, **kwargs:P.kwargs) -> T:
-            if self._database is None:
-                raise sqlite3.ProgrammingError("Cannot operate on a closed database")
-            return func(self, *args, **kwargs)
-        return wrapper
-
     def __init__(self, db_path:Path):
-        self._db_path = db_path
-        self._database:sqlite3.Connection|None = None
-
-    def __enter__(self) -> Self:
-        self._database = sqlite3.connect(self._db_path)
+        super().__init__(db_path, mode='rwc')
         self._init_database()
-        return self
-
-    def __exit__(self, exc_type, exc_val, exc_tb) -> Literal[False]:
-        if self._database is not None:
-            self._database.close()
-            self._database = None
-        return False
 
-    @_requires_connection
     def _init_database(self) -> None:
         """Initializes the database to create the required tables, indexes, and views
         """
-        with self.get_cursor() as cursor:
+        with self.cursor() as cursor:
             #The last_serial column is used for checking values when resuming from a partially-built DB or updating the DB
             create_table_cmd = """
                 CREATE TABLE
@@ -241,9 +189,8 @@ def _init_database(self) -> None:
             cursor.execute(create_table_cmd)
 
 
-    @_requires_connection
     def set_version(self, version:int) -> None:
-        with self.get_cursor() as cursor:
+        with self.cursor() as cursor:
             #We only want a single version information row, so if the table already has values, clear it
             metadata_remove_cmd = """
                 DELETE FROM dataset_version
@@ -256,16 +203,6 @@ def set_version(self, version:int) -> None:
             """
             cursor.execute(metadata_add_cmd, (version, int(datetime.now().timestamp())))
 
-    @_requires_connection
-    def get_cursor(self) -> TransactionCursor:
-        """Gets a cursor for interacting with the database
-        Uses the TransactionCursor subclass to allow for automatic transaction handling
-
-        :return: Cursor object with additional support for use with a context manager to automatically run transactions
-        """
-        return self._database.cursor(factory=self.TransactionCursor)
-
-    @_requires_connection
     def get_processed_packages(self) -> Generator[tuple[str,int], None, None]:
         """Gets a list of all the packages that have been added to the database along with their serial number
 
@@ -274,18 +211,17 @@ def get_processed_packages(self) -> Generator[tuple[str,int], None, None]:
 
         :return: A generator of tuples with the format (package_name, serial)
         """
-        cursor = self.get_cursor()
+        cursor = self.cursor()
         package_query = """
             SELECT package_name, last_serial
             FROM packages
         """
         packages = (
             (package_name, last_serial)
-            for package_name, last_serial, *_ in cursor.execute(package_query).fetchall()
+            for package_name, last_serial, *_ in cursor.execute(package_query).fetchall_chunked()
         )
         yield from packages
 
-    @_requires_connection
     def add_package(self, package_details:PackageDetails) -> None:
         """Adds a package and import names to the database
 
@@ -298,7 +234,7 @@ def add_package(self, package_details:PackageDetails) -> None:
             #Nothing useful to add to the database
             return
 
-        with self.get_cursor() as cursor:
+        with self.cursor() as cursor:
             insert_package_cmd = """
                 INSERT INTO packages(package_name, last_serial)
                 values (?, ?)
@@ -326,7 +262,6 @@ def add_package(self, package_details:PackageDetails) -> None:
                 )
                 cursor.executemany(insert_file_cmd, data)
 
-    @_requires_connection
     def remove_packages(self, package_names:str|Iterable[str]) -> None:
         """Removes the specified package(s) from the database
 
@@ -338,7 +273,7 @@ def remove_packages(self, package_names:str|Iterable[str]) -> None:
         if isinstance(package_names, str):
             package_names = (package_names, )
 
-        with self.get_cursor() as cursor:
+        with self.cursor() as cursor:
             remove_package_cmd = """
             DELETE FROM packages
             WHERE package_name = ?
diff --git a/python/dapper_python/databases/database.py b/python/dapper_python/databases/database.py
new file mode 100644
index 0000000..fd208ea
--- /dev/null
+++ b/python/dapper_python/databases/database.py
@@ -0,0 +1,81 @@
+from __future__ import annotations
+
+import sqlite3
+import functools
+
+from pathlib import Path
+
+from typing import Literal, Callable, TypeVar, ParamSpec
+from typing import Any
+from collections.abc import Generator
+
+
+T = TypeVar("T")
+P = ParamSpec("P")
+class Database:
+    """Base class for databases"""
+
+    class Cursor(sqlite3.Cursor):
+        """A modified subclass of sqlite3.Cursor that adds support for transaction handling inside a context manager
+
+        This allows for running multiple commands inside the context which will all be treated as part of a transaction
+        Upon exiting, all changes are committed, or if an exception occurs, the transaction is rolled back and none are committed
+        This helps ensure atomicity with multiple commands
+        Behaves the same as using the connection's context manager, but is usable on the cursor object as well
+        
+        Adds support for streaming_based functionality similar to fetch_all but does not read all values in memory
+        Allowing for lighter-weight memory usage
+        """
+        #TODO: Potentially replace with typing_extensions.self
+        def __enter__(self) -> Database.Cursor:
+            self.connection.__enter__()
+            return self
+
+        def __exit__(self, exc_type, exc_val, exc_tb) -> Literal[False]:
+            self.connection.__exit__(exc_type, exc_val, exc_tb)
+            return False
+
+        def fetchall_chunked(self, *, chunk_size:int=1024) -> Generator[Any, None, None]:
+            """Streams the results of previously executed command(s)
+
+            Behaves similarly to fetchall() in that it will retrieve all results
+            But fetches them in chunks of N entries (specified by chunk_size) and yields each entry one at a time
+            Instead of loading all results into memory to reduce memory overhead
+            """
+            while True:
+                entries = self.fetchmany(chunk_size)
+                if not entries:
+                    return
+                yield from entries
+
+    @staticmethod
+    def _requires_connection(func:Callable[[P], T]) -> Callable[[P], T]:
+        """Wrapper function which is used to decorate functions that require a database connection to work
+        If a decorated function is called when the database is not open/connected, then an exception is raised
+        """
+        @functools.wraps(func)
+        def wrapper(self:Database, *args:P.args, **kwargs:P.kwargs) -> T:
+            if self._db is None:
+                raise sqlite3.ProgrammingError("Cannot operate on a closed database")
+            return func(self, *args, **kwargs)
+        return wrapper
+
+    def __init__(self, db_path:Path, *, mode='rw') -> None:
+        #Base class defaults to allow both reading and writing
+        #Implementations intended for end-use should set to read-only
+        self._db_path = db_path
+
+        uri = f'file:{db_path}?mode={mode}'
+        self._db = sqlite3.connect(uri, uri=True)
+
+    def __enter__(self) -> Database:
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb) -> Literal[False]:
+        return False
+
+    @_requires_connection
+    def cursor(self) -> Database.Cursor:
+        #Allow user to access the cursor to perform queries outside of things we've thought of
+        #End-use applications should be opened in readonly mode to prevent changes, and only used for querying
+        return self._db.cursor(factory=self.Cursor)
\ No newline at end of file
diff --git a/python/dapper_python/databases/linuxDB.py b/python/dapper_python/databases/linuxDB.py
new file mode 100644
index 0000000..33d30dc
--- /dev/null
+++ b/python/dapper_python/databases/linuxDB.py
@@ -0,0 +1,68 @@
+from __future__ import annotations
+
+from pathlib import PurePosixPath, Path
+from dataclasses import dataclass
+
+from collections.abc import Generator
+
+from dapper_python.normalize import normalize_file_name
+from dapper_python.databases.database import Database
+
+
+@dataclass
+class PackageFile:
+    file_name: str
+    normalized_name: str
+    file_path: PurePosixPath
+    package_name: str
+    full_package_name: str
+
+
+class LinuxDB(Database):
+    """Helper class to read and query the Linux dataset
+
+    Contains some of the most commonly used query functionality
+    And also exposes the database to enable specific queries for specific use cases
+    """
+
+    def __init__(self, db_path:Path) -> None:
+        super().__init__(db_path, mode='ro')
+
+    def list_packages(self) -> Generator[str]:
+        """Lists names of all packages available in the database"""
+        cursor = self.cursor()
+        query = """
+            SELECT DISTINCT package_name
+            FROM package_files
+            ORDER BY package_name
+        """
+
+        packages = (
+            pkg
+            for pkg, *_ in cursor.execute(query).fetchall_chunked()
+        )
+        yield from packages
+
+    def query_filename(self, file_name:str, *, normalize=True) -> list[PackageFile]:
+        if normalize:
+            file_name = str(normalize_file_name(file_name))
+
+        cursor = self.cursor()
+        query = f"""
+            SELECT (file_name, normalized_file_name, file_path, package_name, full_package_name)
+            FROM package_files
+            WHERE {'normalized_file_name' if normalize else 'file_name'} = ?
+        """
+
+        package_files = [
+            PackageFile(
+                file_name=file_name,
+                normalized_name=normalized_filename,
+                file_path=PurePosixPath(file_path),
+                package_name=package_name,
+                full_package_name=full_package_name
+            )
+            for file_name, normalized_filename, file_path, package_name, full_package_name, *_
+            in cursor.execute(query, (file_name,)).fetchall_chunked()
+        ]
+        return package_files
diff --git a/python/dapper_python/databases/pythonDB.py b/python/dapper_python/databases/pythonDB.py
new file mode 100644
index 0000000..9cc4c1c
--- /dev/null
+++ b/python/dapper_python/databases/pythonDB.py
@@ -0,0 +1,92 @@
+from __future__ import annotations
+
+from pathlib import PurePosixPath, Path
+from dataclasses import dataclass
+
+from collections.abc import Generator
+
+from dapper_python.databases.database import Database
+
+
+@dataclass
+class PackageFile:
+    file_name: str
+    normalized_file_name: str
+    file_path: PurePosixPath
+    mime_type: str
+    magic_string: str # Magic_string is named as such as it is the result string from checking the file with libmagic
+
+
+class PythonDB(Database):
+    """Helper class to read and query the PyPI dataset
+    
+    Contains some of the most commonly used query functionality
+    And also exposes the database to enable specific queries for specific use cases
+    """
+
+    def __init__(self, db_path:Path) -> None:
+        super().__init__(db_path, mode='ro')
+
+    def list_packages(self) -> Generator[str, None, None]:
+        """Lists names of all packages available in the database"""
+        cursor = self.cursor()
+        query = """
+            SELECT DISTINCT package_name
+            FROM packages
+            ORDER BY package_name
+        """
+
+        packages = (
+            pkg
+            for pkg, *_ in cursor.execute(query).fetchall_chunked()
+        )
+        yield from packages
+
+    def query_import(self, import_name:str) -> list[str]:
+        """Queries packages by their import name
+        The package(s) often use the same name for both the package and the import, but this is not always the case.
+
+        E.g:
+        Numpy is imported as "numpy"
+        Whereas BeautifulSoup[4] is imported as "bs4"
+        """
+        cursor = self.cursor()
+        query = """
+            SELECT package_name
+            FROM packages JOIN package_imports ON packages.id = package_imports.package_id
+            WHERE import_as = ?
+        """
+
+        packages = [
+            package
+            for package, *_ in cursor.execute(query, (import_name,)).fetchall_chunked()
+        ]
+        return packages
+
+
+    def query_package_files(self, package_name:str) -> list[PackageFile]:
+        """Gets a list of the files contained in the package
+        Along with additional details of the file inspected with libmagic
+
+        File lists are extracted from the published tarball for the package on PyPI
+        Paths are relative to the root directory of the tarball
+        """
+        cursor = self.cursor()
+        query = """
+            SELECT (file_name, normalized_file_name, file_path, mime_type, magic_string)
+            FROM packages JOIN package_files ON packages.id = package_files.package_id
+            WHERE package_name = ?
+        """
+
+        package_files = [
+            PackageFile(
+                file_name=file_name,
+                normalized_file_name=normalized_file_name,
+                file_path=PurePosixPath(file_path),
+                mime_type=mime_type,
+                magic_string=magic_string,
+            )
+            for file_name, normalized_file_name, file_path, mime_type, magic_string, *_
+            in cursor.execute(query, (package_name,)).fetchall_chunked()
+        ]
+        return package_files