diff --git a/.gitignore b/.gitignore index 2090422..3468d10 100644 --- a/.gitignore +++ b/.gitignore @@ -13,3 +13,4 @@ tests/fixtures *.sf sqlitefid/* pathlesstaken/* +queries diff --git a/demystify.py b/demystify.py index 9ac9fac..e1ddcfa 100644 --- a/demystify.py +++ b/demystify.py @@ -111,7 +111,6 @@ def handle_output(analysis_results, txtout=False, rogues=False, heroes=False): "Please report any feedback you have around its accuracy and helpfulness. " "The feedback received will help improve the feature." ) - if txtout is True: logging.info("Outputting text report") textoutput = FormatAnalysisTextOutput(analysis_results) @@ -133,7 +132,9 @@ def handle_output(analysis_results, txtout=False, rogues=False, heroes=False): print(htmloutput.printHTMLResults().encode("utf8")) -def analysis_from_database(database_path, denylist=None, rogues=False, heroes=False): +def analysis_from_database( + database_path, denylist=None, rogues=False, heroes=False, audit=False +): """Analysis of format identification report from existing database. :param database_path: path to sqlite database containing analysis @@ -145,7 +146,7 @@ def analysis_from_database(database_path, denylist=None, rogues=False, heroes=Fa """ logging.info("Analysis from database: %s", database_path) try: - analysis = DemystifyAnalysis(database_path, get_config(), denylist) + analysis = DemystifyAnalysis(database_path, get_config(), denylist, audit) except AnalysisError as err: raise AnalysisError(err) rogue_analysis = False @@ -155,7 +156,9 @@ def analysis_from_database(database_path, denylist=None, rogues=False, heroes=Fa return analysis -def analysis_from_csv(format_report, analyze, denylist=None, rogues=None, heroes=None): +def analysis_from_csv( + format_report, analyze, denylist=None, rogues=None, heroes=None, audit=False +): """Analysis of format identification report from raw data, i.e. DROID CSV, SF YAML etc. @@ -176,7 +179,7 @@ def analysis_from_csv(format_report, analyze, denylist=None, rogues=None, heroes if analyze is not True: logging.error("Analysis is not set: %s", analyze) return - analysis = analysis_from_database(database_path, denylist, rogues, heroes) + analysis = analysis_from_database(database_path, denylist, rogues, heroes, audit) return analysis @@ -221,6 +224,11 @@ def main(): help="Output 'Heroes Gallery' listing", action="store_true", ) + parser.add_argument( + "--audit", + help="Output a listing of all database queries used for a report", + action="store_true", + ) start_time = time.time() if len(sys.argv) == 1: parser.print_help() @@ -233,16 +241,18 @@ def main(): if args.export: args.db = False analysis = analysis_from_csv( - args.export, True, denylist, args.rogues, args.heroes + args.export, True, denylist, args.rogues, args.heroes, args.audit ) if args.db: if not IdentifyDB().identify_export(args.db): logging.error("Not a recognized sqlite database: %s", args.db) sys.exit(1) - analysis = analysis_from_database(args.db, denylist, args.rogues, args.heroes) - if analysis: + analysis = analysis_from_database( + args.db, denylist, args.rogues, args.heroes.args.audit + ) + if analysis and not args.audit: handle_output(analysis.analysis_results, args.txt, args.rogues, args.heroes) - output_time(start_time) + output_time(start_time) logging.info("Demystify: ...analysis complete") diff --git a/libs/AnalysisQueriesClass.py b/libs/AnalysisQueriesClass.py index dd2186b..4125f86 100644 --- a/libs/AnalysisQueriesClass.py +++ b/libs/AnalysisQueriesClass.py @@ -14,38 +14,70 @@ class AnalysisQueries: analysis functions in Demystify. """ - SELECT_FILENAMES = "SELECT FILEDATA.NAME FROM FILEDATA" - SELECT_DIRNAMES = "SELECT DISTINCT FILEDATA.DIR_NAME FROM FILEDATA" - - SELECT_HASH = "SELECT DBMD.HASH_TYPE FROM DBMD" - SELECT_TOOL = "SELECT DBMD.TOOL_TYPE FROM DBMD" + SELECT_FILENAMES = ( + "/* demystify: select all filenames from the filedata table \n*/\n" + "SELECT FILEDATA.NAME FROM FILEDATA" + ) + SELECT_DIRNAMES = ( + "/* demystify: select distinct file names from the database \n*/\n" + "SELECT DISTINCT FILEDATA.DIR_NAME FROM FILEDATA" + ) - SELECT_COLLECTION_SIZE = "SELECT SUM(FILEDATA.SIZE) FROM FILEDATA" - SELECT_COUNT_FILES = "SELECT COUNT(FILEDATA.FILE_ID) FROM FILEDATA WHERE (FILEDATA.TYPE='File' OR FILEDATA.TYPE='Container')" + SELECT_HASH = ( + "/* demystify: select the checksum type used by the database \n*/\n" + "SELECT DBMD.HASH_TYPE FROM DBMD" + ) + SELECT_TOOL = ( + "/* demystify: select the format identification tool used \n*/\n" + "SELECT DBMD.TOOL_TYPE FROM DBMD" + ) + SELECT_COLLECTION_SIZE = ( + "/* demystify: sum the total bytes used by all files in the database \n*/\n" + "SELECT SUM(FILEDATA.SIZE) FROM FILEDATA" + ) + SELECT_COUNT_FILES = ( + "/* demystify: count all files in the database \n*/\n" + "SELECT COUNT(FILEDATA.FILE_ID) FROM FILEDATA WHERE (FILEDATA.TYPE='File' OR FILEDATA.TYPE='Container')" + ) SELECT_COUNT_CONTAINERS = ( + "/* demystify: count the number of container files in the database \n*/\n" "SELECT COUNT(FILEDATA.FILE_ID) FROM FILEDATA WHERE FILEDATA.TYPE='Container'" ) SELECT_CONTAINER_TYPES = ( + "/* demystify: list extensions of container objects in the database \n*/\n" "SELECT DISTINCT FILEDATA.EXT FROM FILEDATA WHERE FILEDATA.TYPE='Container'" ) - SELECT_COUNT_FILES_IN_CONTAINERS = "SELECT COUNT(FILEDATA.FILE_ID) FROM FILEDATA WHERE (FILEDATA.URI_SCHEME!='file') AND (FILEDATA.TYPE='File' OR FILEDATA.TYPE='Container')" - - SELECT_COUNT_ZERO_BYTE_FILES = "SELECT COUNT(FILEDATA.SIZE) FROM FILEDATA WHERE (FILEDATA.TYPE!='Folder') AND (FILEDATA.SIZE='0')" - SELECT_ZERO_BYTE_FILEPATHS = "SELECT FILEDATA.FILE_PATH FROM FILEDATA WHERE FILEDATA.TYPE='File' AND FILEDATA.SIZE='0'" - + SELECT_COUNT_FILES_IN_CONTAINERS = ( + "/* demystify: count the number of files in container objects, e.g. zip \n*/\n" + "SELECT COUNT(FILEDATA.FILE_ID) FROM FILEDATA WHERE (FILEDATA.URI_SCHEME!='file') AND (FILEDATA.TYPE='File' OR FILEDATA.TYPE='Container')" + ) + SELECT_COUNT_ZERO_BYTE_FILES = ( + "/* demystify: count the number of zero-byte files in the database \n*/\n" + "SELECT COUNT(FILEDATA.SIZE) FROM FILEDATA WHERE (FILEDATA.TYPE!='Folder') AND (FILEDATA.SIZE='0')" + ) + SELECT_ZERO_BYTE_FILEPATHS = ( + "/* demystify: list the paths of the zero-byte files in the database \n*/\n" + "SELECT FILEDATA.FILE_PATH FROM FILEDATA WHERE FILEDATA.TYPE!='Folder' AND FILEDATA.SIZE='0'" + ) SELECT_COUNT_FOLDERS = ( + "/* demystify: count the number of unique identifications \n*/\n" "SELECT COUNT(FILEDATA.FILE_ID) FROM FILEDATA WHERE FILEDATA.TYPE='Folder'" ) - - SELECT_COUNT_UNIQUE_FILENAMES = "SELECT COUNT(DISTINCT FILEDATA.NAME) FROM FILEDATA WHERE (FILEDATA.TYPE='File' OR FILEDATA.TYPE='Container')" + SELECT_COUNT_UNIQUE_FILENAMES = ( + "/* demystify: count the number of distinct files in the database \n*/\n" + "SELECT COUNT(DISTINCT FILEDATA.NAME) FROM FILEDATA WHERE (FILEDATA.TYPE='File' OR FILEDATA.TYPE='Container')" + ) SELECT_COUNT_UNIQUE_DIRNAMES = ( + "/* demystify: count the number of unique directories in the database \n*/\n" "SELECT COUNT(DISTINCT FILEDATA.DIR_NAME) FROM FILEDATA" ) - - SELECT_COUNT_NAMESPACES = "SELECT COUNT(NSDATA.NS_ID) FROM NSDATA" - + SELECT_COUNT_NAMESPACES = ( + "/* demystify: count the number of namespaces used by an identification report \n*/\n" + "SELECT COUNT(NSDATA.NS_ID) FROM NSDATA" + ) SELECT_FREQUENCY_ERRORS = ( + "/* demystify: create a frequency list of unique errors output by the identification tool \n*/\n" "SELECT FILEDATA.ERROR, COUNT(*) AS TOTAL\n" "FROM FILEDATA\n" "WHERE FILEDATA.TYPE!='Folder'\n" @@ -56,6 +88,7 @@ class AnalysisQueries: ns_pattern = "{{ ns_id }}" SELECT_COUNT_ID_METHODS_PATTERN = ( + "/* demystify: select all identification results across the database and order by namespace \n*/\n" "SELECT IDRESULTS.FILE_ID, IDDATA.ID_ID, IDDATA.METHOD, IDDATA.NS_ID\n" "FROM IDRESULTS\n" "JOIN IDDATA on IDRESULTS.ID_ID = IDDATA.ID_ID\n" @@ -65,9 +98,9 @@ class AnalysisQueries: "ELSE 2\n" "END\n" ) - # Prority of results is based on order input to database... SELECT_COUNT_ID_METHODS_NONE = ( + "/* demystify: select all identification results from across the database \n*/\n" "SELECT IDRESULTS.FILE_ID, IDDATA.ID_ID, IDDATA.METHOD, IDDATA.NS_ID\n" "FROM IDRESULTS\n" "JOIN IDDATA on IDRESULTS.ID_ID = IDDATA.ID_ID" @@ -81,6 +114,7 @@ def methods_return_ns_sort(self, ns_id): return self.SELECT_COUNT_ID_METHODS_NONE SELECT_COUNT_EXT_MISMATCHES = ( + "/* demystify: count of all extension mismatches \n*/\n" "SELECT COUNT(distinct(IDRESULTS.FILE_ID))\n" "FROM IDRESULTS\n" "JOIN IDDATA on IDRESULTS.ID_ID = IDDATA.ID_ID\n" @@ -88,6 +122,7 @@ def methods_return_ns_sort(self, ns_id): ) SELECT_COUNT_FORMAT_COUNT = ( + "/* demystify: cpunt of pronom signature or container identification results \n*/\n" "SELECT COUNT(DISTINCT IDDATA.ID)\n" "FROM IDRESULTS\n" "JOIN NSDATA on IDDATA.NS_ID = NSDATA.NS_ID\n" @@ -97,6 +132,7 @@ def methods_return_ns_sort(self, ns_id): ) SELECT_COUNT_OTHER_FORMAT_COUNT = ( + "/* demystify: count of non-pronom signature or container identification results \n*/\n" "SELECT COUNT(DISTINCT IDDATA.ID)\n" "FROM IDRESULTS\n" "JOIN NSDATA on IDDATA.NS_ID = NSDATA.NS_ID\n" @@ -105,13 +141,13 @@ def methods_return_ns_sort(self, ns_id): "AND (IDDATA.METHOD='Signature' OR IDDATA.METHOD='Container')" ) - # PRONOM and OTHERS Text identifiers as one result - # PRONOM and OTHERS Text identifiers as one result + # PRONOM and OTHER identifiers as one result. @staticmethod def select_count_identifiers(method): # XML, Text, Filename method_pattern = "{{ method }}" SELECT_METHOD_IDENTIFIER = ( + "/* demystify: count all identification results for all identifiers as one \n*/\n" "SELECT COUNT(DISTINCT IDMETHOD)\n" "FROM (SELECT IDRESULTS.FILE_ID, IDDATA.ID as IDMETHOD\n" "FROM IDRESULTS\n" @@ -137,6 +173,7 @@ def select_frequency_identifier_types(method): identifier_text = "%match on filename%" SELECT_IDENTIFIER_COUNT = ( + "/* demystify: select information about namespace and identification based on the given identification method \n*/\n" "SELECT 'ns:' || NSDATA.NS_NAME || ' ' || IDDATA.ID, count(IDDATA.ID) as TOTAL\n" "FROM IDDATA\n" "JOIN NSDATA on IDDATA.NS_ID = NSDATA.NS_ID\n" @@ -148,6 +185,7 @@ def select_frequency_identifier_types(method): return SELECT_IDENTIFIER_COUNT.replace(identifier_pattern, identifier_text) SELECT_COUNT_EXTENSION_RANGE = ( + "/* demystify: count of extensions for all signature and container identified files in the database \n*/\n" "SELECT COUNT(DISTINCT FILEDATA.EXT)\n" "FROM FILEDATA\n" "WHERE (FILEDATA.TYPE='File' OR FILEDATA.TYPE='Container')" @@ -163,6 +201,7 @@ def getmimes(idids): """ mimes = [id_[1] for id_ in idids] query1 = ( + "/* demystify: select information about all mimetypes recorded in the database where a mimetype is listed \n*/\n" "SELECT IDDATA.MIME_TYPE, COUNT(*) AS total\n" "FROM IDRESULTS\n" "JOIN IDDATA on IDRESULTS.ID_ID = IDDATA.ID_ID\n" @@ -177,6 +216,7 @@ def getmimes(idids): return query SELECT_BINARY_MATCH_COUNT = ( + "/* demystify: select information about all signature or container identified files in the database \n*/\n" "SELECT NSDATA.NS_NAME, IDDATA.ID, COUNT(IDDATA.ID) as TOTAL\n" "FROM IDRESULTS\n" "JOIN NSDATA on IDDATA.NS_ID = NSDATA.NS_ID\n" @@ -186,6 +226,7 @@ def getmimes(idids): ) SELECT_YEAR_FREQUENCY_COUNT = ( + "/* demystify: create a frequency listing of all 'last-modified' years in the database \n*/\n" "SELECT FILEDATA.YEAR, COUNT(FILEDATA.YEAR) AS total\n" "FROM FILEDATA\n" "WHERE (FILEDATA.TYPE='File' OR FILEDATA.TYPE='Container')\n" @@ -193,12 +234,14 @@ def getmimes(idids): ) SELECT_PUIDS_EXTENSION_ONLY = ( + "/* demystify: select information about files identified by extension only \n*/\n" "SELECT DISTINCT IDDATA.ID, IDDATA.FORMAT_NAME\n" "FROM IDDATA\n" "WHERE (IDDATA.METHOD='Extension')" ) SELECT_ALL_UNIQUE_EXTENSIONS = ( + "/* demystify: select all unique file format extensions in the database \n*/\n" "SELECT DISTINCT FILEDATA.EXT\n" "FROM FILEDATA\n" "WHERE (FILEDATA.TYPE='File' OR FILEDATA.TYPE='Container')\n" @@ -206,6 +249,7 @@ def getmimes(idids): ) SELECT_COUNT_EXTENSION_FREQUENCY = ( + "/* demystify: create a frequency listing of file format extensions in the database \n*/\n" "SELECT FILEDATA.EXT, COUNT(*) AS total\n" "FROM FILEDATA\n" "WHERE (FILEDATA.TYPE='File' OR FILEDATA.TYPE='Container')\n" @@ -214,6 +258,7 @@ def getmimes(idids): ) SELECT_COUNT_DUPLICATE_CHECKSUMS = ( + "/* demystify: create a frequency listing of duplicate files in the database \n*/\n" "SELECT FILEDATA.HASH, COUNT(*) AS TOTAL\n" "FROM FILEDATA\n" "WHERE FILEDATA.TYPE='File' OR FILEDATA.TYPE='Container'\n" @@ -225,6 +270,7 @@ def getmimes(idids): # Siegfried only queries... SELECT_BYTE_MATCH_BASIS = ( + "/* demystify: select metadata about objects with a byte match identification \n*/\n" "SELECT DISTINCT IDDATA.BASIS, IDDATA.ID, FILEDATA.NAME, FILEDATA.SIZE\n" "FROM IDRESULTS\n" "JOIN FILEDATA on IDRESULTS.FILE_ID = FILEDATA.FILE_ID\n" @@ -242,6 +288,7 @@ def count_multiple_ids(nscount, paths=False): """ if paths is False: body = ( + "/* demystify: count of files with multiple identifications \n*/\n" "SELECT count(FREQUENCY)\n" "FROM (SELECT FILEDATA.FILE_PATH AS PATH, COUNT(FILEDATA.FILE_ID) AS FREQUENCY\n" "FROM IDRESULTS\n" @@ -255,6 +302,7 @@ def count_multiple_ids(nscount, paths=False): query = "{}{}".format(body, nscount) return query body = ( + "/* demystify: select paths for files with multiple identifications \n*/\n" "SELECT PATH\n" "FROM (SELECT FILEDATA.FILE_PATH AS PATH, COUNT(FILEDATA.FILE_ID) AS FREQUENCY\n" "FROM IDRESULTS\n" @@ -270,13 +318,21 @@ def count_multiple_ids(nscount, paths=False): @staticmethod def list_duplicate_paths(checksum): - return "SELECT FILE_PATH FROM FILEDATA WHERE FILEDATA.HASH='{}' ORDER BY FILEDATA.FILE_PATH;".format( - checksum + dupes = ( + "/* demystify: select paths from the database for files with a given checksum \n*/\n" + "SELECT FILE_PATH FROM FILEDATA WHERE FILEDATA.HASH='{}' ORDER BY FILEDATA.FILE_PATH;".format( + checksum + ) ) + return dupes @staticmethod def count_id_instances(id_): - return "SELECT COUNT(*) AS total FROM IDDATA WHERE (IDDATA.ID='{}'".format(id_) + id_instances = ( + "/* demystify: count of all items with a given identification \n*/\n" + "SELECT COUNT(*) AS total FROM IDDATA WHERE (IDDATA.ID='{}'".format(id_) + ) + return id_instances def query_from_idrows(self, idlist, priority=None): list_ = "WHERE IDRESULTS.ID_ID IN " @@ -285,6 +341,7 @@ def query_from_idrows(self, idlist, priority=None): where = "{}{}, ".format(where, i[1]) list_ = "{}{})\n".format(list_, where.strip(", ")) SELECT_NAMESPACE_AND_IDS = ( + "/* demystify: select metadata about all results ordered by namespace \n*/\n" "SELECT 'ns:' || NSDATA.NS_NAME || ' ', IDDATA.ID, IDDATA.FORMAT_NAME, IDDATA.BASIS, IDDATA.FORMAT_VERSION, IDDATA.NS_ID, COUNT(IDDATA.ID) AS TOTAL\n" "FROM IDRESULTS\n" "JOIN NSDATA on IDDATA.NS_ID = NSDATA.NS_ID\n" @@ -299,7 +356,7 @@ def query_from_idrows(self, idlist, priority=None): "END\n" ) GROUP_TOTAL = """GROUP BY IDDATA.ID ORDER BY TOTAL DESC""" - query = "{}\n{}".format(SELECT_NAMESPACE_AND_IDS, list_) + query = "{}{}".format(SELECT_NAMESPACE_AND_IDS, list_) if priority is not None: query = "{}{}".format( query, PRIORITY_ID.replace(self.ns_pattern, str(priority)) @@ -322,9 +379,13 @@ def query_from_ids(idlist, idmethod=False): where = where + str(i) + ", " list_ = list_ + where.strip(", ") + ")" - SELECT_PATHS = "SELECT FILEDATA.FILE_PATH\nFROM FILEDATA\n" + SELECT_PATHS = ( + "/* demystify: select file paths for a given identification method \n*/\n" + "SELECT FILEDATA.FILE_PATH\nFROM FILEDATA\n" + ) SELECT_NAMESPACE_AND_IDS = ( + "/* demystify: select metadata about identifications for a given identification method \n*/\n" "SELECT 'ns:' || NSDATA.NS_NAME || ' ', IDDATA.ID\n" "FROM IDRESULTS\n" "JOIN NSDATA on IDDATA.NS_ID = NSDATA.NS_ID\n" @@ -341,7 +402,9 @@ def query_from_ids(idlist, idmethod=False): return SELECT_PATHS + "\n" + list_ # NAMESPACE QUERIES - SELECT_NS_DATA = "SELECT * FROM NSDATA" + SELECT_NS_DATA = ( + "/* demystify: select all namespace data \n*/\n" "SELECT * FROM NSDATA" + ) @staticmethod def get_ns_gap_count_lists(nsid): @@ -349,6 +412,7 @@ def get_ns_gap_count_lists(nsid): Container methods for a given namespace ID. """ return ( + "/* demystify: identify gaps across all namespaces used in an identification run \n*/\n" "SELECT IDRESULTS.FILE_ID\n" "FROM IDRESULTS\n" "JOIN IDDATA on IDRESULTS.ID_ID = IDDATA.ID_ID\n" @@ -359,6 +423,7 @@ def get_ns_gap_count_lists(nsid): @staticmethod def get_ns_multiple_ids(nsid): SELECT_NAMESPACE_BINARY_IDS1 = ( + "/* demystify: count of files from the database with multiple identifications for a given namespace id \n*/\n" "SELECT count(*)\n" "FROM (SELECT COUNT(FILEDATA.FILE_ID) AS FREQUENCY\n" "FROM IDRESULTS\n" @@ -391,6 +456,7 @@ def get_ns_methods(id_, binary=True, method=False): """ WHERE_NS = "WHERE NS_ID={}".format(id_) COUNT_IDS_NS = ( + "/* demystify: count of files from the database identified using signature or container methods for a given namespace \n*/\n" "SELECT COUNT(DISTINCT IDRESULTS.FILE_ID)\n" "FROM IDRESULTS\n" "JOIN IDDATA on IDRESULTS.ID_ID = IDDATA.ID_ID\n" @@ -398,7 +464,10 @@ def get_ns_methods(id_, binary=True, method=False): COUNT_IDS_METHODS = ( "AND (IDDATA.METHOD='Signature' or IDDATA.METHOD='Container')" ) - ID_METHODS_COUNT = "SELECT COUNT(*) FROM IDDATA\n" + ID_METHODS_COUNT = ( + "/* demystify: count all rows in the iddata table \n*/\n" + "SELECT COUNT(*) FROM IDDATA\n" + ) ID_METHODS_METHOD = "AND IDDATA.METHOD=" query = "" if binary is True: diff --git a/libs/DemystifyAnalysisClass.py b/libs/DemystifyAnalysisClass.py index 22ec4b2..15157d5 100644 --- a/libs/DemystifyAnalysisClass.py +++ b/libs/DemystifyAnalysisClass.py @@ -11,6 +11,7 @@ import logging import sqlite3 +import time from collections import Counter from libs import AnalysisResultsClass @@ -63,7 +64,7 @@ class DemystifyAnalysis(DemystifyBase): TOOLTYPE_DROID = "droid" - def __init__(self, database_path=None, config=False, denylist=None): + def __init__(self, database_path=None, config=False, denylist=None, audit=False): """Constructor for DemystifyAnalysis object.""" logging.debug( "Analysis __init__(): database_path: %s config: %s denylist: %s", @@ -82,6 +83,7 @@ def __init__(self, database_path=None, config=False, denylist=None): # generate data for that. self.analysis_results = AnalysisResultsClass.AnalysisResults() self.query = AnalysisQueries() + self.audit = audit # Initialize database connection variables. self._open_database(database_path) @@ -211,7 +213,12 @@ def _querydb(self, query, fetchone=False, numberquery=False, tolist=False): so that a number of different sqlite query calls styles can be used and the caller can do less work to pull those values apart. """ + query_start_time = time.time() self.cursor.execute(query.replace(" ", "")) + if self.audit: + print("---") + print(query.strip()) + print("Query took: {} seconds".format((time.time() - query_start_time))) if fetchone is True and numberquery is False: return self.cursor.fetchone() if fetchone is True and numberquery is True: @@ -487,7 +494,6 @@ def getMethodIDResults(self, methodids, fmt_version=False): methodresults = self._querydb( self.query.query_from_idrows(methodids, self.priority_ns_id) ) - for id_ in methodresults: ns_id = id_[5] name = id_[2] diff --git a/libs/DenylistQueriesClass.py b/libs/DenylistQueriesClass.py index ae71f22..f18478a 100644 --- a/libs/DenylistQueriesClass.py +++ b/libs/DenylistQueriesClass.py @@ -13,6 +13,7 @@ def getfilenames(filenamelist): newlist = '%" or FILEDATA.NAME LIKE "%'.join(filenamelist) newlist = 'FILEDATA.NAME LIKE "%{}%")'.format(newlist) fnamequery = ( + "/* deny-list: select file paths and names from the database for file names in a given list \n*/\n" "SELECT DISTINCT FILEDATA.FILE_PATH, FILEDATA.NAME\n" "FROM FILEDATA\n" "WHERE FILEDATA.TYPE != 'Folder'\n" @@ -25,6 +26,7 @@ def getdirnames(dirlist): newlist = '%" or FILEDATA.NAME LIKE "%'.join(dirlist) newlist = 'FILEDATA.NAME LIKE "%{}%")'.format(newlist) dirquery = ( + "/* deny-list: select file paths and names from the database for folder names in a given list \n*/\n" "SELECT DISTINCT FILEDATA.FILE_PATH, FILEDATA.NAME\n" "FROM FILEDATA\n" "WHERE FILEDATA.TYPE = 'Folder'\n" @@ -38,6 +40,7 @@ def getexts(extlist): newlist = '("{}")'.format(newlist) newlist = newlist.replace(".", "") extquery = ( + "/* deny-list: select file paths and file names from the database for a set of files with extensions in a given list \n*/\n" "SELECT DISTINCT FILEDATA.FILE_PATH, FILEDATA.NAME, FILEDATA.EXT\n" "FROM FILEDATA\n" "WHERE FILEDATA.TYPE != 'Folder'\n" @@ -50,6 +53,7 @@ def getids(idlist): newlist = '","'.join(idlist) newlist = '("{}")'.format(newlist) idquery = ( + "/* deny-list: select file and id metadata from the database for a set of files in a given list of format ids \n*/\n" "SELECT DISTINCT FILEDATA.FILE_PATH, FILEDATA.NAME, IDDATA.ID || ': ' || IDDATA.FORMAT_NAME || ' ' || IDDATA.FORMAT_VERSION\n" "FROM IDRESULTS\n" "JOIN FILEDATA on IDRESULTS.FILE_ID = FILEDATA.FILE_ID\n" diff --git a/libs/RoguesQueriesClass.py b/libs/RoguesQueriesClass.py index 16e3583..6f910fa 100644 --- a/libs/RoguesQueriesClass.py +++ b/libs/RoguesQueriesClass.py @@ -10,13 +10,16 @@ class RogueQueries(object): """ SELECT_ALL_FILEPATHS = ( - """SELECT DISTINCT FILE_PATH FROM FILEDATA WHERE FILEDATA.TYPE != 'Folder'""" + "/* rogues: select all non-folder file paths from the filedata table \n*/\n" + "SELECT DISTINCT FILE_PATH FROM FILEDATA WHERE FILEDATA.TYPE != 'Folder'" ) SELECT_ALL_FOLDERS = ( - """SELECT DISTINCT FILE_PATH FROM FILEDATA WHERE FILEDATA.TYPE = 'Folder'""" + "/* rogues: select all 'folder' file paths from the filedata table \n*/\n" + "SELECT DISTINCT FILE_PATH FROM FILEDATA WHERE FILEDATA.TYPE = 'Folder'" ) SELECT_EXTENSION_MISMATCHES = ( + "/* rogues: select files from the database that have extension mismatches \n*/\n" "SELECT DISTINCT FILEDATA.FILE_PATH\n" "FROM IDRESULTS\n" "JOIN FILEDATA on IDRESULTS.FILE_ID = FILEDATA.FILE_ID\n" @@ -27,6 +30,7 @@ class RogueQueries(object): @staticmethod def get_pronom_identified_files(pro_ns): PRONOM_ONLY = ( + "/* rogues: select files identified using signatures or containers using PRONOM \n*/\n" "SELECT DISTINCT FILEDATA.FILE_PATH\n" "FROM IDRESULTS\n" "JOIN FILEDATA on IDRESULTS.FILE_ID = FILEDATA.FILE_ID\n" @@ -43,6 +47,7 @@ def get_all_non_ids(ids): csv = ",".join(ids) csv = "({})".format(csv) ALL_IDS = ( + "/* rogues: select file paths from the database not in a given list \n*/\n" "SELECT DISTINCT FILEDATA.FILE_PATH\n" "FROM FILEDATA\n" "WHERE FILE_ID NOT IN" @@ -52,7 +57,10 @@ def get_all_non_ids(ids): @staticmethod def count_multiple_ids(nscount, paths=False): - count = "SELECT count(FREQUENCY)\n" + count = ( + "/* rogues: select file paths and counts from the database with multiple id results \n*/\n" + "SELECT count(FREQUENCY)\n" + ) pathquery = "SELECT PATH\n" body = ( "FROM (SELECT DISTINCT FILEDATA.FILE_PATH AS PATH, COUNT(FILEDATA.FILE_ID) AS FREQUENCY\n" @@ -74,7 +82,10 @@ def count_multiple_ids(nscount, paths=False): def get_rogue_name_paths(itemlist): csv = '","'.join(itemlist) csv = '("{}")'.format(csv) - PATHS = "SELECT DISTINCT FILEDATA.FILE_PATH FROM FILEDATA WHERE NAME IN " + PATHS = ( + "/* rogues: select file paths from the database where the name is in a given list \n*/\n" + "SELECT DISTINCT FILEDATA.FILE_PATH FROM FILEDATA WHERE NAME IN " + ) query = "{}{}".format(PATHS, csv) return query @@ -84,6 +95,9 @@ def get_rogue_dir_paths(itemlist): return "" csv = '","'.join(itemlist) csv = '("{}")'.format(csv) - PATHS = "SELECT DISTINCT FILEDATA.FILE_PATH FROM FILEDATA WHERE DIR_NAME IN " + PATHS = ( + "/* rogues: select file paths from the database where the name is in a given list \n*/\n" + "SELECT DISTINCT FILEDATA.FILE_PATH FROM FILEDATA WHERE DIR_NAME IN " + ) query = "{}{}".format(PATHS, csv) return query