From f21e89898f01544cf99b87faab17aa6335528cc7 Mon Sep 17 00:00:00 2001 From: kieranjol Date: Sun, 30 Jul 2017 14:01:32 +0100 Subject: [PATCH 01/31] creates premis CSV proof of concept scripts --- README.md | 35 +++++++--- premiscsv.py | 117 ++++++++++++++++++++++++++++++++ premisobjects.py | 170 +++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 314 insertions(+), 8 deletions(-) create mode 100755 premiscsv.py create mode 100755 premisobjects.py diff --git a/README.md b/README.md index 9c74f4a..c9688c2 100644 --- a/README.md +++ b/README.md @@ -7,22 +7,25 @@ table of contents 1. [summary](https://github.com/kieranjol/IFIscripts#summary) 2. [Arrangement](https://github.com/kieranjol/IFIscripts#arrangement) * [sipcreator.py](https://github.com/kieranjol/IFIscripts#sipcreator) -3. [Transcodes](https://github.com/kieranjol/IFIscripts#transcodes) +3. [PREMIS](https://github.com/kieranjol/IFIscripts#PREMIS) + * [premisobjects.py](https://github.com/kieranjol/IFIscripts#premisobjectspy) + * [premiscsv.py](https://github.com/kieranjol/IFIscripts#premiscsvpy) +4. [Transcodes](https://github.com/kieranjol/IFIscripts#transcodes) * [makeffv1.py](https://github.com/kieranjol/IFIscripts#makeffv1py) * [bitc.py](https://github.com/kieranjol/IFIscripts#bitcpy) * [prores.py](https://github.com/kieranjol/IFIscripts#prorespy) * [concat.py](https://github.com/kieranjol/IFIscripts#concatpy) -4. [Digital Cinema Package Scripts](https://github.com/kieranjol/IFIscripts#digital-cinema-package-scripts) +5. [Digital Cinema Package Scripts](https://github.com/kieranjol/IFIscripts#digital-cinema-package-scripts) * [dcpaccess.py](https://github.com/kieranjol/IFIscripts#dcpaccesspy) * [dcpfixity.py](https://github.com/kieranjol/IFIscripts#dcpfixitypy) * [dcpsubs2srt.py](https://github.com/kieranjol/IFIscripts#dcpsubs2srtpy) -5. [Fixity Scripts](https://github.com/kieranjol/IFIscripts#fixity-scripts) +6. [Fixity Scripts](https://github.com/kieranjol/IFIscripts#fixity-scripts) * [copyit.py](https://github.com/kieranjol/IFIscripts#copyitpy) * [manifest.py](https://github.com/kieranjol/IFIscripts#manifestpy) * [sha512deep.py](https://github.com/kieranjol/IFIscripts#sha512deeppy) * [validate.py](https://github.com/kieranjol/IFIscripts#validatepy) * [batchfixity.py](https://github.com/kieranjol/IFIscripts#batchfixitypy) -6. [Image Sequences](https://github.com/kieranjol/IFIscripts#image-sequences) +7. [Image Sequences](https://github.com/kieranjol/IFIscripts#image-sequences) * [makedpx.py](https://github.com/kieranjol/IFIscripts#makedpxpy) * [seq2ffv1.py](https://github.com/kieranjol/IFIscripts#seq2ffv1py) * [seq2prores.py](https://github.com/kieranjol/IFIscripts#seq2prorespy) @@ -33,19 +36,19 @@ table of contents * [seq2dv.py](https://github.com/kieranjol/IFIscripts#seq2dvpy) * [batchmetadata.py](https://github.com/kieranjol/IFIscripts#batchmetadata) * [batchrename.py](https://github.com/kieranjol/IFIscripts#batchrename) -7. [Quality Control](https://github.com/kieranjol/IFIscripts#quality-control) +8. [Quality Control](https://github.com/kieranjol/IFIscripts#quality-control) * [qctools.py](https://github.com/kieranjol/IFIscripts#qctoolspy) -8. [Specific Workflows](https://github.com/kieranjol/IFIscripts#specific-workflows) +9. [Specific Workflows](https://github.com/kieranjol/IFIscripts#specific-workflows) * [mezzaninecheck.py](https://github.com/kieranjol/IFIscripts#mezzaninecheckpy) * [loopline.py](https://github.com/kieranjol/IFIscripts#looplinepy) * [masscopy.py](https://github.com/kieranjol/IFIscripts#masscopypy) * [dvsip.py](https://github.com/kieranjol/IFIscripts#dvsippy) -9. [Misc](https://github.com/kieranjol/IFIscripts#misc) +10. [Misc](https://github.com/kieranjol/IFIscripts#misc) * [update.py](https://github.com/kieranjol/IFIscripts#updatepy) * [giffer.py](https://github.com/kieranjol/IFIscripts#gifferpy) * [makeuuid.py](https://github.com/kieranjol/IFIscripts#makeuuidpy) * [durationcheck.py](https://github.com/kieranjol/IFIscripts#durationcheck.py) -10. [Experimental-Premis](https://github.com/kieranjol/IFIscripts#experimental-premis) +11. [Experimental-Premis](https://github.com/kieranjol/IFIscripts#experimental-premis) * [premis.py](https://github.com/kieranjol/IFIscripts#premispy) * [revtmd.py](https://github.com/kieranjol/IFIscripts#revtmdpy) * [as11fixity.py](https://github.com/kieranjol/IFIscripts#as11fixitypy) @@ -68,6 +71,22 @@ Note: Documentation template has been copied from [mediamicroservices](https://g * Usage for more than one directory - `sipcreator.py -i /path/to/directory_name1 /path/to/directory_name2 -o /path/to/output_folder` * Run `sipcreator.py -h` for all options. +## PREMIS ## + +### premisobjects.py ### +* Creates a somewhat PREMIS compliant CSV file describing objects in a package. A seperate script will need to be written in order to transform these CSV files into XML. +* As the flat CSV structure prevents maintaining some of the complex relationships between units, some semantic units have been merged, for example:`relationship_structural_includes` is really a combination of the `relationshipType` and `relationshipSubType` units, which each have the values: `Structural` and `Includes` respectively. +* Assumptions for now: representation UUID already exists as part of the SIP/AIP folder structure. Find a way to supply this, probably via argparse. +* For more information, run `pydoc premisobjects ` +* Usage: `premiscsv.py path/to/SIP path/to/manifest.md5` + +### premiscsv.py ### +* Extracts preservation events from an IFI plain text log file and converts to a CSV using the PREMIS data dictionary. +* For more information, run `pydoc premiscsv` +* Usage: - `premiscsv.py path/to/logfile.log` + + + ## Transcodes ## ### makeffv1.py ### diff --git a/premiscsv.py b/premiscsv.py new file mode 100755 index 0000000..8ee66b8 --- /dev/null +++ b/premiscsv.py @@ -0,0 +1,117 @@ +#!/usr/bin/env python +''' +Extracts preservation events from an IFI plain text log file and converts +to a CSV using the PREMIS data dictionary +''' +import os +import sys +# from lxml import etree +import ififuncs +def find_events(logfile): + ''' + A very hacky attempt to extract the relevant preservation events from our + log files. + ''' + sip_test = os.path.basename(logfile).replace('_sip_log.log', '') + if ififuncs.validate_uuid4(sip_test) != False: + linking_object_identifier_value = sip_test + with open(logfile, 'r') as logfile_object: + log_lines = logfile_object.readlines() + for event_test in log_lines: + if 'eventDetail=copyit.py' in event_test: + logsplit = event_test.split(',') + for line_fragment in logsplit: + manifest_event = line_fragment.replace( + 'eventDetail', '' + ).replace('\n', '').split('=')[1] + for log_entry in log_lines: + valid_entries = [ + 'eventType', + 'eventDetail=sipcreator.py', + 'eventDetail=Mediatrace', + 'eventDetail=Technical', + 'eventDetail=copyit.py' + ] + for entry in valid_entries: + if entry in log_entry: + break_loop = '' + event_outcome = '' + event_detail = '' + event_outcome_detail_note = '' + event_type = '' + event_row = [] + datetime = log_entry[:19] + logsplit = log_entry.split(',') + for line_fragment in logsplit: + if 'eventType' in line_fragment: + if 'EVENT =' in line_fragment: + line_fragment = line_fragment.split('EVENT =')[1] + event_type = line_fragment.replace( + ' eventType=', '' + ).replace('assignement', 'assignment') + if ' value' in line_fragment: + # this assumes that the value is the outcome of an identifier assigment. + event_outcome = line_fragment[7:].replace('\n', '') + # we are less concerned with events starting. + if 'status=started' in line_fragment: + break_loop = 'continue' + if 'Generating destination manifest:' in line_fragment: + break_loop = '' + event_detail = manifest_event + # ugh, this might run multiple times. + if 'eventDetail=sipcreator.py' in log_entry: + event_type = 'Information Package Creation' + event_detail = line_fragment.replace( + 'eventDetail', '' + ).replace('\n', '').split('=')[1] + event_outcome_detail_note = 'Submission Information Package' + if ('eventDetail=Mediatrace' in log_entry) or ('eventDetail=Technical' in log_entry): + event_type = 'metadata extraction' + event_outcome = log_entry.split( + 'eventOutcome=', 1 + )[1].replace(', agentName=mediainfo', '').replace('\n', '') + if 'eventDetail=Mediatrace' in log_entry: + event_outcome = event_outcome.replace('mediainfo.xml', 'mediatrace.xml') + if (break_loop == 'continue') or (event_type == ''): + continue + print event_type + event_row = [ + 'UUID', ififuncs.create_uuid(), + event_type, datetime, event_detail, + '', + event_outcome, '', + event_outcome_detail_note, '', + '', '', + '', 'UUID', + linking_object_identifier_value, '' + ] + ififuncs.append_csv('bla.csv', event_row) + +def make_events_csv(): + ''' + Generates a CSV with PREMIS-esque headings. Currently it's just called + 'bla.csv' but it will probably be called: + UUID_premisevents.csv + and sit in the metadata directory. + ''' + premis_events = [ + 'eventIdentifierType', 'eventIdentifierValue', + 'eventType', 'eventDateTime', 'eventDetail', + 'eventDetailExtension', + 'eventOutcome', 'eventOutcomeDetail', + 'eventOutcomeDetailNote', 'eventOutcomeDetailExtension', + 'linkingAgentIdentifierType', 'linkingAgentIdentifierValue', + 'linkingAgentIdentifierRole', 'linkingObjectIdentifierType', + 'linkingObjectIdentifierValue', 'linkingObjectRole' + ] + ififuncs.create_csv('bla.csv', premis_events) + +def main(): + ''' + Launches all the other functions when run from the command line. + ''' + logfile = sys.argv[1] + find_events(logfile) + +if __name__ == '__main__': + main() diff --git a/premisobjects.py b/premisobjects.py new file mode 100755 index 0000000..a534280 --- /dev/null +++ b/premisobjects.py @@ -0,0 +1,170 @@ +#!/usr/bin/env python +''' +Creates a somewhat PREMIS compliant CSV file describing objects in a package. +A seperate script will need to be written in order to transform these +CSV files into XML. +As the flat CSV structure prevents maintaining some of the complex +relationships between units, some semantic units have been merged, for example: +relation_structural_includes is really a combination of the +relationshipType and relationshipSubType units, which each have the values: +Structural and Includes respectively. + +todo: +Document identifier assignment for files and IE. Probably in events sheet? +Allow for derivation to be entered +Link with events sheet +Link mediainfo xml in /metadata to the objectCharacteristicsExtension field. + + +Assumptions for now: representation UUID already exists as part of the +SIP/AIP folder structure. Find a way to supply this, probably via argparse. +''' + +import os +import sys +import ififuncs + + +def get_checksum(manifest, filename): + ''' + Extracts checksum from manifest, rather than generating a fresh one. + ''' + if os.path.isfile(manifest): + with open(manifest, 'r') as manifest_object: + manifest_lines = manifest_object.readlines() + for md5 in manifest_lines: + if filename in md5: + return md5[:32] + + +def make_skeleton_csv(): + ''' + Generates a CSV with PREMIS-esque headings. Currently it's just called + 'cle.csv' but it will probably be called: + UUID_premisobjects.csv + and sit in the metadata directory. + ''' + premis_object_units = [ + 'objectIdentifier', + 'objectCategory', + 'messageDigestAlgorithm', 'messageDigest', 'messageDigestOriginator', + 'size', 'formatName', 'formatVersion', + 'objectCharacteristicsExtension', 'originalName', + 'contentLocationType', 'contentLocationValue', + 'relatedObjectIdentifierType', 'relatedObjectIdentifierValue', + 'relatedObjectSequence', + 'relatedEventIdentifierType', 'relatedEventIdentifierValue', + 'relatedEventSequence', + 'linkingEventIdentifierType', 'linkingEventIdentifierValue', + 'relationship_structural_includes', + 'relationship_structural_isincludedin', + 'relationship_structural_represents', + 'relationship_structural_hasroot', + 'relationship_derivation_hassource' + ] + ififuncs.create_csv('cle.csv', premis_object_units) + + +def file_description(source, manifest, representation_uuid): + ''' + Generate PREMIS descriptions for items and write to CSV. + ''' + item_ids = [] + for root, _, filenames in os.walk(source): + if os.path.basename(root) == 'objects': + filenames = [f for f in filenames if f[0] != '.'] + for item in filenames: + item_uuid = ififuncs.create_uuid() + full_path = os.path.join(root, item) + item_dictionary = {} + item_dictionary['objectIdentifier'] = ['UUID', item_uuid] + item_dictionary['objectCategory'] = 'file' + item_dictionary['size'] = str(os.path.getsize(full_path)) + item_dictionary['originalName'] = item + item_dictionary['relationship_structural_isincludedin'] = representation_uuid + item_ids.append(item_uuid) + file_data = [ + item_dictionary['objectIdentifier'], + item_dictionary['objectCategory'], + 'md5', get_checksum(manifest, item), 'internal', + item_dictionary['size'], '', '', + '', '', + '', '', + '', '', + '', + '', '', + '', + '', '', + '', + item_dictionary['relationship_structural_isincludedin'], + '', + '', + '' + ] + ififuncs.append_csv('cle.csv', file_data) + return item_ids +def representation_description(representation_uuid, item_ids): + ''' + Generate PREMIS descriptions for a representation and write to CSV. + ''' + + representation_dictionary = {} + representation_dictionary['objectIdentifier'] = ['UUID', representation_uuid] + representation_dictionary['objectCategory'] = 'representation' + representation_dictionary['relationship_structural_includes'] = '' + for item_id in item_ids: + representation_dictionary['relationship_structural_includes'] += item_id + '|' + representation_data = [ + representation_dictionary['objectIdentifier'], + representation_dictionary['objectCategory'], + '', '', '', + '', '', '', + '', '', + '', '', + '', '', + '', + '', '', + '', + '', '', + representation_dictionary['relationship_structural_includes'], + '', + '', + '', + '' + ] + ififuncs.append_csv('cle.csv', representation_data) + + +def intellectual_entity_description(): + ''' + Generate PREMIS descriptions for Intellectual Entities and write to CSV. + ''' + intellectual_entity_dictionary = {} + intellectual_entity_dictionary['objectIdentifier'] = ['UUID', ififuncs.create_uuid()] + intellectual_entity_dictionary['objectCategory'] = 'intellectual entity' + print intellectual_entity_dictionary +def find_representation_uuid(source): + ''' + This extracts the representation UUID from a directory name. + This should be moved to ififuncs as it can be used by other scripts. + ''' + for root, _, _ in os.walk(source): + if 'objects' in root: + return os.path.basename(os.path.dirname(root)) + + +def main(): + ''' + Launches all the other functions when run from the command line. + ''' + make_skeleton_csv() + source = sys.argv[1] + manifest = sys.argv[2] + representation_uuid = find_representation_uuid(source) + item_ids = file_description(source, manifest, representation_uuid) + #intellectual_entity_description() + representation_description(representation_uuid, item_ids) + +if __name__ == '__main__': + main() + From 53aac7d6e2a6721076ce60b05a7d760a7e6a9ecd Mon Sep 17 00:00:00 2001 From: kieranjol Date: Sun, 30 Jul 2017 22:39:48 +0100 Subject: [PATCH 02/31] adds premis csv to xml draft script --- premiscsv2xml.py | 71 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 71 insertions(+) create mode 100755 premiscsv2xml.py diff --git a/premiscsv2xml.py b/premiscsv2xml.py new file mode 100755 index 0000000..c595f8d --- /dev/null +++ b/premiscsv2xml.py @@ -0,0 +1,71 @@ +#!/usr/bin/env python + +import os +import sys +import csv +from lxml import etree + +def extract_metadata(csv_file): + object_dictionaries = [] + input_file = csv.DictReader(open(csv_file)) + for rows in input_file: + object_dictionaries.append(rows) + return object_dictionaries + +def add_value(value, element): + element.text = value + + +def write_premis(doc, premisxml): + with open(premisxml, 'w') as outFile: + doc.write(outFile, pretty_print=True) + + +def create_unit(index, parent, unitname): + premis_namespace = "http://www.loc.gov/premis/v3" + unitname = etree.Element("{%s}%s" % (premis_namespace, unitname)) + parent.insert(index, unitname) + return unitname + +def setup_xml(object_dictionaries): + namespace = '' + premis_namespace = "http://www.loc.gov/premis/v3" + premis = etree.fromstring(namespace) + doc = etree.ElementTree(premis) + for objects in object_dictionaries: + id_list = objects['objectIdentifier'].replace( + '[', '' + ).replace(']', '').replace('\'', '').split(', ') + object_parent = create_unit(0, premis, 'object') + object_identifier_uuid = create_unit(2, object_parent, 'objectIdentifier') + object_identifier_uuid_type = create_unit(1, object_identifier_uuid, 'objectIdentifierType') + object_identifier_uuid_value = create_unit(2, object_identifier_uuid, 'objectIdentifierValue') + object_category = create_unit(4, object_parent, 'objectCategory') + add_value(id_list[0], object_identifier_uuid_type) + add_value(id_list[1], object_identifier_uuid_value) + add_value(objects['objectCategory'], object_category) + if objects['objectCategory'] == 'file': + object_characteristics = create_unit(10, object_parent, 'objectCharacteristics') + fixity = create_unit(0, object_characteristics, 'fixity') + size = create_unit(1, object_characteristics, 'size') + size.text = objects['size'] + message_digest_algorithm = create_unit(0, fixity, 'messageDigestAlgorithm') + message_digest = create_unit(1, fixity, 'messageDigest') + message_digest_originator = create_unit(2, fixity, 'messageDigestOriginator') + message_digest_originator.text = objects['messageDigestOriginator'] + message_digest.text = objects['messageDigest'] + message_digest_algorithm.text = objects['messageDigestAlgorithm'] + print(etree.tostring(doc, pretty_print=True)) + return premis_namespace, doc, premis +def main(): + csv_file = sys.argv[1] + object_dictionaries = extract_metadata(csv_file) + setup_xml(object_dictionaries) + for x in object_dictionaries: + for i in x: + if x[i] != '': + print i, x[i] + print '\n' + +if __name__ == '__main__': + main() From 88b94bea1a3f3adadc8967d94f668fb3d10b4be5 Mon Sep 17 00:00:00 2001 From: kieranjol Date: Sun, 30 Jul 2017 22:56:58 +0100 Subject: [PATCH 03/31] premiscsv2xml - PEP-08 cleanup --- premiscsv2xml.py | 73 ++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 58 insertions(+), 15 deletions(-) diff --git a/premiscsv2xml.py b/premiscsv2xml.py index c595f8d..8483bb4 100755 --- a/premiscsv2xml.py +++ b/premiscsv2xml.py @@ -1,11 +1,15 @@ #!/usr/bin/env python - -import os +''' +Takes a PREMIS CSV file, as generated by premiscsv.py, and transform into XML. +''' import sys import csv from lxml import etree def extract_metadata(csv_file): + ''' + Read the PREMIS csv and store the metadata in a list of dictionaries. + ''' object_dictionaries = [] input_file = csv.DictReader(open(csv_file)) for rows in input_file: @@ -13,21 +17,34 @@ def extract_metadata(csv_file): return object_dictionaries def add_value(value, element): + ''' + Useless helper function - remove! + ''' element.text = value def write_premis(doc, premisxml): - with open(premisxml, 'w') as outFile: - doc.write(outFile, pretty_print=True) + ''' + Writes the PREMIS object to a file. + ''' + with open(premisxml, 'w') as out_file: + doc.write(out_file, pretty_print=True) def create_unit(index, parent, unitname): + ''' + Helper function that adds an XML element. + ''' premis_namespace = "http://www.loc.gov/premis/v3" unitname = etree.Element("{%s}%s" % (premis_namespace, unitname)) parent.insert(index, unitname) return unitname def setup_xml(object_dictionaries): + ''' + This should just create the PREMIS lxml object. + Actual metadata generation should be moved to other functions. + ''' namespace = '' premis_namespace = "http://www.loc.gov/premis/v3" premis = etree.fromstring(namespace) @@ -36,28 +53,54 @@ def setup_xml(object_dictionaries): id_list = objects['objectIdentifier'].replace( '[', '' ).replace(']', '').replace('\'', '').split(', ') - object_parent = create_unit(0, premis, 'object') - object_identifier_uuid = create_unit(2, object_parent, 'objectIdentifier') - object_identifier_uuid_type = create_unit(1, object_identifier_uuid, 'objectIdentifierType') - object_identifier_uuid_value = create_unit(2, object_identifier_uuid, 'objectIdentifierValue') - object_category = create_unit(4, object_parent, 'objectCategory') + object_parent = create_unit( + 0, premis, 'object' + ) + object_identifier_uuid = create_unit( + 2, object_parent, 'objectIdentifier' + ) + object_identifier_uuid_type = create_unit( + 1, object_identifier_uuid, 'objectIdentifierType' + ) + object_identifier_uuid_value = create_unit( + 2, object_identifier_uuid, 'objectIdentifierValue' + ) + object_category = create_unit( + 4, object_parent, 'objectCategory' + ) add_value(id_list[0], object_identifier_uuid_type) add_value(id_list[1], object_identifier_uuid_value) add_value(objects['objectCategory'], object_category) if objects['objectCategory'] == 'file': - object_characteristics = create_unit(10, object_parent, 'objectCharacteristics') - fixity = create_unit(0, object_characteristics, 'fixity') - size = create_unit(1, object_characteristics, 'size') + object_characteristics = create_unit( + 10, object_parent, 'objectCharacteristics' + ) + fixity = create_unit( + 0, object_characteristics, 'fixity' + ) + size = create_unit( + 1, object_characteristics, 'size' + ) size.text = objects['size'] - message_digest_algorithm = create_unit(0, fixity, 'messageDigestAlgorithm') - message_digest = create_unit(1, fixity, 'messageDigest') - message_digest_originator = create_unit(2, fixity, 'messageDigestOriginator') + message_digest_algorithm = create_unit( + 0, fixity, 'messageDigestAlgorithm' + ) + message_digest = create_unit( + 1, fixity, 'messageDigest' + ) + message_digest_originator = create_unit( + 2, fixity, 'messageDigestOriginator' + ) message_digest_originator.text = objects['messageDigestOriginator'] message_digest.text = objects['messageDigest'] message_digest_algorithm.text = objects['messageDigestAlgorithm'] print(etree.tostring(doc, pretty_print=True)) return premis_namespace, doc, premis def main(): + ''' + Launches all the other functions when run from the command line. + For debugging purposes, the contents of the CSV is printed to screen. + ''' csv_file = sys.argv[1] object_dictionaries = extract_metadata(csv_file) setup_xml(object_dictionaries) From 713d2aff3eaa1fbcc2af8d94edd7668202a22b72 Mon Sep 17 00:00:00 2001 From: kieranjol Date: Mon, 31 Jul 2017 00:53:10 +0100 Subject: [PATCH 04/31] premiscsv2xml - removes objectCategory, move information to attribute in order to validate against schema --- premiscsv2xml.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/premiscsv2xml.py b/premiscsv2xml.py index 8483bb4..8d85c45 100755 --- a/premiscsv2xml.py +++ b/premiscsv2xml.py @@ -47,6 +47,7 @@ def setup_xml(object_dictionaries): ''' namespace = '' premis_namespace = "http://www.loc.gov/premis/v3" + xsi_namespace = "http://www.w3.org/2001/XMLSchema-instance" premis = etree.fromstring(namespace) doc = etree.ElementTree(premis) for objects in object_dictionaries: @@ -56,6 +57,9 @@ def setup_xml(object_dictionaries): object_parent = create_unit( 0, premis, 'object' ) + object_parent.attrib[ + "{%s}type" % xsi_namespace + ] = "premis:%s" % objects['objectCategory'] object_identifier_uuid = create_unit( 2, object_parent, 'objectIdentifier' ) @@ -65,12 +69,8 @@ def setup_xml(object_dictionaries): object_identifier_uuid_value = create_unit( 2, object_identifier_uuid, 'objectIdentifierValue' ) - object_category = create_unit( - 4, object_parent, 'objectCategory' - ) add_value(id_list[0], object_identifier_uuid_type) add_value(id_list[1], object_identifier_uuid_value) - add_value(objects['objectCategory'], object_category) if objects['objectCategory'] == 'file': object_characteristics = create_unit( 10, object_parent, 'objectCharacteristics' From de5b4b025b22b203f5090f3db953c83e3534d36d Mon Sep 17 00:00:00 2001 From: kieranjol Date: Mon, 31 Jul 2017 11:12:55 +0100 Subject: [PATCH 05/31] premiscsv2xml - performs recursive item search --- premiscsv2xml.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/premiscsv2xml.py b/premiscsv2xml.py index 8d85c45..f761127 100755 --- a/premiscsv2xml.py +++ b/premiscsv2xml.py @@ -104,11 +104,12 @@ def main(): csv_file = sys.argv[1] object_dictionaries = extract_metadata(csv_file) setup_xml(object_dictionaries) + ''' for x in object_dictionaries: for i in x: if x[i] != '': print i, x[i] print '\n' - + ''' if __name__ == '__main__': main() From 1986b32696d0cf23986aedd0db378b579e7cfe49 Mon Sep 17 00:00:00 2001 From: kieranjol Date: Mon, 31 Jul 2017 11:13:21 +0100 Subject: [PATCH 06/31] premisobjects - remove debug statements --- premisobjects.py | 63 ++++++++++++++++++++++++------------------------ 1 file changed, 32 insertions(+), 31 deletions(-) diff --git a/premisobjects.py b/premisobjects.py index a534280..1024344 100755 --- a/premisobjects.py +++ b/premisobjects.py @@ -72,36 +72,37 @@ def file_description(source, manifest, representation_uuid): item_ids = [] for root, _, filenames in os.walk(source): if os.path.basename(root) == 'objects': - filenames = [f for f in filenames if f[0] != '.'] - for item in filenames: - item_uuid = ififuncs.create_uuid() - full_path = os.path.join(root, item) - item_dictionary = {} - item_dictionary['objectIdentifier'] = ['UUID', item_uuid] - item_dictionary['objectCategory'] = 'file' - item_dictionary['size'] = str(os.path.getsize(full_path)) - item_dictionary['originalName'] = item - item_dictionary['relationship_structural_isincludedin'] = representation_uuid - item_ids.append(item_uuid) - file_data = [ - item_dictionary['objectIdentifier'], - item_dictionary['objectCategory'], - 'md5', get_checksum(manifest, item), 'internal', - item_dictionary['size'], '', '', - '', '', - '', '', - '', '', - '', - '', '', - '', - '', '', - '', - item_dictionary['relationship_structural_isincludedin'], - '', - '', - '' - ] - ififuncs.append_csv('cle.csv', file_data) + for root, _, filenames in os.walk(root): + filenames = [f for f in filenames if f[0] != '.'] + for item in filenames: + item_uuid = ififuncs.create_uuid() + full_path = os.path.join(root, item) + item_dictionary = {} + item_dictionary['objectIdentifier'] = ['UUID', item_uuid] + item_dictionary['objectCategory'] = 'file' + item_dictionary['size'] = str(os.path.getsize(full_path)) + item_dictionary['originalName'] = item + item_dictionary['relationship_structural_isincludedin'] = representation_uuid + item_ids.append(item_uuid) + file_data = [ + item_dictionary['objectIdentifier'], + item_dictionary['objectCategory'], + 'md5', get_checksum(manifest, item), 'internal', + item_dictionary['size'], '', '', + '', '', + '', '', + '', '', + '', + '', '', + '', + '', '', + '', + item_dictionary['relationship_structural_isincludedin'], + '', + '', + '' + ] + ififuncs.append_csv('cle.csv', file_data) return item_ids def representation_description(representation_uuid, item_ids): ''' @@ -142,7 +143,7 @@ def intellectual_entity_description(): intellectual_entity_dictionary = {} intellectual_entity_dictionary['objectIdentifier'] = ['UUID', ififuncs.create_uuid()] intellectual_entity_dictionary['objectCategory'] = 'intellectual entity' - print intellectual_entity_dictionary + #print intellectual_entity_dictionary def find_representation_uuid(source): ''' This extracts the representation UUID from a directory name. From 2af635607add7b86c0a8de1acf3dc810bbeca58c Mon Sep 17 00:00:00 2001 From: kieranjol Date: Wed, 2 Aug 2017 22:54:50 +0100 Subject: [PATCH 07/31] ififuncs - adds pronom/siegfied function --- ififuncs.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/ififuncs.py b/ififuncs.py index 385c6d1..d587049 100755 --- a/ififuncs.py +++ b/ififuncs.py @@ -12,6 +12,7 @@ import uuid import tempfile import csv +import json from glob import glob from email.mime.multipart import MIMEMultipart from email.mime.audio import MIMEAudio @@ -774,3 +775,18 @@ def checksum_replace(manifest, logname): with open(manifest, 'wb') as fo: for lines in updated_manifest: fo.write(lines) + +def get_pronom_format(filename): + ''' + Uses siegfried to return a tuple that contains: + pronom_id, authority, siegfried version + ''' + siegfried_json = subprocess.check_output( + ['sf', '-json', filename] + ) + json_object = json.loads(siegfried_json) + pronom_id = str(json_object['files'][0]['matches'][0]['id']) + authority = str(json_object['files'][0]['matches'][0]['ns']) + version = str(json_object['siegfried']) + return (pronom_id, authority, version) + From 55b6c86e535840ec84cfbd2525ab7ff32bbcab5e Mon Sep 17 00:00:00 2001 From: kieranjol Date: Wed, 2 Aug 2017 23:14:03 +0100 Subject: [PATCH 08/31] premiscsv - ads siegfried/pronom format registry --- premiscsv2xml.py | 18 ++++++++++++++++++ premisobjects.py | 6 ++++++ 2 files changed, 24 insertions(+) diff --git a/premiscsv2xml.py b/premiscsv2xml.py index f761127..8e5319e 100755 --- a/premiscsv2xml.py +++ b/premiscsv2xml.py @@ -81,6 +81,21 @@ def setup_xml(object_dictionaries): size = create_unit( 1, object_characteristics, 'size' ) + format_ = create_unit( + 2, object_characteristics, 'format' + ) + format_registry = create_unit( + 1, format_, 'formatRegistry' + ) + format_registry_name = create_unit( + 0, format_registry, 'formatRegistryName' + ) + format_registry_key = create_unit( + 1, format_registry, 'formatRegistryKey' + ) + format_registry_role = create_unit( + 2, format_registry, 'formatRegistryRole' + ) size.text = objects['size'] message_digest_algorithm = create_unit( 0, fixity, 'messageDigestAlgorithm' @@ -94,6 +109,9 @@ def setup_xml(object_dictionaries): message_digest_originator.text = objects['messageDigestOriginator'] message_digest.text = objects['messageDigest'] message_digest_algorithm.text = objects['messageDigestAlgorithm'] + format_registry_name.text = objects['formatRegistryName'] + format_registry_key.text = objects['formatRegistryKey'] + format_registry_role.text = objects['formatRegistryRole'] print(etree.tostring(doc, pretty_print=True)) return premis_namespace, doc, premis def main(): diff --git a/premisobjects.py b/premisobjects.py index 1024344..082466e 100755 --- a/premisobjects.py +++ b/premisobjects.py @@ -49,6 +49,7 @@ def make_skeleton_csv(): 'objectCategory', 'messageDigestAlgorithm', 'messageDigest', 'messageDigestOriginator', 'size', 'formatName', 'formatVersion', + 'formatRegistryName', 'formatRegistryKey', 'formatRegistryRole', 'objectCharacteristicsExtension', 'originalName', 'contentLocationType', 'contentLocationValue', 'relatedObjectIdentifierType', 'relatedObjectIdentifierValue', @@ -77,6 +78,9 @@ def file_description(source, manifest, representation_uuid): for item in filenames: item_uuid = ififuncs.create_uuid() full_path = os.path.join(root, item) + pronom_id, authority, version = ififuncs.get_pronom_format( + full_path + ) item_dictionary = {} item_dictionary['objectIdentifier'] = ['UUID', item_uuid] item_dictionary['objectCategory'] = 'file' @@ -89,6 +93,7 @@ def file_description(source, manifest, representation_uuid): item_dictionary['objectCategory'], 'md5', get_checksum(manifest, item), 'internal', item_dictionary['size'], '', '', + authority, pronom_id, 'identification', '', '', '', '', '', '', @@ -120,6 +125,7 @@ def representation_description(representation_uuid, item_ids): representation_dictionary['objectCategory'], '', '', '', '', '', '', + '', '', '', '', '', '', '', '', '', From f4af73e27e08eae269dde1b62fc5f5cc8ec8eb52 Mon Sep 17 00:00:00 2001 From: kieranjol Date: Thu, 3 Aug 2017 22:47:35 +0100 Subject: [PATCH 09/31] premiscsv - fix mediainfo eventDetail --- premiscsv.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/premiscsv.py b/premiscsv.py index 8ee66b8..c0dde87 100755 --- a/premiscsv.py +++ b/premiscsv.py @@ -67,6 +67,9 @@ def find_events(logfile): event_outcome_detail_note = 'Submission Information Package' if ('eventDetail=Mediatrace' in log_entry) or ('eventDetail=Technical' in log_entry): event_type = 'metadata extraction' + event_detail = log_entry.split( + 'eventDetail=', 1 + )[1].split(',')[0] event_outcome = log_entry.split( 'eventOutcome=', 1 )[1].replace(', agentName=mediainfo', '').replace('\n', '') @@ -85,7 +88,7 @@ def find_events(logfile): '', 'UUID', linking_object_identifier_value, '' ] - ififuncs.append_csv('bla.csv', event_row) + ififuncs.append_csv('events.csv', event_row) def make_events_csv(): ''' @@ -104,12 +107,13 @@ def make_events_csv(): 'linkingAgentIdentifierRole', 'linkingObjectIdentifierType', 'linkingObjectIdentifierValue', 'linkingObjectRole' ] - ififuncs.create_csv('bla.csv', premis_events) + ififuncs.create_csv('events.csv', premis_events) def main(): ''' Launches all the other functions when run from the command line. ''' + make_events_csv() logfile = sys.argv[1] find_events(logfile) From 33b78d5c9a1d446dcd0a7055dc1302e25f195404 Mon Sep 17 00:00:00 2001 From: kieranjol Date: Thu, 3 Aug 2017 23:06:06 +0100 Subject: [PATCH 10/31] premisobjects - extract relative path of object for contentLocation --- premisobjects.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/premisobjects.py b/premisobjects.py index 082466e..ae0ba84 100755 --- a/premisobjects.py +++ b/premisobjects.py @@ -27,14 +27,14 @@ def get_checksum(manifest, filename): ''' - Extracts checksum from manifest, rather than generating a fresh one. + Extracts the checksum and path within a manifest, returning both as a tuple. ''' if os.path.isfile(manifest): with open(manifest, 'r') as manifest_object: manifest_lines = manifest_object.readlines() for md5 in manifest_lines: if filename in md5: - return md5[:32] + return md5[:32], md5[34:] def make_skeleton_csv(): @@ -63,7 +63,7 @@ def make_skeleton_csv(): 'relationship_structural_hasroot', 'relationship_derivation_hassource' ] - ififuncs.create_csv('cle.csv', premis_object_units) + ififuncs.create_csv('objects.csv', premis_object_units) def file_description(source, manifest, representation_uuid): @@ -76,6 +76,7 @@ def file_description(source, manifest, representation_uuid): for root, _, filenames in os.walk(root): filenames = [f for f in filenames if f[0] != '.'] for item in filenames: + md5, urn = get_checksum(manifest, item) item_uuid = ififuncs.create_uuid() full_path = os.path.join(root, item) pronom_id, authority, version = ififuncs.get_pronom_format( @@ -91,11 +92,11 @@ def file_description(source, manifest, representation_uuid): file_data = [ item_dictionary['objectIdentifier'], item_dictionary['objectCategory'], - 'md5', get_checksum(manifest, item), 'internal', + 'md5', md5, 'internal', item_dictionary['size'], '', '', authority, pronom_id, 'identification', - '', '', - '', '', + '', item, + 'urn', urn, '', '', '', '', '', @@ -107,7 +108,7 @@ def file_description(source, manifest, representation_uuid): '', '' ] - ififuncs.append_csv('cle.csv', file_data) + ififuncs.append_csv('objects.csv', file_data) return item_ids def representation_description(representation_uuid, item_ids): ''' @@ -139,7 +140,7 @@ def representation_description(representation_uuid, item_ids): '', '' ] - ififuncs.append_csv('cle.csv', representation_data) + ififuncs.append_csv('objects.csv', representation_data) def intellectual_entity_description(): From 7bc4538ca2c3ed5f413b5a1a813c1773e4a3f19c Mon Sep 17 00:00:00 2001 From: kieranjol Date: Fri, 4 Aug 2017 00:51:49 +0100 Subject: [PATCH 11/31] premiscsv - extract linkingObjectIdentifier for metadata extraction --- premiscsv.py | 26 ++++++++++++++++++++++++++ premisobjects.py | 5 +++-- 2 files changed, 29 insertions(+), 2 deletions(-) diff --git a/premiscsv.py b/premiscsv.py index c0dde87..5874219 100755 --- a/premiscsv.py +++ b/premiscsv.py @@ -5,8 +5,19 @@ ''' import os import sys +import csv # from lxml import etree import ififuncs + +def extract_metadata(csv_file): + ''' + Read the PREMIS csv and store the metadata in a list of dictionaries. + ''' + object_dictionaries = [] + input_file = csv.DictReader(open(csv_file)) + for rows in input_file: + object_dictionaries.append(rows) + return object_dictionaries def find_events(logfile): ''' A very hacky attempt to extract the relevant preservation events from our @@ -24,6 +35,10 @@ def find_events(logfile): manifest_event = line_fragment.replace( 'eventDetail', '' ).replace('\n', '').split('=')[1] + object_info = extract_metadata('objects.csv') + object_locations = {} + for i in object_info: + object_locations[i['contentLocationValue']] = i['objectIdentifier'].split(', ')[1].replace(']', '') for log_entry in log_lines: valid_entries = [ 'eventType', @@ -75,6 +90,17 @@ def find_events(logfile): )[1].replace(', agentName=mediainfo', '').replace('\n', '') if 'eventDetail=Mediatrace' in log_entry: event_outcome = event_outcome.replace('mediainfo.xml', 'mediatrace.xml') + for x in object_locations: + ''' + This is trying to get the UUID of the source object + that relates to the mediainfo xmls. This is + achieved via a dictionary. + ''' + if 'objects' in x: + a = os.path.basename(event_outcome).replace('_mediainfo.xml', '').replace('_mediatrace.xml', '')[:-1] + b = os.path.basename(x) + if a == b: + linking_object_identifier_value = object_locations[x].replace('\'','') if (break_loop == 'continue') or (event_type == ''): continue print event_type diff --git a/premisobjects.py b/premisobjects.py index ae0ba84..503fc9e 100755 --- a/premisobjects.py +++ b/premisobjects.py @@ -33,8 +33,9 @@ def get_checksum(manifest, filename): with open(manifest, 'r') as manifest_object: manifest_lines = manifest_object.readlines() for md5 in manifest_lines: - if filename in md5: - return md5[:32], md5[34:] + if 'objects' in md5: + if filename in md5: + return md5[:32], md5[34:].rstrip() def make_skeleton_csv(): From 0f6e048728c408eae69545e418c097314724988d Mon Sep 17 00:00:00 2001 From: kieranjol Date: Fri, 4 Aug 2017 01:13:34 +0100 Subject: [PATCH 12/31] premiscsv - ads contentLocation to xml transform --- premiscsv2xml.py | 16 +++++++++++++++- premisobjects.py | 4 ++-- 2 files changed, 17 insertions(+), 3 deletions(-) diff --git a/premiscsv2xml.py b/premiscsv2xml.py index 8e5319e..266a28d 100755 --- a/premiscsv2xml.py +++ b/premiscsv2xml.py @@ -73,7 +73,19 @@ def setup_xml(object_dictionaries): add_value(id_list[1], object_identifier_uuid_value) if objects['objectCategory'] == 'file': object_characteristics = create_unit( - 10, object_parent, 'objectCharacteristics' + 5, object_parent, 'objectCharacteristics' + ) + storage = create_unit( + 7, object_parent, 'storage' + ) + content_location = create_unit( + 0, storage, 'contentLocation' + ) + content_location_type = create_unit( + 0, content_location, 'contentLocationType' + ) + content_location_value = create_unit( + 1, content_location, 'contentLocationValue' ) fixity = create_unit( 0, object_characteristics, 'fixity' @@ -112,6 +124,8 @@ def setup_xml(object_dictionaries): format_registry_name.text = objects['formatRegistryName'] format_registry_key.text = objects['formatRegistryKey'] format_registry_role.text = objects['formatRegistryRole'] + content_location_type.text = objects['contentLocationType'] + content_location_value.text = objects['contentLocationValue'] print(etree.tostring(doc, pretty_print=True)) return premis_namespace, doc, premis def main(): diff --git a/premisobjects.py b/premisobjects.py index 503fc9e..991249b 100755 --- a/premisobjects.py +++ b/premisobjects.py @@ -77,7 +77,7 @@ def file_description(source, manifest, representation_uuid): for root, _, filenames in os.walk(root): filenames = [f for f in filenames if f[0] != '.'] for item in filenames: - md5, urn = get_checksum(manifest, item) + md5, uri = get_checksum(manifest, item) item_uuid = ififuncs.create_uuid() full_path = os.path.join(root, item) pronom_id, authority, version = ififuncs.get_pronom_format( @@ -97,7 +97,7 @@ def file_description(source, manifest, representation_uuid): item_dictionary['size'], '', '', authority, pronom_id, 'identification', '', item, - 'urn', urn, + 'uri', uri, '', '', '', '', '', From 4ab7049d3f1ff2e2fbb305ee1d0c274ced0caac6 Mon Sep 17 00:00:00 2001 From: kieranjol Date: Sat, 5 Aug 2017 18:45:40 +0100 Subject: [PATCH 13/31] adds linkingEventIdentifierValue to object descriptions --- premiscsv.py | 48 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) diff --git a/premiscsv.py b/premiscsv.py index 5874219..0270d96 100755 --- a/premiscsv.py +++ b/premiscsv.py @@ -116,6 +116,53 @@ def find_events(logfile): ] ififuncs.append_csv('events.csv', event_row) +def update_objects(): + ''' + Update the object description with the linkingEventIdentifiers + ''' + link_dict = {} + event_dicts = extract_metadata('events.csv') + for i in event_dicts: + a = i['eventIdentifierValue'] + try: + link_dict[i['linkingObjectIdentifierValue']] += a + '|' + except KeyError: + link_dict[i['linkingObjectIdentifierValue']] = a + '|' + print link_dict + object_dicts = extract_metadata('objects.csv') + for x in object_dicts: + for link in link_dict: + + if link == x['objectIdentifier'].split(', ')[1].replace(']', '').replace('\'',''): + x['linkingEventIdentifierValue'] = link_dict[link] + premis_object_units = [ + 'objectIdentifier', + 'objectCategory', + 'messageDigestAlgorithm', 'messageDigest', 'messageDigestOriginator', + 'size', 'formatName', 'formatVersion', + 'formatRegistryName', 'formatRegistryKey', 'formatRegistryRole', + 'objectCharacteristicsExtension', 'originalName', + 'contentLocationType', 'contentLocationValue', + 'relatedObjectIdentifierType', 'relatedObjectIdentifierValue', + 'relatedObjectSequence', + 'relatedEventIdentifierType', 'relatedEventIdentifierValue', + 'relatedEventSequence', + 'linkingEventIdentifierType', 'linkingEventIdentifierValue', + 'relationship_structural_includes', + 'relationship_structural_isincludedin', + 'relationship_structural_represents', + 'relationship_structural_hasroot', + 'relationship_derivation_hassource' + ] + with open('mycsvfile.csv', 'wb') as f: + counter = 0 + for i in object_dicts: + w = csv.DictWriter(f, fieldnames=premis_object_units) + if counter == 0: + w.writeheader() + counter += 1 + w.writerow(i) + def make_events_csv(): ''' Generates a CSV with PREMIS-esque headings. Currently it's just called @@ -142,6 +189,7 @@ def main(): make_events_csv() logfile = sys.argv[1] find_events(logfile) + update_objects() if __name__ == '__main__': main() From 494a167f8913f09a984e40c4a6d0a6343e67449d Mon Sep 17 00:00:00 2001 From: kieranjol Date: Sat, 5 Aug 2017 19:24:44 +0100 Subject: [PATCH 14/31] premiscsv2xml - converts Events CSV to XML --- premiscsv2xml.py | 29 +++++++++++++++++++++++++++-- 1 file changed, 27 insertions(+), 2 deletions(-) diff --git a/premiscsv2xml.py b/premiscsv2xml.py index 266a28d..1469bf8 100755 --- a/premiscsv2xml.py +++ b/premiscsv2xml.py @@ -40,7 +40,7 @@ def create_unit(index, parent, unitname): parent.insert(index, unitname) return unitname -def setup_xml(object_dictionaries): +def setup_xml(object_dictionaries, event_dictionaries): ''' This should just create the PREMIS lxml object. Actual metadata generation should be moved to other functions. @@ -126,6 +126,29 @@ def setup_xml(object_dictionaries): format_registry_role.text = objects['formatRegistryRole'] content_location_type.text = objects['contentLocationType'] content_location_value.text = objects['contentLocationValue'] + for x in event_dictionaries: + event_parent = create_unit( + 99, premis, 'event' + ) + event_identifier_uuid = create_unit( + 1, event_parent, 'eventIdentifier' + ) + event_identifier_uuid_type = create_unit( + 1, event_identifier_uuid, 'eventIdentifierType' + ) + event_identifier_uuid_value = create_unit( + 2, event_identifier_uuid, 'eventIdentifierValue' + ) + event_type = create_unit( + 1, event_parent, 'eventType' + ) + event_date_time = create_unit( + 1, event_parent, 'eventDateTime' + ) + event_identifier_uuid_type.text = x['eventIdentifierType'] + event_identifier_uuid_value.text = x['eventIdentifierValue'] + event_type.text = x['eventType'] + event_date_time.text = x['eventDateTime'] print(etree.tostring(doc, pretty_print=True)) return premis_namespace, doc, premis def main(): @@ -134,8 +157,10 @@ def main(): For debugging purposes, the contents of the CSV is printed to screen. ''' csv_file = sys.argv[1] + events_csv = sys.argv[2] object_dictionaries = extract_metadata(csv_file) - setup_xml(object_dictionaries) + event_dictionaries = extract_metadata(events_csv) + setup_xml(object_dictionaries, event_dictionaries) ''' for x in object_dictionaries: for i in x: From 39c3235fa27af94c700d3f51e391bc66e595ab6c Mon Sep 17 00:00:00 2001 From: kieranjol Date: Sat, 5 Aug 2017 19:27:08 +0100 Subject: [PATCH 15/31] premiscsv2xml - fixes element order, XML validates again against schema --- premiscsv2xml.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/premiscsv2xml.py b/premiscsv2xml.py index 1469bf8..1361432 100755 --- a/premiscsv2xml.py +++ b/premiscsv2xml.py @@ -143,7 +143,7 @@ def setup_xml(object_dictionaries, event_dictionaries): 1, event_parent, 'eventType' ) event_date_time = create_unit( - 1, event_parent, 'eventDateTime' + 2, event_parent, 'eventDateTime' ) event_identifier_uuid_type.text = x['eventIdentifierType'] event_identifier_uuid_value.text = x['eventIdentifierValue'] From 63cc919bae0bcfcd4f0c4f4ac3e60af535c9fec4 Mon Sep 17 00:00:00 2001 From: kieranjol Date: Sat, 5 Aug 2017 23:04:18 +0100 Subject: [PATCH 16/31] premiscsv2xml - more PREMIS event info --- premiscsv2xml.py | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/premiscsv2xml.py b/premiscsv2xml.py index 1361432..1815719 100755 --- a/premiscsv2xml.py +++ b/premiscsv2xml.py @@ -145,10 +145,31 @@ def setup_xml(object_dictionaries, event_dictionaries): event_date_time = create_unit( 2, event_parent, 'eventDateTime' ) + event_detail_information = create_unit( + 3, event_parent, 'eventDetailInformation' + ) + event_detail = create_unit( + 1, event_detail_information, 'eventDetail' + ) + event_outcome_information = create_unit( + 4, event_parent, 'eventOutcomeInformation' + ) + event_outcome = create_unit( + 1, event_outcome_information, 'eventOutcome' + ) + event_outcome_detail = create_unit( + 2, event_outcome_information, 'eventOutcomeDetail' + ) + event_outcome_detail_note = create_unit( + 1, event_outcome_detail, 'eventOutcomeDetailNote' + ) event_identifier_uuid_type.text = x['eventIdentifierType'] event_identifier_uuid_value.text = x['eventIdentifierValue'] event_type.text = x['eventType'] event_date_time.text = x['eventDateTime'] + event_detail.text = x['eventDetail'] + event_outcome.text = x['eventOutcome'] + event_outcome_detail_note.text = x['eventOutcomeDetailNote'] print(etree.tostring(doc, pretty_print=True)) return premis_namespace, doc, premis def main(): From d079604d57ea4a488ccfb46b47028c6f0290d864 Mon Sep 17 00:00:00 2001 From: kieranjol Date: Sat, 5 Aug 2017 23:22:32 +0100 Subject: [PATCH 17/31] premiscsv2xml - adds linkingEventIdentifiers --- premiscsv2xml.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/premiscsv2xml.py b/premiscsv2xml.py index 1815719..920b9ee 100755 --- a/premiscsv2xml.py +++ b/premiscsv2xml.py @@ -126,6 +126,20 @@ def setup_xml(object_dictionaries, event_dictionaries): format_registry_role.text = objects['formatRegistryRole'] content_location_type.text = objects['contentLocationType'] content_location_value.text = objects['contentLocationValue'] + linked_events = objects['linkingEventIdentifierValue'].split('|') + for event in linked_events: + if event != '': + linking_event_identifier = create_unit( + 99, object_parent, 'linkingEventIdentifier' + ) + linking_event_identifier_type = create_unit( + 1, linking_event_identifier, 'linkingEventIdentifierType' + ) + linking_event_identifier_value = create_unit( + 2, linking_event_identifier, 'linkingEventIdentifierValue' + ) + linking_event_identifier_type.text = 'UUID' + linking_event_identifier_value.text = event for x in event_dictionaries: event_parent = create_unit( 99, premis, 'event' From bd4259f04833ad94de4b16a717ea4dc8f2f922c9 Mon Sep 17 00:00:00 2001 From: kieranjol Date: Sun, 6 Aug 2017 18:33:25 +0100 Subject: [PATCH 18/31] ififuncs/premiscsv - moves functions form premis scripts into ififuncs --- ififuncs.py | 31 +++++++++++++++++++++++++++++++ premiscsv.py | 18 ++++++------------ premiscsv2xml.py | 15 +++------------ premisobjects.py | 24 +++--------------------- 4 files changed, 43 insertions(+), 45 deletions(-) diff --git a/ififuncs.py b/ififuncs.py index d587049..87a8d1d 100755 --- a/ififuncs.py +++ b/ififuncs.py @@ -790,3 +790,34 @@ def get_pronom_format(filename): version = str(json_object['siegfried']) return (pronom_id, authority, version) +def get_checksum(manifest, filename): + ''' + Extracts the checksum and path within a manifest, returning both as a tuple. + ''' + if os.path.isfile(manifest): + with open(manifest, 'r') as manifest_object: + manifest_lines = manifest_object.readlines() + for md5 in manifest_lines: + if 'objects' in md5: + if filename in md5: + return md5[:32], md5[34:].rstrip() + +def find_representation_uuid(source): + ''' + This extracts the representation UUID from a directory name. + This should be moved to ififuncs as it can be used by other scripts. + ''' + for root, _, _ in os.walk(source): + if 'objects' in root: + return os.path.basename(os.path.dirname(root)) + +def extract_metadata(csv_file): + ''' + Read the PREMIS csv and store the metadata in a list of dictionaries. + ''' + object_dictionaries = [] + input_file = csv.DictReader(open(csv_file)) + for rows in input_file: + object_dictionaries.append(rows) + return object_dictionaries + diff --git a/premiscsv.py b/premiscsv.py index 0270d96..f0a2317 100755 --- a/premiscsv.py +++ b/premiscsv.py @@ -6,18 +6,11 @@ import os import sys import csv +import shutil # from lxml import etree import ififuncs -def extract_metadata(csv_file): - ''' - Read the PREMIS csv and store the metadata in a list of dictionaries. - ''' - object_dictionaries = [] - input_file = csv.DictReader(open(csv_file)) - for rows in input_file: - object_dictionaries.append(rows) - return object_dictionaries + def find_events(logfile): ''' A very hacky attempt to extract the relevant preservation events from our @@ -35,7 +28,7 @@ def find_events(logfile): manifest_event = line_fragment.replace( 'eventDetail', '' ).replace('\n', '').split('=')[1] - object_info = extract_metadata('objects.csv') + object_info = ififuncs.extract_metadata('objects.csv') object_locations = {} for i in object_info: object_locations[i['contentLocationValue']] = i['objectIdentifier'].split(', ')[1].replace(']', '') @@ -121,7 +114,7 @@ def update_objects(): Update the object description with the linkingEventIdentifiers ''' link_dict = {} - event_dicts = extract_metadata('events.csv') + event_dicts = ififuncs.extract_metadata('events.csv') for i in event_dicts: a = i['eventIdentifierValue'] try: @@ -129,7 +122,7 @@ def update_objects(): except KeyError: link_dict[i['linkingObjectIdentifierValue']] = a + '|' print link_dict - object_dicts = extract_metadata('objects.csv') + object_dicts = ififuncs.extract_metadata('objects.csv') for x in object_dicts: for link in link_dict: @@ -162,6 +155,7 @@ def update_objects(): w.writeheader() counter += 1 w.writerow(i) + shutil.move('mycsvfile.csv', 'objects.csv') def make_events_csv(): ''' diff --git a/premiscsv2xml.py b/premiscsv2xml.py index 920b9ee..1fc0921 100755 --- a/premiscsv2xml.py +++ b/premiscsv2xml.py @@ -5,16 +5,7 @@ import sys import csv from lxml import etree - -def extract_metadata(csv_file): - ''' - Read the PREMIS csv and store the metadata in a list of dictionaries. - ''' - object_dictionaries = [] - input_file = csv.DictReader(open(csv_file)) - for rows in input_file: - object_dictionaries.append(rows) - return object_dictionaries +import ififuncs def add_value(value, element): ''' @@ -193,8 +184,8 @@ def main(): ''' csv_file = sys.argv[1] events_csv = sys.argv[2] - object_dictionaries = extract_metadata(csv_file) - event_dictionaries = extract_metadata(events_csv) + object_dictionaries = ififuncs.extract_metadata(csv_file) + event_dictionaries = ififuncs.extract_metadata(events_csv) setup_xml(object_dictionaries, event_dictionaries) ''' for x in object_dictionaries: diff --git a/premisobjects.py b/premisobjects.py index 991249b..f0fe6db 100755 --- a/premisobjects.py +++ b/premisobjects.py @@ -25,17 +25,7 @@ import ififuncs -def get_checksum(manifest, filename): - ''' - Extracts the checksum and path within a manifest, returning both as a tuple. - ''' - if os.path.isfile(manifest): - with open(manifest, 'r') as manifest_object: - manifest_lines = manifest_object.readlines() - for md5 in manifest_lines: - if 'objects' in md5: - if filename in md5: - return md5[:32], md5[34:].rstrip() + def make_skeleton_csv(): @@ -77,7 +67,7 @@ def file_description(source, manifest, representation_uuid): for root, _, filenames in os.walk(root): filenames = [f for f in filenames if f[0] != '.'] for item in filenames: - md5, uri = get_checksum(manifest, item) + md5, uri = ififuncs.get_checksum(manifest, item) item_uuid = ififuncs.create_uuid() full_path = os.path.join(root, item) pronom_id, authority, version = ififuncs.get_pronom_format( @@ -152,14 +142,6 @@ def intellectual_entity_description(): intellectual_entity_dictionary['objectIdentifier'] = ['UUID', ififuncs.create_uuid()] intellectual_entity_dictionary['objectCategory'] = 'intellectual entity' #print intellectual_entity_dictionary -def find_representation_uuid(source): - ''' - This extracts the representation UUID from a directory name. - This should be moved to ififuncs as it can be used by other scripts. - ''' - for root, _, _ in os.walk(source): - if 'objects' in root: - return os.path.basename(os.path.dirname(root)) def main(): @@ -169,7 +151,7 @@ def main(): make_skeleton_csv() source = sys.argv[1] manifest = sys.argv[2] - representation_uuid = find_representation_uuid(source) + representation_uuid = ififuncs.find_representation_uuid(source) item_ids = file_description(source, manifest, representation_uuid) #intellectual_entity_description() representation_description(representation_uuid, item_ids) From a455a45cedf2f92deedeb6fec0b88391ffa2ee7b Mon Sep 17 00:00:00 2001 From: kieranjol Date: Sun, 6 Aug 2017 20:36:38 +0100 Subject: [PATCH 19/31] premiscsv2xml - removes stupid add_value() function --- premiscsv2xml.py | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/premiscsv2xml.py b/premiscsv2xml.py index 1fc0921..90ae7f5 100755 --- a/premiscsv2xml.py +++ b/premiscsv2xml.py @@ -7,12 +7,6 @@ from lxml import etree import ififuncs -def add_value(value, element): - ''' - Useless helper function - remove! - ''' - element.text = value - def write_premis(doc, premisxml): ''' @@ -60,8 +54,8 @@ def setup_xml(object_dictionaries, event_dictionaries): object_identifier_uuid_value = create_unit( 2, object_identifier_uuid, 'objectIdentifierValue' ) - add_value(id_list[0], object_identifier_uuid_type) - add_value(id_list[1], object_identifier_uuid_value) + object_identifier_uuid_type.text = id_list[0] + object_identifier_uuid_value.text = id_list[1] if objects['objectCategory'] == 'file': object_characteristics = create_unit( 5, object_parent, 'objectCharacteristics' From 4d762b53adf1cac2aa6f1073066f058759b67194 Mon Sep 17 00:00:00 2001 From: kieranjol Date: Sun, 6 Aug 2017 21:55:28 +0100 Subject: [PATCH 20/31] premiscsv2xml - cleanup and docstrings --- premiscsv2xml.py | 43 +++++++++++++++++++++++++++---------------- 1 file changed, 27 insertions(+), 16 deletions(-) diff --git a/premiscsv2xml.py b/premiscsv2xml.py index 90ae7f5..be5bea5 100755 --- a/premiscsv2xml.py +++ b/premiscsv2xml.py @@ -3,7 +3,6 @@ Takes a PREMIS CSV file, as generated by premiscsv.py, and transform into XML. ''' import sys -import csv from lxml import etree import ififuncs @@ -25,16 +24,20 @@ def create_unit(index, parent, unitname): parent.insert(index, unitname) return unitname -def setup_xml(object_dictionaries, event_dictionaries): +def setup_xml(): ''' This should just create the PREMIS lxml object. Actual metadata generation should be moved to other functions. ''' namespace = '' - premis_namespace = "http://www.loc.gov/premis/v3" - xsi_namespace = "http://www.w3.org/2001/XMLSchema-instance" premis = etree.fromstring(namespace) - doc = etree.ElementTree(premis) + return premis + +def describe_objects(premis, object_dictionaries): + ''' + Converts the CSV object metadata into PREMIS XML. + ''' + xsi_namespace = "http://www.w3.org/2001/XMLSchema-instance" for objects in object_dictionaries: id_list = objects['objectIdentifier'].replace( '[', '' @@ -115,20 +118,25 @@ def setup_xml(object_dictionaries, event_dictionaries): for event in linked_events: if event != '': linking_event_identifier = create_unit( - 99, object_parent, 'linkingEventIdentifier' - ) + 99, object_parent, 'linkingEventIdentifier' + ) linking_event_identifier_type = create_unit( - 1, linking_event_identifier, 'linkingEventIdentifierType' - ) + 1, linking_event_identifier, 'linkingEventIdentifierType' + ) linking_event_identifier_value = create_unit( - 2, linking_event_identifier, 'linkingEventIdentifierValue' - ) + 2, linking_event_identifier, 'linkingEventIdentifierValue' + ) linking_event_identifier_type.text = 'UUID' linking_event_identifier_value.text = event + return premis, linked_events +def describe_events(premis, event_dictionaries): + ''' + Converts the CSV object metadata into PREMIS XML. + ''' for x in event_dictionaries: event_parent = create_unit( - 99, premis, 'event' - ) + 99, premis, 'event' + ) event_identifier_uuid = create_unit( 1, event_parent, 'eventIdentifier' ) @@ -169,8 +177,9 @@ def setup_xml(object_dictionaries, event_dictionaries): event_detail.text = x['eventDetail'] event_outcome.text = x['eventOutcome'] event_outcome_detail_note.text = x['eventOutcomeDetailNote'] - print(etree.tostring(doc, pretty_print=True)) - return premis_namespace, doc, premis + print(etree.tostring(premis, pretty_print=True)) + + def main(): ''' Launches all the other functions when run from the command line. @@ -180,7 +189,9 @@ def main(): events_csv = sys.argv[2] object_dictionaries = ififuncs.extract_metadata(csv_file) event_dictionaries = ififuncs.extract_metadata(events_csv) - setup_xml(object_dictionaries, event_dictionaries) + premis = setup_xml() + premis = describe_objects(premis, object_dictionaries) + describe_events(premis, event_dictionaries) ''' for x in object_dictionaries: for i in x: From 73f08d619a92c6d5f8f3d6c463c9ff74e15fb539 Mon Sep 17 00:00:00 2001 From: kieranjol Date: Sun, 6 Aug 2017 23:10:48 +0100 Subject: [PATCH 21/31] premisobjects/premiscsv2xml - more cleanup --- premiscsv2xml.py | 11 +++-------- premisobjects.py | 1 - 2 files changed, 3 insertions(+), 9 deletions(-) diff --git a/premiscsv2xml.py b/premiscsv2xml.py index be5bea5..127fb8d 100755 --- a/premiscsv2xml.py +++ b/premiscsv2xml.py @@ -128,7 +128,7 @@ def describe_objects(premis, object_dictionaries): ) linking_event_identifier_type.text = 'UUID' linking_event_identifier_value.text = event - return premis, linked_events + return premis def describe_events(premis, event_dictionaries): ''' Converts the CSV object metadata into PREMIS XML. @@ -192,12 +192,7 @@ def main(): premis = setup_xml() premis = describe_objects(premis, object_dictionaries) describe_events(premis, event_dictionaries) - ''' - for x in object_dictionaries: - for i in x: - if x[i] != '': - print i, x[i] - print '\n' - ''' + + if __name__ == '__main__': main() diff --git a/premisobjects.py b/premisobjects.py index f0fe6db..af788ce 100755 --- a/premisobjects.py +++ b/premisobjects.py @@ -12,7 +12,6 @@ todo: Document identifier assignment for files and IE. Probably in events sheet? Allow for derivation to be entered -Link with events sheet Link mediainfo xml in /metadata to the objectCharacteristicsExtension field. From a6288cc032c3af17de1fe97b0c6770dd4ab1aba7 Mon Sep 17 00:00:00 2001 From: kieranjol Date: Mon, 7 Aug 2017 00:57:09 +0100 Subject: [PATCH 22/31] ififuncs/premiscsv - adds argparse and changes variables in all PREMIS CSV scripts --- premiscsv.py | 58 ++++++++++++++++++++++++++++++++++++------------ premiscsv2xml.py | 33 ++++++++++++++++++++++----- premisobjects.py | 56 ++++++++++++++++++++++++++++++++++------------ 3 files changed, 114 insertions(+), 33 deletions(-) diff --git a/premiscsv.py b/premiscsv.py index f0a2317..3d8a4dd 100755 --- a/premiscsv.py +++ b/premiscsv.py @@ -7,11 +7,12 @@ import sys import csv import shutil +import argparse # from lxml import etree import ififuncs -def find_events(logfile): +def find_events(logfile, objects_csv, output): ''' A very hacky attempt to extract the relevant preservation events from our log files. @@ -107,14 +108,14 @@ def find_events(logfile): '', 'UUID', linking_object_identifier_value, '' ] - ififuncs.append_csv('events.csv', event_row) + ififuncs.append_csv(output, event_row) -def update_objects(): +def update_objects(output, objects_csv): ''' Update the object description with the linkingEventIdentifiers ''' link_dict = {} - event_dicts = ififuncs.extract_metadata('events.csv') + event_dicts = ififuncs.extract_metadata(output) for i in event_dicts: a = i['eventIdentifierValue'] try: @@ -122,7 +123,7 @@ def update_objects(): except KeyError: link_dict[i['linkingObjectIdentifierValue']] = a + '|' print link_dict - object_dicts = ififuncs.extract_metadata('objects.csv') + object_dicts = ififuncs.extract_metadata(objects_csv) for x in object_dicts: for link in link_dict: @@ -155,9 +156,9 @@ def update_objects(): w.writeheader() counter += 1 w.writerow(i) - shutil.move('mycsvfile.csv', 'objects.csv') + shutil.move('mycsvfile.csv', objects_csv) -def make_events_csv(): +def make_events_csv(output): ''' Generates a CSV with PREMIS-esque headings. Currently it's just called 'bla.csv' but it will probably be called: @@ -174,16 +175,45 @@ def make_events_csv(): 'linkingAgentIdentifierRole', 'linkingObjectIdentifierType', 'linkingObjectIdentifierValue', 'linkingObjectRole' ] - ififuncs.create_csv('events.csv', premis_events) + ififuncs.create_csv(output, premis_events) -def main(): +def parse_args(args_): + ''' + Parse command line arguments. + ''' + parser = argparse.ArgumentParser( + description='Describes events using PREMIS data dictionary via CSV' + ' Written by Kieran O\'Leary.' + ) + parser.add_argument( + '-i', + help='full path of a log textfile', required=True + ) + parser.add_argument( + '-o', + help='full path of output csv', required=True + ) + parser.add_argument( + '-object_csv', + help='full path of object description csv', required=True + ) + parser.add_argument( + '-user', + help='Declare who you are. If this is not set, you will be prompted.' + ) + parsed_args = parser.parse_args(args_) + return parsed_args +def main(args_): ''' Launches all the other functions when run from the command line. ''' - make_events_csv() - logfile = sys.argv[1] - find_events(logfile) - update_objects() + args = parse_args(args_) + logfile = args.i + output = args.o + objects_csv = args.object_csv + make_events_csv(output) + find_events(logfile, objects_csv, output) + update_objects(output, objects_csv) if __name__ == '__main__': - main() + main(sys.argv[1:]) diff --git a/premiscsv2xml.py b/premiscsv2xml.py index 127fb8d..3611923 100755 --- a/premiscsv2xml.py +++ b/premiscsv2xml.py @@ -3,6 +3,7 @@ Takes a PREMIS CSV file, as generated by premiscsv.py, and transform into XML. ''' import sys +import argparse from lxml import etree import ififuncs @@ -179,14 +180,36 @@ def describe_events(premis, event_dictionaries): event_outcome_detail_note.text = x['eventOutcomeDetailNote'] print(etree.tostring(premis, pretty_print=True)) - -def main(): +def parse_args(args_): + ''' + Parse command line arguments. + ''' + parser = argparse.ArgumentParser( + description='Converts PREMIS CSV to XML' + ' Written by Kieran O\'Leary.' + ) + parser.add_argument( + '-i', + help='full path of objects csv', required=True + ) + parser.add_argument( + '-ev', + help='full path of events csv', required=True + ) + parser.add_argument( + '-user', + help='Declare who you are. If this is not set, you will be prompted.' + ) + parsed_args = parser.parse_args(args_) + return parsed_args +def main(args_): ''' Launches all the other functions when run from the command line. For debugging purposes, the contents of the CSV is printed to screen. ''' - csv_file = sys.argv[1] - events_csv = sys.argv[2] + args = parse_args(args_) + csv_file = args.i + events_csv = args.ev object_dictionaries = ififuncs.extract_metadata(csv_file) event_dictionaries = ififuncs.extract_metadata(events_csv) premis = setup_xml() @@ -195,4 +218,4 @@ def main(): if __name__ == '__main__': - main() + main(sys.argv[1:]) diff --git a/premisobjects.py b/premisobjects.py index af788ce..f8d8f84 100755 --- a/premisobjects.py +++ b/premisobjects.py @@ -21,13 +21,14 @@ import os import sys +import argparse import ififuncs -def make_skeleton_csv(): +def make_skeleton_csv(output): ''' Generates a CSV with PREMIS-esque headings. Currently it's just called 'cle.csv' but it will probably be called: @@ -53,10 +54,10 @@ def make_skeleton_csv(): 'relationship_structural_hasroot', 'relationship_derivation_hassource' ] - ififuncs.create_csv('objects.csv', premis_object_units) + ififuncs.create_csv(output, premis_object_units) -def file_description(source, manifest, representation_uuid): +def file_description(source, manifest, representation_uuid, output): ''' Generate PREMIS descriptions for items and write to CSV. ''' @@ -98,9 +99,9 @@ def file_description(source, manifest, representation_uuid): '', '' ] - ififuncs.append_csv('objects.csv', file_data) + ififuncs.append_csv(output, file_data) return item_ids -def representation_description(representation_uuid, item_ids): +def representation_description(representation_uuid, item_ids, output): ''' Generate PREMIS descriptions for a representation and write to CSV. ''' @@ -130,7 +131,7 @@ def representation_description(representation_uuid, item_ids): '', '' ] - ififuncs.append_csv('objects.csv', representation_data) + ififuncs.append_csv(output, representation_data) def intellectual_entity_description(): @@ -142,19 +143,46 @@ def intellectual_entity_description(): intellectual_entity_dictionary['objectCategory'] = 'intellectual entity' #print intellectual_entity_dictionary - -def main(): +def parse_args(args_): + ''' + Parse command line arguments. + ''' + parser = argparse.ArgumentParser( + description='Describes objects using PREMIS data dictionary using CSV' + ' Written by Kieran O\'Leary.' + ) + parser.add_argument( + '-i', + help='full path of input directory', required=True + ) + parser.add_argument( + '-o', '-output', + help='full path of output directory', required=True + ) + parser.add_argument( + '-m', '-manifest', + help='full path to a pre-existing manifest', required=True + ) + parser.add_argument( + '-user', + help='Declare who you are. If this is not set, you will be prompted.' + ) + parsed_args = parser.parse_args(args_) + return parsed_args +def main(args_): ''' Launches all the other functions when run from the command line. ''' - make_skeleton_csv() - source = sys.argv[1] - manifest = sys.argv[2] + args = parse_args(args_) + source = args.i + output = args.o + manifest = args.m + make_skeleton_csv(output) representation_uuid = ififuncs.find_representation_uuid(source) - item_ids = file_description(source, manifest, representation_uuid) + item_ids = file_description(source, manifest, representation_uuid, output) #intellectual_entity_description() - representation_description(representation_uuid, item_ids) + representation_description(representation_uuid, item_ids, output) if __name__ == '__main__': - main() + main(sys.argv[1:]) From 8c452284a2f41207602b1bee78be520ac95c35cf Mon Sep 17 00:00:00 2001 From: kieranjol Date: Mon, 7 Aug 2017 01:50:34 +0100 Subject: [PATCH 23/31] cleans up PREMIS csv scripts --- premiscsv.py | 25 ++++++++++++++++--------- premiscsv2xml.py | 7 +++++++ premisobjects.py | 10 ++++++---- 3 files changed, 29 insertions(+), 13 deletions(-) diff --git a/premiscsv.py b/premiscsv.py index 3d8a4dd..9d6b34d 100755 --- a/premiscsv.py +++ b/premiscsv.py @@ -12,7 +12,7 @@ import ififuncs -def find_events(logfile, objects_csv, output): +def find_events(logfile, output): ''' A very hacky attempt to extract the relevant preservation events from our log files. @@ -32,7 +32,9 @@ def find_events(logfile, objects_csv, output): object_info = ififuncs.extract_metadata('objects.csv') object_locations = {} for i in object_info: - object_locations[i['contentLocationValue']] = i['objectIdentifier'].split(', ')[1].replace(']', '') + object_locations[ + i['contentLocationValue'] + ] = i['objectIdentifier'].split(', ')[1].replace(']', '') for log_entry in log_lines: valid_entries = [ 'eventType', @@ -94,7 +96,7 @@ def find_events(logfile, objects_csv, output): a = os.path.basename(event_outcome).replace('_mediainfo.xml', '').replace('_mediatrace.xml', '')[:-1] b = os.path.basename(x) if a == b: - linking_object_identifier_value = object_locations[x].replace('\'','') + linking_object_identifier_value = object_locations[x].replace('\'', '') if (break_loop == 'continue') or (event_type == ''): continue print event_type @@ -110,6 +112,7 @@ def find_events(logfile, objects_csv, output): ] ififuncs.append_csv(output, event_row) + def update_objects(output, objects_csv): ''' Update the object description with the linkingEventIdentifiers @@ -117,17 +120,16 @@ def update_objects(output, objects_csv): link_dict = {} event_dicts = ififuncs.extract_metadata(output) for i in event_dicts: - a = i['eventIdentifierValue'] + a = i['eventIdentifierValue'] try: - link_dict[i['linkingObjectIdentifierValue']] += a + '|' + link_dict[i['linkingObjectIdentifierValue']] += a + '|' except KeyError: - link_dict[i['linkingObjectIdentifierValue']] = a + '|' + link_dict[i['linkingObjectIdentifierValue']] = a + '|' print link_dict object_dicts = ififuncs.extract_metadata(objects_csv) for x in object_dicts: for link in link_dict: - - if link == x['objectIdentifier'].split(', ')[1].replace(']', '').replace('\'',''): + if link == x['objectIdentifier'].split(', ')[1].replace(']', '').replace('\'', ''): x['linkingEventIdentifierValue'] = link_dict[link] premis_object_units = [ 'objectIdentifier', @@ -158,6 +160,7 @@ def update_objects(output, objects_csv): w.writerow(i) shutil.move('mycsvfile.csv', objects_csv) + def make_events_csv(output): ''' Generates a CSV with PREMIS-esque headings. Currently it's just called @@ -177,6 +180,7 @@ def make_events_csv(output): ] ififuncs.create_csv(output, premis_events) + def parse_args(args_): ''' Parse command line arguments. @@ -203,6 +207,8 @@ def parse_args(args_): ) parsed_args = parser.parse_args(args_) return parsed_args + + def main(args_): ''' Launches all the other functions when run from the command line. @@ -212,8 +218,9 @@ def main(args_): output = args.o objects_csv = args.object_csv make_events_csv(output) - find_events(logfile, objects_csv, output) + find_events(logfile, output) update_objects(output, objects_csv) + if __name__ == '__main__': main(sys.argv[1:]) diff --git a/premiscsv2xml.py b/premiscsv2xml.py index 3611923..610799a 100755 --- a/premiscsv2xml.py +++ b/premiscsv2xml.py @@ -25,6 +25,7 @@ def create_unit(index, parent, unitname): parent.insert(index, unitname) return unitname + def setup_xml(): ''' This should just create the PREMIS lxml object. @@ -34,6 +35,7 @@ def setup_xml(): premis = etree.fromstring(namespace) return premis + def describe_objects(premis, object_dictionaries): ''' Converts the CSV object metadata into PREMIS XML. @@ -130,6 +132,8 @@ def describe_objects(premis, object_dictionaries): linking_event_identifier_type.text = 'UUID' linking_event_identifier_value.text = event return premis + + def describe_events(premis, event_dictionaries): ''' Converts the CSV object metadata into PREMIS XML. @@ -180,6 +184,7 @@ def describe_events(premis, event_dictionaries): event_outcome_detail_note.text = x['eventOutcomeDetailNote'] print(etree.tostring(premis, pretty_print=True)) + def parse_args(args_): ''' Parse command line arguments. @@ -202,6 +207,8 @@ def parse_args(args_): ) parsed_args = parser.parse_args(args_) return parsed_args + + def main(args_): ''' Launches all the other functions when run from the command line. diff --git a/premisobjects.py b/premisobjects.py index f8d8f84..c10f65d 100755 --- a/premisobjects.py +++ b/premisobjects.py @@ -25,13 +25,10 @@ import ififuncs - - - def make_skeleton_csv(output): ''' Generates a CSV with PREMIS-esque headings. Currently it's just called - 'cle.csv' but it will probably be called: + 'objects.csv' but it will probably be called: UUID_premisobjects.csv and sit in the metadata directory. ''' @@ -70,6 +67,7 @@ def file_description(source, manifest, representation_uuid, output): md5, uri = ififuncs.get_checksum(manifest, item) item_uuid = ififuncs.create_uuid() full_path = os.path.join(root, item) + print 'Using Siegfriend to analyze %s' % item pronom_id, authority, version = ififuncs.get_pronom_format( full_path ) @@ -143,6 +141,7 @@ def intellectual_entity_description(): intellectual_entity_dictionary['objectCategory'] = 'intellectual entity' #print intellectual_entity_dictionary + def parse_args(args_): ''' Parse command line arguments. @@ -169,6 +168,8 @@ def parse_args(args_): ) parsed_args = parser.parse_args(args_) return parsed_args + + def main(args_): ''' Launches all the other functions when run from the command line. @@ -183,6 +184,7 @@ def main(args_): #intellectual_entity_description() representation_description(representation_uuid, item_ids, output) + if __name__ == '__main__': main(sys.argv[1:]) From 9bad84c12172b44544ded8540fab4d636b90bc7f Mon Sep 17 00:00:00 2001 From: kieranjol Date: Mon, 7 Aug 2017 11:05:46 +0100 Subject: [PATCH 24/31] premisobjects - be less IFI folder structure specific --- premisobjects.py | 80 +++++++++++++++++++++++------------------------- 1 file changed, 39 insertions(+), 41 deletions(-) diff --git a/premisobjects.py b/premisobjects.py index c10f65d..421fd91 100755 --- a/premisobjects.py +++ b/premisobjects.py @@ -10,7 +10,8 @@ Structural and Includes respectively. todo: -Document identifier assignment for files and IE. Probably in events sheet? +Document identifier assignment for files and IE. Probably in events sheet?\ +This would ideally just add to the log in the helper script. Allow for derivation to be entered Link mediainfo xml in /metadata to the objectCharacteristicsExtension field. @@ -60,50 +61,47 @@ def file_description(source, manifest, representation_uuid, output): ''' item_ids = [] for root, _, filenames in os.walk(source): - if os.path.basename(root) == 'objects': - for root, _, filenames in os.walk(root): - filenames = [f for f in filenames if f[0] != '.'] - for item in filenames: - md5, uri = ififuncs.get_checksum(manifest, item) - item_uuid = ififuncs.create_uuid() - full_path = os.path.join(root, item) - print 'Using Siegfriend to analyze %s' % item - pronom_id, authority, version = ififuncs.get_pronom_format( - full_path - ) - item_dictionary = {} - item_dictionary['objectIdentifier'] = ['UUID', item_uuid] - item_dictionary['objectCategory'] = 'file' - item_dictionary['size'] = str(os.path.getsize(full_path)) - item_dictionary['originalName'] = item - item_dictionary['relationship_structural_isincludedin'] = representation_uuid - item_ids.append(item_uuid) - file_data = [ - item_dictionary['objectIdentifier'], - item_dictionary['objectCategory'], - 'md5', md5, 'internal', - item_dictionary['size'], '', '', - authority, pronom_id, 'identification', - '', item, - 'uri', uri, - '', '', - '', - '', '', - '', - '', '', - '', - item_dictionary['relationship_structural_isincludedin'], - '', - '', - '' - ] - ififuncs.append_csv(output, file_data) + filenames = [f for f in filenames if f[0] != '.'] + for item in filenames: + md5, uri = ififuncs.get_checksum(manifest, item) + item_uuid = ififuncs.create_uuid() + full_path = os.path.join(root, item) + print 'Using Siegfriend to analyze %s' % item + pronom_id, authority, version = ififuncs.get_pronom_format( + full_path + ) + item_dictionary = {} + item_dictionary['objectIdentifier'] = ['UUID', item_uuid] + item_dictionary['objectCategory'] = 'file' + item_dictionary['size'] = str(os.path.getsize(full_path)) + item_dictionary['originalName'] = item + item_dictionary['relationship_structural_isincludedin'] = representation_uuid + item_ids.append(item_uuid) + file_data = [ + item_dictionary['objectIdentifier'], + item_dictionary['objectCategory'], + 'md5', md5, 'internal', + item_dictionary['size'], '', '', + authority, pronom_id, 'identification', + '', item, + 'uri', uri, + '', '', + '', + '', '', + '', + '', '', + '', + item_dictionary['relationship_structural_isincludedin'], + '', + '', + '' + ] + ififuncs.append_csv(output, file_data) return item_ids def representation_description(representation_uuid, item_ids, output): ''' Generate PREMIS descriptions for a representation and write to CSV. ''' - representation_dictionary = {} representation_dictionary['objectIdentifier'] = ['UUID', representation_uuid] representation_dictionary['objectCategory'] = 'representation' @@ -152,7 +150,7 @@ def parse_args(args_): ) parser.add_argument( '-i', - help='full path of input directory', required=True + help='full path of input objects directory', required=True ) parser.add_argument( '-o', '-output', From f0683108b673f258b63c3768b50e83ce51c16236 Mon Sep 17 00:00:00 2001 From: kieranjol Date: Mon, 7 Aug 2017 11:14:31 +0100 Subject: [PATCH 25/31] logs2premis - renames premiscsv to logs2premis --- logs2premis.py | 226 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 226 insertions(+) create mode 100755 logs2premis.py diff --git a/logs2premis.py b/logs2premis.py new file mode 100755 index 0000000..9d6b34d --- /dev/null +++ b/logs2premis.py @@ -0,0 +1,226 @@ +#!/usr/bin/env python +''' +Extracts preservation events from an IFI plain text log file and converts +to a CSV using the PREMIS data dictionary +''' +import os +import sys +import csv +import shutil +import argparse +# from lxml import etree +import ififuncs + + +def find_events(logfile, output): + ''' + A very hacky attempt to extract the relevant preservation events from our + log files. + ''' + sip_test = os.path.basename(logfile).replace('_sip_log.log', '') + if ififuncs.validate_uuid4(sip_test) != False: + linking_object_identifier_value = sip_test + with open(logfile, 'r') as logfile_object: + log_lines = logfile_object.readlines() + for event_test in log_lines: + if 'eventDetail=copyit.py' in event_test: + logsplit = event_test.split(',') + for line_fragment in logsplit: + manifest_event = line_fragment.replace( + 'eventDetail', '' + ).replace('\n', '').split('=')[1] + object_info = ififuncs.extract_metadata('objects.csv') + object_locations = {} + for i in object_info: + object_locations[ + i['contentLocationValue'] + ] = i['objectIdentifier'].split(', ')[1].replace(']', '') + for log_entry in log_lines: + valid_entries = [ + 'eventType', + 'eventDetail=sipcreator.py', + 'eventDetail=Mediatrace', + 'eventDetail=Technical', + 'eventDetail=copyit.py' + ] + for entry in valid_entries: + if entry in log_entry: + break_loop = '' + event_outcome = '' + event_detail = '' + event_outcome_detail_note = '' + event_type = '' + event_row = [] + datetime = log_entry[:19] + logsplit = log_entry.split(',') + for line_fragment in logsplit: + if 'eventType' in line_fragment: + if 'EVENT =' in line_fragment: + line_fragment = line_fragment.split('EVENT =')[1] + event_type = line_fragment.replace( + ' eventType=', '' + ).replace('assignement', 'assignment') + if ' value' in line_fragment: + # this assumes that the value is the outcome of an identifier assigment. + event_outcome = line_fragment[7:].replace('\n', '') + # we are less concerned with events starting. + if 'status=started' in line_fragment: + break_loop = 'continue' + if 'Generating destination manifest:' in line_fragment: + break_loop = '' + event_detail = manifest_event + # ugh, this might run multiple times. + if 'eventDetail=sipcreator.py' in log_entry: + event_type = 'Information Package Creation' + event_detail = line_fragment.replace( + 'eventDetail', '' + ).replace('\n', '').split('=')[1] + event_outcome_detail_note = 'Submission Information Package' + if ('eventDetail=Mediatrace' in log_entry) or ('eventDetail=Technical' in log_entry): + event_type = 'metadata extraction' + event_detail = log_entry.split( + 'eventDetail=', 1 + )[1].split(',')[0] + event_outcome = log_entry.split( + 'eventOutcome=', 1 + )[1].replace(', agentName=mediainfo', '').replace('\n', '') + if 'eventDetail=Mediatrace' in log_entry: + event_outcome = event_outcome.replace('mediainfo.xml', 'mediatrace.xml') + for x in object_locations: + ''' + This is trying to get the UUID of the source object + that relates to the mediainfo xmls. This is + achieved via a dictionary. + ''' + if 'objects' in x: + a = os.path.basename(event_outcome).replace('_mediainfo.xml', '').replace('_mediatrace.xml', '')[:-1] + b = os.path.basename(x) + if a == b: + linking_object_identifier_value = object_locations[x].replace('\'', '') + if (break_loop == 'continue') or (event_type == ''): + continue + print event_type + event_row = [ + 'UUID', ififuncs.create_uuid(), + event_type, datetime, event_detail, + '', + event_outcome, '', + event_outcome_detail_note, '', + '', '', + '', 'UUID', + linking_object_identifier_value, '' + ] + ififuncs.append_csv(output, event_row) + + +def update_objects(output, objects_csv): + ''' + Update the object description with the linkingEventIdentifiers + ''' + link_dict = {} + event_dicts = ififuncs.extract_metadata(output) + for i in event_dicts: + a = i['eventIdentifierValue'] + try: + link_dict[i['linkingObjectIdentifierValue']] += a + '|' + except KeyError: + link_dict[i['linkingObjectIdentifierValue']] = a + '|' + print link_dict + object_dicts = ififuncs.extract_metadata(objects_csv) + for x in object_dicts: + for link in link_dict: + if link == x['objectIdentifier'].split(', ')[1].replace(']', '').replace('\'', ''): + x['linkingEventIdentifierValue'] = link_dict[link] + premis_object_units = [ + 'objectIdentifier', + 'objectCategory', + 'messageDigestAlgorithm', 'messageDigest', 'messageDigestOriginator', + 'size', 'formatName', 'formatVersion', + 'formatRegistryName', 'formatRegistryKey', 'formatRegistryRole', + 'objectCharacteristicsExtension', 'originalName', + 'contentLocationType', 'contentLocationValue', + 'relatedObjectIdentifierType', 'relatedObjectIdentifierValue', + 'relatedObjectSequence', + 'relatedEventIdentifierType', 'relatedEventIdentifierValue', + 'relatedEventSequence', + 'linkingEventIdentifierType', 'linkingEventIdentifierValue', + 'relationship_structural_includes', + 'relationship_structural_isincludedin', + 'relationship_structural_represents', + 'relationship_structural_hasroot', + 'relationship_derivation_hassource' + ] + with open('mycsvfile.csv', 'wb') as f: + counter = 0 + for i in object_dicts: + w = csv.DictWriter(f, fieldnames=premis_object_units) + if counter == 0: + w.writeheader() + counter += 1 + w.writerow(i) + shutil.move('mycsvfile.csv', objects_csv) + + +def make_events_csv(output): + ''' + Generates a CSV with PREMIS-esque headings. Currently it's just called + 'bla.csv' but it will probably be called: + UUID_premisevents.csv + and sit in the metadata directory. + ''' + premis_events = [ + 'eventIdentifierType', 'eventIdentifierValue', + 'eventType', 'eventDateTime', 'eventDetail', + 'eventDetailExtension', + 'eventOutcome', 'eventOutcomeDetail', + 'eventOutcomeDetailNote', 'eventOutcomeDetailExtension', + 'linkingAgentIdentifierType', 'linkingAgentIdentifierValue', + 'linkingAgentIdentifierRole', 'linkingObjectIdentifierType', + 'linkingObjectIdentifierValue', 'linkingObjectRole' + ] + ififuncs.create_csv(output, premis_events) + + +def parse_args(args_): + ''' + Parse command line arguments. + ''' + parser = argparse.ArgumentParser( + description='Describes events using PREMIS data dictionary via CSV' + ' Written by Kieran O\'Leary.' + ) + parser.add_argument( + '-i', + help='full path of a log textfile', required=True + ) + parser.add_argument( + '-o', + help='full path of output csv', required=True + ) + parser.add_argument( + '-object_csv', + help='full path of object description csv', required=True + ) + parser.add_argument( + '-user', + help='Declare who you are. If this is not set, you will be prompted.' + ) + parsed_args = parser.parse_args(args_) + return parsed_args + + +def main(args_): + ''' + Launches all the other functions when run from the command line. + ''' + args = parse_args(args_) + logfile = args.i + output = args.o + objects_csv = args.object_csv + make_events_csv(output) + find_events(logfile, output) + update_objects(output, objects_csv) + + +if __name__ == '__main__': + main(sys.argv[1:]) From 784665dd1ce9a1c236982b6f79c1a2a15d362e8e Mon Sep 17 00:00:00 2001 From: kieranjol Date: Mon, 7 Aug 2017 12:34:51 +0100 Subject: [PATCH 26/31] deletes renamed premiscsv --- premiscsv.py | 226 --------------------------------------------------- 1 file changed, 226 deletions(-) delete mode 100755 premiscsv.py diff --git a/premiscsv.py b/premiscsv.py deleted file mode 100755 index 9d6b34d..0000000 --- a/premiscsv.py +++ /dev/null @@ -1,226 +0,0 @@ -#!/usr/bin/env python -''' -Extracts preservation events from an IFI plain text log file and converts -to a CSV using the PREMIS data dictionary -''' -import os -import sys -import csv -import shutil -import argparse -# from lxml import etree -import ififuncs - - -def find_events(logfile, output): - ''' - A very hacky attempt to extract the relevant preservation events from our - log files. - ''' - sip_test = os.path.basename(logfile).replace('_sip_log.log', '') - if ififuncs.validate_uuid4(sip_test) != False: - linking_object_identifier_value = sip_test - with open(logfile, 'r') as logfile_object: - log_lines = logfile_object.readlines() - for event_test in log_lines: - if 'eventDetail=copyit.py' in event_test: - logsplit = event_test.split(',') - for line_fragment in logsplit: - manifest_event = line_fragment.replace( - 'eventDetail', '' - ).replace('\n', '').split('=')[1] - object_info = ififuncs.extract_metadata('objects.csv') - object_locations = {} - for i in object_info: - object_locations[ - i['contentLocationValue'] - ] = i['objectIdentifier'].split(', ')[1].replace(']', '') - for log_entry in log_lines: - valid_entries = [ - 'eventType', - 'eventDetail=sipcreator.py', - 'eventDetail=Mediatrace', - 'eventDetail=Technical', - 'eventDetail=copyit.py' - ] - for entry in valid_entries: - if entry in log_entry: - break_loop = '' - event_outcome = '' - event_detail = '' - event_outcome_detail_note = '' - event_type = '' - event_row = [] - datetime = log_entry[:19] - logsplit = log_entry.split(',') - for line_fragment in logsplit: - if 'eventType' in line_fragment: - if 'EVENT =' in line_fragment: - line_fragment = line_fragment.split('EVENT =')[1] - event_type = line_fragment.replace( - ' eventType=', '' - ).replace('assignement', 'assignment') - if ' value' in line_fragment: - # this assumes that the value is the outcome of an identifier assigment. - event_outcome = line_fragment[7:].replace('\n', '') - # we are less concerned with events starting. - if 'status=started' in line_fragment: - break_loop = 'continue' - if 'Generating destination manifest:' in line_fragment: - break_loop = '' - event_detail = manifest_event - # ugh, this might run multiple times. - if 'eventDetail=sipcreator.py' in log_entry: - event_type = 'Information Package Creation' - event_detail = line_fragment.replace( - 'eventDetail', '' - ).replace('\n', '').split('=')[1] - event_outcome_detail_note = 'Submission Information Package' - if ('eventDetail=Mediatrace' in log_entry) or ('eventDetail=Technical' in log_entry): - event_type = 'metadata extraction' - event_detail = log_entry.split( - 'eventDetail=', 1 - )[1].split(',')[0] - event_outcome = log_entry.split( - 'eventOutcome=', 1 - )[1].replace(', agentName=mediainfo', '').replace('\n', '') - if 'eventDetail=Mediatrace' in log_entry: - event_outcome = event_outcome.replace('mediainfo.xml', 'mediatrace.xml') - for x in object_locations: - ''' - This is trying to get the UUID of the source object - that relates to the mediainfo xmls. This is - achieved via a dictionary. - ''' - if 'objects' in x: - a = os.path.basename(event_outcome).replace('_mediainfo.xml', '').replace('_mediatrace.xml', '')[:-1] - b = os.path.basename(x) - if a == b: - linking_object_identifier_value = object_locations[x].replace('\'', '') - if (break_loop == 'continue') or (event_type == ''): - continue - print event_type - event_row = [ - 'UUID', ififuncs.create_uuid(), - event_type, datetime, event_detail, - '', - event_outcome, '', - event_outcome_detail_note, '', - '', '', - '', 'UUID', - linking_object_identifier_value, '' - ] - ififuncs.append_csv(output, event_row) - - -def update_objects(output, objects_csv): - ''' - Update the object description with the linkingEventIdentifiers - ''' - link_dict = {} - event_dicts = ififuncs.extract_metadata(output) - for i in event_dicts: - a = i['eventIdentifierValue'] - try: - link_dict[i['linkingObjectIdentifierValue']] += a + '|' - except KeyError: - link_dict[i['linkingObjectIdentifierValue']] = a + '|' - print link_dict - object_dicts = ififuncs.extract_metadata(objects_csv) - for x in object_dicts: - for link in link_dict: - if link == x['objectIdentifier'].split(', ')[1].replace(']', '').replace('\'', ''): - x['linkingEventIdentifierValue'] = link_dict[link] - premis_object_units = [ - 'objectIdentifier', - 'objectCategory', - 'messageDigestAlgorithm', 'messageDigest', 'messageDigestOriginator', - 'size', 'formatName', 'formatVersion', - 'formatRegistryName', 'formatRegistryKey', 'formatRegistryRole', - 'objectCharacteristicsExtension', 'originalName', - 'contentLocationType', 'contentLocationValue', - 'relatedObjectIdentifierType', 'relatedObjectIdentifierValue', - 'relatedObjectSequence', - 'relatedEventIdentifierType', 'relatedEventIdentifierValue', - 'relatedEventSequence', - 'linkingEventIdentifierType', 'linkingEventIdentifierValue', - 'relationship_structural_includes', - 'relationship_structural_isincludedin', - 'relationship_structural_represents', - 'relationship_structural_hasroot', - 'relationship_derivation_hassource' - ] - with open('mycsvfile.csv', 'wb') as f: - counter = 0 - for i in object_dicts: - w = csv.DictWriter(f, fieldnames=premis_object_units) - if counter == 0: - w.writeheader() - counter += 1 - w.writerow(i) - shutil.move('mycsvfile.csv', objects_csv) - - -def make_events_csv(output): - ''' - Generates a CSV with PREMIS-esque headings. Currently it's just called - 'bla.csv' but it will probably be called: - UUID_premisevents.csv - and sit in the metadata directory. - ''' - premis_events = [ - 'eventIdentifierType', 'eventIdentifierValue', - 'eventType', 'eventDateTime', 'eventDetail', - 'eventDetailExtension', - 'eventOutcome', 'eventOutcomeDetail', - 'eventOutcomeDetailNote', 'eventOutcomeDetailExtension', - 'linkingAgentIdentifierType', 'linkingAgentIdentifierValue', - 'linkingAgentIdentifierRole', 'linkingObjectIdentifierType', - 'linkingObjectIdentifierValue', 'linkingObjectRole' - ] - ififuncs.create_csv(output, premis_events) - - -def parse_args(args_): - ''' - Parse command line arguments. - ''' - parser = argparse.ArgumentParser( - description='Describes events using PREMIS data dictionary via CSV' - ' Written by Kieran O\'Leary.' - ) - parser.add_argument( - '-i', - help='full path of a log textfile', required=True - ) - parser.add_argument( - '-o', - help='full path of output csv', required=True - ) - parser.add_argument( - '-object_csv', - help='full path of object description csv', required=True - ) - parser.add_argument( - '-user', - help='Declare who you are. If this is not set, you will be prompted.' - ) - parsed_args = parser.parse_args(args_) - return parsed_args - - -def main(args_): - ''' - Launches all the other functions when run from the command line. - ''' - args = parse_args(args_) - logfile = args.i - output = args.o - objects_csv = args.object_csv - make_events_csv(output) - find_events(logfile, output) - update_objects(output, objects_csv) - - -if __name__ == '__main__': - main(sys.argv[1:]) From 951ae948e56e6b7e528c0fa27ea8ed0224678d68 Mon Sep 17 00:00:00 2001 From: kieranjol Date: Mon, 7 Aug 2017 13:30:51 +0100 Subject: [PATCH 27/31] makepremis - makepremis - adds helper script that launches premisobjects/logs2premis/premiscsv2xml --- makepremis.py | 73 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 73 insertions(+) create mode 100755 makepremis.py diff --git a/makepremis.py b/makepremis.py new file mode 100755 index 0000000..d02be77 --- /dev/null +++ b/makepremis.py @@ -0,0 +1,73 @@ +#!/usr/bin/env python +''' +Creates PREMIS CSV and XML descriptions by launching other IFIscripts, +such as logs2premis.py, premisobjects.py, premiscsv2xml.py' +''' +import os +import argparse +import premisobjects +import premiscsv2xml +import logs2premis + + +def parse_args(): + ''' + Parse command line arguments. + ''' + parser = argparse.ArgumentParser( + description='Creates PREMIS CSV and XML descriptions by launching' + 'other IFIscripts, such as logs2premis.py, premisobjects.py,' + 'premiscsv2xml.py' + ' Written by Kieran O\'Leary.' + ) + parser.add_argument( + 'input', + help='full path to your input directory' + ) + parser.add_argument( + '-user', + help='Declare who you are. If this is not set, you will be prompted.' + ) + parsed_args = parser.parse_args() + return parsed_args + + +def launch_scripts(source): + ''' + Launches premisobjects, logs2premis and premiscsv2xml in input directory + ''' + for root, _, _ in os.walk(source): + if os.path.basename(root) == 'objects': + objects_csv = 'kelvin.csv' + events_csv = 'temp.csv' + uuid_dir = os.path.dirname(root) + logs_dir = os.path.join( + uuid_dir, 'logs' + ) + logname = os.path.join( + logs_dir, os.path.basename(uuid_dir + '_sip_log.log') + ) + manifest = os.path.join( + os.path.dirname(uuid_dir), os.path.basename(uuid_dir + '_manifest.md5') + ) + premisobjects.main( + ['-i', root, '-m', manifest, '-o', objects_csv] + ) + logs2premis.main( + ['-i', logname, '-object_csv', objects_csv, '-o', events_csv] + ) + premiscsv2xml.main( + ['-i', objects_csv, '-ev', events_csv] + ) + + +def main(): + ''' + Launch the other functions when called from the command line + ''' + args = parse_args() + source = args.input + launch_scripts(source) + +if __name__ == '__main__': + main() From 01a52546d4a69c167eb1e23f12211afe6558f3eb Mon Sep 17 00:00:00 2001 From: Kieran O'Leary Date: Fri, 18 Aug 2017 21:00:23 +0100 Subject: [PATCH 28/31] makepremis - adds arguments for object/events csv filenames --- makepremis.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/makepremis.py b/makepremis.py index d02be77..b8819aa 100755 --- a/makepremis.py +++ b/makepremis.py @@ -28,18 +28,26 @@ def parse_args(): '-user', help='Declare who you are. If this is not set, you will be prompted.' ) + parser.add_argument( + '-object_csv', required=True, + help='full path of objects csv' + ) + parser.add_argument( + '-event_csv', required=True, + help='full path of events csv' + ) parsed_args = parser.parse_args() return parsed_args -def launch_scripts(source): +def launch_scripts(source, args): ''' Launches premisobjects, logs2premis and premiscsv2xml in input directory ''' for root, _, _ in os.walk(source): if os.path.basename(root) == 'objects': - objects_csv = 'kelvin.csv' - events_csv = 'temp.csv' + objects_csv = args.object_csv + events_csv = args.event_csv uuid_dir = os.path.dirname(root) logs_dir = os.path.join( uuid_dir, 'logs' @@ -67,7 +75,7 @@ def main(): ''' args = parse_args() source = args.input - launch_scripts(source) + launch_scripts(source, args) if __name__ == '__main__': main() From 61fd02e90f382e1c93cb08cea6b9bf75c83239c8 Mon Sep 17 00:00:00 2001 From: Kieran O'Leary Date: Sat, 19 Aug 2017 16:31:21 +0100 Subject: [PATCH 29/31] README.md - updates PREMIS scripts documentation --- README.md | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 9a8fcc5..9debac5 100644 --- a/README.md +++ b/README.md @@ -9,7 +9,9 @@ table of contents * [sipcreator.py](https://github.com/kieranjol/IFIscripts#sipcreator) 3. [PREMIS](https://github.com/kieranjol/IFIscripts#PREMIS) * [premisobjects.py](https://github.com/kieranjol/IFIscripts#premisobjectspy) - * [premiscsv.py](https://github.com/kieranjol/IFIscripts#premiscsvpy) + * [logs2premis.py](https://github.com/kieranjol/IFIscripts#logs2premispy) + * [makepremis.py](https://github.com/kieranjol/IFIscripts#makepremispremispy) + * [premiscsv2xml.py](https://github.com/kieranjol/IFIscripts#premiscsv2xmlpy) 4. [Transcodes](https://github.com/kieranjol/IFIscripts#transcodes) * [makeffv1.py](https://github.com/kieranjol/IFIscripts#makeffv1py) * [bitc.py](https://github.com/kieranjol/IFIscripts#bitcpy) @@ -74,18 +76,28 @@ Note: Documentation template has been copied from [mediamicroservices](https://g ## PREMIS ## +### makepremis.py ### +* Creates PREMIS CSV and XML descriptions by launching other IFIscripts, such as logs2premis.py, premisobjects.py, premiscsv2xml.py. +* Assumptions for now: representation UUID already exists as part of the SIP/AIP folder structure. Find a way to supply this, probably via argparse. +* For more information, run `pydoc makepremis ` +* Usage: `makepremis.py -event_csv path/to/events.csv -object_csv path/to/objects.csv` + ### premisobjects.py ### * Creates a somewhat PREMIS compliant CSV file describing objects in a package. A seperate script will need to be written in order to transform these CSV files into XML. -* As the flat CSV structure prevents maintaining some of the complex relationships between units, some semantic units have been merged, for example:`relationship_structural_includes` is really a combination of the `relationshipType` and `relationshipSubType` units, which each have the values: `Structural` and `Includes` respectively. +* As the flat CSV structure prevents maintaining some of the relationships between units, some semantic units have been merged, for example:`relationship_structural_includes` is really a combination of the `relationshipType` and `relationshipSubType` units, which each have the values: `Structural` and `Includes` respectively. * Assumptions for now: representation UUID already exists as part of the SIP/AIP folder structure. Find a way to supply this, probably via argparse. -* For more information, run `pydoc premisobjects ` -* Usage: `premiscsv.py path/to/SIP path/to/manifest.md5` +* For more information, run `pydoc premisobjects` +* Usage: `premisobjects.py -i path/to/SIP -m path/to/manifest.md5 -o path/to/output.csv` -### premiscsv.py ### +### logs2premis.py ### * Extracts preservation events from an IFI plain text log file and converts to a CSV using the PREMIS data dictionary. * For more information, run `pydoc premiscsv` -* Usage: - `premiscsv.py path/to/logfile.log` +* Usage: - `premiscsv.py -i path/to/logfile.log -o path/to/output.csv -object_csv path/to/objects.csv` +### premiscsv2xml.py ### +* Transforms PREMIS csv files into XML. +* For more information, run `pydoc premiscsv2xml` +* Usage: `premiscsv2xml.py -ev path/to/events.csv -i path/to/objects.csv` ## Transcodes ## From ebc6ef4b1d5941341d65d619999c72690702d2a7 Mon Sep 17 00:00:00 2001 From: kieranjol Date: Tue, 16 Jan 2018 14:41:15 +0000 Subject: [PATCH 30/31] premisobjects/makepremis - clarifies argparse and fixes typo --- makepremis.py | 4 ++-- premisobjects.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/makepremis.py b/makepremis.py index b8819aa..5180869 100755 --- a/makepremis.py +++ b/makepremis.py @@ -30,11 +30,11 @@ def parse_args(): ) parser.add_argument( '-object_csv', required=True, - help='full path of objects csv' + help='full path and filename of the output objects CSV.' ) parser.add_argument( '-event_csv', required=True, - help='full path of events csv' + help='full path and filename of the output events CSV' ) parsed_args = parser.parse_args() return parsed_args diff --git a/premisobjects.py b/premisobjects.py index 421fd91..c2c5f5e 100755 --- a/premisobjects.py +++ b/premisobjects.py @@ -66,7 +66,7 @@ def file_description(source, manifest, representation_uuid, output): md5, uri = ififuncs.get_checksum(manifest, item) item_uuid = ififuncs.create_uuid() full_path = os.path.join(root, item) - print 'Using Siegfriend to analyze %s' % item + print 'Using Siegfried to analyze %s' % item pronom_id, authority, version = ififuncs.get_pronom_format( full_path ) From ec2e17b924d94baaa7b0d39795850c2f8c44ba93 Mon Sep 17 00:00:00 2001 From: kieranjol Date: Wed, 17 Jan 2018 22:26:22 +0000 Subject: [PATCH 31/31] premisobjects - adds placeholder relationships function --- premisobjects.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/premisobjects.py b/premisobjects.py index c2c5f5e..883d5e8 100755 --- a/premisobjects.py +++ b/premisobjects.py @@ -98,6 +98,23 @@ def file_description(source, manifest, representation_uuid, output): ] ififuncs.append_csv(output, file_data) return item_ids + +def build_relationships(): + ''' + Placeholder function that will produce a CSV containing the relationships + within a PREMIS object description. + ''' + relationships = [ + "relationship_uuid", + "objectIdentifierValue", + "relationshipType", + "relationshipSubType", + "relatedObjectIdentifierType", + "relatedObjectIdentifierValue", + "relatedEventIdentifierType", + "relatedEventIdentifierValue", + "relatedEventSequence" + ] def representation_description(representation_uuid, item_ids, output): ''' Generate PREMIS descriptions for a representation and write to CSV.