From 39fc05debe5e8d5f98814dc4d88d23e8ec6dff09 Mon Sep 17 00:00:00 2001 From: Jithu Sunny Date: Wed, 25 Apr 2018 18:11:57 +0530 Subject: [PATCH 1/5] Create xml_to_json.py --- scripts/xml_to_json.py | 139 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 139 insertions(+) create mode 100644 scripts/xml_to_json.py diff --git a/scripts/xml_to_json.py b/scripts/xml_to_json.py new file mode 100644 index 0000000..a8422ea --- /dev/null +++ b/scripts/xml_to_json.py @@ -0,0 +1,139 @@ +import os +import bz2 +import json +from glob import glob +from lxml import etree +from optparse import OptionParser +from pathos.multiprocessing import ProcessingPool as Pool + + +class XMLToJson(): + def __init__(self, region=None, flow=None, path=None, output=None): + self.process_options(region, flow, path, output) + + def process_options(self, region, flow, path, output): + parser = OptionParser() + parser.add_option("-r", "--region", dest="region", help="Enter region", metavar="PATH") + parser.add_option("-f", "--flow", dest="flow",help="enter flow", metavar="PATH") + parser.add_option("-p", "--path", dest="input_dir", help="bz2 file path", metavar="PATH") + parser.add_option("-o", "--output", dest="json", help="json file path", metavar="OUTPUT") + parser.add_option("-q", "--quiet", action="store_false", dest="verbose", default=True, + help="don't print status messages to stdout") + + (options, args) = parser.parse_args() + + if not options.region: + if not region: + parser.error('region not provided (-r option)') + else: + options.region = region + + if not options.flow: + if not flow: + parser.error('flow value not provided (-f option)') + else: + options.flow = flow + + if not options.input_dir: + if not path: + parser.error('input directory not provided (-p option)') + else: + options.input_dir = path + + if not os.path.exists(options.input_dir): + parser.error('input directory does not exist (-p option)') + + if not os.path.isdir(options.input_dir): + parser.error('please enter a directory name (-p option)') + + if not options.json: + if not output: + parser.error('json output filename not provided (-o option)') + else: + options.json = output + + if not options.json.endswith('.json'): + parser.error('please enter a json filename (-o option)') + + self.options = options + + + def parseXML(self, xml_data): + root = etree.fromstring(xml_data) + + data = { + 'UITID': '', + 'MESSAGE_ID': '', + 'TRD_DATE': '' + } + + imexmlTradeNotificationMessageHeader = root.find('imexml:imexmlTradeNotificationMessageHeader', root.nsmap) + imexmlMessageId = imexmlTradeNotificationMessageHeader.find('imexml:imexmlMessageId', root.nsmap) + if imexmlMessageId is not None: + data['MESSAGE_ID'] = imexmlMessageId.text + + imexmlNotificationBundleDetails = root.find('imexml:imexmlNotificationBundleDetails', root.nsmap) + imexmlPostEventTrades = imexmlNotificationBundleDetails.find('imexml:imexmlPostEventTrades', root.nsmap) + + for imexmlPostEventTrade in imexmlPostEventTrades: + postEventTrade = imexmlPostEventTrade.find('imexml:postEventTrade', root.nsmap) + + tradeHeader = postEventTrade.find('fpml:tradeHeader', root.nsmap) + tradeDate = tradeHeader.find('fpml:tradeDate', root.nsmap) + if tradeDate is not None: + data['TRD_DATE'] = tradeDate.text + + for partyTradeIdentifier in tradeHeader: + tradeId = partyTradeIdentifier.find('fpml:tradeId', root.nsmap) + + if tradeId is not None and tradeId.attrib['tradeIdScheme'] == 'http://www.dtcc.com/internal-reference-id': + data['UITID'] = tradeId.text + elif tradeId is not None and tradeId.attrib['tradeIdScheme'] == 'UniqueInternalTradeID': + data['UITID'] = tradeId.text + + return data + + + def read_file(self, filename): + try: + file_data = bz2.BZ2File(filename, 'rb') + return file_data.read() + except: + print('Error while reading ', filename) + + + def process_file(self, f): + xml_data = self.read_file(f) + try: + d = self.parseXML(xml_data) + except: + print('Error while parsing the XML file', f) + return {} + + d['REGION'] = self.options.region + d['FLOW'] = self.options.flow + return d + + + def get_bz2_files(self): + files_l = [] + for x in os.walk(self.options.input_dir): + for y in glob(os.path.join(x[0], '*.bz2')): + files_l.append(y) + + return files_l + + + def run(self): + pool = Pool(processes=10) + self.json_data = pool.map(self.process_file, self.get_bz2_files()) + + with open(self.options.json, 'w') as fp: + json.dump(self.json_data, fp) + + +def run_from_cmd(): + XMLToJson().run() + +if __name__ == '__main__': + XMLToJson().run() From 2e9983bbd438aa1a619e4bba103ba1eee1dddf68 Mon Sep 17 00:00:00 2001 From: Jithu Sunny Date: Wed, 25 Apr 2018 22:14:27 +0530 Subject: [PATCH 2/5] Update xml_to_json.py --- scripts/xml_to_json.py | 97 +++++++++++++++++++++++++++++++++--------- 1 file changed, 77 insertions(+), 20 deletions(-) diff --git a/scripts/xml_to_json.py b/scripts/xml_to_json.py index a8422ea..c9300fd 100644 --- a/scripts/xml_to_json.py +++ b/scripts/xml_to_json.py @@ -1,14 +1,16 @@ import os +import re +import sys import bz2 import json from glob import glob from lxml import etree from optparse import OptionParser -from pathos.multiprocessing import ProcessingPool as Pool class XMLToJson(): def __init__(self, region=None, flow=None, path=None, output=None): + self.json_data = [] self.process_options(region, flow, path, output) def process_options(self, region, flow, path, output): @@ -16,7 +18,7 @@ def process_options(self, region, flow, path, output): parser.add_option("-r", "--region", dest="region", help="Enter region", metavar="PATH") parser.add_option("-f", "--flow", dest="flow",help="enter flow", metavar="PATH") parser.add_option("-p", "--path", dest="input_dir", help="bz2 file path", metavar="PATH") - parser.add_option("-o", "--output", dest="json", help="json file path", metavar="OUTPUT") + parser.add_option("-o", "--output", dest="output_dir", help="json file path", metavar="OUTPUT") parser.add_option("-q", "--quiet", action="store_false", dest="verbose", default=True, help="don't print status messages to stdout") @@ -46,18 +48,41 @@ def process_options(self, region, flow, path, output): if not os.path.isdir(options.input_dir): parser.error('please enter a directory name (-p option)') - if not options.json: + if not options.output_dir: if not output: - parser.error('json output filename not provided (-o option)') + parser.error('output folder not provided (-o option)') else: - options.json = output + options.output_dir = output - if not options.json.endswith('.json'): - parser.error('please enter a json filename (-o option)') + if not os.path.exists(options.output_dir) or not os.path.isdir(options.output_dir): + os.makedirs(options.output_dir) self.options = options + def sanitize(self, content): + _illegal_unichrs = [(0x00, 0x08), (0x0D, 0x1F), + (0x7F, 0x84), (0x86, 0x9F), (0xFDD0, 0xFDDF), (0xFFFE, 0xFFFF)] + + if sys.maxunicode >= 0x10000: # not narrow build + _illegal_unichrs.extend([(0x1FFFE, 0x1FFFF), (0x2FFFE, 0x2FFFF), + (0x3FFFE, 0x3FFFF), (0x4FFFE, 0x4FFFF), + (0x5FFFE, 0x5FFFF), (0x6FFFE, 0x6FFFF), + (0x7FFFE, 0x7FFFF), (0x8FFFE, 0x8FFFF), + (0x9FFFE, 0x9FFFF), (0xAFFFE, 0xAFFFF), + (0xBFFFE, 0xBFFFF), (0xCFFFE, 0xCFFFF), + (0xDFFFE, 0xDFFFF), (0xEFFFE, 0xEFFFF), + (0xFFFFE, 0xFFFFF), (0x10FFFE, 0x10FFFF)]) + + _illegal_ranges = ["%s-%s" % (chr(low), chr(high)) + for (low, high) in _illegal_unichrs] + _illegal_xml_chars_RE = re.compile(u'[%s]' % u''.join(_illegal_ranges)) + + content = re.sub(_illegal_xml_chars_RE, "", content) + content = re.sub(r"[\x0A-\x0C]", "\n", content) + return content + + def parseXML(self, xml_data): root = etree.fromstring(xml_data) @@ -103,15 +128,29 @@ def read_file(self, filename): def process_file(self, f): - xml_data = self.read_file(f) - try: - d = self.parseXML(xml_data) - except: - print('Error while parsing the XML file', f) - return {} - - d['REGION'] = self.options.region - d['FLOW'] = self.options.flow + file_data = self.read_file(f).replace('\x02', '') + if file_data.count('') <= 1: + try: + d = self.parseXML(file_data) + d['REGION'] = self.options.region + d['FLOW'] = self.options.flow + d['PATH'] = f + except: + print('Error while parsing the XML file', f) + return {} + else: + d = [] + for xml_data in file_data.split(''): + if not xml_data: + continue + try: + t = self.parseXML(xml_data) + t['REGION'] = self.options.region + t['FLOW'] = self.options.flow + t['PATH'] = f + d.append(t) + except: + pass return d @@ -125,11 +164,29 @@ def get_bz2_files(self): def run(self): - pool = Pool(processes=10) - self.json_data = pool.map(self.process_file, self.get_bz2_files()) + for f in self.get_bz2_files(): + + output_f = self.options.output_dir + os.sep + f + + d = self.process_file(f) - with open(self.options.json, 'w') as fp: - json.dump(self.json_data, fp) + if not d: + continue + + if not os.path.exists(os.path.dirname(output_f)): + try: + os.makedirs(os.path.dirname(output_f)) + except OSError as exc: # Guard against race condition + if exc.errno != errno.EEXIST: + raise + + if isinstance(d, (list,)): + for i, data in enumerate(d): + with open(output_f + '_' + str(i + 1) + '.json', 'w') as fp: + json.dump(data, fp) + else: + with open(output_f + '.json', 'w') as fp: + json.dump(d, fp) def run_from_cmd(): From b3c2e99947aa10813d4be8dbf34cd59bd48761c4 Mon Sep 17 00:00:00 2001 From: Jithu Sunny Date: Wed, 25 Apr 2018 22:17:52 +0530 Subject: [PATCH 3/5] Update xml_to_json.py 1) Multiprocessing is removed 2) Install lxml: https://stackoverflow.com/questions/4642942/setup-py-install-lxml-with-python2-6-on-centos https://stackoverflow.com/questions/20886069/cant-install-lxml-on-centos/30440011 3) now the output is a directory 4) I can't post anything in your repo. Please make me a contributor to do so.. 5) Paths are accepted for input/output. This is fine To run: python2.7 xml_to_json.py -r xregion -f xflow -p input_dir -o ~/output_dir --- scripts/xml_to_json.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/xml_to_json.py b/scripts/xml_to_json.py index c9300fd..36b6f18 100644 --- a/scripts/xml_to_json.py +++ b/scripts/xml_to_json.py @@ -80,7 +80,7 @@ def sanitize(self, content): content = re.sub(_illegal_xml_chars_RE, "", content) content = re.sub(r"[\x0A-\x0C]", "\n", content) - return content + return content def parseXML(self, xml_data): From d0811dbecd1a76e68c0aac39e7a15d98c7e93b03 Mon Sep 17 00:00:00 2001 From: Jithu Sunny Date: Wed, 25 Apr 2018 22:42:58 +0530 Subject: [PATCH 4/5] Update xml_to_json.py --- scripts/xml_to_json.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/scripts/xml_to_json.py b/scripts/xml_to_json.py index 36b6f18..701e349 100644 --- a/scripts/xml_to_json.py +++ b/scripts/xml_to_json.py @@ -80,7 +80,7 @@ def sanitize(self, content): content = re.sub(_illegal_xml_chars_RE, "", content) content = re.sub(r"[\x0A-\x0C]", "\n", content) - return content + return content def parseXML(self, xml_data): @@ -105,16 +105,17 @@ def parseXML(self, xml_data): tradeHeader = postEventTrade.find('fpml:tradeHeader', root.nsmap) tradeDate = tradeHeader.find('fpml:tradeDate', root.nsmap) - if tradeDate is not None: + if tradeDate is not None and data['TRD_DATE'] == '': data['TRD_DATE'] = tradeDate.text for partyTradeIdentifier in tradeHeader: - tradeId = partyTradeIdentifier.find('fpml:tradeId', root.nsmap) + tradeIds = partyTradeIdentifier.findall('.//fpml:tradeId', root.nsmap) - if tradeId is not None and tradeId.attrib['tradeIdScheme'] == 'http://www.dtcc.com/internal-reference-id': - data['UITID'] = tradeId.text - elif tradeId is not None and tradeId.attrib['tradeIdScheme'] == 'UniqueInternalTradeID': - data['UITID'] = tradeId.text + for tradeId in tradeIds: + if tradeId is not None and tradeId.attrib['tradeIdScheme'] == 'http://www.dtcc.com/internal-reference-id' and data['UITID'] == '': + data['UITID'] = tradeId.text + elif tradeId is not None and tradeId.attrib['tradeIdScheme'] == 'UniqueInternalTradeID' and data['UITID'] == '': + data['UITID'] = tradeId.text return data From a38a7f342dd0b5f61dd9afa913c4fa0af23fcc28 Mon Sep 17 00:00:00 2001 From: Jithu Sunny Date: Wed, 25 Apr 2018 23:46:57 +0530 Subject: [PATCH 5/5] Update xml_to_json.py --- scripts/xml_to_json.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/scripts/xml_to_json.py b/scripts/xml_to_json.py index 701e349..e9d313c 100644 --- a/scripts/xml_to_json.py +++ b/scripts/xml_to_json.py @@ -129,6 +129,7 @@ def read_file(self, filename): def process_file(self, f): + print('Processing ' + f) file_data = self.read_file(f).replace('\x02', '') if file_data.count('') <= 1: try: @@ -165,9 +166,10 @@ def get_bz2_files(self): def run(self): + parent_dir_of_input_dir = os.path.abspath(self.options.input_dir).rstrip('/').rstrip('\\').replace(os.path.basename(self.options.input_dir), '').rstrip('/').rstrip('\\') for f in self.get_bz2_files(): - output_f = self.options.output_dir + os.sep + f + output_f = os.path.abspath(self.options.output_dir) + os.path.abspath(f).replace(parent_dir_of_input_dir, '') d = self.process_file(f)