Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
105 changes: 63 additions & 42 deletions acf/acf_parse-docx-to-xml.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,11 @@
import json
from collections import defaultdict
import urllib.parse
import requests
import csv
import pandas as pd

def find_source_link(cell, target_text, preceding_target_text=None):
def find_source_link(details, cell, target_text, preceding_target_text=None):
# Get the XML for the cell
cell_xml = cell._element

Expand Down Expand Up @@ -63,6 +66,7 @@ def find_source_link(cell, target_text, preceding_target_text=None):
full_target = rel._target
if anchor:
full_target += f"#{anchor}" # Append anchor if present

return full_target

# OPTION 2: LINKS ARE STORED IN <w:instrText> TAGS, NESTED INSIDE <p> tags alongside text
Expand Down Expand Up @@ -91,8 +95,7 @@ def find_source_link(cell, target_text, preceding_target_text=None):
anchor_text = anchor_text[1:-1] # Remove surrounding quotes
hyperlink_url += f"#{anchor_text}"

return hyperlink_url

return hyperlink_url

# If no matching hyperlink was found, return empty string (no None so that we avoid string processing issues downstream)
print(f"\n\nWARNING: Word doc does not appear to include a hyperlink for this string: {orig_text}")
Expand Down Expand Up @@ -262,7 +265,7 @@ def parse_requirement_blocks(data, cell, temp_list, details, parent_position):
record['label'] = label
record['description'] = description
record['state_code'] = state_code
record['source'] = find_source_link(cell, state_code)
record['source'] = find_source_link(details, cell, state_code)

if record['source'] is None:
print('\tREQ LABEL', label)
Expand Down Expand Up @@ -293,6 +296,7 @@ def parse_requirement_blocks(data, cell, temp_list, details, parent_position):
if tags_match:
tags_text = tags_match.group(1)
else:
#print(data_slice)
tags_text = data_slice[2].split(':', 1)[1]

# check our terms and assign to dict
Expand Down Expand Up @@ -324,7 +328,7 @@ def parse_to_dict(list_of_strings, cell, search_term):
temp_dict = {
"current_position": t.strip(),
"name": list_of_strings[t_idx+1],
"source": find_source_link(cell, list_of_strings[t_idx+1], t),
"source": find_source_link(details, cell, list_of_strings[t_idx+1], t),
"start_idx": t_idx,
"end_idx": t_idx+1
}
Expand Down Expand Up @@ -401,8 +405,7 @@ def parse_tables(doc, details, record_data, object_type):
#some rows will actually have more than 3 cells: need to verify where indexes will start
if len(row.cells) > 3:
title_idx = return_index(row.cells, details['titleName'])
office_idx = return_index(row.cells, 'ACF Offices Associated')

office_idx = return_index(row.cells, 'ACF Offices Associated')
else:
title_idx = 1
office_idx = 2
Expand Down Expand Up @@ -435,7 +438,7 @@ def parse_tables(doc, details, record_data, object_type):
else:
source_text = category_name

category_source = find_source_link(row.cells[title_idx], source_text)
category_source = find_source_link(details, row.cells[title_idx], source_text)

#update 'current position'
current_position = f"{category_name} - {current_position}"
Expand Down Expand Up @@ -492,7 +495,10 @@ def parse_tables(doc, details, record_data, object_type):
record_data[current_title_key].setdefault('articles', [])

# get article info
temp_article_dict = parse_to_dict(article_overview, overview_cell, details['articleName'])
if details['articleName']:
temp_article_dict = parse_to_dict(article_overview, overview_cell, details['articleName'])
else:
temp_article_dict = None

# Provide warning if we didn't get an article; continue on to next item
if temp_article_dict is None:
Expand Down Expand Up @@ -584,19 +590,18 @@ def parse_tables(doc, details, record_data, object_type):
# set up lists to carry definition and requirement information
definitions = []
requirements = []
regulations = []
found_regs = False

for index, item in enumerate(statutes):
if any(item.lower().startswith(phrase) for phrase in ['definitions related to', 'definitions for ']):
def_index = index
break

for index, item in enumerate(statutes):
if any(item.lower().startswith(phrase) for phrase in ['requirements related ', 'requirements for ', 'Regulations regarding ']):
if any(item.lower().startswith(phrase) for phrase in ['requirements related ', 'requirements for ', 'requirements regarding ', 'regulations regarding ']):
req_index = index
# NOTE: we need to edit Word doc to replace 'Regulations' with 'Requirements'; give a warning if so
if "Regulations" in statutes[index]:
found_regs = True
print(f'\n\nWARNING: Found REGULATION at {current_position}')
break

# we may not have definitions; make sure we found them
Expand Down Expand Up @@ -630,7 +635,7 @@ def parse_tables(doc, details, record_data, object_type):
# add state code, source, and defined terms to a dictionary; append to our definitions
temp_defn_dict = {
"state_code": state_code.strip(),
"source": find_source_link(statutes_cell, state_code),
"source": find_source_link(details, statutes_cell, state_code),
"defined_terms": [t.strip() for t in terms.split(',')]
}

Expand All @@ -656,13 +661,9 @@ def parse_tables(doc, details, record_data, object_type):
# slice our original list to include all the statute information
req_info = statutes[check_index:]

# parse out requirements and add to temp dict; note tht we need to differentiate between 'regulations' and 'requirements'
if found_regs:
regulations = parse_requirement_blocks(req_info, statutes_cell, regulations, details, current_position)
temp_article_dict['regulations'] = regulations
else:
requirements = parse_requirement_blocks(req_info, statutes_cell, requirements, details, current_position)
temp_article_dict['requirements'] = requirements
# parse out requirements and add to temp dict
requirements = parse_requirement_blocks(req_info, statutes_cell, requirements, details, current_position)
temp_article_dict['requirements'] = requirements

if temp_article_dict not in record_data[current_title_key]['articles']:
record_data[current_title_key]['articles'].append(temp_article_dict)
Expand Down Expand Up @@ -831,25 +832,25 @@ def write_xml(details, record_data):
etree.SubElement(terms_elem, "term").text = tag.replace('&amp;', '&')

#add regulations, if present
if article.get("regulations", []):
reg_elem = etree.SubElement(article_elem, "regulations")
for reg in article.get("regulations", []):
reg_statute_elem = etree.SubElement(reg_elem, "statute")
etree.SubElement(reg_statute_elem, "label").text = reg.get("label", '')
etree.SubElement(reg_statute_elem, "description").text = reg.get("description", '')
etree.SubElement(reg_statute_elem, "stateCode").text = reg.get("state_code", '')
etree.SubElement(reg_statute_elem, "source").text = reg.get("source")

# Add appliesTo entities
applies_elem = etree.SubElement(reg_statute_elem, "appliesTo")
for ent in reg.get("entities", ['']):
etree.SubElement(applies_elem, "entity").text = ent

# Add terms tags
terms_elem = etree.SubElement(reg_statute_elem, "terms")
for tag in reg.get("tags", []):
#watch out for lxml double-escaping ampersands...
etree.SubElement(terms_elem, "term").text = tag.replace('&amp;', '&')
# if article.get("regulations", []):
# reg_elem = etree.SubElement(article_elem, "regulations")
# for reg in article.get("regulations", []):
# reg_statute_elem = etree.SubElement(reg_elem, "statute")
# etree.SubElement(reg_statute_elem, "label").text = reg.get("label", '')
# etree.SubElement(reg_statute_elem, "description").text = reg.get("description", '')
# etree.SubElement(reg_statute_elem, "stateCode").text = reg.get("state_code", '')
# etree.SubElement(reg_statute_elem, "source").text = reg.get("source")

# # Add appliesTo entities
# applies_elem = etree.SubElement(reg_statute_elem, "appliesTo")
# for ent in reg.get("entities", ['']):
# etree.SubElement(applies_elem, "entity").text = ent

# # Add terms tags
# terms_elem = etree.SubElement(reg_statute_elem, "terms")
# for tag in reg.get("tags", []):
# #watch out for lxml double-escaping ampersands...
# etree.SubElement(terms_elem, "term").text = tag.replace('&amp;', '&')

# Create the ElementTree object
tree = etree.ElementTree(root)
Expand Down Expand Up @@ -960,15 +961,15 @@ def check_all_hyperlinks_from_docx(doc, xml_file):
# Extract the hyperlink URL from the instrText
parts = instr_text.text.split('"')
if len(parts) > 1:
hyperlink_url = parts[1]
hyperlink_url = parts[1].lower()

if "\\l" in instr_text.text:
# Find the text following \l
l_index = instr_text.text.index("\\l") + 2
anchor_text = instr_text.text[l_index:].strip()
if anchor_text.startswith('"') and anchor_text.endswith('"'):
anchor_text = anchor_text[1:-1] # Remove surrounding quotes
hyperlink_url += f"#{anchor_text}"
hyperlink_url += f"#{anchor_text}".lower()

if hyperlink_url.lower() not in xml_hyperlinks:

Expand All @@ -981,6 +982,20 @@ def check_all_hyperlinks_from_docx(doc, xml_file):
missing_docx_hyperlinks[hyperlink_url] = []
missing_docx_hyperlinks[hyperlink_url].append(paragraph_text)

#For New Jersey record, check to see if bad_links were replaced with good_links
# excel_path = 'C:/ACF/bad_links.xlsx'
# df = pd.read_excel(excel_path)
# bad_to_good = {k.lower(): v.lower() for k, v in zip(df['Bad Link'], df['Good Link'])}

# remove_from_list = []

# for url, text in missing_docx_hyperlinks.items():
# if url in bad_to_good:
# if bad_to_good[url].lower() in xml_hyperlinks:
# remove_from_list.append(url)
# for url in remove_from_list:
# del missing_docx_hyperlinks[url]

# If there are missing hyperlinks, print them
if missing_docx_hyperlinks:
print(f"\n\nFound {len(missing_docx_hyperlinks)} hyperlinks in DOCX that are not in the XML:")
Expand Down Expand Up @@ -1072,10 +1087,16 @@ def main(details):
details['tmp_audit_log'] = os.path.join(details['out_dir'], f'tmp_{details['state'].lower().replace(' ', '_')}_audit-log.txt')
if os.path.exists(details['tmp_audit_log']):
os.remove(details['tmp_audit_log'])

details["audit_log"] = os.path.join(details['out_dir'], f'{details['state'].lower().replace(' ', '_')}_audit-log.txt')
if os.path.exists(details['audit_log']):
os.remove(details['audit_log'])


details['bad_link_log'] = os.path.join(details['out_dir'], f'{details['state'].lower().replace(' ', '_')}_bad-link-log.csv')
if os.path.exists(details['bad_link_log']):
os.remove(details['bad_link_log'])

# make sure boolean values are set
for term in ["category", "titleContent"]:
try:
Expand Down
Loading