Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 11 additions & 4 deletions acf/acf_parse-docx-to-xml.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ def find_source_link(cell, target_text, preceding_target_text=None):
text_elements = paragraph.findall('.//w:t', namespaces={'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'})

# Concatenate text from all <w:t> elements and clean it
paragraph_text = ''.join([t.text.strip().replace(' ', '').replace('(', '').replace(')', '').replace("’", "'").lower() for t in text_elements if t.text])
paragraph_text = ''.join([t.text.strip().replace(' ', '').replace('(', '').replace(')', '').replace('\u200b', '').replace("\u2009", "").replace("’", "'").lower() for t in text_elements if t.text])

# Check for preceding target text if needed
if preceding_target_text and not preceding_text_found:
Expand All @@ -48,7 +48,7 @@ def find_source_link(cell, target_text, preceding_target_text=None):
for hyperlink in hyperlinks:
# Extract all <w:t> elements in the hyperlink
link_text_elements = hyperlink.findall('.//w:t', namespaces={'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'})
full_text = ''.join([t.text.strip().replace(' ', '').replace("’", "'").replace('(', '').replace(')', '').lower() for t in link_text_elements if t.text])
full_text = ''.join([t.text.strip().replace(' ', '').replace('\u200b', '').replace("\u2009", "").replace("’", "'").replace('(', '').replace(')', '').lower() for t in link_text_elements if t.text])

# Match the full hyperlink text with target_text
if target_text == full_text:
Expand Down Expand Up @@ -544,8 +544,13 @@ def parse_tables(doc, details, record_data, object_type):
if len(part_slice) > 0:
temp_article_dict['part'] = parse_to_dict(part_slice, overview_cell, details['partName'])

if temp_article_dict['part']['current_position'] not in current_position:
current_position += f" - {temp_article_dict['part']['current_position']}"
#NOTE: we sometimes have issues with the parsing; exit if we have an error so that we can figure out the issue
try:
if temp_article_dict['part']['current_position'] not in current_position:
current_position += f" - {temp_article_dict['part']['current_position']}"
except TypeError:
print(part_slice)
sys.exit(1)

# we will only have a sub-part if there is a part
if details['subPartName']:
Expand Down Expand Up @@ -1067,6 +1072,8 @@ def main(details):
if os.path.exists(details['tmp_audit_log']):
os.remove(details['tmp_audit_log'])
details["audit_log"] = os.path.join(details['out_dir'], f'{details['state'].lower().replace(' ', '_')}_audit-log.txt')
if os.path.exists(details['audit_log']):
os.remove(details['audit_log'])

# make sure boolean values are set
for term in ["category", "titleContent"]:
Expand Down
12 changes: 7 additions & 5 deletions acf/acf_parse_config.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"input_doc": "C:/ACF/FINAL APPROVED STATE RECORDS/Nevada/Nevada_Copy.docx",
"out_dir": "C:/ACF/FINAL APPROVED STATE RECORDS/Florida",
"input_doc": "C:/ACF/FINAL APPROVED STATE RECORDS/Navajo/Navajo Nation_Copy.docx",
"out_dir": "C:/ACF/FINAL APPROVED STATE RECORDS/Navajo",
"xsd_file": "schema_final.xsd",
"xsl_file": "statetemplatev5.xsl",
"category": "false",
Expand All @@ -10,9 +10,9 @@
"partName": [],
"subPartName": "",
"titleContent": "false",
"state": "Nevada",
"state_code_pattern": "^(.+?)\\s?\\u2013\\s?(.*?)\\s?\\(?(FLA\\.\\sSTAT\\.\\s\\u00A7\\s\\d+\\.\\d+)\\)?$",
"statute_pattern": "^(FLA\\.\\sSTAT\\.\\s\\u00A7\\s\\d+\\.\\d+)\\s+[-\\u2013\\u2014]\\s+(.*)$",
"state": "Navajo Nation",
"state_code_pattern": "^(.+?)\\s?\\u2013\\s?(.*?)\\s?\\(?(\\d+\\sN\\.N\\.C\\.\\s\\u00A7\\s\\d+),\\sp\\.\\d+\\)?$",
"statute_pattern": "^(\\d+\\sN\\.N\\.C\\.\\s\\u00A7\\s\\d+),\\sp\\.\\d+\\s+[-\\u2013\\u2014]\\s+(.*)$",
"patterns": {
"base": "(.+)\\s?\\u2013\\s?(.*)\\s+",
"Alabama": "^(.+?)\\s?\\u2013\\s?(.*?)\\s?\\(?(AL\\sCode\\s\\u00A7\\s[-\\w\\.]+)\\)?$",
Expand All @@ -25,6 +25,7 @@
"Idaho": "(.+)\\s?\\u2013\\s?(.*)\\s+\\(?(I\\.C\\. Stat\\s.+\\d+[A-Za-z]*)\\)?$",
"Illinois": "^(.+?)\\s?\\u2013\\s?(.*?)\\s?\\(?(\\d+\\sILCS\\s[-\\d\\.\\/]+)\\)?$",
"Indiana": "(.+?)\\s?\\u2013\\s?(.*?)\\s?\\(?(IN Code\\s\\u00A7\\s[-\\.\\w]+)\\)?$",
"Iowa": "^(.+?)\\s?\\u2013\\s?(.*?)\\s?\\(?(IA\\sCode\\s\\u00A7\\s[\\w\\.]+)\\)?$",
"Kentucky": "^(.+?)\\s?\\u2013\\s?(.*?)\\s?\\(?(K\\.Y\\.\\sStat\\s\\u00A7\\s[-\\w\\.]+)\\)?$",
"Maine": "^(.+?)\\s?\\u2013\\s?(.*?)\\s?\\(?(M\\.R\\.S\\.\\s[-\\w]+\\s\\u00A7\\s[-\\w]+)\\)?$",
"Maryland": "(.+?)\\s?\\u2013\\s?(.*?)\\s?\\(?(MD Code,?\\s(?:Family Law|Local Government|General Provisions|Health\\s*[-\\u2013]\\s*General|Human Services),?\\s\u00A7\\s\\d+(?:\\.\\d+)?-[-–\\.\\w]*)\\)?$",
Expand Down Expand Up @@ -61,6 +62,7 @@
"Idaho": "^(I\\.C\\. Stat\\s.+\\d+[A-Za-z]*)\\s?[\\u2013\\u2014]\\s?(.*)",
"Illinois": "^(\\d+\\sILCS\\s[-\\d\\.\\/]+)\\s+[-\\u2013\\u2014]\\s+(.*)$",
"Indiana": "^(IN Code\\s\\u00A7\\s[-\\.\\w]+)\\s+[-\\u2013\\u2014]\\s+(.*)$",
"Iowa": "^(IA\\sCode\\s\\u00A7\\s[\\w\\.]+)\\s+[-\\u2013\\u2014]\\s+(.*)$",
"Kentucky": "^(K\\.Y\\.\\sStat\\s\\u00A7\\s[-\\w\\.]+)\\s+[-\\u2013\\u2014]\\s+(.*)$",
"Maine": "^(M\\.R\\.S\\.\\s[-\\w]+\\s\\u00A7\\s[-\\w]+)\\s+[-\\u2013\\u2014]\\s+(.*)$",
"Maryland": "^(MD Code,?\\s(?:Family Law|Local Government|General Provisions|Health\\s*[-\\u2013]\\s*General|Human Services),?\\s\u00A7\\s\\d+(?:\\.\\d+)?-[-\\.\\w]*)\\s+[-\\u2013\\u2014]\\s+(.*)$",
Expand Down
90 changes: 90 additions & 0 deletions acf/state_configs/IA_acf_parse_config.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
{
"input_doc": "C:/ACF/FINAL APPROVED STATE RECORDS/Iowa/Iowa_Copy.docx",
"out_dir": "C:/ACF/FINAL APPROVED STATE RECORDS/Iowa",
"xsd_file": "schema_final.xsd",
"xsl_file": "statetemplatev5.xsl",
"category": "false",
"titleName": "Title",
"subtitleName": "",
"articleName": "Chapter",
"partName": [],
"subPartName": "",
"titleContent": "false",
"state": "Iowa",
"state_code_pattern": "^(.+?)\\s?\\u2013\\s?(.*?)\\s?\\(?(IA\\sCode\\s\\u00A7\\s[\\w\\.]+)\\)?$",
"statute_pattern": "^(IA\\sCode\\s\\u00A7\\s[\\w\\.]+)\\s+[-\\u2013\\u2014]\\s+(.*)$",
"patterns": {
"base": "(.+)\\s?\\u2013\\s?(.*)\\s+",
"Alabama": "^(.+?)\\s?\\u2013\\s?(.*?)\\s?\\(?(AL\\sCode\\s\\u00A7\\s[-\\w\\.]+)\\)?$",
"Arizona": "^(.+?)\\s?\\u2013\\s?(.*?)\\s?\\(?(AZ\\sRev\\sStat\\s\\u00A7\\s\\d+-[-\\w\\.]+)\\)?$",
"Colorado": "^(.+?)\\s?\\u2013\\s?(.*?)\\s?\\(?(CO\\sCode\\s\\u00A7\\s\\[-\\d\\.]+)\\)?$",
"Connecticut": "^(.+?)\\s?\\u2013\\s?(.*?)\\s?\\(?(CT\\sGen\\sStat\\s\\u00A7\\s[-\\w]+)\\)?$",
"Delaware": "^(.+?)\\s?\\u2013\\s?(.*?)\\s?\\(?(\\d+\\sDE\\sCode\\s\\u00A7\\s[-\\w]+)\\)?$",
"Flandreau": "^(.+?)\\s?\\u2013\\s?(.*?)\\s?\\(?(Flandreau\\sSantee\\sSioux\\sTribal\\sCode\\s\\u00A7\\s[-\\d]+)\\)?$",
"Florida": "^(.+?)\\s?\\u2013\\s?(.*?)\\s?\\(?(FLA\\.\\sSTAT\\.\\s\\u00A7\\s\\d+\\.\\d+)\\)?$",
"Idaho": "(.+)\\s?\\u2013\\s?(.*)\\s+\\(?(I\\.C\\. Stat\\s.+\\d+[A-Za-z]*)\\)?$",
"Illinois": "^(.+?)\\s?\\u2013\\s?(.*?)\\s?\\(?(\\d+\\sILCS\\s[-\\d\\.\\/]+)\\)?$",
"Indiana": "(.+?)\\s?\\u2013\\s?(.*?)\\s?\\(?(IN Code\\s\\u00A7\\s[-\\.\\w]+)\\)?$",
"Iowa": "^(.+?)\\s?\\u2013\\s?(.*?)\\s?\\(?(IA\\sCode\\s\\u00A7\\s[\\w\\.]+)\\)?$",
"Kentucky": "^(.+?)\\s?\\u2013\\s?(.*?)\\s?\\(?(K\\.Y\\.\\sStat\\s\\u00A7\\s[-\\w\\.]+)\\)?$",
"Maine": "^(.+?)\\s?\\u2013\\s?(.*?)\\s?\\(?(M\\.R\\.S\\.\\s[-\\w]+\\s\\u00A7\\s[-\\w]+)\\)?$",
"Maryland": "(.+?)\\s?\\u2013\\s?(.*?)\\s?\\(?(MD Code,?\\s(?:Family Law|Local Government|General Provisions|Health\\s*[-\\u2013]\\s*General|Human Services),?\\s\u00A7\\s\\d+(?:\\.\\d+)?-[-–\\.\\w]*)\\)?$",
"Massachusetts": "(.+)\\s?\\u2013\\s?(.*)\\s+\\(?(MA Gen L.+\\d+[A-Za-z]*)\\s*\\)?$",
"Michigan": "^(.+?)\\s?\\u2013\\s?(.*?)\\s?\\(?(MI\\sComp\\sLaws\\s\\u00A7\\s[\\w\\.]+)\\)?$",
"Minnesota": "^(.+?)\\s?\\u2013\\s?(.*?)\\s?\\(?(MN\\sStat\\s\\u00A7\\s[-\\w\\.]+)\\)?$",
"Mississippi": "^(.+?)\\s?\\u2013\\s?(.*?)\\s?\\(?(MS\\sCode\\s\\u00A7\\s?[-\\w\\.]+)\\)?$",
"Navajo Nation": "^(.+?)\\s?\\u2013\\s?(.*?)\\s?\\(?(\\d+\\sN\\.N\\.C\\.\\s\\u00A7\\s\\d+),\\sp\\.\\d+\\)?$",
"New Jersey": "(.+?)\\s?\\u2013\\s?(.*?)\\s?\\(?(NJ Rev Stat \\u00A7 \\d+:\\d+[-\\.\\w]*)\\)?$",
"New York": "^(.+?)\\s?\\u2013\\s?(.*?)\\s?\\(?(NY\\s.+\\s\\u00A7\\s[-\\w\\.]+)\\)?$",
"Nevada": "(.+)\\s?\\u2013\\s?(.*)\\s+\\(?(NRS[\\s\\.][A-Za-z0-9\\.]+)\\)?$",
"North Dakota": "^(.+?)\\s?\\u2013\\s?(.*?)\\s?\\(?(N\\.D\\.\\sStat\\s\\u00A7\\s[-\\w\\.]+)\\)?$",
"Ohio": "(.+?)\\s?\\u2013\\s?(.*?)\\s+\\(?(Ohio Rev Code\\s\\u00A7\\s+\\d+(?:\\.\\d*)?)\\)?$",
"Oklahoma": "(.+?)\\s?\\u2013\\s?(.*?)\\s+\\(?(\\d+[A-Za-z]?\\sOK STAT \\u00A7\\s[\\d-]+([A-Za-z0-9\\-\\.]+)?)\\)?$",
"Oregon": "(.+?)\\s?\\u2013\\s?(.*?)\\s?\\(?(ORS\\s+[A-Za-z0-9\\.]+)\\)?$",
"Rhode Island": "^(.+?)\\s?\\u2013\\s?(.*?)\\s?\\(?(R\\.I\\.\\sStat\\s\\u00A7\\s[-\\d\\.]+)\\)?$",
"South Carolina": "^(.+?)\\s?\\u2013\\s?(.*?)\\s?\\((SC\\sCode\\s\\u00A7\\s[-\\d]+)\\)\\.?$",
"Tennessee": "^(.+?)\\s?\\u2013\\s?(.*?)\\s?\\(?(TN Code\\s\\u00A7\\s[-\\.\\w]+)\\)?$",
"Texas": "^(.+?)\\s?\\u2013\\s?(.*?)\\s?\\(?(TX\\s(?:Hum Res Code|Govt Code|Health & Safety Code|Fam Code)\\s\\u00A7\\s[\\d\\.]+)\\)?$",
"Vermont": "^(.+?)\\s?\\u2013\\s?(.*?)\\s?\\(?(\\d+[A-Z]?\\sV\\.S\\.A\\.\\s\\u00A7\\s[-\\w]+)\\)?$",
"Virginia": "^(VA Code\\s\\u00A7\\s+\\d+(?:\\.\\d+)?-\\d+(?:\\.\\d+)?(?::\\d+)?)\\s+[\\u2013\\u2014]\\s+(.*)$",
"Wisconsin": "^(.+?)\\s?\\u2013\\s?(.*?)\\s?\\(?(WI\\sStat\\s\\u00A7\\s[\\d\\.]+)\\)?$",
"Wyoming": "^(.+?)\\s?\\u2013\\s?(.*?)\\s?\\(?(WY\\sStat\\s\\u00A7\\s[-\\w\\.]+)\\)?$",
"Yurok": "^(.+?)\\s?\\u2013\\s?(.*?)\\s?\\(?(YTC\\s[\\d\\.]+)\\)?$"
},
"state_statutes": {
"Alabama": "^(AL\\sCode\\s\\u00A7\\s[-\\w\\.]+)\\s+[-\\u2013\\u2014]\\s+(.*)$",
"Arizona": "^(AZ\\sRev\\sStat\\s\\u00A7\\s\\d+-[-\\w\\.]+)\\s+[-\\u2013\\u2014]\\s+(.*)$",
"Colorado": "^(CO\\sCode\\s\\u00A7\\s\\[-\\d\\.]+)\\s+[-\\u2013\\u2014]\\s+(.*)$",
"Connecticut": "^(CT\\sGen\\sStat\\s\\u00A7\\s[-\\w]+)\\s+[-\\u2013\\u2014]\\s+(.*)$",
"Delaware": "^(\\d+\\sDE\\sCode\\s\\u00A7\\s[-\\w]+)\\s+[-\\u2013\\u2014]\\s+(.*)$",
"Flandreau": "^(Flandreau\\sSantee\\sSioux\\sTribal\\sCode\\s\\u00A7\\s[-\\d]+)\\s+[-\\u2013\\u2014]\\s+(.*)$",
"Florida": "^(FLA\\.\\sSTAT\\.\\s\\u00A7\\s\\d+\\.\\d+)\\s+[-\\u2013\\u2014]\\s+(.*)$",
"Idaho": "^(I\\.C\\. Stat\\s.+\\d+[A-Za-z]*)\\s?[\\u2013\\u2014]\\s?(.*)",
"Illinois": "^(\\d+\\sILCS\\s[-\\d\\.\\/]+)\\s+[-\\u2013\\u2014]\\s+(.*)$",
"Indiana": "^(IN Code\\s\\u00A7\\s[-\\.\\w]+)\\s+[-\\u2013\\u2014]\\s+(.*)$",
"Iowa": "^(IA\\sCode\\s\\u00A7\\s[\\w\\.]+)\\s+[-\\u2013\\u2014]\\s+(.*)$",
"Kentucky": "^(K\\.Y\\.\\sStat\\s\\u00A7\\s[-\\w\\.]+)\\s+[-\\u2013\\u2014]\\s+(.*)$",
"Maine": "^(M\\.R\\.S\\.\\s[-\\w]+\\s\\u00A7\\s[-\\w]+)\\s+[-\\u2013\\u2014]\\s+(.*)$",
"Maryland": "^(MD Code,?\\s(?:Family Law|Local Government|General Provisions|Health\\s*[-\\u2013]\\s*General|Human Services),?\\s\u00A7\\s\\d+(?:\\.\\d+)?-[-\\.\\w]*)\\s+[-\\u2013\\u2014]\\s+(.*)$",
"Massachusetts": "^(MA Gen L.+\\d+[A-Za-z]*)\\s?[\\u2013\\u2014]\\s?(.*)",
"Michigan": "^(MI\\sComp\\sLaws\\s\\u00A7\\s[\\w\\.]+)\\s+[-\\u2013\\u2014]\\s+(.*)$",
"Minnesota": "^(M\\.R\\.S\\.\\s[-\\w]+\\s\\u00A7\\s[-\\w]+)\\s+[-\\u2013\\u2014]\\s+(.*)$",
"Mississippi": "^(MS\\sCode\\s\\u00A7\\s?[-\\w\\.]+)\\s+[-\\u2013\\u2014]\\s+(.*)$",
"Navajo Nation": "^(\\d+\\sN\\.N\\.C\\.\\s\\u00A7\\s\\d+),\\sp\\.\\d+\\s+[-\\u2013\\u2014]\\s+(.*)$",
"New Jersey": "^(NJ Rev Stat \\u00A7 \\d+:\\d+[-\\.\\w]*)\\s+[\u2013\u2014]\\s+(.*)$",
"New York": "^(NY\\s.+\\s\\u00A7\\s[-\\w\\.]+)\\s+[-\\u2013\\u2014]\\s+(.*)$",
"North Dakota": "^(N\\.D\\.\\sStat\\s\\u00A7\\s[-\\w\\.\\s\\(\\)]+)\\s+[-\\u2013\\u2014]\\s+(.*)$",
"Oklahoma": "^(\\d+[A-Za-z]?\\sOK STAT \\u00A7\\s[\\d-]+[A-Za-z0-9\\-\\.]+)\\s?[\\u2013\\u2014]\\s?(.*)",
"Ohio": "^(Ohio Rev Code\\s\\u00A7\\s+\\d+(?:\\.\\d*-)?)\\s+[\u2013\u2014]\\s+(.*)",
"Oregon": "^(ORS\\s+[A-Za-z0-9\\.]+)\\s+[\\u2013\\u2014]\\s+(.*)$",
"Rhode Island": "^(R\\.I\\.\\sStat\\s\\u00A7\\s[-\\d\\.]+)\\s+[-\\u2013\\u2014]\\s+(.*)$",
"South Carolina": "^(SC\\sCode\\s\\u00A7\\s[-\\d]+)\\s+[-\\u2013\\u2014]\\s+(.*)$",
"Tennessee": "^(TN Code\\s\\u00A7\\s[-\\.\\w]+)\\s+[-\\u2013\\u2014]\\s+(.*)$",
"Texas": "^(TX\\s(?:Hum Res Code|Govt Code|Health & Safety Code|Fam Code)\\s\u00A7\\s[\\d\\.]+)\\s+[-\\u2013\\u2014]\\s+(.*)$",
"Vermont": "^(\\d+[A-Z]?\\sV\\.S\\.A\\.\\s\\u00A7\\s[-\\w]+)\\s+[-\\u2013\\u2014]\\s+(.*)$",
"Virginia": "(.+?)\\s?\\u2013\\s?(.*?)\\s+\\(?(VA Code\\s\\u00A7\\s+\\d+(?:\\.\\d+)?-\\d+(?:\\.\\d+)?(?::\\d+)?)\\)?$",
"Wyoming": "^(WY\\sStat\\s\\u00A7\\s[-\\w\\.]+)\\s+[-\\u2013\\u2014]\\s+(.*)$",
"Wisconsin": "^(WI\\sStat\\s\\u00A7\\s[\\d\\.]+)\\s+[-\\u2013\\u2014]\\s+(.*)$",
"Yurok": "^(YTC\\s[\\d\\.]+)\\s+[-\\u2013\\u2014]\\s+(.*)$"
}
}
Loading