ICPSR · shallcro · Apr 11, 2025 · Apr 1, 2025 · Apr 8, 2025 · Apr 8, 2025
diff --git a/acf/acf_parse-docx-to-xml.py b/acf/acf_parse-docx-to-xml.py
@@ -9,8 +9,11 @@
 import json
 from collections import defaultdict
 import urllib.parse
+import requests
+import csv
+import pandas as pd
 
-def find_source_link(cell, target_text, preceding_target_text=None):
+def find_source_link(details, cell, target_text, preceding_target_text=None):
     # Get the XML for the cell
     cell_xml = cell._element
 
@@ -63,6 +66,7 @@ def find_source_link(cell, target_text, preceding_target_text=None):
                             full_target = rel._target
                             if anchor:
                                 full_target += f"#{anchor}"  # Append anchor if present
+
                             return full_target
 
     # OPTION 2: LINKS ARE STORED IN <w:instrText> TAGS, NESTED INSIDE <p> tags alongside text
@@ -91,8 +95,7 @@ def find_source_link(cell, target_text, preceding_target_text=None):
                                     anchor_text = anchor_text[1:-1]  # Remove surrounding quotes
                                 hyperlink_url += f"#{anchor_text}"
 
-                        return hyperlink_url
-
+                        return hyperlink_url    
 
     # If no matching hyperlink was found, return empty string (no None so that we avoid string processing issues downstream)
     print(f"\n\nWARNING: Word doc does not appear to include a hyperlink for this string: {orig_text}")
@@ -262,7 +265,7 @@ def parse_requirement_blocks(data, cell, temp_list, details, parent_position):
             record['label'] = label
             record['description'] = description
             record['state_code'] = state_code
-            record['source'] = find_source_link(cell, state_code)
+            record['source'] = find_source_link(details, cell, state_code)
 
             if record['source'] is None:
                 print('\tREQ LABEL', label)
@@ -293,6 +296,7 @@ def parse_requirement_blocks(data, cell, temp_list, details, parent_position):
         if tags_match:
             tags_text = tags_match.group(1)
         else:
+            #print(data_slice)
             tags_text = data_slice[2].split(':', 1)[1]
 
         # check our terms and assign to dict
@@ -324,7 +328,7 @@ def parse_to_dict(list_of_strings, cell, search_term):
             temp_dict = {
                 "current_position": t.strip(),
                 "name": list_of_strings[t_idx+1],
-                "source": find_source_link(cell, list_of_strings[t_idx+1], t),
+                "source": find_source_link(details, cell, list_of_strings[t_idx+1], t),
                 "start_idx": t_idx,
                 "end_idx": t_idx+1
             }
@@ -401,8 +405,7 @@ def parse_tables(doc, details, record_data, object_type):
                     #some rows will actually have more than 3 cells: need to verify where indexes will start
                     if len(row.cells) > 3:
                         title_idx = return_index(row.cells, details['titleName'])
-                        office_idx = return_index(row.cells, 'ACF Offices Associated')
-
+                        office_idx = return_index(row.cells, 'ACF Offices Associated')              
                     else:
                         title_idx = 1
                         office_idx = 2 
@@ -435,7 +438,7 @@ def parse_tables(doc, details, record_data, object_type):
                                 else:
                                     source_text = category_name
 
-                                category_source = find_source_link(row.cells[title_idx], source_text)
+                                category_source = find_source_link(details, row.cells[title_idx], source_text)
 
                                 #update 'current position'
                                 current_position = f"{category_name} - {current_position}"
@@ -492,7 +495,10 @@ def parse_tables(doc, details, record_data, object_type):
                     record_data[current_title_key].setdefault('articles', [])
 
                     # get article info
-                    temp_article_dict = parse_to_dict(article_overview, overview_cell, details['articleName'])
+                    if details['articleName']:
+                        temp_article_dict = parse_to_dict(article_overview, overview_cell, details['articleName'])
+                    else:
+                        temp_article_dict = None
 
                     # Provide warning if we didn't get an article; continue on to next item
                     if temp_article_dict is None:
@@ -584,19 +590,18 @@ def parse_tables(doc, details, record_data, object_type):
                     # set up lists to carry definition and requirement information
                     definitions = []
                     requirements = []
-                    regulations = []
-                    found_regs = False
 
                     for index, item in enumerate(statutes):
                         if any(item.lower().startswith(phrase) for phrase in ['definitions related to', 'definitions for ']):
                             def_index = index
                             break
 
                     for index, item in enumerate(statutes):
-                        if any(item.lower().startswith(phrase) for phrase in ['requirements related ', 'requirements for ', 'Regulations regarding ']):
+                        if any(item.lower().startswith(phrase) for phrase in ['requirements related ', 'requirements for ', 'requirements regarding ', 'regulations regarding ']):
                             req_index = index
+                            # NOTE: we need to edit Word doc to replace 'Regulations' with 'Requirements'; give a warning if so
                             if "Regulations" in statutes[index]:
-                                found_regs = True
+                                print(f'\n\nWARNING: Found REGULATION at {current_position}')
                             break
 
                     # we may not have definitions; make sure we found them
@@ -630,7 +635,7 @@ def parse_tables(doc, details, record_data, object_type):
                             # add state code, source, and defined terms to a dictionary; append to our definitions
                             temp_defn_dict = { 
                                 "state_code": state_code.strip(),
-                                "source": find_source_link(statutes_cell, state_code),
+                                "source": find_source_link(details, statutes_cell, state_code),
                                 "defined_terms": [t.strip() for t in terms.split(',')]
                             }
 
@@ -656,13 +661,9 @@ def parse_tables(doc, details, record_data, object_type):
                         # slice our original list to include all the statute information
                         req_info = statutes[check_index:]
 
-                        # parse out requirements and add to temp dict; note tht we need to differentiate between 'regulations' and 'requirements'
-                        if found_regs:
-                            regulations = parse_requirement_blocks(req_info, statutes_cell, regulations, details, current_position)
-                            temp_article_dict['regulations'] = regulations
-                        else:
-                            requirements = parse_requirement_blocks(req_info, statutes_cell, requirements, details, current_position)
-                            temp_article_dict['requirements'] = requirements
+                        # parse out requirements and add to temp dict
+                        requirements = parse_requirement_blocks(req_info, statutes_cell, requirements, details, current_position)
+                        temp_article_dict['requirements'] = requirements
 
                     if temp_article_dict not in record_data[current_title_key]['articles']:
                         record_data[current_title_key]['articles'].append(temp_article_dict)
@@ -831,25 +832,25 @@ def write_xml(details, record_data):
                         etree.SubElement(terms_elem, "term").text = tag.replace('&amp;', '&')
 
             #add regulations, if present
-            if article.get("regulations", []):
-                reg_elem = etree.SubElement(article_elem, "regulations")
-                for reg in article.get("regulations", []):
-                    reg_statute_elem = etree.SubElement(reg_elem, "statute")
-                    etree.SubElement(reg_statute_elem, "label").text = reg.get("label", '')
-                    etree.SubElement(reg_statute_elem, "description").text = reg.get("description", '')
-                    etree.SubElement(reg_statute_elem, "stateCode").text = reg.get("state_code", '')
-                    etree.SubElement(reg_statute_elem, "source").text = reg.get("source")
-
-                    # Add appliesTo entities
-                    applies_elem = etree.SubElement(reg_statute_elem, "appliesTo")
-                    for ent in reg.get("entities", ['']):
-                        etree.SubElement(applies_elem, "entity").text = ent
-
-                    # Add terms tags
-                    terms_elem = etree.SubElement(reg_statute_elem, "terms")
-                    for tag in reg.get("tags", []):
-                        #watch out for lxml double-escaping ampersands...
-                        etree.SubElement(terms_elem, "term").text = tag.replace('&amp;', '&')
+            # if article.get("regulations", []):
+            #     reg_elem = etree.SubElement(article_elem, "regulations")
+            #     for reg in article.get("regulations", []):
+            #         reg_statute_elem = etree.SubElement(reg_elem, "statute")
+            #         etree.SubElement(reg_statute_elem, "label").text = reg.get("label", '')
+            #         etree.SubElement(reg_statute_elem, "description").text = reg.get("description", '')
+            #         etree.SubElement(reg_statute_elem, "stateCode").text = reg.get("state_code", '')
+            #         etree.SubElement(reg_statute_elem, "source").text = reg.get("source")
+
+            #         # Add appliesTo entities
+            #         applies_elem = etree.SubElement(reg_statute_elem, "appliesTo")
+            #         for ent in reg.get("entities", ['']):
+            #             etree.SubElement(applies_elem, "entity").text = ent
+
+            #         # Add terms tags
+            #         terms_elem = etree.SubElement(reg_statute_elem, "terms")
+            #         for tag in reg.get("tags", []):
+            #             #watch out for lxml double-escaping ampersands...
+            #             etree.SubElement(terms_elem, "term").text = tag.replace('&amp;', '&')
 
     # Create the ElementTree object
     tree = etree.ElementTree(root)
@@ -960,15 +961,15 @@ def check_all_hyperlinks_from_docx(doc, xml_file):
                         # Extract the hyperlink URL from the instrText
                         parts = instr_text.text.split('"')
                         if len(parts) > 1:
-                            hyperlink_url = parts[1]
+                            hyperlink_url = parts[1].lower()
 
                             if "\\l" in instr_text.text:
                                 # Find the text following \l
                                 l_index = instr_text.text.index("\\l") + 2
                                 anchor_text = instr_text.text[l_index:].strip()
                                 if anchor_text.startswith('"') and anchor_text.endswith('"'):
                                     anchor_text = anchor_text[1:-1]  # Remove surrounding quotes
-                                hyperlink_url += f"#{anchor_text}"
+                                hyperlink_url += f"#{anchor_text}".lower()
 
                             if hyperlink_url.lower() not in xml_hyperlinks:
 
@@ -981,6 +982,20 @@ def check_all_hyperlinks_from_docx(doc, xml_file):
                                     missing_docx_hyperlinks[hyperlink_url] = []
                                 missing_docx_hyperlinks[hyperlink_url].append(paragraph_text)
 
+    #For New Jersey record, check to see if bad_links were replaced with good_links
+    # excel_path = 'C:/ACF/bad_links.xlsx'
+    # df = pd.read_excel(excel_path)
+    # bad_to_good = {k.lower(): v.lower() for k, v in zip(df['Bad Link'], df['Good Link'])}
+
+    # remove_from_list = []
+
+    # for url, text in missing_docx_hyperlinks.items():
+    #     if url in bad_to_good:
+    #         if bad_to_good[url].lower() in xml_hyperlinks:
+    #              remove_from_list.append(url)
+    # for url in remove_from_list:
+    #     del missing_docx_hyperlinks[url]
+
     # If there are missing hyperlinks, print them
     if missing_docx_hyperlinks:
         print(f"\n\nFound {len(missing_docx_hyperlinks)} hyperlinks in DOCX that are not in the XML:")
@@ -1072,10 +1087,16 @@ def main(details):
     details['tmp_audit_log'] = os.path.join(details['out_dir'], f'tmp_{details['state'].lower().replace(' ', '_')}_audit-log.txt')
     if os.path.exists(details['tmp_audit_log']):
         os.remove(details['tmp_audit_log'])
+
     details["audit_log"] = os.path.join(details['out_dir'], f'{details['state'].lower().replace(' ', '_')}_audit-log.txt')
     if os.path.exists(details['audit_log']):
         os.remove(details['audit_log'])
 
+
+    details['bad_link_log'] = os.path.join(details['out_dir'], f'{details['state'].lower().replace(' ', '_')}_bad-link-log.csv')
+    if os.path.exists(details['bad_link_log']):
+        os.remove(details['bad_link_log'])
+
     # make sure boolean values are set
     for term in ["category", "titleContent"]:            
         try: