Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 11 additions & 5 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,5 +1,11 @@
somar/drafts/
somar/XML/
somar/prod_credentials.json
somar/test_credentials.json
somar/draft_dois.csv
projects/somar/drafts/
projects/somar/XML/
projects/somar/prod_credentials.json
projects/somar/test_credentials.json
projects/somar/draft_dois.csv
projects/ror/
projects/terms_of_use/
projects/jinja_tests/export_request-*.json
projects/datacite/
projects/ddi_doi_cleanup/input/
projects/ddi_doi_cleanup/output
66 changes: 0 additions & 66 deletions acf/state_configs/NN_acf_parse_config.json

This file was deleted.

Original file line number Diff line number Diff line change
Expand Up @@ -221,14 +221,16 @@ def check_controlled_vocabs(cv_list, cv_used, details, current_position):

if cv_used in ('offices', 'federal'):
if cv_list != sorted(cv_list, key=str.lower):
write_error(details, f"{current_position} - {cv_used.capitalize()} list is not in alphabetical order; correct in MS Word.")
write_error(details, f"{current_position} - {cv_used.capitalize()} list is not in alphabetical order; correct in MS Word: {cv_list}.")
cv_list = sorted(cv_list, key=str.lower)


return cv_list

def parse_requirement_blocks(data, cell, temp_list, details, parent_position):

en_dash = '\u2013'

# Patterns to identify the different parts of each entry
label_desc_state_code_pattern = rf"{details['state_code_pattern']}"
entities_pattern = re.compile(r"Who Law Applies To:(.*)")
Expand Down Expand Up @@ -266,6 +268,41 @@ def parse_requirement_blocks(data, cell, temp_list, details, parent_position):
except ValueError:
print(f'\nValueError: unable to parse this content: {data_slice[0]}')

# WE NEED TO CHECK / ACCOUNT FOR EN DASH ISSUES
# In particular, we need to watch out for instances where there are multiple en dashes
if en_dash in description:
# print("\nMultiple en dashes found. Here’s the string:")

text = f"{label} {en_dash} {description}"
# positions = [m.start() for m in re.finditer(en_dash, text)]
# preview = text
# for j, pos in enumerate(positions[::-1], 1):
# idx = positions[-j]
# preview = preview[:idx] + f"[{j}]{en_dash}" + preview[idx + 1:]
# print(preview)

# while True:
# try:
# choice = int(input(f"Which en dash # should be used to split label/description (1–{len(positions)})? "))
# if 1 <= choice <= len(positions):
# split_index = positions[choice - 1]
# break
# else:
# print("Invalid number. Try again.")
# except ValueError:
# print("Please enter a valid integer.")

# Find all matches
en_dash_matches = list(re.finditer(en_dash, text))

if en_dash_matches:
last_match = en_dash_matches[-1] # The last en dash
split_index = last_match.start()

# Split using selected en dash only
label = text[:split_index].strip()
description = text[split_index + 1:].strip()

try:
# update dict
record['label'] = label
Expand Down Expand Up @@ -327,8 +364,12 @@ def parse_to_dict(list_of_strings, cell, search_term):

for t_idx, t in enumerate(list_of_strings):

#the search terms should always occur at beginning of string
if any(t.lower().startswith(term.lower()) for term in search_term):
# Find the first search term that matches the start of t; the search terms should always occur at beginning of string
# This was the old way we matched; had to change to accommodate lables with a space:
# if any(t.lower().startswith(term.lower()) for term in search_term):
matched_term = next((term for term in search_term if t.lower().startswith(term.lower())), None)

if matched_term:

# if found, assign values to dictionary
temp_dict = {
Expand All @@ -341,7 +382,7 @@ def parse_to_dict(list_of_strings, cell, search_term):

# NOTE: we have at least one state that does not include Title #'s. Add handling for this...
try:
temp_dict['number'] = t.split(' ', 1)[1].strip()
temp_dict['number'] = t.split(f'{matched_term} ')[1].strip()
except IndexError:
temp_dict['number'] = None

Expand Down Expand Up @@ -526,7 +567,14 @@ def parse_tables(doc, details, record_data, object_type):

# if we expect to find subtitles, check to see if there is any text between the Title and the Article
if details['subtitleName']:
subtitle_slice = article_overview[title_end_idx+1:temp_article_dict['start_idx']]

if found_titleContent:
subtitle_slice = []
for sub_index, item in enumerate(article_overview[title_end_idx+1:]):
if item.startswith(details['subtitleName']):
subtitle_slice = article_overview[sub_index:]
else:
subtitle_slice = article_overview[title_end_idx+1:temp_article_dict['start_idx']]

# if there is actually text here, return subtitle info
if len(subtitle_slice) > 0:
Expand Down Expand Up @@ -767,6 +815,11 @@ def write_xml(details, record_data):
else:
article_elem = etree.SubElement(title_elem, "article")
etree.SubElement(article_elem, "domain").text = article.get("domain", '')
if record_data[title].get("subtitle", {}):
subtitle_elem = etree.SubElement(article_elem, "subtitle")
etree.SubElement(subtitle_elem, "number").text = record_data[title]['subtitle'].get("number", "")
etree.SubElement(subtitle_elem, "name").text = record_data[title]['subtitle'].get("name", "")
etree.SubElement(subtitle_elem, "source").text = record_data[title]['subtitle'].get("source", "")

# add subtitle info, id present
if article.get("subtitle", {}):
Expand Down Expand Up @@ -1022,6 +1075,8 @@ def main(details):
# delete tmp file
os.remove(details['tmp_audit_log'])

print('\n\nERRORS IN WORD DOC! CHECK AUDIT FILE!!!')

print('\n\n----------------------------------------------------\n\nProcess complete!')

if __name__ == "__main__":
Expand Down Expand Up @@ -1088,5 +1143,7 @@ def main(details):
print('\n\nThe "partName" value must be a list!')
sys.exit(1)

print(f'\n\nWorking on {details['state'].upper()} record')

main(details)

10 changes: 5 additions & 5 deletions acf/acf_parse_config.json → projects/acf/acf_parse_config.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"input_doc": "C:/ACF/FINAL APPROVED STATE RECORDS/Utah/Utah_copy.docx",
"out_dir": "C:/ACF/FINAL APPROVED STATE RECORDS/Utah",
"input_doc": "C:/ACF/FINAL APPROVED STATE RECORDS/Yurok Tribe/Yurok Tribe_Copy.docx",
"out_dir": "C:/ACF/FINAL APPROVED STATE RECORDS/Yurok Tribe",
"xsd_file": "schema_final.xsd",
"xsl_file": "statetemplatev5.xsl",
"category": "false",
Expand All @@ -10,9 +10,9 @@
"partName": [],
"subPartName": "",
"titleContent": "false",
"state": "Utah",
"state_code_pattern": "^(.+?)\\s?\\u2013\\s?(.*?)\\s?\\(?(Utah\\s\\sCode\\s\\u00A7\\s[-\\w\\.]+)\\)?$",
"statute_pattern": "^(Utah\\s\\sCode\\s\\u00A7\\s[-\\w\\.]+)\\s+[-\\u2013\\u2014]\\s+(.*)$",
"state": "Yurok Tribe",
"state_code_pattern": "^(.+?)\\s?\\u2013\\s?(.*?)\\s?\\(?(YTC\\s[\\d\\.]+)\\)?$",
"statute_pattern": "^(YTC\\s[\\d\\.]+)\\s+[-\\u2013\\u2014]\\s+(.*)$",
"patterns": {
"base": "(.+)\\s?\\u2013\\s?(.*)\\s+",
"Alabama": "^(.+?)\\s?\\u2013\\s?(.*?)\\s?\\(?(AL\\sCode\\s\\u00A7\\s[-\\w\\.]+)\\)?$",
Expand Down
39 changes: 39 additions & 0 deletions projects/acf/docx-fixes.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
DOCX Fixes

1) Fix MS Word smart quotes
- Replace: ’ and ‘
- Use: '

- Replace “ and ”
- Use: "

2) Fix delimiters used with statute definitions and requirements
- Replace:' - '
- With: ' – '

- Replace: --
- With: –

4) Ensure consistent text with statute information
- Use "Definitions related to" instead of "Definitions for"

- Use "Requirements related to" instead of "Requirements for"

5) Search for and fix placeholder text:
- ()
- text
- 999

6) Make sure 'ACF Offices Associated' is underlined, with no colon

7) Wild card search example: Act [0-9]{1,} (*)\) Article

8) Replace 'thin space' with regular
-  

9) En dash spacing:
- –([A-Za-z])
- – \1

- ([A-Za-z])–
- \1 –
File renamed without changes.
1 change: 1 addition & 0 deletions acf/schema_final.xsd → projects/acf/schema_final.xsd
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,7 @@
<xs:complexType>
<xs:sequence>
<xs:element ref="domain"/>
<xs:element minOccurs="0" ref="subtitle"/>
<xs:element ref="associatedFederalRecords"/>
<xs:choice minOccurs="1" maxOccurs="1">
<xs:sequence>
Expand Down
Loading