-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathsort_tr_docs.py
More file actions
177 lines (145 loc) · 5.92 KB
/
sort_tr_docs.py
File metadata and controls
177 lines (145 loc) · 5.92 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
"""
Sort and organize PDF files based on metadata from Trade Republic events.
This module processes PDF files in a specified directory by:
- Matching filenames with events in all_events.json
- Extracting metadata (postboxType, title, subtitle, timestamp)
- Organizing files into categorized subdirectories
- Renaming files with timestamps and descriptive names
"""
import json
import logging
import sys
from datetime import datetime
from pathlib import Path
import coloredlogs
from jsonpath_ng.ext import parse
from rule_based_sort.rule_based_sorter import FileSorter
import csv
# Setup logging with coloredlogs
logger = logging.getLogger(__name__)
coloredlogs.install(
level="DEBUG",
logger=logger,
fmt="%(asctime)s - %(levelname)s - %(message)s",
)
def main(target_directory: Path):
"""
Process PDF files from the specified directory:
- Search corresponding event for the filename in all_events.json
- Retrieve meta data from that event
- Rename and sort files accordingly
"""
sorter = FileSorter("config/tr_sorting_rules.yaml", base_output_dir=target_directory)
events_file = Path(target_directory, "all_events.json")
# Check if directory exists
if not target_directory.exists():
logger.error("Directory '%s' does not exist", target_directory)
return
# Check if events file exists
if not events_file.exists():
logger.error("File '%s' does not exist", events_file)
return
# Load all events
logger.info("Loading events from '%s'...", events_file)
with open(events_file, "r", encoding="utf-8") as f:
events = json.load(f)
logger.info("Loaded %d events", len(events))
# Build index of documents from events (performance optimization)
logger.info("Building document index...")
document_index = {}
jsonpath_expr = parse("$.details.sections[?(@.type=='documents')].data[*]")
for event in events:
matches = jsonpath_expr.find(event)
for match in matches:
payload = match.value.get("action", {}).get("payload", "")
if payload:
# Extract filename from payload (assuming it's a URL or path)
# Store the match object with the full context
document_index[payload] = match
logger.info("Document index built with %d entries", len(document_index))
# Process all PDF files
pdf_files = list(target_directory.glob("*.pdf"))
logger.info("Found %d PDF files in '%s'", len(pdf_files), target_directory)
if not pdf_files:
logger.warning("No PDF files found")
return
matched_count = 0
not_found_count = 0
sorted_count = 0
not_sorted_count = 0
for pdf_file in pdf_files:
filename = pdf_file.name
# Find matching document in index
all_matches = [match for payload, match in document_index.items() if filename in payload]
if len(all_matches) == 0:
logger.error("No matches found for file '%s'", filename)
not_found_count += 1
continue
elif len(all_matches) > 1:
logger.error(
"Multiple matches (%d) found for file '%s'", len(all_matches), filename
)
not_found_count += 1
continue
match = all_matches[0]
matched_count += 1
postbox_type = match.value.get("postboxType")
document_title = match.value.get("title")
document_detail = match.value.get("detail")
event_title = match.context.context.context.context.context.value.get("title")
event_subtitle = match.context.context.context.context.context.value.get(
"subtitle"
)
timestamp = match.context.context.context.context.context.value.get("timestamp")
date_time_str = (
datetime.fromisoformat(timestamp).strftime("%Y-%m-%d %H:%M")
if timestamp
else None
)
# Write document metadata to CSV file
csv_file = target_directory / "docs_with_metadata.csv"
file_exists = csv_file.exists()
with open(csv_file, "a", newline="", encoding="utf-8-sig") as f:
writer = csv.writer(f)
if not file_exists:
writer.writerow(["filename", "postbox_type", "document_title", "document_detail", "event_title", "event_subtitle"])
writer.writerow([filename, postbox_type, document_title, document_detail, event_title, event_subtitle])
try:
path, name, _rule = sorter.get_new_location(
original_filepath=pdf_file,
postbox_type=postbox_type,
document_title=document_title,
document_detail=document_detail,
event_title=event_title,
event_subtitle=event_subtitle,
date_time_str=date_time_str,
)
# Move the file
sorter.move_file(
original_filepath=pdf_file,
new_path=path,
new_filename=name,
create_dirs=True,
overwrite=False,
)
sorted_count += 1
logger.info("Successfully sorted file '%s'", filename)
except Exception as e:
logger.error("Failed to sort file '%s': %s", filename, str(e))
not_sorted_count += 1
# Print summary
logger.info("\n" + "=" * 60)
logger.info("Processing Summary:")
logger.info("=" * 60)
logger.info("Total PDF files found: %d", len(pdf_files))
logger.info("Events matched: %d", matched_count)
logger.info("Events not found: %d", not_found_count)
logger.info("Files successfully sorted: %d", sorted_count)
logger.info("Files failed to sort: %d", not_sorted_count)
logger.info("=" * 60)
if __name__ == "__main__":
if len(sys.argv) != 2:
print("Usage: python sort_tr_docs.py <directory>")
sys.exit(1)
directory = Path(sys.argv[1])
main(directory)