-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmonitor.py
More file actions
112 lines (106 loc) · 4.64 KB
/
monitor.py
File metadata and controls
112 lines (106 loc) · 4.64 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
import os
from watchdog.observers import Observer
from watchdog.events import FileSystemEventHandler
from ocr import process_pdf, process_image, process_text, process_docx, process_xlsx, process_csv
from docx import Document
import pandas as pd
class Watcher(FileSystemEventHandler):
def __init__(self, tagged_data):
super().__init__()
self.tagged_data = tagged_data
def on_created(self, event):
# Process different file types
if event.src_path not in self.tagged_data:
if event.src_path.endswith(".pdf"):
print(f"New PDF found: {event.src_path}")
text = process_pdf(event.src_path)
tags = extract_tags(text)
self.tagged_data[event.src_path] = tags
print(tags)
elif event.src_path.endswith(".jpg") or event.src_path.endswith(".png"):
print(f"New image found: {event.src_path}")
text = process_image(event.src_path)
tags = extract_tags(text)
self.tagged_data[event.src_path] = tags
print(tags)
elif event.src_path.endswith(".txt"):
print(f"New text file found: {event.src_path}")
text = process_text(event.src_path)
tags = extract_tags(text)
self.tagged_data[event.src_path] = tags
print(tags)
elif event.src_path.endswith(".docx"):
print(f"New Word document found: {event.src_path}")
text = process_docx(event.src_path)
tags = extract_tags(text)
self.tagged_data[event.src_path] = tags
print(tags)
elif event.src_path.endswith(".xlsx"):
print(f"New Excel file found: {event.src_path}")
text = process_xlsx(event.src_path)
tags = extract_tags(text)
self.tagged_data[event.src_path] = tags
print(tags)
elif event.src_path.endswith(".csv"):
print(f"New CSV file found: {event.src_path}")
text = process_csv(event.src_path)
tags = extract_tags(text)
self.tagged_data[event.src_path] = tags
print(tags)
def monitor_directory(path, tagged_data):
observer = Observer()
observer.schedule(Watcher(tagged_data), path=path, recursive=False)
observer.start()
print("Monitoring started...")
def process_existing_files(directory, tagged_data):
"""
Function to process all existing files in the directory.
:param directory: str: The directory path to check for existing files.
:param tagged_data: dict: Dictionary to update with tagged data.
"""
for filename in os.listdir(directory):
file_path = os.path.join(directory, filename)
if os.path.isfile(file_path) and file_path not in tagged_data:
print(f"Processing existing file: {file_path}")
if filename.endswith(".pdf"):
text = process_pdf(file_path)
tags = extract_tags(text)
tagged_data[file_path] = tags
print(tags)
elif filename.endswith(".jpg") or filename.endswith(".png"):
text = process_image(file_path)
tags = extract_tags(text)
tagged_data[file_path] = tags
print(tags)
elif filename.endswith(".txt"):
text = process_text(file_path)
tags = extract_tags(text)
tagged_data[file_path] = tags
print(tags)
elif filename.endswith(".docx"):
text = process_docx(file_path)
tags = extract_tags(text)
tagged_data[file_path] = tags
print(tags)
elif filename.endswith(".xlsx"):
text = process_xlsx(file_path)
tags = extract_tags(text)
tagged_data[file_path] = tags
print(tags)
elif filename.endswith(".csv"):
text = process_csv(file_path)
tags = extract_tags(text)
tagged_data[file_path] = tags
print(tags)
def extract_tags(text):
"""
Function to extract named entities from the text using SpaCy.
:param text: str: The extracted text from a document.
:return: dict: Dictionary of extracted entities.
"""
import spacy
nlp = spacy.load("en_core_web_sm")
doc = nlp(text)
entities = {ent.text: ent.label_ for ent in doc.ents} # Extract entity and its label (e.g., PERSON, DATE)
return entities
# Other functions for processing documents remain unchanged...