-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathcleaner.py
More file actions
119 lines (90 loc) · 4.95 KB
/
Copy pathcleaner.py
File metadata and controls
119 lines (90 loc) · 4.95 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
# cleaner_v2.py
# Safe cleaning for parliamentary transcript-like text.
# - Preserves line breaks (keeps same number of lines).
# - Removes slash/dash technical markers, page-number-only lines, and interrupt-noise like ...(Interruptions)...
# - Keeps English names / normal uppercase names intact.
import re
from pathlib import Path
# ---- CONFIG: change these paths if needed ----
INPUT_PATH = Path("cleaned_parliament_2016_8_3_11-12.txt")
OUTPUT_PATH = Path("cleaned_parliament_2016_8_3_11-12.cleaned_v2.txt")
# ---------------------------------------------
# compile regexes once
RE_PAREN_WITH_SLASH = re.compile(r'\([^)]*\/[^)]*\)', flags=re.UNICODE) # remove ( ... / ... )
RE_HYPHEN_SLASH_FRAGMENT = re.compile(
r'-\s*(?:[A-Za-z0-9]+(?:\s*[-/]\s*[A-Za-z0-9\.]+)+(?:\s*[-/]\s*[A-Za-z0-9\.]+)*)',
flags=re.UNICODE) # remove -.../... fragments
# remove any substring (up to a reasonable length) that contains at least one slash
RE_ANY_CONTAIN_SLASH = re.compile(r'[A-Za-z0-9\-\./\s]{0,120}\/[A-Za-z0-9\-\./\s]{0,120}', flags=re.UNICODE)
# remove dotted noise patterns like ...(Interruptions)... or ...(व्यवधान)...
RE_DOTTED_NOISE = re.compile(r'\.{2,}\s*\(?\s*(Interruptions|व्यवधान)\s*\)?\s*\.{0,}', flags=re.IGNORECASE | re.UNICODE)
# small parenthetical acronyms/noise to strip if they appear alone in parenthesis
RE_SMALL_PARENS_NOISE = re.compile(
r'\(\s*(?:AIA|AIM|HK/C|HK-SC|HK/SC|KSK/D|Contd\.?|Contd|Contd by|Ends\.)\s*\)',
flags=re.IGNORECASE | re.UNICODE)
# collapse many dots to an ellipsis (choice: change '...' to ' ' if you prefer removal)
RE_MANY_DOTS = re.compile(r'\.{3,}', flags=re.UNICODE)
# detect page-number-only lines (keep a blank line in output to preserve line count)
RE_PAGE_NUMBER = re.compile(r'^\s*\*?\d+\s*$')
# helper: collapse repeated identical word sequences like "and and" -> "and"
RE_DUP_WORDS = re.compile(r'\b(\w+)(?:\s+\1\b)+', flags=re.IGNORECASE)
# optional: remove leftover long tokens that are mostly punctuation (conservative)
RE_LONG_GARBAGE = re.compile(r'[^\w\s]{6,}', flags=re.UNICODE)
def clean_line_preserve_breaks(line: str) -> str:
"""
Clean a single line and return the cleaned text (without newline).
If function returns empty string, caller will write a blank line to preserve line count.
"""
s = line # do not strip newline here (caller supplies line content without trailing newline)
# 1) quick removal of parenthetical sequences that contain slashes
s = RE_PAREN_WITH_SLASH.sub('', s)
# 2) fragments that start with hyphen and include slashes/dashes
s = RE_HYPHEN_SLASH_FRAGMENT.sub('', s)
# 3) robust: remove any contiguous substring that contains a '/' (covers spaced variants)
s = RE_ANY_CONTAIN_SLASH.sub('', s)
# 4) remove the noisy dotted parenthetical patterns like ...(Interruptions)...
s = RE_DOTTED_NOISE.sub('', s)
# 5) remove some short parenthetical acronyms/noise we commonly see
s = RE_SMALL_PARENS_NOISE.sub('', s)
# 6) collapse extremely long runs of dots into a single ellipsis (keeps readability)
s = RE_MANY_DOTS.sub('...', s)
# 7) collapse duplicate adjacent words (e.g. "and and")
s = RE_DUP_WORDS.sub(r'\1', s)
# 8) remove long punctuation-only garbage (conservative: only sequences of punctuation)
# keep words/letters; this targets lines like "----- ) ## // ///" etc.
def _long_garbage_repl(m):
token = m.group(0)
# measure proportion of letters (incl. Devanagari) vs punctuation
letters = re.findall(r'[A-Za-z\u0900-\u097F\u00C0-\u024F0-9]', token)
if len(letters) / max(1, len(token)) < 0.25:
return ''
return token # otherwise keep
s = RE_LONG_GARBAGE.sub(_long_garbage_repl, s)
# 9) tidy whitespace (but do not remove newline; caller handles it)
s = re.sub(r'[ \t]{2,}', ' ', s).strip()
return s
def clean_file(input_path: Path, output_path: Path):
with input_path.open('r', encoding='utf-8') as inf:
lines = inf.readlines() # preserves line count and newline characters
out_lines = []
for raw in lines:
# raw includes trailing '\n' except possibly last line; remove it for processing but remember newline
has_nl = raw.endswith('\n')
line_body = raw[:-1] if has_nl else raw
# preserve exact number of lines: if a page-number-only line, output a blank line (i.e. '\n')
if RE_PAGE_NUMBER.match(line_body):
out_lines.append('\n')
continue
cleaned = clean_line_preserve_breaks(line_body)
# if cleaned is empty, write a blank line so line count stays same
if cleaned == '':
out_lines.append('\n')
else:
out_lines.append(cleaned + ('\n' if has_nl else ''))
# write output
with output_path.open('w', encoding='utf-8') as outf:
outf.writelines(out_lines)
print("✅ Done. Cleaned file saved to:", output_path)
if __name__ == "__main__":
# run
clean_file(INPUT_PATH, OUTPUT_PATH)