-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathinsert_errors.py
More file actions
115 lines (89 loc) · 5.8 KB
/
insert_errors.py
File metadata and controls
115 lines (89 loc) · 5.8 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
# Format /wikiextract/insert_errors.py
'''Adds grammatical errors to hindi_pos_tags.txt
'''
import random
import csv
from math import ceil
import ast
skip_tokens = ('गे', 'दे', 'ले', 'गा','जाए','जा', 'ला', 'ले', 'पा', 'खा', 'चाहिए', 'समाजवादी', 'विधानसभा',
'थालिपीठे', 'मराठवाड़ा', 'अन्यथा', 'खुफिया', 'अनसुना', 'इन्होंने')
exceptions = (('है', 'हैं'), ('था', 'थे', 'थी', 'थीं'), ('हुआ', 'हुई', 'हुए', 'हुईं'))
conjugated_adj = ('लंबा', 'ऊंचा', 'धीमा', 'महंगा', 'गीला', 'भूरा', 'मोटा', 'हल्का', 'पुराना',
'ताज़ा', 'बुरा', 'फिस्लाहा', 'कड़वा', 'चौड़ा', 'चौड़ा', 'सुखा', 'नमा', 'खट्टा',
'पतला', 'लम्बा', 'अच्छा', 'हरा', 'थोड़ा', 'बड़ा', 'बूढा', 'कड़वा', 'निचा',
'चमकीला', 'मीठा', 'पीला', 'भोला', 'गाढ़ा', 'खुरदुरा', 'ठंडा', 'गंदा', 'तीता',
'सस्ता', 'छोटा', 'नया', 'गीरा', 'सूखा', 'गहरा', 'सीधा', 'खारा', 'दुबला',
'चिपचिपा', 'नीला','तीखा','डरावना','सुनहरा','इकलौता','तीखा','समूचा','पुरा',
'अनूठा', 'सुरीला','ख़रीदा','संकरा','रूखा','अंधा','बहरा','बौना','ठिगना','पैना',
'घना','डरावना','अनूठा','झूठा','इकट्ठा','भरा','अधूरा', 'नुकीला','उबला','ढीला',
'पक्का', 'पहला', 'दूसरा','तीसरा','चौथा','पांचवा','छठा','पचवा','सातवा','आठवा',
'नौवा', 'दसवा')
adj_endings = ('ा', 'े', 'ी')
vb_endings = ('ा', 'े', 'ी', 'ीं')
endings1 = ('या', 'ए', 'ई', 'ईं',)
endings2 = ('या', 'ये', 'यी', 'यीं')
def random_except(options, choice):
remaining = list(options)
if (choice in remaining):
remaining.remove(choice)
return random.choice(remaining)
def endswith_any(word, endings):
for ending in endings:
if word.endswith(ending):
return ending
return None
def insert_single_error(sentence):
words =[]
for word,_ in sentence:
words.append(word)
erroneous_sentences = []
for i, word in enumerate(sentence):
token, tag = word
if token in skip_tokens or len(token)<2:
continue
elif any(token in ex for ex in exceptions):
matching_exception = next(ex for ex in exceptions if token in ex)
modified_token = random_except(matching_exception, token)
erroneous_sentences.append(' '.join(words[:i] + [modified_token] + words[i+1:]))
elif len(token) > 4 and token[-1] in adj_endings and token[-4:-1] == 'वाल':
modified_token = token[:-1] + random_except(adj_endings, token[-1])
erroneous_sentences.append(' '.join(words[:i] + [modified_token] + words[i+1:]))
elif tag in ('ADJ') and token[-1] in adj_endings and (token[:-1] + adj_endings[0]) in conjugated_adj:
modified_token = token[:-1] + random_except(adj_endings, token[-1])
erroneous_sentences.append(' '.join(words[:i] + [modified_token] + words[i+1:]))
elif tag == 'PRON' and token[-1] in adj_endings and (token[-2] in ('र', 'क') or token.startswith('अप')):
modified_token = (token[:-1] + random_except(adj_endings, token[-1]))
erroneous_sentences.append(' '.join(words[:i] + [modified_token] + words[i+1:]))
elif tag == 'AUX':
if len(token) == 2 and token[-1] in vb_endings[-2:]:
modified_token = token[0] + random.choice(endings1[:2])
erroneous_sentences.append(' '.join(words[:i] + [modified_token] + words[i+1:]))
elif endswith_any(token, endings1):
ending = endswith_any(token, endings1)
substitute = random_except(endings1, ending)
modified_token = token[:-len(ending)] + random_except(vb_endings, ending)
erroneous_sentences.append(' '.join(words[:i] + [modified_token] + words[i+1:]))
return erroneous_sentences
def process_sentences(input_file, output_file):
with open(input_file, 'r', encoding='utf-8') as infile:
data = infile.read()
sentences = data.split('\n')
output_rows = []
for sentence in sentences:
sentence = sentence.strip()
sentence = ast.literal_eval((sentence))
if not sentence:
continue
untagged_sentence = " ".join(word for word,_ in sentence)
erroneous_variants = insert_single_error(sentence)
for variant in erroneous_variants:
output_rows.append([variant, untagged_sentence])
with open(output_file, 'w', newline='', encoding='utf-8') as csvfile:
writer = csv.writer(csvfile)
writer.writerow(["hi_err", "hi_corr"])
writer.writerows(output_rows)
print(f"Processed {len(sentences)} sentences. Results saved to {output_file}.")
if __name__ == "__main__":
input_file = "hindi_pos_tags.txt"
output_file = "mono_hi_train.csv"
process_sentences(input_file, output_file)