gemma2hin/insert_errors.py at main · kashvigarg/gemma2hin · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
# Format /wikiextract/insert_errors.py
'''Adds grammatical errors to hindi_pos_tags.txt
'''

import random
import csv
from math import ceil
import ast

skip_tokens = ('गे', 'दे', 'ले', 'गा','जाए','जा', 'ला', 'ले', 'पा', 'खा', 'चाहिए', 'समाजवादी', 'विधानसभा',
               'थालिपीठे', 'मराठवाड़ा', 'अन्यथा', 'खुफिया', 'अनसुना', 'इन्होंने')
exceptions = (('है', 'हैं'), ('था', 'थे', 'थी', 'थीं'), ('हुआ', 'हुई', 'हुए', 'हुईं'))
conjugated_adj = ('लंबा', 'ऊंचा', 'धीमा', 'महंगा', 'गीला', 'भूरा', 'मोटा', 'हल्का', 'पुराना',
                  'ताज़ा', 'बुरा', 'फिस्लाहा', 'कड़वा', 'चौड़ा', 'चौड़ा', 'सुखा', 'नमा', 'खट्टा',
                  'पतला', 'लम्बा', 'अच्छा', 'हरा', 'थोड़ा', 'बड़ा', 'बूढा', 'कड़वा', 'निचा',
                  'चमकीला', 'मीठा', 'पीला', 'भोला', 'गाढ़ा', 'खुरदुरा', 'ठंडा', 'गंदा', 'तीता',
                  'सस्ता', 'छोटा', 'नया', 'गीरा', 'सूखा', 'गहरा', 'सीधा', 'खारा', 'दुबला',
                  'चिपचिपा', 'नीला','तीखा','डरावना','सुनहरा','इकलौता','तीखा','समूचा','पुरा',
                  'अनूठा', 'सुरीला','ख़रीदा','संकरा','रूखा','अंधा','बहरा','बौना','ठिगना','पैना',
                  'घना','डरावना','अनूठा','झूठा','इकट्ठा','भरा','अधूरा', 'नुकीला','उबला','ढीला',
                  'पक्का', 'पहला', 'दूसरा','तीसरा','चौथा','पांचवा','छठा','पचवा','सातवा','आठवा',
                  'नौवा', 'दसवा')
adj_endings = ('ा', 'े', 'ी')
vb_endings = ('ा', 'े', 'ी', 'ीं')
endings1 = ('या', 'ए', 'ई', 'ईं',)
endings2 = ('या', 'ये', 'यी', 'यीं')

def random_except(options, choice):
    remaining = list(options)
    if (choice in remaining):
        remaining.remove(choice)
    return random.choice(remaining)

def endswith_any(word, endings):
    for ending in endings:
        if word.endswith(ending):
            return ending
    return None

def insert_single_error(sentence):
    words =[]
    for word,_ in sentence:
        words.append(word)

    erroneous_sentences = []

    for i, word in enumerate(sentence):
        token, tag = word

        if token in skip_tokens or len(token)<2:
            continue

        elif any(token in ex for ex in exceptions):
            matching_exception = next(ex for ex in exceptions if token in ex)
            modified_token = random_except(matching_exception, token)
            erroneous_sentences.append(' '.join(words[:i] + [modified_token] + words[i+1:]))

        elif len(token) > 4 and token[-1] in adj_endings and token[-4:-1] == 'वाल':
            modified_token = token[:-1] + random_except(adj_endings, token[-1])
            erroneous_sentences.append(' '.join(words[:i] + [modified_token] + words[i+1:]))

        elif tag in ('ADJ') and token[-1] in adj_endings and (token[:-1] + adj_endings[0]) in conjugated_adj:
            modified_token = token[:-1] + random_except(adj_endings, token[-1])
            erroneous_sentences.append(' '.join(words[:i] + [modified_token] + words[i+1:]))

        elif tag == 'PRON' and token[-1] in adj_endings and (token[-2] in ('र', 'क') or token.startswith('अप')):
                modified_token = (token[:-1] + random_except(adj_endings, token[-1]))
                erroneous_sentences.append(' '.join(words[:i] + [modified_token] + words[i+1:]))

        elif tag == 'AUX':
            if len(token) == 2 and token[-1] in vb_endings[-2:]:
                modified_token = token[0] + random.choice(endings1[:2])
                erroneous_sentences.append(' '.join(words[:i] + [modified_token] + words[i+1:]))

            elif endswith_any(token, endings1):
                    ending = endswith_any(token, endings1)
                    substitute = random_except(endings1, ending)
                    modified_token = token[:-len(ending)] + random_except(vb_endings, ending)
                    erroneous_sentences.append(' '.join(words[:i] + [modified_token] + words[i+1:]))

    return erroneous_sentences

def process_sentences(input_file, output_file):
    with open(input_file, 'r', encoding='utf-8') as infile:
        data = infile.read()

    sentences = data.split('\n')
    output_rows = []

    for sentence in sentences:
        sentence = sentence.strip()
        sentence = ast.literal_eval((sentence))
        if not sentence:
            continue

        untagged_sentence = " ".join(word for word,_ in sentence)

        erroneous_variants = insert_single_error(sentence)

        for variant in erroneous_variants:
            output_rows.append([variant, untagged_sentence])


    with open(output_file, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(["hi_err", "hi_corr"])
        writer.writerows(output_rows)

    print(f"Processed {len(sentences)} sentences. Results saved to {output_file}.")


if __name__ == "__main__":
    input_file = "hindi_pos_tags.txt"
    output_file = "mono_hi_train.csv"
    process_sentences(input_file, output_file)