From e4d95c202d1c779168b9505387fb96efd5e4b8c3 Mon Sep 17 00:00:00 2001 From: vishnu vinay Date: Mon, 5 May 2025 12:55:07 +0530 Subject: [PATCH] Task #239065: Python LC for te,kn --- lc - python/configs.py | 21 +++++++ lc - python/langcomplexity.py | 105 ++++++++++++++++++++++++++++++++++ 2 files changed, 126 insertions(+) create mode 100644 lc - python/configs.py create mode 100644 lc - python/langcomplexity.py diff --git a/lc - python/configs.py b/lc - python/configs.py new file mode 100644 index 0000000..8d6e3a7 --- /dev/null +++ b/lc - python/configs.py @@ -0,0 +1,21 @@ +language_data = { + "te":{ + "score":{'అ': {'Weight': 0.1, 'Weight_base': 0, 'similar': 0}, 'ఆ': {'Weight': 0.2, 'Weight_base': 0, 'similar': 1}, 'ఇ': {'Weight': 0.1, 'Weight_base': 0, 'similar': 0}, 'ఈ': {'Weight': 0.2, 'Weight_base': 0, 'similar': 0}, 'ఉ': {'Weight': 0.1, 'Weight_base': 0, 'similar': 1}, 'ఊ': {'Weight': 0.2, 'Weight_base': 0, 'similar': 1}, 'ఋ': {'Weight': 0.4, 'Weight_base': 0, 'similar': 0}, 'ౠ': {'Weight': 0.4, 'Weight_base': 0, 'similar': 1}, 'ఎ': {'Weight': 0.1, 'Weight_base': 0, 'similar': 0}, 'ఏ': {'Weight': 0.2, 'Weight_base': 0, 'similar': 1}, 'ఐ': {'Weight': 0.2, 'Weight_base': 0, 'similar': 0}, 'ఒ': {'Weight': 0.1, 'Weight_base': 0, 'similar': 0}, 'ఓ': {'Weight': 0.2, 'Weight_base': 0, 'similar': 1}, 'ఔ': {'Weight': 0.2, 'Weight_base': 0, 'similar': 0}, 'క': {'Weight': 0.2, 'Weight_base': 1, 'similar': 0}, 'ఖ': {'Weight': 0.5, 'Weight_base': 1, 'similar': 0}, 'గ': {'Weight': 0.1, 'Weight_base': 1, 'similar': 0}, 'ఘ': {'Weight': 0.4, 'Weight_base': 1, 'similar': 1}, 'ఙ': {'Weight': 0.3, 'Weight_base': 1, 'similar': 1}, 'చ': {'Weight': 0.2, 'Weight_base': 1, 'similar': 0}, 'ఛ': {'Weight': 0.5, 'Weight_base': 1, 'similar': 1}, 'జ': {'Weight': 0.1, 'Weight_base': 1, 'similar': 0}, 'ఝ': {'Weight': 0.4, 'Weight_base': 1, 'similar': 1}, 'ఞ': {'Weight': 0.3, 'Weight_base': 1, 'similar': 1}, 'ట': {'Weight': 0.2, 'Weight_base': 1, 'similar': 0}, 'ఠ': {'Weight': 0.5, 'Weight_base': 1, 'similar': 1}, 'డ': {'Weight': 0.1, 'Weight_base': 1, 'similar': 0}, 'ఢ': {'Weight': 0.4, 'Weight_base': 1, 'similar': 1}, 'ణ': {'Weight': 0.3, 'Weight_base': 1, 'similar': 0}, 'త': {'Weight': 0.2, 'Weight_base': 2, 'similar': 0}, 'థ': {'Weight': 0.5, 'Weight_base': 1, 'similar': 1}, 'ద': {'Weight': 0.1, 'Weight_base': 1, 'similar': 0}, 'ధ': {'Weight': 0.4, 'Weight_base': 1, 'similar': 1}, 'న': {'Weight': 0.3, 'Weight_base': 2, 'similar': 0}, 'ప': {'Weight': 0.2, 'Weight_base': 1, 'similar': 0}, 'ఫ': {'Weight': 0.5, 'Weight_base': 1, 'similar': 1}, 'బ': {'Weight': 0.1, 'Weight_base': 1, 'similar': 0}, 'భ': {'Weight': 0.4, 'Weight_base': 1, 'similar': 1}, 'మ': {'Weight': 0.3, 'Weight_base': 2, 'similar': 1}, 'య': {'Weight': 0.1, 'Weight_base': 2, 'similar': 1}, 'ర': {'Weight': 0.1, 'Weight_base': 2, 'similar': 0}, 'ఱ': {'Weight': 0.1, 'Weight_base': 1, 'similar': 0}, 'ల': {'Weight': 0.1, 'Weight_base': 2, 'similar': 0}, 'ళ': {'Weight': 0.3, 'Weight_base': 1, 'similar': 0}, 'వ': {'Weight': 0.1, 'Weight_base': 1, 'similar': 1}, 'శ': {'Weight': 0.4, 'Weight_base': 1, 'similar': 0}, 'ష': {'Weight': 0.5, 'Weight_base': 1, 'similar': 1}, 'స': {'Weight': 0.4, 'Weight_base': 1, 'similar': 0}, 'హ': {'Weight': 0.3, 'Weight_base': 1, 'similar': 0}, '్': {'Weight': 1.2, 'Weight_base': 0, 'similar': 0}, 'ా': {'Weight': 0.2, 'Weight_base': 0, 'similar': 0}, 'ి': {'Weight': 0.1, 'Weight_base': 0, 'similar': 0}, 'ీ': {'Weight': 0.2, 'Weight_base': 0, 'similar': 0}, 'ు': {'Weight': 0.1, 'Weight_base': 0, 'similar': 0}, 'ూ': {'Weight': 0.2, 'Weight_base': 0, 'similar': 0}, 'ృ': {'Weight': 0.4, 'Weight_base': 0, 'similar': 0}, 'ౄ': {'Weight': 0.4, 'Weight_base': 0, 'similar': 0}, 'ె': {'Weight': 0.1, 'Weight_base': 0, 'similar': 0}, 'ే': {'Weight': 0.2, 'Weight_base': 0, 'similar': 0}, 'ై': {'Weight': 0.2, 'Weight_base': 0, 'similar': 1}, 'ొ': {'Weight': 0.1, 'Weight_base': 0, 'similar': 0}, 'ో': {'Weight': 0.2, 'Weight_base': 0, 'similar': 0}, 'ౌ': {'Weight': 0.2, 'Weight_base': 0, 'similar': 0}, 'ం': {'Weight': 0.3, 'Weight_base': 0, 'similar': 0}, 'ః': {'Weight': 0.3, 'Weight_base': 0, 'similar': 0}}, + "virama":'్', + "regex": { + "letter": r'[\u0C00-\u0C7F]', + "trailing_letter": r'[\u0C00-\u0C04\u0C3E-\u0C56\u0C62-\u0C63]', + "control": r'\u0C4D' + } + }, + "kn": { + "score":{'ಅ': {'Weight': 0.1, 'Weight_base': 0, 'similar': 1}, 'ಆ': {'Weight': 0.2, 'Weight_base': 0, 'similar': 1}, 'ಇ': {'Weight': 0.1, 'Weight_base': 0, 'similar': 0}, 'ಈ': {'Weight': 0.2, 'Weight_base': 0, 'similar': 0}, 'ಉ': {'Weight': 0.1, 'Weight_base': 0, 'similar': 1}, 'ಊ': {'Weight': 0.2, 'Weight_base': 0, 'similar': 1}, 'ಋ': {'Weight': 0.4, 'Weight_base': 0, 'similar': 1}, 'ೠ': {'Weight': 0.4, 'Weight_base': 0, 'similar': 1}, 'ಎ': {'Weight': 0.1, 'Weight_base': 0, 'similar': 0}, 'ಏ': {'Weight': 0.2, 'Weight_base': 0, 'similar': 1}, 'ಐ': {'Weight': 0.2, 'Weight_base': 0, 'similar': 0}, 'ಒ': {'Weight': 0.1, 'Weight_base': 0, 'similar': 0}, 'ಓ': {'Weight': 0.2, 'Weight_base': 0, 'similar': 1}, 'ಔ': {'Weight': 0.2, 'Weight_base': 0, 'similar': 0}, 'ಕ': {'Weight': 0.2, 'Weight_base': 1, 'similar': 0}, 'ಖ': {'Weight': 0.4, 'Weight_base': 1, 'similar': 0}, 'ಗ': {'Weight': 0.1, 'Weight_base': 1, 'similar': 0}, 'ಘ': {'Weight': 0.5, 'Weight_base': 1, 'similar': 1}, 'ಙ': {'Weight': 0.3, 'Weight_base': 1, 'similar': 1}, 'ಚ': {'Weight': 0.2, 'Weight_base': 1, 'similar': 0}, 'ಛ': {'Weight': 0.4, 'Weight_base': 1, 'similar': 1}, 'ಜ': {'Weight': 0.1, 'Weight_base': 1, 'similar': 1}, 'ಝ': {'Weight': 0.5, 'Weight_base': 1, 'similar': 1}, 'ಞ': {'Weight': 0.3, 'Weight_base': 1, 'similar': 1}, 'ಟ': {'Weight': 0.2, 'Weight_base': 1, 'similar': 0}, 'ಠ': {'Weight': 0.4, 'Weight_base': 1, 'similar': 1}, 'ಡ': {'Weight': 0.1, 'Weight_base': 1, 'similar': 0}, 'ಢ': {'Weight': 0.5, 'Weight_base': 1, 'similar': 1}, 'ಣ': {'Weight': 0.3, 'Weight_base': 1, 'similar': 0}, 'ತ': {'Weight': 0.2, 'Weight_base': 2, 'similar': 0}, 'ಥ': {'Weight': 0.5, 'Weight_base': 1, 'similar': 1}, 'ದ': {'Weight': 0.1, 'Weight_base': 1, 'similar': 0}, 'ಧ': {'Weight': 0.4, 'Weight_base': 1, 'similar': 1}, 'ನ': {'Weight': 0.3, 'Weight_base': 2, 'similar': 0}, 'ಪ': {'Weight': 0.2, 'Weight_base': 1, 'similar': 0}, 'ಫ': {'Weight': 0.5, 'Weight_base': 1, 'similar': 1}, 'ಬ': {'Weight': 0.1, 'Weight_base': 1, 'similar': 0}, 'ಭ': {'Weight': 0.4, 'Weight_base': 1, 'similar': 1}, 'ಮ': {'Weight': 0.3, 'Weight_base': 2, 'similar': 0}, 'ಯ': {'Weight': 0.1, 'Weight_base': 2, 'similar': 0}, 'ರ': {'Weight': 0.1, 'Weight_base': 2, 'similar': 0}, 'ಱ': {'Weight': 0.3, 'Weight_base': 0, 'similar': 0}, 'ಲ': {'Weight': 0.1, 'Weight_base': 2, 'similar': 0}, 'ಳ': {'Weight': 0.3, 'Weight_base': 1, 'similar': 0}, 'ವ': {'Weight': 0.1, 'Weight_base': 1, 'similar': 1}, 'ಶ': {'Weight': 0.4, 'Weight_base': 1, 'similar': 0}, 'ಷ': {'Weight': 0.5, 'Weight_base': 1, 'similar': 1}, 'ಸ': {'Weight': 0.4, 'Weight_base': 1, 'similar': 0}, 'ಹ': {'Weight': 0.3, 'Weight_base': 1, 'similar': 0}, 'ೞ': {'Weight': 0.3, 'Weight_base': 1, 'similar': 0}, '್': {'Weight': 1.2, 'Weight_base': 0, 'similar': 0}, 'ಾ': {'Weight': 0.2, 'Weight_base': 0, 'similar': 0}, 'ಿ': {'Weight': 0.1, 'Weight_base': 0, 'similar': 0}, 'ೀ': {'Weight': 0.2, 'Weight_base': 0, 'similar': 1}, 'ು': {'Weight': 0.1, 'Weight_base': 0, 'similar': 0}, 'ೂ': {'Weight': 0.2, 'Weight_base': 0, 'similar': 2}, 'ೃ': {'Weight': 0.4, 'Weight_base': 0, 'similar': 1}, 'ೆ': {'Weight': 0.1, 'Weight_base': 0, 'similar': 1}, 'ೇ': {'Weight': 0.1, 'Weight_base': 0, 'similar': 1}, 'ೈ': {'Weight': 0.2, 'Weight_base': 0, 'similar': 1}, 'ೊ': {'Weight': 0.1, 'Weight_base': 0, 'similar': 2}, 'ೋ': {'Weight': 0.2, 'Weight_base': 0, 'similar': 2}, 'ೌ': {'Weight': 0.2, 'Weight_base': 0, 'similar': 1}, 'ಂ': {'Weight': 2.0, 'Weight_base': 0, 'similar': 0}, 'ಃ': {'Weight': 2.0, 'Weight_base': 0, 'similar': 0}}, + "virama":'್', + "regex":{ + "letter":r'[\u0C80-\u0CFF]', + "trailing_letter":r'[\u0C81-\u0C83\u0CBC\u0CBE-\u0CCD\u0CD5\u0CD6\u0CE2\u0CE3]', + "control":r'\u0CCD' #virama + }, + "arkavattu":'ರ' + } +} \ No newline at end of file diff --git a/lc - python/langcomplexity.py b/lc - python/langcomplexity.py new file mode 100644 index 0000000..1ea23a6 --- /dev/null +++ b/lc - python/langcomplexity.py @@ -0,0 +1,105 @@ +import regex +import configs +import pandas as pd +import unicodedata + +language = 'kn' # change the language. +word='ಸಾಮರ್ಥ್ಯ' + + + +config = configs.language_data #Languges based data +scores = config[language]["score"] # Select weightage data for a particular Language + +# This function will return syllables for a given word (ಸಾಮರ್ಥ್ಯ ==> ['ಸಾ', 'ಮ', 'ರ್ಥ್ಯ']) +def find_syllables(word): + letter = config[language]["regex"]["letter"] + trailing_letter = config[language]["regex"]["trailing_letter"] + control = config[language]["regex"]["control"] + regex_exp= rf'{letter}(?:{control}{letter}|{trailing_letter})*' + syllables= regex.findall(regex_exp, word) + return syllables + +# This function helps in finding whether a syllable is samyutkashara or not and return the included consonants. +def is_samyukta(syllable) : + virama = config[language]["virama"] + consonants = [] + for i, char in enumerate(syllable): + if (i > 0 and syllable[i - 1] == virama) or (i < len(syllable) - 1 and syllable[i + 1] == virama): + consonants.append(char) + return consonants + +# This funcrion checks whether a syllable has arkavattu and returns the score. +def add_arkavattu_score(samyukta_arr): + if(samyukta_arr[0]==config[language]["arkavattu"]): return 1.5 + return 0 + +def get_score(word): + addedscores=[] + word=word.replace(" ", "") + word = unicodedata.normalize('NFC', word) + # syllables=regex.findall(r'\X', word) + syllables=find_syllables(word) + print(syllables) + score=0 + + if len(syllables)==4: # length of word is 4 add weight of 2 + score+=2 + addedscores.append(2) + if len(syllables)>4: # length of word is 4 add weight of 4 + score+=4 + addedscores.append(4) + # similarities=set() + for syllable in syllables: + similarities=set() + resp=is_samyukta(syllable) + print(resp) + # Check for arkavattu in language and add the score of it. + if("arkavattu" in config[language] and config[language]["arkavattu"] in syllable and len(resp)>0): + arka_score = add_arkavattu_score(resp) + if(arka_score >0): + addedscores.append(1.5) + score = score + arka_score + # Add the Samyutakshra Weightage + if len(resp)>0: + if len(resp)==2: #f there are two consonants + + if resp[0]==resp[1]: #If they are the same, the Weight_base of that consonant is added. along with additional weight of 1 + score+=scores[resp[0]]['Weight_base']+1 + addedscores.append(scores[resp[0]]['Weight_base']+1) + else: # f they are different, the Weight_base of both consonants is added. along with additional weight of 2 + score+=scores[resp[0]]['Weight_base']+scores[resp[1]]['Weight_base']+2 + addedscores.append(scores[resp[0]]['Weight_base']+scores[resp[1]]['Weight_base']+2) + + if len(resp)>2: # If there are more than two consonants, the Weight_base of 3 consonants is added along with additional 3 points are added. + score+=scores[resp[0]]['Weight_base']+scores[resp[1]]['Weight_base']+scores[resp[2]]['Weight_base']+3 + addedscores.append(scores[resp[0]]['Weight_base']+scores[resp[1]]['Weight_base']+scores[resp[2]]['Weight_base']+3) + + for char in syllable: + score+=scores[char]['Weight'] # Add the individual Char weightage. + addedscores.append(scores[char]['Weight']) + + if scores[char]['similar'] >0 : + is_similar = char in similarities + if not is_similar: + score+=0.9 # Add the similar score if applicable. + addedscores.append(0.9) + similarities.add(char) + score=round(score,2) + return score,addedscores + + +def score(content): + content = regex.sub(r'[^\w\s]', '', content) + print(content) + split_content = content.split() + final_scores =0 + scores_data = [] + for word in split_content: + output = get_score(word) + final_scores+=output[0] + scores_data.append(output[1]) + + return final_scores,scores_data + +print(score(word)) \ No newline at end of file