Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 21 additions & 0 deletions lc - python/configs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
language_data = {
"te":{
"score":{'అ': {'Weight': 0.1, 'Weight_base': 0, 'similar': 0}, 'ఆ': {'Weight': 0.2, 'Weight_base': 0, 'similar': 1}, 'ఇ': {'Weight': 0.1, 'Weight_base': 0, 'similar': 0}, 'ఈ': {'Weight': 0.2, 'Weight_base': 0, 'similar': 0}, 'ఉ': {'Weight': 0.1, 'Weight_base': 0, 'similar': 1}, 'ఊ': {'Weight': 0.2, 'Weight_base': 0, 'similar': 1}, 'ఋ': {'Weight': 0.4, 'Weight_base': 0, 'similar': 0}, 'ౠ': {'Weight': 0.4, 'Weight_base': 0, 'similar': 1}, 'ఎ': {'Weight': 0.1, 'Weight_base': 0, 'similar': 0}, 'ఏ': {'Weight': 0.2, 'Weight_base': 0, 'similar': 1}, 'ఐ': {'Weight': 0.2, 'Weight_base': 0, 'similar': 0}, 'ఒ': {'Weight': 0.1, 'Weight_base': 0, 'similar': 0}, 'ఓ': {'Weight': 0.2, 'Weight_base': 0, 'similar': 1}, 'ఔ': {'Weight': 0.2, 'Weight_base': 0, 'similar': 0}, 'క': {'Weight': 0.2, 'Weight_base': 1, 'similar': 0}, 'ఖ': {'Weight': 0.5, 'Weight_base': 1, 'similar': 0}, 'గ': {'Weight': 0.1, 'Weight_base': 1, 'similar': 0}, 'ఘ': {'Weight': 0.4, 'Weight_base': 1, 'similar': 1}, 'ఙ': {'Weight': 0.3, 'Weight_base': 1, 'similar': 1}, 'చ': {'Weight': 0.2, 'Weight_base': 1, 'similar': 0}, 'ఛ': {'Weight': 0.5, 'Weight_base': 1, 'similar': 1}, 'జ': {'Weight': 0.1, 'Weight_base': 1, 'similar': 0}, 'ఝ': {'Weight': 0.4, 'Weight_base': 1, 'similar': 1}, 'ఞ': {'Weight': 0.3, 'Weight_base': 1, 'similar': 1}, 'ట': {'Weight': 0.2, 'Weight_base': 1, 'similar': 0}, 'ఠ': {'Weight': 0.5, 'Weight_base': 1, 'similar': 1}, 'డ': {'Weight': 0.1, 'Weight_base': 1, 'similar': 0}, 'ఢ': {'Weight': 0.4, 'Weight_base': 1, 'similar': 1}, 'ణ': {'Weight': 0.3, 'Weight_base': 1, 'similar': 0}, 'త': {'Weight': 0.2, 'Weight_base': 2, 'similar': 0}, 'థ': {'Weight': 0.5, 'Weight_base': 1, 'similar': 1}, 'ద': {'Weight': 0.1, 'Weight_base': 1, 'similar': 0}, 'ధ': {'Weight': 0.4, 'Weight_base': 1, 'similar': 1}, 'న': {'Weight': 0.3, 'Weight_base': 2, 'similar': 0}, 'ప': {'Weight': 0.2, 'Weight_base': 1, 'similar': 0}, 'ఫ': {'Weight': 0.5, 'Weight_base': 1, 'similar': 1}, 'బ': {'Weight': 0.1, 'Weight_base': 1, 'similar': 0}, 'భ': {'Weight': 0.4, 'Weight_base': 1, 'similar': 1}, 'మ': {'Weight': 0.3, 'Weight_base': 2, 'similar': 1}, 'య': {'Weight': 0.1, 'Weight_base': 2, 'similar': 1}, 'ర': {'Weight': 0.1, 'Weight_base': 2, 'similar': 0}, 'ఱ': {'Weight': 0.1, 'Weight_base': 1, 'similar': 0}, 'ల': {'Weight': 0.1, 'Weight_base': 2, 'similar': 0}, 'ళ': {'Weight': 0.3, 'Weight_base': 1, 'similar': 0}, 'వ': {'Weight': 0.1, 'Weight_base': 1, 'similar': 1}, 'శ': {'Weight': 0.4, 'Weight_base': 1, 'similar': 0}, 'ష': {'Weight': 0.5, 'Weight_base': 1, 'similar': 1}, 'స': {'Weight': 0.4, 'Weight_base': 1, 'similar': 0}, 'హ': {'Weight': 0.3, 'Weight_base': 1, 'similar': 0}, '్': {'Weight': 1.2, 'Weight_base': 0, 'similar': 0}, 'ా': {'Weight': 0.2, 'Weight_base': 0, 'similar': 0}, 'ి': {'Weight': 0.1, 'Weight_base': 0, 'similar': 0}, 'ీ': {'Weight': 0.2, 'Weight_base': 0, 'similar': 0}, 'ు': {'Weight': 0.1, 'Weight_base': 0, 'similar': 0}, 'ూ': {'Weight': 0.2, 'Weight_base': 0, 'similar': 0}, 'ృ': {'Weight': 0.4, 'Weight_base': 0, 'similar': 0}, 'ౄ': {'Weight': 0.4, 'Weight_base': 0, 'similar': 0}, 'ె': {'Weight': 0.1, 'Weight_base': 0, 'similar': 0}, 'ే': {'Weight': 0.2, 'Weight_base': 0, 'similar': 0}, 'ై': {'Weight': 0.2, 'Weight_base': 0, 'similar': 1}, 'ొ': {'Weight': 0.1, 'Weight_base': 0, 'similar': 0}, 'ో': {'Weight': 0.2, 'Weight_base': 0, 'similar': 0}, 'ౌ': {'Weight': 0.2, 'Weight_base': 0, 'similar': 0}, 'ం': {'Weight': 0.3, 'Weight_base': 0, 'similar': 0}, 'ః': {'Weight': 0.3, 'Weight_base': 0, 'similar': 0}},
"virama":'్',
"regex": {
"letter": r'[\u0C00-\u0C7F]',
"trailing_letter": r'[\u0C00-\u0C04\u0C3E-\u0C56\u0C62-\u0C63]',
"control": r'\u0C4D'
}
},
"kn": {
"score":{'ಅ': {'Weight': 0.1, 'Weight_base': 0, 'similar': 1}, 'ಆ': {'Weight': 0.2, 'Weight_base': 0, 'similar': 1}, 'ಇ': {'Weight': 0.1, 'Weight_base': 0, 'similar': 0}, 'ಈ': {'Weight': 0.2, 'Weight_base': 0, 'similar': 0}, 'ಉ': {'Weight': 0.1, 'Weight_base': 0, 'similar': 1}, 'ಊ': {'Weight': 0.2, 'Weight_base': 0, 'similar': 1}, 'ಋ': {'Weight': 0.4, 'Weight_base': 0, 'similar': 1}, 'ೠ': {'Weight': 0.4, 'Weight_base': 0, 'similar': 1}, 'ಎ': {'Weight': 0.1, 'Weight_base': 0, 'similar': 0}, 'ಏ': {'Weight': 0.2, 'Weight_base': 0, 'similar': 1}, 'ಐ': {'Weight': 0.2, 'Weight_base': 0, 'similar': 0}, 'ಒ': {'Weight': 0.1, 'Weight_base': 0, 'similar': 0}, 'ಓ': {'Weight': 0.2, 'Weight_base': 0, 'similar': 1}, 'ಔ': {'Weight': 0.2, 'Weight_base': 0, 'similar': 0}, 'ಕ': {'Weight': 0.2, 'Weight_base': 1, 'similar': 0}, 'ಖ': {'Weight': 0.4, 'Weight_base': 1, 'similar': 0}, 'ಗ': {'Weight': 0.1, 'Weight_base': 1, 'similar': 0}, 'ಘ': {'Weight': 0.5, 'Weight_base': 1, 'similar': 1}, 'ಙ': {'Weight': 0.3, 'Weight_base': 1, 'similar': 1}, 'ಚ': {'Weight': 0.2, 'Weight_base': 1, 'similar': 0}, 'ಛ': {'Weight': 0.4, 'Weight_base': 1, 'similar': 1}, 'ಜ': {'Weight': 0.1, 'Weight_base': 1, 'similar': 1}, 'ಝ': {'Weight': 0.5, 'Weight_base': 1, 'similar': 1}, 'ಞ': {'Weight': 0.3, 'Weight_base': 1, 'similar': 1}, 'ಟ': {'Weight': 0.2, 'Weight_base': 1, 'similar': 0}, 'ಠ': {'Weight': 0.4, 'Weight_base': 1, 'similar': 1}, 'ಡ': {'Weight': 0.1, 'Weight_base': 1, 'similar': 0}, 'ಢ': {'Weight': 0.5, 'Weight_base': 1, 'similar': 1}, 'ಣ': {'Weight': 0.3, 'Weight_base': 1, 'similar': 0}, 'ತ': {'Weight': 0.2, 'Weight_base': 2, 'similar': 0}, 'ಥ': {'Weight': 0.5, 'Weight_base': 1, 'similar': 1}, 'ದ': {'Weight': 0.1, 'Weight_base': 1, 'similar': 0}, 'ಧ': {'Weight': 0.4, 'Weight_base': 1, 'similar': 1}, 'ನ': {'Weight': 0.3, 'Weight_base': 2, 'similar': 0}, 'ಪ': {'Weight': 0.2, 'Weight_base': 1, 'similar': 0}, 'ಫ': {'Weight': 0.5, 'Weight_base': 1, 'similar': 1}, 'ಬ': {'Weight': 0.1, 'Weight_base': 1, 'similar': 0}, 'ಭ': {'Weight': 0.4, 'Weight_base': 1, 'similar': 1}, 'ಮ': {'Weight': 0.3, 'Weight_base': 2, 'similar': 0}, 'ಯ': {'Weight': 0.1, 'Weight_base': 2, 'similar': 0}, 'ರ': {'Weight': 0.1, 'Weight_base': 2, 'similar': 0}, 'ಱ': {'Weight': 0.3, 'Weight_base': 0, 'similar': 0}, 'ಲ': {'Weight': 0.1, 'Weight_base': 2, 'similar': 0}, 'ಳ': {'Weight': 0.3, 'Weight_base': 1, 'similar': 0}, 'ವ': {'Weight': 0.1, 'Weight_base': 1, 'similar': 1}, 'ಶ': {'Weight': 0.4, 'Weight_base': 1, 'similar': 0}, 'ಷ': {'Weight': 0.5, 'Weight_base': 1, 'similar': 1}, 'ಸ': {'Weight': 0.4, 'Weight_base': 1, 'similar': 0}, 'ಹ': {'Weight': 0.3, 'Weight_base': 1, 'similar': 0}, 'ೞ': {'Weight': 0.3, 'Weight_base': 1, 'similar': 0}, '್': {'Weight': 1.2, 'Weight_base': 0, 'similar': 0}, 'ಾ': {'Weight': 0.2, 'Weight_base': 0, 'similar': 0}, 'ಿ': {'Weight': 0.1, 'Weight_base': 0, 'similar': 0}, 'ೀ': {'Weight': 0.2, 'Weight_base': 0, 'similar': 1}, 'ು': {'Weight': 0.1, 'Weight_base': 0, 'similar': 0}, 'ೂ': {'Weight': 0.2, 'Weight_base': 0, 'similar': 2}, 'ೃ': {'Weight': 0.4, 'Weight_base': 0, 'similar': 1}, 'ೆ': {'Weight': 0.1, 'Weight_base': 0, 'similar': 1}, 'ೇ': {'Weight': 0.1, 'Weight_base': 0, 'similar': 1}, 'ೈ': {'Weight': 0.2, 'Weight_base': 0, 'similar': 1}, 'ೊ': {'Weight': 0.1, 'Weight_base': 0, 'similar': 2}, 'ೋ': {'Weight': 0.2, 'Weight_base': 0, 'similar': 2}, 'ೌ': {'Weight': 0.2, 'Weight_base': 0, 'similar': 1}, 'ಂ': {'Weight': 2.0, 'Weight_base': 0, 'similar': 0}, 'ಃ': {'Weight': 2.0, 'Weight_base': 0, 'similar': 0}},
"virama":'್',
"regex":{
"letter":r'[\u0C80-\u0CFF]',
"trailing_letter":r'[\u0C81-\u0C83\u0CBC\u0CBE-\u0CCD\u0CD5\u0CD6\u0CE2\u0CE3]',
"control":r'\u0CCD' #virama
},
"arkavattu":'ರ'
}
}
105 changes: 105 additions & 0 deletions lc - python/langcomplexity.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
import regex
import configs
import pandas as pd
import unicodedata

language = 'kn' # change the language.
word='ಸಾಮರ್ಥ್ಯ'



config = configs.language_data #Languges based data
scores = config[language]["score"] # Select weightage data for a particular Language

# This function will return syllables for a given word (ಸಾಮರ್ಥ್ಯ ==> ['ಸಾ', 'ಮ', 'ರ್ಥ್ಯ'])
def find_syllables(word):
letter = config[language]["regex"]["letter"]
trailing_letter = config[language]["regex"]["trailing_letter"]
control = config[language]["regex"]["control"]
regex_exp= rf'{letter}(?:{control}{letter}|{trailing_letter})*'
syllables= regex.findall(regex_exp, word)
return syllables

# This function helps in finding whether a syllable is samyutkashara or not and return the included consonants.
def is_samyukta(syllable) :
virama = config[language]["virama"]
consonants = []
for i, char in enumerate(syllable):
if (i > 0 and syllable[i - 1] == virama) or (i < len(syllable) - 1 and syllable[i + 1] == virama):
consonants.append(char)
return consonants

# This funcrion checks whether a syllable has arkavattu and returns the score.
def add_arkavattu_score(samyukta_arr):
if(samyukta_arr[0]==config[language]["arkavattu"]): return 1.5
return 0

def get_score(word):
addedscores=[]
word=word.replace(" ", "")
word = unicodedata.normalize('NFC', word)
# syllables=regex.findall(r'\X', word)
syllables=find_syllables(word)
print(syllables)
score=0

if len(syllables)==4: # length of word is 4 add weight of 2
score+=2
addedscores.append(2)
if len(syllables)>4: # length of word is 4 add weight of 4
score+=4
addedscores.append(4)
# similarities=set()
for syllable in syllables:
similarities=set()
resp=is_samyukta(syllable)
print(resp)
# Check for arkavattu in language and add the score of it.
if("arkavattu" in config[language] and config[language]["arkavattu"] in syllable and len(resp)>0):
arka_score = add_arkavattu_score(resp)
if(arka_score >0):
addedscores.append(1.5)
score = score + arka_score
# Add the Samyutakshra Weightage
if len(resp)>0:
if len(resp)==2: #f there are two consonants

if resp[0]==resp[1]: #If they are the same, the Weight_base of that consonant is added. along with additional weight of 1
score+=scores[resp[0]]['Weight_base']+1
addedscores.append(scores[resp[0]]['Weight_base']+1)
else: # f they are different, the Weight_base of both consonants is added. along with additional weight of 2
score+=scores[resp[0]]['Weight_base']+scores[resp[1]]['Weight_base']+2
addedscores.append(scores[resp[0]]['Weight_base']+scores[resp[1]]['Weight_base']+2)

if len(resp)>2: # If there are more than two consonants, the Weight_base of 3 consonants is added along with additional 3 points are added.
score+=scores[resp[0]]['Weight_base']+scores[resp[1]]['Weight_base']+scores[resp[2]]['Weight_base']+3
addedscores.append(scores[resp[0]]['Weight_base']+scores[resp[1]]['Weight_base']+scores[resp[2]]['Weight_base']+3)

for char in syllable:
score+=scores[char]['Weight'] # Add the individual Char weightage.
addedscores.append(scores[char]['Weight'])

if scores[char]['similar'] >0 :
is_similar = char in similarities
if not is_similar:
score+=0.9 # Add the similar score if applicable.
addedscores.append(0.9)
similarities.add(char)
score=round(score,2)
return score,addedscores


def score(content):
content = regex.sub(r'[^\w\s]', '', content)
print(content)
split_content = content.split()
final_scores =0
scores_data = []
for word in split_content:
output = get_score(word)
final_scores+=output[0]
scores_data.append(output[1])

return final_scores,scores_data

print(score(word))