Skip to content
Closed
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
74 changes: 73 additions & 1 deletion CommcareTranslationChecker/CommcareTranslationChecker.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,10 +19,12 @@
NON_LINGUISTIC_CHARACTERS = "~`!@#$%^&*()_-+={[}]|\\:;\"'<,>.?/"
MISMATCH_FILL_STYLE_NAME = "mismatchFillStyle"
LESSER_MISMATCH_FILL_STYLE_NAME = "lesserMismatchFillStyle"
LANG_MISMATCH_FILL_STYLE_NAME = "langMismatchFillStyle"

# DEFINE COLORS
RED = '00FF0000'
YELLOW = '00FFFF00'
BLUE = '000000FF'


# DEFINE METHODS #
Expand Down Expand Up @@ -95,10 +97,16 @@ def register_styles(wb):
name=LESSER_MISMATCH_FILL_STYLE_NAME,
fill=xl.styles.PatternFill(fgColor=xl.styles.colors.Color(YELLOW), fill_type="solid"),
alignment=xl.styles.Alignment(wrap_text=True))
langMismatchFillStyle = xl.styles.NamedStyle(
name=LANG_MISMATCH_FILL_STYLE_NAME,
fill=xl.styles.PatternFill(fgColor=xl.styles.colors.Color(BLUE), fill_type="solid"),
alignment=xl.styles.Alignment(wrap_text=True))
if MISMATCH_FILL_STYLE_NAME not in wb.named_styles:
wb.add_named_style(mismatchFillStyle)
if LESSER_MISMATCH_FILL_STYLE_NAME not in wb.named_styles:
wb.add_named_style(lesserMismatchFillStyle)
if LANG_MISMATCH_FILL_STYLE_NAME not in wb.named_styles:
wb.add_named_style(langMismatchFillStyle)


def convertCellToOutputValueList(cell):
Expand Down Expand Up @@ -140,6 +148,54 @@ def convertCellToOutputValueList(cell):
return outputList, messages


def convertCellToDict(cell):
"""
Convert an Excel cell to a dict of strings with occurence. <output value...> tags are ignored.
If the Excel cell contains 'jr://file/' empty dict is returned.
Input:
cell (xl.cell.cell.Cell): Cell whose contents are to be parsed

Output:
Dict with strings as key and value as 1. All keys are unique in the dict.

"""
outputDict = {}
x = cell.value
if 'jr://file/' in x:
return outputDict

try:
x = re.sub(r'(?:\s)<output[^, ]*', '', x)
x = re.sub(r'(?:\s)value=[^, ]*', '', x)
except Exception as e:
raise FatalError("FATAL ERROR determining string values for worksheet %s cell %s : %s" %
(cell.parent.title, cell.coordinate, str(e)))

for i in x.split():
i = re.sub(r'[\W\_]', '', i)
if not outputDict.get(i) and (i and i.strip()) and not (re.match(r'^[0-9]*$', i)):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If a word occurs more than once, do we increment occurrence count or we ignore it.

Copy link
Author

@LambdaLearner LambdaLearner Sep 21, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We take only unique words in the dictionary. The keys are unique and value will always be 1. We do not increment count if the word occurs again.

outputDict[i] = 1

return outputDict

def linguisticCharChecker(baseDict, colDict):
"""
takes base column word dictionary and current column word dictionary as input.
Returns list of common words and whether the script is english in both the columns or not.

"""
sharedWords = list(set(baseDict).intersection(colDict))
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we just compare only the shared words or even the occurrence count. For example, if base text has word w 2 times, and translated text has same word w only once, what do we do?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We take only unique words in the dictionary and value will always be 1 in the dictionary. Hence this situation doesn't arise.

if not sharedWords:
return sharedWords, False
for i in list(colDict.keys()):
if i in sharedWords:
continue
else:
if re.match(r"^[A-Za-z0-9]*$", i):
return sharedWords, False
else:
return sharedWords, True

def createOutputCell(cell, wsOut):
'''
Make a copy of a Cell object into the exact same coordinates in the target Worksheet.
Expand Down Expand Up @@ -268,6 +324,7 @@ def checkRowForMismatch(row, columnDict, fixedColumnDict, baseColumnIdx=None, ig
if baseColumnIdx is None:
baseColumnIdx = sorted(columnDictKeyList)[0]
baseOutputValueList, error_messages = convertCellToOutputValueList(row[baseColumnIdx])
baseValueDict = convertCellToDict(row[baseColumnIdx])
messages.extend(error_messages)
if ignoreOrder:
baseOutputValueList = sorted(baseOutputValueList)
Expand All @@ -285,6 +342,15 @@ def checkRowForMismatch(row, columnDict, fixedColumnDict, baseColumnIdx=None, ig
if ignoreOrder:
curOutputValueList = sorted(curOutputValueList)
curFormatDict = {}
sharedWords =[]
if (colIdx != baseColumnIdx):
curValueDict = convertCellToDict(row[colIdx])
sharedWords, bool_translation = linguisticCharChecker(baseValueDict, curValueDict)
if wsOut:
cellOut = getOutputCell(row[colIdx], wsOut)
if len(sharedWords) > 0 and bool_translation:
curMismatchFillStyle = LANG_MISMATCH_FILL_STYLE_NAME
cellOut.style = curMismatchFillStyle

# Initialize block_tags_fixed_flag to False, if any fix is applied, set to True
block_tags_fixed_flag = False
Expand Down Expand Up @@ -431,6 +497,11 @@ def checkRowForMismatch(row, columnDict, fixedColumnDict, baseColumnIdx=None, ig
(row[colIdx].parent.title, row[colIdx].coordinate, str(e)))

mismatchCell = wsOut.cell(row=getOutputCell(row[0], wsOut).row, column=1).offset(column=mismatchFlagIdx)
if len(sharedWords) > 0:
if bool_translation:
curMismatchFillStyle = LANG_MISMATCH_FILL_STYLE_NAME
mismatchCell.value = "Y"
mismatchCell.style = curMismatchFillStyle
if len(mismatchDict) > 0:
curMismatchFillStyle = LESSER_MISMATCH_FILL_STYLE_NAME
for key in mismatchDict:
Expand All @@ -439,7 +510,8 @@ def checkRowForMismatch(row, columnDict, fixedColumnDict, baseColumnIdx=None, ig
mismatchCell.value = "Y"
mismatchCell.style = curMismatchFillStyle
else:
mismatchCell.value = "N"
if mismatchCell.value != "Y":
mismatchCell.value = "N"

return baseColumnDict, mismatchDict

Expand Down