dimagi · LambdaLearner · Sep 4, 2020 · Sep 4, 2020 · Sep 4, 2020 · Sep 21, 2020
diff --git a/CommcareTranslationChecker/CommcareTranslationChecker.py b/CommcareTranslationChecker/CommcareTranslationChecker.py
@@ -19,10 +19,12 @@
 NON_LINGUISTIC_CHARACTERS = "~`!@#$%^&*()_-+={[}]|\\:;\"'<,>.?/"
 MISMATCH_FILL_STYLE_NAME = "mismatchFillStyle"
 LESSER_MISMATCH_FILL_STYLE_NAME = "lesserMismatchFillStyle"
+LANG_MISMATCH_FILL_STYLE_NAME = "langMismatchFillStyle"
 
 # DEFINE COLORS
 RED = '00FF0000'
 YELLOW = '00FFFF00'
+BLUE = '000000FF'
 
 
 # DEFINE METHODS #
@@ -95,10 +97,16 @@ def register_styles(wb):
         name=LESSER_MISMATCH_FILL_STYLE_NAME,
         fill=xl.styles.PatternFill(fgColor=xl.styles.colors.Color(YELLOW), fill_type="solid"),
         alignment=xl.styles.Alignment(wrap_text=True))
+    langMismatchFillStyle = xl.styles.NamedStyle(
+        name=LANG_MISMATCH_FILL_STYLE_NAME,
+        fill=xl.styles.PatternFill(fgColor=xl.styles.colors.Color(BLUE), fill_type="solid"),
+        alignment=xl.styles.Alignment(wrap_text=True))
     if MISMATCH_FILL_STYLE_NAME not in wb.named_styles:
         wb.add_named_style(mismatchFillStyle)
     if LESSER_MISMATCH_FILL_STYLE_NAME not in wb.named_styles:
         wb.add_named_style(lesserMismatchFillStyle)
+    if LANG_MISMATCH_FILL_STYLE_NAME not in wb.named_styles:
+        wb.add_named_style(langMismatchFillStyle)
 
 
 def convertCellToOutputValueList(cell):
@@ -140,6 +148,54 @@ def convertCellToOutputValueList(cell):
     return outputList, messages
 
 
+def convertCellToDict(cell):
+    """
+    Convert an Excel cell to a dict of strings with occurence. <output value...> tags are ignored. 
+    If the Excel cell contains 'jr://file/' empty dict is returned.
+    Input:
+    cell (xl.cell.cell.Cell): Cell whose contents are to be parsed
+
+    Output:
+    Dict with strings as key and value as 1. All keys are unique in the dict.
+
+   """
+    outputDict = {}
+    x = cell.value
+    if 'jr://file/' in x:
+        return outputDict
+
+    try:
+        x = re.sub(r'(?:\s)<output[^, ]*', '', x)
+        x = re.sub(r'(?:\s)value=[^, ]*', '', x)
+    except Exception as e:
+        raise FatalError("FATAL ERROR determining string values for worksheet %s cell %s : %s" %
+                         (cell.parent.title, cell.coordinate, str(e)))
+
+    for i in x.split():
+        i = re.sub(r'[\W\_]', '', i)
+        if not outputDict.get(i) and (i and i.strip()) and not (re.match(r'^[0-9]*$', i)):
+            outputDict[i] = 1
+
+    return outputDict
+
+def linguisticCharChecker(baseDict, colDict):
+    """
+    takes base column word dictionary and current column word dictionary as input. 
+    Returns list of common words and whether the script is english in both the columns or not. 
+
+   """
+    sharedWords = list(set(baseDict).intersection(colDict))
+    if not sharedWords:
+        return sharedWords, False
+    for i in list(colDict.keys()):
+        if i in sharedWords:
+            continue
+        else:
+            if re.match(r"^[A-Za-z0-9]*$", i):
+                return sharedWords, False
+            else:
+                return sharedWords, True
+
 def createOutputCell(cell, wsOut):
     '''
     Make a copy of a Cell object into the exact same coordinates in the target Worksheet.
@@ -268,6 +324,7 @@ def checkRowForMismatch(row, columnDict, fixedColumnDict, baseColumnIdx=None, ig
     if baseColumnIdx is None:
         baseColumnIdx = sorted(columnDictKeyList)[0]
     baseOutputValueList, error_messages = convertCellToOutputValueList(row[baseColumnIdx])
+    baseValueDict = convertCellToDict(row[baseColumnIdx])
     messages.extend(error_messages)
     if ignoreOrder:
         baseOutputValueList = sorted(baseOutputValueList)
@@ -285,6 +342,15 @@ def checkRowForMismatch(row, columnDict, fixedColumnDict, baseColumnIdx=None, ig
             if ignoreOrder:
                 curOutputValueList = sorted(curOutputValueList)
             curFormatDict = {}
+            sharedWords =[]
+            if (colIdx != baseColumnIdx):
+                curValueDict = convertCellToDict(row[colIdx])
+                sharedWords, bool_translation = linguisticCharChecker(baseValueDict, curValueDict)
+                if wsOut:
+                    cellOut = getOutputCell(row[colIdx], wsOut)
+                    if len(sharedWords) > 0 and bool_translation:
+                        curMismatchFillStyle = LANG_MISMATCH_FILL_STYLE_NAME
+                        cellOut.style = curMismatchFillStyle
 
             # Initialize block_tags_fixed_flag to False, if any fix is applied, set to True
             block_tags_fixed_flag = False
@@ -431,6 +497,11 @@ def checkRowForMismatch(row, columnDict, fixedColumnDict, baseColumnIdx=None, ig
                              (row[colIdx].parent.title, row[colIdx].coordinate, str(e)))
 
     mismatchCell = wsOut.cell(row=getOutputCell(row[0], wsOut).row, column=1).offset(column=mismatchFlagIdx)
+    if len(sharedWords) > 0:
+        if bool_translation:
+            curMismatchFillStyle = LANG_MISMATCH_FILL_STYLE_NAME
+            mismatchCell.value = "Y"
+            mismatchCell.style = curMismatchFillStyle
     if len(mismatchDict) > 0:
         curMismatchFillStyle = LESSER_MISMATCH_FILL_STYLE_NAME
         for key in mismatchDict:
@@ -439,7 +510,8 @@ def checkRowForMismatch(row, columnDict, fixedColumnDict, baseColumnIdx=None, ig
         mismatchCell.value = "Y"
         mismatchCell.style = curMismatchFillStyle
     else:
-        mismatchCell.value = "N"
+        if mismatchCell.value != "Y":
+            mismatchCell.value = "N"
 
     return baseColumnDict, mismatchDict