tpgillam · wanderingstan · Feb 15, 2021 · Feb 15, 2021 · Feb 15, 2021 · Feb 15, 2021
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,5 @@
+.vscode/*
+__pycache__/*
+bin/*
+lib/*
+pyvenv.cfg
diff --git a/Procfile b/Procfile
@@ -0,0 +1 @@
+web: gunicorn wsgi:application
diff --git a/README.md b/README.md
@@ -1,6 +1,74 @@
+# cuneifyplus
+
 This is the source code for http://cuneifyplus.arch.cam.ac.uk
 
 It is a wrapper around Steve Tinney's 'cuneify' tool, which allows for more complex conversion of transliterated
 Babylonian and Akkadian into various cuneiform fonts.
 
+## Local Server
+
+```bash
+$ python3 wsgi.py
+Serving on http://localhost:8051
+```
+
+Open the URL in browser to use the HTML interface.
+
+## Remote Server
+
+Can be deployed to e.g. Heroku as wsgi app.
+
+Example: https://cuneify.herokuapp.com/
+
+```bash
+$ heroku create cuneify
+Creating app... done, ⬢ cuneify
+Created http://cuneify.herokuapp.com/ | git@heroku.com:cuneify.git
+
+$ git push heroku master
+Enumerating objects: 5, done.
+Counting objects: 100% (5/5), done.
+Delta compression using up to 16 threads
+Compressing objects: 100% (3/3), done.
+Writing objects: 100% (3/3), 287 bytes | 287.00 KiB/s, done.
+Total 3 (delta 2), reused 0 (delta 0), pack-reused 0
+.
+.
+.
+remote:        https://cuneify.herokuapp.com/ deployed to Heroku
+```
+
+## Command line usage
+
+```bash
+$ python3 cuneify_interface.py test_file.txt
+𒇻 𒀠 𒌨 𒆠 𒆗 𒄯 𒃻 𒀀 𒉌
+𒁕
+𒃮
+```
 
+```bash
+$ python3 cuneify_interface.py --parse-atf test_file.atf
+&P232701 = RIME 3/1.01.07.031, ex. 117
+#atf: lang sux
+# reconstruction
+@object cone
+@surface a
+1. {d}nin-dar-a
+# 𒀭 𒊩𒌆 𒁯 𒀀
+2. lugal uru16
+# 𒈗 𒂗
+3. lugal-a-ni
+# 𒈗 𒀀 𒉌
+4. gu3-de2-a
+# 𒅗 𒌤 𒀀
+5. ensi2
+# 𒉺𒋼𒋛
+6. lagasz{ki}-ke4
+# 𒉢𒁓𒆷 𒆠 𒆤
+7. e2 gir2-su{ki}-ka-ni
+# 𒂍 𒄈 𒋢 𒆠 𒅗 𒉌
+8. mu-na-du3
+# 𒈬 𒈾 𒆕
+...
+```
diff --git a/cuneify_interface.py b/cuneify_interface.py
@@ -1,3 +1,5 @@
+# -*- coding: utf8 -*-
+
 import itertools
 import os
 import pickle
@@ -140,8 +142,8 @@ def __enter__(self):
 
     @abstractmethod
     def __exit__(self, type_, value, traceback):
-        """Update the cache with the current transliteration, cuneiform pairs. 
-        
+        """Update the cache with the current transliteration, cuneiform pairs.
+
         It will overwrite the given values if present
         """
 
@@ -253,46 +255,79 @@ def _write_cache_file(self):
         self._cache_modified = False
 
 
-def cuneify_line(cache, transliteration, show_transliteration):
-    """ Take a line of transliteration and display the output, nicely formatted, on the terminal.
-        Should be used whilst in the context of cache.
+def cuneify_line_structured(cache, transliteration):
+    """ Take a line of transliteration and return structured data (tuple) of
+        - tokens, eg `tok1`
+        - separators, eg `.`
+        - cuneiform symbols, eg `𒌉`
+        - unrecognized tokens, eg `Howdy`
     """
     transliteration = transliteration.strip()
     # Split using alphanumeric characters (\w)
     tokens = re.split(TOKEN_REGEX, transliteration)
 
-    # It's a much easier code path if we just show the cuneiform
-    if not show_transliteration:
-        return " ".join(cache.get_cuneiform(token) for token in tokens)
-
-    # Otherwise format something like this:
-    #
-    # tok1.tok2  tok3-tok4-5-   6
-    # A    BBBBB CC   DDD  EEEE F
     separators = re.findall(TOKEN_REGEX, transliteration)
     separators.append("")
-
     line_original = ""
     line_cuneiform = ""
-    for token, separator in zip(tokens, separators):
-        symbol = cache.get_cuneiform(token)
-        # FIXME -- take into account separator length (could be more than one
-        # character
-        n_spaces_after_symbol = 1 + max(len(separator) + len(token) - len(symbol), 0)
-        n_spaces_after_token_separator = 1 + max(len(symbol) - len(token), 0)
-        line_original += token + separator + " " * n_spaces_after_token_separator
-        line_cuneiform += symbol + " " * n_spaces_after_symbol
+    symbols = []
+    unrecognized_tokens = []
+    for token in tokens:
+        try:
+            symbol = cache.get_cuneiform(token)
+        except (UnrecognisedSymbol, TransliterationNotUnderstood):
+            symbol = None
+            unrecognized_tokens.append(token)
+        symbols.append(symbol)
+
+    return (tokens, separators, symbols, unrecognized_tokens)
+
+def cuneify_line(cache, transliteration, show_transliteration, unrecognized_indicator=""):
+    """ Take a line of transliteration and display the output, nicely formatted, on the terminal.
+        Should be used whilst in the context of cache.
+        unrecognized_indicator : String to display if token not recognized. If empty string,
+          the token will be returned as-is.
+    """
 
-    return "{}\n{}".format(line_original, line_cuneiform)
+    (tokens, separators, raw_symbols, unrecognized_tokens) = cuneify_line_structured(cache, transliteration)
+
+    # Substitube chosen string for unrecognized tokens
+    symbols = [s if s is not None else t if unrecognized_indicator=="" else unrecognized_indicator for (t, s) in zip(tokens, raw_symbols)]
+
+    if show_transliteration:
+        # Format tranliteratino under cuneiform, something like this:
+        #
+        # tok1.tok2  tok3-tok4-5-   6
+        # A    BBBBB CC   DDD  EEEE F
+        line_original = ""
+        line_cuneiform = ""
+        for token, separator, symbol in zip(tokens, separators, symbols):
+            if symbol is None:
+                symbol = token if unrecognized_indicator=="" else unrecognized_indicator
+            width = max(len(token + separator), len(symbol))
+            line_original += (token + separator).ljust(width)
+            line_cuneiform += symbol.ljust(width)
+        return "{}\n{}".format(line_original, line_cuneiform)
+    else:
+        return " ".join(symbols)
 
 
-def cuneify_file(cache, file_name, show_transliteration):
-    """ Given a text file with one or more lines of transliterated text, print out the corresponding
+def cuneify_iterator(cache, iterator, show_transliteration, parse_atf=True):
+    """ Given a iterator object that yields lines of transliterated text, return the corresponding
         version in cuneiform
     """
     output = ""
-    with open(file_name) as input_file:
-        for line in input_file:
+    if parse_atf:
+        for line in iterator:
+            atf_line_parts = re.search("^([0-9]+\.)([ \t]*)(.*)", line)
+            if atf_line_parts:
+                transliteration = atf_line_parts.group(3)
+                output += line
+                output += "#" + atf_line_parts.group(2) + cuneify_line(cache, transliteration, show_transliteration) + "\n"
+            else:
+                output += line
+    else:
+        for line in iterator:
             output += cuneify_line(cache, line, show_transliteration)
             output += "\n"
             # If also showing transliteration then an extra blank line aids legibility
@@ -301,6 +336,14 @@ def cuneify_file(cache, file_name, show_transliteration):
     return output
 
 
+def cuneify_file(cache, file_name, show_transliteration, parse_atf=True):
+    """ Given a text file with one or more lines of transliterated text, return the corresponding
+        version in cuneiform
+    """
+    with open(file_name) as iterator:
+        return cuneify_iterator(cache, iterator, show_transliteration, parse_atf=parse_atf)
+
+
 def ordered_symbol_to_transliterations(
     cache, transliteration, return_unrecognised=False
 ):
@@ -355,6 +398,9 @@ def main():
         help="By default just show cuneiform. If this is set, "
         "also display original transliteration",
     )
+    parser.add_argument(
+        "--parse-atf", action="store_true",
+        help="If this is set parse file as .atf formatted")
     parser.add_argument(
         "--symbol-list",
         action="store_true",
@@ -380,7 +426,7 @@ def main():
                 print("Unrecognised symbols:")
                 print(unrecognised_tokens)
         else:
-            print(cuneify_file(cache, args.input_file, args.show_transliteration))
+            print(cuneify_file(cache, args.input_file, args.show_transliteration, args.parse_atf))
 
 
 if __name__ == "__main__":

diff --git a/environment.py b/environment.py
@@ -6,22 +6,31 @@
 from cuneify_interface import FileCuneiformCache
 
 
-if not "mws" in socket.gethostname().lower():
-    raise RuntimeError(
-        "Unrecognised environment: {}".format(socket.gethostname().lower())
-    )
+if "mws" in socket.gethostname().lower():
 
+    # Running in MWS
+    MY_URL = "http://cuneifyplus.arch.cam.ac.uk"
 
-MY_URL = "http://cuneifyplus.arch.cam.ac.uk"
+    def get_font_directory(environ):
+        return os.path.join(environ["DOCUMENT_ROOT"], "cuneifyplus", "fonts")
 
+    def get_cache(environ):
+        """ Return the standard cuneiform cache """
+        cache_file_path = os.path.normpath(
+            os.path.join(environ["DOCUMENT_ROOT"], "cuneifyplus", "cuneiform_cache.pickle")
+        )
+        return FileCuneiformCache(cache_file_path=cache_file_path, read_only=True)
 
-def get_font_directory(environ):
-    return os.path.join(environ["DOCUMENT_ROOT"], "cuneifyplus", "fonts")
+else:
 
+    # Running locally
+    MY_URL = ""
 
-def get_cache(environ):
-    """ Return the standard cuneiform cache """
-    cache_file_path = os.path.normpath(
-        os.path.join(environ["DOCUMENT_ROOT"], "cuneifyplus", "cuneiform_cache.pickle")
-    )
-    return FileCuneiformCache(cache_file_path=cache_file_path, read_only=True)
+    def get_font_directory(environ):
+        return "fonts"
+
+    def get_cache(environ):
+        """ Return the standard cuneiform cache """
+        # We use a cache in the data directory. This isn't touched by the deployment process
+        cache_file_path = "cuneiform_cache.pickle"
+        return FileCuneiformCache(cache_file_path=cache_file_path)
diff --git a/requirements.txt b/requirements.txt
@@ -0,0 +1 @@
+gunicorn==20.0.4
diff --git a/test_file.atf b/test_file.atf
@@ -0,0 +1,58 @@
+&P232701 = RIME 3/1.01.07.031, ex. 117
+#atf: lang sux
+# reconstruction
+@object cone
+@surface a
+1. {d}nin-dar-a
+2. lugal uru16
+3. lugal-a-ni
+4. gu3-de2-a
+5. ensi2
+6. lagasz{ki}-ke4
+7. e2 gir2-su{ki}-ka-ni
+8. mu-na-du3
+
+
+&P232702 = RIME 3/1.01.07.031, ex. 118
+#atf: lang sux
+# reconstruction
+@object cone
+@surface a
+1. {d}nin-dar-a
+2. lugal uru16
+3. lugal-a-ni
+4. gu3-de2-a
+5. ensi2
+6. lagasz{ki}-ke4
+7. e2 gir2-su{ki}-ka-ni
+8. mu-na-du3
+
+
+&P232703 = RIME 3/1.01.07.031, ex. 119
+#atf: lang sux
+# reconstruction
+@object cone
+@surface a
+1. {d}nin-dar-a
+2. lugal uru16
+3. lugal-a-ni
+4. gu3-de2-a
+5. ensi2
+6. lagasz{ki}-ke4
+7. e2 gir2-su{ki}-ka-ni
+8. mu-na-du3
+
+
+&P232704 = RIME 3/1.01.07.031, ex. 120
+#atf: lang sux
+# reconstruction
+@object cone
+@surface a
+1. {d}nin-dar-a
+2. lugal uru16
+3. lugal-a-ni
+4. gu3-de2-a
+5. ensi2
+6. lagasz{ki}-ke4
+7. e2 gir2-su{ki}-ka-ni
+8. mu-na-du3