diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..6c784e5 --- /dev/null +++ b/.gitignore @@ -0,0 +1,5 @@ +.vscode/* +__pycache__/* +bin/* +lib/* +pyvenv.cfg diff --git a/Procfile b/Procfile new file mode 100644 index 0000000..908eb48 --- /dev/null +++ b/Procfile @@ -0,0 +1 @@ +web: gunicorn wsgi:application diff --git a/README.md b/README.md index 208b608..9079c07 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,74 @@ +# cuneifyplus + This is the source code for http://cuneifyplus.arch.cam.ac.uk It is a wrapper around Steve Tinney's 'cuneify' tool, which allows for more complex conversion of transliterated Babylonian and Akkadian into various cuneiform fonts. +## Local Server + +```bash +$ python3 wsgi.py +Serving on http://localhost:8051 +``` + +Open the URL in browser to use the HTML interface. + +## Remote Server + +Can be deployed to e.g. Heroku as wsgi app. + +Example: https://cuneify.herokuapp.com/ + +```bash +$ heroku create cuneify +Creating app... done, ⬢ cuneify +Created http://cuneify.herokuapp.com/ | git@heroku.com:cuneify.git + +$ git push heroku master +Enumerating objects: 5, done. +Counting objects: 100% (5/5), done. +Delta compression using up to 16 threads +Compressing objects: 100% (3/3), done. +Writing objects: 100% (3/3), 287 bytes | 287.00 KiB/s, done. +Total 3 (delta 2), reused 0 (delta 0), pack-reused 0 +. +. +. +remote: https://cuneify.herokuapp.com/ deployed to Heroku +``` + +## Command line usage + +```bash +$ python3 cuneify_interface.py test_file.txt +𒇻 𒀠 𒌨 𒆠 𒆗 𒄯 𒃻 𒀀 𒉌 +𒁕 +𒃮 +``` +```bash +$ python3 cuneify_interface.py --parse-atf test_file.atf +&P232701 = RIME 3/1.01.07.031, ex. 117 +#atf: lang sux +# reconstruction +@object cone +@surface a +1. {d}nin-dar-a +# 𒀭 𒊩𒌆 𒁯 𒀀 +2. lugal uru16 +# 𒈗 𒂗 +3. lugal-a-ni +# 𒈗 𒀀 𒉌 +4. gu3-de2-a +# 𒅗 𒌤 𒀀 +5. ensi2 +# 𒉺𒋼𒋛 +6. lagasz{ki}-ke4 +# 𒉢𒁓𒆷 𒆠 𒆤 +7. e2 gir2-su{ki}-ka-ni +# 𒂍 𒄈 𒋢 𒆠 𒅗 𒉌 +8. mu-na-du3 +# 𒈬 𒈾 𒆕 +... +``` diff --git a/cuneify_interface.py b/cuneify_interface.py index faada8b..efb03a9 100644 --- a/cuneify_interface.py +++ b/cuneify_interface.py @@ -1,3 +1,5 @@ +# -*- coding: utf8 -*- + import itertools import os import pickle @@ -140,8 +142,8 @@ def __enter__(self): @abstractmethod def __exit__(self, type_, value, traceback): - """Update the cache with the current transliteration, cuneiform pairs. - + """Update the cache with the current transliteration, cuneiform pairs. + It will overwrite the given values if present """ @@ -253,46 +255,79 @@ def _write_cache_file(self): self._cache_modified = False -def cuneify_line(cache, transliteration, show_transliteration): - """ Take a line of transliteration and display the output, nicely formatted, on the terminal. - Should be used whilst in the context of cache. +def cuneify_line_structured(cache, transliteration): + """ Take a line of transliteration and return structured data (tuple) of + - tokens, eg `tok1` + - separators, eg `.` + - cuneiform symbols, eg `𒌉` + - unrecognized tokens, eg `Howdy` """ transliteration = transliteration.strip() # Split using alphanumeric characters (\w) tokens = re.split(TOKEN_REGEX, transliteration) - # It's a much easier code path if we just show the cuneiform - if not show_transliteration: - return " ".join(cache.get_cuneiform(token) for token in tokens) - - # Otherwise format something like this: - # - # tok1.tok2 tok3-tok4-5- 6 - # A BBBBB CC DDD EEEE F separators = re.findall(TOKEN_REGEX, transliteration) separators.append("") - line_original = "" line_cuneiform = "" - for token, separator in zip(tokens, separators): - symbol = cache.get_cuneiform(token) - # FIXME -- take into account separator length (could be more than one - # character - n_spaces_after_symbol = 1 + max(len(separator) + len(token) - len(symbol), 0) - n_spaces_after_token_separator = 1 + max(len(symbol) - len(token), 0) - line_original += token + separator + " " * n_spaces_after_token_separator - line_cuneiform += symbol + " " * n_spaces_after_symbol + symbols = [] + unrecognized_tokens = [] + for token in tokens: + try: + symbol = cache.get_cuneiform(token) + except (UnrecognisedSymbol, TransliterationNotUnderstood): + symbol = None + unrecognized_tokens.append(token) + symbols.append(symbol) + + return (tokens, separators, symbols, unrecognized_tokens) + +def cuneify_line(cache, transliteration, show_transliteration, unrecognized_indicator=""): + """ Take a line of transliteration and display the output, nicely formatted, on the terminal. + Should be used whilst in the context of cache. + unrecognized_indicator : String to display if token not recognized. If empty string, + the token will be returned as-is. + """ - return "{}\n{}".format(line_original, line_cuneiform) + (tokens, separators, raw_symbols, unrecognized_tokens) = cuneify_line_structured(cache, transliteration) + + # Substitube chosen string for unrecognized tokens + symbols = [s if s is not None else t if unrecognized_indicator=="" else unrecognized_indicator for (t, s) in zip(tokens, raw_symbols)] + + if show_transliteration: + # Format tranliteratino under cuneiform, something like this: + # + # tok1.tok2 tok3-tok4-5- 6 + # A BBBBB CC DDD EEEE F + line_original = "" + line_cuneiform = "" + for token, separator, symbol in zip(tokens, separators, symbols): + if symbol is None: + symbol = token if unrecognized_indicator=="" else unrecognized_indicator + width = max(len(token + separator), len(symbol)) + line_original += (token + separator).ljust(width) + line_cuneiform += symbol.ljust(width) + return "{}\n{}".format(line_original, line_cuneiform) + else: + return " ".join(symbols) -def cuneify_file(cache, file_name, show_transliteration): - """ Given a text file with one or more lines of transliterated text, print out the corresponding +def cuneify_iterator(cache, iterator, show_transliteration, parse_atf=True): + """ Given a iterator object that yields lines of transliterated text, return the corresponding version in cuneiform """ output = "" - with open(file_name) as input_file: - for line in input_file: + if parse_atf: + for line in iterator: + atf_line_parts = re.search("^([0-9]+\.)([ \t]*)(.*)", line) + if atf_line_parts: + transliteration = atf_line_parts.group(3) + output += line + output += "#" + atf_line_parts.group(2) + cuneify_line(cache, transliteration, show_transliteration) + "\n" + else: + output += line + else: + for line in iterator: output += cuneify_line(cache, line, show_transliteration) output += "\n" # If also showing transliteration then an extra blank line aids legibility @@ -301,6 +336,14 @@ def cuneify_file(cache, file_name, show_transliteration): return output +def cuneify_file(cache, file_name, show_transliteration, parse_atf=True): + """ Given a text file with one or more lines of transliterated text, return the corresponding + version in cuneiform + """ + with open(file_name) as iterator: + return cuneify_iterator(cache, iterator, show_transliteration, parse_atf=parse_atf) + + def ordered_symbol_to_transliterations( cache, transliteration, return_unrecognised=False ): @@ -355,6 +398,9 @@ def main(): help="By default just show cuneiform. If this is set, " "also display original transliteration", ) + parser.add_argument( + "--parse-atf", action="store_true", + help="If this is set parse file as .atf formatted") parser.add_argument( "--symbol-list", action="store_true", @@ -380,7 +426,7 @@ def main(): print("Unrecognised symbols:") print(unrecognised_tokens) else: - print(cuneify_file(cache, args.input_file, args.show_transliteration)) + print(cuneify_file(cache, args.input_file, args.show_transliteration, args.parse_atf)) if __name__ == "__main__": diff --git a/environment.py b/environment.py index e8714f3..ed370a5 100644 --- a/environment.py +++ b/environment.py @@ -6,22 +6,31 @@ from cuneify_interface import FileCuneiformCache -if not "mws" in socket.gethostname().lower(): - raise RuntimeError( - "Unrecognised environment: {}".format(socket.gethostname().lower()) - ) +if "mws" in socket.gethostname().lower(): + # Running in MWS + MY_URL = "http://cuneifyplus.arch.cam.ac.uk" -MY_URL = "http://cuneifyplus.arch.cam.ac.uk" + def get_font_directory(environ): + return os.path.join(environ["DOCUMENT_ROOT"], "cuneifyplus", "fonts") + def get_cache(environ): + """ Return the standard cuneiform cache """ + cache_file_path = os.path.normpath( + os.path.join(environ["DOCUMENT_ROOT"], "cuneifyplus", "cuneiform_cache.pickle") + ) + return FileCuneiformCache(cache_file_path=cache_file_path, read_only=True) -def get_font_directory(environ): - return os.path.join(environ["DOCUMENT_ROOT"], "cuneifyplus", "fonts") +else: + # Running locally + MY_URL = "" -def get_cache(environ): - """ Return the standard cuneiform cache """ - cache_file_path = os.path.normpath( - os.path.join(environ["DOCUMENT_ROOT"], "cuneifyplus", "cuneiform_cache.pickle") - ) - return FileCuneiformCache(cache_file_path=cache_file_path, read_only=True) + def get_font_directory(environ): + return "fonts" + + def get_cache(environ): + """ Return the standard cuneiform cache """ + # We use a cache in the data directory. This isn't touched by the deployment process + cache_file_path = "cuneiform_cache.pickle" + return FileCuneiformCache(cache_file_path=cache_file_path) diff --git a/requirements.txt b/requirements.txt index e69de29..3540093 100644 --- a/requirements.txt +++ b/requirements.txt @@ -0,0 +1 @@ +gunicorn==20.0.4 diff --git a/test_file.atf b/test_file.atf new file mode 100644 index 0000000..2b8f7eb --- /dev/null +++ b/test_file.atf @@ -0,0 +1,58 @@ +&P232701 = RIME 3/1.01.07.031, ex. 117 +#atf: lang sux +# reconstruction +@object cone +@surface a +1. {d}nin-dar-a +2. lugal uru16 +3. lugal-a-ni +4. gu3-de2-a +5. ensi2 +6. lagasz{ki}-ke4 +7. e2 gir2-su{ki}-ka-ni +8. mu-na-du3 + + +&P232702 = RIME 3/1.01.07.031, ex. 118 +#atf: lang sux +# reconstruction +@object cone +@surface a +1. {d}nin-dar-a +2. lugal uru16 +3. lugal-a-ni +4. gu3-de2-a +5. ensi2 +6. lagasz{ki}-ke4 +7. e2 gir2-su{ki}-ka-ni +8. mu-na-du3 + + +&P232703 = RIME 3/1.01.07.031, ex. 119 +#atf: lang sux +# reconstruction +@object cone +@surface a +1. {d}nin-dar-a +2. lugal uru16 +3. lugal-a-ni +4. gu3-de2-a +5. ensi2 +6. lagasz{ki}-ke4 +7. e2 gir2-su{ki}-ka-ni +8. mu-na-du3 + + +&P232704 = RIME 3/1.01.07.031, ex. 120 +#atf: lang sux +# reconstruction +@object cone +@surface a +1. {d}nin-dar-a +2. lugal uru16 +3. lugal-a-ni +4. gu3-de2-a +5. ensi2 +6. lagasz{ki}-ke4 +7. e2 gir2-su{ki}-ka-ni +8. mu-na-du3 diff --git a/wsgi.py b/wsgi.py index 0faea15..850916f 100644 --- a/wsgi.py +++ b/wsgi.py @@ -15,6 +15,7 @@ TransliterationNotUnderstood, UnrecognisedSymbol, cuneify_line, + cuneify_iterator, ordered_symbol_to_transliterations, ) from environment import MY_URL, get_cache, get_font_directory @@ -37,60 +38,37 @@ FONTS_PATH_NAME = "/fonts" -def _get_input_form(initial="Enter transliteration here..."): +def _get_input_form(initial=""): """ Return a form that the user can use to enter some transliterated text """ - font_name_selection = "".join( - [ - ''.format(name, description) - for name, description in FONT_NAMES.items() - ] - ) + font_name_selection = "".join(["".format(name, description) + for name, description in FONT_NAMES.items()]) body = """ -
- + +

Show transliteration with output

""".format( - MY_URL, font_name_selection + MY_URL, cgi.escape(initial), font_name_selection ) - # TODO Use 'initial' when it can be made to disappear on entry into widget return body def _get_cuneify_body(environ, transliteration, show_transliteration, font_name): """ Return the HTML body contents when we've been given a transliteration, and show in the specified font """ body = "" + is_atf = "\n2." in transliteration # Hacky way to decide if this is an atf formatted file with get_cache(environ) as cache: - for line in transliteration.split("\n"): - # Make empty lines appear as breaks in the output - line = line.strip() - if line == "": - body += "
" - continue - - try: - body += '{}
'.format( - font_name.lower(), - cuneify_line(cache, line, show_transliteration).replace( - "\n", "
" - ), - ) - # body += '{}
'.format(cuneify_line(cache, line, show_transliteration).replace('\n', '
')) - except UnrecognisedSymbol as exception: - body += 'Unknown symbol "{}" in "{}"
'.format( - exception.transliteration, line - ) - except TransliterationNotUnderstood: - body += 'Possible formatting error in "{}"
'.format( - line - ) - - # TODO will need javascript to re-populate the text area, I believe - # body += '

Go back
'.format(MY_URL, quote(transliteration)) - body += '

Go back
'.format(MY_URL) + cuneified = cuneify_iterator(cache, iter(transliteration.splitlines(True)), show_transliteration, parse_atf=is_atf) + try: + body += "
{}

".format(font_name.lower(), cuneified) + except UnrecognisedSymbol as exception: + body += "Unknown symbol \"{}\" in \"{}\"
".format(exception.transliteration, line) + except TransliterationNotUnderstood: + body += "Possible formatting error in \"{}\"
".format(line) + # TODO this can probably be neatened up a little bit return body @@ -106,21 +84,17 @@ def _get_symbol_list_body(environ, transliteration, font_name): cache, transliteration, return_unrecognised=True ) for cuneiform_symbol, transliterations in symbol_to_transliterations.items(): - line = '{}: {}
'.format( + line = "{}: {}
".format( font_name.lower(), cuneiform_symbol, ", ".join(transliterations) ) body += line if len(unrecognised_tokens) > 0: # Print out unrecognised tokens if there are any - body += '
These tokens were unrecognised: {}
'.format( + body += "
These tokens were unrecognised: {}
".format( ", ".join(unrecognised_tokens) ) - # TODO will need javascript to re-populate the text area, I believe - # body += '

Go back
'.format(MY_URL, quote(transliteration)) - body += '

Go back
'.format(MY_URL) - # TODO this can probably be neatened up a little bit return body @@ -165,16 +139,15 @@ def application(environ, start_response): form = cgi.FieldStorage( fp=environ["wsgi.input"], environ=environ, keep_blank_values=True ) + cuneiform_output = "" + transliteration = "" if path_info.startswith(FONTS_PATH_NAME): # Return the static font file return construct_font_response(environ, start_response, path_info) - elif path_info == "/cuneify": + elif environ["REQUEST_METHOD"] == "POST": # Whatever else happens, we always need a non-empty transliteration - transliteration = escape(form.getvalue("input")) - if transliteration is None or transliteration == "": - # There is no transliteration, so show the input form again - body = _get_input_form() + transliteration = form.getvalue("input") # Get the values of the other form inputs show_transliteration_value = form.getvalue("show_transliteration") @@ -188,25 +161,24 @@ def application(environ, start_response): # The type of form submission we make determines what we do now if action_value == "Cuneify": # We do a transliteration and show the output - body = _get_cuneify_body( + cuneiform_output += _get_cuneify_body( environ, transliteration, show_transliteration, font_name ) elif action_value == "Create sign list": # Make a symbol list! - body = _get_symbol_list_body(environ, transliteration, font_name) + cuneiform_output += _get_symbol_list_body(environ, transliteration, font_name) else: raise RuntimeError("Unrecognised action value {}".format(action_value)) - else: - body = _get_input_form() + body = _get_input_form(initial=transliteration) # All the CSS representing font classes font_info = "\n".join( [ """@font-face {{{{ font-family: {1}; - src: url(fonts/{1}.woff) format('woff'), - url(fonts/{1}.eot) format('embedded-opentype'), - url(fonts/{1}.ttf) format('truetype'); + src: url(fonts/{1}.woff) format("woff"), + url(fonts/{1}.eot) format("embedded-opentype"), + url(fonts/{1}.ttf) format("truetype"); }}}} .{0} {{{{ font-family: {1}; @@ -226,11 +198,14 @@ def application(environ, start_response): + """ +
+ {} +
{}


-Using most browsers, the cuneiform should appear on your screen, as the fonts are embedded in the website. +Using most browsers, the cuneiform should appear on your screen, as the fonts are embedded in the website. However, if you wish to copy-and-paste (e.g. into a Word document), you may need to install the fonts in order for the characters to display correctly. To install the fonts, follow the links below:
@@ -250,11 +225,10 @@ def application(environ, start_response): """ ) - response_body = response_body.format(body) + response_body = response_body.format(cuneiform_output, body) response_body = response_body.encode("utf-8") status = "200 OK" - # ctype = 'text/plain' ctype = "text/html" response_headers = [ ("Content-Type", ctype), @@ -267,9 +241,9 @@ def application(environ, start_response): # Below for testing only # if __name__ == "__main__": + MY_URL="" from wsgiref.simple_server import make_server - httpd = make_server("localhost", 8051, application) - # Wait for a single request, serve it and quit. - httpd.handle_request() - + print("Serving on http://localhost:8051") + while 1: + httpd.handle_request()