From b59f2a255917ed9980afe5bb4cad2ff9c19b4135 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20P=C3=A9rez?= Date: Mon, 17 Apr 2023 16:05:36 +0200 Subject: [PATCH 1/4] Add gradio interface --- app.py | 144 ++++++++++++++++++++++++++++++++++++++++++++ src/averell/core.py | 81 +++++++++++++++++++++++++ 2 files changed, 225 insertions(+) create mode 100644 app.py diff --git a/app.py b/app.py new file mode 100644 index 0000000..54e0396 --- /dev/null +++ b/app.py @@ -0,0 +1,144 @@ +import copy +import json + +from pathlib import Path +from zipfile import ZipFile + +import gradio as gr +import pycountry + +from averell.utils import get_ids, CORPORA_SOURCES +from averell.core import export_corpora_ui, get_corpora + +PARAMS = { + "granularity": "poem", + "ouput_format": "JSON", + "corpora_list": [ + 'bibit', 'stichopt', 'disco2_1', + 'disco3', 'adso', 'adso100', + 'plc', 'gongo', 'ecpa', + '4b4v', 'czverse', 'mel' + ], +} + +def get_available_languages(sources): + available_languages = [] + for c in sources: + lang = c["properties"]["language"] + if lang not in available_languages: + available_languages.append(lang) + return available_languages + +def filter_corpus_language(sources, lang): + corpora_sources = copy.deepcopy(sources) + filtered_corpora = [c for c in corpora_sources if c["properties"]["language"] == lang] + return filtered_corpora + +available_languages = get_available_languages(CORPORA_SOURCES) + +with gr.Blocks() as app_averell: + def export_corpora(output_path, output_format, output_granularity): + corpora_list = PARAMS["corpora_list"] + #print(corpora_list) + output_path = output_path["label"] + filename = f"tmp/{output_format}.zip" + if output_granularity == "JSON": + with ZipFile(filename, "w") as zfile: + for corpus in corpora_list: + p = Path(f'{output_path}/{output_format}/{corpus}') + print(corpus) + for f in p.glob("**/*.json"): + #print(f) + zfile.write(f) + return f"tmp/{output_format}.zip" + else: + json_l, filename = export_corpora_ui(get_ids(corpora_list), + "stanza", + "tmp/tmp_corp", + None, + False) + with open(f"tmp/{filename}.json", 'w', encoding='utf-8') as f: + json.dump(json_l, f, ensure_ascii=False, indent=4) + return f"tmp/{filename}.json" + + def block_granularity(rad_format): + if rad_format == "TEI": + PARAMS["ouput_format"] = rad_format + return {rad_granularity: gr.update(value="poem", visible=False)} + else: + PARAMS["ouput_format"] = rad_format + return {rad_granularity: gr.update(value="poem", visible=True)} + + def update_granularity(value): + PARAMS["granularity"] = value + true_l = [gr.Checkbox.update(value=True, visible=True)] + false_l = [gr.Checkbox.update(value=False, visible=False)] + if value == "word": + # 000011100110 + 110110 + corp_list = false_l*4 + true_l*3 + false_l*2 + true_l*2 + false_l + lang_list = true_l*2 + false_l + true_l*2 + false_l + return corp_list + lang_list + elif value == "syllable": + # 000011000000 + 100000 + corp_list = false_l*4 + true_l*2 + false_l*6 + lang_list = true_l + false_l*5 + return corp_list + lang_list + else: + #return None + return [gr.Checkbox.update(value=True, visible=True)]*18 + + def change_selection(value): + return value + + def update_corpora_list(added, corpus): + corpora_list = PARAMS["corpora_list"] + if corpus in corpora_list and not added: + corpora_list.remove(corpus) + elif corpus not in corpora_list and added: + corpora_list.append(corpus) + + app_title = gr.HTML("

Averell

") + with gr.Row() as row: + + with gr.Column(scale=1) as c1: + rad_format = gr.Radio(["TEI", "JSON"], label="Output", info="Choose output format", value="TEI", interactive=True) + rad_granularity = gr.Radio(["poem", "stanza", "line", "word", "syllable"], + label="Granularity", + info="Choose output granularity", + value="poem", + interactive=True, + visible=False, + ) + corpus_checkboxes = [] + lang_checkboxes = [] + for lang in available_languages: + language = pycountry.languages.get(alpha_2=lang).name + with gr.Blocks() as corpora: + lang_chk = gr.Checkbox(True, label=language, interactive=True) + filtered_corpus = filter_corpus_language(CORPORA_SOURCES, lang) + for corpus in filtered_corpus: + classes = corpus["properties"]["granularity"] + classes.append("poem") + classes.append(lang) + chk = gr.Checkbox(True, + label=corpus["name"], + info=f'License: {corpus["properties"]["license"]} | Number of poems: {corpus["properties"]["doc_quantity"]}', + interactive=True, + elem_classes=classes, + elem_id=corpus["properties"]["slug"], + ) + label = gr.Textbox(value=corpus["properties"]["slug"], visible=False) + lang_chk.change(change_selection, lang_chk, chk, show_progress=True) + chk.change(update_corpora_list, [chk, label], show_progress=True) + corpus_checkboxes.append(chk) + lang_checkboxes.append(lang_chk) + with gr.Column(scale=1) as c2: + + rad_granularity.change(update_granularity, rad_granularity, [*corpus_checkboxes,*lang_checkboxes], show_progress=True) + rad_format.change(block_granularity, rad_format, rad_granularity, api_name="output_format", show_progress=True) + exp_btn = gr.Button("Export") + folder_path = gr.Label(value="tmp/", visible=False) + out_file = gr.File() + exp_btn.click(export_corpora, [folder_path, rad_format, rad_granularity], out_file, api_name="export") + +app_averell.launch(share=True) \ No newline at end of file diff --git a/src/averell/core.py b/src/averell/core.py index 423b4ba..39c1931 100644 --- a/src/averell/core.py +++ b/src/averell/core.py @@ -68,6 +68,8 @@ def export_corpora( export_filename = filename if Path(corpora_folder).exists() or not no_download: if not corpus_ids: + print("ID not in corpora list") + logging.error("No CORPUS ID selected") else: if granularity is not None: @@ -75,6 +77,7 @@ def export_corpora( try: corpus = CORPORA_SOURCES[corpus_id] except IndexError: + print("ID not in corpora list") logging.error("ID not in corpora list") else: corpus_folder = corpus["properties"]["slug"] @@ -90,6 +93,84 @@ def export_corpora( continue granularities_list = corpus["properties"]["granularity"] if granularity not in granularities_list: + print("ID not in corpora list") + + logging.error( + f"'{granularity}' granularity not found on " + f"'{corpus_name}' properties") + continue + features = read_features( + Path(corpora_folder) / corpus_folder) + filtered_features = filter_corpus_features(features, + corpus_id, + granularity) + corpora_features.extend(filtered_features) + print(corpora_features) + else: + print("ID not in corpora list") + + logging.error("No GRANULARITY selected") + + if not export_filename: + export_filename = "_".join(slugs) + export_filename = f"{export_filename}_{granularity}s" + + if corpora_features: + write_json(corpora_features, export_filename) + else: + print("Corpora folder not found") + logging.error("Corpora folder not found") + return corpora_features, export_filename + + +def export_corpora_ui( + corpus_ids, granularity, corpora_folder, filename, no_download=False +): + """ + Generates a single JSON file with the chosen granularity for all of the + selected corpora + + :param corpus_ids: IDs of the corpora that will be exported + :param granularity: Level of parsing granularity + :param corpora_folder: Local folder where the corpora is located + :param filename: Name of the output file + :param no_download: Whether to download or not a corpora when missing + :return: Python dict with the chosen granularity for all of the selected + corpora + """ + corpora_features = [] + slugs = [] + export_filename = filename + if Path(corpora_folder).exists() or not no_download: + if not corpus_ids: + print("No CORPUS ID selected") + logging.error("No CORPUS ID selected") + else: + if granularity is not None: + for corpus_id in corpus_ids: + try: + corpus = CORPORA_SOURCES[corpus_id] + except IndexError: + print("ID not in corpora list") + logging.error("ID not in corpora list") + else: + corpus_folder = corpus["properties"]["slug"] + slugs.append(corpus_folder) + corpus_name = corpus["name"] + if not (Path(corpora_folder) / corpus_folder).exists(): + # Si vamos a descargar el corpus + if not no_download: + get_corpora([corpus_id], corpora_folder) + else: + print("Whatever") + logging.error( + f'"{corpus_name} ({corpus_folder})" not ' + f'found in "{corpora_folder}" folder') + continue + granularities_list = corpus["properties"]["granularity"] + if granularity not in granularities_list: + print("granularity not found") + logging.error( f"'{granularity}' granularity not found on " f"'{corpus_name}' properties") From b23ad498cad4f26dc54af85ee1e3f9ddab36bc0c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20P=C3=A9rez?= Date: Mon, 17 Apr 2023 16:05:36 +0200 Subject: [PATCH 2/4] Add gradio interface Add gradio interface/fix TEI parsing --- app.py | 191 +++++++++++++++++++++++++++++-------------- src/averell/core.py | 4 +- src/averell/utils.py | 138 +++++++++++++++++++++++++++++++ 3 files changed, 271 insertions(+), 62 deletions(-) diff --git a/app.py b/app.py index 54e0396..502296b 100644 --- a/app.py +++ b/app.py @@ -7,8 +7,8 @@ import gradio as gr import pycountry -from averell.utils import get_ids, CORPORA_SOURCES -from averell.core import export_corpora_ui, get_corpora +from averell.utils import get_ids, generate_tei, CORPORA_SOURCES +from averell.core import export_corpora_ui PARAMS = { "granularity": "poem", @@ -39,57 +39,74 @@ def filter_corpus_language(sources, lang): with gr.Blocks() as app_averell: def export_corpora(output_path, output_format, output_granularity): corpora_list = PARAMS["corpora_list"] - #print(corpora_list) + if not corpora_list: + return {out_file: gr.File.update(label="ERROR: No corpus selected")} output_path = output_path["label"] filename = f"tmp/{output_format}.zip" - if output_granularity == "JSON": + # Export to TEI + if output_format == "TEI": + generate_tei(corpora_list, output_path, True) + return {out_file: gr.File.update(value=f"tmp/{output_format}.zip", + label=output_format)} + # Expor to JSON + if output_granularity == "poem": with ZipFile(filename, "w") as zfile: for corpus in corpora_list: p = Path(f'{output_path}/{output_format}/{corpus}') - print(corpus) for f in p.glob("**/*.json"): - #print(f) zfile.write(f) - return f"tmp/{output_format}.zip" + return {out_file: gr.File.update(value=f"tmp/{output_format}.zip", + label=output_format)} else: json_l, filename = export_corpora_ui(get_ids(corpora_list), - "stanza", + output_granularity, "tmp/tmp_corp", None, False) with open(f"tmp/{filename}.json", 'w', encoding='utf-8') as f: json.dump(json_l, f, ensure_ascii=False, indent=4) - return f"tmp/{filename}.json" + return {out_file: gr.File.update(value=f"tmp/{filename}.json", + label=filename)} - def block_granularity(rad_format): + + def block_granularity(rad_format): if rad_format == "TEI": PARAMS["ouput_format"] = rad_format return {rad_granularity: gr.update(value="poem", visible=False)} else: PARAMS["ouput_format"] = rad_format return {rad_granularity: gr.update(value="poem", visible=True)} - def update_granularity(value): PARAMS["granularity"] = value - true_l = [gr.Checkbox.update(value=True, visible=True)] + true_l = [gr.Checkbox.update(value=True, visible=True)] false_l = [gr.Checkbox.update(value=False, visible=False)] + true_acc = [gr.Box.update(visible=True)] + false_acc = [gr.Box.update(visible=False)] if value == "word": - # 000011100110 + 110110 - corp_list = false_l*4 + true_l*3 + false_l*2 + true_l*2 + false_l - lang_list = true_l*2 + false_l + true_l*2 + false_l - return corp_list + lang_list + # 000011100110 + 110110 + 110110 + corp_list = false_l * 4 + true_l * 3 + false_l * 2 + true_l * 2 + false_l + lang_list = true_l * 2 + false_l + true_l * 2 + false_l + acc_list = true_acc * 2 + false_acc + true_acc * 2 + false_acc + return corp_list + lang_list + acc_list elif value == "syllable": - # 000011000000 + 100000 - corp_list = false_l*4 + true_l*2 + false_l*6 - lang_list = true_l + false_l*5 - return corp_list + lang_list + # 000011000000 + 100000 + 100000 + corp_list = false_l * 4 + true_l * 2 + false_l * 6 + lang_list = true_l + false_l * 5 + acc_list = true_acc + false_acc * 5 + return corp_list + lang_list + acc_list else: - #return None - return [gr.Checkbox.update(value=True, visible=True)]*18 + # return None + return true_l * 18 + true_acc * 6 - def change_selection(value): - return value + def change_selection(value, *labels): + for corpus in labels: + update_corpora_list(value, corpus) + return value + def change_global_selection(value, *labels): + for corpus in labels: + update_corpora_list(value, corpus) + return [gr.Checkbox.update(value=value)] * 6 def update_corpora_list(added, corpus): corpora_list = PARAMS["corpora_list"] if corpus in corpora_list and not added: @@ -99,46 +116,100 @@ def update_corpora_list(added, corpus): app_title = gr.HTML("

Averell

") with gr.Row() as row: - with gr.Column(scale=1) as c1: - rad_format = gr.Radio(["TEI", "JSON"], label="Output", info="Choose output format", value="TEI", interactive=True) - rad_granularity = gr.Radio(["poem", "stanza", "line", "word", "syllable"], - label="Granularity", - info="Choose output granularity", - value="poem", - interactive=True, - visible=False, - ) + rad_format = gr.Radio(["TEI", "JSON"], + label="Output", + info="Choose output format", + value="TEI", + interactive=True) + rad_granularity = gr.Radio( + ["poem", "stanza", "line", "word", "syllable"], + label="Granularity", + info="Choose output granularity", + value="poem", + interactive=True, + visible=False, + ) corpus_checkboxes = [] lang_checkboxes = [] - for lang in available_languages: - language = pycountry.languages.get(alpha_2=lang).name - with gr.Blocks() as corpora: - lang_chk = gr.Checkbox(True, label=language, interactive=True) - filtered_corpus = filter_corpus_language(CORPORA_SOURCES, lang) - for corpus in filtered_corpus: - classes = corpus["properties"]["granularity"] - classes.append("poem") - classes.append(lang) - chk = gr.Checkbox(True, - label=corpus["name"], - info=f'License: {corpus["properties"]["license"]} | Number of poems: {corpus["properties"]["doc_quantity"]}', - interactive=True, - elem_classes=classes, - elem_id=corpus["properties"]["slug"], - ) - label = gr.Textbox(value=corpus["properties"]["slug"], visible=False) - lang_chk.change(change_selection, lang_chk, chk, show_progress=True) - chk.change(update_corpora_list, [chk, label], show_progress=True) - corpus_checkboxes.append(chk) - lang_checkboxes.append(lang_chk) + with gr.Box() as b1: + with gr.Row() as rowa: + all_corp_chk = gr.Checkbox(True, label="Select all/none", + interactive=True) + all_label_list = [] + for lang in available_languages: + with gr.Box() as b2: + language = pycountry.languages.get( + alpha_2=lang).name + gr.HTML(language) + with gr.Blocks() as corpora: + with gr.Row() as rowb: + lang_chk = gr.Checkbox(True, + label="Select all/none", + interactive=True) + filtered_corpus = filter_corpus_language( + CORPORA_SOURCES, lang) + with gr.Accordion("Expand list", + open=False) as acc: + label_list = [] + for corpus in filtered_corpus: + classes = corpus["properties"][ + "granularity"] + classes.append("poem") + classes.append(lang) + chk = gr.Checkbox(True, + label=corpus[ + "name"], + info=f'License: {corpus["properties"]["license"]} | Number of poems: {corpus["properties"]["doc_quantity"]}', + interactive=True, + elem_classes=classes, + elem_id=corpus[ + "properties"][ + "slug"], + ) + label = gr.Textbox( + value=corpus["properties"][ + "slug"], visible=False) + # Corpus checkboxes change + chk.change(update_corpora_list, + [chk, label], + show_progress=False) + # "Select all" language checkboxes change + lang_chk.change(change_selection, + [lang_chk, + *label_list], + chk, + show_progress=False) + corpus_checkboxes.append(chk) + label_list.append(label) + all_label_list.append(label) + lang_checkboxes.append(lang_chk) + # "Selec All/None" checkbox change + all_corp_chk.change(change_global_selection, + [all_corp_chk, + *all_label_list], + [*lang_checkboxes]) with gr.Column(scale=1) as c2: - - rad_granularity.change(update_granularity, rad_granularity, [*corpus_checkboxes,*lang_checkboxes], show_progress=True) - rad_format.change(block_granularity, rad_format, rad_granularity, api_name="output_format", show_progress=True) + accordions_boxes = rowa.children[1:] + rad_granularity.change(update_granularity, + rad_granularity, + [*corpus_checkboxes, + *lang_checkboxes, + *accordions_boxes], + show_progress=False) + rad_format.change(block_granularity, + rad_format, + rad_granularity, + api_name="output_format", + show_progress=False) exp_btn = gr.Button("Export") folder_path = gr.Label(value="tmp/", visible=False) out_file = gr.File() - exp_btn.click(export_corpora, [folder_path, rad_format, rad_granularity], out_file, api_name="export") - -app_averell.launch(share=True) \ No newline at end of file + exp_btn.click(export_corpora, + [folder_path, + rad_format, + rad_granularity], + out_file, + api_name="export") + +app_averell.launch(share=True) diff --git a/src/averell/core.py b/src/averell/core.py index 39c1931..7aa7187 100644 --- a/src/averell/core.py +++ b/src/averell/core.py @@ -188,8 +188,8 @@ def export_corpora_ui( export_filename = "_".join(slugs) export_filename = f"{export_filename}_{granularity}s" - if corpora_features: - write_json(corpora_features, export_filename) + #if corpora_features: + # write_json(corpora_features, export_filename) else: logging.error("Corpora folder not found") return corpora_features, export_filename diff --git a/src/averell/utils.py b/src/averell/utils.py index 125b27b..0244686 100644 --- a/src/averell/utils.py +++ b/src/averell/utils.py @@ -2,6 +2,8 @@ import logging import os import urllib.request +import xml.etree.ElementTree as et + from pathlib import Path from zipfile import ZipFile @@ -17,6 +19,20 @@ TEI_NAMESPACE = "{http://www.tei-c.org/ns/1.0}" XML_NS = "{http://www.w3.org/XML/1998/namespace}" +CORPUS_NAMES = { + "disco2_1": "Disco V2.1", + "disco3": "Disco V3", + "adso": "Sonetos Siglo de Oro", + "adso100": "ADSO 100 poems corpus", + "plc": "Poesía Lírica Castellana Siglo de Oro", + "gongo": "Gongocorpus", + "ecpa": "Eighteenth Century Poetry Archive", + "4b4v": "For Better For Verse", + "mel": "Métrique en Ligne", + "bibit": "Biblioteca Italiana", + "czverse": "Corpus of Czech Verse", + "stichopt": "Stichotheque Portuguese", +} def progress_bar(t): """ from https://gist.github.com/leimao/37ff6e990b3226c2c9670a2cd1e4a6f5 @@ -347,3 +363,125 @@ def get_ids(values): or props["language"] in values): ids.append(index) return ids + + +def generate_tei(corpora_list, output_path, ui_enabled=False): + filename_list = [] + for corpus in corpora_list: + if ui_enabled: + p = Path(output_path) / "JSON" / corpus + else: + p = Path(output_path) / corpus / "averell" / "parser" + poem_path_list = p.glob("**/*.json") + for poem_path in poem_path_list: + with open(poem_path, "r") as poem_file: + poem = json.load(poem_file) + file_name = poem_path.stem + filename_list.append(file_name) + poem_title = poem["poem_title"] + author = poem["author"] + corpus_name = CORPUS_NAMES[poem["corpus"]] + manually_checked = poem["manually_checked"] + + poem_id = f"{author}_{file_name}" + + root = et.Element("TEI") + header = et.SubElement(root, "teiHeader") + + file_desc = et.SubElement(header, "fileDesc") + + title_stmt = et.SubElement(file_desc, "titleStmt") + title_stmt_desc = et.SubElement(title_stmt, "title") + author_stmt_desc = et.SubElement(title_stmt, "author") + title_stmt_desc.text = poem_title + author_stmt_desc.text = author + + extent = et.SubElement(file_desc, "extent") + + pub_stmt = et.SubElement(file_desc, "publicationStmt") + publisher = et.SubElement(pub_stmt, "publisher") + publisher.text = "UNED University" + idno = et.SubElement(pub_stmt, "idno") + idno.text = poem_id + availability = et.SubElement(pub_stmt, "availability") + availability.attrib["status"] = "free" + p = et.SubElement(availability, "p") + p.text = "The text is freely available." + + series_stmt = et.SubElement(file_desc, "seriesStmt") + title_series = et.SubElement(series_stmt, "title") + title_series.text = corpus_name + + source_desc = et.SubElement(file_desc, "sourceDesc") + bibl_source = et.SubElement(source_desc, "bibl") + bibl_title = et.SubElement(bibl_source, "title") + bibl_title.text = poem_title + bibl_author = et.SubElement(bibl_source, "author") + bibl_author.text = author + + measure_st = et.SubElement(extent, "measure") + measure_st.attrib["unit"] = "stanza" + measure_st.text = str(len(poem["stanzas"])) + n_lines = 0 + + text_poem = et.SubElement(root, "text") + front_poem = et.SubElement(text_poem, "front") + head_poem = et.SubElement(front_poem, "head") + head_poem.text = poem_title + body_poem = et.SubElement(text_poem, "body") + + lg_main = et.SubElement(body_poem, "lg") + lg_main.attrib["xmlns"] = "http://www.tei-c.org/ns/1.0" + lg_main.attrib["type"] = "poem" + + for stanza in poem["stanzas"]: + + n_lines += len(stanza["lines"]) + + stanza_number = stanza["stanza_number"] + stanza_type = stanza.get("stanza_type") + lg = et.SubElement(lg_main, "lg") + lg.attrib["n"] = str(stanza_number) + if stanza_type: + lg.attrib["stanza_type"] = stanza_type + for line in stanza["lines"]: + l = et.SubElement(lg, "l") + l.text = line["line_text"] + l.attrib["n"] = str(line["line_number"]) + + metrical_pattern = line.get("metrical_pattern") + rhyme = line.get("rhyme") + line_length = line.get("line_length") + + if metrical_pattern: + l.attrib["met"] = str(metrical_pattern) + if rhyme: + l.attrib["rhyme"] = str(rhyme) + if line_length: + l.attrib["line_length"] = str(line_length) + + measure_l = et.SubElement(extent, "measure") + measure_l.attrib["unit"] = "line" + measure_l.text = str(n_lines) + tree = et.ElementTree(root) + + # output_path = Path('corpora') / f'{poem["corpus"]}' / 'averell' / 'TEI' + output_base_path = Path(output_path) / 'TEI' + output_extended_path = output_base_path / poem["corpus"] / author + + # prefix = '{:05d}'.format(filename_list.count(file_name)) + output_file = f"{poem_id}.xml" + if not os.path.exists(output_base_path): + Path.mkdir(output_base_path) + if not os.path.exists(output_extended_path): + Path.mkdir(output_extended_path) + et.indent(tree, space=" ", level=0) + tree.write(f"{Path(output_extended_path) / output_file}", + encoding="UTF-8", + xml_declaration=True) + if ui_enabled: + with ZipFile("tmp/TEI.zip", "w") as zfile: + for corpus in corpora_list: + p = Path(f'tmp/TEI/{corpus}') + for f in p.glob("*/**/*.xml"): + zfile.write(f) From 446fbd3fdd31f0a3d8229267d8c057739de8572e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20P=C3=A9rez?= Date: Wed, 24 May 2023 20:12:55 +0200 Subject: [PATCH 3/4] First UI version --- app.py | 103 ++++++++++++++++++++++++------------------- requirements.txt | 5 +++ setup.py | 2 +- src/averell/core.py | 80 +++------------------------------ src/averell/utils.py | 6 +-- 5 files changed, 71 insertions(+), 125 deletions(-) diff --git a/app.py b/app.py index 502296b..d7832f6 100644 --- a/app.py +++ b/app.py @@ -8,11 +8,11 @@ import pycountry from averell.utils import get_ids, generate_tei, CORPORA_SOURCES -from averell.core import export_corpora_ui +from averell.core import export_corpora PARAMS = { "granularity": "poem", - "ouput_format": "JSON", + "output_format": "JSON", "corpora_list": [ 'bibit', 'stichopt', 'disco2_1', 'disco3', 'adso', 'adso100', @@ -22,22 +22,22 @@ } def get_available_languages(sources): - available_languages = [] + available_langs = [] for c in sources: - lang = c["properties"]["language"] - if lang not in available_languages: - available_languages.append(lang) - return available_languages + lg = c["properties"]["language"] + if lg not in available_langs: + available_langs.append(lg) + return available_langs -def filter_corpus_language(sources, lang): +def filter_corpus_language(sources, lg): corpora_sources = copy.deepcopy(sources) - filtered_corpora = [c for c in corpora_sources if c["properties"]["language"] == lang] + filtered_corpora = [c for c in corpora_sources if c["properties"]["language"] == lg] return filtered_corpora available_languages = get_available_languages(CORPORA_SOURCES) with gr.Blocks() as app_averell: - def export_corpora(output_path, output_format, output_granularity): + def export(output_path, output_format, output_granularity): corpora_list = PARAMS["corpora_list"] if not corpora_list: return {out_file: gr.File.update(label="ERROR: No corpus selected")} @@ -48,75 +48,83 @@ def export_corpora(output_path, output_format, output_granularity): generate_tei(corpora_list, output_path, True) return {out_file: gr.File.update(value=f"tmp/{output_format}.zip", label=output_format)} - # Expor to JSON + # Export to JSON if output_granularity == "poem": + export_corpora(get_ids(corpora_list), + None, + "tmp/JSON", + None, + no_download=False, + ui_mode=True) with ZipFile(filename, "w") as zfile: - for corpus in corpora_list: - p = Path(f'{output_path}/{output_format}/{corpus}') + for corp in corpora_list: + p = Path(f'{output_path}/{output_format}/{corp}') for f in p.glob("**/*.json"): zfile.write(f) return {out_file: gr.File.update(value=f"tmp/{output_format}.zip", label=output_format)} else: - json_l, filename = export_corpora_ui(get_ids(corpora_list), + json_l, filename = export_corpora(get_ids(corpora_list), output_granularity, - "tmp/tmp_corp", + "tmp/JSON", None, - False) + no_download=False, + ui_mode=True) with open(f"tmp/{filename}.json", 'w', encoding='utf-8') as f: json.dump(json_l, f, ensure_ascii=False, indent=4) return {out_file: gr.File.update(value=f"tmp/{filename}.json", label=filename)} - def block_granularity(rad_format): - if rad_format == "TEI": - PARAMS["ouput_format"] = rad_format + def block_granularity(r_format): + if r_format == "TEI": + PARAMS["output_format"] = r_format return {rad_granularity: gr.update(value="poem", visible=False)} else: - PARAMS["ouput_format"] = rad_format + PARAMS["output_format"] = r_format return {rad_granularity: gr.update(value="poem", visible=True)} def update_granularity(value): PARAMS["granularity"] = value true_l = [gr.Checkbox.update(value=True, visible=True)] - false_l = [gr.Checkbox.update(value=False, visible=False)] + false_l = [gr.Checkbox.update(value=False, visible=False)] # False true_acc = [gr.Box.update(visible=True)] - false_acc = [gr.Box.update(visible=False)] + false_acc = [gr.Box.update(visible=False)] # False if value == "word": # 000011100110 + 110110 + 110110 - corp_list = false_l * 4 + true_l * 3 + false_l * 2 + true_l * 2 + false_l - lang_list = true_l * 2 + false_l + true_l * 2 + false_l - acc_list = true_acc * 2 + false_acc + true_acc * 2 + false_acc + corp_list = false_l*4 + true_l*3 + false_l*2 + true_l*2 + false_l + lang_list = true_l*2 + false_l + true_l*2 + false_l + acc_list = true_acc*2 + false_acc + true_acc*2 + false_acc + PARAMS["corpora_list"] = ["plc", "gongo", "ecpa", "bibit", "czverse"] return corp_list + lang_list + acc_list elif value == "syllable": # 000011000000 + 100000 + 100000 - corp_list = false_l * 4 + true_l * 2 + false_l * 6 - lang_list = true_l + false_l * 5 - acc_list = true_acc + false_acc * 5 + corp_list = false_l*4 + true_l*2 + false_l*6 + lang_list = true_l + false_l*5 + acc_list = true_acc + false_acc*5 + PARAMS["corpora_list"] = ["plc", "gongo"] return corp_list + lang_list + acc_list else: - # return None - return true_l * 18 + true_acc * 6 + return true_l*18 + true_acc*6 def change_selection(value, *labels): - for corpus in labels: - update_corpora_list(value, corpus) + for corpus_name in labels: + update_corpora_list(value, corpus_name) return value def change_global_selection(value, *labels): - for corpus in labels: - update_corpora_list(value, corpus) + for corpus_name in labels: + update_corpora_list(value, corpus_name) return [gr.Checkbox.update(value=value)] * 6 - def update_corpora_list(added, corpus): + def update_corpora_list(added, corpus_name): corpora_list = PARAMS["corpora_list"] - if corpus in corpora_list and not added: - corpora_list.remove(corpus) - elif corpus not in corpora_list and added: - corpora_list.append(corpus) + if corpus_name in corpora_list and not added: + corpora_list.remove(corpus_name) + elif corpus_name not in corpora_list and added: + corpora_list.append(corpus_name) app_title = gr.HTML("

Averell

") with gr.Row() as row: - with gr.Column(scale=1) as c1: + with gr.Column(scale=3) as c1: rad_format = gr.Radio(["TEI", "JSON"], label="Output", info="Choose output format", @@ -133,12 +141,15 @@ def update_corpora_list(added, corpus): corpus_checkboxes = [] lang_checkboxes = [] with gr.Box() as b1: + gr.HTML(value="

Corpora list

") + gr.HTML(value="
") + all_corp_chk = gr.Checkbox(True, label="Select all/none", + interactive=True) with gr.Row() as rowa: - all_corp_chk = gr.Checkbox(True, label="Select all/none", - interactive=True) all_label_list = [] + gr.Checkbox(label="dummy",visible=False) for lang in available_languages: - with gr.Box() as b2: + with gr.Column() as b2: language = pycountry.languages.get( alpha_2=lang).name gr.HTML(language) @@ -184,7 +195,7 @@ def update_corpora_list(added, corpus): label_list.append(label) all_label_list.append(label) lang_checkboxes.append(lang_chk) - # "Selec All/None" checkbox change + # "Select All/None" checkbox change all_corp_chk.change(change_global_selection, [all_corp_chk, *all_label_list], @@ -205,11 +216,11 @@ def update_corpora_list(added, corpus): exp_btn = gr.Button("Export") folder_path = gr.Label(value="tmp/", visible=False) out_file = gr.File() - exp_btn.click(export_corpora, + exp_btn.click(export, [folder_path, rad_format, rad_granularity], out_file, api_name="export") -app_averell.launch(share=True) +app_averell.launch(share=False, server_port=5741) diff --git a/requirements.txt b/requirements.txt index ffc9977..c4a2bfa 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,3 +5,8 @@ pyyaml>=5 requests>=2.21.0 tabulate>=0.8.7 tqdm>=4.43.0 + +averell~=1.2.2 +setuptools~=67.7.2 +gradio~=3.27.0 +pycountry~=22.3.5 diff --git a/setup.py b/setup.py index 6faa4e0..095e1de 100644 --- a/setup.py +++ b/setup.py @@ -66,7 +66,7 @@ def read(*names, **kwargs): keywords=[ # eg: 'keyword1', 'keyword2', 'keyword3', ], - python_requires='>3.6.*', + python_requires='>=3.6', install_requires=read("requirements.txt").split("\n"), extras_require={ # eg: diff --git a/src/averell/core.py b/src/averell/core.py index 7aa7187..7d27004 100644 --- a/src/averell/core.py +++ b/src/averell/core.py @@ -49,7 +49,8 @@ def get_corpora(corpus_indices=None, output_folder=DEFAULT_OUTPUT_FOLDER): def export_corpora( - corpus_ids, granularity, corpora_folder, filename, no_download=False + corpus_ids, granularity, corpora_folder, filename, no_download=False, + ui_mode=False ): """ Generates a single JSON file with the chosen granularity for all of the @@ -60,6 +61,8 @@ def export_corpora( :param corpora_folder: Local folder where the corpora is located :param filename: Name of the output file :param no_download: Whether to download or not a corpora when missing + :param ui_mode: Whether the function is called from the gradio UI + :return: Python dict with the chosen granularity for all of the selected corpora """ @@ -105,7 +108,6 @@ def export_corpora( corpus_id, granularity) corpora_features.extend(filtered_features) - print(corpora_features) else: print("ID not in corpora list") @@ -115,81 +117,9 @@ def export_corpora( export_filename = "_".join(slugs) export_filename = f"{export_filename}_{granularity}s" - if corpora_features: + if corpora_features and not ui_mode: write_json(corpora_features, export_filename) else: print("Corpora folder not found") logging.error("Corpora folder not found") return corpora_features, export_filename - - -def export_corpora_ui( - corpus_ids, granularity, corpora_folder, filename, no_download=False -): - """ - Generates a single JSON file with the chosen granularity for all of the - selected corpora - - :param corpus_ids: IDs of the corpora that will be exported - :param granularity: Level of parsing granularity - :param corpora_folder: Local folder where the corpora is located - :param filename: Name of the output file - :param no_download: Whether to download or not a corpora when missing - :return: Python dict with the chosen granularity for all of the selected - corpora - """ - corpora_features = [] - slugs = [] - export_filename = filename - if Path(corpora_folder).exists() or not no_download: - if not corpus_ids: - print("No CORPUS ID selected") - logging.error("No CORPUS ID selected") - else: - if granularity is not None: - for corpus_id in corpus_ids: - try: - corpus = CORPORA_SOURCES[corpus_id] - except IndexError: - print("ID not in corpora list") - logging.error("ID not in corpora list") - else: - corpus_folder = corpus["properties"]["slug"] - slugs.append(corpus_folder) - corpus_name = corpus["name"] - if not (Path(corpora_folder) / corpus_folder).exists(): - # Si vamos a descargar el corpus - if not no_download: - get_corpora([corpus_id], corpora_folder) - else: - print("Whatever") - logging.error( - f'"{corpus_name} ({corpus_folder})" not ' - f'found in "{corpora_folder}" folder') - continue - granularities_list = corpus["properties"]["granularity"] - if granularity not in granularities_list: - print("granularity not found") - - logging.error( - f"'{granularity}' granularity not found on " - f"'{corpus_name}' properties") - continue - features = read_features( - Path(corpora_folder) / corpus_folder) - filtered_features = filter_corpus_features(features, - corpus_id, - granularity) - corpora_features.extend(filtered_features) - else: - logging.error("No GRANULARITY selected") - - if not export_filename: - export_filename = "_".join(slugs) - export_filename = f"{export_filename}_{granularity}s" - - #if corpora_features: - # write_json(corpora_features, export_filename) - else: - logging.error("Corpora folder not found") - return corpora_features, export_filename diff --git a/src/averell/utils.py b/src/averell/utils.py index 0244686..cd9f029 100644 --- a/src/averell/utils.py +++ b/src/averell/utils.py @@ -365,11 +365,11 @@ def get_ids(values): return ids -def generate_tei(corpora_list, output_path, ui_enabled=False): +def generate_tei(corpora_list, output_path, ui_enabled=True): filename_list = [] for corpus in corpora_list: if ui_enabled: - p = Path(output_path) / "JSON" / corpus + p = Path(output_path) / "tmp_corp" / corpus / "averell" / "parser" else: p = Path(output_path) / corpus / "averell" / "parser" poem_path_list = p.glob("**/*.json") @@ -474,7 +474,7 @@ def generate_tei(corpora_list, output_path, ui_enabled=False): if not os.path.exists(output_base_path): Path.mkdir(output_base_path) if not os.path.exists(output_extended_path): - Path.mkdir(output_extended_path) + Path.mkdir(output_extended_path, parents=True) et.indent(tree, space=" ", level=0) tree.write(f"{Path(output_extended_path) / output_file}", encoding="UTF-8", From ed19592f2ec380d5475e6a2d8d38a36e270a2a83 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20P=C3=A9rez?= Date: Thu, 27 Jul 2023 12:17:52 +0200 Subject: [PATCH 4/4] Update README.rst --- README.rst | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/README.rst b/README.rst index cc79414..b96828e 100644 --- a/README.rst +++ b/README.rst @@ -105,3 +105,24 @@ Usage ===== Check `usage page `_ + + +Docker +====== +A docker for the frontend is available at https://hub.docker.com/repository/docker/linhdpostdata/averell-ui/ + +This is a frontend to the POSTDATA group tool Averell. In order to install and run it, follow these steps: + +* Download Docker desktop from: https://www.docker.com/products/docker-desktop/ + +* Open the Docker desktop app + +* From the Docker Dashboard you can use Quick Search, which is located in the Dashboard header, to search for: + * Any container or Compose app on your local system. You can see an overview of associated environment variables or perform quick actions, such as start, stop, or delete. + * Public Docker Hub images, local images, and images from remote repositories. Depending on the type of image you select, you can either pull the image by tag, view documentation, go to Docker Hub for more details, or run a new container using the image. + * Extensions. From here, you can learn more about the extension and install it with a single click. Or, if you already have an extension installed, you can open it straight from the search results. + * Any volume. From here you can view the associated container. + +* Search for linhdpostdata/averell-ui and download it. + +* Run the container and enter url 127.0.0.1:5741 in your browser to access the UI.