diff --git a/markup_doc/labeling_utils.py b/markup_doc/labeling_utils.py index 620fde4..d0247f1 100644 --- a/markup_doc/labeling_utils.py +++ b/markup_doc/labeling_utils.py @@ -11,6 +11,7 @@ # Local application imports from model_ai.models import LlamaModel +from .choices import order_labels MODEL_NAME_GEMINI = 'GEMINI' @@ -836,3 +837,302 @@ def extract_keywords(text): clean_keywords = ", ".join(keywords) return {"title": label, "keywords": clean_keywords} + + +def create_special_content_object(item, stream_data_body, counts): + """Create objects for special content types (image, table, list, compound)""" + obj = {} + + if item.get('type') == 'image': + obj = {} + counts['numfig'] += 1 + obj['type'] = 'image' + obj['value'] = { + 'figid' : f"f{counts['numfig']}", + 'label' : '', + 'image' : item.get('image') + } + + #Obitiene el elemento aterior + try: + prev_element = stream_data_body[-1] + label_title = extract_label_and_title(prev_element['value']['paragraph']) + obj['value']['figlabel'] = label_title['label'] + obj['value']['title'] = label_title['title'] + stream_data_body.pop(-1) + except: + pass + + elif item.get('type') == 'table': + obj = {} + counts['numtab'] += 1 + obj['type'] = 'table' + obj['value'] = { + 'tabid' : f"t{counts['numtab']}", + 'label' : '', + 'content' : item.get('table') + } + + #Obitiene el elemento aterior + try: + prev_element = stream_data_body[-1] + label_title = extract_label_and_title(prev_element['value']['paragraph']) + obj['value']['tablabel'] = label_title['label'] + obj['value']['title'] = label_title['title'] + stream_data_body.pop(-1) + except: + #No hay elemento anterior + pass + + elif item.get('type') == 'list': + obj = {} + obj['type'] = 'paragraph' + obj['value'] = { + 'label' : '', + 'paragraph' : item.get('list') + } + + elif item.get('type') == 'compound': + obj = {} + counts['numeq'] += 1 + obj['type'] = 'compound_paragraph' + obj['value'] = { + 'eid' : f"e{counts['numeq']}", + #'label' : '', + 'content': item.get('text') + } + text_count = sum( + 1 for c in obj['value']['content'] + if c['type'] == 'text' + ) + + if text_count > 1: + obj['value']['label'] = '' + return obj, counts + + if text_count == 0: + obj['value']['label'] = '' + return obj, counts + + text_value = next( + item['value'] + for item in obj['value']['content'] + if item['type'] == 'text' + ) + text = is_number_parenthesis(text_value) + if text: + obj['value']['label'] = '' + next( + item + for item in obj['value']['content'] + if item['type'] == 'text' + )['value'] = text + else: + obj['value']['label'] = '' + + return obj, counts + + +def extract_subsection(text): + # Quitar punto final si existe + text = text.strip() + + # Ver si contiene una etiqueta con dos puntos + match = re.match(r'(?i)\s*(.+?)\s*:\s*(.+)', text) + + if match: + label = match.group(1).strip() + content = match.group(2).strip() + else: + label = None + content = text + + return {"title": label, "content": content} + + +def search_special_id(data_body, label): + for d in data_body: + if d['type'] in ['image', 'table']: + data = d['value'] + clean_label = re.sub(r'^[\s\.,;:–—-]+', '', label).capitalize() + + if d['type'] == 'image': + if clean_label == data['figlabel']: + return data.get('figid') + if data['figid'][0] == clean_label.lower()[0] and data['figid'][1] in clean_label.lower(): + return data.get('figid') + + if d['type'] == 'table': + if clean_label == data['tablabel']: + return data.get('tabid') + if data['tabid'][0] == clean_label.lower()[0] and data['tabid'][1] in clean_label.lower(): + return data.get('tabid') + + for d in data_body: + if d['type'] in ['compound_paragraph']: + data = d['value'] + clean_label = re.sub(r'^[\s\.,;:–—-]+', '', label).lower() + + if d['type'] == 'compound_paragraph': + if data['eid'][0] in clean_label[0] and data['eid'][1] in clean_label: + return data.get('eid') + + return None + + +def is_number_parenthesis(text): + pattern = re.compile(r'^\s*\(\s*(\d+)\s*\)\s*$') + match = pattern.fullmatch(text) + if match: + return f"({match.group(1)})" + return None + + +def remove_unpaired_tags(text): + # Match opening/closing tags, capturing only the tag name (before any space or >) + pattern = re.compile(r'<(/?)([a-zA-Z0-9]+)(?:\s[^>]*)?>') + + result = [] + stack = [] # Stores (tag_name, position_in_result) + + i = 0 + for match in pattern.finditer(text): + is_closing, tag_name = match.groups() + is_closing = bool(is_closing) + + # Text between tags + if match.start() > i: + result.append(text[i:match.start()]) + + tag_text = text[match.start():match.end()] + + if not is_closing: + # Opening tag + stack.append((tag_name, len(result))) + result.append(tag_text) + else: + # Closing tag + if stack and stack[-1][0] == tag_name: + stack.pop() + result.append(tag_text) + else: + # Orphan closing tag - skip + pass + + i = match.end() + + # Append remaining text + if i < len(text): + result.append(text[i:]) + + # Remove unclosed opening tags + for tag_name, pos in sorted(stack, reverse=True, key=lambda x: x[1]): + result.pop(pos) + + return ''.join(result) + + +def append_fragment(node_dest, val): + if not val: + parent = node_dest.getparent() + if parent: + parent.remove(node_dest) + return + + # 1) Limpiezas mínimas + # - eliminar
/
+ # - quitar saltos de línea + clean = re.sub(r"(?i)", "", val) + clean = clean.replace("\n", "") + + # normaliza entidades problemáticas + clean = clean.replace(" ", " ") + clean = re.sub(r'&(?!\w+;|#\d+;)', '&', clean) + + clean = remove_unpaired_tags(clean) + + if clean == "": + parent = node_dest.getparent() + if parent: + parent.remove(node_dest) + return + + # 2) Si no hay etiquetas, es texto plano + if "<" not in clean: + node_dest.text = (node_dest.text or "") + clean + return + + # 3) Envolver para que sea XML bien formado aunque empiece con texto + wrapper = etree.XML(f"<_wrap_>{clean}") + + # 4) Pasar el texto inicial (antes del primer tag) + if wrapper.text: + node_dest.text = (node_dest.text or "") + wrapper.text + + # 5) Mover cada hijo al destino (sus .tail se conservan) + for child in list(wrapper): + node_dest.append(child) + + +def extract_label_and_title(text): + """ + Extrae el Label (Figura/Figure/Tabla/Table/Tabela + número) y el Title (resto del texto limpio). + Ignora mayúsculas y minúsculas y limpia puntuación/espacios entre el número y el título. + """ + # Acepta Figura/Figure y Tabla/Table/Tabela + pattern = r'\b(Imagen|Imágen|Image|Imagem|Figura|Figure|Tabla|Table|Tabela)\s+(\d+)\b' + match = re.search(pattern, text, re.IGNORECASE) + + if match: + word = match.group(1).capitalize() # Normaliza capitalización + number = match.group(2) + label = f"{word} {number}" + + # Texto después del número + rest = text[match.end():] + + # Quita puntuación/espacios iniciales (.,;: guiones, etc.) + rest_clean = re.sub(r'^[\s\.,;:–—-]+', '', rest) + + return {"label": label, "title": rest_clean.strip()} + else: + return {"label": None, "title": text.strip()} + + +def proccess_special_content(text, data_body): + # normaliza espacios no separables por si acaso + text = re.sub(r'[\u00A0\u2007\u202F]', ' ', text) + + pattern = r""" + (?\n" + + # Almacena las combinaciones para las celdas + rowspan_dict = {} # {(row, col): rowspan_count} + colspan_dict = {} # {(row, col): colspan_count} + + # Itera sobre las filas de la tabla + for i, row in enumerate(element.xpath('.//w:tr')): + hiperlinks = self.extract_hiperlink(row, rels_map, namespaces) if found_hiperlinks else None + + html += "
\n" + # Itera sobre las celdas de cada fila + j = 0 # índice de columna + for cell in row.xpath('.//w:tc'): + # Revisa si la celda está en una posición afectada por rowspan + while (i, j) in rowspan_dict and rowspan_dict[(i, j)] > 0: + # Reduce el contador de rowspan + rowspan_dict[(i, j)] -= 1 + j += 1 # Mueve a la siguiente columna + + # Revisa las propiedades de la celda para rowspan y colspan + cell_props = cell.xpath('.//w:tcPr') + rowspan = 1 + colspan = 1 + + # Procesa rowspan (vMerge) + v_merge_fin = False + v_merge = cell.xpath('.//w:vMerge') + if v_merge: + v_merge_val = v_merge[0].get(qn('w:val')) + if v_merge_val == "restart": + # Es el inicio de una combinación vertical + rowspan = 1 + # Busca el total de filas combinadas contando hacia abajo + k = i + 1 + while k < len(element.xpath('.//w:tr')): + try: + next_cell = element.xpath('.//w:tr')[k].xpath('.//w:tc')[j] + next_merge = next_cell.xpath('.//w:tcPr//w:vMerge') + except: + next_cell = None + next_merge = None + + if next_merge and next_merge[0].get(qn('w:val')) is None: + rowspan += 1 + else: + break + k += 1 + + for k in range(rowspan): + rowspan_dict[(i + k, j)] = rowspan - k - 1 + else: + v_merge_fin = True + + # Procesa colspan (gridSpan) + grid_span = cell.xpath('.//w:gridSpan') + if grid_span: + colspan = int(grid_span[0].get(qn('w:val'))) + for k in range(colspan): + colspan_dict[(i, j + k)] = colspan - k - 1 + + if not v_merge_fin: + # Obtén el contenido del texto de la celda + cell_text = "
".join([t.text for t in cell.xpath('.//w:t')]) + cell_text = clean_labels(cell_text) + (f" {hiperlinks}" if hiperlinks else "") + + # Determina el tag a usar (th para el encabezado, td para celdas normales) + tag = "th" if i == 0 else "td" + + # Construye la celda en HTML + cell_html = f" <{tag}" + if rowspan > 1: + cell_html += f' rowspan="{rowspan}"' + if colspan > 1: + cell_html += f' colspan="{colspan}"' + cell_html += f">{cell_text}\n" + + html += cell_html + j += 1 + (colspan - 1) # Avanza las columnas tomando en cuenta el colspan + + html += " \n" + + html += "
" + return html + content = [] sections = [] + images = [] found_fb = False review_fb = True #Palabras a buscar como indicador del primer bloque @@ -228,11 +319,46 @@ def clean_labels(text): hiperlinks = self.extract_hiperlink(element, hiperlinks_info, namespaces) if found_hiperlinks else None - paragraph = element - text_paragraph = [] + obj_image = False + obj_formula = False + + for drawing in element.findall('.//w:drawing', namespaces=namespaces): + if drawing.find('.//a:blip', namespaces=namespaces) is not None: + blip = drawing.find('.//a:blip', namespaces=namespaces) + if blip is not None: + obj_image = True + + rId = blip.get('{http://schemas.openxmlformats.org/officeDocument/2006/relationships}embed') + image_part = doc.part.related_parts[rId] + image_data = image_part.blob + image_name = image_part.partname.split('/')[-1] + + if image_name not in images: + images.append(image_name) + + # Guardar la imagen en Wagtail + wagtail_image = ImageModel.objects.create( + title=image_name, + file=ContentFile(image_data, name=image_name) + ) + + # Referenciar la imagen guardada en el objeto + obj['type'] = 'image' + obj['image'] = wagtail_image.id + + ns_math = { + 'm': 'http://schemas.openxmlformats.org/officeDocument/2006/math', + 'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main' + } + + for formula in element.findall('.//m:oMathPara', namespaces=ns_math): + obj_formula = True + mathml_result = transform(formula) + mathml_root = etree.fromstring(str(mathml_result)) + mathml_root = self.replace_mfenced_pipe_only(mathml_root) + obj['type'] = 'formula' + obj['formula'] = etree.tostring(mathml_root, pretty_print=True, encoding='unicode') - # Determina si es parte de una lista - is_numPr = paragraph.find('.//w:numPr', namespaces=paragraph.nsmap) is not None # obtiene id y nivel if is_numPr: @@ -244,9 +370,29 @@ def clean_labels(text): if objt.get('numId') == numId ] - #Es una lista diferente - if numId != current_num_id: - current_num_id = numId + # obtiene id y nivel + if is_numPr: + numPr = paragraph.find('.//w:numPr', namespaces=paragraph.nsmap) + numId = numPr.find('.//w:numId', namespaces=paragraph.nsmap).get(namespaces_p + 'val') + type = [(key, objt) for key, objt in list_types.items() if objt['numId'] == numId] + + #Es una lista diferente + if numId != current_num_id: + current_num_id = numId + if len(current_list) > 0: + current_list.append('[/list]') + objl = {} + objl['type'] = 'list' + objl['list'] = '\n'.join(current_list) + current_list = [] + content.append(objl) + list_type = 'bullet' + if type[0][1][str(0)] == 'decimal': + list_type = 'order' + + current_list.append(f'[list list-type="{list_type}"]') + else: + #Se terminaron de agregar elementos a la lista if len(current_list) > 0: current_list.append('[/list]') objl = {} @@ -325,89 +471,182 @@ def clean_labels(text): if p_pr is not None: i_tag = p_pr.find('.//w:pPr', namespaces=child.nsmap) - if i_tag is not None: - val = i_tag.get('{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val') - obj['italic'] = (val is None or val in ['1', 'true', 'True']) - else: - obj['italic'] = False - - s_tag = child.find('.//w:spacing', namespaces=child.nsmap) - - if s_tag is None: - p_pr = paragraph.find('.//w:rPr/w:spacing', namespaces=child.nsmap) - if p_pr is not None: - s_tag = p_pr.find('.//w:pPr', namespaces=child.nsmap) - - if s_tag is not None: - val = s_tag.get('{http://schemas.openxmlformats.org/wordprocessingml/2006/main}before') - obj['spacing'] = not (val is None) - else: - obj['spacing'] = False - - clean_text = clean_labels(child.text) - - #identifica sección - sections = identify_section(sections, obj['font_size'], obj['bold'] , clean_text) - - if obj['italic']: - text_paragraph.append('' + clean_text + '' + (f" {hiperlinks}" if hiperlinks else "")) - else: - text_paragraph.append(clean_text + (f" {hiperlinks}" if hiperlinks else "")) - - paraph = match_paragraph(clean_text) + for child in paragraph: + if child.tag == '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}hyperlink': + for r in child.findall('w:r', namespaces=child.nsmap): + t_elem = r.find('w:t', namespaces=child.nsmap) + if t_elem is not None and t_elem.text: + text_paragraph.append(t_elem.text) + + elif child.tag == '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}r': + namespaces = '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}' + sz_element = child.find('.//w:sz', namespaces=child.nsmap) + obj['font_size'] = 0 + + if sz_element is None: + p_pr = paragraph.find('.//w:rPr/w:sz', namespaces=child.nsmap) + if p_pr is not None: + sz_element = p_pr.find('.//w:pPr', namespaces=child.nsmap) + + if sz_element is not None: + xml_string = etree.tostring(sz_element, pretty_print=True, encoding='unicode') + size_element = objectify.fromstring(xml_string) + font_size_value = size_element.get(namespaces+'val') + obj['font_size'] = int(font_size_value)/2 + + color_element = child.find('.//w:color', namespaces=child.nsmap) + + if color_element is None: + p_pr = paragraph.find('.//w:pPr', namespaces=child.nsmap) + if p_pr is not None: + color_element = p_pr.find('.//w:rPr/w:color', namespaces=child.nsmap) + + if color_element is not None: + xml_string_color = etree.tostring(color_element, pretty_print=True, encoding='unicode') + object_element = objectify.fromstring(xml_string_color) + color_value = object_element.get(namespaces + 'val') + obj['color'] = color_value + + b_tag = child.find('.//w:b', namespaces=child.nsmap) + + if b_tag is None: + p_pr = paragraph.find('.//w:rPr/w:b', namespaces=child.nsmap) + if p_pr is not None: + b_tag = p_pr.find('.//w:pPr', namespaces=child.nsmap) + + if b_tag is not None: + val = b_tag.get('{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val') + obj['bold'] = (val is None or val in ['1', 'true', 'True']) + else: + obj['bold'] = False + + i_tag = child.find('.//w:i', namespaces=child.nsmap) + + if i_tag is None: + p_pr = paragraph.find('.//w:rPr/w:i', namespaces=child.nsmap) + if p_pr is not None: + i_tag = p_pr.find('.//w:pPr', namespaces=child.nsmap) + + if i_tag is not None: + val = i_tag.get('{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val') + obj['italic'] = (val is None or val in ['1', 'true', 'True']) + else: + obj['italic'] = False + + s_tag = child.find('.//w:spacing', namespaces=child.nsmap) + + if s_tag is None: + p_pr = paragraph.find('.//w:rPr/w:spacing', namespaces=child.nsmap) + if p_pr is not None: + s_tag = p_pr.find('.//w:pPr', namespaces=child.nsmap) + + if s_tag is not None: + val = s_tag.get('{http://schemas.openxmlformats.org/wordprocessingml/2006/main}before') + obj['spacing'] = not (val is None) + else: + obj['spacing'] = False + + clean_text = clean_labels(child.text) + + #identifica sección + sections = identify_section(sections, obj['font_size'], obj['bold'] , clean_text) + + if obj['italic']: + text_paragraph.append('' + clean_text + '' + (f" {hiperlinks}" if hiperlinks else "")) + else: + text_paragraph.append(clean_text + (f" {hiperlinks}" if hiperlinks else "")) + + paraph = match_paragraph(clean_text) + if paraph: + obj['paraph'] = paraph + obj['type'] = paraph + + if review_fb: + found_fb = any(word in clean_text.lower() for word in start_text) + + #Si se encontró alguna palabra, incluye todo lo anterior en un sólo bloque + if found_fb: + found_fb = False + review_fb = False + found_hiperlinks = False + sections = [sections[-1]] + first_block = '' + tmp_content = [] + abstract_mode = False + + for c in content: + if abstract_mode: + if c['text'] == '' or c['spacing'] is True: + abstract_mode = False + else: + tmp_content.append(c) + continue + + if 'paraph' in c: + tmp_content.append(c) + abstract_mode = False + if c['paraph'] == '': + abstract_mode = True + continue + else: + if 'text' in c: + first_block = first_block + "\n" + c["text"] + if 'table' in c: + first_block = first_block + "\n" + c["table"] + + obj_b = {} + obj_b['type'] = 'first_block' + obj_b['text'] = first_block + tmp_content.append(obj_b) + content = tmp_content + start_text = [] + + if child.tag == f"{{{ns_math['m']}}}oMath": + if 'text' not in obj or not isinstance(obj['text'], list): + obj['type'] = 'compound' + obj['text'] = [] + if len(text_paragraph) > 0: + obj2 = {} + obj2['type'] = 'text' + obj2['value'] = ' '.join(text_paragraph) + obj['text'].append(obj2) + text_paragraph = [] + + mathml_result = transform(child) + mathml_root = etree.fromstring(str(mathml_result)) + self.replace_mfenced_pipe_only(mathml_root) + obj2 = {} + obj2['type'] = 'formula' + obj2['value'] = etree.tostring(mathml_root, pretty_print=True, encoding='unicode') + obj['text'].append(obj2) + + if 'text' not in obj: + obj['text'] = (' '.join(text_paragraph)).strip() + clean_text = clean_labels(obj['text']) + obj['text'] = clean_text + + paraph = match_paragraph(obj['text']) if paraph: obj['paraph'] = paraph obj['type'] = paraph - if review_fb: - found_fb = any(word in clean_text.lower() for word in start_text) - - #Si se encontró alguna palabra, incluye todo lo anterior en un sólo bloque - if found_fb: - found_fb = False - review_fb = False - found_hiperlinks = False - sections = [sections[-1]] - first_block = '' - tmp_content = [] - abstract_mode = False - - for c in content: - if abstract_mode: - if c['text'] == '' or c['spacing'] is True: - abstract_mode = False - else: - tmp_content.append(c) - continue - - if 'paraph' in c: - tmp_content.append(c) - abstract_mode = False - if c['paraph'] == '': - abstract_mode = True - continue - else: - if 'text' in c: - first_block = first_block + "\n" + c["text"] - if 'table' in c: - first_block = first_block + "\n" + c["table"] - - obj_b = {} - obj_b['type'] = 'first_block' - obj_b['text'] = first_block - tmp_content.append(obj_b) - content = tmp_content - start_text = [] - - if 'text' not in obj: - obj['text'] = (' '.join(text_paragraph)).strip() - clean_text = clean_labels(obj['text']) - obj['text'] = clean_text - - paraph = match_paragraph(obj['text']) - if paraph: - obj['paraph'] = paraph - obj['type'] = paraph + if is_numPr: + if 'font_size' in obj: + del obj['font_size'] + current_list.append(f'[list-item]{obj["text"]}[/list-item]') + if isinstance(obj['text'], list) and len(text_paragraph) > 0: + obj2 = {} + obj2['type'] = 'text' + obj2['value'] = ' '.join(text_paragraph) + obj['text'].append(obj2) + text_paragraph = [] + + elif isinstance(element, CT_Tbl): + namespaces = { + 'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main', + 'a': 'http://schemas.openxmlformats.org/drawingml/2006/main', + 'r': 'http://schemas.openxmlformats.org/officeDocument/2006/relationships' + } if is_numPr: if 'font_size' in obj: