diff --git a/scripts/check_arxiv_metadata_latex.py b/scripts/check_arxiv_metadata_latex.py new file mode 100644 index 0000000..6ba9ab6 --- /dev/null +++ b/scripts/check_arxiv_metadata_latex.py @@ -0,0 +1,114 @@ +#!/usr/bin/env python3 +"""Regression checks for arXiv metadata LaTeX cleanup.""" + +import json +import re +import sys +from pathlib import Path + + +ROOT = Path(__file__).resolve().parents[1] +ARXIV_DIR = ROOT / "sites/arxiv" +PAPERS = ARXIV_DIR / "papers.json" + + +def main(): + sys.path.insert(0, str(ARXIV_DIR)) + from metadata_cleaning import clean_arxiv_metadata_text as clean + from metadata_cleaning import format_arxiv_display_text as display + papers = json.loads(PAPERS.read_text()) + by_id = {paper.get("arxiv_id"): paper for paper in papers} + + known_cases = { + "2604.07983": { + "contains": [r"$\gtrsim 100\times$", r"$z=1.37$", "SN 2025mkn"], + "forbidden": [r"\gtrsim 100\times\gtrsim 100\times", "z=1.37z=1.37"], + }, + "2604.04709": { + "contains": [r"\mathbb{P}^1"], + "forbidden": [r"\mathbb{P}^1\mathbb{P}^1"], + }, + "2604.07446": { + "contains": [r"\mathcal{N}=1"], + "forbidden": [r"\mathcal{N}=1\mathcal{N}=1", "CFT_3_3"], + }, + "2206.03566": { + "contains": [r"\mathbb{R}^4"], + "forbidden": [r"\mathbb{R}^4\mathbb{R}^4"], + }, + "2511.23096": { + "contains": [r"GL(d_1)\times GL(d_2)"], + "forbidden": [r"GL(d_1)\times GL(d_2)GL(d_1)\times GL(d_2)"], + }, + } + + for arxiv_id, checks in known_cases.items(): + paper = by_id.get(arxiv_id) + if not paper: + raise SystemExit(f"missing arXiv fixture {arxiv_id}") + cleaned_title = clean(paper.get("title", "")) + for text in checks["contains"]: + if text not in cleaned_title: + raise SystemExit(f"{arxiv_id}: expected cleaned title fragment missing: {text}") + for text in checks["forbidden"]: + if text in cleaned_title: + raise SystemExit(f"{arxiv_id}: duplicated title fragment remains: {text}") + + display_cases = { + r"A Natural $\gtrsim 100\times$ Telescope at $z=1.37$": [ + "≥ 100×", + "z=1.37", + ], + r"Project page \url{this https URL}": [ + "Project page this https URL", + ], + r"available at \href{this https URL}{GitHub}": [ + "available at GitHub", + ], + r"$\mathbb{R}^4$ and $\mathcal{N}=2$": [ + "R^4", + "N=2", + ], + } + for raw, fragments in display_cases.items(): + rendered = display(raw) + for fragment in fragments: + if fragment not in rendered: + raise SystemExit(f"display fragment missing: {fragment!r} from {rendered!r}") + if "\\" in rendered or "$" in rendered: + raise SystemExit(f"display still contains raw TeX delimiters: {rendered!r}") + + duplicate_patterns = [ + re.compile(r"(\\mathbb\{[^}]+\}\^?[^\\\s]*)\1"), + re.compile(r"(\\mathcal\{[^}]+\}=?[0-9A-Za-z]*)\1"), + re.compile(r"(\\mathrm\{[^}]+\}[_^]?\{?[^{}\s]*\}?)\1"), + re.compile(r"\b(z=[0-9.]+)\1\b"), + re.compile(r"(\$[^$]{1,120}\$)\s*\1"), + re.compile(r"(GL\(d_1\)\\times GL\(d_2\))\1"), + ] + failures = [] + for paper in papers: + cleaned_title = clean(paper.get("title", "")) + for pattern in duplicate_patterns: + if pattern.search(cleaned_title): + failures.append((paper.get("arxiv_id"), cleaned_title)) + break + if failures: + preview = "\n".join(f"{pid}: {title}" for pid, title in failures[:20]) + raise SystemExit(f"duplicated LaTeX fragments remain in cleaned titles:\n{preview}") + + display_failures = [] + for paper in papers: + for field in ("title", "abstract", "comments"): + rendered = display(paper.get(field, "")) + if r"\href{" in rendered or r"\url{" in rendered: + display_failures.append((paper.get("arxiv_id"), field, rendered[:160])) + if display_failures: + preview = "\n".join(f"{pid} {field}: {text}" for pid, field, text in display_failures[:20]) + raise SystemExit(f"raw URL TeX commands remain in display text:\n{preview}") + + print("arXiv metadata LaTeX checks passed") + + +if __name__ == "__main__": + main() diff --git a/sites/arxiv/app.py b/sites/arxiv/app.py index 2bb9f11..5f41fa0 100644 --- a/sites/arxiv/app.py +++ b/sites/arxiv/app.py @@ -29,6 +29,8 @@ from flask_wtf import CSRFProtect from sqlalchemy import or_, and_, func +from metadata_cleaning import clean_arxiv_metadata_text, format_arxiv_display_text + BASE_DIR = Path(__file__).parent DB_DIR = BASE_DIR / "instance" DB_DIR.mkdir(exist_ok=True) @@ -197,11 +199,28 @@ def authors_display(self): @property def short_abstract(self): - if not self.abstract: + abstract = self.display_abstract + if not abstract: return "Abstract not available." - if len(self.abstract) > 280: - return self.abstract[:280] + "…" - return self.abstract + if len(abstract) > 280: + return abstract[:280] + "…" + return abstract + + @property + def display_title(self): + return format_arxiv_display_text(self.title or "") + + @property + def display_abstract(self): + return format_arxiv_display_text(self.abstract or "") + + @property + def display_comments(self): + return format_arxiv_display_text(self.comments or "") + + @property + def display_journal_ref(self): + return format_arxiv_display_text(self.journal_ref or "") @property def subject_list(self): @@ -443,6 +462,10 @@ def _synthesize_abstract(title: str, subject_code: str, f"experiments.") +def _clean_arxiv_metadata_text(text): + return clean_arxiv_metadata_text(text) + + def seed_database(): """Populate the DB from categories.json + papers.json.""" if Category.query.first() is not None: @@ -521,7 +544,7 @@ def seed_database(): if primary_category not in primary_cats and subject_code in primary_cats: primary_category = subject_code # Titles - title = rp.get("title", "").strip() + title = _clean_arxiv_metadata_text(rp.get("title", "").strip()) if not title: continue # Parse date, falling back to arxiv-id-encoded yymm (e.g. 2604.08525 -> 2026-04) @@ -547,7 +570,7 @@ def seed_database(): if not authors: authors = _synthesize_authors(arxiv_id) # Parse figures, tables, formulas counts from comments - cmt = rp.get("comments", "") + cmt = _clean_arxiv_metadata_text(rp.get("comments", "")) figs = 0 tbls = 0 frms = 0 @@ -564,7 +587,7 @@ def seed_database(): versions = rp.get("versions", []) # Loss function from abstract loss_fn = "" - abs_text = rp.get("abstract", "") or "" + abs_text = _clean_arxiv_metadata_text(rp.get("abstract", "") or "") # Backfill empty abstracts for high-traffic categories so the /abs # and listing pages always surface something meaningful. if not abs_text: @@ -2444,6 +2467,25 @@ def backfill_paper_gaps(): print(f" ! backfill_paper_gaps failed: {e}") +def normalize_paper_metadata(): + """Normalize known duplicated LaTeX/text fragments in existing DB rows.""" + try: + changed = 0 + for paper in Paper.query.all(): + for field in ("title", "abstract", "comments"): + current = getattr(paper, field) or "" + cleaned = _clean_arxiv_metadata_text(current) + if cleaned != current: + setattr(paper, field, cleaned) + changed += 1 + if changed: + db.session.commit() + print(f" [+] Normalized {changed} arXiv metadata fields") + except Exception as e: + db.session.rollback() + print(f" ! normalize_paper_metadata failed: {e}") + + def ensure_affiliation_column(): """Ensure the author_affiliations_json column exists on older DBs.""" try: @@ -2495,6 +2537,7 @@ def backfill_affiliations(): seed_database() seed_benchmark_users() backfill_paper_gaps() + normalize_paper_metadata() backfill_affiliations() diff --git a/sites/arxiv/metadata_cleaning.py b/sites/arxiv/metadata_cleaning.py new file mode 100644 index 0000000..57cbf64 --- /dev/null +++ b/sites/arxiv/metadata_cleaning.py @@ -0,0 +1,151 @@ +"""Metadata cleanup helpers for arXiv seed data.""" + +import re + + +_DUP_LATEX_FRAGMENT_RE = re.compile( + r"(" + r"(?:\\[A-Za-z]+(?:\{[^{}]*\})?(?:[_^]\{?[^{}\s]+\}?)*)" + r"(?:=[0-9A-Za-z_.+-]+)?" + r"(?:\s*(?:\\times|\\rightarrow|to|-|\+)\s*)?" + r"(?:[A-Za-z0-9_.{}^+\-/]+)?" + r")\1" +) +_DUP_PLAIN_FRAGMENT_RE = re.compile( + r"\b([A-Za-z][A-Za-z0-9_.+-]{1,}(?:\s+[A-Za-z0-9_.+-]{1,}){0,5})\1\b" +) +_DUP_Z_VALUE_RE = re.compile(r"\b(z\s*=\s*[0-9.]+)\1\b") +_DUP_ISOTOPE_RE = re.compile(r"(\^\{?\d+\}?)(?=\1)") +_DUP_ION_PLUS_RE = re.compile(r"(\^[A-Za-z0-9_/{}+-]+)\+(?=\+)") +_DUP_TIMES_EXPR_RE = re.compile(r"([A-Za-z]+\([^)]{1,30}\)\\times [A-Za-z]+\([^)]{1,30}\))\1") +_DUP_ADJACENT_MATH_RE = re.compile(r"(\$[^$]{1,120}\$)\s*\1") +_HREF_RE = re.compile(r"\\href\{([^{}]*)\}\{([^{}]*)\}") +_URL_RE = re.compile(r"\\url\{([^{}]*)\}") + +_GREEK = { + "alpha": "α", + "beta": "β", + "gamma": "γ", + "delta": "δ", + "epsilon": "ε", + "lambda": "λ", + "mu": "μ", + "nu": "ν", + "phi": "φ", + "pi": "π", + "sigma": "σ", + "theta": "θ", + "upsilon": "υ", + "zeta": "ζ", +} + +_SYMBOLS = { + "approx": "≈", + "bullet": "•", + "cdot": "·", + "epsilon": "ε", + "geq": "≥", + "gt": ">", + "gtrsim": "≥", + "in": "∈", + "infty": "∞", + "leq": "≤", + "log": "log", + "lt": "<", + "nabla": "∇", + "odot": "⊙", + "pm": "±", + "prime": "′", + "rightarrow": "→", + "sim": "∼", + "sqrt": "√", + "textendash": "–", + "times": "×", + "to": "→", +} +_SYMBOLS.update(_GREEK) + + +def clean_arxiv_metadata_text(text): + """Repair scrape artifacts from mixed text/MathJax extraction.""" + if not text: + return text + cleaned = re.sub(r"\s+", " ", str(text)).strip() + cleaned = _HREF_RE.sub(r"\2", cleaned) + cleaned = _URL_RE.sub(r"\1", cleaned) + previous = None + while cleaned != previous: + previous = cleaned + cleaned = _DUP_LATEX_FRAGMENT_RE.sub(r"\1", cleaned) + cleaned = _DUP_Z_VALUE_RE.sub(r"\1", cleaned) + cleaned = _DUP_TIMES_EXPR_RE.sub(r"\1", cleaned) + cleaned = _DUP_PLAIN_FRAGMENT_RE.sub(r"\1", cleaned) + cleaned = _DUP_ISOTOPE_RE.sub(r"\1", cleaned) + cleaned = _DUP_ION_PLUS_RE.sub(r"\1+", cleaned) + cleaned = re.sub(r"(\\mathcal\{N\}=)(\d+)\s+to\s+(\d+)", r"\1\2 to \3", cleaned) + cleaned = re.sub(r"\bCFT_(\d+)_(?=\d+\b)", r"CFT_", cleaned) + cleaned = re.sub(r"\bAdS_(\d+)_(?=\d+\b)", r"AdS_", cleaned) + if r"\gtrsim 100\times" in cleaned and "$" not in cleaned: + cleaned = cleaned.replace(r"\gtrsim 100\times", r"$\gtrsim 100\times$") + cleaned = re.sub(r"(?Recent Library {% for item in recent_library %}
  • - {{ item.paper.title }} + {{ item.paper.display_title }}
    {{ item.paper.authors_display }}
    diff --git a/sites/arxiv/templates/author.html b/sites/arxiv/templates/author.html index 49dba9f..28d0c68 100644 --- a/sites/arxiv/templates/author.html +++ b/sites/arxiv/templates/author.html @@ -21,7 +21,7 @@

    {{ name }}'s articles on arXiv

    other]
    -
    {{ p.title }}
    +
    {{ p.display_title }}
    Subjects: {{ p.primary_subject }}
    diff --git a/sites/arxiv/templates/category.html b/sites/arxiv/templates/category.html index ecb3021..f3807f3 100644 --- a/sites/arxiv/templates/category.html +++ b/sites/arxiv/templates/category.html @@ -44,18 +44,18 @@

    Latest papers in {{ category.name }}

    [abs]
    -
    {{ p.title }}
    +
    {{ p.display_title }}
    - {% if p.comments %} -
    Comments: {{ p.comments }}
    + {% if p.display_comments %} +
    Comments: {{ p.display_comments }}
    {% endif %}
    Subjects: {{ p.primary_subject }}
    - {% if p.abstract %} + {% if p.display_abstract %}
    {{ p.short_abstract }}
    {% endif %}
    diff --git a/sites/arxiv/templates/export_detail.html b/sites/arxiv/templates/export_detail.html index f8786cd..4f5a7f1 100644 --- a/sites/arxiv/templates/export_detail.html +++ b/sites/arxiv/templates/export_detail.html @@ -55,7 +55,7 @@

    Included Papers ({{ export.items|length }})

    {% for item in export.items %}
  • - {{ item.paper.title }} + {{ item.paper.display_title }}
    {{ item.paper.authors_display }}
    arXiv:{{ item.paper.arxiv_id }} · {{ item.paper.primary_subject }}
    diff --git a/sites/arxiv/templates/library.html b/sites/arxiv/templates/library.html index 242da1d..7576e3d 100644 --- a/sites/arxiv/templates/library.html +++ b/sites/arxiv/templates/library.html @@ -38,7 +38,7 @@

    Your Library

    {% for item in items %}
  • - {{ item.paper.title }} + {{ item.paper.display_title }}
    {{ item.paper.authors_display }}
    @@ -54,7 +54,7 @@

    Your Library

    - {% if item.paper.abstract %} + {% if item.paper.display_abstract %}
    {{ item.paper.short_abstract }}
    {% endif %}
    diff --git a/sites/arxiv/templates/listing.html b/sites/arxiv/templates/listing.html index 1a8f326..5fbe23c 100644 --- a/sites/arxiv/templates/listing.html +++ b/sites/arxiv/templates/listing.html @@ -37,14 +37,14 @@

    {{ g.label }} ({{ g.count }} entries)

    other]
    -
    {{ p.title }}
    +
    {{ p.display_title }}
    - {% if p.comments %} -
    Comments: {{ p.comments }}
    + {% if p.display_comments %} +
    Comments: {{ p.display_comments }}
    {% endif %}
    Subjects: {{ p.subjects }}
    @@ -61,14 +61,14 @@

    {{ g.label }} ({{ g.count }} entries)

    other]
    -
    {{ p.title }}
    +
    {{ p.display_title }}
    - {% if p.comments %} -
    Comments: {{ p.comments }}
    + {% if p.display_comments %} +
    Comments: {{ p.display_comments }}
    {% endif %}
    Subjects: {{ p.subjects }}
    diff --git a/sites/arxiv/templates/paper.html b/sites/arxiv/templates/paper.html index 1d8fc1f..3675f88 100644 --- a/sites/arxiv/templates/paper.html +++ b/sites/arxiv/templates/paper.html @@ -1,5 +1,5 @@ {% extends "base.html" %} -{% block title %}{{ paper.title }} — arXiv:{{ paper.arxiv_id }}{% endblock %} +{% block title %}{{ paper.display_title }} — arXiv:{{ paper.arxiv_id }}{% endblock %} {% block breadcrumbs %} {% if paper.primary_category_code %} > @@ -29,7 +29,7 @@
    {% if paper.submitted_date %}[Submitted on {{ paper.submitted_date }}]{% endif %}
    -

    {{ paper.title }}

    +

    {{ paper.display_title }}

    {% for entry in paper.get_authors_with_affiliations() %}{{ entry.name }}{% if not loop.last %}, {% endif %}{% endfor %} @@ -37,16 +37,16 @@

    {{ paper.title }}

    Abstract: - {% if paper.abstract %} - {{ paper.abstract }} + {% if paper.display_abstract %} + {{ paper.display_abstract }} {% else %} Abstract not currently available for this paper. {% endif %}
    - {% if paper.comments %} -
    Comments: {{ paper.comments }}
    + {% if paper.display_comments %} +
    Comments: {{ paper.display_comments }}
    {% endif %}
    Subjects: {{ paper.subjects }}
    {% set _versions = paper.get_versions() %} diff --git a/sites/arxiv/templates/pdf_preview.html b/sites/arxiv/templates/pdf_preview.html index c7672f6..4974770 100644 --- a/sites/arxiv/templates/pdf_preview.html +++ b/sites/arxiv/templates/pdf_preview.html @@ -1,5 +1,5 @@ {% extends "base.html" %} -{% block title %}{{ paper.title }} — PDF — arXiv{% endblock %} +{% block title %}{{ paper.display_title }} — PDF — arXiv{% endblock %} {% block content %}
    @@ -10,7 +10,7 @@
    -

    {{ paper.title }}

    +

    {{ paper.display_title }}

    {% for a in paper.get_authors() %} {{ a }}{% if not loop.last %}, {% endif %} @@ -23,8 +23,8 @@

    {{ paper.title }}

    ABSTRACT
    - {% if paper.abstract %} - {{ paper.abstract }} + {% if paper.display_abstract %} + {{ paper.display_abstract }} {% else %} Abstract not currently available. {% endif %} @@ -48,9 +48,9 @@

    2. Content Access

    For citation purposes, please reference the arXiv identifier and version.

    - {% if paper.comments %} + {% if paper.display_comments %}

    Notes

    -

    {{ paper.comments }}

    +

    {{ paper.display_comments }}

    {% endif %}
    diff --git a/sites/arxiv/templates/search.html b/sites/arxiv/templates/search.html index 7e64b28..b3d5d3d 100644 --- a/sites/arxiv/templates/search.html +++ b/sites/arxiv/templates/search.html @@ -59,10 +59,10 @@

    Search arXiv

    html, other]
    - +
    Authors: {{ p.authors_display }}
    {{ p.short_abstract }}
    -
    {{ p.primary_subject }}{% if p.journal_ref %} · {{ p.journal_ref }}{% endif %}{% if p.comments %} · {{ p.comments }}{% endif %}
    +
    {{ p.primary_subject }}{% if p.display_journal_ref %} · {{ p.display_journal_ref }}{% endif %}{% if p.display_comments %} · {{ p.display_comments }}{% endif %}
    {% endfor %} diff --git a/sites/arxiv/templates/starred.html b/sites/arxiv/templates/starred.html index 8127396..90f3bbf 100644 --- a/sites/arxiv/templates/starred.html +++ b/sites/arxiv/templates/starred.html @@ -10,13 +10,13 @@

    ★ Starred Papers

    {% for item in items %}
  • - {{ item.paper.title }} + {{ item.paper.display_title }}
    {{ item.paper.authors_display }}
    arXiv:{{ item.paper.arxiv_id }} · Starred {{ item.starred_at.strftime("%b %d, %Y") }}
    - {% if item.paper.abstract %} + {% if item.paper.display_abstract %}
    {{ item.paper.short_abstract }}
    {% endif %}