Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
114 changes: 114 additions & 0 deletions scripts/check_arxiv_metadata_latex.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
#!/usr/bin/env python3
"""Regression checks for arXiv metadata LaTeX cleanup."""

import json
import re
import sys
from pathlib import Path


ROOT = Path(__file__).resolve().parents[1]
ARXIV_DIR = ROOT / "sites/arxiv"
PAPERS = ARXIV_DIR / "papers.json"


def main():
sys.path.insert(0, str(ARXIV_DIR))
from metadata_cleaning import clean_arxiv_metadata_text as clean
from metadata_cleaning import format_arxiv_display_text as display
papers = json.loads(PAPERS.read_text())
by_id = {paper.get("arxiv_id"): paper for paper in papers}

known_cases = {
"2604.07983": {
"contains": [r"$\gtrsim 100\times$", r"$z=1.37$", "SN 2025mkn"],
"forbidden": [r"\gtrsim 100\times\gtrsim 100\times", "z=1.37z=1.37"],
},
"2604.04709": {
"contains": [r"\mathbb{P}^1"],
"forbidden": [r"\mathbb{P}^1\mathbb{P}^1"],
},
"2604.07446": {
"contains": [r"\mathcal{N}=1"],
"forbidden": [r"\mathcal{N}=1\mathcal{N}=1", "CFT_3_3"],
},
"2206.03566": {
"contains": [r"\mathbb{R}^4"],
"forbidden": [r"\mathbb{R}^4\mathbb{R}^4"],
},
"2511.23096": {
"contains": [r"GL(d_1)\times GL(d_2)"],
"forbidden": [r"GL(d_1)\times GL(d_2)GL(d_1)\times GL(d_2)"],
},
}

for arxiv_id, checks in known_cases.items():
paper = by_id.get(arxiv_id)
if not paper:
raise SystemExit(f"missing arXiv fixture {arxiv_id}")
cleaned_title = clean(paper.get("title", ""))
for text in checks["contains"]:
if text not in cleaned_title:
raise SystemExit(f"{arxiv_id}: expected cleaned title fragment missing: {text}")
for text in checks["forbidden"]:
if text in cleaned_title:
raise SystemExit(f"{arxiv_id}: duplicated title fragment remains: {text}")

display_cases = {
r"A Natural $\gtrsim 100\times$ Telescope at $z=1.37$": [
"≥ 100×",
"z=1.37",
],
r"Project page \url{this https URL}": [
"Project page this https URL",
],
r"available at \href{this https URL}{GitHub}": [
"available at GitHub",
],
r"$\mathbb{R}^4$ and $\mathcal{N}=2$": [
"R^4",
"N=2",
],
}
for raw, fragments in display_cases.items():
rendered = display(raw)
for fragment in fragments:
if fragment not in rendered:
raise SystemExit(f"display fragment missing: {fragment!r} from {rendered!r}")
if "\\" in rendered or "$" in rendered:
raise SystemExit(f"display still contains raw TeX delimiters: {rendered!r}")

duplicate_patterns = [
re.compile(r"(\\mathbb\{[^}]+\}\^?[^\\\s]*)\1"),
re.compile(r"(\\mathcal\{[^}]+\}=?[0-9A-Za-z]*)\1"),
re.compile(r"(\\mathrm\{[^}]+\}[_^]?\{?[^{}\s]*\}?)\1"),
re.compile(r"\b(z=[0-9.]+)\1\b"),
re.compile(r"(\$[^$]{1,120}\$)\s*\1"),
re.compile(r"(GL\(d_1\)\\times GL\(d_2\))\1"),
]
failures = []
for paper in papers:
cleaned_title = clean(paper.get("title", ""))
for pattern in duplicate_patterns:
if pattern.search(cleaned_title):
failures.append((paper.get("arxiv_id"), cleaned_title))
break
if failures:
preview = "\n".join(f"{pid}: {title}" for pid, title in failures[:20])
raise SystemExit(f"duplicated LaTeX fragments remain in cleaned titles:\n{preview}")

display_failures = []
for paper in papers:
for field in ("title", "abstract", "comments"):
rendered = display(paper.get(field, ""))
if r"\href{" in rendered or r"\url{" in rendered:
display_failures.append((paper.get("arxiv_id"), field, rendered[:160]))
if display_failures:
preview = "\n".join(f"{pid} {field}: {text}" for pid, field, text in display_failures[:20])
raise SystemExit(f"raw URL TeX commands remain in display text:\n{preview}")

print("arXiv metadata LaTeX checks passed")


if __name__ == "__main__":
main()
57 changes: 50 additions & 7 deletions sites/arxiv/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@
from flask_wtf import CSRFProtect
from sqlalchemy import or_, and_, func

from metadata_cleaning import clean_arxiv_metadata_text, format_arxiv_display_text

BASE_DIR = Path(__file__).parent
DB_DIR = BASE_DIR / "instance"
DB_DIR.mkdir(exist_ok=True)
Expand Down Expand Up @@ -197,11 +199,28 @@ def authors_display(self):

@property
def short_abstract(self):
if not self.abstract:
abstract = self.display_abstract
if not abstract:
return "Abstract not available."
if len(self.abstract) > 280:
return self.abstract[:280] + "…"
return self.abstract
if len(abstract) > 280:
return abstract[:280] + "…"
return abstract

@property
def display_title(self):
return format_arxiv_display_text(self.title or "")

@property
def display_abstract(self):
return format_arxiv_display_text(self.abstract or "")

@property
def display_comments(self):
return format_arxiv_display_text(self.comments or "")

@property
def display_journal_ref(self):
return format_arxiv_display_text(self.journal_ref or "")

@property
def subject_list(self):
Expand Down Expand Up @@ -443,6 +462,10 @@ def _synthesize_abstract(title: str, subject_code: str,
f"experiments.")


def _clean_arxiv_metadata_text(text):
return clean_arxiv_metadata_text(text)


def seed_database():
"""Populate the DB from categories.json + papers.json."""
if Category.query.first() is not None:
Expand Down Expand Up @@ -521,7 +544,7 @@ def seed_database():
if primary_category not in primary_cats and subject_code in primary_cats:
primary_category = subject_code
# Titles
title = rp.get("title", "").strip()
title = _clean_arxiv_metadata_text(rp.get("title", "").strip())
if not title:
continue
# Parse date, falling back to arxiv-id-encoded yymm (e.g. 2604.08525 -> 2026-04)
Expand All @@ -547,7 +570,7 @@ def seed_database():
if not authors:
authors = _synthesize_authors(arxiv_id)
# Parse figures, tables, formulas counts from comments
cmt = rp.get("comments", "")
cmt = _clean_arxiv_metadata_text(rp.get("comments", ""))
figs = 0
tbls = 0
frms = 0
Expand All @@ -564,7 +587,7 @@ def seed_database():
versions = rp.get("versions", [])
# Loss function from abstract
loss_fn = ""
abs_text = rp.get("abstract", "") or ""
abs_text = _clean_arxiv_metadata_text(rp.get("abstract", "") or "")
# Backfill empty abstracts for high-traffic categories so the /abs
# and listing pages always surface something meaningful.
if not abs_text:
Expand Down Expand Up @@ -2444,6 +2467,25 @@ def backfill_paper_gaps():
print(f" ! backfill_paper_gaps failed: {e}")


def normalize_paper_metadata():
"""Normalize known duplicated LaTeX/text fragments in existing DB rows."""
try:
changed = 0
for paper in Paper.query.all():
for field in ("title", "abstract", "comments"):
current = getattr(paper, field) or ""
cleaned = _clean_arxiv_metadata_text(current)
if cleaned != current:
setattr(paper, field, cleaned)
changed += 1
if changed:
db.session.commit()
print(f" [+] Normalized {changed} arXiv metadata fields")
except Exception as e:
db.session.rollback()
print(f" ! normalize_paper_metadata failed: {e}")


def ensure_affiliation_column():
"""Ensure the author_affiliations_json column exists on older DBs."""
try:
Expand Down Expand Up @@ -2495,6 +2537,7 @@ def backfill_affiliations():
seed_database()
seed_benchmark_users()
backfill_paper_gaps()
normalize_paper_metadata()
backfill_affiliations()


Expand Down
151 changes: 151 additions & 0 deletions sites/arxiv/metadata_cleaning.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,151 @@
"""Metadata cleanup helpers for arXiv seed data."""

import re


_DUP_LATEX_FRAGMENT_RE = re.compile(
r"("
r"(?:\\[A-Za-z]+(?:\{[^{}]*\})?(?:[_^]\{?[^{}\s]+\}?)*)"
r"(?:=[0-9A-Za-z_.+-]+)?"
r"(?:\s*(?:\\times|\\rightarrow|to|-|\+)\s*)?"
r"(?:[A-Za-z0-9_.{}^+\-/]+)?"
r")\1"
)
_DUP_PLAIN_FRAGMENT_RE = re.compile(
r"\b([A-Za-z][A-Za-z0-9_.+-]{1,}(?:\s+[A-Za-z0-9_.+-]{1,}){0,5})\1\b"
)
_DUP_Z_VALUE_RE = re.compile(r"\b(z\s*=\s*[0-9.]+)\1\b")
_DUP_ISOTOPE_RE = re.compile(r"(\^\{?\d+\}?)(?=\1)")
_DUP_ION_PLUS_RE = re.compile(r"(\^[A-Za-z0-9_/{}+-]+)\+(?=\+)")
_DUP_TIMES_EXPR_RE = re.compile(r"([A-Za-z]+\([^)]{1,30}\)\\times [A-Za-z]+\([^)]{1,30}\))\1")
_DUP_ADJACENT_MATH_RE = re.compile(r"(\$[^$]{1,120}\$)\s*\1")
_HREF_RE = re.compile(r"\\href\{([^{}]*)\}\{([^{}]*)\}")
_URL_RE = re.compile(r"\\url\{([^{}]*)\}")

_GREEK = {
"alpha": "α",
"beta": "β",
"gamma": "γ",
"delta": "δ",
"epsilon": "ε",
"lambda": "λ",
"mu": "μ",
"nu": "ν",
"phi": "φ",
"pi": "π",
"sigma": "σ",
"theta": "θ",
"upsilon": "υ",
"zeta": "ζ",
}

_SYMBOLS = {
"approx": "≈",
"bullet": "•",
"cdot": "·",
"epsilon": "ε",
"geq": "≥",
"gt": ">",
"gtrsim": "≥",
"in": "∈",
"infty": "∞",
"leq": "≤",
"log": "log",
"lt": "<",
"nabla": "∇",
"odot": "⊙",
"pm": "±",
"prime": "′",
"rightarrow": "→",
"sim": "∼",
"sqrt": "√",
"textendash": "–",
"times": "×",
"to": "→",
}
_SYMBOLS.update(_GREEK)


def clean_arxiv_metadata_text(text):
"""Repair scrape artifacts from mixed text/MathJax extraction."""
if not text:
return text
cleaned = re.sub(r"\s+", " ", str(text)).strip()
cleaned = _HREF_RE.sub(r"\2", cleaned)
cleaned = _URL_RE.sub(r"\1", cleaned)
previous = None
while cleaned != previous:
previous = cleaned
cleaned = _DUP_LATEX_FRAGMENT_RE.sub(r"\1", cleaned)
cleaned = _DUP_Z_VALUE_RE.sub(r"\1", cleaned)
cleaned = _DUP_TIMES_EXPR_RE.sub(r"\1", cleaned)
cleaned = _DUP_PLAIN_FRAGMENT_RE.sub(r"\1", cleaned)
cleaned = _DUP_ISOTOPE_RE.sub(r"\1", cleaned)
cleaned = _DUP_ION_PLUS_RE.sub(r"\1+", cleaned)
cleaned = re.sub(r"(\\mathcal\{N\}=)(\d+)\s+to\s+(\d+)", r"\1\2 to \3", cleaned)
cleaned = re.sub(r"\bCFT_(\d+)_(?=\d+\b)", r"CFT_", cleaned)
cleaned = re.sub(r"\bAdS_(\d+)_(?=\d+\b)", r"AdS_", cleaned)
if r"\gtrsim 100\times" in cleaned and "$" not in cleaned:
cleaned = cleaned.replace(r"\gtrsim 100\times", r"$\gtrsim 100\times$")
cleaned = re.sub(r"(?<!\$)\bz=([0-9]+(?:\.[0-9]+)?)(?!\$)", r"$z=\1$", cleaned)
previous = None
while cleaned != previous:
previous = cleaned
cleaned = _DUP_ADJACENT_MATH_RE.sub(r"\1", cleaned)
return cleaned


def _replace_tex_command(match):
command = match.group(1)
braced = match.group(2)
if command in _SYMBOLS:
return _SYMBOLS[command] + (braced or "")
if command == "mathbb":
return braced or ""
if command == "mathcal":
return braced or ""
if command == "mathrm":
return braced or ""
if command == "mathbf":
return braced or ""
if command == "textbf":
return braced or ""
if command == "textit":
return braced or ""
if command == "textsc":
return (braced or "").upper()
if command == "texttt":
return braced or ""
if command == "emph":
return braced or ""
if command in {"operatorname", "mathsf", "mathit", "mathscr"}:
return braced or ""
if braced:
return braced
return "\\" + command


def _format_math_text(expr):
expr = expr.replace(r"\,", "")
expr = expr.replace(r"\ ", " ")
expr = expr.replace("~", " ")
expr = re.sub(r"\\([A-Za-z]+)(?:\{([^{}]*)\})?", _replace_tex_command, expr)
expr = re.sub(r"\^\{([^{}]+)\}", r"^\1", expr)
expr = re.sub(r"_\{([^{}]+)\}", r"_\1", expr)
expr = expr.replace("{", "").replace("}", "")
expr = re.sub(r"\s+", " ", expr).strip()
return expr


def format_arxiv_display_text(text):
"""Return metadata text formatted for human-readable HTML display."""
if not text:
return text
formatted = clean_arxiv_metadata_text(text)
formatted = formatted.replace(r"\$", "$")
formatted = _HREF_RE.sub(r"\2", formatted)
formatted = _URL_RE.sub(r"\1", formatted)
formatted = re.sub(r"\$([^$]{1,240})\$", lambda m: _format_math_text(m.group(1)), formatted)
formatted = _format_math_text(formatted)
formatted = re.sub(r"\s+", " ", formatted).strip()
return formatted
2 changes: 1 addition & 1 deletion sites/arxiv/templates/account.html
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ <h2>Recent Library</h2>
{% for item in recent_library %}
<li class="library-item">
<div class="lib-title">
<a href="{{ url_for('paper_detail', arxiv_id=item.paper.arxiv_id) }}">{{ item.paper.title }}</a>
<a href="{{ url_for('paper_detail', arxiv_id=item.paper.arxiv_id) }}">{{ item.paper.display_title }}</a>
</div>
<div class="lib-meta">{{ item.paper.authors_display }}</div>
<div class="lib-meta muted">
Expand Down
2 changes: 1 addition & 1 deletion sites/arxiv/templates/author.html
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ <h1>{{ name }}'s articles on arXiv</h1>
<a href="{{ url_for('format_view', arxiv_id=p.arxiv_id) }}">other</a>]
</dt>
<dd class="paper-entry">
<div class="entry-title"><a href="{{ url_for('paper_detail', arxiv_id=p.arxiv_id) }}">{{ p.title }}</a></div>
<div class="entry-title"><a href="{{ url_for('paper_detail', arxiv_id=p.arxiv_id) }}">{{ p.display_title }}</a></div>
<div class="entry-authors"><b>Authors:</b> {{ p.authors_display }}</div>
<div class="entry-subjects"><b>Subjects:</b> {{ p.primary_subject }}</div>
</dd>
Expand Down
Loading