Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions .linkcheckerrc-special.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
drop:
https://doi.org: 300
https://anaconda.org: 403
https://claude.ai: 403
https://idm.uab.edu/cgi-cas/xrmi/sites: 423
https://idm.uab.edu/cgi-cas/xrmi/users: 423
replace:
200 OK: [300 Redirect, result]
ConnectTimeout: [408 Timeout, result]
https://padlock.idm.uab.edu: [423 Locked, url-after-redirection]
27 changes: 14 additions & 13 deletions .ruff.toml
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,20 @@ required-version = ">=0.15.6"
show-fixes = true

[lint]
per-file-ignores = { "test/*" = [
"ANN201",
"ANN202",
"D101",
"D102",
"D100",
"PT",
], "**/*.ipynb" = [
"T201",
"ANN401",
], "**/__init__.py" = [
"D104",
] }

ignore = [
"D203", # prefer conflicting D211
"D213", # prefer conflicting D212
Expand All @@ -21,19 +35,6 @@ ignore = [
"Q003", # END
]
select = ["ALL"]
per-file-ignores = { "test/*" = [
"ANN201",
"ANN202",
"D101",
"D102",
"D100",
"PT",
], "**/*.ipynb" = [
"T201",
"ANN401",
], "**/__init__.py" = [
"D104",
] }

[format]
indent-style = "space"
Expand Down
2 changes: 1 addition & 1 deletion docs/account/code.rc/index.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ hide:

# Create and Manage Code.rc (GitLab) Accounts

UAB Research Computing maintains an on-premises GitLab server, part of the Research Computing System (RCS), called **Code.rc**. Generally speaking, [GitLab](https://about.gitlab.com/) is a service designed for collaborating on software development projects and is similar in structure and purpose to [GitHub](https://github.com/). In contrast to the Git hosting services [GitLab.com](https://gitlab.com) and [GitHub.com](https://github.com), Code.rc is hosted on-premises and stored in a secure physical environment on UAB Campus.
UAB Research Computing maintains an on-premises GitLab server, part of the Research Computing System (RCS), called **Code.rc**. Generally speaking, [GitLab](https://about.gitlab.com/) is a service designed for collaborating on software development projects and is similar in structure and purpose to [GitHub](https://github.com/). In contrast to the Git hosting services [GitLab.com](https://about.gitlab.com) and [GitHub.com](https://github.com), Code.rc is hosted on-premises and stored in a secure physical environment on UAB Campus.

<!-- markdownlint-disable MD046 -->
!!! important
Expand Down
2 changes: 1 addition & 1 deletion docs/data_management/storage/index.md
Original file line number Diff line number Diff line change
Expand Up @@ -190,7 +190,7 @@ Periodically review permissions, clean up unused data, and follow institutional

At this time, Research Computing does not offer a method of archival. If you have need for archival, please feel free to contact [Support](../../help/support.md) to start a conversation.

A possible external resource for archival is available through University of Oklahoma (OU) Supercomputing Center for Education and Research (OSCER). Please see the following link for details: <https://www.ou.edu/oscer/resources/ourrstore--ou---regional-research-store>.
A possible external resource for archival is available through University of Oklahoma (OU) Supercomputing Center for Education and Research (OSCER). Please see the following link for details: <https://ou.edu/oscer/storage/ourrstore--ou---regional-research-store>.

### Backups

Expand Down
234 changes: 127 additions & 107 deletions verification_scripts/linkchecker.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,9 @@
"""Runs linkchecker on docs and produces human-readable output."""
"""Runs linkchecker on docs and produces human-readable output.

Install with `pip install -r requirements-dev.txt`.

Use with `python ./verification_scripts/linkchecker.py`.
"""

from __future__ import annotations

Expand All @@ -9,23 +14,7 @@

import pandas as pd
import yaml

"""
How to use:

python ./scripts/linkchecker.py
"""

# Cleans up output of linkchecker

OUTPUT = PurePath("out")
Path(OUTPUT).mkdir(exist_ok=True)

# FILE PATHS
LINKCHECKER_LOG = OUTPUT / "linkchecker.log"
LINKCHECKER_RAW_CSV = OUTPUT / "linkchecker-raw.csv"
LINKCHECKER_OUT_CSV = OUTPUT / "linkchecker-out.csv"
LINKCHECKER_OUT_YAML = OUTPUT / "linkchecker-out.yml"
from attrs import define

# COLUMNS
## ORIGINAL
Expand All @@ -41,19 +30,25 @@
MARKDOWN_FILE = "document"


def run_linkchecker() -> None:
# READ
def _run_linkchecker(path: PurePath) -> None:
"""Run the linkchecker application."""
with Path(LINKCHECKER_LOG).open("wb", buffering=0) as f:
with Path(path).open("wb", buffering=0) as f:
subprocess.run( # noqa: S603
[_get_linkchecker_path(), "--config", ".linkcheckerrc", "docs"],
stdout=f,
check=False,
)


def load_output() -> pd.DataFrame:
def _get_linkchecker_path() -> PurePath:
return PurePath(sys.executable).parent / "Scripts" / "linkchecker"


# PROCESS
def _load_results(path: PurePath) -> pd.DataFrame:
"""Load the raw linkchecker output dataframe."""
raw_linkchecker_data = pd.read_csv(LINKCHECKER_RAW_CSV)
raw_linkchecker_data = pd.read_csv(path)
raw_linkchecker_data = raw_linkchecker_data[
[RESULT, URLNAME, URL, PARENTNAME, LINE, COLUMN]
]
Expand All @@ -66,7 +61,77 @@ def load_output() -> pd.DataFrame:
)


def replace_rows(
def _drop_ok_with_no_redirects(_df: pd.DataFrame) -> pd.DataFrame:
"""Drop rows with OK code (200) if there is no redirection."""
same_url = _df[URL_IN_MARKDOWN] == _df[URL_AFTER_REDIRECTION]
result_ok = _df[RESULT].str.startswith("200")
drop = same_url & result_ok
return _df[~drop]


@define
class Drop:
"""Information about rows to drop from linkchecker output."""

url: str
code: str


@define
class Replace:
"""Information about rows to replace in linkchecker output."""

find: str
replace: str
where: str


@define
class Cases:
"""All special case information."""

drops: list[Drop]
replacements: list[Replace]


def _read_special_cases() -> Cases:
with Path(".linkcheckerrc-special.yaml").open("r") as f:
data = yaml.safe_load(f)

drops = [Drop(url, str(code)) for url, code in data["drop"].items()]
replaces = [Replace(pattern, v[0], v[1]) for pattern, v in data["replace"].items()]
return Cases(drops, replaces)


def _file_uris_to_paths(_s: pd.Series) -> pd.Series:
"""Modify file URIs to a normalized format.

Example:
file:///D|/repos/uabrc.github.io/dir/file.md -> dir/file.md

"""
if _s.empty:
return _s

keep = _s.str.startswith("file:") & _s.str.contains("repos/uabrc.github.io")
splits = _s.str.split("repos/uabrc.github.io", expand=True)

fixes = splits.iloc[:, -1][keep]
fixes = fixes.apply(PurePath) # type: ignore[reportCallIssue,reportArgumentType]
fixes = fixes.astype(str)
fixes = fixes.str.lstrip(os.sep)

out = _s.copy()
out[keep] = fixes
return out


def _find_rows_containing(_s: pd.Series, _containing: str) -> pd.Series:
"""Find rows containing the supplied string in the supplied series."""
return _s.str.contains(_containing)


def _replace_rows(
_s: pd.Series,
_containing: str,
_with: str,
Expand All @@ -87,15 +152,7 @@ def replace_rows(
return out


def drop_ok_with_no_redirects(_df: pd.DataFrame) -> pd.DataFrame:
"""Drop rows with OK code (200) if there is no redirection."""
same_url = _df[URL_IN_MARKDOWN] == _df[URL_AFTER_REDIRECTION]
result_ok = _df[RESULT].str.startswith("200")
drop = same_url & result_ok
return _df[~drop]


def drop_rows(
def _drop_rows(
_df: pd.DataFrame,
_in: str,
_containing: str,
Expand All @@ -115,91 +172,54 @@ def drop_rows(
return _df[~contains]


def modify_file_uris(_s: pd.Series) -> pd.Series:
"""Modify file URIs to a normalized format.

Example:
file:///D|/repos/uabrc.github.io/dir/file.md -> dir/file.md

"""
keep = _s.str.startswith("file:") & _s.str.contains("repos/uabrc.github.io")
splits = _s.str.split("repos/uabrc.github.io", expand=True)
def _handle_special_cases(results: pd.DataFrame) -> pd.DataFrame:
cases = _read_special_cases()
for replace in cases.replacements:
results[RESULT] = _replace_rows(
results[RESULT],
replace.find,
replace.replace,
find_in=results[replace.where],
)

fixes = splits.iloc[:, -1][keep]
fixes = fixes.apply(PurePath) # pyright: ignore[reportCallIssue,reportArgumentType]
fixes = fixes.astype(str)
fixes = fixes.str.lstrip(os.sep)
for drop in cases.drops:
results = _drop_rows(results, URL_IN_MARKDOWN, drop.url, drop.code)

out = _s.copy()
out[keep] = fixes
return out
return results


def _find_rows_containing(_s: pd.Series, _containing: str) -> pd.Series:
"""Find rows containing the supplied string in the supplied series."""
return _s.str.contains(_containing)
# WRITE
def _to_csv(results: pd.DataFrame, path: PurePath) -> None:
results.to_csv(path, index=False)


def _get_linkchecker_path() -> PurePath:
return PurePath(sys.executable).parent / "Scripts" / "linkchecker"
def _to_yaml(results: pd.DataFrame, path: PurePath) -> None:
records = results.to_dict(orient="records") if not results.empty else ""
with Path(path).open("w") as f:
yaml.safe_dump(records, f, sort_keys=False)


if __name__ == "__main__":
run_linkchecker()
results = load_output()

### drop good urls
results = drop_ok_with_no_redirects(results)

### replace unhelpful error messages
# change 200 OK to 300 Redirect for human clarity on successful redirects
results[RESULT] = replace_rows(results[RESULT], "200 OK", "300 Redirect")
# replace long error messages with short codes
results[RESULT] = replace_rows(results[RESULT], "ConnectTimeout", "408 Timeout")
# special code for SSO urls
results[RESULT] = replace_rows(
results[RESULT],
"https://padlock.idm.uab.edu",
"423 Locked",
find_in=results[URL_AFTER_REDIRECTION],
)
# ENTRY POINT
def main() -> None:
"""Primary entrypoint."""
# config
output_path = PurePath("out")
Path(output_path).mkdir(exist_ok=True)

### special url ignore rules
# doi.org always redirects, that's its purpose, so we ignore
results = drop_rows(
results,
URL_IN_MARKDOWN,
"https://doi.org",
if_result_code="300",
)
# if anaconda.org goes down we'll surely hear about it
results = drop_rows(
results,
URL_IN_MARKDOWN,
"https://anaconda.org",
if_result_code="403",
)
# UAB specific requiring login
results = drop_rows(
results,
URL_IN_MARKDOWN,
"https://idm.uab.edu/cgi-cas/xrmi/sites",
if_result_code="423",
)
# generate input
_run_linkchecker(output_path / "linkchecker.log")
results = _load_results(output_path / "linkchecker-raw.csv")

### modify file uris to improve readability
results[MARKDOWN_FILE] = modify_file_uris(results[MARKDOWN_FILE])
# process
results = _drop_ok_with_no_redirects(results)
results = _handle_special_cases(results)
results[MARKDOWN_FILE] = _file_uris_to_paths(results[MARKDOWN_FILE])
results = results.sort_values(by=[RESULT, MARKDOWN_FILE, LINE, COLUMN])

### organize
results = results.sort_values(
by=[RESULT, URL_IN_MARKDOWN, MARKDOWN_FILE, LINE, COLUMN],
)
# write output
_to_csv(results, output_path / "linkchecker-out.csv")
_to_yaml(results, output_path / "linkchecker-out.yml")

### output
# csv
results.to_csv(LINKCHECKER_OUT_CSV, index=False)

# yml
records = results.to_dict(orient="records")
with Path(LINKCHECKER_OUT_YAML).open("w") as f:
yaml.safe_dump(records, f, sort_keys=False)
if __name__ == "__main__":
main()
Loading