diff --git a/.linkcheckerrc-special.yaml b/.linkcheckerrc-special.yaml new file mode 100644 index 00000000..f8864f0c --- /dev/null +++ b/.linkcheckerrc-special.yaml @@ -0,0 +1,10 @@ +drop: + https://doi.org: 300 + https://anaconda.org: 403 + https://claude.ai: 403 + https://idm.uab.edu/cgi-cas/xrmi/sites: 423 + https://idm.uab.edu/cgi-cas/xrmi/users: 423 +replace: + 200 OK: [300 Redirect, result] + ConnectTimeout: [408 Timeout, result] + https://padlock.idm.uab.edu: [423 Locked, url-after-redirection] diff --git a/.ruff.toml b/.ruff.toml index 45ab4afd..40615af5 100644 --- a/.ruff.toml +++ b/.ruff.toml @@ -5,6 +5,20 @@ required-version = ">=0.15.6" show-fixes = true [lint] +per-file-ignores = { "test/*" = [ + "ANN201", + "ANN202", + "D101", + "D102", + "D100", + "PT", +], "**/*.ipynb" = [ + "T201", + "ANN401", +], "**/__init__.py" = [ + "D104", +] } + ignore = [ "D203", # prefer conflicting D211 "D213", # prefer conflicting D212 @@ -21,19 +35,6 @@ ignore = [ "Q003", # END ] select = ["ALL"] -per-file-ignores = { "test/*" = [ - "ANN201", - "ANN202", - "D101", - "D102", - "D100", - "PT", -], "**/*.ipynb" = [ - "T201", - "ANN401", -], "**/__init__.py" = [ - "D104", -] } [format] indent-style = "space" diff --git a/docs/account/code.rc/index.md b/docs/account/code.rc/index.md index f50b16e6..9a0c230c 100644 --- a/docs/account/code.rc/index.md +++ b/docs/account/code.rc/index.md @@ -5,7 +5,7 @@ hide: # Create and Manage Code.rc (GitLab) Accounts -UAB Research Computing maintains an on-premises GitLab server, part of the Research Computing System (RCS), called **Code.rc**. Generally speaking, [GitLab](https://about.gitlab.com/) is a service designed for collaborating on software development projects and is similar in structure and purpose to [GitHub](https://github.com/). In contrast to the Git hosting services [GitLab.com](https://gitlab.com) and [GitHub.com](https://github.com), Code.rc is hosted on-premises and stored in a secure physical environment on UAB Campus. +UAB Research Computing maintains an on-premises GitLab server, part of the Research Computing System (RCS), called **Code.rc**. Generally speaking, [GitLab](https://about.gitlab.com/) is a service designed for collaborating on software development projects and is similar in structure and purpose to [GitHub](https://github.com/). In contrast to the Git hosting services [GitLab.com](https://about.gitlab.com) and [GitHub.com](https://github.com), Code.rc is hosted on-premises and stored in a secure physical environment on UAB Campus. !!! important diff --git a/docs/data_management/storage/index.md b/docs/data_management/storage/index.md index a8e567a0..cdd89cf2 100644 --- a/docs/data_management/storage/index.md +++ b/docs/data_management/storage/index.md @@ -190,7 +190,7 @@ Periodically review permissions, clean up unused data, and follow institutional At this time, Research Computing does not offer a method of archival. If you have need for archival, please feel free to contact [Support](../../help/support.md) to start a conversation. -A possible external resource for archival is available through University of Oklahoma (OU) Supercomputing Center for Education and Research (OSCER). Please see the following link for details: . +A possible external resource for archival is available through University of Oklahoma (OU) Supercomputing Center for Education and Research (OSCER). Please see the following link for details: . ### Backups diff --git a/verification_scripts/linkchecker.py b/verification_scripts/linkchecker.py index 4216fb77..54690f81 100644 --- a/verification_scripts/linkchecker.py +++ b/verification_scripts/linkchecker.py @@ -1,4 +1,9 @@ -"""Runs linkchecker on docs and produces human-readable output.""" +"""Runs linkchecker on docs and produces human-readable output. + +Install with `pip install -r requirements-dev.txt`. + +Use with `python ./verification_scripts/linkchecker.py`. +""" from __future__ import annotations @@ -9,23 +14,7 @@ import pandas as pd import yaml - -""" -How to use: - -python ./scripts/linkchecker.py -""" - -# Cleans up output of linkchecker - -OUTPUT = PurePath("out") -Path(OUTPUT).mkdir(exist_ok=True) - -# FILE PATHS -LINKCHECKER_LOG = OUTPUT / "linkchecker.log" -LINKCHECKER_RAW_CSV = OUTPUT / "linkchecker-raw.csv" -LINKCHECKER_OUT_CSV = OUTPUT / "linkchecker-out.csv" -LINKCHECKER_OUT_YAML = OUTPUT / "linkchecker-out.yml" +from attrs import define # COLUMNS ## ORIGINAL @@ -41,9 +30,10 @@ MARKDOWN_FILE = "document" -def run_linkchecker() -> None: +# READ +def _run_linkchecker(path: PurePath) -> None: """Run the linkchecker application.""" - with Path(LINKCHECKER_LOG).open("wb", buffering=0) as f: + with Path(path).open("wb", buffering=0) as f: subprocess.run( # noqa: S603 [_get_linkchecker_path(), "--config", ".linkcheckerrc", "docs"], stdout=f, @@ -51,9 +41,14 @@ def run_linkchecker() -> None: ) -def load_output() -> pd.DataFrame: +def _get_linkchecker_path() -> PurePath: + return PurePath(sys.executable).parent / "Scripts" / "linkchecker" + + +# PROCESS +def _load_results(path: PurePath) -> pd.DataFrame: """Load the raw linkchecker output dataframe.""" - raw_linkchecker_data = pd.read_csv(LINKCHECKER_RAW_CSV) + raw_linkchecker_data = pd.read_csv(path) raw_linkchecker_data = raw_linkchecker_data[ [RESULT, URLNAME, URL, PARENTNAME, LINE, COLUMN] ] @@ -66,7 +61,77 @@ def load_output() -> pd.DataFrame: ) -def replace_rows( +def _drop_ok_with_no_redirects(_df: pd.DataFrame) -> pd.DataFrame: + """Drop rows with OK code (200) if there is no redirection.""" + same_url = _df[URL_IN_MARKDOWN] == _df[URL_AFTER_REDIRECTION] + result_ok = _df[RESULT].str.startswith("200") + drop = same_url & result_ok + return _df[~drop] + + +@define +class Drop: + """Information about rows to drop from linkchecker output.""" + + url: str + code: str + + +@define +class Replace: + """Information about rows to replace in linkchecker output.""" + + find: str + replace: str + where: str + + +@define +class Cases: + """All special case information.""" + + drops: list[Drop] + replacements: list[Replace] + + +def _read_special_cases() -> Cases: + with Path(".linkcheckerrc-special.yaml").open("r") as f: + data = yaml.safe_load(f) + + drops = [Drop(url, str(code)) for url, code in data["drop"].items()] + replaces = [Replace(pattern, v[0], v[1]) for pattern, v in data["replace"].items()] + return Cases(drops, replaces) + + +def _file_uris_to_paths(_s: pd.Series) -> pd.Series: + """Modify file URIs to a normalized format. + + Example: + file:///D|/repos/uabrc.github.io/dir/file.md -> dir/file.md + + """ + if _s.empty: + return _s + + keep = _s.str.startswith("file:") & _s.str.contains("repos/uabrc.github.io") + splits = _s.str.split("repos/uabrc.github.io", expand=True) + + fixes = splits.iloc[:, -1][keep] + fixes = fixes.apply(PurePath) # type: ignore[reportCallIssue,reportArgumentType] + fixes = fixes.astype(str) + fixes = fixes.str.lstrip(os.sep) + + out = _s.copy() + out[keep] = fixes + return out + + +def _find_rows_containing(_s: pd.Series, _containing: str) -> pd.Series: + """Find rows containing the supplied string in the supplied series.""" + return _s.str.contains(_containing) + + +def _replace_rows( _s: pd.Series, _containing: str, _with: str, @@ -87,15 +152,7 @@ def replace_rows( return out -def drop_ok_with_no_redirects(_df: pd.DataFrame) -> pd.DataFrame: - """Drop rows with OK code (200) if there is no redirection.""" - same_url = _df[URL_IN_MARKDOWN] == _df[URL_AFTER_REDIRECTION] - result_ok = _df[RESULT].str.startswith("200") - drop = same_url & result_ok - return _df[~drop] - - -def drop_rows( +def _drop_rows( _df: pd.DataFrame, _in: str, _containing: str, @@ -115,91 +172,54 @@ def drop_rows( return _df[~contains] -def modify_file_uris(_s: pd.Series) -> pd.Series: - """Modify file URIs to a normalized format. - - Example: - file:///D|/repos/uabrc.github.io/dir/file.md -> dir/file.md - - """ - keep = _s.str.startswith("file:") & _s.str.contains("repos/uabrc.github.io") - splits = _s.str.split("repos/uabrc.github.io", expand=True) +def _handle_special_cases(results: pd.DataFrame) -> pd.DataFrame: + cases = _read_special_cases() + for replace in cases.replacements: + results[RESULT] = _replace_rows( + results[RESULT], + replace.find, + replace.replace, + find_in=results[replace.where], + ) - fixes = splits.iloc[:, -1][keep] - fixes = fixes.apply(PurePath) # pyright: ignore[reportCallIssue,reportArgumentType] - fixes = fixes.astype(str) - fixes = fixes.str.lstrip(os.sep) + for drop in cases.drops: + results = _drop_rows(results, URL_IN_MARKDOWN, drop.url, drop.code) - out = _s.copy() - out[keep] = fixes - return out + return results -def _find_rows_containing(_s: pd.Series, _containing: str) -> pd.Series: - """Find rows containing the supplied string in the supplied series.""" - return _s.str.contains(_containing) +# WRITE +def _to_csv(results: pd.DataFrame, path: PurePath) -> None: + results.to_csv(path, index=False) -def _get_linkchecker_path() -> PurePath: - return PurePath(sys.executable).parent / "Scripts" / "linkchecker" +def _to_yaml(results: pd.DataFrame, path: PurePath) -> None: + records = results.to_dict(orient="records") if not results.empty else "" + with Path(path).open("w") as f: + yaml.safe_dump(records, f, sort_keys=False) -if __name__ == "__main__": - run_linkchecker() - results = load_output() - - ### drop good urls - results = drop_ok_with_no_redirects(results) - - ### replace unhelpful error messages - # change 200 OK to 300 Redirect for human clarity on successful redirects - results[RESULT] = replace_rows(results[RESULT], "200 OK", "300 Redirect") - # replace long error messages with short codes - results[RESULT] = replace_rows(results[RESULT], "ConnectTimeout", "408 Timeout") - # special code for SSO urls - results[RESULT] = replace_rows( - results[RESULT], - "https://padlock.idm.uab.edu", - "423 Locked", - find_in=results[URL_AFTER_REDIRECTION], - ) +# ENTRY POINT +def main() -> None: + """Primary entrypoint.""" + # config + output_path = PurePath("out") + Path(output_path).mkdir(exist_ok=True) - ### special url ignore rules - # doi.org always redirects, that's its purpose, so we ignore - results = drop_rows( - results, - URL_IN_MARKDOWN, - "https://doi.org", - if_result_code="300", - ) - # if anaconda.org goes down we'll surely hear about it - results = drop_rows( - results, - URL_IN_MARKDOWN, - "https://anaconda.org", - if_result_code="403", - ) - # UAB specific requiring login - results = drop_rows( - results, - URL_IN_MARKDOWN, - "https://idm.uab.edu/cgi-cas/xrmi/sites", - if_result_code="423", - ) + # generate input + _run_linkchecker(output_path / "linkchecker.log") + results = _load_results(output_path / "linkchecker-raw.csv") - ### modify file uris to improve readability - results[MARKDOWN_FILE] = modify_file_uris(results[MARKDOWN_FILE]) + # process + results = _drop_ok_with_no_redirects(results) + results = _handle_special_cases(results) + results[MARKDOWN_FILE] = _file_uris_to_paths(results[MARKDOWN_FILE]) + results = results.sort_values(by=[RESULT, MARKDOWN_FILE, LINE, COLUMN]) - ### organize - results = results.sort_values( - by=[RESULT, URL_IN_MARKDOWN, MARKDOWN_FILE, LINE, COLUMN], - ) + # write output + _to_csv(results, output_path / "linkchecker-out.csv") + _to_yaml(results, output_path / "linkchecker-out.yml") - ### output - # csv - results.to_csv(LINKCHECKER_OUT_CSV, index=False) - # yml - records = results.to_dict(orient="records") - with Path(LINKCHECKER_OUT_YAML).open("w") as f: - yaml.safe_dump(records, f, sort_keys=False) +if __name__ == "__main__": + main()