From db9f645439b56d71d5c3875d30bbdd0412d5b06d Mon Sep 17 00:00:00 2001
From: Adrien Barbaresi <adbar@users.noreply.github.com>
Date: Sat, 30 May 2026 14:45:48 +0200
Subject: [PATCH 1/2] maintenance: update setup, harden code, add tests

---
 .pre-commit-config.yaml  |  27 ------
 CONTRIBUTING.md          |   1 -
 htmldate/cli.py          |  17 ++--
 htmldate/core.py         | 191 ++++++++++++++++++---------------------
 htmldate/extractors.py   |  56 ++++++------
 htmldate/meta.py         |   1 -
 htmldate/utils.py        |  91 +++++++++----------
 htmldate/validators.py   |  69 +++++++-------
 pyproject.toml           |  14 +--
 setup.py                 |  37 --------
 tests/evaluation.py      |  23 +++--
 tests/realworld_tests.py |   5 +-
 tests/unit_tests.py      |  30 +++---
 13 files changed, 229 insertions(+), 333 deletions(-)
 delete mode 100644 .pre-commit-config.yaml
 delete mode 100644 setup.py

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
deleted file mode 100644
index 011dd200..00000000
--- a/.pre-commit-config.yaml
+++ /dev/null
@@ -1,27 +0,0 @@
-repos:
-  - repo: https://github.com/pre-commit/pre-commit-hooks
-    rev: v4.5.0
-    hooks:
-      - id: check-yaml
-      - id: end-of-file-fixer
-      - id: trailing-whitespace
-
-  - repo: https://github.com/psf/black
-    rev: 24.3.0
-    hooks:
-      - id: black
-
-  #- repo: https://github.com/PyCQA/flake8
-  #  rev: 7.0.0
-  #  hooks:
-  #    - id: flake8
-
-  #- repo: https://github.com/pycqa/isort
-  #  rev: 5.13.2
-  #  hooks:
-  #    - id: isort
-
-  #- repo: https://github.com/pre-commit/mirrors-mypy
-  #  rev: v1.9.0
-  #  hooks:
-  #    - id: mypy
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 0f2287ed..8fd3507b 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -26,7 +26,6 @@ and if there are no errors.
    - Tests with `pytest`
    - Type checking with `mypy` on the directory: `mypy htmldate/`
    - Code formatting with `black` on the directory as well
-   - Optional: install `pre-commit` to use the corresponding commit hooks
 
 
 For further questions you can use [GitHub issues](https://github.com/adbar/htmldate/issues) or [E-Mail](https://adrien.barbaresi.eu/).
diff --git a/htmldate/cli.py b/htmldate/cli.py
index 4b59d6cf..abba2e13 100644
--- a/htmldate/cli.py
+++ b/htmldate/cli.py
@@ -7,7 +7,6 @@
 import sys
 
 from platform import python_version
-from typing import Any, Optional, Union
 
 from lxml.html import HtmlElement
 
@@ -17,12 +16,12 @@
 
 
 def cli_examine(
-    htmlstring: Union[str, HtmlElement],
-    args: Any,
-) -> Optional[str]:
+    htmlstring: str | HtmlElement | None,
+    args: argparse.Namespace,
+) -> str | None:
     """Generic safeguards and triggers"""
     # safety check
-    if is_wrong_document(htmlstring):
+    if htmlstring is None or is_wrong_document(htmlstring):
         sys.stderr.write("# ERROR: document is empty or too large\n")
         return None
     return find_date(
@@ -35,7 +34,7 @@ def cli_examine(
     )
 
 
-def parse_args(args: Any) -> Any:
+def parse_args(args: list[str]) -> argparse.Namespace:
     """Define parser for command-line arguments"""
     argsparser = argparse.ArgumentParser()
     argsparser.add_argument(
@@ -67,10 +66,10 @@ def parse_args(args: Any) -> Any:
         action="version",
         version=f"Htmldate {__version__} - Python {python_version()}",
     )
-    return argsparser.parse_args()
+    return argsparser.parse_args(args)
 
 
-def process_args(args: Any) -> None:
+def process_args(args: argparse.Namespace) -> None:
     """Process the arguments passed on the command-line."""
     # verbosity
     if args.verbose:
@@ -98,7 +97,7 @@ def process_args(args: Any) -> None:
         with open(args.inputfile, mode="r", encoding="utf-8") as inputfile:
             for line in inputfile:
                 htmltext = fetch_url(line.strip())
-                result = cli_examine(htmltext, args)  # type: ignore[arg-type]
+                result = cli_examine(htmltext, args)
                 sys.stdout.write(f"{line.strip()}\t{result or 'None'}\n")
 
 
diff --git a/htmldate/core.py b/htmldate/core.py
index 98ba9188..f40489d6 100644
--- a/htmldate/core.py
+++ b/htmldate/core.py
@@ -7,10 +7,10 @@
 import re
 
 from collections import Counter
+from collections.abc import Callable
 from copy import deepcopy
 from datetime import datetime
 from functools import lru_cache, partial
-from typing import Match, Optional, Pattern, Union, Counter as Counter_Type
 
 from lxml.html import HtmlElement, tostring
 
@@ -68,7 +68,6 @@
     validate_and_convert,
 )
 
-
 LOGGER = logging.getLogger(__name__)
 
 
@@ -199,7 +198,7 @@ def logstring(element: HtmlElement) -> str:
 def examine_text(
     text: str,
     options: Extractor,
-) -> Optional[str]:
+) -> str | None:
     "Prepare text and try to extract a date."
     text = trim_text(text)
 
@@ -216,7 +215,7 @@ def examine_date_elements(
     tree: HtmlElement,
     expression: str,
     options: Extractor,
-) -> Optional[str]:
+) -> str | None:
     """Check HTML elements one by one for date expressions"""
     elements = tree.xpath(expression)
     if not elements or len(elements) > MAX_POSSIBLE_CANDIDATES:
@@ -235,7 +234,7 @@ def examine_date_elements(
 def examine_header(
     tree: HtmlElement,
     options: Extractor,
-) -> Optional[str]:
+) -> str | None:
     """
     Parse header elements to find date cues
 
@@ -353,11 +352,11 @@ def examine_header(
 
 
 def select_candidate(
-    occurrences: Counter_Type[str],
-    catch: Pattern[str],
-    yearpat: Pattern[str],
+    occurrences: Counter[str],
+    catch: re.Pattern[str],
+    yearpat: re.Pattern[str],
     options: Extractor,
-) -> Optional[Match[str]]:
+) -> re.Match[str] | None:
     """Select a candidate among the most frequent matches"""
     if not occurrences or len(occurrences) > MAX_POSSIBLE_CANDIDATES:
         return None
@@ -381,12 +380,8 @@ def select_candidate(
         if year_match:
             years.append(year_match[1])
 
-    validation = [
-        is_valid_date(
-            datetime(int(year), 1, 1), "%Y", earliest=options.min, latest=options.max
-        )
-        for year in years
-    ]
+    min_year, max_year = options.min.year, options.max.year
+    validation = [min_year <= int(year) <= max_year for year in years]
 
     # safety net: plausibility
     if all(validation):
@@ -409,11 +404,11 @@ def select_candidate(
 
 def search_pattern(
     htmlstring: str,
-    pattern: Pattern[str],
-    catch: Pattern[str],
-    yearpat: Pattern[str],
+    pattern: re.Pattern[str],
+    catch: re.Pattern[str],
+    yearpat: re.Pattern[str],
     options: Extractor,
-) -> Optional[Match[str]]:
+) -> re.Match[str] | None:
     """Chained candidate filtering and selection"""
     candidates = plausible_year_filter(
         htmlstring,
@@ -443,7 +438,7 @@ def compare_reference(
 def examine_abbr_elements(
     tree: HtmlElement,
     options: Extractor,
-) -> Optional[str]:
+) -> str | None:
     """Scan the page for abbr elements and check if their content contains an eligible date"""
     elements = tree.findall(".//abbr")
     if 0 < len(elements) < MAX_POSSIBLE_CANDIDATES:
@@ -500,7 +495,7 @@ def examine_abbr_elements(
 def examine_time_elements(
     tree: HtmlElement,
     options: Extractor,
-) -> Optional[str]:
+) -> str | None:
     """Scan the page for time elements and check if their content contains an eligible date"""
     elements = tree.findall(".//time")
     if 0 < len(elements) < MAX_POSSIBLE_CANDIDATES:
@@ -521,16 +516,17 @@ def examine_time_elements(
                     LOGGER.debug("shortcut for time pubdate found: %s", datetime_attr)
                 # shortcuts: class attribute
                 elif "class" in elem.attrib:
+                    class_attr = elem.get("class", "")
                     if options.original and (
-                        elem.get("class", "").startswith("entry-date")
-                        or elem.get("class", "").startswith("entry-time")
+                        class_attr.startswith("entry-date")
+                        or class_attr.startswith("entry-time")
                     ):
                         shortcut_flag = True
                         LOGGER.debug(
                             "shortcut for time/datetime found: %s", datetime_attr
                         )
                     # updated time
-                    elif not options.original and elem.get("class") == "updated":
+                    elif not options.original and class_attr == "updated":
                         shortcut_flag = True
                         LOGGER.debug(
                             "shortcut for updated time/datetime found: %s",
@@ -562,7 +558,7 @@ def examine_time_elements(
     return None
 
 
-def normalize_match(match: Optional[Match[str]]) -> str:
+def normalize_match(match: re.Match[str] | None) -> str:
     """Normalize string output by adding "0" if necessary,
     and optionally expand the year from two to four digits."""
     day, month, year = (g.zfill(2) for g in match.groups() if g)  # type: ignore[union-attr]
@@ -571,7 +567,44 @@ def normalize_match(match: Optional[Match[str]]) -> str:
     return f"{year}-{month}-{day}"
 
 
-def search_page(htmlstring: str, options: Extractor) -> Optional[str]:
+def normalize_two_comp(item: str) -> str:
+    """Normalize a MM-YYYY style match into a YYYY-MM-01 string."""
+    match = TWO_COMP_REGEX.match(item)
+    month = match[1].zfill(2)  # type: ignore[index]
+    return "-".join([match[2], month, "01"])  # type: ignore[index]
+
+
+def search_normalized(
+    htmlstring: str,
+    pattern: re.Pattern[str],
+    yearpat: re.Pattern[str],
+    normalizer: Callable[[str], str],
+    copyear: int,
+    options: Extractor,
+    *,
+    incomplete: bool = False,
+) -> str | None:
+    """Filter plausible years, normalize each candidate to the YMD format, then
+    select the best match and validate it (shared candidate-selection pipeline)."""
+    candidates = plausible_year_filter(
+        htmlstring,
+        pattern=pattern,
+        yearpat=yearpat,
+        earliest=options.min,
+        latest=options.max,
+        incomplete=incomplete,
+    )
+    # revert DD-MM-YYYY patterns before sorting
+    normalized = Counter(
+        {normalizer(item): count for item, count in candidates.items()}
+    )
+    bestmatch = select_candidate(normalized, YMD_PATTERN, YMD_YEAR, options)
+    return filter_ymd_candidate(
+        bestmatch, pattern, copyear, options.format, options.min, options.max
+    )
+
+
+def search_page(htmlstring: str, options: Extractor) -> str | None:
     """
     Opportunistically search the HTML text for common text patterns
 
@@ -619,7 +652,6 @@ def search_page(htmlstring: str, options: Extractor) -> Optional[str]:
         result = filter_ymd_candidate(
             bestmatch,
             patterns[0],
-            options.original,
             copyear,
             options.format,
             options.min,
@@ -629,30 +661,13 @@ def search_page(htmlstring: str, options: Extractor) -> Optional[str]:
             return result
 
     # YYYY-MM-DD/DD-MM-YYYY
-    candidates = plausible_year_filter(
+    result = search_normalized(
         htmlstring,
-        pattern=SELECT_YMD_PATTERN,
-        yearpat=SELECT_YMD_YEAR,
-        earliest=options.min,
-        latest=options.max,
-    )
-    # revert DD-MM-YYYY patterns before sorting
-    replacement = {}
-    for item in candidates:
-        match = THREE_COMP_REGEX_A.match(item)
-        candidate = normalize_match(match)
-        replacement[candidate] = candidates[item]
-    candidates = Counter(replacement)
-    # select
-    bestmatch = select_candidate(candidates, YMD_PATTERN, YMD_YEAR, options)
-    result = filter_ymd_candidate(
-        bestmatch,
         SELECT_YMD_PATTERN,
-        options.original,
+        SELECT_YMD_YEAR,
+        lambda item: normalize_match(THREE_COMP_REGEX_A.match(item)),
         copyear,
-        options.format,
-        options.min,
-        options.max,
+        options,
     )
     if result is not None:
         return result
@@ -668,7 +683,6 @@ def search_page(htmlstring: str, options: Extractor) -> Optional[str]:
     result = filter_ymd_candidate(
         bestmatch,
         DATESTRINGS_PATTERN,
-        options.original,
         copyear,
         options.format,
         options.min,
@@ -678,30 +692,14 @@ def search_page(htmlstring: str, options: Extractor) -> Optional[str]:
         return result
 
     # DD?/MM?/YY
-    candidates = plausible_year_filter(
+    result = search_normalized(
         htmlstring,
-        pattern=SLASHES_PATTERN,
-        yearpat=SLASHES_YEAR,
-        earliest=options.min,
-        latest=options.max,
-        incomplete=True,
-    )
-    # revert DD-MM-YYYY patterns before sorting
-    replacement = {}
-    for item in candidates:
-        match = THREE_COMP_REGEX_B.match(item)
-        candidate = normalize_match(match)
-        replacement[candidate] = candidates[item]
-    candidates = Counter(replacement)
-    bestmatch = select_candidate(candidates, YMD_PATTERN, YMD_YEAR, options)
-    result = filter_ymd_candidate(
-        bestmatch,
         SLASHES_PATTERN,
-        options.original,
+        SLASHES_YEAR,
+        lambda item: normalize_match(THREE_COMP_REGEX_B.match(item)),
         copyear,
-        options.format,
-        options.min,
-        options.max,
+        options,
+        incomplete=True,
     )
     if result is not None:
         return result
@@ -732,44 +730,24 @@ def search_page(htmlstring: str, options: Extractor) -> Optional[str]:
                 return result
 
     # 2 components, second option
-    candidates = plausible_year_filter(
+    result = search_normalized(
         htmlstring,
-        pattern=MMYYYY_PATTERN,
-        yearpat=MMYYYY_YEAR,
-        earliest=options.min,
-        latest=options.max,
-        incomplete=options.original,
-    )
-    # revert DD-MM-YYYY patterns before sorting
-    replacement = {}
-    for item in candidates:
-        match = TWO_COMP_REGEX.match(item)
-        month = match[1]  # type: ignore[index]
-        if len(month) == 1:
-            month = f"0{month}"
-        candidate = "-".join([match[2], month, "01"])  # type: ignore[index]
-        replacement[candidate] = candidates[item]
-    candidates = Counter(replacement)
-    # select
-    bestmatch = select_candidate(candidates, YMD_PATTERN, YMD_YEAR, options)
-    result = filter_ymd_candidate(
-        bestmatch,
         MMYYYY_PATTERN,
-        options.original,
+        MMYYYY_YEAR,
+        normalize_two_comp,
         copyear,
-        options.format,
-        options.min,
-        options.max,
+        options,
+        incomplete=options.original,
     )
     if result is not None:
         return result
 
     # try full-blown text regex on all HTML?
-    dateobject = regex_parse(htmlstring)  # type: ignore[assignment]
+    text_date = regex_parse(htmlstring)
     # todo: find all candidates and disambiguate?
-    if copyear == 0 or (dateobject and dateobject.year >= copyear):
+    if copyear == 0 or (text_date and text_date.year >= copyear):
         result = validate_and_convert(
-            dateobject, options.format, earliest=options.min, latest=options.max
+            text_date, options.format, earliest=options.min, latest=options.max
         )
         if result is not None:
             return result
@@ -806,16 +784,16 @@ def search_page(htmlstring: str, options: Extractor) -> Optional[str]:
 
 
 def find_date(
-    htmlobject: Union[bytes, str, HtmlElement],
+    htmlobject: bytes | str | HtmlElement,
     extensive_search: bool = True,
     original_date: bool = False,
     outputformat: str = "%Y-%m-%d",
-    url: Optional[str] = None,
+    url: str | None = None,
     verbose: bool = False,
-    min_date: Optional[Union[datetime, str]] = None,
-    max_date: Optional[Union[datetime, str]] = None,
+    min_date: datetime | str | None = None,
+    max_date: datetime | str | None = None,
     deferred_url_extractor: bool = False,
-) -> Optional[str]:
+) -> str | None:
     """
     Extract dates from HTML documents using markup analysis and text patterns
 
@@ -878,7 +856,6 @@ def find_date(
     # find_date.extensive_search = extensive_search
 
     # URL
-    url_result = None
     if url is None:
         # probe for canonical links
         urlelem = tree.find('.//link[@rel="canonical"]')
@@ -909,9 +886,13 @@ def find_date(
         return abbr_result
 
     # first, prune tree
+    # only copy the tree if the caller passed one in: when we parsed it ourselves
+    # (string/bytes/URL input) we own it and can clean it in place, avoiding a
+    # costly deepcopy of the whole document
+    pruning_tree = deepcopy(tree) if isinstance(htmlobject, HtmlElement) else tree
     try:
         search_tree, discarded = discard_unwanted(
-            clean_html(deepcopy(tree), CLEANING_LIST)
+            clean_html(pruning_tree, CLEANING_LIST)
         )
     # rare LXML error: no NULL bytes or control characters
     except ValueError:  # pragma: no cover
diff --git a/htmldate/extractors.py b/htmldate/extractors.py
index 9435c876..01ad9025 100644
--- a/htmldate/extractors.py
+++ b/htmldate/extractors.py
@@ -8,10 +8,9 @@
 
 from datetime import datetime
 from functools import lru_cache
-from typing import List, Optional, Pattern, Tuple
 
 # coverage for date parsing
-from dateparser import DateDataParser  # type: ignore  # third-party, slow
+from dateparser import DateDataParser  # type: ignore[attr-defined]  # third-party, slow
 
 from dateutil.parser import parse as dateutil_parse
 
@@ -23,7 +22,6 @@
 from .utils import Extractor, trim_text
 from .validators import convert_date, is_valid_date, validate_and_convert
 
-
 LOGGER = logging.getLogger(__name__)
 
 EXTERNAL_PARSER = DateDataParser(
@@ -120,9 +118,7 @@
     rf"""(?P<month>{REGEX_MONTHS})\s
 (?P<day>{DAY_RE})(?:st|nd|rd|th)?,? (?P<year>{YEAR_RE})|
 (?P<day2>{DAY_RE})(?:st|nd|rd|th|\.)? (?:of )?
-(?P<month2>{REGEX_MONTHS})[,.]? (?P<year2>{YEAR_RE})""".replace(
-        "\n", ""
-    ),
+(?P<month2>{REGEX_MONTHS})[,.]? (?P<year2>{YEAR_RE})""".replace("\n", ""),
     re.I,
 )
 
@@ -213,7 +209,7 @@
 SIMPLE_PATTERN = re.compile(rf"(?<!w3.org)\D({YEAR_RE})\D")
 
 
-def discard_unwanted(tree: HtmlElement) -> Tuple[HtmlElement, List[HtmlElement]]:
+def discard_unwanted(tree: HtmlElement) -> tuple[HtmlElement, list[HtmlElement]]:
     """Delete unwanted sections of an HTML document and return them as a list"""
     my_discarded = []
     for subtree in DISCARD_EXPRESSIONS(tree):
@@ -223,9 +219,9 @@ def discard_unwanted(tree: HtmlElement) -> Tuple[HtmlElement, List[HtmlElement]]
 
 
 def extract_url_date(
-    testurl: Optional[str],
+    testurl: str | None,
     options: Extractor,
-) -> Optional[str]:
+) -> str | None:
     """Extract the date out of an URL string complying with the Y-M-D format"""
     if testurl is not None:
         match = COMPLETE_URL.search(testurl)
@@ -233,10 +229,9 @@ def extract_url_date(
             LOGGER.debug("found date in URL: %s", match[0])
             try:
                 dateobject = datetime(int(match[1]), int(match[2]), int(match[3]))
-                if is_valid_date(
+                return validate_and_convert(
                     dateobject, options.format, earliest=options.min, latest=options.max
-                ):
-                    return dateobject.strftime(options.format)
+                )
             except ValueError as err:  # pragma: no cover
                 LOGGER.debug("conversion error: %s %s", match[0], err)
     return None
@@ -249,12 +244,12 @@ def correct_year(year: int) -> int:
     return year
 
 
-def try_swap_values(day: int, month: int) -> Tuple[int, int]:
+def try_swap_values(day: int, month: int) -> tuple[int, int]:
     """Swap day and month values if it seems feasible."""
     return (month, day) if month > 12 and day <= 12 else (day, month)
 
 
-def regex_parse(string: str) -> Optional[datetime]:
+def regex_parse(string: str) -> datetime | None:
     """Try full-text parse for date elements using a series of regular expressions
     with particular emphasis on English, French, German and Turkish"""
     # https://github.com/vi3k6i5/flashtext ?
@@ -285,7 +280,7 @@ def regex_parse(string: str) -> Optional[datetime]:
 
 def custom_parse(
     string: str, outputformat: str, min_date: datetime, max_date: datetime
-) -> Optional[str]:
+) -> str | None:
     """Try to bypass the slow dateparser"""
     LOGGER.debug("custom parse test: %s", string)
 
@@ -303,7 +298,7 @@ def custom_parse(
         # b. much faster than extensive parsing
         else:
             try:
-                candidate = datetime.fromisoformat(string)  # type: ignore[attr-defined]
+                candidate = datetime.fromisoformat(string)
             except ValueError:
                 LOGGER.debug("not an ISO date string: %s", string)
                 try:
@@ -383,7 +378,7 @@ def custom_parse(
     )
 
 
-def external_date_parser(string: str, outputformat: str) -> Optional[str]:
+def external_date_parser(string: str, outputformat: str) -> str | None:
     """Use dateutil parser or dateparser module according to system settings"""
     LOGGER.debug("send to external parser: %s", string)
     try:
@@ -393,17 +388,17 @@ def external_date_parser(string: str, outputformat: str) -> Optional[str]:
         target = None
         LOGGER.error("external parser error: %s %s", string, err)
     # issue with data type
-    return datetime.strftime(target, outputformat) if target else None
+    return target.strftime(outputformat) if target else None
 
 
 @lru_cache(maxsize=CACHE_SIZE)
 def try_date_expr(
-    string: Optional[str],
+    string: str | None,
     outputformat: str,
     extensive_search: bool,
     min_date: datetime,
     max_date: datetime,
-) -> Optional[str]:
+) -> str | None:
     """Use a series of heuristics and rules to parse a potential date expression"""
     if not string:
         return None
@@ -440,7 +435,7 @@ def try_date_expr(
 def img_search(
     tree: HtmlElement,
     options: Extractor,
-) -> Optional[str]:
+) -> str | None:
     """Skim through image elements"""
     element = tree.find('.//meta[@property="og:image"][@content]')
     if element is not None:
@@ -453,9 +448,9 @@ def img_search(
 
 def pattern_search(
     text: str,
-    date_pattern: Pattern[str],
+    date_pattern: re.Pattern[str],
     options: Extractor,
-) -> Optional[str]:
+) -> str | None:
     "Look for date expressions using a regular expression on a string of text."
     match = date_pattern.search(text)
     if match and is_valid_date(
@@ -469,7 +464,7 @@ def pattern_search(
 def json_search(
     tree: HtmlElement,
     options: Extractor,
-) -> Optional[str]:
+) -> str | None:
     """Look for JSON time patterns in JSON sections of the tree"""
     # determine pattern
     json_pattern = JSON_PUBLISHED if options.original else JSON_MODIFIED
@@ -479,14 +474,16 @@ def json_search(
     ):
         if not elem.text or '"date' not in elem.text:
             continue
-        return pattern_search(elem.text, json_pattern, options)
+        result = pattern_search(elem.text, json_pattern, options)
+        if result is not None:
+            return result
     return None
 
 
 def idiosyncrasies_search(
     htmlstring: str,
     options: Extractor,
-) -> Optional[str]:
+) -> str | None:
     """Look for author-written dates throughout the web page"""
     match = TEXT_PATTERNS.search(htmlstring)  # EN+DE+TR
     if match:
@@ -499,10 +496,9 @@ def idiosyncrasies_search(
                 day, month = try_swap_values(int(parts[0]), int(parts[1]))
                 year = correct_year(int(parts[2]))
                 candidate = datetime(year, month, day)
-            if is_valid_date(
-                candidate, "%Y-%m-%d", earliest=options.min, latest=options.max
-            ):
-                return candidate.strftime(options.format)  # type: ignore[union-attr]
+            return validate_and_convert(
+                candidate, options.format, earliest=options.min, latest=options.max
+            )
         except (IndexError, ValueError):
             LOGGER.debug("cannot process idiosyncrasies: %s", match[0])
 
diff --git a/htmldate/meta.py b/htmldate/meta.py
index c5faf6df..87a34a99 100644
--- a/htmldate/meta.py
+++ b/htmldate/meta.py
@@ -8,7 +8,6 @@
 from .extractors import try_date_expr
 from .validators import filter_ymd_candidate, is_valid_date, is_valid_format
 
-
 LOGGER = logging.getLogger(__name__)
 
 
diff --git a/htmldate/utils.py b/htmldate/utils.py
index 890d7fea..bd95f906 100644
--- a/htmldate/utils.py
+++ b/htmldate/utils.py
@@ -6,15 +6,14 @@
 import logging
 import re
 
+from dataclasses import dataclass
 from datetime import datetime
-from typing import Any, List, Optional, Set, Union
 
 import urllib3
 
-
 # CChardet is faster and can be more accurate
 try:
-    from cchardet import detect as cchardet_detect  # type: ignore
+    from cchardet import detect as cchardet_detect  # type: ignore[import-untyped]
 except ImportError:
     cchardet_detect = None
 from charset_normalizer import from_bytes
@@ -23,10 +22,9 @@
 
 from .settings import MAX_FILE_SIZE
 
-
 LOGGER = logging.getLogger(__name__)
 
-UNICODE_ALIASES: Set[str] = {"utf-8", "utf_8"}
+UNICODE_ALIASES: set[str] = {"utf-8", "utf_8"}
 
 urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
 RETRY_STRATEGY = urllib3.util.Retry(
@@ -44,32 +42,21 @@
 FAULTY_HTML = re.compile(r"(<html.*?)\s*/>", re.I)
 
 
+# eq=False keeps identity-based hashing so instances stay usable as lru_cache keys
+@dataclass(slots=True, eq=False)
 class Extractor:
     "Defines a class to store all extraction options."
 
-    __slots__ = ["extensive", "format", "max", "min", "original"]
-
-    # consider dataclasses for Python 3.7+
-    def __init__(
-        self,
-        extensive_search: bool,
-        max_date: datetime,
-        min_date: datetime,
-        original_date: bool,
-        outputformat: str,
-    ) -> None:
-        self.extensive: bool = extensive_search
-        self.format: str = outputformat
-        self.max: datetime = max_date
-        self.min: datetime = min_date
-        self.original: bool = original_date
-
-
-def is_wrong_document(data: Any) -> bool:
+    extensive: bool
+    max: datetime
+    min: datetime
+    original: bool
+    format: str
+
+
+def is_wrong_document(data: str | bytes | HtmlElement | None) -> bool:
     "Check if the input object is suitable to be processed."
-    if not data or len(data) > MAX_FILE_SIZE:
-        return True
-    return False
+    return not data or len(data) > MAX_FILE_SIZE
 
 
 def isutf8(data: bytes) -> bool:
@@ -81,7 +68,7 @@ def isutf8(data: bytes) -> bool:
     return True
 
 
-def detect_encoding(bytesobject: bytes) -> List[str]:
+def detect_encoding(bytesobject: bytes) -> list[str]:
     """Read all input or first chunk and return a list of encodings"""
     # alternatives: https://github.com/scrapy/w3lib/blob/master/w3lib/encoding.py
     # unicode-test
@@ -103,7 +90,7 @@ def detect_encoding(bytesobject: bytes) -> List[str]:
     return [g for g in guesses if g not in UNICODE_ALIASES]
 
 
-def decode_file(filecontent: Union[bytes, str]) -> str:
+def decode_file(filecontent: bytes | str) -> str:
     """Guess bytestring encoding and try to decode to Unicode string.
     Resort to destructive conversion otherwise."""
     # init
@@ -116,25 +103,24 @@ def decode_file(filecontent: Union[bytes, str]) -> str:
             htmltext = filecontent.decode(guessed_encoding)
         except (LookupError, UnicodeDecodeError):  # VISCII: lookup
             LOGGER.warning("wrong encoding detected: %s", guessed_encoding)
-            htmltext = None
         else:
             break
     # return original content if nothing else succeeded
     return htmltext or str(filecontent, encoding="utf-8", errors="replace")
 
 
-def decode_response(response: Any) -> str:
+def decode_response(response: urllib3.response.HTTPResponse | bytes) -> str:
     """Read the urllib3 object corresponding to the server response, then
     try to guess its encoding and decode it to return a unicode string"""
     # urllib3 response object / bytes switch
-    if isinstance(response, urllib3.response.HTTPResponse) or hasattr(response, "data"):
+    if isinstance(response, urllib3.response.HTTPResponse):
         resp_content = response.data
     else:
         resp_content = response
     return decode_file(resp_content)
 
 
-def fetch_url(url: str) -> Optional[str]:
+def fetch_url(url: str) -> str | None:
     """Fetches page using urllib3 and decodes the response.
 
     Args:
@@ -149,7 +135,7 @@ def fetch_url(url: str) -> Optional[str]:
     try:
         # read by streaming chunks (stream=True, iter_content=xx)
         # so we can stop downloading as soon as MAX_FILE_SIZE is reached
-        response = HTTP_POOL.request("GET", url, timeout=30)  # type: ignore
+        response = HTTP_POOL.request("GET", url, timeout=30)
     except Exception as err:
         LOGGER.error("download error: %s %s", url, err)  # sys.exc_info()[0]
     else:
@@ -175,7 +161,7 @@ def repair_faulty_html(htmlstring: str, beginning: str) -> str:
         firstline, _, rest = htmlstring.partition("\n")
         htmlstring = DOCTYPE_TAG.sub("", firstline, count=1) + "\n" + rest
     # other issue with malformed documents: check first three lines
-    for i, line in enumerate(iter(htmlstring.splitlines())):
+    for i, line in enumerate(htmlstring.splitlines()):
         if "<html" in line and line.endswith("/>"):
             htmlstring = FAULTY_HTML.sub(r"\1>", htmlstring, count=1)
             break
@@ -184,17 +170,16 @@ def repair_faulty_html(htmlstring: str, beginning: str) -> str:
     return htmlstring
 
 
-def fromstring_bytes(htmlobject: str) -> Optional[HtmlElement]:
+def fromstring_bytes(htmlobject: str) -> HtmlElement | None:
     "Try to pass bytes to LXML parser."
-    tree = None
     try:
-        tree = fromstring(htmlobject.encode("utf8"), parser=HTML_PARSER)
+        return fromstring(htmlobject.encode("utf8"), parser=HTML_PARSER)
     except Exception as err:
         LOGGER.error("lxml parser bytestring %s", err)
-    return tree
+    return None
 
 
-def load_html(htmlobject: Union[bytes, str, HtmlElement]) -> Optional[HtmlElement]:
+def load_html(htmlobject: bytes | str | HtmlElement) -> HtmlElement | None:
     """Load object given as input and validate its type
     (accepted: lxml.html tree, bytestring and string)
     """
@@ -203,7 +188,7 @@ def load_html(htmlobject: Union[bytes, str, HtmlElement]) -> Optional[HtmlElemen
         return htmlobject
     # do not accept any other type after this point
     if not isinstance(htmlobject, (bytes, str)):
-        raise TypeError("incompatible input type: %s", type(htmlobject))
+        raise TypeError(f"incompatible input type: {type(htmlobject)}")
     # the string is a URL, download it
     if (
         isinstance(htmlobject, str)
@@ -211,10 +196,11 @@ def load_html(htmlobject: Union[bytes, str, HtmlElement]) -> Optional[HtmlElemen
         and " " not in htmlobject
     ):
         LOGGER.debug("URL detected, downloading: %s", htmlobject)
-        htmlobject = fetch_url(htmlobject)  # type: ignore[assignment]
+        downloaded = fetch_url(htmlobject)
         # log the error and quit
-        if htmlobject is None:
-            raise ValueError("URL couldn't be processed: %s", htmlobject)
+        if downloaded is None:
+            raise ValueError(f"URL couldn't be processed: {htmlobject}")
+        htmlobject = downloaded
     # start processing
     tree = None
     # try to guess encoding and decode file: if None then keep original
@@ -246,15 +232,20 @@ def load_html(htmlobject: Union[bytes, str, HtmlElement]) -> Optional[HtmlElemen
     return tree
 
 
-def clean_html(tree: HtmlElement, elemlist: List[str]) -> HtmlElement:
+def clean_html(tree: HtmlElement, elemlist: list[str]) -> HtmlElement:
     "Delete selected elements."
-    for element in tree.iter(elemlist):  # type: ignore[call-overload]
-        parent = element.getparent()
-        if parent is not None:
-            parent.remove(element)
+    for element in tree.iter(elemlist):
+        # drop_tree() keeps the element's tail text (a date may sit right after a
+        # cleaned media element); fall back to remove() if it is unavailable
+        try:
+            element.drop_tree()
+        except AttributeError:  # pragma: no cover
+            parent = element.getparent()
+            if parent is not None:
+                parent.remove(element)
     return tree
 
 
 def trim_text(string: str) -> str:
     "Remove superfluous space and normalize remaining space."
-    return " ".join(string.split()).strip()
+    return " ".join(string.split())
diff --git a/htmldate/validators.py b/htmldate/validators.py
index c29fbad0..d8c5462d 100644
--- a/htmldate/validators.py
+++ b/htmldate/validators.py
@@ -4,24 +4,22 @@
 """
 
 import logging
+import re
 
 from collections import Counter
 from datetime import datetime
 from functools import lru_cache
-from time import mktime
-from typing import Match, Optional, Pattern, Union, Counter as Counter_Type
 
 from .settings import CACHE_SIZE, MIN_DATE
 from .utils import Extractor
 
-
 LOGGER = logging.getLogger(__name__)
 LOGGER.debug("minimum date setting: %s", MIN_DATE)
 
 
 @lru_cache(maxsize=CACHE_SIZE)
 def is_valid_date(
-    date_input: Optional[Union[datetime, str]],
+    date_input: datetime | str | None,
     outputformat: str,
     earliest: datetime,
     latest: datetime,
@@ -58,16 +56,18 @@ def is_valid_date(
 
 
 def validate_and_convert(
-    date_input: Optional[Union[datetime, str]],
+    date_input: datetime | None,
     outputformat: str,
     earliest: datetime,
     latest: datetime,
-) -> Optional[str]:
+) -> str | None:
     "Robust validation and conversion for plausible dates."
-    if is_valid_date(date_input, outputformat, earliest, latest):
+    if date_input is not None and is_valid_date(
+        date_input, outputformat, earliest, latest
+    ):
         try:
             LOGGER.debug("custom parse result: %s", date_input)
-            return date_input.strftime(outputformat)  # type: ignore
+            return date_input.strftime(outputformat)
         except ValueError as err:  # pragma: no cover
             LOGGER.error("value error during conversion: %s %s", date_input, err)
     return None
@@ -83,8 +83,8 @@ def is_valid_format(outputformat: str) -> bool:
     except (TypeError, ValueError) as err:
         LOGGER.error("wrong output format or type: %s %s", outputformat, err)
         return False
-    # test in abstracto (could be the only test)
-    if not isinstance(outputformat, str) or "%" not in outputformat:
+    # a format without any directive cannot produce a date
+    if "%" not in outputformat:
         LOGGER.error("malformed output format: %s", outputformat)
         return False
     return True
@@ -93,14 +93,15 @@ def is_valid_format(outputformat: str) -> bool:
 def plausible_year_filter(
     htmlstring: str,
     *,
-    pattern: Pattern[str],
-    yearpat: Pattern[str],
+    pattern: re.Pattern[str],
+    yearpat: re.Pattern[str],
     earliest: datetime,
     latest: datetime,
     incomplete: bool = False,
-) -> Counter_Type[str]:
+) -> Counter[str]:
     """Filter the date patterns to find plausible years only"""
     occurrences = Counter(pattern.findall(htmlstring))  # slow!
+    min_year, max_year = earliest.year, latest.year
 
     for item in list(occurrences):  # prevent RuntimeError
         year_match = yearpat.search(item)
@@ -116,7 +117,7 @@ def plausible_year_filter(
             century = "19" if lastdigits[0] == "9" else "20"
             potential_year = int(century + lastdigits)
 
-        if not earliest.year <= potential_year <= latest.year:
+        if not min_year <= potential_year <= max_year:
             LOGGER.debug("no potential year: %s", item)
             del occurrences[item]
 
@@ -126,7 +127,7 @@ def plausible_year_filter(
 def compare_values(reference: int, attempt: str, options: Extractor) -> int:
     """Compare the date expression to a reference"""
     try:
-        timestamp = int(mktime(datetime.strptime(attempt, options.format).timetuple()))
+        timestamp = int(datetime.strptime(attempt, options.format).timestamp())
     except Exception as err:
         LOGGER.debug("datetime.strptime exception: %s for string %s", err, attempt)
         return reference
@@ -139,14 +140,13 @@ def compare_values(reference: int, attempt: str, options: Extractor) -> int:
 
 @lru_cache(maxsize=CACHE_SIZE)
 def filter_ymd_candidate(
-    bestmatch: Match[str],
-    pattern: Pattern[str],
-    original_date: bool,
+    bestmatch: re.Match[str],
+    pattern: re.Pattern[str],
     copyear: int,
     outputformat: str,
     min_date: datetime,
     max_date: datetime,
-) -> Optional[str]:
+) -> str | None:
     """Filter free text candidates in the YMD format"""
     if bestmatch is not None:
         pagedate = "-".join([bestmatch[1], bestmatch[2], bestmatch[3]])
@@ -155,15 +155,6 @@ def filter_ymd_candidate(
         ):
             LOGGER.debug('date found for pattern "%s": %s', pattern, pagedate)
             return convert_date(pagedate, "%Y-%m-%d", outputformat)
-            ## TODO: test and improve
-            # if original_date is True:
-            #    if copyear == 0 or int(bestmatch[1]) <= copyear:
-            #        LOGGER.debug('date found for pattern "%s": %s', pattern, pagedate)
-            #        return convert_date(pagedate, '%Y-%m-%d', outputformat)
-            # else:
-            #    if copyear == 0 or int(bestmatch[1]) >= copyear:
-            #        LOGGER.debug('date found for pattern "%s": %s', pattern, pagedate)
-            #        return convert_date(pagedate, '%Y-%m-%d', outputformat)
     return None
 
 
@@ -180,7 +171,7 @@ def convert_date(datestring: str, inputformat: str, outputformat: str) -> str:
     return dateobject.strftime(outputformat)
 
 
-def check_extracted_reference(reference: int, options: Extractor) -> Optional[str]:
+def check_extracted_reference(reference: int, options: Extractor) -> str | None:
     """Test if the extracted reference date can be returned"""
     if reference > 0:
         dateobject = datetime.fromtimestamp(reference)
@@ -192,25 +183,29 @@ def check_extracted_reference(reference: int, options: Extractor) -> Optional[st
     return None
 
 
-def check_date_input(
-    date_object: Optional[Union[datetime, str]], default: datetime
-) -> datetime:
+def check_date_input(date_object: datetime | str | None, default: datetime) -> datetime:
     "Check if the input is a usable datetime or ISO date string, return default otherwise"
     if isinstance(date_object, datetime):
         return date_object
     if isinstance(date_object, str):
         try:
-            return datetime.fromisoformat(date_object)  # type: ignore[attr-defined]
+            return datetime.fromisoformat(date_object)
         except ValueError:
             LOGGER.warning("invalid datetime string: %s", date_object)
     return default  # no input or error thrown
 
 
-def get_min_date(min_date: Optional[Union[datetime, str]]) -> datetime:
+def get_min_date(min_date: datetime | str | None) -> datetime:
     """Validates the minimum date and/or defaults to earliest plausible date"""
     return check_date_input(min_date, MIN_DATE)
 
 
-def get_max_date(max_date: Optional[Union[datetime, str]]) -> datetime:
-    """Validates the maximum date and/or defaults to latest plausible date"""
-    return check_date_input(max_date, datetime.now())
+def get_max_date(max_date: datetime | str | None) -> datetime:
+    """Validates the maximum date and/or defaults to the end of the current day.
+    A day-granular default stays stable across calls (unlike datetime.now()),
+    which lets the date-validation caches be reused from one document to the
+    next in batch processing, and accepts dates published earlier the same day."""
+    end_of_today = datetime.now().replace(
+        hour=23, minute=59, second=59, microsecond=999999
+    )
+    return check_date_input(max_date, end_of_today)
diff --git a/pyproject.toml b/pyproject.toml
index a6504ec6..428bc284 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -9,7 +9,7 @@ description = "Fast and robust extraction of original and updated publication da
 readme = "README.md"
 license = { text = "Apache 2.0" }
 dynamic = ["version"]
-requires-python = ">=3.8"
+requires-python = ">=3.10"
 authors = [
   {name = "Adrien Barbaresi", email = "adrien.barbaresi@gmail.com"}
 ]
@@ -37,8 +37,6 @@ classifiers = [
     "Operating System :: POSIX :: Linux",
     "Programming Language :: Python",
     "Programming Language :: Python :: 3",
-    "Programming Language :: Python :: 3.8",
-    "Programming Language :: Python :: 3.9",
     "Programming Language :: Python :: 3.10",
     "Programming Language :: Python :: 3.11",
     "Programming Language :: Python :: 3.12",
@@ -52,9 +50,7 @@ classifiers = [
 dependencies = [
     "charset_normalizer >= 3.4.0",
     "dateparser >= 1.1.2",  # 1.1.3+ slower
-    # see tests on Github Actions
-    "lxml == 4.9.2 ; platform_system == 'Darwin' and python_version <= '3.8'",
-    "lxml >= 5.3.0 ; platform_system != 'Darwin' or python_version > '3.8'",
+    "lxml >= 5.3.0",
     "python-dateutil >= 2.9.0.post0",
     "urllib3 >= 1.26, < 3",
 ]
@@ -99,5 +95,11 @@ all = [
     "htmldate[speed]",
 ]
 
+[tool.black]
+target-version = ["py310"]
+
+[tool.mypy]
+warn_unused_ignores = true
+
 [tool.pytest.ini_options]
 testpaths = "tests/*test*.py"
diff --git a/setup.py b/setup.py
deleted file mode 100644
index 8f91d615..00000000
--- a/setup.py
+++ /dev/null
@@ -1,37 +0,0 @@
-"""
-Seamlessly extract the date of web pages based on URL, header or body.
-http://github.com/adbar/htmldate
-"""
-
-import sys
-
-from setuptools import setup
-
-
-# add argument to compile with mypyc
-if len(sys.argv) > 1 and sys.argv[1] == "--use-mypyc":
-    sys.argv.pop(1)
-    USE_MYPYC = True
-    from mypyc.build import mypycify
-
-    ext_modules = mypycify(
-        [
-            "htmldate/__init__.py",
-            "htmldate/core.py",
-            "htmldate/extractors.py",
-            "htmldate/meta.py",
-            "htmldate/settings.py",
-            "htmldate/utils.py",
-            "htmldate/validators.py",
-        ],
-        opt_level="3",
-        multi_file=True,
-    )
-else:
-    ext_modules = []
-
-
-setup(
-    # mypyc or not
-    ext_modules=ext_modules,
-)
diff --git a/tests/evaluation.py b/tests/evaluation.py
index 2f1f844d..394125dd 100644
--- a/tests/evaluation.py
+++ b/tests/evaluation.py
@@ -11,16 +11,23 @@
 except ImportError:
     from charset_normalizer import detect
 
-from articleDateExtractor import extractArticlePublishedDate
-from date_guesser import guess_date
-from goose3 import Goose
-from newspaper import Article
-from newspaper.article import ArticleDownloadState
-from newsplease import NewsPlease
-
 from htmldate import find_date
 from htmldate.validators import convert_date
 
+# Optional third-party libraries, only needed for the full benchmark
+# (i.e. comparison.py *without* --small). Guard the imports so the
+# htmldate-only run works without these heavy/legacy packages installed.
+try:
+    from articleDateExtractor import extractArticlePublishedDate
+    from date_guesser import guess_date
+    from goose3 import Goose
+    from newspaper import Article
+    from newspaper.article import ArticleDownloadState
+    from newsplease import NewsPlease
+except ImportError:
+    extractArticlePublishedDate = guess_date = Goose = None
+    Article = ArticleDownloadState = NewsPlease = None
+
 
 TEST_DIR = os.path.abspath(os.path.dirname(__file__))
 # list the jsons containing the pages here
@@ -32,7 +39,7 @@
     with open(evalpath, "r", encoding="utf-8") as f:
         EVAL_PAGES.update(json.load(f))
 
-G = Goose()
+G = Goose() if Goose is not None else None
 
 
 def load_document(filename):
diff --git a/tests/realworld_tests.py b/tests/realworld_tests.py
index c646114f..ecc56c4a 100644
--- a/tests/realworld_tests.py
+++ b/tests/realworld_tests.py
@@ -768,9 +768,8 @@ def test_cli():
     "Test the command-line interface"
     # third test: Linux and MacOS only
     if os.name != "nt":
-        testargs = [""]
-        with patch.object(sys, "argv", testargs):
-            args = parse_args(testargs)
+        testargs = []
+        args = parse_args(testargs)
         sys.stdin = open(
             os.path.join(TEST_DIR, "cache", "befifty.montauk.html"),
             "r",
diff --git a/tests/unit_tests.py b/tests/unit_tests.py
index 533bb32c..00ae20de 100644
--- a/tests/unit_tests.py
+++ b/tests/unit_tests.py
@@ -1515,7 +1515,6 @@ def test_idiosyncrasies():
 def test_parser():
     """test argument parsing for the command-line interface"""
     testargs = [
-        "-f",
         "-v",
         "--original",
         "-max",
@@ -1523,26 +1522,23 @@ def test_parser():
         "-u",
         "https://www.example.org",
     ]
-    with patch.object(sys, "argv", testargs):
-        args = parse_args(testargs)
+    args = parse_args(testargs)
     assert args.fast is True
     assert args.original is True
     assert args.verbose is True
     assert args.maxdate == "2015-12-31"
     assert args.URL == "https://www.example.org"
-    testargs = ["-f", "-min", "2015-12-31"]
-    with patch.object(sys, "argv", testargs):
-        args = parse_args(testargs)
+    testargs = ["-min", "2015-12-31"]
+    args = parse_args(testargs)
     assert args.fast is True
     assert args.original is False
     assert args.verbose is False
     assert args.mindate == "2015-12-31"
     # version
     f = io.StringIO()
-    testargs = ["", "--version"]
+    testargs = ["--version"]
     with pytest.raises(SystemExit) as e, redirect_stdout(f):
-        with patch.object(sys, "argv", testargs):
-            args = parse_args(testargs)
+        args = parse_args(testargs)
     assert e.type == SystemExit and e.value.code == 0
     assert re.match(
         r"Htmldate [0-9]\.[0-9]+\.[0-9] - Python [0-9]\.[0-9]+\.[0-9]", f.getvalue()
@@ -1552,8 +1548,7 @@ def test_parser():
 def test_cli():
     "Test the command-line interface"
     testargs = ["--original"]
-    with patch.object(sys, "argv", testargs):
-        args = parse_args(testargs)
+    args = parse_args(testargs)
 
     assert cli_examine(None, args) is None
     assert cli_examine(" ", args) is None
@@ -1608,16 +1603,14 @@ def test_cli():
     )
 
     # first test
-    testargs = ["", "-u", "123", "-v"]
-    with patch.object(sys, "argv", testargs):
-        args = parse_args(testargs)
+    testargs = ["-u", "123", "-v"]
+    args = parse_args(testargs)
     with pytest.raises(SystemExit) as err:
         process_args(args)
     assert err.type == SystemExit
     # meaningful test
-    testargs = ["", "-u", "https://httpbun.com/html"]
-    with patch.object(sys, "argv", testargs):
-        args = parse_args(testargs)
+    testargs = ["-u", "https://httpbun.com/html"]
+    args = parse_args(testargs)
     f = io.StringIO()
     with redirect_stdout(f):
         process_args(args)
@@ -1638,8 +1631,7 @@ def test_download():
             main()
 
     testargs = ["--original"]
-    with patch.object(sys, "argv", testargs):
-        args = parse_args(testargs)
+    args = parse_args(testargs)
 
     url = "https://httpbin.org/status/200"
     teststring = fetch_url(url)

From a066bd4702ada47cea094a46241916d3331968ef Mon Sep 17 00:00:00 2001
From: Adrien Barbaresi <adbar@users.noreply.github.com>
Date: Sat, 30 May 2026 15:07:02 +0200
Subject: [PATCH 2/2] fix issues and use ruff

---
 .github/workflows/tests.yml | 10 +++-------
 CONTRIBUTING.md             |  2 +-
 docs/index.rst              |  6 +++---
 htmldate/utils.py           |  2 +-
 pyproject.toml              | 15 +++++++++++----
 tests/realworld_tests.py    |  1 -
 tests/unit_tests.py         |  6 +++---
 7 files changed, 22 insertions(+), 20 deletions(-)

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index 24cd7629..bb7750ca 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -58,17 +58,13 @@ jobs:
       run: python -m pip install --upgrade -e ".[dev]"
 
     # tests
-    - name: Lint with flake8
-      run: |
-        # stop the build if there are Python syntax errors or undefined names
-        flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
-        # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
-        flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
+    - name: Lint with ruff
+      run: ruff check .
 
     - name: Code format and type checking
       if: ${{ matrix.python-version == env.REF_PY_VERSION }}
       run: |
-        black --check --diff htmldate
+        ruff format --check htmldate
         mypy -p htmldate
 
     - name: Install full dependencies
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 8fd3507b..4eb1e9e6 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -25,7 +25,7 @@ and if there are no errors.
 2. Run the tests and code quality tools:
    - Tests with `pytest`
    - Type checking with `mypy` on the directory: `mypy htmldate/`
-   - Code formatting with `black` on the directory as well
+   - Linting with `ruff check .` and formatting with `ruff format htmldate/`
 
 
 For further questions you can use [GitHub issues](https://github.com/adbar/htmldate/issues) or [E-Mail](https://adrien.barbaresi.eu/).
diff --git a/docs/index.rst b/docs/index.rst
index 5aa83b02..16b85ead 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -21,9 +21,9 @@ Htmldate: Find the Publication Date of Web Pages
    :target: https://doi.org/10.21105/joss.02439
    :alt: JOSS article reference DOI: 10.21105/joss.02439
 
-.. image:: https://img.shields.io/badge/code%20style-black-000000.svg
-   :target: https://github.com/psf/black
-   :alt: Code style: black
+.. image:: https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/astral-sh/ruff/main/assets/badge/v2.json
+   :target: https://github.com/astral-sh/ruff
+   :alt: Ruff
 
 |
 
diff --git a/htmldate/utils.py b/htmldate/utils.py
index bd95f906..90382d53 100644
--- a/htmldate/utils.py
+++ b/htmldate/utils.py
@@ -13,7 +13,7 @@
 
 # CChardet is faster and can be more accurate
 try:
-    from cchardet import detect as cchardet_detect  # type: ignore[import-untyped]
+    from cchardet import detect as cchardet_detect
 except ImportError:
     cchardet_detect = None
 from charset_normalizer import from_bytes
diff --git a/pyproject.toml b/pyproject.toml
index 428bc284..86bc72e8 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -75,11 +75,10 @@ htmldate = "htmldate.cli:main"
 # Development extras
 [project.optional-dependencies]
 dev = [
-    "black",
-    "flake8",
     "mypy",
     "pytest",
     "pytest-cov",
+    "ruff",
     "types-dateparser",
     "types-python-dateutil",
     "types-lxml",
@@ -95,11 +94,19 @@ all = [
     "htmldate[speed]",
 ]
 
-[tool.black]
-target-version = ["py310"]
+[tool.ruff]
+target-version = "py310"
 
 [tool.mypy]
 warn_unused_ignores = true
 
+[[tool.mypy.overrides]]
+# faust-cchardet (optional "speed" extra) ships no type stubs and is absent
+# during the type-checking step, so silence its import here. This behaves the
+# same whether the extra is installed (import-untyped) or not (import-not-found),
+# unlike an inline ignore tied to a single error code.
+module = ["cchardet"]
+ignore_missing_imports = true
+
 [tool.pytest.ini_options]
 testpaths = "tests/*test*.py"
diff --git a/tests/realworld_tests.py b/tests/realworld_tests.py
index ecc56c4a..0e0ee129 100644
--- a/tests/realworld_tests.py
+++ b/tests/realworld_tests.py
@@ -10,7 +10,6 @@
 import sys
 
 from contextlib import redirect_stdout
-from unittest.mock import patch
 
 import pytest
 
diff --git a/tests/unit_tests.py b/tests/unit_tests.py
index 00ae20de..92b37d32 100644
--- a/tests/unit_tests.py
+++ b/tests/unit_tests.py
@@ -1538,8 +1538,8 @@ def test_parser():
     f = io.StringIO()
     testargs = ["--version"]
     with pytest.raises(SystemExit) as e, redirect_stdout(f):
-        args = parse_args(testargs)
-    assert e.type == SystemExit and e.value.code == 0
+        parse_args(testargs)
+    assert e.type is SystemExit and e.value.code == 0
     assert re.match(
         r"Htmldate [0-9]\.[0-9]+\.[0-9] - Python [0-9]\.[0-9]+\.[0-9]", f.getvalue()
     )
@@ -1607,7 +1607,7 @@ def test_cli():
     args = parse_args(testargs)
     with pytest.raises(SystemExit) as err:
         process_args(args)
-    assert err.type == SystemExit
+    assert err.type is SystemExit
     # meaningful test
     testargs = ["-u", "https://httpbun.com/html"]
     args = parse_args(testargs)