From db9f645439b56d71d5c3875d30bbdd0412d5b06d Mon Sep 17 00:00:00 2001 From: Adrien Barbaresi Date: Sat, 30 May 2026 14:45:48 +0200 Subject: [PATCH 1/2] maintenance: update setup, harden code, add tests --- .pre-commit-config.yaml | 27 ------ CONTRIBUTING.md | 1 - htmldate/cli.py | 17 ++-- htmldate/core.py | 191 ++++++++++++++++++--------------------- htmldate/extractors.py | 56 ++++++------ htmldate/meta.py | 1 - htmldate/utils.py | 91 +++++++++---------- htmldate/validators.py | 69 +++++++------- pyproject.toml | 14 +-- setup.py | 37 -------- tests/evaluation.py | 23 +++-- tests/realworld_tests.py | 5 +- tests/unit_tests.py | 30 +++--- 13 files changed, 229 insertions(+), 333 deletions(-) delete mode 100644 .pre-commit-config.yaml delete mode 100644 setup.py diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml deleted file mode 100644 index 011dd200..00000000 --- a/.pre-commit-config.yaml +++ /dev/null @@ -1,27 +0,0 @@ -repos: - - repo: https://github.com/pre-commit/pre-commit-hooks - rev: v4.5.0 - hooks: - - id: check-yaml - - id: end-of-file-fixer - - id: trailing-whitespace - - - repo: https://github.com/psf/black - rev: 24.3.0 - hooks: - - id: black - - #- repo: https://github.com/PyCQA/flake8 - # rev: 7.0.0 - # hooks: - # - id: flake8 - - #- repo: https://github.com/pycqa/isort - # rev: 5.13.2 - # hooks: - # - id: isort - - #- repo: https://github.com/pre-commit/mirrors-mypy - # rev: v1.9.0 - # hooks: - # - id: mypy diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 0f2287ed..8fd3507b 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -26,7 +26,6 @@ and if there are no errors. - Tests with `pytest` - Type checking with `mypy` on the directory: `mypy htmldate/` - Code formatting with `black` on the directory as well - - Optional: install `pre-commit` to use the corresponding commit hooks For further questions you can use [GitHub issues](https://github.com/adbar/htmldate/issues) or [E-Mail](https://adrien.barbaresi.eu/). diff --git a/htmldate/cli.py b/htmldate/cli.py index 4b59d6cf..abba2e13 100644 --- a/htmldate/cli.py +++ b/htmldate/cli.py @@ -7,7 +7,6 @@ import sys from platform import python_version -from typing import Any, Optional, Union from lxml.html import HtmlElement @@ -17,12 +16,12 @@ def cli_examine( - htmlstring: Union[str, HtmlElement], - args: Any, -) -> Optional[str]: + htmlstring: str | HtmlElement | None, + args: argparse.Namespace, +) -> str | None: """Generic safeguards and triggers""" # safety check - if is_wrong_document(htmlstring): + if htmlstring is None or is_wrong_document(htmlstring): sys.stderr.write("# ERROR: document is empty or too large\n") return None return find_date( @@ -35,7 +34,7 @@ def cli_examine( ) -def parse_args(args: Any) -> Any: +def parse_args(args: list[str]) -> argparse.Namespace: """Define parser for command-line arguments""" argsparser = argparse.ArgumentParser() argsparser.add_argument( @@ -67,10 +66,10 @@ def parse_args(args: Any) -> Any: action="version", version=f"Htmldate {__version__} - Python {python_version()}", ) - return argsparser.parse_args() + return argsparser.parse_args(args) -def process_args(args: Any) -> None: +def process_args(args: argparse.Namespace) -> None: """Process the arguments passed on the command-line.""" # verbosity if args.verbose: @@ -98,7 +97,7 @@ def process_args(args: Any) -> None: with open(args.inputfile, mode="r", encoding="utf-8") as inputfile: for line in inputfile: htmltext = fetch_url(line.strip()) - result = cli_examine(htmltext, args) # type: ignore[arg-type] + result = cli_examine(htmltext, args) sys.stdout.write(f"{line.strip()}\t{result or 'None'}\n") diff --git a/htmldate/core.py b/htmldate/core.py index 98ba9188..f40489d6 100644 --- a/htmldate/core.py +++ b/htmldate/core.py @@ -7,10 +7,10 @@ import re from collections import Counter +from collections.abc import Callable from copy import deepcopy from datetime import datetime from functools import lru_cache, partial -from typing import Match, Optional, Pattern, Union, Counter as Counter_Type from lxml.html import HtmlElement, tostring @@ -68,7 +68,6 @@ validate_and_convert, ) - LOGGER = logging.getLogger(__name__) @@ -199,7 +198,7 @@ def logstring(element: HtmlElement) -> str: def examine_text( text: str, options: Extractor, -) -> Optional[str]: +) -> str | None: "Prepare text and try to extract a date." text = trim_text(text) @@ -216,7 +215,7 @@ def examine_date_elements( tree: HtmlElement, expression: str, options: Extractor, -) -> Optional[str]: +) -> str | None: """Check HTML elements one by one for date expressions""" elements = tree.xpath(expression) if not elements or len(elements) > MAX_POSSIBLE_CANDIDATES: @@ -235,7 +234,7 @@ def examine_date_elements( def examine_header( tree: HtmlElement, options: Extractor, -) -> Optional[str]: +) -> str | None: """ Parse header elements to find date cues @@ -353,11 +352,11 @@ def examine_header( def select_candidate( - occurrences: Counter_Type[str], - catch: Pattern[str], - yearpat: Pattern[str], + occurrences: Counter[str], + catch: re.Pattern[str], + yearpat: re.Pattern[str], options: Extractor, -) -> Optional[Match[str]]: +) -> re.Match[str] | None: """Select a candidate among the most frequent matches""" if not occurrences or len(occurrences) > MAX_POSSIBLE_CANDIDATES: return None @@ -381,12 +380,8 @@ def select_candidate( if year_match: years.append(year_match[1]) - validation = [ - is_valid_date( - datetime(int(year), 1, 1), "%Y", earliest=options.min, latest=options.max - ) - for year in years - ] + min_year, max_year = options.min.year, options.max.year + validation = [min_year <= int(year) <= max_year for year in years] # safety net: plausibility if all(validation): @@ -409,11 +404,11 @@ def select_candidate( def search_pattern( htmlstring: str, - pattern: Pattern[str], - catch: Pattern[str], - yearpat: Pattern[str], + pattern: re.Pattern[str], + catch: re.Pattern[str], + yearpat: re.Pattern[str], options: Extractor, -) -> Optional[Match[str]]: +) -> re.Match[str] | None: """Chained candidate filtering and selection""" candidates = plausible_year_filter( htmlstring, @@ -443,7 +438,7 @@ def compare_reference( def examine_abbr_elements( tree: HtmlElement, options: Extractor, -) -> Optional[str]: +) -> str | None: """Scan the page for abbr elements and check if their content contains an eligible date""" elements = tree.findall(".//abbr") if 0 < len(elements) < MAX_POSSIBLE_CANDIDATES: @@ -500,7 +495,7 @@ def examine_abbr_elements( def examine_time_elements( tree: HtmlElement, options: Extractor, -) -> Optional[str]: +) -> str | None: """Scan the page for time elements and check if their content contains an eligible date""" elements = tree.findall(".//time") if 0 < len(elements) < MAX_POSSIBLE_CANDIDATES: @@ -521,16 +516,17 @@ def examine_time_elements( LOGGER.debug("shortcut for time pubdate found: %s", datetime_attr) # shortcuts: class attribute elif "class" in elem.attrib: + class_attr = elem.get("class", "") if options.original and ( - elem.get("class", "").startswith("entry-date") - or elem.get("class", "").startswith("entry-time") + class_attr.startswith("entry-date") + or class_attr.startswith("entry-time") ): shortcut_flag = True LOGGER.debug( "shortcut for time/datetime found: %s", datetime_attr ) # updated time - elif not options.original and elem.get("class") == "updated": + elif not options.original and class_attr == "updated": shortcut_flag = True LOGGER.debug( "shortcut for updated time/datetime found: %s", @@ -562,7 +558,7 @@ def examine_time_elements( return None -def normalize_match(match: Optional[Match[str]]) -> str: +def normalize_match(match: re.Match[str] | None) -> str: """Normalize string output by adding "0" if necessary, and optionally expand the year from two to four digits.""" day, month, year = (g.zfill(2) for g in match.groups() if g) # type: ignore[union-attr] @@ -571,7 +567,44 @@ def normalize_match(match: Optional[Match[str]]) -> str: return f"{year}-{month}-{day}" -def search_page(htmlstring: str, options: Extractor) -> Optional[str]: +def normalize_two_comp(item: str) -> str: + """Normalize a MM-YYYY style match into a YYYY-MM-01 string.""" + match = TWO_COMP_REGEX.match(item) + month = match[1].zfill(2) # type: ignore[index] + return "-".join([match[2], month, "01"]) # type: ignore[index] + + +def search_normalized( + htmlstring: str, + pattern: re.Pattern[str], + yearpat: re.Pattern[str], + normalizer: Callable[[str], str], + copyear: int, + options: Extractor, + *, + incomplete: bool = False, +) -> str | None: + """Filter plausible years, normalize each candidate to the YMD format, then + select the best match and validate it (shared candidate-selection pipeline).""" + candidates = plausible_year_filter( + htmlstring, + pattern=pattern, + yearpat=yearpat, + earliest=options.min, + latest=options.max, + incomplete=incomplete, + ) + # revert DD-MM-YYYY patterns before sorting + normalized = Counter( + {normalizer(item): count for item, count in candidates.items()} + ) + bestmatch = select_candidate(normalized, YMD_PATTERN, YMD_YEAR, options) + return filter_ymd_candidate( + bestmatch, pattern, copyear, options.format, options.min, options.max + ) + + +def search_page(htmlstring: str, options: Extractor) -> str | None: """ Opportunistically search the HTML text for common text patterns @@ -619,7 +652,6 @@ def search_page(htmlstring: str, options: Extractor) -> Optional[str]: result = filter_ymd_candidate( bestmatch, patterns[0], - options.original, copyear, options.format, options.min, @@ -629,30 +661,13 @@ def search_page(htmlstring: str, options: Extractor) -> Optional[str]: return result # YYYY-MM-DD/DD-MM-YYYY - candidates = plausible_year_filter( + result = search_normalized( htmlstring, - pattern=SELECT_YMD_PATTERN, - yearpat=SELECT_YMD_YEAR, - earliest=options.min, - latest=options.max, - ) - # revert DD-MM-YYYY patterns before sorting - replacement = {} - for item in candidates: - match = THREE_COMP_REGEX_A.match(item) - candidate = normalize_match(match) - replacement[candidate] = candidates[item] - candidates = Counter(replacement) - # select - bestmatch = select_candidate(candidates, YMD_PATTERN, YMD_YEAR, options) - result = filter_ymd_candidate( - bestmatch, SELECT_YMD_PATTERN, - options.original, + SELECT_YMD_YEAR, + lambda item: normalize_match(THREE_COMP_REGEX_A.match(item)), copyear, - options.format, - options.min, - options.max, + options, ) if result is not None: return result @@ -668,7 +683,6 @@ def search_page(htmlstring: str, options: Extractor) -> Optional[str]: result = filter_ymd_candidate( bestmatch, DATESTRINGS_PATTERN, - options.original, copyear, options.format, options.min, @@ -678,30 +692,14 @@ def search_page(htmlstring: str, options: Extractor) -> Optional[str]: return result # DD?/MM?/YY - candidates = plausible_year_filter( + result = search_normalized( htmlstring, - pattern=SLASHES_PATTERN, - yearpat=SLASHES_YEAR, - earliest=options.min, - latest=options.max, - incomplete=True, - ) - # revert DD-MM-YYYY patterns before sorting - replacement = {} - for item in candidates: - match = THREE_COMP_REGEX_B.match(item) - candidate = normalize_match(match) - replacement[candidate] = candidates[item] - candidates = Counter(replacement) - bestmatch = select_candidate(candidates, YMD_PATTERN, YMD_YEAR, options) - result = filter_ymd_candidate( - bestmatch, SLASHES_PATTERN, - options.original, + SLASHES_YEAR, + lambda item: normalize_match(THREE_COMP_REGEX_B.match(item)), copyear, - options.format, - options.min, - options.max, + options, + incomplete=True, ) if result is not None: return result @@ -732,44 +730,24 @@ def search_page(htmlstring: str, options: Extractor) -> Optional[str]: return result # 2 components, second option - candidates = plausible_year_filter( + result = search_normalized( htmlstring, - pattern=MMYYYY_PATTERN, - yearpat=MMYYYY_YEAR, - earliest=options.min, - latest=options.max, - incomplete=options.original, - ) - # revert DD-MM-YYYY patterns before sorting - replacement = {} - for item in candidates: - match = TWO_COMP_REGEX.match(item) - month = match[1] # type: ignore[index] - if len(month) == 1: - month = f"0{month}" - candidate = "-".join([match[2], month, "01"]) # type: ignore[index] - replacement[candidate] = candidates[item] - candidates = Counter(replacement) - # select - bestmatch = select_candidate(candidates, YMD_PATTERN, YMD_YEAR, options) - result = filter_ymd_candidate( - bestmatch, MMYYYY_PATTERN, - options.original, + MMYYYY_YEAR, + normalize_two_comp, copyear, - options.format, - options.min, - options.max, + options, + incomplete=options.original, ) if result is not None: return result # try full-blown text regex on all HTML? - dateobject = regex_parse(htmlstring) # type: ignore[assignment] + text_date = regex_parse(htmlstring) # todo: find all candidates and disambiguate? - if copyear == 0 or (dateobject and dateobject.year >= copyear): + if copyear == 0 or (text_date and text_date.year >= copyear): result = validate_and_convert( - dateobject, options.format, earliest=options.min, latest=options.max + text_date, options.format, earliest=options.min, latest=options.max ) if result is not None: return result @@ -806,16 +784,16 @@ def search_page(htmlstring: str, options: Extractor) -> Optional[str]: def find_date( - htmlobject: Union[bytes, str, HtmlElement], + htmlobject: bytes | str | HtmlElement, extensive_search: bool = True, original_date: bool = False, outputformat: str = "%Y-%m-%d", - url: Optional[str] = None, + url: str | None = None, verbose: bool = False, - min_date: Optional[Union[datetime, str]] = None, - max_date: Optional[Union[datetime, str]] = None, + min_date: datetime | str | None = None, + max_date: datetime | str | None = None, deferred_url_extractor: bool = False, -) -> Optional[str]: +) -> str | None: """ Extract dates from HTML documents using markup analysis and text patterns @@ -878,7 +856,6 @@ def find_date( # find_date.extensive_search = extensive_search # URL - url_result = None if url is None: # probe for canonical links urlelem = tree.find('.//link[@rel="canonical"]') @@ -909,9 +886,13 @@ def find_date( return abbr_result # first, prune tree + # only copy the tree if the caller passed one in: when we parsed it ourselves + # (string/bytes/URL input) we own it and can clean it in place, avoiding a + # costly deepcopy of the whole document + pruning_tree = deepcopy(tree) if isinstance(htmlobject, HtmlElement) else tree try: search_tree, discarded = discard_unwanted( - clean_html(deepcopy(tree), CLEANING_LIST) + clean_html(pruning_tree, CLEANING_LIST) ) # rare LXML error: no NULL bytes or control characters except ValueError: # pragma: no cover diff --git a/htmldate/extractors.py b/htmldate/extractors.py index 9435c876..01ad9025 100644 --- a/htmldate/extractors.py +++ b/htmldate/extractors.py @@ -8,10 +8,9 @@ from datetime import datetime from functools import lru_cache -from typing import List, Optional, Pattern, Tuple # coverage for date parsing -from dateparser import DateDataParser # type: ignore # third-party, slow +from dateparser import DateDataParser # type: ignore[attr-defined] # third-party, slow from dateutil.parser import parse as dateutil_parse @@ -23,7 +22,6 @@ from .utils import Extractor, trim_text from .validators import convert_date, is_valid_date, validate_and_convert - LOGGER = logging.getLogger(__name__) EXTERNAL_PARSER = DateDataParser( @@ -120,9 +118,7 @@ rf"""(?P{REGEX_MONTHS})\s (?P{DAY_RE})(?:st|nd|rd|th)?,? (?P{YEAR_RE})| (?P{DAY_RE})(?:st|nd|rd|th|\.)? (?:of )? -(?P{REGEX_MONTHS})[,.]? (?P{YEAR_RE})""".replace( - "\n", "" - ), +(?P{REGEX_MONTHS})[,.]? (?P{YEAR_RE})""".replace("\n", ""), re.I, ) @@ -213,7 +209,7 @@ SIMPLE_PATTERN = re.compile(rf"(? Tuple[HtmlElement, List[HtmlElement]]: +def discard_unwanted(tree: HtmlElement) -> tuple[HtmlElement, list[HtmlElement]]: """Delete unwanted sections of an HTML document and return them as a list""" my_discarded = [] for subtree in DISCARD_EXPRESSIONS(tree): @@ -223,9 +219,9 @@ def discard_unwanted(tree: HtmlElement) -> Tuple[HtmlElement, List[HtmlElement]] def extract_url_date( - testurl: Optional[str], + testurl: str | None, options: Extractor, -) -> Optional[str]: +) -> str | None: """Extract the date out of an URL string complying with the Y-M-D format""" if testurl is not None: match = COMPLETE_URL.search(testurl) @@ -233,10 +229,9 @@ def extract_url_date( LOGGER.debug("found date in URL: %s", match[0]) try: dateobject = datetime(int(match[1]), int(match[2]), int(match[3])) - if is_valid_date( + return validate_and_convert( dateobject, options.format, earliest=options.min, latest=options.max - ): - return dateobject.strftime(options.format) + ) except ValueError as err: # pragma: no cover LOGGER.debug("conversion error: %s %s", match[0], err) return None @@ -249,12 +244,12 @@ def correct_year(year: int) -> int: return year -def try_swap_values(day: int, month: int) -> Tuple[int, int]: +def try_swap_values(day: int, month: int) -> tuple[int, int]: """Swap day and month values if it seems feasible.""" return (month, day) if month > 12 and day <= 12 else (day, month) -def regex_parse(string: str) -> Optional[datetime]: +def regex_parse(string: str) -> datetime | None: """Try full-text parse for date elements using a series of regular expressions with particular emphasis on English, French, German and Turkish""" # https://github.com/vi3k6i5/flashtext ? @@ -285,7 +280,7 @@ def regex_parse(string: str) -> Optional[datetime]: def custom_parse( string: str, outputformat: str, min_date: datetime, max_date: datetime -) -> Optional[str]: +) -> str | None: """Try to bypass the slow dateparser""" LOGGER.debug("custom parse test: %s", string) @@ -303,7 +298,7 @@ def custom_parse( # b. much faster than extensive parsing else: try: - candidate = datetime.fromisoformat(string) # type: ignore[attr-defined] + candidate = datetime.fromisoformat(string) except ValueError: LOGGER.debug("not an ISO date string: %s", string) try: @@ -383,7 +378,7 @@ def custom_parse( ) -def external_date_parser(string: str, outputformat: str) -> Optional[str]: +def external_date_parser(string: str, outputformat: str) -> str | None: """Use dateutil parser or dateparser module according to system settings""" LOGGER.debug("send to external parser: %s", string) try: @@ -393,17 +388,17 @@ def external_date_parser(string: str, outputformat: str) -> Optional[str]: target = None LOGGER.error("external parser error: %s %s", string, err) # issue with data type - return datetime.strftime(target, outputformat) if target else None + return target.strftime(outputformat) if target else None @lru_cache(maxsize=CACHE_SIZE) def try_date_expr( - string: Optional[str], + string: str | None, outputformat: str, extensive_search: bool, min_date: datetime, max_date: datetime, -) -> Optional[str]: +) -> str | None: """Use a series of heuristics and rules to parse a potential date expression""" if not string: return None @@ -440,7 +435,7 @@ def try_date_expr( def img_search( tree: HtmlElement, options: Extractor, -) -> Optional[str]: +) -> str | None: """Skim through image elements""" element = tree.find('.//meta[@property="og:image"][@content]') if element is not None: @@ -453,9 +448,9 @@ def img_search( def pattern_search( text: str, - date_pattern: Pattern[str], + date_pattern: re.Pattern[str], options: Extractor, -) -> Optional[str]: +) -> str | None: "Look for date expressions using a regular expression on a string of text." match = date_pattern.search(text) if match and is_valid_date( @@ -469,7 +464,7 @@ def pattern_search( def json_search( tree: HtmlElement, options: Extractor, -) -> Optional[str]: +) -> str | None: """Look for JSON time patterns in JSON sections of the tree""" # determine pattern json_pattern = JSON_PUBLISHED if options.original else JSON_MODIFIED @@ -479,14 +474,16 @@ def json_search( ): if not elem.text or '"date' not in elem.text: continue - return pattern_search(elem.text, json_pattern, options) + result = pattern_search(elem.text, json_pattern, options) + if result is not None: + return result return None def idiosyncrasies_search( htmlstring: str, options: Extractor, -) -> Optional[str]: +) -> str | None: """Look for author-written dates throughout the web page""" match = TEXT_PATTERNS.search(htmlstring) # EN+DE+TR if match: @@ -499,10 +496,9 @@ def idiosyncrasies_search( day, month = try_swap_values(int(parts[0]), int(parts[1])) year = correct_year(int(parts[2])) candidate = datetime(year, month, day) - if is_valid_date( - candidate, "%Y-%m-%d", earliest=options.min, latest=options.max - ): - return candidate.strftime(options.format) # type: ignore[union-attr] + return validate_and_convert( + candidate, options.format, earliest=options.min, latest=options.max + ) except (IndexError, ValueError): LOGGER.debug("cannot process idiosyncrasies: %s", match[0]) diff --git a/htmldate/meta.py b/htmldate/meta.py index c5faf6df..87a34a99 100644 --- a/htmldate/meta.py +++ b/htmldate/meta.py @@ -8,7 +8,6 @@ from .extractors import try_date_expr from .validators import filter_ymd_candidate, is_valid_date, is_valid_format - LOGGER = logging.getLogger(__name__) diff --git a/htmldate/utils.py b/htmldate/utils.py index 890d7fea..bd95f906 100644 --- a/htmldate/utils.py +++ b/htmldate/utils.py @@ -6,15 +6,14 @@ import logging import re +from dataclasses import dataclass from datetime import datetime -from typing import Any, List, Optional, Set, Union import urllib3 - # CChardet is faster and can be more accurate try: - from cchardet import detect as cchardet_detect # type: ignore + from cchardet import detect as cchardet_detect # type: ignore[import-untyped] except ImportError: cchardet_detect = None from charset_normalizer import from_bytes @@ -23,10 +22,9 @@ from .settings import MAX_FILE_SIZE - LOGGER = logging.getLogger(__name__) -UNICODE_ALIASES: Set[str] = {"utf-8", "utf_8"} +UNICODE_ALIASES: set[str] = {"utf-8", "utf_8"} urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) RETRY_STRATEGY = urllib3.util.Retry( @@ -44,32 +42,21 @@ FAULTY_HTML = re.compile(r"(", re.I) +# eq=False keeps identity-based hashing so instances stay usable as lru_cache keys +@dataclass(slots=True, eq=False) class Extractor: "Defines a class to store all extraction options." - __slots__ = ["extensive", "format", "max", "min", "original"] - - # consider dataclasses for Python 3.7+ - def __init__( - self, - extensive_search: bool, - max_date: datetime, - min_date: datetime, - original_date: bool, - outputformat: str, - ) -> None: - self.extensive: bool = extensive_search - self.format: str = outputformat - self.max: datetime = max_date - self.min: datetime = min_date - self.original: bool = original_date - - -def is_wrong_document(data: Any) -> bool: + extensive: bool + max: datetime + min: datetime + original: bool + format: str + + +def is_wrong_document(data: str | bytes | HtmlElement | None) -> bool: "Check if the input object is suitable to be processed." - if not data or len(data) > MAX_FILE_SIZE: - return True - return False + return not data or len(data) > MAX_FILE_SIZE def isutf8(data: bytes) -> bool: @@ -81,7 +68,7 @@ def isutf8(data: bytes) -> bool: return True -def detect_encoding(bytesobject: bytes) -> List[str]: +def detect_encoding(bytesobject: bytes) -> list[str]: """Read all input or first chunk and return a list of encodings""" # alternatives: https://github.com/scrapy/w3lib/blob/master/w3lib/encoding.py # unicode-test @@ -103,7 +90,7 @@ def detect_encoding(bytesobject: bytes) -> List[str]: return [g for g in guesses if g not in UNICODE_ALIASES] -def decode_file(filecontent: Union[bytes, str]) -> str: +def decode_file(filecontent: bytes | str) -> str: """Guess bytestring encoding and try to decode to Unicode string. Resort to destructive conversion otherwise.""" # init @@ -116,25 +103,24 @@ def decode_file(filecontent: Union[bytes, str]) -> str: htmltext = filecontent.decode(guessed_encoding) except (LookupError, UnicodeDecodeError): # VISCII: lookup LOGGER.warning("wrong encoding detected: %s", guessed_encoding) - htmltext = None else: break # return original content if nothing else succeeded return htmltext or str(filecontent, encoding="utf-8", errors="replace") -def decode_response(response: Any) -> str: +def decode_response(response: urllib3.response.HTTPResponse | bytes) -> str: """Read the urllib3 object corresponding to the server response, then try to guess its encoding and decode it to return a unicode string""" # urllib3 response object / bytes switch - if isinstance(response, urllib3.response.HTTPResponse) or hasattr(response, "data"): + if isinstance(response, urllib3.response.HTTPResponse): resp_content = response.data else: resp_content = response return decode_file(resp_content) -def fetch_url(url: str) -> Optional[str]: +def fetch_url(url: str) -> str | None: """Fetches page using urllib3 and decodes the response. Args: @@ -149,7 +135,7 @@ def fetch_url(url: str) -> Optional[str]: try: # read by streaming chunks (stream=True, iter_content=xx) # so we can stop downloading as soon as MAX_FILE_SIZE is reached - response = HTTP_POOL.request("GET", url, timeout=30) # type: ignore + response = HTTP_POOL.request("GET", url, timeout=30) except Exception as err: LOGGER.error("download error: %s %s", url, err) # sys.exc_info()[0] else: @@ -175,7 +161,7 @@ def repair_faulty_html(htmlstring: str, beginning: str) -> str: firstline, _, rest = htmlstring.partition("\n") htmlstring = DOCTYPE_TAG.sub("", firstline, count=1) + "\n" + rest # other issue with malformed documents: check first three lines - for i, line in enumerate(iter(htmlstring.splitlines())): + for i, line in enumerate(htmlstring.splitlines()): if ""): htmlstring = FAULTY_HTML.sub(r"\1>", htmlstring, count=1) break @@ -184,17 +170,16 @@ def repair_faulty_html(htmlstring: str, beginning: str) -> str: return htmlstring -def fromstring_bytes(htmlobject: str) -> Optional[HtmlElement]: +def fromstring_bytes(htmlobject: str) -> HtmlElement | None: "Try to pass bytes to LXML parser." - tree = None try: - tree = fromstring(htmlobject.encode("utf8"), parser=HTML_PARSER) + return fromstring(htmlobject.encode("utf8"), parser=HTML_PARSER) except Exception as err: LOGGER.error("lxml parser bytestring %s", err) - return tree + return None -def load_html(htmlobject: Union[bytes, str, HtmlElement]) -> Optional[HtmlElement]: +def load_html(htmlobject: bytes | str | HtmlElement) -> HtmlElement | None: """Load object given as input and validate its type (accepted: lxml.html tree, bytestring and string) """ @@ -203,7 +188,7 @@ def load_html(htmlobject: Union[bytes, str, HtmlElement]) -> Optional[HtmlElemen return htmlobject # do not accept any other type after this point if not isinstance(htmlobject, (bytes, str)): - raise TypeError("incompatible input type: %s", type(htmlobject)) + raise TypeError(f"incompatible input type: {type(htmlobject)}") # the string is a URL, download it if ( isinstance(htmlobject, str) @@ -211,10 +196,11 @@ def load_html(htmlobject: Union[bytes, str, HtmlElement]) -> Optional[HtmlElemen and " " not in htmlobject ): LOGGER.debug("URL detected, downloading: %s", htmlobject) - htmlobject = fetch_url(htmlobject) # type: ignore[assignment] + downloaded = fetch_url(htmlobject) # log the error and quit - if htmlobject is None: - raise ValueError("URL couldn't be processed: %s", htmlobject) + if downloaded is None: + raise ValueError(f"URL couldn't be processed: {htmlobject}") + htmlobject = downloaded # start processing tree = None # try to guess encoding and decode file: if None then keep original @@ -246,15 +232,20 @@ def load_html(htmlobject: Union[bytes, str, HtmlElement]) -> Optional[HtmlElemen return tree -def clean_html(tree: HtmlElement, elemlist: List[str]) -> HtmlElement: +def clean_html(tree: HtmlElement, elemlist: list[str]) -> HtmlElement: "Delete selected elements." - for element in tree.iter(elemlist): # type: ignore[call-overload] - parent = element.getparent() - if parent is not None: - parent.remove(element) + for element in tree.iter(elemlist): + # drop_tree() keeps the element's tail text (a date may sit right after a + # cleaned media element); fall back to remove() if it is unavailable + try: + element.drop_tree() + except AttributeError: # pragma: no cover + parent = element.getparent() + if parent is not None: + parent.remove(element) return tree def trim_text(string: str) -> str: "Remove superfluous space and normalize remaining space." - return " ".join(string.split()).strip() + return " ".join(string.split()) diff --git a/htmldate/validators.py b/htmldate/validators.py index c29fbad0..d8c5462d 100644 --- a/htmldate/validators.py +++ b/htmldate/validators.py @@ -4,24 +4,22 @@ """ import logging +import re from collections import Counter from datetime import datetime from functools import lru_cache -from time import mktime -from typing import Match, Optional, Pattern, Union, Counter as Counter_Type from .settings import CACHE_SIZE, MIN_DATE from .utils import Extractor - LOGGER = logging.getLogger(__name__) LOGGER.debug("minimum date setting: %s", MIN_DATE) @lru_cache(maxsize=CACHE_SIZE) def is_valid_date( - date_input: Optional[Union[datetime, str]], + date_input: datetime | str | None, outputformat: str, earliest: datetime, latest: datetime, @@ -58,16 +56,18 @@ def is_valid_date( def validate_and_convert( - date_input: Optional[Union[datetime, str]], + date_input: datetime | None, outputformat: str, earliest: datetime, latest: datetime, -) -> Optional[str]: +) -> str | None: "Robust validation and conversion for plausible dates." - if is_valid_date(date_input, outputformat, earliest, latest): + if date_input is not None and is_valid_date( + date_input, outputformat, earliest, latest + ): try: LOGGER.debug("custom parse result: %s", date_input) - return date_input.strftime(outputformat) # type: ignore + return date_input.strftime(outputformat) except ValueError as err: # pragma: no cover LOGGER.error("value error during conversion: %s %s", date_input, err) return None @@ -83,8 +83,8 @@ def is_valid_format(outputformat: str) -> bool: except (TypeError, ValueError) as err: LOGGER.error("wrong output format or type: %s %s", outputformat, err) return False - # test in abstracto (could be the only test) - if not isinstance(outputformat, str) or "%" not in outputformat: + # a format without any directive cannot produce a date + if "%" not in outputformat: LOGGER.error("malformed output format: %s", outputformat) return False return True @@ -93,14 +93,15 @@ def is_valid_format(outputformat: str) -> bool: def plausible_year_filter( htmlstring: str, *, - pattern: Pattern[str], - yearpat: Pattern[str], + pattern: re.Pattern[str], + yearpat: re.Pattern[str], earliest: datetime, latest: datetime, incomplete: bool = False, -) -> Counter_Type[str]: +) -> Counter[str]: """Filter the date patterns to find plausible years only""" occurrences = Counter(pattern.findall(htmlstring)) # slow! + min_year, max_year = earliest.year, latest.year for item in list(occurrences): # prevent RuntimeError year_match = yearpat.search(item) @@ -116,7 +117,7 @@ def plausible_year_filter( century = "19" if lastdigits[0] == "9" else "20" potential_year = int(century + lastdigits) - if not earliest.year <= potential_year <= latest.year: + if not min_year <= potential_year <= max_year: LOGGER.debug("no potential year: %s", item) del occurrences[item] @@ -126,7 +127,7 @@ def plausible_year_filter( def compare_values(reference: int, attempt: str, options: Extractor) -> int: """Compare the date expression to a reference""" try: - timestamp = int(mktime(datetime.strptime(attempt, options.format).timetuple())) + timestamp = int(datetime.strptime(attempt, options.format).timestamp()) except Exception as err: LOGGER.debug("datetime.strptime exception: %s for string %s", err, attempt) return reference @@ -139,14 +140,13 @@ def compare_values(reference: int, attempt: str, options: Extractor) -> int: @lru_cache(maxsize=CACHE_SIZE) def filter_ymd_candidate( - bestmatch: Match[str], - pattern: Pattern[str], - original_date: bool, + bestmatch: re.Match[str], + pattern: re.Pattern[str], copyear: int, outputformat: str, min_date: datetime, max_date: datetime, -) -> Optional[str]: +) -> str | None: """Filter free text candidates in the YMD format""" if bestmatch is not None: pagedate = "-".join([bestmatch[1], bestmatch[2], bestmatch[3]]) @@ -155,15 +155,6 @@ def filter_ymd_candidate( ): LOGGER.debug('date found for pattern "%s": %s', pattern, pagedate) return convert_date(pagedate, "%Y-%m-%d", outputformat) - ## TODO: test and improve - # if original_date is True: - # if copyear == 0 or int(bestmatch[1]) <= copyear: - # LOGGER.debug('date found for pattern "%s": %s', pattern, pagedate) - # return convert_date(pagedate, '%Y-%m-%d', outputformat) - # else: - # if copyear == 0 or int(bestmatch[1]) >= copyear: - # LOGGER.debug('date found for pattern "%s": %s', pattern, pagedate) - # return convert_date(pagedate, '%Y-%m-%d', outputformat) return None @@ -180,7 +171,7 @@ def convert_date(datestring: str, inputformat: str, outputformat: str) -> str: return dateobject.strftime(outputformat) -def check_extracted_reference(reference: int, options: Extractor) -> Optional[str]: +def check_extracted_reference(reference: int, options: Extractor) -> str | None: """Test if the extracted reference date can be returned""" if reference > 0: dateobject = datetime.fromtimestamp(reference) @@ -192,25 +183,29 @@ def check_extracted_reference(reference: int, options: Extractor) -> Optional[st return None -def check_date_input( - date_object: Optional[Union[datetime, str]], default: datetime -) -> datetime: +def check_date_input(date_object: datetime | str | None, default: datetime) -> datetime: "Check if the input is a usable datetime or ISO date string, return default otherwise" if isinstance(date_object, datetime): return date_object if isinstance(date_object, str): try: - return datetime.fromisoformat(date_object) # type: ignore[attr-defined] + return datetime.fromisoformat(date_object) except ValueError: LOGGER.warning("invalid datetime string: %s", date_object) return default # no input or error thrown -def get_min_date(min_date: Optional[Union[datetime, str]]) -> datetime: +def get_min_date(min_date: datetime | str | None) -> datetime: """Validates the minimum date and/or defaults to earliest plausible date""" return check_date_input(min_date, MIN_DATE) -def get_max_date(max_date: Optional[Union[datetime, str]]) -> datetime: - """Validates the maximum date and/or defaults to latest plausible date""" - return check_date_input(max_date, datetime.now()) +def get_max_date(max_date: datetime | str | None) -> datetime: + """Validates the maximum date and/or defaults to the end of the current day. + A day-granular default stays stable across calls (unlike datetime.now()), + which lets the date-validation caches be reused from one document to the + next in batch processing, and accepts dates published earlier the same day.""" + end_of_today = datetime.now().replace( + hour=23, minute=59, second=59, microsecond=999999 + ) + return check_date_input(max_date, end_of_today) diff --git a/pyproject.toml b/pyproject.toml index a6504ec6..428bc284 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -9,7 +9,7 @@ description = "Fast and robust extraction of original and updated publication da readme = "README.md" license = { text = "Apache 2.0" } dynamic = ["version"] -requires-python = ">=3.8" +requires-python = ">=3.10" authors = [ {name = "Adrien Barbaresi", email = "adrien.barbaresi@gmail.com"} ] @@ -37,8 +37,6 @@ classifiers = [ "Operating System :: POSIX :: Linux", "Programming Language :: Python", "Programming Language :: Python :: 3", - "Programming Language :: Python :: 3.8", - "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", "Programming Language :: Python :: 3.12", @@ -52,9 +50,7 @@ classifiers = [ dependencies = [ "charset_normalizer >= 3.4.0", "dateparser >= 1.1.2", # 1.1.3+ slower - # see tests on Github Actions - "lxml == 4.9.2 ; platform_system == 'Darwin' and python_version <= '3.8'", - "lxml >= 5.3.0 ; platform_system != 'Darwin' or python_version > '3.8'", + "lxml >= 5.3.0", "python-dateutil >= 2.9.0.post0", "urllib3 >= 1.26, < 3", ] @@ -99,5 +95,11 @@ all = [ "htmldate[speed]", ] +[tool.black] +target-version = ["py310"] + +[tool.mypy] +warn_unused_ignores = true + [tool.pytest.ini_options] testpaths = "tests/*test*.py" diff --git a/setup.py b/setup.py deleted file mode 100644 index 8f91d615..00000000 --- a/setup.py +++ /dev/null @@ -1,37 +0,0 @@ -""" -Seamlessly extract the date of web pages based on URL, header or body. -http://github.com/adbar/htmldate -""" - -import sys - -from setuptools import setup - - -# add argument to compile with mypyc -if len(sys.argv) > 1 and sys.argv[1] == "--use-mypyc": - sys.argv.pop(1) - USE_MYPYC = True - from mypyc.build import mypycify - - ext_modules = mypycify( - [ - "htmldate/__init__.py", - "htmldate/core.py", - "htmldate/extractors.py", - "htmldate/meta.py", - "htmldate/settings.py", - "htmldate/utils.py", - "htmldate/validators.py", - ], - opt_level="3", - multi_file=True, - ) -else: - ext_modules = [] - - -setup( - # mypyc or not - ext_modules=ext_modules, -) diff --git a/tests/evaluation.py b/tests/evaluation.py index 2f1f844d..394125dd 100644 --- a/tests/evaluation.py +++ b/tests/evaluation.py @@ -11,16 +11,23 @@ except ImportError: from charset_normalizer import detect -from articleDateExtractor import extractArticlePublishedDate -from date_guesser import guess_date -from goose3 import Goose -from newspaper import Article -from newspaper.article import ArticleDownloadState -from newsplease import NewsPlease - from htmldate import find_date from htmldate.validators import convert_date +# Optional third-party libraries, only needed for the full benchmark +# (i.e. comparison.py *without* --small). Guard the imports so the +# htmldate-only run works without these heavy/legacy packages installed. +try: + from articleDateExtractor import extractArticlePublishedDate + from date_guesser import guess_date + from goose3 import Goose + from newspaper import Article + from newspaper.article import ArticleDownloadState + from newsplease import NewsPlease +except ImportError: + extractArticlePublishedDate = guess_date = Goose = None + Article = ArticleDownloadState = NewsPlease = None + TEST_DIR = os.path.abspath(os.path.dirname(__file__)) # list the jsons containing the pages here @@ -32,7 +39,7 @@ with open(evalpath, "r", encoding="utf-8") as f: EVAL_PAGES.update(json.load(f)) -G = Goose() +G = Goose() if Goose is not None else None def load_document(filename): diff --git a/tests/realworld_tests.py b/tests/realworld_tests.py index c646114f..ecc56c4a 100644 --- a/tests/realworld_tests.py +++ b/tests/realworld_tests.py @@ -768,9 +768,8 @@ def test_cli(): "Test the command-line interface" # third test: Linux and MacOS only if os.name != "nt": - testargs = [""] - with patch.object(sys, "argv", testargs): - args = parse_args(testargs) + testargs = [] + args = parse_args(testargs) sys.stdin = open( os.path.join(TEST_DIR, "cache", "befifty.montauk.html"), "r", diff --git a/tests/unit_tests.py b/tests/unit_tests.py index 533bb32c..00ae20de 100644 --- a/tests/unit_tests.py +++ b/tests/unit_tests.py @@ -1515,7 +1515,6 @@ def test_idiosyncrasies(): def test_parser(): """test argument parsing for the command-line interface""" testargs = [ - "-f", "-v", "--original", "-max", @@ -1523,26 +1522,23 @@ def test_parser(): "-u", "https://www.example.org", ] - with patch.object(sys, "argv", testargs): - args = parse_args(testargs) + args = parse_args(testargs) assert args.fast is True assert args.original is True assert args.verbose is True assert args.maxdate == "2015-12-31" assert args.URL == "https://www.example.org" - testargs = ["-f", "-min", "2015-12-31"] - with patch.object(sys, "argv", testargs): - args = parse_args(testargs) + testargs = ["-min", "2015-12-31"] + args = parse_args(testargs) assert args.fast is True assert args.original is False assert args.verbose is False assert args.mindate == "2015-12-31" # version f = io.StringIO() - testargs = ["", "--version"] + testargs = ["--version"] with pytest.raises(SystemExit) as e, redirect_stdout(f): - with patch.object(sys, "argv", testargs): - args = parse_args(testargs) + args = parse_args(testargs) assert e.type == SystemExit and e.value.code == 0 assert re.match( r"Htmldate [0-9]\.[0-9]+\.[0-9] - Python [0-9]\.[0-9]+\.[0-9]", f.getvalue() @@ -1552,8 +1548,7 @@ def test_parser(): def test_cli(): "Test the command-line interface" testargs = ["--original"] - with patch.object(sys, "argv", testargs): - args = parse_args(testargs) + args = parse_args(testargs) assert cli_examine(None, args) is None assert cli_examine(" ", args) is None @@ -1608,16 +1603,14 @@ def test_cli(): ) # first test - testargs = ["", "-u", "123", "-v"] - with patch.object(sys, "argv", testargs): - args = parse_args(testargs) + testargs = ["-u", "123", "-v"] + args = parse_args(testargs) with pytest.raises(SystemExit) as err: process_args(args) assert err.type == SystemExit # meaningful test - testargs = ["", "-u", "https://httpbun.com/html"] - with patch.object(sys, "argv", testargs): - args = parse_args(testargs) + testargs = ["-u", "https://httpbun.com/html"] + args = parse_args(testargs) f = io.StringIO() with redirect_stdout(f): process_args(args) @@ -1638,8 +1631,7 @@ def test_download(): main() testargs = ["--original"] - with patch.object(sys, "argv", testargs): - args = parse_args(testargs) + args = parse_args(testargs) url = "https://httpbin.org/status/200" teststring = fetch_url(url) From a066bd4702ada47cea094a46241916d3331968ef Mon Sep 17 00:00:00 2001 From: Adrien Barbaresi Date: Sat, 30 May 2026 15:07:02 +0200 Subject: [PATCH 2/2] fix issues and use ruff --- .github/workflows/tests.yml | 10 +++------- CONTRIBUTING.md | 2 +- docs/index.rst | 6 +++--- htmldate/utils.py | 2 +- pyproject.toml | 15 +++++++++++---- tests/realworld_tests.py | 1 - tests/unit_tests.py | 6 +++--- 7 files changed, 22 insertions(+), 20 deletions(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 24cd7629..bb7750ca 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -58,17 +58,13 @@ jobs: run: python -m pip install --upgrade -e ".[dev]" # tests - - name: Lint with flake8 - run: | - # stop the build if there are Python syntax errors or undefined names - flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics - # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide - flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics + - name: Lint with ruff + run: ruff check . - name: Code format and type checking if: ${{ matrix.python-version == env.REF_PY_VERSION }} run: | - black --check --diff htmldate + ruff format --check htmldate mypy -p htmldate - name: Install full dependencies diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 8fd3507b..4eb1e9e6 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -25,7 +25,7 @@ and if there are no errors. 2. Run the tests and code quality tools: - Tests with `pytest` - Type checking with `mypy` on the directory: `mypy htmldate/` - - Code formatting with `black` on the directory as well + - Linting with `ruff check .` and formatting with `ruff format htmldate/` For further questions you can use [GitHub issues](https://github.com/adbar/htmldate/issues) or [E-Mail](https://adrien.barbaresi.eu/). diff --git a/docs/index.rst b/docs/index.rst index 5aa83b02..16b85ead 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -21,9 +21,9 @@ Htmldate: Find the Publication Date of Web Pages :target: https://doi.org/10.21105/joss.02439 :alt: JOSS article reference DOI: 10.21105/joss.02439 -.. image:: https://img.shields.io/badge/code%20style-black-000000.svg - :target: https://github.com/psf/black - :alt: Code style: black +.. image:: https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/astral-sh/ruff/main/assets/badge/v2.json + :target: https://github.com/astral-sh/ruff + :alt: Ruff | diff --git a/htmldate/utils.py b/htmldate/utils.py index bd95f906..90382d53 100644 --- a/htmldate/utils.py +++ b/htmldate/utils.py @@ -13,7 +13,7 @@ # CChardet is faster and can be more accurate try: - from cchardet import detect as cchardet_detect # type: ignore[import-untyped] + from cchardet import detect as cchardet_detect except ImportError: cchardet_detect = None from charset_normalizer import from_bytes diff --git a/pyproject.toml b/pyproject.toml index 428bc284..86bc72e8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -75,11 +75,10 @@ htmldate = "htmldate.cli:main" # Development extras [project.optional-dependencies] dev = [ - "black", - "flake8", "mypy", "pytest", "pytest-cov", + "ruff", "types-dateparser", "types-python-dateutil", "types-lxml", @@ -95,11 +94,19 @@ all = [ "htmldate[speed]", ] -[tool.black] -target-version = ["py310"] +[tool.ruff] +target-version = "py310" [tool.mypy] warn_unused_ignores = true +[[tool.mypy.overrides]] +# faust-cchardet (optional "speed" extra) ships no type stubs and is absent +# during the type-checking step, so silence its import here. This behaves the +# same whether the extra is installed (import-untyped) or not (import-not-found), +# unlike an inline ignore tied to a single error code. +module = ["cchardet"] +ignore_missing_imports = true + [tool.pytest.ini_options] testpaths = "tests/*test*.py" diff --git a/tests/realworld_tests.py b/tests/realworld_tests.py index ecc56c4a..0e0ee129 100644 --- a/tests/realworld_tests.py +++ b/tests/realworld_tests.py @@ -10,7 +10,6 @@ import sys from contextlib import redirect_stdout -from unittest.mock import patch import pytest diff --git a/tests/unit_tests.py b/tests/unit_tests.py index 00ae20de..92b37d32 100644 --- a/tests/unit_tests.py +++ b/tests/unit_tests.py @@ -1538,8 +1538,8 @@ def test_parser(): f = io.StringIO() testargs = ["--version"] with pytest.raises(SystemExit) as e, redirect_stdout(f): - args = parse_args(testargs) - assert e.type == SystemExit and e.value.code == 0 + parse_args(testargs) + assert e.type is SystemExit and e.value.code == 0 assert re.match( r"Htmldate [0-9]\.[0-9]+\.[0-9] - Python [0-9]\.[0-9]+\.[0-9]", f.getvalue() ) @@ -1607,7 +1607,7 @@ def test_cli(): args = parse_args(testargs) with pytest.raises(SystemExit) as err: process_args(args) - assert err.type == SystemExit + assert err.type is SystemExit # meaningful test testargs = ["-u", "https://httpbun.com/html"] args = parse_args(testargs)