From db9f645439b56d71d5c3875d30bbdd0412d5b06d Mon Sep 17 00:00:00 2001 From: Adrien Barbaresi Date: Sat, 30 May 2026 14:45:48 +0200 Subject: [PATCH 1/3] maintenance: update setup, harden code, add tests --- .pre-commit-config.yaml | 27 ------ CONTRIBUTING.md | 1 - htmldate/cli.py | 17 ++-- htmldate/core.py | 191 ++++++++++++++++++--------------------- htmldate/extractors.py | 56 ++++++------ htmldate/meta.py | 1 - htmldate/utils.py | 91 +++++++++---------- htmldate/validators.py | 69 +++++++------- pyproject.toml | 14 +-- setup.py | 37 -------- tests/evaluation.py | 23 +++-- tests/realworld_tests.py | 5 +- tests/unit_tests.py | 30 +++--- 13 files changed, 229 insertions(+), 333 deletions(-) delete mode 100644 .pre-commit-config.yaml delete mode 100644 setup.py diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml deleted file mode 100644 index 011dd200..00000000 --- a/.pre-commit-config.yaml +++ /dev/null @@ -1,27 +0,0 @@ -repos: - - repo: https://github.com/pre-commit/pre-commit-hooks - rev: v4.5.0 - hooks: - - id: check-yaml - - id: end-of-file-fixer - - id: trailing-whitespace - - - repo: https://github.com/psf/black - rev: 24.3.0 - hooks: - - id: black - - #- repo: https://github.com/PyCQA/flake8 - # rev: 7.0.0 - # hooks: - # - id: flake8 - - #- repo: https://github.com/pycqa/isort - # rev: 5.13.2 - # hooks: - # - id: isort - - #- repo: https://github.com/pre-commit/mirrors-mypy - # rev: v1.9.0 - # hooks: - # - id: mypy diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 0f2287ed..8fd3507b 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -26,7 +26,6 @@ and if there are no errors. - Tests with `pytest` - Type checking with `mypy` on the directory: `mypy htmldate/` - Code formatting with `black` on the directory as well - - Optional: install `pre-commit` to use the corresponding commit hooks For further questions you can use [GitHub issues](https://github.com/adbar/htmldate/issues) or [E-Mail](https://adrien.barbaresi.eu/). diff --git a/htmldate/cli.py b/htmldate/cli.py index 4b59d6cf..abba2e13 100644 --- a/htmldate/cli.py +++ b/htmldate/cli.py @@ -7,7 +7,6 @@ import sys from platform import python_version -from typing import Any, Optional, Union from lxml.html import HtmlElement @@ -17,12 +16,12 @@ def cli_examine( - htmlstring: Union[str, HtmlElement], - args: Any, -) -> Optional[str]: + htmlstring: str | HtmlElement | None, + args: argparse.Namespace, +) -> str | None: """Generic safeguards and triggers""" # safety check - if is_wrong_document(htmlstring): + if htmlstring is None or is_wrong_document(htmlstring): sys.stderr.write("# ERROR: document is empty or too large\n") return None return find_date( @@ -35,7 +34,7 @@ def cli_examine( ) -def parse_args(args: Any) -> Any: +def parse_args(args: list[str]) -> argparse.Namespace: """Define parser for command-line arguments""" argsparser = argparse.ArgumentParser() argsparser.add_argument( @@ -67,10 +66,10 @@ def parse_args(args: Any) -> Any: action="version", version=f"Htmldate {__version__} - Python {python_version()}", ) - return argsparser.parse_args() + return argsparser.parse_args(args) -def process_args(args: Any) -> None: +def process_args(args: argparse.Namespace) -> None: """Process the arguments passed on the command-line.""" # verbosity if args.verbose: @@ -98,7 +97,7 @@ def process_args(args: Any) -> None: with open(args.inputfile, mode="r", encoding="utf-8") as inputfile: for line in inputfile: htmltext = fetch_url(line.strip()) - result = cli_examine(htmltext, args) # type: ignore[arg-type] + result = cli_examine(htmltext, args) sys.stdout.write(f"{line.strip()}\t{result or 'None'}\n") diff --git a/htmldate/core.py b/htmldate/core.py index 98ba9188..f40489d6 100644 --- a/htmldate/core.py +++ b/htmldate/core.py @@ -7,10 +7,10 @@ import re from collections import Counter +from collections.abc import Callable from copy import deepcopy from datetime import datetime from functools import lru_cache, partial -from typing import Match, Optional, Pattern, Union, Counter as Counter_Type from lxml.html import HtmlElement, tostring @@ -68,7 +68,6 @@ validate_and_convert, ) - LOGGER = logging.getLogger(__name__) @@ -199,7 +198,7 @@ def logstring(element: HtmlElement) -> str: def examine_text( text: str, options: Extractor, -) -> Optional[str]: +) -> str | None: "Prepare text and try to extract a date." text = trim_text(text) @@ -216,7 +215,7 @@ def examine_date_elements( tree: HtmlElement, expression: str, options: Extractor, -) -> Optional[str]: +) -> str | None: """Check HTML elements one by one for date expressions""" elements = tree.xpath(expression) if not elements or len(elements) > MAX_POSSIBLE_CANDIDATES: @@ -235,7 +234,7 @@ def examine_date_elements( def examine_header( tree: HtmlElement, options: Extractor, -) -> Optional[str]: +) -> str | None: """ Parse header elements to find date cues @@ -353,11 +352,11 @@ def examine_header( def select_candidate( - occurrences: Counter_Type[str], - catch: Pattern[str], - yearpat: Pattern[str], + occurrences: Counter[str], + catch: re.Pattern[str], + yearpat: re.Pattern[str], options: Extractor, -) -> Optional[Match[str]]: +) -> re.Match[str] | None: """Select a candidate among the most frequent matches""" if not occurrences or len(occurrences) > MAX_POSSIBLE_CANDIDATES: return None @@ -381,12 +380,8 @@ def select_candidate( if year_match: years.append(year_match[1]) - validation = [ - is_valid_date( - datetime(int(year), 1, 1), "%Y", earliest=options.min, latest=options.max - ) - for year in years - ] + min_year, max_year = options.min.year, options.max.year + validation = [min_year <= int(year) <= max_year for year in years] # safety net: plausibility if all(validation): @@ -409,11 +404,11 @@ def select_candidate( def search_pattern( htmlstring: str, - pattern: Pattern[str], - catch: Pattern[str], - yearpat: Pattern[str], + pattern: re.Pattern[str], + catch: re.Pattern[str], + yearpat: re.Pattern[str], options: Extractor, -) -> Optional[Match[str]]: +) -> re.Match[str] | None: """Chained candidate filtering and selection""" candidates = plausible_year_filter( htmlstring, @@ -443,7 +438,7 @@ def compare_reference( def examine_abbr_elements( tree: HtmlElement, options: Extractor, -) -> Optional[str]: +) -> str | None: """Scan the page for abbr elements and check if their content contains an eligible date""" elements = tree.findall(".//abbr") if 0 < len(elements) < MAX_POSSIBLE_CANDIDATES: @@ -500,7 +495,7 @@ def examine_abbr_elements( def examine_time_elements( tree: HtmlElement, options: Extractor, -) -> Optional[str]: +) -> str | None: """Scan the page for time elements and check if their content contains an eligible date""" elements = tree.findall(".//time") if 0 < len(elements) < MAX_POSSIBLE_CANDIDATES: @@ -521,16 +516,17 @@ def examine_time_elements( LOGGER.debug("shortcut for time pubdate found: %s", datetime_attr) # shortcuts: class attribute elif "class" in elem.attrib: + class_attr = elem.get("class", "") if options.original and ( - elem.get("class", "").startswith("entry-date") - or elem.get("class", "").startswith("entry-time") + class_attr.startswith("entry-date") + or class_attr.startswith("entry-time") ): shortcut_flag = True LOGGER.debug( "shortcut for time/datetime found: %s", datetime_attr ) # updated time - elif not options.original and elem.get("class") == "updated": + elif not options.original and class_attr == "updated": shortcut_flag = True LOGGER.debug( "shortcut for updated time/datetime found: %s", @@ -562,7 +558,7 @@ def examine_time_elements( return None -def normalize_match(match: Optional[Match[str]]) -> str: +def normalize_match(match: re.Match[str] | None) -> str: """Normalize string output by adding "0" if necessary, and optionally expand the year from two to four digits.""" day, month, year = (g.zfill(2) for g in match.groups() if g) # type: ignore[union-attr] @@ -571,7 +567,44 @@ def normalize_match(match: Optional[Match[str]]) -> str: return f"{year}-{month}-{day}" -def search_page(htmlstring: str, options: Extractor) -> Optional[str]: +def normalize_two_comp(item: str) -> str: + """Normalize a MM-YYYY style match into a YYYY-MM-01 string.""" + match = TWO_COMP_REGEX.match(item) + month = match[1].zfill(2) # type: ignore[index] + return "-".join([match[2], month, "01"]) # type: ignore[index] + + +def search_normalized( + htmlstring: str, + pattern: re.Pattern[str], + yearpat: re.Pattern[str], + normalizer: Callable[[str], str], + copyear: int, + options: Extractor, + *, + incomplete: bool = False, +) -> str | None: + """Filter plausible years, normalize each candidate to the YMD format, then + select the best match and validate it (shared candidate-selection pipeline).""" + candidates = plausible_year_filter( + htmlstring, + pattern=pattern, + yearpat=yearpat, + earliest=options.min, + latest=options.max, + incomplete=incomplete, + ) + # revert DD-MM-YYYY patterns before sorting + normalized = Counter( + {normalizer(item): count for item, count in candidates.items()} + ) + bestmatch = select_candidate(normalized, YMD_PATTERN, YMD_YEAR, options) + return filter_ymd_candidate( + bestmatch, pattern, copyear, options.format, options.min, options.max + ) + + +def search_page(htmlstring: str, options: Extractor) -> str | None: """ Opportunistically search the HTML text for common text patterns @@ -619,7 +652,6 @@ def search_page(htmlstring: str, options: Extractor) -> Optional[str]: result = filter_ymd_candidate( bestmatch, patterns[0], - options.original, copyear, options.format, options.min, @@ -629,30 +661,13 @@ def search_page(htmlstring: str, options: Extractor) -> Optional[str]: return result # YYYY-MM-DD/DD-MM-YYYY - candidates = plausible_year_filter( + result = search_normalized( htmlstring, - pattern=SELECT_YMD_PATTERN, - yearpat=SELECT_YMD_YEAR, - earliest=options.min, - latest=options.max, - ) - # revert DD-MM-YYYY patterns before sorting - replacement = {} - for item in candidates: - match = THREE_COMP_REGEX_A.match(item) - candidate = normalize_match(match) - replacement[candidate] = candidates[item] - candidates = Counter(replacement) - # select - bestmatch = select_candidate(candidates, YMD_PATTERN, YMD_YEAR, options) - result = filter_ymd_candidate( - bestmatch, SELECT_YMD_PATTERN, - options.original, + SELECT_YMD_YEAR, + lambda item: normalize_match(THREE_COMP_REGEX_A.match(item)), copyear, - options.format, - options.min, - options.max, + options, ) if result is not None: return result @@ -668,7 +683,6 @@ def search_page(htmlstring: str, options: Extractor) -> Optional[str]: result = filter_ymd_candidate( bestmatch, DATESTRINGS_PATTERN, - options.original, copyear, options.format, options.min, @@ -678,30 +692,14 @@ def search_page(htmlstring: str, options: Extractor) -> Optional[str]: return result # DD?/MM?/YY - candidates = plausible_year_filter( + result = search_normalized( htmlstring, - pattern=SLASHES_PATTERN, - yearpat=SLASHES_YEAR, - earliest=options.min, - latest=options.max, - incomplete=True, - ) - # revert DD-MM-YYYY patterns before sorting - replacement = {} - for item in candidates: - match = THREE_COMP_REGEX_B.match(item) - candidate = normalize_match(match) - replacement[candidate] = candidates[item] - candidates = Counter(replacement) - bestmatch = select_candidate(candidates, YMD_PATTERN, YMD_YEAR, options) - result = filter_ymd_candidate( - bestmatch, SLASHES_PATTERN, - options.original, + SLASHES_YEAR, + lambda item: normalize_match(THREE_COMP_REGEX_B.match(item)), copyear, - options.format, - options.min, - options.max, + options, + incomplete=True, ) if result is not None: return result @@ -732,44 +730,24 @@ def search_page(htmlstring: str, options: Extractor) -> Optional[str]: return result # 2 components, second option - candidates = plausible_year_filter( + result = search_normalized( htmlstring, - pattern=MMYYYY_PATTERN, - yearpat=MMYYYY_YEAR, - earliest=options.min, - latest=options.max, - incomplete=options.original, - ) - # revert DD-MM-YYYY patterns before sorting - replacement = {} - for item in candidates: - match = TWO_COMP_REGEX.match(item) - month = match[1] # type: ignore[index] - if len(month) == 1: - month = f"0{month}" - candidate = "-".join([match[2], month, "01"]) # type: ignore[index] - replacement[candidate] = candidates[item] - candidates = Counter(replacement) - # select - bestmatch = select_candidate(candidates, YMD_PATTERN, YMD_YEAR, options) - result = filter_ymd_candidate( - bestmatch, MMYYYY_PATTERN, - options.original, + MMYYYY_YEAR, + normalize_two_comp, copyear, - options.format, - options.min, - options.max, + options, + incomplete=options.original, ) if result is not None: return result # try full-blown text regex on all HTML? - dateobject = regex_parse(htmlstring) # type: ignore[assignment] + text_date = regex_parse(htmlstring) # todo: find all candidates and disambiguate? - if copyear == 0 or (dateobject and dateobject.year >= copyear): + if copyear == 0 or (text_date and text_date.year >= copyear): result = validate_and_convert( - dateobject, options.format, earliest=options.min, latest=options.max + text_date, options.format, earliest=options.min, latest=options.max ) if result is not None: return result @@ -806,16 +784,16 @@ def search_page(htmlstring: str, options: Extractor) -> Optional[str]: def find_date( - htmlobject: Union[bytes, str, HtmlElement], + htmlobject: bytes | str | HtmlElement, extensive_search: bool = True, original_date: bool = False, outputformat: str = "%Y-%m-%d", - url: Optional[str] = None, + url: str | None = None, verbose: bool = False, - min_date: Optional[Union[datetime, str]] = None, - max_date: Optional[Union[datetime, str]] = None, + min_date: datetime | str | None = None, + max_date: datetime | str | None = None, deferred_url_extractor: bool = False, -) -> Optional[str]: +) -> str | None: """ Extract dates from HTML documents using markup analysis and text patterns @@ -878,7 +856,6 @@ def find_date( # find_date.extensive_search = extensive_search # URL - url_result = None if url is None: # probe for canonical links urlelem = tree.find('.//link[@rel="canonical"]') @@ -909,9 +886,13 @@ def find_date( return abbr_result # first, prune tree + # only copy the tree if the caller passed one in: when we parsed it ourselves + # (string/bytes/URL input) we own it and can clean it in place, avoiding a + # costly deepcopy of the whole document + pruning_tree = deepcopy(tree) if isinstance(htmlobject, HtmlElement) else tree try: search_tree, discarded = discard_unwanted( - clean_html(deepcopy(tree), CLEANING_LIST) + clean_html(pruning_tree, CLEANING_LIST) ) # rare LXML error: no NULL bytes or control characters except ValueError: # pragma: no cover diff --git a/htmldate/extractors.py b/htmldate/extractors.py index 9435c876..01ad9025 100644 --- a/htmldate/extractors.py +++ b/htmldate/extractors.py @@ -8,10 +8,9 @@ from datetime import datetime from functools import lru_cache -from typing import List, Optional, Pattern, Tuple # coverage for date parsing -from dateparser import DateDataParser # type: ignore # third-party, slow +from dateparser import DateDataParser # type: ignore[attr-defined] # third-party, slow from dateutil.parser import parse as dateutil_parse @@ -23,7 +22,6 @@ from .utils import Extractor, trim_text from .validators import convert_date, is_valid_date, validate_and_convert - LOGGER = logging.getLogger(__name__) EXTERNAL_PARSER = DateDataParser( @@ -120,9 +118,7 @@ rf"""(?P{REGEX_MONTHS})\s (?P{DAY_RE})(?:st|nd|rd|th)?,? (?P{YEAR_RE})| (?P{DAY_RE})(?:st|nd|rd|th|\.)? (?:of )? -(?P{REGEX_MONTHS})[,.]? (?P{YEAR_RE})""".replace( - "\n", "" - ), +(?P{REGEX_MONTHS})[,.]? (?P{YEAR_RE})""".replace("\n", ""), re.I, ) @@ -213,7 +209,7 @@ SIMPLE_PATTERN = re.compile(rf"(? Tuple[HtmlElement, List[HtmlElement]]: +def discard_unwanted(tree: HtmlElement) -> tuple[HtmlElement, list[HtmlElement]]: """Delete unwanted sections of an HTML document and return them as a list""" my_discarded = [] for subtree in DISCARD_EXPRESSIONS(tree): @@ -223,9 +219,9 @@ def discard_unwanted(tree: HtmlElement) -> Tuple[HtmlElement, List[HtmlElement]] def extract_url_date( - testurl: Optional[str], + testurl: str | None, options: Extractor, -) -> Optional[str]: +) -> str | None: """Extract the date out of an URL string complying with the Y-M-D format""" if testurl is not None: match = COMPLETE_URL.search(testurl) @@ -233,10 +229,9 @@ def extract_url_date( LOGGER.debug("found date in URL: %s", match[0]) try: dateobject = datetime(int(match[1]), int(match[2]), int(match[3])) - if is_valid_date( + return validate_and_convert( dateobject, options.format, earliest=options.min, latest=options.max - ): - return dateobject.strftime(options.format) + ) except ValueError as err: # pragma: no cover LOGGER.debug("conversion error: %s %s", match[0], err) return None @@ -249,12 +244,12 @@ def correct_year(year: int) -> int: return year -def try_swap_values(day: int, month: int) -> Tuple[int, int]: +def try_swap_values(day: int, month: int) -> tuple[int, int]: """Swap day and month values if it seems feasible.""" return (month, day) if month > 12 and day <= 12 else (day, month) -def regex_parse(string: str) -> Optional[datetime]: +def regex_parse(string: str) -> datetime | None: """Try full-text parse for date elements using a series of regular expressions with particular emphasis on English, French, German and Turkish""" # https://github.com/vi3k6i5/flashtext ? @@ -285,7 +280,7 @@ def regex_parse(string: str) -> Optional[datetime]: def custom_parse( string: str, outputformat: str, min_date: datetime, max_date: datetime -) -> Optional[str]: +) -> str | None: """Try to bypass the slow dateparser""" LOGGER.debug("custom parse test: %s", string) @@ -303,7 +298,7 @@ def custom_parse( # b. much faster than extensive parsing else: try: - candidate = datetime.fromisoformat(string) # type: ignore[attr-defined] + candidate = datetime.fromisoformat(string) except ValueError: LOGGER.debug("not an ISO date string: %s", string) try: @@ -383,7 +378,7 @@ def custom_parse( ) -def external_date_parser(string: str, outputformat: str) -> Optional[str]: +def external_date_parser(string: str, outputformat: str) -> str | None: """Use dateutil parser or dateparser module according to system settings""" LOGGER.debug("send to external parser: %s", string) try: @@ -393,17 +388,17 @@ def external_date_parser(string: str, outputformat: str) -> Optional[str]: target = None LOGGER.error("external parser error: %s %s", string, err) # issue with data type - return datetime.strftime(target, outputformat) if target else None + return target.strftime(outputformat) if target else None @lru_cache(maxsize=CACHE_SIZE) def try_date_expr( - string: Optional[str], + string: str | None, outputformat: str, extensive_search: bool, min_date: datetime, max_date: datetime, -) -> Optional[str]: +) -> str | None: """Use a series of heuristics and rules to parse a potential date expression""" if not string: return None @@ -440,7 +435,7 @@ def try_date_expr( def img_search( tree: HtmlElement, options: Extractor, -) -> Optional[str]: +) -> str | None: """Skim through image elements""" element = tree.find('.//meta[@property="og:image"][@content]') if element is not None: @@ -453,9 +448,9 @@ def img_search( def pattern_search( text: str, - date_pattern: Pattern[str], + date_pattern: re.Pattern[str], options: Extractor, -) -> Optional[str]: +) -> str | None: "Look for date expressions using a regular expression on a string of text." match = date_pattern.search(text) if match and is_valid_date( @@ -469,7 +464,7 @@ def pattern_search( def json_search( tree: HtmlElement, options: Extractor, -) -> Optional[str]: +) -> str | None: """Look for JSON time patterns in JSON sections of the tree""" # determine pattern json_pattern = JSON_PUBLISHED if options.original else JSON_MODIFIED @@ -479,14 +474,16 @@ def json_search( ): if not elem.text or '"date' not in elem.text: continue - return pattern_search(elem.text, json_pattern, options) + result = pattern_search(elem.text, json_pattern, options) + if result is not None: + return result return None def idiosyncrasies_search( htmlstring: str, options: Extractor, -) -> Optional[str]: +) -> str | None: """Look for author-written dates throughout the web page""" match = TEXT_PATTERNS.search(htmlstring) # EN+DE+TR if match: @@ -499,10 +496,9 @@ def idiosyncrasies_search( day, month = try_swap_values(int(parts[0]), int(parts[1])) year = correct_year(int(parts[2])) candidate = datetime(year, month, day) - if is_valid_date( - candidate, "%Y-%m-%d", earliest=options.min, latest=options.max - ): - return candidate.strftime(options.format) # type: ignore[union-attr] + return validate_and_convert( + candidate, options.format, earliest=options.min, latest=options.max + ) except (IndexError, ValueError): LOGGER.debug("cannot process idiosyncrasies: %s", match[0]) diff --git a/htmldate/meta.py b/htmldate/meta.py index c5faf6df..87a34a99 100644 --- a/htmldate/meta.py +++ b/htmldate/meta.py @@ -8,7 +8,6 @@ from .extractors import try_date_expr from .validators import filter_ymd_candidate, is_valid_date, is_valid_format - LOGGER = logging.getLogger(__name__) diff --git a/htmldate/utils.py b/htmldate/utils.py index 890d7fea..bd95f906 100644 --- a/htmldate/utils.py +++ b/htmldate/utils.py @@ -6,15 +6,14 @@ import logging import re +from dataclasses import dataclass from datetime import datetime -from typing import Any, List, Optional, Set, Union import urllib3 - # CChardet is faster and can be more accurate try: - from cchardet import detect as cchardet_detect # type: ignore + from cchardet import detect as cchardet_detect # type: ignore[import-untyped] except ImportError: cchardet_detect = None from charset_normalizer import from_bytes @@ -23,10 +22,9 @@ from .settings import MAX_FILE_SIZE - LOGGER = logging.getLogger(__name__) -UNICODE_ALIASES: Set[str] = {"utf-8", "utf_8"} +UNICODE_ALIASES: set[str] = {"utf-8", "utf_8"} urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) RETRY_STRATEGY = urllib3.util.Retry( @@ -44,32 +42,21 @@ FAULTY_HTML = re.compile(r"(", re.I) +# eq=False keeps identity-based hashing so instances stay usable as lru_cache keys +@dataclass(slots=True, eq=False) class Extractor: "Defines a class to store all extraction options." - __slots__ = ["extensive", "format", "max", "min", "original"] - - # consider dataclasses for Python 3.7+ - def __init__( - self, - extensive_search: bool, - max_date: datetime, - min_date: datetime, - original_date: bool, - outputformat: str, - ) -> None: - self.extensive: bool = extensive_search - self.format: str = outputformat - self.max: datetime = max_date - self.min: datetime = min_date - self.original: bool = original_date - - -def is_wrong_document(data: Any) -> bool: + extensive: bool + max: datetime + min: datetime + original: bool + format: str + + +def is_wrong_document(data: str | bytes | HtmlElement | None) -> bool: "Check if the input object is suitable to be processed." - if not data or len(data) > MAX_FILE_SIZE: - return True - return False + return not data or len(data) > MAX_FILE_SIZE def isutf8(data: bytes) -> bool: @@ -81,7 +68,7 @@ def isutf8(data: bytes) -> bool: return True -def detect_encoding(bytesobject: bytes) -> List[str]: +def detect_encoding(bytesobject: bytes) -> list[str]: """Read all input or first chunk and return a list of encodings""" # alternatives: https://github.com/scrapy/w3lib/blob/master/w3lib/encoding.py # unicode-test @@ -103,7 +90,7 @@ def detect_encoding(bytesobject: bytes) -> List[str]: return [g for g in guesses if g not in UNICODE_ALIASES] -def decode_file(filecontent: Union[bytes, str]) -> str: +def decode_file(filecontent: bytes | str) -> str: """Guess bytestring encoding and try to decode to Unicode string. Resort to destructive conversion otherwise.""" # init @@ -116,25 +103,24 @@ def decode_file(filecontent: Union[bytes, str]) -> str: htmltext = filecontent.decode(guessed_encoding) except (LookupError, UnicodeDecodeError): # VISCII: lookup LOGGER.warning("wrong encoding detected: %s", guessed_encoding) - htmltext = None else: break # return original content if nothing else succeeded return htmltext or str(filecontent, encoding="utf-8", errors="replace") -def decode_response(response: Any) -> str: +def decode_response(response: urllib3.response.HTTPResponse | bytes) -> str: """Read the urllib3 object corresponding to the server response, then try to guess its encoding and decode it to return a unicode string""" # urllib3 response object / bytes switch - if isinstance(response, urllib3.response.HTTPResponse) or hasattr(response, "data"): + if isinstance(response, urllib3.response.HTTPResponse): resp_content = response.data else: resp_content = response return decode_file(resp_content) -def fetch_url(url: str) -> Optional[str]: +def fetch_url(url: str) -> str | None: """Fetches page using urllib3 and decodes the response. Args: @@ -149,7 +135,7 @@ def fetch_url(url: str) -> Optional[str]: try: # read by streaming chunks (stream=True, iter_content=xx) # so we can stop downloading as soon as MAX_FILE_SIZE is reached - response = HTTP_POOL.request("GET", url, timeout=30) # type: ignore + response = HTTP_POOL.request("GET", url, timeout=30) except Exception as err: LOGGER.error("download error: %s %s", url, err) # sys.exc_info()[0] else: @@ -175,7 +161,7 @@ def repair_faulty_html(htmlstring: str, beginning: str) -> str: firstline, _, rest = htmlstring.partition("\n") htmlstring = DOCTYPE_TAG.sub("", firstline, count=1) + "\n" + rest # other issue with malformed documents: check first three lines - for i, line in enumerate(iter(htmlstring.splitlines())): + for i, line in enumerate(htmlstring.splitlines()): if ""): htmlstring = FAULTY_HTML.sub(r"\1>", htmlstring, count=1) break @@ -184,17 +170,16 @@ def repair_faulty_html(htmlstring: str, beginning: str) -> str: return htmlstring -def fromstring_bytes(htmlobject: str) -> Optional[HtmlElement]: +def fromstring_bytes(htmlobject: str) -> HtmlElement | None: "Try to pass bytes to LXML parser." - tree = None try: - tree = fromstring(htmlobject.encode("utf8"), parser=HTML_PARSER) + return fromstring(htmlobject.encode("utf8"), parser=HTML_PARSER) except Exception as err: LOGGER.error("lxml parser bytestring %s", err) - return tree + return None -def load_html(htmlobject: Union[bytes, str, HtmlElement]) -> Optional[HtmlElement]: +def load_html(htmlobject: bytes | str | HtmlElement) -> HtmlElement | None: """Load object given as input and validate its type (accepted: lxml.html tree, bytestring and string) """ @@ -203,7 +188,7 @@ def load_html(htmlobject: Union[bytes, str, HtmlElement]) -> Optional[HtmlElemen return htmlobject # do not accept any other type after this point if not isinstance(htmlobject, (bytes, str)): - raise TypeError("incompatible input type: %s", type(htmlobject)) + raise TypeError(f"incompatible input type: {type(htmlobject)}") # the string is a URL, download it if ( isinstance(htmlobject, str) @@ -211,10 +196,11 @@ def load_html(htmlobject: Union[bytes, str, HtmlElement]) -> Optional[HtmlElemen and " " not in htmlobject ): LOGGER.debug("URL detected, downloading: %s", htmlobject) - htmlobject = fetch_url(htmlobject) # type: ignore[assignment] + downloaded = fetch_url(htmlobject) # log the error and quit - if htmlobject is None: - raise ValueError("URL couldn't be processed: %s", htmlobject) + if downloaded is None: + raise ValueError(f"URL couldn't be processed: {htmlobject}") + htmlobject = downloaded # start processing tree = None # try to guess encoding and decode file: if None then keep original @@ -246,15 +232,20 @@ def load_html(htmlobject: Union[bytes, str, HtmlElement]) -> Optional[HtmlElemen return tree -def clean_html(tree: HtmlElement, elemlist: List[str]) -> HtmlElement: +def clean_html(tree: HtmlElement, elemlist: list[str]) -> HtmlElement: "Delete selected elements." - for element in tree.iter(elemlist): # type: ignore[call-overload] - parent = element.getparent() - if parent is not None: - parent.remove(element) + for element in tree.iter(elemlist): + # drop_tree() keeps the element's tail text (a date may sit right after a + # cleaned media element); fall back to remove() if it is unavailable + try: + element.drop_tree() + except AttributeError: # pragma: no cover + parent = element.getparent() + if parent is not None: + parent.remove(element) return tree def trim_text(string: str) -> str: "Remove superfluous space and normalize remaining space." - return " ".join(string.split()).strip() + return " ".join(string.split()) diff --git a/htmldate/validators.py b/htmldate/validators.py index c29fbad0..d8c5462d 100644 --- a/htmldate/validators.py +++ b/htmldate/validators.py @@ -4,24 +4,22 @@ """ import logging +import re from collections import Counter from datetime import datetime from functools import lru_cache -from time import mktime -from typing import Match, Optional, Pattern, Union, Counter as Counter_Type from .settings import CACHE_SIZE, MIN_DATE from .utils import Extractor - LOGGER = logging.getLogger(__name__) LOGGER.debug("minimum date setting: %s", MIN_DATE) @lru_cache(maxsize=CACHE_SIZE) def is_valid_date( - date_input: Optional[Union[datetime, str]], + date_input: datetime | str | None, outputformat: str, earliest: datetime, latest: datetime, @@ -58,16 +56,18 @@ def is_valid_date( def validate_and_convert( - date_input: Optional[Union[datetime, str]], + date_input: datetime | None, outputformat: str, earliest: datetime, latest: datetime, -) -> Optional[str]: +) -> str | None: "Robust validation and conversion for plausible dates." - if is_valid_date(date_input, outputformat, earliest, latest): + if date_input is not None and is_valid_date( + date_input, outputformat, earliest, latest + ): try: LOGGER.debug("custom parse result: %s", date_input) - return date_input.strftime(outputformat) # type: ignore + return date_input.strftime(outputformat) except ValueError as err: # pragma: no cover LOGGER.error("value error during conversion: %s %s", date_input, err) return None @@ -83,8 +83,8 @@ def is_valid_format(outputformat: str) -> bool: except (TypeError, ValueError) as err: LOGGER.error("wrong output format or type: %s %s", outputformat, err) return False - # test in abstracto (could be the only test) - if not isinstance(outputformat, str) or "%" not in outputformat: + # a format without any directive cannot produce a date + if "%" not in outputformat: LOGGER.error("malformed output format: %s", outputformat) return False return True @@ -93,14 +93,15 @@ def is_valid_format(outputformat: str) -> bool: def plausible_year_filter( htmlstring: str, *, - pattern: Pattern[str], - yearpat: Pattern[str], + pattern: re.Pattern[str], + yearpat: re.Pattern[str], earliest: datetime, latest: datetime, incomplete: bool = False, -) -> Counter_Type[str]: +) -> Counter[str]: """Filter the date patterns to find plausible years only""" occurrences = Counter(pattern.findall(htmlstring)) # slow! + min_year, max_year = earliest.year, latest.year for item in list(occurrences): # prevent RuntimeError year_match = yearpat.search(item) @@ -116,7 +117,7 @@ def plausible_year_filter( century = "19" if lastdigits[0] == "9" else "20" potential_year = int(century + lastdigits) - if not earliest.year <= potential_year <= latest.year: + if not min_year <= potential_year <= max_year: LOGGER.debug("no potential year: %s", item) del occurrences[item] @@ -126,7 +127,7 @@ def plausible_year_filter( def compare_values(reference: int, attempt: str, options: Extractor) -> int: """Compare the date expression to a reference""" try: - timestamp = int(mktime(datetime.strptime(attempt, options.format).timetuple())) + timestamp = int(datetime.strptime(attempt, options.format).timestamp()) except Exception as err: LOGGER.debug("datetime.strptime exception: %s for string %s", err, attempt) return reference @@ -139,14 +140,13 @@ def compare_values(reference: int, attempt: str, options: Extractor) -> int: @lru_cache(maxsize=CACHE_SIZE) def filter_ymd_candidate( - bestmatch: Match[str], - pattern: Pattern[str], - original_date: bool, + bestmatch: re.Match[str], + pattern: re.Pattern[str], copyear: int, outputformat: str, min_date: datetime, max_date: datetime, -) -> Optional[str]: +) -> str | None: """Filter free text candidates in the YMD format""" if bestmatch is not None: pagedate = "-".join([bestmatch[1], bestmatch[2], bestmatch[3]]) @@ -155,15 +155,6 @@ def filter_ymd_candidate( ): LOGGER.debug('date found for pattern "%s": %s', pattern, pagedate) return convert_date(pagedate, "%Y-%m-%d", outputformat) - ## TODO: test and improve - # if original_date is True: - # if copyear == 0 or int(bestmatch[1]) <= copyear: - # LOGGER.debug('date found for pattern "%s": %s', pattern, pagedate) - # return convert_date(pagedate, '%Y-%m-%d', outputformat) - # else: - # if copyear == 0 or int(bestmatch[1]) >= copyear: - # LOGGER.debug('date found for pattern "%s": %s', pattern, pagedate) - # return convert_date(pagedate, '%Y-%m-%d', outputformat) return None @@ -180,7 +171,7 @@ def convert_date(datestring: str, inputformat: str, outputformat: str) -> str: return dateobject.strftime(outputformat) -def check_extracted_reference(reference: int, options: Extractor) -> Optional[str]: +def check_extracted_reference(reference: int, options: Extractor) -> str | None: """Test if the extracted reference date can be returned""" if reference > 0: dateobject = datetime.fromtimestamp(reference) @@ -192,25 +183,29 @@ def check_extracted_reference(reference: int, options: Extractor) -> Optional[st return None -def check_date_input( - date_object: Optional[Union[datetime, str]], default: datetime -) -> datetime: +def check_date_input(date_object: datetime | str | None, default: datetime) -> datetime: "Check if the input is a usable datetime or ISO date string, return default otherwise" if isinstance(date_object, datetime): return date_object if isinstance(date_object, str): try: - return datetime.fromisoformat(date_object) # type: ignore[attr-defined] + return datetime.fromisoformat(date_object) except ValueError: LOGGER.warning("invalid datetime string: %s", date_object) return default # no input or error thrown -def get_min_date(min_date: Optional[Union[datetime, str]]) -> datetime: +def get_min_date(min_date: datetime | str | None) -> datetime: """Validates the minimum date and/or defaults to earliest plausible date""" return check_date_input(min_date, MIN_DATE) -def get_max_date(max_date: Optional[Union[datetime, str]]) -> datetime: - """Validates the maximum date and/or defaults to latest plausible date""" - return check_date_input(max_date, datetime.now()) +def get_max_date(max_date: datetime | str | None) -> datetime: + """Validates the maximum date and/or defaults to the end of the current day. + A day-granular default stays stable across calls (unlike datetime.now()), + which lets the date-validation caches be reused from one document to the + next in batch processing, and accepts dates published earlier the same day.""" + end_of_today = datetime.now().replace( + hour=23, minute=59, second=59, microsecond=999999 + ) + return check_date_input(max_date, end_of_today) diff --git a/pyproject.toml b/pyproject.toml index a6504ec6..428bc284 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -9,7 +9,7 @@ description = "Fast and robust extraction of original and updated publication da readme = "README.md" license = { text = "Apache 2.0" } dynamic = ["version"] -requires-python = ">=3.8" +requires-python = ">=3.10" authors = [ {name = "Adrien Barbaresi", email = "adrien.barbaresi@gmail.com"} ] @@ -37,8 +37,6 @@ classifiers = [ "Operating System :: POSIX :: Linux", "Programming Language :: Python", "Programming Language :: Python :: 3", - "Programming Language :: Python :: 3.8", - "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", "Programming Language :: Python :: 3.12", @@ -52,9 +50,7 @@ classifiers = [ dependencies = [ "charset_normalizer >= 3.4.0", "dateparser >= 1.1.2", # 1.1.3+ slower - # see tests on Github Actions - "lxml == 4.9.2 ; platform_system == 'Darwin' and python_version <= '3.8'", - "lxml >= 5.3.0 ; platform_system != 'Darwin' or python_version > '3.8'", + "lxml >= 5.3.0", "python-dateutil >= 2.9.0.post0", "urllib3 >= 1.26, < 3", ] @@ -99,5 +95,11 @@ all = [ "htmldate[speed]", ] +[tool.black] +target-version = ["py310"] + +[tool.mypy] +warn_unused_ignores = true + [tool.pytest.ini_options] testpaths = "tests/*test*.py" diff --git a/setup.py b/setup.py deleted file mode 100644 index 8f91d615..00000000 --- a/setup.py +++ /dev/null @@ -1,37 +0,0 @@ -""" -Seamlessly extract the date of web pages based on URL, header or body. -http://github.com/adbar/htmldate -""" - -import sys - -from setuptools import setup - - -# add argument to compile with mypyc -if len(sys.argv) > 1 and sys.argv[1] == "--use-mypyc": - sys.argv.pop(1) - USE_MYPYC = True - from mypyc.build import mypycify - - ext_modules = mypycify( - [ - "htmldate/__init__.py", - "htmldate/core.py", - "htmldate/extractors.py", - "htmldate/meta.py", - "htmldate/settings.py", - "htmldate/utils.py", - "htmldate/validators.py", - ], - opt_level="3", - multi_file=True, - ) -else: - ext_modules = [] - - -setup( - # mypyc or not - ext_modules=ext_modules, -) diff --git a/tests/evaluation.py b/tests/evaluation.py index 2f1f844d..394125dd 100644 --- a/tests/evaluation.py +++ b/tests/evaluation.py @@ -11,16 +11,23 @@ except ImportError: from charset_normalizer import detect -from articleDateExtractor import extractArticlePublishedDate -from date_guesser import guess_date -from goose3 import Goose -from newspaper import Article -from newspaper.article import ArticleDownloadState -from newsplease import NewsPlease - from htmldate import find_date from htmldate.validators import convert_date +# Optional third-party libraries, only needed for the full benchmark +# (i.e. comparison.py *without* --small). Guard the imports so the +# htmldate-only run works without these heavy/legacy packages installed. +try: + from articleDateExtractor import extractArticlePublishedDate + from date_guesser import guess_date + from goose3 import Goose + from newspaper import Article + from newspaper.article import ArticleDownloadState + from newsplease import NewsPlease +except ImportError: + extractArticlePublishedDate = guess_date = Goose = None + Article = ArticleDownloadState = NewsPlease = None + TEST_DIR = os.path.abspath(os.path.dirname(__file__)) # list the jsons containing the pages here @@ -32,7 +39,7 @@ with open(evalpath, "r", encoding="utf-8") as f: EVAL_PAGES.update(json.load(f)) -G = Goose() +G = Goose() if Goose is not None else None def load_document(filename): diff --git a/tests/realworld_tests.py b/tests/realworld_tests.py index c646114f..ecc56c4a 100644 --- a/tests/realworld_tests.py +++ b/tests/realworld_tests.py @@ -768,9 +768,8 @@ def test_cli(): "Test the command-line interface" # third test: Linux and MacOS only if os.name != "nt": - testargs = [""] - with patch.object(sys, "argv", testargs): - args = parse_args(testargs) + testargs = [] + args = parse_args(testargs) sys.stdin = open( os.path.join(TEST_DIR, "cache", "befifty.montauk.html"), "r", diff --git a/tests/unit_tests.py b/tests/unit_tests.py index 533bb32c..00ae20de 100644 --- a/tests/unit_tests.py +++ b/tests/unit_tests.py @@ -1515,7 +1515,6 @@ def test_idiosyncrasies(): def test_parser(): """test argument parsing for the command-line interface""" testargs = [ - "-f", "-v", "--original", "-max", @@ -1523,26 +1522,23 @@ def test_parser(): "-u", "https://www.example.org", ] - with patch.object(sys, "argv", testargs): - args = parse_args(testargs) + args = parse_args(testargs) assert args.fast is True assert args.original is True assert args.verbose is True assert args.maxdate == "2015-12-31" assert args.URL == "https://www.example.org" - testargs = ["-f", "-min", "2015-12-31"] - with patch.object(sys, "argv", testargs): - args = parse_args(testargs) + testargs = ["-min", "2015-12-31"] + args = parse_args(testargs) assert args.fast is True assert args.original is False assert args.verbose is False assert args.mindate == "2015-12-31" # version f = io.StringIO() - testargs = ["", "--version"] + testargs = ["--version"] with pytest.raises(SystemExit) as e, redirect_stdout(f): - with patch.object(sys, "argv", testargs): - args = parse_args(testargs) + args = parse_args(testargs) assert e.type == SystemExit and e.value.code == 0 assert re.match( r"Htmldate [0-9]\.[0-9]+\.[0-9] - Python [0-9]\.[0-9]+\.[0-9]", f.getvalue() @@ -1552,8 +1548,7 @@ def test_parser(): def test_cli(): "Test the command-line interface" testargs = ["--original"] - with patch.object(sys, "argv", testargs): - args = parse_args(testargs) + args = parse_args(testargs) assert cli_examine(None, args) is None assert cli_examine(" ", args) is None @@ -1608,16 +1603,14 @@ def test_cli(): ) # first test - testargs = ["", "-u", "123", "-v"] - with patch.object(sys, "argv", testargs): - args = parse_args(testargs) + testargs = ["-u", "123", "-v"] + args = parse_args(testargs) with pytest.raises(SystemExit) as err: process_args(args) assert err.type == SystemExit # meaningful test - testargs = ["", "-u", "https://httpbun.com/html"] - with patch.object(sys, "argv", testargs): - args = parse_args(testargs) + testargs = ["-u", "https://httpbun.com/html"] + args = parse_args(testargs) f = io.StringIO() with redirect_stdout(f): process_args(args) @@ -1638,8 +1631,7 @@ def test_download(): main() testargs = ["--original"] - with patch.object(sys, "argv", testargs): - args = parse_args(testargs) + args = parse_args(testargs) url = "https://httpbin.org/status/200" teststring = fetch_url(url) From a066bd4702ada47cea094a46241916d3331968ef Mon Sep 17 00:00:00 2001 From: Adrien Barbaresi Date: Sat, 30 May 2026 15:07:02 +0200 Subject: [PATCH 2/3] fix issues and use ruff --- .github/workflows/tests.yml | 10 +++------- CONTRIBUTING.md | 2 +- docs/index.rst | 6 +++--- htmldate/utils.py | 2 +- pyproject.toml | 15 +++++++++++---- tests/realworld_tests.py | 1 - tests/unit_tests.py | 6 +++--- 7 files changed, 22 insertions(+), 20 deletions(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 24cd7629..bb7750ca 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -58,17 +58,13 @@ jobs: run: python -m pip install --upgrade -e ".[dev]" # tests - - name: Lint with flake8 - run: | - # stop the build if there are Python syntax errors or undefined names - flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics - # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide - flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics + - name: Lint with ruff + run: ruff check . - name: Code format and type checking if: ${{ matrix.python-version == env.REF_PY_VERSION }} run: | - black --check --diff htmldate + ruff format --check htmldate mypy -p htmldate - name: Install full dependencies diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 8fd3507b..4eb1e9e6 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -25,7 +25,7 @@ and if there are no errors. 2. Run the tests and code quality tools: - Tests with `pytest` - Type checking with `mypy` on the directory: `mypy htmldate/` - - Code formatting with `black` on the directory as well + - Linting with `ruff check .` and formatting with `ruff format htmldate/` For further questions you can use [GitHub issues](https://github.com/adbar/htmldate/issues) or [E-Mail](https://adrien.barbaresi.eu/). diff --git a/docs/index.rst b/docs/index.rst index 5aa83b02..16b85ead 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -21,9 +21,9 @@ Htmldate: Find the Publication Date of Web Pages :target: https://doi.org/10.21105/joss.02439 :alt: JOSS article reference DOI: 10.21105/joss.02439 -.. image:: https://img.shields.io/badge/code%20style-black-000000.svg - :target: https://github.com/psf/black - :alt: Code style: black +.. image:: https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/astral-sh/ruff/main/assets/badge/v2.json + :target: https://github.com/astral-sh/ruff + :alt: Ruff | diff --git a/htmldate/utils.py b/htmldate/utils.py index bd95f906..90382d53 100644 --- a/htmldate/utils.py +++ b/htmldate/utils.py @@ -13,7 +13,7 @@ # CChardet is faster and can be more accurate try: - from cchardet import detect as cchardet_detect # type: ignore[import-untyped] + from cchardet import detect as cchardet_detect except ImportError: cchardet_detect = None from charset_normalizer import from_bytes diff --git a/pyproject.toml b/pyproject.toml index 428bc284..86bc72e8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -75,11 +75,10 @@ htmldate = "htmldate.cli:main" # Development extras [project.optional-dependencies] dev = [ - "black", - "flake8", "mypy", "pytest", "pytest-cov", + "ruff", "types-dateparser", "types-python-dateutil", "types-lxml", @@ -95,11 +94,19 @@ all = [ "htmldate[speed]", ] -[tool.black] -target-version = ["py310"] +[tool.ruff] +target-version = "py310" [tool.mypy] warn_unused_ignores = true +[[tool.mypy.overrides]] +# faust-cchardet (optional "speed" extra) ships no type stubs and is absent +# during the type-checking step, so silence its import here. This behaves the +# same whether the extra is installed (import-untyped) or not (import-not-found), +# unlike an inline ignore tied to a single error code. +module = ["cchardet"] +ignore_missing_imports = true + [tool.pytest.ini_options] testpaths = "tests/*test*.py" diff --git a/tests/realworld_tests.py b/tests/realworld_tests.py index ecc56c4a..0e0ee129 100644 --- a/tests/realworld_tests.py +++ b/tests/realworld_tests.py @@ -10,7 +10,6 @@ import sys from contextlib import redirect_stdout -from unittest.mock import patch import pytest diff --git a/tests/unit_tests.py b/tests/unit_tests.py index 00ae20de..92b37d32 100644 --- a/tests/unit_tests.py +++ b/tests/unit_tests.py @@ -1538,8 +1538,8 @@ def test_parser(): f = io.StringIO() testargs = ["--version"] with pytest.raises(SystemExit) as e, redirect_stdout(f): - args = parse_args(testargs) - assert e.type == SystemExit and e.value.code == 0 + parse_args(testargs) + assert e.type is SystemExit and e.value.code == 0 assert re.match( r"Htmldate [0-9]\.[0-9]+\.[0-9] - Python [0-9]\.[0-9]+\.[0-9]", f.getvalue() ) @@ -1607,7 +1607,7 @@ def test_cli(): args = parse_args(testargs) with pytest.raises(SystemExit) as err: process_args(args) - assert err.type == SystemExit + assert err.type is SystemExit # meaningful test testargs = ["-u", "https://httpbun.com/html"] args = parse_args(testargs) From 483f76911c481576f684b292d423e7a49dd9dc8d Mon Sep 17 00:00:00 2001 From: Adrien Barbaresi Date: Mon, 1 Jun 2026 18:54:29 +0200 Subject: [PATCH 3/3] fix: update eval and docs, check code robustness --- .readthedocs.yaml | 8 +++++-- CHANGELOG.md | 6 +++++ README.md | 23 +++++++++--------- docs/conf.py | 2 +- docs/evaluation.rst | 23 ++++++++++++++++-- docs/index.rst | 26 +++++++-------------- docs/options.rst | 12 ++++------ docs/requirements.txt | 5 ++-- htmldate/__init__.py | 2 +- htmldate/cli.py | 4 ++-- htmldate/core.py | 26 ++++++++------------- htmldate/extractors.py | 21 ++++------------- htmldate/settings.py | 4 ++++ htmldate/utils.py | 18 +++++++-------- htmldate/validators.py | 10 ++++++-- tests/comparison.py | 37 ++++++++++------------------- tests/eval-requirements.txt | 15 +++++++----- tests/eval_default.json | 32 ++++++++++++------------- tests/eval_mediacloud_2020.json | 2 +- tests/evaluation.py | 41 ++++++++++++++++++++++----------- tests/unit_tests.py | 5 ++-- 21 files changed, 168 insertions(+), 154 deletions(-) diff --git a/.readthedocs.yaml b/.readthedocs.yaml index f6a12ce2..d2386598 100644 --- a/.readthedocs.yaml +++ b/.readthedocs.yaml @@ -6,9 +6,9 @@ version: 2 # Set the OS, Python version and other tools you might need build: - os: ubuntu-22.04 + os: ubuntu-24.04 tools: - python: "3.11" + python: "3.13" # You can also specify other tool versions: # nodejs: "20" # rust: "1.70" @@ -33,3 +33,7 @@ sphinx: python: install: - requirements: docs/requirements.txt + # install the checked-out source so autodoc and the version reflect this + # branch/tag rather than the released package from PyPI + - method: pip + path: . diff --git a/CHANGELOG.md b/CHANGELOG.md index 5c527d90..61662294 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,11 @@ ## Changelog +## 1.10.0 +- maintenance: modernize typing, packaging and code +- evaluation: review and correct benchmark ground-truth labels, update and speed up alternatives +- performance: stable day-granular cache key and reduced copying +- fixes: preserve tails in element cleaning + ## 1.9.4 - maintenance: remove LXML version constraint (#184) diff --git a/README.md b/README.md index 7e9ea895..3ed5a84a 100644 --- a/README.md +++ b/README.md @@ -54,7 +54,7 @@ $ htmldate -u http://blog.python.org/2016/12/python-360-is-now-available.html YMD](https://en.wikipedia.org/wiki/ISO_8601)). - Detection of both original and updated dates. - Multilingual. -- Compatible with all recent versions of Python. +- Compatible with Python 3.10 and later. ### How it works @@ -77,17 +77,17 @@ Finally, the output is validated and converted to the chosen format. ## Performance -1000 web pages containing identifiable dates (as of 2023-11-13 on Python 3.10) +1000 web pages containing identifiable dates (as of 2026-06-01 on Python 3.13) | Python Package | Precision | Recall | Accuracy | F-Score | Time | | -------------- | --------- | ------ | -------- | ------- | ---- | -| articleDateExtractor 0.20 | 0.803 | 0.734 | 0.622 | 0.767 | 5x | -| date_guesser 2.1.4 | 0.781 | 0.600 | 0.514 | 0.679 | 18x | -| goose3 3.1.17 | 0.869 | 0.532 | 0.493 | 0.660 | 15x | -| htmldate\[all\] 1.6.0 (fast) | **0.883** | 0.924 | 0.823 | 0.903 | **1x** | -| htmldate\[all\] 1.6.0 (extensive) | 0.870 | **0.993** | **0.865** | **0.928** | 1.7x | -| newspaper3k 0.2.8 | 0.769 | 0.667 | 0.556 | 0.715 | 15x | -| news-please 1.5.35 | 0.801 | 0.768 | 0.645 | 0.784 | 34x | +| articleDateExtractor 0.20 | 0.846 | 0.745 | 0.656 | 0.792 | 3x | +| date_guesser 2.1.4 | 0.832 | 0.611 | 0.544 | 0.705 | 11x | +| goose3 3.1.21 | **0.930** | 0.568 | 0.545 | 0.706 | 14x | +| htmldate\[all\] 1.10.0 (fast) | 0.924 | 0.927 | 0.861 | 0.925 | **1x** | +| htmldate\[all\] 1.10.0 (extensive) | 0.908 | **0.993** | **0.903** | **0.949** | 1.8x | +| newspaper4k 0.9.5 | 0.912 | 0.728 | 0.680 | 0.810 | 2.5x | +| news-please 1.6.16 | 0.845 | 0.777 | 0.680 | 0.810 | 29x | For the complete results and explanations see [evaluation page](https://htmldate.readthedocs.io/en/latest/evaluation.html). @@ -95,13 +95,14 @@ page](https://htmldate.readthedocs.io/en/latest/evaluation.html). ## Installation Htmldate is tested on Linux, macOS and Windows systems, it is compatible -with Python 3.8 upwards. It can notably be installed with `pip` (`pip3` +with Python 3.10 upwards. It can notably be installed with `pip` (`pip3` where applicable) from the PyPI package repository: - `pip install htmldate` - (optionally) `pip install htmldate[speed]` -The last version to support Python 3.6 and 3.7 is `htmldate==1.8.1`. +The last version to support Python 3.6 and 3.7 is `htmldate==1.8.1`; for +Python 3.8 and 3.9 use the `1.9.x` series. ## Documentation diff --git a/docs/conf.py b/docs/conf.py index d4856e55..4089bcfb 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -21,7 +21,7 @@ # -- Project information ----------------------------------------------------- project = 'htmldate' -copyright = '2023, Adrien Barbaresi' +copyright = '2017-2026, Adrien Barbaresi' author = 'Adrien Barbaresi' # -- General configuration --------------------------------------------------- diff --git a/docs/evaluation.rst b/docs/evaluation.rst index f485d4ac..47d88bf2 100644 --- a/docs/evaluation.rst +++ b/docs/evaluation.rst @@ -18,7 +18,7 @@ There are comparable software solutions in Python, the following date extraction - `date_guesser `_ extracts publication dates from a web pages along with an accuracy measure (not used here), - `goose3 `_ can extract information for embedded content, - `htmldate `_ is the software package described here, it is designed to extract original and updated publication dates of web pages, -- `newspaper `_ is mostly geared towards newspaper texts, +- `newspaper4k `_ (the maintained successor of newspaper3k) is mostly geared towards newspaper texts, - `news-please `_ is a news crawler that extracts structured information. Two alternative packages are not tested here but could be used in addition: @@ -36,7 +36,7 @@ Description **Time**: the execution time cannot be easily compared in all cases as some solutions perform a whole series of operations which are irrelevant to this task. -**Errors:** *goose3*'s output isn't always meaningful and/or in a standardized format, these cases were discarded. *news-please* seems to have trouble with some encodings (e.g. in Chinese), in which case it leads to an exception. +**Errors:** *goose3*'s output isn't always meaningful and/or in a standardized format, these cases were discarded. Results @@ -45,6 +45,23 @@ Results The results below show that **date extraction is not a completely solved task** but one for which extractors have to resort to heuristics and guesses. The figures documenting recall and accuracy capture the real-world performance of the tools as the absence of a date output impacts the result. +================================ ========= ========= ========= ========= ======= +1000 web pages containing identifiable dates (as of 2026-06-01 on Python 3.13) +-------------------------------------------------------------------------------- +Python Package Precision Recall Accuracy F-Score Time +================================ ========= ========= ========= ========= ======= +articleDateExtractor 0.20 0.846 0.745 0.656 0.792 3x +date_guesser 2.1.4 0.832 0.611 0.544 0.705 11x +goose3 3.1.21 **0.930** 0.568 0.545 0.706 14x +htmldate[all] 1.10.0 (fast) 0.924 0.927 0.861 0.925 **1x** +htmldate[all] 1.10.0 (extensive) 0.908 **0.993** **0.903** **0.949** 1.8x +newspaper4k 0.9.5 0.912 0.728 0.680 0.810 2.5x +news-please 1.6.16 0.845 0.777 0.680 0.810 29x +================================ ========= ========= ========= ========= ======= + +This run uses a reviewed version of the ground-truth labels (publication-date corrections) and the maintained *newspaper4k* fork in place of the now-unmaintained *newspaper3k*. + + =============================== ========= ========= ========= ========= ======= 1000 web pages containing identifiable dates (as of 2023-11-13 on Python 3.10) ------------------------------------------------------------------------------- @@ -62,6 +79,8 @@ news-please 1.5.35 0.801 0.768 0.645 0.784 34x Additional data for new pages in English collected by the `Data Culture Group `_ at Northeastern University. +The discussion below refers to the most recent run (top table), measured against a reviewed version of the publication-date labels. + Precision describes if the dates given as output are correct: *goose3* fares well precision-wise but it fails to extract dates in a large majority of cases (poor recall). The difference in accuracy between *date_guesser* and *newspaper* is consistent with tests described on the `website of the former `_. It turns out that *htmldate* performs better than the other solutions overall. It is also noticeably faster than the strictly comparable packages (*articleDateExtractor* and most certainly *date_guesser*). Despite being measured on a sample, **the higher accuracy and faster processing time are highly significant**. Especially for smaller news outlets, websites and blogs, as well as pages written in languages other than English (in this case mostly but not exclusively German), *htmldate* greatly extends date extraction coverage without sacrificing precision. diff --git a/docs/index.rst b/docs/index.rst index 16b85ead..ba9d2132 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -80,7 +80,7 @@ Features - URLs, HTML files, or HTML trees are given as input (includes batch processing) - Output as string in any date format (defaults to `ISO 8601 YMD `_) - Detection of both original and updated dates -- Compatible with all recent versions of Python +- Compatible with Python 3.10 and later ``htmldate`` can examine markup and text. It provides the following ways to date an HTML document: @@ -94,7 +94,7 @@ Features The output is thoroughly verified in terms of plausibility and adequateness. If a valid date has been found the library outputs a date string corresponding to either the last update or the original publishing statement (the default), in the desired format. -Markup-based extraction is multilingual by nature, text-based refinements for better coverage currently support German, English and Turkish. +Markup-based extraction is multilingual by nature, text-based refinements for better coverage currently support English, French, German, Indonesian and Turkish. Installation @@ -103,16 +103,16 @@ Installation Main package ~~~~~~~~~~~~ -This Python package is tested on Linux, macOS and Windows systems; it is compatible with Python 3.8 upwards. It is available on the package repository `PyPI `_ and can notably be installed with ``pip`` or ``pipenv``: +This Python package is tested on Linux, macOS and Windows systems; it is compatible with Python 3.10 upwards. It is available on the package repository `PyPI `_ and can notably be installed with ``pip`` or ``pipenv``: .. code-block:: bash - $ pip install htmldate # pip3 install on systems where both Python 2 and 3 are installed + $ pip install htmldate $ pip install --upgrade htmldate # to make sure you have the latest version $ pip install git+https://github.com/adbar/htmldate.git # latest available code (see build status above) -The last version to support Python 3.6 and 3.7 is ``htmldate==1.8.1``. +The last version to support Python 3.6 and 3.7 is ``htmldate==1.8.1``; for Python 3.8 and 3.9 use the ``1.9.x`` series. Optional @@ -131,16 +131,6 @@ The ``dateparser`` package is noticeably slower in its latest versions, version *For infos on dependency management of Python packages see* `this discussion thread `_. -Experimental -~~~~~~~~~~~~ - -Experimental compilation with ``mypyc``, as using pre-compiled library may shorten processing speed: - -1. Install ``mypy``: ``pip3 install mypy`` -2. Compile the package: ``python setup.py --use-mypyc bdist_wheel`` -3. Use the newly created wheel: ``pip3 install dist/...`` - - With Python ----------- @@ -162,7 +152,7 @@ In case the web page features easily readable metadata in the header, the extrac .. code-block:: python >>> find_date('https://creativecommons.org/about/') - '2017-08-11' # has been updated since + '2017-08-11' # may change >>> find_date('https://creativecommons.org/about/', extensive_search=False) >>> @@ -189,7 +179,7 @@ Change the output to a format known to Python's ``datetime`` module, the default .. code-block:: python >>> find_date('https://www.gnu.org/licenses/gpl-3.0.en.html', outputformat='%d %B %Y') - '18 November 2016' # may have changed since + '18 November 2016' # may change Original vs. updated dates @@ -200,7 +190,7 @@ Although the time delta between original publication and "last modified" info is .. code-block:: python >>> find_date('https://netzpolitik.org/2016/die-cider-connection-abmahnungen-gegen-nutzer-von-creative-commons-bildern/', original_date=True) # modified behavior - '2016-06-23' + '2016-06-23' # may change For more information see `options page `_. diff --git a/docs/options.rst b/docs/options.rst index 71c90f46..a2754545 100644 --- a/docs/options.rst +++ b/docs/options.rst @@ -27,15 +27,15 @@ An external module can be used for download, as described in versions anterior t >>> import requests >>> r = requests.get('https://creativecommons.org/about/') >>> find_date(r.text) - '2017-11-28' # may have changed since + '2017-11-28' # may change # using htmldate's own fetch_url function >>> from htmldate.utils import fetch_url >>> htmldoc = fetch_url('https://blog.wikimedia.org/2018/06/28/interactive-maps-now-in-your-language/') >>> find_date(htmldoc) - '2018-06-28' + '2018-06-28' # may change # or simply >>> find_date('https://blog.wikimedia.org/2018/06/28/interactive-maps-now-in-your-language/') # URL detected - '2018-06-28' + '2018-06-28' # may change Date format @@ -46,7 +46,7 @@ Change the output to a format known to Python's ``datetime`` module, the default .. code-block:: python >>> find_date('https://www.gnu.org/licenses/gpl-3.0.en.html', outputformat='%d %B %Y') - '18 November 2016' # may have changed since + '18 November 2016' # may change >>> find_date('http://blog.python.org/2016/12/python-360-is-now-available.html', outputformat='%Y-%m-%dT%H:%M:%S%z') '2016-12-23T05:11:00-0500' @@ -62,7 +62,7 @@ Although the time delta between the original publication and the "last modified" .. code-block:: python >>> find_date('https://netzpolitik.org/2016/die-cider-connection-abmahnungen-gegen-nutzer-von-creative-commons-bildern/') # default setting - '2019-06-24' + '2019-06-24' # may change >>> find_date('https://netzpolitik.org/2016/die-cider-connection-abmahnungen-gegen-nutzer-von-creative-commons-bildern/', original_date=True) # modified behavior '2016-06-23' @@ -77,8 +77,6 @@ See ``settings.py`` file: :show-inheritance: :undoc-members: -The module can then be re-compiled locally to apply changes to the settings. - Clearing caches ~~~~~~~~~~~~~~~ diff --git a/docs/requirements.txt b/docs/requirements.txt index 8d0cbee9..b02618bd 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -1,4 +1,3 @@ # version required -sphinx>=8.1.3 -# without version specifier -htmldate +sphinx>=9.1.0 +# htmldate itself is installed from the repo root (see .readthedocs.yaml) diff --git a/htmldate/__init__.py b/htmldate/__init__.py index 19678528..c8b0023c 100644 --- a/htmldate/__init__.py +++ b/htmldate/__init__.py @@ -7,7 +7,7 @@ __author__ = "Adrien Barbaresi" __license__ = "Apache-2.0" __copyright__ = "Copyright 2017-present, Adrien Barbaresi" -__version__ = "1.9.4" +__version__ = "1.10.0" import logging diff --git a/htmldate/cli.py b/htmldate/cli.py index abba2e13..ac58f6c3 100644 --- a/htmldate/cli.py +++ b/htmldate/cli.py @@ -81,13 +81,13 @@ def process_args(args: argparse.Namespace) -> None: if args.URL: htmlstring = fetch_url(args.URL) if htmlstring is None: - sys.exit(f"No data for URL: {args.URL}" + "\n") + sys.exit(f"No data for URL: {args.URL}\n") # unicode check else: try: htmlstring = sys.stdin.read() except UnicodeDecodeError as err: - sys.exit(f"Wrong buffer encoding: {str(err)}" + "\n") + sys.exit(f"Wrong buffer encoding: {err}\n") result = cli_examine(htmlstring, args) if result is not None: sys.stdout.write(result + "\n") diff --git a/htmldate/core.py b/htmldate/core.py index f40489d6..e7baf346 100644 --- a/htmldate/core.py +++ b/htmldate/core.py @@ -28,8 +28,6 @@ FAST_PREPEND, SLOW_PREPEND, FREE_TEXT_EXPRESSIONS, - MAX_SEGMENT_LEN, - MIN_SEGMENT_LEN, YEAR_PATTERN, YMD_PATTERN, COPYRIGHT_PATTERN, @@ -54,11 +52,18 @@ THREE_COMP_REGEX_B, TWO_COMP_REGEX, ) -from .settings import CACHE_SIZE, CLEANING_LIST, MAX_POSSIBLE_CANDIDATES +from .settings import ( + CACHE_SIZE, + CLEANING_LIST, + MAX_POSSIBLE_CANDIDATES, + MAX_SEGMENT_LEN, + MIN_SEGMENT_LEN, +) from .utils import Extractor, clean_html, load_html, trim_text from .validators import ( check_extracted_reference, compare_values, + correct_year, filter_ymd_candidate, get_min_date, get_max_date, @@ -563,7 +568,7 @@ def normalize_match(match: re.Match[str] | None) -> str: and optionally expand the year from two to four digits.""" day, month, year = (g.zfill(2) for g in match.groups() if g) # type: ignore[union-attr] if len(year) == 2: - year = f"19{year}" if year[0] == "9" else f"20{year}" + year = str(correct_year(int(year))) return f"{year}-{month}-{day}" @@ -852,8 +857,6 @@ def find_date( original_date, outputformat, ) - # unclear what this line is for and it impedes type checking: - # find_date.extensive_search = extensive_search # URL if url is None: @@ -891,9 +894,7 @@ def find_date( # costly deepcopy of the whole document pruning_tree = deepcopy(tree) if isinstance(htmlobject, HtmlElement) else tree try: - search_tree, discarded = discard_unwanted( - clean_html(pruning_tree, CLEANING_LIST) - ) + search_tree = discard_unwanted(clean_html(pruning_tree, CLEANING_LIST)) # rare LXML error: no NULL bytes or control characters except ValueError: # pragma: no cover search_tree = tree @@ -923,13 +924,6 @@ def find_date( if result is not None: return result - # TODO: decide on this - # search in discarded parts (e.g. archive.org-banner) - # for subtree in discarded: - # dateresult = examine_date_elements(subtree, DATE_EXPRESSIONS, options) - # if dateresult is not None: - # return dateresult - # robust conversion to string try: htmlstring = tostring(search_tree, pretty_print=False, encoding="unicode") diff --git a/htmldate/extractors.py b/htmldate/extractors.py index 01ad9025..9e5efd87 100644 --- a/htmldate/extractors.py +++ b/htmldate/extractors.py @@ -18,9 +18,9 @@ from lxml.html import HtmlElement # own -from .settings import CACHE_SIZE +from .settings import CACHE_SIZE, MAX_SEGMENT_LEN from .utils import Extractor, trim_text -from .validators import convert_date, is_valid_date, validate_and_convert +from .validators import convert_date, correct_year, is_valid_date, validate_and_convert LOGGER = logging.getLogger(__name__) @@ -80,8 +80,6 @@ # or contains(@id, 'lastmod') or contains(@class, 'updated') FREE_TEXT_EXPRESSIONS = XPath(FAST_PREPEND + "/text()") -MIN_SEGMENT_LEN = 6 -MAX_SEGMENT_LEN = 52 # discard parts of the webpage # archive.org banner inserts @@ -209,13 +207,11 @@ SIMPLE_PATTERN = re.compile(rf"(? tuple[HtmlElement, list[HtmlElement]]: - """Delete unwanted sections of an HTML document and return them as a list""" - my_discarded = [] +def discard_unwanted(tree: HtmlElement) -> HtmlElement: + """Delete unwanted sections of an HTML document.""" for subtree in DISCARD_EXPRESSIONS(tree): - my_discarded.append(subtree) subtree.getparent().remove(subtree) - return tree, my_discarded + return tree def extract_url_date( @@ -237,13 +233,6 @@ def extract_url_date( return None -def correct_year(year: int) -> int: - """Adapt year from YY to YYYY format""" - if year < 100: - year += 1900 if year >= 90 else 2000 - return year - - def try_swap_values(day: int, month: int) -> tuple[int, int]: """Swap day and month values if it seems feasible.""" return (month, day) if month > 12 and day <= 12 else (day, month) diff --git a/htmldate/settings.py b/htmldate/settings.py index 2f1aa3d4..5e8fc1ab 100644 --- a/htmldate/settings.py +++ b/htmldate/settings.py @@ -18,6 +18,10 @@ # set an upper limit to the number of candidates MAX_POSSIBLE_CANDIDATES: int = 1000 +# Text segment length bounds (in characters) for date extraction +MIN_SEGMENT_LEN: int = 6 +MAX_SEGMENT_LEN: int = 52 + CLEANING_LIST = [ "applet", "audio", diff --git a/htmldate/utils.py b/htmldate/utils.py index 90382d53..10855f8e 100644 --- a/htmldate/utils.py +++ b/htmldate/utils.py @@ -8,6 +8,7 @@ from dataclasses import dataclass from datetime import datetime +from typing import Any import urllib3 @@ -109,15 +110,14 @@ def decode_file(filecontent: bytes | str) -> str: return htmltext or str(filecontent, encoding="utf-8", errors="replace") -def decode_response(response: urllib3.response.HTTPResponse | bytes) -> str: - """Read the urllib3 object corresponding to the server response, then - try to guess its encoding and decode it to return a unicode string""" - # urllib3 response object / bytes switch - if isinstance(response, urllib3.response.HTTPResponse): - resp_content = response.data - else: - resp_content = response - return decode_file(resp_content) +def decode_response(response: Any) -> str: + """Read the data from a response object exposing the body via ``.data`` + (e.g. urllib3 or a compatible response) or from a bytestring, then guess + its encoding and decode it to return a unicode string.""" + # accept any response-like object exposing the body via .data, or raw bytes; + # .data may be None, so guard before decoding + data = response.data if hasattr(response, "data") else response + return decode_file(data) if data else "" def fetch_url(url: str) -> str | None: diff --git a/htmldate/validators.py b/htmldate/validators.py index d8c5462d..a04062d2 100644 --- a/htmldate/validators.py +++ b/htmldate/validators.py @@ -90,6 +90,13 @@ def is_valid_format(outputformat: str) -> bool: return True +def correct_year(year: int) -> int: + """Adapt year from YY to YYYY format""" + if year < 100: + year += 1900 if year >= 90 else 2000 + return year + + def plausible_year_filter( htmlstring: str, *, @@ -114,8 +121,7 @@ def plausible_year_filter( if not incomplete: potential_year = int(lastdigits) else: - century = "19" if lastdigits[0] == "9" else "20" - potential_year = int(century + lastdigits) + potential_year = correct_year(int(lastdigits)) if not min_year <= potential_year <= max_year: LOGGER.debug("no potential year: %s", item) diff --git a/tests/comparison.py b/tests/comparison.py index a3de094c..69460a6f 100644 --- a/tests/comparison.py +++ b/tests/comparison.py @@ -4,8 +4,6 @@ import argparse import contextlib -import json -import os import sys import time @@ -15,6 +13,7 @@ from evaluation import ( + EVAL_PAGES, evaluate_result, load_document, run_htmldate_extensive, @@ -27,17 +26,6 @@ ) -TEST_DIR = os.path.abspath(os.path.dirname(__file__)) -# list the jsons containing the pages here -eval_paths = ["eval_mediacloud_2020.json", "eval_default.json"] -# load the pages here -EVAL_PAGES = {} -for each in eval_paths: - evalpath = os.path.join(TEST_DIR, each) - with open(evalpath, "r", encoding="utf-8") as f: - EVAL_PAGES.update(json.load(f)) - - PARSER = argparse.ArgumentParser(description="Run the evaluation") PARSER.add_argument( "--small", @@ -63,20 +51,19 @@ FUNC_DICT = { "htmldate_extensive": run_htmldate_extensive, "htmldate_fast": run_htmldate_fast, - **{ - key: func - for key, func in [ - ("newspaper", run_newspaper), - ("newsplease", run_newsplease), - ("articledateextractor", run_articledateextractor), - ("date_guesser", run_dateguesser), - ("goose", run_goose), - ] - if not ARGS.small - }, } +if not ARGS.small: + FUNC_DICT.update( + { + "newspaper": run_newspaper, + "newsplease": run_newsplease, + "articledateextractor": run_articledateextractor, + "date_guesser": run_dateguesser, + "goose": run_goose, + } + ) -RESULTS_DICT = {key: TEMPLATE_DICT.copy() for key, value in FUNC_DICT.items()} +RESULTS_DICT = {key: TEMPLATE_DICT.copy() for key in FUNC_DICT} def calculate_scores(name, mydict): diff --git a/tests/eval-requirements.txt b/tests/eval-requirements.txt index cc552d17..00492a8d 100644 --- a/tests/eval-requirements.txt +++ b/tests/eval-requirements.txt @@ -1,13 +1,16 @@ # package -htmldate>=1.9.2 +htmldate>=1.10.0 # alternatives articleDateExtractor==0.20 date_guesser==2.1.4 -goose3==3.1.19 -newspaper3k==0.2.8 -news-please==1.6.13 +goose3==3.1.21 +# newspaper4k succeeds the unmaintained newspaper3k. Extras = tokenizers the +# corpus needs (via news-please): nltk, tinysegmenter (ja), jieba (zh). +# Also requires NLTK data: python -m nltk.downloader punkt_tab stopwords +newspaper4k[nlp,ja,zh]==0.9.5 +news-please==1.6.16 # helpers -tabulate==0.9.0 -tqdm==4.67.0 +tabulate==0.10.0 +tqdm==4.67.3 diff --git a/tests/eval_default.json b/tests/eval_default.json index ef77e503..b4600526 100644 --- a/tests/eval_default.json +++ b/tests/eval_default.json @@ -525,11 +525,11 @@ }, "https://www.uusisuomi.fi/uutiset/sanna-marin-tapasi-angela-merkelin-myos-saksa-haluaa-pitaa-kiinni-maataloustuista-meidan-nakemyksiamme-suurimpana-nettomaksajana-ei-ole-otettu-riittavasti-huomioon/b29c11d3-9590-4045-8e2c-a568f9f24617": { "file": "uusisuomi.fi.angela.html", - "date": "2019-02-19" + "date": "2020-02-19" }, "https://yle.fi/uutiset/3-11212601": { "file": "yle.fi.3-11212601.html", - "date": "2019-02-19" + "date": "2020-02-19" }, "https://www.tofugu.com/travel/dezuka-suisan/": { "file": "tofugu.com.dezuka-suisan.html", @@ -713,7 +713,7 @@ }, "https://zahlenzauberin.wordpress.com/2012/08/22/was-zum-horen-in-den-ferien/": { "file": "zahlenzauberin.wordpress.com.ferien.html", - "date": "2010-08-22" + "date": "2012-08-22" }, "https://www.deutschlandfunk.de/die-zukunft-der-arbeit-wir-dekorieren-auf-der-titanic-die.911.de.html?dram:article_id=385022": { "file": "deutschlandfunk.de.titanic.html", @@ -877,7 +877,7 @@ }, "https://www.theplanetarypress.com/2020/01/management-of-intact-forestlands-by-indigenous-peoples-key-to-protecting-climate/": { "file": "theplanetarypress.com.forestlands.html", - "date": "2020-01-19" + "date": "2020-01-17" }, "https://wikimediafoundation.org/news/2020/01/15/access-to-wikipedia-restored-in-turkey-after-more-than-two-and-a-half-years/": { "file": "wikimediafoundation.org.turkey.html", @@ -937,7 +937,7 @@ }, "https://www.tomshardware.com/uk/news/where-and-how-to-buy-rtx-3080-3090-3070": { "file": "tomshardware.com.rtx.html", - "date": "2020-11-02" + "date": "2020-11-04" }, "https://stardewvalleywiki.com/Penny": { "file": "stardewvalleywiki.com.penny.html", @@ -985,7 +985,7 @@ }, "https://diem25.org/the-eus-green-deal-isnt-enough-save-from-climate-catastrophe/": { "file": "diem25.org.climate.html", - "date": "2020-12-12" + "date": "2020-10-12" }, "https://www.economist.com/open-future/2018/06/18/why-collaborative-thinking-beats-individual-smarts": { "file": "economist.com.thinking.html", @@ -1041,7 +1041,7 @@ }, "https://mywakenews.wordpress.com/2016/07/09/nwo-psyop-unitedwestrike-radio-marathon/": { "file": "mywakenews.wordpress.com.psyop.html", - "date": "2016-06-09" + "date": "2016-07-09" }, "https://web.archive.org/web/20130307194448/the-pain.net/2008/05/silkroad-roc-mountain-quests-und-npcs.html": { "file": "archive.org.the-pain.net.silkroad.html", @@ -1373,7 +1373,7 @@ }, "https://berkutschi.com/de/front/news/10759-marius-lindvik-gewinnt-in-willingen": { "file": "berkutschi.com-willingen.html", - "date": "2022-01-31" + "date": "2022-01-30" }, "https://www.berliner-feuerwehr.de/aktuelles/nachrichten/feuerwehr-und-katastrophenschutz-ehrenzeichen-verliehen-3896/": { "file": "berliner-feuerwehr.de-Ehrenzeichen.html", @@ -1661,7 +1661,7 @@ }, "https://www.ekiba.de/detail/nachricht-seite/id/35204-trauern-digital-am-ewigkeitssonntag/?default=true": { "file": "ekiba.de-trauer.html", - "date": "2021-12-12" + "date": "2021-11-12" }, "https://emacspeak.blogspot.com/2019/10/meta-programming-in-emacs-using.html": { "file": "emacspeak.blogspot.com.meta.html", @@ -1825,7 +1825,7 @@ }, "https://www.handwerksblatt.de/themen-specials/coronaschutz-im-betrieb/2g-3g-was-gilt-beim-friseurbesuch": { "file": "handwerksblatt.de-Friseurbesuch.html", - "date": "2022-01-01" + "date": "2022-01-14" }, "https://www.haus.de/bauen/vorsatzschalung-33656": { "file": "haus.de-Vorsatzschallung.html", @@ -2121,7 +2121,7 @@ }, "https://redtri.com/best-jokes-for-kids/slide/1": { "file": "redtri.com.jokes.html", - "date": "2020-11-03" + "date": "2021-09-19" }, "https://www.refinery29.com/de-de/vreni-frost-instagram-werbung-abmahnung": { "file": "refiner29.com-Verni.html", @@ -2169,7 +2169,7 @@ }, "https://www.selbst.de/wurmkiste-39572.html": { "file": "selbst.de-wurmkiste.html", - "date": "2022-01-22" + "date": "2021-02-22" }, "https://www.siegessaeule.de/magazin/p%C3%A4dophilie-als-politisches-machtinstrument/": { "file": "siegessaeule.de-Machtinstrument.html", @@ -2237,7 +2237,7 @@ }, "https://www.tennismagazin.de/news/zverev-zieht-ins-viertelfinale-von-montpellier-ein/": { "file": "tennismagazin.de-viertelfinale.html", - "date": "2022-02-04" + "date": "2022-02-03" }, "https://www.tennisnet.com/news/diese-ymers-zwei-ueberraschungen-an-einem-tag": { "file": "tennisnet.com-ueberraschungen.html", @@ -2265,7 +2265,7 @@ }, "https://www.tierwelt.ch/news/natur-umwelt/immer-mehr-modemarken-werden-pelzfrei-so-erkennen-sie-echtpelz-im-laden": { "file": "tierwelt.ch-plez.html", - "date": "2022-02-02" + "date": "2022-02-01" }, "https://www.tonight.de/unterhaltung/promis/daniela-buechner-danni-und-ennesto-monte-trennen-sich-arschloch_114240.html": { "file": "tonight.de-Arschloch.html", @@ -2325,7 +2325,7 @@ }, "https://www.wochenblatt.com/landwirtschaft/agrarpolitik/heinen-esser-offen-fuer-existenzgruendungspraemie-12810183.html": { "file": "wochenblatt.com-Heinen-Essen.html", - "date": "2022-02-21" + "date": "2022-01-21" }, "https://www.wolfgangmichal.de/2017/06/07/publizistische-sorgfaltspflicht-statt-netzwerkdurchsetzungsgesetz/": { "file": "wolfgangmichal.de.sorgfaltspflicht.html", @@ -3045,7 +3045,7 @@ }, "https://popkultur.de/homosexuelle-schauspieler/": { "file": "Popkultur.de-Schauspieler.html", - "date": "2023-05-06" + "date": "2023-06-05" }, "https://presse-augsburg.de/augsburger-verkehrs-und-tarifverbund-avv-erhoeht-die-oepnv-preise-deutlich/909665/": { "file": "presse-ausburg.de-Tarifverbund.html", diff --git a/tests/eval_mediacloud_2020.json b/tests/eval_mediacloud_2020.json index 1b84a29a..94166423 100644 --- a/tests/eval_mediacloud_2020.json +++ b/tests/eval_mediacloud_2020.json @@ -1 +1 @@ -{"https://zaxid.net/news/showNews.do?nastupnogo_tizhnya_na_ukrayinu_chekayut_anomalna_speka_ta_grozi&objectId=1503302": {"file": "1628285861.html", "date": "2020-06-07"}, "https://24tv.ua/yak-venediktova-zahishhala-nardepa-vid-slugi-narodu_n1419177": {"file": "1716869064.html", "date": "2020-09-21"}, "https://www.mynet.com/samsunda-sobadan-zehirlenen-2-cocuk-hastanelik-oldu-110106661445": {"file": "1783862633.html", "date": "2020-11-30"}, "http://www.detaykibris.com/yikilan-binalarkurtarma-calismalari-izmirden-goruntuler-2196g.htm": {"file": "1754856212.html", "date": "2020-10-30"}, "https://www.hatawtabloid.com/2020/02/18/aktres-sunod-sunuran-sa-aktor-bf/": {"file": "1523761669.html", "date": "2020-02-18"}, "http://auto-door16814.ttblogs.com/2028064/%E0%B8%9C-%E0%B8%9C%E0%B8%A5-%E0%B8%95-%E0%B9%81%E0%B8%A5%E0%B8%B0%E0%B8%88%E0%B8%B3%E0%B8%AB%E0%B8%99-%E0%B8%B2%E0%B8%A2%E0%B9%82%E0%B8%8B-%E0%B8%AD-%E0%B8%95%E0%B8%AA%E0%B8%B2%E0%B8%AB%E0%B8%81%E0%B8%A3%E0%B8%A3%E0%B8%A1%E0%B8%97-%E0%B8%81%E0%B8%9B%E0%B8%A3%E0%B8%B0%E0%B9%80%E0%B8%A0%E0%B8%97-%E0%B8%97-%E0%B8%81%E0%B8%8A%E0%B8%99-%E0%B8%94": {"file": "1799801548.html", "date": NaN}, "https://www.hbl.fi/artikel/bbc-premier-league-omstart-17-juli/": {"file": "1619092544.html", "date": "2020-05-28"}, "https://avaz.ba/kantoni/republika-srpska/604722/generalni-direktor-eprs-pozitivan-na-koronavirus": {"file": "1750355638.html", "date": "2020-10-26"}, "https://www.koha.net/kronike-e-zeze/233165/arrestohen-dy-persona-ne-peje-per-organizim-te-lojerave-te-fatit/": {"file": "1682093320.html", "date": "2020-08-13"}, "http://www.standard.al/2020/04/15/dite-zie-shenohen-1-mije-e-438-viktima-nga-covid-19-ne-24-oret-e-fundit-ne-france/": {"file": "1579090372.html", "date": "2020-04-15"}, "https://oren.mk.ru/social/2020/02/18/orenburgskim-chinovnikam-kupyat-eshhe-pyat-avtomobiley.html": {"file": "1523821409.html", "date": "2020-02-18"}, "https://www.inform.kz/ru/polnost-yu-obespechit-region-myasom-pticy-namereny-v-sko_a3725092": {"file": "1784458132.html", "date": "2020-12-01"}, "https://www.ukrinform.ru/rubric-kyiv/3103218-policia-napravila-delo-minera-stolicnogo-metro-v-sud.html": {"file": "1716561772.html", "date": NaN}, "https://rzn.mk.ru/social/2020/01/13/v-ryazani-nagradili-luchshikh-zhurnalistov.html": {"file": "1493663910.html", "date": "2020-01-13"}, "https://www.mk.ru/politics/2020/09/23/prichinoy-taynoy-inauguracii-lukashenko-stal-styd.html": {"file": "1719330438.html", "date": "2020-09-23"}, "https://tass.ru/moskva/8836033": {"file": "1647262025.html", "date": "2020-06-28"}, "https://www.mk.ru/social/2020/12/29/prisyazhnye-opravdali-muzhchinu-kotoryy-obvinyalsya-v-ubiystve-aktivista-v-serpukhove.html": {"file": "1810559448.html", "date": "2020-12-29"}, "https://aif.ru/politics/world/otrezat_golovu_vo_imya_proroka_terakt_vo_francii_obnazhil_starye_problemy": {"file": "1742998174.html", "date": "2020-10-19"}, "https://www.unn.com.ua/ru/news/1906059-parlament-moldovi-z-podachi-dodona-obmezhiv-povnovazhennya-sandu-sche-do-yiyi-inavguratsiyi": {"file": "1786902045.html", "date": "2020-12-03"}, "https://news.yam.md/ro/story/10939686": {"file": "1716324024.html", "date": NaN}, "https://www.noticiaexata.com.br/artigo/brasileiro-goias-supera-palmeiras-com-gol-no-apagar-das-luzes": {"file": "1777110076.html", "date": "2020-11-22"}, "http://feedproxy.google.com/~r/PublicoRSS/~3/iyen1-bXCVc/covid19-trump-ja-nao-corre-risco-infectar-terceiros-medico-casa-branca-1934765": {"file": "1735359864.html", "date": "2020-10-11"}, "https://economia.ig.com.br/2020-11-26/agronegocio-quer-salvar-relacao-com-a-china-apos-acusacoes-de-eduardo-bolsonaro.html": {"file": "1781093249.html", "date": "2020-11-26"}, "https://noticias.uol.com.br/ultimas-noticias/reuters/2020/03/14/sancoes-dos-eua-dificultam-severamente-luta-do-ira-contra-coronavirus-diz-rouhani.htm": {"file": "1548435131.html", "date": "2020-03-14"}, "https://opapel.com/quintal-dentro-do-ape-tres-dicas-para-integrar-a-varanda-a-sala-de-estar/": {"file": "1756260927.html", "date": "2020-11-01"}, "https://www.uol.com.br/esporte/futebol/ultimas-noticias/2020/05/21/witzel-ignora-decreto-e-diz-que-volta-e-de-responsabilidade-dos-clubes.htm": {"file": "1613345316.html", "date": "2020-05-21"}, "https://www.otempo.com.br/diversao/benjamin-moser-fala-sobre-a-autora-que-o-conquistou-aos-19-anos-1.2423262": {"file": "1793766917.html", "date": "2020-12-10"}, "https://revistaforum.com.br/brasil/tuca-almeida-do-the-voice-kids-morre-baleado/": {"file": "1594316187.html", "date": "2020-05-01"}, "https://g1.globo.com/ro/rondonia/noticia/2020/03/08/estudante-faz-vaquinha-virtual-para-tratamento-de-pitbull-abandonado-com-deficiencia.ghtml": {"file": "1542250510.html", "date": "2020-03-08"}, "https://www.uol.com.br/carros/videos/2020/06/23/comprador-capota-vw-polo-retirando-carro-da-concessionaria-assista.htm": {"file": "1643070775.html", "date": "2020-06-23"}, "https://trojmiasto.wyborcza.pl/trojmiasto/7,35612,25651465,jest-nowy-prezes-grupy-lotos-ma-doswiadczenie-na-kierowniczych.html": {"file": "1511056469.html", "date": "2020-01-31"}, "https://tvn24.pl/swiat/koronawirus-w-chile-po-zniesieniu-obostrzen-tlumy-ludzi-w-sklepach-i-dlugie-kolejki-4668753?source=rss": {"file": "1686918031.html", "date": "2020-08-19"}, "http://alarmeringen.nl/zuid-holland/haaglanden/den-haag/34192126/p2000-ambulance-met-spoed-naar-hoge-zand-in-den-haag.html?utm_source=rss&utm_medium=nederland&utm_campaign=sharing": {"file": "1733850206.html", "date": "2020-10-09"}, "https://news.google.com/__i/rss/rd/articles/CBMibWh0dHBzOi8vd3d3Lm51Lm5sL2Zvcm11bGUtMS82MDgwMDIxL3ZlcnN0YXBwZW4tbm90ZWVydC16ZXNkZS10aWpkLWluLWRlcmRlLXRyYWluaW5nLXJ1c3Npc2NoZS1ncmFuZC1wcml4Lmh0bWzSAWxodHRwczovL3d3dy5udS5ubC9mb3JtdWxlLTEvNjA4MDAyMS92ZXJzdGFwcGVuLW5vdGVlcnQtemVzZGUtdGlqZC1pbi1kZXJkZS10cmFpbmluZy1ydXNzaXNjaGUtZ3JhbmQtcHJpeC5hbXA?oc=5": {"file": "1721884119.html", "date": "2020-06-14"}, "https://www.lrt.lt/naujienos/verslas/4/1261660/finansu-ministerija-vidaus-rinkoje-pasiskolino-30-mln-euru": {"file": "1750014643.html", "date": "2020-10-26"}, "https://www.noz.de/lokales/westerkappeln/artikel/1994799/kindergottesdienst-in-velpe-macht-teilnehmern-und-betreuerinnen-spass": {"file": "1518961308.html", "date": "2020-02-11"}, "http://m.daejonilbo.com/mnews.asp?pk_no=1412361": {"file": "1538026996.html", "date": "2020-03-04"}, "https://news.chosun.com/site/data/html_dir/2020/08/27/2020082704301.html": {"file": "1694328145.html", "date": "2020-08-27"}, "https://www.youtube.com/watch?v=Lfw9H63g0OQ": {"file": "1577247412.html", "date": "2020-04-13"}, "https://news.biglobe.ne.jp/entertainment/0215/ori_200215_7568645051.html": {"file": "1521513101.html", "date": NaN}, "https://prtimes.jp/main/html/rd/p/000000232.000009812.html": {"file": "1662376572.html", "date": "2020-07-15"}, "https://prtimes.jp/main/html/rd/p/000000088.000029713.html": {"file": "1649432346.html", "date": "2020-07-01"}, "http://oshiete.goo.ne.jp/qa/11763760.html": {"file": "1660114516.html", "date": "2020-07-13"}, "https://blog.goo.ne.jp/umaichi_news/e/52743e20f825567d4e9889be58ec06b9": {"file": "1660064054.html", "date": "2020-07-12"}, "https://blog.goo.ne.jp/jgccg115/e/5d8c31a659b95cc18a43ad75d152e80f": {"file": "1651753632.html", "date": "2020-07-03"}, "https://www.israelhayom.co.il/article/791661": {"file": "1685121845.html", "date": "2020-08-17"}, "https://www.lagazzettadelmezzogiorno.it/news/mondo/1265473/california-certifica-voto-biden-oltre-quorum-270-elettori.html": {"file": "1789111868.html", "date": "2020-12-06"}, "https://www.edilportale.com/news/2020/09/informatica/quando-la-stampante-rende-piu-smart-il-lavoro-del-progettista_78527_10.html": {"file": "1720037955.html", "date": "2020-09-24"}, "https://www.ilmattino.it/primopiano/sanita/isolamento_gli_urologi_uomini_la_pigrizia_danneggia_la_prostata_in_casa_allenatevi_cosi-5180477.html": {"file": "1582946828.html", "date": "2020-04-19"}, "http://www.ansa.it/sito/notizie/sport/calcio/2020/07/28/ghersini-dirige-cagliari-juve-massimi-lazio-brescia_c708a8fb-c2d2-4a9a-b0f3-05b8cc98389d.html": {"file": "1671960661.html", "date": "2020-07-28"}, "https://www.tribunnews.com/pendidikan/2020/05/04/jawaban-soal-apa-dampak-negatif-jika-menunda-pekerjaan-belajar-dari-rumah-sma-di-tvri": {"file": "1595907782.html", "date": "2020-05-04"}, "https://a1plus.am/hy/article/378866": {"file": "1711609490.html", "date": "2020-09-15"}, "https://news.am/arm/news/615163.html": {"file": "1778318045.html", "date": "2020-11-24"}, "https://www.sonline.hu/orszag-vilag/sokan-visszaallitanak-a-tortenelmi-magyarorszagot-2866608/": {"file": "1625266707.html", "date": "2020-06-04"}, "https://www.baon.hu/eletstilus/gyogyulas-utan-is-kiserheti-kronikus-faradtsag-es-poszttraumas-stressz-a-koronavirust-2928554/": {"file": "1686705395.html", "date": "2020-08-18"}, "https://hindi.business-standard.com//storypage.php?autono=172475": {"file": "1725202846.html", "date": "2020-09-29"}, "https://www.amarujala.com/uttar-pradesh/varanasi/gahu-city-news-vns5205298178?utm_source=rssfeed&utm_medium=Referral&utm_campaign=rssfeed": {"file": "1579139187.html", "date": "2020-04-16"}, "https://hindi.business-standard.com//storypage.php?autono=166607": {"file": "1521292139.html", "date": "2020-02-14"}, "https://www.jagran.com/uttar-pradesh/allahabad-city-21109565.html": {"file": "1782160236.html", "date": "2020-11-28"}, "https://www.divyabhaskar.co.in/local/gujarat/vadodara/news/chhetu-patel-a-resident-of-the-united-states-died-due-to-corona-wife-under-treatment-127088788.html": {"file": "1565998355.html", "date": NaN}, "https://www.divyabhaskar.co.in/local/gujarat/rajkot/news/people-who-are-scared-of-corona-call-and-say-i-see-corona-in-my-hand-and-foot-127064846.html": {"file": "1561643340.html", "date": NaN}, "https://www.lexpress.fr/actualites/1/actualite/angleterre-manchester-united-rate-la-marche-arsenal-revit_2141546.html": {"file": "1808011241.html", "date": "2020-12-26"}, "http://ici.radio-canada.ca/nouvelle/1728875/transport-scolaire-ottawa-mi-septembre-covid": {"file": "1691900941.html", "date": "2020-08-24"}, "https://www.vosgesmatin.fr/edition-la-plaine/2020/03/23/incendie-dans-une-maison-cinq-personnes-relogees": {"file": "1556056486.html", "date": "2020-03-23"}, "https://www.guineenews.org/colonel-barry-accuse-pour-vol-aggrave-vers-la-projection-de-la-video-de-toute-la-verite/": {"file": "1759902081.html", "date": "2020-11-04"}, "https://www.sudinfo.be/id300074/article/2020-12-24/il-brise-le-couvre-feu-et-est-surpris-au-volant-23h30-mellet-je-me-fiche-pas-mal": {"file": "1806102460.html", "date": "2020-12-24"}, "https://actu.fr/societe/coronavirus/solidarite-centre-hospitalier-cote-basque-lance-appel-dons-entreprises-particuliers_32590953.html": {"file": "1560424903.html", "date": "2020-03-27"}, "http://www.republicoftogo.com//Toutes-les-rubriques/Sport/Le-championnat-d-Afrique-des-Nations-n-aura-pas-lieu": {"file": "1551001635.html", "date": "2020-03-17"}, "https://yle.fi/uutiset/3-11523428?origin=rss": {"file": "1699747711.html", "date": "2020-09-02"}, "https://www.khabaronline.ir/news/1353889/\u062a\u062d\u0644\u06cc\u0644-\u0631\u0648\u0632\u0646\u0627\u0645\u0647-\u0627\u0635\u0648\u0644\u06af\u0631\u0627-\u0627\u0632-\u062f\u0639\u0648\u062a-\u0627\u0635\u0644\u0627\u062d-\u0637\u0644\u0628\u0627\u0646-\u0628\u0647-\u062d\u0636\u0648\u0631-\u0645\u0631\u062f\u0645-\u062f\u0631-\u0627\u0646\u062a\u062e\u0627\u0628\u0627\u062a": {"file": "1523806243.html", "date": "2020-02-18"}, "https://www.yjc.ir/fa/news/7349926/\u0627\u0632-\u06a9\u0634\u0641-\u06f7\u06f2-\u062f\u0633\u062a\u06af\u0627\u0647-\u0645\u0648\u062a\u0648\u0631-\u0642\u0627\u0686\u0627\u0642-\u062f\u0631-\u0645\u0647\u0631\u06cc\u0632-\u062a\u0627-\u062f\u0633\u062a\u06af\u06cc\u0631\u06cc-\u0633\u0627\u0631\u0642-\u06f1\u06f0\u06f0-\u0645\u06cc\u0644\u06cc\u0648\u0646-\u0631\u06cc\u0627\u0644\u06cc-\u0637\u0644\u0627\u062c\u0627\u062a-\u0645\u0646\u0632\u0644-\u062f\u0631-\u0628\u0627\u0641\u0642": {"file": "1602557452.html", "date": "2020-05-10"}, "http://www.aryanews.com/News/120200622120908039/\u0648\u0631\u0648\u062f-50-\u0647\u0632\u0627\u0631-\u0645\u06cc\u0644\u06cc\u0627\u0631\u062f-\u062a\u0648\u0645\u0627\u0646-\u0646\u0642\u062f\u06cc\u0646\u06af\u06cc-\u0628\u0647-\u0628\u0648\u0631\u0633-\u062f\u0631-3-\u0645\u0627\u0647": {"file": "1641304459.html", "date": "2020-07-02"}, "https://laprensafl.com/2020/02/17/tenemos-silvia-pinal-para-rato-alejandra-guzman-habla-del-estado-de-salud-de-su-mama/": {"file": "1523780090.html", "date": "2020-02-17"}, "https://listindiario.com/el-deporte/2020/12/22/649384/los-grandes-ligas-en-la-lidom": {"file": "1804096113.html", "date": "2020-12-22"}, "http://bohemia.cu/nacionales/2020/03/adoptan-medidas-organizativas-en-la-habana-para-la-venta-de-alimentos/": {"file": "1561086382.html", "date": "2020-03-27"}, "https://www.diariolibre.com/actualidad/internacional/con-silencio-y-partidos-fantasma-se-reanuda-futbol-aleman-EI18893744": {"file": "1608578024.html", "date": "2020-05-16"}, "http://feedproxy.google.com/~r/NoticiaAlDia/~3/_k8dJ5xnwYw/": {"file": "1579141678.html", "date": "2020-04-15"}, "https://www.eldia.com/nota/2020-4-15-16-5-0-en-ruta-36-y-520-activan-protocolo-de-emergencia-en-un-colectivo-de-la-linea-oeste-la-ciudad": {"file": "1579111612.html", "date": "2020-04-15"}, "https://www.lainformacion.com/mundo/opositores-partidarios-lukashenko-culminan-dias-tension-marchas/2812881/": {"file": "1684898837.html", "date": "2020-08-16"}, "https://www.noticierodigital.com/2020/10/borrell-no-aplazar-las-parlamentarias-empeorara-la-situacion-en-venezuela/": {"file": "1731743891.html", "date": "2020-10-07"}, "https://www.la-prensa.com.mx/republica/decomisa-aduana-de-tijuana-mas-de-730-mil-dolares-en-efectivo-5050444.html": {"file": "1566936994.html", "date": "2020-04-02"}, "https://junin24.com/194420/tres-muertos-en-un-choque-frontal-en-ruta-188.html": {"file": "1506220874.html", "date": "2020-01-27"}, "http://www.radionacional.com.ar/intendente-de-pilar-encontramos-obras-paralizadas-y-calles-derrumbadas/": {"file": "1493765064.html", "date": "2020-01-13"}, "https://www.farodevigo.es/deportes/2020/07/05/andres-iniesta-recuerdos-son-magicos/2309843.html?utm_source=rss": {"file": "1653186688.html", "date": "2020-05-07"}, "http://www.andaluciainformacion.es/andalucia/895957/imbroda-revela-que-padecio-y-supero-el-coronavirus-el-pasado-marzo/": {"file": "1598244597.html", "date": "2020-06-05"}, "https://www.elsoldesanjuandelrio.com.mx/local/pescadores-gestionaran-crias-de-peces-5807069.html": {"file": "1721965295.html", "date": "2020-09-25"}, "https://www.elsoldemazatlan.com.mx/finanzas/precio-del-petroleo-mexicano-cae-a-un-minimo-de-18-anos-4982125.html": {"file": "1551294044.html", "date": "2020-03-17"}, "http://www.telepinar.cu/licenciados-en-educacion-primaria-en-consolacion-del-sur-fotos-y-video/": {"file": "1670377569.html", "date": "2020-07-23"}, "https://larazon.pe/faenon-de-toledo-y-grana-le-costo-s-1400-millones-al-estado-peruano/": {"file": "1718257353.html", "date": "2020-09-22"}, "https://diariodelsur.com.co/noticias/deportes/f%C3%BAtbol/el-primero-en-hablar-sorprendente-despedida-de-juan-guillerm-647581": {"file": "1789664619.html", "date": "2020-12-06"}, "http://www.soychile.cl/Puerto-Montt/Deportes/2020/08/24/670291/Congresos-y-seminarios-sobre-actividad-fisica-y-salud-se-transmitiran-desde-Puerto-Montt.aspx": {"file": "1691780201.html", "date": "2020-08-24"}, "https://www.lavozdelafrontera.com.mx/gossip/luis-miguel-y-jose-jose-entre-la-musica-que-sono-en-la-pandemia-plataformas-digitales-coronavirus-covid-19-5821245.html": {"file": "1724245424.html", "date": "2020-09-29"}, "http://www.radionacional.com.ar/comunidad-regional-de-calamuchita-rechazo-la-idea-de-una-capsula-turistica/": {"file": "1730863222.html", "date": "2020-06-10"}, "https://boingboing.net/2020/12/12/this-deep-funk-hanukkah-song-is-a-holiday-classic-in-the-making.html": {"file": "1795915731.html", "date": "2020-12-12"}, "https://www.washingtonpost.com/politics/federal-workers-are-returning-to-the-office-some-members-of-congress-say-they-shouldnt-be/2020/07/08/c3d22ec8-c151-11ea-b4f6-cb39cd8940fb_story.html": {"file": "1676872902.html", "date": "2020-07-09"}, "http://www.marketwatch.com/news/story.asp?guid=%7B49E8785A-F1C7-11EA-B8AA-ECF03EAB1839%7D&siteid=rss&rss=1": {"file": "1705296354.html", "date": "2020-09-08"}, "https://abc7ny.com/traffic/penn-station-to-close-overnight-for-cleaning/6144149/": {"file": "1594754655.html", "date": "2020-05-01"}, "http://feeds.mashable.com/~r/Mashable/~3/9SVJRKMUwTI/": {"file": "1526526251.html", "date": "2020-02-20"}, "https://www.seattlepi.com/sports/article/Tiz-the-Law-draws-No-17-post-as-3-5-Kentucky-15530833.php": {"file": "1699121936.html", "date": NaN}, "https://twitter.com/Reuters/status/1281836879789404160/photo/1": {"file": "1676646261.html", "date": "2020-07-11"}, "https://kesq.com/news/2020/05/14/mayor-of-coachella-explains-citys-decision-to-continue-requiring-face-coverings/": {"file": "1606865668.html", "date": "2020-05-14"}, "https://tucson.com/news/national/college-football-player-arrested-on-murder-charge-in-georgia/article_c7e4b901-9d60-5895-a288-73911df10bd3.html": {"file": "1725250200.html", "date": NaN}, "http://feeds.bizjournals.com/~r/industry_12/~3/_rJ5SC99V8E/after-two-weeks-chef-says-oggies.html": {"file": "1685765130.html", "date": "2020-08-17"}, "https://www.oann.com/protesters-gather-at-paris-theater-to-confront-macron-over-pension-reform/": {"file": "1498311133.html", "date": "2020-01-18"}, "https://timesofindia.indiatimes.com/india/farmers-protests-continue-for-eleventh-day-top-developments/articleshow/79591842.cms": {"file": "1789437552.html", "date": "2020-12-06"}, "https://kdvr.com/news/auroras-violent-crime-rate-ranks-3rd-out-of-colorados-ten-largest-cities/": {"file": "1731257347.html", "date": "2020-10-06"}, "https://www.breakingsoup.com/south-park-characters-fill-empty-seats-at-denver-broncos-games/": {"file": "1732261760.html", "date": NaN}, "https://www.hutchnews.com/ZZ/news/20201123/latest-germanys-curevac-signs-contract-for-new-vaccine?rssfeed=true": {"file": "1777898598.html", "date": "2020-11-23"}, "https://www.news18.com/news/business/rbi-prescribes-five-pillared-approach-guard-against-cybersecurity-threats-for-urban-co-op-banks-2906047.html": {"file": "1720285564.html", "date": "2020-08-24"}, "https://www.stuff.co.nz/national/crime/300145150/three-men-charged-for-alleged-bank-card-skimming-at-auckland-hospitals.html": {"file": "1753142980.html", "date": "2020-10-29"}, "https://economictimes.indiatimes.com/markets/stocks/news/share-market-update-psu-bank-shares-gain-canara-bank-rises-1br/articleshow/73540940.cms": {"file": "1502375798.html", "date": "2020-01-23"}, "http://rnanews.com/young-leaders-from-canada-fiji-pakistan-uganda-win-commonwealth-youth-awards-2020/": {"file": "1545738097.html", "date": "2020-03-11"}, "https://www.seattletimes.com/nation-world/the-quiet-hand-of-conservative-groups-in-the-anti-lockdown-protests/": {"file": "1587149671.html", "date": "2020-04-21"}, "https://au.news.yahoo.com/the-two-aussie-covid-measures-that-could-never-work-in-the-us-222249752.html": {"file": "1753036930.html", "date": "2020-10-28"}, "https://www.dailymail.co.uk/sport/football/article-8297417/Man-Utd-ace-Dean-Henderson-morally-right-finish-season-Sheff-Utd-Wilder.html?ns_mchannel=rss&ns_campaign=1490&ito=1490": {"file": "1599860291.html", "date": "2020-05-07"}, "https://www.google.com/imgres?imgurl=https://i.ebayimg.com/images/g/fjoAAOSwyGZaRXKK/s-l300.jpg&imgrefurl=https://www.ebay.com/itm/Retirement-Gift-Ideas-Retired-Definition-Funny-Retirement-Coffee-Mug-Tea-Cup-/132449557566&tbnid=QBu8niz350w2PM&vet=1&docid=CTb7OqAkXHkPUM&w=300&h=265&itg=1&q=retirement+definition&hl=en-US&source=sh/x/im": {"file": "1564637733.html", "date": NaN}, "https://www.inquirer.com/news/nation-world/us-state-department-blocks-lawsuit-by-american-imprisoned-tortured-in-egypt-20200718.html": {"file": "1666031133.html", "date": "2020-07-18"}, "https://www.slobodnaevropa.org/a/30657941.html": {"file": "1641303898.html", "date": "2020-06-22"}, "https://www.laprensalatina.com/uncertain-future-for-britains-essential-workers-after-brexit/": {"file": "1632107283.html", "date": "2020-06-11"}, "https://www.monroenews.com/ZZ/news/20200516/italy-seeks-to-boost-tourism-by-opening-borders-june-3?rssfeed=true": {"file": "1608581255.html", "date": "2020-05-16"}, "https://www.malaymail.com/news/sports/2020/04/08/2022-world-athletics-championships-set-for-july-15-24/1854874": {"file": "1572542522.html", "date": "2020-04-08"}, "http://city.udn.com/67926/6950016?ch=rss_ugccitynewpost": {"file": "1580240354.html", "date": "2020-04-17"}, "https://www.moneycontrol.com/news/business/goldman-sachs-says-india\u2019s-fy21-gdp-may-plummet-tomulti-decade-low16bleakest-forecast-so-far_13654421.html": {"file": "1572317591.html", "date": NaN}, "https://wiki.d-addicts.com/index.php?title=Park_Ye_Jin&diff=591343&oldid=588001": {"file": "1571322029.html", "date": "2020-04-07"}, "https://www.forbes.com/sites/marlamilling/2020/05/15/drunkorexia-on-the-rise-among-female-university-students/": {"file": "1608572427.html", "date": "2020-05-15"}, "https://thefrontierpost.com/two-newborns-die-for-want-of-oxygen-at-bhakkar-hospital/": {"file": "1535415671.html", "date": NaN}, "https://www.theargus.co.uk/news/18701511.woman-hurt-hit-car-station-street-eastbourne/?ref=rss": {"file": "1703831611.html", "date": "2020-09-06"}, "https://chicago.suntimes.com/2020/6/24/21302329/trump-judges-nominee-federal-senate": {"file": "1643960841.html", "date": "2020-06-24"}, "https://www.dln.com/newcorporations/details/ref_index/438057": {"file": "1687351353.html", "date": NaN}, "https://www.engadget.com/amazon-luxury-stores-fashion-140141502.html": {"file": "1711803974.html", "date": "2020-09-15"}, "https://www.philstar.com/showbiz/2020/11/27/2059828/abs-cbn-nagsalita-na-sa-paglayas-ni-bea": {"file": "1781170231.html", "date": "2020-11-27"}, "https://www.kut.org/post/local-attorney-andy-brown-will-be-democratic-nominee-county-judge": {"file": "1708999888.html", "date": "2020-08-16"}, "https://sanfrancisco.cbslocal.com/2020/09/16/55th-acm-awards-winners-list/": {"file": "1713311346.html", "date": "2020-08-16"}, "https://semissourinews.com/stories/544335905-total-oasdi-disabled-beneficiaries-in-missouri-zip-63848-remains-the-same-in-2019": {"file": "1693504805.html", "date": "2020-08-26"}, "https://zitrod.com/business/we-must-do-more-what-ceos-like-tim-cook-jamie-dimon-larry-fink-say-about-racial-inequality-protests/": {"file": "1627298010.html", "date": "2020-06-01"}, "http://feeds.bizjournals.com/~r/industry_20/~3/Imon3NQaB8c/shutting-down-tampa-bay-construction-during.html": {"file": "1566929327.html", "date": "2020-04-02"}, "http://rssfeeds.usatoday.com/~/620721596/0/usatoday-newstopstories~Hurricanes-in-a-pandemic-Absolutely-thats-our-nightmare-scenario/": {"file": "1566930342.html", "date": "2020-04-02"}, "https://www.recordonline.com/news/20200316/rockland-to-declare-local-state-of-emergency-on-monday?rssfeed=true": {"file": "1549555386.html", "date": "2020-03-16"}, "https://hypixel.net/threads/what-whered-it-go.2675645/": {"file": "1552005464.html", "date": "2020-03-18"}, "http://nationalpost.com/pmn/health-pmn/frances-macron-condemns-unilateral-border-control-measures-over-coronavirus": {"file": "1549542851.html", "date": "2020-03-16"}, "https://www.news18.com/news/india/suspended-aap-councillor-tahir-hussain-arrested-in-delhi-court-over-ib-staffers-murder-2538473.html": {"file": "1549579884.html", "date": "2020-03-16"}, "http://www.asiapacificstar.com/news/263700449/australian-megablaze-brought-under-control": {"file": "1493757507.html", "date": "2020-01-13"}, "https://www.realestate.com.au/news/live-in-your-own-jurassic-park-at-this-multimilliondollar-kenthurst-estate/?rsf=syn:news:nca:news:spa:strap": {"file": "1500330653.html", "date": "2020-01-22"}, "https://carnegieendowment.org/chinafinancialmarkets/79641": {"file": "1487058344.html", "date": "2019-08-06"}, "http://feeds.reuters.com/~r/reuters/businessNews/~3/UjOBluJTi0o/volkswagens-skoda-auto-2019-deliveries-dip-to-1-24-million-cars-due-to-weaker-sales-in-china-idUSKBN1ZC1DA": {"file": "1493638362.html", "date": NaN}, "https://www.96fm.ie/": {"file": "1630687900.html", "date": NaN}, "https://www.mirror.co.uk/sport/football/transfer-news/arsenal-set-pierre-emerick-aubameyang-22002407": {"file": "1602142656.html", "date": "2020-05-09"}, "https://www.businesstimes.com.sg/companies-markets/s232m-fair-value-loss-pushes-sph-into-the-red-for-first-time": {"file": "1738268651.html", "date": "2020-10-14"}, "https://nckansasnews.com/stories/567912132-mark-dings-donates-2-800-to-tracey-robert-mann-s-campaign-committee-in-september": {"file": "1793453954.html", "date": "2020-12-08"}, "https://whnt.com/news/don-trump-jr-tests-positive-for-coronavirus/": {"file": "1775625776.html", "date": "2020-11-20"}, "https://www.hindustantimes.com/india-news/odisha-artist-spreads-awareness-on-coronavirus-with-wall-paintings/story-zMoh0EOYcRzfnhXu6NBPnM.html": {"file": "1589455043.html", "date": "2020-04-26"}, "https://www.jstor.org/stable/2669240?origin=crossref": {"file": "1587353145.html", "date": NaN}, "https://azraelsmerryland.blogspot.com/2020/07/consumers-elevate-appeal-to-president.html": {"file": "1653190016.html", "date": "2020-07-05"}, "https://kiow.com/2020/10/26/absentee-ballots-are-slow-to-return/": {"file": "1749876905.html", "date": "2020-10-26"}, "https://www.zimeye.net/2020/03/23/coronavirus-doctors-threaten-to-down-tools-due-to-govt-unpreparedness/": {"file": "1556637845.html", "date": "2020-03-23"}, "https://www.registerguard.com/news/20200413/second-suspect-in-shooting-turns-himself-in?rssfeed=true": {"file": "1577551432.html", "date": "2020-04-13"}, "https://www.thestar.com/news/world/us/2020/03/23/ap-exclusive-allen-has-new-publisher-memoir-out-monday.html": {"file": "1556635225.html", "date": "2020-03-23"}, "https://www.urdupoint.com/en/world/russian-prime-minister-mikhail-mishustin-says-906868.html": {"file": "1592133028.html", "date": "2020-04-29"}, "https://www.couriermail.com.au/news/national/98yearold-wwii-veteran-beats-covid19-receives-ovation-from-hospital-staff/video/ca6ce285879e2291307f3fc8148670aa": {"file": "1588233953.html", "date": NaN}, "https://www.eastbaytimes.com/2020/04/24/joe-biden-predicts-trump-will-try-to-delay-elections/": {"file": "1588026728.html", "date": "2020-04-24"}, "https://www.eastbaytimes.com/2020/04/24/coronavirus-how-these-bay-area-travelers-got-stranded-in-bolivia/": {"file": "1588027676.html", "date": "2020-04-24"}, "https://sputniknews.com/radio_the_critical_hour/202004281079127690-some-us-states-begin-lifting-lockdowns-vp-pence-defiantly-tours-clinic-unmasked/": {"file": "1592175395.html", "date": "2020-04-28"}, "https://www.baltimoresun.com/maryland/howard/cng-ho-permits-public-hearing-20200817-hntmbtcnhfgwjlv7vompxm4nmu-story.html#ed=rss_www.baltimoresun.com/arcio/rss/category/latest/": {"file": "1685640081.html", "date": "2020-08-17"}, "https://globalnews.ca/news/6620622/syria-turkey-strikes-conflict/": {"file": "1536545428.html", "date": "2020-03-06"}, "http://rssfeeds.detroitnews.com/~/619980292/0/detroit/home~Men-chased-her-shot-her-at-front-door-now-reward-offered-for-slaying-suspects/": {"file": "1551271741.html", "date": "2020-03-17"}, "https://timesofindia.indiatimes.com/sports/cricket/ipl/live-blog/ipl-2020-live-cricket-score-chennai-super-kings-vs-sunrisers-hyderabad-match-14-dubai/liveblog/78447140.cms": {"file": "1727473717.html", "date": "2020-10-03"}, "https://timesofindia.indiatimes.com/city/bhubaneswar/odisha-reports-first-covid-19-death-72-year-old-man-from-bhubaneswar-dies/articleshow/75026800.cms": {"file": "1571147129.html", "date": "2020-04-07"}, "https://thewest.com.au/business/public-companies/caeneus-charges-up-its-exploration-tool-kit-at-mallina-c-1317422": {"file": "1711399465.html", "date": "2020-09-15"}, "http://optimussearch.com.ph/2020/06/02/no-membership-required-best-and-free-online-dating-websites-in-los-angeles/": {"file": "1677010256.html", "date": NaN}, "https://www.businesstoday.in/current/economy-politics/coronavirus-in-bihar-record-749-cases-in-24-hours-patna-other-districts-announce-lockdown-from-july-10/story/409327.html": {"file": "1656725247.html", "date": "2020-07-09"}, "https://theweek.com/speedreads/887020/trump-visited-trumpowned-golf-course-nearly-24-percent-days-2019": {"file": "1485102321.html", "date": "2020-01-02"}, "https://globalnews.ca/news/7278919/kamala-harris-fact-check-us-vice-president/": {"file": "1684405920.html", "date": "2020-09-15"}, "https://www.baltimoresun.com/opinion/columnists/zurawik/bs-ed-zontv-media-year-20201223-cnvrlhkhnrbihcxx6wxcxt2b7y-story.html#ed=rss_www.baltimoresun.com/arcio/rss/category/latest/": {"file": "1805697156.html", "date": "2020-12-23"}, "https://www.sfgate.com/news/article/New-anthology-collects-dozens-of-poems-about-15250468.php": {"file": "1598537220.html", "date": "2020-05-06"}, "https://bizwest.com/2020/03/11/loft-clothing-store-at-twenty-ninth-street-in-boulder-to-close/loft/": {"file": "1545286960.html", "date": "2020-03-11"}, "https://www.news.az/news/azerbaijan-launches-counteroffensive-to-restore-its-territorial-integrity-pakistani-envoy": {"file": "1791336003.html", "date": "2020-10-13"}, "https://upton.wickedlocal.com/news/20200924/battle-in-congress-to-replace-ruth-bader-ginsburg-is-dashing-hopes-for-covid-19-stimulus-package?rssfeed=true": {"file": "1720539463.html", "date": "2020-09-24"}, "http://www.haniotika-nea.gr/ton-epiasan-tin-ora-poy-prospathoyse-na-klepsei-aytokinita/": {"file": "1687581723.html", "date": "2020-09-24"}, "https://www.nzz.ch/international/neue-us-sanktionen-erschuettern-die-syrische-wirtschaft-ld.1560586": {"file": "1635095370.html", "date": "2020-06-17"}, "https://www.infranken.de/ueberregional/boulevard/kultur/tatort-aus-muenchen-30-jahre-leitmayr-und-batic-art-5139057": {"file": "1808293409.html", "date": "2020-12-27"}, "https://www.presseportal.de/blaulicht/pm/43526/4791887": {"file": "1798244877.html", "date": "2020-12-15"}, "https://sn.dk/Erhverv/Forstaerket-haab-om-737-MAX-erstatning-loefter-Norwegian-aktie/artikel/900368?rss": {"file": "1484494308.html", "date": "2020-01-02"}, "https://sn.dk/Danmark/Coronavirus-rammer-trafikken-i-Danmark/artikel/922379?rss": {"file": "1544178583.html", "date": "2020-03-10"}, "https://www.novinky.cz/vase-zpravy/clanek/janovicka-knihovna-zve-na-vystavu-o-nebezpecnem-zivobyti-prevadecu-na-sumave-40311673": {"file": "1509819104.html", "date": "2020-01-30"}, "https://www.elvallenc.cat/societat/vallsconfinat-continua-amb-mes-novetats/": {"file": "1556626357.html", "date": "2020-03-23"}, "https://bn.wikipedia.org/w/index.php?title=Jim_Higgs&diff=3986629&oldid=0": {"file": "1522651525.html", "date": "2020-02-18"}, "https://www.actualno.com/haskovo/obshtina-haskovo-osiguri-komputri-i-tableti-na-deca-v-socialni-obshtejitija-news_1509983.html": {"file": "1740081165.html", "date": "2020-10-15"}, "https://vratza.com/obshtina-b-vratsa-b-specheli-proekt-za-izgrazhdaneto-na-dopalnitelen-korpus-na/": {"file": "1766087391.html", "date": NaN}, "https://www.youm7.com/story/2020/8/24/\u0648\u0632\u064a\u0631-\u0627\u0644\u0631\u0649-\u064a\u0634\u0647\u062f-\u062a\u0648\u0642\u064a\u0639-\u0639\u0642\u062f-\u062f\u0631\u0627\u0633\u0629-\u062a\u062d\u062f\u064a\u062f-\u0627\u0644\u0633\u062d\u0628-\u0627\u0644\u0622\u0645\u0646-\u0644\u0644\u062e\u0632\u0627\u0646\u0627\u062a/4943459": {"file": "1691196862.html", "date": "2020-09-24"}, "https://www.alyaum.com/articles/6291787/\u0627\u0644\u0642\u0627\u0631\u0627\u062a-\u0627\u0644\u0633\u0628\u0639/\u0637\u0647\u0631\u0627\u0646-\u062a\u062f\u0641\u0646-\u0632\u0627\u062f\u0629-\u0648\u062a\u062a\u0647\u0645-\u0627\u0644\u0645\u0639\u0627\u0631\u0636\u0629-\u0627\u0644\u0625\u064a\u0631\u0627\u0646\u064a\u0629-\u0628\u0627\u063a\u062a\u064a\u0627\u0644\u0647": {"file": "1784190729.html", "date": "2020-01-12"}, "https://www.albayan.ae/across-the-uae/news-and-reports/2020-06-03-1.3874341": {"file": "1623715966.html", "date": "2020-06-03"}, "https://akhbarelyom.com/news/newdetails/3126898/1/36-\u0639\u0627\u0645\u064b\u0627..-\u0633\u0631-\u0631\u062d\u064a\u0644-\u0646\u0639\u064a\u0645\u0629-\u0639\u0627\u0643\u0641-\u0641\u064a-\u0633\u0646-\u0645\u0628\u0643\u0631": {"file": "1731695567.html", "date": "2020-10-07"}, "https://www.almadenahnews.com/article/825076-%D8%A3%D9%85%D9%8A%D8%B1%D9%83%D8%A7-50-%D8%A7%D9%84%D9%81%D8%A7-%D8%AD%D8%B5%D9%8A%D9%84%D8%A9-%D8%A7%D9%84%D9%88%D9%81%D9%8A%D8%A7%D8%AA-%D8%A8%D8%B3%D8%A8%D8%A8-%D9%81%D9%8A%D8%B1%D9%88%D8%B3-%D9%83%D9%88%D8%B1%D9%88%D9%86%D8%A7": {"file": "1588107198.html", "date": "2020-04-24"}, "https://arabic.sputniknews.com/arab_world/202003171044893040-%D9%85%D8%B5%D8%B1-%D8%AA%D8%B3%D8%AC%D9%84-30-%D8%A5%D8%B5%D8%A7%D8%A8%D8%A9-%D8%AC%D8%AF%D9%8A%D8%AF%D8%A9-%D9%88%D8%AD%D8%A7%D9%84%D8%AA%D9%8A-%D9%88%D9%81%D8%A7%D8%A9-%D8%A8%D9%81%D9%8A%D8%B1%D9%88%D8%B3-%D9%83%D9%88%D8%B1%D9%88%D9%86%D8%A7/": {"file": "1550981615.html", "date": "2020-03-17"}, "https://elbaladtv.net/%d8%aa%d8%b1%d9%83%d9%89-%d8%a2%d9%84-%d8%a7%d9%84%d8%b4%d9%8a%d8%ae-%d8%a8%d8%b9%d8%af-%d8%a5%d8%b5%d8%a7%d8%a8%d8%a9-%d9%8a%d8%b3%d8%b1%d8%a7-%d8%a8%d9%83%d9%88%d8%b1%d9%88%d9%86%d8%a7-%d9%8a%d8%a7/": {"file": "1806793639.html", "date": "2020-12-25"}, "https://www.beirutobserver.com/2020/11/2338761/": {"file": "1764731404.html", "date": "2020-11-09"}, "https://money.udn.com/money/story/5603/4909425": {"file": "1728970162.html", "date": "2020-10-04"}, "http://www.upmedia.mg/news_info.php?SerialNo=83141": {"file": "1546021647.html", "date": "2020-03-12"}, "https://news.ltn.com.tw/news/business/breakingnews/3119452": {"file": "1564888577.html", "date": "2020-04-01"}, "https://news.sina.com.tw/article/20200121/34046328.html": {"file": "1500260110.html", "date": "2020-01-21"}} \ No newline at end of file +{"https://zaxid.net/news/showNews.do?nastupnogo_tizhnya_na_ukrayinu_chekayut_anomalna_speka_ta_grozi&objectId=1503302": {"file": "1628285861.html", "date": "2020-06-07"}, "https://24tv.ua/yak-venediktova-zahishhala-nardepa-vid-slugi-narodu_n1419177": {"file": "1716869064.html", "date": "2020-09-21"}, "https://www.mynet.com/samsunda-sobadan-zehirlenen-2-cocuk-hastanelik-oldu-110106661445": {"file": "1783862633.html", "date": "2020-11-30"}, "http://www.detaykibris.com/yikilan-binalarkurtarma-calismalari-izmirden-goruntuler-2196g.htm": {"file": "1754856212.html", "date": "2020-10-30"}, "https://www.hatawtabloid.com/2020/02/18/aktres-sunod-sunuran-sa-aktor-bf/": {"file": "1523761669.html", "date": "2020-02-18"}, "http://auto-door16814.ttblogs.com/2028064/%E0%B8%9C-%E0%B8%9C%E0%B8%A5-%E0%B8%95-%E0%B9%81%E0%B8%A5%E0%B8%B0%E0%B8%88%E0%B8%B3%E0%B8%AB%E0%B8%99-%E0%B8%B2%E0%B8%A2%E0%B9%82%E0%B8%8B-%E0%B8%AD-%E0%B8%95%E0%B8%AA%E0%B8%B2%E0%B8%AB%E0%B8%81%E0%B8%A3%E0%B8%A3%E0%B8%A1%E0%B8%97-%E0%B8%81%E0%B8%9B%E0%B8%A3%E0%B8%B0%E0%B9%80%E0%B8%A0%E0%B8%97-%E0%B8%97-%E0%B8%81%E0%B8%8A%E0%B8%99-%E0%B8%94": {"file": "1799801548.html", "date": NaN}, "https://www.hbl.fi/artikel/bbc-premier-league-omstart-17-juli/": {"file": "1619092544.html", "date": "2020-05-28"}, "https://avaz.ba/kantoni/republika-srpska/604722/generalni-direktor-eprs-pozitivan-na-koronavirus": {"file": "1750355638.html", "date": "2020-10-26"}, "https://www.koha.net/kronike-e-zeze/233165/arrestohen-dy-persona-ne-peje-per-organizim-te-lojerave-te-fatit/": {"file": "1682093320.html", "date": "2020-08-13"}, "http://www.standard.al/2020/04/15/dite-zie-shenohen-1-mije-e-438-viktima-nga-covid-19-ne-24-oret-e-fundit-ne-france/": {"file": "1579090372.html", "date": "2020-04-15"}, "https://oren.mk.ru/social/2020/02/18/orenburgskim-chinovnikam-kupyat-eshhe-pyat-avtomobiley.html": {"file": "1523821409.html", "date": "2020-02-18"}, "https://www.inform.kz/ru/polnost-yu-obespechit-region-myasom-pticy-namereny-v-sko_a3725092": {"file": "1784458132.html", "date": "2020-12-01"}, "https://www.ukrinform.ru/rubric-kyiv/3103218-policia-napravila-delo-minera-stolicnogo-metro-v-sud.html": {"file": "1716561772.html", "date": "2020-09-20"}, "https://rzn.mk.ru/social/2020/01/13/v-ryazani-nagradili-luchshikh-zhurnalistov.html": {"file": "1493663910.html", "date": "2020-01-13"}, "https://www.mk.ru/politics/2020/09/23/prichinoy-taynoy-inauguracii-lukashenko-stal-styd.html": {"file": "1719330438.html", "date": "2020-09-23"}, "https://tass.ru/moskva/8836033": {"file": "1647262025.html", "date": "2020-06-28"}, "https://www.mk.ru/social/2020/12/29/prisyazhnye-opravdali-muzhchinu-kotoryy-obvinyalsya-v-ubiystve-aktivista-v-serpukhove.html": {"file": "1810559448.html", "date": "2020-12-29"}, "https://aif.ru/politics/world/otrezat_golovu_vo_imya_proroka_terakt_vo_francii_obnazhil_starye_problemy": {"file": "1742998174.html", "date": "2020-10-19"}, "https://www.unn.com.ua/ru/news/1906059-parlament-moldovi-z-podachi-dodona-obmezhiv-povnovazhennya-sandu-sche-do-yiyi-inavguratsiyi": {"file": "1786902045.html", "date": "2020-12-03"}, "https://news.yam.md/ro/story/10939686": {"file": "1716324024.html", "date": "2020-09-20"}, "https://www.noticiaexata.com.br/artigo/brasileiro-goias-supera-palmeiras-com-gol-no-apagar-das-luzes": {"file": "1777110076.html", "date": "2020-11-22"}, "http://feedproxy.google.com/~r/PublicoRSS/~3/iyen1-bXCVc/covid19-trump-ja-nao-corre-risco-infectar-terceiros-medico-casa-branca-1934765": {"file": "1735359864.html", "date": "2020-10-11"}, "https://economia.ig.com.br/2020-11-26/agronegocio-quer-salvar-relacao-com-a-china-apos-acusacoes-de-eduardo-bolsonaro.html": {"file": "1781093249.html", "date": "2020-11-26"}, "https://noticias.uol.com.br/ultimas-noticias/reuters/2020/03/14/sancoes-dos-eua-dificultam-severamente-luta-do-ira-contra-coronavirus-diz-rouhani.htm": {"file": "1548435131.html", "date": "2020-03-14"}, "https://opapel.com/quintal-dentro-do-ape-tres-dicas-para-integrar-a-varanda-a-sala-de-estar/": {"file": "1756260927.html", "date": "2020-11-01"}, "https://www.uol.com.br/esporte/futebol/ultimas-noticias/2020/05/21/witzel-ignora-decreto-e-diz-que-volta-e-de-responsabilidade-dos-clubes.htm": {"file": "1613345316.html", "date": "2020-05-21"}, "https://www.otempo.com.br/diversao/benjamin-moser-fala-sobre-a-autora-que-o-conquistou-aos-19-anos-1.2423262": {"file": "1793766917.html", "date": "2020-12-10"}, "https://revistaforum.com.br/brasil/tuca-almeida-do-the-voice-kids-morre-baleado/": {"file": "1594316187.html", "date": "2020-05-01"}, "https://g1.globo.com/ro/rondonia/noticia/2020/03/08/estudante-faz-vaquinha-virtual-para-tratamento-de-pitbull-abandonado-com-deficiencia.ghtml": {"file": "1542250510.html", "date": "2020-03-08"}, "https://www.uol.com.br/carros/videos/2020/06/23/comprador-capota-vw-polo-retirando-carro-da-concessionaria-assista.htm": {"file": "1643070775.html", "date": "2020-06-23"}, "https://trojmiasto.wyborcza.pl/trojmiasto/7,35612,25651465,jest-nowy-prezes-grupy-lotos-ma-doswiadczenie-na-kierowniczych.html": {"file": "1511056469.html", "date": "2020-01-31"}, "https://tvn24.pl/swiat/koronawirus-w-chile-po-zniesieniu-obostrzen-tlumy-ludzi-w-sklepach-i-dlugie-kolejki-4668753?source=rss": {"file": "1686918031.html", "date": "2020-08-19"}, "http://alarmeringen.nl/zuid-holland/haaglanden/den-haag/34192126/p2000-ambulance-met-spoed-naar-hoge-zand-in-den-haag.html?utm_source=rss&utm_medium=nederland&utm_campaign=sharing": {"file": "1733850206.html", "date": "2020-10-09"}, "https://news.google.com/__i/rss/rd/articles/CBMibWh0dHBzOi8vd3d3Lm51Lm5sL2Zvcm11bGUtMS82MDgwMDIxL3ZlcnN0YXBwZW4tbm90ZWVydC16ZXNkZS10aWpkLWluLWRlcmRlLXRyYWluaW5nLXJ1c3Npc2NoZS1ncmFuZC1wcml4Lmh0bWzSAWxodHRwczovL3d3dy5udS5ubC9mb3JtdWxlLTEvNjA4MDAyMS92ZXJzdGFwcGVuLW5vdGVlcnQtemVzZGUtdGlqZC1pbi1kZXJkZS10cmFpbmluZy1ydXNzaXNjaGUtZ3JhbmQtcHJpeC5hbXA?oc=5": {"file": "1721884119.html", "date": "2020-09-26"}, "https://www.lrt.lt/naujienos/verslas/4/1261660/finansu-ministerija-vidaus-rinkoje-pasiskolino-30-mln-euru": {"file": "1750014643.html", "date": "2020-10-26"}, "https://www.noz.de/lokales/westerkappeln/artikel/1994799/kindergottesdienst-in-velpe-macht-teilnehmern-und-betreuerinnen-spass": {"file": "1518961308.html", "date": "2020-02-11"}, "http://m.daejonilbo.com/mnews.asp?pk_no=1412361": {"file": "1538026996.html", "date": "2020-03-04"}, "https://news.chosun.com/site/data/html_dir/2020/08/27/2020082704301.html": {"file": "1694328145.html", "date": "2020-08-27"}, "https://www.youtube.com/watch?v=Lfw9H63g0OQ": {"file": "1577247412.html", "date": "2020-04-13"}, "https://news.biglobe.ne.jp/entertainment/0215/ori_200215_7568645051.html": {"file": "1521513101.html", "date": "2020-02-15"}, "https://prtimes.jp/main/html/rd/p/000000232.000009812.html": {"file": "1662376572.html", "date": "2020-07-15"}, "https://prtimes.jp/main/html/rd/p/000000088.000029713.html": {"file": "1649432346.html", "date": "2020-07-01"}, "http://oshiete.goo.ne.jp/qa/11763760.html": {"file": "1660114516.html", "date": "2020-07-13"}, "https://blog.goo.ne.jp/umaichi_news/e/52743e20f825567d4e9889be58ec06b9": {"file": "1660064054.html", "date": "2020-07-12"}, "https://blog.goo.ne.jp/jgccg115/e/5d8c31a659b95cc18a43ad75d152e80f": {"file": "1651753632.html", "date": "2020-07-03"}, "https://www.israelhayom.co.il/article/791661": {"file": "1685121845.html", "date": "2020-08-17"}, "https://www.lagazzettadelmezzogiorno.it/news/mondo/1265473/california-certifica-voto-biden-oltre-quorum-270-elettori.html": {"file": "1789111868.html", "date": "2020-12-06"}, "https://www.edilportale.com/news/2020/09/informatica/quando-la-stampante-rende-piu-smart-il-lavoro-del-progettista_78527_10.html": {"file": "1720037955.html", "date": "2020-09-24"}, "https://www.ilmattino.it/primopiano/sanita/isolamento_gli_urologi_uomini_la_pigrizia_danneggia_la_prostata_in_casa_allenatevi_cosi-5180477.html": {"file": "1582946828.html", "date": "2020-04-19"}, "http://www.ansa.it/sito/notizie/sport/calcio/2020/07/28/ghersini-dirige-cagliari-juve-massimi-lazio-brescia_c708a8fb-c2d2-4a9a-b0f3-05b8cc98389d.html": {"file": "1671960661.html", "date": "2020-07-28"}, "https://www.tribunnews.com/pendidikan/2020/05/04/jawaban-soal-apa-dampak-negatif-jika-menunda-pekerjaan-belajar-dari-rumah-sma-di-tvri": {"file": "1595907782.html", "date": "2020-05-04"}, "https://a1plus.am/hy/article/378866": {"file": "1711609490.html", "date": "2020-09-15"}, "https://news.am/arm/news/615163.html": {"file": "1778318045.html", "date": "2020-11-24"}, "https://www.sonline.hu/orszag-vilag/sokan-visszaallitanak-a-tortenelmi-magyarorszagot-2866608/": {"file": "1625266707.html", "date": "2020-06-04"}, "https://www.baon.hu/eletstilus/gyogyulas-utan-is-kiserheti-kronikus-faradtsag-es-poszttraumas-stressz-a-koronavirust-2928554/": {"file": "1686705395.html", "date": "2020-08-18"}, "https://hindi.business-standard.com//storypage.php?autono=172475": {"file": "1725202846.html", "date": "2020-09-29"}, "https://www.amarujala.com/uttar-pradesh/varanasi/gahu-city-news-vns5205298178?utm_source=rssfeed&utm_medium=Referral&utm_campaign=rssfeed": {"file": "1579139187.html", "date": "2020-04-16"}, "https://hindi.business-standard.com//storypage.php?autono=166607": {"file": "1521292139.html", "date": "2020-02-14"}, "https://www.jagran.com/uttar-pradesh/allahabad-city-21109565.html": {"file": "1782160236.html", "date": "2020-11-28"}, "https://www.divyabhaskar.co.in/local/gujarat/vadodara/news/chhetu-patel-a-resident-of-the-united-states-died-due-to-corona-wife-under-treatment-127088788.html": {"file": "1565998355.html", "date": "2020-04-02"}, "https://www.divyabhaskar.co.in/local/gujarat/rajkot/news/people-who-are-scared-of-corona-call-and-say-i-see-corona-in-my-hand-and-foot-127064846.html": {"file": "1561643340.html", "date": "2020-03-28"}, "https://www.lexpress.fr/actualites/1/actualite/angleterre-manchester-united-rate-la-marche-arsenal-revit_2141546.html": {"file": "1808011241.html", "date": "2020-12-26"}, "http://ici.radio-canada.ca/nouvelle/1728875/transport-scolaire-ottawa-mi-septembre-covid": {"file": "1691900941.html", "date": "2020-08-24"}, "https://www.vosgesmatin.fr/edition-la-plaine/2020/03/23/incendie-dans-une-maison-cinq-personnes-relogees": {"file": "1556056486.html", "date": "2020-03-23"}, "https://www.guineenews.org/colonel-barry-accuse-pour-vol-aggrave-vers-la-projection-de-la-video-de-toute-la-verite/": {"file": "1759902081.html", "date": "2020-11-04"}, "https://www.sudinfo.be/id300074/article/2020-12-24/il-brise-le-couvre-feu-et-est-surpris-au-volant-23h30-mellet-je-me-fiche-pas-mal": {"file": "1806102460.html", "date": "2020-12-24"}, "https://actu.fr/societe/coronavirus/solidarite-centre-hospitalier-cote-basque-lance-appel-dons-entreprises-particuliers_32590953.html": {"file": "1560424903.html", "date": "2020-03-27"}, "http://www.republicoftogo.com//Toutes-les-rubriques/Sport/Le-championnat-d-Afrique-des-Nations-n-aura-pas-lieu": {"file": "1551001635.html", "date": "2020-03-17"}, "https://yle.fi/uutiset/3-11523428?origin=rss": {"file": "1699747711.html", "date": "2020-09-02"}, "https://www.khabaronline.ir/news/1353889/\u062a\u062d\u0644\u06cc\u0644-\u0631\u0648\u0632\u0646\u0627\u0645\u0647-\u0627\u0635\u0648\u0644\u06af\u0631\u0627-\u0627\u0632-\u062f\u0639\u0648\u062a-\u0627\u0635\u0644\u0627\u062d-\u0637\u0644\u0628\u0627\u0646-\u0628\u0647-\u062d\u0636\u0648\u0631-\u0645\u0631\u062f\u0645-\u062f\u0631-\u0627\u0646\u062a\u062e\u0627\u0628\u0627\u062a": {"file": "1523806243.html", "date": "2020-02-18"}, "https://www.yjc.ir/fa/news/7349926/\u0627\u0632-\u06a9\u0634\u0641-\u06f7\u06f2-\u062f\u0633\u062a\u06af\u0627\u0647-\u0645\u0648\u062a\u0648\u0631-\u0642\u0627\u0686\u0627\u0642-\u062f\u0631-\u0645\u0647\u0631\u06cc\u0632-\u062a\u0627-\u062f\u0633\u062a\u06af\u06cc\u0631\u06cc-\u0633\u0627\u0631\u0642-\u06f1\u06f0\u06f0-\u0645\u06cc\u0644\u06cc\u0648\u0646-\u0631\u06cc\u0627\u0644\u06cc-\u0637\u0644\u0627\u062c\u0627\u062a-\u0645\u0646\u0632\u0644-\u062f\u0631-\u0628\u0627\u0641\u0642": {"file": "1602557452.html", "date": "2020-05-10"}, "http://www.aryanews.com/News/120200622120908039/\u0648\u0631\u0648\u062f-50-\u0647\u0632\u0627\u0631-\u0645\u06cc\u0644\u06cc\u0627\u0631\u062f-\u062a\u0648\u0645\u0627\u0646-\u0646\u0642\u062f\u06cc\u0646\u06af\u06cc-\u0628\u0647-\u0628\u0648\u0631\u0633-\u062f\u0631-3-\u0645\u0627\u0647": {"file": "1641304459.html", "date": "2020-07-02"}, "https://laprensafl.com/2020/02/17/tenemos-silvia-pinal-para-rato-alejandra-guzman-habla-del-estado-de-salud-de-su-mama/": {"file": "1523780090.html", "date": "2020-02-17"}, "https://listindiario.com/el-deporte/2020/12/22/649384/los-grandes-ligas-en-la-lidom": {"file": "1804096113.html", "date": "2020-12-22"}, "http://bohemia.cu/nacionales/2020/03/adoptan-medidas-organizativas-en-la-habana-para-la-venta-de-alimentos/": {"file": "1561086382.html", "date": "2020-03-27"}, "https://www.diariolibre.com/actualidad/internacional/con-silencio-y-partidos-fantasma-se-reanuda-futbol-aleman-EI18893744": {"file": "1608578024.html", "date": "2020-05-16"}, "http://feedproxy.google.com/~r/NoticiaAlDia/~3/_k8dJ5xnwYw/": {"file": "1579141678.html", "date": "2020-04-15"}, "https://www.eldia.com/nota/2020-4-15-16-5-0-en-ruta-36-y-520-activan-protocolo-de-emergencia-en-un-colectivo-de-la-linea-oeste-la-ciudad": {"file": "1579111612.html", "date": "2020-04-15"}, "https://www.lainformacion.com/mundo/opositores-partidarios-lukashenko-culminan-dias-tension-marchas/2812881/": {"file": "1684898837.html", "date": "2020-08-16"}, "https://www.noticierodigital.com/2020/10/borrell-no-aplazar-las-parlamentarias-empeorara-la-situacion-en-venezuela/": {"file": "1731743891.html", "date": "2020-10-07"}, "https://www.la-prensa.com.mx/republica/decomisa-aduana-de-tijuana-mas-de-730-mil-dolares-en-efectivo-5050444.html": {"file": "1566936994.html", "date": "2020-04-02"}, "https://junin24.com/194420/tres-muertos-en-un-choque-frontal-en-ruta-188.html": {"file": "1506220874.html", "date": "2020-01-27"}, "http://www.radionacional.com.ar/intendente-de-pilar-encontramos-obras-paralizadas-y-calles-derrumbadas/": {"file": "1493765064.html", "date": "2020-01-13"}, "https://www.farodevigo.es/deportes/2020/07/05/andres-iniesta-recuerdos-son-magicos/2309843.html?utm_source=rss": {"file": "1653186688.html", "date": "2020-07-05"}, "http://www.andaluciainformacion.es/andalucia/895957/imbroda-revela-que-padecio-y-supero-el-coronavirus-el-pasado-marzo/": {"file": "1598244597.html", "date": "2020-05-06"}, "https://www.elsoldesanjuandelrio.com.mx/local/pescadores-gestionaran-crias-de-peces-5807069.html": {"file": "1721965295.html", "date": "2020-09-25"}, "https://www.elsoldemazatlan.com.mx/finanzas/precio-del-petroleo-mexicano-cae-a-un-minimo-de-18-anos-4982125.html": {"file": "1551294044.html", "date": "2020-03-17"}, "http://www.telepinar.cu/licenciados-en-educacion-primaria-en-consolacion-del-sur-fotos-y-video/": {"file": "1670377569.html", "date": "2020-07-23"}, "https://larazon.pe/faenon-de-toledo-y-grana-le-costo-s-1400-millones-al-estado-peruano/": {"file": "1718257353.html", "date": "2020-09-22"}, "https://diariodelsur.com.co/noticias/deportes/f%C3%BAtbol/el-primero-en-hablar-sorprendente-despedida-de-juan-guillerm-647581": {"file": "1789664619.html", "date": "2020-12-06"}, "http://www.soychile.cl/Puerto-Montt/Deportes/2020/08/24/670291/Congresos-y-seminarios-sobre-actividad-fisica-y-salud-se-transmitiran-desde-Puerto-Montt.aspx": {"file": "1691780201.html", "date": "2020-08-24"}, "https://www.lavozdelafrontera.com.mx/gossip/luis-miguel-y-jose-jose-entre-la-musica-que-sono-en-la-pandemia-plataformas-digitales-coronavirus-covid-19-5821245.html": {"file": "1724245424.html", "date": "2020-09-29"}, "http://www.radionacional.com.ar/comunidad-regional-de-calamuchita-rechazo-la-idea-de-una-capsula-turistica/": {"file": "1730863222.html", "date": "2020-10-06"}, "https://boingboing.net/2020/12/12/this-deep-funk-hanukkah-song-is-a-holiday-classic-in-the-making.html": {"file": "1795915731.html", "date": "2020-12-12"}, "https://www.washingtonpost.com/politics/federal-workers-are-returning-to-the-office-some-members-of-congress-say-they-shouldnt-be/2020/07/08/c3d22ec8-c151-11ea-b4f6-cb39cd8940fb_story.html": {"file": "1676872902.html", "date": "2020-07-09"}, "http://www.marketwatch.com/news/story.asp?guid=%7B49E8785A-F1C7-11EA-B8AA-ECF03EAB1839%7D&siteid=rss&rss=1": {"file": "1705296354.html", "date": "2020-09-08"}, "https://abc7ny.com/traffic/penn-station-to-close-overnight-for-cleaning/6144149/": {"file": "1594754655.html", "date": "2020-05-01"}, "http://feeds.mashable.com/~r/Mashable/~3/9SVJRKMUwTI/": {"file": "1526526251.html", "date": "2020-02-20"}, "https://www.seattlepi.com/sports/article/Tiz-the-Law-draws-No-17-post-as-3-5-Kentucky-15530833.php": {"file": "1699121936.html", "date": "2020-09-01"}, "https://twitter.com/Reuters/status/1281836879789404160/photo/1": {"file": "1676646261.html", "date": "2020-07-11"}, "https://kesq.com/news/2020/05/14/mayor-of-coachella-explains-citys-decision-to-continue-requiring-face-coverings/": {"file": "1606865668.html", "date": "2020-05-14"}, "https://tucson.com/news/national/college-football-player-arrested-on-murder-charge-in-georgia/article_c7e4b901-9d60-5895-a288-73911df10bd3.html": {"file": "1725250200.html", "date": "2020-09-30"}, "http://feeds.bizjournals.com/~r/industry_12/~3/_rJ5SC99V8E/after-two-weeks-chef-says-oggies.html": {"file": "1685765130.html", "date": "2020-08-17"}, "https://www.oann.com/protesters-gather-at-paris-theater-to-confront-macron-over-pension-reform/": {"file": "1498311133.html", "date": "2020-01-18"}, "https://timesofindia.indiatimes.com/india/farmers-protests-continue-for-eleventh-day-top-developments/articleshow/79591842.cms": {"file": "1789437552.html", "date": "2020-12-06"}, "https://kdvr.com/news/auroras-violent-crime-rate-ranks-3rd-out-of-colorados-ten-largest-cities/": {"file": "1731257347.html", "date": "2020-10-06"}, "https://www.breakingsoup.com/south-park-characters-fill-empty-seats-at-denver-broncos-games/": {"file": "1732261760.html", "date": "2020-09-28"}, "https://www.hutchnews.com/ZZ/news/20201123/latest-germanys-curevac-signs-contract-for-new-vaccine?rssfeed=true": {"file": "1777898598.html", "date": "2020-11-23"}, "https://www.news18.com/news/business/rbi-prescribes-five-pillared-approach-guard-against-cybersecurity-threats-for-urban-co-op-banks-2906047.html": {"file": "1720285564.html", "date": "2020-09-24"}, "https://www.stuff.co.nz/national/crime/300145150/three-men-charged-for-alleged-bank-card-skimming-at-auckland-hospitals.html": {"file": "1753142980.html", "date": "2020-10-29"}, "https://economictimes.indiatimes.com/markets/stocks/news/share-market-update-psu-bank-shares-gain-canara-bank-rises-1br/articleshow/73540940.cms": {"file": "1502375798.html", "date": "2020-01-23"}, "http://rnanews.com/young-leaders-from-canada-fiji-pakistan-uganda-win-commonwealth-youth-awards-2020/": {"file": "1545738097.html", "date": "2020-03-11"}, "https://www.seattletimes.com/nation-world/the-quiet-hand-of-conservative-groups-in-the-anti-lockdown-protests/": {"file": "1587149671.html", "date": "2020-04-21"}, "https://au.news.yahoo.com/the-two-aussie-covid-measures-that-could-never-work-in-the-us-222249752.html": {"file": "1753036930.html", "date": "2020-10-28"}, "https://www.dailymail.co.uk/sport/football/article-8297417/Man-Utd-ace-Dean-Henderson-morally-right-finish-season-Sheff-Utd-Wilder.html?ns_mchannel=rss&ns_campaign=1490&ito=1490": {"file": "1599860291.html", "date": "2020-05-07"}, "https://www.google.com/imgres?imgurl=https://i.ebayimg.com/images/g/fjoAAOSwyGZaRXKK/s-l300.jpg&imgrefurl=https://www.ebay.com/itm/Retirement-Gift-Ideas-Retired-Definition-Funny-Retirement-Coffee-Mug-Tea-Cup-/132449557566&tbnid=QBu8niz350w2PM&vet=1&docid=CTb7OqAkXHkPUM&w=300&h=265&itg=1&q=retirement+definition&hl=en-US&source=sh/x/im": {"file": "1564637733.html", "date": NaN}, "https://www.inquirer.com/news/nation-world/us-state-department-blocks-lawsuit-by-american-imprisoned-tortured-in-egypt-20200718.html": {"file": "1666031133.html", "date": "2020-07-18"}, "https://www.slobodnaevropa.org/a/30657941.html": {"file": "1641303898.html", "date": "2020-06-22"}, "https://www.laprensalatina.com/uncertain-future-for-britains-essential-workers-after-brexit/": {"file": "1632107283.html", "date": "2020-06-11"}, "https://www.monroenews.com/ZZ/news/20200516/italy-seeks-to-boost-tourism-by-opening-borders-june-3?rssfeed=true": {"file": "1608581255.html", "date": "2020-05-16"}, "https://www.malaymail.com/news/sports/2020/04/08/2022-world-athletics-championships-set-for-july-15-24/1854874": {"file": "1572542522.html", "date": "2020-04-08"}, "http://city.udn.com/67926/6950016?ch=rss_ugccitynewpost": {"file": "1580240354.html", "date": "2020-04-17"}, "https://www.moneycontrol.com/news/business/goldman-sachs-says-india\u2019s-fy21-gdp-may-plummet-tomulti-decade-low16bleakest-forecast-so-far_13654421.html": {"file": "1572317591.html", "date": "2020-04-08"}, "https://wiki.d-addicts.com/index.php?title=Park_Ye_Jin&diff=591343&oldid=588001": {"file": "1571322029.html", "date": "2020-04-07"}, "https://www.forbes.com/sites/marlamilling/2020/05/15/drunkorexia-on-the-rise-among-female-university-students/": {"file": "1608572427.html", "date": "2020-05-15"}, "https://thefrontierpost.com/two-newborns-die-for-want-of-oxygen-at-bhakkar-hospital/": {"file": "1535415671.html", "date": "2020-03-01"}, "https://www.theargus.co.uk/news/18701511.woman-hurt-hit-car-station-street-eastbourne/?ref=rss": {"file": "1703831611.html", "date": "2020-09-06"}, "https://chicago.suntimes.com/2020/6/24/21302329/trump-judges-nominee-federal-senate": {"file": "1643960841.html", "date": "2020-06-24"}, "https://www.dln.com/newcorporations/details/ref_index/438057": {"file": "1687351353.html", "date": NaN}, "https://www.engadget.com/amazon-luxury-stores-fashion-140141502.html": {"file": "1711803974.html", "date": "2020-09-15"}, "https://www.philstar.com/showbiz/2020/11/27/2059828/abs-cbn-nagsalita-na-sa-paglayas-ni-bea": {"file": "1781170231.html", "date": "2020-11-27"}, "https://www.kut.org/post/local-attorney-andy-brown-will-be-democratic-nominee-county-judge": {"file": "1708999888.html", "date": "2020-08-16"}, "https://sanfrancisco.cbslocal.com/2020/09/16/55th-acm-awards-winners-list/": {"file": "1713311346.html", "date": "2020-09-16"}, "https://semissourinews.com/stories/544335905-total-oasdi-disabled-beneficiaries-in-missouri-zip-63848-remains-the-same-in-2019": {"file": "1693504805.html", "date": "2020-08-26"}, "https://zitrod.com/business/we-must-do-more-what-ceos-like-tim-cook-jamie-dimon-larry-fink-say-about-racial-inequality-protests/": {"file": "1627298010.html", "date": "2020-06-01"}, "http://feeds.bizjournals.com/~r/industry_20/~3/Imon3NQaB8c/shutting-down-tampa-bay-construction-during.html": {"file": "1566929327.html", "date": "2020-04-02"}, "http://rssfeeds.usatoday.com/~/620721596/0/usatoday-newstopstories~Hurricanes-in-a-pandemic-Absolutely-thats-our-nightmare-scenario/": {"file": "1566930342.html", "date": "2020-04-02"}, "https://www.recordonline.com/news/20200316/rockland-to-declare-local-state-of-emergency-on-monday?rssfeed=true": {"file": "1549555386.html", "date": "2020-03-16"}, "https://hypixel.net/threads/what-whered-it-go.2675645/": {"file": "1552005464.html", "date": "2020-03-18"}, "http://nationalpost.com/pmn/health-pmn/frances-macron-condemns-unilateral-border-control-measures-over-coronavirus": {"file": "1549542851.html", "date": "2020-03-16"}, "https://www.news18.com/news/india/suspended-aap-councillor-tahir-hussain-arrested-in-delhi-court-over-ib-staffers-murder-2538473.html": {"file": "1549579884.html", "date": "2020-03-16"}, "http://www.asiapacificstar.com/news/263700449/australian-megablaze-brought-under-control": {"file": "1493757507.html", "date": "2020-01-13"}, "https://www.realestate.com.au/news/live-in-your-own-jurassic-park-at-this-multimilliondollar-kenthurst-estate/?rsf=syn:news:nca:news:spa:strap": {"file": "1500330653.html", "date": "2020-01-22"}, "https://carnegieendowment.org/chinafinancialmarkets/79641": {"file": "1487058344.html", "date": "2019-08-06"}, "http://feeds.reuters.com/~r/reuters/businessNews/~3/UjOBluJTi0o/volkswagens-skoda-auto-2019-deliveries-dip-to-1-24-million-cars-due-to-weaker-sales-in-china-idUSKBN1ZC1DA": {"file": "1493638362.html", "date": "2020-01-13"}, "https://www.96fm.ie/": {"file": "1630687900.html", "date": NaN}, "https://www.mirror.co.uk/sport/football/transfer-news/arsenal-set-pierre-emerick-aubameyang-22002407": {"file": "1602142656.html", "date": "2020-05-09"}, "https://www.businesstimes.com.sg/companies-markets/s232m-fair-value-loss-pushes-sph-into-the-red-for-first-time": {"file": "1738268651.html", "date": "2020-10-14"}, "https://nckansasnews.com/stories/567912132-mark-dings-donates-2-800-to-tracey-robert-mann-s-campaign-committee-in-september": {"file": "1793453954.html", "date": "2020-12-08"}, "https://whnt.com/news/don-trump-jr-tests-positive-for-coronavirus/": {"file": "1775625776.html", "date": "2020-11-20"}, "https://www.hindustantimes.com/india-news/odisha-artist-spreads-awareness-on-coronavirus-with-wall-paintings/story-zMoh0EOYcRzfnhXu6NBPnM.html": {"file": "1589455043.html", "date": "2020-04-26"}, "https://www.jstor.org/stable/2669240?origin=crossref": {"file": "1587353145.html", "date": NaN}, "https://azraelsmerryland.blogspot.com/2020/07/consumers-elevate-appeal-to-president.html": {"file": "1653190016.html", "date": "2020-07-05"}, "https://kiow.com/2020/10/26/absentee-ballots-are-slow-to-return/": {"file": "1749876905.html", "date": "2020-10-26"}, "https://www.zimeye.net/2020/03/23/coronavirus-doctors-threaten-to-down-tools-due-to-govt-unpreparedness/": {"file": "1556637845.html", "date": "2020-03-23"}, "https://www.registerguard.com/news/20200413/second-suspect-in-shooting-turns-himself-in?rssfeed=true": {"file": "1577551432.html", "date": "2020-04-13"}, "https://www.thestar.com/news/world/us/2020/03/23/ap-exclusive-allen-has-new-publisher-memoir-out-monday.html": {"file": "1556635225.html", "date": "2020-03-23"}, "https://www.urdupoint.com/en/world/russian-prime-minister-mikhail-mishustin-says-906868.html": {"file": "1592133028.html", "date": "2020-04-29"}, "https://www.couriermail.com.au/news/national/98yearold-wwii-veteran-beats-covid19-receives-ovation-from-hospital-staff/video/ca6ce285879e2291307f3fc8148670aa": {"file": "1588233953.html", "date": "2020-04-24"}, "https://www.eastbaytimes.com/2020/04/24/joe-biden-predicts-trump-will-try-to-delay-elections/": {"file": "1588026728.html", "date": "2020-04-24"}, "https://www.eastbaytimes.com/2020/04/24/coronavirus-how-these-bay-area-travelers-got-stranded-in-bolivia/": {"file": "1588027676.html", "date": "2020-04-24"}, "https://sputniknews.com/radio_the_critical_hour/202004281079127690-some-us-states-begin-lifting-lockdowns-vp-pence-defiantly-tours-clinic-unmasked/": {"file": "1592175395.html", "date": "2020-04-28"}, "https://www.baltimoresun.com/maryland/howard/cng-ho-permits-public-hearing-20200817-hntmbtcnhfgwjlv7vompxm4nmu-story.html#ed=rss_www.baltimoresun.com/arcio/rss/category/latest/": {"file": "1685640081.html", "date": "2020-08-17"}, "https://globalnews.ca/news/6620622/syria-turkey-strikes-conflict/": {"file": "1536545428.html", "date": "2020-03-02"}, "http://rssfeeds.detroitnews.com/~/619980292/0/detroit/home~Men-chased-her-shot-her-at-front-door-now-reward-offered-for-slaying-suspects/": {"file": "1551271741.html", "date": "2020-03-17"}, "https://timesofindia.indiatimes.com/sports/cricket/ipl/live-blog/ipl-2020-live-cricket-score-chennai-super-kings-vs-sunrisers-hyderabad-match-14-dubai/liveblog/78447140.cms": {"file": "1727473717.html", "date": "2020-10-03"}, "https://timesofindia.indiatimes.com/city/bhubaneswar/odisha-reports-first-covid-19-death-72-year-old-man-from-bhubaneswar-dies/articleshow/75026800.cms": {"file": "1571147129.html", "date": "2020-04-07"}, "https://thewest.com.au/business/public-companies/caeneus-charges-up-its-exploration-tool-kit-at-mallina-c-1317422": {"file": "1711399465.html", "date": "2020-09-15"}, "http://optimussearch.com.ph/2020/06/02/no-membership-required-best-and-free-online-dating-websites-in-los-angeles/": {"file": "1677010256.html", "date": "2020-06-02"}, "https://www.businesstoday.in/current/economy-politics/coronavirus-in-bihar-record-749-cases-in-24-hours-patna-other-districts-announce-lockdown-from-july-10/story/409327.html": {"file": "1656725247.html", "date": "2020-07-09"}, "https://theweek.com/speedreads/887020/trump-visited-trumpowned-golf-course-nearly-24-percent-days-2019": {"file": "1485102321.html", "date": "2020-01-02"}, "https://globalnews.ca/news/7278919/kamala-harris-fact-check-us-vice-president/": {"file": "1684405920.html", "date": "2020-08-15"}, "https://www.baltimoresun.com/opinion/columnists/zurawik/bs-ed-zontv-media-year-20201223-cnvrlhkhnrbihcxx6wxcxt2b7y-story.html#ed=rss_www.baltimoresun.com/arcio/rss/category/latest/": {"file": "1805697156.html", "date": "2020-12-23"}, "https://www.sfgate.com/news/article/New-anthology-collects-dozens-of-poems-about-15250468.php": {"file": "1598537220.html", "date": "2020-05-06"}, "https://bizwest.com/2020/03/11/loft-clothing-store-at-twenty-ninth-street-in-boulder-to-close/loft/": {"file": "1545286960.html", "date": "2020-03-11"}, "https://www.news.az/news/azerbaijan-launches-counteroffensive-to-restore-its-territorial-integrity-pakistani-envoy": {"file": "1791336003.html", "date": "2020-10-13"}, "https://upton.wickedlocal.com/news/20200924/battle-in-congress-to-replace-ruth-bader-ginsburg-is-dashing-hopes-for-covid-19-stimulus-package?rssfeed=true": {"file": "1720539463.html", "date": "2020-09-24"}, "http://www.haniotika-nea.gr/ton-epiasan-tin-ora-poy-prospathoyse-na-klepsei-aytokinita/": {"file": "1687581723.html", "date": "2020-08-19"}, "https://www.nzz.ch/international/neue-us-sanktionen-erschuettern-die-syrische-wirtschaft-ld.1560586": {"file": "1635095370.html", "date": "2020-06-15"}, "https://www.infranken.de/ueberregional/boulevard/kultur/tatort-aus-muenchen-30-jahre-leitmayr-und-batic-art-5139057": {"file": "1808293409.html", "date": "2020-12-27"}, "https://www.presseportal.de/blaulicht/pm/43526/4791887": {"file": "1798244877.html", "date": "2020-12-15"}, "https://sn.dk/Erhverv/Forstaerket-haab-om-737-MAX-erstatning-loefter-Norwegian-aktie/artikel/900368?rss": {"file": "1484494308.html", "date": "2020-01-02"}, "https://sn.dk/Danmark/Coronavirus-rammer-trafikken-i-Danmark/artikel/922379?rss": {"file": "1544178583.html", "date": "2020-03-10"}, "https://www.novinky.cz/vase-zpravy/clanek/janovicka-knihovna-zve-na-vystavu-o-nebezpecnem-zivobyti-prevadecu-na-sumave-40311673": {"file": "1509819104.html", "date": "2020-01-30"}, "https://www.elvallenc.cat/societat/vallsconfinat-continua-amb-mes-novetats/": {"file": "1556626357.html", "date": "2020-03-23"}, "https://bn.wikipedia.org/w/index.php?title=Jim_Higgs&diff=3986629&oldid=0": {"file": "1522651525.html", "date": "2020-02-18"}, "https://www.actualno.com/haskovo/obshtina-haskovo-osiguri-komputri-i-tableti-na-deca-v-socialni-obshtejitija-news_1509983.html": {"file": "1740081165.html", "date": "2020-10-15"}, "https://vratza.com/obshtina-b-vratsa-b-specheli-proekt-za-izgrazhdaneto-na-dopalnitelen-korpus-na/": {"file": "1766087391.html", "date": "2020-11-10"}, "https://www.youm7.com/story/2020/8/24/\u0648\u0632\u064a\u0631-\u0627\u0644\u0631\u0649-\u064a\u0634\u0647\u062f-\u062a\u0648\u0642\u064a\u0639-\u0639\u0642\u062f-\u062f\u0631\u0627\u0633\u0629-\u062a\u062d\u062f\u064a\u062f-\u0627\u0644\u0633\u062d\u0628-\u0627\u0644\u0622\u0645\u0646-\u0644\u0644\u062e\u0632\u0627\u0646\u0627\u062a/4943459": {"file": "1691196862.html", "date": "2020-08-24"}, "https://www.alyaum.com/articles/6291787/\u0627\u0644\u0642\u0627\u0631\u0627\u062a-\u0627\u0644\u0633\u0628\u0639/\u0637\u0647\u0631\u0627\u0646-\u062a\u062f\u0641\u0646-\u0632\u0627\u062f\u0629-\u0648\u062a\u062a\u0647\u0645-\u0627\u0644\u0645\u0639\u0627\u0631\u0636\u0629-\u0627\u0644\u0625\u064a\u0631\u0627\u0646\u064a\u0629-\u0628\u0627\u063a\u062a\u064a\u0627\u0644\u0647": {"file": "1784190729.html", "date": "2020-12-01"}, "https://www.albayan.ae/across-the-uae/news-and-reports/2020-06-03-1.3874341": {"file": "1623715966.html", "date": "2020-06-03"}, "https://akhbarelyom.com/news/newdetails/3126898/1/36-\u0639\u0627\u0645\u064b\u0627..-\u0633\u0631-\u0631\u062d\u064a\u0644-\u0646\u0639\u064a\u0645\u0629-\u0639\u0627\u0643\u0641-\u0641\u064a-\u0633\u0646-\u0645\u0628\u0643\u0631": {"file": "1731695567.html", "date": "2020-10-07"}, "https://www.almadenahnews.com/article/825076-%D8%A3%D9%85%D9%8A%D8%B1%D9%83%D8%A7-50-%D8%A7%D9%84%D9%81%D8%A7-%D8%AD%D8%B5%D9%8A%D9%84%D8%A9-%D8%A7%D9%84%D9%88%D9%81%D9%8A%D8%A7%D8%AA-%D8%A8%D8%B3%D8%A8%D8%A8-%D9%81%D9%8A%D8%B1%D9%88%D8%B3-%D9%83%D9%88%D8%B1%D9%88%D9%86%D8%A7": {"file": "1588107198.html", "date": "2020-04-24"}, "https://arabic.sputniknews.com/arab_world/202003171044893040-%D9%85%D8%B5%D8%B1-%D8%AA%D8%B3%D8%AC%D9%84-30-%D8%A5%D8%B5%D8%A7%D8%A8%D8%A9-%D8%AC%D8%AF%D9%8A%D8%AF%D8%A9-%D9%88%D8%AD%D8%A7%D9%84%D8%AA%D9%8A-%D9%88%D9%81%D8%A7%D8%A9-%D8%A8%D9%81%D9%8A%D8%B1%D9%88%D8%B3-%D9%83%D9%88%D8%B1%D9%88%D9%86%D8%A7/": {"file": "1550981615.html", "date": "2020-03-17"}, "https://elbaladtv.net/%d8%aa%d8%b1%d9%83%d9%89-%d8%a2%d9%84-%d8%a7%d9%84%d8%b4%d9%8a%d8%ae-%d8%a8%d8%b9%d8%af-%d8%a5%d8%b5%d8%a7%d8%a8%d8%a9-%d9%8a%d8%b3%d8%b1%d8%a7-%d8%a8%d9%83%d9%88%d8%b1%d9%88%d9%86%d8%a7-%d9%8a%d8%a7/": {"file": "1806793639.html", "date": "2020-12-25"}, "https://www.beirutobserver.com/2020/11/2338761/": {"file": "1764731404.html", "date": "2020-11-09"}, "https://money.udn.com/money/story/5603/4909425": {"file": "1728970162.html", "date": "2020-10-04"}, "http://www.upmedia.mg/news_info.php?SerialNo=83141": {"file": "1546021647.html", "date": "2020-03-12"}, "https://news.ltn.com.tw/news/business/breakingnews/3119452": {"file": "1564888577.html", "date": "2020-04-01"}, "https://news.sina.com.tw/article/20200121/34046328.html": {"file": "1500260110.html", "date": "2020-01-21"}} \ No newline at end of file diff --git a/tests/evaluation.py b/tests/evaluation.py index 394125dd..7ecb5fb1 100644 --- a/tests/evaluation.py +++ b/tests/evaluation.py @@ -21,12 +21,11 @@ from articleDateExtractor import extractArticlePublishedDate from date_guesser import guess_date from goose3 import Goose - from newspaper import Article - from newspaper.article import ArticleDownloadState + from newspaper import Article, parsers from newsplease import NewsPlease except ImportError: extractArticlePublishedDate = guess_date = Goose = None - Article = ArticleDownloadState = NewsPlease = None + Article = parsers = NewsPlease = None TEST_DIR = os.path.abspath(os.path.dirname(__file__)) @@ -82,24 +81,38 @@ def run_htmldate_fast(htmlstring): def run_newspaper(htmlstring): - """try with the newspaper module""" - # throws error on the eval_default dataset + """try with the newspaper module (newspaper4k) + + Only the publication date is needed, so we run newspaper's cheap metadata + pass (``get_publishing_date``) and stop before ``calculate_best_node`` and + the rest of ``parse()``. This skips the per-language NLP tokenizers (much + faster, and avoids the optional language-data dependencies) and yields the + exact same date. Note: feeding the HTML via ``download(input_html=...)`` is + the correct newspaper4k entry point -- the older ``Article(html)`` hack + raised ``UnicodeEncodeError`` on non-ASCII pages, silently counting them as + misses. ``extractor.get_publishing_date`` is semi-internal: pin newspaper4k. + """ try: - myarticle = Article(htmlstring) - myarticle.html = htmlstring - myarticle.download_state = ArticleDownloadState.SUCCESS - myarticle.parse() + article = Article(url="") + article.download(input_html=htmlstring) + article.doc = parsers.fromstring(article.html) + if article.doc is None: + return None + publish_date = article.extractor.get_publishing_date(article.url, article.doc) except (UnicodeDecodeError, UnicodeEncodeError): return None - if myarticle.publish_date is None or myarticle.publish_date == "": - return None - return str(myarticle.publish_date)[0:10] + return str(publish_date)[0:10] if publish_date else None def run_newsplease(htmlstring): - """try with newsplease""" + """try with newsplease + + ``fetch_images=False`` skips image downloading/processing (and uses the + no-images newspaper extractor internally); the publication date is + unaffected and the call is ~2.5x faster. + """ try: - article = NewsPlease.from_html(htmlstring, url=None) + article = NewsPlease.from_html(htmlstring, url=None, fetch_images=False) if article.date_publish is None: return None return convert_date(article.date_publish, "%Y-%m-%d %H:%M:%S", "%Y-%m-%d") diff --git a/tests/unit_tests.py b/tests/unit_tests.py index 92b37d32..cd472a6c 100644 --- a/tests/unit_tests.py +++ b/tests/unit_tests.py @@ -153,12 +153,13 @@ def test_sanity(): assert is_valid_format("ABC") is False assert is_valid_format(123) is False assert is_valid_format(("a", "b")) is False - _, discarded = discard_unwanted( + tree = discard_unwanted( html.fromstring( '
000
AAA
' ) ) - assert len(discarded) == 1 + assert tree.find('.//div[@id="wm-ipp"]') is None # archive.org banner removed + assert "AAA" in tree.text_content() # real content kept # reset caches: examine_date_elements used above old_values = try_date_expr.cache_info() reset_caches()