From 3128e3019238da5f2a2f486241e937f8849ffc0f Mon Sep 17 00:00:00 2001 From: "mcode-bot@modelcode.ai" Date: Thu, 11 Jun 2026 13:59:04 -0500 Subject: [PATCH 01/11] Migrate build system from setup.py to pyproject.toml - Create pyproject.toml with setuptools build backend, project metadata, dependencies (nltk, breadability; docopt removed for argparse migration), optional deps (LSA, LexRank, dev), console scripts, and tool configs (pytest, ruff, mypy) - Update classifiers to Python 3.10-3.13 and Development Status 4 - Beta - Remove version-suffixed entry points (sumy-X.Y, sumy_eval-X.Y) - Delete setup.py, setup.cfg, and MANIFEST.in - Update Makefile with modern pytest target, remove old publish/bump targets - Update .gitignore with patterns for build artifacts, testing, linting, and type checking caches --- .gitignore | 21 ++++++++-- MANIFEST.in | 4 -- Makefile | 17 ++------ pyproject.toml | 86 ++++++++++++++++++++++++++++++++++++++++ setup.cfg | 14 ------- setup.py | 105 ------------------------------------------------- 6 files changed, 107 insertions(+), 140 deletions(-) delete mode 100644 MANIFEST.in create mode 100644 pyproject.toml delete mode 100644 setup.cfg delete mode 100644 setup.py diff --git a/.gitignore b/.gitignore index dc8535c9..1685a3a5 100644 --- a/.gitignore +++ b/.gitignore @@ -1,12 +1,27 @@ # Python __pycache__/ *.py[co] -*.egg-info +*.egg-info/ +*.egg -# tests +# Build artifacts +dist/ +build/ +*.whl + +# Testing .coverage +.pytest_cache/ +.venv/ +venv/ +*.pyc +*.pyo + +# Linting/Type checking +.mypy_cache/ +.ruff_cache/ -# working folders +# Working folders /experiments/ # IDE files diff --git a/MANIFEST.in b/MANIFEST.in deleted file mode 100644 index 5fb97913..00000000 --- a/MANIFEST.in +++ /dev/null @@ -1,4 +0,0 @@ -include README.rst -include LICENSE.rst -include CHANGELOG.rst -recursive-include sumy/data * diff --git a/Makefile b/Makefile index f697f4f1..3bbf5c93 100644 --- a/Makefile +++ b/Makefile @@ -1,20 +1,9 @@ PYTHON=python -VERSION=patch - -.PHONY=test publish bump clean +.PHONY=test clean test: - py.test-2.6 && py.test-3.2 && py.test-2.7 && py.test-3.3 && py.test-3.4 - -publish: test - ${PYTHON} setup.py register sdist bdist_wheel - twine upload dist/* - -bump: test - bumpversion ${VERSION} --config-file setup.cfg - git rm .bumpversion.cfg - git commit --amend + pytest clean: - rm -rf .bumpversion.cfg .coverage dist build + rm -rf .coverage dist build *.egg-info diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 00000000..9e79ea32 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,86 @@ +[build-system] +requires = ["setuptools>=68.0", "setuptools-scm"] +build-backend = "setuptools.build_meta" + +[project] +name = "sumy" +version = "0.3.0" +description = "Module for automatic summarization of text documents and HTML pages." +authors = [{name = "Michal Belica", email = "miso.belica@gmail.com"}] +license = {text = "Apache License, Version 2.0"} +readme = "README.rst" +requires-python = ">=3.10" +keywords = [ + "data mining", + "automatic summarization", + "data reduction", + "web-data extraction", + "NLP", + "natural language processing", + "latent semantic analysis", + "LSA", + "TextRank", + "LexRank", +] +classifiers = [ + "Development Status :: 4 - Beta", + "Intended Audience :: Developers", + "Intended Audience :: Education", + "License :: OSI Approved :: Apache Software License", + "Natural Language :: Czech", + "Natural Language :: Slovak", + "Natural Language :: English", + "Natural Language :: German", + "Natural Language :: French", + "Topic :: Education", + "Topic :: Internet", + "Topic :: Scientific/Engineering :: Information Analysis", + "Topic :: Text Processing :: Filters", + "Topic :: Text Processing :: Linguistic", + "Topic :: Text Processing :: Markup :: HTML", + "Operating System :: OS Independent", + "Programming Language :: Python", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", + "Programming Language :: Python :: 3.13", + "Programming Language :: Python :: Implementation :: CPython", +] +dependencies = [ + "nltk>=3.9.1", + "breadability>=0.1.20", +] + +[project.optional-dependencies] +LSA = ["numpy"] +LexRank = ["numpy"] +dev = ["pytest>=8.0", "pytest-cov>=5.0", "pytest-xdist", "ruff", "mypy"] + +[project.scripts] +sumy = "sumy.__main__:main" +sumy_eval = "sumy.evaluation.__main__:main" + +[project.urls] +Homepage = "https://github.com/miso-belica/sumy" + +[tool.setuptools.packages.find] +include = ["sumy*"] + +[tool.setuptools.package-data] +sumy = ["data/stopwords/*.txt"] + +[tool.pytest.ini_options] +addopts = "--quiet --tb=short --color=yes --cov=sumy --cov-report=term-missing --no-cov-on-fail" + +[tool.ruff] +line-length = 160 +target-version = "py310" + +[tool.ruff.lint] +select = ["E", "F", "W", "I"] + +[tool.mypy] +python_version = "3.10" +warn_return_any = true +warn_unused_configs = true diff --git a/setup.cfg b/setup.cfg deleted file mode 100644 index 9b070636..00000000 --- a/setup.cfg +++ /dev/null @@ -1,14 +0,0 @@ -[pytest] -addopts = --quiet --tb=short --color=yes --cov=sumy --cov-report=term-missing --no-cov-on-fail - -[pep8] -max-line-length = 160 - -[wheel] -universal=1 - -[bumpversion] -current_version=0.3.0 -files=setup.py setup.cfg sumy/__init__.py CHANGELOG.rst -commit=True -tag=True diff --git a/setup.py b/setup.py deleted file mode 100644 index 84f76805..00000000 --- a/setup.py +++ /dev/null @@ -1,105 +0,0 @@ -# -*- coding: utf-8 -*- - -try: - from setuptools import setup -except ImportError: - from distutils.core import setup - -import sys - - -VERSION_SUFFIX = "%d.%d" % sys.version_info[:2] - - -with open("README.rst") as readme: - with open("CHANGELOG.rst") as changelog: - long_description = readme.read() + "\n\n" + changelog.read() - - -setup( - name="sumy", - version="0.3.0", - description="Module for automatic summarization of text documents and HTML pages.", - long_description=long_description, - author="Michal Belica", - author_email="miso.belica@gmail.com", - url="https://github.com/miso-belica/sumy", - license="Apache License, Version 2.0", - keywords=[ - "data mining", - "automatic summarization", - "data reduction", - "web-data extraction", - "NLP", - "natural language processing", - "latent semantic analysis", - "LSA", - "TextRank", - "LexRank", - ], - install_requires=[ - "docopt>=0.6.1,<0.7", - "breadability>=0.1.20", - "nltk>=3.0.2", - ], - tests_require=[ - "pytest", - "pytest-cov", - "pytest-watch", - ], - extras_require={ - "LSA": ["numpy"], - "LexRank": ["numpy"], - }, - packages=[ - "sumy", - "sumy.evaluation", - "sumy.models", - "sumy.models.dom", - "sumy.nlp", - "sumy.nlp.stemmers", - "sumy.parsers", - "sumy.summarizers", - ], - package_data={"sumy": [ - "data/stopwords/*.txt", - ]}, - entry_points={ - "console_scripts": [ - "sumy = sumy.__main__:main", - "sumy-%s = sumy.__main__:main" % VERSION_SUFFIX, - "sumy_eval = sumy.evaluation.__main__:main", - "sumy_eval-%s = sumy.evaluation.__main__:main" % VERSION_SUFFIX, - ] - }, - classifiers=[ - "Development Status :: 3 - Alpha", - "Intended Audience :: Developers", - "Intended Audience :: Education", - "License :: OSI Approved :: Apache Software License", - - "Natural Language :: Czech", - "Natural Language :: Slovak", - "Natural Language :: English", - "Natural Language :: German", - "Natural Language :: French", - - "Topic :: Education", - "Topic :: Internet", - "Topic :: Scientific/Engineering :: Information Analysis", - "Topic :: Text Processing :: Filters", - "Topic :: Text Processing :: Linguistic", - "Topic :: Text Processing :: Markup :: HTML", - - "Operating System :: OS Independent", - "Programming Language :: Python", - "Programming Language :: Python :: 2", - "Programming Language :: Python :: 2.6", - "Programming Language :: Python :: 2.7", - "Programming Language :: Python :: 3", - "Programming Language :: Python :: 3.2", - "Programming Language :: Python :: 3.3", - "Programming Language :: Python :: 3.4", - "Programming Language :: Python :: Implementation :: CPython", - ], -) From cb3c5c9e3b1374b014c3ab035f1bfb9a53504c20 Mon Sep 17 00:00:00 2001 From: "mcode-bot@modelcode.ai" Date: Thu, 11 Jun 2026 14:13:19 -0500 Subject: [PATCH 02/11] Remove Python 2 compatibility layer and modernize all source files - Delete sumy/_compat.py (Python 2/3 compatibility shim) - Remove all '# -*- coding: utf8 -*-' lines from every source file - Remove all 'from __future__ import' lines from every source file - Replace _compat imports with Python 3 native equivalents: - to_unicode/to_string -> str() or remove where unnecessary - unicode -> str - string_types -> str - unicode_compatible decorator -> direct __str__ method - ffilter -> itertools.filterfalse - Counter -> collections.Counter - urllib -> urllib.request - PY3 branching -> direct Python 3 code - Modernize class syntax: class Foo(object) -> class Foo - Modernize super calls: super(Class, self) -> super() - Convert % string formatting to f-strings where appropriate - Update collections.Sequence -> collections.abc.Sequence in models/tf.py - Keep custom cached_property (required for __slots__ classes) with explanatory comment --- sumy/__init__.py | 6 -- sumy/__main__.py | 28 +++---- sumy/_compat.py | 109 ------------------------- sumy/evaluation/__init__.py | 8 +- sumy/evaluation/__main__.py | 27 +++--- sumy/evaluation/content_based.py | 9 +- sumy/evaluation/coselection.py | 6 -- sumy/evaluation/rouge.py | 60 ++++++-------- sumy/models/__init__.py | 6 -- sumy/models/dom/__init__.py | 5 -- sumy/models/dom/_document.py | 13 +-- sumy/models/dom/_paragraph.py | 16 +--- sumy/models/dom/_sentence.py | 18 +--- sumy/models/tf.py | 21 ++--- sumy/nlp/stemmers/__init__.py | 13 +-- sumy/nlp/stemmers/czech.py | 14 +--- sumy/nlp/tokenizers.py | 19 ++--- sumy/parsers/__init__.py | 5 -- sumy/parsers/html.py | 10 +-- sumy/parsers/parser.py | 8 +- sumy/parsers/plaintext.py | 10 +-- sumy/summarizers/__init__.py | 5 -- sumy/summarizers/_summarizer.py | 11 +-- sumy/summarizers/edmundson.py | 7 +- sumy/summarizers/edmundson_cue.py | 7 +- sumy/summarizers/edmundson_key.py | 9 +- sumy/summarizers/edmundson_location.py | 12 +-- sumy/summarizers/edmundson_title.py | 12 +-- sumy/summarizers/kl.py | 19 ++--- sumy/summarizers/lex_rank.py | 11 +-- sumy/summarizers/lsa.py | 9 +- sumy/summarizers/luhn.py | 5 -- sumy/summarizers/random.py | 5 -- sumy/summarizers/sum_basic.py | 20 ++--- sumy/summarizers/text_rank.py | 5 -- sumy/utils.py | 26 +++--- 36 files changed, 131 insertions(+), 443 deletions(-) delete mode 100644 sumy/_compat.py diff --git a/sumy/__init__.py b/sumy/__init__.py index 1ca3a0b2..d5506b54 100644 --- a/sumy/__init__.py +++ b/sumy/__init__.py @@ -1,8 +1,2 @@ -# -*- coding: utf8 -*- - -from __future__ import absolute_import -from __future__ import division, print_function, unicode_literals - - __author__ = "Michal Belica" __version__ = "0.3.0" diff --git a/sumy/__main__.py b/sumy/__main__.py index 07f1bb31..e9078203 100644 --- a/sumy/__main__.py +++ b/sumy/__main__.py @@ -1,5 +1,3 @@ -# -*- coding: utf8 -*- - """ Sumy - automatic text summarizer. @@ -24,15 +22,13 @@ """ -from __future__ import absolute_import -from __future__ import division, print_function, unicode_literals - import sys +from urllib import request as urllib_request + from docopt import docopt from . import __version__ from .utils import ItemsCount, get_stop_words, read_stop_words -from ._compat import urllib, to_string, to_unicode, to_bytes, PY3 from .nlp.tokenizers import Tokenizer from .parsers.html import HtmlParser from .parsers.plaintext import PlaintextParser @@ -46,7 +42,7 @@ from .nlp.stemmers import Stemmer HEADERS = { - "User-Agent": "Sumy (Automatic text summarizer) Version/%s" % __version__, + "User-Agent": f"Sumy (Automatic text summarizer) Version/{__version__}", } PARSERS = { "html": HtmlParser, @@ -65,14 +61,11 @@ def main(args=None): - args = docopt(to_string(__doc__), args, version=__version__) + args = docopt(__doc__, args, version=__version__) summarizer, parser, items_count = handle_arguments(args) for sentence in summarizer(parser.document, items_count): - if PY3: - print(to_unicode(sentence)) - else: - print(to_bytes(sentence)) + print(str(sentence)) return 0 @@ -80,15 +73,14 @@ def main(args=None): def handle_arguments(args, default_input_stream=sys.stdin): document_format = args['--format'] if document_format is not None and document_format not in PARSERS: - raise ValueError("Unsupported format of input document. Possible values are: %s. Given: %s." % ( - ", ".join(PARSERS.keys()), - document_format, - )) + raise ValueError( + f"Unsupported format of input document. Possible values are: " + f"{', '.join(PARSERS.keys())}. Given: {document_format}.") if args["--url"] is not None: parser = PARSERS[document_format or "html"] - request = urllib.Request(args["--url"], headers=HEADERS) - input_stream = urllib.urlopen(request) + request = urllib_request.Request(args["--url"], headers=HEADERS) + input_stream = urllib_request.urlopen(request) elif args["--file"] is not None: parser = PARSERS[document_format or "plaintext"] input_stream = open(args["--file"], "rb") diff --git a/sumy/_compat.py b/sumy/_compat.py deleted file mode 100644 index 742204cf..00000000 --- a/sumy/_compat.py +++ /dev/null @@ -1,109 +0,0 @@ -# -*- coding: utf8 -*- - -from __future__ import absolute_import -from __future__ import division, print_function, unicode_literals - -from sys import version_info - - -PY3 = version_info[0] == 3 - - -if PY3: - bytes = bytes - unicode = str -else: - bytes = str - unicode = unicode -string_types = (bytes, unicode,) - - -try: - import urllib2 as urllib -except ImportError: - from urllib import request as urllib - - -try: - from itertools import ifilterfalse as ffilter -except ImportError: - from itertools import filterfalse as ffilter - - -try: - from collections import Counter -except ImportError: - # Python < 2.7 - from itertools import groupby - - def Counter(iterable): - iterable = sorted(iterable) - return dict((key, len(tuple(group))) for key, group in groupby(iterable)) - - -def unicode_compatible(cls): - """ - Decorator for unicode compatible classes. Method ``__unicode__`` - has to be implemented to work decorator as expected. - """ - if PY3: - cls.__str__ = cls.__unicode__ - cls.__bytes__ = lambda self: self.__str__().encode("utf8") - else: - cls.__str__ = lambda self: self.__unicode__().encode("utf8") - - return cls - - -def to_string(object): - return to_unicode(object) if PY3 else to_bytes(object) - - -def to_bytes(object): - if isinstance(object, bytes): - return object - elif isinstance(object, unicode): - return object.encode("utf8") - else: - # try encode instance to bytes - return instance_to_bytes(object) - - -def to_unicode(object): - if isinstance(object, unicode): - return object - elif isinstance(object, bytes): - return object.decode("utf8") - else: - # try decode instance to unicode - return instance_to_unicode(object) - - -def instance_to_bytes(instance): - if PY3: - if hasattr(instance, "__bytes__"): - return bytes(instance) - elif hasattr(instance, "__str__"): - return unicode(instance).encode("utf8") - else: - if hasattr(instance, "__str__"): - return bytes(instance) - elif hasattr(instance, "__unicode__"): - return unicode(instance).encode("utf8") - - return to_bytes(repr(instance)) - - -def instance_to_unicode(instance): - if PY3: - if hasattr(instance, "__str__"): - return unicode(instance) - elif hasattr(instance, "__bytes__"): - return bytes(instance).decode("utf8") - else: - if hasattr(instance, "__unicode__"): - return unicode(instance) - elif hasattr(instance, "__str__"): - return bytes(instance).decode("utf8") - - return to_unicode(repr(instance)) diff --git a/sumy/evaluation/__init__.py b/sumy/evaluation/__init__.py index a60fe9c1..32e98567 100644 --- a/sumy/evaluation/__init__.py +++ b/sumy/evaluation/__init__.py @@ -1,9 +1,3 @@ -# -*- coding: utf8 -*- - -from __future__ import absolute_import -from __future__ import division, print_function, unicode_literals - - from .coselection import f_score, precision, recall from .content_based import cosine_similarity, unit_overlap -from .rouge import rouge_n, rouge_1, rouge_2, rouge_l_sentence_level, rouge_l_summary_level +from .rouge import rouge_n, rouge_1, rouge_2, rouge_l_sentence_level, rouge_l_summary_level diff --git a/sumy/evaluation/__main__.py b/sumy/evaluation/__main__.py index 871be3fd..1d326524 100644 --- a/sumy/evaluation/__main__.py +++ b/sumy/evaluation/__main__.py @@ -1,5 +1,3 @@ -# -*- coding: utf8 -*- - """ Sumy - evaluation of automatic text summary. @@ -23,17 +21,15 @@ """ -from __future__ import absolute_import -from __future__ import division, print_function, unicode_literals - import sys from itertools import chain +from urllib import request as urllib_request + from docopt import docopt from .. import __version__ from ..utils import ItemsCount, get_stop_words from ..models import TfDocumentModel -from .._compat import urllib, to_string from ..nlp.tokenizers import Tokenizer from ..parsers.html import HtmlParser from ..parsers.plaintext import PlaintextParser @@ -47,11 +43,11 @@ from ..summarizers.kl import KLSummarizer from ..nlp.stemmers import Stemmer from . import precision, recall, f_score, cosine_similarity, unit_overlap -from . import rouge_1, rouge_2, rouge_l_sentence_level, rouge_l_summary_level +from . import rouge_1, rouge_2, rouge_l_sentence_level, rouge_l_summary_level HEADERS = { - "User-Agent": "Sumy (Automatic text summarizer) Version/%s" % __version__, + "User-Agent": f"Sumy (Automatic text summarizer) Version/{__version__}", } PARSERS = { "html": HtmlParser, @@ -159,7 +155,7 @@ def evaluate_unit_overlap(evaluated_sentences, reference_sentences): def main(args=None): - args = docopt(to_string(__doc__), args, version=__version__) + args = docopt(__doc__, args, version=__version__) summarizer, document, items_count, reference_summary = handle_arguments(args) evaluated_sentences = summarizer(document, items_count) @@ -172,24 +168,23 @@ def main(args=None): result = evaluate(evaluated_sentences, document.sentences) else: result = evaluate(evaluated_sentences, reference_sentences) - print("%s: %f" % (name, result)) + print(f"{name}: {result:f}") def handle_arguments(args): document_format = args["--format"] if document_format is not None and document_format not in PARSERS: - raise ValueError("Unsupported format of input document. Possible values are: %s. Given: %s." % ( - ", ".join(PARSERS.keys()), - document_format, - )) + raise ValueError( + f"Unsupported format of input document. Possible values are: " + f"{', '.join(PARSERS.keys())}. Given: {document_format}.") parser = PARSERS["plaintext"] input_stream = sys.stdin if args["--url"] is not None: parser = PARSERS["html"] - request = urllib.Request(args["--url"], headers=HEADERS) - input_stream = urllib.urlopen(request) + request = urllib_request.Request(args["--url"], headers=HEADERS) + input_stream = urllib_request.urlopen(request) elif args["--file"] is not None: parser = PARSERS.get(document_format, PlaintextParser) input_stream = open(args["--file"], "rb") diff --git a/sumy/evaluation/content_based.py b/sumy/evaluation/content_based.py index f8e0ac74..a8e130f0 100644 --- a/sumy/evaluation/content_based.py +++ b/sumy/evaluation/content_based.py @@ -1,8 +1,3 @@ -# -*- coding: utf8 -*- - -from __future__ import absolute_import -from __future__ import division, print_function, unicode_literals - from ..models import TfDocumentModel as TfModel @@ -27,8 +22,8 @@ def cosine_similarity(evaluated_model, reference_model): denominator = evaluated_model.magnitude * reference_model.magnitude if denominator == 0.0: - raise ValueError("Document model can't be empty. Given %r & %r" % ( - evaluated_model, reference_model)) + raise ValueError( + f"Document model can't be empty. Given {evaluated_model!r} & {reference_model!r}") return numerator / denominator diff --git a/sumy/evaluation/coselection.py b/sumy/evaluation/coselection.py index f785b375..b7c84170 100644 --- a/sumy/evaluation/coselection.py +++ b/sumy/evaluation/coselection.py @@ -1,9 +1,3 @@ -# -*- coding: utf8 -*- - -from __future__ import absolute_import -from __future__ import division, print_function, unicode_literals - - def f_score(evaluated_sentences, reference_sentences, weight=1.0): """ Computation of F-Score measure. It is computed as diff --git a/sumy/evaluation/rouge.py b/sumy/evaluation/rouge.py index 9f2c50db..56add83b 100644 --- a/sumy/evaluation/rouge.py +++ b/sumy/evaluation/rouge.py @@ -1,8 +1,3 @@ -# -*- coding: utf8 -*- - -from __future__ import absolute_import -from __future__ import division, print_function, unicode_literals - from ..models.dom import Sentence @@ -18,7 +13,7 @@ def _get_ngrams(n, text): def _split_into_words(sentences): fullTextWords = [] for s in sentences: - if not isinstance(s, Sentence): + if not isinstance(s, Sentence): raise (ValueError("Object in collection must be of type Sentence")) fullTextWords.extend(s.words) return fullTextWords @@ -41,7 +36,7 @@ def _len_lcs(x, y): Returns the length of the Longest Common Subsequence between sequences x and y. Source: http://www.algorithmist.com/index.php/Longest_Common_Subsequence - + :param x: sequence of words :param y: sequence of words :returns integer: Length of LCS between x and y @@ -105,17 +100,17 @@ def rouge_n(evaluated_sentences, reference_sentences, n=2): Sourece: http://research.microsoft.com/en-us/um/people/cyl/download/ papers/rouge-working-note-v1.3.1.pdf - :param evaluated_sentences: + :param evaluated_sentences: The sentences that have been picked by the summarizer :param reference_sentences: The sentences from the referene set :param n: Size of ngram. Defaults to 2. - :returns: + :returns: float 0 <= ROUGE-N <= 1, where 0 means no overlap and 1 means exactly the same. :raises ValueError: raises exception if a param has len <= 0 """ - if len(evaluated_sentences) <= 0 or len(reference_sentences) <= 0: + if len(evaluated_sentences) <= 0 or len(reference_sentences) <= 0: raise (ValueError("Collections must contain at least 1 sentence.")) evaluated_ngrams = _get_word_ngrams(n, evaluated_sentences) @@ -133,11 +128,11 @@ def rouge_1(evaluated_sentences, reference_sentences): ''' Rouge-N where N=1. This is a commonly used metric. - :param evaluated_sentences: + :param evaluated_sentences: The sentences that have been picked by the summarizer :param reference_sentences: The sentences from the referene set - :returns: + :returns: float 0 <= ROUGE-N <= 1, where 0 means no overlap and 1 means exactly the same. ''' @@ -148,11 +143,11 @@ def rouge_2(evaluated_sentences, reference_sentences): ''' Rouge-N where N=2. This is a commonly used metric. - :param evaluated_sentences: + :param evaluated_sentences: The sentences that have been picked by the summarizer :param reference_sentences: The sentences from the referene set - :returns: + :returns: float 0 <= ROUGE-N <= 1, where 0 means no overlap and 1 means exactly the same. ''' @@ -164,16 +159,16 @@ def _f_lcs(llcs, m, n): Computes the LCS-based F-measure score Source: http://research.microsoft.com/en-us/um/people/cyl/download/papers/ rouge-working-note-v1.3.1.pdf - + :param llcs: Length of LCS - :param m: number of words in reference summary + :param m: number of words in reference summary :param n: number of words in candidate summary :returns float: LCS-based F-measure score ''' r_lcs = llcs / m p_lcs = llcs / n beta = p_lcs / r_lcs - num = (1 + (beta ** 2)) * r_lcs * p_lcs + num = (1 + (beta ** 2)) * r_lcs * p_lcs denom = r_lcs + ((beta ** 2) * p_lcs) return num / denom @@ -183,7 +178,7 @@ def rouge_l_sentence_level(evaluated_sentences, reference_sentences): Computes ROUGE-L (sentence level) of two text collections of sentences. http://research.microsoft.com/en-us/um/people/cyl/download/papers/ rouge-working-note-v1.3.1.pdf - + Calculated according to: R_lcs = LCS(X,Y)/m P_lcs = LCS(X,Y)/n @@ -195,14 +190,14 @@ def rouge_l_sentence_level(evaluated_sentences, reference_sentences): m = length of reference summary n = length of candidate summary - :param evaluated_sentences: + :param evaluated_sentences: The sentences that have been picked by the summarizer :param reference_sentences: The sentences from the referene set :returns float: F_lcs :raises ValueError: raises exception if a param has len <= 0 """ - if len(evaluated_sentences) <= 0 or len(reference_sentences) <= 0: + if len(evaluated_sentences) <= 0 or len(reference_sentences) <= 0: raise (ValueError("Collections must contain at least 1 sentence.")) reference_words = _split_into_words(reference_sentences) evaluated_words = _split_into_words(evaluated_sentences) @@ -214,27 +209,27 @@ def rouge_l_sentence_level(evaluated_sentences, reference_sentences): def _union_lcs(evaluated_sentences, reference_sentence): ''' - Returns LCS_u(r_i, C) which is the LCS score of the union longest common subsequence - between reference sentence ri and candidate summary C. For example, if - r_i= w1 w2 w3 w4 w5, and C contains two sentences: c1 = w1 w2 w6 w7 w8 and - c2 = w1 w3 w8 w9 w5, then the longest common subsequence of r_i and c1 is - “w1 w2” and the longest common subsequence of r_i and c2 is “w1 w3 w5”. The - union longest common subsequence of r_i, c1, and c2 is “w1 w2 w3 w5” and + Returns LCS_u(r_i, C) which is the LCS score of the union longest common subsequence + between reference sentence ri and candidate summary C. For example, if + r_i= w1 w2 w3 w4 w5, and C contains two sentences: c1 = w1 w2 w6 w7 w8 and + c2 = w1 w3 w8 w9 w5, then the longest common subsequence of r_i and c1 is + "w1 w2" and the longest common subsequence of r_i and c2 is "w1 w3 w5". The + union longest common subsequence of r_i, c1, and c2 is "w1 w2 w3 w5" and LCS_u(r_i, C) = 4/5. - :param evaluated_sentences: + :param evaluated_sentences: The sentences that have been picked by the summarizer :param reference_sentence: One of the sentences in the reference summaries :returns float: LCS_u(r_i, C) :raises ValueError: raises exception if a param has len <= 0 ''' - if len(evaluated_sentences) <= 0: + if len(evaluated_sentences) <= 0: raise (ValueError("Collections must contain at least 1 sentence.")) lcs_union = set() reference_words = _split_into_words([reference_sentence]) - combined_lcs_length = 0 + combined_lcs_length = 0 for eval_s in evaluated_sentences: evaluated_words = _split_into_words([eval_s]) lcs = set(_recon_lcs(reference_words, evaluated_words)) @@ -264,19 +259,19 @@ def rouge_l_summary_level(evaluated_sentences, reference_sentences): m = number of words in reference summary n = number of words in candidate summary - :param evaluated_sentences: + :param evaluated_sentences: The sentences that have been picked by the summarizer :param reference_sentences: The sentences from the referene set :returns float: F_lcs :raises ValueError: raises exception if a param has len <= 0 """ - if len(evaluated_sentences) <= 0 or len(reference_sentences) <= 0: + if len(evaluated_sentences) <= 0 or len(reference_sentences) <= 0: raise (ValueError("Collections must contain at least 1 sentence.")) # total number of words in reference sentences m = len(_split_into_words(reference_sentences)) - + # total number of words in evaluated sentences n = len(_split_into_words(evaluated_sentences)) @@ -284,4 +279,3 @@ def rouge_l_summary_level(evaluated_sentences, reference_sentences): for ref_s in reference_sentences: union_lcs_sum_across_all_references += _union_lcs(evaluated_sentences, ref_s) return _f_lcs(union_lcs_sum_across_all_references, m, n) - diff --git a/sumy/models/__init__.py b/sumy/models/__init__.py index f1584535..3c80f326 100644 --- a/sumy/models/__init__.py +++ b/sumy/models/__init__.py @@ -1,7 +1 @@ -# -*- coding: utf8 -*- - -from __future__ import absolute_import -from __future__ import division, print_function, unicode_literals - - from .tf import TfDocumentModel diff --git a/sumy/models/dom/__init__.py b/sumy/models/dom/__init__.py index 5bfe104b..a68f9ff0 100644 --- a/sumy/models/dom/__init__.py +++ b/sumy/models/dom/__init__.py @@ -1,8 +1,3 @@ -# -*- coding: utf8 -*- - -from __future__ import absolute_import -from __future__ import division, print_function, unicode_literals - from ._document import ObjectDocumentModel from ._paragraph import Paragraph from ._sentence import Sentence diff --git a/sumy/models/dom/_document.py b/sumy/models/dom/_document.py index 4420b945..e011965c 100644 --- a/sumy/models/dom/_document.py +++ b/sumy/models/dom/_document.py @@ -1,15 +1,8 @@ -# -*- coding: utf8 -*- - -from __future__ import absolute_import -from __future__ import division, print_function, unicode_literals - from itertools import chain from ...utils import cached_property -from ..._compat import unicode_compatible -@unicode_compatible -class ObjectDocumentModel(object): +class ObjectDocumentModel: def __init__(self, paragraphs): self._paragraphs = tuple(paragraphs) @@ -32,8 +25,8 @@ def words(self): words = (p.words for p in self._paragraphs) return tuple(chain(*words)) - def __unicode__(self): - return "" % len(self.paragraphs) + def __str__(self): + return f"" def __repr__(self): return self.__str__() diff --git a/sumy/models/dom/_paragraph.py b/sumy/models/dom/_paragraph.py index 8cab505b..ab58f41b 100644 --- a/sumy/models/dom/_paragraph.py +++ b/sumy/models/dom/_paragraph.py @@ -1,16 +1,9 @@ -# -*- coding: utf8 -*- - -from __future__ import absolute_import -from __future__ import division, print_function, unicode_literals - from itertools import chain -from ..._compat import unicode_compatible from ...utils import cached_property from ._sentence import Sentence -@unicode_compatible -class Paragraph(object): +class Paragraph: __slots__ = ( "_sentences", "_cached_property_sentences", @@ -38,11 +31,8 @@ def headings(self): def words(self): return tuple(chain(*(s.words for s in self._sentences))) - def __unicode__(self): - return "" % ( - len(self.headings), - len(self.sentences), - ) + def __str__(self): + return f"" def __repr__(self): return self.__str__() diff --git a/sumy/models/dom/_sentence.py b/sumy/models/dom/_sentence.py index 6d2ea405..a8962959 100644 --- a/sumy/models/dom/_sentence.py +++ b/sumy/models/dom/_sentence.py @@ -1,18 +1,11 @@ -# -*- coding: utf8 -*- - -from __future__ import absolute_import -from __future__ import division, print_function, unicode_literals - from ...utils import cached_property -from ..._compat import to_unicode, to_string, unicode_compatible -@unicode_compatible -class Sentence(object): +class Sentence: __slots__ = ("_text", "_cached_property_words", "_tokenizer", "_is_heading",) def __init__(self, text, tokenizer, is_heading=False): - self._text = to_unicode(text).strip() + self._text = str(text).strip() self._tokenizer = tokenizer self._is_heading = bool(is_heading) @@ -34,11 +27,8 @@ def __ne__(self, sentence): def __hash__(self): return hash((self._is_heading, self._text)) - def __unicode__(self): + def __str__(self): return self._text def __repr__(self): - return to_string("<%s: %s>") % ( - "Heading" if self._is_heading else "Sentence", - self.__str__() - ) + return f"<{'Heading' if self._is_heading else 'Sentence'}: {self.__str__()}>" diff --git a/sumy/models/tf.py b/sumy/models/tf.py index 6fd913b7..b399da94 100644 --- a/sumy/models/tf.py +++ b/sumy/models/tf.py @@ -1,28 +1,23 @@ -# -*- coding: utf8 -*- - -from __future__ import absolute_import -from __future__ import division, print_function, unicode_literals - import math from pprint import pformat -from collections import Sequence -from .._compat import to_unicode, unicode, string_types, Counter +from collections import Counter +from collections.abc import Sequence -class TfDocumentModel(object): +class TfDocumentModel: """Term-Frequency document model (term = word).""" def __init__(self, words, tokenizer=None): - if isinstance(words, string_types) and tokenizer is None: + if isinstance(words, str) and tokenizer is None: raise ValueError( "Tokenizer has to be given if ``words`` is not a sequence.") - elif isinstance(words, string_types): - words = tokenizer.to_words(to_unicode(words)) + elif isinstance(words, str): + words = tokenizer.to_words(str(words)) elif not isinstance(words, Sequence): raise ValueError( "Parameter ``words`` has to be sequence or string with tokenizer given.") - self._terms = Counter(map(unicode.lower, words)) + self._terms = Counter(map(str.lower, words)) self._max_frequency = max(self._terms.values()) if self._terms else 1 @property @@ -85,4 +80,4 @@ def normalized_term_frequency(self, term, smooth=0.0): return smooth + (1.0 - smooth)*frequency def __repr__(self): - return "" % pformat(self._terms) + return f"" diff --git a/sumy/nlp/stemmers/__init__.py b/sumy/nlp/stemmers/__init__.py index 621d01a4..3a2f7fc5 100644 --- a/sumy/nlp/stemmers/__init__.py +++ b/sumy/nlp/stemmers/__init__.py @@ -1,21 +1,14 @@ -# -*- coding: utf8 -*- - -from __future__ import absolute_import -from __future__ import division, print_function, unicode_literals - import nltk.stem.snowball as nltk_stemmers_module from .czech import stem_word as czech_stemmer -from ..._compat import to_unicode - def null_stemmer(object): "Converts given object to unicode with lower letters." - return to_unicode(object).lower() + return str(object).lower() -class Stemmer(object): +class Stemmer: def __init__(self, language): self._stemmer = null_stemmer if language.lower() in ('czech', 'slovak'): @@ -25,7 +18,7 @@ def __init__(self, language): try: stemmer_class = getattr(nltk_stemmers_module, stemmer_classname) except AttributeError: - raise LookupError("Stemmer is not available for language %s." % language) + raise LookupError(f"Stemmer is not available for language {language}.") self._stemmer = stemmer_class().stem def __call__(self, word): diff --git a/sumy/nlp/stemmers/czech.py b/sumy/nlp/stemmers/czech.py index d3be1720..5ec3d337 100644 --- a/sumy/nlp/stemmers/czech.py +++ b/sumy/nlp/stemmers/czech.py @@ -1,5 +1,3 @@ -# -*- coding: utf8 -*- - """ Czech stemmer Copyright © 2010 Luís Gomes . @@ -11,21 +9,17 @@ czech_stemmer.py light|aggressive """ -from __future__ import absolute_import -from __future__ import division, print_function, unicode_literals - import re import sys from warnings import warn -from ..._compat import unicode WORD_PATTERN = re.compile(r"^\w+$", re.UNICODE) def stem_word(word, aggressive=False): - if not isinstance(word, unicode): + if not isinstance(word, str): word = word.decode("utf8") if not WORD_PATTERN.match(word): @@ -196,9 +190,9 @@ def _palatalize(word): if __name__ == '__main__': if len(sys.argv) != 2 or sys.argv[1] not in ("light", "aggressive"): - sys.exit(__doc__.encode("utf8")) + sys.exit(__doc__) aggressive_stemming = bool(sys.argv[1] == "aggressive") for line in sys.stdin: - words = tuple(w.decode("utf8") + " " + stem_word(w, aggressive_stemming) for w in line.split()) - print(*map(lambda s: s.encode("utf8"), words)) + words = tuple(f"{w} {stem_word(w, aggressive_stemming)}" for w in line.split()) + print(*words) diff --git a/sumy/nlp/tokenizers.py b/sumy/nlp/tokenizers.py index 77aacef6..a7d1e073 100644 --- a/sumy/nlp/tokenizers.py +++ b/sumy/nlp/tokenizers.py @@ -1,15 +1,8 @@ -# -*- coding: utf8 -*- - -from __future__ import absolute_import -from __future__ import division, print_function, unicode_literals - import re import nltk -from .._compat import to_string, to_unicode, unicode - -class Tokenizer(object): +class Tokenizer: """Language dependent tokenizer of text document.""" _WORD_PATTERN = re.compile(r"^[^\W\d_]+$", re.UNICODE) @@ -17,7 +10,7 @@ class Tokenizer(object): LANGUAGE_ALIASES = { "slovak": "czech", } - + # improve tokenizer by adding specific abbreviations it has issues with # note the final point in these items must not be included LANGUAGE_EXTRA_ABREVS = { @@ -36,17 +29,17 @@ def language(self): return self._language def _sentence_tokenizer(self, language): - path = to_string("tokenizers/punkt/%s.pickle") % to_string(language) + path = f"tokenizers/punkt/{language}.pickle" return nltk.data.load(path) def to_sentences(self, paragraph): extra_abbreviations = self.LANGUAGE_EXTRA_ABREVS.get(self._language, []) self._sentence_tokenizer._params.abbrev_types.update(extra_abbreviations) - sentences = self._sentence_tokenizer.tokenize(to_unicode(paragraph)) - return tuple(map(unicode.strip, sentences)) + sentences = self._sentence_tokenizer.tokenize(str(paragraph)) + return tuple(map(str.strip, sentences)) def to_words(self, sentence): - words = nltk.word_tokenize(to_unicode(sentence)) + words = nltk.word_tokenize(str(sentence)) return tuple(filter(self._is_word, words)) def _is_word(self, word): diff --git a/sumy/parsers/__init__.py b/sumy/parsers/__init__.py index 90bc3d44..28ceeeb7 100644 --- a/sumy/parsers/__init__.py +++ b/sumy/parsers/__init__.py @@ -1,6 +1 @@ -# -*- coding: utf8 -*- - -from __future__ import absolute_import -from __future__ import division, print_function, unicode_literals - from .parser import DocumentParser diff --git a/sumy/parsers/html.py b/sumy/parsers/html.py index a139e3e0..2aa00350 100644 --- a/sumy/parsers/html.py +++ b/sumy/parsers/html.py @@ -1,10 +1,6 @@ -# -*- coding: utf8 -*- - -from __future__ import absolute_import -from __future__ import division, print_function, unicode_literals +from urllib import request as urllib_request from breadability.readable import Article -from .._compat import urllib from ..utils import cached_property from ..models.dom import Sentence, Paragraph, ObjectDocumentModel from .parser import DocumentParser @@ -32,14 +28,14 @@ def from_file(cls, file_path, url, tokenizer): @classmethod def from_url(cls, url, tokenizer): - response = urllib.urlopen(url) + response = urllib_request.urlopen(url) data = response.read() response.close() return cls(data, tokenizer, url) def __init__(self, html_content, tokenizer, url=None): - super(HtmlParser, self).__init__(tokenizer) + super().__init__(tokenizer) self._article = Article(html_content, url) @cached_property diff --git a/sumy/parsers/parser.py b/sumy/parsers/parser.py index fd6037c5..447c3ff7 100644 --- a/sumy/parsers/parser.py +++ b/sumy/parsers/parser.py @@ -1,10 +1,4 @@ -# -*- coding: utf8 -*- - -from __future__ import absolute_import -from __future__ import division, print_function, unicode_literals - - -class DocumentParser(object): +class DocumentParser: """Abstract parser of input format into DOM.""" SIGNIFICANT_WORDS = ( diff --git a/sumy/parsers/plaintext.py b/sumy/parsers/plaintext.py index e6822e26..d436f09a 100644 --- a/sumy/parsers/plaintext.py +++ b/sumy/parsers/plaintext.py @@ -1,9 +1,3 @@ -# -*- coding: utf8 -*- - -from __future__ import absolute_import -from __future__ import division, print_function, unicode_literals - -from .._compat import to_unicode from ..utils import cached_property from ..models.dom import Sentence, Paragraph, ObjectDocumentModel from .parser import DocumentParser @@ -20,8 +14,8 @@ def from_file(cls, file_path, tokenizer): return cls(file.read(), tokenizer) def __init__(self, text, tokenizer): - super(PlaintextParser, self).__init__(tokenizer) - self._text = to_unicode(text).strip() + super().__init__(tokenizer) + self._text = str(text).strip() @cached_property def significant_words(self): diff --git a/sumy/summarizers/__init__.py b/sumy/summarizers/__init__.py index 4e0750e1..a1bcb2a9 100644 --- a/sumy/summarizers/__init__.py +++ b/sumy/summarizers/__init__.py @@ -1,6 +1 @@ -# -*- coding: utf8 -*- - -from __future__ import absolute_import -from __future__ import division, print_function, unicode_literals - from ._summarizer import AbstractSummarizer diff --git a/sumy/summarizers/_summarizer.py b/sumy/summarizers/_summarizer.py index e15b497c..f3f4737f 100644 --- a/sumy/summarizers/_summarizer.py +++ b/sumy/summarizers/_summarizer.py @@ -1,20 +1,13 @@ -# -*- coding: utf8 -*- - -from __future__ import absolute_import -from __future__ import division, print_function, unicode_literals - - from collections import namedtuple from operator import attrgetter from ..utils import ItemsCount -from .._compat import to_unicode from ..nlp.stemmers import null_stemmer SentenceInfo = namedtuple("SentenceInfo", ("sentence", "order", "rating",)) -class AbstractSummarizer(object): +class AbstractSummarizer: def __init__(self, stemmer=null_stemmer): if not callable(stemmer): raise ValueError("Stemmer has to be a callable object") @@ -28,7 +21,7 @@ def stem_word(self, word): return self._stemmer(self.normalize_word(word)) def normalize_word(self, word): - return to_unicode(word).lower() + return str(word).lower() def _get_best_sentences(self, sentences, count, rating, *args, **kwargs): rate = rating diff --git a/sumy/summarizers/edmundson.py b/sumy/summarizers/edmundson.py index dad164ac..61eec5b7 100644 --- a/sumy/summarizers/edmundson.py +++ b/sumy/summarizers/edmundson.py @@ -1,8 +1,3 @@ -# -*- coding: utf8 -*- - -from __future__ import absolute_import -from __future__ import division, print_function, unicode_literals - from collections import defaultdict from ..nlp.stemmers import null_stemmer from ._summarizer import AbstractSummarizer @@ -22,7 +17,7 @@ class EdmundsonSummarizer(AbstractSummarizer): def __init__(self, stemmer=null_stemmer, cue_weight=1.0, key_weight=0.0, title_weight=1.0, location_weight=1.0): - super(EdmundsonSummarizer, self).__init__(stemmer) + super().__init__(stemmer) self._ensure_correct_weights(cue_weight, key_weight, title_weight, location_weight) diff --git a/sumy/summarizers/edmundson_cue.py b/sumy/summarizers/edmundson_cue.py index 103d8641..c124bfde 100644 --- a/sumy/summarizers/edmundson_cue.py +++ b/sumy/summarizers/edmundson_cue.py @@ -1,14 +1,9 @@ -# -*- coding: utf8 -*- - -from __future__ import absolute_import -from __future__ import division, print_function, unicode_literals - from ._summarizer import AbstractSummarizer class EdmundsonCueMethod(AbstractSummarizer): def __init__(self, stemmer, bonus_words, stigma_words): - super(EdmundsonCueMethod, self).__init__(stemmer) + super().__init__(stemmer) self._bonus_words = bonus_words self._stigma_words = stigma_words diff --git a/sumy/summarizers/edmundson_key.py b/sumy/summarizers/edmundson_key.py index 9f1cb3aa..23ff282b 100644 --- a/sumy/summarizers/edmundson_key.py +++ b/sumy/summarizers/edmundson_key.py @@ -1,15 +1,10 @@ -# -*- coding: utf8 -*- - -from __future__ import absolute_import -from __future__ import division, print_function, unicode_literals - -from .._compat import Counter +from collections import Counter from ._summarizer import AbstractSummarizer class EdmundsonKeyMethod(AbstractSummarizer): def __init__(self, stemmer, bonus_words): - super(EdmundsonKeyMethod, self).__init__(stemmer) + super().__init__(stemmer) self._bonus_words = bonus_words def __call__(self, document, sentences_count, weight): diff --git a/sumy/summarizers/edmundson_location.py b/sumy/summarizers/edmundson_location.py index 406597f5..ff6c439b 100644 --- a/sumy/summarizers/edmundson_location.py +++ b/sumy/summarizers/edmundson_location.py @@ -1,17 +1,11 @@ -# -*- coding: utf8 -*- - -from __future__ import absolute_import -from __future__ import division, print_function, unicode_literals - -from itertools import chain +from itertools import chain, filterfalse from operator import attrgetter -from .._compat import ffilter from ._summarizer import AbstractSummarizer class EdmundsonLocationMethod(AbstractSummarizer): def __init__(self, stemmer, null_words): - super(EdmundsonLocationMethod, self).__init__(stemmer) + super().__init__(stemmer) self._null_words = null_words def __call__(self, document, sentences_count, w_h, w_p1, w_p2, w_s1, w_s2): @@ -26,7 +20,7 @@ def _compute_significant_words(self, document): significant_words = chain(*map(attrgetter("words"), headings)) significant_words = map(self.stem_word, significant_words) - significant_words = ffilter(self._is_null_word, significant_words) + significant_words = filterfalse(self._is_null_word, significant_words) return frozenset(significant_words) diff --git a/sumy/summarizers/edmundson_title.py b/sumy/summarizers/edmundson_title.py index c4779200..fbdfb57a 100644 --- a/sumy/summarizers/edmundson_title.py +++ b/sumy/summarizers/edmundson_title.py @@ -1,17 +1,11 @@ -# -*- coding: utf8 -*- - -from __future__ import absolute_import -from __future__ import division, print_function, unicode_literals - from operator import attrgetter -from itertools import chain -from .._compat import ffilter +from itertools import chain, filterfalse from ._summarizer import AbstractSummarizer class EdmundsonTitleMethod(AbstractSummarizer): def __init__(self, stemmer, null_words): - super(EdmundsonTitleMethod, self).__init__(stemmer) + super().__init__(stemmer) self._null_words = null_words def __call__(self, document, sentences_count): @@ -26,7 +20,7 @@ def _compute_significant_words(self, document): significant_words = chain(*heading_words) significant_words = map(self.stem_word, significant_words) - significant_words = ffilter(self._is_null_word, significant_words) + significant_words = filterfalse(self._is_null_word, significant_words) return frozenset(significant_words) diff --git a/sumy/summarizers/kl.py b/sumy/summarizers/kl.py index f6f5a01d..38e42976 100644 --- a/sumy/summarizers/kl.py +++ b/sumy/summarizers/kl.py @@ -1,7 +1,3 @@ -# -*- coding: utf8 -*- - -from __future__ import absolute_import -from __future__ import division, print_function, unicode_literals import math from ._summarizer import AbstractSummarizer @@ -10,7 +6,7 @@ class KLSummarizer(AbstractSummarizer): """ - Method that greedily adds sentences to a summary so long as it decreases the + Method that greedily adds sentences to a summary so long as it decreases the KL Divergence. Source: http://www.aclweb.org/anthology/N09-1041 """ @@ -29,7 +25,7 @@ def _get_all_words_in_doc(self, sentences): return [w for s in sentences for w in s.words] def _get_content_words_in_sentence(self, sentence): - normalized_words = self._normalize_words(sentence.words) + normalized_words = self._normalize_words(sentence.words) normalized_content_words = self._filter_out_stop_words(normalized_words) return normalized_content_words @@ -50,7 +46,7 @@ def _get_all_content_words_in_doc(self, sentences): content_words = self._filter_out_stop_words(all_words) normalized_content_words = self._normalize_words(content_words) return normalized_content_words - + def _compute_tf(self, sentences): ''' Computes the normalized term frequency as explained in http://www.tfidf.com/ @@ -74,7 +70,7 @@ def _joint_freq(self, word_list_1, word_list_2): # adds in the counts of the second list for k in wc2: - if k in joint: + if k in joint: joint[k] += wc2[k] else: joint[k] = wc2[k] @@ -111,15 +107,15 @@ def _compute_ratings(self, sentences): # get all content words once for efficiency sentences_as_words = [self._get_content_words_in_sentence(s) for s in sentences] - + # Removes one sentence per iteration by adding to summary while len(sentences_list) > 0: # will store all the kls values for this pass kls = [] - + # converts summary to word list summary_as_word_list = self._get_all_words_in_doc(summary) - + for s in sentences_as_words: # calculates the joint frequency through combining the word lists joint_freq = self._joint_freq(s, summary_as_word_list) @@ -137,4 +133,3 @@ def _compute_ratings(self, sentences): ratings[best_sentence] = -1 * len(ratings) return ratings - diff --git a/sumy/summarizers/lex_rank.py b/sumy/summarizers/lex_rank.py index a4bd436f..c6966b67 100644 --- a/sumy/summarizers/lex_rank.py +++ b/sumy/summarizers/lex_rank.py @@ -1,9 +1,5 @@ -# -*- coding: utf8 -*- - -from __future__ import absolute_import -from __future__ import division, print_function, unicode_literals - import math +from collections import Counter try: import numpy @@ -11,7 +7,6 @@ numpy = None from ._summarizer import AbstractSummarizer -from .._compat import Counter class LexRankSummarizer(AbstractSummarizer): @@ -87,9 +82,9 @@ def _compute_idf(sentences): def _create_matrix(self, sentences, threshold, tf_metrics, idf_metrics): """ - Creates matrix of shape |sentences|×|sentences|. + Creates matrix of shape |sentences|x|sentences|. """ - # create matrix |sentences|×|sentences| filled with zeroes + # create matrix |sentences|x|sentences| filled with zeroes sentences_count = len(sentences) matrix = numpy.zeros((sentences_count, sentences_count)) degrees = numpy.zeros((sentences_count, )) diff --git a/sumy/summarizers/lsa.py b/sumy/summarizers/lsa.py index 2a29a90a..f9ec6d60 100644 --- a/sumy/summarizers/lsa.py +++ b/sumy/summarizers/lsa.py @@ -1,8 +1,3 @@ -# -*- coding: utf8 -*- - -from __future__ import absolute_import -from __future__ import division, print_function, unicode_literals - import math from warnings import warn @@ -61,7 +56,7 @@ def _create_dictionary(self, document): def _create_matrix(self, document, dictionary): """ - Creates matrix of shape |unique words|×|sentences| where cells + Creates matrix of shape |unique words|x|sentences| where cells contains number of occurences of words (rows) in senteces (cols). """ sentences = document.sentences @@ -75,7 +70,7 @@ def _create_matrix(self, document, dictionary): ) warn(message % (words_count, sentences_count)) - # create matrix |unique words|×|sentences| filled with zeroes + # create matrix |unique words|x|sentences| filled with zeroes matrix = numpy.zeros((words_count, sentences_count)) for col, sentence in enumerate(sentences): for word in map(self.stem_word, sentence.words): diff --git a/sumy/summarizers/luhn.py b/sumy/summarizers/luhn.py index e16750eb..99a209c3 100644 --- a/sumy/summarizers/luhn.py +++ b/sumy/summarizers/luhn.py @@ -1,8 +1,3 @@ -# -*- coding: utf8 -*- - -from __future__ import absolute_import -from __future__ import division, print_function, unicode_literals - from ..models import TfDocumentModel from ._summarizer import AbstractSummarizer diff --git a/sumy/summarizers/random.py b/sumy/summarizers/random.py index badb4390..d4d43187 100644 --- a/sumy/summarizers/random.py +++ b/sumy/summarizers/random.py @@ -1,8 +1,3 @@ -# -*- coding: utf8 -*- - -from __future__ import absolute_import -from __future__ import division, print_function, unicode_literals - import random from ._summarizer import AbstractSummarizer diff --git a/sumy/summarizers/sum_basic.py b/sumy/summarizers/sum_basic.py index fce3d81c..0916054f 100644 --- a/sumy/summarizers/sum_basic.py +++ b/sumy/summarizers/sum_basic.py @@ -1,7 +1,3 @@ -# -*- coding: utf8 -*- - -from __future__ import absolute_import -from __future__ import division, print_function, unicode_literals import math from ._summarizer import AbstractSummarizer @@ -10,7 +6,7 @@ class SumBasicSummarizer(AbstractSummarizer): """ - SumBasic: a frequency-based summarization system that adjusts word frequencies as + SumBasic: a frequency-based summarization system that adjusts word frequencies as sentences are extracted. Source: http://www.cis.upenn.edu/~nenkova/papers/ipm.pdf @@ -25,7 +21,7 @@ def _get_all_words_in_doc(self, sentences): return [w for s in sentences for w in s.words] def _get_content_words_in_sentence(self, sentence): - normalized_words = self._normalize_words(sentence.words) + normalized_words = self._normalize_words(sentence.words) normalized_content_words = self._filter_out_stop_words(normalized_words) return normalized_content_words @@ -63,7 +59,7 @@ def _compute_average_probability_of_words(self, word_freq_in_doc, content_words_ word_freq_sum = sum([word_freq_in_doc[w] for w in content_words_in_sentence]) word_freq_avg = word_freq_sum / content_words_count return word_freq_avg - else: + else: return 0 def _update_tf(self, word_freq, words_to_update): @@ -71,29 +67,27 @@ def _update_tf(self, word_freq, words_to_update): word_freq[w] *= word_freq[w] return word_freq - def _find_index_of_best_sentence(self, word_freq, sentences_as_words): min_possible_freq = -1 max_value = min_possible_freq best_sentence_index = 0 for i, words in enumerate(sentences_as_words): word_freq_avg = self._compute_average_probability_of_words(word_freq, words) - if (word_freq_avg > max_value): + if (word_freq_avg > max_value): max_value = word_freq_avg best_sentence_index = i return best_sentence_index - def _compute_ratings(self, sentences): word_freq = self._compute_tf(sentences) ratings = {} - + # make it a list so that it can be modified sentences_list = list(sentences) # get all content words once for efficiency sentences_as_words = [self._get_content_words_in_sentence(s) for s in sentences] - + # Removes one sentence per iteration by adding to summary while len(sentences_list) > 0: best_sentence_index = self._find_index_of_best_sentence(word_freq, sentences_as_words) @@ -106,4 +100,4 @@ def _compute_ratings(self, sentences): best_sentence_words = sentences_as_words.pop(best_sentence_index) self._update_tf(word_freq, best_sentence_words) - return ratings \ No newline at end of file + return ratings diff --git a/sumy/summarizers/text_rank.py b/sumy/summarizers/text_rank.py index 488bd24e..66fd905c 100644 --- a/sumy/summarizers/text_rank.py +++ b/sumy/summarizers/text_rank.py @@ -1,8 +1,3 @@ -# -*- coding: utf8 -*- - -from __future__ import absolute_import -from __future__ import division, print_function, unicode_literals - import math from itertools import combinations diff --git a/sumy/utils.py b/sumy/utils.py index bd6cdef1..d1bb7f21 100644 --- a/sumy/utils.py +++ b/sumy/utils.py @@ -1,13 +1,7 @@ -# -*- coding: utf8 -*- - -from __future__ import absolute_import -from __future__ import division, print_function, unicode_literals - import sys from functools import wraps from os.path import dirname, abspath, join, exists -from ._compat import to_string, to_unicode, string_types def cached_property(getter): @@ -15,6 +9,10 @@ def cached_property(getter): Decorator that converts a method into memoized property. The decorator works as expected only for classes with attribute '__dict__' and immutable properties. + + NOTE: We keep this custom implementation instead of using + functools.cached_property because functools.cached_property + does NOT work with __slots__-based classes (Sentence, Paragraph, etc.). """ @wraps(getter) def decorator(self): @@ -31,27 +29,27 @@ def decorator(self): def expand_resource_path(path): directory = dirname(sys.modules["sumy"].__file__) directory = abspath(directory) - return join(directory, to_string("data"), to_string(path)) + return join(directory, "data", str(path)) def get_stop_words(language): - path = expand_resource_path("stopwords/%s.txt" % language) + path = expand_resource_path(f"stopwords/{language}.txt") if not exists(path): - raise LookupError("Stop-words are not available for language %s." % language) + raise LookupError(f"Stop-words are not available for language {language}.") return read_stop_words(path) def read_stop_words(filename): with open(filename, "rb") as open_file: - return frozenset(to_unicode(w.rstrip()) for w in open_file.readlines()) + return frozenset(w.decode("utf-8").rstrip() for w in open_file.readlines()) -class ItemsCount(object): +class ItemsCount: def __init__(self, value): self._value = value def __call__(self, sequence): - if isinstance(self._value, string_types): + if isinstance(self._value, str): if self._value.endswith("%"): total_count = len(sequence) percentage = int(self._value[:-1]) @@ -63,7 +61,7 @@ def __call__(self, sequence): elif isinstance(self._value, (int, float)): return sequence[:int(self._value)] else: - ValueError("Unsuported value of items count '%s'." % self._value) + ValueError(f"Unsuported value of items count '{self._value}'.") def __repr__(self): - return to_string("" % self._value) + return f"" From 8b47c6c82fddb327d68f6c2ef235cc0f35b3e32b Mon Sep 17 00:00:00 2001 From: "mcode-bot@modelcode.ai" Date: Thu, 11 Jun 2026 14:14:48 -0500 Subject: [PATCH 03/11] Update NLTK tokenizer to use punkt_tab resource path Replace deprecated tokenizers/punkt/{lang}.pickle path with the new tokenizers/punkt_tab/{lang}.pickle path required by NLTK 3.9+. The _params.abbrev_types API remains compatible. --- sumy/nlp/tokenizers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sumy/nlp/tokenizers.py b/sumy/nlp/tokenizers.py index a7d1e073..3fcafd0d 100644 --- a/sumy/nlp/tokenizers.py +++ b/sumy/nlp/tokenizers.py @@ -29,7 +29,7 @@ def language(self): return self._language def _sentence_tokenizer(self, language): - path = f"tokenizers/punkt/{language}.pickle" + path = f"tokenizers/punkt_tab/{language}.pickle" return nltk.data.load(path) def to_sentences(self, paragraph): From c8e0c960bced7f6b80358d3c83dbebbb4780711a Mon Sep 17 00:00:00 2001 From: "mcode-bot@modelcode.ai" Date: Thu, 11 Jun 2026 14:16:53 -0500 Subject: [PATCH 04/11] Migrate both CLI entry points from docopt to argparse Replace docopt with argparse in sumy/__main__.py and sumy/evaluation/__main__.py. Both CLIs now use a single positional argument with choices for summarizer selection, preserving the existing `sumy [options]` interface. Key changes: - Remove docopt imports and docstring-as-usage-spec blocks - Add argparse.ArgumentParser with explicit argument definitions - Access parsed args as namespace attributes instead of dict keys - Look up summarizer via AVAILABLE_METHODS[args.method] instead of iterating and checking boolean flags - Evaluation CLI includes 'random' in method choices and accepts a positional reference_summary argument --- sumy/__main__.py | 100 ++++++++++++++++++++++-------------- sumy/evaluation/__main__.py | 100 +++++++++++++++++++++--------------- 2 files changed, 122 insertions(+), 78 deletions(-) diff --git a/sumy/__main__.py b/sumy/__main__.py index e9078203..68815e9a 100644 --- a/sumy/__main__.py +++ b/sumy/__main__.py @@ -1,32 +1,10 @@ -""" -Sumy - automatic text summarizer. - -Usage: - sumy (luhn | edmundson | lsa | text-rank | lex-rank | sum-basic | kl) [--length=] [--language=] [--stopwords=] [--format=] - sumy (luhn | edmundson | lsa | text-rank | lex-rank | sum-basic | kl) [--length=] [--language=] [--stopwords=] [--format=] --url= - sumy (luhn | edmundson | lsa | text-rank | lex-rank | sum-basic | kl) [--length=] [--language=] [--stopwords=] [--format=] --file= - sumy --version - sumy --help - -Options: - --length= Length of summarized text. It may be count of sentences - or percentage of input text. [default: 20%] - --language= Natural language of summarized text. [default: english] - --stopwords= Path to a file containing a list of stopwords. One word per line in UTF-8 encoding. - If it's not provided default list of stop-words is used according to chosen language. - --format= Format of input document. Possible values: html, plaintext - --url= URL address of the web page to summarize. - --file= Path to the text file to summarize. - --version Displays current application version. - --help Displays this text. - -""" +"""Sumy - automatic text summarizer.""" +import argparse import sys from urllib import request as urllib_request -from docopt import docopt from . import __version__ from .utils import ItemsCount, get_stop_words, read_stop_words from .nlp.tokenizers import Tokenizer @@ -61,38 +39,84 @@ def main(args=None): - args = docopt(__doc__, args, version=__version__) - summarizer, parser, items_count = handle_arguments(args) - - for sentence in summarizer(parser.document, items_count): + parser = argparse.ArgumentParser( + prog="sumy", + description="Automatic text summarizer.", + ) + parser.add_argument( + "method", + choices=["luhn", "edmundson", "lsa", "text-rank", "lex-rank", "sum-basic", "kl"], + help="Summarization method to use.", + ) + parser.add_argument( + "--length", + default="20%", + help="Length of summarized text. It may be count of sentences or percentage of input text. (default: 20%%)", + ) + parser.add_argument( + "--language", + default="english", + help="Natural language of summarized text. (default: english)", + ) + parser.add_argument( + "--stopwords", + help="Path to a file containing a list of stopwords. One word per line in UTF-8 encoding. " + "If not provided, the default list of stop-words is used according to the chosen language.", + ) + parser.add_argument( + "--format", + choices=["html", "plaintext"], + default=None, + dest="format", + help="Format of input document. Possible values: html, plaintext.", + ) + parser.add_argument( + "--url", + help="URL address of the web page to summarize.", + ) + parser.add_argument( + "--file", + dest="file", + help="Path to the text file to summarize.", + ) + parser.add_argument( + "--version", + action="version", + version=f"%(prog)s {__version__}", + ) + + parsed_args = parser.parse_args(args) + summarizer, document_parser, items_count = handle_arguments(parsed_args) + + for sentence in summarizer(document_parser.document, items_count): print(str(sentence)) return 0 def handle_arguments(args, default_input_stream=sys.stdin): - document_format = args['--format'] + document_format = args.format if document_format is not None and document_format not in PARSERS: raise ValueError( f"Unsupported format of input document. Possible values are: " f"{', '.join(PARSERS.keys())}. Given: {document_format}.") - if args["--url"] is not None: + if args.url is not None: parser = PARSERS[document_format or "html"] - request = urllib_request.Request(args["--url"], headers=HEADERS) + request = urllib_request.Request(args.url, headers=HEADERS) input_stream = urllib_request.urlopen(request) - elif args["--file"] is not None: + elif args.file is not None: parser = PARSERS[document_format or "plaintext"] - input_stream = open(args["--file"], "rb") + input_stream = open(args.file, "rb") else: parser = PARSERS[document_format or "plaintext"] input_stream = default_input_stream - items_count = ItemsCount(args["--length"]) + items_count = ItemsCount(args.length) - language = args["--language"] - if args['--stopwords']: - stop_words = read_stop_words(args['--stopwords']) + language = args.language + if args.stopwords: + stop_words = read_stop_words(args.stopwords) else: stop_words = get_stop_words(language) @@ -102,7 +126,7 @@ def handle_arguments(args, default_input_stream=sys.stdin): stemmer = Stemmer(language) - summarizer_class = next(cls for name, cls in AVAILABLE_METHODS.items() if args[name]) + summarizer_class = AVAILABLE_METHODS[args.method] summarizer = build_summarizer(summarizer_class, stop_words, stemmer, parser) return summarizer, parser, items_count diff --git a/sumy/evaluation/__main__.py b/sumy/evaluation/__main__.py index 1d326524..7a19170d 100644 --- a/sumy/evaluation/__main__.py +++ b/sumy/evaluation/__main__.py @@ -1,32 +1,11 @@ -""" -Sumy - evaluation of automatic text summary. - -Usage: - sumy_eval (random | luhn | edmundson | lsa | text-rank | lex-rank | sum-basic | kl) [--length=] [--language=] - sumy_eval (random | luhn | edmundson | lsa | text-rank | lex-rank | sum-basic | kl) [--length=] [--language=] --url= - sumy_eval (random | luhn | edmundson | lsa | text-rank | lex-rank | sum-basic | kl) [--length=] [--language=] --file= --format= - sumy_eval --version - sumy_eval --help - -Options: - Path to the file with reference summary. - --url= URL address of summarizied message. - --file= Path to file with summarizied text. - --format= Format of input file. [default: plaintext] - --length= Length of summarizied text. It may be count of sentences - or percentage of input text. [default: 20%] - --language= Natural language of summarizied text. [default: english] - --version Displays version of application. - --help Displays this text. - -""" +"""Sumy - evaluation of automatic text summary.""" +import argparse import sys from itertools import chain from urllib import request as urllib_request -from docopt import docopt from .. import __version__ from ..utils import ItemsCount, get_stop_words from ..models import TfDocumentModel @@ -155,12 +134,57 @@ def evaluate_unit_overlap(evaluated_sentences, reference_sentences): def main(args=None): - args = docopt(__doc__, args, version=__version__) - summarizer, document, items_count, reference_summary = handle_arguments(args) + parser = argparse.ArgumentParser( + prog="sumy_eval", + description="Evaluation of automatic text summary.", + ) + parser.add_argument( + "method", + choices=["random", "luhn", "edmundson", "lsa", "text-rank", "lex-rank", "sum-basic", "kl"], + help="Summarization method to evaluate.", + ) + parser.add_argument( + "reference_summary", + help="Path to the file with reference summary.", + ) + parser.add_argument( + "--length", + default="20%", + help="Length of summarized text. It may be count of sentences or percentage of input text. (default: 20%%)", + ) + parser.add_argument( + "--language", + default="english", + help="Natural language of summarized text. (default: english)", + ) + parser.add_argument( + "--url", + help="URL address of the web page to summarize.", + ) + parser.add_argument( + "--file", + dest="file", + help="Path to the text file to summarize.", + ) + parser.add_argument( + "--format", + choices=["html", "plaintext"], + default="plaintext", + dest="format", + help="Format of input document. Possible values: html, plaintext. (default: plaintext)", + ) + parser.add_argument( + "--version", + action="version", + version=f"%(prog)s {__version__}", + ) + + parsed_args = parser.parse_args(args) + summarizer, document, items_count, reference_summary = handle_arguments(parsed_args) evaluated_sentences = summarizer(document, items_count) reference_document = PlaintextParser.from_string(reference_summary, - Tokenizer(args["--language"])) + Tokenizer(parsed_args.language)) reference_sentences = reference_document.document.sentences for name, evaluate_document, evaluate in AVAILABLE_EVALUATIONS: @@ -172,7 +196,7 @@ def main(args=None): def handle_arguments(args): - document_format = args["--format"] + document_format = args.format if document_format is not None and document_format not in PARSERS: raise ValueError( f"Unsupported format of input document. Possible values are: " @@ -181,30 +205,26 @@ def handle_arguments(args): parser = PARSERS["plaintext"] input_stream = sys.stdin - if args["--url"] is not None: + if args.url is not None: parser = PARSERS["html"] - request = urllib_request.Request(args["--url"], headers=HEADERS) + request = urllib_request.Request(args.url, headers=HEADERS) input_stream = urllib_request.urlopen(request) - elif args["--file"] is not None: + elif args.file is not None: parser = PARSERS.get(document_format, PlaintextParser) - input_stream = open(args["--file"], "rb") + input_stream = open(args.file, "rb") - summarizer_builder = AVAILABLE_METHODS["luhn"] - for method, builder in AVAILABLE_METHODS.items(): - if args[method]: - summarizer_builder = builder - break + summarizer_builder = AVAILABLE_METHODS[args.method] - items_count = ItemsCount(args["--length"]) + items_count = ItemsCount(args.length) - parser = parser(input_stream.read(), Tokenizer(args["--language"])) + parser = parser(input_stream.read(), Tokenizer(args.language)) if input_stream is not sys.stdin: input_stream.close() - with open(args[""], "rb") as file: + with open(args.reference_summary, "rb") as file: reference_summmary = file.read().decode("utf8") - return summarizer_builder(parser, args["--language"]), parser.document, items_count, reference_summmary + return summarizer_builder(parser, args.language), parser.document, items_count, reference_summmary if __name__ == "__main__": From f6c97c5c78d8e87e3bbb1bf70a82490d532393ef Mon Sep 17 00:00:00 2001 From: "mcode-bot@modelcode.ai" Date: Thu, 11 Jun 2026 14:18:45 -0500 Subject: [PATCH 05/11] Fix PlaintextParser to properly decode bytes input When files are opened in binary mode ("rb"), input_stream.read() returns bytes. The old to_unicode() from _compat handled this, but the modernized str(text) produces "b'...'" instead of decoding. Add explicit bytes decoding in PlaintextParser.__init__. --- sumy/parsers/plaintext.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/sumy/parsers/plaintext.py b/sumy/parsers/plaintext.py index d436f09a..1b4b74b3 100644 --- a/sumy/parsers/plaintext.py +++ b/sumy/parsers/plaintext.py @@ -15,7 +15,9 @@ def from_file(cls, file_path, tokenizer): def __init__(self, text, tokenizer): super().__init__(tokenizer) - self._text = str(text).strip() + if isinstance(text, bytes): + text = text.decode("utf-8") + self._text = text.strip() @cached_property def significant_words(self): From 980948587beca51579617ba8bc64058649cb2849 Mon Sep 17 00:00:00 2001 From: "mcode-bot@modelcode.ai" Date: Thu, 11 Jun 2026 14:25:48 -0500 Subject: [PATCH 06/11] Update all test files for modernized codebase - Delete test_compat.py and test_unicode_compatible_class.py (tested deleted _compat module) - Rewrite test_main.py to test argparse-based CLI instead of docopt - Update tests/utils.py: remove _compat imports, use plain str and bytes decode - Remove # -*- coding: utf8 -*- and from __future__ imports from all test files - Replace all to_unicode() calls with str() across 9 test files - Remove all sumy._compat imports from test files --- tests/test_evaluation.py | 5 - tests/test_main.py | 121 ++++++++++-------- tests/test_models/test_dom.py | 14 +- tests/test_models/test_tf.py | 5 - tests/test_parsers.py | 14 +- tests/test_stemmers.py | 5 - tests/test_summarizers/test_edmundson.py | 88 ++++++------- tests/test_summarizers/test_kl.py | 6 - tests/test_summarizers/test_lex_rank.py | 5 - tests/test_summarizers/test_lsa.py | 12 +- tests/test_summarizers/test_luhn.py | 40 +++--- tests/test_summarizers/test_random.py | 14 +- tests/test_summarizers/test_sum_basic.py | 6 - tests/test_summarizers/test_text_rank.py | 12 +- tests/test_tokenizers.py | 5 - tests/test_utils/test_compat.py | 87 ------------- .../test_unicode_compatible_class.py | 55 -------- tests/test_utils/test_utils.py | 5 - tests/utils.py | 16 +-- 19 files changed, 150 insertions(+), 365 deletions(-) delete mode 100644 tests/test_utils/test_compat.py delete mode 100644 tests/test_utils/test_unicode_compatible_class.py diff --git a/tests/test_evaluation.py b/tests/test_evaluation.py index 2dc8e836..94e04d71 100644 --- a/tests/test_evaluation.py +++ b/tests/test_evaluation.py @@ -1,8 +1,3 @@ -# -*- coding: utf8 -*- - -from __future__ import absolute_import -from __future__ import division, print_function, unicode_literals - import unittest from sumy.nlp.tokenizers import Tokenizer diff --git a/tests/test_main.py b/tests/test_main.py index 27161147..6dd60d8c 100644 --- a/tests/test_main.py +++ b/tests/test_main.py @@ -1,55 +1,76 @@ -# -*- coding: utf8 -*- - -from __future__ import absolute_import -from __future__ import division, print_function, unicode_literals - import unittest +import argparse +import sys +import pytest -from docopt import docopt, DocoptExit -from sumy.__main__ import __doc__ as main_doc -from sumy.__main__ import handle_arguments, to_string, __version__ -from .utils import StringIO +from sumy.__main__ import main, handle_arguments, AVAILABLE_METHODS +from sumy import __version__ +from io import StringIO class TestMain(unittest.TestCase): - DEFAULT_ARGS = { - '--file': None, - '--format': None, - '--help': False, - '--language': 'english', - '--length': '20%', - '--stopwords': None, - '--url': None, - '--version': False, - 'edmundson': False, - 'lex-rank': False, - 'lsa': True, - 'luhn': False, - 'text-rank': False, - 'sum-basic': False, - 'kl': False, - } - - def test_ok_args(self): - docopt(to_string(main_doc), 'luhn --url=URL --format=FORMAT'.split(), version=__version__) - - def test_args_none(self): - self.assertRaises(DocoptExit, docopt, to_string(main_doc), None, version=__version__) - - def test_args_just_command(self): - args = docopt(to_string(main_doc), ['lsa'], version=__version__) - self.assertEqual(self.DEFAULT_ARGS, args) - - def test_args_two_commands(self): - self.assertRaises(DocoptExit, docopt, to_string(main_doc), 'lsa luhn'.split(), version=__version__) - - def test_args_url_and_file(self): - self.assertRaises(DocoptExit, docopt, to_string(main_doc), 'lsa --url=URL --file=FILE'.split(), version=__version__) - - def test_handle_default_arguments(self): - handle_arguments(self.DEFAULT_ARGS, default_input_stream=StringIO("Whatever.")) - - def test_handle_wrong_format(self): - wrong_args = self.DEFAULT_ARGS.copy() - wrong_args.update({'--url': 'URL', '--format': 'text'}) - self.assertRaises(ValueError, handle_arguments, wrong_args, default_input_stream=StringIO("Whatever.")) + def test_main_with_valid_method(self): + """Test that main() runs without error for a valid method with file input.""" + # Use a test file that exists + exit_code = main(["luhn", "--file", "tests/data/snippets/prevko.txt", "--language", "czech", "--length", "3"]) + assert exit_code == 0 + + def test_main_no_args_exits(self): + """Test that calling main with no args raises SystemExit (argparse requires method).""" + with pytest.raises(SystemExit): + main([]) + + def test_main_invalid_method_exits(self): + """Test that an invalid method name raises SystemExit.""" + with pytest.raises(SystemExit): + main(["invalid_method"]) + + def test_main_version(self): + """Test that --version raises SystemExit (argparse behavior).""" + with pytest.raises(SystemExit) as exc_info: + main(["--version"]) + assert exc_info.value.code == 0 + + def test_handle_arguments_default_input(self): + """Test handle_arguments with default input (stdin).""" + args = argparse.Namespace( + method="lsa", + url=None, + file=None, + format=None, + length="20%", + language="english", + stopwords=None, + ) + summarizer, parser, items_count = handle_arguments(args, default_input_stream=StringIO("This is a test sentence. And another one.")) + assert summarizer is not None + assert parser is not None + + def test_handle_arguments_wrong_format(self): + """Test that handle_arguments raises ValueError for invalid format.""" + args = argparse.Namespace( + method="lsa", + url="http://example.com", + file=None, + format="text", + length="20%", + language="english", + stopwords=None, + ) + with pytest.raises(ValueError): + handle_arguments(args, default_input_stream=StringIO("Whatever.")) + + def test_handle_arguments_with_file(self): + """Test handle_arguments with --file argument.""" + args = argparse.Namespace( + method="luhn", + url=None, + file="tests/data/snippets/prevko.txt", + format=None, + length="20%", + language="czech", + stopwords=None, + ) + summarizer, parser, items_count = handle_arguments(args) + assert summarizer is not None + assert parser is not None diff --git a/tests/test_models/test_dom.py b/tests/test_models/test_dom.py index 53b383fd..3cdcb9ba 100644 --- a/tests/test_models/test_dom.py +++ b/tests/test_models/test_dom.py @@ -1,11 +1,5 @@ -# -*- coding: utf8 -*- - -from __future__ import absolute_import -from __future__ import division, print_function, unicode_literals - import unittest -from sumy._compat import to_unicode from sumy.nlp.tokenizers import Tokenizer from sumy.models.dom import Paragraph, Sentence from ..utils import build_document, build_document_from_string @@ -44,7 +38,7 @@ def test_headings(self): """) self.assertEqual(len(document.headings), 1) - self.assertEqual(to_unicode(document.headings[0]), "Nová myšlenka") + self.assertEqual(str(document.headings[0]), "Nová myšlenka") def test_sentences(self): document = build_document_from_string(""" @@ -56,11 +50,11 @@ def test_sentences(self): """) self.assertEqual(len(document.sentences), 3) - self.assertEqual(to_unicode(document.sentences[0]), + self.assertEqual(str(document.sentences[0]), "Nějaký muž šel kolem naší zahrady") - self.assertEqual(to_unicode(document.sentences[1]), + self.assertEqual(str(document.sentences[1]), "Nějaký jiný muž šel kolem vaší zahrady") - self.assertEqual(to_unicode(document.sentences[2]), + self.assertEqual(str(document.sentences[2]), "Už už abych taky šel") def test_only_instances_of_sentence_allowed(self): diff --git a/tests/test_models/test_tf.py b/tests/test_models/test_tf.py index 34257820..863330c8 100644 --- a/tests/test_models/test_tf.py +++ b/tests/test_models/test_tf.py @@ -1,8 +1,3 @@ -# -*- coding: utf8 -*- - -from __future__ import absolute_import -from __future__ import division, print_function, unicode_literals - import unittest from sumy.nlp.tokenizers import Tokenizer diff --git a/tests/test_parsers.py b/tests/test_parsers.py index 739446cc..10c26045 100644 --- a/tests/test_parsers.py +++ b/tests/test_parsers.py @@ -1,11 +1,5 @@ -# -*- coding: utf8 -*- - -from __future__ import absolute_import -from __future__ import division, print_function, unicode_literals - import unittest -from sumy._compat import to_unicode from sumy.parsers.plaintext import PlaintextParser from sumy.parsers.html import HtmlParser from sumy.nlp.tokenizers import Tokenizer @@ -86,15 +80,15 @@ def test_annotated_text(self): self.assertEqual(len(document.paragraphs[0].headings), 1) self.assertEqual(len(document.paragraphs[0].sentences), 1) - self.assertEqual(to_unicode(document.paragraphs[0].headings[0]), + self.assertEqual(str(document.paragraphs[0].headings[0]), "Toto je nadpis prvej úrovne") - self.assertEqual(to_unicode(document.paragraphs[0].sentences[0]), + self.assertEqual(str(document.paragraphs[0].sentences[0]), "Toto je prvý odstavec a to je fajn.") self.assertEqual(len(document.paragraphs[1].headings), 0) self.assertEqual(len(document.paragraphs[1].sentences), 2) - self.assertEqual(to_unicode(document.paragraphs[1].sentences[0]), + self.assertEqual(str(document.paragraphs[1].sentences[0]), "Tento text je tu aby vyplnil prázdne miesto v srdci súboru.") - self.assertEqual(to_unicode(document.paragraphs[1].sentences[1]), + self.assertEqual(str(document.paragraphs[1].sentences[1]), "Aj súbory majú predsa city.") diff --git a/tests/test_stemmers.py b/tests/test_stemmers.py index 789e7644..f70af65a 100644 --- a/tests/test_stemmers.py +++ b/tests/test_stemmers.py @@ -1,8 +1,3 @@ -# -*- coding: utf8 -*- - -from __future__ import absolute_import -from __future__ import division, print_function, unicode_literals - import unittest from sumy.nlp.stemmers import null_stemmer, Stemmer diff --git a/tests/test_summarizers/test_edmundson.py b/tests/test_summarizers/test_edmundson.py index 8dc0ccbf..0051a80d 100644 --- a/tests/test_summarizers/test_edmundson.py +++ b/tests/test_summarizers/test_edmundson.py @@ -1,12 +1,6 @@ -# -*- coding: utf8 -*- - -from __future__ import absolute_import -from __future__ import division, print_function, unicode_literals - import unittest from sumy.summarizers.edmundson import EdmundsonSummarizer -from sumy._compat import to_unicode from ..utils import build_document, build_document_from_string @@ -66,9 +60,9 @@ def test_mixed_cue_key(self): sentences = summarizer(document, 2) self.assertEqual(len(sentences), 2) - self.assertEqual(to_unicode(sentences[0]), + self.assertEqual(str(sentences[0]), "Because I am sentence I like words") - self.assertEqual(to_unicode(sentences[1]), + self.assertEqual(str(sentences[1]), "Here is the winner because contains words like cool and heading") def test_cue_with_no_words(self): @@ -108,8 +102,8 @@ def test_cue_letters_case(self): sentences = summarizer.cue_method(document, 2) self.assertEqual(len(sentences), 2) - self.assertEqual(to_unicode(sentences[0]), "x x x x") - self.assertEqual(to_unicode(sentences[1]), "W W W W") + self.assertEqual(str(sentences[0]), "x x x x") + self.assertEqual(str(sentences[1]), "W W W W") def test_cue_1(self): document = build_document( @@ -135,13 +129,13 @@ def test_cue_2(self): sentences = summarizer.cue_method(document, 10) self.assertEqual(len(sentences), 2) - self.assertEqual(to_unicode(sentences[0]), + self.assertEqual(str(sentences[0]), "ba bb bc bb unknown ľščťžýáíé sb sc sb") - self.assertEqual(to_unicode(sentences[1]), "Pepek likes spinach") + self.assertEqual(str(sentences[1]), "Pepek likes spinach") sentences = summarizer.cue_method(document, 1) self.assertEqual(len(sentences), 1) - self.assertEqual(to_unicode(sentences[0]), + self.assertEqual(str(sentences[0]), "ba bb bc bb unknown ľščťžýáíé sb sc sb") def test_cue_3(self): @@ -166,12 +160,12 @@ def test_cue_3(self): sentences = summarizer.cue_method(document, 5) self.assertEqual(len(sentences), 5) - self.assertEqual(to_unicode(sentences[0]), ("ba "*10).strip()) - self.assertEqual(to_unicode(sentences[1]), ("bb "*10).strip()) - self.assertEqual(to_unicode(sentences[2]), "bb bc ba") - self.assertEqual(to_unicode(sentences[3]), + self.assertEqual(str(sentences[0]), ("ba "*10).strip()) + self.assertEqual(str(sentences[1]), ("bb "*10).strip()) + self.assertEqual(str(sentences[2]), "bb bc ba") + self.assertEqual(str(sentences[3]), "na nb nc nd sa bc bc bc bc bc bc bc bc bc bc") - self.assertEqual(to_unicode(sentences[4]), ("ba n "*10).strip()) + self.assertEqual(str(sentences[4]), ("ba n "*10).strip()) def test_key_empty(self): summarizer = EdmundsonSummarizer() @@ -195,9 +189,9 @@ def test_key_no_bonus_words_in_document(self): sentences = summarizer.key_method(document, 10) self.assertEqual(len(sentences), 3) - self.assertEqual(to_unicode(sentences[0]), "wa wb wc wd") - self.assertEqual(to_unicode(sentences[1]), "I like music") - self.assertEqual(to_unicode(sentences[2]), + self.assertEqual(str(sentences[0]), "wa wb wc wd") + self.assertEqual(str(sentences[1]), "I like music") + self.assertEqual(str(sentences[2]), "This is test sentence with some extra words") def test_key_1(self): @@ -210,7 +204,7 @@ def test_key_1(self): sentences = summarizer.key_method(document, 1) self.assertEqual(len(sentences), 1) - self.assertEqual(to_unicode(sentences[0]), + self.assertEqual(str(sentences[0]), "This is test sentence with some extra words and bonus") def test_key_2(self): @@ -223,8 +217,8 @@ def test_key_2(self): sentences = summarizer.key_method(document, 2) self.assertEqual(len(sentences), 2) - self.assertEqual(to_unicode(sentences[0]), "Om nom nom nom nom") - self.assertEqual(to_unicode(sentences[1]), + self.assertEqual(str(sentences[0]), "Om nom nom nom nom") + self.assertEqual(str(sentences[1]), "This is bonus test sentence with some extra words and bonus") def test_key_3(self): @@ -237,15 +231,15 @@ def test_key_3(self): sentences = summarizer.key_method(document, 3) self.assertEqual(len(sentences), 3) - self.assertEqual(to_unicode(sentences[0]), "wa wa wa") - self.assertEqual(to_unicode(sentences[1]), "wa wa wa wa") - self.assertEqual(to_unicode(sentences[2]), "wa Wa Wa Wa wa") + self.assertEqual(str(sentences[0]), "wa wa wa") + self.assertEqual(str(sentences[1]), "wa wa wa wa") + self.assertEqual(str(sentences[2]), "wa Wa Wa Wa wa") sentences = summarizer.key_method(document, 3, weight=0) self.assertEqual(len(sentences), 3) - self.assertEqual(to_unicode(sentences[0]), "wa wa wa wa") - self.assertEqual(to_unicode(sentences[1]), "wa Wa Wa Wa wa") - self.assertEqual(to_unicode(sentences[2]), "x X x X") + self.assertEqual(str(sentences[0]), "wa wa wa wa") + self.assertEqual(str(sentences[1]), "wa Wa Wa Wa wa") + self.assertEqual(str(sentences[2]), "x X x X") def test_title_method_with_empty_document(self): summarizer = EdmundsonSummarizer() @@ -270,9 +264,9 @@ def test_title_method_without_title(self): sentences = summarizer.title_method(document, 10) self.assertEqual(len(sentences), 3) - self.assertEqual(to_unicode(sentences[0]), "This is sentence") - self.assertEqual(to_unicode(sentences[1]), "This is another one") - self.assertEqual(to_unicode(sentences[2]), "And some next sentence but no heading") + self.assertEqual(str(sentences[0]), "This is sentence") + self.assertEqual(str(sentences[1]), "This is another one") + self.assertEqual(str(sentences[2]), "And some next sentence but no heading") def test_title_method_1(self): document = build_document_from_string(""" @@ -290,7 +284,7 @@ def test_title_method_1(self): sentences = summarizer.title_method(document, 1) self.assertEqual(len(sentences), 1) - self.assertEqual(to_unicode(sentences[0]), + self.assertEqual(str(sentences[0]), "Here is the winner because contains words like cool and heading") def test_title_method_2(self): @@ -309,9 +303,9 @@ def test_title_method_2(self): sentences = summarizer.title_method(document, 2) self.assertEqual(len(sentences), 2) - self.assertEqual(to_unicode(sentences[0]), + self.assertEqual(str(sentences[0]), "This is next paragraph because of blank line above") - self.assertEqual(to_unicode(sentences[1]), + self.assertEqual(str(sentences[1]), "Here is the winner because contains words like cool and heading") def test_title_method_3(self): @@ -330,11 +324,11 @@ def test_title_method_3(self): sentences = summarizer.title_method(document, 3) self.assertEqual(len(sentences), 3) - self.assertEqual(to_unicode(sentences[0]), + self.assertEqual(str(sentences[0]), "Because I am sentence I like words") - self.assertEqual(to_unicode(sentences[1]), + self.assertEqual(str(sentences[1]), "This is next paragraph because of blank line above") - self.assertEqual(to_unicode(sentences[2]), + self.assertEqual(str(sentences[2]), "Here is the winner because contains words like cool and heading") def test_location_method_with_empty_document(self): @@ -369,10 +363,10 @@ def test_location_method_1(self): sentences = summarizer.location_method(document, 4) self.assertEqual(len(sentences), 4) - self.assertEqual(to_unicode(sentences[0]), "ha = 1 + 1 + 1 = 3") - self.assertEqual(to_unicode(sentences[1]), "ha hb = 2 + 1 + 1 = 4") - self.assertEqual(to_unicode(sentences[2]), "hb hc hd = 3 + 1 + 1 = 5") - self.assertEqual(to_unicode(sentences[3]), "ha hb = 2 + 1 + 1 = 4") + self.assertEqual(str(sentences[0]), "ha = 1 + 1 + 1 = 3") + self.assertEqual(str(sentences[1]), "ha hb = 2 + 1 + 1 = 4") + self.assertEqual(str(sentences[2]), "hb hc hd = 3 + 1 + 1 = 5") + self.assertEqual(str(sentences[3]), "ha hb = 2 + 1 + 1 = 4") def test_location_method_2(self): document = build_document_from_string(""" @@ -395,7 +389,7 @@ def test_location_method_2(self): sentences = summarizer.location_method(document, 4, w_p1=0, w_p2=0) self.assertEqual(len(sentences), 4) - self.assertEqual(to_unicode(sentences[0]), "ha hb = 2 + 1 + 0 = 3") - self.assertEqual(to_unicode(sentences[1]), "ha hb ha = 3") - self.assertEqual(to_unicode(sentences[2]), "hb hc hd = 3 + 1 + 0 = 4") - self.assertEqual(to_unicode(sentences[3]), "ha hb = 2 + 1 + 0 = 3") + self.assertEqual(str(sentences[0]), "ha hb = 2 + 1 + 0 = 3") + self.assertEqual(str(sentences[1]), "ha hb ha = 3") + self.assertEqual(str(sentences[2]), "hb hc hd = 3 + 1 + 0 = 4") + self.assertEqual(str(sentences[3]), "ha hb = 2 + 1 + 0 = 3") diff --git a/tests/test_summarizers/test_kl.py b/tests/test_summarizers/test_kl.py index ec0c78e4..1991b5ee 100644 --- a/tests/test_summarizers/test_kl.py +++ b/tests/test_summarizers/test_kl.py @@ -1,13 +1,7 @@ -# -*- coding: utf8 -*- - -from __future__ import absolute_import -from __future__ import division, print_function, unicode_literals - import unittest from sumy.models.dom._sentence import Sentence from sumy.summarizers.kl import KLSummarizer -from sumy._compat import to_unicode from ..utils import build_document, build_document_from_string from sumy.nlp.tokenizers import Tokenizer diff --git a/tests/test_summarizers/test_lex_rank.py b/tests/test_summarizers/test_lex_rank.py index 0b842a3f..2ff57ee4 100644 --- a/tests/test_summarizers/test_lex_rank.py +++ b/tests/test_summarizers/test_lex_rank.py @@ -1,8 +1,3 @@ -# -*- coding: utf8 -*- - -from __future__ import absolute_import -from __future__ import division, print_function, unicode_literals - import math import unittest import sumy.summarizers.lex_rank as lex_rank_module diff --git a/tests/test_summarizers/test_lsa.py b/tests/test_summarizers/test_lsa.py index 2ae432f0..2f3b0ec6 100644 --- a/tests/test_summarizers/test_lsa.py +++ b/tests/test_summarizers/test_lsa.py @@ -1,8 +1,3 @@ -# -*- coding: utf8 -*- - -from __future__ import absolute_import -from __future__ import division, print_function, unicode_literals - import unittest import pytest import sumy.summarizers.lsa as lsa_module @@ -12,7 +7,6 @@ from sumy.nlp.tokenizers import Tokenizer from sumy.nlp.stemmers import Stemmer from sumy.utils import get_stop_words -from sumy._compat import to_unicode from ..utils import build_document, load_resource @@ -55,7 +49,7 @@ def test_single_sentence(self): sentences = summarizer(document, 10) self.assertEqual(len(sentences), 1) - self.assertEqual(to_unicode(sentences[0]), "I am the sentence you like") + self.assertEqual(str(sentences[0]), "I am the sentence you like") def test_document(self): document = build_document( @@ -69,8 +63,8 @@ def test_document(self): sentences = summarizer(document, 2) self.assertEqual(len(sentences), 2) - self.assertEqual(to_unicode(sentences[0]), "I am the sentence you like") - self.assertEqual(to_unicode(sentences[1]), "This sentence is better than that above") + self.assertEqual(str(sentences[0]), "I am the sentence you like") + self.assertEqual(str(sentences[1]), "This sentence is better than that above") def test_real_example(self): """Source: http://www.prevko.cz/dite/skutecne-pribehy-deti""" diff --git a/tests/test_summarizers/test_luhn.py b/tests/test_summarizers/test_luhn.py index 3f985d63..53cb6722 100644 --- a/tests/test_summarizers/test_luhn.py +++ b/tests/test_summarizers/test_luhn.py @@ -1,8 +1,3 @@ -# -*- coding: utf8 -*- - -from __future__ import absolute_import -from __future__ import division, print_function, unicode_literals - import unittest from sumy.summarizers.luhn import LuhnSummarizer @@ -10,7 +5,6 @@ from sumy.nlp.tokenizers import Tokenizer from sumy.nlp.stemmers.czech import stem_word from sumy.utils import get_stop_words -from sumy._compat import to_unicode from ..utils import build_document, build_sentence @@ -37,8 +31,8 @@ def test_two_sentences(self): returned = summarizer(document, 10) self.assertEqual(len(returned), 2) - self.assertEqual(to_unicode(returned[0]), "Já jsem 1. věta") - self.assertEqual(to_unicode(returned[1]), "A já ta 2. vítězná výhra") + self.assertEqual(str(returned[0]), "Já jsem 1. věta") + self.assertEqual(str(returned[1]), "A já ta 2. vítězná výhra") def test_two_sentences_but_one_winner(self): document = build_document(( @@ -50,7 +44,7 @@ def test_two_sentences_but_one_winner(self): returned = summarizer(document, 1) self.assertEqual(len(returned), 1) - self.assertEqual(to_unicode(returned[0]), "A já ta 2. vítězná věta") + self.assertEqual(str(returned[0]), "A já ta 2. vítězná věta") def test_three_sentences(self): document = build_document(( @@ -63,18 +57,18 @@ def test_three_sentences(self): returned = summarizer(document, 1) self.assertEqual(len(returned), 1) - self.assertEqual(to_unicode(returned[0]), "wb s wb s wb s s s s s s s s s wb") + self.assertEqual(str(returned[0]), "wb s wb s wb s s s s s s s s s wb") returned = summarizer(document, 2) self.assertEqual(len(returned), 2) - self.assertEqual(to_unicode(returned[0]), "wb s wb s wb s s s s s s s s s wb") - self.assertEqual(to_unicode(returned[1]), "wc s s wc s s wc") + self.assertEqual(str(returned[0]), "wb s wb s wb s s s s s s s s s wb") + self.assertEqual(str(returned[1]), "wc s s wc s s wc") returned = summarizer(document, 3) self.assertEqual(len(returned), 3) - self.assertEqual(to_unicode(returned[0]), "wa s s s wa s s s wa") - self.assertEqual(to_unicode(returned[1]), "wb s wb s wb s s s s s s s s s wb") - self.assertEqual(to_unicode(returned[2]), "wc s s wc s s wc") + self.assertEqual(str(returned[0]), "wa s s s wa s s s wa") + self.assertEqual(str(returned[1]), "wb s wb s wb s s s s s s s s s wb") + self.assertEqual(str(returned[2]), "wc s s wc s s wc") def test_various_words_with_significant_percentage(self): document = build_document(( @@ -90,18 +84,18 @@ def test_various_words_with_significant_percentage(self): returned = summarizer(document, 1) self.assertEqual(len(returned), 1) - self.assertEqual(to_unicode(returned[0]), "6 e e e e e") + self.assertEqual(str(returned[0]), "6 e e e e e") returned = summarizer(document, 2) self.assertEqual(len(returned), 2) - self.assertEqual(to_unicode(returned[0]), "5 z z z z") - self.assertEqual(to_unicode(returned[1]), "6 e e e e e") + self.assertEqual(str(returned[0]), "5 z z z z") + self.assertEqual(str(returned[1]), "6 e e e e e") returned = summarizer(document, 3) self.assertEqual(len(returned), 3) - self.assertEqual(to_unicode(returned[0]), "3 c c c") - self.assertEqual(to_unicode(returned[1]), "5 z z z z") - self.assertEqual(to_unicode(returned[2]), "6 e e e e e") + self.assertEqual(str(returned[0]), "3 c c c") + self.assertEqual(str(returned[1]), "5 z z z z") + self.assertEqual(str(returned[2]), "6 e e e e e") def test_real_example(self): parser = PlaintextParser.from_string( @@ -118,9 +112,9 @@ def test_real_example(self): returned = summarizer(parser.document, 2) self.assertEqual(len(returned), 2) - self.assertEqual(to_unicode(returned[0]), + self.assertEqual(str(returned[0]), "Jednalo se o případ chlapce v 6. třídě, který měl problémy s učením.") - self.assertEqual(to_unicode(returned[1]), + self.assertEqual(str(returned[1]), "Připadal si, že je mezi malými dětmi a realizoval se tím, " "že si ve třídě o rok mladších dětí budoval vedoucí pozici.") diff --git a/tests/test_summarizers/test_random.py b/tests/test_summarizers/test_random.py index c803b5ca..57066538 100644 --- a/tests/test_summarizers/test_random.py +++ b/tests/test_summarizers/test_random.py @@ -1,12 +1,6 @@ -# -*- coding: utf8 -*- - -from __future__ import absolute_import -from __future__ import division, print_function, unicode_literals - import unittest from sumy.summarizers.random import RandomSummarizer -from sumy._compat import to_unicode from ..utils import build_document, build_document_from_string @@ -26,7 +20,7 @@ def test_less_sentences_than_requested(self): sentences = summarizer(document, 10) self.assertEqual(len(sentences), 1) - self.assertEqual(to_unicode(sentences[0]), "This is only one sentence.") + self.assertEqual(str(sentences[0]), "This is only one sentence.") def test_sentences_in_right_order(self): document = build_document_from_string(""" @@ -39,9 +33,9 @@ def test_sentences_in_right_order(self): sentences = summarizer(document, 4) self.assertEqual(len(sentences), 3) - self.assertEqual(to_unicode(sentences[0]), "First sentence.") - self.assertEqual(to_unicode(sentences[1]), "Second sentence.") - self.assertEqual(to_unicode(sentences[2]), "Third sentence.") + self.assertEqual(str(sentences[0]), "First sentence.") + self.assertEqual(str(sentences[1]), "Second sentence.") + self.assertEqual(str(sentences[2]), "Third sentence.") def test_more_sentences_than_requested(self): document = build_document_from_string(""" diff --git a/tests/test_summarizers/test_sum_basic.py b/tests/test_summarizers/test_sum_basic.py index c14db883..be426c24 100644 --- a/tests/test_summarizers/test_sum_basic.py +++ b/tests/test_summarizers/test_sum_basic.py @@ -1,13 +1,7 @@ -# -*- coding: utf8 -*- - -from __future__ import absolute_import -from __future__ import division, print_function, unicode_literals - import unittest from sumy.models.dom._sentence import Sentence from sumy.summarizers.sum_basic import SumBasicSummarizer -from sumy._compat import to_unicode from ..utils import build_document, build_document_from_string from sumy.nlp.tokenizers import Tokenizer diff --git a/tests/test_summarizers/test_text_rank.py b/tests/test_summarizers/test_text_rank.py index b24c8fc3..153e7dc8 100644 --- a/tests/test_summarizers/test_text_rank.py +++ b/tests/test_summarizers/test_text_rank.py @@ -1,13 +1,7 @@ -# -*- coding: utf8 -*- - -from __future__ import absolute_import -from __future__ import division, print_function, unicode_literals - import unittest from sumy.summarizers.text_rank import TextRankSummarizer from sumy.nlp.stemmers import Stemmer -from sumy._compat import to_unicode from ..utils import build_document @@ -34,8 +28,8 @@ def test_two_sentences(self): returned = summarizer(document, 10) self.assertEqual(len(returned), 2) - self.assertEqual(to_unicode(returned[0]), "I am that 1. sentence") - self.assertEqual(to_unicode(returned[1]), "And I am 2. winning prize") + self.assertEqual(str(returned[0]), "I am that 1. sentence") + self.assertEqual(str(returned[1]), "And I am 2. winning prize") def test_stop_words_correctly_removed(self): summarizer = TextRankSummarizer() @@ -76,7 +70,7 @@ def test_three_sentences_but_second_winner(self): returned = summarizer(document, 1) self.assertEqual(len(returned), 1) - self.assertEqual(to_unicode(returned[0]), "And I am 2. sentence - winning sentence") + self.assertEqual(str(returned[0]), "And I am 2. sentence - winning sentence") def test_sentences_rating(self): document = build_document([ diff --git a/tests/test_tokenizers.py b/tests/test_tokenizers.py index d08b8cc1..4a250819 100644 --- a/tests/test_tokenizers.py +++ b/tests/test_tokenizers.py @@ -1,8 +1,3 @@ -# -*- coding: utf8 -*- - -from __future__ import absolute_import -from __future__ import division, print_function, unicode_literals - import unittest from sumy.nlp.tokenizers import Tokenizer diff --git a/tests/test_utils/test_compat.py b/tests/test_utils/test_compat.py deleted file mode 100644 index 7455e212..00000000 --- a/tests/test_utils/test_compat.py +++ /dev/null @@ -1,87 +0,0 @@ -# -*- coding: utf8 -*- - -from __future__ import absolute_import -from __future__ import division, print_function, unicode_literals - -import unittest -import pytest - -from sumy import _compat as py3k - - -BYTES_STRING = "ľščťžáýíééäúňô €đ€Ł¤".encode("utf8") -UNICODE_STRING = "ľščťžáýíééäúňô €đ€Ł¤" - - -class TestPy3k(unittest.TestCase): - def assertStringsEqual(self, str1, str2, *args): - self.assertEqual(type(str1), type(str2), *args) - self.assertEqual(str1, str2, *args) - - def test_bytes_to_bytes(self): - returned = py3k.to_bytes(BYTES_STRING) - self.assertStringsEqual(BYTES_STRING, returned) - - def test_unicode_to_bytes(self): - returned = py3k.to_bytes(UNICODE_STRING) - self.assertStringsEqual(BYTES_STRING, returned) - - def test_str_object_to_bytes(self): - value = UNICODE_STRING if py3k.PY3 else BYTES_STRING - instance = self.__build_test_instance("__str__", value) - - returned = py3k.to_bytes(instance) - self.assertStringsEqual(BYTES_STRING, returned) - - def test_unicode_object_to_bytes(self): - if not py3k.PY3: - pytest.skip("Py2 object has `__str__` method called 1st") - - instance = self.__build_test_instance("__str__", UNICODE_STRING) - - returned = py3k.to_bytes(instance) - self.assertStringsEqual(BYTES_STRING, returned) - - def test_repr_object_to_bytes(self): - value = UNICODE_STRING if py3k.PY3 else BYTES_STRING - instance = self.__build_test_instance("__repr__", value) - - returned = py3k.to_bytes(instance) - self.assertStringsEqual(BYTES_STRING, returned) - - def test_data_to_unicode(self): - returned = py3k.to_unicode(BYTES_STRING) - self.assertStringsEqual(UNICODE_STRING, returned) - - def test_unicode_to_unicode(self): - returned = py3k.to_unicode(UNICODE_STRING) - self.assertStringsEqual(UNICODE_STRING, returned) - - def test_str_object_to_unicode(self): - value = UNICODE_STRING if py3k.PY3 else BYTES_STRING - instance = self.__build_test_instance("__str__", value) - - returned = py3k.to_unicode(instance) - self.assertStringsEqual(UNICODE_STRING, returned) - - def test_unicode_object_to_unicode(self): - method = "__str__" if py3k.PY3 else "__unicode__" - instance = self.__build_test_instance(method, UNICODE_STRING) - - returned = py3k.to_unicode(instance) - self.assertStringsEqual(UNICODE_STRING, returned) - - def test_repr_object_to_unicode(self): - value = UNICODE_STRING if py3k.PY3 else BYTES_STRING - instance = self.__build_test_instance("__repr__", value) - - returned = py3k.to_unicode(instance) - self.assertStringsEqual(UNICODE_STRING, returned) - - def __build_test_instance(self, tested_method, value): - class Object(object): - def __init__(self, value): - self.value = value - - setattr(Object, tested_method, lambda self: self.value) - return Object(value) diff --git a/tests/test_utils/test_unicode_compatible_class.py b/tests/test_utils/test_unicode_compatible_class.py deleted file mode 100644 index dfe7db49..00000000 --- a/tests/test_utils/test_unicode_compatible_class.py +++ /dev/null @@ -1,55 +0,0 @@ -# -*- coding: utf8 -*- - -from __future__ import absolute_import -from __future__ import division, print_function, unicode_literals - -import unittest -import pytest - -from sumy import _compat as compat - - -BYTES_STRING = "ľščťžáýíééäúňô €đ€Ł¤".encode("utf8") -UNICODE_STRING = "ľščťžáýíééäúňô €đ€Ł¤" -NATIVE_STRING = compat.to_string(UNICODE_STRING) - - -@compat.unicode_compatible -class O(object): - def __unicode__(self): - return UNICODE_STRING - - -class TestObject(unittest.TestCase): - def setUp(self): - self.o = O() - - def assertStringsEqual(self, str1, str2, *args): - self.assertEqual(type(str1), type(str2), *args) - self.assertEqual(str1, str2, *args) - - def test_native_bytes(self): - if not compat.PY3: - pytest.skip("Python 2 doesn't support method `__bytes__`") - - returned = bytes(self.o) - self.assertStringsEqual(BYTES_STRING, returned) - - def test_native_unicode(self): - if compat.PY3: - pytest.skip("Python 3 doesn't support method `__unicode__`") - - returned = unicode(self.o) - self.assertStringsEqual(UNICODE_STRING, returned) - - def test_to_bytes(self): - returned = compat.to_bytes(self.o) - self.assertStringsEqual(BYTES_STRING, returned) - - def test_to_string(self): - returned = compat.to_string(self.o) - self.assertStringsEqual(NATIVE_STRING, returned) - - def test_to_unicode(self): - returned = compat.to_unicode(self.o) - self.assertStringsEqual(UNICODE_STRING, returned) diff --git a/tests/test_utils/test_utils.py b/tests/test_utils/test_utils.py index 3824185c..81dbf406 100644 --- a/tests/test_utils/test_utils.py +++ b/tests/test_utils/test_utils.py @@ -1,8 +1,3 @@ -# -*- coding: utf8 -*- - -from __future__ import absolute_import -from __future__ import division, print_function, unicode_literals - import unittest from sumy.utils import get_stop_words, read_stop_words, ItemsCount diff --git a/tests/utils.py b/tests/utils.py index 4e3d68a7..7e22d046 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -1,16 +1,6 @@ -# -*- coding: utf8 -*- - -from __future__ import absolute_import -from __future__ import division, print_function, unicode_literals - -try: - from StringIO import StringIO -except ImportError: - from io import StringIO - +from io import StringIO from os.path import dirname, join, abspath from sumy.nlp.tokenizers import Tokenizer -from sumy._compat import to_string, to_unicode from sumy.models.dom import ObjectDocumentModel, Paragraph, Sentence @@ -18,13 +8,13 @@ def expand_resource_path(path): - return join(abspath(dirname(__file__)), to_string("data"), to_string(path)) + return join(abspath(dirname(__file__)), "data", path) def load_resource(path): path = expand_resource_path(path) with open(path, "rb") as file: - return to_unicode(file.read()) + return file.read().decode("utf-8") def build_document(*sets_of_sentences): From a382e52e057abb3d2c1f2aa00ec438caf91f75b6 Mon Sep 17 00:00:00 2001 From: "mcode-bot@modelcode.ai" Date: Thu, 11 Jun 2026 14:38:04 -0500 Subject: [PATCH 07/11] Convert remaining % string formatting to f-string in lsa.py Missed during the bulk modernization pass. Converts the warning message in _create_matrix from % formatting to an f-string. --- sumy/summarizers/lsa.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/sumy/summarizers/lsa.py b/sumy/summarizers/lsa.py index f9ec6d60..cd889e50 100644 --- a/sumy/summarizers/lsa.py +++ b/sumy/summarizers/lsa.py @@ -64,11 +64,10 @@ def _create_matrix(self, document, dictionary): words_count = len(dictionary) sentences_count = len(sentences) if words_count < sentences_count: - message = ( - "Number of words (%d) is lower than number of sentences (%d). " + warn( + f"Number of words ({words_count}) is lower than number of sentences ({sentences_count}). " "LSA algorithm may not work properly." ) - warn(message % (words_count, sentences_count)) # create matrix |unique words|x|sentences| filled with zeroes matrix = numpy.zeros((words_count, sentences_count)) From d4fe8be8c70b49c157ccb6111b670f4fba983745 Mon Sep 17 00:00:00 2001 From: "mcode-bot@modelcode.ai" Date: Thu, 11 Jun 2026 14:51:10 -0500 Subject: [PATCH 08/11] Fix KL summarizer KeyError and update tokenizer test for punkt_tab MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The KL summarizer's _compute_ratings method used raw (unnormalized) words from the summary when computing joint frequencies, but compared them against the normalized word frequency dictionary. This caused a KeyError when a capitalized word appeared in the summary but only its lowercase form existed in the term frequency map. Fixed by using _get_all_content_words_in_doc (which normalizes and filters stop words) instead of _get_all_words_in_doc for the summary word list. Updated test_tokenize_sentence expected tuple to include 'but' — the NLTK punkt_tab tokenizer correctly extracts 'but' from 'but..' as a word token, whereas the legacy punkt pickle tokenizer did not. --- sumy/summarizers/kl.py | 4 ++-- tests/test_tokenizers.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/sumy/summarizers/kl.py b/sumy/summarizers/kl.py index 38e42976..aad7714e 100644 --- a/sumy/summarizers/kl.py +++ b/sumy/summarizers/kl.py @@ -113,8 +113,8 @@ def _compute_ratings(self, sentences): # will store all the kls values for this pass kls = [] - # converts summary to word list - summary_as_word_list = self._get_all_words_in_doc(summary) + # converts summary to normalized content word list + summary_as_word_list = self._get_all_content_words_in_doc(summary) for s in sentences_as_words: # calculates the joint frequency through combining the word lists diff --git a/tests/test_tokenizers.py b/tests/test_tokenizers.py index 4a250819..d705c927 100644 --- a/tests/test_tokenizers.py +++ b/tests/test_tokenizers.py @@ -33,7 +33,7 @@ def test_tokenize_sentence(self): expected = ( "I", "am", "a", "very", "nice", "sentence", - "with", "comma", + "with", "comma", "but", ) self.assertEqual(expected, words) From 2f920bbb6d12def13e2e56fcfaebbc256d555104 Mon Sep 17 00:00:00 2001 From: "mcode-bot@modelcode.ai" Date: Thu, 11 Jun 2026 14:58:45 -0500 Subject: [PATCH 09/11] Fix all ruff lint errors and apply consistent formatting - Fix F401 re-exports in __init__.py files using explicit `as symbol` syntax (evaluation, models, models/dom, parsers, summarizers) - Fix W191/E101 tab indentation in evaluation/rouge.py (converted to spaces) - Fix E731 lambda assignment in summarizers/_summarizer.py (use def instead) - Fix E701 multiple statements on one line in summarizers/kl.py - Fix F841 unused variables in tests (test_evaluation.py, test_kl.py) - Fix I001 unsorted imports via ruff --fix - Apply ruff format across all source and test files --- sumy/__main__.py | 19 +- sumy/evaluation/__init__.py | 23 +- sumy/evaluation/__main__.py | 28 +- sumy/evaluation/content_based.py | 12 +- sumy/evaluation/coselection.py | 2 +- sumy/evaluation/rouge.py | 478 ++++++++++++----------- sumy/models/__init__.py | 2 +- sumy/models/dom/__init__.py | 6 +- sumy/models/dom/_document.py | 1 + sumy/models/dom/_paragraph.py | 1 + sumy/models/dom/_sentence.py | 7 +- sumy/models/tf.py | 15 +- sumy/nlp/stemmers/__init__.py | 4 +- sumy/nlp/stemmers/czech.py | 35 +- sumy/nlp/tokenizers.py | 5 +- sumy/parsers/__init__.py | 2 +- sumy/parsers/html.py | 10 +- sumy/parsers/plaintext.py | 2 +- sumy/summarizers/__init__.py | 2 +- sumy/summarizers/_summarizer.py | 20 +- sumy/summarizers/edmundson.py | 15 +- sumy/summarizers/edmundson_cue.py | 13 +- sumy/summarizers/edmundson_key.py | 10 +- sumy/summarizers/edmundson_location.py | 4 +- sumy/summarizers/edmundson_title.py | 9 +- sumy/summarizers/kl.py | 18 +- sumy/summarizers/lex_rank.py | 9 +- sumy/summarizers/lsa.py | 23 +- sumy/summarizers/luhn.py | 9 +- sumy/summarizers/sum_basic.py | 11 +- sumy/summarizers/text_rank.py | 4 +- sumy/utils.py | 10 +- tests/test_evaluation.py | 118 +++--- tests/test_main.py | 9 +- tests/test_models/test_dom.py | 34 +- tests/test_models/test_tf.py | 22 +- tests/test_parsers.py | 31 +- tests/test_stemmers.py | 11 +- tests/test_summarizers/test_edmundson.py | 312 ++++++++++----- tests/test_summarizers/test_kl.py | 50 ++- tests/test_summarizers/test_lex_rank.py | 99 +++-- tests/test_summarizers/test_lsa.py | 67 ++-- tests/test_summarizers/test_luhn.py | 186 ++++++--- tests/test_summarizers/test_random.py | 1 + tests/test_summarizers/test_sum_basic.py | 48 ++- tests/test_summarizers/test_text_rank.py | 54 ++- tests/test_tokenizers.py | 11 +- tests/test_utils/test_utils.py | 3 +- tests/utils.py | 7 +- 49 files changed, 1055 insertions(+), 817 deletions(-) diff --git a/sumy/__main__.py b/sumy/__main__.py index 68815e9a..bef17999 100644 --- a/sumy/__main__.py +++ b/sumy/__main__.py @@ -2,22 +2,21 @@ import argparse import sys - from urllib import request as urllib_request from . import __version__ -from .utils import ItemsCount, get_stop_words, read_stop_words +from .nlp.stemmers import Stemmer from .nlp.tokenizers import Tokenizer from .parsers.html import HtmlParser from .parsers.plaintext import PlaintextParser -from .summarizers.luhn import LuhnSummarizer from .summarizers.edmundson import EdmundsonSummarizer -from .summarizers.lsa import LsaSummarizer -from .summarizers.text_rank import TextRankSummarizer +from .summarizers.kl import KLSummarizer from .summarizers.lex_rank import LexRankSummarizer +from .summarizers.lsa import LsaSummarizer +from .summarizers.luhn import LuhnSummarizer from .summarizers.sum_basic import SumBasicSummarizer -from .summarizers.kl import KLSummarizer -from .nlp.stemmers import Stemmer +from .summarizers.text_rank import TextRankSummarizer +from .utils import ItemsCount, get_stop_words, read_stop_words HEADERS = { "User-Agent": f"Sumy (Automatic text summarizer) Version/{__version__}", @@ -61,7 +60,7 @@ def main(args=None): parser.add_argument( "--stopwords", help="Path to a file containing a list of stopwords. One word per line in UTF-8 encoding. " - "If not provided, the default list of stop-words is used according to the chosen language.", + "If not provided, the default list of stop-words is used according to the chosen language.", ) parser.add_argument( "--format", @@ -97,9 +96,7 @@ def main(args=None): def handle_arguments(args, default_input_stream=sys.stdin): document_format = args.format if document_format is not None and document_format not in PARSERS: - raise ValueError( - f"Unsupported format of input document. Possible values are: " - f"{', '.join(PARSERS.keys())}. Given: {document_format}.") + raise ValueError(f"Unsupported format of input document. Possible values are: {', '.join(PARSERS.keys())}. Given: {document_format}.") if args.url is not None: parser = PARSERS[document_format or "html"] diff --git a/sumy/evaluation/__init__.py b/sumy/evaluation/__init__.py index 32e98567..93265e19 100644 --- a/sumy/evaluation/__init__.py +++ b/sumy/evaluation/__init__.py @@ -1,3 +1,20 @@ -from .coselection import f_score, precision, recall -from .content_based import cosine_similarity, unit_overlap -from .rouge import rouge_n, rouge_1, rouge_2, rouge_l_sentence_level, rouge_l_summary_level +from .content_based import cosine_similarity as cosine_similarity +from .content_based import unit_overlap as unit_overlap +from .coselection import f_score as f_score +from .coselection import precision as precision +from .coselection import recall as recall +from .rouge import ( + rouge_1 as rouge_1, +) +from .rouge import ( + rouge_2 as rouge_2, +) +from .rouge import ( + rouge_l_sentence_level as rouge_l_sentence_level, +) +from .rouge import ( + rouge_l_summary_level as rouge_l_summary_level, +) +from .rouge import ( + rouge_n as rouge_n, +) diff --git a/sumy/evaluation/__main__.py b/sumy/evaluation/__main__.py index 7a19170d..460a4d9a 100644 --- a/sumy/evaluation/__main__.py +++ b/sumy/evaluation/__main__.py @@ -2,28 +2,25 @@ import argparse import sys - from itertools import chain from urllib import request as urllib_request from .. import __version__ -from ..utils import ItemsCount, get_stop_words from ..models import TfDocumentModel +from ..nlp.stemmers import Stemmer from ..nlp.tokenizers import Tokenizer from ..parsers.html import HtmlParser from ..parsers.plaintext import PlaintextParser -from ..summarizers.random import RandomSummarizer -from ..summarizers.luhn import LuhnSummarizer from ..summarizers.edmundson import EdmundsonSummarizer -from ..summarizers.lsa import LsaSummarizer -from ..summarizers.text_rank import TextRankSummarizer +from ..summarizers.kl import KLSummarizer from ..summarizers.lex_rank import LexRankSummarizer +from ..summarizers.lsa import LsaSummarizer +from ..summarizers.luhn import LuhnSummarizer +from ..summarizers.random import RandomSummarizer from ..summarizers.sum_basic import SumBasicSummarizer -from ..summarizers.kl import KLSummarizer -from ..nlp.stemmers import Stemmer -from . import precision, recall, f_score, cosine_similarity, unit_overlap -from . import rouge_1, rouge_2, rouge_l_sentence_level, rouge_l_summary_level - +from ..summarizers.text_rank import TextRankSummarizer +from ..utils import ItemsCount, get_stop_words +from . import cosine_similarity, f_score, precision, recall, rouge_1, rouge_2, rouge_l_sentence_level, rouge_l_summary_level, unit_overlap HEADERS = { "User-Agent": f"Sumy (Automatic text summarizer) Version/{__version__}", @@ -129,7 +126,7 @@ def evaluate_unit_overlap(evaluated_sentences, reference_sentences): ("Rouge-1", False, rouge_1), ("Rouge-2", False, rouge_2), ("Rouge-L (Sentence Level)", False, rouge_l_sentence_level), - ("Rouge-L (Summary Level)", False, rouge_l_summary_level) + ("Rouge-L (Summary Level)", False, rouge_l_summary_level), ) @@ -183,8 +180,7 @@ def main(args=None): summarizer, document, items_count, reference_summary = handle_arguments(parsed_args) evaluated_sentences = summarizer(document, items_count) - reference_document = PlaintextParser.from_string(reference_summary, - Tokenizer(parsed_args.language)) + reference_document = PlaintextParser.from_string(reference_summary, Tokenizer(parsed_args.language)) reference_sentences = reference_document.document.sentences for name, evaluate_document, evaluate in AVAILABLE_EVALUATIONS: @@ -198,9 +194,7 @@ def main(args=None): def handle_arguments(args): document_format = args.format if document_format is not None and document_format not in PARSERS: - raise ValueError( - f"Unsupported format of input document. Possible values are: " - f"{', '.join(PARSERS.keys())}. Given: {document_format}.") + raise ValueError(f"Unsupported format of input document. Possible values are: {', '.join(PARSERS.keys())}. Given: {document_format}.") parser = PARSERS["plaintext"] input_stream = sys.stdin diff --git a/sumy/evaluation/content_based.py b/sumy/evaluation/content_based.py index a8e130f0..1986049b 100644 --- a/sumy/evaluation/content_based.py +++ b/sumy/evaluation/content_based.py @@ -11,8 +11,7 @@ def cosine_similarity(evaluated_model, reference_model): exactly the same. """ if not (isinstance(evaluated_model, TfModel) and isinstance(reference_model, TfModel)): - raise ValueError( - "Arguments has to be instances of 'sumy.models.TfDocumentModel'") + raise ValueError("Arguments has to be instances of 'sumy.models.TfDocumentModel'") terms = frozenset(evaluated_model.terms) | frozenset(reference_model.terms) @@ -22,8 +21,7 @@ def cosine_similarity(evaluated_model, reference_model): denominator = evaluated_model.magnitude * reference_model.magnitude if denominator == 0.0: - raise ValueError( - f"Document model can't be empty. Given {evaluated_model!r} & {reference_model!r}") + raise ValueError(f"Document model can't be empty. Given {evaluated_model!r} & {reference_model!r}") return numerator / denominator @@ -38,15 +36,13 @@ def unit_overlap(evaluated_model, reference_model): exactly the same. """ if not (isinstance(evaluated_model, TfModel) and isinstance(reference_model, TfModel)): - raise ValueError( - "Arguments has to be instances of 'sumy.models.TfDocumentModel'") + raise ValueError("Arguments has to be instances of 'sumy.models.TfDocumentModel'") terms1 = frozenset(evaluated_model.terms) terms2 = frozenset(reference_model.terms) if not terms1 and not terms2: - raise ValueError( - "Documents can't be empty. Please pass the valid documents.") + raise ValueError("Documents can't be empty. Please pass the valid documents.") common_terms_count = len(terms1 & terms2) return common_terms_count / (len(terms1) + len(terms2) - common_terms_count) diff --git a/sumy/evaluation/coselection.py b/sumy/evaluation/coselection.py index b7c84170..9166a0c3 100644 --- a/sumy/evaluation/coselection.py +++ b/sumy/evaluation/coselection.py @@ -21,7 +21,7 @@ def f_score(evaluated_sentences, reference_sentences, weight=1.0): p = precision(evaluated_sentences, reference_sentences) r = recall(evaluated_sentences, reference_sentences) - weight **= 2 # weight = weight^2 + weight **= 2 # weight = weight^2 denominator = weight * p + r if denominator == 0.0: return 0.0 diff --git a/sumy/evaluation/rouge.py b/sumy/evaluation/rouge.py index 56add83b..231344de 100644 --- a/sumy/evaluation/rouge.py +++ b/sumy/evaluation/rouge.py @@ -2,280 +2,282 @@ def _get_ngrams(n, text): - ngram_set = set() - text_length = len(text) - max_index_ngram_start = text_length - n - for i in range (max_index_ngram_start + 1): - ngram_set.add(tuple(text[i:i+n])) - return ngram_set + ngram_set = set() + text_length = len(text) + max_index_ngram_start = text_length - n + for i in range(max_index_ngram_start + 1): + ngram_set.add(tuple(text[i : i + n])) + return ngram_set def _split_into_words(sentences): - fullTextWords = [] - for s in sentences: - if not isinstance(s, Sentence): - raise (ValueError("Object in collection must be of type Sentence")) - fullTextWords.extend(s.words) - return fullTextWords + fullTextWords = [] + for s in sentences: + if not isinstance(s, Sentence): + raise (ValueError("Object in collection must be of type Sentence")) + fullTextWords.extend(s.words) + return fullTextWords def _get_word_ngrams(n, sentences): - assert (len(sentences) > 0) - assert (n > 0) + assert len(sentences) > 0 + assert n > 0 - words = _split_into_words(sentences) - return _get_ngrams(n, words) + words = _split_into_words(sentences) + return _get_ngrams(n, words) def _get_index_of_lcs(x, y): - return len(x), len(y) + return len(x), len(y) def _len_lcs(x, y): - ''' - Returns the length of the Longest Common Subsequence between sequences x - and y. - Source: http://www.algorithmist.com/index.php/Longest_Common_Subsequence - - :param x: sequence of words - :param y: sequence of words - :returns integer: Length of LCS between x and y - ''' - table = _lcs(x, y) - n, m = _get_index_of_lcs(x, y) - return table[n, m] - - -def _lcs (x, y): - ''' - Computes the length of the longest common subsequence (lcs) between two - strings. The implementation below uses a DP programming algorithm and runs - in O(nm) time where n = len(x) and m = len(y). - Source: http://www.algorithmist.com/index.php/Longest_Common_Subsequence - - :param x: collection of words - :param y: collection of words - :returns table: dictionary of coord and len lcs - ''' - n, m = _get_index_of_lcs(x, y) - table = dict() - for i in range(n + 1): - for j in range (m + 1): - if i == 0 or j == 0: - table[i, j] = 0 - elif x[i-1] == y[j-1]: - table[i, j] = table[i-1, j-1] + 1 - else: - table[i, j] = max(table[i-1, j], table[i, j-1]) - return table + """ + Returns the length of the Longest Common Subsequence between sequences x + and y. + Source: http://www.algorithmist.com/index.php/Longest_Common_Subsequence + + :param x: sequence of words + :param y: sequence of words + :returns integer: Length of LCS between x and y + """ + table = _lcs(x, y) + n, m = _get_index_of_lcs(x, y) + return table[n, m] + + +def _lcs(x, y): + """ + Computes the length of the longest common subsequence (lcs) between two + strings. The implementation below uses a DP programming algorithm and runs + in O(nm) time where n = len(x) and m = len(y). + Source: http://www.algorithmist.com/index.php/Longest_Common_Subsequence + + :param x: collection of words + :param y: collection of words + :returns table: dictionary of coord and len lcs + """ + n, m = _get_index_of_lcs(x, y) + table = dict() + for i in range(n + 1): + for j in range(m + 1): + if i == 0 or j == 0: + table[i, j] = 0 + elif x[i - 1] == y[j - 1]: + table[i, j] = table[i - 1, j - 1] + 1 + else: + table[i, j] = max(table[i - 1, j], table[i, j - 1]) + return table def _recon_lcs(x, y): - ''' - Returns the Longest Subsequence between x and y. - Source: http://www.algorithmist.com/index.php/Longest_Common_Subsequence - - :param x: sequence of words - :param y: sequence of words - :returns sequence: LCS of x and y - ''' - i, j = _get_index_of_lcs(x, y) - table = _lcs(x, y) - def _recon (i, j): - if i == 0 or j == 0: - return [] - elif x[i-1] == y[j-1]: - return _recon(i-1, j-1) + [(x[i-1], i)] - elif table[i-1, j] > table[i, j-1]: - return _recon(i-1, j) - else: - return _recon(i, j-1) - recon_tuple = tuple(map(lambda x: x[0], _recon(i, j))) - return recon_tuple + """ + Returns the Longest Subsequence between x and y. + Source: http://www.algorithmist.com/index.php/Longest_Common_Subsequence + + :param x: sequence of words + :param y: sequence of words + :returns sequence: LCS of x and y + """ + i, j = _get_index_of_lcs(x, y) + table = _lcs(x, y) + + def _recon(i, j): + if i == 0 or j == 0: + return [] + elif x[i - 1] == y[j - 1]: + return _recon(i - 1, j - 1) + [(x[i - 1], i)] + elif table[i - 1, j] > table[i, j - 1]: + return _recon(i - 1, j) + else: + return _recon(i, j - 1) + + recon_tuple = tuple(map(lambda x: x[0], _recon(i, j))) + return recon_tuple def rouge_n(evaluated_sentences, reference_sentences, n=2): - """ - Computes ROUGE-N of two text collections of sentences. - Sourece: http://research.microsoft.com/en-us/um/people/cyl/download/ - papers/rouge-working-note-v1.3.1.pdf - - :param evaluated_sentences: - The sentences that have been picked by the summarizer - :param reference_sentences: - The sentences from the referene set - :param n: Size of ngram. Defaults to 2. - :returns: - float 0 <= ROUGE-N <= 1, where 0 means no overlap and 1 means - exactly the same. - :raises ValueError: raises exception if a param has len <= 0 - """ - if len(evaluated_sentences) <= 0 or len(reference_sentences) <= 0: - raise (ValueError("Collections must contain at least 1 sentence.")) - - evaluated_ngrams = _get_word_ngrams(n, evaluated_sentences) - reference_ngrams = _get_word_ngrams(n, reference_sentences) - reference_count = len(reference_ngrams) - - # Gets the overlapping ngrams between evaluated and reference - overlapping_ngrams = evaluated_ngrams.intersection(reference_ngrams) - overlapping_count = len(overlapping_ngrams) - - return overlapping_count / reference_count + """ + Computes ROUGE-N of two text collections of sentences. + Sourece: http://research.microsoft.com/en-us/um/people/cyl/download/ + papers/rouge-working-note-v1.3.1.pdf + + :param evaluated_sentences: + The sentences that have been picked by the summarizer + :param reference_sentences: + The sentences from the referene set + :param n: Size of ngram. Defaults to 2. + :returns: + float 0 <= ROUGE-N <= 1, where 0 means no overlap and 1 means + exactly the same. + :raises ValueError: raises exception if a param has len <= 0 + """ + if len(evaluated_sentences) <= 0 or len(reference_sentences) <= 0: + raise (ValueError("Collections must contain at least 1 sentence.")) + + evaluated_ngrams = _get_word_ngrams(n, evaluated_sentences) + reference_ngrams = _get_word_ngrams(n, reference_sentences) + reference_count = len(reference_ngrams) + + # Gets the overlapping ngrams between evaluated and reference + overlapping_ngrams = evaluated_ngrams.intersection(reference_ngrams) + overlapping_count = len(overlapping_ngrams) + + return overlapping_count / reference_count def rouge_1(evaluated_sentences, reference_sentences): - ''' - Rouge-N where N=1. This is a commonly used metric. + """ + Rouge-N where N=1. This is a commonly used metric. - :param evaluated_sentences: - The sentences that have been picked by the summarizer - :param reference_sentences: - The sentences from the referene set - :returns: - float 0 <= ROUGE-N <= 1, where 0 means no overlap and 1 means - exactly the same. - ''' - return rouge_n(evaluated_sentences, reference_sentences, 1) + :param evaluated_sentences: + The sentences that have been picked by the summarizer + :param reference_sentences: + The sentences from the referene set + :returns: + float 0 <= ROUGE-N <= 1, where 0 means no overlap and 1 means + exactly the same. + """ + return rouge_n(evaluated_sentences, reference_sentences, 1) def rouge_2(evaluated_sentences, reference_sentences): - ''' - Rouge-N where N=2. This is a commonly used metric. + """ + Rouge-N where N=2. This is a commonly used metric. - :param evaluated_sentences: - The sentences that have been picked by the summarizer - :param reference_sentences: - The sentences from the referene set - :returns: - float 0 <= ROUGE-N <= 1, where 0 means no overlap and 1 means - exactly the same. - ''' - return rouge_n(evaluated_sentences, reference_sentences, 2) + :param evaluated_sentences: + The sentences that have been picked by the summarizer + :param reference_sentences: + The sentences from the referene set + :returns: + float 0 <= ROUGE-N <= 1, where 0 means no overlap and 1 means + exactly the same. + """ + return rouge_n(evaluated_sentences, reference_sentences, 2) def _f_lcs(llcs, m, n): - ''' - Computes the LCS-based F-measure score - Source: http://research.microsoft.com/en-us/um/people/cyl/download/papers/ - rouge-working-note-v1.3.1.pdf - - :param llcs: Length of LCS - :param m: number of words in reference summary - :param n: number of words in candidate summary - :returns float: LCS-based F-measure score - ''' - r_lcs = llcs / m - p_lcs = llcs / n - beta = p_lcs / r_lcs - num = (1 + (beta ** 2)) * r_lcs * p_lcs - denom = r_lcs + ((beta ** 2) * p_lcs) - return num / denom + """ + Computes the LCS-based F-measure score + Source: http://research.microsoft.com/en-us/um/people/cyl/download/papers/ + rouge-working-note-v1.3.1.pdf + + :param llcs: Length of LCS + :param m: number of words in reference summary + :param n: number of words in candidate summary + :returns float: LCS-based F-measure score + """ + r_lcs = llcs / m + p_lcs = llcs / n + beta = p_lcs / r_lcs + num = (1 + (beta**2)) * r_lcs * p_lcs + denom = r_lcs + ((beta**2) * p_lcs) + return num / denom def rouge_l_sentence_level(evaluated_sentences, reference_sentences): - """ - Computes ROUGE-L (sentence level) of two text collections of sentences. - http://research.microsoft.com/en-us/um/people/cyl/download/papers/ - rouge-working-note-v1.3.1.pdf - - Calculated according to: - R_lcs = LCS(X,Y)/m - P_lcs = LCS(X,Y)/n - F_lcs = ((1 + beta^2)*R_lcs*P_lcs) / (R_lcs + (beta^2) * P_lcs) - - where: - X = reference summary - Y = Candidate summary - m = length of reference summary - n = length of candidate summary - - :param evaluated_sentences: - The sentences that have been picked by the summarizer - :param reference_sentences: - The sentences from the referene set - :returns float: F_lcs - :raises ValueError: raises exception if a param has len <= 0 - """ - if len(evaluated_sentences) <= 0 or len(reference_sentences) <= 0: - raise (ValueError("Collections must contain at least 1 sentence.")) - reference_words = _split_into_words(reference_sentences) - evaluated_words = _split_into_words(evaluated_sentences) - m = len(reference_words) - n = len(evaluated_words) - lcs = _len_lcs(evaluated_words, reference_words) - return _f_lcs(lcs, m, n) + """ + Computes ROUGE-L (sentence level) of two text collections of sentences. + http://research.microsoft.com/en-us/um/people/cyl/download/papers/ + rouge-working-note-v1.3.1.pdf + + Calculated according to: + R_lcs = LCS(X,Y)/m + P_lcs = LCS(X,Y)/n + F_lcs = ((1 + beta^2)*R_lcs*P_lcs) / (R_lcs + (beta^2) * P_lcs) + + where: + X = reference summary + Y = Candidate summary + m = length of reference summary + n = length of candidate summary + + :param evaluated_sentences: + The sentences that have been picked by the summarizer + :param reference_sentences: + The sentences from the referene set + :returns float: F_lcs + :raises ValueError: raises exception if a param has len <= 0 + """ + if len(evaluated_sentences) <= 0 or len(reference_sentences) <= 0: + raise (ValueError("Collections must contain at least 1 sentence.")) + reference_words = _split_into_words(reference_sentences) + evaluated_words = _split_into_words(evaluated_sentences) + m = len(reference_words) + n = len(evaluated_words) + lcs = _len_lcs(evaluated_words, reference_words) + return _f_lcs(lcs, m, n) def _union_lcs(evaluated_sentences, reference_sentence): - ''' - Returns LCS_u(r_i, C) which is the LCS score of the union longest common subsequence - between reference sentence ri and candidate summary C. For example, if - r_i= w1 w2 w3 w4 w5, and C contains two sentences: c1 = w1 w2 w6 w7 w8 and - c2 = w1 w3 w8 w9 w5, then the longest common subsequence of r_i and c1 is - "w1 w2" and the longest common subsequence of r_i and c2 is "w1 w3 w5". The - union longest common subsequence of r_i, c1, and c2 is "w1 w2 w3 w5" and - LCS_u(r_i, C) = 4/5. - - :param evaluated_sentences: - The sentences that have been picked by the summarizer - :param reference_sentence: - One of the sentences in the reference summaries - :returns float: LCS_u(r_i, C) - :raises ValueError: raises exception if a param has len <= 0 - ''' - if len(evaluated_sentences) <= 0: - raise (ValueError("Collections must contain at least 1 sentence.")) - - lcs_union = set() - reference_words = _split_into_words([reference_sentence]) - combined_lcs_length = 0 - for eval_s in evaluated_sentences: - evaluated_words = _split_into_words([eval_s]) - lcs = set(_recon_lcs(reference_words, evaluated_words)) - combined_lcs_length += len(lcs) - lcs_union = lcs_union.union(lcs) - - union_lcs_count = len(lcs_union) - union_lcs_value = union_lcs_count / combined_lcs_length - return union_lcs_value + """ + Returns LCS_u(r_i, C) which is the LCS score of the union longest common subsequence + between reference sentence ri and candidate summary C. For example, if + r_i= w1 w2 w3 w4 w5, and C contains two sentences: c1 = w1 w2 w6 w7 w8 and + c2 = w1 w3 w8 w9 w5, then the longest common subsequence of r_i and c1 is + "w1 w2" and the longest common subsequence of r_i and c2 is "w1 w3 w5". The + union longest common subsequence of r_i, c1, and c2 is "w1 w2 w3 w5" and + LCS_u(r_i, C) = 4/5. + + :param evaluated_sentences: + The sentences that have been picked by the summarizer + :param reference_sentence: + One of the sentences in the reference summaries + :returns float: LCS_u(r_i, C) + :raises ValueError: raises exception if a param has len <= 0 + """ + if len(evaluated_sentences) <= 0: + raise (ValueError("Collections must contain at least 1 sentence.")) + + lcs_union = set() + reference_words = _split_into_words([reference_sentence]) + combined_lcs_length = 0 + for eval_s in evaluated_sentences: + evaluated_words = _split_into_words([eval_s]) + lcs = set(_recon_lcs(reference_words, evaluated_words)) + combined_lcs_length += len(lcs) + lcs_union = lcs_union.union(lcs) + + union_lcs_count = len(lcs_union) + union_lcs_value = union_lcs_count / combined_lcs_length + return union_lcs_value def rouge_l_summary_level(evaluated_sentences, reference_sentences): - """ - Computes ROUGE-L (summary level) of two text collections of sentences. - http://research.microsoft.com/en-us/um/people/cyl/download/papers/ - rouge-working-note-v1.3.1.pdf - - Calculated according to: - R_lcs = SUM(1, u)[LCS(r_i,C)]/m - P_lcs = SUM(1, u)[LCS(r_i,C)]/n - F_lcs = ((1 + beta^2)*R_lcs*P_lcs) / (R_lcs + (beta^2) * P_lcs) - - where: - SUM(i,u) = SUM from i through u - u = number of sentences in reference summary - C = Candidate summary made up of v sentences - m = number of words in reference summary - n = number of words in candidate summary - - :param evaluated_sentences: - The sentences that have been picked by the summarizer - :param reference_sentences: - The sentences from the referene set - :returns float: F_lcs - :raises ValueError: raises exception if a param has len <= 0 - """ - if len(evaluated_sentences) <= 0 or len(reference_sentences) <= 0: - raise (ValueError("Collections must contain at least 1 sentence.")) - - # total number of words in reference sentences - m = len(_split_into_words(reference_sentences)) - - # total number of words in evaluated sentences - n = len(_split_into_words(evaluated_sentences)) - - union_lcs_sum_across_all_references = 0 - for ref_s in reference_sentences: - union_lcs_sum_across_all_references += _union_lcs(evaluated_sentences, ref_s) - return _f_lcs(union_lcs_sum_across_all_references, m, n) + """ + Computes ROUGE-L (summary level) of two text collections of sentences. + http://research.microsoft.com/en-us/um/people/cyl/download/papers/ + rouge-working-note-v1.3.1.pdf + + Calculated according to: + R_lcs = SUM(1, u)[LCS(r_i,C)]/m + P_lcs = SUM(1, u)[LCS(r_i,C)]/n + F_lcs = ((1 + beta^2)*R_lcs*P_lcs) / (R_lcs + (beta^2) * P_lcs) + + where: + SUM(i,u) = SUM from i through u + u = number of sentences in reference summary + C = Candidate summary made up of v sentences + m = number of words in reference summary + n = number of words in candidate summary + + :param evaluated_sentences: + The sentences that have been picked by the summarizer + :param reference_sentences: + The sentences from the referene set + :returns float: F_lcs + :raises ValueError: raises exception if a param has len <= 0 + """ + if len(evaluated_sentences) <= 0 or len(reference_sentences) <= 0: + raise (ValueError("Collections must contain at least 1 sentence.")) + + # total number of words in reference sentences + m = len(_split_into_words(reference_sentences)) + + # total number of words in evaluated sentences + n = len(_split_into_words(evaluated_sentences)) + + union_lcs_sum_across_all_references = 0 + for ref_s in reference_sentences: + union_lcs_sum_across_all_references += _union_lcs(evaluated_sentences, ref_s) + return _f_lcs(union_lcs_sum_across_all_references, m, n) diff --git a/sumy/models/__init__.py b/sumy/models/__init__.py index 3c80f326..bdf747ea 100644 --- a/sumy/models/__init__.py +++ b/sumy/models/__init__.py @@ -1 +1 @@ -from .tf import TfDocumentModel +from .tf import TfDocumentModel as TfDocumentModel diff --git a/sumy/models/dom/__init__.py b/sumy/models/dom/__init__.py index a68f9ff0..5b7573e5 100644 --- a/sumy/models/dom/__init__.py +++ b/sumy/models/dom/__init__.py @@ -1,3 +1,3 @@ -from ._document import ObjectDocumentModel -from ._paragraph import Paragraph -from ._sentence import Sentence +from ._document import ObjectDocumentModel as ObjectDocumentModel +from ._paragraph import Paragraph as Paragraph +from ._sentence import Sentence as Sentence diff --git a/sumy/models/dom/_document.py b/sumy/models/dom/_document.py index e011965c..fd0de68a 100644 --- a/sumy/models/dom/_document.py +++ b/sumy/models/dom/_document.py @@ -1,4 +1,5 @@ from itertools import chain + from ...utils import cached_property diff --git a/sumy/models/dom/_paragraph.py b/sumy/models/dom/_paragraph.py index ab58f41b..e568d7b9 100644 --- a/sumy/models/dom/_paragraph.py +++ b/sumy/models/dom/_paragraph.py @@ -1,4 +1,5 @@ from itertools import chain + from ...utils import cached_property from ._sentence import Sentence diff --git a/sumy/models/dom/_sentence.py b/sumy/models/dom/_sentence.py index a8962959..49910d73 100644 --- a/sumy/models/dom/_sentence.py +++ b/sumy/models/dom/_sentence.py @@ -2,7 +2,12 @@ class Sentence: - __slots__ = ("_text", "_cached_property_words", "_tokenizer", "_is_heading",) + __slots__ = ( + "_text", + "_cached_property_words", + "_tokenizer", + "_is_heading", + ) def __init__(self, text, tokenizer, is_heading=False): self._text = str(text).strip() diff --git a/sumy/models/tf.py b/sumy/models/tf.py index b399da94..367e30b3 100644 --- a/sumy/models/tf.py +++ b/sumy/models/tf.py @@ -1,21 +1,19 @@ import math - -from pprint import pformat from collections import Counter from collections.abc import Sequence +from pprint import pformat class TfDocumentModel: """Term-Frequency document model (term = word).""" + def __init__(self, words, tokenizer=None): if isinstance(words, str) and tokenizer is None: - raise ValueError( - "Tokenizer has to be given if ``words`` is not a sequence.") + raise ValueError("Tokenizer has to be given if ``words`` is not a sequence.") elif isinstance(words, str): words = tokenizer.to_words(str(words)) elif not isinstance(words, Sequence): - raise ValueError( - "Parameter ``words`` has to be sequence or string with tokenizer given.") + raise ValueError("Parameter ``words`` has to be sequence or string with tokenizer given.") self._terms = Counter(map(str.lower, words)) self._max_frequency = max(self._terms.values()) if self._terms else 1 @@ -49,8 +47,7 @@ def most_frequent_terms(self, count=0): elif count > 0: return terms[:count] else: - raise ValueError( - "Only non-negative values are allowed for count of terms.") + raise ValueError("Only non-negative values are allowed for count of terms.") def term_frequency(self, term): """ @@ -77,7 +74,7 @@ def normalized_term_frequency(self, term, smooth=0.0): and 1 the most frequent term in document. """ frequency = self.term_frequency(term) / self._max_frequency - return smooth + (1.0 - smooth)*frequency + return smooth + (1.0 - smooth) * frequency def __repr__(self): return f"" diff --git a/sumy/nlp/stemmers/__init__.py b/sumy/nlp/stemmers/__init__.py index 3a2f7fc5..d2bab141 100644 --- a/sumy/nlp/stemmers/__init__.py +++ b/sumy/nlp/stemmers/__init__.py @@ -11,10 +11,10 @@ def null_stemmer(object): class Stemmer: def __init__(self, language): self._stemmer = null_stemmer - if language.lower() in ('czech', 'slovak'): + if language.lower() in ("czech", "slovak"): self._stemmer = czech_stemmer return - stemmer_classname = language.capitalize() + 'Stemmer' + stemmer_classname = language.capitalize() + "Stemmer" try: stemmer_class = getattr(nltk_stemmers_module, stemmer_classname) except AttributeError: diff --git a/sumy/nlp/stemmers/czech.py b/sumy/nlp/stemmers/czech.py index 5ec3d337..a17aba38 100644 --- a/sumy/nlp/stemmers/czech.py +++ b/sumy/nlp/stemmers/czech.py @@ -11,10 +11,8 @@ import re import sys - from warnings import warn - WORD_PATTERN = re.compile(r"^\w+$", re.UNICODE) @@ -58,11 +56,9 @@ def _remove_case(word): return word[:-4] if len(word) > 5: - if word[-3:] in ("ech", "ich", "ích", "ého", "ěmi", "emi", "ému", - "ete", "eti", "iho", "ího", "ími", "imu"): + if word[-3:] in ("ech", "ich", "ích", "ého", "ěmi", "emi", "ému", "ete", "eti", "iho", "ího", "ími", "imu"): return _palatalize(word[:-2]) - if word[-3:] in ("ách", "ata", "aty", "ých", "ama", "ami", - "ové", "ovi", "ými"): + if word[-3:] in ("ách", "ata", "aty", "ých", "ama", "ami", "ové", "ovi", "ými"): return word[:-3] if len(word) > 4: @@ -102,18 +98,14 @@ def _remove_diminutive(word): if len(word) > 7 and word.endswith("oušek"): return word[:-5] if len(word) > 6: - if word[-4:] in ("eček", "éček", "iček", "íček", "enek", "ének", - "inek", "ínek"): + if word[-4:] in ("eček", "éček", "iček", "íček", "enek", "ének", "inek", "ínek"): return _palatalize(word[:-3]) - if word[-4:] in ("áček", "aček", "oček", "uček", "anek", "onek", - "unek", "ánek"): + if word[-4:] in ("áček", "aček", "oček", "uček", "anek", "onek", "unek", "ánek"): return _palatalize(word[:-4]) if len(word) > 5: - if word[-3:] in ("ečk", "éčk", "ičk", "íčk", "enk", "énk", - "ink", "ínk"): + if word[-3:] in ("ečk", "éčk", "ičk", "íčk", "enk", "énk", "ink", "ínk"): return _palatalize(word[:-3]) - if word[-3:] in ("áčk", "ačk", "očk", "učk", "ank", "onk", - "unk", "átk", "ánk", "ušk"): + if word[-3:] in ("áčk", "ačk", "očk", "učk", "ank", "onk", "unk", "átk", "ánk", "ušk"): return word[:-3] if len(word) > 4: if word[-2:] in ("ek", "ék", "ík", "ik"): @@ -144,8 +136,7 @@ def _remove_derivational(word): if word[-5:] in ("ovisk", "ovstv", "ovišt", "ovník"): return word[:-5] if len(word) > 6: - if word[-4:] in ("ásek", "loun", "nost", "teln", "ovec", "ovík", - "ovtv", "ovin", "štin"): + if word[-4:] in ("ásek", "loun", "nost", "teln", "ovec", "ovík", "ovtv", "ovin", "štin"): return word[:-4] if word[-4:] in ("enic", "inec", "itel"): return _palatalize(word[:-3]) @@ -154,18 +145,14 @@ def _remove_derivational(word): return word[:-3] if word[-3:] in ("ěnk", "ián", "ist", "isk", "išt", "itb", "írn"): return _palatalize(word[:-2]) - if word[-3:] in ("och", "ost", "ovn", "oun", "out", "ouš", - "ušk", "kyn", "čan", "kář", "néř", "ník", - "ctv", "stv"): + if word[-3:] in ("och", "ost", "ovn", "oun", "out", "ouš", "ušk", "kyn", "čan", "kář", "néř", "ník", "ctv", "stv"): return word[:-3] if len(word) > 4: if word[-2:] in ("áč", "ač", "án", "an", "ář", "as"): return word[:-2] - if word[-2:] in ("ec", "en", "ěn", "éř", "íř", "ic", "in", "ín", - "it", "iv"): + if word[-2:] in ("ec", "en", "ěn", "éř", "íř", "ic", "in", "ín", "it", "iv"): return _palatalize(word[:-1]) - if word[-2:] in ("ob", "ot", "ov", "oň", "ul", "yn", "čk", "čn", - "dl", "nk", "tv", "tk", "vk"): + if word[-2:] in ("ob", "ot", "ov", "oň", "ul", "yn", "čk", "čn", "dl", "nk", "tv", "tk", "vk"): return word[:-2] if len(word) > 3 and word[-1] in "cčklnt": return word[:-1] @@ -188,7 +175,7 @@ def _palatalize(word): return word[:-1] -if __name__ == '__main__': +if __name__ == "__main__": if len(sys.argv) != 2 or sys.argv[1] not in ("light", "aggressive"): sys.exit(__doc__) diff --git a/sumy/nlp/tokenizers.py b/sumy/nlp/tokenizers.py index 3fcafd0d..82ec0be2 100644 --- a/sumy/nlp/tokenizers.py +++ b/sumy/nlp/tokenizers.py @@ -1,4 +1,5 @@ import re + import nltk @@ -14,8 +15,8 @@ class Tokenizer: # improve tokenizer by adding specific abbreviations it has issues with # note the final point in these items must not be included LANGUAGE_EXTRA_ABREVS = { - "english": ['e.g', 'al', 'i.e'], - "german": ['al', 'z.B', 'Inc','engl','z. B', 'vgl', 'lat', 'bzw', 'S'], + "english": ["e.g", "al", "i.e"], + "german": ["al", "z.B", "Inc", "engl", "z. B", "vgl", "lat", "bzw", "S"], } def __init__(self, language): diff --git a/sumy/parsers/__init__.py b/sumy/parsers/__init__.py index 28ceeeb7..e286a67d 100644 --- a/sumy/parsers/__init__.py +++ b/sumy/parsers/__init__.py @@ -1 +1 @@ -from .parser import DocumentParser +from .parser import DocumentParser as DocumentParser diff --git a/sumy/parsers/html.py b/sumy/parsers/html.py index 2aa00350..4fa9010b 100644 --- a/sumy/parsers/html.py +++ b/sumy/parsers/html.py @@ -1,8 +1,9 @@ from urllib import request as urllib_request from breadability.readable import Article + +from ..models.dom import ObjectDocumentModel, Paragraph, Sentence from ..utils import cached_property -from ..models.dom import Sentence, Paragraph, ObjectDocumentModel from .parser import DocumentParser @@ -10,8 +11,11 @@ class HtmlParser(DocumentParser): """Parser of text from HTML format into DOM.""" SIGNIFICANT_TAGS = ( - "h1", "h2", "h3", - "b", "strong", + "h1", + "h2", + "h3", + "b", + "strong", "big", "dfn", "em", diff --git a/sumy/parsers/plaintext.py b/sumy/parsers/plaintext.py index 1b4b74b3..70b2ddf3 100644 --- a/sumy/parsers/plaintext.py +++ b/sumy/parsers/plaintext.py @@ -1,5 +1,5 @@ +from ..models.dom import ObjectDocumentModel, Paragraph, Sentence from ..utils import cached_property -from ..models.dom import Sentence, Paragraph, ObjectDocumentModel from .parser import DocumentParser diff --git a/sumy/summarizers/__init__.py b/sumy/summarizers/__init__.py index a1bcb2a9..8f327957 100644 --- a/sumy/summarizers/__init__.py +++ b/sumy/summarizers/__init__.py @@ -1 +1 @@ -from ._summarizer import AbstractSummarizer +from ._summarizer import AbstractSummarizer as AbstractSummarizer diff --git a/sumy/summarizers/_summarizer.py b/sumy/summarizers/_summarizer.py index f3f4737f..3ee43c64 100644 --- a/sumy/summarizers/_summarizer.py +++ b/sumy/summarizers/_summarizer.py @@ -1,10 +1,17 @@ from collections import namedtuple from operator import attrgetter -from ..utils import ItemsCount -from ..nlp.stemmers import null_stemmer +from ..nlp.stemmers import null_stemmer +from ..utils import ItemsCount -SentenceInfo = namedtuple("SentenceInfo", ("sentence", "order", "rating",)) +SentenceInfo = namedtuple( + "SentenceInfo", + ( + "sentence", + "order", + "rating", + ), +) class AbstractSummarizer: @@ -27,10 +34,11 @@ def _get_best_sentences(self, sentences, count, rating, *args, **kwargs): rate = rating if isinstance(rating, dict): assert not args and not kwargs - rate = lambda s: rating[s] - infos = (SentenceInfo(s, o, rate(s, *args, **kwargs)) - for o, s in enumerate(sentences)) + def rate(s): + return rating[s] + + infos = (SentenceInfo(s, o, rate(s, *args, **kwargs)) for o, s in enumerate(sentences)) # sort sentences by rating in descending order infos = sorted(infos, key=attrgetter("rating"), reverse=True) diff --git a/sumy/summarizers/edmundson.py b/sumy/summarizers/edmundson.py index 61eec5b7..fcace9f8 100644 --- a/sumy/summarizers/edmundson.py +++ b/sumy/summarizers/edmundson.py @@ -1,11 +1,11 @@ from collections import defaultdict + from ..nlp.stemmers import null_stemmer from ._summarizer import AbstractSummarizer from .edmundson_cue import EdmundsonCueMethod from .edmundson_key import EdmundsonKeyMethod -from .edmundson_title import EdmundsonTitleMethod from .edmundson_location import EdmundsonLocationMethod - +from .edmundson_title import EdmundsonTitleMethod _EMPTY_SET = frozenset() @@ -15,12 +15,10 @@ class EdmundsonSummarizer(AbstractSummarizer): _stigma_words = _EMPTY_SET _null_words = _EMPTY_SET - def __init__(self, stemmer=null_stemmer, cue_weight=1.0, key_weight=0.0, - title_weight=1.0, location_weight=1.0): + def __init__(self, stemmer=null_stemmer, cue_weight=1.0, key_weight=0.0, title_weight=1.0, location_weight=1.0): super().__init__(stemmer) - self._ensure_correct_weights(cue_weight, key_weight, title_weight, - location_weight) + self._ensure_correct_weights(cue_weight, key_weight, title_weight, location_weight) self._cue_weight = float(cue_weight) self._key_weight = float(key_weight) @@ -84,8 +82,7 @@ def _update_ratings(self, ratings, new_ratings): def cue_method(self, document, sentences_count, bunus_word_value=1, stigma_word_value=1): summarization_method = self._build_cue_method_instance() - return summarization_method(document, sentences_count, bunus_word_value, - stigma_word_value) + return summarization_method(document, sentences_count, bunus_word_value, stigma_word_value) def _build_cue_method_instance(self): self.__check_bonus_words() @@ -100,7 +97,7 @@ def key_method(self, document, sentences_count, weight=0.5): def _build_key_method_instance(self): self.__check_bonus_words() - return EdmundsonKeyMethod(self._stemmer, self._bonus_words) + return EdmundsonKeyMethod(self._stemmer, self._bonus_words) def title_method(self, document, sentences_count): summarization_method = self._build_title_method_instance() diff --git a/sumy/summarizers/edmundson_cue.py b/sumy/summarizers/edmundson_cue.py index c124bfde..1c9dc6c9 100644 --- a/sumy/summarizers/edmundson_cue.py +++ b/sumy/summarizers/edmundson_cue.py @@ -8,9 +8,7 @@ def __init__(self, stemmer, bonus_words, stigma_words): self._stigma_words = stigma_words def __call__(self, document, sentences_count, bunus_word_weight, stigma_word_weight): - return self._get_best_sentences(document.sentences, - sentences_count, self._rate_sentence, bunus_word_weight, - stigma_word_weight) + return self._get_best_sentences(document.sentences, sentences_count, self._rate_sentence, bunus_word_weight, stigma_word_weight) def _rate_sentence(self, sentence, bunus_word_weight, stigma_word_weight): # count number of bonus/stigma words in sentece @@ -18,8 +16,8 @@ def _rate_sentence(self, sentence, bunus_word_weight, stigma_word_weight): bonus_words_count, stigma_words_count = self._count_words(words) # compute positive & negative rating - bonus_rating = bonus_words_count*bunus_word_weight - stigma_rating = stigma_words_count*stigma_word_weight + bonus_rating = bonus_words_count * bunus_word_weight + stigma_rating = stigma_words_count * stigma_word_weight # rating of sentence is (positive - negative) rating return bonus_rating - stigma_rating @@ -38,7 +36,7 @@ def _count_words(self, words): for word in words: if word in self._bonus_words: - bonus_words_count +=1 + bonus_words_count += 1 if word in self._stigma_words: stigma_words_count += 1 @@ -47,7 +45,6 @@ def _count_words(self, words): def rate_sentences(self, document, bunus_word_weight=1, stigma_word_weight=1): rated_sentences = {} for sentence in document.sentences: - rated_sentences[sentence] = self._rate_sentence(sentence, - bunus_word_weight, stigma_word_weight) + rated_sentences[sentence] = self._rate_sentence(sentence, bunus_word_weight, stigma_word_weight) return rated_sentences diff --git a/sumy/summarizers/edmundson_key.py b/sumy/summarizers/edmundson_key.py index 23ff282b..6ca66a0b 100644 --- a/sumy/summarizers/edmundson_key.py +++ b/sumy/summarizers/edmundson_key.py @@ -1,4 +1,5 @@ from collections import Counter + from ._summarizer import AbstractSummarizer @@ -10,8 +11,7 @@ def __init__(self, stemmer, bonus_words): def __call__(self, document, sentences_count, weight): significant_words = self._compute_significant_words(document, weight) - return self._get_best_sentences(document.sentences, - sentences_count, self._rate_sentence, significant_words) + return self._get_best_sentences(document.sentences, sentences_count, self._rate_sentence, significant_words) def _compute_significant_words(self, document, weight): # keep only stems contained in bonus words @@ -28,8 +28,7 @@ def _compute_significant_words(self, document, weight): # return only words greater than weight max_word_frequency = max(word_frequencies) - return tuple(word for word, frequency in word_counts.items() - if frequency/max_word_frequency > weight) + return tuple(word for word, frequency in word_counts.items() if frequency / max_word_frequency > weight) def _is_bonus_word(self, word): return word in self._bonus_words @@ -43,7 +42,6 @@ def rate_sentences(self, document, weight=0.5): rated_sentences = {} for sentence in document.sentences: - rated_sentences[sentence] = self._rate_sentence(sentence, - significant_words) + rated_sentences[sentence] = self._rate_sentence(sentence, significant_words) return rated_sentences diff --git a/sumy/summarizers/edmundson_location.py b/sumy/summarizers/edmundson_location.py index ff6c439b..a01bf5f5 100644 --- a/sumy/summarizers/edmundson_location.py +++ b/sumy/summarizers/edmundson_location.py @@ -1,5 +1,6 @@ from itertools import chain, filterfalse from operator import attrgetter + from ._summarizer import AbstractSummarizer @@ -10,8 +11,7 @@ def __init__(self, stemmer, null_words): def __call__(self, document, sentences_count, w_h, w_p1, w_p2, w_s1, w_s2): significant_words = self._compute_significant_words(document) - ratings = self._rate_sentences(document, significant_words, w_h, w_p1, - w_p2, w_s1, w_s2) + ratings = self._rate_sentences(document, significant_words, w_h, w_p1, w_p2, w_s1, w_s2) return self._get_best_sentences(document.sentences, sentences_count, ratings) diff --git a/sumy/summarizers/edmundson_title.py b/sumy/summarizers/edmundson_title.py index fbdfb57a..9a87d0a9 100644 --- a/sumy/summarizers/edmundson_title.py +++ b/sumy/summarizers/edmundson_title.py @@ -1,5 +1,6 @@ -from operator import attrgetter from itertools import chain, filterfalse +from operator import attrgetter + from ._summarizer import AbstractSummarizer @@ -12,8 +13,7 @@ def __call__(self, document, sentences_count): sentences = document.sentences significant_words = self._compute_significant_words(document) - return self._get_best_sentences(sentences, sentences_count, - self._rate_sentence, significant_words) + return self._get_best_sentences(sentences, sentences_count, self._rate_sentence, significant_words) def _compute_significant_words(self, document): heading_words = map(attrgetter("words"), document.headings) @@ -36,7 +36,6 @@ def rate_sentences(self, document): rated_sentences = {} for sentence in document.sentences: - rated_sentences[sentence] = self._rate_sentence(sentence, - significant_words) + rated_sentences[sentence] = self._rate_sentence(sentence, significant_words) return rated_sentences diff --git a/sumy/summarizers/kl.py b/sumy/summarizers/kl.py index aad7714e..10e5a6ed 100644 --- a/sumy/summarizers/kl.py +++ b/sumy/summarizers/kl.py @@ -1,7 +1,6 @@ import math from ._summarizer import AbstractSummarizer -from ..utils import get_stop_words class KLSummarizer(AbstractSummarizer): @@ -48,9 +47,9 @@ def _get_all_content_words_in_doc(self, sentences): return normalized_content_words def _compute_tf(self, sentences): - ''' + """ Computes the normalized term frequency as explained in http://www.tfidf.com/ - ''' + """ content_words = self._get_all_content_words_in_doc(sentences) content_words_count = len(content_words) content_words_freq = self._compute_word_freq(content_words) @@ -72,7 +71,8 @@ def _joint_freq(self, word_list_1, word_list_2): for k in wc2: if k in joint: joint[k] += wc2[k] - else: joint[k] = wc2[k] + else: + joint[k] = wc2[k] # divides total counts by the combined length for k in joint: @@ -81,19 +81,19 @@ def _joint_freq(self, word_list_1, word_list_2): return joint def _kl_divergence(self, summary_freq, doc_freq): - ''' + """ Note: Could import scipy.stats and use scipy.stats.entropy(doc_freq, summary_freq) but this gives equivalent value without the import - ''' + """ sum_val = 0 for w in summary_freq: sum_val += doc_freq[w] * math.log(doc_freq[w] / summary_freq[w]) return sum_val def _find_index_of_best_sentence(self, kls): - ''' + """ the best sentence is the one with the smallest kl_divergence - ''' + """ indexToRemove = kls.index(min(kls)) return indexToRemove @@ -130,6 +130,6 @@ def _compute_ratings(self, sentences): summary.append(best_sentence) # value is the iteration in which it was removed multiplied by -1 so that the first sentences removed (the most important) have highest values - ratings[best_sentence] = -1 * len(ratings) + ratings[best_sentence] = -1 * len(ratings) return ratings diff --git a/sumy/summarizers/lex_rank.py b/sumy/summarizers/lex_rank.py index c6966b67..fbe16498 100644 --- a/sumy/summarizers/lex_rank.py +++ b/sumy/summarizers/lex_rank.py @@ -14,6 +14,7 @@ class LexRankSummarizer(AbstractSummarizer): LexRank: Graph-based Centrality as Salience in Text Summarization Source: http://tangra.si.umich.edu/~radev/lexrank/lexrank.pdf """ + threshold = 0.1 epsilon = 0.1 _stop_words = frozenset() @@ -87,7 +88,7 @@ def _create_matrix(self, sentences, threshold, tf_metrics, idf_metrics): # create matrix |sentences|x|sentences| filled with zeroes sentences_count = len(sentences) matrix = numpy.zeros((sentences_count, sentences_count)) - degrees = numpy.zeros((sentences_count, )) + degrees = numpy.zeros((sentences_count,)) for row, (sentence1, tf1) in enumerate(zip(sentences, tf_metrics)): for col, (sentence2, tf2) in enumerate(zip(sentences, tf_metrics)): @@ -114,10 +115,10 @@ def _compute_cosine(sentence1, sentence2, tf1, tf2, idf_metrics): numerator = 0.0 for term in common_words: - numerator += tf1[term]*tf2[term] * idf_metrics[term]**2 + numerator += tf1[term] * tf2[term] * idf_metrics[term] ** 2 - denominator1 = sum((tf1[t]*idf_metrics[t])**2 for t in sentence1) - denominator2 = sum((tf2[t]*idf_metrics[t])**2 for t in sentence2) + denominator1 = sum((tf1[t] * idf_metrics[t]) ** 2 for t in sentence1) + denominator2 = sum((tf2[t] * idf_metrics[t]) ** 2 for t in sentence2) if denominator1 > 0 and denominator2 > 0: return numerator / (math.sqrt(denominator1) * math.sqrt(denominator2)) diff --git a/sumy/summarizers/lsa.py b/sumy/summarizers/lsa.py index cd889e50..76ac2c9c 100644 --- a/sumy/summarizers/lsa.py +++ b/sumy/summarizers/lsa.py @@ -1,5 +1,4 @@ import math - from warnings import warn try: @@ -16,7 +15,7 @@ class LsaSummarizer(AbstractSummarizer): MIN_DIMENSIONS = 3 - REDUCTION_RATIO = 1/1 + REDUCTION_RATIO = 1 / 1 _stop_words = frozenset() @property @@ -40,8 +39,7 @@ def __call__(self, document, sentences_count): u, sigma, v = singular_value_decomposition(matrix, full_matrices=False) ranks = iter(self._compute_ranks(sigma, v)) - return self._get_best_sentences(document.sentences, sentences_count, - lambda s: next(ranks)) + return self._get_best_sentences(document.sentences, sentences_count, lambda s: next(ranks)) def _ensure_dependecies_installed(self): if numpy is None: @@ -64,10 +62,7 @@ def _create_matrix(self, document, dictionary): words_count = len(dictionary) sentences_count = len(sentences) if words_count < sentences_count: - warn( - f"Number of words ({words_count}) is lower than number of sentences ({sentences_count}). " - "LSA algorithm may not work properly." - ) + warn(f"Number of words ({words_count}) is lower than number of sentences ({sentences_count}). LSA algorithm may not work properly.") # create matrix |unique words|x|sentences| filled with zeroes matrix = numpy.zeros((words_count, sentences_count)) @@ -94,23 +89,21 @@ def _compute_term_frequency(self, matrix, smooth=0.4): for col in range(cols): max_word_frequency = max_word_frequencies[col] if max_word_frequency != 0: - frequency = matrix[row, col]/max_word_frequency - matrix[row, col] = smooth + (1.0 - smooth)*frequency + frequency = matrix[row, col] / max_word_frequency + matrix[row, col] = smooth + (1.0 - smooth) * frequency return matrix def _compute_ranks(self, sigma, v_matrix): assert len(sigma) == v_matrix.shape[0], "Matrices should be multiplicable" - dimensions = max(LsaSummarizer.MIN_DIMENSIONS, - int(len(sigma)*LsaSummarizer.REDUCTION_RATIO)) - powered_sigma = tuple(s**2 if i < dimensions else 0.0 - for i, s in enumerate(sigma)) + dimensions = max(LsaSummarizer.MIN_DIMENSIONS, int(len(sigma) * LsaSummarizer.REDUCTION_RATIO)) + powered_sigma = tuple(s**2 if i < dimensions else 0.0 for i, s in enumerate(sigma)) ranks = [] # iterate over columns of matrix (rows of transposed matrix) for column_vector in v_matrix.T: - rank = sum(s*v**2 for s, v in zip(powered_sigma, column_vector)) + rank = sum(s * v**2 for s, v in zip(powered_sigma, column_vector)) ranks.append(math.sqrt(rank)) return ranks diff --git a/sumy/summarizers/luhn.py b/sumy/summarizers/luhn.py index 99a209c3..d6dcfaae 100644 --- a/sumy/summarizers/luhn.py +++ b/sumy/summarizers/luhn.py @@ -18,8 +18,7 @@ def stop_words(self, words): def __call__(self, document, sentences_count): words = self._get_significant_words(document.words) - return self._get_best_sentences(document.sentences, - sentences_count, self.rate_sentence, words) + return self._get_best_sentences(document.sentences, sentences_count, self.rate_sentence, words) def _get_significant_words(self, words): words = map(self.normalize_word, words) @@ -40,7 +39,7 @@ def rate_sentence(self, sentence, significant_stems): def _get_chunk_ratings(self, sentence, significant_stems): chunks = [] - NONSIGNIFICANT_CHUNK = [0]*self.max_gap_size + NONSIGNIFICANT_CHUNK = [0] * self.max_gap_size in_chunk = False for order, word in enumerate(sentence.words): @@ -55,7 +54,7 @@ def _get_chunk_ratings(self, sentence, significant_stems): chunks[-1].append(is_significant_word) # end of chunk - if chunks and chunks[-1][-self.max_gap_size:] == NONSIGNIFICANT_CHUNK: + if chunks and chunks[-1][-self.max_gap_size :] == NONSIGNIFICANT_CHUNK: in_chunk = False return tuple(map(self._get_chunk_rating, chunks)) @@ -77,4 +76,4 @@ def __remove_trailing_zeros(self, collection): while index >= 0 and collection[index] == 0: index -= 1 - return collection[:index + 1] + return collection[: index + 1] diff --git a/sumy/summarizers/sum_basic.py b/sumy/summarizers/sum_basic.py index 0916054f..07007010 100644 --- a/sumy/summarizers/sum_basic.py +++ b/sumy/summarizers/sum_basic.py @@ -1,7 +1,4 @@ -import math - from ._summarizer import AbstractSummarizer -from ..utils import get_stop_words class SumBasicSummarizer(AbstractSummarizer): @@ -44,9 +41,9 @@ def _get_all_content_words_in_doc(self, sentences): return normalized_content_words def _compute_tf(self, sentences): - ''' + """ Computes the normalized term frequency as explained in http://www.tfidf.com/ - ''' + """ content_words = self._get_all_content_words_in_doc(sentences) content_words_count = len(content_words) content_words_freq = self._compute_word_freq(content_words) @@ -73,7 +70,7 @@ def _find_index_of_best_sentence(self, word_freq, sentences_as_words): best_sentence_index = 0 for i, words in enumerate(sentences_as_words): word_freq_avg = self._compute_average_probability_of_words(word_freq, words) - if (word_freq_avg > max_value): + if word_freq_avg > max_value: max_value = word_freq_avg best_sentence_index = i return best_sentence_index @@ -94,7 +91,7 @@ def _compute_ratings(self, sentences): best_sentence = sentences_list.pop(best_sentence_index) # value is the iteration in which it was removed multiplied by -1 so that the first sentences removed (the most important) have highest values - ratings[best_sentence] = -1 * len(ratings) + ratings[best_sentence] = -1 * len(ratings) # update probabilities best_sentence_words = sentences_as_words.pop(best_sentence_index) diff --git a/sumy/summarizers/text_rank.py b/sumy/summarizers/text_rank.py index 66fd905c..33dfbd47 100644 --- a/sumy/summarizers/text_rank.py +++ b/sumy/summarizers/text_rank.py @@ -1,7 +1,7 @@ import math - -from itertools import combinations from collections import defaultdict +from itertools import combinations + from ._summarizer import AbstractSummarizer diff --git a/sumy/utils.py b/sumy/utils.py index d1bb7f21..ab813f1c 100644 --- a/sumy/utils.py +++ b/sumy/utils.py @@ -1,7 +1,6 @@ import sys - from functools import wraps -from os.path import dirname, abspath, join, exists +from os.path import abspath, dirname, exists, join def cached_property(getter): @@ -14,6 +13,7 @@ def cached_property(getter): functools.cached_property because functools.cached_property does NOT work with __slots__-based classes (Sentence, Paragraph, etc.). """ + @wraps(getter) def decorator(self): key = "_cached_property_" + getter.__name__ @@ -54,12 +54,12 @@ def __call__(self, sequence): total_count = len(sequence) percentage = int(self._value[:-1]) # at least one sentence should be choosen - count = max(1, total_count*percentage // 100) + count = max(1, total_count * percentage // 100) return sequence[:count] else: - return sequence[:int(self._value)] + return sequence[: int(self._value)] elif isinstance(self._value, (int, float)): - return sequence[:int(self._value)] + return sequence[: int(self._value)] else: ValueError(f"Unsuported value of items count '{self._value}'.") diff --git a/tests/test_evaluation.py b/tests/test_evaluation.py index 94e04d71..c1723eb2 100644 --- a/tests/test_evaluation.py +++ b/tests/test_evaluation.py @@ -1,13 +1,11 @@ import unittest +from sumy.evaluation import cosine_similarity, f_score, precision, recall, rouge_l_sentence_level, rouge_l_summary_level, rouge_n, unit_overlap +from sumy.evaluation.rouge import _get_ngrams, _get_word_ngrams, _len_lcs, _recon_lcs, _split_into_words, _union_lcs +from sumy.models import TfDocumentModel from sumy.nlp.tokenizers import Tokenizer from sumy.parsers.plaintext import PlaintextParser -from sumy.models.dom._sentence import Sentence -from sumy.models import TfDocumentModel -from sumy.evaluation import precision, recall, f_score -from sumy.evaluation import cosine_similarity, unit_overlap -from sumy.evaluation import rouge_n, rouge_l_sentence_level, rouge_l_summary_level -from sumy.evaluation.rouge import _get_ngrams, _split_into_words, _get_word_ngrams, _len_lcs, _recon_lcs, _union_lcs + class TestCoselectionEvaluation(unittest.TestCase): def test_precision_empty_evaluated(self): @@ -38,10 +36,10 @@ def test_precision_equals(self): self.assertAlmostEqual(result, 1.0) def test_recall_empty_evaluated(self): - self.assertRaises(ValueError, recall, (), ("s1", "s2", "s3", "s4", "s5")) + self.assertRaises(ValueError, recall, (), ("s1", "s2", "s3", "s4", "s5")) def test_recall_empty_reference(self): - self.assertRaises(ValueError, recall, ("s1", "s2", "s3", "s4", "s5"), ()) + self.assertRaises(ValueError, recall, ("s1", "s2", "s3", "s4", "s5"), ()) def test_recall_no_match(self): result = recall(("s1", "s2", "s3", "s4", "s5"), ("s6", "s7", "s8")) @@ -78,12 +76,12 @@ def test_basic_f_score_no_match(self): def test_basic_f_score_reference_smaller(self): result = f_score(("s1", "s2", "s3", "s4", "s5"), ("s1",)) - self.assertAlmostEqual(result, 1/3) + self.assertAlmostEqual(result, 1 / 3) def test_basic_f_score_evaluated_smaller(self): result = f_score(("s1",), ("s1", "s2", "s3", "s4", "s5")) - self.assertAlmostEqual(result, 1/3) + self.assertAlmostEqual(result, 1 / 3) def test_basic_f_score_equals(self): sentences = ("s1", "s2", "s3", "s4", "s5") @@ -95,8 +93,8 @@ def test_f_score_1(self): sentences = (("s1",), ("s1", "s2", "s3", "s4", "s5")) result = f_score(*sentences, weight=2.0) - p = 1/1 - r = 1/5 + p = 1 / 1 + r = 1 / 5 # ( (W^2 + 1) * P * R ) / ( W^2 * P + R ) expected = (5 * p * r) / (4 * p + r) @@ -106,8 +104,8 @@ def test_f_score_2(self): sentences = (("s1", "s3", "s6"), ("s1", "s2", "s3", "s4", "s5")) result = f_score(*sentences, weight=0.5) - p = 2/3 - r = 2/5 + p = 2 / 3 + r = 2 / 5 # ( (W^2 + 1) * P * R ) / ( W^2 * P + R ) expected = (1.25 * p * r) / (0.25 * p + r) @@ -140,19 +138,15 @@ def test_cosine_exact_match(self): def test_cosine_no_match(self): tokenizer = Tokenizer("czech") - model1 = TfDocumentModel("Toto je moja veta. To sa nedá poprieť!", - tokenizer) - model2 = TfDocumentModel("Hento bolo jeho slovo, ale možno klame.", - tokenizer) + model1 = TfDocumentModel("Toto je moja veta. To sa nedá poprieť!", tokenizer) + model2 = TfDocumentModel("Hento bolo jeho slovo, ale možno klame.", tokenizer) self.assertAlmostEqual(cosine_similarity(model1, model2), 0.0) def test_cosine_half_match(self): tokenizer = Tokenizer("czech") - model1 = TfDocumentModel("Veta aká sa tu len veľmi ťažko hľadá", - tokenizer) - model2 = TfDocumentModel("Teta ktorá sa tu iba veľmi zle hľadá", - tokenizer) + model1 = TfDocumentModel("Veta aká sa tu len veľmi ťažko hľadá", tokenizer) + model2 = TfDocumentModel("Teta ktorá sa tu iba veľmi zle hľadá", tokenizer) self.assertAlmostEqual(cosine_similarity(model1, model2), 0.5) @@ -178,63 +172,50 @@ def test_unit_overlap_exact_match(self): def test_unit_overlap_no_match(self): tokenizer = Tokenizer("czech") - model1 = TfDocumentModel("Toto je moja veta. To sa nedá poprieť!", - tokenizer) - model2 = TfDocumentModel("Hento bolo jeho slovo, ale možno klame.", - tokenizer) + model1 = TfDocumentModel("Toto je moja veta. To sa nedá poprieť!", tokenizer) + model2 = TfDocumentModel("Hento bolo jeho slovo, ale možno klame.", tokenizer) self.assertAlmostEqual(unit_overlap(model1, model2), 0.0) def test_unit_overlap_half_match(self): tokenizer = Tokenizer("czech") - model1 = TfDocumentModel("Veta aká sa tu len veľmi ťažko hľadá", - tokenizer) - model2 = TfDocumentModel("Teta ktorá sa tu iba veľmi zle hľadá", - tokenizer) + model1 = TfDocumentModel("Veta aká sa tu len veľmi ťažko hľadá", tokenizer) + model2 = TfDocumentModel("Teta ktorá sa tu iba veľmi zle hľadá", tokenizer) - self.assertAlmostEqual(unit_overlap(model1, model2), 1/3) + self.assertAlmostEqual(unit_overlap(model1, model2), 1 / 3) class TestRougeEvaluation(unittest.TestCase): def test_get_ngrams(self): self.assertTrue(not _get_ngrams(3, "")) - correct_ngrams = [("t", "e"), ("e", "s"), ("s", "t"), - ("t", "i"), ("i", "n"), ("n", "g")] + correct_ngrams = [("t", "e"), ("e", "s"), ("s", "t"), ("t", "i"), ("i", "n"), ("n", "g")] found_ngrams = _get_ngrams(2, "testing") self.assertEqual(len(correct_ngrams), len(found_ngrams)) for ngram in correct_ngrams: - self.assertTrue(ngram in found_ngrams) + self.assertTrue(ngram in found_ngrams) def test_split_into_words(self): - sentences1 = PlaintextParser.from_string("One, two two. Two. Three.", - Tokenizer("english")).document.sentences - self.assertEqual(["One", "two", "two", "Two", "Three"], - _split_into_words(sentences1)) - - sentences2 = PlaintextParser.from_string("two two. Two. Three.", - Tokenizer("english")).document.sentences - self.assertEqual(["two", "two", "Two", "Three"], - _split_into_words(sentences2)) + sentences1 = PlaintextParser.from_string("One, two two. Two. Three.", Tokenizer("english")).document.sentences + self.assertEqual(["One", "two", "two", "Two", "Three"], _split_into_words(sentences1)) + + sentences2 = PlaintextParser.from_string("two two. Two. Three.", Tokenizer("english")).document.sentences + self.assertEqual(["two", "two", "Two", "Three"], _split_into_words(sentences2)) def test_get_word_ngrams(self): - sentences = PlaintextParser.from_string("This is a test.", - Tokenizer("english")).document.sentences + sentences = PlaintextParser.from_string("This is a test.", Tokenizer("english")).document.sentences correct_ngrams = [("This", "is"), ("is", "a"), ("a", "test")] found_ngrams = _get_word_ngrams(2, sentences) for ngram in correct_ngrams: - self.assertTrue(ngram in found_ngrams) + self.assertTrue(ngram in found_ngrams) def test_len_lcs(self): self.assertEqual(_len_lcs("1234", "1224533324"), 4) self.assertEqual(_len_lcs("thisisatest", "testing123testing"), 7) - def test_recon_lcs(self): self.assertEqual(_recon_lcs("1234", "1224533324"), ("1", "2", "3", "4")) - self.assertEqual(_recon_lcs("thisisatest", "testing123testing"), - ("t", "s", "i", "t", "e", "s", "t")) - + self.assertEqual(_recon_lcs("thisisatest", "testing123testing"), ("t", "s", "i", "t", "e", "s", "t")) def test_rouge_n(self): candidate_text = "pulses may ease schizophrenic voices" @@ -243,22 +224,21 @@ def test_rouge_n(self): reference1_text = "magnetic pulse series sent through brain may ease schizophrenic voices" reference1 = PlaintextParser(reference1_text, Tokenizer("english")).document.sentences - reference2_text = "yale finds magnetic stimulation some relief to schizophrenics imaginary voices"; + reference2_text = "yale finds magnetic stimulation some relief to schizophrenics imaginary voices" - reference2 = PlaintextParser.from_string(reference2_text, - Tokenizer("english")).document.sentences + reference2 = PlaintextParser.from_string(reference2_text, Tokenizer("english")).document.sentences - self.assertAlmostEqual(rouge_n(candidate, reference1, 1), 4/10) - self.assertAlmostEqual(rouge_n(candidate, reference2, 1), 1/10) + self.assertAlmostEqual(rouge_n(candidate, reference1, 1), 4 / 10) + self.assertAlmostEqual(rouge_n(candidate, reference2, 1), 1 / 10) - self.assertAlmostEqual(rouge_n(candidate, reference1, 2), 3/9) - self.assertAlmostEqual(rouge_n(candidate, reference2, 2), 0/9) + self.assertAlmostEqual(rouge_n(candidate, reference1, 2), 3 / 9) + self.assertAlmostEqual(rouge_n(candidate, reference2, 2), 0 / 9) - self.assertAlmostEqual(rouge_n(candidate, reference1, 3), 2/8) - self.assertAlmostEqual(rouge_n(candidate, reference2, 3), 0/8) + self.assertAlmostEqual(rouge_n(candidate, reference1, 3), 2 / 8) + self.assertAlmostEqual(rouge_n(candidate, reference2, 3), 0 / 8) - self.assertAlmostEqual(rouge_n(candidate, reference1, 4), 1/7) - self.assertAlmostEqual(rouge_n(candidate, reference2, 4), 0/7) + self.assertAlmostEqual(rouge_n(candidate, reference1, 4), 1 / 7) + self.assertAlmostEqual(rouge_n(candidate, reference2, 4), 0 / 7) # These tests will apply when multiple reference summaries can be input # self.assertAlmostEqual(rouge_n(candidate, [reference1, reference2], 1), 5/20) @@ -266,24 +246,22 @@ def test_rouge_n(self): # self.assertAlmostEqual(rouge_n(candidate, [reference1, reference2], 3), 2/16) # self.assertAlmostEqual(rouge_n(candidate, [reference1, reference2], 4), 1/14) - def test_rouge_l_sentence_level(self): reference_text = "police killed the gunman" reference = PlaintextParser(reference_text, Tokenizer("english")).document.sentences candidate1_text = "police kill the gunman" candidate1 = PlaintextParser(candidate1_text, Tokenizer("english")).document.sentences - + candidate2_text = "the gunman kill police" candidate2 = PlaintextParser(candidate2_text, Tokenizer("english")).document.sentences - + candidate3_text = "the gunman police killed" - candidate3 = PlaintextParser(candidate3_text, Tokenizer("english")).document.sentences - - self.assertAlmostEqual(rouge_l_sentence_level(candidate1, reference), 3/4) - self.assertAlmostEqual(rouge_l_sentence_level(candidate2, reference), 2/4) - self.assertAlmostEqual(rouge_l_sentence_level(candidate2, reference), 2/4) + _candidate3 = PlaintextParser(candidate3_text, Tokenizer("english")).document.sentences + self.assertAlmostEqual(rouge_l_sentence_level(candidate1, reference), 3 / 4) + self.assertAlmostEqual(rouge_l_sentence_level(candidate2, reference), 2 / 4) + self.assertAlmostEqual(rouge_l_sentence_level(candidate2, reference), 2 / 4) def test_union_lcs(self): reference_text = "one two three four five" @@ -292,7 +270,7 @@ def test_union_lcs(self): candidate_text = "one two six seven eight. one three eight nine five." candidates = PlaintextParser(candidate_text, Tokenizer("english")).document.sentences - self.assertAlmostEqual(_union_lcs(candidates, reference[0]), 4/5) + self.assertAlmostEqual(_union_lcs(candidates, reference[0]), 4 / 5) def test_rouge_l_summary_level(self): reference_text = "one two three four five. one two three four five." diff --git a/tests/test_main.py b/tests/test_main.py index 6dd60d8c..bfa7b749 100644 --- a/tests/test_main.py +++ b/tests/test_main.py @@ -1,11 +1,10 @@ -import unittest import argparse -import sys +import unittest +from io import StringIO + import pytest -from sumy.__main__ import main, handle_arguments, AVAILABLE_METHODS -from sumy import __version__ -from io import StringIO +from sumy.__main__ import handle_arguments, main class TestMain(unittest.TestCase): diff --git a/tests/test_models/test_dom.py b/tests/test_models/test_dom.py index 3cdcb9ba..a6ec9b0e 100644 --- a/tests/test_models/test_dom.py +++ b/tests/test_models/test_dom.py @@ -1,31 +1,23 @@ import unittest -from sumy.nlp.tokenizers import Tokenizer from sumy.models.dom import Paragraph, Sentence +from sumy.nlp.tokenizers import Tokenizer + from ..utils import build_document, build_document_from_string class TestDocument(unittest.TestCase): def test_unique_words(self): document = build_document( - ("Nějaký muž šel kolem naší zahrady", "Nějaký muž šel kolem vaší zahrady",), + ( + "Nějaký muž šel kolem naší zahrady", + "Nějaký muž šel kolem vaší zahrady", + ), ("Už už abych taky šel",), ) returned = tuple(sorted(frozenset(document.words))) - expected = ( - "Nějaký", - "Už", - "abych", - "kolem", - "muž", - "naší", - "taky", - "už", - "vaší", - "zahrady", - "šel" - ) + expected = ("Nějaký", "Už", "abych", "kolem", "muž", "naší", "taky", "už", "vaší", "zahrady", "šel") self.assertEqual(expected, returned) def test_headings(self): @@ -50,12 +42,9 @@ def test_sentences(self): """) self.assertEqual(len(document.sentences), 3) - self.assertEqual(str(document.sentences[0]), - "Nějaký muž šel kolem naší zahrady") - self.assertEqual(str(document.sentences[1]), - "Nějaký jiný muž šel kolem vaší zahrady") - self.assertEqual(str(document.sentences[2]), - "Už už abych taky šel") + self.assertEqual(str(document.sentences[0]), "Nějaký muž šel kolem naší zahrady") + self.assertEqual(str(document.sentences[1]), "Nějaký jiný muž šel kolem vaší zahrady") + self.assertEqual(str(document.sentences[2]), "Už už abych taky šel") def test_only_instances_of_sentence_allowed(self): document = build_document_from_string(""" @@ -66,8 +55,7 @@ def test_only_instances_of_sentence_allowed(self): Už už abych taky šel """) - self.assertRaises(TypeError, Paragraph, - list(document.sentences) + ["Last sentence"]) + self.assertRaises(TypeError, Paragraph, list(document.sentences) + ["Last sentence"]) def test_sentences_equal(self): sentence1 = Sentence("", Tokenizer("czech")) diff --git a/tests/test_models/test_tf.py b/tests/test_models/test_tf.py index 863330c8..a7937409 100644 --- a/tests/test_models/test_tf.py +++ b/tests/test_models/test_tf.py @@ -1,7 +1,7 @@ import unittest -from sumy.nlp.tokenizers import Tokenizer from sumy.models import TfDocumentModel +from sumy.nlp.tokenizers import Tokenizer class TestTfModel(unittest.TestCase): @@ -81,11 +81,11 @@ def test_normalized_words_frequencies(self): words = "a b c d e c b d c e e d e d e".split() model = TfDocumentModel(tuple(words)) - self.assertAlmostEqual(model.normalized_term_frequency("a"), 1/5) - self.assertAlmostEqual(model.normalized_term_frequency("b"), 2/5) - self.assertAlmostEqual(model.normalized_term_frequency("c"), 3/5) - self.assertAlmostEqual(model.normalized_term_frequency("d"), 4/5) - self.assertAlmostEqual(model.normalized_term_frequency("e"), 5/5) + self.assertAlmostEqual(model.normalized_term_frequency("a"), 1 / 5) + self.assertAlmostEqual(model.normalized_term_frequency("b"), 2 / 5) + self.assertAlmostEqual(model.normalized_term_frequency("c"), 3 / 5) + self.assertAlmostEqual(model.normalized_term_frequency("d"), 4 / 5) + self.assertAlmostEqual(model.normalized_term_frequency("e"), 5 / 5) self.assertAlmostEqual(model.normalized_term_frequency("z"), 0.0) self.assertEqual(model.most_frequent_terms(), ("e", "d", "c", "b", "a")) @@ -94,11 +94,11 @@ def test_normalized_words_frequencies_with_smoothing_term(self): words = "a b c d e c b d c e e d e d e".split() model = TfDocumentModel(tuple(words)) - self.assertAlmostEqual(model.normalized_term_frequency("a", 0.5), 0.5 + 1/10) - self.assertAlmostEqual(model.normalized_term_frequency("b", 0.5), 0.5 + 2/10) - self.assertAlmostEqual(model.normalized_term_frequency("c", 0.5), 0.5 + 3/10) - self.assertAlmostEqual(model.normalized_term_frequency("d", 0.5), 0.5 + 4/10) - self.assertAlmostEqual(model.normalized_term_frequency("e", 0.5), 0.5 + 5/10) + self.assertAlmostEqual(model.normalized_term_frequency("a", 0.5), 0.5 + 1 / 10) + self.assertAlmostEqual(model.normalized_term_frequency("b", 0.5), 0.5 + 2 / 10) + self.assertAlmostEqual(model.normalized_term_frequency("c", 0.5), 0.5 + 3 / 10) + self.assertAlmostEqual(model.normalized_term_frequency("d", 0.5), 0.5 + 4 / 10) + self.assertAlmostEqual(model.normalized_term_frequency("e", 0.5), 0.5 + 5 / 10) self.assertAlmostEqual(model.normalized_term_frequency("z", 0.5), 0.5) self.assertEqual(model.most_frequent_terms(), ("e", "d", "c", "b", "a")) diff --git a/tests/test_parsers.py b/tests/test_parsers.py index 10c26045..f99aeb04 100644 --- a/tests/test_parsers.py +++ b/tests/test_parsers.py @@ -1,14 +1,16 @@ import unittest -from sumy.parsers.plaintext import PlaintextParser -from sumy.parsers.html import HtmlParser from sumy.nlp.tokenizers import Tokenizer +from sumy.parsers.html import HtmlParser +from sumy.parsers.plaintext import PlaintextParser + from .utils import expand_resource_path class TestParser(unittest.TestCase): def test_parse_plaintext(self): - parser = PlaintextParser.from_string(""" + parser = PlaintextParser.from_string( + """ Ako sa máš? Ja dobre! A ty? No mohlo to byť aj lepšie!!! Ale pohodička. @@ -16,7 +18,9 @@ def test_parse_plaintext(self): TOTO JE AKOŽE NADPIS A toto je text pod ním, ktorý je textový. A tak ďalej... - """, Tokenizer("czech")) + """, + Tokenizer("czech"), + ) document = parser.document @@ -29,7 +33,8 @@ def test_parse_plaintext(self): self.assertEqual(len(document.paragraphs[1].sentences), 2) def test_parse_plaintext_long(self): - parser = PlaintextParser.from_string(""" + parser = PlaintextParser.from_string( + """ Ako sa máš? Ja dobre! A ty? No mohlo to byť aj lepšie!!! Ale pohodička. @@ -45,7 +50,9 @@ def test_parse_plaintext_long(self): A tak este dalej! - """, Tokenizer("czech")) + """, + Tokenizer("czech"), + ) document = parser.document @@ -80,15 +87,11 @@ def test_annotated_text(self): self.assertEqual(len(document.paragraphs[0].headings), 1) self.assertEqual(len(document.paragraphs[0].sentences), 1) - self.assertEqual(str(document.paragraphs[0].headings[0]), - "Toto je nadpis prvej úrovne") - self.assertEqual(str(document.paragraphs[0].sentences[0]), - "Toto je prvý odstavec a to je fajn.") + self.assertEqual(str(document.paragraphs[0].headings[0]), "Toto je nadpis prvej úrovne") + self.assertEqual(str(document.paragraphs[0].sentences[0]), "Toto je prvý odstavec a to je fajn.") self.assertEqual(len(document.paragraphs[1].headings), 0) self.assertEqual(len(document.paragraphs[1].sentences), 2) - self.assertEqual(str(document.paragraphs[1].sentences[0]), - "Tento text je tu aby vyplnil prázdne miesto v srdci súboru.") - self.assertEqual(str(document.paragraphs[1].sentences[1]), - "Aj súbory majú predsa city.") + self.assertEqual(str(document.paragraphs[1].sentences[0]), "Tento text je tu aby vyplnil prázdne miesto v srdci súboru.") + self.assertEqual(str(document.paragraphs[1].sentences[1]), "Aj súbory majú predsa city.") diff --git a/tests/test_stemmers.py b/tests/test_stemmers.py index f70af65a..384cd3d1 100644 --- a/tests/test_stemmers.py +++ b/tests/test_stemmers.py @@ -1,10 +1,11 @@ import unittest -from sumy.nlp.stemmers import null_stemmer, Stemmer +from sumy.nlp.stemmers import Stemmer, null_stemmer class TestStemmers(unittest.TestCase): """Simple tests to make sure all stemmers share the same API.""" + def test_missing_stemmer_language(self): self.assertRaises(LookupError, Stemmer, "klingon") @@ -12,19 +13,19 @@ def test_null_stemmer(self): self.assertEqual("ľščťžýáíé", null_stemmer("ľŠčŤžÝáÍé")) def test_english_stemmer(self): - english_stemmer = Stemmer('english') + english_stemmer = Stemmer("english") self.assertEqual("beauti", english_stemmer("beautiful")) def test_german_stemmer(self): - german_stemmer = Stemmer('german') + german_stemmer = Stemmer("german") self.assertEqual("sterb", german_stemmer("sterben")) def test_czech_stemmer(self): - czech_stemmer = Stemmer('czech') + czech_stemmer = Stemmer("czech") self.assertEqual("pěkn", czech_stemmer("pěkný")) def test_french_stemmer(self): - french_stemmer = Stemmer('czech') + french_stemmer = Stemmer("czech") self.assertEqual("jol", french_stemmer("jolies")) def test_slovak_stemmer(self): diff --git a/tests/test_summarizers/test_edmundson.py b/tests/test_summarizers/test_edmundson.py index 0051a80d..2c73a4af 100644 --- a/tests/test_summarizers/test_edmundson.py +++ b/tests/test_summarizers/test_edmundson.py @@ -1,6 +1,7 @@ import unittest from sumy.summarizers.edmundson import EdmundsonSummarizer + from ..utils import build_document, build_document_from_string @@ -10,7 +11,13 @@ def test_bonus_words_property(self): self.assertEqual(summarizer.bonus_words, frozenset()) - words = ("word", "another", "and", "some", "next",) + words = ( + "word", + "another", + "and", + "some", + "next", + ) summarizer.bonus_words = words self.assertTrue(isinstance(summarizer.bonus_words, frozenset)) self.assertEqual(summarizer.bonus_words, frozenset(words)) @@ -20,7 +27,13 @@ def test_stigma_words_property(self): self.assertEqual(summarizer.stigma_words, frozenset()) - words = ("word", "another", "and", "some", "next",) + words = ( + "word", + "another", + "and", + "some", + "next", + ) summarizer.stigma_words = words self.assertTrue(isinstance(summarizer.stigma_words, frozenset)) self.assertEqual(summarizer.stigma_words, frozenset(words)) @@ -30,14 +43,19 @@ def test_null_words_property(self): self.assertEqual(summarizer.null_words, frozenset()) - words = ("word", "another", "and", "some", "next",) + words = ( + "word", + "another", + "and", + "some", + "next", + ) summarizer.null_words = words self.assertTrue(isinstance(summarizer.null_words, frozenset)) self.assertEqual(summarizer.null_words, frozenset(words)) def test_empty_document(self): - summarizer = EdmundsonSummarizer(cue_weight=0, key_weight=0, - title_weight=0, location_weight=0) + summarizer = EdmundsonSummarizer(cue_weight=0, key_weight=0, title_weight=0, location_weight=0) sentences = summarizer(build_document(), 10) self.assertEqual(len(sentences), 0) @@ -53,17 +71,20 @@ def test_mixed_cue_key(self): Here is the winner because contains words like cool and heading """) - summarizer = EdmundsonSummarizer(cue_weight=1, key_weight=1, - title_weight=0, location_weight=0) + summarizer = EdmundsonSummarizer(cue_weight=1, key_weight=1, title_weight=0, location_weight=0) summarizer.bonus_words = ("cool", "heading", "sentence", "words", "like", "because") - summarizer.stigma_words = ("this", "is", "I", "am", "and",) + summarizer.stigma_words = ( + "this", + "is", + "I", + "am", + "and", + ) sentences = summarizer(document, 2) self.assertEqual(len(sentences), 2) - self.assertEqual(str(sentences[0]), - "Because I am sentence I like words") - self.assertEqual(str(sentences[1]), - "Here is the winner because contains words like cool and heading") + self.assertEqual(str(sentences[0]), "Because I am sentence I like words") + self.assertEqual(str(sentences[1]), "Here is the winner because contains words like cool and heading") def test_cue_with_no_words(self): summarizer = EdmundsonSummarizer() @@ -72,32 +93,57 @@ def test_cue_with_no_words(self): def test_cue_with_no_stigma_words(self): summarizer = EdmundsonSummarizer() - summarizer.bonus_words = ("great", "very", "beautiful",) + summarizer.bonus_words = ( + "great", + "very", + "beautiful", + ) self.assertRaises(ValueError, summarizer.cue_method, build_document(), 10) def test_cue_with_no_bonus_words(self): summarizer = EdmundsonSummarizer() - summarizer.stigma_words = ("useless", "bad", "spinach",) + summarizer.stigma_words = ( + "useless", + "bad", + "spinach", + ) self.assertRaises(ValueError, summarizer.cue_method, build_document(), 10) def test_cue_empty(self): summarizer = EdmundsonSummarizer() - summarizer.bonus_words = ("ba", "bb", "bc",) - summarizer.stigma_words = ("sa", "sb", "sc",) + summarizer.bonus_words = ( + "ba", + "bb", + "bc", + ) + summarizer.stigma_words = ( + "sa", + "sb", + "sc", + ) sentences = summarizer.cue_method(build_document(), 10) self.assertEqual(len(sentences), 0) def test_cue_letters_case(self): document = build_document( - ("X X X", "x x x x",), - ("w w w", "W W W W",) + ( + "X X X", + "x x x x", + ), + ( + "w w w", + "W W W W", + ), ) summarizer = EdmundsonSummarizer() - summarizer.bonus_words = ("X", "w",) + summarizer.bonus_words = ( + "X", + "w", + ) summarizer.stigma_words = ("stigma",) sentences = summarizer.cue_method(document, 2) @@ -106,70 +152,90 @@ def test_cue_letters_case(self): self.assertEqual(str(sentences[1]), "W W W W") def test_cue_1(self): - document = build_document( - ("ba bb bc bb unknown ľščťžýáíé sb sc sb",) - ) + document = build_document(("ba bb bc bb unknown ľščťžýáíé sb sc sb",)) summarizer = EdmundsonSummarizer() - summarizer.bonus_words = ("ba", "bb", "bc",) - summarizer.stigma_words = ("sa", "sb", "sc",) + summarizer.bonus_words = ( + "ba", + "bb", + "bc", + ) + summarizer.stigma_words = ( + "sa", + "sb", + "sc", + ) sentences = summarizer.cue_method(document, 10) self.assertEqual(len(sentences), 1) def test_cue_2(self): - document = build_document( - ("ba bb bc bb unknown ľščťžýáíé sb sc sb",), - ("Pepek likes spinach",) - ) + document = build_document(("ba bb bc bb unknown ľščťžýáíé sb sc sb",), ("Pepek likes spinach",)) summarizer = EdmundsonSummarizer() - summarizer.bonus_words = ("ba", "bb", "bc",) - summarizer.stigma_words = ("sa", "sb", "sc",) + summarizer.bonus_words = ( + "ba", + "bb", + "bc", + ) + summarizer.stigma_words = ( + "sa", + "sb", + "sc", + ) sentences = summarizer.cue_method(document, 10) self.assertEqual(len(sentences), 2) - self.assertEqual(str(sentences[0]), - "ba bb bc bb unknown ľščťžýáíé sb sc sb") + self.assertEqual(str(sentences[0]), "ba bb bc bb unknown ľščťžýáíé sb sc sb") self.assertEqual(str(sentences[1]), "Pepek likes spinach") sentences = summarizer.cue_method(document, 1) self.assertEqual(len(sentences), 1) - self.assertEqual(str(sentences[0]), - "ba bb bc bb unknown ľščťžýáíé sb sc sb") + self.assertEqual(str(sentences[0]), "ba bb bc bb unknown ľščťžýáíé sb sc sb") def test_cue_3(self): document = build_document( ( - "ba "*10, - "bb "*10, - " sa"*8 + " bb"*10, + "ba " * 10, + "bb " * 10, + " sa" * 8 + " bb" * 10, "bb bc ba", ), (), ( - "babbbc "*10, - "na nb nc nd sa" + " bc"*10, - " ba n"*10, - ) + "babbbc " * 10, + "na nb nc nd sa" + " bc" * 10, + " ba n" * 10, + ), ) summarizer = EdmundsonSummarizer() - summarizer.bonus_words = ("ba", "bb", "bc",) - summarizer.stigma_words = ("sa", "sb", "sc",) + summarizer.bonus_words = ( + "ba", + "bb", + "bc", + ) + summarizer.stigma_words = ( + "sa", + "sb", + "sc", + ) sentences = summarizer.cue_method(document, 5) self.assertEqual(len(sentences), 5) - self.assertEqual(str(sentences[0]), ("ba "*10).strip()) - self.assertEqual(str(sentences[1]), ("bb "*10).strip()) + self.assertEqual(str(sentences[0]), ("ba " * 10).strip()) + self.assertEqual(str(sentences[1]), ("bb " * 10).strip()) self.assertEqual(str(sentences[2]), "bb bc ba") - self.assertEqual(str(sentences[3]), - "na nb nc nd sa bc bc bc bc bc bc bc bc bc bc") - self.assertEqual(str(sentences[4]), ("ba n "*10).strip()) + self.assertEqual(str(sentences[3]), "na nb nc nd sa bc bc bc bc bc bc bc bc bc bc") + self.assertEqual(str(sentences[4]), ("ba n " * 10).strip()) def test_key_empty(self): summarizer = EdmundsonSummarizer() - summarizer.bonus_words = ("ba", "bb", "bc",) + summarizer.bonus_words = ( + "ba", + "bb", + "bc", + ) sentences = summarizer.key_method(build_document(), 10) self.assertEqual(len(sentences), 0) @@ -181,53 +247,81 @@ def test_key_without_bonus_words(self): def test_key_no_bonus_words_in_document(self): document = build_document( - ("wa wb wc wd", "I like music",), - ("This is test sentence with some extra words",) + ( + "wa wb wc wd", + "I like music", + ), + ("This is test sentence with some extra words",), ) summarizer = EdmundsonSummarizer() - summarizer.bonus_words = ("ba", "bb", "bc", "bonus",) + summarizer.bonus_words = ( + "ba", + "bb", + "bc", + "bonus", + ) sentences = summarizer.key_method(document, 10) self.assertEqual(len(sentences), 3) self.assertEqual(str(sentences[0]), "wa wb wc wd") self.assertEqual(str(sentences[1]), "I like music") - self.assertEqual(str(sentences[2]), - "This is test sentence with some extra words") + self.assertEqual(str(sentences[2]), "This is test sentence with some extra words") def test_key_1(self): document = build_document( - ("wa wb wc wd", "I like music",), - ("This is test sentence with some extra words and bonus",) + ( + "wa wb wc wd", + "I like music", + ), + ("This is test sentence with some extra words and bonus",), ) summarizer = EdmundsonSummarizer() - summarizer.bonus_words = ("ba", "bb", "bc", "bonus",) + summarizer.bonus_words = ( + "ba", + "bb", + "bc", + "bonus", + ) sentences = summarizer.key_method(document, 1) self.assertEqual(len(sentences), 1) - self.assertEqual(str(sentences[0]), - "This is test sentence with some extra words and bonus") + self.assertEqual(str(sentences[0]), "This is test sentence with some extra words and bonus") def test_key_2(self): document = build_document( - ("Om nom nom nom nom", "Sure I summarize it, with bonus",), - ("This is bonus test sentence with some extra words and bonus",) + ( + "Om nom nom nom nom", + "Sure I summarize it, with bonus", + ), + ("This is bonus test sentence with some extra words and bonus",), ) summarizer = EdmundsonSummarizer() - summarizer.bonus_words = ("nom", "bonus",) + summarizer.bonus_words = ( + "nom", + "bonus", + ) sentences = summarizer.key_method(document, 2) self.assertEqual(len(sentences), 2) self.assertEqual(str(sentences[0]), "Om nom nom nom nom") - self.assertEqual(str(sentences[1]), - "This is bonus test sentence with some extra words and bonus") + self.assertEqual(str(sentences[1]), "This is bonus test sentence with some extra words and bonus") def test_key_3(self): document = build_document( - ("wa", "wa wa", "wa wa wa", "wa wa wa wa", "wa Wa Wa Wa wa",), - ("x X x X",) + ( + "wa", + "wa wa", + "wa wa wa", + "wa wa wa wa", + "wa Wa Wa Wa wa", + ), + ("x X x X",), ) summarizer = EdmundsonSummarizer() - summarizer.bonus_words = ("wa", "X",) + summarizer.bonus_words = ( + "wa", + "X", + ) sentences = summarizer.key_method(document, 3) self.assertEqual(len(sentences), 3) @@ -243,7 +337,11 @@ def test_key_3(self): def test_title_method_with_empty_document(self): summarizer = EdmundsonSummarizer() - summarizer.null_words = ("ba", "bb", "bc",) + summarizer.null_words = ( + "ba", + "bb", + "bc", + ) sentences = summarizer.title_method(build_document(), 10) self.assertEqual(len(sentences), 0) @@ -255,12 +353,20 @@ def test_title_method_without_null_words(self): def test_title_method_without_title(self): document = build_document( - ("This is sentence", "This is another one",), - ("And some next sentence but no heading",) + ( + "This is sentence", + "This is another one", + ), + ("And some next sentence but no heading",), ) summarizer = EdmundsonSummarizer() - summarizer.null_words = ("this", "is", "some", "and",) + summarizer.null_words = ( + "this", + "is", + "some", + "and", + ) sentences = summarizer.title_method(document, 10) self.assertEqual(len(sentences), 3) @@ -280,12 +386,17 @@ def test_title_method_1(self): """) summarizer = EdmundsonSummarizer() - summarizer.null_words = ("this", "is", "I", "am", "and",) + summarizer.null_words = ( + "this", + "is", + "I", + "am", + "and", + ) sentences = summarizer.title_method(document, 1) self.assertEqual(len(sentences), 1) - self.assertEqual(str(sentences[0]), - "Here is the winner because contains words like cool and heading") + self.assertEqual(str(sentences[0]), "Here is the winner because contains words like cool and heading") def test_title_method_2(self): document = build_document_from_string(""" @@ -299,14 +410,18 @@ def test_title_method_2(self): """) summarizer = EdmundsonSummarizer() - summarizer.null_words = ("this", "is", "I", "am", "and",) + summarizer.null_words = ( + "this", + "is", + "I", + "am", + "and", + ) sentences = summarizer.title_method(document, 2) self.assertEqual(len(sentences), 2) - self.assertEqual(str(sentences[0]), - "This is next paragraph because of blank line above") - self.assertEqual(str(sentences[1]), - "Here is the winner because contains words like cool and heading") + self.assertEqual(str(sentences[0]), "This is next paragraph because of blank line above") + self.assertEqual(str(sentences[1]), "Here is the winner because contains words like cool and heading") def test_title_method_3(self): document = build_document_from_string(""" @@ -320,20 +435,27 @@ def test_title_method_3(self): """) summarizer = EdmundsonSummarizer() - summarizer.null_words = ("this", "is", "I", "am", "and",) + summarizer.null_words = ( + "this", + "is", + "I", + "am", + "and", + ) sentences = summarizer.title_method(document, 3) self.assertEqual(len(sentences), 3) - self.assertEqual(str(sentences[0]), - "Because I am sentence I like words") - self.assertEqual(str(sentences[1]), - "This is next paragraph because of blank line above") - self.assertEqual(str(sentences[2]), - "Here is the winner because contains words like cool and heading") + self.assertEqual(str(sentences[0]), "Because I am sentence I like words") + self.assertEqual(str(sentences[1]), "This is next paragraph because of blank line above") + self.assertEqual(str(sentences[2]), "Here is the winner because contains words like cool and heading") def test_location_method_with_empty_document(self): summarizer = EdmundsonSummarizer() - summarizer.null_words = ("na", "nb", "nc",) + summarizer.null_words = ( + "na", + "nb", + "nc", + ) sentences = summarizer.location_method(build_document(), 10) self.assertEqual(len(sentences), 0) @@ -359,7 +481,13 @@ def test_location_method_1(self): """) summarizer = EdmundsonSummarizer() - summarizer.null_words = ("na", "nb", "nc", "nd", "ne",) + summarizer.null_words = ( + "na", + "nb", + "nc", + "nd", + "ne", + ) sentences = summarizer.location_method(document, 4) self.assertEqual(len(sentences), 4) @@ -385,7 +513,13 @@ def test_location_method_2(self): """) summarizer = EdmundsonSummarizer() - summarizer.null_words = ("na", "nb", "nc", "nd", "ne",) + summarizer.null_words = ( + "na", + "nb", + "nc", + "nd", + "ne", + ) sentences = summarizer.location_method(document, 4, w_p1=0, w_p2=0) self.assertEqual(len(sentences), 4) diff --git a/tests/test_summarizers/test_kl.py b/tests/test_summarizers/test_kl.py index 1991b5ee..247a3b0c 100644 --- a/tests/test_summarizers/test_kl.py +++ b/tests/test_summarizers/test_kl.py @@ -1,9 +1,10 @@ import unittest from sumy.models.dom._sentence import Sentence -from sumy.summarizers.kl import KLSummarizer -from ..utils import build_document, build_document_from_string from sumy.nlp.tokenizers import Tokenizer +from sumy.summarizers.kl import KLSummarizer + +from ..utils import build_document class TestKL(unittest.TestCase): @@ -32,16 +33,16 @@ def test_single_sentence(self): self.assertEqual(len(returned), 1) def test_compute_word_freq(self): - + summarizer = self._build_summarizer(self.EMPTY_STOP_WORDS) - + words = ["one", "two", "three", "four"] freq = summarizer._compute_word_freq(words) self.assertEqual(freq.get("one", 0), 1) self.assertEqual(freq.get("two", 0), 1) self.assertEqual(freq.get("three", 0), 1) self.assertEqual(freq.get("four", 0), 1) - + words = ["one", "one", "two", "two"] freq = summarizer._compute_word_freq(words) self.assertEqual(freq.get("one", 0), 2) @@ -53,45 +54,42 @@ def test_joint_freq(self): w1 = ["one", "two", "three", "four"] w2 = ["one", "two", "three", "four"] freq = summarizer._joint_freq(w1, w2) - self.assertEqual(freq["one"], 1.0/4) - self.assertEqual(freq["two"], 1.0/4) - self.assertEqual(freq["three"], 1.0/4) - self.assertEqual(freq["four"], 1.0/4) + self.assertEqual(freq["one"], 1.0 / 4) + self.assertEqual(freq["two"], 1.0 / 4) + self.assertEqual(freq["three"], 1.0 / 4) + self.assertEqual(freq["four"], 1.0 / 4) w1 = ["one", "two", "three", "four"] w2 = ["one", "one", "three", "five"] freq = summarizer._joint_freq(w1, w2) - self.assertEqual(freq["one"], 3.0/8) - self.assertEqual(freq["two"], 1.0/8) - self.assertEqual(freq["three"], 1.0/4) - self.assertEqual(freq["four"], 1.0/8) - self.assertEqual(freq["five"], 1.0/8) + self.assertEqual(freq["one"], 3.0 / 8) + self.assertEqual(freq["two"], 1.0 / 8) + self.assertEqual(freq["three"], 1.0 / 4) + self.assertEqual(freq["four"], 1.0 / 8) + self.assertEqual(freq["five"], 1.0 / 8) def test_kl_divergence(self): summarizer = self._build_summarizer(self.EMPTY_STOP_WORDS) EPS = 0.00001 + w1 = {"one": 0.35, "two": 0.5, "three": 0.15} + w2 = {"one": 1.0 / 3.0, "two": 1.0 / 3.0, "three": 1.0 / 3.0} - w1 = {"one":.35, "two":.5, "three":.15} - w2 = {"one":1.0/3.0, "two":1.0/3.0, "three":1.0/3.0} - - w1_ = [.35, .5, .15] - w2_ = [1.0/3.0, 1.0/3.0, 1.0/3.0] + _w1_list = [0.35, 0.5, 0.15] + _w2_list = [1.0 / 3.0, 1.0 / 3.0, 1.0 / 3.0] # This value comes from scipy.stats.entropy(w2_, w1_) # Note: the order of params is different kl_correct = 0.11475080798005841 self.assertTrue(abs(summarizer._kl_divergence(w1, w2) - kl_correct < EPS)) - w1 = {"one":.1, "two":.2, "three":.7} - w2 = {"one":.2, "two":.4, "three":.4} + w1 = {"one": 0.1, "two": 0.2, "three": 0.7} + w2 = {"one": 0.2, "two": 0.4, "three": 0.4} + + _w1 = [0.1, 0.2, 0.7] + _w2 = [0.2, 0.4, 0.4] - w1_ = [.1, .2, .7] - w2_ = [.2, .4, .4] - # This value comes from scipy.stats.entropy(w2_, w1_) # Note: the order of params is different kl_correct = 0.1920419931617981 self.assertTrue(abs(summarizer._kl_divergence(w1, w2) - kl_correct) < EPS) - - diff --git a/tests/test_summarizers/test_lex_rank.py b/tests/test_summarizers/test_lex_rank.py index 2ff57ee4..f1046ac3 100644 --- a/tests/test_summarizers/test_lex_rank.py +++ b/tests/test_summarizers/test_lex_rank.py @@ -1,12 +1,13 @@ import math import unittest -import sumy.summarizers.lex_rank as lex_rank_module -from sumy.summarizers.lex_rank import LexRankSummarizer -from sumy.parsers.plaintext import PlaintextParser +import sumy.summarizers.lex_rank as lex_rank_module from sumy.nlp.stemmers.czech import stem_word from sumy.nlp.tokenizers import Tokenizer +from sumy.parsers.plaintext import PlaintextParser +from sumy.summarizers.lex_rank import LexRankSummarizer from sumy.utils import get_stop_words + from ..utils import build_document, load_resource @@ -31,8 +32,8 @@ def test_tf_metrics(self): metrics = summarizer._compute_tf(sentences) expected = [ - {"this": 1/2, "is": 1/2, "simple": 1/2, "sentence": 1.0}, - {"this": 1/3, "is": 2/3, "yes": 1/3, "simple": 1/3, "sentence": 1/3, "too": 1.0}, + {"this": 1 / 2, "is": 1 / 2, "simple": 1 / 2, "sentence": 1.0}, + {"this": 1 / 3, "is": 2 / 3, "yes": 1 / 3, "simple": 1 / 3, "sentence": 1 / 3, "too": 1.0}, ] self.assertEqual(expected, metrics) @@ -40,28 +41,57 @@ def test_idf_metrics(self): summarizer = LexRankSummarizer() sentences = [ - ("this", "sentence", "is", "simple", "sentence",), - ("this", "is", "simple", "sentence", "yes", "is", "too", "too", "too",), - ("not", "every", "sentence", "makes", "me", "happy",), + ( + "this", + "sentence", + "is", + "simple", + "sentence", + ), + ( + "this", + "is", + "simple", + "sentence", + "yes", + "is", + "too", + "too", + "too", + ), + ( + "not", + "every", + "sentence", + "makes", + "me", + "happy", + ), ("yes",), (), - ("every", "day", "is", "happy", "day",), + ( + "every", + "day", + "is", + "happy", + "day", + ), ] metrics = summarizer._compute_idf(sentences) expected = { - "this": math.log(6/3), - "is": math.log(6/4), - "yes": math.log(6/3), - "simple": math.log(6/3), - "sentence": math.log(6/4), - "too": math.log(6/2), - "not": math.log(6/2), - "every": math.log(6/3), - "makes": math.log(6/2), - "me": math.log(6/2), - "happy": math.log(6/3), - "day": math.log(6/2), + "this": math.log(6 / 3), + "is": math.log(6 / 4), + "yes": math.log(6 / 3), + "simple": math.log(6 / 3), + "sentence": math.log(6 / 4), + "too": math.log(6 / 2), + "not": math.log(6 / 2), + "every": math.log(6 / 3), + "makes": math.log(6 / 2), + "me": math.log(6 / 2), + "happy": math.log(6 / 3), + "day": math.log(6 / 2), } assert expected == metrics @@ -69,21 +99,21 @@ def test_modified_cosine_computation(self): summarizer = LexRankSummarizer() sentence1 = ["this", "sentence", "is", "simple", "sentence"] - tf1 = {"this": 1/2, "sentence": 1.0, "is": 1/2, "simple": 1/2} + tf1 = {"this": 1 / 2, "sentence": 1.0, "is": 1 / 2, "simple": 1 / 2} sentence2 = ["this", "is", "simple", "sentence", "yes", "is", "too", "too"] - tf2 = {"this": 1/2, "is": 1.0, "simple": 1/2, "sentence": 1/2, "yes": 1/2, "too": 1.0} + tf2 = {"this": 1 / 2, "is": 1.0, "simple": 1 / 2, "sentence": 1 / 2, "yes": 1 / 2, "too": 1.0} idf = { - "this": 2/2, - "sentence": 2/2, - "is": 2/2, - "simple": 2/2, - "yes": 2/1, - "too": 2/1, + "this": 2 / 2, + "sentence": 2 / 2, + "is": 2 / 2, + "simple": 2 / 2, + "yes": 2 / 1, + "too": 2 / 1, } - numerator = sum(tf1[t]*tf2[t]*idf[t]**2 for t in ["this", "sentence", "is", "simple"]) - denominator1 = math.sqrt(sum((tf1[t]*idf[t])**2 for t in sentence1)) - denominator2 = math.sqrt(sum((tf2[t]*idf[t])**2 for t in sentence2)) + numerator = sum(tf1[t] * tf2[t] * idf[t] ** 2 for t in ["this", "sentence", "is", "simple"]) + denominator1 = math.sqrt(sum((tf1[t] * idf[t]) ** 2 for t in sentence1)) + denominator2 = math.sqrt(sum((tf2[t] * idf[t]) ** 2 for t in sentence2)) expected = numerator / (denominator1 * denominator2) cosine = summarizer._compute_cosine(sentence1, sentence2, tf1, tf2, idf) @@ -91,10 +121,7 @@ def test_modified_cosine_computation(self): def test_article_example(self): """Source: http://www.prevko.cz/dite/skutecne-pribehy-deti""" - parser = PlaintextParser.from_string( - load_resource("articles/prevko_cz_1.txt"), - Tokenizer("czech") - ) + parser = PlaintextParser.from_string(load_resource("articles/prevko_cz_1.txt"), Tokenizer("czech")) summarizer = LexRankSummarizer(stem_word) summarizer.stop_words = get_stop_words("czech") diff --git a/tests/test_summarizers/test_lsa.py b/tests/test_summarizers/test_lsa.py index 2f3b0ec6..8ea83c43 100644 --- a/tests/test_summarizers/test_lsa.py +++ b/tests/test_summarizers/test_lsa.py @@ -1,12 +1,14 @@ import unittest + import pytest -import sumy.summarizers.lsa as lsa_module -from sumy.summarizers.lsa import LsaSummarizer -from sumy.parsers.plaintext import PlaintextParser -from sumy.nlp.tokenizers import Tokenizer +import sumy.summarizers.lsa as lsa_module from sumy.nlp.stemmers import Stemmer +from sumy.nlp.tokenizers import Tokenizer +from sumy.parsers.plaintext import PlaintextParser +from sumy.summarizers.lsa import LsaSummarizer from sumy.utils import get_stop_words + from ..utils import build_document, load_resource @@ -26,9 +28,18 @@ def test_dictionary_without_stop_words(self): summarizer.stop_words = ["stop", "Halt", "SHUT", "HmMm"] document = build_document( - ("stop halt shut hmmm", "Stop Halt Shut Hmmm",), - ("StOp HaLt ShUt HmMm", "STOP HALT SHUT HMMM",), - ("Some relevant sentence", "Some moRe releVant sentEnce",), + ( + "stop halt shut hmmm", + "Stop Halt Shut Hmmm", + ), + ( + "StOp HaLt ShUt HmMm", + "STOP HALT SHUT HMMM", + ), + ( + "Some relevant sentence", + "Some moRe releVant sentEnce", + ), ) expected = frozenset(["some", "more", "relevant", "sentence"]) @@ -45,7 +56,11 @@ def test_empty_document(self): def test_single_sentence(self): document = build_document(("I am the sentence you like",)) summarizer = LsaSummarizer() - summarizer.stopwords = ("I", "am", "the",) + summarizer.stopwords = ( + "I", + "am", + "the", + ) sentences = summarizer(document, 10) self.assertEqual(len(sentences), 1) @@ -53,12 +68,27 @@ def test_single_sentence(self): def test_document(self): document = build_document( - ("I am the sentence you like", "Do you like me too",), - ("This sentence is better than that above", "Are you kidding me",) + ( + "I am the sentence you like", + "Do you like me too", + ), + ( + "This sentence is better than that above", + "Are you kidding me", + ), ) summarizer = LsaSummarizer() summarizer.stopwords = ( - "I", "am", "the", "you", "are", "me", "is", "than", "that", "this", + "I", + "am", + "the", + "you", + "are", + "me", + "is", + "than", + "that", + "this", ) sentences = summarizer(document, 2) @@ -68,10 +98,7 @@ def test_document(self): def test_real_example(self): """Source: http://www.prevko.cz/dite/skutecne-pribehy-deti""" - parser = PlaintextParser.from_string( - load_resource("snippets/prevko.txt"), - Tokenizer("czech") - ) + parser = PlaintextParser.from_string(load_resource("snippets/prevko.txt"), Tokenizer("czech")) summarizer = LsaSummarizer(Stemmer("czech")) summarizer.stop_words = get_stop_words("czech") @@ -80,10 +107,7 @@ def test_real_example(self): def test_article_example(self): """Source: http://www.prevko.cz/dite/skutecne-pribehy-deti""" - parser = PlaintextParser.from_string( - load_resource("articles/prevko_cz_1.txt"), - Tokenizer("czech") - ) + parser = PlaintextParser.from_string(load_resource("articles/prevko_cz_1.txt"), Tokenizer("czech")) summarizer = LsaSummarizer(Stemmer("czech")) summarizer.stop_words = get_stop_words("czech") @@ -94,10 +118,7 @@ def test_issue_5_svd_converges(self): """Source: https://github.com/miso-belica/sumy/issues/5""" pytest.skip("Can't reproduce the issue.") - parser = PlaintextParser.from_string( - load_resource("articles/svd_converges.txt"), - Tokenizer("english") - ) + parser = PlaintextParser.from_string(load_resource("articles/svd_converges.txt"), Tokenizer("english")) summarizer = LsaSummarizer(Stemmer("english")) summarizer.stop_words = get_stop_words("english") diff --git a/tests/test_summarizers/test_luhn.py b/tests/test_summarizers/test_luhn.py index 53cb6722..00a515fa 100644 --- a/tests/test_summarizers/test_luhn.py +++ b/tests/test_summarizers/test_luhn.py @@ -1,10 +1,11 @@ import unittest -from sumy.summarizers.luhn import LuhnSummarizer -from sumy.parsers.plaintext import PlaintextParser -from sumy.nlp.tokenizers import Tokenizer from sumy.nlp.stemmers.czech import stem_word +from sumy.nlp.tokenizers import Tokenizer +from sumy.parsers.plaintext import PlaintextParser +from sumy.summarizers.luhn import LuhnSummarizer from sumy.utils import get_stop_words + from ..utils import build_document, build_sentence @@ -19,7 +20,10 @@ def test_empty_document(self): def test_single_sentence(self): document = build_document(("Já jsem jedna věta",)) summarizer = LuhnSummarizer() - summarizer.stop_words = ("já", "jsem",) + summarizer.stop_words = ( + "já", + "jsem", + ) returned = summarizer(document, 10) self.assertEqual(len(returned), 1) @@ -27,7 +31,12 @@ def test_single_sentence(self): def test_two_sentences(self): document = build_document(("Já jsem 1. věta", "A já ta 2. vítězná výhra")) summarizer = LuhnSummarizer() - summarizer.stop_words = ("já", "jsem", "a", "ta",) + summarizer.stop_words = ( + "já", + "jsem", + "a", + "ta", + ) returned = summarizer(document, 10) self.assertEqual(len(returned), 2) @@ -35,23 +44,27 @@ def test_two_sentences(self): self.assertEqual(str(returned[1]), "A já ta 2. vítězná výhra") def test_two_sentences_but_one_winner(self): - document = build_document(( - "Já jsem 1. vítězná ta věta", - "A já ta 2. vítězná věta" - )) + document = build_document(("Já jsem 1. vítězná ta věta", "A já ta 2. vítězná věta")) summarizer = LuhnSummarizer() - summarizer.stop_words = ("já", "jsem", "a", "ta",) + summarizer.stop_words = ( + "já", + "jsem", + "a", + "ta", + ) returned = summarizer(document, 1) self.assertEqual(len(returned), 1) self.assertEqual(str(returned[0]), "A já ta 2. vítězná věta") def test_three_sentences(self): - document = build_document(( - "wa s s s wa s s s wa", - "wb s wb s wb s s s s s s s s s wb", - "wc s s wc s s wc", - )) + document = build_document( + ( + "wa s s s wa s s s wa", + "wb s wb s wb s s s s s s s s s wb", + "wc s s wc s s wc", + ) + ) summarizer = LuhnSummarizer() summarizer.stop_words = ("s",) @@ -71,14 +84,16 @@ def test_three_sentences(self): self.assertEqual(str(returned[2]), "wc s s wc s s wc") def test_various_words_with_significant_percentage(self): - document = build_document(( - "1 a", - "2 b b", - "3 c c c", - "4 d d d", - "5 z z z z", - "6 e e e e e", - )) + document = build_document( + ( + "1 a", + "2 b b", + "3 c c c", + "4 d d d", + "5 z z z z", + "6 e e e e e", + ) + ) summarizer = LuhnSummarizer() summarizer.stop_words = ("1", "2", "3", "4", "5", "6") @@ -105,45 +120,74 @@ def test_real_example(self): "Připadal si, že je mezi malými dětmi a realizoval se tím, že si ve třídě " "o rok mladších dětí budoval vedoucí pozici. " "Dost razantně. Fyzickou převahu měl, takže to nedalo až tak moc práce.", - Tokenizer("czech") + Tokenizer("czech"), ) summarizer = LuhnSummarizer(stem_word) summarizer.stop_words = get_stop_words("czech") returned = summarizer(parser.document, 2) self.assertEqual(len(returned), 2) - self.assertEqual(str(returned[0]), - "Jednalo se o případ chlapce v 6. třídě, který měl problémy s učením.") - self.assertEqual(str(returned[1]), - "Připadal si, že je mezi malými dětmi a realizoval se tím, " - "že si ve třídě o rok mladších dětí budoval vedoucí pozici.") + self.assertEqual(str(returned[0]), "Jednalo se o případ chlapce v 6. třídě, který měl problémy s učením.") + self.assertEqual( + str(returned[1]), "Připadal si, že je mezi malými dětmi a realizoval se tím, že si ve třídě o rok mladších dětí budoval vedoucí pozici." + ) class TestSentenceRating(unittest.TestCase): def setUp(self): self.summarizer = LuhnSummarizer() - self.sentence = build_sentence( - "Nějaký muž šel kolem naší zahrady a žil pěkný život samotáře") + self.sentence = build_sentence("Nějaký muž šel kolem naší zahrady a žil pěkný život samotáře") def test_significant_words(self): - self.summarizer.significant_percentage = 1/5 - words = self.summarizer._get_significant_words(( - "wa", "wb", "wc", "wd", "we", "wf", "wg", "wh", "wi", "wj", - "wa", "wb", - )) + self.summarizer.significant_percentage = 1 / 5 + words = self.summarizer._get_significant_words( + ( + "wa", + "wb", + "wc", + "wd", + "we", + "wf", + "wg", + "wh", + "wi", + "wj", + "wa", + "wb", + ) + ) self.assertEqual(tuple(sorted(words)), ("wa", "wb")) def test_stop_words_not_in_significant_words(self): self.summarizer.stop_words = ["stop", "Halt", "SHUT", "HmMm"] - words = self.summarizer._get_significant_words([ - "stop", "Stop", "StOp", "STOP", - "halt", "Halt", "HaLt", "HALT", - "shut", "Shut", "ShUt", "SHUT", - "hmmm", "Hmmm", "HmMm", "HMMM", - "some", "relevant", "word", - "some", "more", "relevant", "word", - ]) + words = self.summarizer._get_significant_words( + [ + "stop", + "Stop", + "StOp", + "STOP", + "halt", + "Halt", + "HaLt", + "HALT", + "shut", + "Shut", + "ShUt", + "SHUT", + "hmmm", + "Hmmm", + "HmMm", + "HMMM", + "some", + "relevant", + "word", + "some", + "more", + "relevant", + "word", + ] + ) self.assertEqual(tuple(sorted(words)), ("relevant", "some", "word")) @@ -164,49 +208,77 @@ def test_single_word_at_end(self): self.assertEqual(self.summarizer.rate_sentence(self.sentence, significant_stems), 0) def test_two_chunks_too_far(self): - significant_stems = ("šel", "žil",) + significant_stems = ( + "šel", + "žil", + ) self.assertEqual(self.summarizer.rate_sentence(self.sentence, significant_stems), 0) def test_two_chunks_at_begin(self): - significant_stems = ("muž", "šel",) + significant_stems = ( + "muž", + "šel", + ) self.assertEqual(self.summarizer.rate_sentence(self.sentence, significant_stems), 2) def test_two_chunks_before_end(self): - significant_stems = ("pěkný", "život",) + significant_stems = ( + "pěkný", + "život", + ) self.assertEqual(self.summarizer.rate_sentence(self.sentence, significant_stems), 2) def test_two_chunks_at_end(self): - significant_stems = ("pěkný", "samotáře",) - self.assertEqual(self.summarizer.rate_sentence(self.sentence, significant_stems), 4/3) + significant_stems = ( + "pěkný", + "samotáře", + ) + self.assertEqual(self.summarizer.rate_sentence(self.sentence, significant_stems), 4 / 3) def test_three_chunks_at_begin(self): - significant_stems = ("nějaký", "muž", "šel",) + significant_stems = ( + "nějaký", + "muž", + "šel", + ) self.assertEqual(self.summarizer.rate_sentence(self.sentence, significant_stems), 3) def test_three_chunks_at_end(self): - significant_stems = ("pěkný", "život", "samotáře",) + significant_stems = ( + "pěkný", + "život", + "samotáře", + ) self.assertEqual(self.summarizer.rate_sentence(self.sentence, significant_stems), 3) def test_three_chunks_with_gaps(self): - significant_stems = ("muž", "šel", "zahrady",) - self.assertEqual(self.summarizer.rate_sentence(self.sentence, significant_stems), 9/5) + significant_stems = ( + "muž", + "šel", + "zahrady", + ) + self.assertEqual(self.summarizer.rate_sentence(self.sentence, significant_stems), 9 / 5) def test_chunks_with_user_gap(self): self.summarizer.max_gap_size = 6 - significant_stems = ("muž", "šel", "pěkný",) - self.assertEqual(self.summarizer.rate_sentence(self.sentence, significant_stems), 9/8) + significant_stems = ( + "muž", + "šel", + "pěkný", + ) + self.assertEqual(self.summarizer.rate_sentence(self.sentence, significant_stems), 9 / 8) def test_three_chunks_with_1_gap(self): sentence = build_sentence("w s w s w") significant_stems = ("w",) - self.assertEqual(self.summarizer.rate_sentence(sentence, significant_stems), 9/5) + self.assertEqual(self.summarizer.rate_sentence(sentence, significant_stems), 9 / 5) def test_three_chunks_with_2_gap(self): sentence = build_sentence("w s s w s s w") significant_stems = ("w",) - self.assertEqual(self.summarizer.rate_sentence(sentence, significant_stems), 9/7) + self.assertEqual(self.summarizer.rate_sentence(sentence, significant_stems), 9 / 7) def test_three_chunks_with_3_gap(self): sentence = build_sentence("w s s s w s s s w") diff --git a/tests/test_summarizers/test_random.py b/tests/test_summarizers/test_random.py index 57066538..2cfbf5b5 100644 --- a/tests/test_summarizers/test_random.py +++ b/tests/test_summarizers/test_random.py @@ -1,6 +1,7 @@ import unittest from sumy.summarizers.random import RandomSummarizer + from ..utils import build_document, build_document_from_string diff --git a/tests/test_summarizers/test_sum_basic.py b/tests/test_summarizers/test_sum_basic.py index be426c24..f762a409 100644 --- a/tests/test_summarizers/test_sum_basic.py +++ b/tests/test_summarizers/test_sum_basic.py @@ -1,10 +1,10 @@ import unittest from sumy.models.dom._sentence import Sentence -from sumy.summarizers.sum_basic import SumBasicSummarizer -from ..utils import build_document, build_document_from_string from sumy.nlp.tokenizers import Tokenizer +from sumy.summarizers.sum_basic import SumBasicSummarizer +from ..utils import build_document class TestSumBasic(unittest.TestCase): @@ -28,7 +28,7 @@ def test_single_sentence(self): s = Sentence("I am one slightly longer sentence.", Tokenizer("english")) document = build_document([s]) summarizer = self._build_summarizer(self.EMPTY_STOP_WORDS) - + returned = summarizer(document, 10) self.assertEqual(len(returned), 1) @@ -49,17 +49,16 @@ def test_filter_out_stop_words(self): words_correctly_filtered = ["dog", "went", "on", "a", "walk"] self.assertEqual(words_filtered, words_correctly_filtered) - def test_compute_word_freq(self): summarizer = self._build_summarizer(self.EMPTY_STOP_WORDS) - + words = ["one", "two", "three", "four"] freq = summarizer._compute_word_freq(words) self.assertEqual(freq.get("one", 0), 1) self.assertEqual(freq.get("two", 0), 1) self.assertEqual(freq.get("three", 0), 1) self.assertEqual(freq.get("four", 0), 1) - + words = ["one", "one", "two", "two"] freq = summarizer._compute_word_freq(words) self.assertEqual(freq.get("one", 0), 2) @@ -79,32 +78,31 @@ def test_get_all_content_words_in_doc(self): content_words_correct = {"one": 2, "two": 2, "three": 2} self.assertEqual(content_words_freq, content_words_correct) - def test_compute_tf(self): summarizer = self._build_summarizer(self.EMPTY_STOP_WORDS) s0 = Sentence("kicking soccer balls.", Tokenizer("english")) s1 = Sentence("eating chicken dumplings.", Tokenizer("english")) document = build_document([s0, s1]) freq = summarizer._compute_tf(document.sentences) - self.assertEqual(freq["kicking"], 1/6) - self.assertEqual(freq["soccer"], 1/6) - self.assertEqual(freq["balls"], 1/6) - self.assertEqual(freq["eating"], 1/6) - self.assertEqual(freq["chicken"], 1/6) - self.assertEqual(freq["dumplings"], 1/6) + self.assertEqual(freq["kicking"], 1 / 6) + self.assertEqual(freq["soccer"], 1 / 6) + self.assertEqual(freq["balls"], 1 / 6) + self.assertEqual(freq["eating"], 1 / 6) + self.assertEqual(freq["chicken"], 1 / 6) + self.assertEqual(freq["dumplings"], 1 / 6) document = build_document([s0, s0, s1]) freq = summarizer._compute_tf(document.sentences) - self.assertEqual(freq["kicking"], 2/9) - self.assertEqual(freq["soccer"], 2/9) - self.assertEqual(freq["balls"], 2/9) - self.assertEqual(freq["eating"], 1/9) - self.assertEqual(freq["chicken"], 1/9) - self.assertEqual(freq["dumplings"], 1/9) + self.assertEqual(freq["kicking"], 2 / 9) + self.assertEqual(freq["soccer"], 2 / 9) + self.assertEqual(freq["balls"], 2 / 9) + self.assertEqual(freq["eating"], 1 / 9) + self.assertEqual(freq["chicken"], 1 / 9) + self.assertEqual(freq["dumplings"], 1 / 9) def test_compute_average_probability_of_words(self): summarizer = self._build_summarizer(self.EMPTY_STOP_WORDS) - word_freq = {"one": 1/6, "two": 2/6, "three": 3/6} + word_freq = {"one": 1 / 6, "two": 2 / 6, "three": 3 / 6} s0 = [] s1 = ["one"] s2 = ["two", "three"] @@ -112,10 +110,9 @@ def test_compute_average_probability_of_words(self): EPS = 0.0001 self.assertTrue(abs(summarizer._compute_average_probability_of_words(word_freq, s0) - 0) < EPS) - self.assertTrue(abs(summarizer._compute_average_probability_of_words(word_freq, s1) - 1/6) < EPS) - self.assertTrue(abs(summarizer._compute_average_probability_of_words(word_freq, s2) - 5/12) < EPS) - self.assertTrue(abs(summarizer._compute_average_probability_of_words(word_freq, s3) - 8/18) < EPS) - + self.assertTrue(abs(summarizer._compute_average_probability_of_words(word_freq, s1) - 1 / 6) < EPS) + self.assertTrue(abs(summarizer._compute_average_probability_of_words(word_freq, s2) - 5 / 12) < EPS) + self.assertTrue(abs(summarizer._compute_average_probability_of_words(word_freq, s3) - 8 / 18) < EPS) def test_compute_ratings(self): summarizer = self._build_summarizer(self.EMPTY_STOP_WORDS) @@ -130,8 +127,7 @@ def test_compute_ratings(self): self.assertEqual(ratings[s1], -2) self.assertEqual(ratings[s2], -1) - - # Due to the frequency discounting, after finding sentence s0, + # Due to the frequency discounting, after finding sentence s0, # s2 should come before s1 since only two of its words get discounted # rather than all 3 of s1's s0 = Sentence("one two three", Tokenizer("english")) diff --git a/tests/test_summarizers/test_text_rank.py b/tests/test_summarizers/test_text_rank.py index 153e7dc8..767bf549 100644 --- a/tests/test_summarizers/test_text_rank.py +++ b/tests/test_summarizers/test_text_rank.py @@ -1,7 +1,8 @@ import unittest -from sumy.summarizers.text_rank import TextRankSummarizer from sumy.nlp.stemmers import Stemmer +from sumy.summarizers.text_rank import TextRankSummarizer + from ..utils import build_document @@ -16,7 +17,10 @@ def test_empty_document(self): def test_single_sentence(self): document = build_document(("I am one sentence",)) summarizer = TextRankSummarizer() - summarizer.stop_words = ("I", "am",) + summarizer.stop_words = ( + "I", + "am", + ) returned = summarizer(document, 10) self.assertEqual(len(returned), 1) @@ -24,7 +28,12 @@ def test_single_sentence(self): def test_two_sentences(self): document = build_document(("I am that 1. sentence", "And I am 2. winning prize")) summarizer = TextRankSummarizer() - summarizer.stop_words = ("I", "am", "and", "that",) + summarizer.stop_words = ( + "I", + "am", + "and", + "that", + ) returned = summarizer(document, 10) self.assertEqual(len(returned), 2) @@ -36,9 +45,18 @@ def test_stop_words_correctly_removed(self): summarizer.stop_words = ["stop", "Halt", "SHUT", "HmMm"] document = build_document( - ("stop halt shut hmmm", "Stop Halt Shut Hmmm",), - ("StOp HaLt ShUt HmMm", "STOP HALT SHUT HMMM",), - ("Some relevant sentence", "Some moRe releVant sentEnce",), + ( + "stop halt shut hmmm", + "Stop Halt Shut Hmmm", + ), + ( + "StOp HaLt ShUt HmMm", + "STOP HALT SHUT HMMM", + ), + ( + "Some relevant sentence", + "Some moRe releVant sentEnce", + ), ) sentences = document.sentences @@ -60,11 +78,13 @@ def test_stop_words_correctly_removed(self): self.assertEqual(expected, returned) def test_three_sentences_but_second_winner(self): - document = build_document([ - "I am that 1. sentence", - "And I am 2. sentence - winning sentence", - "And I am 3. sentence - winner is my 2nd name", - ]) + document = build_document( + [ + "I am that 1. sentence", + "And I am 2. sentence - winning sentence", + "And I am 3. sentence - winner is my 2nd name", + ] + ) summarizer = TextRankSummarizer() summarizer.stop_words = ["I", "am", "and", "that"] @@ -73,11 +93,13 @@ def test_three_sentences_but_second_winner(self): self.assertEqual(str(returned[0]), "And I am 2. sentence - winning sentence") def test_sentences_rating(self): - document = build_document([ - "a c e g", - "a b c d e f g", - "b d f", - ]) + document = build_document( + [ + "a c e g", + "a b c d e f g", + "b d f", + ] + ) summarizer = TextRankSummarizer() summarizer.stop_words = ["I", "am", "and", "that"] diff --git a/tests/test_tokenizers.py b/tests/test_tokenizers.py index d705c927..a03dd5ab 100644 --- a/tests/test_tokenizers.py +++ b/tests/test_tokenizers.py @@ -32,8 +32,15 @@ def test_tokenize_sentence(self): words = tokenizer.to_words("I am a very nice sentence with comma, but..") expected = ( - "I", "am", "a", "very", "nice", "sentence", - "with", "comma", "but", + "I", + "am", + "a", + "very", + "nice", + "sentence", + "with", + "comma", + "but", ) self.assertEqual(expected, words) diff --git a/tests/test_utils/test_utils.py b/tests/test_utils/test_utils.py index 81dbf406..3125a02d 100644 --- a/tests/test_utils/test_utils.py +++ b/tests/test_utils/test_utils.py @@ -1,6 +1,7 @@ import unittest -from sumy.utils import get_stop_words, read_stop_words, ItemsCount +from sumy.utils import ItemsCount, get_stop_words, read_stop_words + from ..utils import expand_resource_path diff --git a/tests/utils.py b/tests/utils.py index 7e22d046..872f9720 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -1,8 +1,7 @@ -from io import StringIO -from os.path import dirname, join, abspath -from sumy.nlp.tokenizers import Tokenizer -from sumy.models.dom import ObjectDocumentModel, Paragraph, Sentence +from os.path import abspath, dirname, join +from sumy.models.dom import ObjectDocumentModel, Paragraph, Sentence +from sumy.nlp.tokenizers import Tokenizer _TOKENIZER = Tokenizer("czech") From 0d4e85c72e8ba4e7ae051ea07f82ea07561a09a7 Mon Sep 17 00:00:00 2001 From: "mcode-bot@modelcode.ai" Date: Thu, 11 Jun 2026 15:02:11 -0500 Subject: [PATCH 10/11] Fix all mypy typecheck errors across the sumy package - Add ignore_missing_imports=true to [tool.mypy] config for untyped third-party libraries (nltk, breadability) - Add frozenset[str] type annotations to _stop_words class attributes in text_rank.py, luhn.py, lsa.py, and lex_rank.py - Add frozenset[str] type annotation to _EMPTY_SET in edmundson.py - Fix numpy/svd optional import typing in lsa.py and lex_rank.py by declaring module-level `Any` annotations before try/except blocks --- pyproject.toml | 1 + sumy/summarizers/edmundson.py | 2 +- sumy/summarizers/lex_rank.py | 4 +++- sumy/summarizers/lsa.py | 5 ++++- sumy/summarizers/luhn.py | 2 +- sumy/summarizers/text_rank.py | 2 +- 6 files changed, 11 insertions(+), 5 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 9e79ea32..5e747d4c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -84,3 +84,4 @@ select = ["E", "F", "W", "I"] python_version = "3.10" warn_return_any = true warn_unused_configs = true +ignore_missing_imports = true diff --git a/sumy/summarizers/edmundson.py b/sumy/summarizers/edmundson.py index fcace9f8..602ff91a 100644 --- a/sumy/summarizers/edmundson.py +++ b/sumy/summarizers/edmundson.py @@ -7,7 +7,7 @@ from .edmundson_location import EdmundsonLocationMethod from .edmundson_title import EdmundsonTitleMethod -_EMPTY_SET = frozenset() +_EMPTY_SET: frozenset[str] = frozenset() class EdmundsonSummarizer(AbstractSummarizer): diff --git a/sumy/summarizers/lex_rank.py b/sumy/summarizers/lex_rank.py index fbe16498..56d5a79d 100644 --- a/sumy/summarizers/lex_rank.py +++ b/sumy/summarizers/lex_rank.py @@ -1,6 +1,8 @@ import math from collections import Counter +from typing import Any +numpy: Any try: import numpy except ImportError: @@ -17,7 +19,7 @@ class LexRankSummarizer(AbstractSummarizer): threshold = 0.1 epsilon = 0.1 - _stop_words = frozenset() + _stop_words: frozenset[str] = frozenset() @property def stop_words(self): diff --git a/sumy/summarizers/lsa.py b/sumy/summarizers/lsa.py index 76ac2c9c..95d042b7 100644 --- a/sumy/summarizers/lsa.py +++ b/sumy/summarizers/lsa.py @@ -1,11 +1,14 @@ import math +from typing import Any from warnings import warn +numpy: Any try: import numpy except ImportError: numpy = None +singular_value_decomposition: Any try: from numpy.linalg import svd as singular_value_decomposition except ImportError: @@ -16,7 +19,7 @@ class LsaSummarizer(AbstractSummarizer): MIN_DIMENSIONS = 3 REDUCTION_RATIO = 1 / 1 - _stop_words = frozenset() + _stop_words: frozenset[str] = frozenset() @property def stop_words(self): diff --git a/sumy/summarizers/luhn.py b/sumy/summarizers/luhn.py index d6dcfaae..460a09a2 100644 --- a/sumy/summarizers/luhn.py +++ b/sumy/summarizers/luhn.py @@ -6,7 +6,7 @@ class LuhnSummarizer(AbstractSummarizer): max_gap_size = 4 # TODO: better recognition of significant words (automatic) significant_percentage = 1 - _stop_words = frozenset() + _stop_words: frozenset[str] = frozenset() @property def stop_words(self): diff --git a/sumy/summarizers/text_rank.py b/sumy/summarizers/text_rank.py index 33dfbd47..2461fa81 100644 --- a/sumy/summarizers/text_rank.py +++ b/sumy/summarizers/text_rank.py @@ -8,7 +8,7 @@ class TextRankSummarizer(AbstractSummarizer): """Source: https://github.com/adamfabish/Reduction""" - _stop_words = frozenset() + _stop_words: frozenset[str] = frozenset() @property def stop_words(self): From 0c4789101d704db820a9bc30096561f3baeab70a Mon Sep 17 00:00:00 2001 From: "mcode-bot@modelcode.ai" Date: Thu, 11 Jun 2026 15:05:42 -0500 Subject: [PATCH 11/11] Fix E402 lint errors in lsa.py and lex_rank.py import ordering Move the relative import of AbstractSummarizer above the numpy try/except blocks so all standard imports precede the optional import guards. This resolves ruff E402 (module-level import not at top of file) introduced by the mypy type annotation fix. --- sumy/summarizers/lex_rank.py | 4 ++-- sumy/summarizers/lsa.py | 3 ++- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/sumy/summarizers/lex_rank.py b/sumy/summarizers/lex_rank.py index 56d5a79d..ff970f8e 100644 --- a/sumy/summarizers/lex_rank.py +++ b/sumy/summarizers/lex_rank.py @@ -2,14 +2,14 @@ from collections import Counter from typing import Any +from ._summarizer import AbstractSummarizer + numpy: Any try: import numpy except ImportError: numpy = None -from ._summarizer import AbstractSummarizer - class LexRankSummarizer(AbstractSummarizer): """ diff --git a/sumy/summarizers/lsa.py b/sumy/summarizers/lsa.py index 95d042b7..89bec519 100644 --- a/sumy/summarizers/lsa.py +++ b/sumy/summarizers/lsa.py @@ -2,6 +2,8 @@ from typing import Any from warnings import warn +from ._summarizer import AbstractSummarizer + numpy: Any try: import numpy @@ -13,7 +15,6 @@ from numpy.linalg import svd as singular_value_decomposition except ImportError: singular_value_decomposition = None -from ._summarizer import AbstractSummarizer class LsaSummarizer(AbstractSummarizer):