From a74e5ffae5d53187bc2b18eb1b98c8826489c3ab Mon Sep 17 00:00:00 2001 From: Robert Forkel Date: Tue, 10 Mar 2026 20:40:01 +0100 Subject: [PATCH 1/2] done linting --- CHANGELOG.md | 2 + RELEASING.md | 4 + setup.cfg | 2 - src/pyigt/__init__.py | 3 + src/pyigt/__main__.py | 5 +- src/pyigt/cli_util.py | 10 +- src/pyigt/commands/ls.py | 8 +- src/pyigt/commands/stats.py | 9 +- src/pyigt/graid.py | 269 ++++++++++++-------- src/pyigt/igt.py | 475 ++++++++++++++++++++---------------- src/pyigt/index.html | 102 -------- src/pyigt/lgrmorphemes.py | 129 ++++++---- src/pyigt/util.py | 40 ++- tests/test_igt.py | 17 +- tests/test_util.py | 7 + 15 files changed, 584 insertions(+), 498 deletions(-) delete mode 100644 src/pyigt/index.html create mode 100644 tests/test_util.py diff --git a/CHANGELOG.md b/CHANGELOG.md index a629755..c61dbb8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,8 @@ ## unreleased - Support parsing GRAID annotations. +- Drop py3.8 compat. +- Removed the `Corpus.write_app` method. ## [2.2.0] - 2025-01-15 diff --git a/RELEASING.md b/RELEASING.md index a88bade..e8a3872 100644 --- a/RELEASING.md +++ b/RELEASING.md @@ -12,6 +12,10 @@ tox -r ```shell flake8 src ``` +- Make sure pylint passes with a score of 10: +```shell +pylint src +``` - Make sure docs can be created: ```shell diff --git a/setup.cfg b/setup.cfg index aa64500..8ac56a4 100644 --- a/setup.cfg +++ b/setup.cfg @@ -34,12 +34,10 @@ package_dir = = src python_requires = >=3.8 install_requires = - attrs csvw clldutils pycldf segments>=2.0.0 - tabulate include_package_data = True [options.packages.find] diff --git a/src/pyigt/__init__.py b/src/pyigt/__init__.py index 767ea1c..3c5b689 100644 --- a/src/pyigt/__init__.py +++ b/src/pyigt/__init__.py @@ -1,3 +1,6 @@ +""" +Functionality to read and write interlinear glossed text. +""" from .igt import Corpus, IGT, LGRConformance, Example # noqa: F401 from .lgrmorphemes import GlossedWord, GlossedMorpheme # noqa: F401 diff --git a/src/pyigt/__main__.py b/src/pyigt/__main__.py index ed503de..b40bf5d 100644 --- a/src/pyigt/__main__.py +++ b/src/pyigt/__main__.py @@ -1,3 +1,6 @@ +""" +CLI for the pyigt package. +""" import sys import contextlib @@ -7,7 +10,7 @@ import pyigt.commands -def main(args=None, catch_all=False, parsed_args=None): +def main(args=None, catch_all=False, parsed_args=None): # pylint: disable=C0116 parser, subparsers = get_parser_and_subparsers('igt') register_subcommands(subparsers, pyigt.commands) diff --git a/src/pyigt/cli_util.py b/src/pyigt/cli_util.py index 5ff6f47..fb157f7 100644 --- a/src/pyigt/cli_util.py +++ b/src/pyigt/cli_util.py @@ -1,11 +1,16 @@ +""" +Helpers for the `pyigt` CLI. +""" import sys +import argparse from clldutils.clilib import PathType from pyigt import Corpus -def add_corpus(parser): +def add_corpus(parser: argparse.ArgumentParser): + """Add an argument to specify a CLDF Dataset.""" parser.add_argument( 'dataset', type=PathType(type='file', must_exist=False), @@ -13,7 +18,8 @@ def add_corpus(parser): "as CSV file or '-' to read from .") -def get_corpus(args): +def get_corpus(args: argparse.Namespace) -> Corpus: + """Retrieve a Corpus according to the input from the CLI. Works in tandem with `add_corpus`.""" if args.dataset.name == '-': return Corpus.from_stream(sys.stdin) return Corpus.from_path(args.dataset) diff --git a/src/pyigt/commands/ls.py b/src/pyigt/commands/ls.py index c9922f4..7e10ba3 100644 --- a/src/pyigt/commands/ls.py +++ b/src/pyigt/commands/ls.py @@ -6,7 +6,7 @@ from pyigt.cli_util import add_corpus, get_corpus -def register(parser): +def register(parser): # pylint: disable=C0116 add_corpus(parser) parser.add_argument( 'filter', @@ -23,7 +23,7 @@ def register(parser): ) -def run(args): +def run(args): # pylint: disable=C0116 corpus = get_corpus(args) filters = [f.split('=', maxsplit=1) for f in args.filter] @@ -42,9 +42,9 @@ def match(igt, c, p): for igt in corpus: if (not filters) or all(match(igt, c, p) for c, p in filters): - print('Example {0}:'.format(igt.id)) + print(f'Example {igt.id}:') print(igt) print() if corpus.fname: - print('IGT corpus at {0}'.format(corpus.fname)) + print(f'IGT corpus at {corpus.fname}') diff --git a/src/pyigt/commands/stats.py b/src/pyigt/commands/stats.py index 9c5027b..29457cd 100644 --- a/src/pyigt/commands/stats.py +++ b/src/pyigt/commands/stats.py @@ -1,20 +1,21 @@ """ Describe the IGTs in a CLDF dataset """ -from clldutils.clilib import Table +from clldutils.clilib import Table, add_format from pyigt.cli_util import add_corpus, get_corpus -def register(parser): +def register(parser): # pylint: disable=C0116 add_corpus(parser) + add_format(parser) parser.add_argument('--verbose', action='store_true', default=False) -def run(args): +def run(args): # pylint: disable=C0116 corpus = get_corpus(args) - with Table('type', 'count') as t: + with Table(args, 'type', 'count') as t: e, w, m = corpus.get_stats() t.append(['example', e]) t.append(['word', w]) diff --git a/src/pyigt/graid.py b/src/pyigt/graid.py index 81a78d2..199f394 100644 --- a/src/pyigt/graid.py +++ b/src/pyigt/graid.py @@ -4,42 +4,43 @@ https://multicast.aspra.uni-bamberg.de/data/pubs/graid/Haig+Schnell2014_GRAID-manual_v7.0.pdf """ import re -import typing +from typing import Optional, Union, Protocol, Literal, Any import itertools import collections +from collections.abc import Iterable, Generator import dataclasses __all__ = ['Referent', 'Boundary', 'Predicate', 'Symbol', 'GRAID'] -SymbolDict = typing.Dict[typing.Union[typing.Tuple[str, str], str], str] +SymbolDict = dict[Union[tuple[str, str], str], str] -class Gloss(typing.Protocol): # pragma: no cover +class Gloss(Protocol): # pragma: no cover """ Classes passed to GRAID as `other_glosses` must implement this protocol. I.e. implement a classmethod `from_annotation`, which returns an instance of the class if the annotation matches the pattern or `None` otherwise. """ @classmethod - def from_annotation(cls, annotation: str, parser: "GRAID" = None) -> typing.Optional["Gloss"]: + def from_annotation(cls, annotation: str, parser: "GRAID" = None) -> Optional["Gloss"]: """ :return: `None` to signal that the annotation was not parsed, `Gloss` instance otherwise. """ - ... # pragma: no cover def __str__(self) -> str: """ The full gloss, re-assembled (and possibly normalized) or as passed to `from_annotation`. """ - ... - def describe(self, parser: "GRAID" = None) -> typing.Dict[str, str]: + def describe(self, parser: "GRAID" = None) -> dict[str, str]: # pylint: disable=C0116 ... -def update_symbols(symbols: SymbolDict, - d: SymbolDict, - attaches: typing.Union[typing.Literal['left'], typing.Literal['right']] = None): +def update_symbols( + symbols: SymbolDict, + d: SymbolDict, + attaches: Union[Literal['left'], Literal['right']] = None, +) -> None: """ Utility function to update GRAID symbol `dict`s. """ @@ -50,23 +51,23 @@ def update_symbols(symbols: SymbolDict, assert all( isinstance(k, str) or (k[1] if attaches == 'left' else k[0]) in symbols for k in symbols), ( - 'Core component of composite symbol is not a generic symbol: {}'.format(symbols)) + f'Core component of composite symbol is not a generic symbol: {symbols}') else: symbols.update(d) -def re_or(items: typing.Iterable[str]) -> str: +def re_or(items: Iterable[str]) -> str: """ Concatenate strings in as regular expression pattern matching any of them. """ return r'|'.join(re.escape(item) for item in items if isinstance(item, str)) -class GRAID: +class GRAID: # pylint: disable=R0902 """ The GRAID 7.0 specification. """ - def __init__(self, + def __init__(self, # pylint: disable=R0913,R0914,R0917 form_glosses: SymbolDict = None, form_gloss_specifiers: SymbolDict = None, referent_properties: SymbolDict = None, @@ -74,9 +75,9 @@ def __init__(self, syntactic_function_specifiers: SymbolDict = None, predicate_glosses: SymbolDict = None, clause_boundary_symbols: SymbolDict = None, - subconstituent_symbols: typing.Dict[str, typing.Tuple[str, list]] = None, + subconstituent_symbols: dict[str, tuple[str, list]] = None, other_symbols: SymbolDict = None, - other_glosses: typing.Optional[typing.List[Gloss]] = None, + other_glosses: Optional[list[Gloss]] = None, with_cross_index=False): """ Almost all lists of symbols specified by the GRAID standard may be extended with new, @@ -207,33 +208,35 @@ def __init__(self, if with_cross_index: self.other_glosses.append(CrossIndex) - def iter_expressions(self, s) -> typing.Generator[str, None, None]: + def iter_expressions(self, s: str) -> Generator[str, None, None]: # pylint: disable=C0116 sep = None - for item in itertools.dropwhile( - lambda ss: not ss, re.split(r'({})'.format(re_or(self.morpheme_separators)), s)): + pattern = r'({})'.format(re_or(self.morpheme_separators)) # pylint: disable=C0209 + for item in itertools.dropwhile(lambda ss: not ss, re.split(pattern, s)): if item in self.morpheme_separators: sep = item else: assert item - yield '{}{}'.format(sep if sep else '', item) + yield f"{sep if sep else ''}{item}" sep = None - assert not sep, 'Trailing morpheme separator in gloss: {}'.format(s) + assert not sep, f'Trailing morpheme separator in gloss: {s}' - def __call__(self, gloss: str) \ - -> typing.List[typing.Union[Gloss, "Boundary", "Symbol", "Predicate", "Referent"]]: + def __call__( + self, + gloss: str, + ) -> list[Union[Gloss, "Boundary", "Symbol", "Predicate", "Referent"]]: """ Call a GRAID object to parse a full-word GRAID annotation. """ return [self.parse_expression(exp) for exp in self.iter_expressions(gloss.strip())] - def parse_expression(self, expression): + def parse_expression(self, expression): # pylint: disable=C0116 for cls in self.other_glosses + [Symbol, Boundary, Predicate, Referent]: obj = cls.from_annotation(expression, self) if obj: return obj - raise ValueError('Could not parse expression: {}'.format(expression)) # pragma: no cover + raise ValueError(f'Could not parse expression: {expression}') # pragma: no cover - def parse_function(self, function, predicate=False): + def parse_function(self, function, predicate=False): # pylint: disable=C0116 kw = {} function = function.split('_') if predicate: @@ -264,14 +267,19 @@ def parse_function(self, function, predicate=False): @dataclasses.dataclass -class Symbol: +class Symbol: # pylint: disable=C0115 symbol: str morpheme_separator: str = None def __str__(self): - return '{}{}'.format(self.morpheme_separator or '', self.symbol) + return f"{self.morpheme_separator or ''}{self.symbol}" - def describe(self, parser: GRAID = None): + def describe(self, parser: GRAID = None) -> collections.OrderedDict[str, Any]: + """ + >>> s = Symbol.from_annotation('#nc') + >>> s.describe() + OrderedDict({'#nc': 'boundary, not considered'}) + """ parser = parser or GRAID() res = collections.OrderedDict() if self.morpheme_separator: @@ -280,27 +288,44 @@ def describe(self, parser: GRAID = None): return res @classmethod - def from_annotation(cls, ann, parser) -> typing.Optional["Symbol"]: + def from_annotation( # pylint: disable=C0116 + cls, + ann: str, + parser: GRAID = None, + ) -> Optional["Symbol"]: parser = parser or GRAID() kw = {} if any(ann.startswith(sep) for sep in parser.morpheme_separators): kw['morpheme_separator'], ann = ann[:1], ann[1:] if ann in parser.other_symbols: return cls(symbol=ann, **kw) + return None @dataclasses.dataclass -class Boundary: +class Boundary: # pylint: disable=R0902 + """A GRAID boundary object.""" boundary_type: str clause_type: str = None ds: bool = False neg: bool = False property: str = None function: str = None - function_qualifiers: typing.List[str] = dataclasses.field(default_factory=list) - qualifiers: typing.List[str] = dataclasses.field(default_factory=list) + function_qualifiers: list[str] = dataclasses.field(default_factory=list) + qualifiers: list[str] = dataclasses.field(default_factory=list) - def describe(self, parser: GRAID = None): + def describe(self, parser: GRAID = None) -> dict[str, str]: + """ + >>> bnd = Boundary.from_annotation('#ds_cc.neg:p') + >>> for k, v in bnd.describe().items(): + ... print(f'{k}:\t{v}') + ... + #: boundary of dependent clause, inserted at left edge, further specified + ds: direct speech + cc: complement clause + neg: negative polarity + p: transitive object + """ parser = parser or GRAID() res = collections.OrderedDict() res[self.boundary_type] = parser.boundary_markers[self.boundary_type] @@ -322,13 +347,18 @@ def describe(self, parser: GRAID = None): return res @classmethod - def from_annotation(cls, annotation: str, parser=None) -> typing.Optional["Boundary"]: + def from_annotation( # pylint: disable=R0912 + cls, + annotation: str, + parser: GRAID = None, + ) -> Optional["Boundary"]: + """Initialize a boundaery instrance from an annotation string.""" parser = parser or GRAID() for marker in parser.boundary_markers: if annotation.startswith(marker): break else: - return + return None parser = parser or DEFAULT_PARSER kw = {'qualifiers': [], 'boundary_type': marker, 'function_qualifiers': []} @@ -364,36 +394,46 @@ def from_annotation(cls, annotation: str, parser=None) -> typing.Optional["Bound return cls(**kw) def __str__(self): - return '{}{}{}{}{}'.format( - self.boundary_type, - '_'.join((['ds'] if self.ds else []) + # noqa: W504 - ([self.clause_type] if self.clause_type else []) + # noqa: W504 - self.qualifiers), - '.neg' if self.neg else ('.' + self.property if self.property else ''), - ':{}'.format(self.function) if self.function else '', - ''.join('_' + fq for fq in self.function_qualifiers), - ) + spec = ['ds'] if self.ds else [] + spec.extend([self.clause_type] if self.clause_type else []) + spec.extend(self.qualifiers) + neg = '.neg' if self.neg else ('.' + self.property if self.property else '') + func = f':{self.function}' if self.function else '' + fqual = ''.join('_' + fq for fq in self.function_qualifiers) + return f'{self.boundary_type}{"_".join(spec)}{neg}{func}{fqual}' @dataclasses.dataclass class Expression: + """A GRAID expression.""" form_gloss: str = None function: str = None morpheme_separator: str = None # -, = may be leading or trailing! - form_qualifiers: typing.List[str] = dataclasses.field(default_factory=list) - function_qualifiers: typing.List[str] = dataclasses.field(default_factory=list) + form_qualifiers: list[str] = dataclasses.field(default_factory=list) + function_qualifiers: list[str] = dataclasses.field(default_factory=list) @dataclasses.dataclass class Predicate(Expression): + """ + A GRAID object. + """ def __str__(self): res = self.morpheme_separator or '' res += '_'.join(self.form_qualifiers + [self.form_gloss]) if self.function: - res += ':{}'.format('_'.join([self.function] + self.function_qualifiers)) + res += f":{'_'.join([self.function] + self.function_qualifiers)}" return res def describe(self, parser: GRAID = None): + """ + >>> pred = Predicate.from_annotation('v:pred') + >>> for k, v in pred.describe().items(): + ... print(f'{k}:\t{v}') + ... + v: verb or verb complex (cf. Section 2.5.1) + pred: predicative function + """ parser = parser or GRAID() res = collections.OrderedDict() if (self.morpheme_separator, self.form_gloss) in parser.predicate_glosses: @@ -405,7 +445,7 @@ def describe(self, parser: GRAID = None): res[self.form_gloss] = parser.predicate_glosses[self.form_gloss] if self.form_qualifiers: - res['{}_{}'.format(self.form_qualifiers[0], self.form_gloss)] = ( + res[f'{self.form_qualifiers[0]}_{self.form_gloss}'] = ( parser.predicate_glosses[(self.form_qualifiers[0], self.form_gloss)]) if self.function: @@ -415,7 +455,7 @@ def describe(self, parser: GRAID = None): return res @classmethod - def from_annotation(cls, annotation: str, parser=None) -> typing.Optional["Predicate"]: + def from_annotation(cls, annotation: str, parser=None) -> Optional["Predicate"]: """ 1. check morpheme separator 2. split off function, separated by : @@ -433,10 +473,10 @@ def from_annotation(cls, annotation: str, parser=None) -> typing.Optional["Predi kw['morpheme_separator'], ann = ann[:1], ann[1:] ann = ann.split('_') if ann[0] in parser.subconstituent_markers: - return + return None if not (ann[-1] in parser.predicate_glosses or tuple(ann[-2:]) in parser.predicate_glosses): # Don't raise an error, because this may still be parsed as valid Referent! - return + return None # Now we know it's supposed to be a predicate. So parsing problems mean raising ValueError. if function: @@ -454,9 +494,10 @@ def from_annotation(cls, annotation: str, parser=None) -> typing.Optional["Predi @dataclasses.dataclass class Referent(Expression): + """A GRAID object.""" property: str = None subconstituent: str = None - subconstituent_qualifiers: typing.List[str] = dataclasses.field(default_factory=list) + subconstituent_qualifiers: list[str] = dataclasses.field(default_factory=list) def __str__(self): res = self.morpheme_separator or '' @@ -465,12 +506,22 @@ def __str__(self): self.form_qualifiers + # noqa: W504 ([self.form_gloss] if self.form_gloss else [])) if self.property: - res += '.{}'.format(self.property) + res += f'.{self.property}' if self.function: - res += ':{}'.format('_'.join([self.function] + self.function_qualifiers)) + res += f":{'_'.join([self.function] + self.function_qualifiers)}" return res - def describe(self, parser: GRAID = None): + def describe(self, parser: GRAID = None) -> dict[str, str]: + """ + >>> ref = Referent.from_annotation('rn_refl_pro.h:poss') + >>> for k, v in ref.describe().items(): + ... print(f'{k}:\t{v}') + ... + rn: NP-internal subconstituent occurring to the right of NP head + pro: free pronoun in full form + refl: reflexive or reciprocal pronoun, cf. Section 4.2 + poss: possessor + """ parser = parser or GRAID() res = collections.OrderedDict() if (self.morpheme_separator, self.form_gloss) in parser.form_glosses: @@ -488,21 +539,13 @@ def describe(self, parser: GRAID = None): if self.form_gloss: res[self.form_gloss] = parser.form_glosses[self.form_gloss] - for i, q in enumerate(reversed(self.form_qualifiers)): - if i == 0: - if (q, self.form_gloss) in parser.form_glosses: - res['{}_{}'.format(q, self.form_gloss)] = ( - parser.form_glosses[(q, self.form_gloss)]) - else: - res[q] = parser.form_glosses.get(q, parser.form_gloss_specifiers.get(q)) - else: - res[q] = parser.form_glosses.get(q, parser.form_gloss_specifiers.get(q)) + res.update(dict(self._describe_form_qualifiers(parser))) start = 0 if self.function: if (self.function_qualifiers and # noqa: W504 (self.function, self.function_qualifiers[0]) in parser.syntactic_functions): - res['{}_{}'.format(self.function, self.function_qualifiers[0])] = ( + res[f'{self.function}_{self.function_qualifiers[0]}'] = ( parser.syntactic_functions)[(self.function, self.function_qualifiers[0])] start = 1 else: @@ -512,6 +555,16 @@ def describe(self, parser: GRAID = None): res[q] = parser.syntactic_function_specifiers[q] return res + def _describe_form_qualifiers(self, parser) -> Generator[tuple[str, Any]]: + for i, q in enumerate(reversed(self.form_qualifiers)): + if i == 0: + if (q, self.form_gloss) in parser.form_glosses: + yield f'{q}_{self.form_gloss}', parser.form_glosses[(q, self.form_gloss)] + else: + yield q, parser.form_glosses.get(q, parser.form_gloss_specifiers.get(q)) + else: + yield q, parser.form_glosses.get(q, parser.form_gloss_specifiers.get(q)) + @classmethod def from_annotation(cls, annotation: str, parser=None) -> "Referent": """ @@ -533,41 +586,54 @@ def from_annotation(cls, annotation: str, parser=None) -> "Referent": elif any(ann.startswith(scm + '_') for scm in parser.subconstituent_markers): kw['subconstituent'], _, ann = ann.partition('_') if kw.get('subconstituent') and kw['subconstituent'] in parser.subconstituent_symbols: - kw['subconstituent_qualifiers'] = [] - # Consume subconstituent_symbols from the left - pattern = re.compile( - r'(?P{})(_|$)'.format( - re_or(parser.subconstituent_symbols[kw['subconstituent']]))) - m = pattern.match(ann) - while m: - kw['subconstituent_qualifiers'].append(m.group('sym')) - ann = ann[m.end():] - m = pattern.match(ann) + ann = cls._parse_subconstituent(parser, ann, kw) ann, _, function = ann.partition(":") if function: kw.update(parser.parse_function(function)) - ann, _, property = ann.partition(".") - if property: - if property not in parser.referent_properties: + ann, _, prop = ann.partition(".") + if prop: + if prop not in parser.referent_properties: raise ValueError(annotation) - kw['property'] = property + kw['property'] = prop if ann: - ann = ann.split("_") - if not (ann[-1] in parser.form_glosses or tuple(ann[-2:]) in parser.form_glosses): - raise ValueError(annotation) - kw['form_gloss'] = ann.pop() - kw['form_qualifiers'] = [] - if ann: - if (ann[-1], kw['form_gloss']) in parser.form_glosses: - kw['form_qualifiers'].append(ann.pop()) - for a in ann: - if a in parser.form_gloss_specifiers or a in parser.form_glosses: - kw['form_qualifiers'].insert(0, a) - else: - raise ValueError(annotation) + try: + cls._parse_form(parser, ann, kw) + except ValueError: + raise ValueError(annotation) # pylint: disable=W0707 + return cls(**kw) + @staticmethod + def _parse_form(parser, ann, kw): + ann = ann.split("_") + if not (ann[-1] in parser.form_glosses or tuple(ann[-2:]) in parser.form_glosses): + raise ValueError() + kw['form_gloss'] = ann.pop() + kw['form_qualifiers'] = [] + if ann: + if (ann[-1], kw['form_gloss']) in parser.form_glosses: + kw['form_qualifiers'].append(ann.pop()) + for a in ann: + if a in parser.form_gloss_specifiers or a in parser.form_glosses: + kw['form_qualifiers'].insert(0, a) + else: + raise ValueError() + + @staticmethod + def _parse_subconstituent(parser, ann, kw): + kw['subconstituent_qualifiers'] = [] + # Consume subconstituent_symbols from the left + pattern = re.compile( + r'(?P{})(_|$)'.format( # pylint: disable=C0209 + re_or(parser.subconstituent_symbols[kw['subconstituent']]))) + m = pattern.match(ann) + while m: + kw['subconstituent_qualifiers'].append(m.group('sym')) + ann = ann[m.end():] + m = pattern.match(ann) + return ann + @dataclasses.dataclass class CrossIndex: @@ -581,19 +647,25 @@ class CrossIndex: morpheme_separator: str = None def __str__(self): - return '{}{}pro_{}_{}'.format( + return '{}{}pro_{}_{}'.format( # pylint: disable=C0209 self.morpheme_separator or '', self.subconstituent_marker + '_' if self.subconstituent_marker else '', self.referent_property, self.function, ) - def describe(self, parser: GRAID = None) -> typing.Dict[str, str]: - parser = parser or GRAID() + def describe(self, parser: GRAID = None) -> dict[str, str]: # pylint: disable=W0613,C0116 return {'symbol': str(self)} @classmethod - def from_annotation(cls, ann, parser: GRAID = None) -> typing.Optional["CrossIndex"]: + def from_annotation(cls, ann, parser: GRAID = None) -> Optional["CrossIndex"]: + """ + >>> ci = CrossIndex.from_annotation('-rn_pro_1_a') + >>> ci.referent_property + '1' + >>> ci.function + 'a' + """ parser = parser or GRAID() kw = {} if any(ann.startswith(sep) for sep in parser.morpheme_separators): @@ -602,9 +674,10 @@ def from_annotation(cls, ann, parser: GRAID = None) -> typing.Optional["CrossInd if ann.startswith(scm + '_'): kw['subconstituent_marker'], ann = scm, ann[len(scm) + 1:] m = re.fullmatch( - r'pro_(?P{})_(?P{})'.format( + r'pro_(?P{})_(?P{})'.format( # pylint: disable=C0209 re_or(parser.referent_properties), re_or(parser.syntactic_functions)), ann) if m: kw['referent_property'], kw['function'] = m.group('rp'), m.group('f') return cls(**kw) + return None # pragma: no cover diff --git a/src/pyigt/igt.py b/src/pyigt/igt.py index afa39e6..022645d 100644 --- a/src/pyigt/igt.py +++ b/src/pyigt/igt.py @@ -1,20 +1,22 @@ +""" +Functionality related to interlinear glossed text. +""" +import functools import re import enum -import json import types -import shutil -import typing +from typing import Optional, Union, Literal, Callable, Any import pathlib import tempfile import itertools import collections +from collections.abc import Iterable, Generator +import dataclasses import unicodedata -from tabulate import tabulate import segments -import attr from csvw.dsv import UnicodeWriter, reader -from csvw.metadata import Link +from csvw.metadata import Link, TableGroup from pycldf import Dataset from pycldf import orm import pycldf @@ -24,17 +26,18 @@ except ImportError: # pragma: no cover lingpy = False -from pyigt.util import expand_standard_abbr +from pyigt.util import expand_standard_abbr, align from pyigt.lgrmorphemes import ( GlossedWord, split_morphemes, remove_morpheme_separators, GlossedMorpheme ) -__all__ = ['IGT', 'Corpus', 'LGRConformance', 'Example'] +__all__ = ['IGT', 'Corpus', 'LGRConformance', 'Example', 'LingPySettings'] NON_OVERT_ELEMENT = '∅' +ConcordanceType = Literal["grammar", "lexicon", "form"] -def with_lingpy(): +def with_lingpy(): # pylint: disable=C0116 if not lingpy: raise ValueError('pyigt must be installed with lingpy support for this functionality! ' 'Run `pip install pyigt[lingpy]`') @@ -69,8 +72,8 @@ def parse_phrase(p): return p -@attr.s -class IGT(object): +@dataclasses.dataclass +class IGT: # pylint: disable=R0902 """ The main trait of IGT is the alignment of words and glosses. Thus, we are mostly interested in the two aligned "lines": the analyzed text and the glosses, rather than trying to support @@ -124,22 +127,19 @@ class IGT(object): >>> igt[0].glossed_morphemes # we extract as many glossed morphemes as possible ... [] """ - phrase = attr.ib( - validator=attr.validators.instance_of(list), - converter=parse_phrase, - ) - gloss = attr.ib( - validator=attr.validators.instance_of(list), - converter=lambda g: g.split() if isinstance(g, str) else g, - ) - id = attr.ib(default=None) - properties = attr.ib(validator=attr.validators.instance_of(dict), default=attr.Factory(dict)) - language = attr.ib(default=None) - translation = attr.ib(default=None) - abbrs = attr.ib(validator=attr.validators.instance_of(dict), default=attr.Factory(dict)) - strict = attr.ib(default=False) - - def __attrs_post_init__(self): + phrase: list[str] + gloss: list[str] + id: Optional[str] = None + properties: dict = dataclasses.field(default_factory=dict) + language: str = None + translation: str = None + abbrs: dict = dataclasses.field(default_factory=dict) + strict: bool = False + + def __post_init__(self): + self.phrase = parse_phrase(self.phrase) + self.gloss = self.gloss.split() if isinstance(self.gloss, str) else self.gloss + if self.translation: p = re.compile(r'\((?P((\s*,\s*)?[A-Z][A-Z0-9]*\s*=\s*[^,)]+)+)\)') abbrs = p.search(self.translation) @@ -163,11 +163,11 @@ def __iter__(self): yield from self.glossed_words @property - def glossed_words(self) -> typing.List[GlossedWord]: + def glossed_words(self) -> list[GlossedWord]: # pylint: disable=C0116 return [GlossedWord(w, g, strict=self.strict) for w, g in zip(self.phrase, self.gloss)] @property - def prosodic_words(self) -> typing.List[GlossedWord]: + def prosodic_words(self) -> list[GlossedWord]: """ Interpret an IGT's phrase prosodically, i.e. @@ -195,7 +195,7 @@ def prosodic_words(self) -> typing.List[GlossedWord]: return res @property - def morphosyntactic_words(self) -> typing.List[GlossedWord]: + def morphosyntactic_words(self) -> list[GlossedWord]: """ Interpret an IGT's phrase morphosyntactically, i.e. @@ -262,6 +262,7 @@ def as_morphosyntactic(self): @property def gloss_abbrs(self) -> collections.OrderedDict: + """The gloss abbreviations used in the IGT.""" res = collections.OrderedDict() for gw in self.glossed_words: for gm in gw: @@ -280,21 +281,21 @@ def __str__(self): A plain text representation of the IGT, to be viewed with a monospaced font to make alignments work. """ - return '{0}\n{1}{2}'.format( - self.primary_text, - tabulate([self.gloss], self.phrase, tablefmt='plain'), - '\n‘{}’'.format(self.translation) if self.translation else '', - ) + aligned = align(self.gloss, self.phrase) + translation = f'\n‘{self.translation}’' if self.translation else '' + return f'{self.primary_text}\n{aligned}{translation}' - def pprint(self): + def pprint(self): # pylint: disable=C0116 abbrs = [(k, v) for k, v in self.gloss_abbrs.items() if v] if abbrs: mlen = max(len(a[0]) for a in abbrs) - abbrs = ''.join('\n {} = {}'.format(k.ljust(mlen), v) for k, v in abbrs) - print('{}{}'.format(self, abbrs or '')) + abbrs = ''.join(f'\n {k.ljust(mlen)} = {v}' for k, v in abbrs) + print(f"{self}{abbrs or ''}") - def __getitem__(self, i: typing.Union[int, typing.Tuple[int, typing.Union[int, slice]]]) \ - -> typing.Union[typing.List, GlossedWord, GlossedMorpheme]: + def __getitem__( + self, + i: Union[int, tuple[int, Union[int, slice]]], + ) -> Union[list, GlossedWord, GlossedMorpheme]: """ Provide access to `GlossedWord` or `GlossedMorpheme` (s) by zero-based index. @@ -335,7 +336,7 @@ def conformance(self) -> LGRConformance: return LGRConformance.WORD_ALIGNED return LGRConformance.UNALIGNED - def is_valid(self, strict: bool = False) -> bool: + def is_valid(self, strict: bool = False) -> bool: # pylint: disable=C0116 try: self.check(strict=strict) return True @@ -357,16 +358,16 @@ def check(self, strict: bool = False, verbose: bool = False): for i, (m, g) in enumerate(zip(self.phrase, self.gloss)): try: GlossedWord(m, g, strict=True) - except ValueError: + except ValueError as e: if verbose: print(self.phrase[i]) print(self.gloss[i]) raise ValueError( 'Rule 2 violated: Number of morphemes does not match number of morpheme ' - 'glosses!') + 'glosses!') from e @property - def phrase_text(self) -> str: + def phrase_text(self) -> str: # pylint: disable=C0116 return ' '.join([w or '' for w in self.phrase]) @property @@ -382,11 +383,11 @@ def primary_text(self) -> str: return remove_morpheme_separators(self.phrase_text) @property - def gloss_text(self) -> str: + def gloss_text(self) -> str: # pylint: disable=C0116 return ' '.join(self.gloss) -class Example(orm.Example): +class Example(orm.Example): # pylint: disable=R0903 """ A custom object class to use with `pycldf.orm `_ @@ -407,10 +408,11 @@ class Example(orm.Example): """ @property def igt(self) -> IGT: - tr = "'{}'".format(self.cldf.translatedText) + """Create an IGT instance based on the data of the example row.""" + tr = f"'{self.cldf.translatedText}'" try: if self.cldf.comment: - tr += ' ({})'.format(self.cldf.comment) + tr += f' ({self.cldf.comment})' except AttributeError: # pragma: no cover pass return IGT( @@ -427,7 +429,91 @@ def _clean_lexical_concept(s): return s.replace('†', '').strip() -class Corpus(object): +@dataclasses.dataclass +class MorphemeContext: + """The context of a morpheme in a corpus, i.e. the word it appears in and the IGT.""" + igt: IGT + word: GlossedWord + morpheme: GlossedMorpheme + + @property + def concepts(self) -> list[tuple[str, str]]: + """ + Categorized concepts appearing in the morpheme. + """ + res = list(itertools.zip_longest(self.morpheme.lexical_concepts, [], fillvalue='lexicon')) + res.extend( + list(itertools.zip_longest(self.morpheme.grammatical_concepts, [], fillvalue='grammar')) + ) + return res + + +@dataclasses.dataclass(frozen=True) +class MorphemeReference: + """Morphemes in a corpus are identified by three numbers.""" + igt_index: int + word_index: int + morpheme_index: int + + def __str__(self): + return f'{self.igt_index}:{self.word_index}:{self.morpheme_index}' + + def resolve(self, corpus) -> MorphemeContext: + """Resolve the indices against the corpus.""" + return MorphemeContext( + corpus[self.igt_index], + corpus[self.igt_index, self.word_index], + corpus[self]) + + +@dataclasses.dataclass(frozen=True) +class Concordance: + """A concordance maps glosses or forms to lists of occurrences.""" + grammar: dict[str, list[MorphemeReference]] + lexicon: dict[str, list[MorphemeReference]] + form: dict[str, list[MorphemeReference]] + + @classmethod + def from_igts(cls, igts: collections.OrderedDict[Union[str, int], IGT]): + """Create a concordance for the morphemes in IGT items.""" + grammar = collections.defaultdict(list) + lexicon = collections.defaultdict(list) + form = collections.defaultdict(list) + for idx, igt in igts.items(): + if not igt.is_valid(strict=True): # We ignore non-morpheme-aligned IGTs. + continue + for i, gw in enumerate(igt): + for j, gm in enumerate(gw): + if not gm.form: + continue + + ref = MorphemeReference(idx, i, j) + for g in gm.grammatical_concepts: + grammar[g].append(ref) + lexicon[' // '.join(gm.lexical_concepts)].append(ref) + form[gm.form].append(ref) + return cls(grammar, lexicon, form) + + +@dataclasses.dataclass(frozen=True) +class LingPySettings: + """Container for settings related to initializing a LingPy wordlist.""" + ref: str = 'crossid' + lexstat: bool = True + threshold: float = 0.4 + + def get_wordlist(self, d: collections.OrderedDict[int, list[Any]]): # pylint: disable=C0116 + if self.lexstat: + wl = with_lingpy().LexStat(d) + wl.cluster(method='sca', threshold=self.threshold, ref=self.ref) + else: + wl = with_lingpy().Wordlist(d) + wl.add_entries('cog', 'concept,form', lambda x, y: x[y[0]] + '-' + x[y[1]]) + wl.renumber('cog', self.ref) + return wl + + +class Corpus: """ A Corpus is an immutable, ordered list of `IGT` instances. @@ -436,34 +522,17 @@ class Corpus(object): :ivar monolingual: Flag signaling whether the corpus is monolingual or contains `IGT` from \ different object languages. """ - def __init__(self, igts: typing.Iterable[IGT], fname=None, clean_lexical_concept=None): + def __init__(self, igts: Iterable[IGT], fname=None, clean_lexical_concept=None): self.clean_lexical_concept = clean_lexical_concept or _clean_lexical_concept self.fname = fname self._igts = collections.OrderedDict([(igt.id or n, igt) for n, igt in enumerate(igts)]) - self._concordance = dict( - grammar=collections.defaultdict(list), - lexicon=collections.defaultdict(list), - form=collections.defaultdict(list), - ) # Since changing the IGTs in the corpus is not allowed, we can compute concordances right # away. - for idx, igt in self._igts.items(): - if not igt.is_valid(strict=True): # We ignore non-morpheme-aligned IGTs. - continue - for i, gw in enumerate(igt): - for j, gm in enumerate(gw): - if not gm.form: - continue - - ref = (idx, i, j) - for g in gm.grammatical_concepts: - self._concordance['grammar'][g].append(ref) - self._concordance['lexicon'][' // '.join(gm.lexical_concepts)].append(ref) - self._concordance['form'][gm.form].append(ref) + self._concordance = Concordance.from_igts(self._igts) self.monolingual = len(set(igt.language for igt in self._igts.values())) == 1 @property - def grammar(self) -> typing.Dict[str, typing.List[typing.Tuple[int, int, int]]]: + def grammar(self) -> dict[str, list[MorphemeReference]]: """ Maps grammatical concepts to lists of occurrences. @@ -477,10 +546,10 @@ def grammar(self) -> typing.Dict[str, typing.List[typing.Tuple[int, int, int]]]: [[], []] """ - return self._concordance['grammar'] + return self._concordance.grammar @property - def lexicon(self) -> typing.Dict[str, typing.List[typing.Tuple[int, int, int]]]: + def lexicon(self) -> dict[str, list[MorphemeReference]]: """ Maps lexical concepts to lists of occurrences. @@ -493,10 +562,10 @@ def lexicon(self) -> typing.Dict[str, typing.List[typing.Tuple[int, int, int]]]: >>> [c[ref] for ref in c.lexicon['Sohn']] [] """ - return self._concordance['lexicon'] + return self._concordance.lexicon @property - def form(self) -> typing.Dict[str, typing.List[typing.Tuple[int, int, int]]]: + def form(self) -> dict[str, list[MorphemeReference]]: """ Maps grammatical concepts to lists of occurrences. @@ -509,10 +578,10 @@ def form(self) -> typing.Dict[str, typing.List[typing.Tuple[int, int, int]]]: >>> [k for k in c.form] ['ni', 'c', 'chihui', 'lia', 'in', 'no', 'piltzin', 'ce', 'calli'] """ - return self._concordance['form'] + return self._concordance.form @staticmethod - def get_column_names(cldf: Dataset) -> types.SimpleNamespace: + def get_column_names(cldf: Dataset) -> types.SimpleNamespace: # pylint: disable=C0116 # We lookup local column names by ontology term: lookup = [ ('id', 'id'), @@ -533,25 +602,28 @@ def from_cldf(cls, cldf: Dataset) -> 'Corpus': :param cldf: a `pycldf.Dataset` instance. :param spec: a `CorpusSpec` instance, specifying how to interpret markup in the corpus. """ + def fix_tab(s): + if s and '\\t' in s[0]: + return s[0].split('\\t') + return s + cols = cls.get_column_names(cldf) igts = [ IGT( id=igt[cols.id], - gloss=igt[cols.gloss], - phrase=igt[cols.phrase], + gloss=fix_tab(igt[cols.gloss]), + phrase=fix_tab(igt[cols.phrase]), language=igt.get(cols.language), translation=igt.get(cols.translation), properties=igt, ) for igt in cldf['ExampleTable']] - return cls( - igts, - fname=cldf.tablegroup._fname.parent / str(cldf['ExampleTable'].url)) + d = cldf.tablegroup._fname.parent # pylint: disable=W0212 + return cls(igts, fname=d / str(cldf['ExampleTable'].url)) @classmethod - def from_stream(cls, stream) -> 'Corpus': - from csvw.metadata import TableGroup - cldf = Dataset(TableGroup(fname=pathlib.Path('tmp.json'))) + def from_stream(cls, stream) -> 'Corpus': # pylint: disable=C0116 + cldf = Dataset(TableGroup()) cldf.add_component('ExampleTable') cols = cls.get_column_names(cldf) @@ -567,7 +639,7 @@ def from_stream(cls, stream) -> 'Corpus': return cls(igts) @classmethod - def from_path(cls, path: typing.Union[str, pathlib.Path]) -> 'Corpus': + def from_path(cls, path: Union[str, pathlib.Path]) -> 'Corpus': """ Instantiate a corpus from a file path. @@ -586,7 +658,7 @@ def from_path(cls, path: typing.Union[str, pathlib.Path]) -> 'Corpus': break ds = Dataset.from_metadata( pathlib.Path(pycldf.__file__).parent / 'modules' / 'Generic-metadata.json') - ds.tablegroup._fname = path.parent / 'cldf-metadata.json' + ds.tablegroup._fname = path.parent / 'cldf-metadata.json' # pylint: disable=W0212 t = ds.add_component('ExampleTable') t.url = Link(path.name) default_cols = [col.name for col in t.tableSchema.columns] @@ -600,23 +672,36 @@ def __len__(self): def __iter__(self): return iter(self._igts.values()) - def __getitem__(self, item): + def __getitem__(self, item) -> Union[IGT, GlossedWord, GlossedMorpheme]: + if isinstance(item, MorphemeReference): + item = dataclasses.astuple(item) if not isinstance(item, tuple): return self._igts[item] if item in self._igts else list(self._igts.values())[item] if len(item) == 2: return self._igts[item[0]][item[1]] return self[item[0]][tuple(item[1:])] - def get_stats(self): + def get_stats(self) -> tuple[int, int, int]: # pylint: disable=C0116 return ( len(self), sum(len(igt) for igt in self), sum(sum(len(w) for w in igt) for igt in self)) - def get_lgr_conformance_stats(self): + def get_lgr_conformance_stats(self) -> dict[LGRConformance, int]: # pylint: disable=C0116 return collections.Counter([igt.conformance for igt in self]) - def write_concordance(self, ctype: str, filename=None): + def _write_csv(self, filename, rows): + with UnicodeWriter(filename, delimiter='\t') as w: + w.writerows(rows) + + if not filename: + print(w.read().decode('utf8')) + + def write_concordance( + self, + ctype: ConcordanceType, + filename: Optional[Union[str, pathlib.Path]] = None, + ): """ :param ctype: `lexicon` or `grammar` or `form`. """ @@ -625,49 +710,43 @@ def write_concordance(self, ctype: str, filename=None): for ref in refs: # We want one row per unique (form, language, concept, gloss). if ctype == 'form': - gloss = str(self[ref].gloss) - conc[c, gloss, gloss, self[ref[0]].language].append(ref) + key = [c, str(self[ref].gloss), str(self[ref].gloss)] else: - conc[ - self[ref].form, - self.clean_lexical_concept(c), - c, - self[ref[0]].language].append(ref) - - with UnicodeWriter(filename, delimiter='\t') as w: - h = ['ID', 'FORM', 'GLOSS', 'GLOSS_IN_SOURCE', 'OCCURRENCE', 'REF'] + key = [self[ref].form, self.clean_lexical_concept(c), c] + conc[tuple(key + [self[ref.igt_index].language])].append(ref) + + rows = [] + h = ['ID', 'FORM', 'GLOSS', 'GLOSS_IN_SOURCE', 'OCCURRENCE', 'REF'] + if not self.monolingual: + h.insert(1, 'LANGUAGE_ID') + rows.append(h) + # We order the rows by descending frequency: + for i, (k, refs) in enumerate( + sorted(conc.items(), key=lambda x: (-len(x[1]), x[0])), start=1): + c = [i, k[0], k[1], k[2], len(refs), ' '.join(str(ref) for ref in refs)] if not self.monolingual: - h.insert(1, 'LANGUAGE_ID') - w.writerow(h) - # We order the rows by descending frequency: - for i, (k, refs) in enumerate( - sorted(conc.items(), key=lambda x: (-len(x[1]), x[0])), start=1): - c = [ - i, - k[0], - k[1], - k[2], - len(refs), - ' '.join(['{}:{}:{}'.format(*ref) for ref in refs])] - if not self.monolingual: - c.insert(1, k[3]) - w.writerow(c) + c.insert(1, k[3]) + rows.append(c) - if not filename: - print(w.read().decode('utf8')) + self._write_csv(filename, rows) - def write_concepts(self, ctype, filename=None): + def write_concepts( + self, + ctype: ConcordanceType, + filename: Optional[Union[str, pathlib.Path]] = None, + ): """ :param ctype: `lexicon` or `grammar`. """ def form(ref): - return self[ref].form if self.monolingual else '{}: {}'.format( - self[ref[0]].language, self[ref].form) + if self.monolingual: + return self[ref].form + return f'{self[ref.igt_index].language}: {self[ref].form}' conc = [] for c, refs in getattr(self, ctype).items(): if c: - igt = self[refs[0][0]] + igt = self[refs[0].igt_index] conc.append([ self.clean_lexical_concept(c), len(refs), @@ -677,19 +756,17 @@ def form(ref): igt.gloss_text, ]) - with UnicodeWriter(filename, delimiter='\t') as w: - w.writerow( - ['ID', 'ENGLISH', 'OCCURRENCE', 'CONCEPT_IN_SOURCE', 'FORMS', 'PHRASE', 'GLOSS']) - for i, row in enumerate(sorted(conc, key=lambda x: -x[1]), start=1): - w.writerow([i] + row) - if not filename: - print(w.read().decode('utf8')) + rows = [['ID', 'ENGLISH', 'OCCURRENCE', 'CONCEPT_IN_SOURCE', 'FORMS', 'PHRASE', 'GLOSS']] + for i, row in enumerate(sorted(conc, key=lambda x: -x[1]), start=1): + rows.append([i] + row) + self._write_csv(filename, rows) def check_glosses(self, level=2): + """Check alignment of glosses on word and morpheme level.""" count = 1 for idx, igt in self._igts.items(): if not igt.is_valid() and level >= 1: - print('[{0} : first level {1}]'.format(idx, count)) + print(f'[{idx} : first level {count}]') print(igt.phrase) print(igt.gloss) print('---') @@ -699,71 +776,53 @@ def check_glosses(self, level=2): try: GlossedWord(w, m, strict=True) except ValueError: - print('[{0}:{1} : second level {2}]'.format(idx, i, count)) + print(f'[{idx}:{i} : second level {count}]') print(w) print(m) print('---') count += 1 - def get_wordlist( + def _iter_wordlist_items( self, - doculect='base', - profile=False, - ref='crossid', - lexstat=True, - threshold=0.4): - """ - Return a classical wordlist from the data. - """ - if profile: - profile = segments.Tokenizer(profile) - tokenize = lambda x: profile('^' + x + '$', column='IPA').split() # noqa: E731 - else: - tokenize = with_lingpy().ipa2tokens - - D = { - 0: [ - 'doculect', - 'concept', - 'concept_in_source', - 'concept_type', - 'form', - 'tokens', - 'occurrences', - 'word_forms', - 'gloss_forms', - 'phrase_example', - 'gloss_example', - 'references', - ] - } + doculect, + tokenize: Callable[[str], list[str]], + ) -> Generator[tuple[int, list[Any]], None, None]: + yield 0, [ + 'doculect', + 'concept', + 'concept_in_source', + 'concept_type', + 'form', + 'tokens', + 'occurrences', + 'word_forms', + 'gloss_forms', + 'phrase_example', + 'gloss_example', + 'references', + ] idx = 1 # Iterate over unique (cleaned concept, form, language, gloss) tuples. i = 0 for form, refs in self.form.items(): for (lid, gloss), morphrefs in itertools.groupby( - sorted(refs, key=lambda r: (self[r[0]].language, str(self[r].gloss))), - lambda r: (self[r[0]].language, str(self[r].gloss)) + sorted(refs, key=lambda r: (self[r.igt_index].language, str(self[r].gloss))), + lambda r: (self[r.igt_index].language, str(self[r].gloss)) ): morphrefs = list(morphrefs) - gm = self[morphrefs[0]] - gw = self[morphrefs[0][:2]] - igt = self[morphrefs[0][0]] + morphctx = morphrefs[0].resolve(self) i += 1 - concepts = \ - list(itertools.zip_longest(gm.lexical_concepts, [], fillvalue='lexicon')) + \ - list(itertools.zip_longest(gm.grammatical_concepts, [], fillvalue='grammar')) - for concept, ctype in concepts: + for concept, ctype in morphctx.concepts: concept = self.clean_lexical_concept(concept) tokens = tokenize(form) # check tokens try: with_lingpy().tokens2class(tokens, 'sca') check = True - except: # noqa: E722, # pragma: no cover + except: # noqa: E722, # pragma: no cover # pylint: disable=W0702 check = False if concept.strip() and check: - D[idx] = [ + yield idx, [ doculect if self.monolingual else lid, concept, gloss, @@ -771,24 +830,36 @@ def get_wordlist( form, tokens, len(morphrefs), - ' '.join(m.form for m in gw), - ' '.join(m.gloss for m in gw), - igt.phrase_text, - igt.gloss_text, - ' '.join('{}:{}:{}'.format(*ref) for ref in morphrefs)] + ' '.join(m.form for m in morphctx.word), + ' '.join(m.gloss for m in morphctx.word), + morphctx.igt.phrase_text, + morphctx.igt.gloss_text, + ' '.join(str(ref) for ref in morphrefs)] idx += 1 else: - print('[!] Problem with "{0}" / [{1}] [{2}] / {3} {4} {5}'.format( - concept, form, tokens, *morphrefs[0])) - wl = with_lingpy().Wordlist(D) + print( + f'[!] Problem with "{concept}" / [{form}] [{tokens}] / {morphrefs[0]}') - if lexstat: - wl = with_lingpy().LexStat(D) - wl.cluster(method='sca', threshold=threshold, ref=ref) - else: - wl.add_entries('cog', 'concept,form', lambda x, y: x[y[0]] + '-' + x[y[1]]) - wl.renumber('cog', ref) - return wl + def get_wordlist( + self, + doculect: str = 'base', + profile: Optional[Union[str, pathlib.Path, segments.Profile]] = None, + lingpy_settings: LingPySettings = LingPySettings(), + ): + """ + Return a classical wordlist from the data. + """ + if profile: + profile = segments.Tokenizer(profile) + + def tokenize(profile, x): + if profile: + return profile('^' + x + '$', column='IPA').split() # noqa: E731 + return with_lingpy().ipa2tokens(x) + + d = collections.OrderedDict( + self._iter_wordlist_items(doculect, functools.partial(tokenize, profile))) + return lingpy_settings.get_wordlist(d) def get_profile(self, clts=None, filename=None) -> segments.Profile: """ @@ -799,7 +870,7 @@ def get_profile(self, clts=None, filename=None) -> segments.Profile: """ clts = clts.bipa if clts else None - D = {0: ['doculect', 'concept', 'ipa']} + D = {0: ['doculect', 'concept', 'ipa']} # pylint: disable=C0103 for i, key in enumerate(self.form, start=1): D[i] = ['dummy', str(self[self.form[key][0]].gloss), key] wordlist = with_lingpy().basic.wordlist.Wordlist(D) @@ -821,33 +892,3 @@ def get_profile(self, clts=None, filename=None) -> segments.Profile: if not filename: p.unlink() return res - - def write_app(self, dest='app'): - # idxs must be in index 2 of wordlist, form 0, and concept 1 - # concordance 0 is phrase, 1 is gloss - - wordlist = self.get_wordlist() - WL, CN = collections.OrderedDict(), collections.OrderedDict() - for idx, form, concept, refs in wordlist.iter_rows('form', 'concept', 'references'): - WL[idx] = [ - form, - concept, - [[int(y) for y in x.split(':')] for x in refs.split()], - wordlist[idx, 'tokens'], - ] - - for line in WL[idx][2]: - igt = self[str(line[0])] - CN[line[0]] = [ - igt.phrase, - igt.gloss, - ] - # FIXME: must add additional IGT data from ExampleTable row! - dest = pathlib.Path(dest) - assert dest.is_dir() - with dest.joinpath('script.js').open('w', encoding='utf8') as f: - f.write('var WORDLIST = ' + json.dumps(WL, indent=2) + ';\n') - f.write('var CONC = ' + json.dumps(CN, indent=2) + ';\n') - index = dest / 'index.html' - if not index.exists(): - shutil.copy(str(pathlib.Path(__file__).parent.joinpath('index.html')), str(index)) diff --git a/src/pyigt/index.html b/src/pyigt/index.html deleted file mode 100644 index 3e2a020..0000000 --- a/src/pyigt/index.html +++ /dev/null @@ -1,102 +0,0 @@ - - - CONCORDANCE BROWSER - - - - - - - - - -

CONCORDANCE BROWSER

- -
- - -
-
- - - diff --git a/src/pyigt/lgrmorphemes.py b/src/pyigt/lgrmorphemes.py index c2544d6..da70ba2 100644 --- a/src/pyigt/lgrmorphemes.py +++ b/src/pyigt/lgrmorphemes.py @@ -13,10 +13,10 @@ """ import re import itertools -import typing +from typing import Optional +import dataclasses import unicodedata - -import attr +from collections.abc import Generator from pyigt.util import is_standard_abbr, is_generic_abbr @@ -36,11 +36,14 @@ ] -def split_morphemes(s): - return re.split('({})'.format('|'.join(re.escape(c) for c in MORPHEME_SEPARATORS)), s or '') +def split_morphemes(s: str) -> list[str]: + """Split string into morphemes.""" + pattern = f"({'|'.join(re.escape(c) for c in MORPHEME_SEPARATORS)})" + return re.split(pattern, s or '') -def remove_morpheme_separators(s): +def remove_morpheme_separators(s: str) -> str: + """Remove all characters listed as morpheme separators from string.""" return ''.join(ss for ss in split_morphemes(s) if ss not in MORPHEME_SEPARATORS) @@ -54,24 +57,24 @@ class GlossElement(str): end = None in_gloss_only = True - def __init__(self, s): + def __init__(self, _): self.prev = None self.next = None def __repr__(self): - return '<{} "{}">'.format( - self.__class__.__name__, self.encode('ascii', 'replace').decode()) + text = self.encode('ascii', 'replace').decode() + return f'<{self.__class__.__name__} "{text}">' @property - def is_agentlike_argument(self): + def is_agentlike_argument(self) -> bool: # pylint: disable=C0116 return isinstance(self.next, PatientlikeArgument) @property - def is_standard_abbreviation(self): + def is_standard_abbreviation(self) -> bool: # pylint: disable=C0116 return is_standard_abbr(self) @property - def is_category_label(self): + def is_category_label(self) -> bool: # pylint: disable=C0116 return is_generic_abbr(self) @@ -157,7 +160,7 @@ def __str__(self): return s @staticmethod - def _iter_gloss_elements(s, type_): + def _iter_gloss_elements(s, type_) -> Generator[GlossElement, None, None]: classes = {GlossElement.start: GlossElement} if type_ == 'gloss' else {} for cls in GlossElement.__subclasses__(): if (not cls.in_gloss_only) or type_ == 'gloss': @@ -187,7 +190,8 @@ def _iter_gloss_elements(s, type_): yield cls(e) @classmethod - def from_morpheme(cls, s, type_): + def from_morpheme(cls, s: str, type_) -> 'GlossElements': + """Instantiate gloss elements from a string.""" res, prev = [], None for ge in GlossElements._iter_gloss_elements(s, type_): if prev: @@ -204,18 +208,29 @@ class Morpheme(str): """ sep = '-' - def __init__(self, s): + def __init__(self, _): self.type = None def __repr__(self): - return '<{} "{}">'.format(self.__class__.__name__, self.encode('ascii', 'replace').decode()) + morph = self.encode('ascii', 'replace').decode() + return f'<{self.__class__.__name__} "{morph}">' @property - def elements(self): + def elements(self) -> list[GlossElement]: + """ + >>> m = Morpheme('ac') + >>> m.elements + [, , ] + """ return GlossElements.from_morpheme(str(self), self.type) @property - def form_and_infixes(self): + def form_and_infixes(self) -> tuple[str, list[str]]: + """ + >>> m = Morpheme('ac') + >>> m.form_and_infixes + ('ac', ['b']) + """ form, infixes = '', [] for ge in self.elements: if isinstance(ge, Infix): @@ -225,8 +240,8 @@ def form_and_infixes(self): return form, infixes -@attr.s(repr=False) -class GlossedMorpheme(object): +@dataclasses.dataclass +class GlossedMorpheme: """ A (morpheme, gloss) pair. @@ -236,21 +251,23 @@ class GlossedMorpheme(object): :ivar prev: Points to the previous `GlossedMorpheme` in a word, or `None`. :ivar next: Points to the next `GlossedMorpheme` in a word, or `None`. """ - morpheme = attr.ib() - gloss = attr.ib() - sep = attr.ib() - prev = attr.ib(default=None, eq=False) - next = attr.ib(default=None, eq=False) + morpheme: Morpheme + gloss: Morpheme + sep: str + prev: Optional['GlossedMorpheme'] = None + next: Optional['GlossedMorpheme'] = None - def __attrs_post_init__(self): + def __post_init__(self): self.morpheme = Morpheme(self.morpheme) self.morpheme.type = 'word' self.gloss = Morpheme(self.gloss) self.gloss.type = 'gloss' + def __eq__(self, other): + return self.morpheme == other.morpheme and self.gloss == other.gloss + def __repr__(self): - return '<{} morpheme={} gloss={}>'.format( - self.__class__.__name__, self.morpheme, self.gloss) + return f'<{self.__class__.__name__} morpheme={self.morpheme} gloss={self.gloss}>' @property def form(self) -> str: @@ -269,15 +286,17 @@ def form(self) -> str: unicodedata.category(c) not in {'Po', 'Pf', 'Ps', 'Pd', 'Pe', 'Pi', 'Sm'}) @property - def first(self): + def first(self) -> bool: + """Whether the morpheme is the first in the word.""" return not bool(self.prev) @property - def last(self): + def last(self) -> bool: + """Whether the morpheme is the last in the word.""" return not bool(self.next) @property - def grammatical_concepts(self) -> typing.List[str]: + def grammatical_concepts(self) -> list[str]: """ Grammatical concepts, referenced with category labels according to Rule 3, used in morpheme gloss. @@ -297,7 +316,7 @@ def grammatical_concepts(self) -> typing.List[str]: return list(self._glosses('grammatical')) @property - def lexical_concepts(self) -> typing.List[str]: + def lexical_concepts(self) -> list[str]: """ Gloss elements not recognized as category labels are interpreted as lexical concepts. @@ -331,26 +350,24 @@ def _glosses(self, type_): yield s.replace('_', ' ') -@attr.s(repr=False) -class GlossedWord(object): +@dataclasses.dataclass +class GlossedWord: """ A (word, gloss) pair, corresponding to two aligned items from IGT according to LGR. Provides list-like access to its :class:`GlossedMorpheme` s. """ - word = attr.ib() - gloss = attr.ib() - glossed_morphemes = attr.ib(default=attr.Factory(list), eq=False) - strict = attr.ib(default=False, eq=False) + word: str + gloss: str + glossed_morphemes: list[GlossedMorpheme] = dataclasses.field(default_factory=list) + strict: bool = False - def __attrs_post_init__(self): + def __post_init__(self): mm, gg = split_morphemes(self.word), split_morphemes(self.gloss) if len(mm) != len(gg): if self.strict: - raise ValueError( - 'Morpheme separator mismatch: {} :: {}'.format(self.word, self.gloss)) - else: - self.is_valid = False + raise ValueError(f'Morpheme separator mismatch: {self.word} :: {self.gloss}') + self.is_valid = False sep, prev = None, None for m, g in zip(mm, gg): if not m and not g: @@ -359,10 +376,9 @@ def __attrs_post_init__(self): if m != g: if self.strict: raise ValueError( - 'Morpheme separator mismatch: {} :: {}'.format(self.word, self.gloss)) - else: - self.is_valid = False - break + f'Morpheme separator mismatch: {self.word} :: {self.gloss}') + self.is_valid = False + break sep = m else: assert m and g, (mm, g) @@ -373,8 +389,11 @@ def __attrs_post_init__(self): gm.prev = prev prev = gm + def __eq__(self, other): + return self.glossed_morphemes == other.glossed_morphemes + def __repr__(self): - return '<{} word={} gloss={}>'.format(self.__class__.__name__, self.word, self.gloss) + return f'<{self.__class__.__name__} word={self.word} gloss={self.gloss}>' def __iter__(self): return iter(self.glossed_morphemes) @@ -400,11 +419,21 @@ def form(self) -> str: return ''.join(gm.form for gm in self) @property - def word_from_morphemes(self): + def word_from_morphemes(self) -> str: + """ + >>> gw = GlossedWord('a-word', 'a.DU-gloss') + >>> gw.word_from_morphemes + 'a-word' + """ return ''.join(itertools.chain( *[(gm.sep if gm.prev else '', str(gm.morpheme.elements)) for gm in self])) @property - def gloss_from_morphemes(self): + def gloss_from_morphemes(self) -> str: + """ + >>> gw = GlossedWord('a-word', 'a.DU-gloss') + >>> gw.gloss_from_morphemes + 'a.DU-gloss' + """ return ''.join(itertools.chain( *[(gm.sep if gm.prev else '', str(gm.gloss.elements)) for gm in self])) diff --git a/src/pyigt/util.py b/src/pyigt/util.py index b8e46d7..7208629 100644 --- a/src/pyigt/util.py +++ b/src/pyigt/util.py @@ -1,25 +1,57 @@ +""" +Utility functions. +""" import re +import itertools from clldutils.lgr import ABBRS, PERSONS, pattern -__all__ = ['is_standard_abbr', 'expand_standard_abbr', 'is_generic_abbr'] +__all__ = ['is_standard_abbr', 'expand_standard_abbr', 'is_generic_abbr', 'align'] STANDARD_ABBR_PATTERN = pattern() GENERIC_ABBR_PATTERN = re.compile('^([A-Z][A-Z0-9]*|([1-3](DL|PL|SG|DU))|[1-3]/[1-3])$') -def is_generic_abbr(label): +def align(seq1, seq2) -> str: + """Align the words in seq1 and seq2.""" + line1, line2 = [], [] + for w1, w2 in itertools.zip_longest(seq1, seq2, fillvalue=''): + w1 = w1.strip() + w2 = w2.strip() + maxlen = max((len(w1), len(w2))) + line1.append(w1.ljust(maxlen)) + line2.append(w2.ljust(maxlen)) + return '\n'.join([' '.join(line1), ' '.join(line2)]) + + +def is_generic_abbr(label: str) -> bool: + """ + >>> is_generic_abbr('ABC') + True + >>> is_generic_abbr('abc') + False + """ return bool((label in ABBRS) or GENERIC_ABBR_PATTERN.match(label)) -def is_standard_abbr(label): +def is_standard_abbr(label: str) -> bool: + """ + >>> is_standard_abbr('ABC') + False + >>> is_standard_abbr('DU') + True + """ match = STANDARD_ABBR_PATTERN.fullmatch(label) if match: return not bool(match.group('pre')) return False -def expand_standard_abbr(label): +def expand_standard_abbr(label: str) -> str: + """ + >>> expand_standard_abbr('DU') + 'dual' + """ match = STANDARD_ABBR_PATTERN.fullmatch(label) if match and not match.group('pre'): res = '' diff --git a/tests/test_igt.py b/tests/test_igt.py index bfd7b4d..8dc4715 100644 --- a/tests/test_igt.py +++ b/tests/test_igt.py @@ -168,20 +168,15 @@ def test_get_wordlist(corpus, tmpdir, capsys, mocker): assert profile_path.exists() corpus.get_wordlist(profile=profile) - _ = corpus.get_wordlist(lexstat=False, profile=corpus.get_profile()) + _ = corpus.get_wordlist( + lingpy_settings=LingPySettings(lexstat=False), + profile=corpus.get_profile()) mocker.patch('pyigt.igt.lingpy', None) with pytest.raises(ValueError): _ = corpus.get_wordlist() -def test_write_app(corpus, tmpdir): - dest = pathlib.Path(str(tmpdir)) - corpus.write_app(dest=dest) - assert dest.joinpath('script.js').exists() - assert dest.joinpath('index.html').exists() - - def test_multilingual(multilingual_dataset, capsys): corpus = Corpus.from_cldf(multilingual_dataset) assert not corpus.monolingual @@ -194,9 +189,3 @@ def test_multilingual(multilingual_dataset, capsys): corpus.write_concepts('grammar') out, _ = capsys.readouterr() assert 'macu1259: ' in out - - -def test_pkg_data(): - import pyigt - - assert pathlib.Path(pyigt.__file__).parent.joinpath('index.html').exists() diff --git a/tests/test_util.py b/tests/test_util.py new file mode 100644 index 0000000..de77b41 --- /dev/null +++ b/tests/test_util.py @@ -0,0 +1,7 @@ +from pyigt.util import align + + +def test_align(): + assert align(['a', 'abc', '1'], ['123', 'a', 'x']) == """\ +a abc 1 +123 a x""" \ No newline at end of file From 30463566808c1301f5c489362d513d1465cb160c Mon Sep 17 00:00:00 2001 From: Robert Forkel Date: Wed, 25 Mar 2026 12:50:33 +0100 Subject: [PATCH 2/2] update CI --- .github/workflows/python-package.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index b0595ee..634450f 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -12,14 +12,14 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python-version: [3.9, "3.10", 3.11, 3.12] + python-version: ["3.10", 3.11, 3.12, 3.13, 3.14] steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v6 with: submodules: recursive - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v5 + uses: actions/setup-python@v6 with: python-version: ${{ matrix.python-version }} - name: Install dependencies