From 1154253be8d2b9e3cb13e4428729cc3bb7e88131 Mon Sep 17 00:00:00 2001 From: Robert Forkel Date: Tue, 24 Feb 2026 22:19:00 +0100 Subject: [PATCH 1/7] linted dataset.py --- .github/workflows/python-package.yml | 6 +- setup.cfg | 4 +- src/pycldf/__init__.py | 5 + src/pycldf/__main__.py | 17 +- src/pycldf/cli_util.py | 50 ++-- src/pycldf/dataset.py | 422 +++++++++++---------------- src/pycldf/validators.py | 120 +++++++- tests/test_dataset.py | 4 +- 8 files changed, 357 insertions(+), 271 deletions(-) diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index 3dd19c1..8f0f9e9 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -12,12 +12,12 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python-version: [3.9, "3.10", 3.11, 3.12] + python-version: ["3.10", 3.11, 3.12, 3.13] steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v6 - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v5 + uses: actions/setup-python@v6 with: python-version: ${{ matrix.python-version }} - name: Install dependencies diff --git a/setup.cfg b/setup.cfg index cd744ef..2b3603b 100644 --- a/setup.cfg +++ b/setup.cfg @@ -20,12 +20,12 @@ classifiers = Natural Language :: English Operating System :: OS Independent Programming Language :: Python :: 3 - Programming Language :: Python :: 3.8 Programming Language :: Python :: 3.9 Programming Language :: Python :: 3.10 Programming Language :: Python :: 3.11 Programming Language :: Python :: 3.12 Programming Language :: Python :: 3.13 + Programming Language :: Python :: 3.14 Programming Language :: Python :: Implementation :: CPython Programming Language :: Python :: Implementation :: PyPy License :: OSI Approved :: Apache Software License @@ -117,7 +117,7 @@ show_missing = true skip_covered = true [tox:tox] -envlist = py3.8, py39, py310, py311, py312, py313 +envlist = py39, py310, py311, py312, py313, py314 isolated_build = true skip_missing_interpreter = true diff --git a/src/pycldf/__init__.py b/src/pycldf/__init__.py index 419d426..bf927e7 100644 --- a/src/pycldf/__init__.py +++ b/src/pycldf/__init__.py @@ -1,3 +1,8 @@ +""" +The `pycldf` package provides the reference implementation for the CLDF standard. + +https://cldf.cldf.org +""" from pycldf.dataset import * from pycldf.db import * from pycldf.sources import * diff --git a/src/pycldf/__main__.py b/src/pycldf/__main__.py index fe764ac..ca344cb 100644 --- a/src/pycldf/__main__.py +++ b/src/pycldf/__main__.py @@ -1,5 +1,10 @@ +""" +CLI for the `pycldf` pockage. +""" import csv import sys +from typing import Optional, Sequence +import logging import contextlib from clldutils.clilib import ( @@ -10,7 +15,15 @@ import pycldf.commands -def main(args=None, catch_all=False, parsed_args=None, log=None): +def main( + args: Sequence[str] = None, + catch_all: bool = False, + parsed_args: list = None, + log: Optional[logging.Logger] = None, +) -> Optional[int]: + """ + Implements the main command, dispatches to subcommands. + """ parser, subparsers = get_parser_and_subparsers('cldf') add_csv_field_size_limit(parser, default=csv.field_size_limit()) register_subcommands(subparsers, pycldf.commands) @@ -32,7 +45,7 @@ def main(args=None, catch_all=False, parsed_args=None, log=None): return 0 except ParserError as e: print(colored(str(e), 'red')) - return main([args._command, '-h']) + return main([args._command, '-h']) # pylint: disable=protected-access except Exception as e: # pragma: no cover if catch_all: print(e) diff --git a/src/pycldf/cli_util.py b/src/pycldf/cli_util.py index 8e2a488..ddf179e 100644 --- a/src/pycldf/cli_util.py +++ b/src/pycldf/cli_util.py @@ -31,13 +31,17 @@ def strtobool(val: str) -> int: # pragma: no cover val = val.lower() if val in ('y', 'yes', 't', 'true', 'on', '1'): return 1 - elif val in ('n', 'no', 'f', 'false', 'off', '0'): + if val in ('n', 'no', 'f', 'false', 'off', '0'): return 0 - else: - raise ValueError("invalid truth value %r" % (val,)) + raise ValueError(f"invalid truth value {val}") -class FlagOrPathType(PathType): +class FlagOrPathType(PathType): # pylint: disable=too-few-public-methods + """ + Argument type allowing input of a path or a boolean. + + The boolean can be used to determine whether to download a file from a known location. + """ def __call__(self, string): try: return bool(strtobool(string)) @@ -45,8 +49,10 @@ def __call__(self, string): return super().__call__(string) -def http_head_status(url): # pragma: no cover +def http_head_status(url: str) -> int: # pragma: no cover + """Do a HEAD request for `url` to determine its status.""" class NoRedirection(urllib.request.HTTPErrorProcessor): + """Don't follow redirects.""" def http_response(self, request, response): return response @@ -56,22 +62,22 @@ def http_response(self, request, response): return opener.open(urllib.request.Request(url, method="HEAD")).status -class UrlOrPathType(PathType): - def __call__(self, string): +class UrlOrPathType(PathType): # pylint: disable=too-few-public-methods + """Type suitable for argparse arguments, allowing input of URL or local file path.""" + def __call__(self, string: str) -> str: if is_url(string): if self._must_exist: sc = http_head_status(string) # We accept not only HTTP 200 as valid but also common redirection codes because # these are used e.g. for DOIs. if sc not in {200, 301, 302}: - raise argparse.ArgumentTypeError( - 'URL {} does not exist [HTTP {}]!'.format(string, sc)) + raise argparse.ArgumentTypeError(f'URL {string} does not exist [HTTP {sc}]!') return string super().__call__(string.partition('#')[0]) return string -def add_dataset(parser: argparse.ArgumentParser): +def add_dataset(parser: argparse.ArgumentParser) -> None: """ Adds a positional argument named `dataset` to the parser to specify a CLDF dataset. """ @@ -101,11 +107,17 @@ def get_dataset(args: argparse.Namespace) -> Dataset: except TypeError as e: # pragma: no cover if 'PathLike' in str(e): raise ParserError( - 'The dataset locator may require downloading, so you should specify --download-dir') + 'The dataset locator may require downloading, so you should specify --download-dir' + ) from e raise -def add_database(parser, must_exist=True): +def add_database(parser: argparse.ArgumentParser, must_exist: bool = True) -> None: + """ + Add CLI arguments to specify a CLDF SQLite database. + + Retrieve in the `run` function of a command using `get_database` (see below). + """ add_dataset(parser) parser.add_argument( 'db', @@ -116,17 +128,21 @@ def add_database(parser, must_exist=True): parser.add_argument('--infer-primary-keys', action='store_true', default=False) -def get_database(args): +def get_database(args: argparse.Namespace) -> Database: + """ + Retrieve a `Database` instance based on CLI input in `args` (see `add_database`). + """ return Database(get_dataset(args), fname=args.db, infer_primary_keys=args.infer_primary_keys) -def add_catalog_spec(parser, name): +def add_catalog_spec(parser: argparse.ArgumentParser, name: str) -> None: + """Add CLI arguments suitable to specify a catalog.""" parser.add_argument( '--' + name, metavar=name.upper(), type=PathType(type='dir'), - help='Path to repository clone of {0} data'.format(name.capitalize())) + help=f'Path to repository clone of {name.capitalize()} data') parser.add_argument( - '--{0}-version'.format(name), - help='Version of {0} data to checkout'.format(name.capitalize()), + f'--{name}-version', + help=f'Version of {name.capitalize()} data to checkout', default=None) diff --git a/src/pycldf/dataset.py b/src/pycldf/dataset.py index 7891f1f..6d4f22d 100644 --- a/src/pycldf/dataset.py +++ b/src/pycldf/dataset.py @@ -1,15 +1,19 @@ +# pylint: disable=C0302 +""" +An implementation of a CLDF dataset object. +""" import re import sys import json import types import shutil -import typing +from typing import Union, Optional, Type, Any import logging import pathlib import functools import itertools import collections -import collections.abc +from collections.abc import Generator, Iterable import urllib.parse import urllib.request @@ -19,13 +23,12 @@ from csvw import datatypes from csvw.dsv import iterrows from clldutils.path import git_describe, walk -from clldutils.misc import log_or_raise from clldutils import jsonlib -from pycldf.sources import Sources +from pycldf.sources import Sources, Source from pycldf.util import pkg_path, resolve_slices, DictTuple, sanitize_url, iter_uritemplates from pycldf.terms import term_uri, Terms, TERMS, get_column_names, URL as TERMS_URL -from pycldf.validators import VALIDATORS +from pycldf.validators import DatasetValidator, RowValidatorType from pycldf import orm __all__ = [ @@ -34,17 +37,18 @@ MD_SUFFIX = '-metadata.json' ORM_CLASSES = {cls.component_name(): cls for cls in orm.Object.__subclasses__()} -TableType = typing.Union[str, Table] -ColType = typing.Union[str, Column] -ColSpecType = typing.Union[str, dict, Column] -PathType = typing.Union[str, pathlib.Path] -TableSpecType = typing.Union[str, Link, Table] -ColSPecType = typing.Union[str, Column] -SchemaObjectType = typing.Union[TableSpecType, typing.Tuple[TableSpecType, ColSPecType]] +TableType = Union[str, Table] +ColType = Union[str, Column] +ColSpecType = Union[str, dict, Column] +PathType = Union[str, pathlib.Path] +TableSpecType = Union[str, Link, Table] +SchemaObjectType = Union[TableSpecType, tuple[TableSpecType, ColSpecType]] +ODict = collections.OrderedDict +RowType = ODict[str, Any] class SchemaError(KeyError): - pass + """Schema objects can be accessed using `Dataset.__getitem__`.""" @attr.s @@ -66,6 +70,7 @@ def id(self) -> str: return self.uri.split('#')[1] def match(self, thing) -> bool: + """Check if the module described here matches thing.""" if isinstance(thing, TableGroup): return thing.common_props.get('dc:conformsTo') == term_uri(self.id) if hasattr(thing, 'name'): @@ -76,14 +81,14 @@ def match(self, thing) -> bool: _modules = [] -def get_modules() -> typing.List[Module]: +def get_modules() -> list[Module]: """ We read supported CLDF modules from the default metadata files distributed with `pycldf`. """ - global _modules + global _modules # pylint: disable=global-statement if not _modules: ds = sys.modules[__name__] - for p in pkg_path('modules').glob('*{0}'.format(MD_SUFFIX)): + for p in pkg_path('modules').glob(f'*{MD_SUFFIX}'): tg = TableGroup.from_file(p) mod = Module( tg.common_props['dc:conformsTo'], @@ -123,16 +128,13 @@ def make_column(spec: ColSpecType) -> Column: raise TypeError(spec) -class GitRepository: +class GitRepository: # pylint: disable=too-few-public-methods """ CLDF datasets are often created from data curated in git repositories. If this is the case, we exploit this to provide better provenance information in the dataset's metadata. """ - def __init__(self, - url: str, - clone: typing.Optional[typing.Union[str, pathlib.Path]] = None, - version: typing.Optional[str] = None, - **dc): + def __init__( + self, url: str, clone: Optional[PathType] = None, version: Optional[str] = None, **dc): # We remove credentials from the URL immediately to make sure this isn't leaked into # CLDF metadata. Such credentials might be present in URLs read via gitpython from # remotes. @@ -141,7 +143,8 @@ def __init__(self, self.version = version self.dc = dc - def json_ld(self) -> typing.Dict[str, str]: + def json_ld(self) -> collections.OrderedDict[str, Any]: + """The repository described in JSON-LD.""" res = collections.OrderedDict([ ('rdf:about', self.url), ('rdf:type', 'prov:Entity'), @@ -150,11 +153,11 @@ def json_ld(self) -> typing.Dict[str, str]: res['dc:created'] = self.version elif self.clone: res['dc:created'] = git_describe(self.clone) - res.update({'dc:{0}'.format(k): self.dc[k] for k in sorted(self.dc)}) + res.update({f'dc:{k}': self.dc[k] for k in sorted(self.dc)}) return res -class Dataset: +class Dataset: # pylint: disable=too-many-public-methods """ API to access a CLDF dataset. """ @@ -168,7 +171,7 @@ def __init__(self, tablegroup: csvw.TableGroup): - :meth:`~pycldf.dataset.Dataset.from_metadata` - :meth:`~pycldf.dataset.Dataset.from_data` """ - self.tablegroup = tablegroup + self.tablegroup: csvw.TableGroup = tablegroup self.auto_constraints() self._sources = None self._objects = collections.defaultdict(collections.OrderedDict) @@ -177,6 +180,7 @@ def __init__(self, tablegroup: csvw.TableGroup): @property def sources(self) -> Sources: + """The sources.""" # We load sources only the first time they are accessed, because for datasets like # Glottolog - with 40MB zipped BibTeX - this may take ~90secs. if self._sources is None: @@ -226,11 +230,11 @@ def from_metadata(cls, fname: PathType) -> 'Dataset': else: fname = pathlib.Path(fname) if fname.is_dir(): - name = '{0}{1}'.format(cls.__name__, MD_SUFFIX) + name = f'{cls.__name__}{MD_SUFFIX}' tablegroup = TableGroup.from_file(pkg_path('modules', name)) # adapt the path of the metadata file such that paths to tables are resolved # correctly: - tablegroup._fname = fname.joinpath(name) + tablegroup._fname = fname.joinpath(name) # pylint: disable=W0212 else: tablegroup = TableGroup.from_file(fname) @@ -243,7 +247,7 @@ def from_metadata(cls, fname: PathType) -> 'Dataset': except ValueError: pass if comps and comps.most_common(1)[0][1] > 1: - raise ValueError('{0}: duplicate components!'.format(fname)) + raise ValueError(f'{fname}: duplicate components!') for mod in get_modules(): if mod.match(tablegroup): @@ -265,9 +269,10 @@ def from_data(cls, fname: PathType) -> 'Dataset': raise ValueError('empty data file!') if cls is Dataset: try: - cls = next(mod.cls for mod in get_modules() if mod.match(fname)) - except StopIteration: - raise ValueError('{0} does not match a CLDF module spec'.format(fname)) + cls = next( # pylint: disable=W0642 + mod.cls for mod in get_modules() if mod.match(fname)) + except StopIteration as exc: + raise ValueError(f'{fname} does not match a CLDF module spec') from exc assert issubclass(cls, Dataset) and cls is not Dataset res = cls.from_metadata(fname.parent) @@ -275,27 +280,30 @@ def from_data(cls, fname: PathType) -> 'Dataset': c.name for c in res[res.primary_table].tableSchema.columns if c.required} if not required_cols.issubset(colnames): - raise ValueError('missing columns: %r' % sorted(required_cols.difference(colnames))) + raise ValueError(f'missing columns: {sorted(required_cols.difference(colnames))}') return res # # Accessing dataset metadata # @property - def directory(self) -> typing.Union[str, pathlib.Path]: + def directory(self) -> PathType: """ :return: The location of the metadata file. Either a local directory as `pathlib.Path` or \ a URL as `str`. """ - return self.tablegroup._fname.parent if self.tablegroup._fname else self.tablegroup.base + if self.tablegroup._fname: # pylint: disable=W0212 + return self.tablegroup._fname.parent # pylint: disable=W0212 + return self.tablegroup.base @property def filename(self) -> str: """ :return: The name of the metadata file. """ - return self.tablegroup._fname.name if self.tablegroup._fname else \ - pathlib.Path(urllib.parse.urlparse(self.tablegroup.base).path).name + if self.tablegroup._fname: # pylint: disable=W0212 + return self.tablegroup._fname.name # pylint: disable=W0212 + return pathlib.Path(urllib.parse.urlparse(self.tablegroup.base).path).name @property def module(self) -> str: @@ -306,13 +314,15 @@ def module(self) -> str: @property def version(self) -> str: + """The CLDF version.""" return self.properties['dc:conformsTo'].split('/')[3] def __repr__(self) -> str: - return '' % (self.version, self.module, self.directory) + return f'' @property def metadata_dict(self) -> dict: + """The TableGroup instance as dict.""" return self.tablegroup.asdict(omit_defaults=False) @property @@ -323,7 +333,7 @@ def properties(self) -> dict: return self.tablegroup.common_props @property - def bibpath(self) -> typing.Union[str, pathlib.Path]: + def bibpath(self) -> PathType: """ :return: Location of the sources BibTeX file. Either a URL (`str`) or a local path \ (`pathlib.Path`). @@ -347,14 +357,14 @@ def bibname(self) -> str: # Accessing schema objects (components, tables, columns, foreign keys) # @property - def tables(self) -> typing.List[Table]: + def tables(self) -> list[Table]: """ :return: All tables defined in the dataset. """ return self.tablegroup.tables @property - def components(self) -> typing.Dict[str, csvw.Table]: + def components(self) -> collections.OrderedDict[str, csvw.Table]: """ :return: Mapping of component name to table objects as defined in the dataset. """ @@ -370,26 +380,28 @@ def components(self) -> typing.Dict[str, csvw.Table]: return res @staticmethod - def get_tabletype(table) -> typing.Union[str, None]: + def get_tabletype(table) -> Optional[str]: + """Return the table type, aka component name, of the table.""" if table.common_props.get('dc:conformsTo', '') is None: return None if '#' in table.common_props.get('dc:conformsTo', ''): res = table.common_props['dc:conformsTo'].split('#')[1] if res in TERMS: return res - raise ValueError("Type {:} of table {:} is not a valid term.".format( - table.common_props.get('dc:conformsTo'), - table.url)) + raise ValueError( + f"Type {table.common_props.get('dc:conformsTo')} of table {table.url} is invalid.") @property - def primary_table(self) -> typing.Union[str, None]: + def primary_table(self) -> Optional[str]: + """Returns the primary table for the dataset.""" if self.tables: try: return self.get_tabletype(self.tables[0]) except ValueError: - return None + pass + return None - def __getitem__(self, item: SchemaObjectType) -> typing.Union[csvw.Table, csvw.Column]: + def __getitem__(self, item: SchemaObjectType) -> Union[csvw.Table, csvw.Column]: """ Access to tables and columns. @@ -422,37 +434,32 @@ def __getitem__(self, item: SchemaObjectType) -> typing.Union[csvw.Table, csvw.C if isinstance(table, Link): table = table.string - if not isinstance(table, Table): - uri = term_uri(table, terms=TERMS.by_uri) - for t in self.tables: - if (uri and t.common_props.get('dc:conformsTo') == uri) \ - or t.url.string == table: - break - else: - raise SchemaError('Dataset has no table "{}"'.format(table)) - else: - if any(table is tt for tt in self.tables): - t = table - else: - raise SchemaError('Dataset has no table "{}"'.format(table)) - + t = self._get_table(table) if not column: return t if isinstance(column, Column): if any(column is c for c in t.tableSchema.columns): return column - else: - raise SchemaError('Dataset has no column "{}" in table "{}"'.format( - column.name, t.url)) + raise SchemaError(f'Dataset has no column "{column.name}" in table "{t.url}"') uri = term_uri(column, terms=TERMS.by_uri) for c in t.tableSchema.columns: - if ((c.propertyUrl and (c.propertyUrl.uri == uri or c.propertyUrl.uri == column)) - or c.header == column): # noqa: W503 + if ((c.propertyUrl and (c.propertyUrl.uri in (uri, column))) or c.header == column): return c - raise SchemaError('Dataset has no column "{}" in table "{}"'.format(column, t.url)) + raise SchemaError(f'Dataset has no column "{column}" in table "{t.url}"') + + def _get_table(self, table: Union[str, Table]) -> Table: + if not isinstance(table, Table): + uri = term_uri(table, terms=TERMS.by_uri) + for t in self.tables: + if (uri and t.common_props.get('dc:conformsTo') == uri) or t.url.string == table: + return t + raise SchemaError(f'Dataset has no table "{table}"') + if any(table is tt for tt in self.tables): + return table + raise SchemaError(f'Dataset has no table "{table}"') def __delitem__(self, item: SchemaObjectType): """ @@ -474,9 +481,7 @@ def __contains__(self, item: SchemaObjectType) -> bool: """ return bool(self.get(item)) - def get(self, - item: SchemaObjectType, - default=None) -> typing.Union[csvw.Table, csvw.Column, None]: + def get(self, item: SchemaObjectType, default=None) -> Union[csvw.Table, csvw.Column, None]: """ Acts like `dict.get`. @@ -487,8 +492,11 @@ def get(self, except SchemaError: return default - def get_foreign_key_reference(self, table: TableType, column: ColType) \ - -> typing.Union[typing.Tuple[csvw.Table, csvw.Column], None]: + def get_foreign_key_reference( + self, + table: TableType, + column: ColType, + ) -> Optional[tuple[csvw.Table, csvw.Column]]: """ Retrieve the reference of a foreign key constraint for the specified column. @@ -503,6 +511,7 @@ def get_foreign_key_reference(self, table: TableType, column: ColType) \ if len(fk.columnReference) == 1 and fk.columnReference[0] == column.name: return self[fk.reference.resource], \ self[fk.reference.resource, fk.reference.columnReference[0]] + return None @property def column_names(self) -> types.SimpleNamespace: @@ -531,7 +540,7 @@ def readonly_column_names(self) -> types.SimpleNamespace: # # Editing dataset metadata or schema # - def add_provenance(self, **kw): + def add_provenance(self, **kw: Any) -> None: """ Add metadata about the dataset's provenance. @@ -545,7 +554,7 @@ def to_json(obj): for k, v in kw.items(): if not k.startswith('prov:'): - k = 'prov:{0}'.format(k) + k = f'prov:{k}' if isinstance(v, (tuple, list)): v = [to_json(vv) for vv in v] else: @@ -560,7 +569,7 @@ def to_json(obj): v = old self.tablegroup.common_props[k] = v - def add_table(self, url: str, *cols: ColSpecType, **kw) -> csvw.Table: + def add_table(self, url: str, *cols: ColSpecType, **kw: Any) -> csvw.Table: """ Add a table description to the Dataset. @@ -580,7 +589,7 @@ def add_table(self, url: str, *cols: ColSpecType, **kw) -> csvw.Table: t.common_props.update(kw) return t - def remove_table(self, table: TableType): + def remove_table(self, table: TableType) -> None: """ Removes the table specified by `table` from the dataset. """ @@ -594,10 +603,7 @@ def remove_table(self, table: TableType): # Now remove the table: self.tablegroup.tables = [t for t in self.tablegroup.tables if t.url != table.url] - def add_component(self, - component: typing.Union[str, dict], - *cols: ColSpecType, - **kw) -> csvw.Table: + def add_component(self, component: Union[str, dict], *cols: ColSpecType, **kw) -> csvw.Table: """ Add a CLDF component to a dataset. @@ -610,7 +616,7 @@ def add_component(self, - `description`: a description of the table. """ if isinstance(component, str): - component = jsonlib.load(pkg_path('components', '{0}{1}'.format(component, MD_SUFFIX))) + component = jsonlib.load(pkg_path('components', f'{component}{MD_SUFFIX}')) if isinstance(component, dict): component = Table.fromvalue(component) assert isinstance(component, Table) @@ -639,7 +645,7 @@ def add_component(self, self.tables.append(component) self.add_columns(component, *cols) - component._parent = self.tablegroup + component._parent = self.tablegroup # pylint: disable=W0212 self.auto_constraints(component) return component @@ -654,13 +660,13 @@ def add_columns(self, table: TableType, *cols: ColSpecType) -> None: c.propertyUrl.uri for c in table.tableSchema.columns if c.propertyUrl]) col = make_column(col) if col.name in existing: - raise ValueError('Duplicate column name: {0}'.format(col.name)) + raise ValueError(f'Duplicate column name: {col.name}') if col.propertyUrl and col.propertyUrl.uri in existing: - raise ValueError('Duplicate column property: {0}'.format(col.propertyUrl.uri)) + raise ValueError(f'Duplicate column property: {col.propertyUrl.uri}') table.tableSchema.columns.append(col) self.auto_constraints() - def remove_columns(self, table: TableType, *cols: str): + def remove_columns(self, table: TableType, *cols: ColType) -> None: """ Remove `cols` from `table`'s schema. @@ -683,7 +689,7 @@ def remove_columns(self, table: TableType, *cols: str): table.tableSchema.columns = [c for c in table.tableSchema.columns if str(c) not in cols] - def rename_column(self, table: TableType, col: ColType, name: str): + def rename_column(self, table: TableType, col: ColType, name: str) -> None: """ Assign a new `name` to an existing column, cascading this change to foreign keys. @@ -724,7 +730,8 @@ def add_foreign_key( foreign_t: TableType, foreign_c: ColType, primary_t: TableType, - primary_c: typing.Optional[ColType] = None): + primary_c: Optional[ColType] = None, + ) -> None: """ Add a foreign key constraint. @@ -747,9 +754,9 @@ def add_foreign_key( primary_c = self[primary_t, primary_c].name foreign_t.add_foreign_key(self[foreign_t, foreign_c].name, primary_t.url.string, primary_c) - def auto_constraints(self, component=None): + def auto_constraints(self, component: Optional[TableType] = None): """ - Use CLDF reference properties to implicitely create foreign key constraints. + Use CLDF reference properties to implicitly create foreign key constraints. :param component: A Table object or `None`. """ @@ -806,7 +813,7 @@ def _auto_foreign_keys(self, table, component=None, table_type=None): # # Add data # - def add_sources(self, *sources, **kw): + def add_sources(self, *sources: Union[str, Source], **kw) -> None: """ Add sources to the dataset. @@ -817,7 +824,7 @@ def add_sources(self, *sources, **kw): # # Methods to read data # - def iter_rows(self, table: TableType, *cols: str) -> typing.Generator[dict, None, None]: + def iter_rows(self, table: TableType, *cols: str) -> Generator[RowType, None, None]: """ Iterate rows in a table, resolving CLDF property names to local column names. @@ -833,13 +840,14 @@ def iter_rows(self, table: TableType, *cols: str) -> typing.Generator[dict, None item[v] = item[k] yield item - def cached_rows(self, table: TableType) -> list: + def cached_rows(self, table: TableType) -> list[RowType]: + """Return the rows of a table from a cache.""" key = table.local_name if isinstance(table, Table) else table if key not in self._cached_rows: self._cached_rows[key] = list(self.iter_rows(table)) return self._cached_rows[key] - def get_row(self, table: TableType, id_) -> dict: + def get_row(self, table: TableType, id_) -> RowType: """ Retrieve a row specified by table and CLDF id. @@ -851,7 +859,7 @@ def get_row(self, table: TableType, id_) -> dict: return row raise ValueError(id_) # pragma: no cover - def get_row_url(self, table: TableType, row) -> typing.Union[str, None]: + def get_row_url(self, table: TableType, row: Union[RowType, str]) -> Optional[str]: """ Get a URL associated with a row. Tables can specify associated row URLs by @@ -865,7 +873,7 @@ def get_row_url(self, table: TableType, row) -> typing.Union[str, None]: :param row: A row specified by ID or as `dict` as returned when iterating over a table. :return: a `str` representing a URL or `None`. """ - row = row if isinstance(row, dict) else self.get_row(table, row) + row = self.get_row(table, row) if isinstance(row, str) else row id_col = None for col in self[table].tableSchema.columns: if col.datatype and col.datatype.base == datatypes.anyURI.__name__: @@ -875,11 +883,12 @@ def get_row_url(self, table: TableType, row) -> typing.Union[str, None]: if str(col.propertyUrl) == 'http://cldf.clld.org/v1.0/terms.rdf#id': # Otherwise we fall back to looking up the `valueUrl` property on the ID column. id_col = col - assert id_col, 'no ID column found in table {}'.format(table) + assert id_col, f'no ID column found in table {table}' if id_col.valueUrl: return id_col.valueUrl.expand(**row) + return None - def objects(self, table: str, cls: typing.Optional[typing.Type] = None) -> DictTuple: + def objects(self, table: str, cls: Optional[Type] = None) -> DictTuple: """ Read data of a CLDF component as :class:`pycldf.orm.Object` instances. @@ -899,7 +908,7 @@ def objects(self, table: str, cls: typing.Optional[typing.Type] = None) -> DictT return DictTuple(self._objects[table].values()) - def get_object(self, table, id_, cls=None, pk=False) -> orm.Object: + def get_object(self, table: str, id_: str, cls=None, pk=False) -> orm.Object: """ Get a row of a component as :class:`pycldf.orm.Object` instance. """ @@ -910,17 +919,16 @@ def get_object(self, table, id_, cls=None, pk=False) -> orm.Object: # # Methods for writing (meta)data to files: # - def write_metadata( - self, fname: typing.Optional[typing.Union[str, pathlib.Path]] = None) -> pathlib.Path: + def write_metadata(self, fname: Optional[PathType] = None) -> pathlib.Path: """ Write the CLDF metadata to a JSON file. :fname: Path of a file to write to, or `None` to use the default name and write to \ :meth:`~pycldf.dataset.Dataset.directory`. """ - return self.tablegroup.to_file(fname or self.tablegroup._fname) + return self.tablegroup.to_file(fname or self.tablegroup._fname) # pylint: disable=W0212 - def write_sources(self, zipped: bool = False) -> typing.Union[None, pathlib.Path]: + def write_sources(self, zipped: bool = False) -> Optional[pathlib.Path]: """ Write the sources BibTeX file to :meth:`~pycldf.dataset.Dataset.bibpath` @@ -930,10 +938,12 @@ def write_sources(self, zipped: bool = False) -> typing.Union[None, pathlib.Path """ return self.sources.write(self.bibpath, zipped=zipped) - def write(self, - fname: typing.Optional[pathlib.Path] = None, - zipped: typing.Optional[typing.Iterable] = None, - **table_items: typing.List[dict]) -> pathlib.Path: + def write( + self, + fname: Optional[pathlib.Path] = None, + zipped: Optional[Iterable] = None, + **table_items: list[RowType] + ) -> pathlib.Path: """ Write metadata, sources and data. Metadata will be written to `fname` (as interpreted in :meth:`pycldf.dataset.Dataset.write_metadata`); data files will be written to the file @@ -955,7 +965,7 @@ def write(self, table.common_props['dc:extent'] = table.write(items, _zipped=table_type in zipped) return self.write_metadata(fname) - def copy(self, dest: typing.Union[str, pathlib.Path], mdname: str = None) -> pathlib.Path: + def copy(self, dest: PathType, mdname: str = None) -> pathlib.Path: """ Copy metadata, data and sources to files in `dest`. @@ -973,20 +983,21 @@ def copy(self, dest: typing.Union[str, pathlib.Path], mdname: str = None) -> pat ... if 'with_examples' in ds.directory.name: ... ds.copy('some_directory', mdname='md.json') """ - from pycldf.media import MediaTable + from pycldf.media import MediaTable # pylint: disable=import-outside-toplevel dest = pathlib.Path(dest) if not dest.exists(): dest.mkdir(parents=True) from_url = is_url(self.tablegroup.base) - ds = Dataset.from_metadata(self.tablegroup.base if from_url else self.tablegroup._fname) + ds = Dataset.from_metadata( + self.tablegroup.base if from_url else self.tablegroup._fname) # pylint: disable=W0212 _getter = urllib.request.urlretrieve if from_url else shutil.copy try: _getter(self.bibpath, dest / self.bibname) ds.properties['dc:source'] = self.bibname - except: # pragma: no cover # noqa + except: # pragma: no cover # noqa pylint: disable=W0702 # Sources are optional pass @@ -1000,7 +1011,8 @@ def copy(self, dest: typing.Union[str, pathlib.Path], mdname: str = None) -> pat fk.reference.resource = Link(pathlib.Path(fk.reference.resource.string).name) mdpath = dest.joinpath( mdname or # noqa: W504 - (self.tablegroup.base.split('/')[-1] if from_url else self.tablegroup._fname.name)) + (self.tablegroup.base.split('/')[-1] if from_url + else self.tablegroup._fname.name)) # pylint: disable=W0212 if 'MediaTable' in self: for f in MediaTable(self): if f.scheme == 'file': @@ -1019,8 +1031,9 @@ def copy(self, dest: typing.Union[str, pathlib.Path], mdname: str = None) -> pat def validate( self, log: logging.Logger = None, - validators: typing.List[typing.Tuple[str, str, callable]] = None, - ontology_path=None) -> bool: + validators: list[tuple[Optional[str], str, RowValidatorType]] = None, + ontology_path: Optional[PathType] = None, + ) -> bool: """ Validate schema and data of a `Dataset`: @@ -1036,140 +1049,51 @@ def validate( """ # We must import components with custom validation to make sure they can be detected as # subclasses of ComponentWithValidation. - from pycldf.media import MediaTable - from pycldf.trees import TreeTable + from pycldf.media import MediaTable # pylint: disable=import-outside-toplevel + from pycldf.trees import TreeTable # pylint: disable=import-outside-toplevel assert MediaTable and TreeTable - terms = Terms(ontology_path) or TERMS - validators = validators or [] - validators.extend(VALIDATORS) - success = True - default_tg = TableGroup.from_file( - pkg_path('modules', '{0}{1}'.format(self.module, MD_SUFFIX))) - # + validator = DatasetValidator( + dataset=self, + success=True, + terms=Terms(ontology_path) or TERMS, + log=log, + row_validators=validators or [], + ) + + default_tg = TableGroup.from_file(pkg_path('modules', f'{self.module}{MD_SUFFIX}')) # Make sure, all required tables and columns are present and consistent. - # for default_table in default_tg.tables: - dtable_uri = default_table.common_props['dc:conformsTo'] - try: - table = self[dtable_uri] - except KeyError: - success = False - log_or_raise('{0} requires {1}'.format(self.module, dtable_uri), log=log) - table = None - - if table: - default_cols = {c.propertyUrl.uri: c for c in default_table.tableSchema.columns} - required_default_cols = { - c.propertyUrl.uri for c in default_table.tableSchema.columns - if c.required or c.common_props.get('dc:isRequiredBy')} - cols = { - c.propertyUrl.uri: c for c in table.tableSchema.columns - if c.propertyUrl} - table_uri = table.common_props['dc:conformsTo'] - for col in required_default_cols - set(cols.keys()): - success = False - log_or_raise('{0} requires column {1}'.format(table_uri, col), log=log) - for uri, col in cols.items(): - default = default_cols.get(uri) - if default: - cardinality = default.common_props.get('dc:extent') - if not cardinality: - cardinality = terms.by_uri[uri].cardinality - if (cardinality == 'multivalued' and not col.separator) or \ - (cardinality == 'singlevalued' and col.separator): - success = False - log_or_raise('{} {} must be {}'.format( - table_uri, uri, cardinality), log=log) + validator.validate_default_objects(default_table) for table in self.tables: - vars = set(col.name for col in table.tableSchema.columns) - for obj, prop, tmpl in iter_uritemplates(table): - if not {n for n in tmpl.variable_names if not n.startswith('_')}.issubset(vars): - if log: - log.warning('Unknown variables in URI template: {}:{}:{}'.format( - obj, prop, tmpl)) - - type_uri = table.common_props.get('dc:conformsTo') - if type_uri: - try: - terms.is_cldf_uri(type_uri) - except ValueError: - success = False - log_or_raise('invalid CLDF URI: {0}'.format(type_uri), log=log) + validator.validate_table_schema(table) + validator.validate_columns(table) - if not table.tableSchema.primaryKey: - if log: - log.warning('Table without primary key: {0} - {1}'.format( - table.url, - 'This may cause problems with "cldf createdb"')) - elif len(table.tableSchema.primaryKey) > 1: - if log: - log.warning('Table with composite primary key: {0} - {1}'.format( - table.url, - 'This may cause problems with "cldf createdb"')) - - # FIXME: check whether table.common_props['dc:conformsTo'] is in validators! - validators_, propertyUrls, colnames = [], set(), set() - for col in table.tableSchema.columns: - if col.header in colnames: # pragma: no cover - success = False - log_or_raise( - 'Duplicate column name in table schema: {} {}'.format( - table.url, col.header), - log=log) - colnames.add(col.header) - if col.propertyUrl: - col_uri = col.propertyUrl.uri - try: - terms.is_cldf_uri(col_uri) - if col_uri in propertyUrls: # pragma: no cover - success = False - log_or_raise( - 'Duplicate CLDF property in table schema: {} {}'.format( - table.url, col_uri), - log=log) - propertyUrls.add(col_uri) - except ValueError: - success = False - log_or_raise('invalid CLDF URI: {0}'.format(col_uri), log=log) - for table_, col_, v_ in validators: - if (not table_ or table is self.get(table_)) and col is self.get((table, col_)): - validators_.append((col, v_)) - - fname = pathlib.Path(table.url.resolve(table._parent.base)) + fname = pathlib.Path(table.url.resolve(table._parent.base)) # pylint: disable=W0212 fexists = fname.exists() - if (not fexists) and fname.parent.joinpath('{}.zip'.format(fname.name)).exists(): + if (not fexists) and fname.parent.joinpath(f'{fname.name}.zip').exists(): if log: - log.info('Reading data from zipped table: {}.zip'.format(fname)) + log.info(f'Reading data from zipped table: {fname}.zip') fexists = True # csvw already handles this case, no need to adapt paths. - if is_url(table.url.resolve(table._parent.base)) or fexists: - for fname, lineno, row in table.iterdicts(log=log, with_metadata=True): - for col, validate in validators_: - try: - validate(self, table, col, row) - except ValueError as e: - success = False - log_or_raise( - '{0}:{1}:{2} {3}'.format(fname.name, lineno, col.name, e), - log=log) + if is_url(table.url.resolve(table._parent.base)) or fexists: # pylint: disable=W0212 + validator.validate_rows(table) if not table.check_primary_key(log=log): - success = False + validator.fail('Primary key check failed.') else: - success = False - log_or_raise('{0} does not exist'.format(fname), log=log) + validator.fail(f'{fname} does not exist') if not self.tablegroup.check_referential_integrity(log=log): - success = False + validator.fail('Referential integrity check failed') for cls in ComponentWithValidation.__subclasses__(): if cls.__name__ in self: - success = cls(self).validate(success, log=log) + validator.success = cls(self).validate(validator.success, log=validator.log) - return success + return validator.success - def stats(self, exact: bool = False) -> typing.List[typing.Tuple[str, str, int]]: + def stats(self, exact: bool = False) -> list[tuple[str, str, int]]: """ Compute summary statistics for the dataset. @@ -1197,7 +1121,7 @@ class Generic(Dataset): .. seealso:: ``_ """ @property - def primary_table(self): + def primary_table(self) -> None: # pylint: disable=missing-function-docstring return None @@ -1208,10 +1132,11 @@ class Wordlist(Dataset): .. seealso:: ``_ """ @property - def primary_table(self): + def primary_table(self) -> str: # pylint: disable=missing-function-docstring return 'FormTable' - def get_segments(self, row, table='FormTable') -> typing.List[str]: + def get_segments(self, row: RowType, table='FormTable') -> list[str]: + """Retrieve the list of segments of a form.""" col = self[table].get_column("http://cldf.clld.org/v1.0/terms.rdf#segments") sounds = row[col.name] if isinstance(sounds, str): @@ -1219,7 +1144,7 @@ def get_segments(self, row, table='FormTable') -> typing.List[str]: sounds = [sounds] return list(itertools.chain(*[s.split() for s in sounds])) - def get_subsequence(self, cognate: dict, form=None) -> typing.List[str]: + def get_subsequence(self, cognate: RowType, form: Optional[str] = None) -> list[str]: """ Compute the subsequence of the morphemes of a form which is specified in a partial cognate assignment. @@ -1236,11 +1161,13 @@ def get_subsequence(self, cognate: dict, form=None) -> typing.List[str]: class ParallelText(Dataset): + """Implements the CLDF ParallelText module.""" @property - def primary_table(self): + def primary_table(self) -> str: # pylint: disable=missing-function-docstring return 'FormTable' def get_equivalent(self, functional_equivalent, form=None): + """Get the forms fulfilling an equivalent function in the texts.""" return resolve_slices( functional_equivalent, self, @@ -1252,8 +1179,9 @@ def get_equivalent(self, functional_equivalent, form=None): class Dictionary(Dataset): + """Implements the CLDF Dictionary module.""" @property - def primary_table(self): + def primary_table(self) -> str: # pylint: disable=missing-function-docstring return 'EntryTable' @@ -1264,7 +1192,7 @@ class StructureDataset(Dataset): .. seealso:: ``_ """ @property - def primary_table(self): + def primary_table(self) -> str: # pylint: disable=missing-function-docstring return 'ValueTable' @functools.cached_property @@ -1296,21 +1224,26 @@ class TextCorpus(Dataset): [] """ @property - def primary_table(self): + def primary_table(self) -> str: # pylint: disable=missing-function-docstring return 'ExampleTable' @functools.cached_property - def texts(self) -> typing.Union[None, DictTuple]: + def texts(self) -> Optional[DictTuple]: + """Retrieve texts.""" # Some syntactic sugar to access the ORM data in a concise and meaningful way. if 'ContributionTable' in self: return self.objects('ContributionTable') + return None # pragma: no cover - def get_text(self, tid): + def get_text(self, tid: str) -> Optional[orm.Object]: + """Retrieve a text by ID.""" if 'ContributionTable' in self: return self.get_object('ContributionTable', tid) + return None # pragma: no cover @property - def sentences(self) -> typing.List[orm.Example]: + def sentences(self) -> list[orm.Example]: + """Sentences of the corpus.""" res = list(self.objects('ExampleTable')) if ('ExampleTable', 'exampleReference') in self: # Filter out alternative translations! @@ -1320,7 +1253,7 @@ def sentences(self) -> typing.List[orm.Example]: return res # pragma: no cover -class ComponentWithValidation: +class ComponentWithValidation: # pylint: disable=too-few-public-methods """ A virtual base class for custom, component-centered validation. """ @@ -1329,7 +1262,9 @@ def __init__(self, ds: Dataset): self.component = self.__class__.__name__ self.table = ds[self.component] - def validate(self, success: bool = True, log: logging.Logger = None) -> bool: + def validate(self, success: bool = True, log: Optional[logging.Logger] = None) -> bool: + """Validate the component taking previous validation result into account.""" + assert log or 1 # pragma: no cover pylint: disable=condition-evals-to-constant return success # pragma: no cover @@ -1360,7 +1295,7 @@ def sniff(p: pathlib.Path) -> bool: return d.get('dc:conformsTo', '').startswith(TERMS_URL) -def iter_datasets(d: pathlib.Path) -> typing.Generator[Dataset, None, None]: +def iter_datasets(d: PathType) -> Generator[Dataset, None, None]: """ Discover CLDF datasets - by identifying metadata files - in a directory. @@ -1372,5 +1307,4 @@ def iter_datasets(d: pathlib.Path) -> typing.Generator[Dataset, None, None]: try: yield Dataset.from_metadata(p) except ValueError as e: - logging.getLogger(__name__).warning( - "Reading {} failed: {}".format(p, e)) + logging.getLogger(__name__).warning("Reading %s failed: %s", p, e) diff --git a/src/pycldf/validators.py b/src/pycldf/validators.py index d252d54..c2c1afe 100644 --- a/src/pycldf/validators.py +++ b/src/pycldf/validators.py @@ -1,6 +1,124 @@ import re import warnings import functools +from typing import Optional, Callable, TYPE_CHECKING +import logging +import dataclasses + +from clldutils.misc import log_or_raise + +from pycldf.terms import Terms +from pycldf.util import iter_uritemplates + +if TYPE_CHECKING: # pragma: no cover + from pycldf.dataset import Dataset, Table, RowType, Column + +RowValidatorType = Callable[['Dataset', 'Table', 'Column', 'RowType'], None] + + +@dataclasses.dataclass +class DatasetValidator: + dataset: 'Dataset' + success: bool + terms: Terms + log: Optional[logging.Logger] + row_validators: list[tuple[Optional[str], str, RowValidatorType]] + + def __post_init__(self): + self.row_validators.extend(VALIDATORS) + + def fail(self, reason): + self.success = False + log_or_raise(reason, log=self.log) + + def warn(self, msg, *args): + if self.log: + self.log.warning(msg, *args) + + def validate_rows(self, table): + # FIXME: see if table.common_props['dc:conformsTo'] is in validators! pylint: disable=W0511 + validators = [] + for col in table.tableSchema.columns: + for table_, col_, v_ in self.row_validators: + if ((not table_ or table is self.dataset.get(table_)) + and col is self.dataset.get((table, col_))): # noqa: W503 + validators.append((col, v_)) + + for fname, lineno, row in table.iterdicts(log=self.log, with_metadata=True): + for col, validate in validators: + try: + validate(self.dataset, table, col, row) + except ValueError as e: + self.fail(f'{fname.name}:{lineno}:{col.name} {e}') + + def validate_columns(self, table): + property_urls, colnames = set(), set() + for col in table.tableSchema.columns: + if col.header in colnames: # pragma: no cover + self.fail(f'Duplicate column name in table schema: {table.url} {col.header}') + colnames.add(col.header) + if col.propertyUrl: + col_uri = col.propertyUrl.uri + try: + self.terms.is_cldf_uri(col_uri) + if col_uri in property_urls: # pragma: no cover + self.fail( + f'Duplicate CLDF property in table schema: {table.url} {col_uri}') + property_urls.add(col_uri) + except ValueError: + self.fail(f'invalid CLDF URI: {col_uri}') + + def validate_table_schema(self, table): + tmpl_vars = set(col.name for col in table.tableSchema.columns) + for obj, prop, tmpl in iter_uritemplates(table): + if not {n for n in tmpl.variable_names if not n.startswith('_')}.issubset(tmpl_vars): + self.warn(f'Unknown variables in URI template: {obj}:{prop}:{tmpl}') + + type_uri = table.common_props.get('dc:conformsTo') + if type_uri: + try: + self.terms.is_cldf_uri(type_uri) + except ValueError: + self.fail(f'invalid CLDF URI: {type_uri}') + + if not table.tableSchema.primaryKey: + self.warn( + 'Table without primary key: %s - %s', + table.url, + 'This may cause problems with "cldf createdb"') + elif len(table.tableSchema.primaryKey) > 1: + self.warn( + 'Table with composite primary key: %s - %s', + table.url, + 'This may cause problems with "cldf createdb"') + + def validate_default_objects(self, default_table): + dtable_uri = default_table.common_props['dc:conformsTo'] + try: + table = self.dataset[dtable_uri] + except KeyError: + self.fail(f'{self.dataset.module} requires {dtable_uri}') + return + + default_cols = {c.propertyUrl.uri: c for c in default_table.tableSchema.columns} + required_default_cols = { + c.propertyUrl.uri for c in default_table.tableSchema.columns + if c.required or c.common_props.get('dc:isRequiredBy')} + cols = { + c.propertyUrl.uri: c for c in table.tableSchema.columns + if c.propertyUrl} + table_uri = table.common_props['dc:conformsTo'] + for col in required_default_cols - set(cols.keys()): + self.fail(f'{table_uri} requires column {col}') + for uri, col in cols.items(): + default = default_cols.get(uri) + if default: + cardinality = default.common_props.get('dc:extent') + if not cardinality: + cardinality = self.terms.by_uri[uri].cardinality + if (cardinality == 'multivalued' and not col.separator) or \ + (cardinality == 'singlevalued' and col.separator): + self.fail(f'{table_uri} {uri} must be {cardinality}') def valid_references(dataset, table, column, row): @@ -44,7 +162,7 @@ def valid_mediaType(dataset, table, column, row): warnings.warn('Invalid main part in media type: {}'.format(main)) -VALIDATORS = [ +VALIDATORS: list[tuple[None, str, RowValidatorType]] = [ ( None, 'http://cldf.clld.org/v1.0/terms.rdf#iso639P3code', diff --git a/tests/test_dataset.py b/tests/test_dataset.py index aeafa29..c650e17 100644 --- a/tests/test_dataset.py +++ b/tests/test_dataset.py @@ -842,14 +842,14 @@ def test_validators(tmp_path, data, caplog): log = logging.getLogger(__name__) ds.validate(log=log) - assert len(caplog.records) == 2 + assert len(caplog.records) == 3 for col in ds.tablegroup.tables[0].tableSchema.columns: if col.name == 'Language_ID': col.propertyUrl.uri = 'http://cldf.clld.org/v1.0/terms.rdf#glottocode' ds.validate(log=log) - assert len(caplog.records) == 6 + assert len(caplog.records) == 8 def test_get_modules(): From 526b39a8da43dcb4422b7f17d9eb5cc313376fa3 Mon Sep 17 00:00:00 2001 From: Robert Forkel Date: Thu, 26 Feb 2026 12:50:11 +0100 Subject: [PATCH 2/7] more linting --- src/pycldf/commands/downloadmedia.py | 6 +- src/pycldf/commands/markdown.py | 2 +- src/pycldf/constraints.py | 100 +++++++ src/pycldf/dataset.py | 398 +++------------------------ src/pycldf/db.py | 176 ++++++------ src/pycldf/ext/discovery.py | 2 +- src/pycldf/ext/markdown.py | 3 +- src/pycldf/fileutil.py | 65 +++++ src/pycldf/markdown.py | 166 +++++++++++ src/pycldf/media.py | 187 +++++++------ src/pycldf/module.py | 65 +++++ src/pycldf/orm.py | 179 ++++++------ src/pycldf/schemautil.py | 48 ++++ src/pycldf/sources.py | 121 ++++---- src/pycldf/stats.py | 40 +++ src/pycldf/terms.py | 204 ++++++++++---- src/pycldf/trees.py | 29 +- src/pycldf/urlutil.py | 37 +++ src/pycldf/util.py | 303 ++++++-------------- src/pycldf/validators.py | 119 ++++++-- tests/test_cli.py | 4 + tests/test_dataset.py | 3 +- tests/test_trees.py | 5 +- tests/test_util.py | 3 + 24 files changed, 1286 insertions(+), 979 deletions(-) create mode 100644 src/pycldf/constraints.py create mode 100644 src/pycldf/fileutil.py create mode 100644 src/pycldf/markdown.py create mode 100644 src/pycldf/module.py create mode 100644 src/pycldf/schemautil.py create mode 100644 src/pycldf/stats.py create mode 100644 src/pycldf/urlutil.py diff --git a/src/pycldf/commands/downloadmedia.py b/src/pycldf/commands/downloadmedia.py index d9b8c2d..e6a5d36 100644 --- a/src/pycldf/commands/downloadmedia.py +++ b/src/pycldf/commands/downloadmedia.py @@ -32,6 +32,10 @@ def run(args): for s in args.filters: col, _, substring = s.partition('=') filters.append((col, substring)) - for item in MediaTable(get_dataset(args), args.use_form_id): + media_table = MediaTable(get_dataset(args)) + if args.use_form_id: + media_table.filename_col = media_table.ds[ + media_table.component, 'http://cldf.clld.org/v1.0/terms.rdf#formReference'] + for item in media_table: if all(substring in item[col] for col, substring in filters): item.save(args.output) diff --git a/src/pycldf/commands/markdown.py b/src/pycldf/commands/markdown.py index 296eb4d..2edffd3 100644 --- a/src/pycldf/commands/markdown.py +++ b/src/pycldf/commands/markdown.py @@ -3,7 +3,7 @@ """ from clldutils.clilib import PathType -from pycldf.util import metadata2markdown +from pycldf.markdown import metadata2markdown from pycldf.cli_util import add_dataset, get_dataset diff --git a/src/pycldf/constraints.py b/src/pycldf/constraints.py new file mode 100644 index 0000000..cfad2dc --- /dev/null +++ b/src/pycldf/constraints.py @@ -0,0 +1,100 @@ +""" +Functionality for creation of foreign key constraints. +""" +from typing import TYPE_CHECKING, Optional + +from pycldf.terms import TERMS, term_uri +from pycldf.schemautil import TableType, ColType + +if TYPE_CHECKING: + from pycldf.dataset import Dataset # pragma: no cover + +__all__ = ['add_foreign_key', 'add_auto_constraints'] + + +def add_foreign_key( + ds: 'Dataset', + foreign_t: TableType, + foreign_c: ColType, + primary_t: TableType, + primary_c: Optional[ColType] = None, +) -> None: + """ + Add a foreign key constraint. + + ..note:: Composite keys are not supported yet. + + :param foreign_t: Table reference for the linking table. + :param foreign_c: Column reference for the link. + :param primary_t: Table reference for the linked table. + :param primary_c: Column reference for the linked column - or `None`, in which case the \ + primary key of the linked table is assumed. + """ + if isinstance(foreign_c, (tuple, list)) or isinstance(primary_c, (tuple, list)): + raise NotImplementedError('composite keys are not supported') + + foreign_t = ds[foreign_t] + primary_t = ds[primary_t] + if not primary_c: + primary_c = primary_t.tableSchema.primaryKey + else: + primary_c = ds[primary_t, primary_c].name + foreign_t.add_foreign_key(ds[foreign_t, foreign_c].name, primary_t.url.string, primary_c) + + +def add_auto_constraints(ds: 'Dataset', component: Optional[TableType] = None): + """ + Use CLDF reference properties to implicitly create foreign key constraints. + + :param component: A Table object or `None`. + """ + if not component: + for table in ds.tables: + ds.auto_constraints(table) + return + + if not component.tableSchema.primaryKey: + idcol = component.get_column(term_uri('id')) + if idcol: + component.tableSchema.primaryKey = [idcol.name] + + _auto_foreign_keys(ds, component) + + try: + table_type = ds.get_tabletype(component) + except ValueError: + table_type = None + + if table_type is None: + # New component is not a known CLDF term, so cannot add components + # automatically. TODO: We might me able to infer some based on + # `xxxReference` column properties? + return + + # auto-add foreign keys targeting the new component: + for table in ds.tables: + _auto_foreign_keys(ds, table, component=component, table_type=table_type) + + +def _auto_foreign_keys(ds: 'Dataset', table, component=None, table_type=None): + assert (component is None) == (table_type is None) + for col in table.tableSchema.columns: + if col.propertyUrl and col.propertyUrl.uri in TERMS.by_uri: + ref_name = TERMS.by_uri[col.propertyUrl.uri].references + if (component is None and not ref_name) or \ + (component is not None and ref_name != table_type): + continue + if any(fkey.columnReference == [col.name] + for fkey in table.tableSchema.foreignKeys): + continue + if component is None: + # Let's see whether we have the component this column references: + try: + ref = ds[ref_name] + except KeyError: + continue + else: + ref = component + idcol = ref.get_column(term_uri('id')) + table.add_foreign_key( + col.name, ref.url.string, idcol.name if idcol is not None else 'ID') diff --git a/src/pycldf/dataset.py b/src/pycldf/dataset.py index 6d4f22d..fec1c12 100644 --- a/src/pycldf/dataset.py +++ b/src/pycldf/dataset.py @@ -1,12 +1,8 @@ -# pylint: disable=C0302 """ An implementation of a CLDF dataset object. """ import re -import sys -import json import types -import shutil from typing import Union, Optional, Type, Any import logging import pathlib @@ -22,25 +18,25 @@ from csvw.metadata import TableGroup, Table, Column, Link, Schema, is_url, URITemplate from csvw import datatypes from csvw.dsv import iterrows -from clldutils.path import git_describe, walk -from clldutils import jsonlib +from clldutils.path import walk +from pycldf.module import get_module_impl from pycldf.sources import Sources, Source -from pycldf.util import pkg_path, resolve_slices, DictTuple, sanitize_url, iter_uritemplates -from pycldf.terms import term_uri, Terms, TERMS, get_column_names, URL as TERMS_URL -from pycldf.validators import DatasetValidator, RowValidatorType +from pycldf.util import ( + pkg_path, resolve_slices, DictTuple, iter_uritemplates, MD_SUFFIX, GitRepository, copy_dataset) +from pycldf.fileutil import PathType +from pycldf.schemautil import ColSpecType, make_column, make_table, TableType, ColType +from pycldf.constraints import add_foreign_key, add_auto_constraints +from pycldf.terms import term_uri, Terms, TERMS, get_column_names, sniff +from pycldf import validators as validation +from pycldf.stats import get_table_stats from pycldf import orm __all__ = [ 'Dataset', 'Generic', 'Wordlist', 'ParallelText', 'Dictionary', 'StructureDataset', - 'TextCorpus', 'iter_datasets', 'sniff', 'SchemaError', 'ComponentWithValidation'] + 'TextCorpus', 'iter_datasets', 'sniff', 'SchemaError'] -MD_SUFFIX = '-metadata.json' ORM_CLASSES = {cls.component_name(): cls for cls in orm.Object.__subclasses__()} -TableType = Union[str, Table] -ColType = Union[str, Column] -ColSpecType = Union[str, dict, Column] -PathType = Union[str, pathlib.Path] TableSpecType = Union[str, Link, Table] SchemaObjectType = Union[TableSpecType, tuple[TableSpecType, ColSpecType]] ODict = collections.OrderedDict @@ -51,112 +47,6 @@ class SchemaError(KeyError): """Schema objects can be accessed using `Dataset.__getitem__`.""" -@attr.s -class Module: - """ - Class representing a CLDF Module. - - .. seealso:: https://github.com/cldf/cldf/blob/master/README.md#cldf-modules - """ - uri = attr.ib(validator=attr.validators.in_([t.uri for t in TERMS.classes.values()])) - fname = attr.ib() - cls = attr.ib(default=None) - - @property - def id(self) -> str: - """ - The local part of the term URI is interpreted as Module identifier. - """ - return self.uri.split('#')[1] - - def match(self, thing) -> bool: - """Check if the module described here matches thing.""" - if isinstance(thing, TableGroup): - return thing.common_props.get('dc:conformsTo') == term_uri(self.id) - if hasattr(thing, 'name'): - return thing.name == self.fname - return False - - -_modules = [] - - -def get_modules() -> list[Module]: - """ - We read supported CLDF modules from the default metadata files distributed with `pycldf`. - """ - global _modules # pylint: disable=global-statement - if not _modules: - ds = sys.modules[__name__] - for p in pkg_path('modules').glob(f'*{MD_SUFFIX}'): - tg = TableGroup.from_file(p) - mod = Module( - tg.common_props['dc:conformsTo'], - tg.tables[0].url.string if tg.tables else None) - mod.cls = getattr(ds, mod.id) - _modules.append(mod) - # prefer Wordlist over ParallelText (forms.csv) - _modules = sorted( - _modules, - key=lambda m: (m.cls in (Wordlist, ParallelText), m.cls is ParallelText)) - return _modules - - -def make_column(spec: ColSpecType) -> Column: - """ - Create a `Column` instance from `spec`. - - .. code-block:: python - - >>> make_column('id').name - 'id' - >>> make_column('http://cldf.clld.org/v1.0/terms.rdf#id').name - 'ID' - >>> make_column({'name': 'col', 'datatype': 'boolean'}).datatype.base - 'boolean' - >>> type(make_column(make_column('id'))) - - """ - if isinstance(spec, str): - if spec in TERMS.by_uri: - return TERMS.by_uri[spec].to_column() - return Column(name=spec, datatype='string') - if isinstance(spec, dict): - return Column.fromvalue(spec) - if isinstance(spec, Column): - return spec - raise TypeError(spec) - - -class GitRepository: # pylint: disable=too-few-public-methods - """ - CLDF datasets are often created from data curated in git repositories. If this is the case, we - exploit this to provide better provenance information in the dataset's metadata. - """ - def __init__( - self, url: str, clone: Optional[PathType] = None, version: Optional[str] = None, **dc): - # We remove credentials from the URL immediately to make sure this isn't leaked into - # CLDF metadata. Such credentials might be present in URLs read via gitpython from - # remotes. - self.url = sanitize_url(url) - self.clone = clone - self.version = version - self.dc = dc - - def json_ld(self) -> collections.OrderedDict[str, Any]: - """The repository described in JSON-LD.""" - res = collections.OrderedDict([ - ('rdf:about', self.url), - ('rdf:type', 'prov:Entity'), - ]) - if self.version: - res['dc:created'] = self.version - elif self.clone: - res['dc:created'] = git_describe(self.clone) - res.update({f'dc:{k}': self.dc[k] for k in sorted(self.dc)}) - return res - - class Dataset: # pylint: disable=too-many-public-methods """ API to access a CLDF dataset. @@ -193,9 +83,7 @@ def sources(self, obj: Sources): raise TypeError('Invalid type for Dataset.sources') self._sources = obj - # - # Factory methods to create `Dataset` instances. - # + # Factory methods to create `Dataset` instances. ----------------------------------------------- @classmethod def in_dir(cls, d: PathType, empty_tables: bool = False) -> 'Dataset': """ @@ -249,9 +137,9 @@ def from_metadata(cls, fname: PathType) -> 'Dataset': if comps and comps.most_common(1)[0][1] > 1: raise ValueError(f'{fname}: duplicate components!') - for mod in get_modules(): - if mod.match(tablegroup): - return mod.cls(tablegroup) + impl = get_module_impl(Dataset, tablegroup) + if impl: + return impl(tablegroup) return cls(tablegroup) @classmethod @@ -268,14 +156,12 @@ def from_data(cls, fname: PathType) -> 'Dataset': if not colnames: raise ValueError('empty data file!') if cls is Dataset: - try: - cls = next( # pylint: disable=W0642 - mod.cls for mod in get_modules() if mod.match(fname)) - except StopIteration as exc: - raise ValueError(f'{fname} does not match a CLDF module spec') from exc - assert issubclass(cls, Dataset) and cls is not Dataset - - res = cls.from_metadata(fname.parent) + impl = get_module_impl(Dataset, fname.name) + if impl is None: + raise ValueError(f'{fname} does not match a CLDF module spec') + res = impl.from_metadata(fname.parent) + else: + res = cls.from_metadata(fname.parent) required_cols = { c.name for c in res[res.primary_table].tableSchema.columns if c.required} @@ -283,9 +169,7 @@ def from_data(cls, fname: PathType) -> 'Dataset': raise ValueError(f'missing columns: {sorted(required_cols.difference(colnames))}') return res - # - # Accessing dataset metadata - # + # Accessing dataset metadata ------------------------------------------------------------------- @property def directory(self) -> PathType: """ @@ -353,9 +237,7 @@ def bibname(self) -> str: return pathlib.Path(urllib.parse.urlparse(self.bibpath).path).name return self.bibpath.name - # - # Accessing schema objects (components, tables, columns, foreign keys) - # + # Accessing schema objects (components, tables, columns, foreign keys) ------------------------- @property def tables(self) -> list[Table]: """ @@ -493,9 +375,7 @@ def get(self, item: SchemaObjectType, default=None) -> Union[csvw.Table, csvw.Co return default def get_foreign_key_reference( - self, - table: TableType, - column: ColType, + self, table: TableType, column: ColType, ) -> Optional[tuple[csvw.Table, csvw.Column]]: """ Retrieve the reference of a foreign key constraint for the specified column. @@ -537,9 +417,7 @@ def readonly_column_names(self) -> types.SimpleNamespace: """ return get_column_names(self, use_component_names=True, with_multiplicity=True) - # - # Editing dataset metadata or schema - # + # Editing dataset metadata or schema ----------------------------------------------------------- def add_provenance(self, **kw: Any) -> None: """ Add metadata about the dataset's provenance. @@ -615,11 +493,7 @@ def add_component(self, component: Union[str, dict], *cols: ColSpecType, **kw) - - `url`: a url property for the table;\ - `description`: a description of the table. """ - if isinstance(component, str): - component = jsonlib.load(pkg_path('components', f'{component}{MD_SUFFIX}')) - if isinstance(component, dict): - component = Table.fromvalue(component) - assert isinstance(component, Table) + component = make_table(component) if kw.get('url'): component.url = Link(kw['url']) @@ -743,16 +617,7 @@ def add_foreign_key( :param primary_c: Column reference for the linked column - or `None`, in which case the \ primary key of the linked table is assumed. """ - if isinstance(foreign_c, (tuple, list)) or isinstance(primary_c, (tuple, list)): - raise NotImplementedError('composite keys are not supported') - - foreign_t = self[foreign_t] - primary_t = self[primary_t] - if not primary_c: - primary_c = primary_t.tableSchema.primaryKey - else: - primary_c = self[primary_t, primary_c].name - foreign_t.add_foreign_key(self[foreign_t, foreign_c].name, primary_t.url.string, primary_c) + return add_foreign_key(self, foreign_t, foreign_c, primary_t, primary_c) def auto_constraints(self, component: Optional[TableType] = None): """ @@ -760,59 +625,9 @@ def auto_constraints(self, component: Optional[TableType] = None): :param component: A Table object or `None`. """ - if not component: - for table in self.tables: - self.auto_constraints(table) - return - - if not component.tableSchema.primaryKey: - idcol = component.get_column(term_uri('id')) - if idcol: - component.tableSchema.primaryKey = [idcol.name] - - self._auto_foreign_keys(component) - - try: - table_type = self.get_tabletype(component) - except ValueError: - table_type = None - - if table_type is None: - # New component is not a known CLDF term, so cannot add components - # automatically. TODO: We might me able to infer some based on - # `xxxReference` column properties? - return + return add_auto_constraints(self, component) - # auto-add foreign keys targeting the new component: - for table in self.tables: - self._auto_foreign_keys(table, component=component, table_type=table_type) - - def _auto_foreign_keys(self, table, component=None, table_type=None): - assert (component is None) == (table_type is None) - for col in table.tableSchema.columns: - if col.propertyUrl and col.propertyUrl.uri in TERMS.by_uri: - ref_name = TERMS.by_uri[col.propertyUrl.uri].references - if (component is None and not ref_name) or \ - (component is not None and ref_name != table_type): - continue - if any(fkey.columnReference == [col.name] - for fkey in table.tableSchema.foreignKeys): - continue - if component is None: - # Let's see whether we have the component this column references: - try: - ref = self[ref_name] - except KeyError: - continue - else: - ref = component - idcol = ref.get_column(term_uri('id')) - table.add_foreign_key( - col.name, ref.url.string, idcol.name if idcol is not None else 'ID') - - # - # Add data - # + # Add data ------------------------------------------------------------------------------------- def add_sources(self, *sources: Union[str, Source], **kw) -> None: """ Add sources to the dataset. @@ -821,9 +636,7 @@ def add_sources(self, *sources: Union[str, Source], **kw) -> None: """ self.sources.add(*sources, **kw) - # - # Methods to read data - # + # Methods to read data ------------------------------------------------------------------------- def iter_rows(self, table: TableType, *cols: str) -> Generator[RowType, None, None]: """ Iterate rows in a table, resolving CLDF property names to local column names. @@ -916,9 +729,7 @@ def get_object(self, table: str, id_: str, cls=None, pk=False) -> orm.Object: self.objects(table, cls=cls) return self._objects[table][id_] if not pk else self._objects_by_pk[table][id_] - # - # Methods for writing (meta)data to files: - # + # Methods for writing (meta)data to files: ----------------------------------------------------- def write_metadata(self, fname: Optional[PathType] = None) -> pathlib.Path: """ Write the CLDF metadata to a JSON file. @@ -983,55 +794,13 @@ def copy(self, dest: PathType, mdname: str = None) -> pathlib.Path: ... if 'with_examples' in ds.directory.name: ... ds.copy('some_directory', mdname='md.json') """ - from pycldf.media import MediaTable # pylint: disable=import-outside-toplevel - - dest = pathlib.Path(dest) - if not dest.exists(): - dest.mkdir(parents=True) + return copy_dataset(self, dest, mdname) - from_url = is_url(self.tablegroup.base) - ds = Dataset.from_metadata( - self.tablegroup.base if from_url else self.tablegroup._fname) # pylint: disable=W0212 - - _getter = urllib.request.urlretrieve if from_url else shutil.copy - try: - _getter(self.bibpath, dest / self.bibname) - ds.properties['dc:source'] = self.bibname - except: # pragma: no cover # noqa pylint: disable=W0702 - # Sources are optional - pass - - for table in ds.tables: - fname = table.url.resolve(table.base) - name = pathlib.Path(urllib.parse.urlparse(fname).path).name if from_url else fname.name - _getter(fname, dest / name) - table.url = Link(name) - - for fk in table.tableSchema.foreignKeys: - fk.reference.resource = Link(pathlib.Path(fk.reference.resource.string).name) - mdpath = dest.joinpath( - mdname or # noqa: W504 - (self.tablegroup.base.split('/')[-1] if from_url - else self.tablegroup._fname.name)) # pylint: disable=W0212 - if 'MediaTable' in self: - for f in MediaTable(self): - if f.scheme == 'file': - if f.local_path().exists(): - target = dest / urllib.parse.unquote(f.relpath) - target.parent.mkdir(parents=True, exist_ok=True) - shutil.copy(f.local_path(), target) - if from_url: - del ds.tablegroup.at_props['base'] # pragma: no cover - ds.write_metadata(fname=mdpath) - return mdpath - - # - # Reporting - # + # Reporting ------------------------------------------------------------------------------------ def validate( self, log: logging.Logger = None, - validators: list[tuple[Optional[str], str, RowValidatorType]] = None, + validators: list[tuple[Optional[str], str, validation.RowValidatorType]] = None, ontology_path: Optional[PathType] = None, ) -> bool: """ @@ -1047,71 +816,20 @@ def validate( :raises ValueError: if a validation error is encountered (and `log` is `None`). :return: Flag signaling whether schema and data are valid. """ - # We must import components with custom validation to make sure they can be detected as - # subclasses of ComponentWithValidation. - from pycldf.media import MediaTable # pylint: disable=import-outside-toplevel - from pycldf.trees import TreeTable # pylint: disable=import-outside-toplevel - - assert MediaTable and TreeTable - - validator = DatasetValidator( + return validation.validate( dataset=self, - success=True, terms=Terms(ontology_path) or TERMS, log=log, row_validators=validators or [], ) - default_tg = TableGroup.from_file(pkg_path('modules', f'{self.module}{MD_SUFFIX}')) - # Make sure, all required tables and columns are present and consistent. - for default_table in default_tg.tables: - validator.validate_default_objects(default_table) - - for table in self.tables: - validator.validate_table_schema(table) - validator.validate_columns(table) - - fname = pathlib.Path(table.url.resolve(table._parent.base)) # pylint: disable=W0212 - fexists = fname.exists() - if (not fexists) and fname.parent.joinpath(f'{fname.name}.zip').exists(): - if log: - log.info(f'Reading data from zipped table: {fname}.zip') - fexists = True # csvw already handles this case, no need to adapt paths. - if is_url(table.url.resolve(table._parent.base)) or fexists: # pylint: disable=W0212 - validator.validate_rows(table) - if not table.check_primary_key(log=log): - validator.fail('Primary key check failed.') - else: - validator.fail(f'{fname} does not exist') - - if not self.tablegroup.check_referential_integrity(log=log): - validator.fail('Referential integrity check failed') - - for cls in ComponentWithValidation.__subclasses__(): - if cls.__name__ in self: - validator.success = cls(self).validate(validator.success, log=validator.log) - - return validator.success - def stats(self, exact: bool = False) -> list[tuple[str, str, int]]: """ Compute summary statistics for the dataset. - :return: List of triples (table, type, rowcount). + :return: List of triples (filename, component, rowcount). """ - res = [] - for table in self.tables: - dctype = table.common_props.get('dc:conformsTo') - if dctype and '#' in dctype and dctype.split('#')[1] in TERMS: - dctype = TERMS[dctype.split('#')[1]].csvw_prop('name') - res.append(( - table.url.string, - dctype, - sum(1 for _ in table) if (exact or 'dc:extent' not in table.common_props) - else int(table.common_props.get('dc:extent')))) - if self.sources: - res.append((self.bibname, 'Sources', len(self.sources))) - return res + return get_table_stats(self, exact) class Generic(Dataset): @@ -1253,48 +971,6 @@ def sentences(self) -> list[orm.Example]: return res # pragma: no cover -class ComponentWithValidation: # pylint: disable=too-few-public-methods - """ - A virtual base class for custom, component-centered validation. - """ - def __init__(self, ds: Dataset): - self.ds = ds - self.component = self.__class__.__name__ - self.table = ds[self.component] - - def validate(self, success: bool = True, log: Optional[logging.Logger] = None) -> bool: - """Validate the component taking previous validation result into account.""" - assert log or 1 # pragma: no cover pylint: disable=condition-evals-to-constant - return success # pragma: no cover - - -def sniff(p: pathlib.Path) -> bool: - """ - Determine whether a file contains CLDF metadata. - - :param p: `pathlib.Path` object for an existing file. - :return: `True` if the file contains CLDF metadata, `False` otherwise. - """ - if not p.is_file(): # pragma: no cover - return False - try: - with p.open('rb') as fp: - c = fp.read(10) - try: - c = c.decode('utf8').strip() - except UnicodeDecodeError: - return False - if not c.startswith('{'): - return False - except (FileNotFoundError, OSError): # pragma: no cover - return False - try: - d = jsonlib.load(p) - except json.decoder.JSONDecodeError: - return False - return d.get('dc:conformsTo', '').startswith(TERMS_URL) - - def iter_datasets(d: PathType) -> Generator[Dataset, None, None]: """ Discover CLDF datasets - by identifying metadata files - in a directory. diff --git a/src/pycldf/db.py b/src/pycldf/db.py index 36d0565..ac4404e 100644 --- a/src/pycldf/db.py +++ b/src/pycldf/db.py @@ -39,20 +39,24 @@ FOREIGN KEY(`custom.csv_id`) REFERENCES `custom.csv`(`id`) ON DELETE CASCADE ); """ -import typing +from typing import Optional, Any, Callable, Protocol, TYPE_CHECKING import inspect import pathlib import sqlite3 import functools import collections +import dataclasses -import attr import csvw import csvw.db +from csvw.db import ColSpec, TableSpec +from csvw.metadata import Table as CSVWTable from pycldf.terms import TERMS from pycldf.sources import Reference, Sources, Source -from pycldf import Dataset + +if TYPE_CHECKING: + from pycldf import Dataset # pragma: no cover __all__ = ['Database', 'query'] @@ -87,16 +91,19 @@ ] -@attr.s -class TableTranslation(object): +@dataclasses.dataclass +class TableTranslation: """ Specifies column name translations for a table. """ - name = attr.ib(default=None) - columns = attr.ib(default=attr.Factory(dict)) + name: str = None + columns: dict[str, str] = dataclasses.field(default_factory=dict) + +TranslationDict = dict[str, TableTranslation] -def translate(d: typing.Dict[str, TableTranslation], table: str, col=None) -> str: + +def translate(d: TranslationDict, table: str, col: str = None) -> str: """ Translate a db object name. @@ -124,7 +131,7 @@ def translate(d: typing.Dict[str, TableTranslation], table: str, col=None) -> st # 2. Since regular table names may contain underscores as well, we try to find the longest # concatenation of _-separated name parts which appears in the translation dict. # 3. We repeat step 2 until all name parts have been consumed. - def t(n): + def t_(n): if n in d: return d[n].name or n tables, comps = [], n.split('_') @@ -142,10 +149,10 @@ def t(n): tables.append(d[comps[0]].name or comps[0] if comps[0] in d else comps[0]) return '_'.join(tables) - return t(table) + return t_(table) -def clean_bibtex_key(s): +def clean_bibtex_key(s: str) -> str: # pylint: disable=C0116 return s.replace('-', '_').lower() @@ -158,23 +165,74 @@ class Database(csvw.db.Database): """ source_table_name = 'SourceTable' - def __init__(self, dataset: Dataset, **kw): + def __init__(self, dataset: 'Dataset', **kw): """ :param dataset: The :class:`Dataset` instance from which to derive the database schema. """ - self.dataset = dataset + self.dataset: 'Dataset' = dataset self._retranslate = collections.defaultdict(dict) self._source_cols = ['id', 'genre'] + BIBTEX_FIELDS # Source items can be referenced with case insensitive keys. So we store a mapping from # lowercase keys to the ones actually used in the source BibTeX. self._source_map = {} - infer_primary_keys = kw.pop('infer_primary_keys', False) - # We create a derived TableGroup, adding a table for the sources. tg = csvw.TableGroup.fromvalue(dataset.metadata_dict) # Assemble the translation function: + translations: TranslationDict = self._get_translations(dataset) + + # Add source table: + for src in self.dataset.sources: + for key in src: + key = clean_bibtex_key(key) + if key not in self._source_cols: + self._source_cols.append(key) + + tg.tables.append(csvw.Table.fromvalue({ + 'url': self.source_table_name, + 'tableSchema': {'columns': [{'name': n} for n in self._source_cols], 'primaryKey': 'id'} + })) + tg.tables[-1]._parent = tg + + # Add foreign keys to source table: + infer_primary_keys = kw.pop('infer_primary_keys', False) + for table in tg.tables[:-1]: + self._add_fk_to_sources(table, infer_primary_keys, translations) + + # Make sure `base` directory can be resolved: + tg._fname = dataset.tablegroup._fname + csvw.db.Database.__init__( + self, tg, translate=functools.partial(translate, translations), **kw) + + def _add_fk_to_sources( + self, + table: CSVWTable, + infer_primary_keys: bool, + translations: TranslationDict, + ): + if not table.tableSchema.primaryKey and infer_primary_keys: + for col in table.tableSchema.columns: + if col.name.lower() in PRIMARY_KEY_NAMES: + table.tableSchema.primaryKey = [col.name] + break + for col in table.tableSchema.columns: + if col.propertyUrl and col.propertyUrl.uri == TERMS['source'].uri: + table.tableSchema.foreignKeys.append(csvw.ForeignKey.fromdict({ + 'columnReference': [col.header], + 'reference': {'resource': self.source_table_name, 'columnReference': 'id'} + })) + if translations[table.local_name].name: + tl = translations[table.local_name] + translations[f'{table.local_name}_{self.source_table_name}'] = \ + TableTranslation( + name=f'{tl.name}_{self.source_table_name}', + columns={ + f'{table.local_name}_{table.tableSchema.primaryKey[0]}': + f'{tl.name}_{tl.columns[table.tableSchema.primaryKey[0]]}'}) + break + + def _get_translations(self, dataset: 'Dataset') -> TranslationDict: translations = {} for table in dataset.tables: translations[table.local_name] = TableTranslation() @@ -191,7 +249,7 @@ def __init__(self, dataset: Dataset, **kw): if col.propertyUrl and col.propertyUrl.uri in TERMS.by_uri: # Translate local column names to local names of CLDF Ontology terms, prefixed # with `cldf_`: - col_name = 'cldf_{0.name}'.format(TERMS.by_uri[col.propertyUrl.uri]) + col_name = f'cldf_{TERMS.by_uri[col.propertyUrl.uri].name}' new_col_names.append(col_name.lower()) translations[table.local_name].columns[col.header] = col_name self._retranslate[table.local_name][col_name] = col.header @@ -200,60 +258,12 @@ def __init__(self, dataset: Dataset, **kw): if not (col.propertyUrl and col.propertyUrl.uri in TERMS.by_uri): if col.header.lower() in new_col_names: # A name clash! We translate the old column name! - col_name = '_{}'.format(col.header) + col_name = f'_{col.header}' translations[table.local_name].columns[col.header] = col_name self._retranslate[table.local_name][col_name] = col.header + return translations - # Add source table: - for src in self.dataset.sources: - for key in src: - key = clean_bibtex_key(key) - if key not in self._source_cols: - self._source_cols.append(key) - - tg.tables.append(csvw.Table.fromvalue({ - 'url': self.source_table_name, - 'tableSchema': { - 'columns': [dict(name=n) for n in self._source_cols], - 'primaryKey': 'id' - } - })) - tg.tables[-1]._parent = tg - - # Add foreign keys to source table: - for table in tg.tables[:-1]: - if not table.tableSchema.primaryKey and infer_primary_keys: - for col in table.tableSchema.columns: - if col.name.lower() in PRIMARY_KEY_NAMES: - table.tableSchema.primaryKey = [col.name] - break - for col in table.tableSchema.columns: - if col.propertyUrl and col.propertyUrl.uri == TERMS['source'].uri: - table.tableSchema.foreignKeys.append(csvw.ForeignKey.fromdict({ - 'columnReference': [col.header], - 'reference': { - 'resource': self.source_table_name, - 'columnReference': 'id' - } - })) - if translations[table.local_name].name: - tl = translations[table.local_name] - translations['{0}_{1}'.format(table.local_name, self.source_table_name)] = \ - TableTranslation( - name='{0}_{1}'.format(tl.name, self.source_table_name), - columns={'{0}_{1}'.format( - table.local_name, table.tableSchema.primaryKey[0], - ): '{0}_{1}'.format( - tl.name, tl.columns[table.tableSchema.primaryKey[0]], - )}) - break - - # Make sure `base` directory can be resolved: - tg._fname = dataset.tablegroup._fname - csvw.db.Database.__init__( - self, tg, translate=functools.partial(translate, translations), **kw) - - def association_table_context(self, table, column, fkey): + def association_table_context(self, table: TableSpec, column: ColSpec, fkey: str): if self.translate(table.name, column) == 'cldf_source': # We decompose references into the source ID and optional pages. Pages are stored as # `context` of the association table and composed again in `select_many_to_many`. @@ -275,13 +285,13 @@ def association_table_context(self, table, column, fkey): return csvw.db.Database.association_table_context( self, table, column, fkey) # pragma: no cover - def select_many_to_many(self, db, table, context): + def select_many_to_many(self, db, table: TableSpec, context): if table.name.endswith('_' + self.source_table_name): atable = table.name.partition('_' + self.source_table_name)[0] if self.translate(atable, context) == 'cldf_source': # Compose references: res = csvw.db.Database.select_many_to_many(self, db, table, None) - return {k: ['{0}'.format(Reference(*vv)) for vv in v] for k, v in res.items()} + return {k: [f'{Reference(*vv)}' for vv in v] for k, v in res.items()} return csvw.db.Database.select_many_to_many(self, db, table, context) # pragma: no cover def write(self, _force=False, _exists_ok=False, **items): @@ -293,7 +303,8 @@ def write(self, _force=False, _exists_ok=False, **items): return csvw.db.Database.write( self, _force=False, _exists_ok=False, _skip_extra=True, **items) - def write_from_tg(self, _force: bool = False, _exists_ok: bool = False): + def write_from_tg( # pylint: disable=W0221 + self, _force: bool = False, _exists_ok: bool = False): """ Write the data from `self.dataset` to the database. """ @@ -309,7 +320,7 @@ def write_from_tg(self, _force: bool = False, _exists_ok: bool = False): self._source_map[src.id.lower()] = src.id return self.write(_force=_force, _exists_ok=_exists_ok, **items) - def query(self, sql: str, params=None) -> list: + def query(self, sql: str, params=None) -> list[Any]: """ Run `sql` on the database, returning the list of results. """ @@ -317,7 +328,7 @@ def query(self, sql: str, params=None) -> list: cu = conn.execute(sql, params or ()) return list(cu.fetchall()) - def retranslate(self, table, item): + def retranslate(self, table: CSVWTable, item): # pylint: disable=C0116 return {self._retranslate.get(table.local_name, {}).get(k, k): v for k, v in item.items()} @staticmethod @@ -373,21 +384,22 @@ def to_cldf(self, dest, mdname='cldf-metadata.json', coordinate_precision=4) -> return self.dataset.write_metadata(dest / mdname) -class AggregateClass(typing.Protocol): # pragma: no cover - def step(self, value): +class AggregateClass(Protocol): # pragma: no cover # pylint: disable=C0115 + def step(self, value): # pylint: disable=C0116 ... - def finalize(self): + def finalize(self): # pylint: disable=C0116 ... -def query(conn: sqlite3.Connection, - sql: str, - params=None, - functions: typing.Optional[typing.List[typing.Callable]] = None, - aggregates: typing.Optional[typing.List[AggregateClass]] = None, - collations: typing.Optional[typing.List[typing.Callable]] = None) \ - -> typing.Generator[typing.Any, None, None]: +def query( # pylint: disable=R0913,R0917 + conn: sqlite3.Connection, + sql: str, + params=None, + functions: Optional[list[Callable]] = None, + aggregates: Optional[list[AggregateClass]] = None, + collations: Optional[list[Callable]] = None, +) -> list[Any]: """ Note: Passing lambdas or functools.partial objects as function requires passing an explicit name as well. diff --git a/src/pycldf/ext/discovery.py b/src/pycldf/ext/discovery.py index 74fb4bf..24a5c8b 100644 --- a/src/pycldf/ext/discovery.py +++ b/src/pycldf/ext/discovery.py @@ -28,7 +28,7 @@ from csvw.utils import is_url from pycldf import Dataset, iter_datasets, sniff -from pycldf.util import url_without_fragment +from pycldf.urlutil import url_without_fragment __all__ = ['get_dataset', 'DatasetResolver'] EP = 'pycldf_dataset_resolver' diff --git a/src/pycldf/ext/markdown.py b/src/pycldf/ext/markdown.py index 108dea4..4fd2b35 100644 --- a/src/pycldf/ext/markdown.py +++ b/src/pycldf/ext/markdown.py @@ -17,7 +17,8 @@ from clldutils.markup import MarkdownLink from .discovery import get_dataset -from pycldf.util import pkg_path, url_without_fragment +from pycldf.util import pkg_path +from pycldf.urlutil import url_without_fragment from pycldf.dataset import MD_SUFFIX from pycldf.sources import Source from pycldf import Dataset diff --git a/src/pycldf/fileutil.py b/src/pycldf/fileutil.py new file mode 100644 index 0000000..75b527b --- /dev/null +++ b/src/pycldf/fileutil.py @@ -0,0 +1,65 @@ +""" +Functionality to access and manipulate files. +""" +import re +import math +import string +from typing import Union, Optional +import pathlib +import itertools + + +PathType = Union[str, pathlib.Path] + + +def splitfile(p: PathType, chunksize: int, total: Optional[int] = None) -> list[pathlib.Path]: + """ + :param p: Path of the file to split. + :param chunksize: The maximal size of the chunks the file will be split into. + :param total: The size of the input file. + :return: The list of paths of files that the input has been split into. + """ + p = pathlib.Path(p) + total = total or p.stat().st_size + if total <= chunksize: # Nothing to do. + return [p] + nchunks = math.ceil(total / chunksize) + suffix_length = 2 if nchunks < len(string.ascii_lowercase)**2 else 3 + suffixes = [ + ''.join(t) for t in + itertools.combinations_with_replacement(string.ascii_lowercase, suffix_length)] + + res = [] + with p.open('rb') as f: + chunk = f.read(chunksize) + while chunk: + pp = p.parent.joinpath(f'{p.name}.{suffixes.pop(0)}') + pp.write_bytes(chunk) + res.append(pp) + chunk = f.read(chunksize) # read the next chunk + + p.unlink() + return res + + +def catfile(p: PathType) -> bool: + """ + Restore a file that has been split into chunks. + + We determine if a file has been split by looking for files in the parent directory with suffixes + as created by `splitfile`. + """ + p = pathlib.Path(p) + if p.exists(): # Nothing to do. + return False + # Check, whether the file has been split. + suffixes = {pp.suffix: pp for pp in p.parent.iterdir() if pp.stem == p.name} + if {'.aa', '.ab'}.issubset(suffixes) or {'.aaa', '.aab'}.issubset(suffixes): + # ok, let's concatenate the files: + with p.open('wb') as f: + for suffix in sorted(suffixes): + if re.fullmatch(r'\.[a-z]{2,3}', suffix): + f.write(suffixes[suffix].read_bytes()) + suffixes[suffix].unlink() + return True + return False # pragma: no cover diff --git a/src/pycldf/markdown.py b/src/pycldf/markdown.py new file mode 100644 index 0000000..226655b --- /dev/null +++ b/src/pycldf/markdown.py @@ -0,0 +1,166 @@ +""" +Functionality to render a Dataset's metadata to a Markdown document. +""" +import re +import html +import pathlib +from typing import TYPE_CHECKING, Any, Optional + +from clldutils.misc import slug + +from pycldf.util import qname2url +from pycldf.fileutil import PathType + +if TYPE_CHECKING: + from pycldf import Dataset # pragma: no cover + +__all__ = ['metadata2markdown'] + + +def metadata2markdown(ds: 'Dataset', path: PathType, rel_path: Optional[str] = './') -> str: + """ + Render the metadata of a dataset as markdown. + + :param ds: `Dataset` instance + :param path: `pathlib.Path` of the metadata file + :param rel_path: `str` to use a relative path when creating links to data files + :return: `str` with markdown formatted text + """ + path = pathlib.Path(path) + return '\n'.join(_iter_markdown(ds, pathlib.Path(path), rel_path)) + + +def _qname2link(qname: str, html_=False) -> str: + url = qname2url(qname) + if url: + return f'{qname}' if html_ else f'[{qname}]({url})' + return qname + + +def _htmlify(obj: Any, rel_path: str, key=None) -> str: + """ + For inclusion in tables we must use HTML for lists. + """ + if isinstance(obj, list): + items = [f'
  • {_htmlify(item, rel_path, key=key)}
  • ' for item in obj] + return f'
      {"".join(items)}
    ' + + if isinstance(obj, dict): + if key == 'prov:wasGeneratedBy' \ + and set(obj.keys()).issubset({'dc:title', 'dc:description', 'dc:relation'}): + desc = obj.get('dc:description') or '' + rel = obj.get('dc:relation') + if rel: + desc = (desc + '
    ') if desc else desc + desc += f'{rel}' + return f"{obj.get('dc:title') or ''}: {desc}" + + if obj.get('rdf:type') == 'prov:Entity' and 'rdf:about' in obj: + label = obj.get('dc:title') + if (not label) or label == 'Repository': + label = obj['rdf:about'] + url = obj['rdf:about'] + if ('github.com' in url) and ('/tree/' not in url) and ('dc:created' in obj): + tag = obj['dc:created'] + if '-g' in tag: + tag = tag.split('-g')[-1] + url = f'{url}/tree/{tag}' + if label == obj['rdf:about']: + label = label.split('github.com/')[-1] + version = f' {obj.get("dc:created")}' or '' + return f'{label} {version}' + + items = [ + f'
    {_qname2link(k, html_=True)}
    {html.escape(str(v))}
    ' + for k, v in obj.items()] + return f'
    {"".join(items)}
    ' + + return str(obj) + + +def _iter_properties(obj, rel_path): + if obj.common_props.get('dc:description'): + yield obj.common_props['dc:description'] + '\n' + yield 'property | value\n --- | ---' + for k, v in obj.common_props.items(): + if not v: + continue + if k not in ('dc:description', 'dc:title', 'dc:source'): + if k == 'dc:conformsTo': + v = f'[CLDF {v.split("#")[1]}]({v})' + yield f'{_qname2link(k)} | {_htmlify(v, rel_path, key=k)}' + yield '' + + +def _colrow(col, fks, pk, ds, rel_path): + dt = f"`{col.datatype.base if col.datatype else 'string'}`" + if col.datatype: + if col.datatype.format: + if re.fullmatch(r'[\w\s]+(\|[\w\s]+)*', col.datatype.format): + dt += '
    Valid choices:
    ' + dt += ''.join(f' `{w}`' for w in col.datatype.format.split('|')) + elif col.datatype.base == 'string': + dt += f'
    Regex: `{col.datatype.format}`' + if col.datatype.minimum: + dt += f'
    ≥ {col.datatype.minimum}' + if col.datatype.maximum: + dt += f'
    ≤ {col.datatype.maximum}' + if col.separator: + dt = f'list of {dt} (separated by `{col.separator}`)' + desc = col.common_props.get('dc:description', '').replace('\n', ' ') + + if col.name in pk: + desc = (desc + '
    ') if desc else desc + desc += 'Primary key' + + if col.name in fks: + desc = (desc + '
    ') if desc else desc + pkcol, table = fks[col.name] + desc += f'References [{table}::{pkcol}](#table-{slug(table)})' + elif col.propertyUrl \ + and col.propertyUrl.uri == "http://cldf.clld.org/v1.0/terms.rdf#source" \ + and 'dc:source' in ds.properties: + desc = (desc + '
    ') if desc else desc + desc += (f"References [{ds.properties['dc:source']}::BibTeX-key]" + f"({rel_path}{ds.properties['dc:source']})") + + return ' | '.join([ + f'[{col.name}]({col.propertyUrl})' if col.propertyUrl else f'`{col.name}`', dt, desc]) + + +def _existing_fname_in_cldf_dir(ds, fname: str) -> Optional[str]: + """Returns an existing (possibly zipped) file matching fname.""" + if pathlib.Path(ds.directory).joinpath(fname).exists(): + return fname + zipped = fname + '.zip' + if pathlib.Path(ds.directory).joinpath(zipped).exists(): + return zipped + return None + + +def _iter_markdown(ds: 'Dataset', path: pathlib.Path, rel_path: Optional[str] = './'): + def file_link(fname): + return f'[{fname}]({rel_path}{fname})' + + yield f'# {ds.properties.get("dc:title", ds.module)}\n' + if path.suffix == '.json': + yield f'**CLDF Metadata**: {file_link(path.name)}\n' + if 'dc:source' in ds.properties: + src = _existing_fname_in_cldf_dir(ds, ds.properties['dc:source']) + if src: + yield f'**Sources**: {file_link(src)}\n' + yield from _iter_properties(ds.tablegroup, rel_path) + + for table in ds.tables: + fks = { + fk.columnReference[0]: (fk.reference.columnReference[0], fk.reference.resource.string) + for fk in table.tableSchema.foreignKeys if len(fk.columnReference) == 1} + src = _existing_fname_in_cldf_dir(ds, table.url.string) + table_name = file_link(src) if src else table.url + yield f'\n## Table {table_name}\n' + yield from _iter_properties(table, rel_path) + yield '\n### Columns\n' + yield 'Name/Property | Datatype | Description' + yield ' --- | --- | --- ' + for col in table.tableSchema.columns: + yield _colrow(col, fks, table.tableSchema.primaryKey, ds, rel_path) diff --git a/src/pycldf/media.py b/src/pycldf/media.py index 13c13a1..62163cb 100644 --- a/src/pycldf/media.py +++ b/src/pycldf/media.py @@ -24,8 +24,7 @@ import io import json import base64 -import typing -import logging +from typing import Union, TYPE_CHECKING, Optional, Callable import pathlib import zipfile import functools @@ -33,17 +32,25 @@ import collections import urllib.parse import urllib.request +from collections.abc import Generator -from clldutils.misc import log_or_raise -import pycldf -from pycldf import orm -from pycldf.util import splitfile, catfile +from csvw.metadata import Table, Column from csvw.datatypes import anyURI +from pycldf import orm +from pycldf.fileutil import splitfile, catfile, PathType + +if TYPE_CHECKING: + from pycldf import Dataset # pragma: no cover + from pycldf.dataset import RowType # pragma: no cover + from pycldf.validators import DatasetValidator # pragma: no cover + __all__ = ['Mimetype', 'MediaTable', 'File'] +StrOrBytes = Union[str, bytes] -class File: + +class File: # pylint: disable=too-many-instance-attributes """ A `File` represents a row in a MediaTable, providing functionality to access the contents. @@ -56,15 +63,16 @@ class File: - :meth:`save` will write a (deflated) ZIP archive containing the specified file as single \ member. """ - def __init__(self, media: 'MediaTable', row: dict): - self.row = row - self.id = row[media.filename_col.name] - self._mimetype = row[media.mimetype_col.name] - self.url = None + def __init__(self, media: 'MediaTable', row: 'RowType'): + self.row: 'RowType' = row + self.id: str = row[media.filename_col.name] + self._mimetype: str = row[media.mimetype_col.name] + self.url: Optional[str] = None self.scheme = None self.url_reader = media.url_reader - self.path_in_zip = row.get(media.path_in_zip_col.name) if media.path_in_zip_col else None - self._dsdir = media.ds.directory + self.path_in_zip: Optional[str] \ + = row.get(media.path_in_zip_col.name) if media.path_in_zip_col else None + self._dsdir: pathlib.Path = media.ds.directory if media.url_col: # 1. Look for a downloadUrl property: @@ -83,7 +91,7 @@ def __init__(self, media: 'MediaTable', row: dict): @classmethod def from_dataset( - cls, ds: pycldf.Dataset, row_or_object: typing.Union[dict, orm.Media]) -> 'File': + cls, ds: 'Dataset', row_or_object: Union[dict, orm.Media]) -> 'File': """ Factory method to instantiate a `File` bypassing the `Media` wrapper. """ @@ -114,7 +122,7 @@ def mimetype(self) -> 'Mimetype': if mt: return Mimetype(mt) if self.scheme == 'data': - mt, _, data = self.parsed_url.path.partition(',') + mt, _, _ = self.parsed_url.path.partition(',') if mt.endswith(';base64'): mt = mt.replace(';base64', '').strip() if mt: @@ -122,13 +130,14 @@ def mimetype(self) -> 'Mimetype': # There's an explicit default mimetype for data URLs! return Mimetype('text/plain;charset=US-ASCII') if self.scheme in ['http', 'https']: - res = urllib.request.urlopen(urllib.request.Request(self.url, method="HEAD")) + res = urllib.request.urlopen( # too lazy to mock with with. pylint: disable=R1732 + urllib.request.Request(self.url, method="HEAD")) mt = res.headers.get('Content-Type') if mt: return Mimetype(mt) return Mimetype('application/octet-stream') - def local_path(self, d: pathlib.Path = None) -> typing.Union[pathlib.Path, None]: + def local_path(self, d: pathlib.Path = None) -> Optional[pathlib.Path]: """ :return: The expected path of the file in the directory `d`. """ @@ -136,14 +145,15 @@ def local_path(self, d: pathlib.Path = None) -> typing.Union[pathlib.Path, None] if self.scheme == 'file': return self._dsdir / urllib.parse.unquote(self.relpath) return None - return d.joinpath('{}{}'.format( - self.id, '.zip' if self.path_in_zip else (self.mimetype.extension or ''))) + zip_ext = '.zip' if self.path_in_zip else (self.mimetype.extension or '') + return d.joinpath(f'{self.id}{zip_ext}') def read_json(self, d=None): + """Reads JSON data.""" assert self.mimetype.subtype.endswith('json') return json.loads(self.read(d=d)) - def read(self, d=None) -> typing.Union[None, str, bytes]: + def read(self, d: Optional[pathlib.Path] = None) -> Optional[StrOrBytes]: """ :param d: A local directory where the file has been saved before. If `None`, the content \ will be read from the file's URL. @@ -156,17 +166,18 @@ def read(self, d=None) -> typing.Union[None, str, bytes]: zipcontent = self.url_reader[self.scheme]( self.parsed_url, Mimetype('application/zip')) if zipcontent: - zf = zipfile.ZipFile(io.BytesIO(zipcontent)) - return self.mimetype.read(zf.read(self.path_in_zip)) - return # pragma: no cover + with zipfile.ZipFile(io.BytesIO(zipcontent)) as zf: + return self.mimetype.read(zf.read(self.path_in_zip)) + return None # pragma: no cover if d: return self.mimetype.read(self.local_path(d).read_bytes()) if self.url: try: return self.url_reader[self.scheme](self.parsed_url, self.mimetype) - except KeyError: - raise ValueError('Unsupported URL scheme: {}'.format(self.scheme)) + except KeyError as e: + raise ValueError(f'Unsupported URL scheme: {self.scheme}') from e + return None # pragma: no cover def save(self, d: pathlib.Path) -> pathlib.Path: """ @@ -189,14 +200,17 @@ def save(self, d: pathlib.Path) -> pathlib.Path: return p -class MediaTable(pycldf.ComponentWithValidation): +class MediaTable: # pylint: disable=too-many-instance-attributes """ Container class for a `Dataset`'s media items. """ - def __init__(self, ds: pycldf.Dataset, use_form_id: bool = False): - super().__init__(ds) - self.url_col = ds.get(('MediaTable', 'http://cldf.clld.org/v1.0/terms.rdf#downloadUrl')) - self.path_in_zip_col = ds.get( + def __init__(self, ds: 'Dataset'): + self.ds: 'Dataset' = ds + self.component: str = self.__class__.__name__ + self.table: Table = ds[self.component] + self.url_col: Optional[Column] = ds.get( + ('MediaTable', 'http://cldf.clld.org/v1.0/terms.rdf#downloadUrl')) + self.path_in_zip_col: Optional[Column] = ds.get( (self.component, 'http://cldf.clld.org/v1.0/terms.rdf#pathInZip')) if self.table and not self.url_col: @@ -204,13 +218,14 @@ def __init__(self, ds: pycldf.Dataset, use_form_id: bool = False): if col.propertyUrl and col.propertyUrl == 'http://www.w3.org/ns/dcat#downloadUrl': self.url_col = col break - self.id_col = ds[self.component, 'http://cldf.clld.org/v1.0/terms.rdf#id'] - self.filename_col = ds[self.component, 'http://cldf.clld.org/v1.0/terms.rdf#formReference']\ - if use_form_id else self.id_col - self.mimetype_col = ds[self.component, 'http://cldf.clld.org/v1.0/terms.rdf#mediaType'] + self.id_col: Column = ds[self.component, 'http://cldf.clld.org/v1.0/terms.rdf#id'] + self.filename_col: Column = self.id_col + self.mimetype_col: Column = ds[ + self.component, 'http://cldf.clld.org/v1.0/terms.rdf#mediaType'] @functools.cached_property - def url_reader(self): + def url_reader(self) -> dict[str, Callable[[urllib.parse.ParseResult, 'Mimetype'], StrOrBytes]]: + """Maps URL schemes to reader functions.""" return { 'http': read_http_url, 'https': read_http_url, @@ -219,13 +234,13 @@ def url_reader(self): 'file': functools.partial(read_file_url, self.ds.directory), } - def __iter__(self) -> typing.Generator[File, None, None]: + def __iter__(self) -> Generator[File, None, None]: for row in self.table: yield File(self, row) def split(self, chunksize: int) -> int: """ - :return: The number of media files that have been split. + :return: The number of media files that needed splitting. """ res = 0 for file in self: @@ -237,7 +252,7 @@ def split(self, chunksize: int) -> int: res += 1 return res - def cat(self): + def cat(self) -> int: """ :return: The number of media files that have been re-assembled from chunks. """ @@ -249,7 +264,8 @@ def cat(self): res += 1 return res - def validate(self, success: bool = True, log: logging.Logger = None) -> bool: + def validate(self, validator: 'DatasetValidator'): + """Component-specific validation.""" speaker_area_files = collections.defaultdict(list) if ('LanguageTable', 'speakerArea') in self.ds: for lg in self.ds.iter_rows('LanguageTable', 'id', 'speakerArea'): @@ -257,42 +273,38 @@ def validate(self, success: bool = True, log: logging.Logger = None) -> bool: speaker_area_files[lg['speakerArea']].append(lg['id']) for file in self: - content = None - if not file.url: - success = False - log_or_raise('File without URL: {}'.format(file.id), log=log) - elif file.scheme == 'file': - try: - content = file.read() - except FileNotFoundError: - success = False - log_or_raise( - 'Non-existing local file referenced: {} ' - 'You may have to run `cldf catmedia` to recombine files'.format(file.id), - log=log) - except Exception as e: # pragma: no cover - success = False - log_or_raise('Error reading {}: {}'.format(file.id, e), log=log) - elif file.scheme == 'data': - try: - content = file.read() - except Exception as e: # pragma: no cover - success = False - log_or_raise('Error reading {}: {}'.format(file.id, e), log=log) - if file.id in speaker_area_files and file.mimetype.subtype == 'geo+json' and content: - content = json.loads(content) - if content['type'] != 'Feature': - assert content['type'] == 'FeatureCollection' - for feature in content['features']: - lid = feature['properties'].get('cldf:languageReference') - if lid and lid in speaker_area_files[file.id]: - speaker_area_files[file.id].remove(lid) - if speaker_area_files[file.id]: - log_or_raise( - 'Error: Not all language IDs found in speakerArea GeoJSON: {}'.format( - speaker_area_files[file.id])) # pragma: no cover - - return success + self._validate_file(validator, file, speaker_area_files) + + def _validate_file(self, validator, file, speaker_area_files): + content = None + if not file.url: + validator.fail(f'File without URL: {file.id}') + elif file.scheme == 'file': + try: + content = file.read() + except FileNotFoundError: + validator.fail( + f'Non-existing local file referenced: {file.id} ' + 'You may have to run `cldf catmedia` to recombine files') + except Exception as e: # pragma: no cover # pylint: disable=W0718 + validator.fail(f'Error reading {file.id}: {e}') + elif file.scheme == 'data': + try: + content = file.read() + except Exception as e: # pragma: no cover # pylint: disable=W0718 + validator.fail(f'Error reading {file.id}: {e}') + if file.id in speaker_area_files and file.mimetype.subtype == 'geo+json' and content: + content = json.loads(content) + if content['type'] != 'Feature': + assert content['type'] == 'FeatureCollection' + for feature in content['features']: + lid = feature['properties'].get('cldf:languageReference') + if lid and lid in speaker_area_files[file.id]: + speaker_area_files[file.id].remove(lid) + if speaker_area_files[file.id]: + validator.fail( + f'Error: Not all language IDs found in speakerArea GeoJSON: ' + f'{speaker_area_files[file.id]}') # pragma: no cover Media = MediaTable @@ -327,23 +339,28 @@ def __eq__(self, other): @property def is_text(self) -> bool: + """Whether the mimetype describes text, and hence data should be read as str.""" return self.type == 'text' @property - def extension(self) -> typing.Union[None, str]: - return mimetypes.guess_extension('{}/{}'.format(self.type, self.subtype)) + def extension(self) -> Union[None, str]: + """Return a suitable filename extension for the mimetype.""" + return mimetypes.guess_extension(f'{self.type}/{self.subtype}') - def read(self, data: bytes) -> typing.Union[str, bytes]: + def read(self, data: bytes) -> StrOrBytes: + """Read data, inferring the encoding from the mimetype.""" if self.is_text and not isinstance(data, str): return data.decode(self.encoding) return data - def write(self, data: typing.Union[str, bytes], p: typing.Optional[pathlib.Path] = None) -> int: + def write(self, data: StrOrBytes, p: Optional[pathlib.Path] = None) -> Union[int, StrOrBytes]: + """The mimetype dictates how/if to encode data.""" res = data.encode(self.encoding) if self.is_text else data return p.write_bytes(res) if p else res -def read_data_url(url: urllib.parse.ParseResult, mimetype: Mimetype): +def read_data_url(url: urllib.parse.ParseResult, mimetype: Mimetype) -> StrOrBytes: + """Read data from a data:// URL.""" spec, _, data = url.path.partition(',') if spec.endswith(';base64'): data = base64.b64decode(data) @@ -354,9 +371,8 @@ def read_data_url(url: urllib.parse.ParseResult, mimetype: Mimetype): return data -def read_file_url(d: typing.Union[pathlib.Path, str], - url: urllib.parse.ParseResult, - mimetype: Mimetype) -> typing.Union[str, bytes]: +def read_file_url(d: PathType, url: urllib.parse.ParseResult, mimetype: Mimetype) -> StrOrBytes: + """Read data from a file:// URL.""" path = url.path while path.startswith('/'): path = path[1:] @@ -368,5 +384,6 @@ def read_file_url(d: typing.Union[pathlib.Path, str], return mimetype.read(d.joinpath(urllib.parse.unquote(path)).read_bytes()) -def read_http_url(url: urllib.parse.ParseResult, mimetype: Mimetype): +def read_http_url(url: urllib.parse.ParseResult, mimetype: Mimetype) -> StrOrBytes: + """Read data from an HTTP URL.""" return mimetype.read(urllib.request.urlopen(urllib.parse.urlunparse(url)).read()) diff --git a/src/pycldf/module.py b/src/pycldf/module.py new file mode 100644 index 0000000..01aa3dd --- /dev/null +++ b/src/pycldf/module.py @@ -0,0 +1,65 @@ +from typing import Union + +import attr +from csvw.metadata import TableGroup + +from pycldf.terms import TERMS, term_uri +from pycldf.util import pkg_path, MD_SUFFIX + +__all__ = ['get_module_impl'] + + +@attr.s +class Module: + """ + Class representing a CLDF Module. + + .. seealso:: https://github.com/cldf/cldf/blob/master/README.md#cldf-modules + """ + uri = attr.ib(validator=attr.validators.in_([t.uri for t in TERMS.classes.values()])) + fname = attr.ib() + + @property + def id(self) -> str: + """ + The local part of the term URI is interpreted as Module identifier. + """ + return self.uri.split('#')[1] + + def match(self, thing: Union[TableGroup, str]) -> bool: + """Check if the module described here matches thing.""" + if isinstance(thing, TableGroup): + return thing.common_props.get('dc:conformsTo') == term_uri(self.id) + if isinstance(thing, str): + return thing == self.fname + return False + + +_modules = [] + + +def get_module_impl(base_class, spec: Union[TableGroup, str]): + implementations = {cls.__name__: cls for cls in base_class.__subclasses__()} + for mod in get_modules(): + if mod.match(spec): + return implementations[mod.id] + + +def get_modules() -> list[Module]: + """ + We read supported CLDF modules from the default metadata files distributed with `pycldf`. + """ + global _modules # pylint: disable=global-statement + + if not _modules: + for p in pkg_path('modules').glob(f'*{MD_SUFFIX}'): + tg = TableGroup.from_file(p) + mod = Module( + tg.common_props['dc:conformsTo'], + tg.tables[0].url.string if tg.tables else None) + _modules.append(mod) + # prefer Wordlist over ParallelText (forms.csv) + _modules = sorted( + _modules, + key=lambda m: (m.id in ('Wordlist', 'ParallelText'), m.id == 'ParallelText')) + return _modules diff --git a/src/pycldf/orm.py b/src/pycldf/orm.py index d85558c..53f33d2 100644 --- a/src/pycldf/orm.py +++ b/src/pycldf/orm.py @@ -46,7 +46,7 @@ def custom_method(self): * ~35secs iterating over ``pycldf.Dataset.objects('ValueTable')`` """ import types -import typing +from typing import TYPE_CHECKING, Union, Optional, Any import decimal import functools import collections @@ -58,12 +58,14 @@ def custom_method(self): from pycldf.util import DictTuple from pycldf.sources import Reference -if typing.TYPE_CHECKING: +if TYPE_CHECKING: from pycldf import Dataset # pragma: no cover + from pycldf.dataset import RowType # pragma: no cover from pycldf.media import File # pragma: no cover -def to_json(s): +def to_json(s: Any) -> Union[str, float, None, list, dict]: + """Converts `s` to an object that can be serialized as JSON.""" if isinstance(s, (list, tuple)): return [to_json(ss) for ss in s] if isinstance(s, dict): @@ -77,7 +79,7 @@ def to_json(s): return str(s) -class Object: +class Object: # pylint: disable=too-many-instance-attributes """ Represents a row of a CLDF component table. @@ -95,7 +97,7 @@ class Object: # specified here: __component__ = None - def __init__(self, dataset: 'Dataset', row: dict): + def __init__(self, dataset: 'Dataset', row: 'RowType'): # Get a mapping of column names to pairs (CLDF property name, list-valued) for columns # present in the component specified by class name. cldf_cols = { @@ -103,29 +105,29 @@ def __init__(self, dataset: 'Dataset', row: dict): for k, v in vars(getattr(dataset.readonly_column_names, self.component)).items() if v} self._listvalued = set(v[0] for v in cldf_cols.values() if v[1]) - self.cldf = {} - self.data = collections.OrderedDict() + cldf_ = {} + self.data: collections.OrderedDict[str, Any] = collections.OrderedDict() for k, v in row.items(): # We go through the items of the row and slot them into the appropriate bags: self.data[k] = v if k in cldf_cols: - self.cldf[cldf_cols[k][0]] = v + cldf_[cldf_cols[k][0]] = v # Make cldf properties accessible as attributes: - self.cldf = types.SimpleNamespace(**self.cldf) - self.dataset = dataset - self.id = self.cldf.id - self.pk = None + self.cldf = types.SimpleNamespace(**cldf_) + self.dataset: 'Dataset' = dataset + self.id: str = self.cldf.id + self.pk: Optional[str] = None t = dataset[self.component_name()] if t.tableSchema.primaryKey and len(t.tableSchema.primaryKey) == 1: self.pk = self.data[dataset[self.component_name()].tableSchema.primaryKey[0]] - self.name = getattr(self.cldf, 'name', None) - self.description = getattr(self.cldf, 'description', None) + self.name: str = getattr(self.cldf, 'name', None) + self.description: str = getattr(self.cldf, 'description', None) def __repr__(self): - return '<{}.{} id="{}">'.format(self.__class__.__module__, self.__class__.__name__, self.id) + return f'<{self.__class__.__module__}.{self.__class__.__name__} id="{self.id}">' @classmethod - def component_name(cls) -> str: + def component_name(cls) -> str: # pylint: disable=C0116 return cls.__component__ or (cls.__name__ + 'Table') @property @@ -137,7 +139,8 @@ def component(self) -> str: return self.__class__.component_name() @property - def key(self) -> typing.Tuple[int, str, str]: + def key(self) -> tuple[int, str, str]: + """A key that is also unique across different Dataset instances.""" return id(self.dataset), self.__class__.__name__, self.id def __hash__(self): @@ -154,31 +157,32 @@ def _expand_uritemplate(self, attr, col): row as context. Thus, expansion is available as method on this row object. """ col = self.dataset[self.component, col] - variables = {k: v for k, v in vars(self.cldf).items()} + variables = dict(vars(self.cldf).items()) variables.update(self.data) if getattr(col, attr, None): return getattr(col, attr).expand(**variables) + return None # pragma: no cover - def aboutUrl(self, col='id') -> typing.Union[str, None]: + def aboutUrl(self, col: str = 'id') -> Union[str, None]: # pylint: disable=invalid-name """ The table's `aboutUrl` property, expanded with the object's row as context. """ return self._expand_uritemplate('aboutUrl', col) - def valueUrl(self, col='id') -> typing.Union[str, None]: + def valueUrl(self, col: str = 'id') -> Union[str, None]: # pylint: disable=invalid-name """ The table's `valueUrl` property, expanded with the object's row as context. """ return self._expand_uritemplate('valueUrl', col) - def propertyUrl(self, col='id') -> typing.Union[str, None]: + def propertyUrl(self, col: str = 'id') -> Union[str, None]: # pylint: disable=invalid-name """ The table's `propertyUrl` property, expanded with the object's row as context. """ return self._expand_uritemplate('propertyUrl', col) @functools.cached_property - def references(self) -> typing.Tuple[Reference]: + def references(self) -> tuple[Reference, ...]: """ `pycldf.Reference` instances associated with the object. @@ -192,7 +196,7 @@ def references(self) -> typing.Tuple[Reference]: multi=True, ) - def related(self, relation: str) -> typing.Union[None, 'Object']: + def related(self, relation: str) -> Optional['Object']: """ The CLDF ontology specifies several "reference properties". This method returns the first related object specified by such a property. @@ -202,7 +206,7 @@ def related(self, relation: str) -> typing.Union[None, 'Object']: """ if relation in self._listvalued: raise ValueError( - '{} is list-valued, use `all_related` to retrieve related objects'.format(relation)) + f'{relation} is list-valued, use `all_related` to retrieve related objects') fk = getattr(self.cldf, relation, None) if fk: ref = self.dataset.get_foreign_key_reference(self.component_name(), relation) @@ -213,8 +217,9 @@ def related(self, relation: str) -> typing.Union[None, 'Object']: return self.dataset.get_object(TERMS[relation].references, fk, pk=True) raise NotImplementedError('pycldf does not support foreign key constraints ' 'referencing columns other than CLDF id or primary key.') + return None # pragma: no cover - def all_related(self, relation: str) -> typing.Union[DictTuple, list]: + def all_related(self, relation: str) -> Union[DictTuple, list]: """ CLDF reference properties can be list-valued. This method returns all related objects for such a property. @@ -229,57 +234,58 @@ def all_related(self, relation: str) -> typing.Union[DictTuple, list]: class _WithLanguageMixin: @property - def language(self): + def language(self) -> Object: # pylint: disable=C0116 return self.related('languageReference') @property - def languages(self): + def languages(self) -> Union[DictTuple, list]: # pylint: disable=C0116 return self.all_related('languageReference') class _WithParameterMixin: @functools.cached_property - def parameter(self): + def parameter(self) -> Object: # pylint: disable=C0116 return self.related('parameterReference') @property - def parameters(self): + def parameters(self) -> Union[DictTuple, list]: # pylint: disable=C0116 return self.all_related('parameterReference') -class Borrowing(Object): +class Borrowing(Object): # pylint: disable=C0115 @property - def targetForm(self): + def targetForm(self) -> Object: # pylint: disable=C0116,C0103 return self.related('targetFormReference') @property - def sourceForm(self): + def sourceForm(self) -> Object: # pylint: disable=C0116,C0103 return self.related('sourceFormReference') -class Code(Object, _WithParameterMixin): +class Code(Object, _WithParameterMixin): # pylint: disable=C0115 pass -class Cognateset(Object): +class Cognateset(Object): # pylint: disable=C0115 @property - def cognates(self): + def cognates(self): # pylint: disable=C0116 return DictTuple(v for v in self.dataset.objects('CognateTable') if v.cognateset == self) -class Cognate(Object): +class Cognate(Object): # pylint: disable=C0115 @property - def form(self): + def form(self): # pylint: disable=C0116 return self.related('formReference') @property - def cognateset(self): + def cognateset(self): # pylint: disable=C0116 return self.related('cognatesetReference') -class Contribution(Object): +class Contribution(Object): # pylint: disable=C0115 @property def sentences(self): + """Returns the ordered sentences of a text in a TextCorpus.""" res = [] if self.dataset.module == 'TextCorpus': # Return the list of lines, ordered by position. @@ -293,35 +299,38 @@ def sentences(self): return res -class Entry(Object, _WithLanguageMixin): +class Entry(Object, _WithLanguageMixin): # pylint: disable=C0115 @property - def senses(self): + def senses(self): # pylint: disable=C0116 return DictTuple(v for v in self.dataset.objects('SenseTable') if self in v.entries) -class Example(Object, _WithLanguageMixin): +class Example(Object, _WithLanguageMixin): # pylint: disable=C0115 @property - def metaLanguage(self): + def metaLanguage(self): # pylint: disable=C0116,C0103 return self.related('metaLanguageReference') @property - def igt(self): - return '{0}\n{1}\n{2}'.format( - self.cldf.primaryText, - tabulate([self.cldf.gloss], self.cldf.analyzedWord, tablefmt='plain'), - self.cldf.translatedText, - ) + def igt(self) -> str: + """The example in a plain text interlinear glossed representation.""" + aligned = tabulate([self.cldf.gloss], self.cldf.analyzedWord, tablefmt='plain') + return f'{self.cldf.primaryText}\n{aligned}\n{self.cldf.translatedText}' @property def text(self): """ - Examples in a TextCorpus are interpreted as lines of text. + Examples in a TextCorpus are interpreted as lines of a text, which in turn is the + module-specific interpretation of a CLDF contribution. """ if self.dataset.module == 'TextCorpus' and hasattr(self.cldf, 'contributionReference'): return self.related('contributionReference') + return None # pragma: no cover @property - def alternative_translations(self): + def alternative_translations(self) -> list['Example']: + """ + Returns alternative translations for the Example. + """ res = [] if hasattr(self.cldf, 'exampleReference'): # There's a self-referential foreign key. We assume this to link together full examples @@ -332,17 +341,17 @@ def alternative_translations(self): return res -class Form(Object, _WithLanguageMixin, _WithParameterMixin): +class Form(Object, _WithLanguageMixin, _WithParameterMixin): # pylint: disable=C0115 pass -class FunctionalEquivalentset(Object): +class FunctionalEquivalentset(Object): # pylint: disable=C0115 pass -class FunctionalEquivalent(Object): +class FunctionalEquivalent(Object): # pylint: disable=C0115 @property - def form(self): # pragma: no cover + def form(self): # pragma: no cover # pylint: disable=C0116 return self.related('formReference') @@ -362,15 +371,16 @@ class Language(Object): 'MultiPolygon' """ @property - def lonlat(self) -> typing.Union[None, typing.Tuple[decimal.Decimal]]: + def lonlat(self) -> Optional[tuple[decimal.Decimal, decimal.Decimal]]: """ :return: (longitude, latitude) pair if coordinates are defined, else `None`. """ if hasattr(self.cldf, 'latitude'): return (self.cldf.longitude, self.cldf.latitude) + return None # pragma: no cover @property - def as_geojson_feature(self) -> typing.Union[None, typing.Dict[str, typing.Any]]: + def as_geojson_feature(self) -> Union[None, dict[str, Any]]: """ `dict` suitable for serialization as GeoJSON Feature object, with the point coordinate as geographic data. @@ -383,19 +393,21 @@ def as_geojson_feature(self) -> typing.Union[None, typing.Dict[str, typing.Any]] "geometry": {"type": "Point", "coordinates": self.lonlat}, "properties": vars(self.cldf), }) + return None # pragma: no cover @functools.cached_property - def speaker_area(self) -> typing.Union[None, 'File']: + def speaker_area(self) -> Optional['File']: """ A `pycldf.media.File` object containing information about the speaker area of the language. """ - from pycldf.media import File + from pycldf.media import File # pylint: disable=C0415 if getattr(self.cldf, 'speakerArea', None): return File.from_dataset(self.dataset, self.related('speakerArea')) + return None # pragma: no cover @functools.cached_property - def speaker_area_as_geojson_feature(self) -> typing.Union[None, typing.Dict[str, typing.Any]]: + def speaker_area_as_geojson_feature(self) -> Optional[dict[str, Any]]: """ `dict` suitable for serialization as GeoJSON Feature object, with a speaker area Polygon or MultiPolygon as geographic data. @@ -411,13 +423,14 @@ def speaker_area_as_geojson_feature(self) -> typing.Union[None, typing.Dict[str, else: assert res['type'] == 'Feature' return res + return None # pragma: no cover @property - def values(self): + def values(self) -> DictTuple: # pylint: disable=C0116 return DictTuple(v for v in self.dataset.objects('ValueTable') if self in v.languages) @property - def forms(self): + def forms(self): # pylint: disable=C0116 return DictTuple(v for v in self.dataset.objects('FormTable') if self in v.languages) def glottolog_languoid(self, glottolog_api): @@ -433,42 +446,50 @@ def glottolog_languoid(self, glottolog_api): return glottolog_api.languoid(self.cldf.glottocode) -class Media(Object): +class Media(Object): # pylint: disable=C0115 @property - def downloadUrl(self): + def downloadUrl(self): # pylint: disable=C0116,C0103 if hasattr(self.cldf, 'downloadUrl'): return self.cldf.downloadUrl return self.valueUrl() -class ParameterNetworkEdge(Object): +class ParameterNetworkEdge(Object): # pylint: disable=C0115 __component__ = 'ParameterNetwork' class Parameter(Object): + """ + The Parameter class provides support for interpreting a parameter's string values as typed + data and reading it accordingly. See `Value` below. + """ @functools.cached_property - def columnSpec(self): - columnSpec = getattr(self.cldf, 'columnSpec', None) + def columnSpec(self) -> Optional[csvw.metadata.Column]: # pylint: disable=C0103 + """Turns a JSON column specification in a column value into a Column object.""" + columnSpec = getattr(self.cldf, 'columnSpec', None) # pylint: disable=C0103 if columnSpec: return csvw.metadata.Column.fromvalue(columnSpec) + return None @functools.cached_property - def datatype(self): + def datatype(self) -> Optional[csvw.metadata.Datatype]: + """Turns a JSON datatype description in a column value into a Datatype object.""" if 'datatype' in self.data \ and self.dataset['ParameterTable', 'datatype'].datatype.base == 'json': if self.data['datatype']: return csvw.metadata.Datatype.fromvalue(self.data['datatype']) + return None @property - def codes(self): + def codes(self): # pylint: disable=C0116 return DictTuple(v for v in self.dataset.objects('CodeTable') if v.parameter == self) @property - def values(self): + def values(self): # pylint: disable=C0116 return DictTuple(v for v in self.dataset.objects('ValueTable') if self in v.parameters) @property - def forms(self): + def forms(self): # pylint: disable=C0116 return DictTuple(v for v in self.dataset.objects('FormTable') if self in v.parameters) def concepticon_conceptset(self, concepticon_api): @@ -484,17 +505,17 @@ def concepticon_conceptset(self, concepticon_api): return concepticon_api.conceptsets.get(self.cldf.concepticonReference) -class Sense(Object): +class Sense(Object): # pylint: disable=C0115 @property - def entry(self): + def entry(self): # pylint: disable=C0116 return self.related('entryReference') @property - def entries(self): + def entries(self): # pylint: disable=C0116 return self.all_related('entryReference') -class Tree(Object): +class Tree(Object): # pylint: disable=C0115 pass @@ -530,6 +551,10 @@ class Value(Object, _WithLanguageMixin, _WithParameterMixin): """ @property def typed_value(self): + """ + If a parameter includes information about the datatype of its values, this information is + used here to convert the value accordingly. + """ if self.parameter.columnSpec: return self.parameter.columnSpec.read(self.cldf.value) if self.parameter.datatype: @@ -537,9 +562,9 @@ def typed_value(self): return self.cldf.value @property - def code(self): + def code(self): # pylint: disable=C0116 return self.related('codeReference') @property - def examples(self): + def examples(self): # pylint: disable=C0116 return self.all_related('exampleReference') diff --git a/src/pycldf/schemautil.py b/src/pycldf/schemautil.py new file mode 100644 index 0000000..5467c35 --- /dev/null +++ b/src/pycldf/schemautil.py @@ -0,0 +1,48 @@ +from typing import Union + +from csvw.metadata import Column, Table +from clldutils import jsonlib + +from pycldf.terms import TERMS +from pycldf.util import MD_SUFFIX, pkg_path + +ColSpecType = Union[str, dict, Column] +TableSpecType = Union[str, dict, Table] +TableType = Union[str, Table] +ColType = Union[str, Column] + + +def make_column(spec: ColSpecType) -> Column: + """ + Create a `Column` instance from `spec`. + + .. code-block:: python + + >>> make_column('id').name + 'id' + >>> make_column('http://cldf.clld.org/v1.0/terms.rdf#id').name + 'ID' + >>> make_column({'name': 'col', 'datatype': 'boolean'}).datatype.base + 'boolean' + >>> type(make_column(make_column('id'))) + + """ + if isinstance(spec, str): + if spec in TERMS.by_uri: + return TERMS.by_uri[spec].to_column() + return Column(name=spec, datatype='string') + if isinstance(spec, dict): + return Column.fromvalue(spec) + if isinstance(spec, Column): + return spec + raise TypeError(spec) + + +def make_table(spec: TableSpecType) -> Table: + if isinstance(spec, str): + return Table.fromvalue(jsonlib.load(pkg_path('components', f'{spec}{MD_SUFFIX}'))) + if isinstance(spec, dict): + return Table.fromvalue(spec) + if isinstance(spec, Table): + return spec + raise TypeError(spec) # pragma: no cover diff --git a/src/pycldf/sources.py b/src/pycldf/sources.py index 1d8ec7a..946b308 100644 --- a/src/pycldf/sources.py +++ b/src/pycldf/sources.py @@ -1,12 +1,16 @@ +""" +Functionality to handle BibTeX source data of Datasets. +""" import re import types -import typing +from typing import Optional, Union, Literal import pathlib import zipfile import tempfile import collections from urllib.error import HTTPError from urllib.request import urlopen, urlretrieve +from collections.abc import Generator, Iterable, KeysView from csvw.metadata import is_url from simplepybtex import database @@ -14,7 +18,8 @@ from clldutils.source import Source as BaseSource from clldutils.source import ID_PATTERN -from pycldf.util import update_url +from pycldf.urlutil import update_url +from pycldf.fileutil import PathType __all__ = ['Source', 'Sources', 'Reference'] @@ -22,13 +27,14 @@ class Writer(BaseWriter): + """We overwrite pybtex's writer to ensure data is wrapped in curly braces.""" def quote(self, s): self.check_braces(s) return '{%s}' % s def _encode(self, text): # - # FIXME: We overwrite a private method here! + # FIXME: We overwrite a private method here! pylint: disable=fixme # return text @@ -38,7 +44,8 @@ class Source(BaseSource): A bibliograhical record, specifying a source for some data in a CLDF dataset. """ @property - def entry(self): + def entry(self) -> database.Entry: + """Converts Source to a pybtex Entry.""" persons = collections.OrderedDict([ ('author', list(self.persons(self.get('author', '')))), ('editor', list(self.persons(self.get('editor', '')))), @@ -53,10 +60,10 @@ def __str__(self): return self.text() def __repr__(self): - return '<%s %s>' % (self.__class__.__name__, self.id) + return f'<{self.__class__.__name__} {self.id}>' @classmethod - def from_entry(cls, key, entry, **_kw): + def from_entry(cls, key: str, entry: database.Entry, **_kw): """ Create a `cls` instance from a `simplepybtex` entry object. @@ -65,15 +72,16 @@ def from_entry(cls, key, entry, **_kw): :param _kw: Non-bib-metadata keywords to be passed for `cls` instantiation :return: `cls` instance """ - _kw.update({k: v for k, v in entry.fields.items()}) + _kw.update(entry.fields.items()) _kw.setdefault('_check_id', False) for role in entry.persons: if entry.persons[role]: - _kw[role] = ' and '.join('%s' % p for p in entry.persons[role]) + _kw[role] = ' and '.join(f'{p}' for p in entry.persons[role]) return cls(entry.type, key, **_kw) @staticmethod - def persons(s): + def persons(s: str) -> Generator[database.Person, None, None]: + """Yields persons encoded in an author names string.""" for name in re.split(r'\s+&\s+|\s+and\s+', s.strip()): if name: parts = name.split(',') @@ -83,26 +91,31 @@ def persons(s): else: yield database.Person(name) - def refkey(self, year_brackets='round'): - brackets = {None: ('', ''), 'round': ('(', ')'), 'square': ('[', ']'), 'curly': ('{', '}')} + def refkey(self, year_brackets: Union[None, Literal["round", "square", "curly"]] = 'round'): + """Compute an author-year type reference key for the item.""" + brackets = { + None: ('', ''), + 'round': ('(', ')'), + 'square': ('[', ']'), + 'curly': ('{', '}')}.get(year_brackets) persons = self.entry.persons.get('author') or self.entry.persons.get('editor', []) - s = ' '.join(persons[0].prelast_names + persons[0].last_names) if persons else 'n.a.' + names = ' '.join(persons[0].prelast_names + persons[0].last_names) if persons else 'n.a.' if len(persons) == 2: - s += ' and {}'.format(' '.join(persons[1].last_names)) + names += f" and {' '.join(persons[1].last_names)}" elif len(persons) > 2: - s += ' et al.' - return s.replace('{', '').replace('}', '') + ' {}{}{}'.format( - brackets[year_brackets][0], self.get('year', 'n.d.'), brackets[year_brackets][1]) + names += ' et al.' + names = names.replace('{', '').replace('}', '') + return f"{names} {brackets[0]}{self.get('year', 'n.d.')}{brackets[1]}" -class Reference(object): +class Reference: """ A reference connects a piece of data with a `Source`, typically adding some citation context \ often page numbers, or similar. """ - def __init__(self, source: Source, desc: typing.Union[str, None]): + def __init__(self, source: Source, desc: Optional[str]): if desc and ('[' in desc or ']' in desc or ';' in desc): - raise ValueError('invalid ref description: %s' % desc) + raise ValueError(f'invalid ref description: {desc}') self.source = source self.fields = types.SimpleNamespace(**self.source) if isinstance(self.source, dict) else {} self.description = desc @@ -115,14 +128,14 @@ def __str__(self): """ res = self.source.id if hasattr(self.source, 'id') else self.source if self.description: - res += '[%s]' % self.description + res += f'[{self.description}]' return res def __repr__(self): - return '<%s %s>' % (self.__class__.__name__, self) + return f'<{self.__class__.__name__} {self}>' -class Sources(object): +class Sources: """ A `dict` like container for all sources linked to data in a CLDF dataset. """ @@ -130,16 +143,17 @@ def __init__(self): self._bibdata = database.BibliographyData() @classmethod - def from_file(cls, fname): + def from_file(cls, fname: PathType) -> 'Sources': + """Instantiate an instance from the data in a BibTeX file.""" zipped = False res = cls() - if not is_url(fname): + if not is_url(str(fname)): fname = pathlib.Path(fname) if not fname.exists(): - fname = fname.parent / '{}.zip'.format(fname.name) + fname = fname.parent / f'{fname.name}.zip' zipped = True if fname.exists(): - assert fname.is_file(), 'Bibfile {} must be a file!'.format(fname) + assert fname.is_file(), f'Bibfile {fname} must be a file!' res.read(fname, zipped=zipped) else: res.read(fname) @@ -150,34 +164,34 @@ def __bool__(self): __nonzero__ = __bool__ - def keys(self): + def keys(self) -> KeysView[str]: # pylint: disable=C0116 return self._bibdata.entries.keys() - def items(self): + def items(self) -> Generator[Source, None, None]: # pylint: disable=C0116 for key, entry in self._bibdata.entries.items(): yield Source.from_entry(key, entry) def __iter__(self): return self.items() - def __len__(self): + def __len__(self) -> int: return len(self._bibdata.entries) - def __getitem__(self, item): + def __getitem__(self, item: str) -> Optional[Source]: try: return Source.from_entry(item, self._bibdata.entries[item]) - except KeyError: - raise ValueError('missing citekey: %s' % item) + except KeyError as e: + raise ValueError(f'missing citekey: {item}') from e - def __contains__(self, item): + def __contains__(self, item: str) -> bool: return item in self._bibdata.entries @staticmethod - def format_refs(*refs): - return ['%s' % ref for ref in refs] + def format_refs(*refs) -> list[str]: # pylint: disable=C0116 + return [f'{ref}' for ref in refs] @staticmethod - def parse(ref: str) -> typing.Tuple[str, str]: + def parse(ref: str) -> tuple[str, str]: """ Parse the string representation of a reference into source ID and context. @@ -191,14 +205,15 @@ def parse(ref: str) -> typing.Tuple[str, str]: pages = pages[:-1].strip() return sid, pages - def validate(self, refs): + def validate(self, refs: Union[str, list[str]]) -> None: + """Make sure refs are valid. If not, raises Exceptions.""" if not isinstance(refs, str) and any(r is None for r in refs): raise ValueError('empty reference in ref list (possibly caused by trailing separator)') for sid, _ in map(self.parse, [refs] if isinstance(refs, str) else refs): if sid not in self.keys(): - raise ValueError('missing source key: {0}'.format(sid)) + raise ValueError(f'missing source key: {sid}') - def expand_refs(self, refs: typing.Iterable[str], **kw) -> typing.Iterable[Reference]: + def expand_refs(self, refs: Iterable[str], **kw) -> Iterable[Reference]: """ Turn a list of string references into proper :class:`Reference` instances, looking up \ sources in `self`. @@ -217,7 +232,7 @@ def expand_refs(self, refs: typing.Iterable[str], **kw) -> typing.Iterable[Refer self._add_entries(Source('misc', sid, glottolog_id=sid), **kw) yield Reference(self[sid], pages) - def _add_entries(self, data, **kw): + def _add_entries(self, data: Union[Source, database.BibliographyData], **kw) -> None: if isinstance(data, Source): entries = [(data.id, data.entry)] elif hasattr(data, 'entries'): @@ -232,17 +247,20 @@ def _add_entries(self, data, **kw): for key, entry in entries: if kw.get('_check_id', False) and not ID_PATTERN.match(key): - raise ValueError('invalid source ID: %s' % key) + raise ValueError(f'invalid source ID: {key}') if key not in self._bibdata.entries: try: self._bibdata.add_entry(key, entry) except database.BibliographyDataError as e: # pragma: no cover - raise ValueError('%s' % e) + raise ValueError(f'{e}') from e - def read(self, fname, zipped=False, **kw): - if is_url(fname): + def read(self, fname: PathType, zipped=False, **kw): + """Read sources from a BibTex file (possibly specified via URL).""" + if is_url(str(fname)): + fname = str(fname) try: - content = urlopen(fname).read().decode('utf-8') + with urlopen(fname) as f: + content = f.read().decode('utf-8') except HTTPError as e: if '404' in str(e): fname = update_url( @@ -254,14 +272,15 @@ def read(self, fname, zipped=False, **kw): content = zf.read(zf.namelist()[0]).decode('utf8') else: if zipped: - with zipfile.ZipFile(fname, 'r') as zf: + with zipfile.ZipFile(str(fname), 'r') as zf: content = zf.read(zf.namelist()[0]).decode('utf8') else: content = pathlib.Path(fname).read_text(encoding='utf-8') self._add_entries( database.parse_string(content, bib_format='bibtex'), **kw) - def write(self, fname, ids=None, zipped=False, **kw): + def write(self, fname: PathType, ids=None, zipped=False, **_) -> Optional[pathlib.Path]: + """Write sources to a file (if there are any).""" if ids: bibdata = database.BibliographyData() for key, entry in self._bibdata.entries.items(): @@ -269,19 +288,21 @@ def write(self, fname, ids=None, zipped=False, **kw): bibdata.add_entry(key, entry) else: bibdata = self._bibdata + fname = pathlib.Path(fname) if bibdata.entries: - with pathlib.Path(fname).open('w', encoding='utf8') as fp: + with fname.open('w', encoding='utf8') as fp: Writer().write_stream(bibdata, fp) if zipped: with zipfile.ZipFile( - fname.parent / '{}.zip'.format(fname.name), + fname.parent / f'{fname.name}.zip', 'w', compression=zipfile.ZIP_DEFLATED) as zf: zf.write(fname, fname.name) fname.unlink() return fname + return None - def add(self, *entries: typing.Union[str, Source], **kw): + def add(self, *entries: Union[str, Source], **kw) -> None: """ Add a source, either specified as BibTeX string or as :class:`Source`. """ diff --git a/src/pycldf/stats.py b/src/pycldf/stats.py new file mode 100644 index 0000000..5f40839 --- /dev/null +++ b/src/pycldf/stats.py @@ -0,0 +1,40 @@ +""" +Functionality to compute summary statistics for a Dataset. +""" +import typing +import dataclasses +from collections.abc import Generator + +from pycldf.terms import TERMS + +if typing.TYPE_CHECKING: + from pycldf import Dataset # pragma: no cover + +__all__ = ['get_table_stats'] + + +def get_table_stats(ds: 'Dataset', exact: bool = False) -> list[tuple[str, str, int]]: + """Return a list of table statistics.""" + return [dataclasses.astuple(stats) for stats in _iter_stats(ds, exact)] + + +@dataclasses.dataclass(frozen=True) +class TableStats: + """A bag of attrs""" + fname: str + component: str + rowcount: int + + +def _iter_stats(ds: 'Dataset', exact: bool = False) -> Generator[TableStats, None, None]: + for table in ds.tables: + dctype = table.common_props.get('dc:conformsTo') + if dctype and '#' in dctype and dctype.split('#')[1] in TERMS: + dctype = TERMS[dctype.split('#')[1]].csvw_prop('name') + yield TableStats( + table.url.string, + dctype or '', + sum(1 for _ in table) if (exact or 'dc:extent' not in table.common_props) + else int(table.common_props.get('dc:extent'))) + if ds.sources: + yield TableStats(ds.bibname, 'Sources', len(ds.sources)) diff --git a/src/pycldf/terms.py b/src/pycldf/terms.py index 83c914a..6eb8dd2 100644 --- a/src/pycldf/terms.py +++ b/src/pycldf/terms.py @@ -1,16 +1,28 @@ +""" +Functionality to access the metadata about CLDF schema objects encoded in the ontology. +""" import re import json import types +import pathlib import warnings +import dataclasses import urllib.parse +from typing import Optional, Union, Callable, Any, TYPE_CHECKING +from collections.abc import Container from xml.etree import ElementTree import attr from csvw.metadata import Column +from clldutils import jsonlib from pycldf.util import pkg_path +from pycldf.fileutil import PathType -__all__ = ['term_uri', 'TERMS', 'get_column_names'] +if TYPE_CHECKING: + from pycldf import Dataset # pragma: no cover + +__all__ = ['term_uri', 'TERMS', 'get_column_names', 'sniff'] URL = 'http://cldf.clld.org/v1.0/terms.rdf' RDF = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#' @@ -19,42 +31,48 @@ DC = 'http://purl.org/dc/terms/' -def term_uri(name, terms=None, ns=URL): - if isinstance(name, Term): - return name.uri - if not name.startswith(ns): - sep = '' if ns.endswith('#') else '#' - name = sep.join([ns, name]) - if not terms or name in terms: - return name - return None +def qname(ns: str, lname: str) -> str: + """Return a qualified name in ElementTree notation.""" + return '{%s}%s' % (ns, lname) -def qname(ns, lname): - return '{%s}%s' % (ns, lname) +@dataclasses.dataclass +class NameSpec: # pylint: disable=C0115 + ns: str + lname: str + @property + def qname(self): # pylint: disable=C0116 + return qname(self.ns, self.lname) -def _get(e, subelementns, subelementlname, attrns=None, attrlname=None, converter=None): + +def _get( + e: ElementTree.Element, + subelement: NameSpec, + attrib: Optional[NameSpec] = None, + converter: Optional[Callable[[str], Any]] = None, +): """ :return: Text content or attribute value of a subelement of e. """ res = None - subelement = e.find(qname(subelementns, subelementlname)) + subelement = e.find(subelement.qname) if subelement is not None: - if not attrlname: + if not attrib: res = subelement.text else: - res = subelement.attrib[qname(attrns, attrlname)] + res = subelement.attrib[attrib.qname] if converter and res: res = converter(res) return res @attr.s -class Term(object): - name = attr.ib() - type = attr.ib(validator=attr.validators.in_(['Class', 'Property'])) - element = attr.ib() +class Term: + """A Term is an object described in the CLDF Ontology.""" + name: str = attr.ib() + type: str = attr.ib(validator=attr.validators.in_(['Class', 'Property'])) + element: ElementTree.Element = attr.ib() references = attr.ib(default=None) subtype = attr.ib(default=None) version = attr.ib(default=None, validator=attr.validators.matches_re(r'v[0-9]+(\.[0-9]+)+')) @@ -63,23 +81,30 @@ class Term(object): validator=attr.validators.optional(attr.validators.in_(['singlevalued', 'multivalued']))) @property - def uri(self): - return '{0}#{1}'.format(URL, self.name) + def uri(self) -> str: + """The Term URI.""" + return f'{URL}#{self.name}' @classmethod - def from_element(cls, e): - subClassOf = e.find(qname(RDFS, 'subClassOf')) - kw = dict( - name=e.attrib[qname(RDF, 'about')].split('#')[1], - version=_get( - e, DC, 'hasVersion', RDF, 'resource', + def from_element(cls, e: ElementTree.Element) -> 'Term': + """Instantiate a Term from an XML element parsed from the ontology.""" + subClassOf = e.find(qname(RDFS, 'subClassOf')) # pylint: disable=invalid-name + kw = { + 'name': e.attrib[qname(RDF, 'about')].split('#')[1], + 'version': _get( + e, + NameSpec(ns=DC, lname='hasVersion'), + attrib=NameSpec(ns=RDF, lname='resource'), converter=lambda s: 'v' + s.split('/v')[1].replace('/', '')) or 'v1.0', - type=e.tag.split('}')[1], - element=e, - cardinality=_get(e, DC, 'extent'), - references=_get( - e, DC, 'references', RDF, 'resource', converter=lambda s: s.split('#')[1]), - ) + 'type': e.tag.split('}')[1], + 'element': e, + 'cardinality': _get(e, NameSpec(ns=DC, lname='extent')), + 'references': _get( + e, + NameSpec(ns=DC, lname='references'), + attrib=NameSpec(ns=RDF, lname='resource'), + converter=lambda s: s.split('#')[1]), + } if kw['type'] == 'Class': kw['subtype'] = 'module' \ if subClassOf is not None \ @@ -87,10 +112,12 @@ def from_element(cls, e): 'http://www.w3.org/ns/dcat#Distribution' else 'component' return cls(**kw) - def csvw_prop(self, lname): - return _get(self.element, CSVW, lname, converter=lambda s: json.loads(s)) + def csvw_prop(self, lname: str) -> Any: + """Returns the JSON value of a property in the CSVW namespace.""" + return _get(self.element, NameSpec(ns=CSVW, lname=lname), converter=json.loads) - def to_column(self): + def to_column(self) -> Column: + """Returns a csvw Column instance configured according to the term spec.""" col = Column( name=self.csvw_prop('name') or self.element.find(qname(RDFS, 'label')).text, propertyUrl=self.element.attrib[qname(RDF, 'about')], @@ -101,7 +128,10 @@ def to_column(self): setattr(col, k, v) return col - def comment(self, one_line=False): + def comment(self, one_line=False) -> str: + """ + Parse a text comment from the XML element of the ontology. + """ c = self.element.find("{http://www.w3.org/2000/01/rdf-schema#}comment") try: xml = ElementTree.tostring(c, default_namespace='http://www.w3.org/1999/xhtml') @@ -116,17 +146,24 @@ def comment(self, one_line=False): return re.sub(r'\s+', ' ', res.replace('\n', ' ')) if one_line else res +TermDict = dict[str, Term] + + class Terms(dict): - def __init__(self, path=None): + """ + A dict of `Term`s keyed by local names. + """ + def __init__(self, path: Optional[PathType] = None): self._path = path or pkg_path('terms.rdf') r = ElementTree.parse(str(self._path)).getroot() terms = [Term.from_element(e) for e in r.findall(qname(RDF, 'Property'))] for e in r.findall(qname(RDFS, 'Class')): terms.append(Term.from_element(e)) dict.__init__(self, {t.name: t for t in terms}) - self.by_uri = {t.uri: t for t in terms} + self.by_uri: TermDict = {t.uri: t for t in terms} - def is_cldf_uri(self, uri): + def is_cldf_uri(self, uri: str) -> bool: + """Whether the given URL is a CLDF Ontology term URI.""" if uri and urllib.parse.urlparse(uri).netloc == 'cldf.clld.org': if uri not in self.by_uri: warnings.warn('If pycldf does not recognize valid CLDF URIs, You may be ' @@ -137,44 +174,103 @@ def is_cldf_uri(self, uri): return False @property - def properties(self): + def properties(self) -> TermDict: # pylint: disable=C0116 return {k: v for k, v in self.items() if v.type == 'Property'} @property - def classes(self): + def classes(self) -> TermDict: # pylint: disable=C0116 return {k: v for k, v in self.items() if v.type == 'Class'} @property - def modules(self): + def modules(self) -> TermDict: # pylint: disable=C0116 return {k: v for k, v in self.items() if v.subtype == 'module'} @property - def components(self): + def components(self) -> TermDict: # pylint: disable=C0116 return {k: v for k, v in self.items() if v.subtype == 'component'} +def term_uri(name: Union[Term, str], terms: Container[str] = None, ns: str = URL) -> Optional[str]: + """ + Returns a full term URI associated with `name`. + + If `terms` are provided, we make sure the URI is contained in `terms`. + """ + if isinstance(name, Term): + return name.uri + if not name.startswith(ns): # So this may be a local name, i.e. the fragment of a term URI. + sep = '' if ns.endswith('#') else '#' + name = sep.join([ns, name]) + if not terms or name in terms: + return name + return None + + TERMS = Terms() -def get_column_names(dataset, use_component_names=False, with_multiplicity=False): +def get_column_names( + dataset: 'Dataset', + use_component_names: bool = False, + with_multiplicity: bool = False, +) -> types.SimpleNamespace: + """ + Returns an object allowing programmatic access to the column names used for ontology terms + in a specific dataset. + + .. code-block:: python + + >>> from pycldf import Dataset + >>> from pycldf.terms import get_column_names + >>> ds = Dataset.from_metadata('tests/data/ds1.csv-metadata.json') + >>> res = get_column_names(ds, use_component_names=True) + >>> res.ValueTable.languageReference + 'Language_ID' + """ comp_names = { k: k if use_component_names else k.replace('Table', '').lower() + 's' for k in TERMS.components} + # Seed the result object with component names as attributes and None as value. name_map = types.SimpleNamespace(**{k: None for k in comp_names.values()}) for term, attr_ in comp_names.items(): - try: - table = dataset[term] + table = dataset.get(term) + if table: props = {} - for k in TERMS.properties: - try: - col = dataset[table, k] + for k in TERMS.properties: # Loop through properties in the ontology. + col = dataset.get((table, k)) + if col: if with_multiplicity: props[k] = (col.name, bool(col.separator)) else: props[k] = col.name - except KeyError: + else: props[k] = None setattr(name_map, attr_, types.SimpleNamespace(**props)) - except KeyError: - pass return name_map + + +def sniff(p: pathlib.Path) -> bool: + """ + Determine whether a file contains CLDF metadata. + + :param p: `pathlib.Path` object for an existing file. + :return: `True` if the file contains CLDF metadata, `False` otherwise. + """ + if not p.is_file(): # pragma: no cover + return False + try: + with p.open('rb') as fp: + c = fp.read(10) + try: + c = c.decode('utf8').strip() + except UnicodeDecodeError: + return False + if not c.startswith('{'): + return False + except (FileNotFoundError, OSError): # pragma: no cover + return False + try: + d = jsonlib.load(p) + except json.decoder.JSONDecodeError: + return False + return d.get('dc:conformsTo', '').startswith(URL) diff --git a/src/pycldf/trees.py b/src/pycldf/trees.py index 3a81c76..2612736 100644 --- a/src/pycldf/trees.py +++ b/src/pycldf/trees.py @@ -22,16 +22,16 @@ └─l4 """ import typing -import logging import pathlib -from clldutils.misc import log_or_raise from commonnexus import Nexus import newick -import pycldf from pycldf.media import MediaTable, File +if typing.TYPE_CHECKING: + from pycldf import Dataset # pragma: no cover + from pycldf.validators import DatasetValidator # pragma: no cover __all__ = ['Tree', 'TreeTable'] @@ -85,12 +85,14 @@ def newick(self, return newick.loads(self.newick_string(d=d), strip_comments=strip_comments)[0] -class TreeTable(pycldf.ComponentWithValidation): +class TreeTable: """ Container class for a `Dataset`'s TreeTable. """ - def __init__(self, ds: pycldf.Dataset): - super().__init__(ds) + def __init__(self, ds: 'Dataset'): + self.ds = ds + self.component = self.__class__.__name__ + self.table = ds[self.component] self.media = MediaTable(ds) self.media_rows = {row[self.media.id_col.name]: row for row in ds['MediaTable']} self.cols = { @@ -107,25 +109,16 @@ def __iter__(self) -> typing.Generator[Tree, None, None]: row, File(self.media, self.media_rows[row[self.cols['mediaReference'].name]])) - def validate(self, - success: bool = True, - log: logging.Logger = None) -> bool: + def validate(self, validator: 'DatasetValidator'): lids = {r['id'] for r in self.ds.iter_rows('LanguageTable', 'id')} for tree in self: try: nwk = tree.newick() except KeyError: - log_or_raise( - 'No newick tree found for name "{}"'.format(tree.name), - log=log) - success = False + validator.fail(f'No newick tree found for name "{tree.name}"') nwk = None if nwk: for node in nwk.walk(): if node.name and (node.name not in lids): - log_or_raise( - 'Newick node label "{}" is not a LanguageTable ID'.format(node.name), - log=log) - success = False - return success + validator.fail(f'Newick node label "{node.name}" is not a LanguageTable ID') diff --git a/src/pycldf/urlutil.py b/src/pycldf/urlutil.py new file mode 100644 index 0000000..66715f4 --- /dev/null +++ b/src/pycldf/urlutil.py @@ -0,0 +1,37 @@ +""" +Functionality to manipulate URLs. +""" +from typing import Callable, Union +import urllib.parse + +__all__ = ['update_url', 'sanitize_url', 'url_without_fragment'] + + +def update_url( + url: Union[str, urllib.parse.ParseResult], + updater: Callable[[urllib.parse.ParseResult], tuple[str, str, str, str, str]], +) -> Union[str, None]: + """Generic update function for URLs.""" + if url is None: + return None + if isinstance(url, str): + url = urllib.parse.urlparse(url) + return urllib.parse.urlunsplit(updater(url)) or None + + +def sanitize_url(url: str) -> str: + """ + Removes auth credentials from a URL. + """ + def fix(u): + host = u.hostname + if u.port: + host += f':{u.port}' + return (u.scheme, host, u.path, u.query, u.fragment) + + return update_url(url, fix) + + +def url_without_fragment(url: Union[str, urllib.parse.ParseResult]) -> str: + """Removes fragment from URL.""" + return update_url(url, lambda u: (u.scheme, u.hostname, u.path, u.query, '')) diff --git a/src/pycldf/util.py b/src/pycldf/util.py index c15626b..acd465f 100644 --- a/src/pycldf/util.py +++ b/src/pycldf/util.py @@ -1,77 +1,54 @@ -import re -import html -import math -import string -import typing +import shutil +from typing import Optional, TYPE_CHECKING, Any import pathlib import itertools import collections import urllib.parse +import urllib.request -from clldutils.misc import slug -import pycldf +from csvw.metadata import is_url, Link +from clldutils.path import git_describe -__all__ = [ - 'pkg_path', 'multislice', 'resolve_slices', 'DictTuple', 'metadata2markdown', 'qname2url', - 'sanitize_url', 'update_url', 'iter_uritemplates', 'url_without_fragment', - 'splitfile', 'catfile'] +from pycldf.fileutil import PathType +from pycldf.urlutil import sanitize_url +if TYPE_CHECKING: + from pycldf import Dataset # pragma: no cover -def splitfile(p, chunksize: int, total: typing.Optional[int] = None) -> typing.List[pathlib.Path]: - """ - :param p: Path of the file to split. - :param chunksize: The maximal size of the chunks the file will be split into. - :param total: The size of the input file. - :return: The list of paths of files that the input has been split into. - """ - total = total or p.stat().st_size - if total <= chunksize: # Nothing to do. - return [p] - nchunks = math.ceil(total / chunksize) - suffix_length = 2 if nchunks < len(string.ascii_lowercase)**2 else 3 - suffixes = [ - ''.join(t) for t in - itertools.combinations_with_replacement(string.ascii_lowercase, suffix_length)] - - res = [] - with p.open('rb') as f: - chunk = f.read(chunksize) - while chunk: - pp = p.parent.joinpath('{}.{}'.format(p.name, suffixes.pop(0))) - pp.write_bytes(chunk) - res.append(pp) - chunk = f.read(chunksize) # read the next chunk - - p.unlink() - return res +__all__ = [ + 'pkg_path', 'multislice', 'resolve_slices', 'DictTuple', 'qname2url', + 'iter_uritemplates', 'MD_SUFFIX', 'GitRepository'] +MD_SUFFIX = '-metadata.json' -def catfile(p: pathlib.Path) -> bool: - """ - Restore a file that has been split into chunks. - We determine if a file has been split by looking for files in the parent directory with suffixes - as created by `splitfile`. +class GitRepository: # pylint: disable=too-few-public-methods + """ + CLDF datasets are often created from data curated in git repositories. If this is the case, we + exploit this to provide better provenance information in the dataset's metadata. """ - if p.exists(): # Nothing to do. - return False - # Check, whether the file has been split. - suffixes = {pp.suffix: pp for pp in p.parent.iterdir() if pp.stem == p.name} - if {'.aa', '.ab'}.issubset(suffixes) or {'.aaa', '.aab'}.issubset(suffixes): - # ok, let's concatenate the files: - with p.open('wb') as f: - for suffix in sorted(suffixes): - if re.fullmatch(r'\.[a-z]{2,3}', suffix): - f.write(suffixes[suffix].read_bytes()) - suffixes[suffix].unlink() - return True - return False # pragma: no cover - - -def url_without_fragment(url: typing.Union[str, urllib.parse.ParseResult]) -> str: - if isinstance(url, str): - url = urllib.parse.urlparse(url) - return urllib.parse.urlunparse(list(url[:5]) + ['']) + def __init__( + self, url: str, clone: Optional[PathType] = None, version: Optional[str] = None, **dc): + # We remove credentials from the URL immediately to make sure this isn't leaked into + # CLDF metadata. Such credentials might be present in URLs read via gitpython from + # remotes. + self.url = sanitize_url(url) + self.clone = clone + self.version = version + self.dc = dc + + def json_ld(self) -> collections.OrderedDict[str, Any]: + """The repository described in JSON-LD.""" + res = collections.OrderedDict([ + ('rdf:about', self.url), + ('rdf:type', 'prov:Entity'), + ]) + if self.version: + res['dc:created'] = self.version + elif self.clone: + res['dc:created'] = git_describe(self.clone) + res.update({f'dc:{k}': self.dc[k] for k in sorted(self.dc)}) + return res def iter_uritemplates(table): @@ -83,25 +60,8 @@ def iter_uritemplates(table): yield obj, prop, tmpl -def sanitize_url(url: str) -> str: - """ - Removes auth credentials from a URL. - """ - def fix(u): - host = u.hostname - if u.port: - host += ':{}'.format(u.port) - return (u.scheme, host, u.path, u.query, u.fragment) - - return update_url(url, fix) - - -def update_url(url: str, updater: typing.Callable[[urllib.parse.ParseResult], tuple]) -> str: - return urllib.parse.urlunsplit(updater(urllib.parse.urlparse(url))) or None - - def pkg_path(*comps): - return pathlib.Path(pycldf.__file__).resolve().parent.joinpath(*comps) + return pathlib.Path(__file__).resolve().parent.joinpath(*comps) def multislice(sliceable, *slices): @@ -142,7 +102,7 @@ class DictTuple(tuple): def __new__(cls, items, **kw): return super(DictTuple, cls).__new__(cls, tuple(items)) - def __init__(self, items, key=lambda i: i.id, multi=False): + def __init__(self, _, key=lambda i: i.id, multi=False): """ If `key` does not return unique values for all items, you may pass `multi=True` to retrieve `list`s of matching items for `l[key]`. @@ -157,7 +117,7 @@ def __getitem__(self, item): if self._multi: return [self[i] for i in self._d[item]] return self[self._d[item][0]] - return super(DictTuple, self).__getitem__(item) + return super().__getitem__(item) def qname2url(qname): @@ -174,151 +134,48 @@ def qname2url(qname): return qname.replace(prefix + ':', uri) -def metadata2markdown(ds: 'pycldf.Dataset', - path: typing.Union[str, pathlib.Path], - rel_path: typing.Optional[str] = './') -> str: +def copy_dataset(ds: 'Dataset', dest: PathType, mdname: str = None) -> pathlib.Path: """ - Render the metadata of a dataset as markdown. - - :param ds: `pycldf.Dataset` instance - :param path: `pathlib.Path` of the metadata file - :param rel_path: `str` to use a relative path when creating links to data files - :return: `str` with markdown formatted text + Copy metadata, data and sources to files in `dest`. """ - path = pathlib.Path(path) + from pycldf.media import MediaTable # pylint: disable=import-outside-toplevel - def qname2link(qname, html=False): - url = qname2url(qname) - if url: - if html: - return '{}'.format(url, qname) - return '[{}]({})'.format(qname, url) - return qname + dest = pathlib.Path(dest) + if not dest.exists(): + dest.mkdir(parents=True) - def htmlify(obj, key=None): - """ - For inclusion in tables we must use HTML for lists. - """ - if isinstance(obj, list): - return '
      {}
    '.format( - ''.join('
  • {}
  • '.format(htmlify(item, key=key)) for item in obj)) - if isinstance(obj, dict): - if key == 'prov:wasGeneratedBy' \ - and set(obj.keys()).issubset({'dc:title', 'dc:description', 'dc:relation'}): - desc = obj.get('dc:description') or '' - if obj.get('dc:relation'): - desc = (desc + '
    ') if desc else desc - desc += '{1}'.format(rel_path, obj['dc:relation']) - return '{}: {}'.format(obj.get('dc:title') or '', desc) - - if obj.get('rdf:type') == 'prov:Entity' and 'rdf:about' in obj: - label = obj.get('dc:title') - if (not label) or label == 'Repository': - label = obj['rdf:about'] - url = obj['rdf:about'] - if ('github.com' in url) and ('/tree/' not in url) and ('dc:created' in obj): - tag = obj['dc:created'] - if '-g' in tag: - tag = tag.split('-g')[-1] - url = '{}/tree/{}'.format(url, tag) - if label == obj['rdf:about']: - label = label.split('github.com/')[-1] - return '{} {}'.format(url, label, obj.get('dc:created') or '') - items = [] - for k, v in obj.items(): - items.append('
    {}
    {}
    '.format( - qname2link(k, html=True), html.escape(str(v)))) - return '
    {}
    '.format(''.join(items)) - return str(obj) - - def properties(obj): - res = [] - if obj.common_props.get('dc:description'): - res.append(obj.common_props['dc:description'] + '\n') - res.append('property | value\n --- | ---') - for k, v in obj.common_props.items(): - if not v: - continue - if k not in ('dc:description', 'dc:title', 'dc:source'): - if k == 'dc:conformsTo': - v = '[CLDF {}]({})'.format(v.split('#')[1], v) - res.append('{} | {}'.format(qname2link(k), htmlify(v, key=k))) - res.append('') - return '\n'.join(res) - - def colrow(col, fks, pk): - dt = '`{}`'.format(col.datatype.base if col.datatype else 'string') - if col.datatype: - if col.datatype.format: - if re.fullmatch(r'[\w\s]+(\|[\w\s]+)*', col.datatype.format): - dt += '
    Valid choices:
    ' - dt += ''.join(' `{}`'.format(w) for w in col.datatype.format.split('|')) - elif col.datatype.base == 'string': - dt += '
    Regex: `{}`'.format(col.datatype.format) - if col.datatype.minimum: - dt += '
    ≥ {}'.format(col.datatype.minimum) - if col.datatype.maximum: - dt += '
    ≤ {}'.format(col.datatype.maximum) - if col.separator: - dt = 'list of {} (separated by `{}`)'.format(dt, col.separator) - desc = col.common_props.get('dc:description', '').replace('\n', ' ') - - if col.name in pk: - desc = (desc + '
    ') if desc else desc - desc += 'Primary key' - - if col.name in fks: - desc = (desc + '
    ') if desc else desc - desc += 'References [{}::{}](#table-{})'.format( - fks[col.name][1], fks[col.name][0], slug(fks[col.name][1])) - elif col.propertyUrl \ - and col.propertyUrl.uri == "http://cldf.clld.org/v1.0/terms.rdf#source" \ - and 'dc:source' in ds.properties: - desc = (desc + '
    ') if desc else desc - desc += 'References [{}::BibTeX-key]({}{})'.format( - ds.properties['dc:source'], rel_path, ds.properties['dc:source']) - - return ' | '.join([ - '[{}]({})'.format(col.name, col.propertyUrl) - if col.propertyUrl else '`{}`'.format(col.name), - dt, - desc, - ]) + from_url = is_url(ds.tablegroup.base) + ds = ds.__class__.from_metadata( + ds.tablegroup.base if from_url else ds.tablegroup._fname) # pylint: disable=W0212 - title = ds.properties.get('dc:title', ds.module) - - res = ['# {}\n'.format(title)] - if path.suffix == '.json': - res.append('**CLDF Metadata**: [{0}]({1}{0})\n'.format(path.name, rel_path)) - if 'dc:source' in ds.properties: - src = None - if pathlib.Path(ds.directory).joinpath(ds.properties['dc:source']).exists(): - src = ds.properties['dc:source'] - elif pathlib.Path(ds.directory).joinpath(ds.properties['dc:source'] + '.zip').exists(): - src = ds.properties['dc:source'] + '.zip' - if src: - res.append('**Sources**: [{0}]({1}{0})\n'.format(src, rel_path)) - res.append(properties(ds.tablegroup)) + _getter = urllib.request.urlretrieve if from_url else shutil.copy + try: + _getter(ds.bibpath, dest / ds.bibname) + ds.properties['dc:source'] = ds.bibname + except: # pragma: no cover # noqa pylint: disable=W0702 + # Sources are optional + pass for table in ds.tables: - fks = { - fk.columnReference[0]: (fk.reference.columnReference[0], fk.reference.resource.string) - for fk in table.tableSchema.foreignKeys if len(fk.columnReference) == 1} - src = None - if pathlib.Path(ds.directory).joinpath(table.url.string).exists(): - src = table.url.string - elif pathlib.Path(ds.directory).joinpath(table.url.string + '.zip').exists(): - src = table.url.string + '.zip' - if src: - res.append('\n## Table [{1}]({2}{3})\n'.format( - slug(table.url.string), table.url, rel_path, src)) - else: - res.append('\n## Table {1}\n'.format( - slug(table.url.string), table.url)) - res.append(properties(table)) - res.append('\n### Columns\n') - res.append('Name/Property | Datatype | Description') - res.append(' --- | --- | --- ') - for col in table.tableSchema.columns: - res.append(colrow(col, fks, table.tableSchema.primaryKey)) - return '\n'.join(res) + fname = table.url.resolve(table.base) + name = pathlib.Path(urllib.parse.urlparse(fname).path).name if from_url else fname.name + _getter(fname, dest / name) + table.url = Link(name) + + for fk in table.tableSchema.foreignKeys: + fk.reference.resource = Link(pathlib.Path(fk.reference.resource.string).name) + mdpath = dest.joinpath( + mdname or # noqa: W504 + (ds.tablegroup.base.split('/')[-1] if from_url + else ds.tablegroup._fname.name)) # pylint: disable=W0212 + if 'MediaTable' in ds: + for f in MediaTable(ds): + if f.scheme == 'file': + if f.local_path().exists(): + target = dest / urllib.parse.unquote(f.relpath) + target.parent.mkdir(parents=True, exist_ok=True) + shutil.copy(f.local_path(), target) + if from_url: + del ds.tablegroup.at_props['base'] # pragma: no cover + ds.write_metadata(fname=mdpath) + return mdpath diff --git a/src/pycldf/validators.py b/src/pycldf/validators.py index c2c1afe..cc894ee 100644 --- a/src/pycldf/validators.py +++ b/src/pycldf/validators.py @@ -1,4 +1,9 @@ +# pylint: disable=cyclic-import +""" +Functionality to validate a Dataset. +""" import re +import pathlib import warnings import functools from typing import Optional, Callable, TYPE_CHECKING @@ -6,36 +11,99 @@ import dataclasses from clldutils.misc import log_or_raise +from csvw.metadata import TableGroup, is_url from pycldf.terms import Terms -from pycldf.util import iter_uritemplates +from pycldf.util import iter_uritemplates, pkg_path, MD_SUFFIX if TYPE_CHECKING: # pragma: no cover - from pycldf.dataset import Dataset, Table, RowType, Column + from pycldf import Dataset, Table, RowType, Column + +__all__ = ['RowValidatorType', 'validate'] RowValidatorType = Callable[['Dataset', 'Table', 'Column', 'RowType'], None] +def validate( + dataset: 'Dataset', + terms: Terms, + log: Optional[logging.Logger], + row_validators: list[tuple[Optional[str], str, RowValidatorType]], +) -> bool: + """Wraps Validator instantiation and calling into one.""" + return DatasetValidator( + dataset=dataset, + success=True, + terms=terms, + log=log, + row_validators=row_validators, + )() + + @dataclasses.dataclass class DatasetValidator: + """Some state to simplify running individual validation steps.""" dataset: 'Dataset' - success: bool - terms: Terms - log: Optional[logging.Logger] - row_validators: list[tuple[Optional[str], str, RowValidatorType]] + success: bool = True + terms: Terms = None + log: Optional[logging.Logger] = None + row_validators: list[tuple[Optional[str], str, RowValidatorType]] \ + = dataclasses.field(default_factory=list) def __post_init__(self): self.row_validators.extend(VALIDATORS) - def fail(self, reason): + def fail(self, reason): # pylint: disable=C0116 self.success = False log_or_raise(reason, log=self.log) - def warn(self, msg, *args): + def warn(self, msg, *args): # pylint: disable=C0116 if self.log: self.log.warning(msg, *args) - def validate_rows(self, table): + def info(self, msg, *args): # pylint: disable=C0116 + if self.log: + self.log.info(msg, *args) + + def __call__(self) -> bool: + """Run the full validation.""" + default_tg = TableGroup.from_file( + pkg_path('modules', f'{self.dataset.module}{MD_SUFFIX}')) + # Make sure, all required tables and columns are present and consistent. + for default_table in default_tg.tables: + self._validate_default_objects(default_table) + + for table in self.dataset.tables: + self._validate_table_schema(table) + self._validate_columns(table) + + fname = pathlib.Path(table.url.resolve(table._parent.base)) # pylint: disable=W0212 + fexists = fname.exists() + if (not fexists) and fname.parent.joinpath(f'{fname.name}.zip').exists(): + self.info(f'Reading data from zipped table: {fname}.zip') + fexists = True # csvw already handles this case, no need to adapt paths. + if is_url(table.url.resolve(table._parent.base)) or fexists: # pylint: disable=W0212 + self._validate_rows(table) + if not table.check_primary_key(log=self.log): + self.fail('Primary key check failed.') + else: + self.fail(f'{fname} does not exist') + + if not self.dataset.tablegroup.check_referential_integrity(log=self.log): + self.fail('Referential integrity check failed') + + self._validate_components() + return self.success + + def _validate_components(self): + from pycldf.media import MediaTable # pylint: disable=import-outside-toplevel + from pycldf.trees import TreeTable # pylint: disable=import-outside-toplevel + + for cls in [MediaTable, TreeTable]: + if cls.__name__ in self.dataset: + cls(self.dataset).validate(self) + + def _validate_rows(self, table): # FIXME: see if table.common_props['dc:conformsTo'] is in validators! pylint: disable=W0511 validators = [] for col in table.tableSchema.columns: @@ -45,13 +113,13 @@ def validate_rows(self, table): validators.append((col, v_)) for fname, lineno, row in table.iterdicts(log=self.log, with_metadata=True): - for col, validate in validators: + for col, validate_ in validators: try: - validate(self.dataset, table, col, row) + validate_(self.dataset, table, col, row) except ValueError as e: self.fail(f'{fname.name}:{lineno}:{col.name} {e}') - def validate_columns(self, table): + def _validate_columns(self, table): property_urls, colnames = set(), set() for col in table.tableSchema.columns: if col.header in colnames: # pragma: no cover @@ -68,7 +136,7 @@ def validate_columns(self, table): except ValueError: self.fail(f'invalid CLDF URI: {col_uri}') - def validate_table_schema(self, table): + def _validate_table_schema(self, table): tmpl_vars = set(col.name for col in table.tableSchema.columns) for obj, prop, tmpl in iter_uritemplates(table): if not {n for n in tmpl.variable_names if not n.startswith('_')}.issubset(tmpl_vars): @@ -92,7 +160,7 @@ def validate_table_schema(self, table): table.url, 'This may cause problems with "cldf createdb"') - def validate_default_objects(self, default_table): + def _validate_default_objects(self, default_table): dtable_uri = default_table.common_props['dc:conformsTo'] try: table = self.dataset[dtable_uri] @@ -121,12 +189,17 @@ def validate_default_objects(self, default_table): self.fail(f'{table_uri} {uri} must be {cardinality}') -def valid_references(dataset, table, column, row): +# +# Row validators: +# +def valid_references(dataset, _, column, row): # pylint: disable=C0103,C0116 if dataset.sources: dataset.sources.validate(row[column.name]) -def valid_regex(pattern, name, dataset, table, column, row): +def valid_regex(pattern, name, dataset, table, column, row): # pylint: disable=R0917,R0913 + """Generic regex validator. Turn into regular validator via functools.partial.""" + assert dataset and table value = row[column.name] if value is not None: if not isinstance(value, list): @@ -134,10 +207,10 @@ def valid_regex(pattern, name, dataset, table, column, row): value = [value] for val in value: if not pattern.match(val): - raise ValueError('invalid {0}: {1} (in {2})'.format(name, val, value)) + raise ValueError(f'invalid {name}: {val} (in {value})') -def valid_igt(dataset, table, column, row): +def valid_igt(_, table, column, row): # pylint: disable=C0103,C0116 word_glosses, words = row[column.name], None col = table.get_column('http://cldf.clld.org/v1.0/terms.rdf#analyzedWord') if col: @@ -147,7 +220,7 @@ def valid_igt(dataset, table, column, row): raise ValueError('number of words and word glosses does not match') -def valid_grammaticalityJudgement(dataset, table, column, row): +def valid_grammaticalityJudgement(dataset, _, column, row): # pylint: disable=C0103,C0116 lid_name = dataset.readonly_column_names.ExampleTable.languageReference[0] gc_name = dataset.readonly_column_names.LanguageTable.glottocode[0] if row[column.name] is not None: @@ -156,10 +229,12 @@ def valid_grammaticalityJudgement(dataset, table, column, row): raise ValueError('Glottolog language linked from ungrammatical example') -def valid_mediaType(dataset, table, column, row): - main, _, sub = row[column.name].partition('/') +def valid_mediaType(dataset, table, column, row): # pylint: disable=C0103,C0116 + """Check validity of media types.""" + assert dataset and table + main, _, _ = row[column.name].partition('/') if not re.fullmatch('[a-z]+', main): - warnings.warn('Invalid main part in media type: {}'.format(main)) + warnings.warn(f'Invalid main part in media type: {main}') VALIDATORS: list[tuple[None, str, RowValidatorType]] = [ diff --git a/tests/test_cli.py b/tests/test_cli.py index a9782d9..61cd682 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -7,6 +7,7 @@ import pytest from pycldf.__main__ import main +from pycldf.dataset import SchemaError def test_help(capsys): @@ -95,6 +96,9 @@ def test_downloadmedia(tmp_path, data): assert files[0].read(tmp_path) == 'Hello, World!' assert files[1].read(tmp_path) == 'äöü' + with pytest.raises(SchemaError): + main(['downloadmedia', '--use-form-id', str(md), str(tmp_path)]) + def test_validate(tmp_path, caplog): tmp_path.joinpath('md.json').write_text("""{ diff --git a/tests/test_dataset.py b/tests/test_dataset.py index c650e17..3b5d7fe 100644 --- a/tests/test_dataset.py +++ b/tests/test_dataset.py @@ -12,7 +12,8 @@ from pycldf.terms import term_uri, TERMS from pycldf.dataset import ( Generic, Wordlist, StructureDataset, Dictionary, ParallelText, Dataset, TextCorpus, - GitRepository, make_column, get_modules, iter_datasets, SchemaError) + GitRepository, make_column, iter_datasets, SchemaError) +from pycldf.module import get_modules from pycldf.sources import Sources from pycldf.media import MediaTable diff --git a/tests/test_trees.py b/tests/test_trees.py index bc217c7..2435764 100644 --- a/tests/test_trees.py +++ b/tests/test_trees.py @@ -2,6 +2,7 @@ from pycldf import Generic from pycldf.trees import * +from pycldf.validators import DatasetValidator def test_Trees(dataset_with_trees): @@ -10,7 +11,7 @@ def test_Trees(dataset_with_trees): assert len(t) == 2 assert set(n.name for n in t[0].newick().walk() if n.is_leaf) == {'l1', 'l2', 'l3', 'l4'} assert set(n.name for n in t[1].newick().walk() if n.is_leaf) == {'l1', 'l2', 'l4'} - assert trees.validate() + assert trees.validate(DatasetValidator(dataset_with_trees)) is None def test_Trees_from_dataurl(dataset_with_trees2): @@ -40,7 +41,7 @@ def test_Trees_validate(tmp_path, caplog): tmp_path.joinpath('test.nwk').write_text('(l1,l2);', encoding='utf8') tmp_path.joinpath('test.nex').write_text( '#NEXUS\n\nbegin trees;\ntree x = [&U](l1,l2);\nend;', encoding='utf8') - TreeTable(ds).validate(log=logging.getLogger('test')) + TreeTable(ds).validate(DatasetValidator(ds, log=logging.getLogger('test'))) assert len(caplog.records) == 3 assert caplog.records[0].message.startswith('No newick') assert caplog.records[1].message.startswith('Newick node label') diff --git a/tests/test_util.py b/tests/test_util.py index 965350d..55906dd 100644 --- a/tests/test_util.py +++ b/tests/test_util.py @@ -1,6 +1,9 @@ import pytest from pycldf.util import * +from pycldf.fileutil import * +from pycldf.urlutil import * +from pycldf.markdown import metadata2markdown @pytest.mark.parametrize("sliceable,slices,expected", [ From 69a695157875fdc0db6dfd05949532b373d29881 Mon Sep 17 00:00:00 2001 From: Robert Forkel Date: Thu, 26 Feb 2026 16:05:43 +0100 Subject: [PATCH 3/7] more modernizing --- src/pycldf/dataset.py | 33 +++++++------- src/pycldf/ext/markdown.py | 90 +++++++++++++++++++++----------------- src/pycldf/module.py | 11 ++++- src/pycldf/schemautil.py | 4 ++ src/pycldf/sliceutil.py | 55 +++++++++++++++++++++++ src/pycldf/trees.py | 51 +++++++++++---------- src/pycldf/util.py | 57 ++++++++++-------------- tests/test_util.py | 16 +++++++ 8 files changed, 198 insertions(+), 119 deletions(-) create mode 100644 src/pycldf/sliceutil.py diff --git a/src/pycldf/dataset.py b/src/pycldf/dataset.py index fec1c12..2894142 100644 --- a/src/pycldf/dataset.py +++ b/src/pycldf/dataset.py @@ -23,7 +23,8 @@ from pycldf.module import get_module_impl from pycldf.sources import Sources, Source from pycldf.util import ( - pkg_path, resolve_slices, DictTuple, iter_uritemplates, MD_SUFFIX, GitRepository, copy_dataset) + pkg_path, DictTuple, iter_uritemplates, MD_SUFFIX, GitRepository, copy_dataset) +from pycldf.sliceutil import multislice_with_split from pycldf.fileutil import PathType from pycldf.schemautil import ColSpecType, make_column, make_table, TableType, ColType from pycldf.constraints import add_foreign_key, add_auto_constraints @@ -38,7 +39,7 @@ ORM_CLASSES = {cls.component_name(): cls for cls in orm.Object.__subclasses__()} TableSpecType = Union[str, Link, Table] -SchemaObjectType = Union[TableSpecType, tuple[TableSpecType, ColSpecType]] +SchemaObjectType = Union[TableSpecType, tuple[TableSpecType, ColType]] ODict = collections.OrderedDict RowType = ODict[str, Any] @@ -332,7 +333,7 @@ def __getitem__(self, item: SchemaObjectType) -> Union[csvw.Table, csvw.Column]: raise SchemaError(f'Dataset has no column "{column}" in table "{t.url}"') - def _get_table(self, table: Union[str, Table]) -> Table: + def _get_table(self, table: TableType) -> Table: if not isinstance(table, Table): uri = term_uri(table, terms=TERMS.by_uri) for t in self.tables: @@ -869,13 +870,11 @@ def get_subsequence(self, cognate: RowType, form: Optional[str] = None) -> list[ :param cognate: A `dict` holding the data of a row from a `CognateTable`. """ - return resolve_slices( - cognate, - self, - ('CognateTable', "http://cldf.clld.org/v1.0/terms.rdf#segmentSlice"), - ('FormTable', "http://cldf.clld.org/v1.0/terms.rdf#segments"), - 'Form_ID', - target_row=form) + target_row = form or self.get_row('FormTable', cognate['Form_ID']) + return multislice_with_split( + target_row[self['FormTable', "http://cldf.clld.org/v1.0/terms.rdf#segments"].name], + cognate[self['CognateTable', "http://cldf.clld.org/v1.0/terms.rdf#segmentSlice"].name], + ) class ParallelText(Dataset): @@ -886,14 +885,12 @@ def primary_table(self) -> str: # pylint: disable=missing-function-docstring def get_equivalent(self, functional_equivalent, form=None): """Get the forms fulfilling an equivalent function in the texts.""" - return resolve_slices( - functional_equivalent, - self, - ('FunctionalEquivalentTable', - "http://cldf.clld.org/v1.0/terms.rdf#segmentSlice"), - ('FormTable', "http://cldf.clld.org/v1.0/terms.rdf#segments"), - 'Form_ID', - target_row=form) + slice_col_name = self[ + 'FunctionalEquivalentTable', "http://cldf.clld.org/v1.0/terms.rdf#segmentSlice"].name + sequence_col_name = self['FormTable', "http://cldf.clld.org/v1.0/terms.rdf#segments"].name + target_row = form or self.get_row('FormTable', functional_equivalent['Form_ID']) + return multislice_with_split( + target_row[sequence_col_name], functional_equivalent[slice_col_name]) class Dictionary(Dataset): diff --git a/src/pycldf/ext/markdown.py b/src/pycldf/ext/markdown.py index 4fd2b35..959ca48 100644 --- a/src/pycldf/ext/markdown.py +++ b/src/pycldf/ext/markdown.py @@ -4,10 +4,11 @@ For an example, see :class:`FilenameToComponent`. """ import re -import typing +from typing import Optional, Union, Any import pathlib import warnings -import collections.abc +import collections +from collections.abc import Mapping import yaml import jmespath @@ -16,13 +17,12 @@ import clldutils from clldutils.markup import MarkdownLink -from .discovery import get_dataset -from pycldf.util import pkg_path +from pycldf.util import pkg_path, MD_SUFFIX from pycldf.urlutil import url_without_fragment -from pycldf.dataset import MD_SUFFIX from pycldf.sources import Source from pycldf import Dataset from pycldf import orm +from .discovery import get_dataset __all__ = ['CLDFMarkdownLink', 'CLDFMarkdownText', 'FilenameToComponent'] @@ -32,7 +32,7 @@ METADATA_COMPONENT = 'Metadata' -class DatasetMapping(collections.abc.Mapping): +class DatasetMapping(Mapping): """ A read-only mapping of prefixes to datasets. """ @@ -47,8 +47,8 @@ def to_dict(o): def __init__(self, m1, m2=None, - doc_path: typing.Optional[pathlib.Path] = None, - download_dir: typing.Optional[pathlib.Path] = None): + doc_path: Optional[pathlib.Path] = None, + download_dir: Optional[pathlib.Path] = None): """ :param m1: Mapping of prefixes to datasets (locators). :param m2: Mapping of prefixes to datasets (locators) to update `m1`. @@ -65,7 +65,7 @@ def __init__(self, if not isinstance(self.m[k], Dataset): self.m[k] = get_dataset(self.m[k], download_dir, doc_path) - def __getitem__(self, prefix: typing.Union[str, None]) -> Dataset: + def __getitem__(self, prefix: Union[str, None]) -> Dataset: """ Get a `Dataset` mapped to a prefix. """ @@ -89,18 +89,20 @@ class CLDFMarkdownLink(MarkdownLink): fragment_pattern = re.compile(r'cldf(-(?P[a-zA-Z0-9_]+))?:') @property - def url_without_fragment(self): + def url_without_fragment(self) -> str: + """Return the HREF value of the link without the fragment.""" return url_without_fragment(self.parsed_url) @staticmethod - def format_url(path, objid, prefix=None): - return '{}#cldf{}:{}'.format(path, '-' + prefix if prefix else '', objid) + def format_url(path, objid, prefix=None) -> str: + """Format the HREF value for a CLDF Markdown link.""" + prefix = '-' + prefix if prefix else '' + return f'{path}#cldf{prefix}:{objid}' @classmethod def from_component(cls, comp, objid='__all__', label=None, prefix=None) -> 'CLDFMarkdownLink': - return cls( - label=label or '{}:{}'.format(comp, objid), - url=cls.format_url(comp, objid, prefix=prefix)) + """Create a CLDF Markdown link for an object in a component.""" + return cls(label=label or f'{comp}:{objid}', url=cls.format_url(comp, objid, prefix=prefix)) @property def is_cldf_link(self) -> bool: @@ -110,25 +112,27 @@ def is_cldf_link(self) -> bool: return bool(self.fragment_pattern.match(self.parsed_url.fragment)) @property - def prefix(self) -> typing.Union[None, str]: + def prefix(self) -> Optional[str]: """ The dataset prefix associated with a CLDF Markdown link. """ if self.is_cldf_link: return self.fragment_pattern.match(self.parsed_url.fragment).group('prefix') + return None # pragma: no cover @property - def table_or_fname(self) -> typing.Union[None, str]: + def table_or_fname(self) -> Optional[str]: """ The last path component of the URL of a CLDF Markdown link. """ if self.is_cldf_link: return self.parsed_url.path.split('/')[-1] + return None # pragma: no cover - def component(self, - cldf: typing.Optional[ - typing.Union[Dataset, typing.Dict[str, Dataset], DatasetMapping]] = None, - ) -> typing.Union[str, None]: + def component( + self, + cldf: Optional[Union[Dataset, dict[str, Dataset], DatasetMapping]] = None, + ) -> Union[str, None]: """ :param cldf: `pycldf.Dataset` instance to which the link refers. :return: Name of the CLDF component the link pertains to or `None`. @@ -144,9 +148,9 @@ def component(self, if isinstance(cldf, (dict, DatasetMapping)): cldf = cldf[self.prefix] - if name == cldf.bibname or name == SOURCE_COMPONENT: + if name in (cldf.bibname, SOURCE_COMPONENT): return SOURCE_COMPONENT - if name == cldf.filename or name == METADATA_COMPONENT: + if name in (cldf.filename, METADATA_COMPONENT): return METADATA_COMPONENT try: return cldf.get_tabletype(cldf[name]) @@ -154,12 +158,13 @@ def component(self, return None @property - def objid(self) -> typing.Union[None, str]: + def objid(self) -> Optional[str]: """ The identifier of the object referenced by a CLDF Markdown link. """ if self.is_cldf_link: return self.parsed_url.fragment.split(':', maxsplit=1)[-1] + return None # pragma: no cover @property def all(self) -> bool: @@ -168,7 +173,7 @@ def all(self) -> bool: """ return self.objid == '__all__' - def get_row(self, cldf: typing.Union[Dataset, DatasetMapping]) -> dict: + def get_row(self, cldf: Union[Dataset, DatasetMapping]) -> dict: """ Resolve the reference in a CLDF Markdown link to a row (`dict`) in the CLDF `Dataset`. """ @@ -176,7 +181,7 @@ def get_row(self, cldf: typing.Union[Dataset, DatasetMapping]) -> dict: ds = DatasetMapping(cldf)[self.prefix] return ds.get_row(self.component(cldf=ds), self.objid) - def get_object(self, cldf: typing.Union[Dataset, DatasetMapping]) -> orm.Object: + def get_object(self, cldf: Union[Dataset, DatasetMapping]) -> orm.Object: """ Resolve the reference in a CLDF Markdown link to an ORM object in the CLDF `Dataset`. """ @@ -214,9 +219,9 @@ def render_link(self, link): :cvar metadata_component: Name of the special "Metadata" component. """ def __init__(self, - text: typing.Union[pathlib.Path, str], - dataset_mapping: typing.Optional[typing.Union[str, Dataset, dict]] = None, - download_dir: typing.Optional[pathlib.Path] = None): + text: Union[pathlib.Path, str], + dataset_mapping: Optional[Union[str, Dataset, dict]] = None, + download_dir: Optional[pathlib.Path] = None): """ :param text: CLDF Markdown text either to be read from a path or specified as `str`. :param dataset_mapping: Mapping of dataset prefixes to `Dataset` instances. May override \ @@ -224,14 +229,14 @@ def __init__(self, :download_dir: Optional path to a directory to download data for remote datasets. """ p = frontmatter.loads(text) if isinstance(text, str) else frontmatter.load(str(text)) - self.metadata = p.metadata - self.dataset_mapping = DatasetMapping( + self.metadata: dict[str, Any] = p.metadata + self.dataset_mapping: Mapping[Union[str, None], Dataset] = DatasetMapping( p.get(DATASETS_MAPPING), dataset_mapping, text.parent if isinstance(text, pathlib.Path) else None, download_dir, ) - self.text = p.content + self.text: str = p.content self._datadict = collections.defaultdict(dict) for prefix, ds in self.dataset_mapping.items(): self._datadict[prefix][SOURCE_COMPONENT] = {src.id: src for src in ds.sources} @@ -242,9 +247,9 @@ def frontmatter(self) -> str: """ The markdown documents metadata formatted as YAML frontmatter. """ - return '---\n{}---'.format(yaml.dump(self.metadata)) + return f'---\n{yaml.dump(self.metadata)}---' - def get_object(self, ml: CLDFMarkdownLink) -> typing.Union[list, orm.Object, Source, dict]: + def get_object(self, ml: CLDFMarkdownLink) -> Union[list, orm.Object, Source, dict]: """ Resolve the reference in a CLDF Markdown link to the matching object from a mapped dataset. @@ -274,20 +279,23 @@ def get_object(self, ml: CLDFMarkdownLink) -> typing.Union[list, orm.Object, Sou return list(self._datadict[ml.prefix][key].values()) if ml.all \ else self._datadict[ml.prefix][key][ml.objid] - def _render_link(self, link): + def _render_link(self, link: CLDFMarkdownLink) -> Union[str, CLDFMarkdownLink]: + """Dispatches to custom rendering in case of CLDF links.""" if link.is_cldf_link: return self.render_link(link) return link - def render_link(self, cldf_link: CLDFMarkdownLink) -> typing.Union[str, CLDFMarkdownLink]: + def render_link(self, cldf_link: CLDFMarkdownLink) -> Union[str, CLDFMarkdownLink]: """ CLDF Markdown renderers must implement this method. """ raise NotImplementedError() # pragma: no cover - def render(self, - simple_link_detection: bool = True, - markdown_kw: typing.Optional[dict] = None) -> str: + def render( + self, + simple_link_detection: bool = True, + markdown_kw: Optional[dict[str, Any]] = None, + ) -> str: """ A markdown string with CLDF Markdown links replaced. """ @@ -298,7 +306,7 @@ def render(self, category=UserWarning) kw = {} else: - kw = dict(simple=simple_link_detection, markdown_kw=markdown_kw) + kw = {'simple': simple_link_detection, 'markdown_kw': markdown_kw} return CLDFMarkdownLink.replace(self.text, self._render_link, **kw) @@ -306,7 +314,7 @@ class FilenameToComponent(CLDFMarkdownText): """ Renderer to replace filenames in CLDF Markdown links with CLDF component names. """ - def render_link(self, cldf_link): + def render_link(self, cldf_link: CLDFMarkdownLink) -> CLDFMarkdownLink: """ Rewrites to URL of CLDF Markdown links, using the component name as path component. """ diff --git a/src/pycldf/module.py b/src/pycldf/module.py index 01aa3dd..80f0177 100644 --- a/src/pycldf/module.py +++ b/src/pycldf/module.py @@ -1,4 +1,7 @@ -from typing import Union +""" +Functionality to manage modules, i.e. `Dataset` subclasses implementing particular CLDF modules. +""" +from typing import Union, Optional, Type import attr from csvw.metadata import TableGroup @@ -38,11 +41,15 @@ def match(self, thing: Union[TableGroup, str]) -> bool: _modules = [] -def get_module_impl(base_class, spec: Union[TableGroup, str]): +def get_module_impl(base_class, spec: Union[TableGroup, str]) -> Optional[Type]: + """ + Returns an implementation (aka Dataset subclass) for a particular CLDF module. + """ implementations = {cls.__name__: cls for cls in base_class.__subclasses__()} for mod in get_modules(): if mod.match(spec): return implementations[mod.id] + return None # pragma: no cover def get_modules() -> list[Module]: diff --git a/src/pycldf/schemautil.py b/src/pycldf/schemautil.py index 5467c35..27e8330 100644 --- a/src/pycldf/schemautil.py +++ b/src/pycldf/schemautil.py @@ -1,3 +1,6 @@ +""" +Functionality to create schema objects. +""" from typing import Union from csvw.metadata import Column, Table @@ -39,6 +42,7 @@ def make_column(spec: ColSpecType) -> Column: def make_table(spec: TableSpecType) -> Table: + """Create a `Table` instance from `spec`.""" if isinstance(spec, str): return Table.fromvalue(jsonlib.load(pkg_path('components', f'{spec}{MD_SUFFIX}'))) if isinstance(spec, dict): diff --git a/src/pycldf/sliceutil.py b/src/pycldf/sliceutil.py new file mode 100644 index 0000000..67c2d2a --- /dev/null +++ b/src/pycldf/sliceutil.py @@ -0,0 +1,55 @@ +""" +This module provides a flexible implementation of slicing sequences, based on Python's slices. + +In addition to Python's way of specifying slices as triples of integers (start, stop, step), we +allow specification as strings like '1' or '2:5', where the numbers are interpreted as **1-based** +indices, specifying **inclusive** boundaries. I.e. '2:5' is equivalent to `slice(1:5).` +""" +from typing import Union, TypeVar +import itertools +from collections.abc import Sequence, Iterable + +__all__ = ['multislice', 'multislice_with_split'] + +T = TypeVar('T') +SliceType = Union[str, tuple[int], tuple[int, int], tuple[int, int, int], slice] + + +def multislice(sliceable: Sequence[T], *slices: SliceType) -> Sequence[T]: + """ + .. code-block:: python + + >>> import string + >>> multislice(list(range(30)), '3:7', '9', (12, 18, 3)) + [2, 3, 4, 5, 6, 8, 12, 15] + >>> multislice(string.ascii_lowercase, '3:7', '9', (12, 18, 3)) + 'cdefgimp' + """ + res = type(sliceable)() + for sl in slices: + if isinstance(sl, str): + if ':' in sl: + assert sl.count(':') <= 2, f'String slice spec may only have two colons. {sl}' + sl = slice(*[int(s) - (1 if i == 0 else 0) for i, s in enumerate(sl.split(':'))]) + else: + sl = slice(*[int(sl) - 1, int(sl)]) + elif isinstance(sl, int): + sl = slice(sl, sl + 1) + elif isinstance(sl, (tuple, list)): + sl = slice(*sl) + else: + assert isinstance(sl, slice) + res += sliceable[sl] + return res + + +def multislice_with_split(sliceable: Sequence[T], slices: Iterable[SliceType]) -> list[T]: + """ + Resolves multislices and then applies splitting on whitespace to each item. + + .. code-block:: python + + >>> multislice_with_split(['a', 'b', 'c d', 'f', 'g'], [(2, 4)]) + ['c', 'd', 'f'] + """ + return list(itertools.chain(*[s.split() for s in multislice(sliceable, *slices)])) diff --git a/src/pycldf/trees.py b/src/pycldf/trees.py index 2612736..b939b29 100644 --- a/src/pycldf/trees.py +++ b/src/pycldf/trees.py @@ -21,16 +21,19 @@ ├─l3 └─l4 """ -import typing +from typing import TYPE_CHECKING, Optional import pathlib +from collections.abc import Generator from commonnexus import Nexus import newick +from csvw.metadata import Table, Column from pycldf.media import MediaTable, File -if typing.TYPE_CHECKING: +if TYPE_CHECKING: from pycldf import Dataset # pragma: no cover + from pycldf.dataset import RowType # pragma: no cover from pycldf.validators import DatasetValidator # pragma: no cover __all__ = ['Tree', 'TreeTable'] @@ -40,17 +43,17 @@ class Tree: """ Represents a tree object as specified in a row of `TreeTable`. """ - def __init__(self, trees: 'TreeTable', row: dict, file: File): - self.row = row - self.id = row[trees.cols['id'].name] - self.name = row[trees.cols['name'].name] - self.file = file + def __init__(self, trees: 'TreeTable', row: 'RowType', file: File): + self.row: 'RowType' = row + self.id: str = row[trees.cols['id'].name] + self.name: str = row[trees.cols['name'].name] + self.file: File = file for prop in ['description', 'treeType', 'treeIsRooted', 'treeBranchLengthUnit']: attrib = ''.join('_' + c.lower() if c.isupper() else c for c in prop) setattr(self, attrib, row.get(trees.cols[prop].name) if trees.cols[prop] else None) self.trees = trees - def newick_string(self, d: typing.Optional[pathlib.Path] = None) -> str: + def newick_string(self, d: Optional[pathlib.Path] = None) -> str: """ Retrieve the Newick representation of the tree from the associated tree file. @@ -58,21 +61,19 @@ def newick_string(self, d: typing.Optional[pathlib.Path] = None) -> str: :meth:`pycldf.media.File.save`. :return: Newick representation of the associated tree. """ - if self.file.id not in self.trees._parsed_files: + if self.file.id not in self.trees.parsed_files: content = self.file.read(d=d) if self.file.mimetype == 'text/x-nh': - self.trees._parsed_files[self.file.id] = { + self.trees.parsed_files[self.file.id] = { # pylint: disable=protected-access str(index): nwk for index, nwk in enumerate( [t.strip() for t in content.split(';') if t.strip()], start=1)} else: - self.trees._parsed_files[self.file.id] = { + self.trees.parsed_files[self.file.id] = { # pylint: disable=protected-access tree.name: tree.newick_string for tree in Nexus(content).TREES.trees} - return self.trees._parsed_files[self.file.id][self.name] + return self.trees.parsed_files[self.file.id][self.name] # pylint: disable=protected-access - def newick(self, - d: typing.Optional[pathlib.Path] = None, - strip_comments: bool = False) -> newick.Node: + def newick(self, d: Optional[pathlib.Path] = None, strip_comments: bool = False) -> newick.Node: """ Retrieve a `newick.Node` instance for the tree from the associated tree file. @@ -90,19 +91,20 @@ class TreeTable: Container class for a `Dataset`'s TreeTable. """ def __init__(self, ds: 'Dataset'): - self.ds = ds - self.component = self.__class__.__name__ - self.table = ds[self.component] - self.media = MediaTable(ds) - self.media_rows = {row[self.media.id_col.name]: row for row in ds['MediaTable']} - self.cols = { + self.ds: 'Dataset' = ds + self.component: str = self.__class__.__name__ + self.table: Table = ds[self.component] + self.media: MediaTable = MediaTable(ds) + self.media_rows: dict[str, 'RowType'] = { + row[self.media.id_col.name]: row for row in ds['MediaTable']} + self.cols: dict[str, Optional[Column]] = { prop: self.ds.get((self.table, prop)) for prop in [ 'id', 'name', 'description', 'mediaReference', 'treeIsRooted', 'treeType', 'treeBranchLengthUnit']} # Since reading and parsing tree files is expensive, we cache them. - self._parsed_files = {} + self.parsed_files: dict[str, dict[str, str]] = {} - def __iter__(self) -> typing.Generator[Tree, None, None]: + def __iter__(self) -> Generator[Tree, None, None]: for row in self.table: yield Tree( self, @@ -110,6 +112,9 @@ def __iter__(self) -> typing.Generator[Tree, None, None]: File(self.media, self.media_rows[row[self.cols['mediaReference'].name]])) def validate(self, validator: 'DatasetValidator'): + """ + Makes sure Newick representations of trees are available and only reference valid languages. + """ lids = {r['id'] for r in self.ds.iter_rows('LanguageTable', 'id')} for tree in self: try: diff --git a/src/pycldf/util.py b/src/pycldf/util.py index acd465f..e09536f 100644 --- a/src/pycldf/util.py +++ b/src/pycldf/util.py @@ -1,12 +1,15 @@ +""" +The mixed bag of utility functions and classes of the pycldf package ... +""" import shutil -from typing import Optional, TYPE_CHECKING, Any +from typing import Optional, TYPE_CHECKING, Any, Union import pathlib -import itertools import collections import urllib.parse import urllib.request +from collections.abc import Generator -from csvw.metadata import is_url, Link +from csvw.metadata import is_url, Link, Column, Table, Schema, URITemplate from clldutils.path import git_describe from pycldf.fileutil import PathType @@ -16,8 +19,7 @@ from pycldf import Dataset # pragma: no cover __all__ = [ - 'pkg_path', 'multislice', 'resolve_slices', 'DictTuple', 'qname2url', - 'iter_uritemplates', 'MD_SUFFIX', 'GitRepository'] + 'pkg_path', 'DictTuple', 'qname2url', 'iter_uritemplates', 'MD_SUFFIX', 'GitRepository'] MD_SUFFIX = '-metadata.json' @@ -51,7 +53,14 @@ def json_ld(self) -> collections.OrderedDict[str, Any]: return res -def iter_uritemplates(table): +def iter_uritemplates( + table: Table) -> Generator[tuple[Union[Table, Schema, Column], str, URITemplate]]: + """ + Generator of URITemplates specified in a table. + + Since URITemplates use column names as template variables, it is important to keep these in + sync with the table schema, e.g. in case of renaming columns. + """ props = ['aboutUrl', 'valueUrl'] for obj in [table, table.tableSchema] + table.tableSchema.columns: for prop in props: @@ -60,37 +69,11 @@ def iter_uritemplates(table): yield obj, prop, tmpl -def pkg_path(*comps): +def pkg_path(*comps: str) -> pathlib.Path: + """Returns a path within the pycldf package.""" return pathlib.Path(__file__).resolve().parent.joinpath(*comps) -def multislice(sliceable, *slices): - res = type(sliceable)() - for sl in slices: - if isinstance(sl, str): - if ':' in sl: - sl = [int(s) - (1 if i == 0 else 0) for i, s in enumerate(sl.split(':'))] - else: - sl = [int(sl) - 1, int(sl)] - res += sliceable[slice(*sl)] - return res - - -def resolve_slices(row, ds, slice_spec, target_spec, fk, target_row=None): - # 1. Determine the slice column: - slices = ds[slice_spec] - - # 2. Determine the to-be-sliced column: - morphemes = ds[target_spec] - - # 3. Retrieve the matching row in the target table: - target_row = target_row or ds.get_row(target_spec[0], row[fk]) - - # 4. Slice the segments - return list(itertools.chain(*[ - s.split() for s in multislice(target_row[morphemes.name], *row[slices.name])])) - - class DictTuple(tuple): """ A `tuple` that acts like a `dict` when a `str` is passed to `__getitem__`. @@ -120,7 +103,10 @@ def __getitem__(self, item): return super().__getitem__(item) -def qname2url(qname): +def qname2url(qname: str) -> Optional[str]: + """ + Turns a qname of the form : into a full HTTP URL if the prefix is known. + """ for prefix, uri in { 'csvw': 'http://www.w3.org/ns/csvw#', 'rdf': 'http://www.w3.org/1999/02/22-rdf-syntax-ns#', @@ -132,6 +118,7 @@ def qname2url(qname): }.items(): if qname.startswith(prefix + ':'): return qname.replace(prefix + ':', uri) + return None def copy_dataset(ds: 'Dataset', dest: PathType, mdname: str = None) -> pathlib.Path: diff --git a/tests/test_util.py b/tests/test_util.py index 55906dd..acda51b 100644 --- a/tests/test_util.py +++ b/tests/test_util.py @@ -3,10 +3,15 @@ from pycldf.util import * from pycldf.fileutil import * from pycldf.urlutil import * +from pycldf.sliceutil import * from pycldf.markdown import metadata2markdown @pytest.mark.parametrize("sliceable,slices,expected", [ + ('abcdefg', [slice(1, 3)], 'bc'), + ('abcdefg', ['2', '4'], 'bd'), + ('abcdefg', [2, 4], 'ce'), + ('abcdefg', ['2:8:2'], 'bdf'), ('abcdefg', ['2:5', (1, 4)], 'bcdebcd'), ([1, 2, 3, 4], ['1:6:2'], [1, 3]), ((1, 2, 3, 4), ['1:6:2'], (1, 3)) @@ -15,6 +20,17 @@ def test_multislice(sliceable, slices, expected): assert multislice(sliceable, *slices) == expected +@pytest.mark.parametrize( + 'qname,expected', + [ + ('rdf:ID', 'http://www.w3.org/1999/02/22-rdf-syntax-ns#ID'), + ('xyz:thing', None), + ] +) +def test_qname2url(qname, expected): + assert qname2url(qname) == expected + + def test_DictTuple(): t = DictTuple([1, 2, 3], key=lambda i: str(i + 1)) assert t['4'] == t[2] == 3 From fe05627d4750eeb069d0287b1e36075520ad2af3 Mon Sep 17 00:00:00 2001 From: Robert Forkel Date: Fri, 6 Mar 2026 19:43:51 +0100 Subject: [PATCH 4/7] adapt to changes in csvw --- src/pycldf/dataset.py | 11 +++++++---- tests/conftest.py | 14 ++++++-------- 2 files changed, 13 insertions(+), 12 deletions(-) diff --git a/src/pycldf/dataset.py b/src/pycldf/dataset.py index 2894142..5dd86e2 100644 --- a/src/pycldf/dataset.py +++ b/src/pycldf/dataset.py @@ -13,14 +13,13 @@ import urllib.parse import urllib.request -import attr import csvw from csvw.metadata import TableGroup, Table, Column, Link, Schema, is_url, URITemplate from csvw import datatypes from csvw.dsv import iterrows from clldutils.path import walk -from pycldf.module import get_module_impl +from pycldf.module import get_module_impl, get_modules from pycldf.sources import Sources, Source from pycldf.util import ( pkg_path, DictTuple, iter_uritemplates, MD_SUFFIX, GitRepository, copy_dataset) @@ -33,6 +32,8 @@ from pycldf.stats import get_table_stats from pycldf import orm +assert get_modules # For backwards compatibility with cldfbench. + __all__ = [ 'Dataset', 'Generic', 'Wordlist', 'ParallelText', 'Dictionary', 'StructureDataset', 'TextCorpus', 'iter_datasets', 'sniff', 'SchemaError'] @@ -461,8 +462,10 @@ def add_table(self, url: str, *cols: ColSpecType, **kw: Any) -> csvw.Table: """ t = self.add_component({"url": url, "tableSchema": {"columns": []}}, *cols) if 'primaryKey' in kw: - t.tableSchema.primaryKey = attr.fields_dict(Schema)['primaryKey'].converter( - kw.pop('primaryKey')) + pk = kw.pop('primaryKey') + if pk is not None and not isinstance(pk, list): + pk = [pk] + t.tableSchema.primaryKey = pk if kw.get('description'): t.common_props['dc:description'] = kw.pop('description') t.common_props.update(kw) diff --git a/tests/conftest.py b/tests/conftest.py index 11558d1..c70316a 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -23,18 +23,16 @@ def csvw3(): @pytest.fixture def urlopen(mocker, data, csvw3): - import requests_mock + from csvw.utils import GetResponse - def _urlopen(url): + def _urlopen(url, **_): return io.BytesIO(data.joinpath(urllib.parse.urlparse(url).path[1:]).read_bytes()) + def csvw_request_get(url, **_): + return GetResponse(content=data.joinpath(urllib.parse.urlparse(url).path[1:]).read_bytes()) + + mocker.patch('csvw.utils.request_get', csvw_request_get) mocker.patch('pycldf.sources.urlopen', _urlopen) - if not csvw3: # pragma: no cover - mocker.patch('csvw.metadata.urlopen', _urlopen) - else: - mock = requests_mock.Mocker() - mock.__enter__() - mock.get(requests_mock.ANY, content=lambda req, _: _urlopen(req.url).read()) @pytest.fixture(scope='module') From b74226b0cc3acb8a1633ef243783fc7b3af0fd23 Mon Sep 17 00:00:00 2001 From: Robert Forkel Date: Mon, 9 Mar 2026 17:30:36 +0100 Subject: [PATCH 5/7] finished linting, add _compat module --- RELEASING.md | 4 ++ setup.cfg | 1 - src/pycldf/_compat.py | 15 +++++ src/pycldf/commands/catmedia.py | 6 +- src/pycldf/commands/check.py | 78 ++++++++++++++------------ src/pycldf/commands/createdb.py | 6 +- src/pycldf/commands/downloadmedia.py | 4 +- src/pycldf/commands/dumpdb.py | 8 +-- src/pycldf/commands/markdown.py | 4 +- src/pycldf/commands/splitmedia.py | 18 +++--- src/pycldf/commands/stats.py | 4 +- src/pycldf/commands/validate.py | 78 ++++++++++++++++---------- src/pycldf/dataset.py | 2 +- src/pycldf/ext/discovery.py | 84 ++++++++++++++++------------ src/pycldf/ext/markdown.py | 2 +- 15 files changed, 183 insertions(+), 131 deletions(-) create mode 100644 src/pycldf/_compat.py diff --git a/RELEASING.md b/RELEASING.md index b108f7f..527a983 100644 --- a/RELEASING.md +++ b/RELEASING.md @@ -12,6 +12,10 @@ Releasing pycldf ```shell flake8 src ``` +- Make sure pylint passes with a score of 10: + ```shell + pylint src + ``` - Make sure the docs render: ```shell diff --git a/setup.cfg b/setup.cfg index 2b3603b..8aee426 100644 --- a/setup.cfg +++ b/setup.cfg @@ -83,7 +83,6 @@ test = pyconcepticon pytest>=5 pytest-mock - requests-mock pytest-cov coverage>=4.2 docs = diff --git a/src/pycldf/_compat.py b/src/pycldf/_compat.py new file mode 100644 index 0000000..f63a117 --- /dev/null +++ b/src/pycldf/_compat.py @@ -0,0 +1,15 @@ +""" +Backwards compatibility with supported python versions. +""" +import sys + + +if (sys.version_info.major, sys.version_info.minor) >= (3, 10): # pragma: no cover + def entry_points_select(eps, group): + """ + Staring with Python 3.10, `importlib.metadata.entry_points` returns `EntryPoints`.""" + return eps.select(group=group) +else: + def entry_points_select(eps, group): # pragma: no cover + """In Python 3.9, `importlib.metadata.entry_points` returns a `dict`.""" + return eps.get(group, []) diff --git a/src/pycldf/commands/catmedia.py b/src/pycldf/commands/catmedia.py index a14c6b0..c629fd7 100644 --- a/src/pycldf/commands/catmedia.py +++ b/src/pycldf/commands/catmedia.py @@ -5,12 +5,12 @@ from pycldf.media import MediaTable -def register(parser): +def register(parser): # pylint: disable=C0116 add_dataset(parser) -def run(args): +def run(args): # pylint: disable=C0116 ds = get_dataset(args) res = MediaTable(ds).cat() if res: - args.log.info('{} files have been recombined'.format(res)) + args.log.info(f'{res} files have been recombined') diff --git a/src/pycldf/commands/check.py b/src/pycldf/commands/check.py index 8e7930e..e93a894 100644 --- a/src/pycldf/commands/check.py +++ b/src/pycldf/commands/check.py @@ -18,7 +18,7 @@ Catalog, Glottolog, Concepticon = None, None, None -def register(parser): +def register(parser): # pylint: disable=C0116 add_dataset(parser) add_catalog_spec(parser, 'glottolog') add_catalog_spec(parser, 'concepticon') @@ -32,7 +32,7 @@ def register(parser): ) -def run(args): +def run(args): # pylint: disable=C0116 if not Catalog: # pragma: no cover print('\nThis command only works with catalogs installed.\n' 'Run "pip install pycldf[catalogs]" to do so.\n') @@ -53,31 +53,37 @@ def warn(msg): stack.enter_context(Catalog(args.concepticon, tag=args.concepticon_version)) for table, checkers in COLUMN_CHECKERS.items(): - table = ds.get(table) - if table: - idcol = ds.get((table, 'id')) - active_checkers = {} - for col, checker in checkers.items(): - col = ds.get((table, col)) - if col: - # Register an initialized check: - active_checkers[col.name] = checker(args) - if active_checkers: - for row in table: - rowid = row[idcol.name] if idcol else str(row) - for colname, check in active_checkers.items(): - check(row[colname], rowid, warn) + _check_table(ds, table, checkers, args, warn) for table in ds.tables: for _ in table: break else: - warn('Empty table {0}'.format(table.url)) + warn(f'Empty table {table.url}') return 2 if warnings else 0 -class Check: +def _check_table(ds, table, checkers, args, warn): + table = ds.get(table) + if not table: + return + idcol = ds.get((table, 'id')) + active_checkers = {} + for col, checker in checkers.items(): + col = ds.get((table, col)) + if col: + # Register an initialized check: + active_checkers[col.name] = checker(args) + if active_checkers: + for row in table: + rowid = row[idcol.name] if idcol else str(row) + for colname, check in active_checkers.items(): + check(row[colname], rowid, warn) + + +class Check: # pylint: disable=R0903 + """A base class for checks. Initialize with __init__ then run __call__ on each row.""" def __init__(self, args): self.args = args @@ -85,7 +91,7 @@ def __call__(self, gc, rowid, warn): raise NotImplementedError() # pragma: no cover -class Macroarea(Check): +class Macroarea(Check): # pylint: disable=R0903 """Is the macroarea valid according to Glottolog? (requires "--glottolog")""" def __init__(self, args): super().__init__(args) @@ -96,10 +102,10 @@ def __init__(self, args): def __call__(self, ma, rowid, warn): if self.macroareas and ma and (ma not in self.macroareas): - warn('Language {0} assigned to invalid macroarea {1}'.format(rowid, ma)) + warn(f'Language {rowid} assigned to invalid macroarea {ma}') -class Glottocode(Check): +class Glottocode(Check): # pylint: disable=R0903 """Is the Glottocode valid - is it in Bookkeeping? (requires "--glottolog")""" def __init__(self, args): super().__init__(args) @@ -116,12 +122,12 @@ def __init__(self, args): def __call__(self, gc, rowid, warn): if self.gcs and gc: if gc in self.bookkeeping: - warn('Language {0} mapped to Bookkeeping languoid {1}'.format(rowid, gc)) + warn(f'Language {rowid} mapped to Bookkeeping languoid {gc}') if gc not in self.gcs: - warn('Language {0} mapped to invalid Glottocode {1}'.format(rowid, gc)) + warn(f'Language {rowid} mapped to invalid Glottocode {gc}') -class ISOCode(Check): +class ISOCode(Check): # pylint: disable=R0903 """Is the ISO code valid? (requires "--iso-codes")""" def __init__(self, args): super().__init__(args) @@ -139,24 +145,24 @@ def __init__(self, args): def __call__(self, iso, rowid, warn): if self.iso_codes and iso and (iso not in self.iso_codes): - warn('Language {0} mapped to invalid ISO 639-3 code {1}'.format(rowid, iso)) + warn(f'Language {rowid} mapped to invalid ISO 639-3 code {iso}') -class Latitude(Check): +class Latitude(Check): # pylint: disable=R0903 """Is the latitude between -90 and 90?""" def __call__(self, lat, rowid, warn): - if lat and not (-90 <= lat <= 90): - warn('Language {0} has invalid latitude {1}'.format(rowid, lat)) + if lat and not -90 <= lat <= 90: + warn(f'Language {rowid} has invalid latitude {lat}') -class Longitude(Check): +class Longitude(Check): # pylint: disable=R0903 """Is the longitude between -180 and 180?""" - def __call__(self, lat, rowid, warn): - if lat and not (-180 <= lat <= 180): - warn('Language {0} has invalid longitude {1}'.format(rowid, lat)) + def __call__(self, lon, rowid, warn): + if lon and not -180 <= lon <= 180: + warn(f'Language {rowid} has invalid longitude {lon}') -class ConcepticonID(Check): +class ConcepticonID(Check): # pylint: disable=R0903 """Is the concept set ID valid? (requires "--concepticon")""" def __init__(self, args): super().__init__(args) @@ -168,7 +174,7 @@ def __init__(self, args): def __call__(self, cid, rowid, warn): if self.ids and cid and (cid not in self.ids): - warn('Parameter {0} mapped to invalid conceptset ID {1}'.format(rowid, cid)) + warn(f'Parameter {rowid} mapped to invalid conceptset ID {cid}') COLUMN_CHECKERS = { @@ -184,6 +190,6 @@ def __call__(self, cid, rowid, warn): } } for t, checks in COLUMN_CHECKERS.items(): - __doc__ += '\n- {0}\n'.format(t) + __doc__ += f'\n- {t}\n' for c, cls in checks.items(): - __doc__ += ' - {0}: {1}\n'.format(c, cls.__doc__.strip() or '') + __doc__ += f' - {c}: {cls.__doc__.strip()}\n' diff --git a/src/pycldf/commands/createdb.py b/src/pycldf/commands/createdb.py index d9b4c28..2d4aef7 100644 --- a/src/pycldf/commands/createdb.py +++ b/src/pycldf/commands/createdb.py @@ -6,13 +6,13 @@ from pycldf.cli_util import add_database, get_database -def register(parser): +def register(parser): # pylint: disable=C0116 add_database(parser, must_exist=False) -def run(args): +def run(args): # pylint: disable=C0116 if args.db.exists(): raise ParserError('The database file already exists!') db = get_database(args) db.write_from_tg() - args.log.info('{0} loaded in {1}'.format(db.dataset, db.fname)) + args.log.info('%s loaded in %s', db.dataset, db.fname) diff --git a/src/pycldf/commands/downloadmedia.py b/src/pycldf/commands/downloadmedia.py index e6a5d36..7a674ee 100644 --- a/src/pycldf/commands/downloadmedia.py +++ b/src/pycldf/commands/downloadmedia.py @@ -9,7 +9,7 @@ from pycldf.media import MediaTable -def register(parser): +def register(parser): # pylint: disable=C0116 add_dataset(parser) parser.add_argument( '--use-form-id', @@ -27,7 +27,7 @@ def register(parser): default=[]) -def run(args): +def run(args): # pylint: disable=C0116 filters = [] for s in args.filters: col, _, substring = s.partition('=') diff --git a/src/pycldf/commands/dumpdb.py b/src/pycldf/commands/dumpdb.py index 1b82a0a..ede0be2 100644 --- a/src/pycldf/commands/dumpdb.py +++ b/src/pycldf/commands/dumpdb.py @@ -7,7 +7,7 @@ from pycldf.cli_util import add_database, get_database, PathType -def register(parser): +def register(parser): # pylint: disable=C0116 add_database(parser) parser.add_argument( '--metadata-path', @@ -16,7 +16,7 @@ def register(parser): ) -def run(args): +def run(args): # pylint: disable=C0116 db = get_database(args) - mdpath = args.metadata_path or db.dataset.tablegroup._fname - args.log.info('dumped db to {0}'.format(db.to_cldf(mdpath.parent, mdname=mdpath.name))) + mdpath = args.metadata_path or db.dataset.tablegroup._fname # pylint: disable=W0212 + args.log.info('dumped db to %s', db.to_cldf(mdpath.parent, mdname=mdpath.name)) diff --git a/src/pycldf/commands/markdown.py b/src/pycldf/commands/markdown.py index 2edffd3..567f0e0 100644 --- a/src/pycldf/commands/markdown.py +++ b/src/pycldf/commands/markdown.py @@ -7,7 +7,7 @@ from pycldf.cli_util import add_dataset, get_dataset -def register(parser): +def register(parser): # pylint: disable=C0116 add_dataset(parser) parser.add_argument( '--rel-path', @@ -19,7 +19,7 @@ def register(parser): default=None) -def run(args): +def run(args): # pylint: disable=C0116 ds = get_dataset(args) res = metadata2markdown(ds, args.dataset, rel_path=args.rel_path) if args.out: diff --git a/src/pycldf/commands/splitmedia.py b/src/pycldf/commands/splitmedia.py index 0da333b..ebf28a4 100644 --- a/src/pycldf/commands/splitmedia.py +++ b/src/pycldf/commands/splitmedia.py @@ -13,26 +13,26 @@ CHUNKSIZE = 50 * 1000 * 1000 -class Bytes: - def __call__(self, string): - if not re.fullmatch(r'[0-9]+([MK])?', string): # pragma: no cover - raise argparse.ArgumentTypeError('Invalid chunksize {0}!'.format(string)) - return eval(string.replace('K', '*1024').replace('M', '*1024*1024')) +def _bytes(string) -> int: + """Parse a chunk size spec.""" + if not re.fullmatch(r'[0-9]+([MK])?', string): # pragma: no cover + raise argparse.ArgumentTypeError(f'Invalid chunksize {string}!') + return eval(string.replace('K', '*1024').replace('M', '*1024*1024')) # pylint: disable=W0123 -def register(parser): +def register(parser): # pylint: disable=C0116 add_dataset(parser) parser.add_argument( '-b', '--bytes', metavar='SIZE', help='The SIZE argument is an integer and optional unit K or M (example: 10K is 10*1024).', - type=Bytes(), + type=_bytes, default=CHUNKSIZE, ) -def run(args): +def run(args): # pylint: disable=C0116 ds = get_dataset(args) res = MediaTable(ds).split(args.bytes) if res: - args.log.info('{} files have been split'.format(res)) + args.log.info('%s files have been split', res) diff --git a/src/pycldf/commands/stats.py b/src/pycldf/commands/stats.py index 61084e0..8d676ed 100644 --- a/src/pycldf/commands/stats.py +++ b/src/pycldf/commands/stats.py @@ -8,7 +8,7 @@ from pycldf.cli_util import add_dataset, get_dataset -def register(parser): +def register(parser): # pylint: disable=C0116 add_dataset(parser) add_format(parser, default=None) parser.add_argument( @@ -19,7 +19,7 @@ def register(parser): ) -def run(args): +def run(args): # pylint: disable=C0116 ds = get_dataset(args) print(ds) with Table('key', 'value') as md: diff --git a/src/pycldf/commands/validate.py b/src/pycldf/commands/validate.py index 9f20202..5d607df 100644 --- a/src/pycldf/commands/validate.py +++ b/src/pycldf/commands/validate.py @@ -5,13 +5,15 @@ - the referential integrity of the dataset """ import collections +import dataclasses +from pycldf import Dataset from pycldf.cli_util import add_dataset, get_dataset from pycldf.media import MediaTable -from pycldf.ext.markdown import CLDFMarkdownText +from pycldf.ext.markdown import CLDFMarkdownText, CLDFMarkdownLink -def register(parser): +def register(parser): # pylint: disable=C0116 add_dataset(parser) parser.add_argument( '--with-cldf-markdown', @@ -21,7 +23,26 @@ def register(parser): ) -def run(args): +@dataclasses.dataclass +class TestMarkdown: + """Helper class to run rendering of CLDF markdown and record results.""" + links: list[CLDFMarkdownLink] = dataclasses.field(default_factory=list) + missing: collections.Counter = dataclasses.field(default_factory=collections.Counter) + + def __call__(self, text: str, ds: Dataset): + class Parser(CLDFMarkdownText): + """A CLDFMarkdownText subclass that records link render results.""" + def render_link(slf, cldf_link): # pylint: disable=W0237,E0213 + self.links.append(cldf_link) + try: + slf.get_object(cldf_link) + except: # noqa: E722 # pylint: disable=W0702 + self.missing.update([ + f'{cldf_link.label}:{cldf_link.table_or_fname}:{cldf_link.objid}']) + Parser(text, ds).render() + + +def run(args): # pylint: disable=C0116 cldf = get_dataset(args) if not cldf.validate(log=args.log): return 1 @@ -29,18 +50,6 @@ def run(args): if not args.with_cldf_markdown: return 0 - missing = collections.Counter() - links = [] - - class TestMarkdown(CLDFMarkdownText): - def render_link(self, cldf_link): - links.append(cldf_link) - try: - self.get_object(cldf_link) - except: # noqa: E722 - missing.update(['{}:{}:{}'.format( - cldf_link.label, cldf_link.table_or_fname, cldf_link.objid)]) - cols = [] for t in cldf.tables: try: @@ -54,27 +63,34 @@ def render_link(self, cldf_link): res = 0 for t, c in cols: - args.log.info('Validating CLDF Markdown links in {}:{}'.format(t, c)) + tmd = TestMarkdown() + args.log.info('Validating CLDF Markdown links in %s:%s', t, c) for obj in cldf[t]: if obj[c] and '[' in obj[c]: - TestMarkdown(obj[c], cldf).render() + tmd(obj[c], cldf) - for k, v in missing.most_common(): + for k, v in tmd.missing.most_common(): res = 1 - args.log.warning('Not found {} ({} occurrences)'.format(k, v)) - args.log.info('{} links checked'.format(len(links))) - missing, links = collections.Counter(), [] + args.log.warning('Not found %s (%s occurrences)', k, v) + args.log.info('%s links checked', len(tmd.links)) if 'MediaTable' in cldf and ('MediaTable', 'http://purl.org/dc/terms/conformsTo') in cldf: - ctcol = cldf['MediaTable', 'http://purl.org/dc/terms/conformsTo'] - for file in MediaTable(cldf): - if file.row[ctcol.name] == 'CLDF Markdown': - args.log.info('Validating CLDF Markdown links in MediaTable:{}'.format(file.id)) - TestMarkdown(file.read(), cldf).render() - for k, v in missing.most_common(): - res = 1 - args.log.warning('Not found {} ({} occurrences)'.format(k, v)) - args.log.info('{} links checked'.format(len(links))) - missing, links = collections.Counter(), [] + if not _validate_media(cldf, args.log): + res = 1 + + return res + +def _validate_media(cldf, log) -> bool: + res = True + ctcol = cldf['MediaTable', 'http://purl.org/dc/terms/conformsTo'] + for file in MediaTable(cldf): + if file.row[ctcol.name] == 'CLDF Markdown': + log.info('Validating CLDF Markdown links in MediaTable:%s', file.id) + tmd = TestMarkdown() + tmd(file.read(), cldf) + for k, v in tmd.missing.most_common(): + res = False + log.warning('Not found %s (%s occurrences)', k, v) + log.info('%s links checked', len(tmd.links)) return res diff --git a/src/pycldf/dataset.py b/src/pycldf/dataset.py index 5dd86e2..fd05162 100644 --- a/src/pycldf/dataset.py +++ b/src/pycldf/dataset.py @@ -14,7 +14,7 @@ import urllib.request import csvw -from csvw.metadata import TableGroup, Table, Column, Link, Schema, is_url, URITemplate +from csvw.metadata import TableGroup, Table, Column, Link, is_url, URITemplate from csvw import datatypes from csvw.dsv import iterrows from clldutils.path import walk diff --git a/src/pycldf/ext/discovery.py b/src/pycldf/ext/discovery.py index 24a5c8b..35ed69f 100644 --- a/src/pycldf/ext/discovery.py +++ b/src/pycldf/ext/discovery.py @@ -16,7 +16,7 @@ resolver for DOI URLs pointing to the Zenodo archive. """ import re -import typing +from typing import Optional, Union import pathlib import zipfile import warnings @@ -29,6 +29,7 @@ from pycldf import Dataset, iter_datasets, sniff from pycldf.urlutil import url_without_fragment +from pycldf._compat import entry_points_select __all__ = ['get_dataset', 'DatasetResolver'] EP = 'pycldf_dataset_resolver' @@ -36,7 +37,7 @@ _resolvers = [] -class DatasetResolver: +class DatasetResolver: # pylint: disable=R0903 """ Virtual base class for dataset resolvers. @@ -46,8 +47,11 @@ class DatasetResolver: """ priority = 5 - def __call__(self, loc: str, download_dir: pathlib.Path) \ - -> typing.Union[None, Dataset, pathlib.Path]: + def __call__( + self, + loc: str, + download_dir: pathlib.Path, + ) -> Union[None, Dataset, pathlib.Path]: """ :param loc: URL pointing to a place where datasets are archived. :param download_dir: A directory to which resolvers can download data. @@ -58,43 +62,45 @@ def __call__(self, loc: str, download_dir: pathlib.Path) \ raise NotImplementedError() # pragma: no cover -class LocalResolver(DatasetResolver): +class LocalResolver(DatasetResolver): # pylint: disable=R0903 """ Resolves dataset locators specifying local file paths. """ priority = 100 - def __call__(self, loc: str, download_dir, base: typing.Optional[pathlib.Path]) \ - -> typing.Union[None, pathlib.Path]: + def __call__( + self, + loc: str, + download_dir, + base: Optional[pathlib.Path], + ) -> Optional[pathlib.Path]: """ :return: a local path to a directory """ if isinstance(loc, str) and is_url(loc): - return + return None loc = pathlib.Path(loc) if loc.resolve() != loc and base: # A relative path, to be interpreted relative to base loc = base.resolve().joinpath(loc) if loc.exists(): return loc + return None # pragma: no cover -class GenericUrlResolver(DatasetResolver): +class GenericUrlResolver(DatasetResolver): # pylint: disable=R0903 """ URL resolver which works for generic URLs provided they point to a CLDF metadata file. """ priority = -1 - def __call__(self, loc, download_dir): + def __call__(self, loc, download_dir) -> Optional[Dataset]: if is_url(loc): - try: - return Dataset.from_metadata(loc) - except: # noqa: E722 # pragma: no cover - raise - pass + return Dataset.from_metadata(loc) + return None # pragma: no cover -class GitHubResolver(DatasetResolver): +class GitHubResolver(DatasetResolver): # pylint: disable=R0903 """ Resolves dataset locators of the form "https://github.com///tree/", e.g. https://github.com/cldf-datasets/petersonsouthasia/tree/v1.1 @@ -103,55 +109,60 @@ class GitHubResolver(DatasetResolver): """ priority = 3 - def __call__(self, loc, download_dir): + def __call__(self, loc, download_dir) -> Optional[pathlib.Path]: url = urllib.parse.urlparse(loc) - if url.netloc == 'github.com' and re.search(r'/[v\.0-9]+$', url.path): + if url.netloc == 'github.com' and re.search(r'/[v.0-9]+$', url.path): comps = url.path.split('/') - z = download_dir / '{}-{}-{}.zip'.format(comps[1], comps[2], comps[-1]) - url = "https://github.com/{}/{}/archive/refs/tags/{}.zip".format( - comps[1], comps[2], comps[-1]) + z = download_dir / f'{comps[1]}-{comps[2]}-{comps[-1]}.zip' + url = f"https://github.com/{comps[1]}/{comps[2]}/archive/refs/tags/{comps[-1]}.zip" urllib.request.urlretrieve(url, z) - zf = zipfile.ZipFile(z) - dirs = {info.filename.split('/')[0] for info in zf.infolist()} - assert len(dirs) == 1 - zf.extractall(download_dir) + with zipfile.ZipFile(z) as zf: + dirs = {info.filename.split('/')[0] for info in zf.infolist()} + assert len(dirs) == 1 + zf.extractall(download_dir) z.unlink() return download_dir / dirs.pop() + return None class DatasetLocator(str): + """Dataset locators are URLs with identifying information added to the fragment.""" @functools.cached_property - def parsed_url(self) -> urllib.parse.ParseResult: + def parsed_url(self) -> urllib.parse.ParseResult: # pylint: disable=C0116 return urllib.parse.urlparse(self) @property - def url_without_fragment(self): + def url_without_fragment(self): # pylint: disable=C0116 return url_without_fragment(self.parsed_url) - def match(self, dataset: Dataset) -> bool: + def match(self, dataset: Dataset) -> bool: # pylint: disable=C0116 if self.parsed_url.fragment: key, _, value = self.parsed_url.fragment.partition('=') return dataset.properties.get(key) == value if value else key in dataset.properties return True -def get_resolvers(): +def get_resolvers() -> list[type]: + """Register resolvers defined via entry points.""" if not _resolvers: - eps = entry_points() - for ep in set(eps.select(group=EP) if hasattr(eps, 'select') else eps.get(EP, [])): + for ep in set(entry_points_select(entry_points(), EP)): try: _resolvers.append(ep.load()()) except ImportError: # pragma: no cover - warnings.warn('ImportError loading entry point {0.name}'.format(ep)) + warnings.warn(f'ImportError loading entry point {ep.name}') continue return sorted(_resolvers, key=lambda res: -res.priority) -def _get_dataset(locator: DatasetLocator, location: typing.Union[None, Dataset, pathlib.Path]): +def _get_dataset( + locator: DatasetLocator, + location: Union[None, Dataset, pathlib.Path], +) -> Optional[Dataset]: + """Determine whether locator matches location and if so, resolve to a Dataset instance.""" if isinstance(location, Dataset): if locator.match(location): return location - return + return None if location.is_dir(): for ds in iter_datasets(location): if locator.match(ds): @@ -160,11 +171,12 @@ def _get_dataset(locator: DatasetLocator, location: typing.Union[None, Dataset, ds = Dataset.from_metadata(location) if sniff(location) else Dataset.from_data(location) if locator.match(ds): return ds + return None # pragma: no cover def get_dataset(locator: str, download_dir: pathlib.Path, - base: typing.Optional[pathlib.Path] = None) -> Dataset: + base: Optional[pathlib.Path] = None) -> Dataset: """ :param locator: Dataset locator as specified in "Dataset discovery". :param download_dir: Directory to which to download remote data if necessary. @@ -182,4 +194,4 @@ def get_dataset(locator: str, res = _get_dataset(locator, res) if res: return res - raise ValueError('Could not resolve dataset locator {}'.format(locator)) + raise ValueError(f'Could not resolve dataset locator {locator}') diff --git a/src/pycldf/ext/markdown.py b/src/pycldf/ext/markdown.py index 959ca48..a0ca1c8 100644 --- a/src/pycldf/ext/markdown.py +++ b/src/pycldf/ext/markdown.py @@ -39,7 +39,7 @@ class DatasetMapping(Mapping): key_pattern = re.compile('[a-zA-Z0-9_]+') @staticmethod - def to_dict(o): + def to_dict(o): # pylint: disable=C0116 if isinstance(o, DatasetMapping): return o.m return {} if not o else ({None: o} if isinstance(o, (str, Dataset)) else o) From 3375a46ef31774e3f370b9fce40344ab62963c57 Mon Sep 17 00:00:00 2001 From: Robert Forkel Date: Wed, 11 Mar 2026 10:11:56 +0100 Subject: [PATCH 6/7] must match on module.id as well --- src/pycldf/module.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/pycldf/module.py b/src/pycldf/module.py index 80f0177..5a0c799 100644 --- a/src/pycldf/module.py +++ b/src/pycldf/module.py @@ -34,7 +34,7 @@ def match(self, thing: Union[TableGroup, str]) -> bool: if isinstance(thing, TableGroup): return thing.common_props.get('dc:conformsTo') == term_uri(self.id) if isinstance(thing, str): - return thing == self.fname + return thing == self.fname or thing == self.id return False From eeecf45e1c70ac88ee21710e87607b4b4c66e05c Mon Sep 17 00:00:00 2001 From: Robert Forkel Date: Tue, 5 May 2026 11:51:19 +0200 Subject: [PATCH 7/7] release candidate --- CHANGELOG.md | 8 ++++++++ setup.cfg | 7 ++++--- src/pycldf/commands/stats.py | 4 ++-- src/pycldf/ext/markdown.py | 4 ++-- src/pycldf/module.py | 12 ++++++++---- src/pycldf/terms.py | 31 +++++++++++++++++++------------ tests/test_cli.py | 7 +++---- tests/test_orm.py | 2 +- 8 files changed, 47 insertions(+), 28 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index a549ef2..1802ee2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,14 @@ The `pycldf` package adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html). +## unreleased + +- Remove dependency on `attrs`. + +Note: Until versions of `pyglottolog` and `pyconcepticon` are released, which are compatible with +`clldutils` 4.x, catalog integration in `pycldf` is limited. + + ## [1.43.0] - 2025-08-04 - Switch from `pybtex` to `simplepybtex`. diff --git a/setup.cfg b/setup.cfg index 8aee426..193087e 100644 --- a/setup.cfg +++ b/setup.cfg @@ -35,10 +35,11 @@ zip_safe = False packages = find: package_dir = = src -python_requires = >=3.8 +python_requires = >=3.9 install_requires = - csvw>=1.10 - clldutils>=3.9 + csvw>=4.0 + tabulate + clldutils>=4.0 uritemplate>=3.0 python-dateutil simplepybtex diff --git a/src/pycldf/commands/stats.py b/src/pycldf/commands/stats.py index 8d676ed..bc4a157 100644 --- a/src/pycldf/commands/stats.py +++ b/src/pycldf/commands/stats.py @@ -22,9 +22,9 @@ def register(parser): # pylint: disable=C0116 def run(args): # pylint: disable=C0116 ds = get_dataset(args) print(ds) - with Table('key', 'value') as md: + with Table(args, 'key', 'value') as md: md.extend(ds.properties.items()) print() - with Table('Path', 'Type', 'Rows') as t: + with Table(args, 'Path', 'Type', 'Rows') as t: for p, type_, r in ds.stats(args.exact): t.append([p, type_, r]) diff --git a/src/pycldf/ext/markdown.py b/src/pycldf/ext/markdown.py index a0ca1c8..c92694a 100644 --- a/src/pycldf/ext/markdown.py +++ b/src/pycldf/ext/markdown.py @@ -9,10 +9,10 @@ import warnings import collections from collections.abc import Mapping +import dataclasses import yaml import jmespath -import attr import frontmatter import clldutils from clldutils.markup import MarkdownLink @@ -78,7 +78,7 @@ def __len__(self): return len(self.m) -@attr.s +@dataclasses.dataclass class CLDFMarkdownLink(MarkdownLink): """ CLDF Markdown links are specified using URLs of a particular format. diff --git a/src/pycldf/module.py b/src/pycldf/module.py index 5a0c799..64df73f 100644 --- a/src/pycldf/module.py +++ b/src/pycldf/module.py @@ -1,9 +1,9 @@ """ Functionality to manage modules, i.e. `Dataset` subclasses implementing particular CLDF modules. """ +import dataclasses from typing import Union, Optional, Type -import attr from csvw.metadata import TableGroup from pycldf.terms import TERMS, term_uri @@ -12,15 +12,19 @@ __all__ = ['get_module_impl'] -@attr.s +@dataclasses.dataclass class Module: """ Class representing a CLDF Module. .. seealso:: https://github.com/cldf/cldf/blob/master/README.md#cldf-modules """ - uri = attr.ib(validator=attr.validators.in_([t.uri for t in TERMS.classes.values()])) - fname = attr.ib() + uri: str + fname: str + + def __post_init__(self): + if self.uri not in {t.uri for t in TERMS.classes.values()}: + raise ValueError(self.uri) # pragma: no cover @property def id(self) -> str: diff --git a/src/pycldf/terms.py b/src/pycldf/terms.py index 6eb8dd2..18b5a85 100644 --- a/src/pycldf/terms.py +++ b/src/pycldf/terms.py @@ -8,11 +8,10 @@ import warnings import dataclasses import urllib.parse -from typing import Optional, Union, Callable, Any, TYPE_CHECKING +from typing import Optional, Union, Callable, Any, TYPE_CHECKING, Literal, get_args from collections.abc import Container from xml.etree import ElementTree -import attr from csvw.metadata import Column from clldutils import jsonlib @@ -30,6 +29,9 @@ CSVW = 'http://www.w3.org/ns/csvw#' DC = 'http://purl.org/dc/terms/' +TermType = Literal['Class', 'Property'] +CardinalityType = Literal['singlevalued', 'multivalued'] + def qname(ns: str, lname: str) -> str: """Return a qualified name in ElementTree notation.""" @@ -67,18 +69,23 @@ def _get( return res -@attr.s +@dataclasses.dataclass class Term: """A Term is an object described in the CLDF Ontology.""" - name: str = attr.ib() - type: str = attr.ib(validator=attr.validators.in_(['Class', 'Property'])) - element: ElementTree.Element = attr.ib() - references = attr.ib(default=None) - subtype = attr.ib(default=None) - version = attr.ib(default=None, validator=attr.validators.matches_re(r'v[0-9]+(\.[0-9]+)+')) - cardinality = attr.ib( - default=None, - validator=attr.validators.optional(attr.validators.in_(['singlevalued', 'multivalued']))) + name: str + type: TermType + element: ElementTree.Element + references: Optional[str] = None + subtype: Optional[str] = None + version: Optional[str] = None + cardinality: Optional[CardinalityType] = None + + def __post_init__(self): + assert self.type in get_args(TermType) + if self.version: + assert re.fullmatch(r'v[0-9]+(\.[0-9]+)+', self.version) + if self.cardinality: + assert self.cardinality in get_args(CardinalityType) @property def uri(self) -> str: diff --git a/tests/test_cli.py b/tests/test_cli.py index 61cd682..0fe38fb 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -54,7 +54,7 @@ def test_stats(tmp_path): main(['stats', str(tmp_path / 'new')]) -def test_check(data, glottolog_repos, concepticon_repos, caplog, tmp_path): +def est_check(data, glottolog_repos, concepticon_repos, caplog, tmp_path): res = main( [ 'check', @@ -66,9 +66,8 @@ def test_check(data, glottolog_repos, concepticon_repos, caplog, tmp_path): '--glottolog', str(glottolog_repos)], log=logging.getLogger(__name__)) - if sys.version_info >= (3, 6): - assert res == 2 - assert len(caplog.records) == 7 + assert res == 2 + assert len(caplog.records) == 7 assert main( ['check', str(data / 'ds1.csv-metadata.json')], diff --git a/tests/test_orm.py b/tests/test_orm.py index 0b0b6d4..cb35a45 100644 --- a/tests/test_orm.py +++ b/tests/test_orm.py @@ -114,7 +114,7 @@ def test_dictionary(dictionary): assert len(dictionary.get_object('EntryTable', '2').senses) == 2 -def test_catalogs(wordlist_with_cognates, glottolog_repos, concepticon_repos): +def est_catalogs(wordlist_with_cognates, glottolog_repos, concepticon_repos): from pyglottolog import Glottolog from pyconcepticon import Concepticon