diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index 3dd19c1..8f0f9e9 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -12,12 +12,12 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python-version: [3.9, "3.10", 3.11, 3.12] + python-version: ["3.10", 3.11, 3.12, 3.13] steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v6 - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v5 + uses: actions/setup-python@v6 with: python-version: ${{ matrix.python-version }} - name: Install dependencies diff --git a/CHANGELOG.md b/CHANGELOG.md index d54dd30..8e88f75 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,14 @@ The `pycldf` package adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html). +## unreleased + +- Remove dependency on `attrs`. + +Note: Until versions of `pyglottolog` and `pyconcepticon` are released, which are compatible with +`clldutils` 4.x, catalog integration in `pycldf` is limited. + + ## [1.43.1] - 2026-03-25 Pin dependencies `csvw` and `clldutils`, since these will get incompatible new major versions. diff --git a/RELEASING.md b/RELEASING.md index b108f7f..527a983 100644 --- a/RELEASING.md +++ b/RELEASING.md @@ -12,6 +12,10 @@ Releasing pycldf ```shell flake8 src ``` +- Make sure pylint passes with a score of 10: + ```shell + pylint src + ``` - Make sure the docs render: ```shell diff --git a/setup.cfg b/setup.cfg index a428afc..2a401ba 100644 --- a/setup.cfg +++ b/setup.cfg @@ -20,12 +20,12 @@ classifiers = Natural Language :: English Operating System :: OS Independent Programming Language :: Python :: 3 - Programming Language :: Python :: 3.8 Programming Language :: Python :: 3.9 Programming Language :: Python :: 3.10 Programming Language :: Python :: 3.11 Programming Language :: Python :: 3.12 Programming Language :: Python :: 3.13 + Programming Language :: Python :: 3.14 Programming Language :: Python :: Implementation :: CPython Programming Language :: Python :: Implementation :: PyPy License :: OSI Approved :: Apache Software License @@ -35,10 +35,11 @@ zip_safe = False packages = find: package_dir = = src -python_requires = >=3.8 +python_requires = >=3.9 install_requires = - csvw<4 - clldutils<4 + csvw>=4.0 + tabulate + clldutils>=4.0 uritemplate>=3.0 python-dateutil simplepybtex @@ -83,7 +84,6 @@ test = pyconcepticon pytest>=5 pytest-mock - requests-mock pytest-cov coverage>=4.2 docs = @@ -117,7 +117,7 @@ show_missing = true skip_covered = true [tox:tox] -envlist = py3.8, py39, py310, py311, py312, py313 +envlist = py39, py310, py311, py312, py313, py314 isolated_build = true skip_missing_interpreter = true diff --git a/src/pycldf/__init__.py b/src/pycldf/__init__.py index e6c77d7..85b0266 100644 --- a/src/pycldf/__init__.py +++ b/src/pycldf/__init__.py @@ -1,3 +1,8 @@ +""" +The `pycldf` package provides the reference implementation for the CLDF standard. + +https://cldf.cldf.org +""" from pycldf.dataset import * from pycldf.db import * from pycldf.sources import * diff --git a/src/pycldf/__main__.py b/src/pycldf/__main__.py index fe764ac..ca344cb 100644 --- a/src/pycldf/__main__.py +++ b/src/pycldf/__main__.py @@ -1,5 +1,10 @@ +""" +CLI for the `pycldf` pockage. +""" import csv import sys +from typing import Optional, Sequence +import logging import contextlib from clldutils.clilib import ( @@ -10,7 +15,15 @@ import pycldf.commands -def main(args=None, catch_all=False, parsed_args=None, log=None): +def main( + args: Sequence[str] = None, + catch_all: bool = False, + parsed_args: list = None, + log: Optional[logging.Logger] = None, +) -> Optional[int]: + """ + Implements the main command, dispatches to subcommands. + """ parser, subparsers = get_parser_and_subparsers('cldf') add_csv_field_size_limit(parser, default=csv.field_size_limit()) register_subcommands(subparsers, pycldf.commands) @@ -32,7 +45,7 @@ def main(args=None, catch_all=False, parsed_args=None, log=None): return 0 except ParserError as e: print(colored(str(e), 'red')) - return main([args._command, '-h']) + return main([args._command, '-h']) # pylint: disable=protected-access except Exception as e: # pragma: no cover if catch_all: print(e) diff --git a/src/pycldf/_compat.py b/src/pycldf/_compat.py new file mode 100644 index 0000000..f63a117 --- /dev/null +++ b/src/pycldf/_compat.py @@ -0,0 +1,15 @@ +""" +Backwards compatibility with supported python versions. +""" +import sys + + +if (sys.version_info.major, sys.version_info.minor) >= (3, 10): # pragma: no cover + def entry_points_select(eps, group): + """ + Staring with Python 3.10, `importlib.metadata.entry_points` returns `EntryPoints`.""" + return eps.select(group=group) +else: + def entry_points_select(eps, group): # pragma: no cover + """In Python 3.9, `importlib.metadata.entry_points` returns a `dict`.""" + return eps.get(group, []) diff --git a/src/pycldf/cli_util.py b/src/pycldf/cli_util.py index 8e2a488..ddf179e 100644 --- a/src/pycldf/cli_util.py +++ b/src/pycldf/cli_util.py @@ -31,13 +31,17 @@ def strtobool(val: str) -> int: # pragma: no cover val = val.lower() if val in ('y', 'yes', 't', 'true', 'on', '1'): return 1 - elif val in ('n', 'no', 'f', 'false', 'off', '0'): + if val in ('n', 'no', 'f', 'false', 'off', '0'): return 0 - else: - raise ValueError("invalid truth value %r" % (val,)) + raise ValueError(f"invalid truth value {val}") -class FlagOrPathType(PathType): +class FlagOrPathType(PathType): # pylint: disable=too-few-public-methods + """ + Argument type allowing input of a path or a boolean. + + The boolean can be used to determine whether to download a file from a known location. + """ def __call__(self, string): try: return bool(strtobool(string)) @@ -45,8 +49,10 @@ def __call__(self, string): return super().__call__(string) -def http_head_status(url): # pragma: no cover +def http_head_status(url: str) -> int: # pragma: no cover + """Do a HEAD request for `url` to determine its status.""" class NoRedirection(urllib.request.HTTPErrorProcessor): + """Don't follow redirects.""" def http_response(self, request, response): return response @@ -56,22 +62,22 @@ def http_response(self, request, response): return opener.open(urllib.request.Request(url, method="HEAD")).status -class UrlOrPathType(PathType): - def __call__(self, string): +class UrlOrPathType(PathType): # pylint: disable=too-few-public-methods + """Type suitable for argparse arguments, allowing input of URL or local file path.""" + def __call__(self, string: str) -> str: if is_url(string): if self._must_exist: sc = http_head_status(string) # We accept not only HTTP 200 as valid but also common redirection codes because # these are used e.g. for DOIs. if sc not in {200, 301, 302}: - raise argparse.ArgumentTypeError( - 'URL {} does not exist [HTTP {}]!'.format(string, sc)) + raise argparse.ArgumentTypeError(f'URL {string} does not exist [HTTP {sc}]!') return string super().__call__(string.partition('#')[0]) return string -def add_dataset(parser: argparse.ArgumentParser): +def add_dataset(parser: argparse.ArgumentParser) -> None: """ Adds a positional argument named `dataset` to the parser to specify a CLDF dataset. """ @@ -101,11 +107,17 @@ def get_dataset(args: argparse.Namespace) -> Dataset: except TypeError as e: # pragma: no cover if 'PathLike' in str(e): raise ParserError( - 'The dataset locator may require downloading, so you should specify --download-dir') + 'The dataset locator may require downloading, so you should specify --download-dir' + ) from e raise -def add_database(parser, must_exist=True): +def add_database(parser: argparse.ArgumentParser, must_exist: bool = True) -> None: + """ + Add CLI arguments to specify a CLDF SQLite database. + + Retrieve in the `run` function of a command using `get_database` (see below). + """ add_dataset(parser) parser.add_argument( 'db', @@ -116,17 +128,21 @@ def add_database(parser, must_exist=True): parser.add_argument('--infer-primary-keys', action='store_true', default=False) -def get_database(args): +def get_database(args: argparse.Namespace) -> Database: + """ + Retrieve a `Database` instance based on CLI input in `args` (see `add_database`). + """ return Database(get_dataset(args), fname=args.db, infer_primary_keys=args.infer_primary_keys) -def add_catalog_spec(parser, name): +def add_catalog_spec(parser: argparse.ArgumentParser, name: str) -> None: + """Add CLI arguments suitable to specify a catalog.""" parser.add_argument( '--' + name, metavar=name.upper(), type=PathType(type='dir'), - help='Path to repository clone of {0} data'.format(name.capitalize())) + help=f'Path to repository clone of {name.capitalize()} data') parser.add_argument( - '--{0}-version'.format(name), - help='Version of {0} data to checkout'.format(name.capitalize()), + f'--{name}-version', + help=f'Version of {name.capitalize()} data to checkout', default=None) diff --git a/src/pycldf/commands/catmedia.py b/src/pycldf/commands/catmedia.py index a14c6b0..c629fd7 100644 --- a/src/pycldf/commands/catmedia.py +++ b/src/pycldf/commands/catmedia.py @@ -5,12 +5,12 @@ from pycldf.media import MediaTable -def register(parser): +def register(parser): # pylint: disable=C0116 add_dataset(parser) -def run(args): +def run(args): # pylint: disable=C0116 ds = get_dataset(args) res = MediaTable(ds).cat() if res: - args.log.info('{} files have been recombined'.format(res)) + args.log.info(f'{res} files have been recombined') diff --git a/src/pycldf/commands/check.py b/src/pycldf/commands/check.py index 8e7930e..e93a894 100644 --- a/src/pycldf/commands/check.py +++ b/src/pycldf/commands/check.py @@ -18,7 +18,7 @@ Catalog, Glottolog, Concepticon = None, None, None -def register(parser): +def register(parser): # pylint: disable=C0116 add_dataset(parser) add_catalog_spec(parser, 'glottolog') add_catalog_spec(parser, 'concepticon') @@ -32,7 +32,7 @@ def register(parser): ) -def run(args): +def run(args): # pylint: disable=C0116 if not Catalog: # pragma: no cover print('\nThis command only works with catalogs installed.\n' 'Run "pip install pycldf[catalogs]" to do so.\n') @@ -53,31 +53,37 @@ def warn(msg): stack.enter_context(Catalog(args.concepticon, tag=args.concepticon_version)) for table, checkers in COLUMN_CHECKERS.items(): - table = ds.get(table) - if table: - idcol = ds.get((table, 'id')) - active_checkers = {} - for col, checker in checkers.items(): - col = ds.get((table, col)) - if col: - # Register an initialized check: - active_checkers[col.name] = checker(args) - if active_checkers: - for row in table: - rowid = row[idcol.name] if idcol else str(row) - for colname, check in active_checkers.items(): - check(row[colname], rowid, warn) + _check_table(ds, table, checkers, args, warn) for table in ds.tables: for _ in table: break else: - warn('Empty table {0}'.format(table.url)) + warn(f'Empty table {table.url}') return 2 if warnings else 0 -class Check: +def _check_table(ds, table, checkers, args, warn): + table = ds.get(table) + if not table: + return + idcol = ds.get((table, 'id')) + active_checkers = {} + for col, checker in checkers.items(): + col = ds.get((table, col)) + if col: + # Register an initialized check: + active_checkers[col.name] = checker(args) + if active_checkers: + for row in table: + rowid = row[idcol.name] if idcol else str(row) + for colname, check in active_checkers.items(): + check(row[colname], rowid, warn) + + +class Check: # pylint: disable=R0903 + """A base class for checks. Initialize with __init__ then run __call__ on each row.""" def __init__(self, args): self.args = args @@ -85,7 +91,7 @@ def __call__(self, gc, rowid, warn): raise NotImplementedError() # pragma: no cover -class Macroarea(Check): +class Macroarea(Check): # pylint: disable=R0903 """Is the macroarea valid according to Glottolog? (requires "--glottolog")""" def __init__(self, args): super().__init__(args) @@ -96,10 +102,10 @@ def __init__(self, args): def __call__(self, ma, rowid, warn): if self.macroareas and ma and (ma not in self.macroareas): - warn('Language {0} assigned to invalid macroarea {1}'.format(rowid, ma)) + warn(f'Language {rowid} assigned to invalid macroarea {ma}') -class Glottocode(Check): +class Glottocode(Check): # pylint: disable=R0903 """Is the Glottocode valid - is it in Bookkeeping? (requires "--glottolog")""" def __init__(self, args): super().__init__(args) @@ -116,12 +122,12 @@ def __init__(self, args): def __call__(self, gc, rowid, warn): if self.gcs and gc: if gc in self.bookkeeping: - warn('Language {0} mapped to Bookkeeping languoid {1}'.format(rowid, gc)) + warn(f'Language {rowid} mapped to Bookkeeping languoid {gc}') if gc not in self.gcs: - warn('Language {0} mapped to invalid Glottocode {1}'.format(rowid, gc)) + warn(f'Language {rowid} mapped to invalid Glottocode {gc}') -class ISOCode(Check): +class ISOCode(Check): # pylint: disable=R0903 """Is the ISO code valid? (requires "--iso-codes")""" def __init__(self, args): super().__init__(args) @@ -139,24 +145,24 @@ def __init__(self, args): def __call__(self, iso, rowid, warn): if self.iso_codes and iso and (iso not in self.iso_codes): - warn('Language {0} mapped to invalid ISO 639-3 code {1}'.format(rowid, iso)) + warn(f'Language {rowid} mapped to invalid ISO 639-3 code {iso}') -class Latitude(Check): +class Latitude(Check): # pylint: disable=R0903 """Is the latitude between -90 and 90?""" def __call__(self, lat, rowid, warn): - if lat and not (-90 <= lat <= 90): - warn('Language {0} has invalid latitude {1}'.format(rowid, lat)) + if lat and not -90 <= lat <= 90: + warn(f'Language {rowid} has invalid latitude {lat}') -class Longitude(Check): +class Longitude(Check): # pylint: disable=R0903 """Is the longitude between -180 and 180?""" - def __call__(self, lat, rowid, warn): - if lat and not (-180 <= lat <= 180): - warn('Language {0} has invalid longitude {1}'.format(rowid, lat)) + def __call__(self, lon, rowid, warn): + if lon and not -180 <= lon <= 180: + warn(f'Language {rowid} has invalid longitude {lon}') -class ConcepticonID(Check): +class ConcepticonID(Check): # pylint: disable=R0903 """Is the concept set ID valid? (requires "--concepticon")""" def __init__(self, args): super().__init__(args) @@ -168,7 +174,7 @@ def __init__(self, args): def __call__(self, cid, rowid, warn): if self.ids and cid and (cid not in self.ids): - warn('Parameter {0} mapped to invalid conceptset ID {1}'.format(rowid, cid)) + warn(f'Parameter {rowid} mapped to invalid conceptset ID {cid}') COLUMN_CHECKERS = { @@ -184,6 +190,6 @@ def __call__(self, cid, rowid, warn): } } for t, checks in COLUMN_CHECKERS.items(): - __doc__ += '\n- {0}\n'.format(t) + __doc__ += f'\n- {t}\n' for c, cls in checks.items(): - __doc__ += ' - {0}: {1}\n'.format(c, cls.__doc__.strip() or '') + __doc__ += f' - {c}: {cls.__doc__.strip()}\n' diff --git a/src/pycldf/commands/createdb.py b/src/pycldf/commands/createdb.py index d9b4c28..2d4aef7 100644 --- a/src/pycldf/commands/createdb.py +++ b/src/pycldf/commands/createdb.py @@ -6,13 +6,13 @@ from pycldf.cli_util import add_database, get_database -def register(parser): +def register(parser): # pylint: disable=C0116 add_database(parser, must_exist=False) -def run(args): +def run(args): # pylint: disable=C0116 if args.db.exists(): raise ParserError('The database file already exists!') db = get_database(args) db.write_from_tg() - args.log.info('{0} loaded in {1}'.format(db.dataset, db.fname)) + args.log.info('%s loaded in %s', db.dataset, db.fname) diff --git a/src/pycldf/commands/downloadmedia.py b/src/pycldf/commands/downloadmedia.py index d9b8c2d..7a674ee 100644 --- a/src/pycldf/commands/downloadmedia.py +++ b/src/pycldf/commands/downloadmedia.py @@ -9,7 +9,7 @@ from pycldf.media import MediaTable -def register(parser): +def register(parser): # pylint: disable=C0116 add_dataset(parser) parser.add_argument( '--use-form-id', @@ -27,11 +27,15 @@ def register(parser): default=[]) -def run(args): +def run(args): # pylint: disable=C0116 filters = [] for s in args.filters: col, _, substring = s.partition('=') filters.append((col, substring)) - for item in MediaTable(get_dataset(args), args.use_form_id): + media_table = MediaTable(get_dataset(args)) + if args.use_form_id: + media_table.filename_col = media_table.ds[ + media_table.component, 'http://cldf.clld.org/v1.0/terms.rdf#formReference'] + for item in media_table: if all(substring in item[col] for col, substring in filters): item.save(args.output) diff --git a/src/pycldf/commands/dumpdb.py b/src/pycldf/commands/dumpdb.py index 1b82a0a..ede0be2 100644 --- a/src/pycldf/commands/dumpdb.py +++ b/src/pycldf/commands/dumpdb.py @@ -7,7 +7,7 @@ from pycldf.cli_util import add_database, get_database, PathType -def register(parser): +def register(parser): # pylint: disable=C0116 add_database(parser) parser.add_argument( '--metadata-path', @@ -16,7 +16,7 @@ def register(parser): ) -def run(args): +def run(args): # pylint: disable=C0116 db = get_database(args) - mdpath = args.metadata_path or db.dataset.tablegroup._fname - args.log.info('dumped db to {0}'.format(db.to_cldf(mdpath.parent, mdname=mdpath.name))) + mdpath = args.metadata_path or db.dataset.tablegroup._fname # pylint: disable=W0212 + args.log.info('dumped db to %s', db.to_cldf(mdpath.parent, mdname=mdpath.name)) diff --git a/src/pycldf/commands/markdown.py b/src/pycldf/commands/markdown.py index 296eb4d..567f0e0 100644 --- a/src/pycldf/commands/markdown.py +++ b/src/pycldf/commands/markdown.py @@ -3,11 +3,11 @@ """ from clldutils.clilib import PathType -from pycldf.util import metadata2markdown +from pycldf.markdown import metadata2markdown from pycldf.cli_util import add_dataset, get_dataset -def register(parser): +def register(parser): # pylint: disable=C0116 add_dataset(parser) parser.add_argument( '--rel-path', @@ -19,7 +19,7 @@ def register(parser): default=None) -def run(args): +def run(args): # pylint: disable=C0116 ds = get_dataset(args) res = metadata2markdown(ds, args.dataset, rel_path=args.rel_path) if args.out: diff --git a/src/pycldf/commands/splitmedia.py b/src/pycldf/commands/splitmedia.py index 0da333b..ebf28a4 100644 --- a/src/pycldf/commands/splitmedia.py +++ b/src/pycldf/commands/splitmedia.py @@ -13,26 +13,26 @@ CHUNKSIZE = 50 * 1000 * 1000 -class Bytes: - def __call__(self, string): - if not re.fullmatch(r'[0-9]+([MK])?', string): # pragma: no cover - raise argparse.ArgumentTypeError('Invalid chunksize {0}!'.format(string)) - return eval(string.replace('K', '*1024').replace('M', '*1024*1024')) +def _bytes(string) -> int: + """Parse a chunk size spec.""" + if not re.fullmatch(r'[0-9]+([MK])?', string): # pragma: no cover + raise argparse.ArgumentTypeError(f'Invalid chunksize {string}!') + return eval(string.replace('K', '*1024').replace('M', '*1024*1024')) # pylint: disable=W0123 -def register(parser): +def register(parser): # pylint: disable=C0116 add_dataset(parser) parser.add_argument( '-b', '--bytes', metavar='SIZE', help='The SIZE argument is an integer and optional unit K or M (example: 10K is 10*1024).', - type=Bytes(), + type=_bytes, default=CHUNKSIZE, ) -def run(args): +def run(args): # pylint: disable=C0116 ds = get_dataset(args) res = MediaTable(ds).split(args.bytes) if res: - args.log.info('{} files have been split'.format(res)) + args.log.info('%s files have been split', res) diff --git a/src/pycldf/commands/stats.py b/src/pycldf/commands/stats.py index 61084e0..bc4a157 100644 --- a/src/pycldf/commands/stats.py +++ b/src/pycldf/commands/stats.py @@ -8,7 +8,7 @@ from pycldf.cli_util import add_dataset, get_dataset -def register(parser): +def register(parser): # pylint: disable=C0116 add_dataset(parser) add_format(parser, default=None) parser.add_argument( @@ -19,12 +19,12 @@ def register(parser): ) -def run(args): +def run(args): # pylint: disable=C0116 ds = get_dataset(args) print(ds) - with Table('key', 'value') as md: + with Table(args, 'key', 'value') as md: md.extend(ds.properties.items()) print() - with Table('Path', 'Type', 'Rows') as t: + with Table(args, 'Path', 'Type', 'Rows') as t: for p, type_, r in ds.stats(args.exact): t.append([p, type_, r]) diff --git a/src/pycldf/commands/validate.py b/src/pycldf/commands/validate.py index 9f20202..5d607df 100644 --- a/src/pycldf/commands/validate.py +++ b/src/pycldf/commands/validate.py @@ -5,13 +5,15 @@ - the referential integrity of the dataset """ import collections +import dataclasses +from pycldf import Dataset from pycldf.cli_util import add_dataset, get_dataset from pycldf.media import MediaTable -from pycldf.ext.markdown import CLDFMarkdownText +from pycldf.ext.markdown import CLDFMarkdownText, CLDFMarkdownLink -def register(parser): +def register(parser): # pylint: disable=C0116 add_dataset(parser) parser.add_argument( '--with-cldf-markdown', @@ -21,7 +23,26 @@ def register(parser): ) -def run(args): +@dataclasses.dataclass +class TestMarkdown: + """Helper class to run rendering of CLDF markdown and record results.""" + links: list[CLDFMarkdownLink] = dataclasses.field(default_factory=list) + missing: collections.Counter = dataclasses.field(default_factory=collections.Counter) + + def __call__(self, text: str, ds: Dataset): + class Parser(CLDFMarkdownText): + """A CLDFMarkdownText subclass that records link render results.""" + def render_link(slf, cldf_link): # pylint: disable=W0237,E0213 + self.links.append(cldf_link) + try: + slf.get_object(cldf_link) + except: # noqa: E722 # pylint: disable=W0702 + self.missing.update([ + f'{cldf_link.label}:{cldf_link.table_or_fname}:{cldf_link.objid}']) + Parser(text, ds).render() + + +def run(args): # pylint: disable=C0116 cldf = get_dataset(args) if not cldf.validate(log=args.log): return 1 @@ -29,18 +50,6 @@ def run(args): if not args.with_cldf_markdown: return 0 - missing = collections.Counter() - links = [] - - class TestMarkdown(CLDFMarkdownText): - def render_link(self, cldf_link): - links.append(cldf_link) - try: - self.get_object(cldf_link) - except: # noqa: E722 - missing.update(['{}:{}:{}'.format( - cldf_link.label, cldf_link.table_or_fname, cldf_link.objid)]) - cols = [] for t in cldf.tables: try: @@ -54,27 +63,34 @@ def render_link(self, cldf_link): res = 0 for t, c in cols: - args.log.info('Validating CLDF Markdown links in {}:{}'.format(t, c)) + tmd = TestMarkdown() + args.log.info('Validating CLDF Markdown links in %s:%s', t, c) for obj in cldf[t]: if obj[c] and '[' in obj[c]: - TestMarkdown(obj[c], cldf).render() + tmd(obj[c], cldf) - for k, v in missing.most_common(): + for k, v in tmd.missing.most_common(): res = 1 - args.log.warning('Not found {} ({} occurrences)'.format(k, v)) - args.log.info('{} links checked'.format(len(links))) - missing, links = collections.Counter(), [] + args.log.warning('Not found %s (%s occurrences)', k, v) + args.log.info('%s links checked', len(tmd.links)) if 'MediaTable' in cldf and ('MediaTable', 'http://purl.org/dc/terms/conformsTo') in cldf: - ctcol = cldf['MediaTable', 'http://purl.org/dc/terms/conformsTo'] - for file in MediaTable(cldf): - if file.row[ctcol.name] == 'CLDF Markdown': - args.log.info('Validating CLDF Markdown links in MediaTable:{}'.format(file.id)) - TestMarkdown(file.read(), cldf).render() - for k, v in missing.most_common(): - res = 1 - args.log.warning('Not found {} ({} occurrences)'.format(k, v)) - args.log.info('{} links checked'.format(len(links))) - missing, links = collections.Counter(), [] + if not _validate_media(cldf, args.log): + res = 1 + + return res + +def _validate_media(cldf, log) -> bool: + res = True + ctcol = cldf['MediaTable', 'http://purl.org/dc/terms/conformsTo'] + for file in MediaTable(cldf): + if file.row[ctcol.name] == 'CLDF Markdown': + log.info('Validating CLDF Markdown links in MediaTable:%s', file.id) + tmd = TestMarkdown() + tmd(file.read(), cldf) + for k, v in tmd.missing.most_common(): + res = False + log.warning('Not found %s (%s occurrences)', k, v) + log.info('%s links checked', len(tmd.links)) return res diff --git a/src/pycldf/constraints.py b/src/pycldf/constraints.py new file mode 100644 index 0000000..cfad2dc --- /dev/null +++ b/src/pycldf/constraints.py @@ -0,0 +1,100 @@ +""" +Functionality for creation of foreign key constraints. +""" +from typing import TYPE_CHECKING, Optional + +from pycldf.terms import TERMS, term_uri +from pycldf.schemautil import TableType, ColType + +if TYPE_CHECKING: + from pycldf.dataset import Dataset # pragma: no cover + +__all__ = ['add_foreign_key', 'add_auto_constraints'] + + +def add_foreign_key( + ds: 'Dataset', + foreign_t: TableType, + foreign_c: ColType, + primary_t: TableType, + primary_c: Optional[ColType] = None, +) -> None: + """ + Add a foreign key constraint. + + ..note:: Composite keys are not supported yet. + + :param foreign_t: Table reference for the linking table. + :param foreign_c: Column reference for the link. + :param primary_t: Table reference for the linked table. + :param primary_c: Column reference for the linked column - or `None`, in which case the \ + primary key of the linked table is assumed. + """ + if isinstance(foreign_c, (tuple, list)) or isinstance(primary_c, (tuple, list)): + raise NotImplementedError('composite keys are not supported') + + foreign_t = ds[foreign_t] + primary_t = ds[primary_t] + if not primary_c: + primary_c = primary_t.tableSchema.primaryKey + else: + primary_c = ds[primary_t, primary_c].name + foreign_t.add_foreign_key(ds[foreign_t, foreign_c].name, primary_t.url.string, primary_c) + + +def add_auto_constraints(ds: 'Dataset', component: Optional[TableType] = None): + """ + Use CLDF reference properties to implicitly create foreign key constraints. + + :param component: A Table object or `None`. + """ + if not component: + for table in ds.tables: + ds.auto_constraints(table) + return + + if not component.tableSchema.primaryKey: + idcol = component.get_column(term_uri('id')) + if idcol: + component.tableSchema.primaryKey = [idcol.name] + + _auto_foreign_keys(ds, component) + + try: + table_type = ds.get_tabletype(component) + except ValueError: + table_type = None + + if table_type is None: + # New component is not a known CLDF term, so cannot add components + # automatically. TODO: We might me able to infer some based on + # `xxxReference` column properties? + return + + # auto-add foreign keys targeting the new component: + for table in ds.tables: + _auto_foreign_keys(ds, table, component=component, table_type=table_type) + + +def _auto_foreign_keys(ds: 'Dataset', table, component=None, table_type=None): + assert (component is None) == (table_type is None) + for col in table.tableSchema.columns: + if col.propertyUrl and col.propertyUrl.uri in TERMS.by_uri: + ref_name = TERMS.by_uri[col.propertyUrl.uri].references + if (component is None and not ref_name) or \ + (component is not None and ref_name != table_type): + continue + if any(fkey.columnReference == [col.name] + for fkey in table.tableSchema.foreignKeys): + continue + if component is None: + # Let's see whether we have the component this column references: + try: + ref = ds[ref_name] + except KeyError: + continue + else: + ref = component + idcol = ref.get_column(term_uri('id')) + table.add_foreign_key( + col.name, ref.url.string, idcol.name if idcol is not None else 'ID') diff --git a/src/pycldf/dataset.py b/src/pycldf/dataset.py index 7891f1f..fd05162 100644 --- a/src/pycldf/dataset.py +++ b/src/pycldf/dataset.py @@ -1,160 +1,55 @@ +""" +An implementation of a CLDF dataset object. +""" import re -import sys -import json import types -import shutil -import typing +from typing import Union, Optional, Type, Any import logging import pathlib import functools import itertools import collections -import collections.abc +from collections.abc import Generator, Iterable import urllib.parse import urllib.request -import attr import csvw -from csvw.metadata import TableGroup, Table, Column, Link, Schema, is_url, URITemplate +from csvw.metadata import TableGroup, Table, Column, Link, is_url, URITemplate from csvw import datatypes from csvw.dsv import iterrows -from clldutils.path import git_describe, walk -from clldutils.misc import log_or_raise -from clldutils import jsonlib - -from pycldf.sources import Sources -from pycldf.util import pkg_path, resolve_slices, DictTuple, sanitize_url, iter_uritemplates -from pycldf.terms import term_uri, Terms, TERMS, get_column_names, URL as TERMS_URL -from pycldf.validators import VALIDATORS +from clldutils.path import walk + +from pycldf.module import get_module_impl, get_modules +from pycldf.sources import Sources, Source +from pycldf.util import ( + pkg_path, DictTuple, iter_uritemplates, MD_SUFFIX, GitRepository, copy_dataset) +from pycldf.sliceutil import multislice_with_split +from pycldf.fileutil import PathType +from pycldf.schemautil import ColSpecType, make_column, make_table, TableType, ColType +from pycldf.constraints import add_foreign_key, add_auto_constraints +from pycldf.terms import term_uri, Terms, TERMS, get_column_names, sniff +from pycldf import validators as validation +from pycldf.stats import get_table_stats from pycldf import orm +assert get_modules # For backwards compatibility with cldfbench. + __all__ = [ 'Dataset', 'Generic', 'Wordlist', 'ParallelText', 'Dictionary', 'StructureDataset', - 'TextCorpus', 'iter_datasets', 'sniff', 'SchemaError', 'ComponentWithValidation'] + 'TextCorpus', 'iter_datasets', 'sniff', 'SchemaError'] -MD_SUFFIX = '-metadata.json' ORM_CLASSES = {cls.component_name(): cls for cls in orm.Object.__subclasses__()} -TableType = typing.Union[str, Table] -ColType = typing.Union[str, Column] -ColSpecType = typing.Union[str, dict, Column] -PathType = typing.Union[str, pathlib.Path] -TableSpecType = typing.Union[str, Link, Table] -ColSPecType = typing.Union[str, Column] -SchemaObjectType = typing.Union[TableSpecType, typing.Tuple[TableSpecType, ColSPecType]] +TableSpecType = Union[str, Link, Table] +SchemaObjectType = Union[TableSpecType, tuple[TableSpecType, ColType]] +ODict = collections.OrderedDict +RowType = ODict[str, Any] class SchemaError(KeyError): - pass - - -@attr.s -class Module: - """ - Class representing a CLDF Module. - - .. seealso:: https://github.com/cldf/cldf/blob/master/README.md#cldf-modules - """ - uri = attr.ib(validator=attr.validators.in_([t.uri for t in TERMS.classes.values()])) - fname = attr.ib() - cls = attr.ib(default=None) - - @property - def id(self) -> str: - """ - The local part of the term URI is interpreted as Module identifier. - """ - return self.uri.split('#')[1] - - def match(self, thing) -> bool: - if isinstance(thing, TableGroup): - return thing.common_props.get('dc:conformsTo') == term_uri(self.id) - if hasattr(thing, 'name'): - return thing.name == self.fname - return False + """Schema objects can be accessed using `Dataset.__getitem__`.""" -_modules = [] - - -def get_modules() -> typing.List[Module]: - """ - We read supported CLDF modules from the default metadata files distributed with `pycldf`. - """ - global _modules - if not _modules: - ds = sys.modules[__name__] - for p in pkg_path('modules').glob('*{0}'.format(MD_SUFFIX)): - tg = TableGroup.from_file(p) - mod = Module( - tg.common_props['dc:conformsTo'], - tg.tables[0].url.string if tg.tables else None) - mod.cls = getattr(ds, mod.id) - _modules.append(mod) - # prefer Wordlist over ParallelText (forms.csv) - _modules = sorted( - _modules, - key=lambda m: (m.cls in (Wordlist, ParallelText), m.cls is ParallelText)) - return _modules - - -def make_column(spec: ColSpecType) -> Column: - """ - Create a `Column` instance from `spec`. - - .. code-block:: python - - >>> make_column('id').name - 'id' - >>> make_column('http://cldf.clld.org/v1.0/terms.rdf#id').name - 'ID' - >>> make_column({'name': 'col', 'datatype': 'boolean'}).datatype.base - 'boolean' - >>> type(make_column(make_column('id'))) - - """ - if isinstance(spec, str): - if spec in TERMS.by_uri: - return TERMS.by_uri[spec].to_column() - return Column(name=spec, datatype='string') - if isinstance(spec, dict): - return Column.fromvalue(spec) - if isinstance(spec, Column): - return spec - raise TypeError(spec) - - -class GitRepository: - """ - CLDF datasets are often created from data curated in git repositories. If this is the case, we - exploit this to provide better provenance information in the dataset's metadata. - """ - def __init__(self, - url: str, - clone: typing.Optional[typing.Union[str, pathlib.Path]] = None, - version: typing.Optional[str] = None, - **dc): - # We remove credentials from the URL immediately to make sure this isn't leaked into - # CLDF metadata. Such credentials might be present in URLs read via gitpython from - # remotes. - self.url = sanitize_url(url) - self.clone = clone - self.version = version - self.dc = dc - - def json_ld(self) -> typing.Dict[str, str]: - res = collections.OrderedDict([ - ('rdf:about', self.url), - ('rdf:type', 'prov:Entity'), - ]) - if self.version: - res['dc:created'] = self.version - elif self.clone: - res['dc:created'] = git_describe(self.clone) - res.update({'dc:{0}'.format(k): self.dc[k] for k in sorted(self.dc)}) - return res - - -class Dataset: +class Dataset: # pylint: disable=too-many-public-methods """ API to access a CLDF dataset. """ @@ -168,7 +63,7 @@ def __init__(self, tablegroup: csvw.TableGroup): - :meth:`~pycldf.dataset.Dataset.from_metadata` - :meth:`~pycldf.dataset.Dataset.from_data` """ - self.tablegroup = tablegroup + self.tablegroup: csvw.TableGroup = tablegroup self.auto_constraints() self._sources = None self._objects = collections.defaultdict(collections.OrderedDict) @@ -177,6 +72,7 @@ def __init__(self, tablegroup: csvw.TableGroup): @property def sources(self) -> Sources: + """The sources.""" # We load sources only the first time they are accessed, because for datasets like # Glottolog - with 40MB zipped BibTeX - this may take ~90secs. if self._sources is None: @@ -189,9 +85,7 @@ def sources(self, obj: Sources): raise TypeError('Invalid type for Dataset.sources') self._sources = obj - # - # Factory methods to create `Dataset` instances. - # + # Factory methods to create `Dataset` instances. ----------------------------------------------- @classmethod def in_dir(cls, d: PathType, empty_tables: bool = False) -> 'Dataset': """ @@ -226,11 +120,11 @@ def from_metadata(cls, fname: PathType) -> 'Dataset': else: fname = pathlib.Path(fname) if fname.is_dir(): - name = '{0}{1}'.format(cls.__name__, MD_SUFFIX) + name = f'{cls.__name__}{MD_SUFFIX}' tablegroup = TableGroup.from_file(pkg_path('modules', name)) # adapt the path of the metadata file such that paths to tables are resolved # correctly: - tablegroup._fname = fname.joinpath(name) + tablegroup._fname = fname.joinpath(name) # pylint: disable=W0212 else: tablegroup = TableGroup.from_file(fname) @@ -243,11 +137,11 @@ def from_metadata(cls, fname: PathType) -> 'Dataset': except ValueError: pass if comps and comps.most_common(1)[0][1] > 1: - raise ValueError('{0}: duplicate components!'.format(fname)) + raise ValueError(f'{fname}: duplicate components!') - for mod in get_modules(): - if mod.match(tablegroup): - return mod.cls(tablegroup) + impl = get_module_impl(Dataset, tablegroup) + if impl: + return impl(tablegroup) return cls(tablegroup) @classmethod @@ -264,38 +158,38 @@ def from_data(cls, fname: PathType) -> 'Dataset': if not colnames: raise ValueError('empty data file!') if cls is Dataset: - try: - cls = next(mod.cls for mod in get_modules() if mod.match(fname)) - except StopIteration: - raise ValueError('{0} does not match a CLDF module spec'.format(fname)) - assert issubclass(cls, Dataset) and cls is not Dataset - - res = cls.from_metadata(fname.parent) + impl = get_module_impl(Dataset, fname.name) + if impl is None: + raise ValueError(f'{fname} does not match a CLDF module spec') + res = impl.from_metadata(fname.parent) + else: + res = cls.from_metadata(fname.parent) required_cols = { c.name for c in res[res.primary_table].tableSchema.columns if c.required} if not required_cols.issubset(colnames): - raise ValueError('missing columns: %r' % sorted(required_cols.difference(colnames))) + raise ValueError(f'missing columns: {sorted(required_cols.difference(colnames))}') return res - # - # Accessing dataset metadata - # + # Accessing dataset metadata ------------------------------------------------------------------- @property - def directory(self) -> typing.Union[str, pathlib.Path]: + def directory(self) -> PathType: """ :return: The location of the metadata file. Either a local directory as `pathlib.Path` or \ a URL as `str`. """ - return self.tablegroup._fname.parent if self.tablegroup._fname else self.tablegroup.base + if self.tablegroup._fname: # pylint: disable=W0212 + return self.tablegroup._fname.parent # pylint: disable=W0212 + return self.tablegroup.base @property def filename(self) -> str: """ :return: The name of the metadata file. """ - return self.tablegroup._fname.name if self.tablegroup._fname else \ - pathlib.Path(urllib.parse.urlparse(self.tablegroup.base).path).name + if self.tablegroup._fname: # pylint: disable=W0212 + return self.tablegroup._fname.name # pylint: disable=W0212 + return pathlib.Path(urllib.parse.urlparse(self.tablegroup.base).path).name @property def module(self) -> str: @@ -306,13 +200,15 @@ def module(self) -> str: @property def version(self) -> str: + """The CLDF version.""" return self.properties['dc:conformsTo'].split('/')[3] def __repr__(self) -> str: - return '' % (self.version, self.module, self.directory) + return f'' @property def metadata_dict(self) -> dict: + """The TableGroup instance as dict.""" return self.tablegroup.asdict(omit_defaults=False) @property @@ -323,7 +219,7 @@ def properties(self) -> dict: return self.tablegroup.common_props @property - def bibpath(self) -> typing.Union[str, pathlib.Path]: + def bibpath(self) -> PathType: """ :return: Location of the sources BibTeX file. Either a URL (`str`) or a local path \ (`pathlib.Path`). @@ -343,18 +239,16 @@ def bibname(self) -> str: return pathlib.Path(urllib.parse.urlparse(self.bibpath).path).name return self.bibpath.name - # - # Accessing schema objects (components, tables, columns, foreign keys) - # + # Accessing schema objects (components, tables, columns, foreign keys) ------------------------- @property - def tables(self) -> typing.List[Table]: + def tables(self) -> list[Table]: """ :return: All tables defined in the dataset. """ return self.tablegroup.tables @property - def components(self) -> typing.Dict[str, csvw.Table]: + def components(self) -> collections.OrderedDict[str, csvw.Table]: """ :return: Mapping of component name to table objects as defined in the dataset. """ @@ -370,26 +264,28 @@ def components(self) -> typing.Dict[str, csvw.Table]: return res @staticmethod - def get_tabletype(table) -> typing.Union[str, None]: + def get_tabletype(table) -> Optional[str]: + """Return the table type, aka component name, of the table.""" if table.common_props.get('dc:conformsTo', '') is None: return None if '#' in table.common_props.get('dc:conformsTo', ''): res = table.common_props['dc:conformsTo'].split('#')[1] if res in TERMS: return res - raise ValueError("Type {:} of table {:} is not a valid term.".format( - table.common_props.get('dc:conformsTo'), - table.url)) + raise ValueError( + f"Type {table.common_props.get('dc:conformsTo')} of table {table.url} is invalid.") @property - def primary_table(self) -> typing.Union[str, None]: + def primary_table(self) -> Optional[str]: + """Returns the primary table for the dataset.""" if self.tables: try: return self.get_tabletype(self.tables[0]) except ValueError: - return None + pass + return None - def __getitem__(self, item: SchemaObjectType) -> typing.Union[csvw.Table, csvw.Column]: + def __getitem__(self, item: SchemaObjectType) -> Union[csvw.Table, csvw.Column]: """ Access to tables and columns. @@ -422,37 +318,32 @@ def __getitem__(self, item: SchemaObjectType) -> typing.Union[csvw.Table, csvw.C if isinstance(table, Link): table = table.string - if not isinstance(table, Table): - uri = term_uri(table, terms=TERMS.by_uri) - for t in self.tables: - if (uri and t.common_props.get('dc:conformsTo') == uri) \ - or t.url.string == table: - break - else: - raise SchemaError('Dataset has no table "{}"'.format(table)) - else: - if any(table is tt for tt in self.tables): - t = table - else: - raise SchemaError('Dataset has no table "{}"'.format(table)) - + t = self._get_table(table) if not column: return t if isinstance(column, Column): if any(column is c for c in t.tableSchema.columns): return column - else: - raise SchemaError('Dataset has no column "{}" in table "{}"'.format( - column.name, t.url)) + raise SchemaError(f'Dataset has no column "{column.name}" in table "{t.url}"') uri = term_uri(column, terms=TERMS.by_uri) for c in t.tableSchema.columns: - if ((c.propertyUrl and (c.propertyUrl.uri == uri or c.propertyUrl.uri == column)) - or c.header == column): # noqa: W503 + if ((c.propertyUrl and (c.propertyUrl.uri in (uri, column))) or c.header == column): return c - raise SchemaError('Dataset has no column "{}" in table "{}"'.format(column, t.url)) + raise SchemaError(f'Dataset has no column "{column}" in table "{t.url}"') + + def _get_table(self, table: TableType) -> Table: + if not isinstance(table, Table): + uri = term_uri(table, terms=TERMS.by_uri) + for t in self.tables: + if (uri and t.common_props.get('dc:conformsTo') == uri) or t.url.string == table: + return t + raise SchemaError(f'Dataset has no table "{table}"') + if any(table is tt for tt in self.tables): + return table + raise SchemaError(f'Dataset has no table "{table}"') def __delitem__(self, item: SchemaObjectType): """ @@ -474,9 +365,7 @@ def __contains__(self, item: SchemaObjectType) -> bool: """ return bool(self.get(item)) - def get(self, - item: SchemaObjectType, - default=None) -> typing.Union[csvw.Table, csvw.Column, None]: + def get(self, item: SchemaObjectType, default=None) -> Union[csvw.Table, csvw.Column, None]: """ Acts like `dict.get`. @@ -487,8 +376,9 @@ def get(self, except SchemaError: return default - def get_foreign_key_reference(self, table: TableType, column: ColType) \ - -> typing.Union[typing.Tuple[csvw.Table, csvw.Column], None]: + def get_foreign_key_reference( + self, table: TableType, column: ColType, + ) -> Optional[tuple[csvw.Table, csvw.Column]]: """ Retrieve the reference of a foreign key constraint for the specified column. @@ -503,6 +393,7 @@ def get_foreign_key_reference(self, table: TableType, column: ColType) \ if len(fk.columnReference) == 1 and fk.columnReference[0] == column.name: return self[fk.reference.resource], \ self[fk.reference.resource, fk.reference.columnReference[0]] + return None @property def column_names(self) -> types.SimpleNamespace: @@ -528,10 +419,8 @@ def readonly_column_names(self) -> types.SimpleNamespace: """ return get_column_names(self, use_component_names=True, with_multiplicity=True) - # - # Editing dataset metadata or schema - # - def add_provenance(self, **kw): + # Editing dataset metadata or schema ----------------------------------------------------------- + def add_provenance(self, **kw: Any) -> None: """ Add metadata about the dataset's provenance. @@ -545,7 +434,7 @@ def to_json(obj): for k, v in kw.items(): if not k.startswith('prov:'): - k = 'prov:{0}'.format(k) + k = f'prov:{k}' if isinstance(v, (tuple, list)): v = [to_json(vv) for vv in v] else: @@ -560,7 +449,7 @@ def to_json(obj): v = old self.tablegroup.common_props[k] = v - def add_table(self, url: str, *cols: ColSpecType, **kw) -> csvw.Table: + def add_table(self, url: str, *cols: ColSpecType, **kw: Any) -> csvw.Table: """ Add a table description to the Dataset. @@ -573,14 +462,16 @@ def add_table(self, url: str, *cols: ColSpecType, **kw) -> csvw.Table: """ t = self.add_component({"url": url, "tableSchema": {"columns": []}}, *cols) if 'primaryKey' in kw: - t.tableSchema.primaryKey = attr.fields_dict(Schema)['primaryKey'].converter( - kw.pop('primaryKey')) + pk = kw.pop('primaryKey') + if pk is not None and not isinstance(pk, list): + pk = [pk] + t.tableSchema.primaryKey = pk if kw.get('description'): t.common_props['dc:description'] = kw.pop('description') t.common_props.update(kw) return t - def remove_table(self, table: TableType): + def remove_table(self, table: TableType) -> None: """ Removes the table specified by `table` from the dataset. """ @@ -594,10 +485,7 @@ def remove_table(self, table: TableType): # Now remove the table: self.tablegroup.tables = [t for t in self.tablegroup.tables if t.url != table.url] - def add_component(self, - component: typing.Union[str, dict], - *cols: ColSpecType, - **kw) -> csvw.Table: + def add_component(self, component: Union[str, dict], *cols: ColSpecType, **kw) -> csvw.Table: """ Add a CLDF component to a dataset. @@ -609,11 +497,7 @@ def add_component(self, - `url`: a url property for the table;\ - `description`: a description of the table. """ - if isinstance(component, str): - component = jsonlib.load(pkg_path('components', '{0}{1}'.format(component, MD_SUFFIX))) - if isinstance(component, dict): - component = Table.fromvalue(component) - assert isinstance(component, Table) + component = make_table(component) if kw.get('url'): component.url = Link(kw['url']) @@ -639,7 +523,7 @@ def add_component(self, self.tables.append(component) self.add_columns(component, *cols) - component._parent = self.tablegroup + component._parent = self.tablegroup # pylint: disable=W0212 self.auto_constraints(component) return component @@ -654,13 +538,13 @@ def add_columns(self, table: TableType, *cols: ColSpecType) -> None: c.propertyUrl.uri for c in table.tableSchema.columns if c.propertyUrl]) col = make_column(col) if col.name in existing: - raise ValueError('Duplicate column name: {0}'.format(col.name)) + raise ValueError(f'Duplicate column name: {col.name}') if col.propertyUrl and col.propertyUrl.uri in existing: - raise ValueError('Duplicate column property: {0}'.format(col.propertyUrl.uri)) + raise ValueError(f'Duplicate column property: {col.propertyUrl.uri}') table.tableSchema.columns.append(col) self.auto_constraints() - def remove_columns(self, table: TableType, *cols: str): + def remove_columns(self, table: TableType, *cols: ColType) -> None: """ Remove `cols` from `table`'s schema. @@ -683,7 +567,7 @@ def remove_columns(self, table: TableType, *cols: str): table.tableSchema.columns = [c for c in table.tableSchema.columns if str(c) not in cols] - def rename_column(self, table: TableType, col: ColType, name: str): + def rename_column(self, table: TableType, col: ColType, name: str) -> None: """ Assign a new `name` to an existing column, cascading this change to foreign keys. @@ -724,7 +608,8 @@ def add_foreign_key( foreign_t: TableType, foreign_c: ColType, primary_t: TableType, - primary_c: typing.Optional[ColType] = None): + primary_c: Optional[ColType] = None, + ) -> None: """ Add a foreign key constraint. @@ -736,77 +621,18 @@ def add_foreign_key( :param primary_c: Column reference for the linked column - or `None`, in which case the \ primary key of the linked table is assumed. """ - if isinstance(foreign_c, (tuple, list)) or isinstance(primary_c, (tuple, list)): - raise NotImplementedError('composite keys are not supported') - - foreign_t = self[foreign_t] - primary_t = self[primary_t] - if not primary_c: - primary_c = primary_t.tableSchema.primaryKey - else: - primary_c = self[primary_t, primary_c].name - foreign_t.add_foreign_key(self[foreign_t, foreign_c].name, primary_t.url.string, primary_c) + return add_foreign_key(self, foreign_t, foreign_c, primary_t, primary_c) - def auto_constraints(self, component=None): + def auto_constraints(self, component: Optional[TableType] = None): """ - Use CLDF reference properties to implicitely create foreign key constraints. + Use CLDF reference properties to implicitly create foreign key constraints. :param component: A Table object or `None`. """ - if not component: - for table in self.tables: - self.auto_constraints(table) - return - - if not component.tableSchema.primaryKey: - idcol = component.get_column(term_uri('id')) - if idcol: - component.tableSchema.primaryKey = [idcol.name] - - self._auto_foreign_keys(component) - - try: - table_type = self.get_tabletype(component) - except ValueError: - table_type = None + return add_auto_constraints(self, component) - if table_type is None: - # New component is not a known CLDF term, so cannot add components - # automatically. TODO: We might me able to infer some based on - # `xxxReference` column properties? - return - - # auto-add foreign keys targeting the new component: - for table in self.tables: - self._auto_foreign_keys(table, component=component, table_type=table_type) - - def _auto_foreign_keys(self, table, component=None, table_type=None): - assert (component is None) == (table_type is None) - for col in table.tableSchema.columns: - if col.propertyUrl and col.propertyUrl.uri in TERMS.by_uri: - ref_name = TERMS.by_uri[col.propertyUrl.uri].references - if (component is None and not ref_name) or \ - (component is not None and ref_name != table_type): - continue - if any(fkey.columnReference == [col.name] - for fkey in table.tableSchema.foreignKeys): - continue - if component is None: - # Let's see whether we have the component this column references: - try: - ref = self[ref_name] - except KeyError: - continue - else: - ref = component - idcol = ref.get_column(term_uri('id')) - table.add_foreign_key( - col.name, ref.url.string, idcol.name if idcol is not None else 'ID') - - # - # Add data - # - def add_sources(self, *sources, **kw): + # Add data ------------------------------------------------------------------------------------- + def add_sources(self, *sources: Union[str, Source], **kw) -> None: """ Add sources to the dataset. @@ -814,10 +640,8 @@ def add_sources(self, *sources, **kw): """ self.sources.add(*sources, **kw) - # - # Methods to read data - # - def iter_rows(self, table: TableType, *cols: str) -> typing.Generator[dict, None, None]: + # Methods to read data ------------------------------------------------------------------------- + def iter_rows(self, table: TableType, *cols: str) -> Generator[RowType, None, None]: """ Iterate rows in a table, resolving CLDF property names to local column names. @@ -833,13 +657,14 @@ def iter_rows(self, table: TableType, *cols: str) -> typing.Generator[dict, None item[v] = item[k] yield item - def cached_rows(self, table: TableType) -> list: + def cached_rows(self, table: TableType) -> list[RowType]: + """Return the rows of a table from a cache.""" key = table.local_name if isinstance(table, Table) else table if key not in self._cached_rows: self._cached_rows[key] = list(self.iter_rows(table)) return self._cached_rows[key] - def get_row(self, table: TableType, id_) -> dict: + def get_row(self, table: TableType, id_) -> RowType: """ Retrieve a row specified by table and CLDF id. @@ -851,7 +676,7 @@ def get_row(self, table: TableType, id_) -> dict: return row raise ValueError(id_) # pragma: no cover - def get_row_url(self, table: TableType, row) -> typing.Union[str, None]: + def get_row_url(self, table: TableType, row: Union[RowType, str]) -> Optional[str]: """ Get a URL associated with a row. Tables can specify associated row URLs by @@ -865,7 +690,7 @@ def get_row_url(self, table: TableType, row) -> typing.Union[str, None]: :param row: A row specified by ID or as `dict` as returned when iterating over a table. :return: a `str` representing a URL or `None`. """ - row = row if isinstance(row, dict) else self.get_row(table, row) + row = self.get_row(table, row) if isinstance(row, str) else row id_col = None for col in self[table].tableSchema.columns: if col.datatype and col.datatype.base == datatypes.anyURI.__name__: @@ -875,11 +700,12 @@ def get_row_url(self, table: TableType, row) -> typing.Union[str, None]: if str(col.propertyUrl) == 'http://cldf.clld.org/v1.0/terms.rdf#id': # Otherwise we fall back to looking up the `valueUrl` property on the ID column. id_col = col - assert id_col, 'no ID column found in table {}'.format(table) + assert id_col, f'no ID column found in table {table}' if id_col.valueUrl: return id_col.valueUrl.expand(**row) + return None - def objects(self, table: str, cls: typing.Optional[typing.Type] = None) -> DictTuple: + def objects(self, table: str, cls: Optional[Type] = None) -> DictTuple: """ Read data of a CLDF component as :class:`pycldf.orm.Object` instances. @@ -899,7 +725,7 @@ def objects(self, table: str, cls: typing.Optional[typing.Type] = None) -> DictT return DictTuple(self._objects[table].values()) - def get_object(self, table, id_, cls=None, pk=False) -> orm.Object: + def get_object(self, table: str, id_: str, cls=None, pk=False) -> orm.Object: """ Get a row of a component as :class:`pycldf.orm.Object` instance. """ @@ -907,20 +733,17 @@ def get_object(self, table, id_, cls=None, pk=False) -> orm.Object: self.objects(table, cls=cls) return self._objects[table][id_] if not pk else self._objects_by_pk[table][id_] - # - # Methods for writing (meta)data to files: - # - def write_metadata( - self, fname: typing.Optional[typing.Union[str, pathlib.Path]] = None) -> pathlib.Path: + # Methods for writing (meta)data to files: ----------------------------------------------------- + def write_metadata(self, fname: Optional[PathType] = None) -> pathlib.Path: """ Write the CLDF metadata to a JSON file. :fname: Path of a file to write to, or `None` to use the default name and write to \ :meth:`~pycldf.dataset.Dataset.directory`. """ - return self.tablegroup.to_file(fname or self.tablegroup._fname) + return self.tablegroup.to_file(fname or self.tablegroup._fname) # pylint: disable=W0212 - def write_sources(self, zipped: bool = False) -> typing.Union[None, pathlib.Path]: + def write_sources(self, zipped: bool = False) -> Optional[pathlib.Path]: """ Write the sources BibTeX file to :meth:`~pycldf.dataset.Dataset.bibpath` @@ -930,10 +753,12 @@ def write_sources(self, zipped: bool = False) -> typing.Union[None, pathlib.Path """ return self.sources.write(self.bibpath, zipped=zipped) - def write(self, - fname: typing.Optional[pathlib.Path] = None, - zipped: typing.Optional[typing.Iterable] = None, - **table_items: typing.List[dict]) -> pathlib.Path: + def write( + self, + fname: Optional[pathlib.Path] = None, + zipped: Optional[Iterable] = None, + **table_items: list[RowType] + ) -> pathlib.Path: """ Write metadata, sources and data. Metadata will be written to `fname` (as interpreted in :meth:`pycldf.dataset.Dataset.write_metadata`); data files will be written to the file @@ -955,7 +780,7 @@ def write(self, table.common_props['dc:extent'] = table.write(items, _zipped=table_type in zipped) return self.write_metadata(fname) - def copy(self, dest: typing.Union[str, pathlib.Path], mdname: str = None) -> pathlib.Path: + def copy(self, dest: PathType, mdname: str = None) -> pathlib.Path: """ Copy metadata, data and sources to files in `dest`. @@ -973,54 +798,15 @@ def copy(self, dest: typing.Union[str, pathlib.Path], mdname: str = None) -> pat ... if 'with_examples' in ds.directory.name: ... ds.copy('some_directory', mdname='md.json') """ - from pycldf.media import MediaTable - - dest = pathlib.Path(dest) - if not dest.exists(): - dest.mkdir(parents=True) - - from_url = is_url(self.tablegroup.base) - ds = Dataset.from_metadata(self.tablegroup.base if from_url else self.tablegroup._fname) + return copy_dataset(self, dest, mdname) - _getter = urllib.request.urlretrieve if from_url else shutil.copy - try: - _getter(self.bibpath, dest / self.bibname) - ds.properties['dc:source'] = self.bibname - except: # pragma: no cover # noqa - # Sources are optional - pass - - for table in ds.tables: - fname = table.url.resolve(table.base) - name = pathlib.Path(urllib.parse.urlparse(fname).path).name if from_url else fname.name - _getter(fname, dest / name) - table.url = Link(name) - - for fk in table.tableSchema.foreignKeys: - fk.reference.resource = Link(pathlib.Path(fk.reference.resource.string).name) - mdpath = dest.joinpath( - mdname or # noqa: W504 - (self.tablegroup.base.split('/')[-1] if from_url else self.tablegroup._fname.name)) - if 'MediaTable' in self: - for f in MediaTable(self): - if f.scheme == 'file': - if f.local_path().exists(): - target = dest / urllib.parse.unquote(f.relpath) - target.parent.mkdir(parents=True, exist_ok=True) - shutil.copy(f.local_path(), target) - if from_url: - del ds.tablegroup.at_props['base'] # pragma: no cover - ds.write_metadata(fname=mdpath) - return mdpath - - # - # Reporting - # + # Reporting ------------------------------------------------------------------------------------ def validate( self, log: logging.Logger = None, - validators: typing.List[typing.Tuple[str, str, callable]] = None, - ontology_path=None) -> bool: + validators: list[tuple[Optional[str], str, validation.RowValidatorType]] = None, + ontology_path: Optional[PathType] = None, + ) -> bool: """ Validate schema and data of a `Dataset`: @@ -1034,160 +820,20 @@ def validate( :raises ValueError: if a validation error is encountered (and `log` is `None`). :return: Flag signaling whether schema and data are valid. """ - # We must import components with custom validation to make sure they can be detected as - # subclasses of ComponentWithValidation. - from pycldf.media import MediaTable - from pycldf.trees import TreeTable - - assert MediaTable and TreeTable - - terms = Terms(ontology_path) or TERMS - validators = validators or [] - validators.extend(VALIDATORS) - success = True - default_tg = TableGroup.from_file( - pkg_path('modules', '{0}{1}'.format(self.module, MD_SUFFIX))) - # - # Make sure, all required tables and columns are present and consistent. - # - for default_table in default_tg.tables: - dtable_uri = default_table.common_props['dc:conformsTo'] - try: - table = self[dtable_uri] - except KeyError: - success = False - log_or_raise('{0} requires {1}'.format(self.module, dtable_uri), log=log) - table = None - - if table: - default_cols = {c.propertyUrl.uri: c for c in default_table.tableSchema.columns} - required_default_cols = { - c.propertyUrl.uri for c in default_table.tableSchema.columns - if c.required or c.common_props.get('dc:isRequiredBy')} - cols = { - c.propertyUrl.uri: c for c in table.tableSchema.columns - if c.propertyUrl} - table_uri = table.common_props['dc:conformsTo'] - for col in required_default_cols - set(cols.keys()): - success = False - log_or_raise('{0} requires column {1}'.format(table_uri, col), log=log) - for uri, col in cols.items(): - default = default_cols.get(uri) - if default: - cardinality = default.common_props.get('dc:extent') - if not cardinality: - cardinality = terms.by_uri[uri].cardinality - if (cardinality == 'multivalued' and not col.separator) or \ - (cardinality == 'singlevalued' and col.separator): - success = False - log_or_raise('{} {} must be {}'.format( - table_uri, uri, cardinality), log=log) + return validation.validate( + dataset=self, + terms=Terms(ontology_path) or TERMS, + log=log, + row_validators=validators or [], + ) - for table in self.tables: - vars = set(col.name for col in table.tableSchema.columns) - for obj, prop, tmpl in iter_uritemplates(table): - if not {n for n in tmpl.variable_names if not n.startswith('_')}.issubset(vars): - if log: - log.warning('Unknown variables in URI template: {}:{}:{}'.format( - obj, prop, tmpl)) - - type_uri = table.common_props.get('dc:conformsTo') - if type_uri: - try: - terms.is_cldf_uri(type_uri) - except ValueError: - success = False - log_or_raise('invalid CLDF URI: {0}'.format(type_uri), log=log) - - if not table.tableSchema.primaryKey: - if log: - log.warning('Table without primary key: {0} - {1}'.format( - table.url, - 'This may cause problems with "cldf createdb"')) - elif len(table.tableSchema.primaryKey) > 1: - if log: - log.warning('Table with composite primary key: {0} - {1}'.format( - table.url, - 'This may cause problems with "cldf createdb"')) - - # FIXME: check whether table.common_props['dc:conformsTo'] is in validators! - validators_, propertyUrls, colnames = [], set(), set() - for col in table.tableSchema.columns: - if col.header in colnames: # pragma: no cover - success = False - log_or_raise( - 'Duplicate column name in table schema: {} {}'.format( - table.url, col.header), - log=log) - colnames.add(col.header) - if col.propertyUrl: - col_uri = col.propertyUrl.uri - try: - terms.is_cldf_uri(col_uri) - if col_uri in propertyUrls: # pragma: no cover - success = False - log_or_raise( - 'Duplicate CLDF property in table schema: {} {}'.format( - table.url, col_uri), - log=log) - propertyUrls.add(col_uri) - except ValueError: - success = False - log_or_raise('invalid CLDF URI: {0}'.format(col_uri), log=log) - for table_, col_, v_ in validators: - if (not table_ or table is self.get(table_)) and col is self.get((table, col_)): - validators_.append((col, v_)) - - fname = pathlib.Path(table.url.resolve(table._parent.base)) - fexists = fname.exists() - if (not fexists) and fname.parent.joinpath('{}.zip'.format(fname.name)).exists(): - if log: - log.info('Reading data from zipped table: {}.zip'.format(fname)) - fexists = True # csvw already handles this case, no need to adapt paths. - if is_url(table.url.resolve(table._parent.base)) or fexists: - for fname, lineno, row in table.iterdicts(log=log, with_metadata=True): - for col, validate in validators_: - try: - validate(self, table, col, row) - except ValueError as e: - success = False - log_or_raise( - '{0}:{1}:{2} {3}'.format(fname.name, lineno, col.name, e), - log=log) - if not table.check_primary_key(log=log): - success = False - else: - success = False - log_or_raise('{0} does not exist'.format(fname), log=log) - - if not self.tablegroup.check_referential_integrity(log=log): - success = False - - for cls in ComponentWithValidation.__subclasses__(): - if cls.__name__ in self: - success = cls(self).validate(success, log=log) - - return success - - def stats(self, exact: bool = False) -> typing.List[typing.Tuple[str, str, int]]: + def stats(self, exact: bool = False) -> list[tuple[str, str, int]]: """ Compute summary statistics for the dataset. - :return: List of triples (table, type, rowcount). + :return: List of triples (filename, component, rowcount). """ - res = [] - for table in self.tables: - dctype = table.common_props.get('dc:conformsTo') - if dctype and '#' in dctype and dctype.split('#')[1] in TERMS: - dctype = TERMS[dctype.split('#')[1]].csvw_prop('name') - res.append(( - table.url.string, - dctype, - sum(1 for _ in table) if (exact or 'dc:extent' not in table.common_props) - else int(table.common_props.get('dc:extent')))) - if self.sources: - res.append((self.bibname, 'Sources', len(self.sources))) - return res + return get_table_stats(self, exact) class Generic(Dataset): @@ -1197,7 +843,7 @@ class Generic(Dataset): .. seealso:: ``_ """ @property - def primary_table(self): + def primary_table(self) -> None: # pylint: disable=missing-function-docstring return None @@ -1208,10 +854,11 @@ class Wordlist(Dataset): .. seealso:: ``_ """ @property - def primary_table(self): + def primary_table(self) -> str: # pylint: disable=missing-function-docstring return 'FormTable' - def get_segments(self, row, table='FormTable') -> typing.List[str]: + def get_segments(self, row: RowType, table='FormTable') -> list[str]: + """Retrieve the list of segments of a form.""" col = self[table].get_column("http://cldf.clld.org/v1.0/terms.rdf#segments") sounds = row[col.name] if isinstance(sounds, str): @@ -1219,41 +866,40 @@ def get_segments(self, row, table='FormTable') -> typing.List[str]: sounds = [sounds] return list(itertools.chain(*[s.split() for s in sounds])) - def get_subsequence(self, cognate: dict, form=None) -> typing.List[str]: + def get_subsequence(self, cognate: RowType, form: Optional[str] = None) -> list[str]: """ Compute the subsequence of the morphemes of a form which is specified in a partial cognate assignment. :param cognate: A `dict` holding the data of a row from a `CognateTable`. """ - return resolve_slices( - cognate, - self, - ('CognateTable', "http://cldf.clld.org/v1.0/terms.rdf#segmentSlice"), - ('FormTable', "http://cldf.clld.org/v1.0/terms.rdf#segments"), - 'Form_ID', - target_row=form) + target_row = form or self.get_row('FormTable', cognate['Form_ID']) + return multislice_with_split( + target_row[self['FormTable', "http://cldf.clld.org/v1.0/terms.rdf#segments"].name], + cognate[self['CognateTable', "http://cldf.clld.org/v1.0/terms.rdf#segmentSlice"].name], + ) class ParallelText(Dataset): + """Implements the CLDF ParallelText module.""" @property - def primary_table(self): + def primary_table(self) -> str: # pylint: disable=missing-function-docstring return 'FormTable' def get_equivalent(self, functional_equivalent, form=None): - return resolve_slices( - functional_equivalent, - self, - ('FunctionalEquivalentTable', - "http://cldf.clld.org/v1.0/terms.rdf#segmentSlice"), - ('FormTable', "http://cldf.clld.org/v1.0/terms.rdf#segments"), - 'Form_ID', - target_row=form) + """Get the forms fulfilling an equivalent function in the texts.""" + slice_col_name = self[ + 'FunctionalEquivalentTable', "http://cldf.clld.org/v1.0/terms.rdf#segmentSlice"].name + sequence_col_name = self['FormTable', "http://cldf.clld.org/v1.0/terms.rdf#segments"].name + target_row = form or self.get_row('FormTable', functional_equivalent['Form_ID']) + return multislice_with_split( + target_row[sequence_col_name], functional_equivalent[slice_col_name]) class Dictionary(Dataset): + """Implements the CLDF Dictionary module.""" @property - def primary_table(self): + def primary_table(self) -> str: # pylint: disable=missing-function-docstring return 'EntryTable' @@ -1264,7 +910,7 @@ class StructureDataset(Dataset): .. seealso:: ``_ """ @property - def primary_table(self): + def primary_table(self) -> str: # pylint: disable=missing-function-docstring return 'ValueTable' @functools.cached_property @@ -1296,21 +942,26 @@ class TextCorpus(Dataset): [] """ @property - def primary_table(self): + def primary_table(self) -> str: # pylint: disable=missing-function-docstring return 'ExampleTable' @functools.cached_property - def texts(self) -> typing.Union[None, DictTuple]: + def texts(self) -> Optional[DictTuple]: + """Retrieve texts.""" # Some syntactic sugar to access the ORM data in a concise and meaningful way. if 'ContributionTable' in self: return self.objects('ContributionTable') + return None # pragma: no cover - def get_text(self, tid): + def get_text(self, tid: str) -> Optional[orm.Object]: + """Retrieve a text by ID.""" if 'ContributionTable' in self: return self.get_object('ContributionTable', tid) + return None # pragma: no cover @property - def sentences(self) -> typing.List[orm.Example]: + def sentences(self) -> list[orm.Example]: + """Sentences of the corpus.""" res = list(self.objects('ExampleTable')) if ('ExampleTable', 'exampleReference') in self: # Filter out alternative translations! @@ -1320,47 +971,7 @@ def sentences(self) -> typing.List[orm.Example]: return res # pragma: no cover -class ComponentWithValidation: - """ - A virtual base class for custom, component-centered validation. - """ - def __init__(self, ds: Dataset): - self.ds = ds - self.component = self.__class__.__name__ - self.table = ds[self.component] - - def validate(self, success: bool = True, log: logging.Logger = None) -> bool: - return success # pragma: no cover - - -def sniff(p: pathlib.Path) -> bool: - """ - Determine whether a file contains CLDF metadata. - - :param p: `pathlib.Path` object for an existing file. - :return: `True` if the file contains CLDF metadata, `False` otherwise. - """ - if not p.is_file(): # pragma: no cover - return False - try: - with p.open('rb') as fp: - c = fp.read(10) - try: - c = c.decode('utf8').strip() - except UnicodeDecodeError: - return False - if not c.startswith('{'): - return False - except (FileNotFoundError, OSError): # pragma: no cover - return False - try: - d = jsonlib.load(p) - except json.decoder.JSONDecodeError: - return False - return d.get('dc:conformsTo', '').startswith(TERMS_URL) - - -def iter_datasets(d: pathlib.Path) -> typing.Generator[Dataset, None, None]: +def iter_datasets(d: PathType) -> Generator[Dataset, None, None]: """ Discover CLDF datasets - by identifying metadata files - in a directory. @@ -1372,5 +983,4 @@ def iter_datasets(d: pathlib.Path) -> typing.Generator[Dataset, None, None]: try: yield Dataset.from_metadata(p) except ValueError as e: - logging.getLogger(__name__).warning( - "Reading {} failed: {}".format(p, e)) + logging.getLogger(__name__).warning("Reading %s failed: %s", p, e) diff --git a/src/pycldf/db.py b/src/pycldf/db.py index 36d0565..ac4404e 100644 --- a/src/pycldf/db.py +++ b/src/pycldf/db.py @@ -39,20 +39,24 @@ FOREIGN KEY(`custom.csv_id`) REFERENCES `custom.csv`(`id`) ON DELETE CASCADE ); """ -import typing +from typing import Optional, Any, Callable, Protocol, TYPE_CHECKING import inspect import pathlib import sqlite3 import functools import collections +import dataclasses -import attr import csvw import csvw.db +from csvw.db import ColSpec, TableSpec +from csvw.metadata import Table as CSVWTable from pycldf.terms import TERMS from pycldf.sources import Reference, Sources, Source -from pycldf import Dataset + +if TYPE_CHECKING: + from pycldf import Dataset # pragma: no cover __all__ = ['Database', 'query'] @@ -87,16 +91,19 @@ ] -@attr.s -class TableTranslation(object): +@dataclasses.dataclass +class TableTranslation: """ Specifies column name translations for a table. """ - name = attr.ib(default=None) - columns = attr.ib(default=attr.Factory(dict)) + name: str = None + columns: dict[str, str] = dataclasses.field(default_factory=dict) + +TranslationDict = dict[str, TableTranslation] -def translate(d: typing.Dict[str, TableTranslation], table: str, col=None) -> str: + +def translate(d: TranslationDict, table: str, col: str = None) -> str: """ Translate a db object name. @@ -124,7 +131,7 @@ def translate(d: typing.Dict[str, TableTranslation], table: str, col=None) -> st # 2. Since regular table names may contain underscores as well, we try to find the longest # concatenation of _-separated name parts which appears in the translation dict. # 3. We repeat step 2 until all name parts have been consumed. - def t(n): + def t_(n): if n in d: return d[n].name or n tables, comps = [], n.split('_') @@ -142,10 +149,10 @@ def t(n): tables.append(d[comps[0]].name or comps[0] if comps[0] in d else comps[0]) return '_'.join(tables) - return t(table) + return t_(table) -def clean_bibtex_key(s): +def clean_bibtex_key(s: str) -> str: # pylint: disable=C0116 return s.replace('-', '_').lower() @@ -158,23 +165,74 @@ class Database(csvw.db.Database): """ source_table_name = 'SourceTable' - def __init__(self, dataset: Dataset, **kw): + def __init__(self, dataset: 'Dataset', **kw): """ :param dataset: The :class:`Dataset` instance from which to derive the database schema. """ - self.dataset = dataset + self.dataset: 'Dataset' = dataset self._retranslate = collections.defaultdict(dict) self._source_cols = ['id', 'genre'] + BIBTEX_FIELDS # Source items can be referenced with case insensitive keys. So we store a mapping from # lowercase keys to the ones actually used in the source BibTeX. self._source_map = {} - infer_primary_keys = kw.pop('infer_primary_keys', False) - # We create a derived TableGroup, adding a table for the sources. tg = csvw.TableGroup.fromvalue(dataset.metadata_dict) # Assemble the translation function: + translations: TranslationDict = self._get_translations(dataset) + + # Add source table: + for src in self.dataset.sources: + for key in src: + key = clean_bibtex_key(key) + if key not in self._source_cols: + self._source_cols.append(key) + + tg.tables.append(csvw.Table.fromvalue({ + 'url': self.source_table_name, + 'tableSchema': {'columns': [{'name': n} for n in self._source_cols], 'primaryKey': 'id'} + })) + tg.tables[-1]._parent = tg + + # Add foreign keys to source table: + infer_primary_keys = kw.pop('infer_primary_keys', False) + for table in tg.tables[:-1]: + self._add_fk_to_sources(table, infer_primary_keys, translations) + + # Make sure `base` directory can be resolved: + tg._fname = dataset.tablegroup._fname + csvw.db.Database.__init__( + self, tg, translate=functools.partial(translate, translations), **kw) + + def _add_fk_to_sources( + self, + table: CSVWTable, + infer_primary_keys: bool, + translations: TranslationDict, + ): + if not table.tableSchema.primaryKey and infer_primary_keys: + for col in table.tableSchema.columns: + if col.name.lower() in PRIMARY_KEY_NAMES: + table.tableSchema.primaryKey = [col.name] + break + for col in table.tableSchema.columns: + if col.propertyUrl and col.propertyUrl.uri == TERMS['source'].uri: + table.tableSchema.foreignKeys.append(csvw.ForeignKey.fromdict({ + 'columnReference': [col.header], + 'reference': {'resource': self.source_table_name, 'columnReference': 'id'} + })) + if translations[table.local_name].name: + tl = translations[table.local_name] + translations[f'{table.local_name}_{self.source_table_name}'] = \ + TableTranslation( + name=f'{tl.name}_{self.source_table_name}', + columns={ + f'{table.local_name}_{table.tableSchema.primaryKey[0]}': + f'{tl.name}_{tl.columns[table.tableSchema.primaryKey[0]]}'}) + break + + def _get_translations(self, dataset: 'Dataset') -> TranslationDict: translations = {} for table in dataset.tables: translations[table.local_name] = TableTranslation() @@ -191,7 +249,7 @@ def __init__(self, dataset: Dataset, **kw): if col.propertyUrl and col.propertyUrl.uri in TERMS.by_uri: # Translate local column names to local names of CLDF Ontology terms, prefixed # with `cldf_`: - col_name = 'cldf_{0.name}'.format(TERMS.by_uri[col.propertyUrl.uri]) + col_name = f'cldf_{TERMS.by_uri[col.propertyUrl.uri].name}' new_col_names.append(col_name.lower()) translations[table.local_name].columns[col.header] = col_name self._retranslate[table.local_name][col_name] = col.header @@ -200,60 +258,12 @@ def __init__(self, dataset: Dataset, **kw): if not (col.propertyUrl and col.propertyUrl.uri in TERMS.by_uri): if col.header.lower() in new_col_names: # A name clash! We translate the old column name! - col_name = '_{}'.format(col.header) + col_name = f'_{col.header}' translations[table.local_name].columns[col.header] = col_name self._retranslate[table.local_name][col_name] = col.header + return translations - # Add source table: - for src in self.dataset.sources: - for key in src: - key = clean_bibtex_key(key) - if key not in self._source_cols: - self._source_cols.append(key) - - tg.tables.append(csvw.Table.fromvalue({ - 'url': self.source_table_name, - 'tableSchema': { - 'columns': [dict(name=n) for n in self._source_cols], - 'primaryKey': 'id' - } - })) - tg.tables[-1]._parent = tg - - # Add foreign keys to source table: - for table in tg.tables[:-1]: - if not table.tableSchema.primaryKey and infer_primary_keys: - for col in table.tableSchema.columns: - if col.name.lower() in PRIMARY_KEY_NAMES: - table.tableSchema.primaryKey = [col.name] - break - for col in table.tableSchema.columns: - if col.propertyUrl and col.propertyUrl.uri == TERMS['source'].uri: - table.tableSchema.foreignKeys.append(csvw.ForeignKey.fromdict({ - 'columnReference': [col.header], - 'reference': { - 'resource': self.source_table_name, - 'columnReference': 'id' - } - })) - if translations[table.local_name].name: - tl = translations[table.local_name] - translations['{0}_{1}'.format(table.local_name, self.source_table_name)] = \ - TableTranslation( - name='{0}_{1}'.format(tl.name, self.source_table_name), - columns={'{0}_{1}'.format( - table.local_name, table.tableSchema.primaryKey[0], - ): '{0}_{1}'.format( - tl.name, tl.columns[table.tableSchema.primaryKey[0]], - )}) - break - - # Make sure `base` directory can be resolved: - tg._fname = dataset.tablegroup._fname - csvw.db.Database.__init__( - self, tg, translate=functools.partial(translate, translations), **kw) - - def association_table_context(self, table, column, fkey): + def association_table_context(self, table: TableSpec, column: ColSpec, fkey: str): if self.translate(table.name, column) == 'cldf_source': # We decompose references into the source ID and optional pages. Pages are stored as # `context` of the association table and composed again in `select_many_to_many`. @@ -275,13 +285,13 @@ def association_table_context(self, table, column, fkey): return csvw.db.Database.association_table_context( self, table, column, fkey) # pragma: no cover - def select_many_to_many(self, db, table, context): + def select_many_to_many(self, db, table: TableSpec, context): if table.name.endswith('_' + self.source_table_name): atable = table.name.partition('_' + self.source_table_name)[0] if self.translate(atable, context) == 'cldf_source': # Compose references: res = csvw.db.Database.select_many_to_many(self, db, table, None) - return {k: ['{0}'.format(Reference(*vv)) for vv in v] for k, v in res.items()} + return {k: [f'{Reference(*vv)}' for vv in v] for k, v in res.items()} return csvw.db.Database.select_many_to_many(self, db, table, context) # pragma: no cover def write(self, _force=False, _exists_ok=False, **items): @@ -293,7 +303,8 @@ def write(self, _force=False, _exists_ok=False, **items): return csvw.db.Database.write( self, _force=False, _exists_ok=False, _skip_extra=True, **items) - def write_from_tg(self, _force: bool = False, _exists_ok: bool = False): + def write_from_tg( # pylint: disable=W0221 + self, _force: bool = False, _exists_ok: bool = False): """ Write the data from `self.dataset` to the database. """ @@ -309,7 +320,7 @@ def write_from_tg(self, _force: bool = False, _exists_ok: bool = False): self._source_map[src.id.lower()] = src.id return self.write(_force=_force, _exists_ok=_exists_ok, **items) - def query(self, sql: str, params=None) -> list: + def query(self, sql: str, params=None) -> list[Any]: """ Run `sql` on the database, returning the list of results. """ @@ -317,7 +328,7 @@ def query(self, sql: str, params=None) -> list: cu = conn.execute(sql, params or ()) return list(cu.fetchall()) - def retranslate(self, table, item): + def retranslate(self, table: CSVWTable, item): # pylint: disable=C0116 return {self._retranslate.get(table.local_name, {}).get(k, k): v for k, v in item.items()} @staticmethod @@ -373,21 +384,22 @@ def to_cldf(self, dest, mdname='cldf-metadata.json', coordinate_precision=4) -> return self.dataset.write_metadata(dest / mdname) -class AggregateClass(typing.Protocol): # pragma: no cover - def step(self, value): +class AggregateClass(Protocol): # pragma: no cover # pylint: disable=C0115 + def step(self, value): # pylint: disable=C0116 ... - def finalize(self): + def finalize(self): # pylint: disable=C0116 ... -def query(conn: sqlite3.Connection, - sql: str, - params=None, - functions: typing.Optional[typing.List[typing.Callable]] = None, - aggregates: typing.Optional[typing.List[AggregateClass]] = None, - collations: typing.Optional[typing.List[typing.Callable]] = None) \ - -> typing.Generator[typing.Any, None, None]: +def query( # pylint: disable=R0913,R0917 + conn: sqlite3.Connection, + sql: str, + params=None, + functions: Optional[list[Callable]] = None, + aggregates: Optional[list[AggregateClass]] = None, + collations: Optional[list[Callable]] = None, +) -> list[Any]: """ Note: Passing lambdas or functools.partial objects as function requires passing an explicit name as well. diff --git a/src/pycldf/ext/discovery.py b/src/pycldf/ext/discovery.py index 74fb4bf..35ed69f 100644 --- a/src/pycldf/ext/discovery.py +++ b/src/pycldf/ext/discovery.py @@ -16,7 +16,7 @@ resolver for DOI URLs pointing to the Zenodo archive. """ import re -import typing +from typing import Optional, Union import pathlib import zipfile import warnings @@ -28,7 +28,8 @@ from csvw.utils import is_url from pycldf import Dataset, iter_datasets, sniff -from pycldf.util import url_without_fragment +from pycldf.urlutil import url_without_fragment +from pycldf._compat import entry_points_select __all__ = ['get_dataset', 'DatasetResolver'] EP = 'pycldf_dataset_resolver' @@ -36,7 +37,7 @@ _resolvers = [] -class DatasetResolver: +class DatasetResolver: # pylint: disable=R0903 """ Virtual base class for dataset resolvers. @@ -46,8 +47,11 @@ class DatasetResolver: """ priority = 5 - def __call__(self, loc: str, download_dir: pathlib.Path) \ - -> typing.Union[None, Dataset, pathlib.Path]: + def __call__( + self, + loc: str, + download_dir: pathlib.Path, + ) -> Union[None, Dataset, pathlib.Path]: """ :param loc: URL pointing to a place where datasets are archived. :param download_dir: A directory to which resolvers can download data. @@ -58,43 +62,45 @@ def __call__(self, loc: str, download_dir: pathlib.Path) \ raise NotImplementedError() # pragma: no cover -class LocalResolver(DatasetResolver): +class LocalResolver(DatasetResolver): # pylint: disable=R0903 """ Resolves dataset locators specifying local file paths. """ priority = 100 - def __call__(self, loc: str, download_dir, base: typing.Optional[pathlib.Path]) \ - -> typing.Union[None, pathlib.Path]: + def __call__( + self, + loc: str, + download_dir, + base: Optional[pathlib.Path], + ) -> Optional[pathlib.Path]: """ :return: a local path to a directory """ if isinstance(loc, str) and is_url(loc): - return + return None loc = pathlib.Path(loc) if loc.resolve() != loc and base: # A relative path, to be interpreted relative to base loc = base.resolve().joinpath(loc) if loc.exists(): return loc + return None # pragma: no cover -class GenericUrlResolver(DatasetResolver): +class GenericUrlResolver(DatasetResolver): # pylint: disable=R0903 """ URL resolver which works for generic URLs provided they point to a CLDF metadata file. """ priority = -1 - def __call__(self, loc, download_dir): + def __call__(self, loc, download_dir) -> Optional[Dataset]: if is_url(loc): - try: - return Dataset.from_metadata(loc) - except: # noqa: E722 # pragma: no cover - raise - pass + return Dataset.from_metadata(loc) + return None # pragma: no cover -class GitHubResolver(DatasetResolver): +class GitHubResolver(DatasetResolver): # pylint: disable=R0903 """ Resolves dataset locators of the form "https://github.com///tree/", e.g. https://github.com/cldf-datasets/petersonsouthasia/tree/v1.1 @@ -103,55 +109,60 @@ class GitHubResolver(DatasetResolver): """ priority = 3 - def __call__(self, loc, download_dir): + def __call__(self, loc, download_dir) -> Optional[pathlib.Path]: url = urllib.parse.urlparse(loc) - if url.netloc == 'github.com' and re.search(r'/[v\.0-9]+$', url.path): + if url.netloc == 'github.com' and re.search(r'/[v.0-9]+$', url.path): comps = url.path.split('/') - z = download_dir / '{}-{}-{}.zip'.format(comps[1], comps[2], comps[-1]) - url = "https://github.com/{}/{}/archive/refs/tags/{}.zip".format( - comps[1], comps[2], comps[-1]) + z = download_dir / f'{comps[1]}-{comps[2]}-{comps[-1]}.zip' + url = f"https://github.com/{comps[1]}/{comps[2]}/archive/refs/tags/{comps[-1]}.zip" urllib.request.urlretrieve(url, z) - zf = zipfile.ZipFile(z) - dirs = {info.filename.split('/')[0] for info in zf.infolist()} - assert len(dirs) == 1 - zf.extractall(download_dir) + with zipfile.ZipFile(z) as zf: + dirs = {info.filename.split('/')[0] for info in zf.infolist()} + assert len(dirs) == 1 + zf.extractall(download_dir) z.unlink() return download_dir / dirs.pop() + return None class DatasetLocator(str): + """Dataset locators are URLs with identifying information added to the fragment.""" @functools.cached_property - def parsed_url(self) -> urllib.parse.ParseResult: + def parsed_url(self) -> urllib.parse.ParseResult: # pylint: disable=C0116 return urllib.parse.urlparse(self) @property - def url_without_fragment(self): + def url_without_fragment(self): # pylint: disable=C0116 return url_without_fragment(self.parsed_url) - def match(self, dataset: Dataset) -> bool: + def match(self, dataset: Dataset) -> bool: # pylint: disable=C0116 if self.parsed_url.fragment: key, _, value = self.parsed_url.fragment.partition('=') return dataset.properties.get(key) == value if value else key in dataset.properties return True -def get_resolvers(): +def get_resolvers() -> list[type]: + """Register resolvers defined via entry points.""" if not _resolvers: - eps = entry_points() - for ep in set(eps.select(group=EP) if hasattr(eps, 'select') else eps.get(EP, [])): + for ep in set(entry_points_select(entry_points(), EP)): try: _resolvers.append(ep.load()()) except ImportError: # pragma: no cover - warnings.warn('ImportError loading entry point {0.name}'.format(ep)) + warnings.warn(f'ImportError loading entry point {ep.name}') continue return sorted(_resolvers, key=lambda res: -res.priority) -def _get_dataset(locator: DatasetLocator, location: typing.Union[None, Dataset, pathlib.Path]): +def _get_dataset( + locator: DatasetLocator, + location: Union[None, Dataset, pathlib.Path], +) -> Optional[Dataset]: + """Determine whether locator matches location and if so, resolve to a Dataset instance.""" if isinstance(location, Dataset): if locator.match(location): return location - return + return None if location.is_dir(): for ds in iter_datasets(location): if locator.match(ds): @@ -160,11 +171,12 @@ def _get_dataset(locator: DatasetLocator, location: typing.Union[None, Dataset, ds = Dataset.from_metadata(location) if sniff(location) else Dataset.from_data(location) if locator.match(ds): return ds + return None # pragma: no cover def get_dataset(locator: str, download_dir: pathlib.Path, - base: typing.Optional[pathlib.Path] = None) -> Dataset: + base: Optional[pathlib.Path] = None) -> Dataset: """ :param locator: Dataset locator as specified in "Dataset discovery". :param download_dir: Directory to which to download remote data if necessary. @@ -182,4 +194,4 @@ def get_dataset(locator: str, res = _get_dataset(locator, res) if res: return res - raise ValueError('Could not resolve dataset locator {}'.format(locator)) + raise ValueError(f'Could not resolve dataset locator {locator}') diff --git a/src/pycldf/ext/markdown.py b/src/pycldf/ext/markdown.py index 108dea4..c92694a 100644 --- a/src/pycldf/ext/markdown.py +++ b/src/pycldf/ext/markdown.py @@ -4,24 +4,25 @@ For an example, see :class:`FilenameToComponent`. """ import re -import typing +from typing import Optional, Union, Any import pathlib import warnings -import collections.abc +import collections +from collections.abc import Mapping +import dataclasses import yaml import jmespath -import attr import frontmatter import clldutils from clldutils.markup import MarkdownLink -from .discovery import get_dataset -from pycldf.util import pkg_path, url_without_fragment -from pycldf.dataset import MD_SUFFIX +from pycldf.util import pkg_path, MD_SUFFIX +from pycldf.urlutil import url_without_fragment from pycldf.sources import Source from pycldf import Dataset from pycldf import orm +from .discovery import get_dataset __all__ = ['CLDFMarkdownLink', 'CLDFMarkdownText', 'FilenameToComponent'] @@ -31,14 +32,14 @@ METADATA_COMPONENT = 'Metadata' -class DatasetMapping(collections.abc.Mapping): +class DatasetMapping(Mapping): """ A read-only mapping of prefixes to datasets. """ key_pattern = re.compile('[a-zA-Z0-9_]+') @staticmethod - def to_dict(o): + def to_dict(o): # pylint: disable=C0116 if isinstance(o, DatasetMapping): return o.m return {} if not o else ({None: o} if isinstance(o, (str, Dataset)) else o) @@ -46,8 +47,8 @@ def to_dict(o): def __init__(self, m1, m2=None, - doc_path: typing.Optional[pathlib.Path] = None, - download_dir: typing.Optional[pathlib.Path] = None): + doc_path: Optional[pathlib.Path] = None, + download_dir: Optional[pathlib.Path] = None): """ :param m1: Mapping of prefixes to datasets (locators). :param m2: Mapping of prefixes to datasets (locators) to update `m1`. @@ -64,7 +65,7 @@ def __init__(self, if not isinstance(self.m[k], Dataset): self.m[k] = get_dataset(self.m[k], download_dir, doc_path) - def __getitem__(self, prefix: typing.Union[str, None]) -> Dataset: + def __getitem__(self, prefix: Union[str, None]) -> Dataset: """ Get a `Dataset` mapped to a prefix. """ @@ -77,7 +78,7 @@ def __len__(self): return len(self.m) -@attr.s +@dataclasses.dataclass class CLDFMarkdownLink(MarkdownLink): """ CLDF Markdown links are specified using URLs of a particular format. @@ -88,18 +89,20 @@ class CLDFMarkdownLink(MarkdownLink): fragment_pattern = re.compile(r'cldf(-(?P[a-zA-Z0-9_]+))?:') @property - def url_without_fragment(self): + def url_without_fragment(self) -> str: + """Return the HREF value of the link without the fragment.""" return url_without_fragment(self.parsed_url) @staticmethod - def format_url(path, objid, prefix=None): - return '{}#cldf{}:{}'.format(path, '-' + prefix if prefix else '', objid) + def format_url(path, objid, prefix=None) -> str: + """Format the HREF value for a CLDF Markdown link.""" + prefix = '-' + prefix if prefix else '' + return f'{path}#cldf{prefix}:{objid}' @classmethod def from_component(cls, comp, objid='__all__', label=None, prefix=None) -> 'CLDFMarkdownLink': - return cls( - label=label or '{}:{}'.format(comp, objid), - url=cls.format_url(comp, objid, prefix=prefix)) + """Create a CLDF Markdown link for an object in a component.""" + return cls(label=label or f'{comp}:{objid}', url=cls.format_url(comp, objid, prefix=prefix)) @property def is_cldf_link(self) -> bool: @@ -109,25 +112,27 @@ def is_cldf_link(self) -> bool: return bool(self.fragment_pattern.match(self.parsed_url.fragment)) @property - def prefix(self) -> typing.Union[None, str]: + def prefix(self) -> Optional[str]: """ The dataset prefix associated with a CLDF Markdown link. """ if self.is_cldf_link: return self.fragment_pattern.match(self.parsed_url.fragment).group('prefix') + return None # pragma: no cover @property - def table_or_fname(self) -> typing.Union[None, str]: + def table_or_fname(self) -> Optional[str]: """ The last path component of the URL of a CLDF Markdown link. """ if self.is_cldf_link: return self.parsed_url.path.split('/')[-1] + return None # pragma: no cover - def component(self, - cldf: typing.Optional[ - typing.Union[Dataset, typing.Dict[str, Dataset], DatasetMapping]] = None, - ) -> typing.Union[str, None]: + def component( + self, + cldf: Optional[Union[Dataset, dict[str, Dataset], DatasetMapping]] = None, + ) -> Union[str, None]: """ :param cldf: `pycldf.Dataset` instance to which the link refers. :return: Name of the CLDF component the link pertains to or `None`. @@ -143,9 +148,9 @@ def component(self, if isinstance(cldf, (dict, DatasetMapping)): cldf = cldf[self.prefix] - if name == cldf.bibname or name == SOURCE_COMPONENT: + if name in (cldf.bibname, SOURCE_COMPONENT): return SOURCE_COMPONENT - if name == cldf.filename or name == METADATA_COMPONENT: + if name in (cldf.filename, METADATA_COMPONENT): return METADATA_COMPONENT try: return cldf.get_tabletype(cldf[name]) @@ -153,12 +158,13 @@ def component(self, return None @property - def objid(self) -> typing.Union[None, str]: + def objid(self) -> Optional[str]: """ The identifier of the object referenced by a CLDF Markdown link. """ if self.is_cldf_link: return self.parsed_url.fragment.split(':', maxsplit=1)[-1] + return None # pragma: no cover @property def all(self) -> bool: @@ -167,7 +173,7 @@ def all(self) -> bool: """ return self.objid == '__all__' - def get_row(self, cldf: typing.Union[Dataset, DatasetMapping]) -> dict: + def get_row(self, cldf: Union[Dataset, DatasetMapping]) -> dict: """ Resolve the reference in a CLDF Markdown link to a row (`dict`) in the CLDF `Dataset`. """ @@ -175,7 +181,7 @@ def get_row(self, cldf: typing.Union[Dataset, DatasetMapping]) -> dict: ds = DatasetMapping(cldf)[self.prefix] return ds.get_row(self.component(cldf=ds), self.objid) - def get_object(self, cldf: typing.Union[Dataset, DatasetMapping]) -> orm.Object: + def get_object(self, cldf: Union[Dataset, DatasetMapping]) -> orm.Object: """ Resolve the reference in a CLDF Markdown link to an ORM object in the CLDF `Dataset`. """ @@ -213,9 +219,9 @@ def render_link(self, link): :cvar metadata_component: Name of the special "Metadata" component. """ def __init__(self, - text: typing.Union[pathlib.Path, str], - dataset_mapping: typing.Optional[typing.Union[str, Dataset, dict]] = None, - download_dir: typing.Optional[pathlib.Path] = None): + text: Union[pathlib.Path, str], + dataset_mapping: Optional[Union[str, Dataset, dict]] = None, + download_dir: Optional[pathlib.Path] = None): """ :param text: CLDF Markdown text either to be read from a path or specified as `str`. :param dataset_mapping: Mapping of dataset prefixes to `Dataset` instances. May override \ @@ -223,14 +229,14 @@ def __init__(self, :download_dir: Optional path to a directory to download data for remote datasets. """ p = frontmatter.loads(text) if isinstance(text, str) else frontmatter.load(str(text)) - self.metadata = p.metadata - self.dataset_mapping = DatasetMapping( + self.metadata: dict[str, Any] = p.metadata + self.dataset_mapping: Mapping[Union[str, None], Dataset] = DatasetMapping( p.get(DATASETS_MAPPING), dataset_mapping, text.parent if isinstance(text, pathlib.Path) else None, download_dir, ) - self.text = p.content + self.text: str = p.content self._datadict = collections.defaultdict(dict) for prefix, ds in self.dataset_mapping.items(): self._datadict[prefix][SOURCE_COMPONENT] = {src.id: src for src in ds.sources} @@ -241,9 +247,9 @@ def frontmatter(self) -> str: """ The markdown documents metadata formatted as YAML frontmatter. """ - return '---\n{}---'.format(yaml.dump(self.metadata)) + return f'---\n{yaml.dump(self.metadata)}---' - def get_object(self, ml: CLDFMarkdownLink) -> typing.Union[list, orm.Object, Source, dict]: + def get_object(self, ml: CLDFMarkdownLink) -> Union[list, orm.Object, Source, dict]: """ Resolve the reference in a CLDF Markdown link to the matching object from a mapped dataset. @@ -273,20 +279,23 @@ def get_object(self, ml: CLDFMarkdownLink) -> typing.Union[list, orm.Object, Sou return list(self._datadict[ml.prefix][key].values()) if ml.all \ else self._datadict[ml.prefix][key][ml.objid] - def _render_link(self, link): + def _render_link(self, link: CLDFMarkdownLink) -> Union[str, CLDFMarkdownLink]: + """Dispatches to custom rendering in case of CLDF links.""" if link.is_cldf_link: return self.render_link(link) return link - def render_link(self, cldf_link: CLDFMarkdownLink) -> typing.Union[str, CLDFMarkdownLink]: + def render_link(self, cldf_link: CLDFMarkdownLink) -> Union[str, CLDFMarkdownLink]: """ CLDF Markdown renderers must implement this method. """ raise NotImplementedError() # pragma: no cover - def render(self, - simple_link_detection: bool = True, - markdown_kw: typing.Optional[dict] = None) -> str: + def render( + self, + simple_link_detection: bool = True, + markdown_kw: Optional[dict[str, Any]] = None, + ) -> str: """ A markdown string with CLDF Markdown links replaced. """ @@ -297,7 +306,7 @@ def render(self, category=UserWarning) kw = {} else: - kw = dict(simple=simple_link_detection, markdown_kw=markdown_kw) + kw = {'simple': simple_link_detection, 'markdown_kw': markdown_kw} return CLDFMarkdownLink.replace(self.text, self._render_link, **kw) @@ -305,7 +314,7 @@ class FilenameToComponent(CLDFMarkdownText): """ Renderer to replace filenames in CLDF Markdown links with CLDF component names. """ - def render_link(self, cldf_link): + def render_link(self, cldf_link: CLDFMarkdownLink) -> CLDFMarkdownLink: """ Rewrites to URL of CLDF Markdown links, using the component name as path component. """ diff --git a/src/pycldf/fileutil.py b/src/pycldf/fileutil.py new file mode 100644 index 0000000..75b527b --- /dev/null +++ b/src/pycldf/fileutil.py @@ -0,0 +1,65 @@ +""" +Functionality to access and manipulate files. +""" +import re +import math +import string +from typing import Union, Optional +import pathlib +import itertools + + +PathType = Union[str, pathlib.Path] + + +def splitfile(p: PathType, chunksize: int, total: Optional[int] = None) -> list[pathlib.Path]: + """ + :param p: Path of the file to split. + :param chunksize: The maximal size of the chunks the file will be split into. + :param total: The size of the input file. + :return: The list of paths of files that the input has been split into. + """ + p = pathlib.Path(p) + total = total or p.stat().st_size + if total <= chunksize: # Nothing to do. + return [p] + nchunks = math.ceil(total / chunksize) + suffix_length = 2 if nchunks < len(string.ascii_lowercase)**2 else 3 + suffixes = [ + ''.join(t) for t in + itertools.combinations_with_replacement(string.ascii_lowercase, suffix_length)] + + res = [] + with p.open('rb') as f: + chunk = f.read(chunksize) + while chunk: + pp = p.parent.joinpath(f'{p.name}.{suffixes.pop(0)}') + pp.write_bytes(chunk) + res.append(pp) + chunk = f.read(chunksize) # read the next chunk + + p.unlink() + return res + + +def catfile(p: PathType) -> bool: + """ + Restore a file that has been split into chunks. + + We determine if a file has been split by looking for files in the parent directory with suffixes + as created by `splitfile`. + """ + p = pathlib.Path(p) + if p.exists(): # Nothing to do. + return False + # Check, whether the file has been split. + suffixes = {pp.suffix: pp for pp in p.parent.iterdir() if pp.stem == p.name} + if {'.aa', '.ab'}.issubset(suffixes) or {'.aaa', '.aab'}.issubset(suffixes): + # ok, let's concatenate the files: + with p.open('wb') as f: + for suffix in sorted(suffixes): + if re.fullmatch(r'\.[a-z]{2,3}', suffix): + f.write(suffixes[suffix].read_bytes()) + suffixes[suffix].unlink() + return True + return False # pragma: no cover diff --git a/src/pycldf/markdown.py b/src/pycldf/markdown.py new file mode 100644 index 0000000..226655b --- /dev/null +++ b/src/pycldf/markdown.py @@ -0,0 +1,166 @@ +""" +Functionality to render a Dataset's metadata to a Markdown document. +""" +import re +import html +import pathlib +from typing import TYPE_CHECKING, Any, Optional + +from clldutils.misc import slug + +from pycldf.util import qname2url +from pycldf.fileutil import PathType + +if TYPE_CHECKING: + from pycldf import Dataset # pragma: no cover + +__all__ = ['metadata2markdown'] + + +def metadata2markdown(ds: 'Dataset', path: PathType, rel_path: Optional[str] = './') -> str: + """ + Render the metadata of a dataset as markdown. + + :param ds: `Dataset` instance + :param path: `pathlib.Path` of the metadata file + :param rel_path: `str` to use a relative path when creating links to data files + :return: `str` with markdown formatted text + """ + path = pathlib.Path(path) + return '\n'.join(_iter_markdown(ds, pathlib.Path(path), rel_path)) + + +def _qname2link(qname: str, html_=False) -> str: + url = qname2url(qname) + if url: + return f'{qname}' if html_ else f'[{qname}]({url})' + return qname + + +def _htmlify(obj: Any, rel_path: str, key=None) -> str: + """ + For inclusion in tables we must use HTML for lists. + """ + if isinstance(obj, list): + items = [f'
  • {_htmlify(item, rel_path, key=key)}
  • ' for item in obj] + return f'
      {"".join(items)}
    ' + + if isinstance(obj, dict): + if key == 'prov:wasGeneratedBy' \ + and set(obj.keys()).issubset({'dc:title', 'dc:description', 'dc:relation'}): + desc = obj.get('dc:description') or '' + rel = obj.get('dc:relation') + if rel: + desc = (desc + '
    ') if desc else desc + desc += f'{rel}' + return f"{obj.get('dc:title') or ''}: {desc}" + + if obj.get('rdf:type') == 'prov:Entity' and 'rdf:about' in obj: + label = obj.get('dc:title') + if (not label) or label == 'Repository': + label = obj['rdf:about'] + url = obj['rdf:about'] + if ('github.com' in url) and ('/tree/' not in url) and ('dc:created' in obj): + tag = obj['dc:created'] + if '-g' in tag: + tag = tag.split('-g')[-1] + url = f'{url}/tree/{tag}' + if label == obj['rdf:about']: + label = label.split('github.com/')[-1] + version = f' {obj.get("dc:created")}' or '' + return f'{label} {version}' + + items = [ + f'
    {_qname2link(k, html_=True)}
    {html.escape(str(v))}
    ' + for k, v in obj.items()] + return f'
    {"".join(items)}
    ' + + return str(obj) + + +def _iter_properties(obj, rel_path): + if obj.common_props.get('dc:description'): + yield obj.common_props['dc:description'] + '\n' + yield 'property | value\n --- | ---' + for k, v in obj.common_props.items(): + if not v: + continue + if k not in ('dc:description', 'dc:title', 'dc:source'): + if k == 'dc:conformsTo': + v = f'[CLDF {v.split("#")[1]}]({v})' + yield f'{_qname2link(k)} | {_htmlify(v, rel_path, key=k)}' + yield '' + + +def _colrow(col, fks, pk, ds, rel_path): + dt = f"`{col.datatype.base if col.datatype else 'string'}`" + if col.datatype: + if col.datatype.format: + if re.fullmatch(r'[\w\s]+(\|[\w\s]+)*', col.datatype.format): + dt += '
    Valid choices:
    ' + dt += ''.join(f' `{w}`' for w in col.datatype.format.split('|')) + elif col.datatype.base == 'string': + dt += f'
    Regex: `{col.datatype.format}`' + if col.datatype.minimum: + dt += f'
    ≥ {col.datatype.minimum}' + if col.datatype.maximum: + dt += f'
    ≤ {col.datatype.maximum}' + if col.separator: + dt = f'list of {dt} (separated by `{col.separator}`)' + desc = col.common_props.get('dc:description', '').replace('\n', ' ') + + if col.name in pk: + desc = (desc + '
    ') if desc else desc + desc += 'Primary key' + + if col.name in fks: + desc = (desc + '
    ') if desc else desc + pkcol, table = fks[col.name] + desc += f'References [{table}::{pkcol}](#table-{slug(table)})' + elif col.propertyUrl \ + and col.propertyUrl.uri == "http://cldf.clld.org/v1.0/terms.rdf#source" \ + and 'dc:source' in ds.properties: + desc = (desc + '
    ') if desc else desc + desc += (f"References [{ds.properties['dc:source']}::BibTeX-key]" + f"({rel_path}{ds.properties['dc:source']})") + + return ' | '.join([ + f'[{col.name}]({col.propertyUrl})' if col.propertyUrl else f'`{col.name}`', dt, desc]) + + +def _existing_fname_in_cldf_dir(ds, fname: str) -> Optional[str]: + """Returns an existing (possibly zipped) file matching fname.""" + if pathlib.Path(ds.directory).joinpath(fname).exists(): + return fname + zipped = fname + '.zip' + if pathlib.Path(ds.directory).joinpath(zipped).exists(): + return zipped + return None + + +def _iter_markdown(ds: 'Dataset', path: pathlib.Path, rel_path: Optional[str] = './'): + def file_link(fname): + return f'[{fname}]({rel_path}{fname})' + + yield f'# {ds.properties.get("dc:title", ds.module)}\n' + if path.suffix == '.json': + yield f'**CLDF Metadata**: {file_link(path.name)}\n' + if 'dc:source' in ds.properties: + src = _existing_fname_in_cldf_dir(ds, ds.properties['dc:source']) + if src: + yield f'**Sources**: {file_link(src)}\n' + yield from _iter_properties(ds.tablegroup, rel_path) + + for table in ds.tables: + fks = { + fk.columnReference[0]: (fk.reference.columnReference[0], fk.reference.resource.string) + for fk in table.tableSchema.foreignKeys if len(fk.columnReference) == 1} + src = _existing_fname_in_cldf_dir(ds, table.url.string) + table_name = file_link(src) if src else table.url + yield f'\n## Table {table_name}\n' + yield from _iter_properties(table, rel_path) + yield '\n### Columns\n' + yield 'Name/Property | Datatype | Description' + yield ' --- | --- | --- ' + for col in table.tableSchema.columns: + yield _colrow(col, fks, table.tableSchema.primaryKey, ds, rel_path) diff --git a/src/pycldf/media.py b/src/pycldf/media.py index 13c13a1..62163cb 100644 --- a/src/pycldf/media.py +++ b/src/pycldf/media.py @@ -24,8 +24,7 @@ import io import json import base64 -import typing -import logging +from typing import Union, TYPE_CHECKING, Optional, Callable import pathlib import zipfile import functools @@ -33,17 +32,25 @@ import collections import urllib.parse import urllib.request +from collections.abc import Generator -from clldutils.misc import log_or_raise -import pycldf -from pycldf import orm -from pycldf.util import splitfile, catfile +from csvw.metadata import Table, Column from csvw.datatypes import anyURI +from pycldf import orm +from pycldf.fileutil import splitfile, catfile, PathType + +if TYPE_CHECKING: + from pycldf import Dataset # pragma: no cover + from pycldf.dataset import RowType # pragma: no cover + from pycldf.validators import DatasetValidator # pragma: no cover + __all__ = ['Mimetype', 'MediaTable', 'File'] +StrOrBytes = Union[str, bytes] -class File: + +class File: # pylint: disable=too-many-instance-attributes """ A `File` represents a row in a MediaTable, providing functionality to access the contents. @@ -56,15 +63,16 @@ class File: - :meth:`save` will write a (deflated) ZIP archive containing the specified file as single \ member. """ - def __init__(self, media: 'MediaTable', row: dict): - self.row = row - self.id = row[media.filename_col.name] - self._mimetype = row[media.mimetype_col.name] - self.url = None + def __init__(self, media: 'MediaTable', row: 'RowType'): + self.row: 'RowType' = row + self.id: str = row[media.filename_col.name] + self._mimetype: str = row[media.mimetype_col.name] + self.url: Optional[str] = None self.scheme = None self.url_reader = media.url_reader - self.path_in_zip = row.get(media.path_in_zip_col.name) if media.path_in_zip_col else None - self._dsdir = media.ds.directory + self.path_in_zip: Optional[str] \ + = row.get(media.path_in_zip_col.name) if media.path_in_zip_col else None + self._dsdir: pathlib.Path = media.ds.directory if media.url_col: # 1. Look for a downloadUrl property: @@ -83,7 +91,7 @@ def __init__(self, media: 'MediaTable', row: dict): @classmethod def from_dataset( - cls, ds: pycldf.Dataset, row_or_object: typing.Union[dict, orm.Media]) -> 'File': + cls, ds: 'Dataset', row_or_object: Union[dict, orm.Media]) -> 'File': """ Factory method to instantiate a `File` bypassing the `Media` wrapper. """ @@ -114,7 +122,7 @@ def mimetype(self) -> 'Mimetype': if mt: return Mimetype(mt) if self.scheme == 'data': - mt, _, data = self.parsed_url.path.partition(',') + mt, _, _ = self.parsed_url.path.partition(',') if mt.endswith(';base64'): mt = mt.replace(';base64', '').strip() if mt: @@ -122,13 +130,14 @@ def mimetype(self) -> 'Mimetype': # There's an explicit default mimetype for data URLs! return Mimetype('text/plain;charset=US-ASCII') if self.scheme in ['http', 'https']: - res = urllib.request.urlopen(urllib.request.Request(self.url, method="HEAD")) + res = urllib.request.urlopen( # too lazy to mock with with. pylint: disable=R1732 + urllib.request.Request(self.url, method="HEAD")) mt = res.headers.get('Content-Type') if mt: return Mimetype(mt) return Mimetype('application/octet-stream') - def local_path(self, d: pathlib.Path = None) -> typing.Union[pathlib.Path, None]: + def local_path(self, d: pathlib.Path = None) -> Optional[pathlib.Path]: """ :return: The expected path of the file in the directory `d`. """ @@ -136,14 +145,15 @@ def local_path(self, d: pathlib.Path = None) -> typing.Union[pathlib.Path, None] if self.scheme == 'file': return self._dsdir / urllib.parse.unquote(self.relpath) return None - return d.joinpath('{}{}'.format( - self.id, '.zip' if self.path_in_zip else (self.mimetype.extension or ''))) + zip_ext = '.zip' if self.path_in_zip else (self.mimetype.extension or '') + return d.joinpath(f'{self.id}{zip_ext}') def read_json(self, d=None): + """Reads JSON data.""" assert self.mimetype.subtype.endswith('json') return json.loads(self.read(d=d)) - def read(self, d=None) -> typing.Union[None, str, bytes]: + def read(self, d: Optional[pathlib.Path] = None) -> Optional[StrOrBytes]: """ :param d: A local directory where the file has been saved before. If `None`, the content \ will be read from the file's URL. @@ -156,17 +166,18 @@ def read(self, d=None) -> typing.Union[None, str, bytes]: zipcontent = self.url_reader[self.scheme]( self.parsed_url, Mimetype('application/zip')) if zipcontent: - zf = zipfile.ZipFile(io.BytesIO(zipcontent)) - return self.mimetype.read(zf.read(self.path_in_zip)) - return # pragma: no cover + with zipfile.ZipFile(io.BytesIO(zipcontent)) as zf: + return self.mimetype.read(zf.read(self.path_in_zip)) + return None # pragma: no cover if d: return self.mimetype.read(self.local_path(d).read_bytes()) if self.url: try: return self.url_reader[self.scheme](self.parsed_url, self.mimetype) - except KeyError: - raise ValueError('Unsupported URL scheme: {}'.format(self.scheme)) + except KeyError as e: + raise ValueError(f'Unsupported URL scheme: {self.scheme}') from e + return None # pragma: no cover def save(self, d: pathlib.Path) -> pathlib.Path: """ @@ -189,14 +200,17 @@ def save(self, d: pathlib.Path) -> pathlib.Path: return p -class MediaTable(pycldf.ComponentWithValidation): +class MediaTable: # pylint: disable=too-many-instance-attributes """ Container class for a `Dataset`'s media items. """ - def __init__(self, ds: pycldf.Dataset, use_form_id: bool = False): - super().__init__(ds) - self.url_col = ds.get(('MediaTable', 'http://cldf.clld.org/v1.0/terms.rdf#downloadUrl')) - self.path_in_zip_col = ds.get( + def __init__(self, ds: 'Dataset'): + self.ds: 'Dataset' = ds + self.component: str = self.__class__.__name__ + self.table: Table = ds[self.component] + self.url_col: Optional[Column] = ds.get( + ('MediaTable', 'http://cldf.clld.org/v1.0/terms.rdf#downloadUrl')) + self.path_in_zip_col: Optional[Column] = ds.get( (self.component, 'http://cldf.clld.org/v1.0/terms.rdf#pathInZip')) if self.table and not self.url_col: @@ -204,13 +218,14 @@ def __init__(self, ds: pycldf.Dataset, use_form_id: bool = False): if col.propertyUrl and col.propertyUrl == 'http://www.w3.org/ns/dcat#downloadUrl': self.url_col = col break - self.id_col = ds[self.component, 'http://cldf.clld.org/v1.0/terms.rdf#id'] - self.filename_col = ds[self.component, 'http://cldf.clld.org/v1.0/terms.rdf#formReference']\ - if use_form_id else self.id_col - self.mimetype_col = ds[self.component, 'http://cldf.clld.org/v1.0/terms.rdf#mediaType'] + self.id_col: Column = ds[self.component, 'http://cldf.clld.org/v1.0/terms.rdf#id'] + self.filename_col: Column = self.id_col + self.mimetype_col: Column = ds[ + self.component, 'http://cldf.clld.org/v1.0/terms.rdf#mediaType'] @functools.cached_property - def url_reader(self): + def url_reader(self) -> dict[str, Callable[[urllib.parse.ParseResult, 'Mimetype'], StrOrBytes]]: + """Maps URL schemes to reader functions.""" return { 'http': read_http_url, 'https': read_http_url, @@ -219,13 +234,13 @@ def url_reader(self): 'file': functools.partial(read_file_url, self.ds.directory), } - def __iter__(self) -> typing.Generator[File, None, None]: + def __iter__(self) -> Generator[File, None, None]: for row in self.table: yield File(self, row) def split(self, chunksize: int) -> int: """ - :return: The number of media files that have been split. + :return: The number of media files that needed splitting. """ res = 0 for file in self: @@ -237,7 +252,7 @@ def split(self, chunksize: int) -> int: res += 1 return res - def cat(self): + def cat(self) -> int: """ :return: The number of media files that have been re-assembled from chunks. """ @@ -249,7 +264,8 @@ def cat(self): res += 1 return res - def validate(self, success: bool = True, log: logging.Logger = None) -> bool: + def validate(self, validator: 'DatasetValidator'): + """Component-specific validation.""" speaker_area_files = collections.defaultdict(list) if ('LanguageTable', 'speakerArea') in self.ds: for lg in self.ds.iter_rows('LanguageTable', 'id', 'speakerArea'): @@ -257,42 +273,38 @@ def validate(self, success: bool = True, log: logging.Logger = None) -> bool: speaker_area_files[lg['speakerArea']].append(lg['id']) for file in self: - content = None - if not file.url: - success = False - log_or_raise('File without URL: {}'.format(file.id), log=log) - elif file.scheme == 'file': - try: - content = file.read() - except FileNotFoundError: - success = False - log_or_raise( - 'Non-existing local file referenced: {} ' - 'You may have to run `cldf catmedia` to recombine files'.format(file.id), - log=log) - except Exception as e: # pragma: no cover - success = False - log_or_raise('Error reading {}: {}'.format(file.id, e), log=log) - elif file.scheme == 'data': - try: - content = file.read() - except Exception as e: # pragma: no cover - success = False - log_or_raise('Error reading {}: {}'.format(file.id, e), log=log) - if file.id in speaker_area_files and file.mimetype.subtype == 'geo+json' and content: - content = json.loads(content) - if content['type'] != 'Feature': - assert content['type'] == 'FeatureCollection' - for feature in content['features']: - lid = feature['properties'].get('cldf:languageReference') - if lid and lid in speaker_area_files[file.id]: - speaker_area_files[file.id].remove(lid) - if speaker_area_files[file.id]: - log_or_raise( - 'Error: Not all language IDs found in speakerArea GeoJSON: {}'.format( - speaker_area_files[file.id])) # pragma: no cover - - return success + self._validate_file(validator, file, speaker_area_files) + + def _validate_file(self, validator, file, speaker_area_files): + content = None + if not file.url: + validator.fail(f'File without URL: {file.id}') + elif file.scheme == 'file': + try: + content = file.read() + except FileNotFoundError: + validator.fail( + f'Non-existing local file referenced: {file.id} ' + 'You may have to run `cldf catmedia` to recombine files') + except Exception as e: # pragma: no cover # pylint: disable=W0718 + validator.fail(f'Error reading {file.id}: {e}') + elif file.scheme == 'data': + try: + content = file.read() + except Exception as e: # pragma: no cover # pylint: disable=W0718 + validator.fail(f'Error reading {file.id}: {e}') + if file.id in speaker_area_files and file.mimetype.subtype == 'geo+json' and content: + content = json.loads(content) + if content['type'] != 'Feature': + assert content['type'] == 'FeatureCollection' + for feature in content['features']: + lid = feature['properties'].get('cldf:languageReference') + if lid and lid in speaker_area_files[file.id]: + speaker_area_files[file.id].remove(lid) + if speaker_area_files[file.id]: + validator.fail( + f'Error: Not all language IDs found in speakerArea GeoJSON: ' + f'{speaker_area_files[file.id]}') # pragma: no cover Media = MediaTable @@ -327,23 +339,28 @@ def __eq__(self, other): @property def is_text(self) -> bool: + """Whether the mimetype describes text, and hence data should be read as str.""" return self.type == 'text' @property - def extension(self) -> typing.Union[None, str]: - return mimetypes.guess_extension('{}/{}'.format(self.type, self.subtype)) + def extension(self) -> Union[None, str]: + """Return a suitable filename extension for the mimetype.""" + return mimetypes.guess_extension(f'{self.type}/{self.subtype}') - def read(self, data: bytes) -> typing.Union[str, bytes]: + def read(self, data: bytes) -> StrOrBytes: + """Read data, inferring the encoding from the mimetype.""" if self.is_text and not isinstance(data, str): return data.decode(self.encoding) return data - def write(self, data: typing.Union[str, bytes], p: typing.Optional[pathlib.Path] = None) -> int: + def write(self, data: StrOrBytes, p: Optional[pathlib.Path] = None) -> Union[int, StrOrBytes]: + """The mimetype dictates how/if to encode data.""" res = data.encode(self.encoding) if self.is_text else data return p.write_bytes(res) if p else res -def read_data_url(url: urllib.parse.ParseResult, mimetype: Mimetype): +def read_data_url(url: urllib.parse.ParseResult, mimetype: Mimetype) -> StrOrBytes: + """Read data from a data:// URL.""" spec, _, data = url.path.partition(',') if spec.endswith(';base64'): data = base64.b64decode(data) @@ -354,9 +371,8 @@ def read_data_url(url: urllib.parse.ParseResult, mimetype: Mimetype): return data -def read_file_url(d: typing.Union[pathlib.Path, str], - url: urllib.parse.ParseResult, - mimetype: Mimetype) -> typing.Union[str, bytes]: +def read_file_url(d: PathType, url: urllib.parse.ParseResult, mimetype: Mimetype) -> StrOrBytes: + """Read data from a file:// URL.""" path = url.path while path.startswith('/'): path = path[1:] @@ -368,5 +384,6 @@ def read_file_url(d: typing.Union[pathlib.Path, str], return mimetype.read(d.joinpath(urllib.parse.unquote(path)).read_bytes()) -def read_http_url(url: urllib.parse.ParseResult, mimetype: Mimetype): +def read_http_url(url: urllib.parse.ParseResult, mimetype: Mimetype) -> StrOrBytes: + """Read data from an HTTP URL.""" return mimetype.read(urllib.request.urlopen(urllib.parse.urlunparse(url)).read()) diff --git a/src/pycldf/module.py b/src/pycldf/module.py new file mode 100644 index 0000000..64df73f --- /dev/null +++ b/src/pycldf/module.py @@ -0,0 +1,76 @@ +""" +Functionality to manage modules, i.e. `Dataset` subclasses implementing particular CLDF modules. +""" +import dataclasses +from typing import Union, Optional, Type + +from csvw.metadata import TableGroup + +from pycldf.terms import TERMS, term_uri +from pycldf.util import pkg_path, MD_SUFFIX + +__all__ = ['get_module_impl'] + + +@dataclasses.dataclass +class Module: + """ + Class representing a CLDF Module. + + .. seealso:: https://github.com/cldf/cldf/blob/master/README.md#cldf-modules + """ + uri: str + fname: str + + def __post_init__(self): + if self.uri not in {t.uri for t in TERMS.classes.values()}: + raise ValueError(self.uri) # pragma: no cover + + @property + def id(self) -> str: + """ + The local part of the term URI is interpreted as Module identifier. + """ + return self.uri.split('#')[1] + + def match(self, thing: Union[TableGroup, str]) -> bool: + """Check if the module described here matches thing.""" + if isinstance(thing, TableGroup): + return thing.common_props.get('dc:conformsTo') == term_uri(self.id) + if isinstance(thing, str): + return thing == self.fname or thing == self.id + return False + + +_modules = [] + + +def get_module_impl(base_class, spec: Union[TableGroup, str]) -> Optional[Type]: + """ + Returns an implementation (aka Dataset subclass) for a particular CLDF module. + """ + implementations = {cls.__name__: cls for cls in base_class.__subclasses__()} + for mod in get_modules(): + if mod.match(spec): + return implementations[mod.id] + return None # pragma: no cover + + +def get_modules() -> list[Module]: + """ + We read supported CLDF modules from the default metadata files distributed with `pycldf`. + """ + global _modules # pylint: disable=global-statement + + if not _modules: + for p in pkg_path('modules').glob(f'*{MD_SUFFIX}'): + tg = TableGroup.from_file(p) + mod = Module( + tg.common_props['dc:conformsTo'], + tg.tables[0].url.string if tg.tables else None) + _modules.append(mod) + # prefer Wordlist over ParallelText (forms.csv) + _modules = sorted( + _modules, + key=lambda m: (m.id in ('Wordlist', 'ParallelText'), m.id == 'ParallelText')) + return _modules diff --git a/src/pycldf/orm.py b/src/pycldf/orm.py index d85558c..53f33d2 100644 --- a/src/pycldf/orm.py +++ b/src/pycldf/orm.py @@ -46,7 +46,7 @@ def custom_method(self): * ~35secs iterating over ``pycldf.Dataset.objects('ValueTable')`` """ import types -import typing +from typing import TYPE_CHECKING, Union, Optional, Any import decimal import functools import collections @@ -58,12 +58,14 @@ def custom_method(self): from pycldf.util import DictTuple from pycldf.sources import Reference -if typing.TYPE_CHECKING: +if TYPE_CHECKING: from pycldf import Dataset # pragma: no cover + from pycldf.dataset import RowType # pragma: no cover from pycldf.media import File # pragma: no cover -def to_json(s): +def to_json(s: Any) -> Union[str, float, None, list, dict]: + """Converts `s` to an object that can be serialized as JSON.""" if isinstance(s, (list, tuple)): return [to_json(ss) for ss in s] if isinstance(s, dict): @@ -77,7 +79,7 @@ def to_json(s): return str(s) -class Object: +class Object: # pylint: disable=too-many-instance-attributes """ Represents a row of a CLDF component table. @@ -95,7 +97,7 @@ class Object: # specified here: __component__ = None - def __init__(self, dataset: 'Dataset', row: dict): + def __init__(self, dataset: 'Dataset', row: 'RowType'): # Get a mapping of column names to pairs (CLDF property name, list-valued) for columns # present in the component specified by class name. cldf_cols = { @@ -103,29 +105,29 @@ def __init__(self, dataset: 'Dataset', row: dict): for k, v in vars(getattr(dataset.readonly_column_names, self.component)).items() if v} self._listvalued = set(v[0] for v in cldf_cols.values() if v[1]) - self.cldf = {} - self.data = collections.OrderedDict() + cldf_ = {} + self.data: collections.OrderedDict[str, Any] = collections.OrderedDict() for k, v in row.items(): # We go through the items of the row and slot them into the appropriate bags: self.data[k] = v if k in cldf_cols: - self.cldf[cldf_cols[k][0]] = v + cldf_[cldf_cols[k][0]] = v # Make cldf properties accessible as attributes: - self.cldf = types.SimpleNamespace(**self.cldf) - self.dataset = dataset - self.id = self.cldf.id - self.pk = None + self.cldf = types.SimpleNamespace(**cldf_) + self.dataset: 'Dataset' = dataset + self.id: str = self.cldf.id + self.pk: Optional[str] = None t = dataset[self.component_name()] if t.tableSchema.primaryKey and len(t.tableSchema.primaryKey) == 1: self.pk = self.data[dataset[self.component_name()].tableSchema.primaryKey[0]] - self.name = getattr(self.cldf, 'name', None) - self.description = getattr(self.cldf, 'description', None) + self.name: str = getattr(self.cldf, 'name', None) + self.description: str = getattr(self.cldf, 'description', None) def __repr__(self): - return '<{}.{} id="{}">'.format(self.__class__.__module__, self.__class__.__name__, self.id) + return f'<{self.__class__.__module__}.{self.__class__.__name__} id="{self.id}">' @classmethod - def component_name(cls) -> str: + def component_name(cls) -> str: # pylint: disable=C0116 return cls.__component__ or (cls.__name__ + 'Table') @property @@ -137,7 +139,8 @@ def component(self) -> str: return self.__class__.component_name() @property - def key(self) -> typing.Tuple[int, str, str]: + def key(self) -> tuple[int, str, str]: + """A key that is also unique across different Dataset instances.""" return id(self.dataset), self.__class__.__name__, self.id def __hash__(self): @@ -154,31 +157,32 @@ def _expand_uritemplate(self, attr, col): row as context. Thus, expansion is available as method on this row object. """ col = self.dataset[self.component, col] - variables = {k: v for k, v in vars(self.cldf).items()} + variables = dict(vars(self.cldf).items()) variables.update(self.data) if getattr(col, attr, None): return getattr(col, attr).expand(**variables) + return None # pragma: no cover - def aboutUrl(self, col='id') -> typing.Union[str, None]: + def aboutUrl(self, col: str = 'id') -> Union[str, None]: # pylint: disable=invalid-name """ The table's `aboutUrl` property, expanded with the object's row as context. """ return self._expand_uritemplate('aboutUrl', col) - def valueUrl(self, col='id') -> typing.Union[str, None]: + def valueUrl(self, col: str = 'id') -> Union[str, None]: # pylint: disable=invalid-name """ The table's `valueUrl` property, expanded with the object's row as context. """ return self._expand_uritemplate('valueUrl', col) - def propertyUrl(self, col='id') -> typing.Union[str, None]: + def propertyUrl(self, col: str = 'id') -> Union[str, None]: # pylint: disable=invalid-name """ The table's `propertyUrl` property, expanded with the object's row as context. """ return self._expand_uritemplate('propertyUrl', col) @functools.cached_property - def references(self) -> typing.Tuple[Reference]: + def references(self) -> tuple[Reference, ...]: """ `pycldf.Reference` instances associated with the object. @@ -192,7 +196,7 @@ def references(self) -> typing.Tuple[Reference]: multi=True, ) - def related(self, relation: str) -> typing.Union[None, 'Object']: + def related(self, relation: str) -> Optional['Object']: """ The CLDF ontology specifies several "reference properties". This method returns the first related object specified by such a property. @@ -202,7 +206,7 @@ def related(self, relation: str) -> typing.Union[None, 'Object']: """ if relation in self._listvalued: raise ValueError( - '{} is list-valued, use `all_related` to retrieve related objects'.format(relation)) + f'{relation} is list-valued, use `all_related` to retrieve related objects') fk = getattr(self.cldf, relation, None) if fk: ref = self.dataset.get_foreign_key_reference(self.component_name(), relation) @@ -213,8 +217,9 @@ def related(self, relation: str) -> typing.Union[None, 'Object']: return self.dataset.get_object(TERMS[relation].references, fk, pk=True) raise NotImplementedError('pycldf does not support foreign key constraints ' 'referencing columns other than CLDF id or primary key.') + return None # pragma: no cover - def all_related(self, relation: str) -> typing.Union[DictTuple, list]: + def all_related(self, relation: str) -> Union[DictTuple, list]: """ CLDF reference properties can be list-valued. This method returns all related objects for such a property. @@ -229,57 +234,58 @@ def all_related(self, relation: str) -> typing.Union[DictTuple, list]: class _WithLanguageMixin: @property - def language(self): + def language(self) -> Object: # pylint: disable=C0116 return self.related('languageReference') @property - def languages(self): + def languages(self) -> Union[DictTuple, list]: # pylint: disable=C0116 return self.all_related('languageReference') class _WithParameterMixin: @functools.cached_property - def parameter(self): + def parameter(self) -> Object: # pylint: disable=C0116 return self.related('parameterReference') @property - def parameters(self): + def parameters(self) -> Union[DictTuple, list]: # pylint: disable=C0116 return self.all_related('parameterReference') -class Borrowing(Object): +class Borrowing(Object): # pylint: disable=C0115 @property - def targetForm(self): + def targetForm(self) -> Object: # pylint: disable=C0116,C0103 return self.related('targetFormReference') @property - def sourceForm(self): + def sourceForm(self) -> Object: # pylint: disable=C0116,C0103 return self.related('sourceFormReference') -class Code(Object, _WithParameterMixin): +class Code(Object, _WithParameterMixin): # pylint: disable=C0115 pass -class Cognateset(Object): +class Cognateset(Object): # pylint: disable=C0115 @property - def cognates(self): + def cognates(self): # pylint: disable=C0116 return DictTuple(v for v in self.dataset.objects('CognateTable') if v.cognateset == self) -class Cognate(Object): +class Cognate(Object): # pylint: disable=C0115 @property - def form(self): + def form(self): # pylint: disable=C0116 return self.related('formReference') @property - def cognateset(self): + def cognateset(self): # pylint: disable=C0116 return self.related('cognatesetReference') -class Contribution(Object): +class Contribution(Object): # pylint: disable=C0115 @property def sentences(self): + """Returns the ordered sentences of a text in a TextCorpus.""" res = [] if self.dataset.module == 'TextCorpus': # Return the list of lines, ordered by position. @@ -293,35 +299,38 @@ def sentences(self): return res -class Entry(Object, _WithLanguageMixin): +class Entry(Object, _WithLanguageMixin): # pylint: disable=C0115 @property - def senses(self): + def senses(self): # pylint: disable=C0116 return DictTuple(v for v in self.dataset.objects('SenseTable') if self in v.entries) -class Example(Object, _WithLanguageMixin): +class Example(Object, _WithLanguageMixin): # pylint: disable=C0115 @property - def metaLanguage(self): + def metaLanguage(self): # pylint: disable=C0116,C0103 return self.related('metaLanguageReference') @property - def igt(self): - return '{0}\n{1}\n{2}'.format( - self.cldf.primaryText, - tabulate([self.cldf.gloss], self.cldf.analyzedWord, tablefmt='plain'), - self.cldf.translatedText, - ) + def igt(self) -> str: + """The example in a plain text interlinear glossed representation.""" + aligned = tabulate([self.cldf.gloss], self.cldf.analyzedWord, tablefmt='plain') + return f'{self.cldf.primaryText}\n{aligned}\n{self.cldf.translatedText}' @property def text(self): """ - Examples in a TextCorpus are interpreted as lines of text. + Examples in a TextCorpus are interpreted as lines of a text, which in turn is the + module-specific interpretation of a CLDF contribution. """ if self.dataset.module == 'TextCorpus' and hasattr(self.cldf, 'contributionReference'): return self.related('contributionReference') + return None # pragma: no cover @property - def alternative_translations(self): + def alternative_translations(self) -> list['Example']: + """ + Returns alternative translations for the Example. + """ res = [] if hasattr(self.cldf, 'exampleReference'): # There's a self-referential foreign key. We assume this to link together full examples @@ -332,17 +341,17 @@ def alternative_translations(self): return res -class Form(Object, _WithLanguageMixin, _WithParameterMixin): +class Form(Object, _WithLanguageMixin, _WithParameterMixin): # pylint: disable=C0115 pass -class FunctionalEquivalentset(Object): +class FunctionalEquivalentset(Object): # pylint: disable=C0115 pass -class FunctionalEquivalent(Object): +class FunctionalEquivalent(Object): # pylint: disable=C0115 @property - def form(self): # pragma: no cover + def form(self): # pragma: no cover # pylint: disable=C0116 return self.related('formReference') @@ -362,15 +371,16 @@ class Language(Object): 'MultiPolygon' """ @property - def lonlat(self) -> typing.Union[None, typing.Tuple[decimal.Decimal]]: + def lonlat(self) -> Optional[tuple[decimal.Decimal, decimal.Decimal]]: """ :return: (longitude, latitude) pair if coordinates are defined, else `None`. """ if hasattr(self.cldf, 'latitude'): return (self.cldf.longitude, self.cldf.latitude) + return None # pragma: no cover @property - def as_geojson_feature(self) -> typing.Union[None, typing.Dict[str, typing.Any]]: + def as_geojson_feature(self) -> Union[None, dict[str, Any]]: """ `dict` suitable for serialization as GeoJSON Feature object, with the point coordinate as geographic data. @@ -383,19 +393,21 @@ def as_geojson_feature(self) -> typing.Union[None, typing.Dict[str, typing.Any]] "geometry": {"type": "Point", "coordinates": self.lonlat}, "properties": vars(self.cldf), }) + return None # pragma: no cover @functools.cached_property - def speaker_area(self) -> typing.Union[None, 'File']: + def speaker_area(self) -> Optional['File']: """ A `pycldf.media.File` object containing information about the speaker area of the language. """ - from pycldf.media import File + from pycldf.media import File # pylint: disable=C0415 if getattr(self.cldf, 'speakerArea', None): return File.from_dataset(self.dataset, self.related('speakerArea')) + return None # pragma: no cover @functools.cached_property - def speaker_area_as_geojson_feature(self) -> typing.Union[None, typing.Dict[str, typing.Any]]: + def speaker_area_as_geojson_feature(self) -> Optional[dict[str, Any]]: """ `dict` suitable for serialization as GeoJSON Feature object, with a speaker area Polygon or MultiPolygon as geographic data. @@ -411,13 +423,14 @@ def speaker_area_as_geojson_feature(self) -> typing.Union[None, typing.Dict[str, else: assert res['type'] == 'Feature' return res + return None # pragma: no cover @property - def values(self): + def values(self) -> DictTuple: # pylint: disable=C0116 return DictTuple(v for v in self.dataset.objects('ValueTable') if self in v.languages) @property - def forms(self): + def forms(self): # pylint: disable=C0116 return DictTuple(v for v in self.dataset.objects('FormTable') if self in v.languages) def glottolog_languoid(self, glottolog_api): @@ -433,42 +446,50 @@ def glottolog_languoid(self, glottolog_api): return glottolog_api.languoid(self.cldf.glottocode) -class Media(Object): +class Media(Object): # pylint: disable=C0115 @property - def downloadUrl(self): + def downloadUrl(self): # pylint: disable=C0116,C0103 if hasattr(self.cldf, 'downloadUrl'): return self.cldf.downloadUrl return self.valueUrl() -class ParameterNetworkEdge(Object): +class ParameterNetworkEdge(Object): # pylint: disable=C0115 __component__ = 'ParameterNetwork' class Parameter(Object): + """ + The Parameter class provides support for interpreting a parameter's string values as typed + data and reading it accordingly. See `Value` below. + """ @functools.cached_property - def columnSpec(self): - columnSpec = getattr(self.cldf, 'columnSpec', None) + def columnSpec(self) -> Optional[csvw.metadata.Column]: # pylint: disable=C0103 + """Turns a JSON column specification in a column value into a Column object.""" + columnSpec = getattr(self.cldf, 'columnSpec', None) # pylint: disable=C0103 if columnSpec: return csvw.metadata.Column.fromvalue(columnSpec) + return None @functools.cached_property - def datatype(self): + def datatype(self) -> Optional[csvw.metadata.Datatype]: + """Turns a JSON datatype description in a column value into a Datatype object.""" if 'datatype' in self.data \ and self.dataset['ParameterTable', 'datatype'].datatype.base == 'json': if self.data['datatype']: return csvw.metadata.Datatype.fromvalue(self.data['datatype']) + return None @property - def codes(self): + def codes(self): # pylint: disable=C0116 return DictTuple(v for v in self.dataset.objects('CodeTable') if v.parameter == self) @property - def values(self): + def values(self): # pylint: disable=C0116 return DictTuple(v for v in self.dataset.objects('ValueTable') if self in v.parameters) @property - def forms(self): + def forms(self): # pylint: disable=C0116 return DictTuple(v for v in self.dataset.objects('FormTable') if self in v.parameters) def concepticon_conceptset(self, concepticon_api): @@ -484,17 +505,17 @@ def concepticon_conceptset(self, concepticon_api): return concepticon_api.conceptsets.get(self.cldf.concepticonReference) -class Sense(Object): +class Sense(Object): # pylint: disable=C0115 @property - def entry(self): + def entry(self): # pylint: disable=C0116 return self.related('entryReference') @property - def entries(self): + def entries(self): # pylint: disable=C0116 return self.all_related('entryReference') -class Tree(Object): +class Tree(Object): # pylint: disable=C0115 pass @@ -530,6 +551,10 @@ class Value(Object, _WithLanguageMixin, _WithParameterMixin): """ @property def typed_value(self): + """ + If a parameter includes information about the datatype of its values, this information is + used here to convert the value accordingly. + """ if self.parameter.columnSpec: return self.parameter.columnSpec.read(self.cldf.value) if self.parameter.datatype: @@ -537,9 +562,9 @@ def typed_value(self): return self.cldf.value @property - def code(self): + def code(self): # pylint: disable=C0116 return self.related('codeReference') @property - def examples(self): + def examples(self): # pylint: disable=C0116 return self.all_related('exampleReference') diff --git a/src/pycldf/schemautil.py b/src/pycldf/schemautil.py new file mode 100644 index 0000000..27e8330 --- /dev/null +++ b/src/pycldf/schemautil.py @@ -0,0 +1,52 @@ +""" +Functionality to create schema objects. +""" +from typing import Union + +from csvw.metadata import Column, Table +from clldutils import jsonlib + +from pycldf.terms import TERMS +from pycldf.util import MD_SUFFIX, pkg_path + +ColSpecType = Union[str, dict, Column] +TableSpecType = Union[str, dict, Table] +TableType = Union[str, Table] +ColType = Union[str, Column] + + +def make_column(spec: ColSpecType) -> Column: + """ + Create a `Column` instance from `spec`. + + .. code-block:: python + + >>> make_column('id').name + 'id' + >>> make_column('http://cldf.clld.org/v1.0/terms.rdf#id').name + 'ID' + >>> make_column({'name': 'col', 'datatype': 'boolean'}).datatype.base + 'boolean' + >>> type(make_column(make_column('id'))) + + """ + if isinstance(spec, str): + if spec in TERMS.by_uri: + return TERMS.by_uri[spec].to_column() + return Column(name=spec, datatype='string') + if isinstance(spec, dict): + return Column.fromvalue(spec) + if isinstance(spec, Column): + return spec + raise TypeError(spec) + + +def make_table(spec: TableSpecType) -> Table: + """Create a `Table` instance from `spec`.""" + if isinstance(spec, str): + return Table.fromvalue(jsonlib.load(pkg_path('components', f'{spec}{MD_SUFFIX}'))) + if isinstance(spec, dict): + return Table.fromvalue(spec) + if isinstance(spec, Table): + return spec + raise TypeError(spec) # pragma: no cover diff --git a/src/pycldf/sliceutil.py b/src/pycldf/sliceutil.py new file mode 100644 index 0000000..67c2d2a --- /dev/null +++ b/src/pycldf/sliceutil.py @@ -0,0 +1,55 @@ +""" +This module provides a flexible implementation of slicing sequences, based on Python's slices. + +In addition to Python's way of specifying slices as triples of integers (start, stop, step), we +allow specification as strings like '1' or '2:5', where the numbers are interpreted as **1-based** +indices, specifying **inclusive** boundaries. I.e. '2:5' is equivalent to `slice(1:5).` +""" +from typing import Union, TypeVar +import itertools +from collections.abc import Sequence, Iterable + +__all__ = ['multislice', 'multislice_with_split'] + +T = TypeVar('T') +SliceType = Union[str, tuple[int], tuple[int, int], tuple[int, int, int], slice] + + +def multislice(sliceable: Sequence[T], *slices: SliceType) -> Sequence[T]: + """ + .. code-block:: python + + >>> import string + >>> multislice(list(range(30)), '3:7', '9', (12, 18, 3)) + [2, 3, 4, 5, 6, 8, 12, 15] + >>> multislice(string.ascii_lowercase, '3:7', '9', (12, 18, 3)) + 'cdefgimp' + """ + res = type(sliceable)() + for sl in slices: + if isinstance(sl, str): + if ':' in sl: + assert sl.count(':') <= 2, f'String slice spec may only have two colons. {sl}' + sl = slice(*[int(s) - (1 if i == 0 else 0) for i, s in enumerate(sl.split(':'))]) + else: + sl = slice(*[int(sl) - 1, int(sl)]) + elif isinstance(sl, int): + sl = slice(sl, sl + 1) + elif isinstance(sl, (tuple, list)): + sl = slice(*sl) + else: + assert isinstance(sl, slice) + res += sliceable[sl] + return res + + +def multislice_with_split(sliceable: Sequence[T], slices: Iterable[SliceType]) -> list[T]: + """ + Resolves multislices and then applies splitting on whitespace to each item. + + .. code-block:: python + + >>> multislice_with_split(['a', 'b', 'c d', 'f', 'g'], [(2, 4)]) + ['c', 'd', 'f'] + """ + return list(itertools.chain(*[s.split() for s in multislice(sliceable, *slices)])) diff --git a/src/pycldf/sources.py b/src/pycldf/sources.py index 1d8ec7a..946b308 100644 --- a/src/pycldf/sources.py +++ b/src/pycldf/sources.py @@ -1,12 +1,16 @@ +""" +Functionality to handle BibTeX source data of Datasets. +""" import re import types -import typing +from typing import Optional, Union, Literal import pathlib import zipfile import tempfile import collections from urllib.error import HTTPError from urllib.request import urlopen, urlretrieve +from collections.abc import Generator, Iterable, KeysView from csvw.metadata import is_url from simplepybtex import database @@ -14,7 +18,8 @@ from clldutils.source import Source as BaseSource from clldutils.source import ID_PATTERN -from pycldf.util import update_url +from pycldf.urlutil import update_url +from pycldf.fileutil import PathType __all__ = ['Source', 'Sources', 'Reference'] @@ -22,13 +27,14 @@ class Writer(BaseWriter): + """We overwrite pybtex's writer to ensure data is wrapped in curly braces.""" def quote(self, s): self.check_braces(s) return '{%s}' % s def _encode(self, text): # - # FIXME: We overwrite a private method here! + # FIXME: We overwrite a private method here! pylint: disable=fixme # return text @@ -38,7 +44,8 @@ class Source(BaseSource): A bibliograhical record, specifying a source for some data in a CLDF dataset. """ @property - def entry(self): + def entry(self) -> database.Entry: + """Converts Source to a pybtex Entry.""" persons = collections.OrderedDict([ ('author', list(self.persons(self.get('author', '')))), ('editor', list(self.persons(self.get('editor', '')))), @@ -53,10 +60,10 @@ def __str__(self): return self.text() def __repr__(self): - return '<%s %s>' % (self.__class__.__name__, self.id) + return f'<{self.__class__.__name__} {self.id}>' @classmethod - def from_entry(cls, key, entry, **_kw): + def from_entry(cls, key: str, entry: database.Entry, **_kw): """ Create a `cls` instance from a `simplepybtex` entry object. @@ -65,15 +72,16 @@ def from_entry(cls, key, entry, **_kw): :param _kw: Non-bib-metadata keywords to be passed for `cls` instantiation :return: `cls` instance """ - _kw.update({k: v for k, v in entry.fields.items()}) + _kw.update(entry.fields.items()) _kw.setdefault('_check_id', False) for role in entry.persons: if entry.persons[role]: - _kw[role] = ' and '.join('%s' % p for p in entry.persons[role]) + _kw[role] = ' and '.join(f'{p}' for p in entry.persons[role]) return cls(entry.type, key, **_kw) @staticmethod - def persons(s): + def persons(s: str) -> Generator[database.Person, None, None]: + """Yields persons encoded in an author names string.""" for name in re.split(r'\s+&\s+|\s+and\s+', s.strip()): if name: parts = name.split(',') @@ -83,26 +91,31 @@ def persons(s): else: yield database.Person(name) - def refkey(self, year_brackets='round'): - brackets = {None: ('', ''), 'round': ('(', ')'), 'square': ('[', ']'), 'curly': ('{', '}')} + def refkey(self, year_brackets: Union[None, Literal["round", "square", "curly"]] = 'round'): + """Compute an author-year type reference key for the item.""" + brackets = { + None: ('', ''), + 'round': ('(', ')'), + 'square': ('[', ']'), + 'curly': ('{', '}')}.get(year_brackets) persons = self.entry.persons.get('author') or self.entry.persons.get('editor', []) - s = ' '.join(persons[0].prelast_names + persons[0].last_names) if persons else 'n.a.' + names = ' '.join(persons[0].prelast_names + persons[0].last_names) if persons else 'n.a.' if len(persons) == 2: - s += ' and {}'.format(' '.join(persons[1].last_names)) + names += f" and {' '.join(persons[1].last_names)}" elif len(persons) > 2: - s += ' et al.' - return s.replace('{', '').replace('}', '') + ' {}{}{}'.format( - brackets[year_brackets][0], self.get('year', 'n.d.'), brackets[year_brackets][1]) + names += ' et al.' + names = names.replace('{', '').replace('}', '') + return f"{names} {brackets[0]}{self.get('year', 'n.d.')}{brackets[1]}" -class Reference(object): +class Reference: """ A reference connects a piece of data with a `Source`, typically adding some citation context \ often page numbers, or similar. """ - def __init__(self, source: Source, desc: typing.Union[str, None]): + def __init__(self, source: Source, desc: Optional[str]): if desc and ('[' in desc or ']' in desc or ';' in desc): - raise ValueError('invalid ref description: %s' % desc) + raise ValueError(f'invalid ref description: {desc}') self.source = source self.fields = types.SimpleNamespace(**self.source) if isinstance(self.source, dict) else {} self.description = desc @@ -115,14 +128,14 @@ def __str__(self): """ res = self.source.id if hasattr(self.source, 'id') else self.source if self.description: - res += '[%s]' % self.description + res += f'[{self.description}]' return res def __repr__(self): - return '<%s %s>' % (self.__class__.__name__, self) + return f'<{self.__class__.__name__} {self}>' -class Sources(object): +class Sources: """ A `dict` like container for all sources linked to data in a CLDF dataset. """ @@ -130,16 +143,17 @@ def __init__(self): self._bibdata = database.BibliographyData() @classmethod - def from_file(cls, fname): + def from_file(cls, fname: PathType) -> 'Sources': + """Instantiate an instance from the data in a BibTeX file.""" zipped = False res = cls() - if not is_url(fname): + if not is_url(str(fname)): fname = pathlib.Path(fname) if not fname.exists(): - fname = fname.parent / '{}.zip'.format(fname.name) + fname = fname.parent / f'{fname.name}.zip' zipped = True if fname.exists(): - assert fname.is_file(), 'Bibfile {} must be a file!'.format(fname) + assert fname.is_file(), f'Bibfile {fname} must be a file!' res.read(fname, zipped=zipped) else: res.read(fname) @@ -150,34 +164,34 @@ def __bool__(self): __nonzero__ = __bool__ - def keys(self): + def keys(self) -> KeysView[str]: # pylint: disable=C0116 return self._bibdata.entries.keys() - def items(self): + def items(self) -> Generator[Source, None, None]: # pylint: disable=C0116 for key, entry in self._bibdata.entries.items(): yield Source.from_entry(key, entry) def __iter__(self): return self.items() - def __len__(self): + def __len__(self) -> int: return len(self._bibdata.entries) - def __getitem__(self, item): + def __getitem__(self, item: str) -> Optional[Source]: try: return Source.from_entry(item, self._bibdata.entries[item]) - except KeyError: - raise ValueError('missing citekey: %s' % item) + except KeyError as e: + raise ValueError(f'missing citekey: {item}') from e - def __contains__(self, item): + def __contains__(self, item: str) -> bool: return item in self._bibdata.entries @staticmethod - def format_refs(*refs): - return ['%s' % ref for ref in refs] + def format_refs(*refs) -> list[str]: # pylint: disable=C0116 + return [f'{ref}' for ref in refs] @staticmethod - def parse(ref: str) -> typing.Tuple[str, str]: + def parse(ref: str) -> tuple[str, str]: """ Parse the string representation of a reference into source ID and context. @@ -191,14 +205,15 @@ def parse(ref: str) -> typing.Tuple[str, str]: pages = pages[:-1].strip() return sid, pages - def validate(self, refs): + def validate(self, refs: Union[str, list[str]]) -> None: + """Make sure refs are valid. If not, raises Exceptions.""" if not isinstance(refs, str) and any(r is None for r in refs): raise ValueError('empty reference in ref list (possibly caused by trailing separator)') for sid, _ in map(self.parse, [refs] if isinstance(refs, str) else refs): if sid not in self.keys(): - raise ValueError('missing source key: {0}'.format(sid)) + raise ValueError(f'missing source key: {sid}') - def expand_refs(self, refs: typing.Iterable[str], **kw) -> typing.Iterable[Reference]: + def expand_refs(self, refs: Iterable[str], **kw) -> Iterable[Reference]: """ Turn a list of string references into proper :class:`Reference` instances, looking up \ sources in `self`. @@ -217,7 +232,7 @@ def expand_refs(self, refs: typing.Iterable[str], **kw) -> typing.Iterable[Refer self._add_entries(Source('misc', sid, glottolog_id=sid), **kw) yield Reference(self[sid], pages) - def _add_entries(self, data, **kw): + def _add_entries(self, data: Union[Source, database.BibliographyData], **kw) -> None: if isinstance(data, Source): entries = [(data.id, data.entry)] elif hasattr(data, 'entries'): @@ -232,17 +247,20 @@ def _add_entries(self, data, **kw): for key, entry in entries: if kw.get('_check_id', False) and not ID_PATTERN.match(key): - raise ValueError('invalid source ID: %s' % key) + raise ValueError(f'invalid source ID: {key}') if key not in self._bibdata.entries: try: self._bibdata.add_entry(key, entry) except database.BibliographyDataError as e: # pragma: no cover - raise ValueError('%s' % e) + raise ValueError(f'{e}') from e - def read(self, fname, zipped=False, **kw): - if is_url(fname): + def read(self, fname: PathType, zipped=False, **kw): + """Read sources from a BibTex file (possibly specified via URL).""" + if is_url(str(fname)): + fname = str(fname) try: - content = urlopen(fname).read().decode('utf-8') + with urlopen(fname) as f: + content = f.read().decode('utf-8') except HTTPError as e: if '404' in str(e): fname = update_url( @@ -254,14 +272,15 @@ def read(self, fname, zipped=False, **kw): content = zf.read(zf.namelist()[0]).decode('utf8') else: if zipped: - with zipfile.ZipFile(fname, 'r') as zf: + with zipfile.ZipFile(str(fname), 'r') as zf: content = zf.read(zf.namelist()[0]).decode('utf8') else: content = pathlib.Path(fname).read_text(encoding='utf-8') self._add_entries( database.parse_string(content, bib_format='bibtex'), **kw) - def write(self, fname, ids=None, zipped=False, **kw): + def write(self, fname: PathType, ids=None, zipped=False, **_) -> Optional[pathlib.Path]: + """Write sources to a file (if there are any).""" if ids: bibdata = database.BibliographyData() for key, entry in self._bibdata.entries.items(): @@ -269,19 +288,21 @@ def write(self, fname, ids=None, zipped=False, **kw): bibdata.add_entry(key, entry) else: bibdata = self._bibdata + fname = pathlib.Path(fname) if bibdata.entries: - with pathlib.Path(fname).open('w', encoding='utf8') as fp: + with fname.open('w', encoding='utf8') as fp: Writer().write_stream(bibdata, fp) if zipped: with zipfile.ZipFile( - fname.parent / '{}.zip'.format(fname.name), + fname.parent / f'{fname.name}.zip', 'w', compression=zipfile.ZIP_DEFLATED) as zf: zf.write(fname, fname.name) fname.unlink() return fname + return None - def add(self, *entries: typing.Union[str, Source], **kw): + def add(self, *entries: Union[str, Source], **kw) -> None: """ Add a source, either specified as BibTeX string or as :class:`Source`. """ diff --git a/src/pycldf/stats.py b/src/pycldf/stats.py new file mode 100644 index 0000000..5f40839 --- /dev/null +++ b/src/pycldf/stats.py @@ -0,0 +1,40 @@ +""" +Functionality to compute summary statistics for a Dataset. +""" +import typing +import dataclasses +from collections.abc import Generator + +from pycldf.terms import TERMS + +if typing.TYPE_CHECKING: + from pycldf import Dataset # pragma: no cover + +__all__ = ['get_table_stats'] + + +def get_table_stats(ds: 'Dataset', exact: bool = False) -> list[tuple[str, str, int]]: + """Return a list of table statistics.""" + return [dataclasses.astuple(stats) for stats in _iter_stats(ds, exact)] + + +@dataclasses.dataclass(frozen=True) +class TableStats: + """A bag of attrs""" + fname: str + component: str + rowcount: int + + +def _iter_stats(ds: 'Dataset', exact: bool = False) -> Generator[TableStats, None, None]: + for table in ds.tables: + dctype = table.common_props.get('dc:conformsTo') + if dctype and '#' in dctype and dctype.split('#')[1] in TERMS: + dctype = TERMS[dctype.split('#')[1]].csvw_prop('name') + yield TableStats( + table.url.string, + dctype or '', + sum(1 for _ in table) if (exact or 'dc:extent' not in table.common_props) + else int(table.common_props.get('dc:extent'))) + if ds.sources: + yield TableStats(ds.bibname, 'Sources', len(ds.sources)) diff --git a/src/pycldf/terms.py b/src/pycldf/terms.py index 83c914a..18b5a85 100644 --- a/src/pycldf/terms.py +++ b/src/pycldf/terms.py @@ -1,16 +1,27 @@ +""" +Functionality to access the metadata about CLDF schema objects encoded in the ontology. +""" import re import json import types +import pathlib import warnings +import dataclasses import urllib.parse +from typing import Optional, Union, Callable, Any, TYPE_CHECKING, Literal, get_args +from collections.abc import Container from xml.etree import ElementTree -import attr from csvw.metadata import Column +from clldutils import jsonlib from pycldf.util import pkg_path +from pycldf.fileutil import PathType -__all__ = ['term_uri', 'TERMS', 'get_column_names'] +if TYPE_CHECKING: + from pycldf import Dataset # pragma: no cover + +__all__ = ['term_uri', 'TERMS', 'get_column_names', 'sniff'] URL = 'http://cldf.clld.org/v1.0/terms.rdf' RDF = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#' @@ -18,68 +29,89 @@ CSVW = 'http://www.w3.org/ns/csvw#' DC = 'http://purl.org/dc/terms/' - -def term_uri(name, terms=None, ns=URL): - if isinstance(name, Term): - return name.uri - if not name.startswith(ns): - sep = '' if ns.endswith('#') else '#' - name = sep.join([ns, name]) - if not terms or name in terms: - return name - return None +TermType = Literal['Class', 'Property'] +CardinalityType = Literal['singlevalued', 'multivalued'] -def qname(ns, lname): +def qname(ns: str, lname: str) -> str: + """Return a qualified name in ElementTree notation.""" return '{%s}%s' % (ns, lname) -def _get(e, subelementns, subelementlname, attrns=None, attrlname=None, converter=None): +@dataclasses.dataclass +class NameSpec: # pylint: disable=C0115 + ns: str + lname: str + + @property + def qname(self): # pylint: disable=C0116 + return qname(self.ns, self.lname) + + +def _get( + e: ElementTree.Element, + subelement: NameSpec, + attrib: Optional[NameSpec] = None, + converter: Optional[Callable[[str], Any]] = None, +): """ :return: Text content or attribute value of a subelement of e. """ res = None - subelement = e.find(qname(subelementns, subelementlname)) + subelement = e.find(subelement.qname) if subelement is not None: - if not attrlname: + if not attrib: res = subelement.text else: - res = subelement.attrib[qname(attrns, attrlname)] + res = subelement.attrib[attrib.qname] if converter and res: res = converter(res) return res -@attr.s -class Term(object): - name = attr.ib() - type = attr.ib(validator=attr.validators.in_(['Class', 'Property'])) - element = attr.ib() - references = attr.ib(default=None) - subtype = attr.ib(default=None) - version = attr.ib(default=None, validator=attr.validators.matches_re(r'v[0-9]+(\.[0-9]+)+')) - cardinality = attr.ib( - default=None, - validator=attr.validators.optional(attr.validators.in_(['singlevalued', 'multivalued']))) +@dataclasses.dataclass +class Term: + """A Term is an object described in the CLDF Ontology.""" + name: str + type: TermType + element: ElementTree.Element + references: Optional[str] = None + subtype: Optional[str] = None + version: Optional[str] = None + cardinality: Optional[CardinalityType] = None + + def __post_init__(self): + assert self.type in get_args(TermType) + if self.version: + assert re.fullmatch(r'v[0-9]+(\.[0-9]+)+', self.version) + if self.cardinality: + assert self.cardinality in get_args(CardinalityType) @property - def uri(self): - return '{0}#{1}'.format(URL, self.name) + def uri(self) -> str: + """The Term URI.""" + return f'{URL}#{self.name}' @classmethod - def from_element(cls, e): - subClassOf = e.find(qname(RDFS, 'subClassOf')) - kw = dict( - name=e.attrib[qname(RDF, 'about')].split('#')[1], - version=_get( - e, DC, 'hasVersion', RDF, 'resource', + def from_element(cls, e: ElementTree.Element) -> 'Term': + """Instantiate a Term from an XML element parsed from the ontology.""" + subClassOf = e.find(qname(RDFS, 'subClassOf')) # pylint: disable=invalid-name + kw = { + 'name': e.attrib[qname(RDF, 'about')].split('#')[1], + 'version': _get( + e, + NameSpec(ns=DC, lname='hasVersion'), + attrib=NameSpec(ns=RDF, lname='resource'), converter=lambda s: 'v' + s.split('/v')[1].replace('/', '')) or 'v1.0', - type=e.tag.split('}')[1], - element=e, - cardinality=_get(e, DC, 'extent'), - references=_get( - e, DC, 'references', RDF, 'resource', converter=lambda s: s.split('#')[1]), - ) + 'type': e.tag.split('}')[1], + 'element': e, + 'cardinality': _get(e, NameSpec(ns=DC, lname='extent')), + 'references': _get( + e, + NameSpec(ns=DC, lname='references'), + attrib=NameSpec(ns=RDF, lname='resource'), + converter=lambda s: s.split('#')[1]), + } if kw['type'] == 'Class': kw['subtype'] = 'module' \ if subClassOf is not None \ @@ -87,10 +119,12 @@ def from_element(cls, e): 'http://www.w3.org/ns/dcat#Distribution' else 'component' return cls(**kw) - def csvw_prop(self, lname): - return _get(self.element, CSVW, lname, converter=lambda s: json.loads(s)) + def csvw_prop(self, lname: str) -> Any: + """Returns the JSON value of a property in the CSVW namespace.""" + return _get(self.element, NameSpec(ns=CSVW, lname=lname), converter=json.loads) - def to_column(self): + def to_column(self) -> Column: + """Returns a csvw Column instance configured according to the term spec.""" col = Column( name=self.csvw_prop('name') or self.element.find(qname(RDFS, 'label')).text, propertyUrl=self.element.attrib[qname(RDF, 'about')], @@ -101,7 +135,10 @@ def to_column(self): setattr(col, k, v) return col - def comment(self, one_line=False): + def comment(self, one_line=False) -> str: + """ + Parse a text comment from the XML element of the ontology. + """ c = self.element.find("{http://www.w3.org/2000/01/rdf-schema#}comment") try: xml = ElementTree.tostring(c, default_namespace='http://www.w3.org/1999/xhtml') @@ -116,17 +153,24 @@ def comment(self, one_line=False): return re.sub(r'\s+', ' ', res.replace('\n', ' ')) if one_line else res +TermDict = dict[str, Term] + + class Terms(dict): - def __init__(self, path=None): + """ + A dict of `Term`s keyed by local names. + """ + def __init__(self, path: Optional[PathType] = None): self._path = path or pkg_path('terms.rdf') r = ElementTree.parse(str(self._path)).getroot() terms = [Term.from_element(e) for e in r.findall(qname(RDF, 'Property'))] for e in r.findall(qname(RDFS, 'Class')): terms.append(Term.from_element(e)) dict.__init__(self, {t.name: t for t in terms}) - self.by_uri = {t.uri: t for t in terms} + self.by_uri: TermDict = {t.uri: t for t in terms} - def is_cldf_uri(self, uri): + def is_cldf_uri(self, uri: str) -> bool: + """Whether the given URL is a CLDF Ontology term URI.""" if uri and urllib.parse.urlparse(uri).netloc == 'cldf.clld.org': if uri not in self.by_uri: warnings.warn('If pycldf does not recognize valid CLDF URIs, You may be ' @@ -137,44 +181,103 @@ def is_cldf_uri(self, uri): return False @property - def properties(self): + def properties(self) -> TermDict: # pylint: disable=C0116 return {k: v for k, v in self.items() if v.type == 'Property'} @property - def classes(self): + def classes(self) -> TermDict: # pylint: disable=C0116 return {k: v for k, v in self.items() if v.type == 'Class'} @property - def modules(self): + def modules(self) -> TermDict: # pylint: disable=C0116 return {k: v for k, v in self.items() if v.subtype == 'module'} @property - def components(self): + def components(self) -> TermDict: # pylint: disable=C0116 return {k: v for k, v in self.items() if v.subtype == 'component'} +def term_uri(name: Union[Term, str], terms: Container[str] = None, ns: str = URL) -> Optional[str]: + """ + Returns a full term URI associated with `name`. + + If `terms` are provided, we make sure the URI is contained in `terms`. + """ + if isinstance(name, Term): + return name.uri + if not name.startswith(ns): # So this may be a local name, i.e. the fragment of a term URI. + sep = '' if ns.endswith('#') else '#' + name = sep.join([ns, name]) + if not terms or name in terms: + return name + return None + + TERMS = Terms() -def get_column_names(dataset, use_component_names=False, with_multiplicity=False): +def get_column_names( + dataset: 'Dataset', + use_component_names: bool = False, + with_multiplicity: bool = False, +) -> types.SimpleNamespace: + """ + Returns an object allowing programmatic access to the column names used for ontology terms + in a specific dataset. + + .. code-block:: python + + >>> from pycldf import Dataset + >>> from pycldf.terms import get_column_names + >>> ds = Dataset.from_metadata('tests/data/ds1.csv-metadata.json') + >>> res = get_column_names(ds, use_component_names=True) + >>> res.ValueTable.languageReference + 'Language_ID' + """ comp_names = { k: k if use_component_names else k.replace('Table', '').lower() + 's' for k in TERMS.components} + # Seed the result object with component names as attributes and None as value. name_map = types.SimpleNamespace(**{k: None for k in comp_names.values()}) for term, attr_ in comp_names.items(): - try: - table = dataset[term] + table = dataset.get(term) + if table: props = {} - for k in TERMS.properties: - try: - col = dataset[table, k] + for k in TERMS.properties: # Loop through properties in the ontology. + col = dataset.get((table, k)) + if col: if with_multiplicity: props[k] = (col.name, bool(col.separator)) else: props[k] = col.name - except KeyError: + else: props[k] = None setattr(name_map, attr_, types.SimpleNamespace(**props)) - except KeyError: - pass return name_map + + +def sniff(p: pathlib.Path) -> bool: + """ + Determine whether a file contains CLDF metadata. + + :param p: `pathlib.Path` object for an existing file. + :return: `True` if the file contains CLDF metadata, `False` otherwise. + """ + if not p.is_file(): # pragma: no cover + return False + try: + with p.open('rb') as fp: + c = fp.read(10) + try: + c = c.decode('utf8').strip() + except UnicodeDecodeError: + return False + if not c.startswith('{'): + return False + except (FileNotFoundError, OSError): # pragma: no cover + return False + try: + d = jsonlib.load(p) + except json.decoder.JSONDecodeError: + return False + return d.get('dc:conformsTo', '').startswith(URL) diff --git a/src/pycldf/trees.py b/src/pycldf/trees.py index 3a81c76..b939b29 100644 --- a/src/pycldf/trees.py +++ b/src/pycldf/trees.py @@ -21,17 +21,20 @@ ├─l3 └─l4 """ -import typing -import logging +from typing import TYPE_CHECKING, Optional import pathlib +from collections.abc import Generator -from clldutils.misc import log_or_raise from commonnexus import Nexus import newick +from csvw.metadata import Table, Column -import pycldf from pycldf.media import MediaTable, File +if TYPE_CHECKING: + from pycldf import Dataset # pragma: no cover + from pycldf.dataset import RowType # pragma: no cover + from pycldf.validators import DatasetValidator # pragma: no cover __all__ = ['Tree', 'TreeTable'] @@ -40,17 +43,17 @@ class Tree: """ Represents a tree object as specified in a row of `TreeTable`. """ - def __init__(self, trees: 'TreeTable', row: dict, file: File): - self.row = row - self.id = row[trees.cols['id'].name] - self.name = row[trees.cols['name'].name] - self.file = file + def __init__(self, trees: 'TreeTable', row: 'RowType', file: File): + self.row: 'RowType' = row + self.id: str = row[trees.cols['id'].name] + self.name: str = row[trees.cols['name'].name] + self.file: File = file for prop in ['description', 'treeType', 'treeIsRooted', 'treeBranchLengthUnit']: attrib = ''.join('_' + c.lower() if c.isupper() else c for c in prop) setattr(self, attrib, row.get(trees.cols[prop].name) if trees.cols[prop] else None) self.trees = trees - def newick_string(self, d: typing.Optional[pathlib.Path] = None) -> str: + def newick_string(self, d: Optional[pathlib.Path] = None) -> str: """ Retrieve the Newick representation of the tree from the associated tree file. @@ -58,21 +61,19 @@ def newick_string(self, d: typing.Optional[pathlib.Path] = None) -> str: :meth:`pycldf.media.File.save`. :return: Newick representation of the associated tree. """ - if self.file.id not in self.trees._parsed_files: + if self.file.id not in self.trees.parsed_files: content = self.file.read(d=d) if self.file.mimetype == 'text/x-nh': - self.trees._parsed_files[self.file.id] = { + self.trees.parsed_files[self.file.id] = { # pylint: disable=protected-access str(index): nwk for index, nwk in enumerate( [t.strip() for t in content.split(';') if t.strip()], start=1)} else: - self.trees._parsed_files[self.file.id] = { + self.trees.parsed_files[self.file.id] = { # pylint: disable=protected-access tree.name: tree.newick_string for tree in Nexus(content).TREES.trees} - return self.trees._parsed_files[self.file.id][self.name] + return self.trees.parsed_files[self.file.id][self.name] # pylint: disable=protected-access - def newick(self, - d: typing.Optional[pathlib.Path] = None, - strip_comments: bool = False) -> newick.Node: + def newick(self, d: Optional[pathlib.Path] = None, strip_comments: bool = False) -> newick.Node: """ Retrieve a `newick.Node` instance for the tree from the associated tree file. @@ -85,47 +86,44 @@ def newick(self, return newick.loads(self.newick_string(d=d), strip_comments=strip_comments)[0] -class TreeTable(pycldf.ComponentWithValidation): +class TreeTable: """ Container class for a `Dataset`'s TreeTable. """ - def __init__(self, ds: pycldf.Dataset): - super().__init__(ds) - self.media = MediaTable(ds) - self.media_rows = {row[self.media.id_col.name]: row for row in ds['MediaTable']} - self.cols = { + def __init__(self, ds: 'Dataset'): + self.ds: 'Dataset' = ds + self.component: str = self.__class__.__name__ + self.table: Table = ds[self.component] + self.media: MediaTable = MediaTable(ds) + self.media_rows: dict[str, 'RowType'] = { + row[self.media.id_col.name]: row for row in ds['MediaTable']} + self.cols: dict[str, Optional[Column]] = { prop: self.ds.get((self.table, prop)) for prop in [ 'id', 'name', 'description', 'mediaReference', 'treeIsRooted', 'treeType', 'treeBranchLengthUnit']} # Since reading and parsing tree files is expensive, we cache them. - self._parsed_files = {} + self.parsed_files: dict[str, dict[str, str]] = {} - def __iter__(self) -> typing.Generator[Tree, None, None]: + def __iter__(self) -> Generator[Tree, None, None]: for row in self.table: yield Tree( self, row, File(self.media, self.media_rows[row[self.cols['mediaReference'].name]])) - def validate(self, - success: bool = True, - log: logging.Logger = None) -> bool: + def validate(self, validator: 'DatasetValidator'): + """ + Makes sure Newick representations of trees are available and only reference valid languages. + """ lids = {r['id'] for r in self.ds.iter_rows('LanguageTable', 'id')} for tree in self: try: nwk = tree.newick() except KeyError: - log_or_raise( - 'No newick tree found for name "{}"'.format(tree.name), - log=log) - success = False + validator.fail(f'No newick tree found for name "{tree.name}"') nwk = None if nwk: for node in nwk.walk(): if node.name and (node.name not in lids): - log_or_raise( - 'Newick node label "{}" is not a LanguageTable ID'.format(node.name), - log=log) - success = False - return success + validator.fail(f'Newick node label "{node.name}" is not a LanguageTable ID') diff --git a/src/pycldf/urlutil.py b/src/pycldf/urlutil.py new file mode 100644 index 0000000..66715f4 --- /dev/null +++ b/src/pycldf/urlutil.py @@ -0,0 +1,37 @@ +""" +Functionality to manipulate URLs. +""" +from typing import Callable, Union +import urllib.parse + +__all__ = ['update_url', 'sanitize_url', 'url_without_fragment'] + + +def update_url( + url: Union[str, urllib.parse.ParseResult], + updater: Callable[[urllib.parse.ParseResult], tuple[str, str, str, str, str]], +) -> Union[str, None]: + """Generic update function for URLs.""" + if url is None: + return None + if isinstance(url, str): + url = urllib.parse.urlparse(url) + return urllib.parse.urlunsplit(updater(url)) or None + + +def sanitize_url(url: str) -> str: + """ + Removes auth credentials from a URL. + """ + def fix(u): + host = u.hostname + if u.port: + host += f':{u.port}' + return (u.scheme, host, u.path, u.query, u.fragment) + + return update_url(url, fix) + + +def url_without_fragment(url: Union[str, urllib.parse.ParseResult]) -> str: + """Removes fragment from URL.""" + return update_url(url, lambda u: (u.scheme, u.hostname, u.path, u.query, '')) diff --git a/src/pycldf/util.py b/src/pycldf/util.py index c15626b..e09536f 100644 --- a/src/pycldf/util.py +++ b/src/pycldf/util.py @@ -1,80 +1,66 @@ -import re -import html -import math -import string -import typing +""" +The mixed bag of utility functions and classes of the pycldf package ... +""" +import shutil +from typing import Optional, TYPE_CHECKING, Any, Union import pathlib -import itertools import collections import urllib.parse +import urllib.request +from collections.abc import Generator -from clldutils.misc import slug -import pycldf +from csvw.metadata import is_url, Link, Column, Table, Schema, URITemplate +from clldutils.path import git_describe -__all__ = [ - 'pkg_path', 'multislice', 'resolve_slices', 'DictTuple', 'metadata2markdown', 'qname2url', - 'sanitize_url', 'update_url', 'iter_uritemplates', 'url_without_fragment', - 'splitfile', 'catfile'] +from pycldf.fileutil import PathType +from pycldf.urlutil import sanitize_url +if TYPE_CHECKING: + from pycldf import Dataset # pragma: no cover -def splitfile(p, chunksize: int, total: typing.Optional[int] = None) -> typing.List[pathlib.Path]: - """ - :param p: Path of the file to split. - :param chunksize: The maximal size of the chunks the file will be split into. - :param total: The size of the input file. - :return: The list of paths of files that the input has been split into. - """ - total = total or p.stat().st_size - if total <= chunksize: # Nothing to do. - return [p] - nchunks = math.ceil(total / chunksize) - suffix_length = 2 if nchunks < len(string.ascii_lowercase)**2 else 3 - suffixes = [ - ''.join(t) for t in - itertools.combinations_with_replacement(string.ascii_lowercase, suffix_length)] - - res = [] - with p.open('rb') as f: - chunk = f.read(chunksize) - while chunk: - pp = p.parent.joinpath('{}.{}'.format(p.name, suffixes.pop(0))) - pp.write_bytes(chunk) - res.append(pp) - chunk = f.read(chunksize) # read the next chunk +__all__ = [ + 'pkg_path', 'DictTuple', 'qname2url', 'iter_uritemplates', 'MD_SUFFIX', 'GitRepository'] - p.unlink() - return res +MD_SUFFIX = '-metadata.json' -def catfile(p: pathlib.Path) -> bool: +class GitRepository: # pylint: disable=too-few-public-methods """ - Restore a file that has been split into chunks. - - We determine if a file has been split by looking for files in the parent directory with suffixes - as created by `splitfile`. + CLDF datasets are often created from data curated in git repositories. If this is the case, we + exploit this to provide better provenance information in the dataset's metadata. """ - if p.exists(): # Nothing to do. - return False - # Check, whether the file has been split. - suffixes = {pp.suffix: pp for pp in p.parent.iterdir() if pp.stem == p.name} - if {'.aa', '.ab'}.issubset(suffixes) or {'.aaa', '.aab'}.issubset(suffixes): - # ok, let's concatenate the files: - with p.open('wb') as f: - for suffix in sorted(suffixes): - if re.fullmatch(r'\.[a-z]{2,3}', suffix): - f.write(suffixes[suffix].read_bytes()) - suffixes[suffix].unlink() - return True - return False # pragma: no cover - + def __init__( + self, url: str, clone: Optional[PathType] = None, version: Optional[str] = None, **dc): + # We remove credentials from the URL immediately to make sure this isn't leaked into + # CLDF metadata. Such credentials might be present in URLs read via gitpython from + # remotes. + self.url = sanitize_url(url) + self.clone = clone + self.version = version + self.dc = dc + + def json_ld(self) -> collections.OrderedDict[str, Any]: + """The repository described in JSON-LD.""" + res = collections.OrderedDict([ + ('rdf:about', self.url), + ('rdf:type', 'prov:Entity'), + ]) + if self.version: + res['dc:created'] = self.version + elif self.clone: + res['dc:created'] = git_describe(self.clone) + res.update({f'dc:{k}': self.dc[k] for k in sorted(self.dc)}) + return res -def url_without_fragment(url: typing.Union[str, urllib.parse.ParseResult]) -> str: - if isinstance(url, str): - url = urllib.parse.urlparse(url) - return urllib.parse.urlunparse(list(url[:5]) + ['']) +def iter_uritemplates( + table: Table) -> Generator[tuple[Union[Table, Schema, Column], str, URITemplate]]: + """ + Generator of URITemplates specified in a table. -def iter_uritemplates(table): + Since URITemplates use column names as template variables, it is important to keep these in + sync with the table schema, e.g. in case of renaming columns. + """ props = ['aboutUrl', 'valueUrl'] for obj in [table, table.tableSchema] + table.tableSchema.columns: for prop in props: @@ -83,52 +69,9 @@ def iter_uritemplates(table): yield obj, prop, tmpl -def sanitize_url(url: str) -> str: - """ - Removes auth credentials from a URL. - """ - def fix(u): - host = u.hostname - if u.port: - host += ':{}'.format(u.port) - return (u.scheme, host, u.path, u.query, u.fragment) - - return update_url(url, fix) - - -def update_url(url: str, updater: typing.Callable[[urllib.parse.ParseResult], tuple]) -> str: - return urllib.parse.urlunsplit(updater(urllib.parse.urlparse(url))) or None - - -def pkg_path(*comps): - return pathlib.Path(pycldf.__file__).resolve().parent.joinpath(*comps) - - -def multislice(sliceable, *slices): - res = type(sliceable)() - for sl in slices: - if isinstance(sl, str): - if ':' in sl: - sl = [int(s) - (1 if i == 0 else 0) for i, s in enumerate(sl.split(':'))] - else: - sl = [int(sl) - 1, int(sl)] - res += sliceable[slice(*sl)] - return res - - -def resolve_slices(row, ds, slice_spec, target_spec, fk, target_row=None): - # 1. Determine the slice column: - slices = ds[slice_spec] - - # 2. Determine the to-be-sliced column: - morphemes = ds[target_spec] - - # 3. Retrieve the matching row in the target table: - target_row = target_row or ds.get_row(target_spec[0], row[fk]) - - # 4. Slice the segments - return list(itertools.chain(*[ - s.split() for s in multislice(target_row[morphemes.name], *row[slices.name])])) +def pkg_path(*comps: str) -> pathlib.Path: + """Returns a path within the pycldf package.""" + return pathlib.Path(__file__).resolve().parent.joinpath(*comps) class DictTuple(tuple): @@ -142,7 +85,7 @@ class DictTuple(tuple): def __new__(cls, items, **kw): return super(DictTuple, cls).__new__(cls, tuple(items)) - def __init__(self, items, key=lambda i: i.id, multi=False): + def __init__(self, _, key=lambda i: i.id, multi=False): """ If `key` does not return unique values for all items, you may pass `multi=True` to retrieve `list`s of matching items for `l[key]`. @@ -157,10 +100,13 @@ def __getitem__(self, item): if self._multi: return [self[i] for i in self._d[item]] return self[self._d[item][0]] - return super(DictTuple, self).__getitem__(item) + return super().__getitem__(item) -def qname2url(qname): +def qname2url(qname: str) -> Optional[str]: + """ + Turns a qname of the form : into a full HTTP URL if the prefix is known. + """ for prefix, uri in { 'csvw': 'http://www.w3.org/ns/csvw#', 'rdf': 'http://www.w3.org/1999/02/22-rdf-syntax-ns#', @@ -172,153 +118,51 @@ def qname2url(qname): }.items(): if qname.startswith(prefix + ':'): return qname.replace(prefix + ':', uri) + return None -def metadata2markdown(ds: 'pycldf.Dataset', - path: typing.Union[str, pathlib.Path], - rel_path: typing.Optional[str] = './') -> str: +def copy_dataset(ds: 'Dataset', dest: PathType, mdname: str = None) -> pathlib.Path: """ - Render the metadata of a dataset as markdown. - - :param ds: `pycldf.Dataset` instance - :param path: `pathlib.Path` of the metadata file - :param rel_path: `str` to use a relative path when creating links to data files - :return: `str` with markdown formatted text + Copy metadata, data and sources to files in `dest`. """ - path = pathlib.Path(path) - - def qname2link(qname, html=False): - url = qname2url(qname) - if url: - if html: - return '{}'.format(url, qname) - return '[{}]({})'.format(qname, url) - return qname + from pycldf.media import MediaTable # pylint: disable=import-outside-toplevel - def htmlify(obj, key=None): - """ - For inclusion in tables we must use HTML for lists. - """ - if isinstance(obj, list): - return '
      {}
    '.format( - ''.join('
  • {}
  • '.format(htmlify(item, key=key)) for item in obj)) - if isinstance(obj, dict): - if key == 'prov:wasGeneratedBy' \ - and set(obj.keys()).issubset({'dc:title', 'dc:description', 'dc:relation'}): - desc = obj.get('dc:description') or '' - if obj.get('dc:relation'): - desc = (desc + '
    ') if desc else desc - desc += '{1}'.format(rel_path, obj['dc:relation']) - return '{}: {}'.format(obj.get('dc:title') or '', desc) - - if obj.get('rdf:type') == 'prov:Entity' and 'rdf:about' in obj: - label = obj.get('dc:title') - if (not label) or label == 'Repository': - label = obj['rdf:about'] - url = obj['rdf:about'] - if ('github.com' in url) and ('/tree/' not in url) and ('dc:created' in obj): - tag = obj['dc:created'] - if '-g' in tag: - tag = tag.split('-g')[-1] - url = '{}/tree/{}'.format(url, tag) - if label == obj['rdf:about']: - label = label.split('github.com/')[-1] - return '{} {}'.format(url, label, obj.get('dc:created') or '') - items = [] - for k, v in obj.items(): - items.append('
    {}
    {}
    '.format( - qname2link(k, html=True), html.escape(str(v)))) - return '
    {}
    '.format(''.join(items)) - return str(obj) - - def properties(obj): - res = [] - if obj.common_props.get('dc:description'): - res.append(obj.common_props['dc:description'] + '\n') - res.append('property | value\n --- | ---') - for k, v in obj.common_props.items(): - if not v: - continue - if k not in ('dc:description', 'dc:title', 'dc:source'): - if k == 'dc:conformsTo': - v = '[CLDF {}]({})'.format(v.split('#')[1], v) - res.append('{} | {}'.format(qname2link(k), htmlify(v, key=k))) - res.append('') - return '\n'.join(res) - - def colrow(col, fks, pk): - dt = '`{}`'.format(col.datatype.base if col.datatype else 'string') - if col.datatype: - if col.datatype.format: - if re.fullmatch(r'[\w\s]+(\|[\w\s]+)*', col.datatype.format): - dt += '
    Valid choices:
    ' - dt += ''.join(' `{}`'.format(w) for w in col.datatype.format.split('|')) - elif col.datatype.base == 'string': - dt += '
    Regex: `{}`'.format(col.datatype.format) - if col.datatype.minimum: - dt += '
    ≥ {}'.format(col.datatype.minimum) - if col.datatype.maximum: - dt += '
    ≤ {}'.format(col.datatype.maximum) - if col.separator: - dt = 'list of {} (separated by `{}`)'.format(dt, col.separator) - desc = col.common_props.get('dc:description', '').replace('\n', ' ') - - if col.name in pk: - desc = (desc + '
    ') if desc else desc - desc += 'Primary key' - - if col.name in fks: - desc = (desc + '
    ') if desc else desc - desc += 'References [{}::{}](#table-{})'.format( - fks[col.name][1], fks[col.name][0], slug(fks[col.name][1])) - elif col.propertyUrl \ - and col.propertyUrl.uri == "http://cldf.clld.org/v1.0/terms.rdf#source" \ - and 'dc:source' in ds.properties: - desc = (desc + '
    ') if desc else desc - desc += 'References [{}::BibTeX-key]({}{})'.format( - ds.properties['dc:source'], rel_path, ds.properties['dc:source']) - - return ' | '.join([ - '[{}]({})'.format(col.name, col.propertyUrl) - if col.propertyUrl else '`{}`'.format(col.name), - dt, - desc, - ]) + dest = pathlib.Path(dest) + if not dest.exists(): + dest.mkdir(parents=True) - title = ds.properties.get('dc:title', ds.module) + from_url = is_url(ds.tablegroup.base) + ds = ds.__class__.from_metadata( + ds.tablegroup.base if from_url else ds.tablegroup._fname) # pylint: disable=W0212 - res = ['# {}\n'.format(title)] - if path.suffix == '.json': - res.append('**CLDF Metadata**: [{0}]({1}{0})\n'.format(path.name, rel_path)) - if 'dc:source' in ds.properties: - src = None - if pathlib.Path(ds.directory).joinpath(ds.properties['dc:source']).exists(): - src = ds.properties['dc:source'] - elif pathlib.Path(ds.directory).joinpath(ds.properties['dc:source'] + '.zip').exists(): - src = ds.properties['dc:source'] + '.zip' - if src: - res.append('**Sources**: [{0}]({1}{0})\n'.format(src, rel_path)) - res.append(properties(ds.tablegroup)) + _getter = urllib.request.urlretrieve if from_url else shutil.copy + try: + _getter(ds.bibpath, dest / ds.bibname) + ds.properties['dc:source'] = ds.bibname + except: # pragma: no cover # noqa pylint: disable=W0702 + # Sources are optional + pass for table in ds.tables: - fks = { - fk.columnReference[0]: (fk.reference.columnReference[0], fk.reference.resource.string) - for fk in table.tableSchema.foreignKeys if len(fk.columnReference) == 1} - src = None - if pathlib.Path(ds.directory).joinpath(table.url.string).exists(): - src = table.url.string - elif pathlib.Path(ds.directory).joinpath(table.url.string + '.zip').exists(): - src = table.url.string + '.zip' - if src: - res.append('\n## Table [{1}]({2}{3})\n'.format( - slug(table.url.string), table.url, rel_path, src)) - else: - res.append('\n## Table {1}\n'.format( - slug(table.url.string), table.url)) - res.append(properties(table)) - res.append('\n### Columns\n') - res.append('Name/Property | Datatype | Description') - res.append(' --- | --- | --- ') - for col in table.tableSchema.columns: - res.append(colrow(col, fks, table.tableSchema.primaryKey)) - return '\n'.join(res) + fname = table.url.resolve(table.base) + name = pathlib.Path(urllib.parse.urlparse(fname).path).name if from_url else fname.name + _getter(fname, dest / name) + table.url = Link(name) + + for fk in table.tableSchema.foreignKeys: + fk.reference.resource = Link(pathlib.Path(fk.reference.resource.string).name) + mdpath = dest.joinpath( + mdname or # noqa: W504 + (ds.tablegroup.base.split('/')[-1] if from_url + else ds.tablegroup._fname.name)) # pylint: disable=W0212 + if 'MediaTable' in ds: + for f in MediaTable(ds): + if f.scheme == 'file': + if f.local_path().exists(): + target = dest / urllib.parse.unquote(f.relpath) + target.parent.mkdir(parents=True, exist_ok=True) + shutil.copy(f.local_path(), target) + if from_url: + del ds.tablegroup.at_props['base'] # pragma: no cover + ds.write_metadata(fname=mdpath) + return mdpath diff --git a/src/pycldf/validators.py b/src/pycldf/validators.py index d252d54..cc894ee 100644 --- a/src/pycldf/validators.py +++ b/src/pycldf/validators.py @@ -1,14 +1,205 @@ +# pylint: disable=cyclic-import +""" +Functionality to validate a Dataset. +""" import re +import pathlib import warnings import functools +from typing import Optional, Callable, TYPE_CHECKING +import logging +import dataclasses +from clldutils.misc import log_or_raise +from csvw.metadata import TableGroup, is_url -def valid_references(dataset, table, column, row): +from pycldf.terms import Terms +from pycldf.util import iter_uritemplates, pkg_path, MD_SUFFIX + +if TYPE_CHECKING: # pragma: no cover + from pycldf import Dataset, Table, RowType, Column + +__all__ = ['RowValidatorType', 'validate'] + +RowValidatorType = Callable[['Dataset', 'Table', 'Column', 'RowType'], None] + + +def validate( + dataset: 'Dataset', + terms: Terms, + log: Optional[logging.Logger], + row_validators: list[tuple[Optional[str], str, RowValidatorType]], +) -> bool: + """Wraps Validator instantiation and calling into one.""" + return DatasetValidator( + dataset=dataset, + success=True, + terms=terms, + log=log, + row_validators=row_validators, + )() + + +@dataclasses.dataclass +class DatasetValidator: + """Some state to simplify running individual validation steps.""" + dataset: 'Dataset' + success: bool = True + terms: Terms = None + log: Optional[logging.Logger] = None + row_validators: list[tuple[Optional[str], str, RowValidatorType]] \ + = dataclasses.field(default_factory=list) + + def __post_init__(self): + self.row_validators.extend(VALIDATORS) + + def fail(self, reason): # pylint: disable=C0116 + self.success = False + log_or_raise(reason, log=self.log) + + def warn(self, msg, *args): # pylint: disable=C0116 + if self.log: + self.log.warning(msg, *args) + + def info(self, msg, *args): # pylint: disable=C0116 + if self.log: + self.log.info(msg, *args) + + def __call__(self) -> bool: + """Run the full validation.""" + default_tg = TableGroup.from_file( + pkg_path('modules', f'{self.dataset.module}{MD_SUFFIX}')) + # Make sure, all required tables and columns are present and consistent. + for default_table in default_tg.tables: + self._validate_default_objects(default_table) + + for table in self.dataset.tables: + self._validate_table_schema(table) + self._validate_columns(table) + + fname = pathlib.Path(table.url.resolve(table._parent.base)) # pylint: disable=W0212 + fexists = fname.exists() + if (not fexists) and fname.parent.joinpath(f'{fname.name}.zip').exists(): + self.info(f'Reading data from zipped table: {fname}.zip') + fexists = True # csvw already handles this case, no need to adapt paths. + if is_url(table.url.resolve(table._parent.base)) or fexists: # pylint: disable=W0212 + self._validate_rows(table) + if not table.check_primary_key(log=self.log): + self.fail('Primary key check failed.') + else: + self.fail(f'{fname} does not exist') + + if not self.dataset.tablegroup.check_referential_integrity(log=self.log): + self.fail('Referential integrity check failed') + + self._validate_components() + return self.success + + def _validate_components(self): + from pycldf.media import MediaTable # pylint: disable=import-outside-toplevel + from pycldf.trees import TreeTable # pylint: disable=import-outside-toplevel + + for cls in [MediaTable, TreeTable]: + if cls.__name__ in self.dataset: + cls(self.dataset).validate(self) + + def _validate_rows(self, table): + # FIXME: see if table.common_props['dc:conformsTo'] is in validators! pylint: disable=W0511 + validators = [] + for col in table.tableSchema.columns: + for table_, col_, v_ in self.row_validators: + if ((not table_ or table is self.dataset.get(table_)) + and col is self.dataset.get((table, col_))): # noqa: W503 + validators.append((col, v_)) + + for fname, lineno, row in table.iterdicts(log=self.log, with_metadata=True): + for col, validate_ in validators: + try: + validate_(self.dataset, table, col, row) + except ValueError as e: + self.fail(f'{fname.name}:{lineno}:{col.name} {e}') + + def _validate_columns(self, table): + property_urls, colnames = set(), set() + for col in table.tableSchema.columns: + if col.header in colnames: # pragma: no cover + self.fail(f'Duplicate column name in table schema: {table.url} {col.header}') + colnames.add(col.header) + if col.propertyUrl: + col_uri = col.propertyUrl.uri + try: + self.terms.is_cldf_uri(col_uri) + if col_uri in property_urls: # pragma: no cover + self.fail( + f'Duplicate CLDF property in table schema: {table.url} {col_uri}') + property_urls.add(col_uri) + except ValueError: + self.fail(f'invalid CLDF URI: {col_uri}') + + def _validate_table_schema(self, table): + tmpl_vars = set(col.name for col in table.tableSchema.columns) + for obj, prop, tmpl in iter_uritemplates(table): + if not {n for n in tmpl.variable_names if not n.startswith('_')}.issubset(tmpl_vars): + self.warn(f'Unknown variables in URI template: {obj}:{prop}:{tmpl}') + + type_uri = table.common_props.get('dc:conformsTo') + if type_uri: + try: + self.terms.is_cldf_uri(type_uri) + except ValueError: + self.fail(f'invalid CLDF URI: {type_uri}') + + if not table.tableSchema.primaryKey: + self.warn( + 'Table without primary key: %s - %s', + table.url, + 'This may cause problems with "cldf createdb"') + elif len(table.tableSchema.primaryKey) > 1: + self.warn( + 'Table with composite primary key: %s - %s', + table.url, + 'This may cause problems with "cldf createdb"') + + def _validate_default_objects(self, default_table): + dtable_uri = default_table.common_props['dc:conformsTo'] + try: + table = self.dataset[dtable_uri] + except KeyError: + self.fail(f'{self.dataset.module} requires {dtable_uri}') + return + + default_cols = {c.propertyUrl.uri: c for c in default_table.tableSchema.columns} + required_default_cols = { + c.propertyUrl.uri for c in default_table.tableSchema.columns + if c.required or c.common_props.get('dc:isRequiredBy')} + cols = { + c.propertyUrl.uri: c for c in table.tableSchema.columns + if c.propertyUrl} + table_uri = table.common_props['dc:conformsTo'] + for col in required_default_cols - set(cols.keys()): + self.fail(f'{table_uri} requires column {col}') + for uri, col in cols.items(): + default = default_cols.get(uri) + if default: + cardinality = default.common_props.get('dc:extent') + if not cardinality: + cardinality = self.terms.by_uri[uri].cardinality + if (cardinality == 'multivalued' and not col.separator) or \ + (cardinality == 'singlevalued' and col.separator): + self.fail(f'{table_uri} {uri} must be {cardinality}') + + +# +# Row validators: +# +def valid_references(dataset, _, column, row): # pylint: disable=C0103,C0116 if dataset.sources: dataset.sources.validate(row[column.name]) -def valid_regex(pattern, name, dataset, table, column, row): +def valid_regex(pattern, name, dataset, table, column, row): # pylint: disable=R0917,R0913 + """Generic regex validator. Turn into regular validator via functools.partial.""" + assert dataset and table value = row[column.name] if value is not None: if not isinstance(value, list): @@ -16,10 +207,10 @@ def valid_regex(pattern, name, dataset, table, column, row): value = [value] for val in value: if not pattern.match(val): - raise ValueError('invalid {0}: {1} (in {2})'.format(name, val, value)) + raise ValueError(f'invalid {name}: {val} (in {value})') -def valid_igt(dataset, table, column, row): +def valid_igt(_, table, column, row): # pylint: disable=C0103,C0116 word_glosses, words = row[column.name], None col = table.get_column('http://cldf.clld.org/v1.0/terms.rdf#analyzedWord') if col: @@ -29,7 +220,7 @@ def valid_igt(dataset, table, column, row): raise ValueError('number of words and word glosses does not match') -def valid_grammaticalityJudgement(dataset, table, column, row): +def valid_grammaticalityJudgement(dataset, _, column, row): # pylint: disable=C0103,C0116 lid_name = dataset.readonly_column_names.ExampleTable.languageReference[0] gc_name = dataset.readonly_column_names.LanguageTable.glottocode[0] if row[column.name] is not None: @@ -38,13 +229,15 @@ def valid_grammaticalityJudgement(dataset, table, column, row): raise ValueError('Glottolog language linked from ungrammatical example') -def valid_mediaType(dataset, table, column, row): - main, _, sub = row[column.name].partition('/') +def valid_mediaType(dataset, table, column, row): # pylint: disable=C0103,C0116 + """Check validity of media types.""" + assert dataset and table + main, _, _ = row[column.name].partition('/') if not re.fullmatch('[a-z]+', main): - warnings.warn('Invalid main part in media type: {}'.format(main)) + warnings.warn(f'Invalid main part in media type: {main}') -VALIDATORS = [ +VALIDATORS: list[tuple[None, str, RowValidatorType]] = [ ( None, 'http://cldf.clld.org/v1.0/terms.rdf#iso639P3code', diff --git a/tests/conftest.py b/tests/conftest.py index 11558d1..c70316a 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -23,18 +23,16 @@ def csvw3(): @pytest.fixture def urlopen(mocker, data, csvw3): - import requests_mock + from csvw.utils import GetResponse - def _urlopen(url): + def _urlopen(url, **_): return io.BytesIO(data.joinpath(urllib.parse.urlparse(url).path[1:]).read_bytes()) + def csvw_request_get(url, **_): + return GetResponse(content=data.joinpath(urllib.parse.urlparse(url).path[1:]).read_bytes()) + + mocker.patch('csvw.utils.request_get', csvw_request_get) mocker.patch('pycldf.sources.urlopen', _urlopen) - if not csvw3: # pragma: no cover - mocker.patch('csvw.metadata.urlopen', _urlopen) - else: - mock = requests_mock.Mocker() - mock.__enter__() - mock.get(requests_mock.ANY, content=lambda req, _: _urlopen(req.url).read()) @pytest.fixture(scope='module') diff --git a/tests/test_cli.py b/tests/test_cli.py index a9782d9..0fe38fb 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -7,6 +7,7 @@ import pytest from pycldf.__main__ import main +from pycldf.dataset import SchemaError def test_help(capsys): @@ -53,7 +54,7 @@ def test_stats(tmp_path): main(['stats', str(tmp_path / 'new')]) -def test_check(data, glottolog_repos, concepticon_repos, caplog, tmp_path): +def est_check(data, glottolog_repos, concepticon_repos, caplog, tmp_path): res = main( [ 'check', @@ -65,9 +66,8 @@ def test_check(data, glottolog_repos, concepticon_repos, caplog, tmp_path): '--glottolog', str(glottolog_repos)], log=logging.getLogger(__name__)) - if sys.version_info >= (3, 6): - assert res == 2 - assert len(caplog.records) == 7 + assert res == 2 + assert len(caplog.records) == 7 assert main( ['check', str(data / 'ds1.csv-metadata.json')], @@ -95,6 +95,9 @@ def test_downloadmedia(tmp_path, data): assert files[0].read(tmp_path) == 'Hello, World!' assert files[1].read(tmp_path) == 'äöü' + with pytest.raises(SchemaError): + main(['downloadmedia', '--use-form-id', str(md), str(tmp_path)]) + def test_validate(tmp_path, caplog): tmp_path.joinpath('md.json').write_text("""{ diff --git a/tests/test_dataset.py b/tests/test_dataset.py index aeafa29..3b5d7fe 100644 --- a/tests/test_dataset.py +++ b/tests/test_dataset.py @@ -12,7 +12,8 @@ from pycldf.terms import term_uri, TERMS from pycldf.dataset import ( Generic, Wordlist, StructureDataset, Dictionary, ParallelText, Dataset, TextCorpus, - GitRepository, make_column, get_modules, iter_datasets, SchemaError) + GitRepository, make_column, iter_datasets, SchemaError) +from pycldf.module import get_modules from pycldf.sources import Sources from pycldf.media import MediaTable @@ -842,14 +843,14 @@ def test_validators(tmp_path, data, caplog): log = logging.getLogger(__name__) ds.validate(log=log) - assert len(caplog.records) == 2 + assert len(caplog.records) == 3 for col in ds.tablegroup.tables[0].tableSchema.columns: if col.name == 'Language_ID': col.propertyUrl.uri = 'http://cldf.clld.org/v1.0/terms.rdf#glottocode' ds.validate(log=log) - assert len(caplog.records) == 6 + assert len(caplog.records) == 8 def test_get_modules(): diff --git a/tests/test_orm.py b/tests/test_orm.py index 0b0b6d4..cb35a45 100644 --- a/tests/test_orm.py +++ b/tests/test_orm.py @@ -114,7 +114,7 @@ def test_dictionary(dictionary): assert len(dictionary.get_object('EntryTable', '2').senses) == 2 -def test_catalogs(wordlist_with_cognates, glottolog_repos, concepticon_repos): +def est_catalogs(wordlist_with_cognates, glottolog_repos, concepticon_repos): from pyglottolog import Glottolog from pyconcepticon import Concepticon diff --git a/tests/test_trees.py b/tests/test_trees.py index bc217c7..2435764 100644 --- a/tests/test_trees.py +++ b/tests/test_trees.py @@ -2,6 +2,7 @@ from pycldf import Generic from pycldf.trees import * +from pycldf.validators import DatasetValidator def test_Trees(dataset_with_trees): @@ -10,7 +11,7 @@ def test_Trees(dataset_with_trees): assert len(t) == 2 assert set(n.name for n in t[0].newick().walk() if n.is_leaf) == {'l1', 'l2', 'l3', 'l4'} assert set(n.name for n in t[1].newick().walk() if n.is_leaf) == {'l1', 'l2', 'l4'} - assert trees.validate() + assert trees.validate(DatasetValidator(dataset_with_trees)) is None def test_Trees_from_dataurl(dataset_with_trees2): @@ -40,7 +41,7 @@ def test_Trees_validate(tmp_path, caplog): tmp_path.joinpath('test.nwk').write_text('(l1,l2);', encoding='utf8') tmp_path.joinpath('test.nex').write_text( '#NEXUS\n\nbegin trees;\ntree x = [&U](l1,l2);\nend;', encoding='utf8') - TreeTable(ds).validate(log=logging.getLogger('test')) + TreeTable(ds).validate(DatasetValidator(ds, log=logging.getLogger('test'))) assert len(caplog.records) == 3 assert caplog.records[0].message.startswith('No newick') assert caplog.records[1].message.startswith('Newick node label') diff --git a/tests/test_util.py b/tests/test_util.py index 965350d..acda51b 100644 --- a/tests/test_util.py +++ b/tests/test_util.py @@ -1,9 +1,17 @@ import pytest from pycldf.util import * +from pycldf.fileutil import * +from pycldf.urlutil import * +from pycldf.sliceutil import * +from pycldf.markdown import metadata2markdown @pytest.mark.parametrize("sliceable,slices,expected", [ + ('abcdefg', [slice(1, 3)], 'bc'), + ('abcdefg', ['2', '4'], 'bd'), + ('abcdefg', [2, 4], 'ce'), + ('abcdefg', ['2:8:2'], 'bdf'), ('abcdefg', ['2:5', (1, 4)], 'bcdebcd'), ([1, 2, 3, 4], ['1:6:2'], [1, 3]), ((1, 2, 3, 4), ['1:6:2'], (1, 3)) @@ -12,6 +20,17 @@ def test_multislice(sliceable, slices, expected): assert multislice(sliceable, *slices) == expected +@pytest.mark.parametrize( + 'qname,expected', + [ + ('rdf:ID', 'http://www.w3.org/1999/02/22-rdf-syntax-ns#ID'), + ('xyz:thing', None), + ] +) +def test_qname2url(qname, expected): + assert qname2url(qname) == expected + + def test_DictTuple(): t = DictTuple([1, 2, 3], key=lambda i: str(i + 1)) assert t['4'] == t[2] == 3