diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
index 3dd19c1..8f0f9e9 100644
--- a/.github/workflows/python-package.yml
+++ b/.github/workflows/python-package.yml
@@ -12,12 +12,12 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python-version: [3.9, "3.10", 3.11, 3.12]
+        python-version: ["3.10", 3.11, 3.12, 3.13]
 
     steps:
-    - uses: actions/checkout@v4
+    - uses: actions/checkout@v6
     - name: Set up Python ${{ matrix.python-version }}
-      uses: actions/setup-python@v5
+      uses: actions/setup-python@v6
       with:
         python-version: ${{ matrix.python-version }}
     - name: Install dependencies
diff --git a/CHANGELOG.md b/CHANGELOG.md
index d54dd30..8e88f75 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -3,6 +3,14 @@
 The `pycldf` package adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html).
 
 
+## unreleased
+
+- Remove dependency on `attrs`.
+
+Note: Until versions of `pyglottolog` and `pyconcepticon` are released, which are compatible with
+`clldutils` 4.x, catalog integration in `pycldf` is limited.
+
+
 ## [1.43.1] - 2026-03-25
 
 Pin dependencies `csvw` and `clldutils`, since these will get incompatible new major versions.
diff --git a/RELEASING.md b/RELEASING.md
index b108f7f..527a983 100644
--- a/RELEASING.md
+++ b/RELEASING.md
@@ -12,6 +12,10 @@ Releasing pycldf
   ```shell
   flake8 src
   ```
+- Make sure pylint passes with a score of 10:
+  ```shell
+  pylint src
+  ```
 
 - Make sure the docs render:
   ```shell
diff --git a/setup.cfg b/setup.cfg
index a428afc..2a401ba 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -20,12 +20,12 @@ classifiers =
     Natural Language :: English
     Operating System :: OS Independent
     Programming Language :: Python :: 3
-    Programming Language :: Python :: 3.8
     Programming Language :: Python :: 3.9
     Programming Language :: Python :: 3.10
     Programming Language :: Python :: 3.11
     Programming Language :: Python :: 3.12
     Programming Language :: Python :: 3.13
+    Programming Language :: Python :: 3.14
     Programming Language :: Python :: Implementation :: CPython
     Programming Language :: Python :: Implementation :: PyPy
     License :: OSI Approved :: Apache Software License
@@ -35,10 +35,11 @@ zip_safe = False
 packages = find:
 package_dir =
     = src
-python_requires = >=3.8
+python_requires = >=3.9
 install_requires =
-    csvw<4
-    clldutils<4
+    csvw>=4.0
+    tabulate
+    clldutils>=4.0
     uritemplate>=3.0
     python-dateutil
     simplepybtex
@@ -83,7 +84,6 @@ test =
     pyconcepticon
     pytest>=5
     pytest-mock
-    requests-mock
     pytest-cov
     coverage>=4.2
 docs =
@@ -117,7 +117,7 @@ show_missing = true
 skip_covered = true
 
 [tox:tox]
-envlist = py3.8, py39, py310, py311, py312, py313
+envlist = py39, py310, py311, py312, py313, py314
 isolated_build = true
 skip_missing_interpreter = true
 
diff --git a/src/pycldf/__init__.py b/src/pycldf/__init__.py
index e6c77d7..85b0266 100644
--- a/src/pycldf/__init__.py
+++ b/src/pycldf/__init__.py
@@ -1,3 +1,8 @@
+"""
+The `pycldf` package provides the reference implementation for the CLDF standard.
+
+https://cldf.cldf.org
+"""
 from pycldf.dataset import *
 from pycldf.db import *
 from pycldf.sources import *
diff --git a/src/pycldf/__main__.py b/src/pycldf/__main__.py
index fe764ac..ca344cb 100644
--- a/src/pycldf/__main__.py
+++ b/src/pycldf/__main__.py
@@ -1,5 +1,10 @@
+"""
+CLI for the `pycldf` pockage.
+"""
 import csv
 import sys
+from typing import Optional, Sequence
+import logging
 import contextlib
 
 from clldutils.clilib import (
@@ -10,7 +15,15 @@
 import pycldf.commands
 
 
-def main(args=None, catch_all=False, parsed_args=None, log=None):
+def main(
+        args: Sequence[str] = None,
+        catch_all: bool = False,
+        parsed_args: list = None,
+        log: Optional[logging.Logger] = None,
+) -> Optional[int]:
+    """
+    Implements the main command, dispatches to subcommands.
+    """
     parser, subparsers = get_parser_and_subparsers('cldf')
     add_csv_field_size_limit(parser, default=csv.field_size_limit())
     register_subcommands(subparsers, pycldf.commands)
@@ -32,7 +45,7 @@ def main(args=None, catch_all=False, parsed_args=None, log=None):
             return 0
         except ParserError as e:
             print(colored(str(e), 'red'))
-            return main([args._command, '-h'])
+            return main([args._command, '-h'])  # pylint: disable=protected-access
         except Exception as e:  # pragma: no cover
             if catch_all:
                 print(e)
diff --git a/src/pycldf/_compat.py b/src/pycldf/_compat.py
new file mode 100644
index 0000000..f63a117
--- /dev/null
+++ b/src/pycldf/_compat.py
@@ -0,0 +1,15 @@
+"""
+Backwards compatibility with supported python versions.
+"""
+import sys
+
+
+if (sys.version_info.major, sys.version_info.minor) >= (3, 10):  # pragma: no cover
+    def entry_points_select(eps, group):
+        """
+        Staring with Python 3.10, `importlib.metadata.entry_points` returns `EntryPoints`."""
+        return eps.select(group=group)
+else:
+    def entry_points_select(eps, group):  # pragma: no cover
+        """In Python 3.9, `importlib.metadata.entry_points` returns a `dict`."""
+        return eps.get(group, [])
diff --git a/src/pycldf/cli_util.py b/src/pycldf/cli_util.py
index 8e2a488..ddf179e 100644
--- a/src/pycldf/cli_util.py
+++ b/src/pycldf/cli_util.py
@@ -31,13 +31,17 @@ def strtobool(val: str) -> int:  # pragma: no cover
     val = val.lower()
     if val in ('y', 'yes', 't', 'true', 'on', '1'):
         return 1
-    elif val in ('n', 'no', 'f', 'false', 'off', '0'):
+    if val in ('n', 'no', 'f', 'false', 'off', '0'):
         return 0
-    else:
-        raise ValueError("invalid truth value %r" % (val,))
+    raise ValueError(f"invalid truth value {val}")
 
 
-class FlagOrPathType(PathType):
+class FlagOrPathType(PathType):  # pylint: disable=too-few-public-methods
+    """
+    Argument type allowing input of a path or a boolean.
+
+    The boolean can be used to determine whether to download a file from a known location.
+    """
     def __call__(self, string):
         try:
             return bool(strtobool(string))
@@ -45,8 +49,10 @@ def __call__(self, string):
             return super().__call__(string)
 
 
-def http_head_status(url):  # pragma: no cover
+def http_head_status(url: str) -> int:  # pragma: no cover
+    """Do a HEAD request for `url` to determine its status."""
     class NoRedirection(urllib.request.HTTPErrorProcessor):
+        """Don't follow redirects."""
         def http_response(self, request, response):
             return response
 
@@ -56,22 +62,22 @@ def http_response(self, request, response):
     return opener.open(urllib.request.Request(url, method="HEAD")).status
 
 
-class UrlOrPathType(PathType):
-    def __call__(self, string):
+class UrlOrPathType(PathType):  # pylint: disable=too-few-public-methods
+    """Type suitable for argparse arguments, allowing input of URL or local file path."""
+    def __call__(self, string: str) -> str:
         if is_url(string):
             if self._must_exist:
                 sc = http_head_status(string)
                 # We accept not only HTTP 200 as valid but also common redirection codes because
                 # these are used e.g. for DOIs.
                 if sc not in {200, 301, 302}:
-                    raise argparse.ArgumentTypeError(
-                        'URL {} does not exist [HTTP {}]!'.format(string, sc))
+                    raise argparse.ArgumentTypeError(f'URL {string} does not exist [HTTP {sc}]!')
             return string
         super().__call__(string.partition('#')[0])
         return string
 
 
-def add_dataset(parser: argparse.ArgumentParser):
+def add_dataset(parser: argparse.ArgumentParser) -> None:
     """
     Adds a positional argument named `dataset` to the parser to specify a CLDF dataset.
     """
@@ -101,11 +107,17 @@ def get_dataset(args: argparse.Namespace) -> Dataset:
     except TypeError as e:  # pragma: no cover
         if 'PathLike' in str(e):
             raise ParserError(
-                'The dataset locator may require downloading, so you should specify --download-dir')
+                'The dataset locator may require downloading, so you should specify --download-dir'
+            ) from e
         raise
 
 
-def add_database(parser, must_exist=True):
+def add_database(parser: argparse.ArgumentParser, must_exist: bool = True) -> None:
+    """
+    Add CLI arguments to specify a CLDF SQLite database.
+
+    Retrieve in the `run` function of a command using `get_database` (see below).
+    """
     add_dataset(parser)
     parser.add_argument(
         'db',
@@ -116,17 +128,21 @@ def add_database(parser, must_exist=True):
     parser.add_argument('--infer-primary-keys', action='store_true', default=False)
 
 
-def get_database(args):
+def get_database(args: argparse.Namespace) -> Database:
+    """
+    Retrieve a `Database` instance based on CLI input in `args` (see `add_database`).
+    """
     return Database(get_dataset(args), fname=args.db, infer_primary_keys=args.infer_primary_keys)
 
 
-def add_catalog_spec(parser, name):
+def add_catalog_spec(parser: argparse.ArgumentParser, name: str) -> None:
+    """Add CLI arguments suitable to specify a catalog."""
     parser.add_argument(
         '--' + name,
         metavar=name.upper(),
         type=PathType(type='dir'),
-        help='Path to repository clone of {0} data'.format(name.capitalize()))
+        help=f'Path to repository clone of {name.capitalize()} data')
     parser.add_argument(
-        '--{0}-version'.format(name),
-        help='Version of {0} data to checkout'.format(name.capitalize()),
+        f'--{name}-version',
+        help=f'Version of {name.capitalize()} data to checkout',
         default=None)
diff --git a/src/pycldf/commands/catmedia.py b/src/pycldf/commands/catmedia.py
index a14c6b0..c629fd7 100644
--- a/src/pycldf/commands/catmedia.py
+++ b/src/pycldf/commands/catmedia.py
@@ -5,12 +5,12 @@
 from pycldf.media import MediaTable
 
 
-def register(parser):
+def register(parser):  # pylint: disable=C0116
     add_dataset(parser)
 
 
-def run(args):
+def run(args):  # pylint: disable=C0116
     ds = get_dataset(args)
     res = MediaTable(ds).cat()
     if res:
-        args.log.info('{} files have been recombined'.format(res))
+        args.log.info(f'{res} files have been recombined')
diff --git a/src/pycldf/commands/check.py b/src/pycldf/commands/check.py
index 8e7930e..e93a894 100644
--- a/src/pycldf/commands/check.py
+++ b/src/pycldf/commands/check.py
@@ -18,7 +18,7 @@
     Catalog, Glottolog, Concepticon = None, None, None
 
 
-def register(parser):
+def register(parser):  # pylint: disable=C0116
     add_dataset(parser)
     add_catalog_spec(parser, 'glottolog')
     add_catalog_spec(parser, 'concepticon')
@@ -32,7 +32,7 @@ def register(parser):
     )
 
 
-def run(args):
+def run(args):  # pylint: disable=C0116
     if not Catalog:  # pragma: no cover
         print('\nThis command only works with catalogs installed.\n'
               'Run "pip install pycldf[catalogs]" to do so.\n')
@@ -53,31 +53,37 @@ def warn(msg):
             stack.enter_context(Catalog(args.concepticon, tag=args.concepticon_version))
 
         for table, checkers in COLUMN_CHECKERS.items():
-            table = ds.get(table)
-            if table:
-                idcol = ds.get((table, 'id'))
-                active_checkers = {}
-                for col, checker in checkers.items():
-                    col = ds.get((table, col))
-                    if col:
-                        # Register an initialized check:
-                        active_checkers[col.name] = checker(args)
-                if active_checkers:
-                    for row in table:
-                        rowid = row[idcol.name] if idcol else str(row)
-                        for colname, check in active_checkers.items():
-                            check(row[colname], rowid, warn)
+            _check_table(ds, table, checkers, args, warn)
 
         for table in ds.tables:
             for _ in table:
                 break
             else:
-                warn('Empty table {0}'.format(table.url))
+                warn(f'Empty table {table.url}')
 
     return 2 if warnings else 0
 
 
-class Check:
+def _check_table(ds, table, checkers, args, warn):
+    table = ds.get(table)
+    if not table:
+        return
+    idcol = ds.get((table, 'id'))
+    active_checkers = {}
+    for col, checker in checkers.items():
+        col = ds.get((table, col))
+        if col:
+            # Register an initialized check:
+            active_checkers[col.name] = checker(args)
+    if active_checkers:
+        for row in table:
+            rowid = row[idcol.name] if idcol else str(row)
+            for colname, check in active_checkers.items():
+                check(row[colname], rowid, warn)
+
+
+class Check:  # pylint: disable=R0903
+    """A base class for checks. Initialize with __init__ then run __call__ on each row."""
     def __init__(self, args):
         self.args = args
 
@@ -85,7 +91,7 @@ def __call__(self, gc, rowid, warn):
         raise NotImplementedError()  # pragma: no cover
 
 
-class Macroarea(Check):
+class Macroarea(Check):  # pylint: disable=R0903
     """Is the macroarea valid according to Glottolog? (requires "--glottolog")"""
     def __init__(self, args):
         super().__init__(args)
@@ -96,10 +102,10 @@ def __init__(self, args):
 
     def __call__(self, ma, rowid, warn):
         if self.macroareas and ma and (ma not in self.macroareas):
-            warn('Language {0} assigned to invalid macroarea {1}'.format(rowid, ma))
+            warn(f'Language {rowid} assigned to invalid macroarea {ma}')
 
 
-class Glottocode(Check):
+class Glottocode(Check):  # pylint: disable=R0903
     """Is the Glottocode valid - is it in Bookkeeping? (requires "--glottolog")"""
     def __init__(self, args):
         super().__init__(args)
@@ -116,12 +122,12 @@ def __init__(self, args):
     def __call__(self, gc, rowid, warn):
         if self.gcs and gc:
             if gc in self.bookkeeping:
-                warn('Language {0} mapped to Bookkeeping languoid {1}'.format(rowid, gc))
+                warn(f'Language {rowid} mapped to Bookkeeping languoid {gc}')
             if gc not in self.gcs:
-                warn('Language {0} mapped to invalid Glottocode {1}'.format(rowid, gc))
+                warn(f'Language {rowid} mapped to invalid Glottocode {gc}')
 
 
-class ISOCode(Check):
+class ISOCode(Check):  # pylint: disable=R0903
     """Is the ISO code valid? (requires "--iso-codes")"""
     def __init__(self, args):
         super().__init__(args)
@@ -139,24 +145,24 @@ def __init__(self, args):
 
     def __call__(self, iso, rowid, warn):
         if self.iso_codes and iso and (iso not in self.iso_codes):
-            warn('Language {0} mapped to invalid ISO 639-3 code {1}'.format(rowid, iso))
+            warn(f'Language {rowid} mapped to invalid ISO 639-3 code {iso}')
 
 
-class Latitude(Check):
+class Latitude(Check):  # pylint: disable=R0903
     """Is the latitude between -90 and 90?"""
     def __call__(self, lat, rowid, warn):
-        if lat and not (-90 <= lat <= 90):
-            warn('Language {0} has invalid latitude {1}'.format(rowid, lat))
+        if lat and not -90 <= lat <= 90:
+            warn(f'Language {rowid} has invalid latitude {lat}')
 
 
-class Longitude(Check):
+class Longitude(Check):  # pylint: disable=R0903
     """Is the longitude between -180 and 180?"""
-    def __call__(self, lat, rowid, warn):
-        if lat and not (-180 <= lat <= 180):
-            warn('Language {0} has invalid longitude {1}'.format(rowid, lat))
+    def __call__(self, lon, rowid, warn):
+        if lon and not -180 <= lon <= 180:
+            warn(f'Language {rowid} has invalid longitude {lon}')
 
 
-class ConcepticonID(Check):
+class ConcepticonID(Check):  # pylint: disable=R0903
     """Is the concept set ID valid? (requires "--concepticon")"""
     def __init__(self, args):
         super().__init__(args)
@@ -168,7 +174,7 @@ def __init__(self, args):
 
     def __call__(self, cid, rowid, warn):
         if self.ids and cid and (cid not in self.ids):
-            warn('Parameter {0} mapped to invalid conceptset ID {1}'.format(rowid, cid))
+            warn(f'Parameter {rowid} mapped to invalid conceptset ID {cid}')
 
 
 COLUMN_CHECKERS = {
@@ -184,6 +190,6 @@ def __call__(self, cid, rowid, warn):
     }
 }
 for t, checks in COLUMN_CHECKERS.items():
-    __doc__ += '\n- {0}\n'.format(t)
+    __doc__ += f'\n- {t}\n'
     for c, cls in checks.items():
-        __doc__ += '  - {0}: {1}\n'.format(c, cls.__doc__.strip() or '')
+        __doc__ += f'  - {c}: {cls.__doc__.strip()}\n'
diff --git a/src/pycldf/commands/createdb.py b/src/pycldf/commands/createdb.py
index d9b4c28..2d4aef7 100644
--- a/src/pycldf/commands/createdb.py
+++ b/src/pycldf/commands/createdb.py
@@ -6,13 +6,13 @@
 from pycldf.cli_util import add_database, get_database
 
 
-def register(parser):
+def register(parser):  # pylint: disable=C0116
     add_database(parser, must_exist=False)
 
 
-def run(args):
+def run(args):  # pylint: disable=C0116
     if args.db.exists():
         raise ParserError('The database file already exists!')
     db = get_database(args)
     db.write_from_tg()
-    args.log.info('{0} loaded in {1}'.format(db.dataset, db.fname))
+    args.log.info('%s loaded in %s', db.dataset, db.fname)
diff --git a/src/pycldf/commands/downloadmedia.py b/src/pycldf/commands/downloadmedia.py
index d9b8c2d..7a674ee 100644
--- a/src/pycldf/commands/downloadmedia.py
+++ b/src/pycldf/commands/downloadmedia.py
@@ -9,7 +9,7 @@
 from pycldf.media import MediaTable
 
 
-def register(parser):
+def register(parser):  # pylint: disable=C0116
     add_dataset(parser)
     parser.add_argument(
         '--use-form-id',
@@ -27,11 +27,15 @@ def register(parser):
         default=[])
 
 
-def run(args):
+def run(args):  # pylint: disable=C0116
     filters = []
     for s in args.filters:
         col, _, substring = s.partition('=')
         filters.append((col, substring))
-    for item in MediaTable(get_dataset(args), args.use_form_id):
+    media_table = MediaTable(get_dataset(args))
+    if args.use_form_id:
+        media_table.filename_col = media_table.ds[
+            media_table.component, 'http://cldf.clld.org/v1.0/terms.rdf#formReference']
+    for item in media_table:
         if all(substring in item[col] for col, substring in filters):
             item.save(args.output)
diff --git a/src/pycldf/commands/dumpdb.py b/src/pycldf/commands/dumpdb.py
index 1b82a0a..ede0be2 100644
--- a/src/pycldf/commands/dumpdb.py
+++ b/src/pycldf/commands/dumpdb.py
@@ -7,7 +7,7 @@
 from pycldf.cli_util import add_database, get_database, PathType
 
 
-def register(parser):
+def register(parser):  # pylint: disable=C0116
     add_database(parser)
     parser.add_argument(
         '--metadata-path',
@@ -16,7 +16,7 @@ def register(parser):
     )
 
 
-def run(args):
+def run(args):  # pylint: disable=C0116
     db = get_database(args)
-    mdpath = args.metadata_path or db.dataset.tablegroup._fname
-    args.log.info('dumped db to {0}'.format(db.to_cldf(mdpath.parent, mdname=mdpath.name)))
+    mdpath = args.metadata_path or db.dataset.tablegroup._fname  # pylint: disable=W0212
+    args.log.info('dumped db to %s', db.to_cldf(mdpath.parent, mdname=mdpath.name))
diff --git a/src/pycldf/commands/markdown.py b/src/pycldf/commands/markdown.py
index 296eb4d..567f0e0 100644
--- a/src/pycldf/commands/markdown.py
+++ b/src/pycldf/commands/markdown.py
@@ -3,11 +3,11 @@
 """
 from clldutils.clilib import PathType
 
-from pycldf.util import metadata2markdown
+from pycldf.markdown import metadata2markdown
 from pycldf.cli_util import add_dataset, get_dataset
 
 
-def register(parser):
+def register(parser):  # pylint: disable=C0116
     add_dataset(parser)
     parser.add_argument(
         '--rel-path',
@@ -19,7 +19,7 @@ def register(parser):
         default=None)
 
 
-def run(args):
+def run(args):  # pylint: disable=C0116
     ds = get_dataset(args)
     res = metadata2markdown(ds, args.dataset, rel_path=args.rel_path)
     if args.out:
diff --git a/src/pycldf/commands/splitmedia.py b/src/pycldf/commands/splitmedia.py
index 0da333b..ebf28a4 100644
--- a/src/pycldf/commands/splitmedia.py
+++ b/src/pycldf/commands/splitmedia.py
@@ -13,26 +13,26 @@
 CHUNKSIZE = 50 * 1000 * 1000
 
 
-class Bytes:
-    def __call__(self, string):
-        if not re.fullmatch(r'[0-9]+([MK])?', string):  # pragma: no cover
-            raise argparse.ArgumentTypeError('Invalid chunksize {0}!'.format(string))
-        return eval(string.replace('K', '*1024').replace('M', '*1024*1024'))
+def _bytes(string) -> int:
+    """Parse a chunk size spec."""
+    if not re.fullmatch(r'[0-9]+([MK])?', string):  # pragma: no cover
+        raise argparse.ArgumentTypeError(f'Invalid chunksize {string}!')
+    return eval(string.replace('K', '*1024').replace('M', '*1024*1024'))  # pylint: disable=W0123
 
 
-def register(parser):
+def register(parser):  # pylint: disable=C0116
     add_dataset(parser)
     parser.add_argument(
         '-b', '--bytes',
         metavar='SIZE',
         help='The SIZE argument is an integer and optional unit K or M (example: 10K is 10*1024).',
-        type=Bytes(),
+        type=_bytes,
         default=CHUNKSIZE,
     )
 
 
-def run(args):
+def run(args):  # pylint: disable=C0116
     ds = get_dataset(args)
     res = MediaTable(ds).split(args.bytes)
     if res:
-        args.log.info('{} files have been split'.format(res))
+        args.log.info('%s files have been split', res)
diff --git a/src/pycldf/commands/stats.py b/src/pycldf/commands/stats.py
index 61084e0..bc4a157 100644
--- a/src/pycldf/commands/stats.py
+++ b/src/pycldf/commands/stats.py
@@ -8,7 +8,7 @@
 from pycldf.cli_util import add_dataset, get_dataset
 
 
-def register(parser):
+def register(parser):  # pylint: disable=C0116
     add_dataset(parser)
     add_format(parser, default=None)
     parser.add_argument(
@@ -19,12 +19,12 @@ def register(parser):
     )
 
 
-def run(args):
+def run(args):  # pylint: disable=C0116
     ds = get_dataset(args)
     print(ds)
-    with Table('key', 'value') as md:
+    with Table(args, 'key', 'value') as md:
         md.extend(ds.properties.items())
     print()
-    with Table('Path', 'Type', 'Rows') as t:
+    with Table(args, 'Path', 'Type', 'Rows') as t:
         for p, type_, r in ds.stats(args.exact):
             t.append([p, type_, r])
diff --git a/src/pycldf/commands/validate.py b/src/pycldf/commands/validate.py
index 9f20202..5d607df 100644
--- a/src/pycldf/commands/validate.py
+++ b/src/pycldf/commands/validate.py
@@ -5,13 +5,15 @@
 - the referential integrity of the dataset
 """
 import collections
+import dataclasses
 
+from pycldf import Dataset
 from pycldf.cli_util import add_dataset, get_dataset
 from pycldf.media import MediaTable
-from pycldf.ext.markdown import CLDFMarkdownText
+from pycldf.ext.markdown import CLDFMarkdownText, CLDFMarkdownLink
 
 
-def register(parser):
+def register(parser):  # pylint: disable=C0116
     add_dataset(parser)
     parser.add_argument(
         '--with-cldf-markdown',
@@ -21,7 +23,26 @@ def register(parser):
     )
 
 
-def run(args):
+@dataclasses.dataclass
+class TestMarkdown:
+    """Helper class to run rendering of CLDF markdown and record results."""
+    links: list[CLDFMarkdownLink] = dataclasses.field(default_factory=list)
+    missing: collections.Counter = dataclasses.field(default_factory=collections.Counter)
+
+    def __call__(self, text: str, ds: Dataset):
+        class Parser(CLDFMarkdownText):
+            """A CLDFMarkdownText subclass that records link render results."""
+            def render_link(slf, cldf_link):  # pylint: disable=W0237,E0213
+                self.links.append(cldf_link)
+                try:
+                    slf.get_object(cldf_link)
+                except:  # noqa: E722  # pylint: disable=W0702
+                    self.missing.update([
+                        f'{cldf_link.label}:{cldf_link.table_or_fname}:{cldf_link.objid}'])
+        Parser(text, ds).render()
+
+
+def run(args):  # pylint: disable=C0116
     cldf = get_dataset(args)
     if not cldf.validate(log=args.log):
         return 1
@@ -29,18 +50,6 @@ def run(args):
     if not args.with_cldf_markdown:
         return 0
 
-    missing = collections.Counter()
-    links = []
-
-    class TestMarkdown(CLDFMarkdownText):
-        def render_link(self, cldf_link):
-            links.append(cldf_link)
-            try:
-                self.get_object(cldf_link)
-            except:  # noqa: E722
-                missing.update(['{}:{}:{}'.format(
-                    cldf_link.label, cldf_link.table_or_fname, cldf_link.objid)])
-
     cols = []
     for t in cldf.tables:
         try:
@@ -54,27 +63,34 @@ def render_link(self, cldf_link):
 
     res = 0
     for t, c in cols:
-        args.log.info('Validating CLDF Markdown links in {}:{}'.format(t, c))
+        tmd = TestMarkdown()
+        args.log.info('Validating CLDF Markdown links in %s:%s', t, c)
         for obj in cldf[t]:
             if obj[c] and '[' in obj[c]:
-                TestMarkdown(obj[c], cldf).render()
+                tmd(obj[c], cldf)
 
-        for k, v in missing.most_common():
+        for k, v in tmd.missing.most_common():
             res = 1
-            args.log.warning('Not found {} ({} occurrences)'.format(k, v))
-        args.log.info('{} links checked'.format(len(links)))
-        missing, links = collections.Counter(), []
+            args.log.warning('Not found %s (%s occurrences)', k, v)
+        args.log.info('%s links checked', len(tmd.links))
 
     if 'MediaTable' in cldf and ('MediaTable', 'http://purl.org/dc/terms/conformsTo') in cldf:
-        ctcol = cldf['MediaTable', 'http://purl.org/dc/terms/conformsTo']
-        for file in MediaTable(cldf):
-            if file.row[ctcol.name] == 'CLDF Markdown':
-                args.log.info('Validating CLDF Markdown links in MediaTable:{}'.format(file.id))
-                TestMarkdown(file.read(), cldf).render()
-                for k, v in missing.most_common():
-                    res = 1
-                    args.log.warning('Not found {} ({} occurrences)'.format(k, v))
-                args.log.info('{} links checked'.format(len(links)))
-                missing, links = collections.Counter(), []
+        if not _validate_media(cldf, args.log):
+            res = 1
+
+    return res
+
 
+def _validate_media(cldf, log) -> bool:
+    res = True
+    ctcol = cldf['MediaTable', 'http://purl.org/dc/terms/conformsTo']
+    for file in MediaTable(cldf):
+        if file.row[ctcol.name] == 'CLDF Markdown':
+            log.info('Validating CLDF Markdown links in MediaTable:%s', file.id)
+            tmd = TestMarkdown()
+            tmd(file.read(), cldf)
+            for k, v in tmd.missing.most_common():
+                res = False
+                log.warning('Not found %s (%s occurrences)', k, v)
+            log.info('%s links checked', len(tmd.links))
     return res
diff --git a/src/pycldf/constraints.py b/src/pycldf/constraints.py
new file mode 100644
index 0000000..cfad2dc
--- /dev/null
+++ b/src/pycldf/constraints.py
@@ -0,0 +1,100 @@
+"""
+Functionality for creation of foreign key constraints.
+"""
+from typing import TYPE_CHECKING, Optional
+
+from pycldf.terms import TERMS, term_uri
+from pycldf.schemautil import TableType, ColType
+
+if TYPE_CHECKING:
+    from pycldf.dataset import Dataset  # pragma: no cover
+
+__all__ = ['add_foreign_key', 'add_auto_constraints']
+
+
+def add_foreign_key(
+        ds: 'Dataset',
+        foreign_t: TableType,
+        foreign_c: ColType,
+        primary_t: TableType,
+        primary_c: Optional[ColType] = None,
+) -> None:
+    """
+    Add a foreign key constraint.
+
+    ..note:: Composite keys are not supported yet.
+
+    :param foreign_t: Table reference for the linking table.
+    :param foreign_c: Column reference for the link.
+    :param primary_t: Table reference for the linked table.
+    :param primary_c: Column reference for the linked column - or `None`, in which case the \
+    primary key of the linked table is assumed.
+    """
+    if isinstance(foreign_c, (tuple, list)) or isinstance(primary_c, (tuple, list)):
+        raise NotImplementedError('composite keys are not supported')
+
+    foreign_t = ds[foreign_t]
+    primary_t = ds[primary_t]
+    if not primary_c:
+        primary_c = primary_t.tableSchema.primaryKey
+    else:
+        primary_c = ds[primary_t, primary_c].name
+    foreign_t.add_foreign_key(ds[foreign_t, foreign_c].name, primary_t.url.string, primary_c)
+
+
+def add_auto_constraints(ds: 'Dataset', component: Optional[TableType] = None):
+    """
+    Use CLDF reference properties to implicitly create foreign key constraints.
+
+    :param component: A Table object or `None`.
+    """
+    if not component:
+        for table in ds.tables:
+            ds.auto_constraints(table)
+        return
+
+    if not component.tableSchema.primaryKey:
+        idcol = component.get_column(term_uri('id'))
+        if idcol:
+            component.tableSchema.primaryKey = [idcol.name]
+
+    _auto_foreign_keys(ds, component)
+
+    try:
+        table_type = ds.get_tabletype(component)
+    except ValueError:
+        table_type = None
+
+    if table_type is None:
+        # New component is not a known CLDF term, so cannot add components
+        # automatically. TODO: We might me able to infer some based on
+        # `xxxReference` column properties?
+        return
+
+    # auto-add foreign keys targeting the new component:
+    for table in ds.tables:
+        _auto_foreign_keys(ds, table, component=component, table_type=table_type)
+
+
+def _auto_foreign_keys(ds: 'Dataset', table, component=None, table_type=None):
+    assert (component is None) == (table_type is None)
+    for col in table.tableSchema.columns:
+        if col.propertyUrl and col.propertyUrl.uri in TERMS.by_uri:
+            ref_name = TERMS.by_uri[col.propertyUrl.uri].references
+            if (component is None and not ref_name) or \
+                    (component is not None and ref_name != table_type):
+                continue
+            if any(fkey.columnReference == [col.name]
+                   for fkey in table.tableSchema.foreignKeys):
+                continue
+            if component is None:
+                # Let's see whether we have the component this column references:
+                try:
+                    ref = ds[ref_name]
+                except KeyError:
+                    continue
+            else:
+                ref = component
+            idcol = ref.get_column(term_uri('id'))
+            table.add_foreign_key(
+                col.name, ref.url.string, idcol.name if idcol is not None else 'ID')
diff --git a/src/pycldf/dataset.py b/src/pycldf/dataset.py
index 7891f1f..fd05162 100644
--- a/src/pycldf/dataset.py
+++ b/src/pycldf/dataset.py
@@ -1,160 +1,55 @@
+"""
+An implementation of a CLDF dataset object.
+"""
 import re
-import sys
-import json
 import types
-import shutil
-import typing
+from typing import Union, Optional, Type, Any
 import logging
 import pathlib
 import functools
 import itertools
 import collections
-import collections.abc
+from collections.abc import Generator, Iterable
 import urllib.parse
 import urllib.request
 
-import attr
 import csvw
-from csvw.metadata import TableGroup, Table, Column, Link, Schema, is_url, URITemplate
+from csvw.metadata import TableGroup, Table, Column, Link, is_url, URITemplate
 from csvw import datatypes
 from csvw.dsv import iterrows
-from clldutils.path import git_describe, walk
-from clldutils.misc import log_or_raise
-from clldutils import jsonlib
-
-from pycldf.sources import Sources
-from pycldf.util import pkg_path, resolve_slices, DictTuple, sanitize_url, iter_uritemplates
-from pycldf.terms import term_uri, Terms, TERMS, get_column_names, URL as TERMS_URL
-from pycldf.validators import VALIDATORS
+from clldutils.path import walk
+
+from pycldf.module import get_module_impl, get_modules
+from pycldf.sources import Sources, Source
+from pycldf.util import (
+    pkg_path, DictTuple, iter_uritemplates, MD_SUFFIX, GitRepository, copy_dataset)
+from pycldf.sliceutil import multislice_with_split
+from pycldf.fileutil import PathType
+from pycldf.schemautil import ColSpecType, make_column, make_table, TableType, ColType
+from pycldf.constraints import add_foreign_key, add_auto_constraints
+from pycldf.terms import term_uri, Terms, TERMS, get_column_names, sniff
+from pycldf import validators as validation
+from pycldf.stats import get_table_stats
 from pycldf import orm
 
+assert get_modules  # For backwards compatibility with cldfbench.
+
 __all__ = [
     'Dataset', 'Generic', 'Wordlist', 'ParallelText', 'Dictionary', 'StructureDataset',
-    'TextCorpus', 'iter_datasets', 'sniff', 'SchemaError', 'ComponentWithValidation']
+    'TextCorpus', 'iter_datasets', 'sniff', 'SchemaError']
 
-MD_SUFFIX = '-metadata.json'
 ORM_CLASSES = {cls.component_name(): cls for cls in orm.Object.__subclasses__()}
-TableType = typing.Union[str, Table]
-ColType = typing.Union[str, Column]
-ColSpecType = typing.Union[str, dict, Column]
-PathType = typing.Union[str, pathlib.Path]
-TableSpecType = typing.Union[str, Link, Table]
-ColSPecType = typing.Union[str, Column]
-SchemaObjectType = typing.Union[TableSpecType, typing.Tuple[TableSpecType, ColSPecType]]
+TableSpecType = Union[str, Link, Table]
+SchemaObjectType = Union[TableSpecType, tuple[TableSpecType, ColType]]
+ODict = collections.OrderedDict
+RowType = ODict[str, Any]
 
 
 class SchemaError(KeyError):
-    pass
-
-
-@attr.s
-class Module:
-    """
-    Class representing a CLDF Module.
-
-    .. seealso:: https://github.com/cldf/cldf/blob/master/README.md#cldf-modules
-    """
-    uri = attr.ib(validator=attr.validators.in_([t.uri for t in TERMS.classes.values()]))
-    fname = attr.ib()
-    cls = attr.ib(default=None)
-
-    @property
-    def id(self) -> str:
-        """
-        The local part of the term URI is interpreted as Module identifier.
-        """
-        return self.uri.split('#')[1]
-
-    def match(self, thing) -> bool:
-        if isinstance(thing, TableGroup):
-            return thing.common_props.get('dc:conformsTo') == term_uri(self.id)
-        if hasattr(thing, 'name'):
-            return thing.name == self.fname
-        return False
+    """Schema objects can be accessed using `Dataset.__getitem__`."""
 
 
-_modules = []
-
-
-def get_modules() -> typing.List[Module]:
-    """
-    We read supported CLDF modules from the default metadata files distributed with `pycldf`.
-    """
-    global _modules
-    if not _modules:
-        ds = sys.modules[__name__]
-        for p in pkg_path('modules').glob('*{0}'.format(MD_SUFFIX)):
-            tg = TableGroup.from_file(p)
-            mod = Module(
-                tg.common_props['dc:conformsTo'],
-                tg.tables[0].url.string if tg.tables else None)
-            mod.cls = getattr(ds, mod.id)
-            _modules.append(mod)
-        # prefer Wordlist over ParallelText (forms.csv)
-        _modules = sorted(
-            _modules,
-            key=lambda m: (m.cls in (Wordlist, ParallelText), m.cls is ParallelText))
-    return _modules
-
-
-def make_column(spec: ColSpecType) -> Column:
-    """
-    Create a `Column` instance from `spec`.
-
-    .. code-block:: python
-
-        >>> make_column('id').name
-        'id'
-        >>> make_column('http://cldf.clld.org/v1.0/terms.rdf#id').name
-        'ID'
-        >>> make_column({'name': 'col', 'datatype': 'boolean'}).datatype.base
-        'boolean'
-        >>> type(make_column(make_column('id')))
-        <class 'csvw.metadata.Column'>
-    """
-    if isinstance(spec, str):
-        if spec in TERMS.by_uri:
-            return TERMS.by_uri[spec].to_column()
-        return Column(name=spec, datatype='string')
-    if isinstance(spec, dict):
-        return Column.fromvalue(spec)
-    if isinstance(spec, Column):
-        return spec
-    raise TypeError(spec)
-
-
-class GitRepository:
-    """
-    CLDF datasets are often created from data curated in git repositories. If this is the case, we
-    exploit this to provide better provenance information in the dataset's metadata.
-    """
-    def __init__(self,
-                 url: str,
-                 clone: typing.Optional[typing.Union[str, pathlib.Path]] = None,
-                 version: typing.Optional[str] = None,
-                 **dc):
-        # We remove credentials from the URL immediately to make sure this isn't leaked into
-        # CLDF metadata. Such credentials might be present in URLs read via gitpython from
-        # remotes.
-        self.url = sanitize_url(url)
-        self.clone = clone
-        self.version = version
-        self.dc = dc
-
-    def json_ld(self) -> typing.Dict[str, str]:
-        res = collections.OrderedDict([
-            ('rdf:about', self.url),
-            ('rdf:type', 'prov:Entity'),
-        ])
-        if self.version:
-            res['dc:created'] = self.version
-        elif self.clone:
-            res['dc:created'] = git_describe(self.clone)
-        res.update({'dc:{0}'.format(k): self.dc[k] for k in sorted(self.dc)})
-        return res
-
-
-class Dataset:
+class Dataset:  # pylint: disable=too-many-public-methods
     """
     API to access a CLDF dataset.
     """
@@ -168,7 +63,7 @@ def __init__(self, tablegroup: csvw.TableGroup):
         - :meth:`~pycldf.dataset.Dataset.from_metadata`
         - :meth:`~pycldf.dataset.Dataset.from_data`
         """
-        self.tablegroup = tablegroup
+        self.tablegroup: csvw.TableGroup = tablegroup
         self.auto_constraints()
         self._sources = None
         self._objects = collections.defaultdict(collections.OrderedDict)
@@ -177,6 +72,7 @@ def __init__(self, tablegroup: csvw.TableGroup):
 
     @property
     def sources(self) -> Sources:
+        """The sources."""
         # We load sources only the first time they are accessed, because for datasets like
         # Glottolog - with 40MB zipped BibTeX - this may take ~90secs.
         if self._sources is None:
@@ -189,9 +85,7 @@ def sources(self, obj: Sources):
             raise TypeError('Invalid type for Dataset.sources')
         self._sources = obj
 
-    #
-    # Factory methods to create `Dataset` instances.
-    #
+    # Factory methods to create `Dataset` instances. -----------------------------------------------
     @classmethod
     def in_dir(cls, d: PathType, empty_tables: bool = False) -> 'Dataset':
         """
@@ -226,11 +120,11 @@ def from_metadata(cls, fname: PathType) -> 'Dataset':
         else:
             fname = pathlib.Path(fname)
             if fname.is_dir():
-                name = '{0}{1}'.format(cls.__name__, MD_SUFFIX)
+                name = f'{cls.__name__}{MD_SUFFIX}'
                 tablegroup = TableGroup.from_file(pkg_path('modules', name))
                 # adapt the path of the metadata file such that paths to tables are resolved
                 # correctly:
-                tablegroup._fname = fname.joinpath(name)
+                tablegroup._fname = fname.joinpath(name)  # pylint: disable=W0212
             else:
                 tablegroup = TableGroup.from_file(fname)
 
@@ -243,11 +137,11 @@ def from_metadata(cls, fname: PathType) -> 'Dataset':
             except ValueError:
                 pass
         if comps and comps.most_common(1)[0][1] > 1:
-            raise ValueError('{0}: duplicate components!'.format(fname))
+            raise ValueError(f'{fname}: duplicate components!')
 
-        for mod in get_modules():
-            if mod.match(tablegroup):
-                return mod.cls(tablegroup)
+        impl = get_module_impl(Dataset, tablegroup)
+        if impl:
+            return impl(tablegroup)
         return cls(tablegroup)
 
     @classmethod
@@ -264,38 +158,38 @@ def from_data(cls, fname: PathType) -> 'Dataset':
         if not colnames:
             raise ValueError('empty data file!')
         if cls is Dataset:
-            try:
-                cls = next(mod.cls for mod in get_modules() if mod.match(fname))
-            except StopIteration:
-                raise ValueError('{0} does not match a CLDF module spec'.format(fname))
-            assert issubclass(cls, Dataset) and cls is not Dataset
-
-        res = cls.from_metadata(fname.parent)
+            impl = get_module_impl(Dataset, fname.name)
+            if impl is None:
+                raise ValueError(f'{fname} does not match a CLDF module spec')
+            res = impl.from_metadata(fname.parent)
+        else:
+            res = cls.from_metadata(fname.parent)
         required_cols = {
             c.name for c in res[res.primary_table].tableSchema.columns
             if c.required}
         if not required_cols.issubset(colnames):
-            raise ValueError('missing columns: %r' % sorted(required_cols.difference(colnames)))
+            raise ValueError(f'missing columns: {sorted(required_cols.difference(colnames))}')
         return res
 
-    #
-    # Accessing dataset metadata
-    #
+    # Accessing dataset metadata -------------------------------------------------------------------
     @property
-    def directory(self) -> typing.Union[str, pathlib.Path]:
+    def directory(self) -> PathType:
         """
         :return: The location of the metadata file. Either a local directory as `pathlib.Path` or \
         a URL as `str`.
         """
-        return self.tablegroup._fname.parent if self.tablegroup._fname else self.tablegroup.base
+        if self.tablegroup._fname:  # pylint: disable=W0212
+            return self.tablegroup._fname.parent  # pylint: disable=W0212
+        return self.tablegroup.base
 
     @property
     def filename(self) -> str:
         """
         :return: The name of the metadata file.
         """
-        return self.tablegroup._fname.name if self.tablegroup._fname else \
-            pathlib.Path(urllib.parse.urlparse(self.tablegroup.base).path).name
+        if self.tablegroup._fname:  # pylint: disable=W0212
+            return self.tablegroup._fname.name  # pylint: disable=W0212
+        return pathlib.Path(urllib.parse.urlparse(self.tablegroup.base).path).name
 
     @property
     def module(self) -> str:
@@ -306,13 +200,15 @@ def module(self) -> str:
 
     @property
     def version(self) -> str:
+        """The CLDF version."""
         return self.properties['dc:conformsTo'].split('/')[3]
 
     def __repr__(self) -> str:
-        return '<cldf:%s:%s at %s>' % (self.version, self.module, self.directory)
+        return f'<cldf:{self.version}:{self.module} at {self.directory}>'
 
     @property
     def metadata_dict(self) -> dict:
+        """The TableGroup instance as dict."""
         return self.tablegroup.asdict(omit_defaults=False)
 
     @property
@@ -323,7 +219,7 @@ def properties(self) -> dict:
         return self.tablegroup.common_props
 
     @property
-    def bibpath(self) -> typing.Union[str, pathlib.Path]:
+    def bibpath(self) -> PathType:
         """
         :return: Location of the sources BibTeX file. Either a URL (`str`) or a local path \
         (`pathlib.Path`).
@@ -343,18 +239,16 @@ def bibname(self) -> str:
             return pathlib.Path(urllib.parse.urlparse(self.bibpath).path).name
         return self.bibpath.name
 
-    #
-    # Accessing schema objects (components, tables, columns, foreign keys)
-    #
+    # Accessing schema objects (components, tables, columns, foreign keys) -------------------------
     @property
-    def tables(self) -> typing.List[Table]:
+    def tables(self) -> list[Table]:
         """
         :return: All tables defined in the dataset.
         """
         return self.tablegroup.tables
 
     @property
-    def components(self) -> typing.Dict[str, csvw.Table]:
+    def components(self) -> collections.OrderedDict[str, csvw.Table]:
         """
         :return: Mapping of component name to table objects as defined in the dataset.
         """
@@ -370,26 +264,28 @@ def components(self) -> typing.Dict[str, csvw.Table]:
         return res
 
     @staticmethod
-    def get_tabletype(table) -> typing.Union[str, None]:
+    def get_tabletype(table) -> Optional[str]:
+        """Return the table type, aka component name, of the table."""
         if table.common_props.get('dc:conformsTo', '') is None:
             return None
         if '#' in table.common_props.get('dc:conformsTo', ''):
             res = table.common_props['dc:conformsTo'].split('#')[1]
             if res in TERMS:
                 return res
-        raise ValueError("Type {:} of table {:} is not a valid term.".format(
-            table.common_props.get('dc:conformsTo'),
-            table.url))
+        raise ValueError(
+            f"Type {table.common_props.get('dc:conformsTo')} of table {table.url} is invalid.")
 
     @property
-    def primary_table(self) -> typing.Union[str, None]:
+    def primary_table(self) -> Optional[str]:
+        """Returns the primary table for the dataset."""
         if self.tables:
             try:
                 return self.get_tabletype(self.tables[0])
             except ValueError:
-                return None
+                pass
+        return None
 
-    def __getitem__(self, item: SchemaObjectType) -> typing.Union[csvw.Table, csvw.Column]:
+    def __getitem__(self, item: SchemaObjectType) -> Union[csvw.Table, csvw.Column]:
         """
         Access to tables and columns.
 
@@ -422,37 +318,32 @@ def __getitem__(self, item: SchemaObjectType) -> typing.Union[csvw.Table, csvw.C
         if isinstance(table, Link):
             table = table.string
 
-        if not isinstance(table, Table):
-            uri = term_uri(table, terms=TERMS.by_uri)
-            for t in self.tables:
-                if (uri and t.common_props.get('dc:conformsTo') == uri) \
-                        or t.url.string == table:
-                    break
-            else:
-                raise SchemaError('Dataset has no table "{}"'.format(table))
-        else:
-            if any(table is tt for tt in self.tables):
-                t = table
-            else:
-                raise SchemaError('Dataset has no table "{}"'.format(table))
-
+        t = self._get_table(table)
         if not column:
             return t
 
         if isinstance(column, Column):
             if any(column is c for c in t.tableSchema.columns):
                 return column
-            else:
-                raise SchemaError('Dataset has no column "{}" in table "{}"'.format(
-                    column.name, t.url))
+            raise SchemaError(f'Dataset has no column "{column.name}" in table "{t.url}"')
 
         uri = term_uri(column, terms=TERMS.by_uri)
         for c in t.tableSchema.columns:
-            if ((c.propertyUrl and (c.propertyUrl.uri == uri or c.propertyUrl.uri == column))
-                    or c.header == column):  # noqa: W503
+            if ((c.propertyUrl and (c.propertyUrl.uri in (uri, column))) or c.header == column):
                 return c
 
-        raise SchemaError('Dataset has no column "{}" in table "{}"'.format(column, t.url))
+        raise SchemaError(f'Dataset has no column "{column}" in table "{t.url}"')
+
+    def _get_table(self, table: TableType) -> Table:
+        if not isinstance(table, Table):
+            uri = term_uri(table, terms=TERMS.by_uri)
+            for t in self.tables:
+                if (uri and t.common_props.get('dc:conformsTo') == uri) or t.url.string == table:
+                    return t
+            raise SchemaError(f'Dataset has no table "{table}"')
+        if any(table is tt for tt in self.tables):
+            return table
+        raise SchemaError(f'Dataset has no table "{table}"')
 
     def __delitem__(self, item: SchemaObjectType):
         """
@@ -474,9 +365,7 @@ def __contains__(self, item: SchemaObjectType) -> bool:
         """
         return bool(self.get(item))
 
-    def get(self,
-            item: SchemaObjectType,
-            default=None) -> typing.Union[csvw.Table, csvw.Column, None]:
+    def get(self, item: SchemaObjectType, default=None) -> Union[csvw.Table, csvw.Column, None]:
         """
         Acts like `dict.get`.
 
@@ -487,8 +376,9 @@ def get(self,
         except SchemaError:
             return default
 
-    def get_foreign_key_reference(self, table: TableType, column: ColType) \
-            -> typing.Union[typing.Tuple[csvw.Table, csvw.Column], None]:
+    def get_foreign_key_reference(
+            self, table: TableType, column: ColType,
+    ) -> Optional[tuple[csvw.Table, csvw.Column]]:
         """
         Retrieve the reference of a foreign key constraint for the specified column.
 
@@ -503,6 +393,7 @@ def get_foreign_key_reference(self, table: TableType, column: ColType) \
             if len(fk.columnReference) == 1 and fk.columnReference[0] == column.name:
                 return self[fk.reference.resource], \
                     self[fk.reference.resource, fk.reference.columnReference[0]]
+        return None
 
     @property
     def column_names(self) -> types.SimpleNamespace:
@@ -528,10 +419,8 @@ def readonly_column_names(self) -> types.SimpleNamespace:
         """
         return get_column_names(self, use_component_names=True, with_multiplicity=True)
 
-    #
-    # Editing dataset metadata or schema
-    #
-    def add_provenance(self, **kw):
+    # Editing dataset metadata or schema -----------------------------------------------------------
+    def add_provenance(self, **kw: Any) -> None:
         """
         Add metadata about the dataset's provenance.
 
@@ -545,7 +434,7 @@ def to_json(obj):
 
         for k, v in kw.items():
             if not k.startswith('prov:'):
-                k = 'prov:{0}'.format(k)
+                k = f'prov:{k}'
             if isinstance(v, (tuple, list)):
                 v = [to_json(vv) for vv in v]
             else:
@@ -560,7 +449,7 @@ def to_json(obj):
                 v = old
             self.tablegroup.common_props[k] = v
 
-    def add_table(self, url: str, *cols: ColSpecType, **kw) -> csvw.Table:
+    def add_table(self, url: str, *cols: ColSpecType, **kw: Any) -> csvw.Table:
         """
         Add a table description to the Dataset.
 
@@ -573,14 +462,16 @@ def add_table(self, url: str, *cols: ColSpecType, **kw) -> csvw.Table:
         """
         t = self.add_component({"url": url, "tableSchema": {"columns": []}}, *cols)
         if 'primaryKey' in kw:
-            t.tableSchema.primaryKey = attr.fields_dict(Schema)['primaryKey'].converter(
-                kw.pop('primaryKey'))
+            pk = kw.pop('primaryKey')
+            if pk is not None and not isinstance(pk, list):
+                pk = [pk]
+            t.tableSchema.primaryKey = pk
         if kw.get('description'):
             t.common_props['dc:description'] = kw.pop('description')
         t.common_props.update(kw)
         return t
 
-    def remove_table(self, table: TableType):
+    def remove_table(self, table: TableType) -> None:
         """
         Removes the table specified by `table` from the dataset.
         """
@@ -594,10 +485,7 @@ def remove_table(self, table: TableType):
         # Now remove the table:
         self.tablegroup.tables = [t for t in self.tablegroup.tables if t.url != table.url]
 
-    def add_component(self,
-                      component: typing.Union[str, dict],
-                      *cols: ColSpecType,
-                      **kw) -> csvw.Table:
+    def add_component(self, component: Union[str, dict], *cols: ColSpecType, **kw) -> csvw.Table:
         """
         Add a CLDF component to a dataset.
 
@@ -609,11 +497,7 @@ def add_component(self,
             - `url`: a url property for the table;\
             - `description`: a description of the table.
         """
-        if isinstance(component, str):
-            component = jsonlib.load(pkg_path('components', '{0}{1}'.format(component, MD_SUFFIX)))
-        if isinstance(component, dict):
-            component = Table.fromvalue(component)
-        assert isinstance(component, Table)
+        component = make_table(component)
 
         if kw.get('url'):
             component.url = Link(kw['url'])
@@ -639,7 +523,7 @@ def add_component(self,
         self.tables.append(component)
         self.add_columns(component, *cols)
 
-        component._parent = self.tablegroup
+        component._parent = self.tablegroup  # pylint: disable=W0212
         self.auto_constraints(component)
         return component
 
@@ -654,13 +538,13 @@ def add_columns(self, table: TableType, *cols: ColSpecType) -> None:
                 c.propertyUrl.uri for c in table.tableSchema.columns if c.propertyUrl])
             col = make_column(col)
             if col.name in existing:
-                raise ValueError('Duplicate column name: {0}'.format(col.name))
+                raise ValueError(f'Duplicate column name: {col.name}')
             if col.propertyUrl and col.propertyUrl.uri in existing:
-                raise ValueError('Duplicate column property: {0}'.format(col.propertyUrl.uri))
+                raise ValueError(f'Duplicate column property: {col.propertyUrl.uri}')
             table.tableSchema.columns.append(col)
         self.auto_constraints()
 
-    def remove_columns(self, table: TableType, *cols: str):
+    def remove_columns(self, table: TableType, *cols: ColType) -> None:
         """
         Remove `cols` from `table`'s schema.
 
@@ -683,7 +567,7 @@ def remove_columns(self, table: TableType, *cols: str):
 
         table.tableSchema.columns = [c for c in table.tableSchema.columns if str(c) not in cols]
 
-    def rename_column(self, table: TableType, col: ColType, name: str):
+    def rename_column(self, table: TableType, col: ColType, name: str) -> None:
         """
         Assign a new `name` to an existing column, cascading this change to foreign keys.
 
@@ -724,7 +608,8 @@ def add_foreign_key(
             foreign_t: TableType,
             foreign_c: ColType,
             primary_t: TableType,
-            primary_c: typing.Optional[ColType] = None):
+            primary_c: Optional[ColType] = None,
+    ) -> None:
         """
         Add a foreign key constraint.
 
@@ -736,77 +621,18 @@ def add_foreign_key(
         :param primary_c: Column reference for the linked column - or `None`, in which case the \
         primary key of the linked table is assumed.
         """
-        if isinstance(foreign_c, (tuple, list)) or isinstance(primary_c, (tuple, list)):
-            raise NotImplementedError('composite keys are not supported')
-
-        foreign_t = self[foreign_t]
-        primary_t = self[primary_t]
-        if not primary_c:
-            primary_c = primary_t.tableSchema.primaryKey
-        else:
-            primary_c = self[primary_t, primary_c].name
-        foreign_t.add_foreign_key(self[foreign_t, foreign_c].name, primary_t.url.string, primary_c)
+        return add_foreign_key(self, foreign_t, foreign_c, primary_t, primary_c)
 
-    def auto_constraints(self, component=None):
+    def auto_constraints(self, component: Optional[TableType] = None):
         """
-        Use CLDF reference properties to implicitely create foreign key constraints.
+        Use CLDF reference properties to implicitly create foreign key constraints.
 
         :param component: A Table object or `None`.
         """
-        if not component:
-            for table in self.tables:
-                self.auto_constraints(table)
-            return
-
-        if not component.tableSchema.primaryKey:
-            idcol = component.get_column(term_uri('id'))
-            if idcol:
-                component.tableSchema.primaryKey = [idcol.name]
-
-        self._auto_foreign_keys(component)
-
-        try:
-            table_type = self.get_tabletype(component)
-        except ValueError:
-            table_type = None
+        return add_auto_constraints(self, component)
 
-        if table_type is None:
-            # New component is not a known CLDF term, so cannot add components
-            # automatically. TODO: We might me able to infer some based on
-            # `xxxReference` column properties?
-            return
-
-        # auto-add foreign keys targeting the new component:
-        for table in self.tables:
-            self._auto_foreign_keys(table, component=component, table_type=table_type)
-
-    def _auto_foreign_keys(self, table, component=None, table_type=None):
-        assert (component is None) == (table_type is None)
-        for col in table.tableSchema.columns:
-            if col.propertyUrl and col.propertyUrl.uri in TERMS.by_uri:
-                ref_name = TERMS.by_uri[col.propertyUrl.uri].references
-                if (component is None and not ref_name) or \
-                        (component is not None and ref_name != table_type):
-                    continue
-                if any(fkey.columnReference == [col.name]
-                       for fkey in table.tableSchema.foreignKeys):
-                    continue
-                if component is None:
-                    # Let's see whether we have the component this column references:
-                    try:
-                        ref = self[ref_name]
-                    except KeyError:
-                        continue
-                else:
-                    ref = component
-                idcol = ref.get_column(term_uri('id'))
-                table.add_foreign_key(
-                    col.name, ref.url.string, idcol.name if idcol is not None else 'ID')
-
-    #
-    # Add data
-    #
-    def add_sources(self, *sources, **kw):
+    # Add data -------------------------------------------------------------------------------------
+    def add_sources(self, *sources: Union[str, Source], **kw) -> None:
         """
         Add sources to the dataset.
 
@@ -814,10 +640,8 @@ def add_sources(self, *sources, **kw):
         """
         self.sources.add(*sources, **kw)
 
-    #
-    # Methods to read data
-    #
-    def iter_rows(self, table: TableType, *cols: str) -> typing.Generator[dict, None, None]:
+    # Methods to read data -------------------------------------------------------------------------
+    def iter_rows(self, table: TableType, *cols: str) -> Generator[RowType, None, None]:
         """
         Iterate rows in a table, resolving CLDF property names to local column names.
 
@@ -833,13 +657,14 @@ def iter_rows(self, table: TableType, *cols: str) -> typing.Generator[dict, None
                 item[v] = item[k]
             yield item
 
-    def cached_rows(self, table: TableType) -> list:
+    def cached_rows(self, table: TableType) -> list[RowType]:
+        """Return the rows of a table from a cache."""
         key = table.local_name if isinstance(table, Table) else table
         if key not in self._cached_rows:
             self._cached_rows[key] = list(self.iter_rows(table))
         return self._cached_rows[key]
 
-    def get_row(self, table: TableType, id_) -> dict:
+    def get_row(self, table: TableType, id_) -> RowType:
         """
         Retrieve a row specified by table and CLDF id.
 
@@ -851,7 +676,7 @@ def get_row(self, table: TableType, id_) -> dict:
                 return row
         raise ValueError(id_)  # pragma: no cover
 
-    def get_row_url(self, table: TableType, row) -> typing.Union[str, None]:
+    def get_row_url(self, table: TableType, row: Union[RowType, str]) -> Optional[str]:
         """
         Get a URL associated with a row. Tables can specify associated row URLs by
 
@@ -865,7 +690,7 @@ def get_row_url(self, table: TableType, row) -> typing.Union[str, None]:
         :param row: A row specified by ID or as `dict` as returned when iterating over a table.
         :return: a `str` representing a URL or `None`.
         """
-        row = row if isinstance(row, dict) else self.get_row(table, row)
+        row = self.get_row(table, row) if isinstance(row, str) else row
         id_col = None
         for col in self[table].tableSchema.columns:
             if col.datatype and col.datatype.base == datatypes.anyURI.__name__:
@@ -875,11 +700,12 @@ def get_row_url(self, table: TableType, row) -> typing.Union[str, None]:
             if str(col.propertyUrl) == 'http://cldf.clld.org/v1.0/terms.rdf#id':
                 # Otherwise we fall back to looking up the `valueUrl` property on the ID column.
                 id_col = col
-        assert id_col, 'no ID column found in table {}'.format(table)
+        assert id_col, f'no ID column found in table {table}'
         if id_col.valueUrl:
             return id_col.valueUrl.expand(**row)
+        return None
 
-    def objects(self, table: str, cls: typing.Optional[typing.Type] = None) -> DictTuple:
+    def objects(self, table: str, cls: Optional[Type] = None) -> DictTuple:
         """
         Read data of a CLDF component as :class:`pycldf.orm.Object` instances.
 
@@ -899,7 +725,7 @@ def objects(self, table: str, cls: typing.Optional[typing.Type] = None) -> DictT
 
         return DictTuple(self._objects[table].values())
 
-    def get_object(self, table, id_, cls=None, pk=False) -> orm.Object:
+    def get_object(self, table: str, id_: str, cls=None, pk=False) -> orm.Object:
         """
         Get a row of a component as :class:`pycldf.orm.Object` instance.
         """
@@ -907,20 +733,17 @@ def get_object(self, table, id_, cls=None, pk=False) -> orm.Object:
             self.objects(table, cls=cls)
         return self._objects[table][id_] if not pk else self._objects_by_pk[table][id_]
 
-    #
-    # Methods for writing (meta)data to files:
-    #
-    def write_metadata(
-            self, fname: typing.Optional[typing.Union[str, pathlib.Path]] = None) -> pathlib.Path:
+    # Methods for writing (meta)data to files: -----------------------------------------------------
+    def write_metadata(self, fname: Optional[PathType] = None) -> pathlib.Path:
         """
         Write the CLDF metadata to a JSON file.
 
         :fname: Path of a file to write to, or `None` to use the default name and write to \
         :meth:`~pycldf.dataset.Dataset.directory`.
         """
-        return self.tablegroup.to_file(fname or self.tablegroup._fname)
+        return self.tablegroup.to_file(fname or self.tablegroup._fname)  # pylint: disable=W0212
 
-    def write_sources(self, zipped: bool = False) -> typing.Union[None, pathlib.Path]:
+    def write_sources(self, zipped: bool = False) -> Optional[pathlib.Path]:
         """
         Write the sources BibTeX file to :meth:`~pycldf.dataset.Dataset.bibpath`
 
@@ -930,10 +753,12 @@ def write_sources(self, zipped: bool = False) -> typing.Union[None, pathlib.Path
         """
         return self.sources.write(self.bibpath, zipped=zipped)
 
-    def write(self,
-              fname: typing.Optional[pathlib.Path] = None,
-              zipped: typing.Optional[typing.Iterable] = None,
-              **table_items: typing.List[dict]) -> pathlib.Path:
+    def write(
+            self,
+            fname: Optional[pathlib.Path] = None,
+            zipped: Optional[Iterable] = None,
+            **table_items: list[RowType]
+    ) -> pathlib.Path:
         """
         Write metadata, sources and data. Metadata will be written to `fname` (as interpreted in
         :meth:`pycldf.dataset.Dataset.write_metadata`); data files will be written to the file
@@ -955,7 +780,7 @@ def write(self,
             table.common_props['dc:extent'] = table.write(items, _zipped=table_type in zipped)
         return self.write_metadata(fname)
 
-    def copy(self, dest: typing.Union[str, pathlib.Path], mdname: str = None) -> pathlib.Path:
+    def copy(self, dest: PathType, mdname: str = None) -> pathlib.Path:
         """
         Copy metadata, data and sources to files in `dest`.
 
@@ -973,54 +798,15 @@ def copy(self, dest: typing.Union[str, pathlib.Path], mdname: str = None) -> pat
             ...     if 'with_examples' in ds.directory.name:
             ...         ds.copy('some_directory', mdname='md.json')
         """
-        from pycldf.media import MediaTable
-
-        dest = pathlib.Path(dest)
-        if not dest.exists():
-            dest.mkdir(parents=True)
-
-        from_url = is_url(self.tablegroup.base)
-        ds = Dataset.from_metadata(self.tablegroup.base if from_url else self.tablegroup._fname)
+        return copy_dataset(self, dest, mdname)
 
-        _getter = urllib.request.urlretrieve if from_url else shutil.copy
-        try:
-            _getter(self.bibpath, dest / self.bibname)
-            ds.properties['dc:source'] = self.bibname
-        except:  # pragma: no cover # noqa
-            # Sources are optional
-            pass
-
-        for table in ds.tables:
-            fname = table.url.resolve(table.base)
-            name = pathlib.Path(urllib.parse.urlparse(fname).path).name if from_url else fname.name
-            _getter(fname, dest / name)
-            table.url = Link(name)
-
-            for fk in table.tableSchema.foreignKeys:
-                fk.reference.resource = Link(pathlib.Path(fk.reference.resource.string).name)
-        mdpath = dest.joinpath(
-            mdname or  # noqa: W504
-            (self.tablegroup.base.split('/')[-1] if from_url else self.tablegroup._fname.name))
-        if 'MediaTable' in self:
-            for f in MediaTable(self):
-                if f.scheme == 'file':
-                    if f.local_path().exists():
-                        target = dest / urllib.parse.unquote(f.relpath)
-                        target.parent.mkdir(parents=True, exist_ok=True)
-                        shutil.copy(f.local_path(), target)
-        if from_url:
-            del ds.tablegroup.at_props['base']  # pragma: no cover
-        ds.write_metadata(fname=mdpath)
-        return mdpath
-
-    #
-    # Reporting
-    #
+    # Reporting ------------------------------------------------------------------------------------
     def validate(
             self,
             log: logging.Logger = None,
-            validators: typing.List[typing.Tuple[str, str, callable]] = None,
-            ontology_path=None) -> bool:
+            validators: list[tuple[Optional[str], str, validation.RowValidatorType]] = None,
+            ontology_path: Optional[PathType] = None,
+    ) -> bool:
         """
         Validate schema and data of a `Dataset`:
 
@@ -1034,160 +820,20 @@ def validate(
         :raises ValueError: if a validation error is encountered (and `log` is `None`).
         :return: Flag signaling whether schema and data are valid.
         """
-        # We must import components with custom validation to make sure they can be detected as
-        # subclasses of ComponentWithValidation.
-        from pycldf.media import MediaTable
-        from pycldf.trees import TreeTable
-
-        assert MediaTable and TreeTable
-
-        terms = Terms(ontology_path) or TERMS
-        validators = validators or []
-        validators.extend(VALIDATORS)
-        success = True
-        default_tg = TableGroup.from_file(
-            pkg_path('modules', '{0}{1}'.format(self.module, MD_SUFFIX)))
-        #
-        # Make sure, all required tables and columns are present and consistent.
-        #
-        for default_table in default_tg.tables:
-            dtable_uri = default_table.common_props['dc:conformsTo']
-            try:
-                table = self[dtable_uri]
-            except KeyError:
-                success = False
-                log_or_raise('{0} requires {1}'.format(self.module, dtable_uri), log=log)
-                table = None
-
-            if table:
-                default_cols = {c.propertyUrl.uri: c for c in default_table.tableSchema.columns}
-                required_default_cols = {
-                    c.propertyUrl.uri for c in default_table.tableSchema.columns
-                    if c.required or c.common_props.get('dc:isRequiredBy')}
-                cols = {
-                    c.propertyUrl.uri: c for c in table.tableSchema.columns
-                    if c.propertyUrl}
-                table_uri = table.common_props['dc:conformsTo']
-                for col in required_default_cols - set(cols.keys()):
-                    success = False
-                    log_or_raise('{0} requires column {1}'.format(table_uri, col), log=log)
-                for uri, col in cols.items():
-                    default = default_cols.get(uri)
-                    if default:
-                        cardinality = default.common_props.get('dc:extent')
-                        if not cardinality:
-                            cardinality = terms.by_uri[uri].cardinality
-                        if (cardinality == 'multivalued' and not col.separator) or \
-                                (cardinality == 'singlevalued' and col.separator):
-                            success = False
-                            log_or_raise('{} {} must be {}'.format(
-                                table_uri, uri, cardinality), log=log)
+        return validation.validate(
+            dataset=self,
+            terms=Terms(ontology_path) or TERMS,
+            log=log,
+            row_validators=validators or [],
+        )
 
-        for table in self.tables:
-            vars = set(col.name for col in table.tableSchema.columns)
-            for obj, prop, tmpl in iter_uritemplates(table):
-                if not {n for n in tmpl.variable_names if not n.startswith('_')}.issubset(vars):
-                    if log:
-                        log.warning('Unknown variables in URI template: {}:{}:{}'.format(
-                            obj, prop, tmpl))
-
-            type_uri = table.common_props.get('dc:conformsTo')
-            if type_uri:
-                try:
-                    terms.is_cldf_uri(type_uri)
-                except ValueError:
-                    success = False
-                    log_or_raise('invalid CLDF URI: {0}'.format(type_uri), log=log)
-
-            if not table.tableSchema.primaryKey:
-                if log:
-                    log.warning('Table without primary key: {0} - {1}'.format(
-                        table.url,
-                        'This may cause problems with "cldf createdb"'))
-            elif len(table.tableSchema.primaryKey) > 1:
-                if log:
-                    log.warning('Table with composite primary key: {0} - {1}'.format(
-                        table.url,
-                        'This may cause problems with "cldf createdb"'))
-
-            # FIXME: check whether table.common_props['dc:conformsTo'] is in validators!
-            validators_, propertyUrls, colnames = [], set(), set()
-            for col in table.tableSchema.columns:
-                if col.header in colnames:  # pragma: no cover
-                    success = False
-                    log_or_raise(
-                        'Duplicate column name in table schema: {} {}'.format(
-                            table.url, col.header),
-                        log=log)
-                colnames.add(col.header)
-                if col.propertyUrl:
-                    col_uri = col.propertyUrl.uri
-                    try:
-                        terms.is_cldf_uri(col_uri)
-                        if col_uri in propertyUrls:  # pragma: no cover
-                            success = False
-                            log_or_raise(
-                                'Duplicate CLDF property in table schema: {} {}'.format(
-                                    table.url, col_uri),
-                                log=log)
-                        propertyUrls.add(col_uri)
-                    except ValueError:
-                        success = False
-                        log_or_raise('invalid CLDF URI: {0}'.format(col_uri), log=log)
-                for table_, col_, v_ in validators:
-                    if (not table_ or table is self.get(table_)) and col is self.get((table, col_)):
-                        validators_.append((col, v_))
-
-            fname = pathlib.Path(table.url.resolve(table._parent.base))
-            fexists = fname.exists()
-            if (not fexists) and fname.parent.joinpath('{}.zip'.format(fname.name)).exists():
-                if log:
-                    log.info('Reading data from zipped table: {}.zip'.format(fname))
-                fexists = True  # csvw already handles this case, no need to adapt paths.
-            if is_url(table.url.resolve(table._parent.base)) or fexists:
-                for fname, lineno, row in table.iterdicts(log=log, with_metadata=True):
-                    for col, validate in validators_:
-                        try:
-                            validate(self, table, col, row)
-                        except ValueError as e:
-                            success = False
-                            log_or_raise(
-                                '{0}:{1}:{2} {3}'.format(fname.name, lineno, col.name, e),
-                                log=log)
-                if not table.check_primary_key(log=log):
-                    success = False
-            else:
-                success = False
-                log_or_raise('{0} does not exist'.format(fname), log=log)
-
-        if not self.tablegroup.check_referential_integrity(log=log):
-            success = False
-
-        for cls in ComponentWithValidation.__subclasses__():
-            if cls.__name__ in self:
-                success = cls(self).validate(success, log=log)
-
-        return success
-
-    def stats(self, exact: bool = False) -> typing.List[typing.Tuple[str, str, int]]:
+    def stats(self, exact: bool = False) -> list[tuple[str, str, int]]:
         """
         Compute summary statistics for the dataset.
 
-        :return: List of triples (table, type, rowcount).
+        :return: List of triples (filename, component, rowcount).
         """
-        res = []
-        for table in self.tables:
-            dctype = table.common_props.get('dc:conformsTo')
-            if dctype and '#' in dctype and dctype.split('#')[1] in TERMS:
-                dctype = TERMS[dctype.split('#')[1]].csvw_prop('name')
-            res.append((
-                table.url.string,
-                dctype,
-                sum(1 for _ in table) if (exact or 'dc:extent' not in table.common_props)
-                else int(table.common_props.get('dc:extent'))))
-        if self.sources:
-            res.append((self.bibname, 'Sources', len(self.sources)))
-        return res
+        return get_table_stats(self, exact)
 
 
 class Generic(Dataset):
@@ -1197,7 +843,7 @@ class Generic(Dataset):
     .. seealso:: `<https://github.com/cldf/cldf/tree/master/modules/Generic>`_
     """
     @property
-    def primary_table(self):
+    def primary_table(self) -> None:  # pylint: disable=missing-function-docstring
         return None
 
 
@@ -1208,10 +854,11 @@ class Wordlist(Dataset):
     .. seealso:: `<https://github.com/cldf/cldf/tree/master/modules/Wordlist>`_
     """
     @property
-    def primary_table(self):
+    def primary_table(self) -> str:  # pylint: disable=missing-function-docstring
         return 'FormTable'
 
-    def get_segments(self, row, table='FormTable') -> typing.List[str]:
+    def get_segments(self, row: RowType, table='FormTable') -> list[str]:
+        """Retrieve the list of segments of a form."""
         col = self[table].get_column("http://cldf.clld.org/v1.0/terms.rdf#segments")
         sounds = row[col.name]
         if isinstance(sounds, str):
@@ -1219,41 +866,40 @@ def get_segments(self, row, table='FormTable') -> typing.List[str]:
             sounds = [sounds]
         return list(itertools.chain(*[s.split() for s in sounds]))
 
-    def get_subsequence(self, cognate: dict, form=None) -> typing.List[str]:
+    def get_subsequence(self, cognate: RowType, form: Optional[str] = None) -> list[str]:
         """
         Compute the subsequence of the morphemes of a form which is specified in a partial
         cognate assignment.
 
         :param cognate: A `dict` holding the data of a row from a `CognateTable`.
         """
-        return resolve_slices(
-            cognate,
-            self,
-            ('CognateTable', "http://cldf.clld.org/v1.0/terms.rdf#segmentSlice"),
-            ('FormTable', "http://cldf.clld.org/v1.0/terms.rdf#segments"),
-            'Form_ID',
-            target_row=form)
+        target_row = form or self.get_row('FormTable', cognate['Form_ID'])
+        return multislice_with_split(
+            target_row[self['FormTable', "http://cldf.clld.org/v1.0/terms.rdf#segments"].name],
+            cognate[self['CognateTable', "http://cldf.clld.org/v1.0/terms.rdf#segmentSlice"].name],
+        )
 
 
 class ParallelText(Dataset):
+    """Implements the CLDF ParallelText module."""
     @property
-    def primary_table(self):
+    def primary_table(self) -> str:  # pylint: disable=missing-function-docstring
         return 'FormTable'
 
     def get_equivalent(self, functional_equivalent, form=None):
-        return resolve_slices(
-            functional_equivalent,
-            self,
-            ('FunctionalEquivalentTable',
-             "http://cldf.clld.org/v1.0/terms.rdf#segmentSlice"),
-            ('FormTable', "http://cldf.clld.org/v1.0/terms.rdf#segments"),
-            'Form_ID',
-            target_row=form)
+        """Get the forms fulfilling an equivalent function in the texts."""
+        slice_col_name = self[
+            'FunctionalEquivalentTable', "http://cldf.clld.org/v1.0/terms.rdf#segmentSlice"].name
+        sequence_col_name = self['FormTable', "http://cldf.clld.org/v1.0/terms.rdf#segments"].name
+        target_row = form or self.get_row('FormTable', functional_equivalent['Form_ID'])
+        return multislice_with_split(
+            target_row[sequence_col_name], functional_equivalent[slice_col_name])
 
 
 class Dictionary(Dataset):
+    """Implements the CLDF Dictionary module."""
     @property
-    def primary_table(self):
+    def primary_table(self) -> str:  # pylint: disable=missing-function-docstring
         return 'EntryTable'
 
 
@@ -1264,7 +910,7 @@ class StructureDataset(Dataset):
     .. seealso:: `<https://github.com/cldf/cldf/tree/master/modules/StructureDataset>`_
     """
     @property
-    def primary_table(self):
+    def primary_table(self) -> str:  # pylint: disable=missing-function-docstring
         return 'ValueTable'
 
     @functools.cached_property
@@ -1296,21 +942,26 @@ class TextCorpus(Dataset):
         [<pycldf.orm.Example id="e2-alt">]
     """
     @property
-    def primary_table(self):
+    def primary_table(self) -> str:  # pylint: disable=missing-function-docstring
         return 'ExampleTable'
 
     @functools.cached_property
-    def texts(self) -> typing.Union[None, DictTuple]:
+    def texts(self) -> Optional[DictTuple]:
+        """Retrieve texts."""
         # Some syntactic sugar to access the ORM data in a concise and meaningful way.
         if 'ContributionTable' in self:
             return self.objects('ContributionTable')
+        return None  # pragma: no cover
 
-    def get_text(self, tid):
+    def get_text(self, tid: str) -> Optional[orm.Object]:
+        """Retrieve a text by ID."""
         if 'ContributionTable' in self:
             return self.get_object('ContributionTable', tid)
+        return None  # pragma: no cover
 
     @property
-    def sentences(self) -> typing.List[orm.Example]:
+    def sentences(self) -> list[orm.Example]:
+        """Sentences of the corpus."""
         res = list(self.objects('ExampleTable'))
         if ('ExampleTable', 'exampleReference') in self:
             # Filter out alternative translations!
@@ -1320,47 +971,7 @@ def sentences(self) -> typing.List[orm.Example]:
         return res  # pragma: no cover
 
 
-class ComponentWithValidation:
-    """
-    A virtual base class for custom, component-centered validation.
-    """
-    def __init__(self, ds: Dataset):
-        self.ds = ds
-        self.component = self.__class__.__name__
-        self.table = ds[self.component]
-
-    def validate(self, success: bool = True, log: logging.Logger = None) -> bool:
-        return success  # pragma: no cover
-
-
-def sniff(p: pathlib.Path) -> bool:
-    """
-    Determine whether a file contains CLDF metadata.
-
-    :param p: `pathlib.Path` object for an existing file.
-    :return: `True` if the file contains CLDF metadata, `False` otherwise.
-    """
-    if not p.is_file():  # pragma: no cover
-        return False
-    try:
-        with p.open('rb') as fp:
-            c = fp.read(10)
-            try:
-                c = c.decode('utf8').strip()
-            except UnicodeDecodeError:
-                return False
-            if not c.startswith('{'):
-                return False
-    except (FileNotFoundError, OSError):  # pragma: no cover
-        return False
-    try:
-        d = jsonlib.load(p)
-    except json.decoder.JSONDecodeError:
-        return False
-    return d.get('dc:conformsTo', '').startswith(TERMS_URL)
-
-
-def iter_datasets(d: pathlib.Path) -> typing.Generator[Dataset, None, None]:
+def iter_datasets(d: PathType) -> Generator[Dataset, None, None]:
     """
     Discover CLDF datasets - by identifying metadata files - in a directory.
 
@@ -1372,5 +983,4 @@ def iter_datasets(d: pathlib.Path) -> typing.Generator[Dataset, None, None]:
             try:
                 yield Dataset.from_metadata(p)
             except ValueError as e:
-                logging.getLogger(__name__).warning(
-                    "Reading {} failed: {}".format(p, e))
+                logging.getLogger(__name__).warning("Reading %s failed: %s", p, e)
diff --git a/src/pycldf/db.py b/src/pycldf/db.py
index 36d0565..ac4404e 100644
--- a/src/pycldf/db.py
+++ b/src/pycldf/db.py
@@ -39,20 +39,24 @@
     FOREIGN KEY(`custom.csv_id`) REFERENCES `custom.csv`(`id`) ON DELETE CASCADE
   );
 """
-import typing
+from typing import Optional, Any, Callable, Protocol, TYPE_CHECKING
 import inspect
 import pathlib
 import sqlite3
 import functools
 import collections
+import dataclasses
 
-import attr
 import csvw
 import csvw.db
+from csvw.db import ColSpec, TableSpec
+from csvw.metadata import Table as CSVWTable
 
 from pycldf.terms import TERMS
 from pycldf.sources import Reference, Sources, Source
-from pycldf import Dataset
+
+if TYPE_CHECKING:
+    from pycldf import Dataset  # pragma: no cover
 
 __all__ = ['Database', 'query']
 
@@ -87,16 +91,19 @@
 ]
 
 
-@attr.s
-class TableTranslation(object):
+@dataclasses.dataclass
+class TableTranslation:
     """
     Specifies column name translations for a table.
     """
-    name = attr.ib(default=None)
-    columns = attr.ib(default=attr.Factory(dict))
+    name: str = None
+    columns: dict[str, str] = dataclasses.field(default_factory=dict)
+
 
+TranslationDict = dict[str, TableTranslation]
 
-def translate(d: typing.Dict[str, TableTranslation], table: str, col=None) -> str:
+
+def translate(d: TranslationDict, table: str, col: str = None) -> str:
     """
     Translate a db object name.
 
@@ -124,7 +131,7 @@ def translate(d: typing.Dict[str, TableTranslation], table: str, col=None) -> st
     # 2. Since regular table names may contain underscores as well, we try to find the longest
     # concatenation of _-separated name parts which appears in the translation dict.
     # 3. We repeat step 2 until all name parts have been consumed.
-    def t(n):
+    def t_(n):
         if n in d:
             return d[n].name or n
         tables, comps = [], n.split('_')
@@ -142,10 +149,10 @@ def t(n):
             tables.append(d[comps[0]].name or comps[0] if comps[0] in d else comps[0])
         return '_'.join(tables)
 
-    return t(table)
+    return t_(table)
 
 
-def clean_bibtex_key(s):
+def clean_bibtex_key(s: str) -> str:  # pylint: disable=C0116
     return s.replace('-', '_').lower()
 
 
@@ -158,23 +165,74 @@ class Database(csvw.db.Database):
     """
     source_table_name = 'SourceTable'
 
-    def __init__(self, dataset: Dataset, **kw):
+    def __init__(self, dataset: 'Dataset', **kw):
         """
         :param dataset: The :class:`Dataset` instance from which to derive the database schema.
         """
-        self.dataset = dataset
+        self.dataset: 'Dataset' = dataset
         self._retranslate = collections.defaultdict(dict)
         self._source_cols = ['id', 'genre'] + BIBTEX_FIELDS
         # Source items can be referenced with case insensitive keys. So we store a mapping from
         # lowercase keys to the ones actually used in the source BibTeX.
         self._source_map = {}
 
-        infer_primary_keys = kw.pop('infer_primary_keys', False)
-
         # We create a derived TableGroup, adding a table for the sources.
         tg = csvw.TableGroup.fromvalue(dataset.metadata_dict)
 
         # Assemble the translation function:
+        translations: TranslationDict = self._get_translations(dataset)
+
+        # Add source table:
+        for src in self.dataset.sources:
+            for key in src:
+                key = clean_bibtex_key(key)
+                if key not in self._source_cols:
+                    self._source_cols.append(key)
+
+        tg.tables.append(csvw.Table.fromvalue({
+            'url': self.source_table_name,
+            'tableSchema': {'columns': [{'name': n} for n in self._source_cols], 'primaryKey': 'id'}
+        }))
+        tg.tables[-1]._parent = tg
+
+        # Add foreign keys to source table:
+        infer_primary_keys = kw.pop('infer_primary_keys', False)
+        for table in tg.tables[:-1]:
+            self._add_fk_to_sources(table, infer_primary_keys, translations)
+
+        # Make sure `base` directory can be resolved:
+        tg._fname = dataset.tablegroup._fname
+        csvw.db.Database.__init__(
+            self, tg, translate=functools.partial(translate, translations), **kw)
+
+    def _add_fk_to_sources(
+            self,
+            table: CSVWTable,
+            infer_primary_keys: bool,
+            translations: TranslationDict,
+    ):
+        if not table.tableSchema.primaryKey and infer_primary_keys:
+            for col in table.tableSchema.columns:
+                if col.name.lower() in PRIMARY_KEY_NAMES:
+                    table.tableSchema.primaryKey = [col.name]
+                    break
+        for col in table.tableSchema.columns:
+            if col.propertyUrl and col.propertyUrl.uri == TERMS['source'].uri:
+                table.tableSchema.foreignKeys.append(csvw.ForeignKey.fromdict({
+                    'columnReference': [col.header],
+                    'reference': {'resource': self.source_table_name, 'columnReference': 'id'}
+                }))
+                if translations[table.local_name].name:
+                    tl = translations[table.local_name]
+                    translations[f'{table.local_name}_{self.source_table_name}'] = \
+                        TableTranslation(
+                            name=f'{tl.name}_{self.source_table_name}',
+                            columns={
+                                f'{table.local_name}_{table.tableSchema.primaryKey[0]}':
+                                f'{tl.name}_{tl.columns[table.tableSchema.primaryKey[0]]}'})
+                break
+
+    def _get_translations(self, dataset: 'Dataset') -> TranslationDict:
         translations = {}
         for table in dataset.tables:
             translations[table.local_name] = TableTranslation()
@@ -191,7 +249,7 @@ def __init__(self, dataset: Dataset, **kw):
                 if col.propertyUrl and col.propertyUrl.uri in TERMS.by_uri:
                     # Translate local column names to local names of CLDF Ontology terms, prefixed
                     # with `cldf_`:
-                    col_name = 'cldf_{0.name}'.format(TERMS.by_uri[col.propertyUrl.uri])
+                    col_name = f'cldf_{TERMS.by_uri[col.propertyUrl.uri].name}'
                     new_col_names.append(col_name.lower())
                     translations[table.local_name].columns[col.header] = col_name
                     self._retranslate[table.local_name][col_name] = col.header
@@ -200,60 +258,12 @@ def __init__(self, dataset: Dataset, **kw):
                 if not (col.propertyUrl and col.propertyUrl.uri in TERMS.by_uri):
                     if col.header.lower() in new_col_names:
                         # A name clash! We translate the old column name!
-                        col_name = '_{}'.format(col.header)
+                        col_name = f'_{col.header}'
                         translations[table.local_name].columns[col.header] = col_name
                         self._retranslate[table.local_name][col_name] = col.header
+        return translations
 
-        # Add source table:
-        for src in self.dataset.sources:
-            for key in src:
-                key = clean_bibtex_key(key)
-                if key not in self._source_cols:
-                    self._source_cols.append(key)
-
-        tg.tables.append(csvw.Table.fromvalue({
-            'url': self.source_table_name,
-            'tableSchema': {
-                'columns': [dict(name=n) for n in self._source_cols],
-                'primaryKey': 'id'
-            }
-        }))
-        tg.tables[-1]._parent = tg
-
-        # Add foreign keys to source table:
-        for table in tg.tables[:-1]:
-            if not table.tableSchema.primaryKey and infer_primary_keys:
-                for col in table.tableSchema.columns:
-                    if col.name.lower() in PRIMARY_KEY_NAMES:
-                        table.tableSchema.primaryKey = [col.name]
-                        break
-            for col in table.tableSchema.columns:
-                if col.propertyUrl and col.propertyUrl.uri == TERMS['source'].uri:
-                    table.tableSchema.foreignKeys.append(csvw.ForeignKey.fromdict({
-                        'columnReference': [col.header],
-                        'reference': {
-                            'resource': self.source_table_name,
-                            'columnReference': 'id'
-                        }
-                    }))
-                    if translations[table.local_name].name:
-                        tl = translations[table.local_name]
-                        translations['{0}_{1}'.format(table.local_name, self.source_table_name)] = \
-                            TableTranslation(
-                                name='{0}_{1}'.format(tl.name, self.source_table_name),
-                                columns={'{0}_{1}'.format(
-                                    table.local_name, table.tableSchema.primaryKey[0],
-                                ): '{0}_{1}'.format(
-                                    tl.name, tl.columns[table.tableSchema.primaryKey[0]],
-                                )})
-                    break
-
-        # Make sure `base` directory can be resolved:
-        tg._fname = dataset.tablegroup._fname
-        csvw.db.Database.__init__(
-            self, tg, translate=functools.partial(translate, translations), **kw)
-
-    def association_table_context(self, table, column, fkey):
+    def association_table_context(self, table: TableSpec, column: ColSpec, fkey: str):
         if self.translate(table.name, column) == 'cldf_source':
             # We decompose references into the source ID and optional pages. Pages are stored as
             # `context` of the association table and composed again in `select_many_to_many`.
@@ -275,13 +285,13 @@ def association_table_context(self, table, column, fkey):
         return csvw.db.Database.association_table_context(
             self, table, column, fkey)  # pragma: no cover
 
-    def select_many_to_many(self, db, table, context):
+    def select_many_to_many(self, db, table: TableSpec, context):
         if table.name.endswith('_' + self.source_table_name):
             atable = table.name.partition('_' + self.source_table_name)[0]
             if self.translate(atable, context) == 'cldf_source':
                 # Compose references:
                 res = csvw.db.Database.select_many_to_many(self, db, table, None)
-                return {k: ['{0}'.format(Reference(*vv)) for vv in v] for k, v in res.items()}
+                return {k: [f'{Reference(*vv)}' for vv in v] for k, v in res.items()}
         return csvw.db.Database.select_many_to_many(self, db, table, context)  # pragma: no cover
 
     def write(self, _force=False, _exists_ok=False, **items):
@@ -293,7 +303,8 @@ def write(self, _force=False, _exists_ok=False, **items):
         return csvw.db.Database.write(
             self, _force=False, _exists_ok=False, _skip_extra=True, **items)
 
-    def write_from_tg(self, _force: bool = False, _exists_ok: bool = False):
+    def write_from_tg(  # pylint: disable=W0221
+            self, _force: bool = False, _exists_ok: bool = False):
         """
         Write the data from `self.dataset` to the database.
         """
@@ -309,7 +320,7 @@ def write_from_tg(self, _force: bool = False, _exists_ok: bool = False):
             self._source_map[src.id.lower()] = src.id
         return self.write(_force=_force, _exists_ok=_exists_ok, **items)
 
-    def query(self, sql: str, params=None) -> list:
+    def query(self, sql: str, params=None) -> list[Any]:
         """
         Run `sql` on the database, returning the list of results.
         """
@@ -317,7 +328,7 @@ def query(self, sql: str, params=None) -> list:
             cu = conn.execute(sql, params or ())
             return list(cu.fetchall())
 
-    def retranslate(self, table, item):
+    def retranslate(self, table: CSVWTable, item):  # pylint: disable=C0116
         return {self._retranslate.get(table.local_name, {}).get(k, k): v for k, v in item.items()}
 
     @staticmethod
@@ -373,21 +384,22 @@ def to_cldf(self, dest, mdname='cldf-metadata.json', coordinate_precision=4) ->
         return self.dataset.write_metadata(dest / mdname)
 
 
-class AggregateClass(typing.Protocol):  # pragma: no cover
-    def step(self, value):
+class AggregateClass(Protocol):  # pragma: no cover   # pylint: disable=C0115
+    def step(self, value):  # pylint: disable=C0116
         ...
 
-    def finalize(self):
+    def finalize(self):  # pylint: disable=C0116
         ...
 
 
-def query(conn: sqlite3.Connection,
-          sql: str,
-          params=None,
-          functions: typing.Optional[typing.List[typing.Callable]] = None,
-          aggregates: typing.Optional[typing.List[AggregateClass]] = None,
-          collations: typing.Optional[typing.List[typing.Callable]] = None) \
-        -> typing.Generator[typing.Any, None, None]:
+def query(  # pylint: disable=R0913,R0917
+        conn: sqlite3.Connection,
+        sql: str,
+        params=None,
+        functions: Optional[list[Callable]] = None,
+        aggregates: Optional[list[AggregateClass]] = None,
+        collations: Optional[list[Callable]] = None,
+) -> list[Any]:
     """
     Note: Passing lambdas or functools.partial objects as function requires passing an explicit name
     as well.
diff --git a/src/pycldf/ext/discovery.py b/src/pycldf/ext/discovery.py
index 74fb4bf..35ed69f 100644
--- a/src/pycldf/ext/discovery.py
+++ b/src/pycldf/ext/discovery.py
@@ -16,7 +16,7 @@
   resolver for DOI URLs pointing to the Zenodo archive.
 """
 import re
-import typing
+from typing import Optional, Union
 import pathlib
 import zipfile
 import warnings
@@ -28,7 +28,8 @@
 from csvw.utils import is_url
 
 from pycldf import Dataset, iter_datasets, sniff
-from pycldf.util import url_without_fragment
+from pycldf.urlutil import url_without_fragment
+from pycldf._compat import entry_points_select
 
 __all__ = ['get_dataset', 'DatasetResolver']
 EP = 'pycldf_dataset_resolver'
@@ -36,7 +37,7 @@
 _resolvers = []
 
 
-class DatasetResolver:
+class DatasetResolver:  # pylint: disable=R0903
     """
     Virtual base class for dataset resolvers.
 
@@ -46,8 +47,11 @@ class DatasetResolver:
     """
     priority = 5
 
-    def __call__(self, loc: str, download_dir: pathlib.Path) \
-            -> typing.Union[None, Dataset, pathlib.Path]:
+    def __call__(
+            self,
+            loc: str,
+            download_dir: pathlib.Path,
+    ) -> Union[None, Dataset, pathlib.Path]:
         """
         :param loc: URL pointing to a place where datasets are archived.
         :param download_dir: A directory to which resolvers can download data.
@@ -58,43 +62,45 @@ def __call__(self, loc: str, download_dir: pathlib.Path) \
         raise NotImplementedError()  # pragma: no cover
 
 
-class LocalResolver(DatasetResolver):
+class LocalResolver(DatasetResolver):  # pylint: disable=R0903
     """
     Resolves dataset locators specifying local file paths.
     """
     priority = 100
 
-    def __call__(self, loc: str, download_dir, base: typing.Optional[pathlib.Path]) \
-            -> typing.Union[None, pathlib.Path]:
+    def __call__(
+            self,
+            loc: str,
+            download_dir,
+            base: Optional[pathlib.Path],
+    ) -> Optional[pathlib.Path]:
         """
         :return: a local path to a directory
         """
         if isinstance(loc, str) and is_url(loc):
-            return
+            return None
         loc = pathlib.Path(loc)
         if loc.resolve() != loc and base:
             # A relative path, to be interpreted relative to base
             loc = base.resolve().joinpath(loc)
         if loc.exists():
             return loc
+        return None  # pragma: no cover
 
 
-class GenericUrlResolver(DatasetResolver):
+class GenericUrlResolver(DatasetResolver):  # pylint: disable=R0903
     """
     URL resolver which works for generic URLs provided they point to a CLDF metadata file.
     """
     priority = -1
 
-    def __call__(self, loc, download_dir):
+    def __call__(self, loc, download_dir) -> Optional[Dataset]:
         if is_url(loc):
-            try:
-                return Dataset.from_metadata(loc)
-            except:  # noqa: E722 # pragma: no cover
-                raise
-                pass
+            return Dataset.from_metadata(loc)
+        return None  # pragma: no cover
 
 
-class GitHubResolver(DatasetResolver):
+class GitHubResolver(DatasetResolver):  # pylint: disable=R0903
     """
     Resolves dataset locators of the form "https://github.com/<org>/<repos>/tree/<tag>", e.g.
     https://github.com/cldf-datasets/petersonsouthasia/tree/v1.1
@@ -103,55 +109,60 @@ class GitHubResolver(DatasetResolver):
     """
     priority = 3
 
-    def __call__(self, loc, download_dir):
+    def __call__(self, loc, download_dir) -> Optional[pathlib.Path]:
         url = urllib.parse.urlparse(loc)
-        if url.netloc == 'github.com' and re.search(r'/[v\.0-9]+$', url.path):
+        if url.netloc == 'github.com' and re.search(r'/[v.0-9]+$', url.path):
             comps = url.path.split('/')
-            z = download_dir / '{}-{}-{}.zip'.format(comps[1], comps[2], comps[-1])
-            url = "https://github.com/{}/{}/archive/refs/tags/{}.zip".format(
-                comps[1], comps[2], comps[-1])
+            z = download_dir / f'{comps[1]}-{comps[2]}-{comps[-1]}.zip'
+            url = f"https://github.com/{comps[1]}/{comps[2]}/archive/refs/tags/{comps[-1]}.zip"
             urllib.request.urlretrieve(url, z)
-            zf = zipfile.ZipFile(z)
-            dirs = {info.filename.split('/')[0] for info in zf.infolist()}
-            assert len(dirs) == 1
-            zf.extractall(download_dir)
+            with zipfile.ZipFile(z) as zf:
+                dirs = {info.filename.split('/')[0] for info in zf.infolist()}
+                assert len(dirs) == 1
+                zf.extractall(download_dir)
             z.unlink()
             return download_dir / dirs.pop()
+        return None
 
 
 class DatasetLocator(str):
+    """Dataset locators are URLs with identifying information added to the fragment."""
     @functools.cached_property
-    def parsed_url(self) -> urllib.parse.ParseResult:
+    def parsed_url(self) -> urllib.parse.ParseResult:  # pylint: disable=C0116
         return urllib.parse.urlparse(self)
 
     @property
-    def url_without_fragment(self):
+    def url_without_fragment(self):  # pylint: disable=C0116
         return url_without_fragment(self.parsed_url)
 
-    def match(self, dataset: Dataset) -> bool:
+    def match(self, dataset: Dataset) -> bool:  # pylint: disable=C0116
         if self.parsed_url.fragment:
             key, _, value = self.parsed_url.fragment.partition('=')
             return dataset.properties.get(key) == value if value else key in dataset.properties
         return True
 
 
-def get_resolvers():
+def get_resolvers() -> list[type]:
+    """Register resolvers defined via entry points."""
     if not _resolvers:
-        eps = entry_points()
-        for ep in set(eps.select(group=EP) if hasattr(eps, 'select') else eps.get(EP, [])):
+        for ep in set(entry_points_select(entry_points(), EP)):
             try:
                 _resolvers.append(ep.load()())
             except ImportError:  # pragma: no cover
-                warnings.warn('ImportError loading entry point {0.name}'.format(ep))
+                warnings.warn(f'ImportError loading entry point {ep.name}')
                 continue
     return sorted(_resolvers, key=lambda res: -res.priority)
 
 
-def _get_dataset(locator: DatasetLocator, location: typing.Union[None, Dataset, pathlib.Path]):
+def _get_dataset(
+        locator: DatasetLocator,
+        location: Union[None, Dataset, pathlib.Path],
+) -> Optional[Dataset]:
+    """Determine whether locator matches location and if so, resolve to a Dataset instance."""
     if isinstance(location, Dataset):
         if locator.match(location):
             return location
-        return
+        return None
     if location.is_dir():
         for ds in iter_datasets(location):
             if locator.match(ds):
@@ -160,11 +171,12 @@ def _get_dataset(locator: DatasetLocator, location: typing.Union[None, Dataset,
         ds = Dataset.from_metadata(location) if sniff(location) else Dataset.from_data(location)
         if locator.match(ds):
             return ds
+    return None  # pragma: no cover
 
 
 def get_dataset(locator: str,
                 download_dir: pathlib.Path,
-                base: typing.Optional[pathlib.Path] = None) -> Dataset:
+                base: Optional[pathlib.Path] = None) -> Dataset:
     """
     :param locator: Dataset locator as specified in "Dataset discovery".
     :param download_dir: Directory to which to download remote data if necessary.
@@ -182,4 +194,4 @@ def get_dataset(locator: str,
             res = _get_dataset(locator, res)
             if res:
                 return res
-    raise ValueError('Could not resolve dataset locator {}'.format(locator))
+    raise ValueError(f'Could not resolve dataset locator {locator}')
diff --git a/src/pycldf/ext/markdown.py b/src/pycldf/ext/markdown.py
index 108dea4..c92694a 100644
--- a/src/pycldf/ext/markdown.py
+++ b/src/pycldf/ext/markdown.py
@@ -4,24 +4,25 @@
 For an example, see :class:`FilenameToComponent`.
 """
 import re
-import typing
+from typing import Optional, Union, Any
 import pathlib
 import warnings
-import collections.abc
+import collections
+from collections.abc import Mapping
+import dataclasses
 
 import yaml
 import jmespath
-import attr
 import frontmatter
 import clldutils
 from clldutils.markup import MarkdownLink
 
-from .discovery import get_dataset
-from pycldf.util import pkg_path, url_without_fragment
-from pycldf.dataset import MD_SUFFIX
+from pycldf.util import pkg_path, MD_SUFFIX
+from pycldf.urlutil import url_without_fragment
 from pycldf.sources import Source
 from pycldf import Dataset
 from pycldf import orm
+from .discovery import get_dataset
 
 __all__ = ['CLDFMarkdownLink', 'CLDFMarkdownText', 'FilenameToComponent']
 
@@ -31,14 +32,14 @@
 METADATA_COMPONENT = 'Metadata'
 
 
-class DatasetMapping(collections.abc.Mapping):
+class DatasetMapping(Mapping):
     """
     A read-only mapping of prefixes to datasets.
     """
     key_pattern = re.compile('[a-zA-Z0-9_]+')
 
     @staticmethod
-    def to_dict(o):
+    def to_dict(o):  # pylint: disable=C0116
         if isinstance(o, DatasetMapping):
             return o.m
         return {} if not o else ({None: o} if isinstance(o, (str, Dataset)) else o)
@@ -46,8 +47,8 @@ def to_dict(o):
     def __init__(self,
                  m1,
                  m2=None,
-                 doc_path: typing.Optional[pathlib.Path] = None,
-                 download_dir: typing.Optional[pathlib.Path] = None):
+                 doc_path: Optional[pathlib.Path] = None,
+                 download_dir: Optional[pathlib.Path] = None):
         """
         :param m1: Mapping of prefixes to datasets (locators).
         :param m2: Mapping of prefixes to datasets (locators) to update `m1`.
@@ -64,7 +65,7 @@ def __init__(self,
             if not isinstance(self.m[k], Dataset):
                 self.m[k] = get_dataset(self.m[k], download_dir, doc_path)
 
-    def __getitem__(self, prefix: typing.Union[str, None]) -> Dataset:
+    def __getitem__(self, prefix: Union[str, None]) -> Dataset:
         """
         Get a `Dataset` mapped to a prefix.
         """
@@ -77,7 +78,7 @@ def __len__(self):
         return len(self.m)
 
 
-@attr.s
+@dataclasses.dataclass
 class CLDFMarkdownLink(MarkdownLink):
     """
     CLDF Markdown links are specified using URLs of a particular format.
@@ -88,18 +89,20 @@ class CLDFMarkdownLink(MarkdownLink):
     fragment_pattern = re.compile(r'cldf(-(?P<prefix>[a-zA-Z0-9_]+))?:')
 
     @property
-    def url_without_fragment(self):
+    def url_without_fragment(self) -> str:
+        """Return the HREF value of the link without the fragment."""
         return url_without_fragment(self.parsed_url)
 
     @staticmethod
-    def format_url(path, objid, prefix=None):
-        return '{}#cldf{}:{}'.format(path, '-' + prefix if prefix else '', objid)
+    def format_url(path, objid, prefix=None) -> str:
+        """Format the HREF value for a CLDF Markdown link."""
+        prefix = '-' + prefix if prefix else ''
+        return f'{path}#cldf{prefix}:{objid}'
 
     @classmethod
     def from_component(cls, comp, objid='__all__', label=None, prefix=None) -> 'CLDFMarkdownLink':
-        return cls(
-            label=label or '{}:{}'.format(comp, objid),
-            url=cls.format_url(comp, objid, prefix=prefix))
+        """Create a CLDF Markdown link for an object in a component."""
+        return cls(label=label or f'{comp}:{objid}', url=cls.format_url(comp, objid, prefix=prefix))
 
     @property
     def is_cldf_link(self) -> bool:
@@ -109,25 +112,27 @@ def is_cldf_link(self) -> bool:
         return bool(self.fragment_pattern.match(self.parsed_url.fragment))
 
     @property
-    def prefix(self) -> typing.Union[None, str]:
+    def prefix(self) -> Optional[str]:
         """
         The dataset prefix associated with a CLDF Markdown link.
         """
         if self.is_cldf_link:
             return self.fragment_pattern.match(self.parsed_url.fragment).group('prefix')
+        return None  # pragma: no cover
 
     @property
-    def table_or_fname(self) -> typing.Union[None, str]:
+    def table_or_fname(self) -> Optional[str]:
         """
         The last path component of the URL of a CLDF Markdown link.
         """
         if self.is_cldf_link:
             return self.parsed_url.path.split('/')[-1]
+        return None  # pragma: no cover
 
-    def component(self,
-                  cldf: typing.Optional[
-                      typing.Union[Dataset, typing.Dict[str, Dataset], DatasetMapping]] = None,
-                  ) -> typing.Union[str, None]:
+    def component(
+            self,
+            cldf: Optional[Union[Dataset, dict[str, Dataset], DatasetMapping]] = None,
+    ) -> Union[str, None]:
         """
         :param cldf: `pycldf.Dataset` instance to which the link refers.
         :return: Name of the CLDF component the link pertains to or `None`.
@@ -143,9 +148,9 @@ def component(self,
         if isinstance(cldf, (dict, DatasetMapping)):
             cldf = cldf[self.prefix]
 
-        if name == cldf.bibname or name == SOURCE_COMPONENT:
+        if name in (cldf.bibname, SOURCE_COMPONENT):
             return SOURCE_COMPONENT
-        if name == cldf.filename or name == METADATA_COMPONENT:
+        if name in (cldf.filename, METADATA_COMPONENT):
             return METADATA_COMPONENT
         try:
             return cldf.get_tabletype(cldf[name])
@@ -153,12 +158,13 @@ def component(self,
             return None
 
     @property
-    def objid(self) -> typing.Union[None, str]:
+    def objid(self) -> Optional[str]:
         """
         The identifier of the object referenced by a CLDF Markdown link.
         """
         if self.is_cldf_link:
             return self.parsed_url.fragment.split(':', maxsplit=1)[-1]
+        return None  # pragma: no cover
 
     @property
     def all(self) -> bool:
@@ -167,7 +173,7 @@ def all(self) -> bool:
         """
         return self.objid == '__all__'
 
-    def get_row(self, cldf: typing.Union[Dataset, DatasetMapping]) -> dict:
+    def get_row(self, cldf: Union[Dataset, DatasetMapping]) -> dict:
         """
         Resolve the reference in a CLDF Markdown link to a row (`dict`) in the CLDF `Dataset`.
         """
@@ -175,7 +181,7 @@ def get_row(self, cldf: typing.Union[Dataset, DatasetMapping]) -> dict:
         ds = DatasetMapping(cldf)[self.prefix]
         return ds.get_row(self.component(cldf=ds), self.objid)
 
-    def get_object(self, cldf: typing.Union[Dataset, DatasetMapping]) -> orm.Object:
+    def get_object(self, cldf: Union[Dataset, DatasetMapping]) -> orm.Object:
         """
         Resolve the reference in a CLDF Markdown link to an ORM object in the CLDF `Dataset`.
         """
@@ -213,9 +219,9 @@ def render_link(self, link):
     :cvar metadata_component: Name of the special "Metadata" component.
     """
     def __init__(self,
-                 text: typing.Union[pathlib.Path, str],
-                 dataset_mapping: typing.Optional[typing.Union[str, Dataset, dict]] = None,
-                 download_dir: typing.Optional[pathlib.Path] = None):
+                 text: Union[pathlib.Path, str],
+                 dataset_mapping: Optional[Union[str, Dataset, dict]] = None,
+                 download_dir: Optional[pathlib.Path] = None):
         """
         :param text: CLDF Markdown text either to be read from a path or specified as `str`.
         :param dataset_mapping: Mapping of dataset prefixes to `Dataset` instances. May override \
@@ -223,14 +229,14 @@ def __init__(self,
         :download_dir: Optional path to a directory to download data for remote datasets.
         """
         p = frontmatter.loads(text) if isinstance(text, str) else frontmatter.load(str(text))
-        self.metadata = p.metadata
-        self.dataset_mapping = DatasetMapping(
+        self.metadata: dict[str, Any] = p.metadata
+        self.dataset_mapping: Mapping[Union[str, None], Dataset] = DatasetMapping(
             p.get(DATASETS_MAPPING),
             dataset_mapping,
             text.parent if isinstance(text, pathlib.Path) else None,
             download_dir,
         )
-        self.text = p.content
+        self.text: str = p.content
         self._datadict = collections.defaultdict(dict)
         for prefix, ds in self.dataset_mapping.items():
             self._datadict[prefix][SOURCE_COMPONENT] = {src.id: src for src in ds.sources}
@@ -241,9 +247,9 @@ def frontmatter(self) -> str:
         """
         The markdown documents metadata formatted as YAML frontmatter.
         """
-        return '---\n{}---'.format(yaml.dump(self.metadata))
+        return f'---\n{yaml.dump(self.metadata)}---'
 
-    def get_object(self, ml: CLDFMarkdownLink) -> typing.Union[list, orm.Object, Source, dict]:
+    def get_object(self, ml: CLDFMarkdownLink) -> Union[list, orm.Object, Source, dict]:
         """
         Resolve the reference in a CLDF Markdown link to the matching object from a mapped dataset.
 
@@ -273,20 +279,23 @@ def get_object(self, ml: CLDFMarkdownLink) -> typing.Union[list, orm.Object, Sou
         return list(self._datadict[ml.prefix][key].values()) if ml.all \
             else self._datadict[ml.prefix][key][ml.objid]
 
-    def _render_link(self, link):
+    def _render_link(self, link: CLDFMarkdownLink) -> Union[str, CLDFMarkdownLink]:
+        """Dispatches to custom rendering in case of CLDF links."""
         if link.is_cldf_link:
             return self.render_link(link)
         return link
 
-    def render_link(self, cldf_link: CLDFMarkdownLink) -> typing.Union[str, CLDFMarkdownLink]:
+    def render_link(self, cldf_link: CLDFMarkdownLink) -> Union[str, CLDFMarkdownLink]:
         """
         CLDF Markdown renderers must implement this method.
         """
         raise NotImplementedError()  # pragma: no cover
 
-    def render(self,
-               simple_link_detection: bool = True,
-               markdown_kw: typing.Optional[dict] = None) -> str:
+    def render(
+            self,
+            simple_link_detection: bool = True,
+            markdown_kw: Optional[dict[str, Any]] = None,
+    ) -> str:
         """
         A markdown string with CLDF Markdown links replaced.
         """
@@ -297,7 +306,7 @@ def render(self,
                     category=UserWarning)
             kw = {}
         else:
-            kw = dict(simple=simple_link_detection, markdown_kw=markdown_kw)
+            kw = {'simple': simple_link_detection, 'markdown_kw': markdown_kw}
         return CLDFMarkdownLink.replace(self.text, self._render_link, **kw)
 
 
@@ -305,7 +314,7 @@ class FilenameToComponent(CLDFMarkdownText):
     """
     Renderer to replace filenames in CLDF Markdown links with CLDF component names.
     """
-    def render_link(self, cldf_link):
+    def render_link(self, cldf_link: CLDFMarkdownLink) -> CLDFMarkdownLink:
         """
         Rewrites to URL of CLDF Markdown links, using the component name as path component.
         """
diff --git a/src/pycldf/fileutil.py b/src/pycldf/fileutil.py
new file mode 100644
index 0000000..75b527b
--- /dev/null
+++ b/src/pycldf/fileutil.py
@@ -0,0 +1,65 @@
+"""
+Functionality to access and manipulate files.
+"""
+import re
+import math
+import string
+from typing import Union, Optional
+import pathlib
+import itertools
+
+
+PathType = Union[str, pathlib.Path]
+
+
+def splitfile(p: PathType, chunksize: int, total: Optional[int] = None) -> list[pathlib.Path]:
+    """
+    :param p: Path of the file to split.
+    :param chunksize: The maximal size of the chunks the file will be split into.
+    :param total: The size of the input file.
+    :return: The list of paths of files that the input has been split into.
+    """
+    p = pathlib.Path(p)
+    total = total or p.stat().st_size
+    if total <= chunksize:  # Nothing to do.
+        return [p]
+    nchunks = math.ceil(total / chunksize)
+    suffix_length = 2 if nchunks < len(string.ascii_lowercase)**2 else 3
+    suffixes = [
+        ''.join(t) for t in
+        itertools.combinations_with_replacement(string.ascii_lowercase, suffix_length)]
+
+    res = []
+    with p.open('rb') as f:
+        chunk = f.read(chunksize)
+        while chunk:
+            pp = p.parent.joinpath(f'{p.name}.{suffixes.pop(0)}')
+            pp.write_bytes(chunk)
+            res.append(pp)
+            chunk = f.read(chunksize)  # read the next chunk
+
+    p.unlink()
+    return res
+
+
+def catfile(p: PathType) -> bool:
+    """
+    Restore a file that has been split into chunks.
+
+    We determine if a file has been split by looking for files in the parent directory with suffixes
+    as created by `splitfile`.
+    """
+    p = pathlib.Path(p)
+    if p.exists():  # Nothing to do.
+        return False
+    # Check, whether the file has been split.
+    suffixes = {pp.suffix: pp for pp in p.parent.iterdir() if pp.stem == p.name}
+    if {'.aa', '.ab'}.issubset(suffixes) or {'.aaa', '.aab'}.issubset(suffixes):
+        # ok, let's concatenate the files:
+        with p.open('wb') as f:
+            for suffix in sorted(suffixes):
+                if re.fullmatch(r'\.[a-z]{2,3}', suffix):
+                    f.write(suffixes[suffix].read_bytes())
+                    suffixes[suffix].unlink()
+        return True
+    return False  # pragma: no cover
diff --git a/src/pycldf/markdown.py b/src/pycldf/markdown.py
new file mode 100644
index 0000000..226655b
--- /dev/null
+++ b/src/pycldf/markdown.py
@@ -0,0 +1,166 @@
+"""
+Functionality to render a Dataset's metadata to a Markdown document.
+"""
+import re
+import html
+import pathlib
+from typing import TYPE_CHECKING, Any, Optional
+
+from clldutils.misc import slug
+
+from pycldf.util import qname2url
+from pycldf.fileutil import PathType
+
+if TYPE_CHECKING:
+    from pycldf import Dataset  # pragma: no cover
+
+__all__ = ['metadata2markdown']
+
+
+def metadata2markdown(ds: 'Dataset', path: PathType, rel_path: Optional[str] = './') -> str:
+    """
+    Render the metadata of a dataset as markdown.
+
+    :param ds: `Dataset` instance
+    :param path: `pathlib.Path` of the metadata file
+    :param rel_path: `str` to use a relative path when creating links to data files
+    :return: `str` with markdown formatted text
+    """
+    path = pathlib.Path(path)
+    return '\n'.join(_iter_markdown(ds, pathlib.Path(path), rel_path))
+
+
+def _qname2link(qname: str, html_=False) -> str:
+    url = qname2url(qname)
+    if url:
+        return f'<a href="{url}">{qname}</a>' if html_ else f'[{qname}]({url})'
+    return qname
+
+
+def _htmlify(obj: Any, rel_path: str, key=None) -> str:
+    """
+    For inclusion in tables we must use HTML for lists.
+    """
+    if isinstance(obj, list):
+        items = [f'<li>{_htmlify(item, rel_path, key=key)}</li>' for item in obj]
+        return f'<ol>{"".join(items)}</ol>'
+
+    if isinstance(obj, dict):
+        if key == 'prov:wasGeneratedBy' \
+                and set(obj.keys()).issubset({'dc:title', 'dc:description', 'dc:relation'}):
+            desc = obj.get('dc:description') or ''
+            rel = obj.get('dc:relation')
+            if rel:
+                desc = (desc + '<br>') if desc else desc
+                desc += f'<a href="{rel_path}{rel}">{rel}</a>'
+            return f"<strong>{obj.get('dc:title') or ''}</strong>: {desc}"
+
+        if obj.get('rdf:type') == 'prov:Entity' and 'rdf:about' in obj:
+            label = obj.get('dc:title')
+            if (not label) or label == 'Repository':
+                label = obj['rdf:about']
+            url = obj['rdf:about']
+            if ('github.com' in url) and ('/tree/' not in url) and ('dc:created' in obj):
+                tag = obj['dc:created']
+                if '-g' in tag:
+                    tag = tag.split('-g')[-1]
+                url = f'{url}/tree/{tag}'
+                if label == obj['rdf:about']:
+                    label = label.split('github.com/')[-1]
+            version = f' {obj.get("dc:created")}' or ''
+            return f'<a href="{url}">{label} {version}</a>'
+
+        items = [
+            f'<dt>{_qname2link(k, html_=True)}</dt><dd>{html.escape(str(v))}</dd>'
+            for k, v in obj.items()]
+        return f'<dl>{"".join(items)}</dl>'
+
+    return str(obj)
+
+
+def _iter_properties(obj, rel_path):
+    if obj.common_props.get('dc:description'):
+        yield obj.common_props['dc:description'] + '\n'
+    yield 'property | value\n --- | ---'
+    for k, v in obj.common_props.items():
+        if not v:
+            continue
+        if k not in ('dc:description', 'dc:title', 'dc:source'):
+            if k == 'dc:conformsTo':
+                v = f'[CLDF {v.split("#")[1]}]({v})'
+            yield f'{_qname2link(k)} | {_htmlify(v, rel_path, key=k)}'
+    yield ''
+
+
+def _colrow(col, fks, pk, ds, rel_path):
+    dt = f"`{col.datatype.base if col.datatype else 'string'}`"
+    if col.datatype:
+        if col.datatype.format:
+            if re.fullmatch(r'[\w\s]+(\|[\w\s]+)*', col.datatype.format):
+                dt += '<br>Valid choices:<br>'
+                dt += ''.join(f' `{w}`' for w in col.datatype.format.split('|'))
+            elif col.datatype.base == 'string':
+                dt += f'<br>Regex: `{col.datatype.format}`'
+        if col.datatype.minimum:
+            dt += f'<br>&ge; {col.datatype.minimum}'
+        if col.datatype.maximum:
+            dt += f'<br>&le; {col.datatype.maximum}'
+    if col.separator:
+        dt = f'list of {dt} (separated by `{col.separator}`)'
+    desc = col.common_props.get('dc:description', '').replace('\n', ' ')
+
+    if col.name in pk:
+        desc = (desc + '<br>') if desc else desc
+        desc += 'Primary key'
+
+    if col.name in fks:
+        desc = (desc + '<br>') if desc else desc
+        pkcol, table = fks[col.name]
+        desc += f'References [{table}::{pkcol}](#table-{slug(table)})'
+    elif col.propertyUrl \
+            and col.propertyUrl.uri == "http://cldf.clld.org/v1.0/terms.rdf#source" \
+            and 'dc:source' in ds.properties:
+        desc = (desc + '<br>') if desc else desc
+        desc += (f"References [{ds.properties['dc:source']}::BibTeX-key]"
+                 f"({rel_path}{ds.properties['dc:source']})")
+
+    return ' | '.join([
+        f'[{col.name}]({col.propertyUrl})' if col.propertyUrl else f'`{col.name}`', dt, desc])
+
+
+def _existing_fname_in_cldf_dir(ds, fname: str) -> Optional[str]:
+    """Returns an existing (possibly zipped) file matching fname."""
+    if pathlib.Path(ds.directory).joinpath(fname).exists():
+        return fname
+    zipped = fname + '.zip'
+    if pathlib.Path(ds.directory).joinpath(zipped).exists():
+        return zipped
+    return None
+
+
+def _iter_markdown(ds: 'Dataset', path: pathlib.Path, rel_path: Optional[str] = './'):
+    def file_link(fname):
+        return f'[{fname}]({rel_path}{fname})'
+
+    yield f'# {ds.properties.get("dc:title", ds.module)}\n'
+    if path.suffix == '.json':
+        yield f'**CLDF Metadata**: {file_link(path.name)}\n'
+    if 'dc:source' in ds.properties:
+        src = _existing_fname_in_cldf_dir(ds, ds.properties['dc:source'])
+        if src:
+            yield f'**Sources**: {file_link(src)}\n'
+    yield from _iter_properties(ds.tablegroup, rel_path)
+
+    for table in ds.tables:
+        fks = {
+            fk.columnReference[0]: (fk.reference.columnReference[0], fk.reference.resource.string)
+            for fk in table.tableSchema.foreignKeys if len(fk.columnReference) == 1}
+        src = _existing_fname_in_cldf_dir(ds, table.url.string)
+        table_name = file_link(src) if src else table.url
+        yield f'\n## <a name="table-{slug(table.url.string)}"></a>Table {table_name}\n'
+        yield from _iter_properties(table, rel_path)
+        yield '\n### Columns\n'
+        yield 'Name/Property | Datatype | Description'
+        yield ' --- | --- | --- '
+        for col in table.tableSchema.columns:
+            yield _colrow(col, fks, table.tableSchema.primaryKey, ds, rel_path)
diff --git a/src/pycldf/media.py b/src/pycldf/media.py
index 13c13a1..62163cb 100644
--- a/src/pycldf/media.py
+++ b/src/pycldf/media.py
@@ -24,8 +24,7 @@
 import io
 import json
 import base64
-import typing
-import logging
+from typing import Union, TYPE_CHECKING, Optional, Callable
 import pathlib
 import zipfile
 import functools
@@ -33,17 +32,25 @@
 import collections
 import urllib.parse
 import urllib.request
+from collections.abc import Generator
 
-from clldutils.misc import log_or_raise
-import pycldf
-from pycldf import orm
-from pycldf.util import splitfile, catfile
+from csvw.metadata import Table, Column
 from csvw.datatypes import anyURI
 
+from pycldf import orm
+from pycldf.fileutil import splitfile, catfile, PathType
+
+if TYPE_CHECKING:
+    from pycldf import Dataset  # pragma: no cover
+    from pycldf.dataset import RowType  # pragma: no cover
+    from pycldf.validators import DatasetValidator  # pragma: no cover
+
 __all__ = ['Mimetype', 'MediaTable', 'File']
 
+StrOrBytes = Union[str, bytes]
 
-class File:
+
+class File:  # pylint: disable=too-many-instance-attributes
     """
     A `File` represents a row in a MediaTable, providing functionality to access the contents.
 
@@ -56,15 +63,16 @@ class File:
     - :meth:`save` will write a (deflated) ZIP archive containing the specified file as single \
       member.
     """
-    def __init__(self, media: 'MediaTable', row: dict):
-        self.row = row
-        self.id = row[media.filename_col.name]
-        self._mimetype = row[media.mimetype_col.name]
-        self.url = None
+    def __init__(self, media: 'MediaTable', row: 'RowType'):
+        self.row: 'RowType' = row
+        self.id: str = row[media.filename_col.name]
+        self._mimetype: str = row[media.mimetype_col.name]
+        self.url: Optional[str] = None
         self.scheme = None
         self.url_reader = media.url_reader
-        self.path_in_zip = row.get(media.path_in_zip_col.name) if media.path_in_zip_col else None
-        self._dsdir = media.ds.directory
+        self.path_in_zip: Optional[str] \
+            = row.get(media.path_in_zip_col.name) if media.path_in_zip_col else None
+        self._dsdir: pathlib.Path = media.ds.directory
 
         if media.url_col:
             # 1. Look for a downloadUrl property:
@@ -83,7 +91,7 @@ def __init__(self, media: 'MediaTable', row: dict):
 
     @classmethod
     def from_dataset(
-            cls, ds: pycldf.Dataset, row_or_object: typing.Union[dict, orm.Media]) -> 'File':
+            cls, ds: 'Dataset', row_or_object: Union[dict, orm.Media]) -> 'File':
         """
         Factory method to instantiate a `File` bypassing the `Media` wrapper.
         """
@@ -114,7 +122,7 @@ def mimetype(self) -> 'Mimetype':
             if mt:
                 return Mimetype(mt)
         if self.scheme == 'data':
-            mt, _, data = self.parsed_url.path.partition(',')
+            mt, _, _ = self.parsed_url.path.partition(',')
             if mt.endswith(';base64'):
                 mt = mt.replace(';base64', '').strip()
                 if mt:
@@ -122,13 +130,14 @@ def mimetype(self) -> 'Mimetype':
             # There's an explicit default mimetype for data URLs!
             return Mimetype('text/plain;charset=US-ASCII')
         if self.scheme in ['http', 'https']:
-            res = urllib.request.urlopen(urllib.request.Request(self.url, method="HEAD"))
+            res = urllib.request.urlopen(  # too lazy to mock with with. pylint: disable=R1732
+                urllib.request.Request(self.url, method="HEAD"))
             mt = res.headers.get('Content-Type')
             if mt:
                 return Mimetype(mt)
         return Mimetype('application/octet-stream')
 
-    def local_path(self, d: pathlib.Path = None) -> typing.Union[pathlib.Path, None]:
+    def local_path(self, d: pathlib.Path = None) -> Optional[pathlib.Path]:
         """
         :return: The expected path of the file in the directory `d`.
         """
@@ -136,14 +145,15 @@ def local_path(self, d: pathlib.Path = None) -> typing.Union[pathlib.Path, None]
             if self.scheme == 'file':
                 return self._dsdir / urllib.parse.unquote(self.relpath)
             return None
-        return d.joinpath('{}{}'.format(
-            self.id, '.zip' if self.path_in_zip else (self.mimetype.extension or '')))
+        zip_ext = '.zip' if self.path_in_zip else (self.mimetype.extension or '')
+        return d.joinpath(f'{self.id}{zip_ext}')
 
     def read_json(self, d=None):
+        """Reads JSON data."""
         assert self.mimetype.subtype.endswith('json')
         return json.loads(self.read(d=d))
 
-    def read(self, d=None) -> typing.Union[None, str, bytes]:
+    def read(self, d: Optional[pathlib.Path] = None) -> Optional[StrOrBytes]:
         """
         :param d: A local directory where the file has been saved before. If `None`, the content \
         will be read from the file's URL.
@@ -156,17 +166,18 @@ def read(self, d=None) -> typing.Union[None, str, bytes]:
                 zipcontent = self.url_reader[self.scheme](
                     self.parsed_url, Mimetype('application/zip'))
             if zipcontent:
-                zf = zipfile.ZipFile(io.BytesIO(zipcontent))
-                return self.mimetype.read(zf.read(self.path_in_zip))
-            return  # pragma: no cover
+                with zipfile.ZipFile(io.BytesIO(zipcontent)) as zf:
+                    return self.mimetype.read(zf.read(self.path_in_zip))
+            return None  # pragma: no cover
 
         if d:
             return self.mimetype.read(self.local_path(d).read_bytes())
         if self.url:
             try:
                 return self.url_reader[self.scheme](self.parsed_url, self.mimetype)
-            except KeyError:
-                raise ValueError('Unsupported URL scheme: {}'.format(self.scheme))
+            except KeyError as e:
+                raise ValueError(f'Unsupported URL scheme: {self.scheme}') from e
+        return None  # pragma: no cover
 
     def save(self, d: pathlib.Path) -> pathlib.Path:
         """
@@ -189,14 +200,17 @@ def save(self, d: pathlib.Path) -> pathlib.Path:
         return p
 
 
-class MediaTable(pycldf.ComponentWithValidation):
+class MediaTable:  # pylint: disable=too-many-instance-attributes
     """
     Container class for a `Dataset`'s media items.
     """
-    def __init__(self, ds: pycldf.Dataset, use_form_id: bool = False):
-        super().__init__(ds)
-        self.url_col = ds.get(('MediaTable', 'http://cldf.clld.org/v1.0/terms.rdf#downloadUrl'))
-        self.path_in_zip_col = ds.get(
+    def __init__(self, ds: 'Dataset'):
+        self.ds: 'Dataset' = ds
+        self.component: str = self.__class__.__name__
+        self.table: Table = ds[self.component]
+        self.url_col: Optional[Column] = ds.get(
+            ('MediaTable', 'http://cldf.clld.org/v1.0/terms.rdf#downloadUrl'))
+        self.path_in_zip_col: Optional[Column] = ds.get(
             (self.component, 'http://cldf.clld.org/v1.0/terms.rdf#pathInZip'))
 
         if self.table and not self.url_col:
@@ -204,13 +218,14 @@ def __init__(self, ds: pycldf.Dataset, use_form_id: bool = False):
                 if col.propertyUrl and col.propertyUrl == 'http://www.w3.org/ns/dcat#downloadUrl':
                     self.url_col = col
                     break
-        self.id_col = ds[self.component, 'http://cldf.clld.org/v1.0/terms.rdf#id']
-        self.filename_col = ds[self.component, 'http://cldf.clld.org/v1.0/terms.rdf#formReference']\
-            if use_form_id else self.id_col
-        self.mimetype_col = ds[self.component, 'http://cldf.clld.org/v1.0/terms.rdf#mediaType']
+        self.id_col: Column = ds[self.component, 'http://cldf.clld.org/v1.0/terms.rdf#id']
+        self.filename_col: Column = self.id_col
+        self.mimetype_col: Column = ds[
+            self.component, 'http://cldf.clld.org/v1.0/terms.rdf#mediaType']
 
     @functools.cached_property
-    def url_reader(self):
+    def url_reader(self) -> dict[str, Callable[[urllib.parse.ParseResult, 'Mimetype'], StrOrBytes]]:
+        """Maps URL schemes to reader functions."""
         return {
             'http': read_http_url,
             'https': read_http_url,
@@ -219,13 +234,13 @@ def url_reader(self):
             'file': functools.partial(read_file_url, self.ds.directory),
         }
 
-    def __iter__(self) -> typing.Generator[File, None, None]:
+    def __iter__(self) -> Generator[File, None, None]:
         for row in self.table:
             yield File(self, row)
 
     def split(self, chunksize: int) -> int:
         """
-        :return: The number of media files that have been split.
+        :return: The number of media files that needed splitting.
         """
         res = 0
         for file in self:
@@ -237,7 +252,7 @@ def split(self, chunksize: int) -> int:
                     res += 1
         return res
 
-    def cat(self):
+    def cat(self) -> int:
         """
         :return: The number of media files that have been re-assembled from chunks.
         """
@@ -249,7 +264,8 @@ def cat(self):
                     res += 1
         return res
 
-    def validate(self, success: bool = True, log: logging.Logger = None) -> bool:
+    def validate(self, validator: 'DatasetValidator'):
+        """Component-specific validation."""
         speaker_area_files = collections.defaultdict(list)
         if ('LanguageTable', 'speakerArea') in self.ds:
             for lg in self.ds.iter_rows('LanguageTable', 'id', 'speakerArea'):
@@ -257,42 +273,38 @@ def validate(self, success: bool = True, log: logging.Logger = None) -> bool:
                     speaker_area_files[lg['speakerArea']].append(lg['id'])
 
         for file in self:
-            content = None
-            if not file.url:
-                success = False
-                log_or_raise('File without URL: {}'.format(file.id), log=log)
-            elif file.scheme == 'file':
-                try:
-                    content = file.read()
-                except FileNotFoundError:
-                    success = False
-                    log_or_raise(
-                        'Non-existing local file referenced: {} '
-                        'You may have to run `cldf catmedia` to recombine files'.format(file.id),
-                        log=log)
-                except Exception as e:  # pragma: no cover
-                    success = False
-                    log_or_raise('Error reading {}: {}'.format(file.id, e), log=log)
-            elif file.scheme == 'data':
-                try:
-                    content = file.read()
-                except Exception as e:  # pragma: no cover
-                    success = False
-                    log_or_raise('Error reading {}: {}'.format(file.id, e), log=log)
-            if file.id in speaker_area_files and file.mimetype.subtype == 'geo+json' and content:
-                content = json.loads(content)
-                if content['type'] != 'Feature':
-                    assert content['type'] == 'FeatureCollection'
-                    for feature in content['features']:
-                        lid = feature['properties'].get('cldf:languageReference')
-                        if lid and lid in speaker_area_files[file.id]:
-                            speaker_area_files[file.id].remove(lid)
-                    if speaker_area_files[file.id]:
-                        log_or_raise(
-                            'Error: Not all language IDs found in speakerArea GeoJSON: {}'.format(
-                                speaker_area_files[file.id]))  # pragma: no cover
-
-        return success
+            self._validate_file(validator, file, speaker_area_files)
+
+    def _validate_file(self, validator, file, speaker_area_files):
+        content = None
+        if not file.url:
+            validator.fail(f'File without URL: {file.id}')
+        elif file.scheme == 'file':
+            try:
+                content = file.read()
+            except FileNotFoundError:
+                validator.fail(
+                    f'Non-existing local file referenced: {file.id} '
+                    'You may have to run `cldf catmedia` to recombine files')
+            except Exception as e:  # pragma: no cover  # pylint: disable=W0718
+                validator.fail(f'Error reading {file.id}: {e}')
+        elif file.scheme == 'data':
+            try:
+                content = file.read()
+            except Exception as e:  # pragma: no cover  # pylint: disable=W0718
+                validator.fail(f'Error reading {file.id}: {e}')
+        if file.id in speaker_area_files and file.mimetype.subtype == 'geo+json' and content:
+            content = json.loads(content)
+            if content['type'] != 'Feature':
+                assert content['type'] == 'FeatureCollection'
+                for feature in content['features']:
+                    lid = feature['properties'].get('cldf:languageReference')
+                    if lid and lid in speaker_area_files[file.id]:
+                        speaker_area_files[file.id].remove(lid)
+                if speaker_area_files[file.id]:
+                    validator.fail(
+                        f'Error: Not all language IDs found in speakerArea GeoJSON: '
+                        f'{speaker_area_files[file.id]}')  # pragma: no cover
 
 
 Media = MediaTable
@@ -327,23 +339,28 @@ def __eq__(self, other):
 
     @property
     def is_text(self) -> bool:
+        """Whether the mimetype describes text, and hence data should be read as str."""
         return self.type == 'text'
 
     @property
-    def extension(self) -> typing.Union[None, str]:
-        return mimetypes.guess_extension('{}/{}'.format(self.type, self.subtype))
+    def extension(self) -> Union[None, str]:
+        """Return a suitable filename extension for the mimetype."""
+        return mimetypes.guess_extension(f'{self.type}/{self.subtype}')
 
-    def read(self, data: bytes) -> typing.Union[str, bytes]:
+    def read(self, data: bytes) -> StrOrBytes:
+        """Read data, inferring the encoding from the mimetype."""
         if self.is_text and not isinstance(data, str):
             return data.decode(self.encoding)
         return data
 
-    def write(self, data: typing.Union[str, bytes], p: typing.Optional[pathlib.Path] = None) -> int:
+    def write(self, data: StrOrBytes, p: Optional[pathlib.Path] = None) -> Union[int, StrOrBytes]:
+        """The mimetype dictates how/if to encode data."""
         res = data.encode(self.encoding) if self.is_text else data
         return p.write_bytes(res) if p else res
 
 
-def read_data_url(url: urllib.parse.ParseResult, mimetype: Mimetype):
+def read_data_url(url: urllib.parse.ParseResult, mimetype: Mimetype) -> StrOrBytes:
+    """Read data from a data:// URL."""
     spec, _, data = url.path.partition(',')
     if spec.endswith(';base64'):
         data = base64.b64decode(data)
@@ -354,9 +371,8 @@ def read_data_url(url: urllib.parse.ParseResult, mimetype: Mimetype):
     return data
 
 
-def read_file_url(d: typing.Union[pathlib.Path, str],
-                  url: urllib.parse.ParseResult,
-                  mimetype: Mimetype) -> typing.Union[str, bytes]:
+def read_file_url(d: PathType, url: urllib.parse.ParseResult, mimetype: Mimetype) -> StrOrBytes:
+    """Read data from a file:// URL."""
     path = url.path
     while path.startswith('/'):
         path = path[1:]
@@ -368,5 +384,6 @@ def read_file_url(d: typing.Union[pathlib.Path, str],
     return mimetype.read(d.joinpath(urllib.parse.unquote(path)).read_bytes())
 
 
-def read_http_url(url: urllib.parse.ParseResult, mimetype: Mimetype):
+def read_http_url(url: urllib.parse.ParseResult, mimetype: Mimetype) -> StrOrBytes:
+    """Read data from an HTTP URL."""
     return mimetype.read(urllib.request.urlopen(urllib.parse.urlunparse(url)).read())
diff --git a/src/pycldf/module.py b/src/pycldf/module.py
new file mode 100644
index 0000000..64df73f
--- /dev/null
+++ b/src/pycldf/module.py
@@ -0,0 +1,76 @@
+"""
+Functionality to manage modules, i.e. `Dataset` subclasses implementing particular CLDF modules.
+"""
+import dataclasses
+from typing import Union, Optional, Type
+
+from csvw.metadata import TableGroup
+
+from pycldf.terms import TERMS, term_uri
+from pycldf.util import pkg_path, MD_SUFFIX
+
+__all__ = ['get_module_impl']
+
+
+@dataclasses.dataclass
+class Module:
+    """
+    Class representing a CLDF Module.
+
+    .. seealso:: https://github.com/cldf/cldf/blob/master/README.md#cldf-modules
+    """
+    uri: str
+    fname: str
+
+    def __post_init__(self):
+        if self.uri not in {t.uri for t in TERMS.classes.values()}:
+            raise ValueError(self.uri)  # pragma: no cover
+
+    @property
+    def id(self) -> str:
+        """
+        The local part of the term URI is interpreted as Module identifier.
+        """
+        return self.uri.split('#')[1]
+
+    def match(self, thing: Union[TableGroup, str]) -> bool:
+        """Check if the module described here matches thing."""
+        if isinstance(thing, TableGroup):
+            return thing.common_props.get('dc:conformsTo') == term_uri(self.id)
+        if isinstance(thing, str):
+            return thing == self.fname or thing == self.id
+        return False
+
+
+_modules = []
+
+
+def get_module_impl(base_class, spec: Union[TableGroup, str]) -> Optional[Type]:
+    """
+    Returns an implementation (aka Dataset subclass) for a particular CLDF module.
+    """
+    implementations = {cls.__name__: cls for cls in base_class.__subclasses__()}
+    for mod in get_modules():
+        if mod.match(spec):
+            return implementations[mod.id]
+    return None  # pragma: no cover
+
+
+def get_modules() -> list[Module]:
+    """
+    We read supported CLDF modules from the default metadata files distributed with `pycldf`.
+    """
+    global _modules  # pylint: disable=global-statement
+
+    if not _modules:
+        for p in pkg_path('modules').glob(f'*{MD_SUFFIX}'):
+            tg = TableGroup.from_file(p)
+            mod = Module(
+                tg.common_props['dc:conformsTo'],
+                tg.tables[0].url.string if tg.tables else None)
+            _modules.append(mod)
+        # prefer Wordlist over ParallelText (forms.csv)
+        _modules = sorted(
+            _modules,
+            key=lambda m: (m.id in ('Wordlist', 'ParallelText'), m.id == 'ParallelText'))
+    return _modules
diff --git a/src/pycldf/orm.py b/src/pycldf/orm.py
index d85558c..53f33d2 100644
--- a/src/pycldf/orm.py
+++ b/src/pycldf/orm.py
@@ -46,7 +46,7 @@ def custom_method(self):
   * ~35secs iterating over ``pycldf.Dataset.objects('ValueTable')``
 """
 import types
-import typing
+from typing import TYPE_CHECKING, Union, Optional, Any
 import decimal
 import functools
 import collections
@@ -58,12 +58,14 @@ def custom_method(self):
 from pycldf.util import DictTuple
 from pycldf.sources import Reference
 
-if typing.TYPE_CHECKING:
+if TYPE_CHECKING:
     from pycldf import Dataset  # pragma: no cover
+    from pycldf.dataset import RowType  # pragma: no cover
     from pycldf.media import File  # pragma: no cover
 
 
-def to_json(s):
+def to_json(s: Any) -> Union[str, float, None, list, dict]:
+    """Converts `s` to an object that can be serialized as JSON."""
     if isinstance(s, (list, tuple)):
         return [to_json(ss) for ss in s]
     if isinstance(s, dict):
@@ -77,7 +79,7 @@ def to_json(s):
     return str(s)
 
 
-class Object:
+class Object:  # pylint: disable=too-many-instance-attributes
     """
     Represents a row of a CLDF component table.
 
@@ -95,7 +97,7 @@ class Object:
     # specified here:
     __component__ = None
 
-    def __init__(self, dataset: 'Dataset', row: dict):
+    def __init__(self, dataset: 'Dataset', row: 'RowType'):
         # Get a mapping of column names to pairs (CLDF property name, list-valued) for columns
         # present in the component specified by class name.
         cldf_cols = {
@@ -103,29 +105,29 @@ def __init__(self, dataset: 'Dataset', row: dict):
             for k, v in vars(getattr(dataset.readonly_column_names, self.component)).items()
             if v}
         self._listvalued = set(v[0] for v in cldf_cols.values() if v[1])
-        self.cldf = {}
-        self.data = collections.OrderedDict()
+        cldf_ = {}
+        self.data: collections.OrderedDict[str, Any] = collections.OrderedDict()
         for k, v in row.items():
             # We go through the items of the row and slot them into the appropriate bags:
             self.data[k] = v
             if k in cldf_cols:
-                self.cldf[cldf_cols[k][0]] = v
+                cldf_[cldf_cols[k][0]] = v
         # Make cldf properties accessible as attributes:
-        self.cldf = types.SimpleNamespace(**self.cldf)
-        self.dataset = dataset
-        self.id = self.cldf.id
-        self.pk = None
+        self.cldf = types.SimpleNamespace(**cldf_)
+        self.dataset: 'Dataset' = dataset
+        self.id: str = self.cldf.id
+        self.pk: Optional[str] = None
         t = dataset[self.component_name()]
         if t.tableSchema.primaryKey and len(t.tableSchema.primaryKey) == 1:
             self.pk = self.data[dataset[self.component_name()].tableSchema.primaryKey[0]]
-        self.name = getattr(self.cldf, 'name', None)
-        self.description = getattr(self.cldf, 'description', None)
+        self.name: str = getattr(self.cldf, 'name', None)
+        self.description: str = getattr(self.cldf, 'description', None)
 
     def __repr__(self):
-        return '<{}.{} id="{}">'.format(self.__class__.__module__, self.__class__.__name__, self.id)
+        return f'<{self.__class__.__module__}.{self.__class__.__name__} id="{self.id}">'
 
     @classmethod
-    def component_name(cls) -> str:
+    def component_name(cls) -> str:  # pylint: disable=C0116
         return cls.__component__ or (cls.__name__ + 'Table')
 
     @property
@@ -137,7 +139,8 @@ def component(self) -> str:
         return self.__class__.component_name()
 
     @property
-    def key(self) -> typing.Tuple[int, str, str]:
+    def key(self) -> tuple[int, str, str]:
+        """A key that is also unique across different Dataset instances."""
         return id(self.dataset), self.__class__.__name__, self.id
 
     def __hash__(self):
@@ -154,31 +157,32 @@ def _expand_uritemplate(self, attr, col):
         row as context. Thus, expansion is available as method on this row object.
         """
         col = self.dataset[self.component, col]
-        variables = {k: v for k, v in vars(self.cldf).items()}
+        variables = dict(vars(self.cldf).items())
         variables.update(self.data)
         if getattr(col, attr, None):
             return getattr(col, attr).expand(**variables)
+        return None  # pragma: no cover
 
-    def aboutUrl(self, col='id') -> typing.Union[str, None]:
+    def aboutUrl(self, col: str = 'id') -> Union[str, None]:  # pylint: disable=invalid-name
         """
         The table's `aboutUrl` property, expanded with the object's row as context.
         """
         return self._expand_uritemplate('aboutUrl', col)
 
-    def valueUrl(self, col='id') -> typing.Union[str, None]:
+    def valueUrl(self, col: str = 'id') -> Union[str, None]:  # pylint: disable=invalid-name
         """
         The table's `valueUrl` property, expanded with the object's row as context.
         """
         return self._expand_uritemplate('valueUrl', col)
 
-    def propertyUrl(self, col='id') -> typing.Union[str, None]:
+    def propertyUrl(self, col: str = 'id') -> Union[str, None]:  # pylint: disable=invalid-name
         """
         The table's `propertyUrl` property, expanded with the object's row as context.
         """
         return self._expand_uritemplate('propertyUrl', col)
 
     @functools.cached_property
-    def references(self) -> typing.Tuple[Reference]:
+    def references(self) -> tuple[Reference, ...]:
         """
         `pycldf.Reference` instances associated with the object.
 
@@ -192,7 +196,7 @@ def references(self) -> typing.Tuple[Reference]:
             multi=True,
         )
 
-    def related(self, relation: str) -> typing.Union[None, 'Object']:
+    def related(self, relation: str) -> Optional['Object']:
         """
         The CLDF ontology specifies several "reference properties". This method returns the first
         related object specified by such a property.
@@ -202,7 +206,7 @@ def related(self, relation: str) -> typing.Union[None, 'Object']:
         """
         if relation in self._listvalued:
             raise ValueError(
-                '{} is list-valued, use `all_related` to retrieve related objects'.format(relation))
+                f'{relation} is list-valued, use `all_related` to retrieve related objects')
         fk = getattr(self.cldf, relation, None)
         if fk:
             ref = self.dataset.get_foreign_key_reference(self.component_name(), relation)
@@ -213,8 +217,9 @@ def related(self, relation: str) -> typing.Union[None, 'Object']:
                     return self.dataset.get_object(TERMS[relation].references, fk, pk=True)
                 raise NotImplementedError('pycldf does not support foreign key constraints '
                                           'referencing columns other than CLDF id or primary key.')
+        return None  # pragma: no cover
 
-    def all_related(self, relation: str) -> typing.Union[DictTuple, list]:
+    def all_related(self, relation: str) -> Union[DictTuple, list]:
         """
         CLDF reference properties can be list-valued. This method returns all related objects for
         such a property.
@@ -229,57 +234,58 @@ def all_related(self, relation: str) -> typing.Union[DictTuple, list]:
 
 class _WithLanguageMixin:
     @property
-    def language(self):
+    def language(self) -> Object:  # pylint: disable=C0116
         return self.related('languageReference')
 
     @property
-    def languages(self):
+    def languages(self) -> Union[DictTuple, list]:  # pylint: disable=C0116
         return self.all_related('languageReference')
 
 
 class _WithParameterMixin:
     @functools.cached_property
-    def parameter(self):
+    def parameter(self) -> Object:  # pylint: disable=C0116
         return self.related('parameterReference')
 
     @property
-    def parameters(self):
+    def parameters(self) -> Union[DictTuple, list]:  # pylint: disable=C0116
         return self.all_related('parameterReference')
 
 
-class Borrowing(Object):
+class Borrowing(Object):  # pylint: disable=C0115
     @property
-    def targetForm(self):
+    def targetForm(self) -> Object:  # pylint: disable=C0116,C0103
         return self.related('targetFormReference')
 
     @property
-    def sourceForm(self):
+    def sourceForm(self) -> Object:  # pylint: disable=C0116,C0103
         return self.related('sourceFormReference')
 
 
-class Code(Object, _WithParameterMixin):
+class Code(Object, _WithParameterMixin):  # pylint: disable=C0115
     pass
 
 
-class Cognateset(Object):
+class Cognateset(Object):  # pylint: disable=C0115
     @property
-    def cognates(self):
+    def cognates(self):  # pylint: disable=C0116
         return DictTuple(v for v in self.dataset.objects('CognateTable') if v.cognateset == self)
 
 
-class Cognate(Object):
+class Cognate(Object):  # pylint: disable=C0115
     @property
-    def form(self):
+    def form(self):  # pylint: disable=C0116
         return self.related('formReference')
 
     @property
-    def cognateset(self):
+    def cognateset(self):  # pylint: disable=C0116
         return self.related('cognatesetReference')
 
 
-class Contribution(Object):
+class Contribution(Object):  # pylint: disable=C0115
     @property
     def sentences(self):
+        """Returns the ordered sentences of a text in a TextCorpus."""
         res = []
         if self.dataset.module == 'TextCorpus':
             # Return the list of lines, ordered by position.
@@ -293,35 +299,38 @@ def sentences(self):
         return res
 
 
-class Entry(Object, _WithLanguageMixin):
+class Entry(Object, _WithLanguageMixin):  # pylint: disable=C0115
     @property
-    def senses(self):
+    def senses(self):  # pylint: disable=C0116
         return DictTuple(v for v in self.dataset.objects('SenseTable') if self in v.entries)
 
 
-class Example(Object, _WithLanguageMixin):
+class Example(Object, _WithLanguageMixin):  # pylint: disable=C0115
     @property
-    def metaLanguage(self):
+    def metaLanguage(self):  # pylint: disable=C0116,C0103
         return self.related('metaLanguageReference')
 
     @property
-    def igt(self):
-        return '{0}\n{1}\n{2}'.format(
-            self.cldf.primaryText,
-            tabulate([self.cldf.gloss], self.cldf.analyzedWord, tablefmt='plain'),
-            self.cldf.translatedText,
-        )
+    def igt(self) -> str:
+        """The example in a plain text interlinear glossed representation."""
+        aligned = tabulate([self.cldf.gloss], self.cldf.analyzedWord, tablefmt='plain')
+        return f'{self.cldf.primaryText}\n{aligned}\n{self.cldf.translatedText}'
 
     @property
     def text(self):
         """
-        Examples in a TextCorpus are interpreted as lines of text.
+        Examples in a TextCorpus are interpreted as lines of a text, which in turn is the
+        module-specific interpretation of a CLDF contribution.
         """
         if self.dataset.module == 'TextCorpus' and hasattr(self.cldf, 'contributionReference'):
             return self.related('contributionReference')
+        return None  # pragma: no cover
 
     @property
-    def alternative_translations(self):
+    def alternative_translations(self) -> list['Example']:
+        """
+        Returns alternative translations for the Example.
+        """
         res = []
         if hasattr(self.cldf, 'exampleReference'):
             # There's a self-referential foreign key. We assume this to link together full examples
@@ -332,17 +341,17 @@ def alternative_translations(self):
         return res
 
 
-class Form(Object, _WithLanguageMixin, _WithParameterMixin):
+class Form(Object, _WithLanguageMixin, _WithParameterMixin):  # pylint: disable=C0115
     pass
 
 
-class FunctionalEquivalentset(Object):
+class FunctionalEquivalentset(Object):  # pylint: disable=C0115
     pass
 
 
-class FunctionalEquivalent(Object):
+class FunctionalEquivalent(Object):  # pylint: disable=C0115
     @property
-    def form(self):  # pragma: no cover
+    def form(self):  # pragma: no cover  # pylint: disable=C0116
         return self.related('formReference')
 
 
@@ -362,15 +371,16 @@ class Language(Object):
         'MultiPolygon'
     """
     @property
-    def lonlat(self) -> typing.Union[None, typing.Tuple[decimal.Decimal]]:
+    def lonlat(self) -> Optional[tuple[decimal.Decimal, decimal.Decimal]]:
         """
         :return: (longitude, latitude) pair if coordinates are defined, else `None`.
         """
         if hasattr(self.cldf, 'latitude'):
             return (self.cldf.longitude, self.cldf.latitude)
+        return None  # pragma: no cover
 
     @property
-    def as_geojson_feature(self) -> typing.Union[None, typing.Dict[str, typing.Any]]:
+    def as_geojson_feature(self) -> Union[None, dict[str, Any]]:
         """
         `dict` suitable for serialization as GeoJSON Feature object, with the point coordinate as
         geographic data.
@@ -383,19 +393,21 @@ def as_geojson_feature(self) -> typing.Union[None, typing.Dict[str, typing.Any]]
                 "geometry": {"type": "Point", "coordinates": self.lonlat},
                 "properties": vars(self.cldf),
             })
+        return None  # pragma: no cover
 
     @functools.cached_property
-    def speaker_area(self) -> typing.Union[None, 'File']:
+    def speaker_area(self) -> Optional['File']:
         """
         A `pycldf.media.File` object containing information about the speaker area of the language.
         """
-        from pycldf.media import File
+        from pycldf.media import File  # pylint: disable=C0415
 
         if getattr(self.cldf, 'speakerArea', None):
             return File.from_dataset(self.dataset, self.related('speakerArea'))
+        return None  # pragma: no cover
 
     @functools.cached_property
-    def speaker_area_as_geojson_feature(self) -> typing.Union[None, typing.Dict[str, typing.Any]]:
+    def speaker_area_as_geojson_feature(self) -> Optional[dict[str, Any]]:
         """
         `dict` suitable for serialization as GeoJSON Feature object, with a speaker area Polygon
         or MultiPolygon as geographic data.
@@ -411,13 +423,14 @@ def speaker_area_as_geojson_feature(self) -> typing.Union[None, typing.Dict[str,
             else:
                 assert res['type'] == 'Feature'
                 return res
+        return None  # pragma: no cover
 
     @property
-    def values(self):
+    def values(self) -> DictTuple:  # pylint: disable=C0116
         return DictTuple(v for v in self.dataset.objects('ValueTable') if self in v.languages)
 
     @property
-    def forms(self):
+    def forms(self):  # pylint: disable=C0116
         return DictTuple(v for v in self.dataset.objects('FormTable') if self in v.languages)
 
     def glottolog_languoid(self, glottolog_api):
@@ -433,42 +446,50 @@ def glottolog_languoid(self, glottolog_api):
         return glottolog_api.languoid(self.cldf.glottocode)
 
 
-class Media(Object):
+class Media(Object):  # pylint: disable=C0115
     @property
-    def downloadUrl(self):
+    def downloadUrl(self):  # pylint: disable=C0116,C0103
         if hasattr(self.cldf, 'downloadUrl'):
             return self.cldf.downloadUrl
         return self.valueUrl()
 
 
-class ParameterNetworkEdge(Object):
+class ParameterNetworkEdge(Object):  # pylint: disable=C0115
     __component__ = 'ParameterNetwork'
 
 
 class Parameter(Object):
+    """
+    The Parameter class provides support for interpreting a parameter's string values as typed
+    data and reading it accordingly. See `Value` below.
+    """
     @functools.cached_property
-    def columnSpec(self):
-        columnSpec = getattr(self.cldf, 'columnSpec', None)
+    def columnSpec(self) -> Optional[csvw.metadata.Column]:  # pylint: disable=C0103
+        """Turns a JSON column specification in a column value into a Column object."""
+        columnSpec = getattr(self.cldf, 'columnSpec', None)  # pylint: disable=C0103
         if columnSpec:
             return csvw.metadata.Column.fromvalue(columnSpec)
+        return None
 
     @functools.cached_property
-    def datatype(self):
+    def datatype(self) -> Optional[csvw.metadata.Datatype]:
+        """Turns a JSON datatype description in a column value into a Datatype object."""
         if 'datatype' in self.data \
                 and self.dataset['ParameterTable', 'datatype'].datatype.base == 'json':
             if self.data['datatype']:
                 return csvw.metadata.Datatype.fromvalue(self.data['datatype'])
+        return None
 
     @property
-    def codes(self):
+    def codes(self):  # pylint: disable=C0116
         return DictTuple(v for v in self.dataset.objects('CodeTable') if v.parameter == self)
 
     @property
-    def values(self):
+    def values(self):  # pylint: disable=C0116
         return DictTuple(v for v in self.dataset.objects('ValueTable') if self in v.parameters)
 
     @property
-    def forms(self):
+    def forms(self):  # pylint: disable=C0116
         return DictTuple(v for v in self.dataset.objects('FormTable') if self in v.parameters)
 
     def concepticon_conceptset(self, concepticon_api):
@@ -484,17 +505,17 @@ def concepticon_conceptset(self, concepticon_api):
         return concepticon_api.conceptsets.get(self.cldf.concepticonReference)
 
 
-class Sense(Object):
+class Sense(Object):  # pylint: disable=C0115
     @property
-    def entry(self):
+    def entry(self):  # pylint: disable=C0116
         return self.related('entryReference')
 
     @property
-    def entries(self):
+    def entries(self):  # pylint: disable=C0116
         return self.all_related('entryReference')
 
 
-class Tree(Object):
+class Tree(Object):  # pylint: disable=C0115
     pass
 
 
@@ -530,6 +551,10 @@ class Value(Object, _WithLanguageMixin, _WithParameterMixin):
     """
     @property
     def typed_value(self):
+        """
+        If a parameter includes information about the datatype of its values, this information is
+        used here to convert the value accordingly.
+        """
         if self.parameter.columnSpec:
             return self.parameter.columnSpec.read(self.cldf.value)
         if self.parameter.datatype:
@@ -537,9 +562,9 @@ def typed_value(self):
         return self.cldf.value
 
     @property
-    def code(self):
+    def code(self):  # pylint: disable=C0116
         return self.related('codeReference')
 
     @property
-    def examples(self):
+    def examples(self):  # pylint: disable=C0116
         return self.all_related('exampleReference')
diff --git a/src/pycldf/schemautil.py b/src/pycldf/schemautil.py
new file mode 100644
index 0000000..27e8330
--- /dev/null
+++ b/src/pycldf/schemautil.py
@@ -0,0 +1,52 @@
+"""
+Functionality to create schema objects.
+"""
+from typing import Union
+
+from csvw.metadata import Column, Table
+from clldutils import jsonlib
+
+from pycldf.terms import TERMS
+from pycldf.util import MD_SUFFIX, pkg_path
+
+ColSpecType = Union[str, dict, Column]
+TableSpecType = Union[str, dict, Table]
+TableType = Union[str, Table]
+ColType = Union[str, Column]
+
+
+def make_column(spec: ColSpecType) -> Column:
+    """
+    Create a `Column` instance from `spec`.
+
+    .. code-block:: python
+
+        >>> make_column('id').name
+        'id'
+        >>> make_column('http://cldf.clld.org/v1.0/terms.rdf#id').name
+        'ID'
+        >>> make_column({'name': 'col', 'datatype': 'boolean'}).datatype.base
+        'boolean'
+        >>> type(make_column(make_column('id')))
+        <class 'csvw.metadata.Column'>
+    """
+    if isinstance(spec, str):
+        if spec in TERMS.by_uri:
+            return TERMS.by_uri[spec].to_column()
+        return Column(name=spec, datatype='string')
+    if isinstance(spec, dict):
+        return Column.fromvalue(spec)
+    if isinstance(spec, Column):
+        return spec
+    raise TypeError(spec)
+
+
+def make_table(spec: TableSpecType) -> Table:
+    """Create a `Table` instance from `spec`."""
+    if isinstance(spec, str):
+        return Table.fromvalue(jsonlib.load(pkg_path('components', f'{spec}{MD_SUFFIX}')))
+    if isinstance(spec, dict):
+        return Table.fromvalue(spec)
+    if isinstance(spec, Table):
+        return spec
+    raise TypeError(spec)  # pragma: no cover
diff --git a/src/pycldf/sliceutil.py b/src/pycldf/sliceutil.py
new file mode 100644
index 0000000..67c2d2a
--- /dev/null
+++ b/src/pycldf/sliceutil.py
@@ -0,0 +1,55 @@
+"""
+This module provides a flexible implementation of slicing sequences, based on Python's slices.
+
+In addition to Python's way of specifying slices as triples of integers (start, stop, step), we
+allow specification as strings like '1' or '2:5', where the numbers are interpreted as **1-based**
+indices, specifying **inclusive** boundaries. I.e. '2:5' is equivalent to `slice(1:5).`
+"""
+from typing import Union, TypeVar
+import itertools
+from collections.abc import Sequence, Iterable
+
+__all__ = ['multislice', 'multislice_with_split']
+
+T = TypeVar('T')
+SliceType = Union[str, tuple[int], tuple[int, int], tuple[int, int, int], slice]
+
+
+def multislice(sliceable: Sequence[T], *slices: SliceType) -> Sequence[T]:
+    """
+    .. code-block:: python
+
+        >>> import string
+        >>> multislice(list(range(30)), '3:7', '9', (12, 18, 3))
+        [2, 3, 4, 5, 6, 8, 12, 15]
+        >>> multislice(string.ascii_lowercase, '3:7', '9', (12, 18, 3))
+        'cdefgimp'
+    """
+    res = type(sliceable)()
+    for sl in slices:
+        if isinstance(sl, str):
+            if ':' in sl:
+                assert sl.count(':') <= 2, f'String slice spec may only have two colons. {sl}'
+                sl = slice(*[int(s) - (1 if i == 0 else 0) for i, s in enumerate(sl.split(':'))])
+            else:
+                sl = slice(*[int(sl) - 1, int(sl)])
+        elif isinstance(sl, int):
+            sl = slice(sl, sl + 1)
+        elif isinstance(sl, (tuple, list)):
+            sl = slice(*sl)
+        else:
+            assert isinstance(sl, slice)
+        res += sliceable[sl]
+    return res
+
+
+def multislice_with_split(sliceable: Sequence[T], slices: Iterable[SliceType]) -> list[T]:
+    """
+    Resolves multislices and then applies splitting on whitespace to each item.
+
+    .. code-block:: python
+
+        >>> multislice_with_split(['a', 'b', 'c d', 'f', 'g'], [(2, 4)])
+        ['c', 'd', 'f']
+    """
+    return list(itertools.chain(*[s.split() for s in multislice(sliceable, *slices)]))
diff --git a/src/pycldf/sources.py b/src/pycldf/sources.py
index 1d8ec7a..946b308 100644
--- a/src/pycldf/sources.py
+++ b/src/pycldf/sources.py
@@ -1,12 +1,16 @@
+"""
+Functionality to handle BibTeX source data of Datasets.
+"""
 import re
 import types
-import typing
+from typing import Optional, Union, Literal
 import pathlib
 import zipfile
 import tempfile
 import collections
 from urllib.error import HTTPError
 from urllib.request import urlopen, urlretrieve
+from collections.abc import Generator, Iterable, KeysView
 
 from csvw.metadata import is_url
 from simplepybtex import database
@@ -14,7 +18,8 @@
 from clldutils.source import Source as BaseSource
 from clldutils.source import ID_PATTERN
 
-from pycldf.util import update_url
+from pycldf.urlutil import update_url
+from pycldf.fileutil import PathType
 
 __all__ = ['Source', 'Sources', 'Reference']
 
@@ -22,13 +27,14 @@
 
 
 class Writer(BaseWriter):
+    """We overwrite pybtex's writer to ensure data is wrapped in curly braces."""
     def quote(self, s):
         self.check_braces(s)
         return '{%s}' % s
 
     def _encode(self, text):
         #
-        # FIXME: We overwrite a private method here!
+        # FIXME: We overwrite a private method here!  pylint: disable=fixme
         #
         return text
 
@@ -38,7 +44,8 @@ class Source(BaseSource):
     A bibliograhical record, specifying a source for some data in a CLDF dataset.
     """
     @property
-    def entry(self):
+    def entry(self) -> database.Entry:
+        """Converts Source to a pybtex Entry."""
         persons = collections.OrderedDict([
             ('author', list(self.persons(self.get('author', '')))),
             ('editor', list(self.persons(self.get('editor', '')))),
@@ -53,10 +60,10 @@ def __str__(self):
         return self.text()
 
     def __repr__(self):
-        return '<%s %s>' % (self.__class__.__name__, self.id)
+        return f'<{self.__class__.__name__} {self.id}>'
 
     @classmethod
-    def from_entry(cls, key, entry, **_kw):
+    def from_entry(cls, key: str, entry: database.Entry, **_kw):
         """
         Create a `cls` instance from a `simplepybtex` entry object.
 
@@ -65,15 +72,16 @@ def from_entry(cls, key, entry, **_kw):
         :param _kw: Non-bib-metadata keywords to be passed for `cls` instantiation
         :return: `cls` instance
         """
-        _kw.update({k: v for k, v in entry.fields.items()})
+        _kw.update(entry.fields.items())
         _kw.setdefault('_check_id', False)
         for role in entry.persons:
             if entry.persons[role]:
-                _kw[role] = ' and '.join('%s' % p for p in entry.persons[role])
+                _kw[role] = ' and '.join(f'{p}' for p in entry.persons[role])
         return cls(entry.type, key, **_kw)
 
     @staticmethod
-    def persons(s):
+    def persons(s: str) -> Generator[database.Person, None, None]:
+        """Yields persons encoded in an author names string."""
         for name in re.split(r'\s+&\s+|\s+and\s+', s.strip()):
             if name:
                 parts = name.split(',')
@@ -83,26 +91,31 @@ def persons(s):
                 else:
                     yield database.Person(name)
 
-    def refkey(self, year_brackets='round'):
-        brackets = {None: ('', ''), 'round': ('(', ')'), 'square': ('[', ']'), 'curly': ('{', '}')}
+    def refkey(self, year_brackets: Union[None, Literal["round", "square", "curly"]] = 'round'):
+        """Compute an author-year type reference key for the item."""
+        brackets = {
+            None: ('', ''),
+            'round': ('(', ')'),
+            'square': ('[', ']'),
+            'curly': ('{', '}')}.get(year_brackets)
         persons = self.entry.persons.get('author') or self.entry.persons.get('editor', [])
-        s = ' '.join(persons[0].prelast_names + persons[0].last_names) if persons else 'n.a.'
+        names = ' '.join(persons[0].prelast_names + persons[0].last_names) if persons else 'n.a.'
         if len(persons) == 2:
-            s += ' and {}'.format(' '.join(persons[1].last_names))
+            names += f" and {' '.join(persons[1].last_names)}"
         elif len(persons) > 2:
-            s += ' et al.'
-        return s.replace('{', '').replace('}', '') + ' {}{}{}'.format(
-            brackets[year_brackets][0], self.get('year', 'n.d.'), brackets[year_brackets][1])
+            names += ' et al.'
+        names = names.replace('{', '').replace('}', '')
+        return f"{names} {brackets[0]}{self.get('year', 'n.d.')}{brackets[1]}"
 
 
-class Reference(object):
+class Reference:
     """
     A reference connects a piece of data with a `Source`, typically adding some citation context \
     often page numbers, or similar.
     """
-    def __init__(self, source: Source, desc: typing.Union[str, None]):
+    def __init__(self, source: Source, desc: Optional[str]):
         if desc and ('[' in desc or ']' in desc or ';' in desc):
-            raise ValueError('invalid ref description: %s' % desc)
+            raise ValueError(f'invalid ref description: {desc}')
         self.source = source
         self.fields = types.SimpleNamespace(**self.source) if isinstance(self.source, dict) else {}
         self.description = desc
@@ -115,14 +128,14 @@ def __str__(self):
         """
         res = self.source.id if hasattr(self.source, 'id') else self.source
         if self.description:
-            res += '[%s]' % self.description
+            res += f'[{self.description}]'
         return res
 
     def __repr__(self):
-        return '<%s %s>' % (self.__class__.__name__, self)
+        return f'<{self.__class__.__name__} {self}>'
 
 
-class Sources(object):
+class Sources:
     """
     A `dict` like container for all sources linked to data in a CLDF dataset.
     """
@@ -130,16 +143,17 @@ def __init__(self):
         self._bibdata = database.BibliographyData()
 
     @classmethod
-    def from_file(cls, fname):
+    def from_file(cls, fname: PathType) -> 'Sources':
+        """Instantiate an instance from the data in a BibTeX file."""
         zipped = False
         res = cls()
-        if not is_url(fname):
+        if not is_url(str(fname)):
             fname = pathlib.Path(fname)
             if not fname.exists():
-                fname = fname.parent / '{}.zip'.format(fname.name)
+                fname = fname.parent / f'{fname.name}.zip'
                 zipped = True
             if fname.exists():
-                assert fname.is_file(), 'Bibfile {} must be a file!'.format(fname)
+                assert fname.is_file(), f'Bibfile {fname} must be a file!'
                 res.read(fname, zipped=zipped)
         else:
             res.read(fname)
@@ -150,34 +164,34 @@ def __bool__(self):
 
     __nonzero__ = __bool__
 
-    def keys(self):
+    def keys(self) -> KeysView[str]:  # pylint: disable=C0116
         return self._bibdata.entries.keys()
 
-    def items(self):
+    def items(self) -> Generator[Source, None, None]:  # pylint: disable=C0116
         for key, entry in self._bibdata.entries.items():
             yield Source.from_entry(key, entry)
 
     def __iter__(self):
         return self.items()
 
-    def __len__(self):
+    def __len__(self) -> int:
         return len(self._bibdata.entries)
 
-    def __getitem__(self, item):
+    def __getitem__(self, item: str) -> Optional[Source]:
         try:
             return Source.from_entry(item, self._bibdata.entries[item])
-        except KeyError:
-            raise ValueError('missing citekey: %s' % item)
+        except KeyError as e:
+            raise ValueError(f'missing citekey: {item}') from e
 
-    def __contains__(self, item):
+    def __contains__(self, item: str) -> bool:
         return item in self._bibdata.entries
 
     @staticmethod
-    def format_refs(*refs):
-        return ['%s' % ref for ref in refs]
+    def format_refs(*refs) -> list[str]:  # pylint: disable=C0116
+        return [f'{ref}' for ref in refs]
 
     @staticmethod
-    def parse(ref: str) -> typing.Tuple[str, str]:
+    def parse(ref: str) -> tuple[str, str]:
         """
         Parse the string representation of a reference into source ID and context.
 
@@ -191,14 +205,15 @@ def parse(ref: str) -> typing.Tuple[str, str]:
             pages = pages[:-1].strip()
         return sid, pages
 
-    def validate(self, refs):
+    def validate(self, refs: Union[str, list[str]]) -> None:
+        """Make sure refs are valid. If not, raises Exceptions."""
         if not isinstance(refs, str) and any(r is None for r in refs):
             raise ValueError('empty reference in ref list (possibly caused by trailing separator)')
         for sid, _ in map(self.parse, [refs] if isinstance(refs, str) else refs):
             if sid not in self.keys():
-                raise ValueError('missing source key: {0}'.format(sid))
+                raise ValueError(f'missing source key: {sid}')
 
-    def expand_refs(self, refs: typing.Iterable[str], **kw) -> typing.Iterable[Reference]:
+    def expand_refs(self, refs: Iterable[str], **kw) -> Iterable[Reference]:
         """
         Turn a list of string references into proper :class:`Reference` instances, looking up \
         sources in `self`.
@@ -217,7 +232,7 @@ def expand_refs(self, refs: typing.Iterable[str], **kw) -> typing.Iterable[Refer
                 self._add_entries(Source('misc', sid, glottolog_id=sid), **kw)
             yield Reference(self[sid], pages)
 
-    def _add_entries(self, data, **kw):
+    def _add_entries(self, data: Union[Source, database.BibliographyData], **kw) -> None:
         if isinstance(data, Source):
             entries = [(data.id, data.entry)]
         elif hasattr(data, 'entries'):
@@ -232,17 +247,20 @@ def _add_entries(self, data, **kw):
 
         for key, entry in entries:
             if kw.get('_check_id', False) and not ID_PATTERN.match(key):
-                raise ValueError('invalid source ID: %s' % key)
+                raise ValueError(f'invalid source ID: {key}')
             if key not in self._bibdata.entries:
                 try:
                     self._bibdata.add_entry(key, entry)
                 except database.BibliographyDataError as e:  # pragma: no cover
-                    raise ValueError('%s' % e)
+                    raise ValueError(f'{e}') from e
 
-    def read(self, fname, zipped=False, **kw):
-        if is_url(fname):
+    def read(self, fname: PathType, zipped=False, **kw):
+        """Read sources from a BibTex file (possibly specified via URL)."""
+        if is_url(str(fname)):
+            fname = str(fname)
             try:
-                content = urlopen(fname).read().decode('utf-8')
+                with urlopen(fname) as f:
+                    content = f.read().decode('utf-8')
             except HTTPError as e:
                 if '404' in str(e):
                     fname = update_url(
@@ -254,14 +272,15 @@ def read(self, fname, zipped=False, **kw):
                             content = zf.read(zf.namelist()[0]).decode('utf8')
         else:
             if zipped:
-                with zipfile.ZipFile(fname, 'r') as zf:
+                with zipfile.ZipFile(str(fname), 'r') as zf:
                     content = zf.read(zf.namelist()[0]).decode('utf8')
             else:
                 content = pathlib.Path(fname).read_text(encoding='utf-8')
         self._add_entries(
             database.parse_string(content, bib_format='bibtex'), **kw)
 
-    def write(self, fname, ids=None, zipped=False, **kw):
+    def write(self, fname: PathType, ids=None, zipped=False, **_) -> Optional[pathlib.Path]:
+        """Write sources to a file (if there are any)."""
         if ids:
             bibdata = database.BibliographyData()
             for key, entry in self._bibdata.entries.items():
@@ -269,19 +288,21 @@ def write(self, fname, ids=None, zipped=False, **kw):
                     bibdata.add_entry(key, entry)
         else:
             bibdata = self._bibdata
+        fname = pathlib.Path(fname)
         if bibdata.entries:
-            with pathlib.Path(fname).open('w', encoding='utf8') as fp:
+            with fname.open('w', encoding='utf8') as fp:
                 Writer().write_stream(bibdata, fp)
             if zipped:
                 with zipfile.ZipFile(
-                        fname.parent / '{}.zip'.format(fname.name),
+                        fname.parent / f'{fname.name}.zip',
                         'w',
                         compression=zipfile.ZIP_DEFLATED) as zf:
                     zf.write(fname, fname.name)
                 fname.unlink()
             return fname
+        return None
 
-    def add(self, *entries: typing.Union[str, Source], **kw):
+    def add(self, *entries: Union[str, Source], **kw) -> None:
         """
         Add a source, either specified as BibTeX string or as :class:`Source`.
         """
diff --git a/src/pycldf/stats.py b/src/pycldf/stats.py
new file mode 100644
index 0000000..5f40839
--- /dev/null
+++ b/src/pycldf/stats.py
@@ -0,0 +1,40 @@
+"""
+Functionality to compute summary statistics for a Dataset.
+"""
+import typing
+import dataclasses
+from collections.abc import Generator
+
+from pycldf.terms import TERMS
+
+if typing.TYPE_CHECKING:
+    from pycldf import Dataset  # pragma: no cover
+
+__all__ = ['get_table_stats']
+
+
+def get_table_stats(ds: 'Dataset', exact: bool = False) -> list[tuple[str, str, int]]:
+    """Return a list of table statistics."""
+    return [dataclasses.astuple(stats) for stats in _iter_stats(ds, exact)]
+
+
+@dataclasses.dataclass(frozen=True)
+class TableStats:
+    """A bag of attrs"""
+    fname: str
+    component: str
+    rowcount: int
+
+
+def _iter_stats(ds: 'Dataset', exact: bool = False) -> Generator[TableStats, None, None]:
+    for table in ds.tables:
+        dctype = table.common_props.get('dc:conformsTo')
+        if dctype and '#' in dctype and dctype.split('#')[1] in TERMS:
+            dctype = TERMS[dctype.split('#')[1]].csvw_prop('name')
+        yield TableStats(
+            table.url.string,
+            dctype or '',
+            sum(1 for _ in table) if (exact or 'dc:extent' not in table.common_props)
+            else int(table.common_props.get('dc:extent')))
+    if ds.sources:
+        yield TableStats(ds.bibname, 'Sources', len(ds.sources))
diff --git a/src/pycldf/terms.py b/src/pycldf/terms.py
index 83c914a..18b5a85 100644
--- a/src/pycldf/terms.py
+++ b/src/pycldf/terms.py
@@ -1,16 +1,27 @@
+"""
+Functionality to access the metadata about CLDF schema objects encoded in the ontology.
+"""
 import re
 import json
 import types
+import pathlib
 import warnings
+import dataclasses
 import urllib.parse
+from typing import Optional, Union, Callable, Any, TYPE_CHECKING, Literal, get_args
+from collections.abc import Container
 from xml.etree import ElementTree
 
-import attr
 from csvw.metadata import Column
+from clldutils import jsonlib
 
 from pycldf.util import pkg_path
+from pycldf.fileutil import PathType
 
-__all__ = ['term_uri', 'TERMS', 'get_column_names']
+if TYPE_CHECKING:
+    from pycldf import Dataset  # pragma: no cover
+
+__all__ = ['term_uri', 'TERMS', 'get_column_names', 'sniff']
 
 URL = 'http://cldf.clld.org/v1.0/terms.rdf'
 RDF = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#'
@@ -18,68 +29,89 @@
 CSVW = 'http://www.w3.org/ns/csvw#'
 DC = 'http://purl.org/dc/terms/'
 
-
-def term_uri(name, terms=None, ns=URL):
-    if isinstance(name, Term):
-        return name.uri
-    if not name.startswith(ns):
-        sep = '' if ns.endswith('#') else '#'
-        name = sep.join([ns, name])
-    if not terms or name in terms:
-        return name
-    return None
+TermType = Literal['Class', 'Property']
+CardinalityType = Literal['singlevalued', 'multivalued']
 
 
-def qname(ns, lname):
+def qname(ns: str, lname: str) -> str:
+    """Return a qualified name in ElementTree notation."""
     return '{%s}%s' % (ns, lname)
 
 
-def _get(e, subelementns, subelementlname, attrns=None, attrlname=None, converter=None):
+@dataclasses.dataclass
+class NameSpec:  # pylint: disable=C0115
+    ns: str
+    lname: str
+
+    @property
+    def qname(self):  # pylint: disable=C0116
+        return qname(self.ns, self.lname)
+
+
+def _get(
+        e: ElementTree.Element,
+        subelement: NameSpec,
+        attrib: Optional[NameSpec] = None,
+        converter: Optional[Callable[[str], Any]] = None,
+):
     """
     :return: Text content or attribute value of a subelement of e.
     """
     res = None
-    subelement = e.find(qname(subelementns, subelementlname))
+    subelement = e.find(subelement.qname)
     if subelement is not None:
-        if not attrlname:
+        if not attrib:
             res = subelement.text
         else:
-            res = subelement.attrib[qname(attrns, attrlname)]
+            res = subelement.attrib[attrib.qname]
     if converter and res:
         res = converter(res)
     return res
 
 
-@attr.s
-class Term(object):
-    name = attr.ib()
-    type = attr.ib(validator=attr.validators.in_(['Class', 'Property']))
-    element = attr.ib()
-    references = attr.ib(default=None)
-    subtype = attr.ib(default=None)
-    version = attr.ib(default=None, validator=attr.validators.matches_re(r'v[0-9]+(\.[0-9]+)+'))
-    cardinality = attr.ib(
-        default=None,
-        validator=attr.validators.optional(attr.validators.in_(['singlevalued', 'multivalued'])))
+@dataclasses.dataclass
+class Term:
+    """A Term is an object described in the CLDF Ontology."""
+    name: str
+    type: TermType
+    element: ElementTree.Element
+    references: Optional[str] = None
+    subtype: Optional[str] = None
+    version: Optional[str] = None
+    cardinality: Optional[CardinalityType] = None
+
+    def __post_init__(self):
+        assert self.type in get_args(TermType)
+        if self.version:
+            assert re.fullmatch(r'v[0-9]+(\.[0-9]+)+', self.version)
+        if self.cardinality:
+            assert self.cardinality in get_args(CardinalityType)
 
     @property
-    def uri(self):
-        return '{0}#{1}'.format(URL, self.name)
+    def uri(self) -> str:
+        """The Term URI."""
+        return f'{URL}#{self.name}'
 
     @classmethod
-    def from_element(cls, e):
-        subClassOf = e.find(qname(RDFS, 'subClassOf'))
-        kw = dict(
-            name=e.attrib[qname(RDF, 'about')].split('#')[1],
-            version=_get(
-                e, DC, 'hasVersion', RDF, 'resource',
+    def from_element(cls, e: ElementTree.Element) -> 'Term':
+        """Instantiate a Term from an XML element parsed from the ontology."""
+        subClassOf = e.find(qname(RDFS, 'subClassOf'))  # pylint: disable=invalid-name
+        kw = {
+            'name': e.attrib[qname(RDF, 'about')].split('#')[1],
+            'version': _get(
+                e,
+                NameSpec(ns=DC, lname='hasVersion'),
+                attrib=NameSpec(ns=RDF, lname='resource'),
                 converter=lambda s: 'v' + s.split('/v')[1].replace('/', '')) or 'v1.0',
-            type=e.tag.split('}')[1],
-            element=e,
-            cardinality=_get(e, DC, 'extent'),
-            references=_get(
-                e, DC, 'references', RDF, 'resource', converter=lambda s: s.split('#')[1]),
-        )
+            'type': e.tag.split('}')[1],
+            'element': e,
+            'cardinality': _get(e, NameSpec(ns=DC, lname='extent')),
+            'references': _get(
+                e,
+                NameSpec(ns=DC, lname='references'),
+                attrib=NameSpec(ns=RDF, lname='resource'),
+                converter=lambda s: s.split('#')[1]),
+        }
         if kw['type'] == 'Class':
             kw['subtype'] = 'module' \
                 if subClassOf is not None \
@@ -87,10 +119,12 @@ def from_element(cls, e):
                 'http://www.w3.org/ns/dcat#Distribution' else 'component'
         return cls(**kw)
 
-    def csvw_prop(self, lname):
-        return _get(self.element, CSVW, lname, converter=lambda s: json.loads(s))
+    def csvw_prop(self, lname: str) -> Any:
+        """Returns the JSON value of a property in the CSVW namespace."""
+        return _get(self.element, NameSpec(ns=CSVW, lname=lname), converter=json.loads)
 
-    def to_column(self):
+    def to_column(self) -> Column:
+        """Returns a csvw Column instance configured according to the term spec."""
         col = Column(
             name=self.csvw_prop('name') or self.element.find(qname(RDFS, 'label')).text,
             propertyUrl=self.element.attrib[qname(RDF, 'about')],
@@ -101,7 +135,10 @@ def to_column(self):
                 setattr(col, k, v)
         return col
 
-    def comment(self, one_line=False):
+    def comment(self, one_line=False) -> str:
+        """
+        Parse a text comment from the XML element of the ontology.
+        """
         c = self.element.find("{http://www.w3.org/2000/01/rdf-schema#}comment")
         try:
             xml = ElementTree.tostring(c, default_namespace='http://www.w3.org/1999/xhtml')
@@ -116,17 +153,24 @@ def comment(self, one_line=False):
         return re.sub(r'\s+', ' ', res.replace('\n', ' ')) if one_line else res
 
 
+TermDict = dict[str, Term]
+
+
 class Terms(dict):
-    def __init__(self, path=None):
+    """
+    A dict of `Term`s keyed by local names.
+    """
+    def __init__(self, path: Optional[PathType] = None):
         self._path = path or pkg_path('terms.rdf')
         r = ElementTree.parse(str(self._path)).getroot()
         terms = [Term.from_element(e) for e in r.findall(qname(RDF, 'Property'))]
         for e in r.findall(qname(RDFS, 'Class')):
             terms.append(Term.from_element(e))
         dict.__init__(self, {t.name: t for t in terms})
-        self.by_uri = {t.uri: t for t in terms}
+        self.by_uri: TermDict = {t.uri: t for t in terms}
 
-    def is_cldf_uri(self, uri):
+    def is_cldf_uri(self, uri: str) -> bool:
+        """Whether the given URL is a CLDF Ontology term URI."""
         if uri and urllib.parse.urlparse(uri).netloc == 'cldf.clld.org':
             if uri not in self.by_uri:
                 warnings.warn('If pycldf does not recognize valid CLDF URIs, You may be '
@@ -137,44 +181,103 @@ def is_cldf_uri(self, uri):
         return False
 
     @property
-    def properties(self):
+    def properties(self) -> TermDict:  # pylint: disable=C0116
         return {k: v for k, v in self.items() if v.type == 'Property'}
 
     @property
-    def classes(self):
+    def classes(self) -> TermDict:  # pylint: disable=C0116
         return {k: v for k, v in self.items() if v.type == 'Class'}
 
     @property
-    def modules(self):
+    def modules(self) -> TermDict:  # pylint: disable=C0116
         return {k: v for k, v in self.items() if v.subtype == 'module'}
 
     @property
-    def components(self):
+    def components(self) -> TermDict:  # pylint: disable=C0116
         return {k: v for k, v in self.items() if v.subtype == 'component'}
 
 
+def term_uri(name: Union[Term, str], terms: Container[str] = None, ns: str = URL) -> Optional[str]:
+    """
+    Returns a full term URI associated with `name`.
+
+    If `terms` are provided, we make sure the URI is contained in `terms`.
+    """
+    if isinstance(name, Term):
+        return name.uri
+    if not name.startswith(ns):  # So this may be a local name, i.e. the fragment of a term URI.
+        sep = '' if ns.endswith('#') else '#'
+        name = sep.join([ns, name])
+    if not terms or name in terms:
+        return name
+    return None
+
+
 TERMS = Terms()
 
 
-def get_column_names(dataset, use_component_names=False, with_multiplicity=False):
+def get_column_names(
+        dataset: 'Dataset',
+        use_component_names: bool = False,
+        with_multiplicity: bool = False,
+) -> types.SimpleNamespace:
+    """
+    Returns an object allowing programmatic access to the column names used for ontology terms
+    in a specific dataset.
+
+    .. code-block:: python
+
+        >>> from pycldf import Dataset
+        >>> from pycldf.terms import get_column_names
+        >>> ds = Dataset.from_metadata('tests/data/ds1.csv-metadata.json')
+        >>> res = get_column_names(ds, use_component_names=True)
+        >>> res.ValueTable.languageReference
+        'Language_ID'
+    """
     comp_names = {
         k: k if use_component_names else k.replace('Table', '').lower() + 's'
         for k in TERMS.components}
+    # Seed the result object with component names as attributes and None as value.
     name_map = types.SimpleNamespace(**{k: None for k in comp_names.values()})
     for term, attr_ in comp_names.items():
-        try:
-            table = dataset[term]
+        table = dataset.get(term)
+        if table:
             props = {}
-            for k in TERMS.properties:
-                try:
-                    col = dataset[table, k]
+            for k in TERMS.properties:  # Loop through properties in the ontology.
+                col = dataset.get((table, k))
+                if col:
                     if with_multiplicity:
                         props[k] = (col.name, bool(col.separator))
                     else:
                         props[k] = col.name
-                except KeyError:
+                else:
                     props[k] = None
             setattr(name_map, attr_, types.SimpleNamespace(**props))
-        except KeyError:
-            pass
     return name_map
+
+
+def sniff(p: pathlib.Path) -> bool:
+    """
+    Determine whether a file contains CLDF metadata.
+
+    :param p: `pathlib.Path` object for an existing file.
+    :return: `True` if the file contains CLDF metadata, `False` otherwise.
+    """
+    if not p.is_file():  # pragma: no cover
+        return False
+    try:
+        with p.open('rb') as fp:
+            c = fp.read(10)
+            try:
+                c = c.decode('utf8').strip()
+            except UnicodeDecodeError:
+                return False
+            if not c.startswith('{'):
+                return False
+    except (FileNotFoundError, OSError):  # pragma: no cover
+        return False
+    try:
+        d = jsonlib.load(p)
+    except json.decoder.JSONDecodeError:
+        return False
+    return d.get('dc:conformsTo', '').startswith(URL)
diff --git a/src/pycldf/trees.py b/src/pycldf/trees.py
index 3a81c76..b939b29 100644
--- a/src/pycldf/trees.py
+++ b/src/pycldf/trees.py
@@ -21,17 +21,20 @@
          ├─l3
          └─l4
 """
-import typing
-import logging
+from typing import TYPE_CHECKING, Optional
 import pathlib
+from collections.abc import Generator
 
-from clldutils.misc import log_or_raise
 from commonnexus import Nexus
 import newick
+from csvw.metadata import Table, Column
 
-import pycldf
 from pycldf.media import MediaTable, File
 
+if TYPE_CHECKING:
+    from pycldf import Dataset  # pragma: no cover
+    from pycldf.dataset import RowType  # pragma: no cover
+    from pycldf.validators import DatasetValidator  # pragma: no cover
 
 __all__ = ['Tree', 'TreeTable']
 
@@ -40,17 +43,17 @@ class Tree:
     """
     Represents a tree object as specified in a row of `TreeTable`.
     """
-    def __init__(self, trees: 'TreeTable', row: dict, file: File):
-        self.row = row
-        self.id = row[trees.cols['id'].name]
-        self.name = row[trees.cols['name'].name]
-        self.file = file
+    def __init__(self, trees: 'TreeTable', row: 'RowType', file: File):
+        self.row: 'RowType' = row
+        self.id: str = row[trees.cols['id'].name]
+        self.name: str = row[trees.cols['name'].name]
+        self.file: File = file
         for prop in ['description', 'treeType', 'treeIsRooted', 'treeBranchLengthUnit']:
             attrib = ''.join('_' + c.lower() if c.isupper() else c for c in prop)
             setattr(self, attrib, row.get(trees.cols[prop].name) if trees.cols[prop] else None)
         self.trees = trees
 
-    def newick_string(self, d: typing.Optional[pathlib.Path] = None) -> str:
+    def newick_string(self, d: Optional[pathlib.Path] = None) -> str:
         """
         Retrieve the Newick representation of the tree from the associated tree file.
 
@@ -58,21 +61,19 @@ def newick_string(self, d: typing.Optional[pathlib.Path] = None) -> str:
         :meth:`pycldf.media.File.save`.
         :return: Newick representation of the associated tree.
         """
-        if self.file.id not in self.trees._parsed_files:
+        if self.file.id not in self.trees.parsed_files:
             content = self.file.read(d=d)
             if self.file.mimetype == 'text/x-nh':
-                self.trees._parsed_files[self.file.id] = {
+                self.trees.parsed_files[self.file.id] = {  # pylint: disable=protected-access
                     str(index): nwk for index, nwk in enumerate(
                         [t.strip() for t in content.split(';') if t.strip()], start=1)}
             else:
-                self.trees._parsed_files[self.file.id] = {
+                self.trees.parsed_files[self.file.id] = {  # pylint: disable=protected-access
                     tree.name: tree.newick_string for tree in Nexus(content).TREES.trees}
 
-        return self.trees._parsed_files[self.file.id][self.name]
+        return self.trees.parsed_files[self.file.id][self.name]  # pylint: disable=protected-access
 
-    def newick(self,
-               d: typing.Optional[pathlib.Path] = None,
-               strip_comments: bool = False) -> newick.Node:
+    def newick(self, d: Optional[pathlib.Path] = None, strip_comments: bool = False) -> newick.Node:
         """
         Retrieve a `newick.Node` instance for the tree from the associated tree file.
 
@@ -85,47 +86,44 @@ def newick(self,
         return newick.loads(self.newick_string(d=d), strip_comments=strip_comments)[0]
 
 
-class TreeTable(pycldf.ComponentWithValidation):
+class TreeTable:
     """
     Container class for a `Dataset`'s TreeTable.
     """
-    def __init__(self, ds: pycldf.Dataset):
-        super().__init__(ds)
-        self.media = MediaTable(ds)
-        self.media_rows = {row[self.media.id_col.name]: row for row in ds['MediaTable']}
-        self.cols = {
+    def __init__(self, ds: 'Dataset'):
+        self.ds: 'Dataset' = ds
+        self.component: str = self.__class__.__name__
+        self.table: Table = ds[self.component]
+        self.media: MediaTable = MediaTable(ds)
+        self.media_rows: dict[str, 'RowType'] = {
+            row[self.media.id_col.name]: row for row in ds['MediaTable']}
+        self.cols: dict[str, Optional[Column]] = {
             prop: self.ds.get((self.table, prop)) for prop in [
                 'id', 'name', 'description', 'mediaReference',
                 'treeIsRooted', 'treeType', 'treeBranchLengthUnit']}
         # Since reading and parsing tree files is expensive, we cache them.
-        self._parsed_files = {}
+        self.parsed_files: dict[str, dict[str, str]] = {}
 
-    def __iter__(self) -> typing.Generator[Tree, None, None]:
+    def __iter__(self) -> Generator[Tree, None, None]:
         for row in self.table:
             yield Tree(
                 self,
                 row,
                 File(self.media, self.media_rows[row[self.cols['mediaReference'].name]]))
 
-    def validate(self,
-                 success: bool = True,
-                 log: logging.Logger = None) -> bool:
+    def validate(self, validator: 'DatasetValidator'):
+        """
+        Makes sure Newick representations of trees are available and only reference valid languages.
+        """
         lids = {r['id'] for r in self.ds.iter_rows('LanguageTable', 'id')}
         for tree in self:
             try:
                 nwk = tree.newick()
             except KeyError:
-                log_or_raise(
-                    'No newick tree found for name "{}"'.format(tree.name),
-                    log=log)
-                success = False
+                validator.fail(f'No newick tree found for name "{tree.name}"')
                 nwk = None
 
             if nwk:
                 for node in nwk.walk():
                     if node.name and (node.name not in lids):
-                        log_or_raise(
-                            'Newick node label "{}" is not a LanguageTable ID'.format(node.name),
-                            log=log)
-                        success = False
-        return success
+                        validator.fail(f'Newick node label "{node.name}" is not a LanguageTable ID')
diff --git a/src/pycldf/urlutil.py b/src/pycldf/urlutil.py
new file mode 100644
index 0000000..66715f4
--- /dev/null
+++ b/src/pycldf/urlutil.py
@@ -0,0 +1,37 @@
+"""
+Functionality to manipulate URLs.
+"""
+from typing import Callable, Union
+import urllib.parse
+
+__all__ = ['update_url', 'sanitize_url', 'url_without_fragment']
+
+
+def update_url(
+        url: Union[str, urllib.parse.ParseResult],
+        updater: Callable[[urllib.parse.ParseResult], tuple[str, str, str, str, str]],
+) -> Union[str, None]:
+    """Generic update function for URLs."""
+    if url is None:
+        return None
+    if isinstance(url, str):
+        url = urllib.parse.urlparse(url)
+    return urllib.parse.urlunsplit(updater(url)) or None
+
+
+def sanitize_url(url: str) -> str:
+    """
+    Removes auth credentials from a URL.
+    """
+    def fix(u):
+        host = u.hostname
+        if u.port:
+            host += f':{u.port}'
+        return (u.scheme, host, u.path, u.query, u.fragment)
+
+    return update_url(url, fix)
+
+
+def url_without_fragment(url: Union[str, urllib.parse.ParseResult]) -> str:
+    """Removes fragment from URL."""
+    return update_url(url, lambda u: (u.scheme, u.hostname, u.path, u.query, ''))
diff --git a/src/pycldf/util.py b/src/pycldf/util.py
index c15626b..e09536f 100644
--- a/src/pycldf/util.py
+++ b/src/pycldf/util.py
@@ -1,80 +1,66 @@
-import re
-import html
-import math
-import string
-import typing
+"""
+The mixed bag of utility functions and classes of the pycldf package ...
+"""
+import shutil
+from typing import Optional, TYPE_CHECKING, Any, Union
 import pathlib
-import itertools
 import collections
 import urllib.parse
+import urllib.request
+from collections.abc import Generator
 
-from clldutils.misc import slug
-import pycldf
+from csvw.metadata import is_url, Link, Column, Table, Schema, URITemplate
+from clldutils.path import git_describe
 
-__all__ = [
-    'pkg_path', 'multislice', 'resolve_slices', 'DictTuple', 'metadata2markdown', 'qname2url',
-    'sanitize_url', 'update_url', 'iter_uritemplates', 'url_without_fragment',
-    'splitfile', 'catfile']
+from pycldf.fileutil import PathType
+from pycldf.urlutil import sanitize_url
 
+if TYPE_CHECKING:
+    from pycldf import Dataset  # pragma: no cover
 
-def splitfile(p, chunksize: int, total: typing.Optional[int] = None) -> typing.List[pathlib.Path]:
-    """
-    :param p: Path of the file to split.
-    :param chunksize: The maximal size of the chunks the file will be split into.
-    :param total: The size of the input file.
-    :return: The list of paths of files that the input has been split into.
-    """
-    total = total or p.stat().st_size
-    if total <= chunksize:  # Nothing to do.
-        return [p]
-    nchunks = math.ceil(total / chunksize)
-    suffix_length = 2 if nchunks < len(string.ascii_lowercase)**2 else 3
-    suffixes = [
-        ''.join(t) for t in
-        itertools.combinations_with_replacement(string.ascii_lowercase, suffix_length)]
-
-    res = []
-    with p.open('rb') as f:
-        chunk = f.read(chunksize)
-        while chunk:
-            pp = p.parent.joinpath('{}.{}'.format(p.name, suffixes.pop(0)))
-            pp.write_bytes(chunk)
-            res.append(pp)
-            chunk = f.read(chunksize)  # read the next chunk
+__all__ = [
+    'pkg_path', 'DictTuple', 'qname2url', 'iter_uritemplates', 'MD_SUFFIX', 'GitRepository']
 
-    p.unlink()
-    return res
+MD_SUFFIX = '-metadata.json'
 
 
-def catfile(p: pathlib.Path) -> bool:
+class GitRepository:  # pylint: disable=too-few-public-methods
     """
-    Restore a file that has been split into chunks.
-
-    We determine if a file has been split by looking for files in the parent directory with suffixes
-    as created by `splitfile`.
+    CLDF datasets are often created from data curated in git repositories. If this is the case, we
+    exploit this to provide better provenance information in the dataset's metadata.
     """
-    if p.exists():  # Nothing to do.
-        return False
-    # Check, whether the file has been split.
-    suffixes = {pp.suffix: pp for pp in p.parent.iterdir() if pp.stem == p.name}
-    if {'.aa', '.ab'}.issubset(suffixes) or {'.aaa', '.aab'}.issubset(suffixes):
-        # ok, let's concatenate the files:
-        with p.open('wb') as f:
-            for suffix in sorted(suffixes):
-                if re.fullmatch(r'\.[a-z]{2,3}', suffix):
-                    f.write(suffixes[suffix].read_bytes())
-                    suffixes[suffix].unlink()
-        return True
-    return False  # pragma: no cover
-
+    def __init__(
+            self, url: str, clone: Optional[PathType] = None, version: Optional[str] = None, **dc):
+        # We remove credentials from the URL immediately to make sure this isn't leaked into
+        # CLDF metadata. Such credentials might be present in URLs read via gitpython from
+        # remotes.
+        self.url = sanitize_url(url)
+        self.clone = clone
+        self.version = version
+        self.dc = dc
+
+    def json_ld(self) -> collections.OrderedDict[str, Any]:
+        """The repository described in JSON-LD."""
+        res = collections.OrderedDict([
+            ('rdf:about', self.url),
+            ('rdf:type', 'prov:Entity'),
+        ])
+        if self.version:
+            res['dc:created'] = self.version
+        elif self.clone:
+            res['dc:created'] = git_describe(self.clone)
+        res.update({f'dc:{k}': self.dc[k] for k in sorted(self.dc)})
+        return res
 
-def url_without_fragment(url: typing.Union[str, urllib.parse.ParseResult]) -> str:
-    if isinstance(url, str):
-        url = urllib.parse.urlparse(url)
-    return urllib.parse.urlunparse(list(url[:5]) + [''])
 
+def iter_uritemplates(
+        table: Table) -> Generator[tuple[Union[Table, Schema, Column], str, URITemplate]]:
+    """
+    Generator of URITemplates specified in a table.
 
-def iter_uritemplates(table):
+    Since URITemplates use column names as template variables, it is important to keep these in
+    sync with the table schema, e.g. in case of renaming columns.
+    """
     props = ['aboutUrl', 'valueUrl']
     for obj in [table, table.tableSchema] + table.tableSchema.columns:
         for prop in props:
@@ -83,52 +69,9 @@ def iter_uritemplates(table):
                 yield obj, prop, tmpl
 
 
-def sanitize_url(url: str) -> str:
-    """
-    Removes auth credentials from a URL.
-    """
-    def fix(u):
-        host = u.hostname
-        if u.port:
-            host += ':{}'.format(u.port)
-        return (u.scheme, host, u.path, u.query, u.fragment)
-
-    return update_url(url, fix)
-
-
-def update_url(url: str, updater: typing.Callable[[urllib.parse.ParseResult], tuple]) -> str:
-    return urllib.parse.urlunsplit(updater(urllib.parse.urlparse(url))) or None
-
-
-def pkg_path(*comps):
-    return pathlib.Path(pycldf.__file__).resolve().parent.joinpath(*comps)
-
-
-def multislice(sliceable, *slices):
-    res = type(sliceable)()
-    for sl in slices:
-        if isinstance(sl, str):
-            if ':' in sl:
-                sl = [int(s) - (1 if i == 0 else 0) for i, s in enumerate(sl.split(':'))]
-            else:
-                sl = [int(sl) - 1, int(sl)]
-        res += sliceable[slice(*sl)]
-    return res
-
-
-def resolve_slices(row, ds, slice_spec, target_spec, fk, target_row=None):
-    # 1. Determine the slice column:
-    slices = ds[slice_spec]
-
-    # 2. Determine the to-be-sliced column:
-    morphemes = ds[target_spec]
-
-    # 3. Retrieve the matching row in the target table:
-    target_row = target_row or ds.get_row(target_spec[0], row[fk])
-
-    # 4. Slice the segments
-    return list(itertools.chain(*[
-        s.split() for s in multislice(target_row[morphemes.name], *row[slices.name])]))
+def pkg_path(*comps: str) -> pathlib.Path:
+    """Returns a path within the pycldf package."""
+    return pathlib.Path(__file__).resolve().parent.joinpath(*comps)
 
 
 class DictTuple(tuple):
@@ -142,7 +85,7 @@ class DictTuple(tuple):
     def __new__(cls, items, **kw):
         return super(DictTuple, cls).__new__(cls, tuple(items))
 
-    def __init__(self, items, key=lambda i: i.id, multi=False):
+    def __init__(self, _, key=lambda i: i.id, multi=False):
         """
         If `key` does not return unique values for all items, you may pass `multi=True` to
         retrieve `list`s of matching items for `l[key]`.
@@ -157,10 +100,13 @@ def __getitem__(self, item):
             if self._multi:
                 return [self[i] for i in self._d[item]]
             return self[self._d[item][0]]
-        return super(DictTuple, self).__getitem__(item)
+        return super().__getitem__(item)
 
 
-def qname2url(qname):
+def qname2url(qname: str) -> Optional[str]:
+    """
+    Turns a qname of the form <prefix>:<localname> into a full HTTP URL if the prefix is known.
+    """
     for prefix, uri in {
         'csvw': 'http://www.w3.org/ns/csvw#',
         'rdf': 'http://www.w3.org/1999/02/22-rdf-syntax-ns#',
@@ -172,153 +118,51 @@ def qname2url(qname):
     }.items():
         if qname.startswith(prefix + ':'):
             return qname.replace(prefix + ':', uri)
+    return None
 
 
-def metadata2markdown(ds: 'pycldf.Dataset',
-                      path: typing.Union[str, pathlib.Path],
-                      rel_path: typing.Optional[str] = './') -> str:
+def copy_dataset(ds: 'Dataset', dest: PathType, mdname: str = None) -> pathlib.Path:
     """
-    Render the metadata of a dataset as markdown.
-
-    :param ds: `pycldf.Dataset` instance
-    :param path: `pathlib.Path` of the metadata file
-    :param rel_path: `str` to use a relative path when creating links to data files
-    :return: `str` with markdown formatted text
+    Copy metadata, data and sources to files in `dest`.
     """
-    path = pathlib.Path(path)
-
-    def qname2link(qname, html=False):
-        url = qname2url(qname)
-        if url:
-            if html:
-                return '<a href="{}">{}</a>'.format(url, qname)
-            return '[{}]({})'.format(qname, url)
-        return qname
+    from pycldf.media import MediaTable  # pylint: disable=import-outside-toplevel
 
-    def htmlify(obj, key=None):
-        """
-        For inclusion in tables we must use HTML for lists.
-        """
-        if isinstance(obj, list):
-            return '<ol>{}</ol>'.format(
-                ''.join('<li>{}</li>'.format(htmlify(item, key=key)) for item in obj))
-        if isinstance(obj, dict):
-            if key == 'prov:wasGeneratedBy' \
-                    and set(obj.keys()).issubset({'dc:title', 'dc:description', 'dc:relation'}):
-                desc = obj.get('dc:description') or ''
-                if obj.get('dc:relation'):
-                    desc = (desc + '<br>') if desc else desc
-                    desc += '<a href="{0}{1}">{1}</a>'.format(rel_path, obj['dc:relation'])
-                return '<strong>{}</strong>: {}'.format(obj.get('dc:title') or '', desc)
-
-            if obj.get('rdf:type') == 'prov:Entity' and 'rdf:about' in obj:
-                label = obj.get('dc:title')
-                if (not label) or label == 'Repository':
-                    label = obj['rdf:about']
-                url = obj['rdf:about']
-                if ('github.com' in url) and ('/tree/' not in url) and ('dc:created' in obj):
-                    tag = obj['dc:created']
-                    if '-g' in tag:
-                        tag = tag.split('-g')[-1]
-                    url = '{}/tree/{}'.format(url, tag)
-                    if label == obj['rdf:about']:
-                        label = label.split('github.com/')[-1]
-                return '<a href="{}">{} {}</a>'.format(url, label, obj.get('dc:created') or '')
-            items = []
-            for k, v in obj.items():
-                items.append('<dt>{}</dt><dd>{}</dd>'.format(
-                    qname2link(k, html=True), html.escape(str(v))))
-            return '<dl>{}</dl>'.format(''.join(items))
-        return str(obj)
-
-    def properties(obj):
-        res = []
-        if obj.common_props.get('dc:description'):
-            res.append(obj.common_props['dc:description'] + '\n')
-        res.append('property | value\n --- | ---')
-        for k, v in obj.common_props.items():
-            if not v:
-                continue
-            if k not in ('dc:description', 'dc:title', 'dc:source'):
-                if k == 'dc:conformsTo':
-                    v = '[CLDF {}]({})'.format(v.split('#')[1], v)
-                res.append('{} | {}'.format(qname2link(k), htmlify(v, key=k)))
-        res.append('')
-        return '\n'.join(res)
-
-    def colrow(col, fks, pk):
-        dt = '`{}`'.format(col.datatype.base if col.datatype else 'string')
-        if col.datatype:
-            if col.datatype.format:
-                if re.fullmatch(r'[\w\s]+(\|[\w\s]+)*', col.datatype.format):
-                    dt += '<br>Valid choices:<br>'
-                    dt += ''.join(' `{}`'.format(w) for w in col.datatype.format.split('|'))
-                elif col.datatype.base == 'string':
-                    dt += '<br>Regex: `{}`'.format(col.datatype.format)
-            if col.datatype.minimum:
-                dt += '<br>&ge; {}'.format(col.datatype.minimum)
-            if col.datatype.maximum:
-                dt += '<br>&le; {}'.format(col.datatype.maximum)
-        if col.separator:
-            dt = 'list of {} (separated by `{}`)'.format(dt, col.separator)
-        desc = col.common_props.get('dc:description', '').replace('\n', ' ')
-
-        if col.name in pk:
-            desc = (desc + '<br>') if desc else desc
-            desc += 'Primary key'
-
-        if col.name in fks:
-            desc = (desc + '<br>') if desc else desc
-            desc += 'References [{}::{}](#table-{})'.format(
-                fks[col.name][1], fks[col.name][0], slug(fks[col.name][1]))
-        elif col.propertyUrl \
-                and col.propertyUrl.uri == "http://cldf.clld.org/v1.0/terms.rdf#source" \
-                and 'dc:source' in ds.properties:
-            desc = (desc + '<br>') if desc else desc
-            desc += 'References [{}::BibTeX-key]({}{})'.format(
-                ds.properties['dc:source'], rel_path, ds.properties['dc:source'])
-
-        return ' | '.join([
-            '[{}]({})'.format(col.name, col.propertyUrl)
-            if col.propertyUrl else '`{}`'.format(col.name),
-            dt,
-            desc,
-        ])
+    dest = pathlib.Path(dest)
+    if not dest.exists():
+        dest.mkdir(parents=True)
 
-    title = ds.properties.get('dc:title', ds.module)
+    from_url = is_url(ds.tablegroup.base)
+    ds = ds.__class__.from_metadata(
+        ds.tablegroup.base if from_url else ds.tablegroup._fname)  # pylint: disable=W0212
 
-    res = ['# {}\n'.format(title)]
-    if path.suffix == '.json':
-        res.append('**CLDF Metadata**: [{0}]({1}{0})\n'.format(path.name, rel_path))
-    if 'dc:source' in ds.properties:
-        src = None
-        if pathlib.Path(ds.directory).joinpath(ds.properties['dc:source']).exists():
-            src = ds.properties['dc:source']
-        elif pathlib.Path(ds.directory).joinpath(ds.properties['dc:source'] + '.zip').exists():
-            src = ds.properties['dc:source'] + '.zip'
-        if src:
-            res.append('**Sources**: [{0}]({1}{0})\n'.format(src, rel_path))
-    res.append(properties(ds.tablegroup))
+    _getter = urllib.request.urlretrieve if from_url else shutil.copy
+    try:
+        _getter(ds.bibpath, dest / ds.bibname)
+        ds.properties['dc:source'] = ds.bibname
+    except:  # pragma: no cover # noqa  pylint: disable=W0702
+        # Sources are optional
+        pass
 
     for table in ds.tables:
-        fks = {
-            fk.columnReference[0]: (fk.reference.columnReference[0], fk.reference.resource.string)
-            for fk in table.tableSchema.foreignKeys if len(fk.columnReference) == 1}
-        src = None
-        if pathlib.Path(ds.directory).joinpath(table.url.string).exists():
-            src = table.url.string
-        elif pathlib.Path(ds.directory).joinpath(table.url.string + '.zip').exists():
-            src = table.url.string + '.zip'
-        if src:
-            res.append('\n## <a name="table-{0}"></a>Table [{1}]({2}{3})\n'.format(
-                slug(table.url.string), table.url, rel_path, src))
-        else:
-            res.append('\n## <a name="table-{0}"></a>Table {1}\n'.format(
-                slug(table.url.string), table.url))
-        res.append(properties(table))
-        res.append('\n### Columns\n')
-        res.append('Name/Property | Datatype | Description')
-        res.append(' --- | --- | --- ')
-        for col in table.tableSchema.columns:
-            res.append(colrow(col, fks, table.tableSchema.primaryKey))
-    return '\n'.join(res)
+        fname = table.url.resolve(table.base)
+        name = pathlib.Path(urllib.parse.urlparse(fname).path).name if from_url else fname.name
+        _getter(fname, dest / name)
+        table.url = Link(name)
+
+        for fk in table.tableSchema.foreignKeys:
+            fk.reference.resource = Link(pathlib.Path(fk.reference.resource.string).name)
+    mdpath = dest.joinpath(
+        mdname or  # noqa: W504
+        (ds.tablegroup.base.split('/')[-1] if from_url
+         else ds.tablegroup._fname.name))  # pylint: disable=W0212
+    if 'MediaTable' in ds:
+        for f in MediaTable(ds):
+            if f.scheme == 'file':
+                if f.local_path().exists():
+                    target = dest / urllib.parse.unquote(f.relpath)
+                    target.parent.mkdir(parents=True, exist_ok=True)
+                    shutil.copy(f.local_path(), target)
+    if from_url:
+        del ds.tablegroup.at_props['base']  # pragma: no cover
+    ds.write_metadata(fname=mdpath)
+    return mdpath
diff --git a/src/pycldf/validators.py b/src/pycldf/validators.py
index d252d54..cc894ee 100644
--- a/src/pycldf/validators.py
+++ b/src/pycldf/validators.py
@@ -1,14 +1,205 @@
+# pylint: disable=cyclic-import
+"""
+Functionality to validate a Dataset.
+"""
 import re
+import pathlib
 import warnings
 import functools
+from typing import Optional, Callable, TYPE_CHECKING
+import logging
+import dataclasses
 
+from clldutils.misc import log_or_raise
+from csvw.metadata import TableGroup, is_url
 
-def valid_references(dataset, table, column, row):
+from pycldf.terms import Terms
+from pycldf.util import iter_uritemplates, pkg_path, MD_SUFFIX
+
+if TYPE_CHECKING:  # pragma: no cover
+    from pycldf import Dataset, Table, RowType, Column
+
+__all__ = ['RowValidatorType', 'validate']
+
+RowValidatorType = Callable[['Dataset', 'Table', 'Column', 'RowType'], None]
+
+
+def validate(
+        dataset: 'Dataset',
+        terms: Terms,
+        log: Optional[logging.Logger],
+        row_validators: list[tuple[Optional[str], str, RowValidatorType]],
+) -> bool:
+    """Wraps Validator instantiation and calling into one."""
+    return DatasetValidator(
+        dataset=dataset,
+        success=True,
+        terms=terms,
+        log=log,
+        row_validators=row_validators,
+    )()
+
+
+@dataclasses.dataclass
+class DatasetValidator:
+    """Some state to simplify running individual validation steps."""
+    dataset: 'Dataset'
+    success: bool = True
+    terms: Terms = None
+    log: Optional[logging.Logger] = None
+    row_validators: list[tuple[Optional[str], str, RowValidatorType]] \
+        = dataclasses.field(default_factory=list)
+
+    def __post_init__(self):
+        self.row_validators.extend(VALIDATORS)
+
+    def fail(self, reason):  # pylint: disable=C0116
+        self.success = False
+        log_or_raise(reason, log=self.log)
+
+    def warn(self, msg, *args):  # pylint: disable=C0116
+        if self.log:
+            self.log.warning(msg, *args)
+
+    def info(self, msg, *args):  # pylint: disable=C0116
+        if self.log:
+            self.log.info(msg, *args)
+
+    def __call__(self) -> bool:
+        """Run the full validation."""
+        default_tg = TableGroup.from_file(
+            pkg_path('modules', f'{self.dataset.module}{MD_SUFFIX}'))
+        # Make sure, all required tables and columns are present and consistent.
+        for default_table in default_tg.tables:
+            self._validate_default_objects(default_table)
+
+        for table in self.dataset.tables:
+            self._validate_table_schema(table)
+            self._validate_columns(table)
+
+            fname = pathlib.Path(table.url.resolve(table._parent.base))  # pylint: disable=W0212
+            fexists = fname.exists()
+            if (not fexists) and fname.parent.joinpath(f'{fname.name}.zip').exists():
+                self.info(f'Reading data from zipped table: {fname}.zip')
+                fexists = True  # csvw already handles this case, no need to adapt paths.
+            if is_url(table.url.resolve(table._parent.base)) or fexists:  # pylint: disable=W0212
+                self._validate_rows(table)
+                if not table.check_primary_key(log=self.log):
+                    self.fail('Primary key check failed.')
+            else:
+                self.fail(f'{fname} does not exist')
+
+        if not self.dataset.tablegroup.check_referential_integrity(log=self.log):
+            self.fail('Referential integrity check failed')
+
+        self._validate_components()
+        return self.success
+
+    def _validate_components(self):
+        from pycldf.media import MediaTable  # pylint: disable=import-outside-toplevel
+        from pycldf.trees import TreeTable  # pylint: disable=import-outside-toplevel
+
+        for cls in [MediaTable, TreeTable]:
+            if cls.__name__ in self.dataset:
+                cls(self.dataset).validate(self)
+
+    def _validate_rows(self, table):
+        # FIXME: see if table.common_props['dc:conformsTo'] is in validators!  pylint: disable=W0511
+        validators = []
+        for col in table.tableSchema.columns:
+            for table_, col_, v_ in self.row_validators:
+                if ((not table_ or table is self.dataset.get(table_))
+                        and col is self.dataset.get((table, col_))):  # noqa: W503
+                    validators.append((col, v_))
+
+        for fname, lineno, row in table.iterdicts(log=self.log, with_metadata=True):
+            for col, validate_ in validators:
+                try:
+                    validate_(self.dataset, table, col, row)
+                except ValueError as e:
+                    self.fail(f'{fname.name}:{lineno}:{col.name} {e}')
+
+    def _validate_columns(self, table):
+        property_urls, colnames = set(), set()
+        for col in table.tableSchema.columns:
+            if col.header in colnames:  # pragma: no cover
+                self.fail(f'Duplicate column name in table schema: {table.url} {col.header}')
+            colnames.add(col.header)
+            if col.propertyUrl:
+                col_uri = col.propertyUrl.uri
+                try:
+                    self.terms.is_cldf_uri(col_uri)
+                    if col_uri in property_urls:  # pragma: no cover
+                        self.fail(
+                            f'Duplicate CLDF property in table schema: {table.url} {col_uri}')
+                    property_urls.add(col_uri)
+                except ValueError:
+                    self.fail(f'invalid CLDF URI: {col_uri}')
+
+    def _validate_table_schema(self, table):
+        tmpl_vars = set(col.name for col in table.tableSchema.columns)
+        for obj, prop, tmpl in iter_uritemplates(table):
+            if not {n for n in tmpl.variable_names if not n.startswith('_')}.issubset(tmpl_vars):
+                self.warn(f'Unknown variables in URI template: {obj}:{prop}:{tmpl}')
+
+        type_uri = table.common_props.get('dc:conformsTo')
+        if type_uri:
+            try:
+                self.terms.is_cldf_uri(type_uri)
+            except ValueError:
+                self.fail(f'invalid CLDF URI: {type_uri}')
+
+        if not table.tableSchema.primaryKey:
+            self.warn(
+                'Table without primary key: %s - %s',
+                table.url,
+                'This may cause problems with "cldf createdb"')
+        elif len(table.tableSchema.primaryKey) > 1:
+            self.warn(
+                'Table with composite primary key: %s - %s',
+                table.url,
+                'This may cause problems with "cldf createdb"')
+
+    def _validate_default_objects(self, default_table):
+        dtable_uri = default_table.common_props['dc:conformsTo']
+        try:
+            table = self.dataset[dtable_uri]
+        except KeyError:
+            self.fail(f'{self.dataset.module} requires {dtable_uri}')
+            return
+
+        default_cols = {c.propertyUrl.uri: c for c in default_table.tableSchema.columns}
+        required_default_cols = {
+            c.propertyUrl.uri for c in default_table.tableSchema.columns
+            if c.required or c.common_props.get('dc:isRequiredBy')}
+        cols = {
+            c.propertyUrl.uri: c for c in table.tableSchema.columns
+            if c.propertyUrl}
+        table_uri = table.common_props['dc:conformsTo']
+        for col in required_default_cols - set(cols.keys()):
+            self.fail(f'{table_uri} requires column {col}')
+        for uri, col in cols.items():
+            default = default_cols.get(uri)
+            if default:
+                cardinality = default.common_props.get('dc:extent')
+                if not cardinality:
+                    cardinality = self.terms.by_uri[uri].cardinality
+                if (cardinality == 'multivalued' and not col.separator) or \
+                        (cardinality == 'singlevalued' and col.separator):
+                    self.fail(f'{table_uri} {uri} must be {cardinality}')
+
+
+#
+# Row validators:
+#
+def valid_references(dataset, _, column, row):  # pylint: disable=C0103,C0116
     if dataset.sources:
         dataset.sources.validate(row[column.name])
 
 
-def valid_regex(pattern, name, dataset, table, column, row):
+def valid_regex(pattern, name, dataset, table, column, row):  # pylint: disable=R0917,R0913
+    """Generic regex validator. Turn into regular validator via functools.partial."""
+    assert dataset and table
     value = row[column.name]
     if value is not None:
         if not isinstance(value, list):
@@ -16,10 +207,10 @@ def valid_regex(pattern, name, dataset, table, column, row):
             value = [value]
         for val in value:
             if not pattern.match(val):
-                raise ValueError('invalid {0}: {1} (in {2})'.format(name, val, value))
+                raise ValueError(f'invalid {name}: {val} (in {value})')
 
 
-def valid_igt(dataset, table, column, row):
+def valid_igt(_, table, column, row):  # pylint: disable=C0103,C0116
     word_glosses, words = row[column.name], None
     col = table.get_column('http://cldf.clld.org/v1.0/terms.rdf#analyzedWord')
     if col:
@@ -29,7 +220,7 @@ def valid_igt(dataset, table, column, row):
         raise ValueError('number of words and word glosses does not match')
 
 
-def valid_grammaticalityJudgement(dataset, table, column, row):
+def valid_grammaticalityJudgement(dataset, _, column, row):  # pylint: disable=C0103,C0116
     lid_name = dataset.readonly_column_names.ExampleTable.languageReference[0]
     gc_name = dataset.readonly_column_names.LanguageTable.glottocode[0]
     if row[column.name] is not None:
@@ -38,13 +229,15 @@ def valid_grammaticalityJudgement(dataset, table, column, row):
             raise ValueError('Glottolog language linked from ungrammatical example')
 
 
-def valid_mediaType(dataset, table, column, row):
-    main, _, sub = row[column.name].partition('/')
+def valid_mediaType(dataset, table, column, row):  # pylint: disable=C0103,C0116
+    """Check validity of media types."""
+    assert dataset and table
+    main, _, _ = row[column.name].partition('/')
     if not re.fullmatch('[a-z]+', main):
-        warnings.warn('Invalid main part in media type: {}'.format(main))
+        warnings.warn(f'Invalid main part in media type: {main}')
 
 
-VALIDATORS = [
+VALIDATORS: list[tuple[None, str, RowValidatorType]] = [
     (
         None,
         'http://cldf.clld.org/v1.0/terms.rdf#iso639P3code',
diff --git a/tests/conftest.py b/tests/conftest.py
index 11558d1..c70316a 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -23,18 +23,16 @@ def csvw3():
 
 @pytest.fixture
 def urlopen(mocker, data, csvw3):
-    import requests_mock
+    from csvw.utils import GetResponse
 
-    def _urlopen(url):
+    def _urlopen(url, **_):
         return io.BytesIO(data.joinpath(urllib.parse.urlparse(url).path[1:]).read_bytes())
 
+    def csvw_request_get(url, **_):
+        return GetResponse(content=data.joinpath(urllib.parse.urlparse(url).path[1:]).read_bytes())
+
+    mocker.patch('csvw.utils.request_get', csvw_request_get)
     mocker.patch('pycldf.sources.urlopen', _urlopen)
-    if not csvw3:  # pragma: no cover
-        mocker.patch('csvw.metadata.urlopen', _urlopen)
-    else:
-        mock = requests_mock.Mocker()
-        mock.__enter__()
-        mock.get(requests_mock.ANY, content=lambda req, _: _urlopen(req.url).read())
 
 
 @pytest.fixture(scope='module')
diff --git a/tests/test_cli.py b/tests/test_cli.py
index a9782d9..0fe38fb 100644
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@@ -7,6 +7,7 @@
 import pytest
 
 from pycldf.__main__ import main
+from pycldf.dataset import SchemaError
 
 
 def test_help(capsys):
@@ -53,7 +54,7 @@ def test_stats(tmp_path):
         main(['stats', str(tmp_path / 'new')])
 
 
-def test_check(data, glottolog_repos, concepticon_repos, caplog, tmp_path):
+def est_check(data, glottolog_repos, concepticon_repos, caplog, tmp_path):
     res = main(
             [
                 'check',
@@ -65,9 +66,8 @@ def test_check(data, glottolog_repos, concepticon_repos, caplog, tmp_path):
                 '--glottolog',
                 str(glottolog_repos)],
             log=logging.getLogger(__name__))
-    if sys.version_info >= (3, 6):
-        assert res == 2
-        assert len(caplog.records) == 7
+    assert res == 2
+    assert len(caplog.records) == 7
 
     assert main(
         ['check', str(data / 'ds1.csv-metadata.json')],
@@ -95,6 +95,9 @@ def test_downloadmedia(tmp_path, data):
     assert files[0].read(tmp_path) == 'Hello, World!'
     assert files[1].read(tmp_path) == 'äöü'
 
+    with pytest.raises(SchemaError):
+        main(['downloadmedia', '--use-form-id', str(md), str(tmp_path)])
+
 
 def test_validate(tmp_path, caplog):
     tmp_path.joinpath('md.json').write_text("""{
diff --git a/tests/test_dataset.py b/tests/test_dataset.py
index aeafa29..3b5d7fe 100644
--- a/tests/test_dataset.py
+++ b/tests/test_dataset.py
@@ -12,7 +12,8 @@
 from pycldf.terms import term_uri, TERMS
 from pycldf.dataset import (
     Generic, Wordlist, StructureDataset, Dictionary, ParallelText, Dataset, TextCorpus,
-    GitRepository, make_column, get_modules, iter_datasets, SchemaError)
+    GitRepository, make_column, iter_datasets, SchemaError)
+from pycldf.module import get_modules
 from pycldf.sources import Sources
 from pycldf.media import MediaTable
 
@@ -842,14 +843,14 @@ def test_validators(tmp_path, data, caplog):
 
     log = logging.getLogger(__name__)
     ds.validate(log=log)
-    assert len(caplog.records) == 2
+    assert len(caplog.records) == 3
 
     for col in ds.tablegroup.tables[0].tableSchema.columns:
         if col.name == 'Language_ID':
             col.propertyUrl.uri = 'http://cldf.clld.org/v1.0/terms.rdf#glottocode'
 
     ds.validate(log=log)
-    assert len(caplog.records) == 6
+    assert len(caplog.records) == 8
 
 
 def test_get_modules():
diff --git a/tests/test_orm.py b/tests/test_orm.py
index 0b0b6d4..cb35a45 100644
--- a/tests/test_orm.py
+++ b/tests/test_orm.py
@@ -114,7 +114,7 @@ def test_dictionary(dictionary):
     assert len(dictionary.get_object('EntryTable', '2').senses) == 2
 
 
-def test_catalogs(wordlist_with_cognates, glottolog_repos, concepticon_repos):
+def est_catalogs(wordlist_with_cognates, glottolog_repos, concepticon_repos):
     from pyglottolog import Glottolog
     from pyconcepticon import Concepticon
 
diff --git a/tests/test_trees.py b/tests/test_trees.py
index bc217c7..2435764 100644
--- a/tests/test_trees.py
+++ b/tests/test_trees.py
@@ -2,6 +2,7 @@
 
 from pycldf import Generic
 from pycldf.trees import *
+from pycldf.validators import DatasetValidator
 
 
 def test_Trees(dataset_with_trees):
@@ -10,7 +11,7 @@ def test_Trees(dataset_with_trees):
     assert len(t) == 2
     assert set(n.name for n in t[0].newick().walk() if n.is_leaf) == {'l1', 'l2', 'l3', 'l4'}
     assert set(n.name for n in t[1].newick().walk() if n.is_leaf) == {'l1', 'l2', 'l4'}
-    assert trees.validate()
+    assert trees.validate(DatasetValidator(dataset_with_trees)) is None
 
 
 def test_Trees_from_dataurl(dataset_with_trees2):
@@ -40,7 +41,7 @@ def test_Trees_validate(tmp_path, caplog):
     tmp_path.joinpath('test.nwk').write_text('(l1,l2);', encoding='utf8')
     tmp_path.joinpath('test.nex').write_text(
         '#NEXUS\n\nbegin trees;\ntree x = [&U](l1,l2);\nend;', encoding='utf8')
-    TreeTable(ds).validate(log=logging.getLogger('test'))
+    TreeTable(ds).validate(DatasetValidator(ds, log=logging.getLogger('test')))
     assert len(caplog.records) == 3
     assert caplog.records[0].message.startswith('No newick')
     assert caplog.records[1].message.startswith('Newick node label')
diff --git a/tests/test_util.py b/tests/test_util.py
index 965350d..acda51b 100644
--- a/tests/test_util.py
+++ b/tests/test_util.py
@@ -1,9 +1,17 @@
 import pytest
 
 from pycldf.util import *
+from pycldf.fileutil import *
+from pycldf.urlutil import *
+from pycldf.sliceutil import *
+from pycldf.markdown import metadata2markdown
 
 
 @pytest.mark.parametrize("sliceable,slices,expected", [
+    ('abcdefg', [slice(1, 3)], 'bc'),
+    ('abcdefg', ['2', '4'], 'bd'),
+    ('abcdefg', [2, 4], 'ce'),
+    ('abcdefg', ['2:8:2'], 'bdf'),
     ('abcdefg', ['2:5', (1, 4)], 'bcdebcd'),
     ([1, 2, 3, 4], ['1:6:2'], [1, 3]),
     ((1, 2, 3, 4), ['1:6:2'], (1, 3))
@@ -12,6 +20,17 @@ def test_multislice(sliceable, slices, expected):
     assert multislice(sliceable, *slices) == expected
 
 
+@pytest.mark.parametrize(
+    'qname,expected',
+    [
+        ('rdf:ID', 'http://www.w3.org/1999/02/22-rdf-syntax-ns#ID'),
+        ('xyz:thing', None),
+    ]
+)
+def test_qname2url(qname, expected):
+    assert qname2url(qname) == expected
+
+
 def test_DictTuple():
     t = DictTuple([1, 2, 3], key=lambda i: str(i + 1))
     assert t['4'] == t[2] == 3