evalkit/pyproject.toml at main · cortexark/evalkit · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
[build-system]
requires = ["hatchling"]
build-backend = "hatchling.build"

[project]
name = "evalkit"
version = "0.1.0"
description = "Production-grade LLM evaluation framework with judge ensembles, synthetic data generation, and regression tracking."
readme = "README.md"
license = { text = "Source Available - All Rights Reserved" }
requires-python = ">=3.11"
authors = [{ name = "cortexark" }]
classifiers = [
    "Development Status :: 4 - Beta",
    "Intended Audience :: Developers",
    "License :: Other/Proprietary License",
    "Programming Language :: Python :: 3",
    "Programming Language :: Python :: 3.11",
    "Programming Language :: Python :: 3.12",
    "Programming Language :: Python :: 3.13",
    "Topic :: Scientific/Engineering :: Artificial Intelligence",
    "Typing :: Typed",
]
dependencies = [
    "pydantic>=2.0,<3.0",
    "duckdb>=1.0,<2.0",
    "pyyaml>=6.0",
    "httpx>=0.27",
    "tenacity>=8.0",
    "structlog>=24.0",
    "numpy>=1.26",
]

[project.optional-dependencies]
openai = ["openai>=1.0"]
anthropic = ["anthropic>=0.30"]
dashboard = ["streamlit>=1.35", "plotly>=5.0", "pandas>=2.0"]
all = ["evalkit[openai,anthropic,dashboard]"]
dev = [
    "pytest>=8.0",
    "pytest-asyncio>=0.24",
    "pytest-cov>=5.0",
    "ruff>=0.5",
    "mypy>=1.10",
    "types-PyYAML>=6.0",
    "pre-commit>=3.0",
]

[project.urls]
Homepage = "https://github.com/cortexark/evalkit"
Documentation = "https://github.com/cortexark/evalkit#readme"
Repository = "https://github.com/cortexark/evalkit"
Issues = "https://github.com/cortexark/evalkit/issues"

[project.scripts]
evalkit = "evalkit.dashboard.app:main"

[tool.hatch.build.targets.wheel]
packages = ["src/evalkit"]

[tool.pytest.ini_options]
testpaths = ["tests"]
asyncio_mode = "auto"
markers = [
    "slow: marks tests as slow (deselect with '-m \"not slow\"')",
    "integration: marks tests requiring external services",
]

[tool.ruff]
target-version = "py311"
line-length = 99
src = ["src", "tests"]

[tool.ruff.lint]
select = [
    "E",    # pycodestyle errors
    "W",    # pycodestyle warnings
    "F",    # pyflakes
    "I",    # isort
    "N",    # pep8-naming
    "UP",   # pyupgrade
    "B",    # flake8-bugbear
    "SIM",  # flake8-simplify
    "TCH",  # flake8-type-checking
    "RUF",  # ruff-specific
]
ignore = [
    "UP042",   # str+Enum to StrEnum — keep for Python 3.10 compat
    "TCH001",  # move imports to TYPE_CHECKING — impacts runtime imports
    "SIM108",  # ternary operator — sometimes explicit if/else is clearer
    "B905",    # zip strict — not needed for trusted same-length sequences
    "B017",    # blind Exception — used for duckdb which raises generic errors
]

[tool.ruff.lint.per-file-ignores]
"src/evalkit/generators/templates.py" = ["E501"]  # template strings have long lines

[tool.mypy]
python_version = "3.11"
strict = true
warn_return_any = true
warn_unused_configs = true

[tool.coverage.run]
source = ["src/evalkit"]
branch = true
omit = [
    "src/evalkit/dashboard/*",
]

[tool.coverage.report]
show_missing = true
fail_under = 80
exclude_lines = [
    "pragma: no cover",
    "if TYPE_CHECKING:",
    "if __name__ == .__main__.",
    "@overload",
]