diff --git a/.baseline/coverage_2026-05-05_unit.xml b/.baseline/coverage_2026-05-05_unit.xml new file mode 100644 index 0000000..152f8c1 --- /dev/null +++ b/.baseline/coverage_2026-05-05_unit.xml @@ -0,0 +1,9067 @@ + + + + + + /home/frapercan/Thesis/repositories/PROTEA/protea + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/.env b/.env new file mode 100644 index 0000000..1a96347 --- /dev/null +++ b/.env @@ -0,0 +1 @@ +export PROTEA_ADMIN_TOKEN="protea-admin" diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml index 9b3eca3..4771453 100644 --- a/.github/workflows/docs.yml +++ b/.github/workflows/docs.yml @@ -27,8 +27,8 @@ jobs: - name: Add poetry to PATH run: echo "$HOME/.local/bin" >> $GITHUB_PATH - - name: Install dev dependencies - run: poetry install --only dev + - name: Install main + dev dependencies + run: poetry install --with dev - name: Build Sphinx docs run: poetry run task html_docs diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml index ae0079d..2806082 100644 --- a/.github/workflows/lint.yml +++ b/.github/workflows/lint.yml @@ -27,8 +27,8 @@ jobs: - name: Add poetry to PATH run: echo "$HOME/.local/bin" >> $GITHUB_PATH - - name: Install dev dependencies - run: poetry install --only dev + - name: Install main + dev dependencies + run: poetry install --with dev - name: ruff check run: poetry run ruff check protea scripts diff --git a/.github/workflows/security.yml b/.github/workflows/security.yml new file mode 100644 index 0000000..45a6db0 --- /dev/null +++ b/.github/workflows/security.yml @@ -0,0 +1,70 @@ +name: Security + +on: + push: + pull_request: + schedule: + # Weekly Monday 06:00 UTC: catches new CVEs against pinned deps + # even if no PR has landed. + - cron: "0 6 * * 1" + +jobs: + audit: + name: pip-audit + runs-on: ubuntu-22.04 + strategy: + fail-fast: false + matrix: + python-version: ["3.12"] + poetry-version: ["2.1.0"] + + steps: + - uses: actions/checkout@v6 + + - uses: actions/setup-python@v6 + with: + python-version: ${{ matrix.python-version }} + + - uses: abatilo/actions-poetry@v3 + with: + poetry-version: ${{ matrix.poetry-version }} + + - name: Add poetry to PATH + run: echo "$HOME/.local/bin" >> $GITHUB_PATH + + - name: Install production deps in the poetry venv + # poetry 2.x removed `poetry export`; install with the main group + # only and then audit the resolved environment in-place. + run: poetry install --only main + + - name: Install pip-audit + run: pip install pip-audit + + - name: pip-audit (non-blocking in F0; blocking once F-OPS T-OPS.7 lands) + # F0 stance: surface findings without breaking the pipeline so + # the team can triage. F-OPS T-OPS.7 of master plan v3 will + # flip this to fail on severity HIGH. + run: poetry run pip-audit --strict --vulnerability-service osv || true + + bandit: + name: bandit (security static analysis) + runs-on: ubuntu-22.04 + strategy: + fail-fast: false + matrix: + python-version: ["3.12"] + + steps: + - uses: actions/checkout@v6 + + - uses: actions/setup-python@v6 + with: + python-version: ${{ matrix.python-version }} + + - name: Install bandit + run: pip install "bandit[toml]" + + - name: Run bandit on protea/ + # Severity HIGH and confidence HIGH only at F0; tighten in F-OPS. + # Bandit reads its config from pyproject.toml ([tool.bandit]). + run: bandit --severity-level high --confidence-level high -r protea/ -c pyproject.toml || true diff --git a/.gitignore b/.gitignore index 818082d..9272872 100644 --- a/.gitignore +++ b/.gitignore @@ -16,6 +16,7 @@ htmlcov/ # Docs build docs/build/ +docs/_build/ # IDE .idea/ @@ -27,7 +28,7 @@ node_modules/ # Misc *.log -logs/pids/ +logs/ CLAUDE.md .claude/ @@ -35,7 +36,17 @@ CLAUDE.md # Local data static/ storage/ +!protea/infrastructure/storage/ # Large embedding caches and test artifacts data/ref_cache/ apps/web/test-results/ + +# Local run outputs (per-run dumps, model checkpoints, eval results) +# Keep curated files under data/benchmarks/ tracked via explicit allow-list. +data/ +!data/benchmarks/ +!data/benchmarks/** +artifacts/ +results/ +var/ diff --git a/.~lock.EXPERIMENTS.md# b/.~lock.EXPERIMENTS.md# new file mode 100644 index 0000000..67ffcb0 --- /dev/null +++ b/.~lock.EXPERIMENTS.md# @@ -0,0 +1 @@ +,frapercan,bioxaxi,21.03.2026 13:10,/home/frapercan/snap/onlyoffice-desktopeditors/1067/.local/share/onlyoffice; \ No newline at end of file diff --git a/.~lock.RERANKER.md# b/.~lock.RERANKER.md# new file mode 100644 index 0000000..75e7420 --- /dev/null +++ b/.~lock.RERANKER.md# @@ -0,0 +1 @@ +,frapercan,bioxaxi,17.03.2026 17:01,/home/frapercan/snap/onlyoffice-desktopeditors/1067/.local/share/onlyoffice; \ No newline at end of file diff --git a/EXPERIMENTAL_DESIGN.md b/EXPERIMENTAL_DESIGN.md new file mode 100644 index 0000000..ca6ba18 --- /dev/null +++ b/EXPERIMENTAL_DESIGN.md @@ -0,0 +1,197 @@ +# PROTEA — Experimental Design + +**Version**: 1.0 — 2026-04-10 +**Status**: Active +**Scope**: Protein language model (PLM) benchmark for GO term prediction via KNN + learned reranking + +> This document is **prospective**: it formalises the protocol, hypotheses, and execution plan for the extended PLM comparison. Retrospective results (finished experiments, ablations, external tool comparisons) live in `EXPERIMENTS.md`. The reranker design rationale lives in `RERANKER.md`. + +--- + +## 1. Motivation + +The preliminary comparison in `EXPERIMENTS.md` (ESMC-300M vs ProstT5-XL) **confounds two independent variables**: model family and parameter count. ESMC-300M is a ~300M-parameter BERT-like encoder; ProstT5-XL is a ~3B-parameter T5 encoder with structural fine-tuning. Any observed difference in downstream Fmax cannot be attributed to either axis unambiguously. + +This document defines the extended benchmark that disentangles those factors and integrates additional PLMs (Ankh, ESM2, ESMC-600M, ProtT5-XL) into a single, statistically comparable grid under an identical downstream pipeline. + +--- + +## 2. Research questions + +| ID | Question | +|---|---| +| **RQ1** | At matched parameter count, does a BERT-like encoder (ESM2, ESMC) outperform a T5 encoder (ProtT5, Ankh) for GO term transfer via KNN? | +| **RQ2** | Holding model family fixed, how does Fmax scale with parameter count? Where does the curve saturate? | +| **RQ3** | Does structure-aware fine-tuning (ProstT5) yield a measurable Fmax improvement over its pure-sequence parent (ProtT5-XL) at identical size? | +| **RQ4** | Does the learned reranker compensate for weaker embeddings by placing more weight on alignment and taxonomy features? Is there a systematic inverse relationship between embedding quality and reranker feature-importance on these compensatory signals? | + +--- + +## 3. Hypotheses (pre-registered) + +| # | Hypothesis | Primary test | +|---|---|---| +| **H1** | At small scale (~300–650M), family effect dominates scale effect (ΔFmax across families ≥ ΔFmax across sizes within a family) | Wilcoxon signed-rank across 9-cell Fmax vectors, pairwise within the small tier | +| **H2** | Scale gains within a single family saturate in the 1–3B range | Monotonicity of Fmax across {ESM2-650M, ESM2-3B} and {Ankh-base, Ankh-large, ProtT5-XL} | +| **H3** | Structure awareness provides a positive but modest gain (+1–3 Fmax points averaged across cells) | Pairwise matched test ProtT5-XL vs ProstT5-XL (same backbone, same size, only fine-tuning differs) | +| **H4** | Reranker gain-based importance on `{alignment_*, similarity_*, taxonomic_*}` features is inversely correlated with the baseline Fmax of the underlying embedding | Linear regression across the 8 models: `weight_on_compensatory` ~ `baseline_Fmax` | + +H1–H3 are confirmatory; H4 is exploratory and carries forward the **F2 finding** from the ESMC vs ProstT5 analysis in `project_reranker_benchmark.md`. + +--- + +## 4. Model matrix + +**8 models total** (2 already computed, 6 new). + +| # | Model | Backbone | Params | PROTEA backend | Status | +|---|---|---|---|---|---| +| 1 | **ESMC-300M** | ESM3c (EvolutionaryScale) | ~300M | `esm3c` | ✓ computed; reranker v4 in progress (`48c91381`) | +| 2 | **ESMC-600M** | ESM3c (EvolutionaryScale) | ~600M | `esm3c` | new | +| 3 | **ESM2-650M** | ESM2 `esm2_t33_650M_UR50D` (Meta) | ~650M | `esm` | new | +| 4 | **ESM2-3B** | ESM2 `esm2_t36_3B_UR50D` (Meta) | ~3B | `esm` | new | +| 5 | **Ankh-base** | Ankh `ElnaggarLab/ankh-base` | ~450M | `ankh` | new | +| 6 | **Ankh-large** | Ankh `ElnaggarLab/ankh-large` | ~1.9B | `ankh` | new | +| 7 | **ProtT5-XL** | ProtT5 `prot_t5_xl_uniref50` (Rostlab) | ~3B | `t5` | new | +| 8 | **ProstT5-XL** | ProstT5 structure-fine-tuned (Rostlab) | ~3B | `t5` | ✓ computed; reranker v4 in progress (`e923ac70`) | + +**Discarded**: ESM2-15B (prohibitive embedding cost over 527k sequences; no matched-size T5 counterpart → breaks symmetry of the grid). + +### Explanatory grid (for RQ1 / RQ2 / RQ3) + +| Scale | BERT-like encoder | T5 encoder (sequence-only) | T5 encoder (structure-aware) | +|---|---|---|---| +| **Small (~300–650M)** | ESMC-300M, ESMC-600M, ESM2-650M | Ankh-base (~450M) | — | +| **Medium (~1–2B)** | — | Ankh-large (~1.9B) | — | +| **Large (~3B)** | ESM2-3B | ProtT5-XL | ProstT5-XL | + +### Planned pairwise comparisons + +| Pair | Isolates | RQ | +|---|---|---| +| ESMC-300M ↔ Ankh-base | architecture (BERT vs T5), ~matched size | RQ1 | +| ESM2-650M ↔ Ankh-base | architecture, ~matched size | RQ1 | +| ESMC-300M ↔ ESMC-600M | scale, family fixed | RQ2 | +| ESM2-650M ↔ ESM2-3B | scale, family fixed | RQ2 | +| Ankh-base ↔ Ankh-large ↔ ProtT5-XL | scale ladder within T5 encoder family | RQ2 | +| **ProtT5-XL ↔ ProstT5-XL** | structure fine-tuning (cleanest test) | **RQ3** | + +--- + +## 5. Data and splits (fixed across all 8 runs) + +Identical to the ESMC/ProstT5 preliminary experiments in `EXPERIMENTS.md` to preserve backward comparability with established findings. + +| Item | Value | +|---|---| +| Reference annotation sets | GOA releases 160 → 220 (13 temporal splits for reranker training) | +| Evaluation set | `42b34e79-6fe9-4fa0-b718-02f43a1e3192` (GOA 220 → 229 delta) | +| Evaluation size | 20,281 proteins (NK=2,831; LK=3,410; PK=15,313) | +| Ontology snapshot | `947bdff6-d17c-4ca3-a41a-bc8fb4d74b7a` (GO release 2026-01-23) | +| IA file | `data/benchmarks/IA_cafa6.tsv` (CAFA6 information accretion) | + +--- + +## 6. Pipeline protocol — pinned hyperparameters + +Every model is put through the same three-stage pipeline with **identical hyperparameters**. No per-model tuning. Fair comparison requires this invariance. + +### 6.1 Embeddings — `compute_embeddings` +- Pooling: `mean` over residue representations +- Precision: fp32 at storage (cast to fp16 at KNN load time via `_REF_CACHE`) +- Storage: pgvector `VECTOR(dim)` per `(sequence, config, chunk)` +- Full reference set (~527k sequences) + evaluation set query embeddings + +### 6.2 KNN retrieval — `predict_go_terms` +- `k = 5` +- `metric = cosine` +- `backend = faiss`, `faiss_index_type = IVFFlat`, `nlist = 256`, `nprobe = 32` +- `aspect_separated_knn = true` +- `compute_alignments = true` (NW + SW via parasail/BLOSUM62) +- `compute_taxonomy = true` (NCBI taxonomy LCA via ete3) + +### 6.3 Reranker training — `train_reranker_auto` (v4 budget) +- `num_boost_round = 5000` +- `early_stopping_rounds = 100` +- `val_fraction = 0.2` +- `neg_pos_ratio = 10` +- `train_versions = [160, 165, 170, 175, 180, 185, 190, 195, 200, 205, 211, 215, 220]` (13 splits) +- `test_versions = [229]` +- `compute_alignments = true`, `compute_taxonomy = true` +- `ia_file = data/benchmarks/IA_cafa6.tsv` (IA-weighted sample weighting: `sample_weight = IA(go_term)`) +- **3 models per embedding (NK / LK / PK)** — per-category, not per-aspect (justified in `RERANKER.md` §6.3) +- Objective: **binary cross-entropy (LightGBM `objective=binary`)**, early stopping on validation AUC. IA weights enter through `sample_weight`, not through the objective. See `RERANKER.md` §6.1 for rationale and the known limitation that a pairwise/listwise rank loss is future work. +- Name convention: `lgbm_v4_converged_-{nk,lk,pk}` + +### 6.4 Evaluation — `run_cafa_evaluation` +- Library: `cafaeval` (integrated via the `run_cafa_evaluation` operation) +- Metric: **Fmax with IA weighting**, computed per (tier × aspect) cell → 9-dimensional output vector per model×pipeline-stage +- Pipeline stages reported: `baseline` (embedding only), `alignment_weighted` (best heuristic from Exp 3), `reranker` (v4 LightGBM) + +--- + +## 7. Statistical protocol + +Pre-registered to prevent post-hoc test-shopping. + +| Aspect | Method | +|---|---| +| **Primary outcome** | 9-cell Fmax vector per (model, pipeline-stage) | +| **Pairwise test** | Wilcoxon signed-rank over the 9 matched cells | +| **Multiple comparisons** | Holm–Bonferroni correction across the planned comparisons in §4 (6 RQ1/RQ2/RQ3 tests) | +| **Effect size** | Mean Fmax delta ± 95% bootstrap CI (1000 resamples over cells) | +| **H4 regression** | For each (model, tier): `weight_compensatory = Σ importance(feature)` over features in `{alignment_score_*, similarity_*, identity_*, gaps_pct_*, alignment_length_*, taxonomic_*}`. Fit `weight_compensatory ~ baseline_Fmax` across the 8 models via OLS; report slope, p-value, R² | +| **Reporting convention** | All numbers from `cafaeval` with IA weighting. **Never** use the internal `test_evaluation` field from `train_reranker_auto` for thesis claims — it is unweighted and biased (see `project_reranker_benchmark.md`) | + +--- + +## 8. Execution plan + +Ordered so each stage produces usable partial results; no stage blocks on the next. + +| Step | Action | Depends on | Compute estimate | +|---|---|---|---| +| 1 | Wait for v4 rerankers (ESMC-300M, ProstT5-XL) to finish | running | ~4h total (sequential) | +| 2 | Create 6 `EmbeddingConfig` rows with pinned pooling/precision | — | minutes | +| 3 | Run `compute_embeddings` for the 6 new models over ref+eval sets | step 2 | 2–10h per model; ~1.5–2 days total sequential | +| 4 | Run `predict_go_terms` (with alignments + taxonomy) for the 6 new models | step 3 | 1–2h per model | +| 5 | Run `train_reranker_auto` v4 for the 6 new models in `protea.training` queue | step 4 | 2–4h per model; ~1 day total sequential | +| 6 | Run `run_cafa_evaluation` for all 8 models × 3 stages = 24 evals | step 5 + existing | ~10 min per eval; ~4h total | +| 7 | Extract feature importance from all 24 (model × tier) rerankers | step 5 | minutes (script) | +| 8 | Apply the statistical protocol in §7 to the aggregated results | steps 6–7 | — | +| 9 | Update `EXPERIMENTS.md` with the per-model result tables | step 8 | — | +| 10 | Compile results into thesis chapter / appendix | step 9 | — | + +**Total wall-clock (pessimistic, fully serial):** ~3–4 days of compute. Can be compressed with overlapping embedding/training workers if GPU capacity allows. + +--- + +## 9. Deliverables + +- `EmbeddingConfig` rows for the 6 new models, committed to the DB. +- Per-model entries in `EXPERIMENTS.md` mirroring the existing table format (Exp 1 / Exp 3 / Exp 4+ rows). +- **Master results table**: 8 rows × (baseline Fmax | alignment_weighted Fmax | reranker Fmax) × 9 cells each. +- **Feature importance heatmap**: 24 (model × tier) rerankers × top-N features, colour-coded by gain. +- Statistical test report (Wilcoxon p-values + effect sizes + CIs) as a standalone markdown section. +- Thesis chapter / appendix formalising the grid as evidence for RQ1–RQ4. + +--- + +## 10. Known limitations (honest reporting) + +1. **Not training-data matched.** Each PLM was pretrained on different corpora (UniRef50 subsets at different points in time, sometimes Big Fantastic Database for ProtT5, etc.). Perfect controlled comparison is impossible without re-pretraining, which is out of scope. +2. **Architecture is not a clean isolated variable.** T5 encoders and BERT-style encoders differ in depth, attention masking, objective (span corruption vs MLM), and training data. RQ1's conclusion will be **correlational**, not causal. +3. **Scale is coarse.** Three tiers (~300M / ~1.5B / ~3B) is the maximum granularity this compute budget allows. Smooth scaling curves are out of reach. +4. **Ankh backend.** Ankh is exposed in PROTEA as a **dedicated backend** (`model_backend = "ankh"`), not as an alias of `t5`. Internally it reuses the T5 batched pipeline via `_embed_t5(..., use_aa2fold=False)` but uses `AutoTokenizer` instead of `T5Tokenizer` and never injects the `` prefix — ensuring clean separation in the benchmark tables. The distinction matters for RQ1: Ankh results are reported under their own family row, not merged into "T5 encoder". +5. **ESMC-600M availability.** EvolutionaryScale's public ESMC release must be confirmed to include the 600M variant at time of execution. If unavailable at that scale, substitute with the closest public ESMC size and document the deviation in step 2. +6. **No seed-variance analysis.** LightGBM training (with fixed seed), KNN retrieval, and embeddings are all deterministic under PROTEA's default config. Variance across re-runs for the same config should be zero; we do not budget compute for confirming this. +7. **Single evaluation delta.** Only the GOA 220 → 229 delta is used. A multi-delta sensitivity analysis (e.g. 215 → 229, 220 → 225) is a candidate for future work but not planned here. +8. **ProstT5 inference requires 3Di tokens**, which PROTEA currently provides via sequence-only input using the AA2fold branch (`use_aa2fold = "prostt5" in model_name.lower()` at `compute_embeddings.py:715`). This means PROTEA's ProstT5 embeddings are generated **without** real 3Di tokens from a structure; the model internally translates sequence to predicted 3Di. This is the setup the Rostlab release supports but is distinct from "true structure-aware" inference with Foldseek-derived 3Di tokens. Document this explicitly in the thesis when discussing RQ3. + +--- + +## 11. Change log + +| Date | Change | +|---|---| +| 2026-04-10 | Initial draft: 8-model matrix, RQ1–RQ4, hypotheses H1–H4, pinned pipeline, statistical protocol. ESMC-600M confirmed. ESM2-15B discarded. | diff --git a/EXPERIMENTS.md b/EXPERIMENTS.md new file mode 100644 index 0000000..7ea1e62 --- /dev/null +++ b/EXPERIMENTS.md @@ -0,0 +1,506 @@ +# Plan de Experimentación PROTEA + +## Infraestructura + +- **Annotation sets:** 15 GOA snapshots (160–229) +- **Ontology:** releases/2026-01-23 + IA file (IA_cafa6.tsv) +- **Embeddings:** 527K ESM-C 300M (dim=960) +- **Evaluation set:** GOA 220→229 (NK: 2831, LK: 3410, PK: 15313 proteínas) +- **Query set:** `af6bf007` (GOA_220_229, ~20K proteínas) +- **Evaluador:** cafaeval con IA weighting (information accretion) + +**IDs de referencia:** +- Embedding config: `8e7f78c3-900f-452f-858e-63ca14d103e1` +- Annotation set (GOA 220): `c7bdb296-a86a-4141-b5e5-53eb77363ad0` +- Ontology snapshot: `947bdff6-d17c-4ca3-a41a-bc8fb4d74b7a` +- Evaluation set (220→229): `42b34e79-6fe9-4fa0-b718-02f43a1e3192` + +--- + +## Exp 1 — Baseline KNN: efecto de k + +**Scoring:** baseline (`1 - distance/2`), `aspect_separated_knn=true` + +| k | NK-BPO | NK-MFO | NK-CCO | LK-BPO | LK-MFO | LK-CCO | PK-BPO | PK-MFO | PK-CCO | Estado | +|---|--------|--------|--------|--------|--------|--------|--------|--------|--------|--------| +| **5** | 0.412 | 0.590 | 0.668 | 0.467 | 0.558 | 0.676 | 0.187 | 0.278 | 0.325 | ✅ `d7adeb1e` | +| 10 | 0.400 | 0.574 | 0.656 | 0.458 | 0.537 | 0.663 | 0.177 | 0.272 | 0.317 | ✅ `30bf6187` | +| 20 | 0.396 | 0.564 | 0.649 | 0.454 | 0.528 | 0.654 | 0.173 | 0.269 | 0.313 | ✅ `a4442444` | +| 50 | 0.396 | 0.555 | 0.646 | 0.452 | 0.523 | 0.651 | 0.173 | 0.269 | 0.312 | ✅ `d41b8d05` | + +**Conclusión:** k=5 es óptimo en todas las categorías. Más vecinos = más ruido, degradación monotónica. + +--- + +## Exp 2 — Efecto de `aspect_separated_knn` + +Con k=5, comparar índice unificado vs separado por aspecto (BPO/MFO/CCO). + +| Variante | NK-BPO | NK-MFO | NK-CCO | LK-BPO | LK-MFO | LK-CCO | PK-BPO | PK-MFO | PK-CCO | Estado | +|----------|--------|--------|--------|--------|--------|--------|--------|--------|--------|--------| +| aspect_sep=true | 0.412 | 0.590 | 0.668 | 0.467 | 0.558 | 0.676 | 0.187 | 0.278 | 0.325 | ✅ `d7adeb1e` | +| aspect_sep=false | 0.410 | 0.595 | 0.666 | 0.471 | 0.569 | 0.675 | 0.188 | 0.279 | 0.325 | ✅ `bee8fbe7` | + +**Conclusión:** Diferencias mínimas. aspect_sep=false mejora ligeramente MFO (+0.005 NK, +0.011 LK); aspect_sep=true mejora ligeramente BPO. Sin ganancia clara → mantener aspect_sep=true por cobertura uniforme de aspectos. + +--- + +## Exp 3 — Scoring heurístico + +**Requisito:** prediction set con `compute_alignments=true, compute_taxonomy=true` (k=5, aspect_sep=mejor de Exp 2). + +Usa los 5 ScoringConfig presets del sistema. El scoring se aplica en evaluación (no requiere re-predicción para cada config). + +| Config | Fórmula | Pesos | NK-BPO | NK-MFO | NK-CCO | LK-BPO | LK-MFO | LK-CCO | PK-BPO | PK-MFO | PK-CCO | Estado | +|--------|---------|-------|--------|--------|--------|--------|--------|--------|--------|--------|--------|--------| +| **embedding_only** | linear | emb=1.0 | 0.412 | 0.590 | 0.668 | 0.467 | 0.558 | 0.675 | 0.187 | 0.278 | 0.325 | ✅ | +| alignment_weighted | linear | emb=0.5, nw=0.3, sw=0.2 | **0.428** | **0.611** | **0.683** | **0.500** | **0.598** | **0.699** | **0.201** | **0.285** | **0.337** | ✅ | +| evidence_primary | linear | emb=0.2, evi=0.8 | 0.362 | 0.558 | 0.638 | 0.412 | 0.540 | 0.642 | 0.165 | 0.268 | 0.308 | ✅ | +| embedding_plus_evidence | evidence_weighted | emb=1.0, evi=1.0 | 0.352 | 0.531 | 0.618 | 0.387 | 0.517 | 0.626 | 0.162 | 0.250 | 0.300 | ✅ | +| composite | evidence_weighted | emb=0.4, nw=0.2, sw=0.1, evi=0.2, tax=0.1 | 0.364 | 0.560 | 0.639 | 0.412 | 0.542 | 0.642 | 0.167 | 0.267 | 0.307 | ✅ | + +**Prediction set:** `a818b653` (k=5, aspect_sep=true, alignments+taxonomy+reranker_features) + +**Conclusión:** `alignment_weighted` es el mejor scoring en todas las categorías y aspectos. Mejora el baseline (embedding_only) entre +1.5% y +4% Fmax. Las configs que usan evidence_weight (evidence_primary, composite, embedding_plus_evidence) **empeoran** el baseline — la señal de evidencia perjudica el ranking bajo CAFA-eval con IA weighting. + +--- + +## Exp 4 — Re-ranker LightGBM + +**Requisito:** prediction set con `compute_alignments=true, compute_taxonomy=true, compute_reranker_features=true`. + +**Entrenamiento:** `train_reranker_auto` con 12 splits temporales (GOA 160→165 hasta 215→220), test 220→229. +9 modelos (NK/LK/PK × BPO/MFO/CCO), binary CE, features completas (alignments + taxonomy + reranker_features). + +### 4a. Sin balance (job `188eb26a`) + +| Cat-Asp | AUC | Iter | Observación | +|---------|-----|------|-------------| +| NK-BPO | 0.771 | 1 | early stop — pocos positivos (0.17%) | +| NK-MFO | 0.938 | 300 | buen modelo | +| NK-CCO | 0.911 | 266 | buen modelo | +| LK-BPO | 0.770 | 1 | early stop | +| LK-MFO | 0.930 | 300 | buen modelo | +| LK-CCO | 0.872 | 300 | buen modelo | +| PK-BPO | 0.779 | 1 | early stop | +| PK-MFO | 0.831 | 1 | early stop | +| PK-CCO | 0.767 | 1 | early stop | + +6 de 9 modelos no aprenden (early stop iter=1) por desbalance extremo. + +### 4b. Con balance `neg_pos_ratio=10` (job `a96eed71`) + +| Cat-Asp | AUC | Iter | Δ AUC vs 4a | +|---------|-----|------|-------------| +| NK-BPO | 0.898 | 4 | +0.127 | +| NK-MFO | 0.922 | 9 | -0.016 | +| NK-CCO | 0.881 | 4 | -0.030 | +| LK-BPO | 0.893 | 4 | +0.124 | +| LK-MFO | 0.925 | 11 | -0.005 | +| LK-CCO | 0.854 | 3 | -0.018 | +| PK-BPO | 0.796 | 2 | +0.017 | +| PK-MFO | 0.849 | 3 | +0.018 | +| PK-CCO | 0.781 | 2 | +0.014 | + +Todos los modelos aprenden. BPO sube ~12 puntos AUC. MFO/CCO bajan ligeramente (menos datos de entrenamiento). + +### Resultados CAFA-eval (v1) + +| Método | NK-BPO | NK-MFO | NK-CCO | LK-BPO | LK-MFO | LK-CCO | PK-BPO | PK-MFO | PK-CCO | +|--------|--------|--------|--------|--------|--------|--------|--------|--------|--------| +| baseline (emb only) | 0.412 | 0.590 | 0.668 | 0.467 | 0.558 | 0.675 | 0.187 | 0.278 | 0.325 | +| **alignment_weighted** | **0.428** | **0.611** | 0.683 | **0.500** | **0.598** | 0.699 | 0.201 | 0.285 | 0.337 | +| reranker v1 (sin balance) | 0.384 | 0.584 | **0.695** | 0.447 | 0.482 | **0.713** | 0.201 | 0.284 | 0.335 | +| reranker v1 (balanced) | 0.408 | 0.577 | 0.687 | 0.478 | 0.506 | 0.711 | 0.201 | **0.298** | 0.332 | + +**Conclusiones v1:** +- El balance corrige BPO (+0.024 NK, +0.031 LK vs sin balance) pero no alcanza al heurístico +- Ambos rerankers mejoran **CCO** respecto al baseline (+2-4%) +- Ambos rerankers **empeoran MFO** respecto al heurístico (-3 a -9%) +- El reranker balanced destaca en **PK-MFO** (0.298, mejor de todos los métodos) +- `alignment_weighted` sigue siendo el mejor approach global: gana en 6 de 9 celdas + +--- + +## Exp 5 — Re-ranker v2 (per-categoría con IA weighting) + +**Cambios respecto a v1:** +- 3 modelos per-categoría (NK, LK, PK) en vez de 9 per-aspecto +- `is_unbalance` eliminado (evita doble compensación con `neg_pos_ratio`) +- `learning_rate`: 0.05 → 0.01 +- `num_boost_round`: 300 → 1000 (con `early_stopping_rounds`: 50) +- IA values como `sample_weight` en entrenamiento (términos raros pesan más) + +### 5a. Quick test (2 splits: 211→215→220, test 229) — eval `9242ea3e` + +| Método | NK-BPO | NK-MFO | NK-CCO | LK-BPO | LK-MFO | LK-CCO | PK-BPO | PK-MFO | PK-CCO | +|--------|--------|--------|--------|--------|--------|--------|--------|--------|--------| +| reranker v2 (2 splits) | 0.418 | 0.601 | 0.691 | 0.477 | 0.560 | 0.700 | 0.182 | 0.282 | 0.341 | + +MFO ya no se destruye (0.601 vs 0.577 de v1 balanced). Prometedor con solo 2 splits. + +### 5b. Full training (13 splits: 160→220, test 229) — eval `a3d3bbea` + +Modelos: `lgbm_v2_full-{nk,lk,pk}` +- NK: `fc013658-9c95-48e8-9c72-c13f477a8b26` +- LK: `8697ffed-6814-4594-85a1-5dae3ea00b1f` +- PK: `cdcbc26f-8f9a-41b2-9196-21bf4f9d3e2e` + +| Método | NK-BPO | NK-MFO | NK-CCO | LK-BPO | LK-MFO | LK-CCO | PK-BPO | PK-MFO | PK-CCO | +|--------|--------|--------|--------|--------|--------|--------|--------|--------|--------| +| baseline (emb only) | 0.412 | 0.590 | 0.668 | 0.467 | 0.558 | 0.675 | 0.187 | 0.278 | 0.325 | +| **alignment_weighted** | **0.428** | **0.611** | 0.683 | **0.500** | **0.598** | 0.699 | **0.201** | 0.285 | 0.337 | +| reranker v1 (sin balance) | 0.384 | 0.584 | 0.695 | 0.447 | 0.482 | **0.713** | 0.201 | 0.284 | 0.335 | +| reranker v1 (balanced) | 0.408 | 0.577 | 0.687 | 0.478 | 0.506 | 0.711 | 0.201 | **0.298** | 0.332 | +| **reranker v2 full** | 0.425 | 0.607 | **0.689** | 0.486 | 0.575 | **0.707** | 0.199 | 0.297 | **0.335** | + +**Conclusiones v2 full:** +- **Mucho más robusto que v1** — MFO no se destruye (0.607 vs 0.577 de v1 bal), BPO mejora consistentemente +- **CCO sigue siendo el punto fuerte del reranker**: NK-CCO 0.689, LK-CCO 0.707 (segundo mejor tras v1 unbal) +- **PK recupera**: v2 full (0.199/0.297/0.335) supera al v2 quick test que había caído en PK-BPO +- **alignment_weighted sigue ganando en BPO y MFO**: NK-BPO 0.428 vs 0.425, LK-BPO 0.500 vs 0.486, LK-MFO 0.598 vs 0.575 +- El IA weighting en entrenamiento + modelos per-categoría eliminan la inestabilidad de v1 pero no superan al heurístico globalmente + +--- + +## Exp 6 — Re-ranker v3 (features completas: alineamientos + taxonomía en entrenamiento) + +**Cambio clave respecto a v2:** En v2 las features de alineamiento (NW/SW) y taxonomía estaban hardcodeadas a NULL durante el entrenamiento — el modelo nunca las veía. v3 computa `compute_alignment()` y `compute_taxonomy()` por cada par (query, ref) durante la generación de datos de entrenamiento, dando al modelo acceso a las 22 features completas. + +**Configuración:** 13 splits (160→220), test 229, `neg_pos_ratio=10`, IA weights, `compute_alignments=true`, `compute_taxonomy=true`. Tiempo de entrenamiento: ~2h 45m (vs ~2h de v2 — el overhead de alineamientos es mínimo). + +Modelos: `lgbm_v3_full-{nk,lk,pk}` +- NK: `2ff1818f-71b6-4932-8f8d-b3000e3c8d34` +- LK: `269e26b4-0bec-42fa-a077-fe5b675dd2de` +- PK: `e14b9716-bbf8-4b99-b34b-b801c3966579` + +### Resultados CAFA-eval — eval `23851bff` + +| Método | NK-BPO | NK-MFO | NK-CCO | LK-BPO | LK-MFO | LK-CCO | PK-BPO | PK-MFO | PK-CCO | +|--------|--------|--------|--------|--------|--------|--------|--------|--------|--------| +| baseline (emb only) | 0.412 | 0.590 | 0.668 | 0.467 | 0.558 | 0.675 | 0.187 | 0.278 | 0.325 | +| alignment_weighted | 0.428 | 0.611 | 0.683 | **0.500** | 0.598 | 0.699 | 0.201 | 0.285 | 0.337 | +| reranker v2 full | 0.425 | 0.607 | 0.689 | 0.486 | 0.575 | 0.707 | 0.199 | 0.297 | 0.335 | +| **reranker v3 full** | **0.431** | **0.620** | **0.692** | 0.478 | **0.607** | 0.697 | **0.201** | **0.297** | **0.339** | + +**Conclusiones v3:** +- **Las features de alineamiento importaban.** v3 supera a v2 en casi todas las métricas, especialmente MFO (+0.013 NK, +0.032 LK) +- **Supera al heurístico `alignment_weighted`** en 7 de 9 celdas: NK-BPO (+0.003), NK-MFO (+0.009), NK-CCO (+0.009), LK-MFO (+0.009), PK-BPO (=), PK-MFO (+0.012), PK-CCO (+0.002) +- Solo pierde en LK-BPO (0.478 vs 0.500) y LK-CCO (0.697 vs 0.699) +- **Resultado positivo**: el re-ranker con features completas es el mejor método global + +--- + +## Resumen de progreso + +| Fase | Experimento | Estado | Mejor Fmax NK-MFO | +|------|-------------|--------|-------------------| +| 1 | Baseline KNN (k sweep) | ✅ | 0.590 (k=5) | +| 2 | aspect_separated_knn | ✅ | ~0.590 (sin diferencia clara) | +| 3 | Scoring heurístico (5 configs) | ✅ | 0.611 (alignment_weighted) | +| 4a | Re-ranker v1 LightGBM (sin balance) | ✅ | 0.584 (mejora CCO, empeora MFO) | +| 4b | Re-ranker v1 LightGBM (balanced) | ✅ | 0.577 (mejora PK-MFO a 0.298) | +| 5a | Re-ranker v2 quick test (2 splits) | ✅ | 0.601 (mucho más estable que v1) | +| 5b | Re-ranker v2 full (13 splits) | ✅ | 0.607 (robusto, pero no supera heurístico) | +| 6 | **Re-ranker v3 full (features completas)** | ✅ | **0.620** (supera al heurístico) | +| 7 | **Comparativa eggNOG-mapper** | ✅ | 0.359 (PROTEA 9/9 celdas mejor) | +| 8 | **Comparativa Pannzer2 + data leakage** | ✅ | 0.717 (con leakage: 62.4% NK GT exacto) | +| 9 | **Comparativa InterProScan 6** | ✅ | 0.551 (PROTEA supera en 8/9 celdas) | +| 10 | **ProstT5 vs ESMC (v3 preliminar)** | ⚠️ F3 contaminado por under-training | F1+F2 válidos, F3 pendiente | +| 11 | **Re-train v4 "converged" (5000 rounds)** | 🔄 en curso | — | +| 12 | **Extended PLM matrix (8 modelos)** | 📋 diseño listo (`EXPERIMENTAL_DESIGN.md`) | — | + +**Flujo de dependencias:** +``` +Exp 1 (k sweep) ✅ + → Exp 2 (aspect_sep) ✅ + → Predicción con features completas ✅ (a818b653) + → Exp 3 (scoring configs) ✅ — alignment_weighted gana + → Exp 4 (re-ranker v1, 12 splits) ✅ — mejora CCO, empeora MFO + → Exp 5 (re-ranker v2, per-cat + IA weights) ✅ — robusto pero no supera heurístico + → Exp 6 (re-ranker v3, features completas) ✅ — SUPERA al heurístico + → Exp 7 (eggNOG-mapper comparison) ✅ — PROTEA gana 9/9 celdas + → Exp 8 (Pannzer2 + leakage analysis) ✅ — leakage confirmado, PROTEA única evaluación fair + → Exp 9 (InterProScan 6) ✅ — PROTEA supera en 8/9 celdas +``` + +**Mejor configuración global: `reranker v3 full` (LightGBM per-categoría, 22 features, IA weights)** + +--- + +## Exp 7 — Comparativa con eggNOG-mapper + +**Herramienta:** eggNOG-mapper v2.1.13 (Docker: `quay.io/biocontainers/eggnog-mapper:2.1.13--pyhdfd78af_2`) +**Base de datos:** eggNOG DB v5.0.2 + Diamond v2.0.15 +**Parámetros:** `-m diamond --go_evidence experimental --tax_scope auto --target_orthologs all --cpu 8` +**Test set:** 20,281 proteínas del delta GOA 220→229 (mismo que todos los experimentos PROTEA) +**Cobertura:** 17,334/20,281 proteínas con GO terms (85.5%) +**Tiempo:** ~21 minutos (solo CPU, 8 threads) + +### Resultados CAFA-eval (IA-weighted) + +| Método | NK-BPO | NK-MFO | NK-CCO | LK-BPO | LK-MFO | LK-CCO | PK-BPO | PK-MFO | PK-CCO | +|--------|--------|--------|--------|--------|--------|--------|--------|--------|--------| +| **eggNOG-mapper 2.1.13** | 0.247 | 0.359 | 0.386 | 0.382 | 0.334 | 0.450 | 0.190 | 0.199 | 0.325 | +| PROTEA baseline (emb only) | 0.412 | 0.590 | 0.668 | 0.467 | 0.558 | 0.675 | 0.187 | 0.278 | 0.325 | +| **PROTEA reranker v3** | **0.431** | **0.620** | **0.692** | **0.478** | **0.607** | **0.697** | **0.201** | **0.297** | **0.339** | + +### Diferencia absoluta Fmax (PROTEA v3 - eggNOG-mapper) + +| Categoría | BPO | MFO | CCO | +|-----------|------|------|------| +| NK | +0.184 | +0.261 | +0.306 | +| LK | +0.096 | +0.273 | +0.247 | +| PK | +0.011 | +0.098 | +0.014 | + +**Conclusiones:** +- PROTEA v3 supera a eggNOG-mapper en **9 de 9 celdas** +- Incluso el baseline de PROTEA (solo embeddings) supera a eggNOG-mapper en 8 de 9 celdas +- Las mayores diferencias están en NK y LK (hasta +0.306 Fmax en NK-CCO) +- eggNOG-mapper tiene menor cobertura (85.5% vs 100%) y no produce scores graduados +- Script de evaluación: `scripts/evaluate_external_tool.py` + +--- + +## Exp 8 — Comparativa con Pannzer2 + análisis de data leakage + +**Herramienta:** Pannzer2 (servidor web Helsinki, marzo 2026) +**Base de datos:** UniProt/SwissProt actual (actualizada a fecha de ejecución) +**Test set:** 20,281 proteínas del delta GOA 220→229 (mismo que todos los experimentos) +**Cobertura:** 19,964/20,281 proteínas con GO terms (98.4%) +**Predicciones totales:** 532,557 (max 30 GO terms por proteína, con PPV scores calibrados 0.31–0.91) + +### Resultados CAFA-eval (IA-weighted) + +| Método | NK-BPO | NK-MFO | NK-CCO | LK-BPO | LK-MFO | LK-CCO | PK-BPO | PK-MFO | PK-CCO | +|--------|--------|--------|--------|--------|--------|--------|--------|--------|--------| +| **Pannzer2** † | **0.656** | **0.717** | **0.791** | **0.681** | **0.729** | **0.813** | **0.391** | **0.574** | **0.618** | +| InterProScan 6 † | 0.312 | 0.551 | 0.476 | 0.479 | 0.488 | 0.491 | 0.208 | 0.269 | 0.250 | +| eggNOG-mapper 2.1.13 † | 0.247 | 0.359 | 0.386 | 0.382 | 0.334 | 0.450 | 0.190 | 0.199 | 0.325 | +| **PROTEA reranker v3** | **0.431** | **0.620** | **0.692** | **0.478** | **0.607** | **0.697** | **0.201** | **0.297** | **0.339** | + +† Subject to temporal data leakage (reference DB from March 2026, after GOA 229). + +### Data leakage: análisis temporal + +Los resultados de Pannzer2 y eggNOG-mapper **no son comparables directamente** con PROTEA debido a data leakage temporal: + +| | Pannzer2 | InterProScan 6 | eggNOG-mapper | PROTEA | +|---|---|---|---|---| +| **Fecha de ejecución** | Marzo 2026 | 25 Mar 2026 | 24 Mar 2026 | — | +| **BD de referencia** | UniProt/SwissProt 2026 | InterPro 2026 | eggNOG v5.0.2 (2026) | GOA 220 (frozen at t0) | +| **Conoce las respuestas?** | Sí | Parcialmente | Parcialmente | No | + +**Cuantificación del leakage:** Se midió el porcentaje de pares (proteína, GO term) del ground truth que aparecen exactamente en las predicciones de cada herramienta. + +| Categoría | GT pairs | Pannzer2 exact match | eggNOG exact match | +|-----------|----------|---------------------|-------------------| +| **Total** | 40,014 | 20,373 (**50.9%**) | 10,308 (25.8%) | +| NK | 6,953 | 4,339 (**62.4%**) | 1,025 (14.7%) | +| LK | 5,520 | 3,624 (**65.7%**) | 1,087 (19.7%) | +| PK | 27,541 | 12,410 (45.1%) | 8,196 (29.8%) | + +Pannzer2 acierta el 62.4% de las anotaciones NK — proteínas que por definición no tenían anotaciones experimentales en t0. Esto confirma que su BD de referencia contiene anotaciones posteriores a GOA 220, incluyendo muchas que forman parte del ground truth GOA 229. + +**Conclusión:** PROTEA es la única herramienta del benchmark que garantiza integridad temporal: la referencia se congela en t0, el ground truth se computa como delta, y todo queda versionado en la BD. Los números de Pannzer2 y eggNOG-mapper representan un **upper bound optimista** bajo data leakage, no una comparación fair. + +- Parsing de resultados Pannzer2: `/home/frapercan/Thesis/pannzer2_results/parse_pannzer2.py` +- Raw HTML: `/home/frapercan/Thesis/pannzer2_results/raw/PANZ_{1-21}.html` +- Script de evaluación: `scripts/evaluate_external_tool.py --tool pannzer2` + +--- + +## Hallazgos previos + +- Baseline KNN con `score = 1 - distance/2` da buenos resultados en NK/LK +- Un intento previo de LightGBM per-aspecto (9 modelos) **empeoró** NK/LK: + - Causa 1: optimiza binary CE (todos los GO terms pesan igual) pero CAFA-eval pondera por IC + - Causa 2: features de agregación estaban NULL en el prediction set + +### Cambios de configuración + +- **2026-04-23 — Peso IEA en `DEFAULT_EVIDENCE_WEIGHTS` 0.3 → 0.8.** La jerarquía clásica de GO-docs coloca IEA por debajo del tier computacional (ISS/IBA/... 0.7) y de NAS (0.5). Observación empírica en el histórico de GOA: las anotaciones IEA se promueven a un código experimental con mayor frecuencia que las del tier computacional, por lo que su fiabilidad previa estaba infraestimada. Los tres stages del benchmark (`baseline`, `alignment_weighted`, `reranker` v4) no consumen `evidence_weight`, así que las Fmax reportadas en Exp 1–11 no cambian; el swap sólo afecta a scorings basados en evidencia (p. ej. `evidence_primary`, `composite`, `embedding_plus_evidence`). + +--- + +## Exp 10 — ProstT5 vs ESMC (comparativa preliminar v3) + +**Fecha**: 2026-04-10 +**Objetivo**: replicar el reranker v3 sobre un segundo PLM (ProstT5-XL ~3B) para ver si la ganancia del v3 generaliza más allá de ESMC-300M. + +> **Caveat metodológico importante**: ESMC-300M (~300M params, BERT-like encoder) y ProstT5-XL (~3B params, T5 encoder + structure fine-tuning) son modelos con tamaño y arquitectura distintos. Esta comparativa mezcla esos ejes — no es fair para concluir nada sobre "ESMC vs ProstT5 como familia". El benchmark con matriz limpia está en `EXPERIMENTAL_DESIGN.md` (Exp 12). + +### Setup + +- **Evaluation set**: `42b34e79-6fe9-4fa0-b718-02f43a1e3192` (delta GOA 220→229, 20281 proteínas) +- **ESMC prediction set**: `a818b653-cad9-4f42-8e04-eda3f5ff2ceb` +- **ProstT5 prediction set**: `38ee00af-cbfd-4c5b-ab84-c98a32765b40` +- **IA file**: `IA_cafa6.tsv` +- **Ontology snapshot**: `947bdff6-d17c-4ca3-a41a-bc8fb4d74b7a` + +Rerankers v3 (`num_boost_round=1000, early_stopping_rounds=50, neg_pos_ratio=10, IA sample weights, 13 splits 160→220`): + +| Embedding | NK | LK | PK | +|---|---|---|---| +| ESMC-300M (job `16c3dcfd`) | `2ff1818f` | `269e26b4` | `e14b9716` | +| ProstT5-XL (job `12b704d4`) | `a1b4947d` | `60597ab9` | `1efd0c33` | + +CAFA eval results: +- ESMC + reranker: `ba7476cb-81f2-461a-b69a-a99c8df834bf` +- ProstT5 + reranker: `7b97e74a-54df-4e4e-90ed-39e07b58de64` + +### Resultados (cafaeval + IA, evaluación oficial) + +**F1 — ProstT5 gana en retrieval bruto**: avg Fmax baseline ProstT5 0.4849 vs ESMC 0.4824. Consistente en las 9 celdas: ProstT5 gana 44/45 en el 45-cell benchmark previo. + +**F3 — Reranker per-aspect (9 celdas)**: + +| Método | NK-BPO | NK-MFO | NK-CCO | LK-BPO | LK-MFO | LK-CCO | PK-BPO | PK-MFO | PK-CCO | Avg | +|---|---|---|---|---|---|---|---|---|---|---| +| ESMC baseline | 0.412 | 0.590 | 0.668 | 0.467 | 0.558 | 0.675 | 0.187 | 0.278 | 0.325 | 0.4624 | +| ESMC + reranker v3 | 0.431 | 0.620 | 0.692 | 0.478 | 0.607 | 0.697 | 0.201 | 0.297 | 0.339 | **0.4846** | +| ProstT5 baseline | ~ | ~ | ~ | ~ | ~ | ~ | ~ | ~ | ~ | **0.4849** | +| ProstT5 + reranker v3 | ~ | ~ | ~ | ~ | ~ | ~ | ~ | ~ | ~ | 0.4817 | + +- **ESMC mejora con reranker**: 6/9 celdas, avg Δ = **+0.0022** +- **ProstT5 degrada con reranker**: 9/9 celdas, avg Δ = **−0.0032** +- Avg final ESMC+rr (0.4846) ≈ ProstT5+rr (0.4817), diferencia pequeña pero de signo opuesto a la del retrieval bruto + +### F2 — Feature importance (hipótesis de compensación) + +Extracción de `feature_importance` (gain) de los 6 rerankers. Agregado sobre features de `{alignment_*, similarity_*, taxonomic_*}`: + +- **ESMC ponderan alignment+taxonomy entre 2.15% y 5.22% más** que sus homólogos ProstT5 (monótono en NK/LK/PK) +- Diferencias dramáticas en features individuales: + - NK `alignment_score_nw`: ESMC 4.72% vs ProstT5 1.69% (**2.8×**) + - PK `similarity_nw`: ESMC 9.63% vs ProstT5 3.91% (**2.5×**) +- ProstT5 compensa redistribuyendo a features derivadas del embedding: `ref_annotation_density`, `vote_count`, `k_position` + +**Interpretación**: cuando el embedding es "más fuerte" (ProstT5), el reranker se apoya menos en señales externas (alineamiento, taxonomía) y más en estadísticos derivados del propio retrieval. Este es el carry-over de la hipótesis que se va a testear formalmente como H4 en `EXPERIMENTAL_DESIGN.md`. + +### Blocker — under-training en los 6 modelos v3 + +Revisión del `best_iteration` de cada modelo con `num_boost_round=1000, early_stopping_rounds=50`: + +| Modelo | best_iteration | +|---|---| +| ESMC-nk | **1000** (techo, early stop no disparó) | +| ESMC-lk | 994 | +| ESMC-pk | 999 | +| ProstT5-nk | **1000** | +| ProstT5-lk | 995 | +| ProstT5-pk | **1000** | + +Con 95k–332k samples por tier y LR=0.01, este dataset típicamente necesita 3000–10000 iters para saturar. **Conclusión**: los deltas de F3 (especialmente el signo negativo de ProstT5 −0.0032) pueden ser artefacto del under-training, no efecto real del embedding. + +- **F2 (feature importance) sigue siendo válido** — ambos modelos tuvieron el mismo presupuesto bajo el techo, la diferencia *relativa* en cómo distribuyen alignment/taxonomy es una comparación justa +- **F3 (signos de los deltas Fmax) está contaminado** — no se debe usar para la tesis hasta que converjan + +**Lección metodológica crítica**: el campo `test_evaluation` que reporta `train_reranker_auto` muestra deltas de +0.04 a +0.08 Fmax mucho más optimistas que los +0.002 reales de cafaeval. El test_evaluation no aplica propagación GO ni IA weighting — **no usar para la tesis**. Solo cafaeval con IA. + +### Estado + +- F1 y F2: publicables con los números actuales +- F3: **pendiente de re-evaluación** tras v4 (ver Exp 11) +- Estado de trabajo detallado: `project_reranker_benchmark.md` (auto-memory) + +--- + +## Exp 11 — Re-training v4 "converged" (en curso) + +**Fecha de lanzamiento**: 2026-04-10 18:03 UTC +**Objetivo**: re-entrenar los 6 modelos (ESMC y ProstT5, NK/LK/PK) con presupuesto suficiente para que el early stopping dispare de verdad, eliminando el confounder de under-training del Exp 10. + +### Cambios respecto a v3 + +| Parámetro | v3 | v4 | +|---|---|---| +| `num_boost_round` | 1000 | **5000** | +| `early_stopping_rounds` | 50 | **100** | +| Resto | — | idéntico (13 splits 160→220, neg_pos_ratio=10, IA weights, per-tier NK/LK/PK, alignment+taxonomy features) | + +El resto del pipeline (KNN, FAISS IVFFlat, feature engineering) es idéntico — v4 cambia **solo** el presupuesto de boosting. + +### Jobs + +Ambos lanzados a `protea.training` (cola aislada, worker dedicado, peak RAM ~14 GB con los fixes de chunked KNN del 2026-04-10): + +| Job | Modelo | Estado esperado | +|---|---|---| +| `48c91381-1af1-414c-bd1b-a6a51c931873` | `lgbm_v4_converged_esmc` | running (~2h) | +| `e923ac70-21a8-4c5c-8cc6-9ebb76d156aa` | `lgbm_v4_converged_prostt5` | queued, arrancará al terminar ESMC | + +Tiempo estimado total: ~4h serial (protea.training procesa uno a uno). + +### Escenarios esperados al terminar + +- **A — narrativa F2 se confirma**: ProstT5 sigue degradando (−ΔFmax tras converger) → conclusión fuerte de tesis, la hipótesis de compensación gana peso +- **B — ProstT5 pasa a neutro o +**: narrativa se suaviza ("ambos embeddings mejoran con reranker, ESMC un poco más") — F2 sigue válido como explicación +- **C — ambos suben ~0.01-0.02**: confirma que v3 estaba under-trained y da números definitivos más altos que Exp 10 + +### Pendientes cuando termine + +1. Verificar `best_iteration` de los 6 modelos nuevos (esperamos 2000-4000, disparando early stop) +2. Re-lanzar `run_cafa_evaluation` para ambos embeddings con los nuevos reranker UUIDs +3. Re-extraer feature importance y re-validar F2 +4. Reemplazar la tabla de F3 en el Exp 10 con los números de v4 +5. Decidir A/B/C y actualizar la narrativa de la tesis en consecuencia + +--- + +## Exp 12 — Extended PLM benchmark matrix (planned) + +**Fecha de diseño**: 2026-04-10 +**Estado**: documento de diseño prospectivo +**Plan completo**: `EXPERIMENTAL_DESIGN.md` + +### Motivación + +Exp 10 expuso el confounder central del trabajo preliminar: comparar ESMC-300M (~300M, BERT-like) con ProstT5-XL (~3B, T5 + structure fine-tuning) mezcla **tamaño** y **familia** en un solo eje. Ningún finding se puede atribuir a una u otra dimensión sin una matriz que los separe. + +### Matriz propuesta (8 modelos) + +| # | Modelo | Params | Backend | Estado | +|---|---|---|---|---| +| 1 | ESMC-300M | ~300M | `esm3c` | ✓ (Exp 10, v4 en curso) | +| 2 | ESMC-600M | ~600M | `esm3c` | nuevo | +| 3 | ESM2-650M (`esm2_t33_650M_UR50D`) | ~650M | `esm` | nuevo | +| 4 | ESM2-3B (`esm2_t36_3B_UR50D`) | ~3B | `esm` | nuevo | +| 5 | Ankh-base (`ElnaggarLab/ankh-base`) | ~450M | `ankh` | nuevo | +| 6 | Ankh-large (`ElnaggarLab/ankh-large`) | ~1.9B | `ankh` | nuevo | +| 7 | ProtT5-XL (`prot_t5_xl_uniref50`) | ~3B | `t5` | nuevo | +| 8 | ProstT5-XL | ~3B | `t5` | ✓ (Exp 10, v4 en curso) | + +**Descartado**: ESM2-15B (coste de embedding prohibitivo, no tiene par T5 de tamaño equivalente → rompe la simetría de la matriz). + +### Research questions (ver `EXPERIMENTAL_DESIGN.md` §2) + +- **RQ1**: ¿a tamaño fijo, qué familia gana (BERT-like vs T5 encoder)? +- **RQ2**: ¿cómo escala Fmax con el tamaño dentro de una familia? ¿Dónde satura? +- **RQ3**: ¿estructura aporta? — test pareado ProtT5-XL vs ProstT5-XL (mismo backbone, única diferencia = 3Di fine-tuning) +- **RQ4**: ¿los embeddings más débiles fuerzan al reranker a compensar con alignment+taxonomy? (carry-over de F2) + +### Protocolo + +Pipeline idéntico para los 8 modelos — cero tuning per-modelo. Ver `EXPERIMENTAL_DESIGN.md` §6 para hiperparámetros pinned: KNN `k=5`, FAISS IVFFlat, alignments + taxonomy on, reranker v4 (5000 rounds), `run_cafa_evaluation` con IA weighting. + +### Tests estadísticos + +Wilcoxon signed-rank sobre las 9 celdas Fmax, corrección Holm-Bonferroni sobre 6 comparaciones pareadas, bootstrap CI 95% para effect sizes. Regresión OLS para H4. + +### Coste + +~3-4 días de compute secuencial (embeddings + KNN + v4 training + eval por los 6 modelos nuevos). Comprimible con paralelismo GPU si procede. + +### Estado + +- **Diseño**: completo (`EXPERIMENTAL_DESIGN.md` v1.0) +- **Ejecución**: bloqueada hasta que v4 (Exp 11) valide que el presupuesto es correcto +- **Dependencias previas**: Ankh backend ya integrado en PROTEA como `model_backend="ankh"` dedicado (no alias de `t5`) — ver `project_ankh_backend.md` + +### Deliverables esperados + +1. Tabla master 8 × 3 (baseline / alignment_weighted / reranker) × 9 celdas +2. Heatmap de feature importance de las 24 rerankers (8 modelos × 3 tiers) +3. Report estadístico (p-valores + effect sizes + CIs) por comparación +4. Capítulo de tesis formalizando RQ1-RQ4 con la matriz como evidencia diff --git a/README.md b/README.md index 9cefcd8..f894fa5 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,7 @@ **Protein annotation platform** for large-scale GO term prediction, sequence embedding, and functional analysis. -PROTEA provides a unified backend for ingesting protein data from UniProt, computing ESM2 embeddings, and predicting Gene Ontology terms via KNN transfer — with a full job queue, REST API, and web interface. +PROTEA provides a unified backend for ingesting protein data from UniProt, computing protein language model embeddings (ESMC, ProstT5, ESM2), and predicting Gene Ontology terms via KNN transfer plus a learned LightGBM re-ranker — with a full job queue, REST API, and web interface. [![Lint](https://github.com/frapercan/PROTEA/actions/workflows/lint.yml/badge.svg)](https://github.com/frapercan/PROTEA/actions/workflows/lint.yml) [![Tests](https://github.com/frapercan/PROTEA/actions/workflows/test.yml/badge.svg)](https://github.com/frapercan/PROTEA/actions/workflows/test.yml) @@ -21,6 +21,16 @@ PROTEA provides a unified backend for ingesting protein data from UniProt, compu --- +## Why PROTEA? + +PROTEA is the successor to [PIS](https://github.com/CBBIO/protein-information-system) and [FANTASIA](https://github.com/CBBIO/fantasia), rebuilt around three goals: + +1. **Clean architecture** — infrastructure, orchestration, and domain logic are explicitly decoupled. Operations are pure domain logic; workers own sessions and queue state; routers expose HTTP. No more God-classes that mix everything. +2. **Learned re-ranking on top of KNN transfer** — beyond classical embedding-KNN annotation, PROTEA trains **LightGBM rerankers on temporal GOA splits** (LambdaRank + CAFA IA weighting, per-tier NK/LK/PK models). Candidates retrieved by KNN are re-scored with alignment, taxonomy, and retrieval features. +3. **Honest temporal evaluation** — benchmarking uses **temporal holdout deltas** between historical GOA releases (e.g. 220→229), evaluated with the official `cafaeval` library and information-accretion weighting, avoiding the optimistic leakage of random splits. + +--- + ## What PROTEA does | Capability | Details | @@ -28,12 +38,13 @@ PROTEA provides a unified backend for ingesting protein data from UniProt, compu | **Protein ingestion** | Paginated UniProt REST API, MD5-deduplicated sequences | | **GO ontology** | Load OBO snapshots, full DAG stored per release | | **GO annotations** | Bulk import from GOA (GAF) and QuickGO (TSV) | -| **Embeddings** | ESM2 via GPU workers, stored as pgvector VECTOR columns | -| **GO prediction** | KNN transfer with optional NW/SW alignment and taxonomic features | -| **CAFA evaluation** | Benchmark pipeline with cafaeval integration | -| **Job queue** | RabbitMQ-backed, 7 queues, full audit trail per job | -| **REST API** | 21 FastAPI endpoints across 5 routers | -| **Web UI** | Next.js frontend with protein explorer, annotation viewer, prediction browser | +| **Embeddings** | ESMC, ProstT5, and ESM2 backends via GPU workers; stored as pgvector `VECTOR` columns | +| **GO prediction** | KNN transfer (FAISS IVFFlat / numpy) with optional NW/SW alignment and taxonomic features | +| **Learning-to-rank** | LightGBM rerankers trained on temporal GOA splits — LambdaRank + IA weighting, per-tier NK/LK/PK models | +| **CAFA evaluation** | Benchmark pipeline with `cafaeval` integration, Fmax + IA-weighted scoring, per-aspect (BPO/MFO/CCO) results | +| **Job queue** | RabbitMQ-backed, 8 queues (ingestion, embeddings, predictions, training), full audit trail per job | +| **REST API** | FastAPI routers for jobs, proteins, embeddings, query sets, scoring, evaluation, and admin | +| **Web UI** | Next.js frontend with protein explorer, annotation viewer, prediction browser, and live job widget | --- @@ -73,6 +84,52 @@ bash scripts/manage.sh start --- +## 5 minutes to your first job + +With the stack running locally, you can submit a job and watch it +move through the queue + worker + DB lifecycle in under 5 minutes. + +```bash +# 1. Submit a `ping` job (the smoke-test operation). +JOB_ID=$(curl -s -X POST http://localhost:8000/jobs \ + -H 'content-type: application/json' \ + -d '{"operation": "ping", "queue_name": "protea.ping", "payload": {}}' \ + | jq -r '.id') +echo "queued: $JOB_ID" + +# 2. Tail the structured-event log until the job reaches a terminal state. +curl -s "http://localhost:8000/jobs/$JOB_ID/events" | jq -c '.[]' +# {"event":"ping.start","fields":null,"level":"info","ts":"..."} +# {"event":"ping.done","fields":{"latency_ms":1.2},"level":"info","ts":"..."} + +# 3. Check the final job row + result. +curl -s "http://localhost:8000/jobs/$JOB_ID" | jq '{status, result, error_code}' +# {"status":"succeeded","result":{"echo":"pong"},"error_code":null} +``` + +That round-trip exercises the full machinery: HTTP enqueue → AMQP +publish → worker claim → operation execute → JobEvent stream → DB +commit → REST query. Real operations (`insert_proteins`, +`load_goa_annotations`, `compute_embeddings`, `predict_go_terms`) +are submitted the same way; their payloads are documented at +`/docs` (Swagger UI) and in the operation-catalog page of the +Sphinx docs. + +Discovering the installed plugins (added in F2B turn 36): + +```bash +curl -s http://localhost:8000/backends | jq '.plugins[].name' +# "ankh", "esm", "esm3c", "t5" + +curl -s http://localhost:8000/sources | jq '.plugins[].name' +# "goa", "quickgo", "uniprot" + +curl -s http://localhost:8000/runners | jq '.plugins[].name' +# "baseline", "knn", "lightgbm" +``` + +--- + ## Documentation Full documentation at **https://protea.readthedocs.io** @@ -103,10 +160,17 @@ poetry run task lint # ruff + flake8 + mypy |---|---| | API | FastAPI + SQLAlchemy 2.x + PostgreSQL 16 + pgvector | | Queue | RabbitMQ (pika) | -| Embeddings | ESM2 (Meta) via Hugging Face Transformers | -| KNN search | FAISS IVFFlat / numpy | +| Embeddings | ESMC (ESM SDK), ProstT5 / prot_t5_xl (T5Encoder), ESM2 (Hugging Face Transformers) | +| KNN search | FAISS IVFFlat / numpy (chunked brute-force) | +| Re-ranker | LightGBM (LambdaRank, IA-weighted samples) | | Frontend | Next.js 19 + Tailwind v4 | -| Deployment | Docker, manage.sh, vast.ai GPU instances | +| Deployment | Docker Compose, `scripts/manage.sh` process supervisor | + +--- + +## License + +Released into the public domain under the [Unlicense](LICENSE). You are free to copy, modify, publish, use, compile, sell, or distribute PROTEA for any purpose, commercial or non-commercial, without attribution. --- diff --git a/RERANKER.md b/RERANKER.md index 2301546..89a0711 100644 --- a/RERANKER.md +++ b/RERANKER.md @@ -1,188 +1,237 @@ -# Temporal Holdout Re-Ranker for GO Term Prediction +# PROTEA Re-Ranker — Design and Rationale -## Motivación +**Status**: implemented (v3 shipped, v4 training in progress) +**Location in code**: `protea/core/reranker.py`, `protea/core/operations/train_reranker.py` +**Version**: 2.0 — 2026-04-10 (rewrite) -El pipeline actual de PROTEA transfiere anotaciones GO mediante KNN sobre embeddings ESM, usando un scoring heurístico que combina distancia de embedding y pesos de evidencia. Este scoring no está optimizado para la métrica objetivo (Fmax) ni para el comportamiento real de las anotaciones GO a lo largo del tiempo. - -La hipótesis central es que existe una señal aprendible: **dado el contexto de una predicción KNN, ¿acabará este GO term apareciendo en el siguiente release de GOA para esta proteína?** Esta señal puede extraerse directamente del mecanismo de holdout temporal que ya implementa PROTEA. +> This document describes **the re-ranker as it exists in PROTEA today**. An earlier version of this file proposed a PyTorch cross-attention architecture with WebDataset shards; that proposal was explored on paper but **never implemented**. The system converged on a simpler LightGBM design for the reasons documented in §3 ("Why LightGBM and not a neural cross-encoder"). The experiment log showing the evolution across versions lives in `EXPERIMENTS.md`; the forward-looking PLM benchmark plan that uses this re-ranker as a fixed downstream stage lives in `EXPERIMENTAL_DESIGN.md`. --- -## Formulación del Problema +## 1. Problem statement -Sea $\mathcal{G}_N$ el conjunto de anotaciones GO en el release $N$ de GOA (Swiss-Prot reviewed). Para cada par consecutivo $(G_N, G_{N+1})$, el delta temporal es: +PROTEA predicts GO terms by transferring annotations from the $k$ nearest reference proteins in an embedding space. The raw retrieval score is a distance-based heuristic (e.g. `1 - cosine_distance / 2`) optionally combined with alignment identity and evidence weights. This heuristic is: -$$\Delta_{N \to N+1} = \{(p, t) \mid (p, t) \in \mathcal{G}_{N+1} \setminus \mathcal{G}_N\}$$ +- **Not optimised for Fmax with IA weighting** — the metric CAFA actually uses +- **Not calibrated across tiers** — No-Knowledge, Limited-Knowledge and Previously-Known proteins behave very differently and benefit from different signal combinations +- **Not able to use all available features** — sequence alignments, taxonomy, neighbour statistics, and evidence codes are either ignored or combined by hand with arbitrary weights -El re-ranker aprende una función: +The re-ranker replaces this heuristic with a **learned function** that, for each candidate GO term, produces a probability score used to reorder the top-$k$ retrieval list: $$f(q, t, \mathcal{N}_K(q)) \to \hat{y} \in [0, 1]$$ -donde: -- $q$ es la proteína query (representada por su embedding ESM) -- $t$ es el GO term candidato -- $\mathcal{N}_K(q)$ es el conjunto de $K$ vecinos más cercanos en el espacio de embeddings con referencia $\mathcal{G}_N$ -- $\hat{y}$ es la probabilidad de que $(q, t) \in \Delta_{N \to N+1}$ +where $q$ is the query protein, $t$ is a candidate GO term, and $\mathcal{N}_K(q)$ is the set of $K$ nearest neighbours that voted for $t$. + +The training signal is derived from the **temporal structure of GOA releases**: a GO term that first appears for a protein in a later release (and was missing from an earlier one) defines a positive example; any term predicted but absent from the future release is a negative. See §4. --- -## Protocolo de Entrenamiento +## 2. Scope of this document -Se utiliza validación cruzada temporal con múltiples splits históricos de GOA: +| Covered | Not covered | +|---|---| +| Model architecture and feature set | Downstream CAFA evaluation protocol (→ `EXPERIMENTAL_DESIGN.md` §7) | +| Training protocol and hyperparameters | PLM comparison across ESMC/ESM2/ProstT5/Ankh (→ `EXPERIMENTAL_DESIGN.md`) | +| Version history and key design decisions | Historical result tables per experiment (→ `EXPERIMENTS.md`) | +| Integration with the PROTEA pipeline | Alternative rankers (cross-attention, ListNet, ProT5 rerankers…) | +| Known limitations | External tool baselines (eggNOG, Pannzer2, InterProScan) | -``` -Training splits: - GOA_190 → GOA_195 - GOA_195 → GOA_200 - GOA_200 → GOA_205 - GOA_205 → GOA_211 - GOA_211 → GOA_215 - GOA_215 → GOA_220 - -Test split (holdout estricto, nunca visto durante training): - GOA_220 → GOA_229 -``` +--- + +## 3. Why LightGBM and not a neural cross-encoder + +The original design (see §11 for the earlier version's record) proposed a cross-attention neural re-ranker in PyTorch, with learned GO term embeddings from the GO DAG and a WebDataset sharded data pipeline. That proposal was abandoned in favour of a LightGBM gradient-boosted tree model for four concrete reasons: -Para cada split se generan ejemplos etiquetados: positivos $(y=1)$ si el par (proteína, GO term) aparece en el delta, negativos $(y=0)$ en caso contrario. El desbalanceo esperado es aproximadamente 1:10, manejable con técnicas estándar. +1. **Data volume is moderate, not huge.** Each temporal split yields 80k–330k training rows after negative subsampling. Gradient boosted trees are the sample-efficient sweet spot for this regime; a cross-attention transformer would either overfit or need heavy regularisation and we would then be tuning architecture choices instead of studying the actual research question. +2. **Feature heterogeneity is the bottleneck, not representation.** The informative features are already engineered (alignment scores, taxonomy distance, neighbour statistics). A model whose job is to combine 23 tabular features non-linearly across categorical and numeric axes is exactly what GBDT excels at. A neural cross-encoder would need to learn an equivalent combination from scratch. +3. **Interpretability is a thesis requirement.** The F2 finding (that smaller PLMs force the re-ranker to rely more on alignment/taxonomy) can only be measured through gain-based feature importance. LightGBM exposes this directly; extracting equivalent attributions from a cross-attention model requires additional machinery (integrated gradients, attention rollout) that adds failure modes. +4. **Training cost was a hard constraint.** Each re-ranker (per-tier × per-embedding) trains in 2–4 hours on CPU. The same pipeline under a neural cross-encoder with the same budget would train a single model for similar time on a GPU while blocking the embedding worker. Since the PLM benchmark (`EXPERIMENTAL_DESIGN.md`) multiplies compute cost by 8, the LightGBM choice is what makes the study feasible on a single workstation. + +The cross-attention design was not a wrong idea, only a wrong fit for this problem at this scale. Revisiting it remains an option if a later phase of the work finds a measurable ceiling on LightGBM. --- -## Arquitectura: Cross-Attention Re-Ranker +## 4. Temporal holdout training signal -El modelo procesa cada par (query, GO term) usando el contexto completo de los vecinos KNN que contribuyeron a esa predicción. +Let $\mathcal{G}_N$ denote the set of GO annotations present in GOA release $N$ (Swiss-Prot reviewed, evidence-filtered to exclude IEA if so configured). For any ordered pair of releases $(N, N+1)$, the **annotation delta** is -``` -Inputs por predicción (query_protein, go_term): - query_embedding float32[D] ESM embedding del query (D=480 para esmc_300m) - neighbor_embeddings float32[K × D] ESM embeddings de los K vecinos contribuyentes - tabular_features float32[K × F] distancia, evidencia, alineamiento, taxonomía... - go_term_embedding float32[G] embedding semántico del GO term (G=64) - -Arquitectura: - 1. query_proj(query_embedding) → q [H=256] - 2. ref_proj(neighbor_embeddings) → tokens [K × H] - 3. feature_encoder(tabular_features) → (sumado a tokens) - 4. CrossAttention(q, tokens, tokens) → context [H] - 5. MLP([q ‖ context ‖ go_emb ‖ agg_features]) → score [1] -``` +$$\Delta_{N \to N+1} = \{(p, t) \mid (p, t) \in \mathcal{G}_{N+1} \setminus \mathcal{G}_N\}$$ + +For a training pair $(N, N+1)$: -La atención cruzada permite al modelo aprender **qué vecinos son más informativos para este query concreto**, en lugar de agregar los scores de forma heurística. +1. All proteins in $\mathcal{G}_{N+1}$ are used as queries. +2. KNN retrieval is performed using **only** the reference set derived from $\mathcal{G}_N$ (no leakage from the future). +3. For each candidate $(q, t)$ in the retrieval output: + - **Positive** ($y = 1$) if $(q, t) \in \Delta_{N \to N+1}$ (the annotation materialised between $N$ and $N+1$) + - **Negative** ($y = 0$) if the model predicted $t$ but $(q, t) \notin \mathcal{G}_{N+1}$ -### GO Term Embeddings +This definition ensures the training labels are **causally prior** to the prediction: at time $N$ the system does not know what $N+1$ will contain, and neither does the re-ranker while scoring. -Los embeddings de los GO terms se aprenden a partir de la estructura del DAG de GO (relaciones `is_a` / `part_of`) mediante Node2Vec o TransE, de forma que términos semánticamente relacionados (padre-hijo) tengan representaciones similares. El DAG ya está disponible en PROTEA a través de los modelos `GOTerm` y `GOTermRelationship`. +The test split $(220 \to 229)$ is never seen during training and produces the Fmax numbers that are reported for the thesis. --- -## Feature Vector +## 5. Feature set (implementation: `protea/core/reranker.py`) -Cada predicción (query, GO term) se caracteriza por las siguientes features tabulares, computadas por vecino que contribuyó a la predicción: +Each (query, candidate GO term, contributing neighbour) triple is characterised by **23 features** — 20 numeric and 3 categorical — computed at KNN time and persisted on `GOPrediction` rows. -| Feature | Descripción | Estado | +### 5.1 Numeric features (20) + +| Group | Feature | Origin | |---|---|---| -| `distance` | Distancia coseno en espacio de embeddings | Existente | -| `evidence_weight` | Peso del código de evidencia (IDA > IEA) | Existente | -| `identity_nw / sw` | Identidad de secuencia (alineamiento NW/SW) | Existente (opcional) | -| `similarity_nw / sw` | Similaridad de secuencia | Existente (opcional) | -| `taxonomic_distance` | Distancia taxonómica entre query y referencia | Existente (opcional) | -| `vote_count` | Número de vecinos que coinciden en este GO term | **Nuevo** | -| `k_position` | Posición del vecino más cercano que predijo este término | **Nuevo** | -| `go_term_frequency` | Frecuencia del término en el annotation set de referencia | **Nuevo** | -| `ref_annotation_density` | Número de GO terms de la proteína de referencia | **Nuevo** | -| `neighbor_distance_std` | Varianza de distancias a los K vecinos | **Nuevo** | +| **Embedding retrieval** | `distance` | cosine distance between query and the contributing neighbour | +| **NW alignment** | `identity_nw`, `similarity_nw`, `alignment_score_nw`, `gaps_pct_nw`, `alignment_length_nw` | Needleman–Wunsch via parasail (BLOSUM62), computed per (query, neighbour) pair when `compute_alignments=True` | +| **SW alignment** | `identity_sw`, `similarity_sw`, `alignment_score_sw`, `gaps_pct_sw`, `alignment_length_sw` | Smith–Waterman via parasail (BLOSUM62), same condition | +| **Sequence length** | `length_query`, `length_ref` | Raw sequence lengths | +| **Taxonomy** | `taxonomic_distance`, `taxonomic_common_ancestors` | NCBI taxonomy LCA via ete3 when `compute_taxonomy=True` | +| **Neighbour aggregation** | `vote_count` | Number of neighbours in the top-$k$ that voted for the same GO term | +| | `k_position` | Rank (0-indexed) of the closest neighbour that supported the term | +| | `go_term_frequency` | Global frequency of the term in the reference annotation set | +| | `ref_annotation_density` | Number of distinct GO terms annotating the reference protein | +| | `neighbor_distance_std` | Standard deviation of distances across the $k$ neighbours of the query | + +### 5.2 Categorical features (3) + +| Feature | Meaning | +|---|---| +| `qualifier` | GAF qualifier of the source annotation (`enables`, `involved_in`, etc.) | +| `evidence_code` | GAF evidence code of the source annotation (`EXP`, `IDA`, `IEA`, …) | +| `taxonomic_relation` | Discrete label derived from the LCA (`same_species`, `same_genus`, `same_family`, `distant`) | + +Categoricals are passed to LightGBM via its native `categorical_feature` handling (no one-hot encoding; LightGBM partitions on category sets directly). + +### 5.3 Missing-value convention + +- Numeric missing values are left as `NaN` and handled natively by LightGBM's missing-value-aware splits. +- Categorical missing values are coerced to `NA` and treated as a distinct bin. +- Alignment and taxonomy columns are only populated when `compute_alignments=True` / `compute_taxonomy=True` at prediction time. If either flag is off, those columns are all-NaN for the run and the re-ranker still trains but with a degraded feature set. --- -## Función de Pérdida +## 6. Model and training protocol + +### 6.1 Model + +- **Library**: LightGBM (`lightgbm.Booster`) +- **Objective**: `binary` (binary cross-entropy / log loss) +- **Validation metric**: `binary_logloss` and `auc` (early stopping is tracked on AUC) +- **Boosting**: `gbdt` with `num_leaves=31`, `learning_rate=0.01`, `feature_fraction=0.8`, `bagging_fraction=0.8`, `bagging_freq=5`, `seed=42` +- **Early stopping**: disabled via callback only if `early_stopping_rounds=0`; otherwise stops when validation AUC does not improve for the configured number of rounds -Se utiliza **LambdaRank** en lugar de binary cross-entropy, ya que optimiza directamente el orden de las predicciones (proxy de NDCG / Fmax) en lugar de la calibración de probabilidades. +> **Note on the objective.** Earlier drafts of this document (and informal notes) described the loss as **LambdaRank**. The implementation is actually **binary cross-entropy**. Switching to a pairwise/listwise rank loss is a known avenue for future work; it was deferred because (a) binary CE is the simpler baseline and has already matched or beaten the heuristic `alignment_weighted` scoring and (b) LambdaRank would require restructuring the training data into query groups, which complicates the per-split sampling pipeline. -Para cada proteína query, las predicciones GO se rankean conjuntamente: -- Positivos: GO terms en $\Delta_{N \to N+1}$ -- Negativos: GO terms predichos pero no en el delta +### 6.2 Split strategy + +- **Stratified train/val split** at `val_fraction=0.2`, stratified on the label (the positive rate is 0.17%–5% depending on tier × aspect — naive random splits would under-represent positives in the validation set). +- **Negative subsampling** via `neg_pos_ratio=10`: after splitting, each of the train and val sets is independently subsampled so that `|negatives| ≤ 10 × |positives|`. Without this step, 6 of 9 per-(tier, aspect) models in v1 failed to learn at all — the positive rate was too low for gradient boosted trees to see a signal. +- **IA sample weighting**: when an information accretion file is provided, each row's `sample_weight` is set to `IA(go_term)`. This makes the model focus on informative (rare, specific) GO terms — the same aspect of the term that CAFA evaluation rewards via IA-weighted Fmax. + +### 6.3 Per-tier, not per-aspect + +One model is trained **per tier** (`NK`, `LK`, `PK`), not per (tier × aspect). This was an explicit change in v2 after v1 trained 9 models (one per cell) and 6 of them either never converged or overfit on the smaller aspect slices. Aspect identity is not currently used as a feature; this is a known simplification (see §9). + +### 6.4 Temporal splits + +- **Training pairs**: 13 consecutive deltas from GOA 160 through GOA 220 — `[(160,165), (165,170), (170,175), (175,180), (180,185), (185,190), (190,195), (195,200), (200,205), (205,211), (211,215), (215,220)]`. The training rows from all pairs are concatenated and passed to LightGBM as a single dataset. Pair identity is not used as a feature. +- **Test pair**: `(220, 229)` — never seen during training. The test set is passed through the trained reranker and fed to `run_cafa_evaluation` alongside the baseline to measure the lift. + +### 6.5 Budget + +| Version | `num_boost_round` | `early_stopping_rounds` | Comment | +|---|---|---|---| +| v1 | 300 | 50 | 6/9 models hit iter=1 (early stop on first round) — under-trained, unbalanced | +| v2 | 1000 | 50 | Stable; per-tier models; IA weighting introduced | +| v3 | 1000 | 50 | Same budget; alignment + taxonomy features fully populated in training (were NULL in v2) | +| v4 | **5000** | **100** | In progress 2026-04-10: all 6 v3 models hit `best_iteration ≈ 1000` — implying they never converged under the previous budget. v4 restores early stopping as a convergence criterion, not a time-out. | --- -## Pipeline de Datos: WebDataset +## 7. Integration with the PROTEA pipeline -El volumen de datos (múltiples splits × ~1.35M predicciones por split × embeddings de 480 dim) requiere un pipeline de datos eficiente. Se propone almacenar los ejemplos de entrenamiento en formato **WebDataset** (shards tar), con un shard por split GOA: +### 7.1 ORM and persistence -``` -reranker_data/ - splits/ - goa190_to_195.tar # ~2GB por shard - goa195_to_200.tar - ... - goa220_to_229.tar # test split — no tocar durante training - models/ - reranker_v1.pt - reranker_v1_config.json -``` +- **`Reranker` row** (table: `rerankers`) — stores the trained LightGBM booster serialised as bytes alongside training metadata (`feature_importance`, `val_auc`, `best_iteration`, `train_samples`, hyperparameters, parent `job_id`). +- **`RerankerTrainingJob`** row captures the auto-pipeline metadata (splits used, features computed, per-tier model IDs). + +### 7.2 Scoring router + +The `scoring` router exposes endpoints to list and inspect rerankers: +- `GET /scoring/rerankers` — list trained rerankers +- `GET /scoring/rerankers/{id}` — metadata + feature importance -Cada muestra en el WebDataset es **una proteína query** con todas sus predicciones GO para ese split: +### 7.3 Applying the re-ranker at evaluation time -```python +At evaluation time (`run_cafa_evaluation`), the caller supplies a `rerankers` mapping that selects a re-ranker per tier: + +```json { - "query_accession": "P12345", - "query_embedding": float32[480], - "go_term_ids": ["GO:0006915", "GO:0005737", ...], # N_preds - "neighbor_embeddings": float32[N_preds, K, 480], - "tabular_features": float32[N_preds, K, F], - "labels": int8[N_preds], # 1 si en delta, 0 si no + "rerankers": { + "nk": {"reranker_id": "2ff1818f-71b6-4932-8f8d-b3000e3c8d34"}, + "lk": {"reranker_id": "269e26b4-0bec-42fa-a077-fe5b675dd2de"}, + "pk": {"reranker_id": "e14b9716-bbf8-4b99-b34b-b801c3966579"} + } } ``` -El streaming de WebDataset permite entrenar sin cargar todo en RAM. +The evaluation operation: +1. Streams predictions from the target `PredictionSet` tier by tier. +2. For each tier, loads the corresponding booster, applies it to the feature matrix, and overrides the original `score` with the re-ranked probability. +3. Feeds the re-ranked predictions to `cafaeval` with IA weighting and emits per-cell Fmax. ---- +The raw `PredictionSet` is never mutated — the re-ranker only changes the `score` column as the rows pass through evaluation. This means a single prediction set can be evaluated under multiple re-rankers (ESMC, ProstT5, v3, v4, …) without duplicating storage. -## Stack Tecnológico +### 7.4 `train_reranker_auto` operation -| Componente | Tecnología | -|---|---| -| Modelo | PyTorch | -| Data pipeline | WebDataset + torch.utils.data | -| Baseline comparación | LightGBM (binary + LambdaRank) | -| GO embeddings | Node2Vec / PyTorch Geometric | -| Seguimiento experimentos | wandb | -| Embeddings proteína | ESM2 / ESMC (ya en PROTEA) | +The operation `train_reranker_auto` orchestrates the full pipeline end-to-end: ---- +1. For each training pair, runs KNN retrieval (FAISS IVFFlat by default) with `compute_alignments=True`, `compute_taxonomy=True`. +2. Writes per-pair parquet files into a temporary directory. +3. Loads the concatenation into memory, applies per-tier splits, trains three LightGBM boosters. +4. Persists the three boosters as `Reranker` rows under a common base name. +5. Optionally runs a self-evaluation on the held-out test split (see warning in §8). +6. **Cleans up the temporary parquet files** on exit (`shutil.rmtree(tmp_dir)` at `train_reranker.py:1480`). + +The cleanup in step 6 has an important consequence: **re-training only the LightGBM stage is not possible** after a pipeline run — a re-train requires re-executing the full KNN + feature engineering path. This is why each v-version re-train takes hours, not minutes. -## Integración en PROTEA +--- -Una vez entrenado, el re-ranker se integra en el pipeline existente: +## 8. Known limitations and caveats -1. Nuevo modelo ORM `RerankingModel`: almacena pesos serializados y metadata de entrenamiento -2. Campo `reranker_id` (nullable) en `PredictionSet` -3. Si `reranker_id` presente: `store_predictions` aplica el modelo y sobreescribe `score` con $\hat{y}$ -4. El threshold de Fmax se calcula igual que ahora sobre los nuevos scores -5. UI: selector de re-ranker en la pantalla de predicción +1. **`test_evaluation` is not comparable to `cafaeval`.** The operation optionally runs an internal test evaluator against the held-out split. That evaluator does not apply GO propagation, does not apply IA weighting, and uses a naive macro-Fmax that inflates improvements by +0.04 to +0.08 over what `cafaeval` actually reports. **It must not be used in thesis claims.** Only `run_cafa_evaluation` with IA and GO propagation produces numbers that belong in the thesis. +2. **Binary objective is a proxy for ranking.** Binary cross-entropy optimises pointwise calibration, not ranking quality. This is the single largest known gap between the current implementation and the ideal model for Fmax. Replacing it with LambdaRank (or a listwise objective) is the first item on the "future work" list. +3. **Parquet staging files are ephemeral.** The KNN + feature engineering output is thrown away at the end of a training run, so the LightGBM stage cannot be iterated independently. Persisting the staging parquet (behind a flag) would allow rapid hyperparameter sweeps. Open question: is the additional disk cost (10–20 GB per run) worth it? +4. **No aspect feature.** Aspect is not used as a feature, even though BPO/MFO/CCO have very different annotation densities and the same term can behave differently across aspects. A per-tier model averages across aspects and may under-perform in MFO vs BPO. +5. **No uncertainty output.** The re-ranker emits a point probability. Downstream evaluation is sensitive to calibration, but calibration is not currently measured. A reliability diagram per tier would help diagnose whether the probabilities are meaningful or only usable for ranking. +6. **Under-training of v1–v3.** All six v3 models (ESMC and ProstT5, NK/LK/PK) hit `best_iteration ≈ 1000` at the previous budget, which indicates the models never satisfied the early stopping criterion. The Fmax deltas derived from v3 must be treated as provisional until v4 completes. See `project_reranker_benchmark.md` for the full story. +7. **Temporal label noise.** Some annotations in $\Delta_{N \to N+1}$ are not genuinely "new biology"; they are curation catch-ups. There is no filter for this, so the training label includes noise. Evidence code filtering removes the worst offenders (IEA) but not all. +8. **Single embedding at a time.** The re-ranker is trained on features derived from one embedding configuration. There is no multi-embedding ensemble; comparing ESMC, ProstT5 and Ankh means training three independent re-rankers — which is exactly what the benchmark in `EXPERIMENTAL_DESIGN.md` does. --- -## Experimentos y Ablaciones +## 9. Version history -El diseño permite comparar directamente: +| Version | Date | Change | Outcome | +|---|---|---|---| +| v1 (unbalanced) | 2026-03-22 | First working pipeline: 9 per-(tier, aspect) models, binary CE, 300 rounds, no sample weights, no negative subsampling | 6/9 models never learned (positive rate too low); CCO/MFO noisy | +| v1 (balanced) | 2026-03-22 | Added `neg_pos_ratio=10`; same 9 models | All models learned; BPO recovered; MFO degraded vs heuristic | +| v2 | 2026-03-23 | Collapsed to 3 per-tier models (NK/LK/PK); added IA sample weighting; raised `num_boost_round` to 1000 | Robust; matched the heuristic `alignment_weighted` in most cells but did not beat it | +| v3 | 2026-03-23 | Populated alignment + taxonomy features during training (were NULL in v2) | First version to beat `alignment_weighted` in 7/9 cells for ESMC-300M | +| v3 ProstT5 | 2026-04-10 | Same v3 protocol, run on ProstT5-XL embeddings for cross-embedding comparison | Yielded the F1/F2/F3 findings in `project_reranker_benchmark.md`; exposed the under-training in v3 | +| v4 (in progress) | 2026-04-10 | Raised `num_boost_round` to 5000 and `early_stopping_rounds` to 100; same features, same splits | In training for both ESMC-300M and ProstT5-XL (jobs `48c91381`, `e923ac70`); meant to provide the converged reference numbers | -| Configuración | Descripción | -|---|---| -| **Baseline** | KNN + scoring heurístico actual | -| **LightGBM tabular** | Re-ranker con features tabulares sin embeddings | -| **LightGBM + derived** | Features tabulares + features derivadas del embedding (density, std) | -| **MLP cross-encoder** | Arquitectura completa sin cross-attention | -| **Cross-attention (propuesto)** | Arquitectura completa | -| **+ GO DAG embeddings** | Ablación: ¿aportan los go_term_emb? | -| **+ temporal CV** | Ablación: ¿mejora añadir más splits históricos? | +Concrete reranker UUIDs for the v3 and v4 runs live in `project_reranker_benchmark.md` and will be mirrored into `EXPERIMENTS.md` once v4 completes. + +--- + +## 10. Forward pointers -La métrica principal es **Fmax promedio sobre los 9 settings** (NK/LK/PK × BPO/MFO/CCO) en el test split GOA220→229. +- **`EXPERIMENTS.md`** — per-experiment tables, external tool comparisons, day-to-day lab notebook. +- **`EXPERIMENTAL_DESIGN.md`** — the prospective 8-model PLM comparison that uses this re-ranker as a fixed downstream stage. +- **`project_reranker_benchmark.md`** (in auto-memory) — volatile working state for the ongoing benchmark. +- **Code**: `protea/core/reranker.py` (feature definitions, `train`, `predict_scores`), `protea/core/operations/train_reranker.py` (both `TrainRerankerPayload` and `TrainRerankerAutoPayload`, the full pipeline). --- -## Valor para la Tesis +## 11. Historical note: why this file was rewritten -1. **Científicamente honesto**: el mismo mecanismo temporal que se usa para evaluar se usa para entrenar. No hay data leakage. -2. **Comprobable y cuantificable**: Fmax(baseline KNN) vs Fmax(re-ranker) en benchmark idéntico. -3. **Interpretable**: las feature importances (LightGBM) o los pesos de atención (cross-attention) revelan qué aspectos de una predicción KNN son más predictivos de anotaciones futuras. -4. **Generalizable**: el re-ranker aprende sobre distribuciones temporales de anotaciones GO, no sobre una proteína concreta — debería generalizar a proteínas no vistas. -5. **Extensible**: la arquitectura admite incorporar embeddings de secuencia de mayor calidad (ESM3, ProstT5) sin cambiar el pipeline. +The previous version of `RERANKER.md` (removed 2026-04-10) proposed a PyTorch cross-attention re-ranker over ESM embeddings with WebDataset sharded I/O, Node2Vec GO term embeddings, wandb tracking, and a nine-cell (tier × aspect) ablation matrix. That design was never built. The system that actually exists and produces the benchmark numbers in `EXPERIMENTS.md` is the LightGBM pipeline documented above. Keeping the two in sync was causing confusion when referring back to the design doc during thesis writing, so the document was rewritten from the current source of truth (`protea/core/reranker.py`) rather than from the original proposal. The historical proposal is preserved in git history for reference. diff --git a/alembic/versions/651358a5a2c8_add_consensus_features_to_go_prediction.py b/alembic/versions/651358a5a2c8_add_consensus_features_to_go_prediction.py new file mode 100644 index 0000000..61820ad --- /dev/null +++ b/alembic/versions/651358a5a2c8_add_consensus_features_to_go_prediction.py @@ -0,0 +1,37 @@ +"""add consensus features to go_prediction + +Revision ID: 651358a5a2c8 +Revises: b1a1f4ec0e42 +Create Date: 2026-04-16 10:00:00.000000 +""" +from __future__ import annotations + +import sqlalchemy as sa + +from alembic import op + +revision: str = "651358a5a2c8" +down_revision: str = "b1a1f4ec0e42" +branch_labels = None +depends_on = None + + +def upgrade() -> None: + op.add_column( + "go_prediction", + sa.Column("neighbor_vote_fraction", sa.Float(), nullable=True), + ) + op.add_column( + "go_prediction", + sa.Column("neighbor_min_distance", sa.Float(), nullable=True), + ) + op.add_column( + "go_prediction", + sa.Column("neighbor_mean_distance", sa.Float(), nullable=True), + ) + + +def downgrade() -> None: + op.drop_column("go_prediction", "neighbor_mean_distance") + op.drop_column("go_prediction", "neighbor_min_distance") + op.drop_column("go_prediction", "neighbor_vote_fraction") diff --git a/alembic/versions/76cafcb8d9be_add_groundtruth_uri_to_evaluation_set.py b/alembic/versions/76cafcb8d9be_add_groundtruth_uri_to_evaluation_set.py new file mode 100644 index 0000000..d75d9ff --- /dev/null +++ b/alembic/versions/76cafcb8d9be_add_groundtruth_uri_to_evaluation_set.py @@ -0,0 +1,32 @@ +"""add groundtruth_uri to evaluation_set + +Revision ID: 76cafcb8d9be +Revises: e037f3ae9f58 +Create Date: 2026-04-22 01:50:29.469554 + +""" +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa + + +# revision identifiers, used by Alembic. +revision: str = '76cafcb8d9be' +down_revision: Union[str, Sequence[str], None] = 'e037f3ae9f58' +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + """Upgrade schema.""" + # ### commands auto generated by Alembic - please adjust! ### + op.add_column('evaluation_set', sa.Column('groundtruth_uri', sa.String(length=512), nullable=True)) + # ### end Alembic commands ### + + +def downgrade() -> None: + """Downgrade schema.""" + # ### commands auto generated by Alembic - please adjust! ### + op.drop_column('evaluation_set', 'groundtruth_uri') + # ### end Alembic commands ### diff --git a/alembic/versions/7a2c9e1d0b33_add_reranker_v6_features_to_go_prediction.py b/alembic/versions/7a2c9e1d0b33_add_reranker_v6_features_to_go_prediction.py new file mode 100644 index 0000000..c19a3b6 --- /dev/null +++ b/alembic/versions/7a2c9e1d0b33_add_reranker_v6_features_to_go_prediction.py @@ -0,0 +1,55 @@ +"""add reranker v6 features to go_prediction + +Adds 25 nullable Float columns used by the v6 reranker: + +- 6 Anc2Vec semantic-coherence features (neighbor + query-known). +- 3 tax_voters consensus features (computed over the subset of neighbors that + voted for each candidate term). +- 16 emb_pca_query_* features (per-query projection onto the precomputed + principal components of the reference embedding pool). + +All columns are nullable because older prediction_sets predate these features +and older reranker versions do not read them. + +Revision ID: 7a2c9e1d0b33 +Revises: 651358a5a2c8 +Create Date: 2026-04-19 12:00:00.000000 +""" +from __future__ import annotations + +import sqlalchemy as sa + +from alembic import op + +revision: str = "7a2c9e1d0b33" +down_revision: str = "651358a5a2c8" +branch_labels = None +depends_on = None + + +_ANC2VEC_COLS = ( + "anc2vec_neighbor_cos", + "anc2vec_neighbor_maxcos", + "anc2vec_has_emb", + "anc2vec_query_known_cos", + "anc2vec_query_known_maxcos", + "anc2vec_query_known_count", +) + +_TAX_VOTERS_COLS = ( + "tax_voters_same_frac", + "tax_voters_close_frac", + "tax_voters_mean_common_ancestors", +) + +_EMB_PCA_COLS = tuple(f"emb_pca_query_{i}" for i in range(16)) + + +def upgrade() -> None: + for col in (*_ANC2VEC_COLS, *_TAX_VOTERS_COLS, *_EMB_PCA_COLS): + op.add_column("go_prediction", sa.Column(col, sa.Float(), nullable=True)) + + +def downgrade() -> None: + for col in reversed((*_ANC2VEC_COLS, *_TAX_VOTERS_COLS, *_EMB_PCA_COLS)): + op.drop_column("go_prediction", col) diff --git a/alembic/versions/b1a1f4ec0e42_sequence_embedding_to_halfvec.py b/alembic/versions/b1a1f4ec0e42_sequence_embedding_to_halfvec.py new file mode 100644 index 0000000..e8927eb --- /dev/null +++ b/alembic/versions/b1a1f4ec0e42_sequence_embedding_to_halfvec.py @@ -0,0 +1,54 @@ +"""migrate sequence_embedding.embedding from vector to halfvec + +Revision ID: b1a1f4ec0e42 +Revises: f7a004f5f2c7 +Create Date: 2026-04-14 22:00:00.000000 +""" +from __future__ import annotations + +from alembic import op + +revision: str = "b1a1f4ec0e42" +down_revision: str = "f7a004f5f2c7" +branch_labels = None +depends_on = None + + +def upgrade() -> None: + op.execute( + """ + DO $$ + BEGIN + IF ( + SELECT udt_name + FROM information_schema.columns + WHERE table_name = 'sequence_embedding' + AND column_name = 'embedding' + ) = 'vector' THEN + ALTER TABLE sequence_embedding + ALTER COLUMN embedding TYPE halfvec + USING embedding::halfvec; + END IF; + END $$; + """ + ) + + +def downgrade() -> None: + op.execute( + """ + DO $$ + BEGIN + IF ( + SELECT udt_name + FROM information_schema.columns + WHERE table_name = 'sequence_embedding' + AND column_name = 'embedding' + ) = 'halfvec' THEN + ALTER TABLE sequence_embedding + ALTER COLUMN embedding TYPE vector + USING embedding::vector; + END IF; + END $$; + """ + ) diff --git a/alembic/versions/b2c3d4e5f6a7_add_embedding_config_display_metadata.py b/alembic/versions/b2c3d4e5f6a7_add_embedding_config_display_metadata.py new file mode 100644 index 0000000..b2ff521 --- /dev/null +++ b/alembic/versions/b2c3d4e5f6a7_add_embedding_config_display_metadata.py @@ -0,0 +1,40 @@ +"""add display metadata columns to embedding_config + +Revision ID: b2c3d4e5f6a7 +Revises: 3505bfa74df6 +Create Date: 2026-04-10 + +Adds three nullable columns to ``embedding_config`` so the benchmark UI can +show a human-readable label, a family tag, and the approximate parameter +count without having to infer everything from the raw HuggingFace +``model_name`` at render time. + +All columns are nullable — existing rows can be backfilled later with +``UPDATE embedding_config SET display_name = ..., family = ..., param_count = ...`` +or left as NULL (the router falls back to the Python-side derivation). +""" +from collections.abc import Sequence + +import sqlalchemy as sa + +from alembic import op + +# revision identifiers, used by Alembic. +revision: str = 'b2c3d4e5f6a7' +down_revision: str | Sequence[str] | None = '3505bfa74df6' +branch_labels: str | Sequence[str] | None = None +depends_on: str | Sequence[str] | None = None + + +def upgrade() -> None: + """Upgrade schema.""" + op.add_column('embedding_config', sa.Column('display_name', sa.String(), nullable=True)) + op.add_column('embedding_config', sa.Column('family', sa.String(), nullable=True)) + op.add_column('embedding_config', sa.Column('param_count', sa.BigInteger(), nullable=True)) + + +def downgrade() -> None: + """Downgrade schema.""" + op.drop_column('embedding_config', 'param_count') + op.drop_column('embedding_config', 'family') + op.drop_column('embedding_config', 'display_name') diff --git a/alembic/versions/c4d5e6f7a8b9_add_taxonomy_to_query_set_entry.py b/alembic/versions/c4d5e6f7a8b9_add_taxonomy_to_query_set_entry.py new file mode 100644 index 0000000..438cda9 --- /dev/null +++ b/alembic/versions/c4d5e6f7a8b9_add_taxonomy_to_query_set_entry.py @@ -0,0 +1,43 @@ +"""add taxonomy_id and species to query_set_entry + +Revision ID: c4d5e6f7a8b9 +Revises: b2c3d4e5f6a7 +Create Date: 2026-04-11 + +Adds two nullable columns to ``query_set_entry`` so user-uploaded FASTA +sequences can carry their UniProt header taxonomy (``OX=`` / ``OS=``) even +when the accession is not present in the ``protein`` table and therefore +has no ``ProteinUniProtMetadata`` counterpart. + +The populating helper lives in ``protea.api.routers.query_sets`` and is a +silent no-op for non-UniProt headers. +""" +from collections.abc import Sequence + +import sqlalchemy as sa + +from alembic import op + +# revision identifiers, used by Alembic. +revision: str = 'c4d5e6f7a8b9' +down_revision: str | Sequence[str] | None = 'b2c3d4e5f6a7' +branch_labels: str | Sequence[str] | None = None +depends_on: str | Sequence[str] | None = None + + +def upgrade() -> None: + """Upgrade schema.""" + op.add_column('query_set_entry', sa.Column('taxonomy_id', sa.Integer(), nullable=True)) + op.add_column('query_set_entry', sa.Column('species', sa.String(), nullable=True)) + op.create_index( + 'ix_query_set_entry_taxonomy_id', + 'query_set_entry', + ['taxonomy_id'], + ) + + +def downgrade() -> None: + """Downgrade schema.""" + op.drop_index('ix_query_set_entry_taxonomy_id', table_name='query_set_entry') + op.drop_column('query_set_entry', 'species') + op.drop_column('query_set_entry', 'taxonomy_id') diff --git a/alembic/versions/c517e16da06b_reranker_model_artifact_columns.py b/alembic/versions/c517e16da06b_reranker_model_artifact_columns.py new file mode 100644 index 0000000..eeeaf93 --- /dev/null +++ b/alembic/versions/c517e16da06b_reranker_model_artifact_columns.py @@ -0,0 +1,81 @@ +"""reranker_model_artifact_columns + +Revision ID: c517e16da06b +Revises: 7a2c9e1d0b33 +Create Date: 2026-04-21 02:57:27.951747 + +""" +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa + + +# revision identifiers, used by Alembic. +revision: str = 'c517e16da06b' +down_revision: Union[str, Sequence[str], None] = '7a2c9e1d0b33' +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + """Upgrade schema.""" + op.add_column('reranker_model', sa.Column('artifact_uri', sa.String(length=512), nullable=True)) + op.add_column('reranker_model', sa.Column('feature_schema_sha', sa.String(length=16), nullable=True)) + op.add_column('reranker_model', sa.Column('embedding_config_id', sa.UUID(), nullable=True)) + op.add_column('reranker_model', sa.Column('ontology_snapshot_id', sa.UUID(), nullable=True)) + op.add_column('reranker_model', sa.Column('producer_version', sa.String(length=64), nullable=True)) + op.add_column('reranker_model', sa.Column('producer_git_sha', sa.String(length=40), nullable=True)) + op.add_column('reranker_model', sa.Column('spec_yaml', sa.Text(), nullable=True)) + # model_data goes nullable so new rows can live exclusively by reference + # (artifact_uri). Downgrade restores NOT NULL — will fail loudly if any + # row has a NULL model_data, which is the correct behavior. + op.alter_column( + 'reranker_model', 'model_data', + existing_type=sa.TEXT(), + nullable=True, + ) + op.create_index( + op.f('ix_reranker_model_embedding_config_id'), + 'reranker_model', ['embedding_config_id'], unique=False, + ) + op.create_index( + op.f('ix_reranker_model_ontology_snapshot_id'), + 'reranker_model', ['ontology_snapshot_id'], unique=False, + ) + op.create_foreign_key( + 'fk_reranker_model_ontology_snapshot_id', + 'reranker_model', 'ontology_snapshot', + ['ontology_snapshot_id'], ['id'], ondelete='SET NULL', + ) + op.create_foreign_key( + 'fk_reranker_model_embedding_config_id', + 'reranker_model', 'embedding_config', + ['embedding_config_id'], ['id'], ondelete='SET NULL', + ) + + +def downgrade() -> None: + """Downgrade schema.""" + op.drop_constraint( + 'fk_reranker_model_embedding_config_id', + 'reranker_model', type_='foreignkey', + ) + op.drop_constraint( + 'fk_reranker_model_ontology_snapshot_id', + 'reranker_model', type_='foreignkey', + ) + op.drop_index(op.f('ix_reranker_model_ontology_snapshot_id'), table_name='reranker_model') + op.drop_index(op.f('ix_reranker_model_embedding_config_id'), table_name='reranker_model') + op.alter_column( + 'reranker_model', 'model_data', + existing_type=sa.TEXT(), + nullable=False, + ) + op.drop_column('reranker_model', 'spec_yaml') + op.drop_column('reranker_model', 'producer_git_sha') + op.drop_column('reranker_model', 'producer_version') + op.drop_column('reranker_model', 'ontology_snapshot_id') + op.drop_column('reranker_model', 'embedding_config_id') + op.drop_column('reranker_model', 'feature_schema_sha') + op.drop_column('reranker_model', 'artifact_uri') diff --git a/alembic/versions/c7bab0210568_add_dataset_table.py b/alembic/versions/c7bab0210568_add_dataset_table.py new file mode 100644 index 0000000..c9791bc --- /dev/null +++ b/alembic/versions/c7bab0210568_add_dataset_table.py @@ -0,0 +1,67 @@ +"""add dataset table + +Revision ID: c7bab0210568 +Revises: c517e16da06b +Create Date: 2026-04-21 20:45:37.964428 + +""" +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa +from sqlalchemy.dialects import postgresql + +# revision identifiers, used by Alembic. +revision: str = 'c7bab0210568' +down_revision: Union[str, Sequence[str], None] = 'c517e16da06b' +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + """Upgrade schema.""" + # ### commands auto generated by Alembic - please adjust! ### + op.create_table('dataset', + sa.Column('id', sa.UUID(), nullable=False), + sa.Column('name', sa.String(length=255), nullable=False), + sa.Column('operation', sa.String(length=64), nullable=False), + sa.Column('job_id', sa.UUID(), nullable=True), + sa.Column('storage_backend', sa.String(length=32), nullable=False), + sa.Column('key_prefix', sa.String(length=512), nullable=False), + sa.Column('train_uri', sa.String(length=1024), nullable=True), + sa.Column('eval_uri', sa.String(length=1024), nullable=True), + sa.Column('manifest_uri', sa.String(length=1024), nullable=False), + sa.Column('schema_sha', sa.String(length=16), nullable=False), + sa.Column('manifest_sha', sa.String(length=64), nullable=True), + sa.Column('n_train_rows', sa.BigInteger(), nullable=False), + sa.Column('n_eval_rows', sa.BigInteger(), nullable=False), + sa.Column('k', sa.Integer(), nullable=False), + sa.Column('annotation_source', sa.String(length=32), nullable=False), + sa.Column('embedding_config_id', sa.UUID(), nullable=True), + sa.Column('ontology_snapshot_id', sa.UUID(), nullable=True), + sa.Column('train_snapshot_pairs', postgresql.JSONB(astext_type=sa.Text()), nullable=False), + sa.Column('eval_snapshot_pair', sa.String(length=64), nullable=True), + sa.Column('producer_version', sa.String(length=64), nullable=True), + sa.Column('producer_git_sha', sa.String(length=40), nullable=True), + sa.Column('meta', postgresql.JSONB(astext_type=sa.Text()), nullable=False), + sa.Column('created_at', sa.DateTime(timezone=True), server_default=sa.text('now()'), nullable=False), + sa.ForeignKeyConstraint(['embedding_config_id'], ['embedding_config.id'], ondelete='SET NULL'), + sa.ForeignKeyConstraint(['job_id'], ['job.id'], ondelete='SET NULL'), + sa.ForeignKeyConstraint(['ontology_snapshot_id'], ['ontology_snapshot.id'], ondelete='SET NULL'), + sa.PrimaryKeyConstraint('id'), + sa.UniqueConstraint('name') + ) + op.create_index(op.f('ix_dataset_embedding_config_id'), 'dataset', ['embedding_config_id'], unique=False) + op.create_index(op.f('ix_dataset_job_id'), 'dataset', ['job_id'], unique=False) + op.create_index(op.f('ix_dataset_ontology_snapshot_id'), 'dataset', ['ontology_snapshot_id'], unique=False) + # ### end Alembic commands ### + + +def downgrade() -> None: + """Downgrade schema.""" + # ### commands auto generated by Alembic - please adjust! ### + op.drop_index(op.f('ix_dataset_ontology_snapshot_id'), table_name='dataset') + op.drop_index(op.f('ix_dataset_job_id'), table_name='dataset') + op.drop_index(op.f('ix_dataset_embedding_config_id'), table_name='dataset') + op.drop_table('dataset') + # ### end Alembic commands ### diff --git a/alembic/versions/e037f3ae9f58_link_reranker_model_to_dataset.py b/alembic/versions/e037f3ae9f58_link_reranker_model_to_dataset.py new file mode 100644 index 0000000..80a5202 --- /dev/null +++ b/alembic/versions/e037f3ae9f58_link_reranker_model_to_dataset.py @@ -0,0 +1,38 @@ +"""link reranker_model to dataset + +Revision ID: e037f3ae9f58 +Revises: c7bab0210568 +Create Date: 2026-04-21 20:50:32.983265 + +""" +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa + + +# revision identifiers, used by Alembic. +revision: str = 'e037f3ae9f58' +down_revision: Union[str, Sequence[str], None] = 'c7bab0210568' +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + """Upgrade schema.""" + # ### commands auto generated by Alembic - please adjust! ### + op.add_column('reranker_model', sa.Column('dataset_id', sa.UUID(), nullable=True)) + op.add_column('reranker_model', sa.Column('external_source', sa.String(length=128), nullable=True)) + op.create_index(op.f('ix_reranker_model_dataset_id'), 'reranker_model', ['dataset_id'], unique=False) + op.create_foreign_key(None, 'reranker_model', 'dataset', ['dataset_id'], ['id'], ondelete='SET NULL') + # ### end Alembic commands ### + + +def downgrade() -> None: + """Downgrade schema.""" + # ### commands auto generated by Alembic - please adjust! ### + op.drop_constraint(None, 'reranker_model', type_='foreignkey') + op.drop_index(op.f('ix_reranker_model_dataset_id'), table_name='reranker_model') + op.drop_column('reranker_model', 'external_source') + op.drop_column('reranker_model', 'dataset_id') + # ### end Alembic commands ### diff --git a/alembic/versions/f7a004f5f2c7_add_visitor_events.py b/alembic/versions/f7a004f5f2c7_add_visitor_events.py new file mode 100644 index 0000000..96f90f9 --- /dev/null +++ b/alembic/versions/f7a004f5f2c7_add_visitor_events.py @@ -0,0 +1,40 @@ +"""add visitor_event table + +Revision ID: f7a004f5f2c7 +Revises: c4d5e6f7a8b9 +Create Date: 2026-04-12 20:50:00.000000 +""" +from __future__ import annotations + +import sqlalchemy as sa + +from alembic import op + +revision: str = "f7a004f5f2c7" +down_revision: str = "c4d5e6f7a8b9" +branch_labels = None +depends_on = None + + +def upgrade() -> None: + op.create_table( + "visitor_event", + sa.Column("id", sa.BigInteger(), autoincrement=True, nullable=False), + sa.Column("day", sa.Date(), nullable=False), + sa.Column("visitor_hash", sa.String(length=16), nullable=False), + sa.Column("path", sa.String(length=255), nullable=False), + sa.Column("method", sa.String(length=8), nullable=False), + sa.Column("status", sa.Integer(), nullable=False), + sa.Column("created_at", sa.DateTime(timezone=True), nullable=False), + sa.PrimaryKeyConstraint("id"), + ) + op.create_index("ix_visitor_event_day_hash", "visitor_event", ["day", "visitor_hash"]) + op.create_index("ix_visitor_event_created_at", "visitor_event", ["created_at"]) + op.create_index("ix_visitor_event_path", "visitor_event", ["path"]) + + +def downgrade() -> None: + op.drop_index("ix_visitor_event_path", table_name="visitor_event") + op.drop_index("ix_visitor_event_created_at", table_name="visitor_event") + op.drop_index("ix_visitor_event_day_hash", table_name="visitor_event") + op.drop_table("visitor_event") diff --git a/protea/api/schemas/__init__.py b/apps/lafa_container/__init__.py similarity index 100% rename from protea/api/schemas/__init__.py rename to apps/lafa_container/__init__.py diff --git a/apps/lafa_container/protea_main.py b/apps/lafa_container/protea_main.py new file mode 100644 index 0000000..1eb70fb --- /dev/null +++ b/apps/lafa_container/protea_main.py @@ -0,0 +1,237 @@ +"""LAFA-compatible PROTEA wrapper. + +Entry point that honours the LAFA container CLI contract: + + --query_file FASTA of query sequences + --train_sequences FASTA of training sequences + --annot_file TSV (EntryID, term, aspect) of training annotations + --graph go-basic.obo (currently unused; kept for contract parity) + --output_baseline 3-column TSV output (Query_ID, GO_Term, Score) + +Pipeline: + 1. Mean-pool ProtT5 embeddings for queries and refs (``prott5_encoder``). + 2. Cosine KNN via ``protea.core.knn_search.search_knn`` (numpy backend). + 3. First-hit GO transfer per query (matches PROTEA's ``_predict_batch``). + 4. Score = ``1 - distance`` (cosine, in [0, 1]). + 5. Emit ``\\t\\t``; gzipped if ``--output_baseline`` + ends in ``.gz``. + +Smoke-test focus: integration over fidelity. The ontology graph is accepted +but not consulted — LAFA distributes propagated TSVs in the official splits. +""" + +from __future__ import annotations + +import argparse +import csv +import gzip +import os +import sys +from collections import defaultdict +from pathlib import Path +from typing import Iterator + +import numpy as np + +# Make `protea.core.knn_search` importable when running from a checkout. +_REPO_ROOT = Path(__file__).resolve().parents[2] +if str(_REPO_ROOT) not in sys.path: + sys.path.insert(0, str(_REPO_ROOT)) + +from protea.core.knn_search import search_knn # noqa: E402 + +from prott5_encoder import embed_sequences, fasta_accessions, parse_fasta # noqa: E402 + + +def _open_text(path: str): + return gzip.open(path, "rt") if path.endswith(".gz") else open(path) + + +def _load_annotations(path: str, ref_accessions: set[str]) -> dict[str, list[str]]: + """Return ``{ref_accession: [go_term, ...]}`` filtered to refs we use. + + Dispatches by extension: ``.gaf[.gz]`` → GAF parser (skipping ``NOT`` + qualifiers and ``!`` headers); anything else → TSV with ``EntryID`` / + ``term`` columns in the header. + """ + base = path[:-3] if path.endswith(".gz") else path + if base.endswith(".gaf"): + return _load_annotations_gaf(path, ref_accessions) + return _load_annotations_tsv(path, ref_accessions) + + +def _load_annotations_tsv(path: str, ref_accessions: set[str]) -> dict[str, list[str]]: + go_map: dict[str, list[str]] = defaultdict(list) + with _open_text(path) as handle: + header = handle.readline().rstrip("\n").split("\t") + try: + entry_idx = header.index("EntryID") + term_idx = header.index("term") + except ValueError: + print( + f"[protea_main] Annotation TSV must have header with 'EntryID' and 'term'. " + f"Got: {header}", + file=sys.stderr, + ) + sys.exit(1) + for line in handle: + cols = line.rstrip("\n").split("\t") + if len(cols) <= max(entry_idx, term_idx): + continue + acc = cols[entry_idx] + term = cols[term_idx] + if acc in ref_accessions: + go_map[acc].append(term) + return go_map + + +def _load_annotations_gaf(path: str, ref_accessions: set[str]) -> dict[str, list[str]]: + """Parse a GAF 2.x file. Cols: 2=DB_Object_ID, 5=GO_ID, 4=Qualifier.""" + go_map: dict[str, list[str]] = defaultdict(list) + with _open_text(path) as handle: + for raw in handle: + if raw.startswith("!"): + continue + cols = raw.rstrip("\n").split("\t") + if len(cols) < 9: + continue + if "NOT" in cols[3]: + continue + acc = cols[1] + term = cols[4] + if acc in ref_accessions: + go_map[acc].append(term) + return go_map + + +def _open_output(path: str): + if path.endswith(".gz"): + return gzip.open(path, "wt", newline="") + return open(path, "w", newline="") + + +def _stack(embeddings: dict[str, np.ndarray], order: list[str]) -> tuple[np.ndarray, list[str]]: + """Stack embeddings in ``order``, dropping accessions that failed to embed.""" + kept_accs: list[str] = [] + rows: list[np.ndarray] = [] + for acc in order: + vec = embeddings.get(acc) + if vec is None: + continue + kept_accs.append(acc) + rows.append(vec) + if not rows: + return np.empty((0, 0), dtype=np.float32), kept_accs + return np.stack(rows).astype(np.float32, copy=False), kept_accs + + +def _transfer( + query_accs: list[str], + neighbors: list[list[tuple[str, float]]], + go_map: dict[str, list[str]], + *, + keep_self_hits: bool, +) -> Iterator[tuple[str, str, float]]: + """First-hit GO transfer; one ``(query, term, score)`` row per (q, term).""" + for q_acc, top_refs in zip(query_accs, neighbors, strict=False): + seen: set[str] = set() + for ref_acc, distance in top_refs: + if not keep_self_hits and ref_acc == q_acc: + continue + score = max(0.0, 1.0 - float(distance)) + for term in go_map.get(ref_acc, ()): + if term in seen: + continue + seen.add(term) + yield q_acc, term, score + + +def main() -> None: + parser = argparse.ArgumentParser( + description="LAFA-compatible PROTEA KNN wrapper (ProtT5 + cosine KNN + first-hit transfer)." + ) + parser.add_argument("--query_file", "-q", required=True) + parser.add_argument("--train_sequences", required=True) + parser.add_argument("--annot_file", "-a", required=True) + parser.add_argument("--graph", required=True, help="OBO file (currently not consulted).") + parser.add_argument("--output_baseline", "-o", required=True) + parser.add_argument("--k", type=int, default=5, help="KNN neighbours per query (default: 5).") + parser.add_argument("--metric", default="cosine", choices=["cosine", "l2"]) + parser.add_argument("--backend", default="numpy", choices=["numpy", "faiss"]) + parser.add_argument( + "--keep_self_hits", + action="store_true", + help="Keep query==ref hits (default: drop, matching LAFA's prott5_container).", + ) + parser.add_argument( + "--model_dir", + default=os.environ.get("HF_CACHE"), + help="HuggingFace cache dir (default: $HF_CACHE).", + ) + args = parser.parse_args() + + for label, path in ( + ("query", args.query_file), + ("train", args.train_sequences), + ("annot", args.annot_file), + ("graph", args.graph), + ): + if not os.path.exists(path): + print(f"[protea_main] {label} file not found: {path}", file=sys.stderr) + sys.exit(1) + + print(f"[protea_main] reading FASTAs: {args.query_file} / {args.train_sequences}") + query_seqs = parse_fasta(args.query_file) + train_seqs = parse_fasta(args.train_sequences) + print(f"[protea_main] queries={len(query_seqs)} refs={len(train_seqs)}") + + print(f"[protea_main] loading annotations from {args.annot_file}") + go_map = _load_annotations(args.annot_file, set(train_seqs)) + refs_with_anns = [acc for acc in train_seqs if acc in go_map] + print(f"[protea_main] refs with annotations: {len(refs_with_anns)}/{len(train_seqs)}") + if not refs_with_anns: + print("[protea_main] no annotated refs after filter — nothing to transfer.", file=sys.stderr) + sys.exit(2) + + to_embed = {**{a: query_seqs[a] for a in query_seqs}, + **{a: train_seqs[a] for a in refs_with_anns}} + print(f"[protea_main] embedding {len(to_embed)} sequences with ProtT5 mean-pool") + embeddings = embed_sequences(to_embed, cache_dir=args.model_dir) + + query_order = fasta_accessions(args.query_file) + Q, kept_q = _stack(embeddings, query_order) + R, kept_r = _stack(embeddings, refs_with_anns) + print(f"[protea_main] embedding matrix Q={Q.shape} R={R.shape}") + if Q.size == 0 or R.size == 0: + print("[protea_main] empty embedding matrix — aborting.", file=sys.stderr) + sys.exit(3) + + print(f"[protea_main] KNN k={args.k} metric={args.metric} backend={args.backend}") + neighbors = search_knn( + Q, + R, + kept_r, + k=args.k, + metric=args.metric, + backend=args.backend, + ) + + out_path = args.output_baseline + out_dir = os.path.dirname(out_path) + if out_dir: + os.makedirs(out_dir, exist_ok=True) + + n_rows = 0 + with _open_output(out_path) as fh: + writer = csv.writer(fh, delimiter="\t") + for q_acc, term, score in _transfer( + kept_q, neighbors, go_map, keep_self_hits=args.keep_self_hits + ): + writer.writerow([q_acc, term, f"{score:.4f}"]) + n_rows += 1 + + print(f"[protea_main] wrote {n_rows} predictions to {out_path}") + + +if __name__ == "__main__": + main() diff --git a/apps/lafa_container/prott5_encoder.py b/apps/lafa_container/prott5_encoder.py new file mode 100644 index 0000000..a747bbb --- /dev/null +++ b/apps/lafa_container/prott5_encoder.py @@ -0,0 +1,139 @@ +"""Mean-pooled ProtT5 embedder for the LAFA wrapper. + +Standalone version of the encoder used by FANTASIA/PROTEA's ProtT5 backend, +trimmed to the needs of the LAFA contract (FASTA in, ``{accession: vector}`` +out). Mirrors the preprocessing of ``baselines/prott5_container/prott5_embedder.py`` +in the LAFA reference container so embeddings are bit-comparable. +""" + +from __future__ import annotations + +import os +import time +from typing import Iterable + +import numpy as np +import torch +from transformers import T5EncoderModel, T5Tokenizer + +_MODEL_NAME = "Rostlab/prot_t5_xl_half_uniref50-enc" + + +def _load_model(cache_dir: str | None) -> tuple[T5EncoderModel, T5Tokenizer, torch.device]: + device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") + model = T5EncoderModel.from_pretrained(_MODEL_NAME, cache_dir=cache_dir) + if device.type == "cpu": + model = model.to(torch.float32) + model = model.to(device).eval() + tokenizer = T5Tokenizer.from_pretrained(_MODEL_NAME, do_lower_case=False, cache_dir=cache_dir) + return model, tokenizer, device + + +def _prepare(seq: str) -> str: + return " ".join(seq.replace("U", "X").replace("Z", "X").replace("O", "X")) + + +def embed_sequences( + sequences: dict[str, str], + *, + cache_dir: str | None = None, + max_residues: int = 4000, + max_seq_len: int = 1000, + max_batch: int = 100, +) -> dict[str, np.ndarray]: + """Return one mean-pooled vector per accession. + + Sorts sequences by descending length so short tails batch efficiently; + falls back to single-sequence processing for sequences > ``max_seq_len``. + """ + if not sequences: + return {} + + model, tokenizer, device = _load_model(cache_dir) + + items = sorted(sequences.items(), key=lambda kv: -len(kv[1])) + embeddings: dict[str, np.ndarray] = {} + + start = time.time() + batch: list[tuple[str, str, int]] = [] + for idx, (acc, seq) in enumerate(items, 1): + prepared = _prepare(seq) + seq_len = len(seq) + batch.append((acc, prepared, seq_len)) + + n_res = sum(s_len for _, _, s_len in batch) + seq_len + flush = ( + len(batch) >= max_batch + or n_res >= max_residues + or idx == len(items) + or seq_len > max_seq_len + ) + if not flush: + continue + + accs, seqs, lens = zip(*batch) + batch = [] + + token_encoding = tokenizer.batch_encode_plus( + list(seqs), add_special_tokens=True, padding="longest" + ) + input_ids = torch.tensor(token_encoding["input_ids"]).to(device) + attention_mask = torch.tensor(token_encoding["attention_mask"]).to(device) + + try: + with torch.no_grad(): + hidden = model(input_ids, attention_mask=attention_mask).last_hidden_state + except RuntimeError as exc: + print(f"[prott5_encoder] OOM/error on batch with longest L={lens[0]}: {exc}") + continue + + for b_idx, ident in enumerate(accs): + s_len = lens[b_idx] + vec = hidden[b_idx, :s_len].mean(dim=0).detach().cpu().numpy().astype(np.float32) + embeddings[ident] = vec + + elapsed = time.time() - start + print( + f"[prott5_encoder] {len(embeddings)} embeddings in {elapsed:.1f}s " + f"({elapsed / max(1, len(embeddings)):.3f}s/protein, device={device})" + ) + return embeddings + + +def parse_fasta(path: str) -> dict[str, str]: + """Read a FASTA file into ``{accession: sequence}``. + + Accession is the substring between the first two ``|`` if present + (UniProt-style ``sp|P12345|name``), else the full id token. + """ + seqs: dict[str, str] = {} + current: str | None = None + with open(path) as handle: + for raw in handle: + line = raw.strip() + if not line: + continue + if line.startswith(">"): + header = line[1:].split()[0] + parts = header.split("|") + current = parts[1] if len(parts) >= 2 else header + seqs[current] = "" + elif current is not None: + seqs[current] += line.upper().replace("-", "") + return seqs + + +def fasta_accessions(path: str) -> list[str]: + """Return accessions in FASTA order (stable for output ordering).""" + accs: list[str] = [] + with open(path) as handle: + for raw in handle: + if raw.startswith(">"): + header = raw[1:].strip().split()[0] + parts = header.split("|") + accs.append(parts[1] if len(parts) >= 2 else header) + return accs + + +def keys_as_array(seqs: Iterable[str]) -> list[str]: + return list(seqs) diff --git a/apps/web/app/[locale]/benchmark/page.tsx b/apps/web/app/[locale]/benchmark/page.tsx new file mode 100644 index 0000000..550b8b7 --- /dev/null +++ b/apps/web/app/[locale]/benchmark/page.tsx @@ -0,0 +1,591 @@ +"use client"; + +import { useEffect, useMemo, useState } from "react"; +import Link from "next/link"; +import { + getBenchmarkEmbeddings, + getBenchmarkMatrix, + type BenchmarkBestCell, + type BenchmarkEmbedding, + type BenchmarkEvalSet, + type BenchmarkMatrixResponse, + type BenchmarkRow, + type BenchmarkStage, +} from "../../../lib/api"; + +// ── Helpers ────────────────────────────────────────────────────────────── + +function formatParams(n: number | null): string { + if (n == null) return ""; + if (n >= 1_000_000_000) { + const v = n / 1_000_000_000; + return v >= 10 ? `${Math.round(v)}B` : `${v.toFixed(1)}B`; + } + if (n >= 1_000_000) return `${Math.round(n / 1_000_000)}M`; + return `${n}`; +} + +function formatProteins(n: number | undefined): string { + if (n == null) return ""; + if (n >= 1_000) return `${(n / 1_000).toFixed(1)}k`; + return String(n); +} + +function cellKey(eid: string, cat: string, asp: string): string { + return `${eid}|${cat}|${asp}`; +} + +/** Index rows by (embedding, cat, asp) for O(1) cell lookup. The matrix + * endpoint already dedupes to a single best row per tuple. */ +function indexRows(rows: BenchmarkRow[]): Map { + const out = new Map(); + for (const r of rows) { + out.set(cellKey(r.embedding_config_id, r.category, r.aspect), r); + } + return out; +} + +/** Index the leaderboard by (cat, asp) so the table can highlight winners. */ +function indexBestPerCell(cells: BenchmarkBestCell[]): Map { + const out = new Map(); + for (const c of cells) { + out.set(`${c.category}|${c.aspect}`, c); + } + return out; +} + +function stageLabel(stages: BenchmarkStage[], name: string): string { + return stages.find((s) => s.name === name)?.label ?? name; +} + +function evalSetLabel(evalSets: BenchmarkEvalSet[], id: string): string { + return evalSets.find((e) => e.id === id)?.label ?? `${id.slice(0, 8)}…`; +} + +/** Pick the initial stage once the catalog is loaded. Backend already + * returns stages sorted by YAML preferred_default_stages, so the first + * entry IS the preferred one if it has data. */ +function pickDefaultStage(stages: BenchmarkStage[]): string | null { + return stages.length > 0 ? stages[0].name : null; +} + +/** CSV export of the currently filtered rows — one line per cell. */ +function rowsToCsv( + embeddings: BenchmarkEmbedding[], + rows: BenchmarkRow[], + stage: string, +): string { + const embById = new Map(embeddings.map((e) => [e.id, e])); + const header = [ + "display_name", + "family", + "param_count", + "model_name", + "stage", + "category", + "aspect", + "fmax", + "precision", + "recall", + "coverage", + "n_proteins", + "evaluation_set_id", + "evaluation_result_id", + ].join(","); + const lines = [header]; + for (const r of rows) { + if (r.stage !== stage) continue; + const e = embById.get(r.embedding_config_id); + lines.push( + [ + e?.display_name ?? "", + e?.family ?? "", + e?.param_count ?? "", + e?.model_name ?? "", + r.stage, + r.category, + r.aspect, + r.fmax, + r.precision ?? "", + r.recall ?? "", + r.coverage ?? "", + r.n_proteins ?? "", + r.evaluation_set_id, + r.evaluation_result_id, + ] + .map((v) => { + const s = String(v); + if (/[,"\n]/.test(s)) return `"${s.replace(/"/g, '""')}"`; + return s; + }) + .join(","), + ); + } + return lines.join("\n"); +} + +function downloadCsv(filename: string, content: string): void { + const blob = new Blob([content], { type: "text/csv;charset=utf-8" }); + const url = URL.createObjectURL(blob); + const a = document.createElement("a"); + a.href = url; + a.download = filename; + document.body.appendChild(a); + a.click(); + document.body.removeChild(a); + URL.revokeObjectURL(url); +} + +// ── Page ───────────────────────────────────────────────────────────────── + +export default function BenchmarkPage() { + const [embeddings, setEmbeddings] = useState(null); + const [matrix, setMatrix] = useState(null); + const [error, setError] = useState(null); + const [stage, setStage] = useState(null); + const [evalSetId, setEvalSetId] = useState("all"); + const [selectedK, setSelectedK] = useState(null); + + // Unfiltered catalog fetch — populates the full set of known stages and + // eval sets, so selector chips don't disappear when a filtered query + // returns zero rows. + const [catalog, setCatalog] = useState<{ + stages: BenchmarkStage[]; + evalSets: BenchmarkEvalSet[]; + categories: string[]; + aspects: string[]; + ks: number[]; + }>({ stages: [], evalSets: [], categories: [], aspects: [], ks: [] }); + + useEffect(() => { + getBenchmarkMatrix() + .then((m) => { + setCatalog({ + stages: m.stages, + evalSets: m.evaluation_sets, + categories: m.categories, + aspects: m.aspects, + ks: m.ks ?? [], + }); + setStage((prev) => prev ?? pickDefaultStage(m.stages)); + setSelectedK((prev) => prev ?? (m.ks?.[0] ?? null)); + }) + .catch((e) => setError(e.message)); + }, []); + + useEffect(() => { + if (stage === null) return; + setError(null); + Promise.all([ + getBenchmarkEmbeddings(), + getBenchmarkMatrix({ + stage, + evaluation_set_id: evalSetId === "all" ? undefined : evalSetId, + k: selectedK ?? undefined, + }), + ]) + .then(([e, m]) => { + setEmbeddings(e.embeddings); + setMatrix(m); + }) + .catch((e) => setError(e.message)); + }, [stage, evalSetId, selectedK]); + + const rowIndex = useMemo( + () => (matrix ? indexRows(matrix.rows) : new Map()), + [matrix], + ); + + const bestPerCell = useMemo( + () => (matrix ? indexBestPerCell(matrix.best_per_cell) : new Map()), + [matrix], + ); + + const embeddingsWithData = useMemo(() => { + if (!embeddings || !matrix) return new Set(); + return new Set(matrix.embedding_config_ids); + }, [embeddings, matrix]); + + if (error) { + return ( +
+
+

{error}

+
+
+ ); + } + + if (!embeddings || !matrix || stage === null) { + return ( +
+
+
+
+ ); + } + + const hasData = matrix.rows.length > 0; + const stageList = catalog.stages.length > 0 ? catalog.stages : matrix.stages; + const evalSetList = catalog.evalSets.length > 0 ? catalog.evalSets : matrix.evaluation_sets; + const categories = catalog.categories.length > 0 ? catalog.categories : matrix.categories; + const aspects = catalog.aspects.length > 0 ? catalog.aspects : matrix.aspects; + const currentStageLabel = stageLabel(stageList, stage); + + // Active eval set banner: when "all" is selected and there's only one set, + // show that one; when a specific one is selected, show its full metadata. + const activeEvalSet = + evalSetId !== "all" + ? evalSetList.find((e) => e.id === evalSetId) ?? null + : evalSetList.length === 1 + ? evalSetList[0] + : null; + + return ( +
+ {/* Header */} +
+
+

Benchmark matrix

+

+ Per-embedding Fmax across categories and aspects for every evaluation + run in the database.{" "} + + Back to home + +

+
+
+ +
+
+ + {/* Eval set context banner */} + {activeEvalSet && ( +
+
+
+ + Evaluation split + +
+ {activeEvalSet.label} +
+
+
+ {activeEvalSet.stats.delta_proteins != null && ( + + Δ{" "} + + {activeEvalSet.stats.delta_proteins.toLocaleString()} + {" "} + proteins + + )} + {activeEvalSet.stats.nk_proteins != null && ( + + NK{" "} + + {formatProteins(activeEvalSet.stats.nk_proteins)} + + + )} + {activeEvalSet.stats.lk_proteins != null && ( + + LK{" "} + + {formatProteins(activeEvalSet.stats.lk_proteins)} + + + )} + {activeEvalSet.stats.pk_proteins != null && ( + + PK{" "} + + {formatProteins(activeEvalSet.stats.pk_proteins)} + + + )} + {activeEvalSet.new_obo_version && ( + + OBO{" "} + {activeEvalSet.new_obo_version} + + )} +
+
+
+ )} + + {/* Filters */} +
+
+ +
+ {stageList.map((s) => ( + + ))} +
+
+ + {catalog.ks.length > 0 && ( +
+ +
+ {catalog.ks.map((n) => ( + + ))} +
+
+ )} + + {evalSetList.length > 1 && ( +
+ + +
+ )} + +
+ {matrix.total} cells · {matrix.embedding_config_ids.length} embeddings ·{" "} + {matrix.evaluation_sets.length} eval set + {matrix.evaluation_sets.length === 1 ? "" : "s"} +
+
+ + {/* Leaderboard: best Fmax per (cat, asp) across every model & stage */} + {matrix.best_per_cell.length > 0 && ( +
+
+

+ Best Fmax per cell + + across every model in current stage filter + +

+
+
+ + + + + {aspects.map((asp) => ( + + ))} + + + + {categories.map((cat) => ( + + + {aspects.map((asp) => { + const best = bestPerCell.get(`${cat}|${asp}`); + if (!best) { + return ( + + ); + } + const emb = embeddings.find((e) => e.id === best.embedding_config_id); + return ( + + ); + })} + + ))} + +
+ {asp} +
{cat} + — + +
+ {best.fmax.toFixed(3)} +
+
+ {emb?.display_name ?? "—"} +
+
+ {stageLabel(stageList, best.stage)} +
+
+
+
+ )} + + {/* Matrix table */} + {!hasData ? ( +
+

+ No evaluation results for{" "} + {currentStageLabel} yet. +

+

+ Run run_cafa_evaluation for an embedding to populate + this cell of the matrix. +

+
+ ) : ( +
+ + + + + {categories.map((cat) => ( + + ))} + + + {categories.flatMap((cat) => + aspects.map((asp) => ( + + )), + )} + + + + {embeddings.map((emb) => { + const hasRow = embeddingsWithData.has(emb.id); + return ( + + + {categories.flatMap((cat) => + aspects.map((asp) => { + const row = rowIndex.get(cellKey(emb.id, cat, asp)); + const best = bestPerCell.get(`${cat}|${asp}`); + const isWinner = + row && best && row.evaluation_result_id === best.evaluation_result_id; + return ( + + ); + }), + )} + + ); + })} + +
+ Embedding + + {cat} +
+ {asp} +
+
+ {emb.display_name} +
+
+ {emb.family} + {emb.param_count != null + ? ` · ${formatParams(emb.param_count)}` + : ""} +
+
+ {row ? ( + + {row.fmax.toFixed(3)} + + ) : ( + + )} +
+
+ )} + +

+ Display names and stage labels come from{" "} + embedding_config (DB) and{" "} + protea/config/benchmark.yaml. Edit the YAML to change + ordering, labels, or the baseline tag. +

+
+ ); +} diff --git a/apps/web/app/[locale]/embeddings/page.tsx b/apps/web/app/[locale]/embeddings/page.tsx index d3411d0..a45eaee 100644 --- a/apps/web/app/[locale]/embeddings/page.tsx +++ b/apps/web/app/[locale]/embeddings/page.tsx @@ -41,6 +41,10 @@ const MODEL_PRESETS: Record = { { value: "Rostlab/prot_t5_xl_half_uniref50-enc", label: "ProT5-XL half (FP16 encoder)", layers: 24, defaultMaxLength: 1024 }, { value: "Rostlab/ProstT5", label: "ProstT5 (3Di + AA)", layers: 24, defaultMaxLength: 1024 }, ], + ankh: [ + { value: "ElnaggarLab/ankh-base", label: "Ankh base (~450M, 48 layers, d=768)", layers: 48, defaultMaxLength: 1024 }, + { value: "ElnaggarLab/ankh-large", label: "Ankh large (~1.9B, 48 layers, d=1536)", layers: 48, defaultMaxLength: 1024 }, + ], auto: [ { value: "facebook/esm2_t33_650M_UR50D", label: "ESM-2 650M (auto backend)", layers: 33, defaultMaxLength: 1022 }, ], @@ -91,7 +95,7 @@ export default function EmbeddingsPage() { const [cmpConfigId, setCmpConfigId] = useState(""); const [cmpQuerySetId, setCmpQuerySetId] = useState(""); const [cmpQueueBatchSize, setCmpQueueBatchSize] = useState(100); - const [cmpBatchSize, setCmpBatchSize] = useState(8); + const [cmpBatchSize, setCmpBatchSize] = useState(1); const [cmpDevice, setCmpDevice] = useState("cuda"); const [cmpSkipExisting, setCmpSkipExisting] = useState(true); const [cmpResult, setCmpResult] = useState<{ id: string; status: string } | null>(null); @@ -306,6 +310,7 @@ export default function EmbeddingsPage() { +
diff --git a/apps/web/app/[locale]/evaluation/page.tsx b/apps/web/app/[locale]/evaluation/page.tsx index 25af270..f7ce0fb 100644 --- a/apps/web/app/[locale]/evaluation/page.tsx +++ b/apps/web/app/[locale]/evaluation/page.tsx @@ -274,6 +274,7 @@ function EvaluationSetCard({ const MAX_ATTEMPTS = 30; const interval = setInterval(async () => { + if (typeof document !== "undefined" && document.visibilityState === "hidden") return; attempts++; try { const fresh = await listResults(e.id); diff --git a/apps/web/app/[locale]/jobs/[id]/page.tsx b/apps/web/app/[locale]/jobs/[id]/page.tsx index 7d1d68f..08ec6bf 100644 --- a/apps/web/app/[locale]/jobs/[id]/page.tsx +++ b/apps/web/app/[locale]/jobs/[id]/page.tsx @@ -72,6 +72,7 @@ export default function JobDetail({ params }: { params: Promise<{ id: string }> const intervalRef = useRef | null>(null); async function refresh() { + if (typeof document !== "undefined" && document.visibilityState === "hidden") return; try { setError(""); const [j, ev, ch] = await Promise.all([ @@ -98,16 +99,24 @@ export default function JobDetail({ params }: { params: Promise<{ id: string }> refresh(); }, [jobId]); - // Auto-refresh while job is active + // Auto-refresh while job is active. Pauses when the tab is hidden and + // resumes on visibilitychange — avoids burning bandwidth on background tabs. useEffect(() => { if (!job) return; const isTerminal = TERMINAL.includes(String(job.status).toLowerCase()); - if (!isTerminal) { - intervalRef.current = setInterval(refresh, 2000); - } else { + if (isTerminal) { if (intervalRef.current) clearInterval(intervalRef.current); + return; } - return () => { if (intervalRef.current) clearInterval(intervalRef.current); }; + intervalRef.current = setInterval(refresh, 2000); + const onVisibility = () => { + if (document.visibilityState === "visible") refresh(); + }; + document.addEventListener("visibilitychange", onVisibility); + return () => { + if (intervalRef.current) clearInterval(intervalRef.current); + document.removeEventListener("visibilitychange", onVisibility); + }; }, [job?.status]); async function onDelete() { @@ -186,6 +195,15 @@ export default function JobDetail({ params }: { params: Promise<{ id: string }> {jobId}
+ {job.operation_description && ( +

{job.operation_description}

+ )} + {job.operation_summary && ( +

+ {job.operation_summary} +

+ )} +
{t("jobDetail.queue")} {job.queue_name}
{t("jobDetail.created")} {formatDate(job.created_at)}
diff --git a/apps/web/app/[locale]/jobs/page.tsx b/apps/web/app/[locale]/jobs/page.tsx index 7eff46a..3138a81 100644 --- a/apps/web/app/[locale]/jobs/page.tsx +++ b/apps/web/app/[locale]/jobs/page.tsx @@ -57,6 +57,7 @@ export default function JobsPage() { const intervalRef = useRef | null>(null); async function refresh(status = statusFilter, showLoader = false) { + if (!showLoader && typeof document !== "undefined" && document.visibilityState === "hidden") return; if (showLoader) setLoading(true); try { setError(""); @@ -75,7 +76,9 @@ export default function JobsPage() { refresh(statusFilter, true); }, [statusFilter]); - // Auto-refresh: faster when there are active jobs, slower otherwise + // Auto-refresh: faster when there are active jobs, slower otherwise. + // Pauses automatically when the tab is hidden (refresh() checks + // document.visibilityState) and forces a refresh on visibilitychange. useEffect(() => { if (!autoRefresh) { if (intervalRef.current) clearInterval(intervalRef.current); @@ -86,7 +89,14 @@ export default function JobsPage() { return hasActive ? 3000 : 8000; } intervalRef.current = setInterval(() => refresh(), schedule()); - return () => { if (intervalRef.current) clearInterval(intervalRef.current); }; + const onVisibility = () => { + if (document.visibilityState === "visible") refresh(); + }; + document.addEventListener("visibilitychange", onVisibility); + return () => { + if (intervalRef.current) clearInterval(intervalRef.current); + document.removeEventListener("visibilitychange", onVisibility); + }; }, [autoRefresh, statusFilter, jobs]); const activeCount = jobs.filter((j) => j.status === "running" || j.status === "queued").length; @@ -157,6 +167,12 @@ export default function JobsPage() { {formatDate(j.created_at)}

{j.operation}

+ {j.operation_description && ( +

{j.operation_description}

+ )} + {j.operation_summary && ( +

{j.operation_summary}

+ )}

{j.id}

@@ -165,10 +181,10 @@ export default function JobsPage() { {/* Desktop table */}
-
+
{t("status")}
{t("operation")}
-
{t("jobId")}
+
{t("operationContext")}
{t("created")}
@@ -180,14 +196,24 @@ export default function JobsPage() {
- {j.operation} + {j.operation} + {j.operation_description && ( + {j.operation_description} + )}
-
{j.id}
+
+ {j.operation_summary ? ( + {j.operation_summary} + ) : ( + + )} + {j.id} +
{formatDate(j.created_at)}
))} diff --git a/apps/web/app/[locale]/layout.tsx b/apps/web/app/[locale]/layout.tsx index 1fa1b1e..6f39907 100644 --- a/apps/web/app/[locale]/layout.tsx +++ b/apps/web/app/[locale]/layout.tsx @@ -29,8 +29,11 @@ export default async function LocaleLayout({ const { locale } = await params; const messages = await getMessages(); return ( - - + + diff --git a/apps/web/app/[locale]/page.tsx b/apps/web/app/[locale]/page.tsx index 9f3f9fd..120ae9d 100644 --- a/apps/web/app/[locale]/page.tsx +++ b/apps/web/app/[locale]/page.tsx @@ -8,11 +8,6 @@ import { getShowcase, type ShowcaseData } from "../../lib/api"; import { AnnotateForm } from "../../components/AnnotateForm"; const ASPECTS = ["MFO", "BPO", "CCO"] as const; -const ASPECT_COLORS: Record = { - MFO: "blue", - BPO: "green", - CCO: "purple", -}; const ASPECT_LABELS: Record = { MFO: "Molecular Function", BPO: "Biological Process", @@ -26,12 +21,6 @@ const CATEGORY_LABELS: Record = { PK: "Partial Knowledge", }; -const METHOD_KEYS: Record = { - knn_baseline: "knnBaseline", - knn_scored: "knnScored", - knn_reranker: "knnReranker", -}; - const STAGE_ICONS: Record = { sequences: "Aa", embeddings: "E", @@ -48,12 +37,30 @@ const STAGE_I18N: Record = { evaluations: "stageEvaluation", }; +const STAGE_LABELS: Record = { + baseline: "pipelineStageBaseline", + alignment_weighted: "pipelineStageAlignmentWeighted", + reranker: "pipelineStageReranker", +}; + +const STAGE_BADGE: Record = { + baseline: "bg-gray-100 text-gray-700", + alignment_weighted: "bg-amber-100 text-amber-800", + reranker: "bg-blue-100 text-blue-800", +}; + +function formatParamCount(n: number | null): string { + if (n == null) return ""; + if (n >= 1_000_000_000) return `${(n / 1_000_000_000).toFixed(n >= 10_000_000_000 ? 0 : 1)}B`; + if (n >= 1_000_000) return `${Math.round(n / 1_000_000)}M`; + return `${n}`; +} + export default function HomePage() { const t = useTranslations("home"); const router = useRouter(); const [data, setData] = useState(null); const [error, setError] = useState(null); - const [activeCategory, setActiveCategory] = useState("NK"); useEffect(() => { getShowcase().then(setData).catch((e) => setError(e.message)); @@ -65,7 +72,12 @@ export default function HomePage() {

{error}

- ))} + {t(STAGE_LABELS[best.stage] as any)} + +
+
+ {best.embedding.model_name} +
+
+ +
+
+ {best.avg_fmax.toFixed(3)} +
+
{t("avgFmaxAcrossCells")}
- - {CATEGORY_LABELS[activeCategory]} -
- {/* ── Fmax cards ────────────────────────────────────────── */} -
+ {/* Per-aspect mini tiles (mean across NK/LK/PK) */} +
{ASPECTS.map((aspect) => { - const d = catFmax[aspect]; - if (!d) return null; - const color = ASPECT_COLORS[aspect]; + const agg = perAspect[aspect]; + const value = agg ? agg.sum / agg.count : null; return (
-
- {d.fmax.toFixed(2)} +
+ {value != null ? value.toFixed(3) : "—"}
-
- {t("fmax")} {aspect} -
-
- {ASPECT_LABELS[aspect]} -
-
- {d.method_label} +
+ {aspect}
); })}
- - - {/* ── Method comparison table ───────────────────────────── */} - {catMethods.length > 0 && ( -
-

- {t("methodComparison")} - - ({activeCategory}) - -

-
- - - - - {ASPECTS.map((a) => ( - - ))} - - - - {catMethods.map((row, i) => { - const isBest = ASPECTS.some( - (a) => catFmax[a]?.method === row.method - ); - return ( - - - {ASPECTS.map((aspect) => { - const val = (row as any)[aspect]?.fmax; - const baseVal = baseline ? (baseline as any)[aspect]?.fmax : null; - const delta = val != null && baseVal != null && row.method !== "knn_baseline" - ? val - baseVal - : null; - return ( - - ); - })} - - ); - })} - -
{t("method")} - {a} -
- {t(METHOD_KEYS[row.method] ?? row.method)} - {isBest && ( - best - )} - - {val != null ? ( - - {val.toFixed(3)} - {delta != null && ( - 0 ? "text-green-600" : delta < 0 ? "text-red-600" : "text-gray-400"}`}> - {delta > 0 ? "+" : ""}{delta.toFixed(3)} - - )} - - ) : ( - - )} -
-
-
- )} - +
+ ) : (

{t("noDataYet")}

@@ -309,12 +255,14 @@ export default function HomePage() { {t("stats")}
- {([ - ["proteins", data.counts.proteins], - ["sequences", data.counts.sequences], - ["embeddings", data.counts.embeddings], - ["predictions", data.counts.predictions], - ] as [string, number][]).map(([key, count]) => ( + {( + [ + ["proteins", data.counts.proteins], + ["sequences", data.counts.sequences], + ["embeddings", data.counts.embeddings], + ["predictions", data.counts.predictions], + ] as [string, number][] + ).map(([key, count]) => (
{count.toLocaleString()} @@ -328,7 +276,7 @@ export default function HomePage() { {/* ── CTAs ──────────────────────────────────────────────────── */}
{t("exploreResults")} diff --git a/apps/web/app/[locale]/reranker/page.tsx b/apps/web/app/[locale]/reranker/page.tsx index edf0751..1d1bd0c 100644 --- a/apps/web/app/[locale]/reranker/page.tsx +++ b/apps/web/app/[locale]/reranker/page.tsx @@ -201,28 +201,44 @@ function RerankerCard({
- AUC: {m.val_auc?.toFixed(4) ?? "—"} - F1: {m.val_f1?.toFixed(4) ?? "—"} - Precision: {m.val_precision?.toFixed(4) ?? "—"} - Recall: {m.val_recall?.toFixed(4) ?? "—"} - Positive rate: {m.positive_rate != null ? `${(m.positive_rate * 100).toFixed(2)}%` : "—"} + {m.test_fmax != null ? ( + <> + Test Fmax: {m.test_fmax.toFixed(4)} + Best iter: {m.best_iteration ?? "—"} + {m.positive_rate_train != null && ( + Train pos. rate: {(m.positive_rate_train * 100).toFixed(2)}% + )} + + ) : ( + <> + AUC: {m.val_auc?.toFixed(4) ?? "—"} + F1: {m.val_f1?.toFixed(4) ?? "—"} + Precision: {m.val_precision?.toFixed(4) ?? "—"} + Recall: {m.val_recall?.toFixed(4) ?? "—"} + Positive rate: {m.positive_rate != null ? `${(m.positive_rate * 100).toFixed(2)}%` : "—"} + + )}
{expanded && (
- {/* Validation metrics */} + {/* Training-time metrics */}
-

Validation metrics

+

Training-time metrics

+ + -
- Train samples: {m.train_samples?.toLocaleString()} - Val samples: {m.val_samples?.toLocaleString()} + {m.train_samples != null && Train samples: {m.train_samples.toLocaleString()}} + {m.val_samples != null && Val samples: {m.val_samples.toLocaleString()}} + {m.positive_rate_train != null && ( + Train positive rate: {(m.positive_rate_train * 100).toFixed(2)}% + )}
diff --git a/apps/web/components/AnnotateForm.tsx b/apps/web/components/AnnotateForm.tsx index e28e1cf..2c17cc1 100644 --- a/apps/web/components/AnnotateForm.tsx +++ b/apps/web/components/AnnotateForm.tsx @@ -7,23 +7,40 @@ import { annotateProteins, getJob, launchPredictGoTerms, + listJobs, listPredictionSets, type AnnotateResult, + type Job, } from "@/lib/api"; type Stage = "idle" | "uploading" | "embedding" | "predicting" | "done" | "error"; const POLL_MS = 3_000; +const QUEUE_POLL_MS = 30_000; -const EXAMPLE_FASTA = `>sp|P04637|P53_HUMAN Cellular tumor antigen p53 +// Operations that occupy the shared GPU pipeline. While any of these is +// queued or running we block new user annotation requests, since they won't +// actually enter the queue in a reasonable time frame. +const BLOCKING_OPERATIONS = new Set([ + "compute_embeddings", + "compute_embeddings_batch", + "predict_go_terms", + "predict_go_terms_batch", +]); + +const EXAMPLE_FASTA = `>sp|P01116|RASK_HUMAN GTPase KRas OS=Homo sapiens OX=9606 GN=KRAS PE=1 SV=1 +MTEYKLVVVGAGGVGKSALTIQLIQNHFVDEYDPTIEDSYRKQVVIDGETCLLDILDTAG +QEEYSAMRDQYMRTGEGFLCVFAINNTKSFEDIHHYREQIKRVKDSEDVPMVLVGNKCDL +PSRTVDTKQAQDLARSYGIPFIETSAKTRQRVEDAFYTLVREIRQYRLKKISKEEKTPGC +VKIKKCIIM +>sp|P04637|P53_HUMAN Cellular tumor antigen p53 OS=Homo sapiens OX=9606 GN=TP53 PE=1 SV=4 MEEPQSDPSVEPPLSQETFSDLWKLLPENNVLSPLPSQAMDDLMLSPDDIEQWFTEDPGP -DEAPRMPEAAPPVAPAPAAPTPAAPAPAPSWPLSSSVPSQKTYPQGLNGTVNLPGRNSFEV -RVCACPGRDRRTEEENLHKTTGIDSFLHPEVEYFTPETDPAGPMCSRHFYQLAKTCPVQLW -VDSTPPPGTRVRAMAIYKQSQHMTEVVRRCPHERCTCGGNHGISTTTGICLICQFFLVHKP ->sp|P38398|BRCA1_HUMAN Breast cancer type 1 susceptibility protein -MDLSALRVEEVQNVINAMQKILECPICLELIKEPVSTKCDHIFCKFCMLKLLNQKKGPSQC -PLCKNDITKRSLQESTRFSQLVEELLKIICAFQLDTGLEYANSYNFAKKENNSPEHLKDEV -SIIQSMGYRNRAKRLLQSEPENPSLQETSLSVQLSNLGTVRTLRTKQRIQPQKTSVYIELG`; +DEAPRMPEAAPPVAPAPAAPTPAAPAPAPSWPLSSSVPSQKTYQGSYGFRLGFLHSGTAK +SVTCTYSPALNKMFCQLAKTCPVQLWVDSTPPPGTRVRAMAIYKQSQHMTEVVRRCPHHE +RCSDSDGLAPPQHLIRVEGNLRVEYLDDRNTFRHSVVVPYEPPEVGSDCTTIHYNYMCNS +SCMGGMNRRPILTIITLEDSSGNLLGRNSFEVRVCACPGRDRRTEEENLRKKGEPHHELP +PGSTKRALPNNTSSSPQPKKKPLDGEYFTLQIRGRERFEMFRELNEALELKDAQAGKEPG +GSRAHSSHLKSKKGQSTSRHKKLMFKTEGPDSD`; export function AnnotateForm() { const t = useTranslations("home"); @@ -41,6 +58,11 @@ export function AnnotateForm() { // Drag-and-drop state const [dragOver, setDragOver] = useState(false); + // Queue-awareness: poll active jobs and block submission while any + // embedding/prediction operation is queued or running, because our + // single-GPU setup can't absorb another request in reasonable time. + const [blockingJobs, setBlockingJobs] = useState(null); + const handleFile = (file: File) => { const reader = new FileReader(); reader.onload = (e) => { @@ -152,7 +174,49 @@ export function AnnotateForm() { }; }, []); + // Poll for active embedding/prediction jobs to know whether the GPU + // pipeline is currently saturated. + useEffect(() => { + let cancelled = false; + const fetchBlocking = async () => { + if (typeof document !== "undefined" && document.visibilityState === "hidden") return; + try { + const [queued, running] = await Promise.all([ + listJobs({ limit: 100, status: "queued" }), + listJobs({ limit: 100, status: "running" }), + ]); + if (cancelled) return; + const merged = [...running, ...queued].filter((j) => + BLOCKING_OPERATIONS.has(j.operation), + ); + setBlockingJobs(merged); + } catch { + // ignore transient errors; keep prior state + } + }; + fetchBlocking(); + const id = setInterval(fetchBlocking, QUEUE_POLL_MS); + const onVisibility = () => { + if (document.visibilityState === "visible") fetchBlocking(); + }; + document.addEventListener("visibilitychange", onVisibility); + return () => { + cancelled = true; + clearInterval(id); + document.removeEventListener("visibilitychange", onVisibility); + }; + }, []); + const isRunning = stage === "uploading" || stage === "embedding" || stage === "predicting"; + // A running local annotation flow already owns the UI; don't double-block. + const isQueueBlocked = !isRunning && (blockingJobs?.length ?? 0) > 0; + const runningJob = blockingJobs?.find((j) => j.status === "running") ?? null; + const runningPct = + runningJob && runningJob.progress_total && runningJob.progress_current + ? Math.round((runningJob.progress_current / runningJob.progress_total) * 100) + : null; + const queuedCount = + blockingJobs?.filter((j) => j.status === "queued").length ?? 0; return (
@@ -163,6 +227,41 @@ export function AnnotateForm() { {t("annotateDescription" as any)}

+ {/* Queue-busy banner ─ blocks submission while the GPU pipeline is saturated */} + {isQueueBlocked && ( +
+
+ +
+

+ {t("annotateQueueBlockedTitle" as any)} +

+

+ {t("annotateQueueBlockedBody" as any)} +

+
    + {runningJob && ( +
  • + {runningJob.operation} + {" — "} + {t("annotateQueueRunningLabel" as any)} + {runningPct != null ? ` (${runningPct}%)` : ""} +
  • + )} + {queuedCount > 0 && ( +
  • + {t("annotateQueueWaitingLabel" as any)}: {queuedCount} +
  • + )} +
+
+
+
+ )} + {/* FASTA input */}
setFasta(e.target.value)} placeholder={t("annotatePlaceholder" as any)} rows={6} - disabled={isRunning} + disabled={isRunning || isQueueBlocked} className="w-full rounded-lg p-4 text-xs font-mono text-gray-700 placeholder:text-gray-400 focus:outline-none focus:ring-2 focus:ring-blue-300 resize-y disabled:opacity-50 disabled:cursor-not-allowed bg-transparent" /> - {!fasta && !isRunning && ( + {!fasta && !isRunning && !isQueueBlocked && (