Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
114 changes: 69 additions & 45 deletions chainforge/react-server/src/EvalGenModal.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ import {
RatingDict,
ResponseUID,
} from "./backend/typing";
import { EvalCriteria, EvalGenReport } from "./backend/evalgen/typing";
import { EvalCriteria, EvalFunction, EvalFunctionReport, EvalFunctionSetReport } from "./backend/evalgen/typing";
import {
IconChevronDown,
IconChevronLeft,
Expand Down Expand Up @@ -267,7 +267,7 @@ const CriteriaCard: React.FC<CriteriaCardProps> = ({
onChangeGrade={onChangeGrade}
getGradeCount={getGradeCount}
/>
<Contributor getStateValue={getStateValue} />
<Contributor getStateValue={getStateValue} style={{ size: 22, thickness: 4 }} />

{/* Title of the criteria */}
<TextInput
Expand Down Expand Up @@ -412,7 +412,7 @@ const CriteriaCard: React.FC<CriteriaCardProps> = ({
export interface EvalGenModalRef {
trigger: (
resps: LLMResponse[],
setFinalReports: (reports: EvalGenReport) => void,
setFinalReports: (selectedFuncs: EvalFunction[]) => void,
) => void;
}

Expand All @@ -422,6 +422,7 @@ const EvalGenModal = forwardRef<EvalGenModalRef, NonNullable<unknown>>(
const apiKeys = useStore((state) => state.apiKeys);
const globalState = useStore((store) => store.state);
const [criteria, setCriteria] = useState<EvalCriteria[]>([]);
const [reports, setReports] = useState<EvalFunctionSetReport | undefined>(undefined);
const [criteriaForDisplay, setCriteriaForDisplay] = useState<
EvalCriteria[]
>([]);
Expand Down Expand Up @@ -585,23 +586,23 @@ const EvalGenModal = forwardRef<EvalGenModalRef, NonNullable<unknown>>(

// const defaultOnFinish = (reports: string) => {};
const [onFinish, setOnFinish] = useState({
setFinalRpts: (reports: EvalGenReport) => {
setFinalRpts: (reports: EvalFunction[]) => {
// console.log("");
},
});

// Open the EvalGen wizard
const trigger = (
resps: LLMResponse[],
setFinalReports: (reports: EvalGenReport) => void,
setFinalReports: (reports: EvalFunction[]) => void,
) => {
// We pass the responses here manually to ensure they remain the same
// for the duration of one EvalGen operation.
setResponses(resps);
gotoNextScreen("response");
// setFinalReports("A plenty response");
setOnFinish({
setFinalRpts: (reports: EvalGenReport) => {
setFinalRpts: (reports: EvalFunction[]) => {
close();
setFinalReports(reports);
},
Expand Down Expand Up @@ -886,7 +887,7 @@ If you determine the feedback corresponds to a new criteria, your response shoul

const estimateGPTCalls = () => {
return executor
? `This will trigger around ${executor.estimateNumGPTCalls(grades[shownResponse?.uid]).numGPT4Calls} GPT-4o and ${executor.estimateNumGPTCalls(grades[shownResponse?.uid]).numGPT35Calls} GPT-3.5-turbo-16k calls.`
? `This will trigger around ${executor.estimateNumGPTCalls(grades[shownResponse?.uid]).numGPT4Calls} GPT-4o and ${executor.estimateNumGPTCalls(grades[shownResponse?.uid]).numGPT35Calls} GPT-4o-mini calls.`
: "# estimated GPT calls not available.";
};

Expand All @@ -906,6 +907,21 @@ If you determine the feedback corresponds to a new criteria, your response shoul
setScreen(screenName);
};

const handleGradingDone = async () => {
await executor?.waitForCompletion();
const filteredFunctions = await executor?.filterEvaluationFunctions(0.25);
console.log("filteredFunctions", filteredFunctions);

// schema is {
// failureCoverage: coverage,
// falseFailureRate,
// selectedEvalFunctions: bestEvalFunctions,
// allEvalFunctionReports: evalFunctionReport,
// };
// set state
setReports(filteredFunctions);
};

// const [onFinish, setOnFinish] = useState(null);

return (
Expand Down Expand Up @@ -934,6 +950,7 @@ If you determine the feedback corresponds to a new criteria, your response shoul
gotoPrevResponse={prevResponse2}
estimateGPTCalls={estimateGPTCalls}
gotoNextScreen={gotoNextScreen}
handleGradingDone={handleGradingDone}
/>

{/* Progress bar */}
Expand Down Expand Up @@ -1110,12 +1127,8 @@ If you determine the feedback corresponds to a new criteria, your response shoul
{screen === "report" && (
<Grid>
<ReportCardView
report={{
criteria: criteria,
failureCoverage: 99.2,
falseFailureRate: 66.7,
}}
onFinish={(reports: EvalGenReport) => {
report={reports}
onFinish={(reports: EvalFunction[]) => {
onFinish.setFinalRpts(reports);
}}
getGradeCount={(crit: EvalCriteria, grade: boolean) => {
Expand Down Expand Up @@ -1155,6 +1168,7 @@ interface GradingViewProps {
gotoNextResponse: () => void;
estimateGPTCalls: () => string;
gotoNextScreen: (screenName: string) => void;
handleGradingDone: () => void;
}

const GradingView: React.FC<GradingViewProps> = ({
Expand All @@ -1168,6 +1182,7 @@ const GradingView: React.FC<GradingViewProps> = ({
gotoNextResponse,
estimateGPTCalls,
gotoNextScreen,
handleGradingDone,
}) => {
// Calculate inner values only when shownResponse changes
const responseText = useMemo(
Expand Down Expand Up @@ -1312,7 +1327,7 @@ const GradingView: React.FC<GradingViewProps> = ({
{/* GPT Call Tally */}
<Text size="sm" color="dark" style={{ fontStyle: "italic" }}>
Executed {numGPT4Calls} GPT-4o calls and {numGPT35Calls}{" "}
GPT-3.5-Turbo-16k calls.
GPT-4o-mini calls.
</Text>
</Flex>
<div
Expand Down Expand Up @@ -1351,8 +1366,10 @@ const GradingView: React.FC<GradingViewProps> = ({
leftIcon={<IconSparkles size={14} />}
variant="gradient"
gradient={{ from: "blue", to: "green", deg: 45 }}
onClick={() => {
onClick={async () => {
// console.log("(3) gotoNextScreen", gotoNextScreen);
// Get the evaluation functions
await handleGradingDone();
gotoNextScreen("report");
}}
>
Expand All @@ -1365,9 +1382,9 @@ const GradingView: React.FC<GradingViewProps> = ({
};

interface ReportCardViewProps {
report: EvalGenReport;
report: EvalFunctionSetReport;
// recomputeAlignment,
onFinish: (reports: EvalGenReport) => void;
onFinish: (reports: EvalFunction[]) => void;
getGradeCount: (crit: EvalCriteria, grade: boolean) => number;
getStateValue: (stateId: number) => number;
}
Expand All @@ -1380,41 +1397,44 @@ const ReportCardView: React.FC<ReportCardViewProps> = ({
getGradeCount,
getStateValue,
}) => {
// The criteria cards, now with report information

const [finalReport, setFinalReport] = useState(report);
const [selectedEvalFunctions, setSelectedEvalFunctions] = useState<EvalFunction[]>(report.selectedEvalFunctions);

const onSelect = (criterion: EvalCriteria, isSelected: boolean) => {
if (isSelected) {
finalReport.criteria.push(criterion);
const matchingFunction = report.selectedEvalFunctions.find(func => func.evalCriteria === criterion);
if (matchingFunction && !selectedEvalFunctions.includes(matchingFunction)) {
setSelectedEvalFunctions([...selectedEvalFunctions, matchingFunction]);
}
} else {
finalReport.criteria = finalReport.criteria.filter(
(c) => c !== criterion,
);
setSelectedEvalFunctions(selectedEvalFunctions.filter(func => func.evalCriteria !== criterion));
}
setFinalReport(finalReport);
};
}

// The criteria cards, now with report information
const cards = useMemo(() => {
const res = [];

// Iterate through selected eval functions and create cards
// for (const selectedFunc of report.selectedEvalFunctions) {
// const crit = selectedFunc.evalCriteria;
// // Find corresponding report in allEvalFunctionReports map from criteria to list
// const critEvalFuncReports = report.allEvalFunctionReports.get(crit);
// const evalFuncReport = critEvalFuncReports.find(
// (rep) => rep.evalFunction === selectedFunc,
// );

// // Get the functions that were not selected for this criteria
// const otherFuncs = critEvalFuncReports.filter(
// (rep) => rep.evalFunction !== selectedFunc,
// );
for (const crit of report.criteria) {
for (const selectedFunc of report.selectedEvalFunctions) {
const crit = selectedFunc.evalCriteria;
// Find corresponding report in allEvalFunctionReports map from criteria to list
const critEvalFuncReports = report.allEvalFunctionReports.get(crit);
const evalFuncReport = critEvalFuncReports.find(
(rep) => rep.evalFunction === selectedFunc,
);

// Get the functions that were not selected for this criteria
const otherFuncs = critEvalFuncReports.filter(
(rep) => rep.evalFunction !== selectedFunc,
);

res.push(
<ReportCriteriaCard
criterion={crit}
key={crit.uid}
evalFunctionReport={evalFuncReport}
otherFunctions={otherFuncs}
// onCheck={(checked) => {
// crit.selected = checked;
// recomputeAlignment();
Expand Down Expand Up @@ -1478,7 +1498,7 @@ const ReportCardView: React.FC<ReportCardViewProps> = ({
<Button
onClick={() => {
// console.log("finalReport", finalReport);
onFinish(finalReport);
onFinish(selectedEvalFunctions);
}}
>
Finish with selected evaluators
Expand All @@ -1491,6 +1511,8 @@ const ReportCardView: React.FC<ReportCardViewProps> = ({

interface ReportCriteriaCardProps {
criterion: EvalCriteria;
evalFunctionReport: EvalFunctionReport;
otherFunctions: EvalFunctionReport[];
// onChange: (changedCriteria: EvalCriteria) => void;
// onDelete: () => void;
// initiallyOpen?: boolean;
Expand All @@ -1503,6 +1525,8 @@ interface ReportCriteriaCardProps {

const ReportCriteriaCard: React.FC<ReportCriteriaCardProps> = ({
criterion,
evalFunctionReport,
otherFunctions,
// onChange,
// onDelete,
// initiallyOpen,
Expand All @@ -1517,12 +1541,12 @@ const ReportCriteriaCard: React.FC<ReportCriteriaCardProps> = ({
const [checked, setChecked] = useState(true);

// Simulates eval functions that are expected to be passed in later on (TODO)
const evalFuncs = [
{ evalFunction: { code: "To be provided (1) ..." } },
{ evalFunction: { code: "To be provided (2) ..." } },
{ evalFunction: { code: "To be provided (3) ..." } },
];
const unselectedImplementations = evalFuncs.map((item) => (
// const evalFuncs = [
// { evalFunction: { code: "To be provided (1) ..." } },
// { evalFunction: { code: "To be provided (2) ..." } },
// { evalFunction: { code: "To be provided (3) ..." } },
// ];
const unselectedImplementations = otherFunctions.map((item) => (
<div key={uuid()}>
<Code style={{ whiteSpace: "pre-wrap" }} key={uuid()}>
{item.evalFunction.code}
Expand Down
1 change: 1 addition & 0 deletions chainforge/react-server/src/ModelSettingSchemas.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@ const ChatGPTSettings: ModelSettingsDict = {
"gpt-3.5-turbo",
"gpt-4-turbo",
"gpt-4o",
"gpt-4o-mini",
"gpt-4",
"gpt-4-turbo-2024-04-09",
"gpt-4-turbo-preview",
Expand Down
28 changes: 14 additions & 14 deletions chainforge/react-server/src/MultiEvalNode.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -60,10 +60,9 @@ import { Dict, LLMResponse, QueryProgress } from "./backend/typing";
import { AlertModalContext } from "./AlertModal";
import { Status } from "./StatusIndicatorComponent";
import EvalGenModal, {
EvalGenModalRef,
ReportCardScreen,
EvalGenModalRef
} from "./EvalGenModal";
import { EvalGenReport } from "./backend/evalgen/typing";
import { EvalFunction } from "./backend/evalgen/typing";

const IS_RUNNING_LOCALLY = APP_IS_RUNNING_LOCALLY();

Expand Down Expand Up @@ -663,43 +662,44 @@ const MultiEvalNode: React.FC<MultiEvalNodeProps> = ({ data, id }) => {
evalGenModalRef.current?.trigger(resps, onFinalReportsReady);
};

const onFinalReportsReady = (reports: EvalGenReport) => {

const onFinalReportsReady = (selectedFunctions: EvalFunction[]) => {
// Placeholder for process the final reports returned from EvalGenModel
console.log("!!!!!!!!!!!!!!!!!!!!!!!!!! final reports", reports);
console.log("!!!!!!!!!!!!!!!!!!!!!!!!!! final functions", selectedFunctions);
// let kkk = 1;
for (const crit of reports.criteria) {
for (const func of selectedFunctions) {
// setTimeout(() => {
// console.log("crit", crit);
if (crit.eval_method === "code") {
if (func.evalCriteria.eval_method === "code") {
// Python
addEvaluator(
crit.shortname,
func.evalCriteria.shortname,
"python",
{
code: "def evaluate(r):\n\treturn len(r.text)", // to be populated once python code is implemented for the criteria
code: func.code, // to be populated once python code is implemented for the criteria
sandbox: true,
},
false,
);
} else if (crit.eval_method === "expert") {
} else if (func.evalCriteria.eval_method === "expert") {
// LLM
addEvaluator(
crit.shortname,
func.evalCriteria.shortname,
"llm",
{
// to be populated once LLM code is implemented for the criteria
prompt: "",
prompt: func.code,
format: "bin",
},
false,
);
} else {
// JavaScript
addEvaluator(
crit.shortname,
func.evalCriteria.shortname,
"javascript",
{
code: "function evaluate(r) {\n\treturn r.text.length;\n}", // to be populated once javascript code is implemented for the criteria
code: func.code,
},
false,
);
Expand Down
2 changes: 1 addition & 1 deletion chainforge/react-server/src/backend/evalgen/executor.ts
Original file line number Diff line number Diff line change
Expand Up @@ -1085,4 +1085,4 @@ export default class EvaluationFunctionExecutor {

return outcomes;
}
}
}
6 changes: 0 additions & 6 deletions chainforge/react-server/src/backend/evalgen/typing.ts
Original file line number Diff line number Diff line change
Expand Up @@ -11,12 +11,6 @@ export interface EvalCriteria {
source?: string;
}

export interface EvalGenReport {
criteria: EvalCriteria[];
failureCoverage: number;
falseFailureRate: number;
}

export function validEvalCriteriaFormat(json_obj: Dict) {
return (
"criteria" in json_obj &&
Expand Down
Loading