textevolve/agent_system.py at main · nickcdryan/textevolve · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#!/usr/bin/env python
"""
agent_system.py - Main class for the Agentic Learning System
"""

import os
import json
import time
import datetime
import traceback
import random
import sys
import ast  # Added for script validation
from typing import Dict, List, Any, Optional, Tuple
from pathlib import Path
from concurrent.futures import ThreadPoolExecutor, as_completed
from google import genai
from google.genai import types  # Added import for GenerateContentConfig
import numpy as np

from sandbox import DockerSandbox, check_docker_available

from prompts.data_analyzer import get_dataset_analysis_prompt
from prompts.batch_size_optimizer import get_batch_size_optimization_prompt
from prompts.batch_learnings import get_batch_learnings_prompt
from prompts.learning_synthesizer import get_learning_synthesis_prompt
from prompts.strategy_optimizer import get_strategy_optimization_prompt
from prompts.progressive_testing import get_progressive_testing_prompt

from prompts.script_generation.strategies import get_explore_instructions, get_exploit_instructions, get_refine_instructions

from prompts.script_generation.prompting_guides import (
    multi_example_prompting_guide,
    llm_reasoning_prompting_guide,
    validation_prompting_guide,
    meta_programming_prompting_guide,
    code_execution_prompting_guide,
)

from prompts.script_generation.llm_patterns import(
    as_example_code,
    extract_information_with_examples,
    verify_solution_with_examples,
    solve_with_validation_loop,
    best_of_n,
    solve_with_react_pattern,

    chain_of_thought_reasoning,
    verification_with_feedback,
    multi_perspective_analysis,
    self_consistency_approach,
    pattern_identification,
    wait_injection,
    solve_with_meta_programming,
    self_modifying_solver,
    debate_approach,
    adaptive_chain_solver,
    dynamic_memory_pattern,
    test_time_training,
    combination_example,
)

class AgentSystem:
    """
    Agentic Learning System that uses LLM reasoning to continuously improve its approach
    to solving dataset problems through iterative exploration and exploitation.
    Now supports custom dataset loaders.
    """

    def __init__(self, dataset_loader=None, use_sandbox=True):
        """
        Initialize the agent system with a dataset loader

        Args:
            dataset_loader: A DatasetLoader instance for loading and processing examples
            use_sandbox: Whether to use Docker sandbox for code execution (default: True)
        """
        # Initialize configuration
        self.explore_rate = 60  # Start with exploration focus
        self.exploit_rate = 20  # Some exploitation
        self.refine_rate = 20   # Some refinement

        # Initialize sandbox
        self.use_sandbox = use_sandbox
        self.sandbox = None
        if self.use_sandbox:
            if not check_docker_available():
                print("WARNING: Docker not available. Falling back to direct execution.")
                print("For secure execution, please install and start Docker.")
                self.use_sandbox = False
            else:
                try:
                    self.sandbox = DockerSandbox()
                    if not self.sandbox.ensure_image_available():
                        print("WARNING: Failed to ensure Docker image is available.")
                        print("Falling back to direct execution.")
                        self.use_sandbox = False
                        self.sandbox = None
                    else:
                        print("Docker sandbox initialized successfully.")
                except Exception as e:
                    print(f"WARNING: Failed to initialize Docker sandbox: {e}")
                    print("Falling back to direct execution.")
                    self.use_sandbox = False
                    self.sandbox = None

        # Store the dataset loader
        self.dataset_loader = dataset_loader
        if not self.dataset_loader:
            raise ValueError("A dataset loader must be provided")

        # Initialize batch size and tracking for seen examples
        self.current_batch_size = 3  # Start with a small batch
        self.baseline_batch_size = 10
        self.seen_examples = set()
        self.examples_processed = 0

        # Ensure directories exist
        self.archive_dir = Path("archive")
        self.archive_dir.mkdir(exist_ok=True)
        self.scripts_dir = Path("scripts")
        self.scripts_dir.mkdir(exist_ok=True)

        self.capability_tracker = CapabilityTracker()

        # Load system prompt
        self.system_prompt = self._load_system_prompt()
        print(f"System prompt loaded: {len(self.system_prompt)} characters")

        # Initialize Gemini API client
        try:
            self.client = genai.Client(
                api_key=os.environ.get("GEMINI_API_KEY"))
            print("Gemini API client initialized successfully")
        except Exception as e:
            print(f"Error initializing Gemini API client: {e}")
            print("Make sure to set the GEMINI_API_KEY environment variable")
            raise

        # Initialize learnings mechanism
        print("Initializing learnings mechanism...")
        learnings = self._load_learnings()
        if learnings:
            print(f"Loaded existing learnings: {len(learnings)} characters")
        else:
            print("No existing learnings found. Will start accumulating learnings.")

        # ADDED: Reserve training examples to prevent data leakage
        self.mark_training_examples()


        # Initialize current iteration
        self.current_iteration = 0

        # Load previous iterations if available
        self._load_previous_state()

        # analyze training examples and add to learnings if cold start
        if self.current_iteration == 0:
            self.analyze_dataset_with_llm()


    def get_training_examples(self, count: int = 5) -> List[Dict]:
        """
        Get a fixed set of examples for initial training (cold start).
        These examples are reserved and never used for testing.

        Args:
            count: Number of training examples to return

        Returns:
            List of example dictionaries with input and output fields
        """
        # Get first 'count' examples from dataset and mark them as seen
        if not self.dataset_loader:
            return []

        # Save original index (though we won't restore it in most cases)
        original_index = self.dataset_loader.current_index

        # Start from beginning for training examples
        self.dataset_loader.current_index = 0

        # Get training examples
        training_examples = self.dataset_loader.get_examples(count)

        # Mark these examples as seen
        for i in range(count):
            self.seen_examples.add(i)

        # Update dataset position to continue after the training examples
        # Instead of going back to the original index
        self.dataset_loader.current_index = count

        # Update the next example index to continue after training examples
        if hasattr(self, 'next_example_index'):
            self.next_example_index = max(self.next_example_index, count)
        else:
            self.next_example_index = count

        print(f"Reserved {len(training_examples)} examples for training, next example will be {self.dataset_loader.current_index}")

        return training_examples

    def mark_training_examples(self):
        """
        Mark the initial training examples as seen to ensure they're not used for testing.
        This should be called during initialization to establish the training set.
        """
        # Only run this if we don't have any seen examples yet
        if not self.seen_examples:
            # The training examples will be added to seen_examples
            training_examples = self.get_training_examples(5)  # Get 5 training examples

            # Update examples processed to include training examples
            self.examples_processed = len(training_examples)

            print(f"Reserved {len(training_examples)} examples for initial training")
            print(f"Total examples seen: {len(self.seen_examples)}")


    # Add this to the AgentSystem class
    def analyze_dataset_with_llm(self):
        """
        Perform an initial analysis of the dataset to understand patterns,
        structures, and potential approaches before any problem-solving.
        Adds insights to the learnings.txt file.
        """
        print("Performing initial dataset analysis with LLM...")

        # Get training examples - these are already set aside for learning
        # We're not using examples that would be used for testing later

        # Don't mess with the dataset loader's current index

        try:
            original_index = self.dataset_loader.current_index
            self.dataset_loader.current_index = 0

            training_examples = self.get_training_examples(5)

            if not training_examples or len(training_examples) == 0:
                print("Warning: No training examples available for analysis.")
                return

            # Format examples for LLM analysis
            formatted_examples = []
            for i, example in enumerate(training_examples):
                formatted_examples.append({
                    "id": f"example_{i}",
                    "question": self.dataset_loader.get_example_input(example),
                    "answer": self.dataset_loader.get_example_output(example)
                })

        finally:
            self.dataset_loader.current_index = original_index

        # Call LLM to analyze the dataset
        try:
            prompt, system_instruction = get_dataset_analysis_prompt(formatted_examples)
            dataset_analysis = self.call_llm(prompt, system_instruction=system_instruction)
            print("Dataset analysis complete, adding to learnings.txt")

            # Format the analysis for learnings.txt
            timestamp = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
            formatted_analysis = f"""
    === INITIAL DATASET ANALYSIS [{timestamp}] ===

    {dataset_analysis}

    === END INITIAL DATASET ANALYSIS ===

    """
            print (formatted_analysis)
            # Load existing learnings (if any)
            current_learnings = self._load_learnings()

            # Add analysis at the beginning of learnings.txt
            updated_learnings = formatted_analysis + current_learnings

            # Save updated learnings
            self._save_learnings(updated_learnings)

            print(f"Added {len(dataset_analysis)} characters of dataset analysis to learnings.txt")
            return dataset_analysis

        except Exception as e:
            print(f"Error analyzing dataset: {e}")
            traceback.print_exc()
            return None


    def _load_system_prompt(self) -> str:
        """Load the system prompt from the system_prompt.md file"""
        system_prompt_path = Path("system_prompt.md")
        if not system_prompt_path.exists():
            print(
                "Warning: system_prompt.md file not found. Using empty system prompt."
            )
            return ""

        try:
            with open(system_prompt_path, 'r', encoding='utf-8') as f:
                return f.read().strip()
        except Exception as e:
            print(f"Error loading system prompt: {e}")
            return ""

    def _load_previous_state(self):
        """Load previous state from archive if available"""
        summaries = self.get_summaries()
        iterations = self.get_all_iterations()

        # Add debugging to verify what's being loaded
        print(f"Loading previous state... Found {len(summaries)} summaries and {len(iterations)} iteration files")

        # First check if we have summaries
        if summaries:
            # Verify summaries have the expected content for debugging
            iteration_nums = [s.get("iteration") for s in summaries]
            print(f"Summary iteration numbers: {iteration_nums}")

            # Sort by iteration number to find the highest
            sorted_summaries = sorted(summaries,
                                      key=lambda x: x.get("iteration", 0),
                                      reverse=True)
            last_iteration = sorted_summaries[0].get("iteration", 0)
            self.current_iteration = last_iteration + 1

            # Use the explore/exploit balance from the last iteration
            self.explore_rate = sorted_summaries[0].get(
                "new_explore_rate", self.explore_rate)
            self.exploit_rate = sorted_summaries[0].get(
                "new_exploit_rate", self.exploit_rate)
            self.refine_rate = sorted_summaries[0].get(
                "new_refine_rate", self.refine_rate)

            # Use the batch size from the last iteration
            self.current_batch_size = sorted_summaries[0].get(
                "new_batch_size", self.current_batch_size)

            print(
                f"Loaded previous state: iteration {self.current_iteration}, "
                +
                f"explore/exploit/refine: {self.explore_rate}/{self.exploit_rate}/{self.refine_rate}, " +
                f"batch size: {self.current_batch_size}")

            # CRITICAL FIX: Always include the 5 training examples
            for i in range(5):
                self.seen_examples.add(i)

            # Track total examples seen so far (starting after training examples)
            total_examples_seen = 5  # Start with the training examples

            # Reconstruct set of seen examples
            for iteration in iterations:
                if iteration and "sample_count" in iteration:
                    # Each iteration represents sample_count examples
                    sample_count = iteration.get("sample_count", 0)

                    # Add the examples this iteration would have seen
                    # Starting from the current total (which includes training examples)
                    for i in range(total_examples_seen, total_examples_seen + sample_count):
                        self.seen_examples.add(i)

                    # Update the total examples seen
                    total_examples_seen += sample_count

                    # Also update our internal examples_processed counter
                    self.examples_processed = total_examples_seen

            # Set next example index to after the last seen example
            self.next_example_index = total_examples_seen

            print(
                f"Loaded {len(self.seen_examples)} seen examples, next example index: {self.next_example_index}"
            )

            # After calculating next_example_index
            if hasattr(self, 'next_example_index') and self.dataset_loader:
                self.dataset_loader.current_index = self.next_example_index
                print(f"Updated dataset loader to start at example index {self.next_example_index}")
        else:
            # Double-check iterations as a backup in case summaries.json is missing
            if iterations:
                # Get the highest iteration number from iteration files
                highest_iter = max([it.get("iteration", 0) for it in iterations if it])
                self.current_iteration = highest_iter + 1
                print(f"No summaries found, but found iteration files. Setting next iteration to {self.current_iteration}")
            else:
                self.current_iteration = 0
                print("No previous state found. Starting from iteration 0.")

    def call_llm(self, prompt: str, system_instruction: str = None) -> str:
        """Call the Gemini LLM with a prompt and return the response"""
        try:
            # Use provided system instruction or default to the loaded system prompt
            sys_instruction = system_instruction if system_instruction is not None else ""

            response = self.client.models.generate_content(
                model="gemini-2.0-flash",
                config=types.GenerateContentConfig(
                    system_instruction=sys_instruction),
                contents=prompt)
            return response.text
        except Exception as e:
            print(f"Error calling Gemini API: {e}")
            return f"Error: {str(e)}"

    def load_dataset(self) -> Dict:
        """Load the entire dataset from the specified file"""
        try:
            with open(self.dataset_path, 'r', encoding='utf-8') as file:
                return json.load(file)
        except Exception as e:
            print(f"Error loading dataset: {str(e)}")
            return {}

    def get_samples(self) -> Dict:
        """
        Get samples from the dataset loader for the current batch.
        Uses universal field names (question/answer) for consistency.

        Returns:
            Dict containing:
            - samples: List of sample dictionaries with question/answer/id fields
            - new_examples_added: Count of new examples added
            - total_seen_examples: Total count of seen examples
        """
        if not self.dataset_loader:
            return {
                "samples": [],
                "new_examples_added": 0,
                "total_seen_examples": 0
            }

        # Get current_batch_size examples
        examples = self.dataset_loader.get_examples(self.current_batch_size)

        # Convert to standardized format for system
        samples = []
        new_examples_added = 0

        for i, example in enumerate(examples):
            # Get current example index
            example_index = self.examples_processed + i

            # Track that we've seen this example
            if example_index not in self.seen_examples:
                self.seen_examples.add(example_index)
                new_examples_added += 1

            # Extract input and output
            try:
                example_input = self.dataset_loader.get_example_input(example)
                example_output = self.dataset_loader.get_example_output(example)

                # Create standardized sample using universal field names
                standardized_sample = {
                    "question": example_input,  # Universal field name
                    "answer": example_output,   # Universal field name
                    "id": f"example_{example_index}",
                    "meta": example.get("meta", {})

                }

                samples.append(standardized_sample)
            except Exception as e:
                print(f"Error processing example: {e}")

        # Update the number of examples processed
        self.examples_processed += len(samples)

        return {
            "samples": samples,
            "new_examples_added": new_examples_added,
            "total_seen_examples": len(self.seen_examples)
        }

    def save_to_archive(self, data: Dict, filename: str) -> None:
        """Save data to the archive directory"""
        filepath = self.archive_dir / filename
        with open(filepath, 'w', encoding='utf-8') as file:
            json.dump(data, file, indent=2)

    def read_from_archive(self, filename: str) -> Dict:
        """Read data from the archive directory"""
        filepath = self.archive_dir / filename
        if not filepath.exists():
            return {}

        with open(filepath, 'r', encoding='utf-8') as file:
            return json.load(file)

    def get_all_iterations(self) -> List[Dict]:
        """Get data from all past iterations"""
        iterations = []
        for file in self.archive_dir.glob("iteration_*.json"):
            with open(file, 'r', encoding='utf-8') as f:
                iterations.append(json.load(f))
        return sorted(iterations, key=lambda x: x.get('iteration', 0))

    def get_summaries(self) -> List[Dict]:
        """Get all iteration summaries"""
        summary_file = self.archive_dir / "summaries.json"
        if not summary_file.exists():
            print(f"Warning: Summaries file {summary_file} does not exist")
            return []

        try:
            with open(summary_file, 'r', encoding='utf-8') as file:
                summaries = json.load(file)
                print(f"Successfully loaded {len(summaries)} summaries from {summary_file}")
                return summaries
        except Exception as e:
            print(f"Error loading summaries file: {e}")
            return []

    def update_summaries(self, new_summary: Dict) -> None:
        """Add a new summary to the summaries file"""
        # Add capability data to summary
        if hasattr(self, 'capability_tracker'):
            new_summary[
                "capability_report"] = self.capability_tracker.generate_report()

        summaries = self.get_summaries()
        summaries.append(new_summary)

        summary_file = self.archive_dir / "summaries.json"
        try:
            with open(summary_file, 'w', encoding='utf-8') as file:
                json.dump(summaries, file, indent=2)
            print(f"Successfully updated summaries file with iteration {new_summary.get('iteration')}")
        except Exception as e:
            print(f"Error updating summaries file: {e}")

    def adjust_batch_size_with_llm(self, performance: Dict) -> Tuple[int, str]:
        """Use LLM to determine appropriate batch size based on performance"""
        # Get performance history
        iterations = self.get_all_iterations()

        # Extract relevant information for LLM
        performance_history = []
        for iteration in iterations[-5:]:  # Last 5 iterations
            if iteration is None:  # Skip None entries
                continue

            perf = iteration.get("performance", {})
            accuracy = perf.get("accuracy", 0) if perf else 0

            performance_history.append({
                "iteration":
                iteration.get("iteration"),
                "batch_size":
                iteration.get("batch_size", 5),
                "accuracy":
                accuracy,
                "error_patterns":
                iteration.get("performance",
                              {}).get("error_analysis",
                                      {}).get("error_patterns", [])
            })

        # Default response if no LLM available
        default_response = (
            self.current_batch_size,
            "Maintaining current batch size due to insufficient performance data"
        )

        # If no performance history, just keep current batch size
        if not performance_history:
            return default_response


        try:
            prompt, system_instruction = get_batch_size_optimization_prompt(
                current_batch_size=self.current_batch_size,
                current_accuracy=performance.get("accuracy", 0),
                total_examples_seen=len(self.seen_examples),
                performance_history=performance_history
            )
            response = self.call_llm(
                prompt, system_instruction=system_instruction)

            # Extract JSON from response
            response = response.strip()
            if response.startswith("```json"):
                response = response.split("```json")[1]
            if response.endswith("```"):
                response = response.split("```")[0]

            result = json.loads(response)

            # Validate and extract new batch size
            new_batch_size = int(
                result.get("new_batch_size", self.current_batch_size))

            # Ensure batch size is within reasonable limits
            new_batch_size = max(3, min(10, new_batch_size))

            return new_batch_size, result.get("rationale",
                                              "No rationale provided")
        except Exception as e:
            print(f"Error adjusting batch size: {e}")
            return default_response

    def _load_learnings(self) -> str:
        """Load accumulated learnings from the learnings.txt file"""
        learnings_path = Path("learnings.txt")
        if not learnings_path.exists():
            print("No existing learnings file found. Starting with fresh learnings.")
            return ""

        try:
            with open(learnings_path, 'r', encoding='utf-8') as f:
                content = f.read().strip()
                print(f"Successfully loaded learnings.txt ({len(content)} characters)")
                return content
        except Exception as e:
            print(f"Error loading learnings: {e}")
            traceback.print_exc()  # Print full traceback for better debugging
            return ""

    def _save_learnings(self, learnings: str) -> None:
        """Save updated learnings to the learnings.txt file"""
        learnings_path = Path("learnings.txt")
        try:
            with open(learnings_path, 'w', encoding='utf-8') as f:
                f.write(learnings)
            print(f"Learnings successfully saved to {learnings_path} ({len(learnings)} characters)")
        except Exception as e:
            print(f"Error saving learnings: {e}")
            traceback.print_exc()  # Print full traceback for better debugging


    def generate_batch_learnings(self, iteration_data: Dict) -> str:
        """Generate learnings from the current batch results with focus on dataset-specific insights"""

        # Get full original samples from iteration_data
        samples = []
        if "samples" in iteration_data:
            samples = iteration_data.get("samples", [])

        # Get example questions - prefer direct samples if available
        sample_questions = []
        for i in range(min(3, len(samples))):
            if i < len(samples):
                # Use the universal "question" field instead of "prompt_0shot"
                sample_questions.append(samples[i].get("question", "N/A"))
            else:
                sample_questions.append("N/A")

        # If we couldn't get samples directly, try to infer from results
        if not sample_questions and "results" in iteration_data:
            results = iteration_data.get("results", [])
            for i in range(min(3, len(results))):
                if "question" in results[i]:
                    sample_questions.append(results[i].get("question", "N/A"))

        # Get script source code (truncated for prompts)
        script_code = iteration_data.get("script", "")[:500] if iteration_data.get("script") else "No script available"

        # Get performance metrics
        accuracy = iteration_data.get("performance", {}).get("accuracy", 0)

        # Get error examples
        error_examples = []
        for i, result in enumerate(iteration_data.get("results", [])):
            if not result.get("match", True) and result.get("success", False):
                # Find the corresponding sample
                sample_question = "N/A"
                golden_answer = "N/A"

                if i < len(samples):
                    # Use the universal "question" and "answer" fields
                    sample_question = samples[i].get("question", "N/A")
                    golden_answer = samples[i].get("answer", "N/A")

                error_examples.append({
                    "question": sample_question,
                    "expected": golden_answer,
                    "actual": result.get("answer", "N/A"),
                    "explanation": result.get("evaluation", {}).get("explanation", "No explanation")
                })

        # Get capability assessment if available
        capability_insights = ""
        if "capability_report" in iteration_data and iteration_data["capability_report"]:
            report = iteration_data["capability_report"]
            capability_insights = f"""
            Key Capabilities:
            - Strengths: {', '.join(report.get('strengths', [])[:2])}
            - Weaknesses: {', '.join(report.get('weaknesses', [])[:2])}
            - Focus Area: {report.get('improvement_focus', 'None identified')}
            """

        try:
            prompt, system_instruction = get_batch_learnings_prompt(iteration_data,
                                                                    accuracy,
                                                                    sample_questions,
                                                                    script_code,
                                                                    error_examples,
                                                                    capability_insights)

            response = self.call_llm(prompt, system_instruction=system_instruction)
            return f"--- LEARNINGS FROM ITERATION {iteration_data.get('iteration')} ---\n{response.strip()}\n\n"
        except Exception as e:
            error_message = f"Error generating batch learnings: {str(e)}"
            print(error_message)
            return f"--- LEARNINGS FROM ITERATION {iteration_data.get('iteration')} ---\n{error_message}\n\n"

    def synthesize_learnings(self, current_learnings: str, new_batch_learnings: str) -> str:
        """
        Synthesize existing learnings with new batch learnings, emphasizing dataset-specific insights.
        Automatically condenses content when approaching token limits.
        """
        # Define character limit threshold (staying well under the 41,000 character limit for gemini 2.0 flash (5 chars per token, 8192 tokens))
        CHARACTER_LIMIT_THRESHOLD = 40000

        # Calculate current lengths
        current_length = len(current_learnings)
        new_length = len(new_batch_learnings)
        combined_length = current_length + new_length

        # Print length info for debugging
        print(f"Current learnings length: {current_length}")
        print(f"New batch learnings length: {new_length}")
        print(f"Combined length: {combined_length}")

        # Determine if we need to condense content
        approaching_limit = combined_length > CHARACTER_LIMIT_THRESHOLD

        # Get the prompt and system instruction
        base_prompt, system_instruction = get_learning_synthesis_prompt(
            current_learnings=current_learnings,
            new_batch_learnings=new_batch_learnings,
            approaching_limit=approaching_limit
        )

        # Condensing-specific instructions when approaching limit
        if approaching_limit:
            condensing_instructions = """
            CRITICAL: The system has just given a warning that number of tokens in our learnings document is at the document limit. The document can't hold more tokens.

            With this in mind, make sure you synthesize the new learnings without adding to the overall length of the document.

            You should condense redundant parts of the document as needed.
            """

            prompt = base_prompt + condensing_instructions
        else:
            prompt = base_prompt

        try:
            print(f"Calling LLM to {'condense and synthesize' if approaching_limit else 'synthesize'} learnings...")
            response = self.call_llm(prompt, system_instruction=system_instruction)

            response_length = len(response.strip())
            print(f"Received synthesized learnings: {response_length} characters")

            return response.strip()

        except Exception as e:
            error_message = f"Error synthesizing learnings: {str(e)}"
            print(error_message)
            traceback.print_exc()  # Print full traceback for better debugging

            fallback = f"{current_learnings}\n\n=== NEWEST LEARNINGS (NOT SYNTHESIZED DUE TO ERROR) ===\n\n{new_batch_learnings}"
            print(f"Using fallback concatenation: {len(fallback)} characters")
            return fallback


    def update_learnings(self, iteration_data: Dict) -> None:
        """Update the learnings file with insights from the current iteration"""
        try:
            # Load existing learnings
            current_learnings = self._load_learnings()
            print(f"Loaded existing learnings: {len(current_learnings)} characters")

            # Generate learnings from current batch
            print("Generating new batch learnings...")
            batch_learnings = self.generate_batch_learnings(iteration_data)
            print(f"Generated batch learnings: {len(batch_learnings)} characters")

            # If this is the first iteration, just use the batch learnings
            if not current_learnings:
                print("No existing learnings found. Using batch learnings as initial content.")
                updated_learnings = batch_learnings
            else:
                # Synthesize existing learnings with new batch learnings
                print("Synthesizing existing learnings with new batch learnings...")
                updated_learnings = self.synthesize_learnings(current_learnings, batch_learnings)
                print(f"Synthesized learnings: {len(updated_learnings)} characters")

            # Save updated learnings
            self._save_learnings(updated_learnings)
            print(f"Learnings saved successfully ({len(updated_learnings)} characters)")

        except Exception as e:
            print(f"Error updating learnings: {e}")
            traceback.print_exc()  # Print full traceback for better debugging


    def generate_baseline_script(self) -> str:
        """
        Generate a simple baseline script that just calls the LLM directly.
        This establishes performance expectations for the dataset.

        Returns:
            Simple baseline script as a string
        """
        baseline_script = '''import os
from google import genai
from google.genai import types

def call_llm(prompt, system_instruction=None):
    """Call the Gemini LLM with a prompt and return the response"""
    try:
        # Initialize the Gemini client
        client = genai.Client(api_key=os.environ.get("GEMINI_API_KEY"))

        # Call the API with system instruction if provided
        if system_instruction:
            response = client.models.generate_content(
                model="gemini-2.0-flash",
                config=types.GenerateContentConfig(
                    system_instruction=system_instruction
                ),
                contents=prompt
            )
        else:
            response = client.models.generate_content(
                model="gemini-2.0-flash",
                contents=prompt
            )

        return response.text
    except Exception as e:
        print(f"Error calling Gemini API: {str(e)}")
        return f"Error: {str(e)}"

def main(question):
    """
    Baseline script: Simple direct LLM call without sophisticated techniques.
    This establishes the baseline performance capability for this dataset.
    """
    system_instruction = "You are a helpful assistant. Answer the question directly and concisely based on the information provided."

    # Simple, direct call to LLM
    answer = call_llm(question, system_instruction)

    return answer
    '''
        return baseline_script


    def get_baseline_performance(self) -> float:
        """
        Get the baseline performance from iteration 0, or None if not available.

        Returns:
            Baseline accuracy as a float, or None if baseline not established
        """
        iterations = self.get_all_iterations()
        for iteration in iterations:
            if iteration and iteration.get("iteration") == 0:
                return iteration.get("performance", {}).get("accuracy", None)
        return None


    def calculate_performance_context(self, current_accuracy: float) -> Dict:
        """
        Calculate performance context relative to baseline for calibrated decision making.

        Args:
            current_accuracy: Current iteration's accuracy

        Returns:
            Dictionary with performance context information
        """
        baseline_accuracy = self.get_baseline_performance()

        if baseline_accuracy is None:
            return {
                "baseline_available": False,
                "relative_performance": "unknown",
                "performance_category": "unknown",
                "improvement_potential": "unknown"
            }

        # Calculate relative improvement
        relative_improvement = current_accuracy - baseline_accuracy
        relative_percentage = (relative_improvement / baseline_accuracy) * 100 if baseline_accuracy > 0 else 0

        # Categorize performance based on relative improvement
        if relative_improvement >= 0.15:  # 15+ percentage points above baseline
            performance_category = "excellent"
            improvement_potential = "high"  # Even excellent performance can be pushed further
        elif relative_improvement >= 0.05:  # 5-15 percentage points above baseline
            performance_category = "good"
            improvement_potential = "moderate"
        elif relative_improvement >= -0.05:  # Within 5 percentage points of baseline
            performance_category = "baseline"
            improvement_potential = "high"  # Lots of room for improvement
        else:  # More than 5 percentage points below baseline
            performance_category = "poor"
            improvement_potential = "high"  # Definitely room for improvement

        # Determine dataset difficulty context
        if baseline_accuracy >= 0.8:
            dataset_difficulty = "easy"
            exploitation_threshold = 0.9  # Need very high performance to justify exploitation
        elif baseline_accuracy >= 0.5:
            dataset_difficulty = "moderate"
            exploitation_threshold = baseline_accuracy + 0.2  # Need 20+ points above baseline
        elif baseline_accuracy >= 0.2:
            dataset_difficulty = "hard"
            exploitation_threshold = baseline_accuracy + 0.1  # Need 10+ points above baseline
        else:
            dataset_difficulty = "very_hard"
            exploitation_threshold = baseline_accuracy + 0.05  # Need 5+ points above baseline

        return {
            "baseline_available": True,
            "baseline_accuracy": baseline_accuracy,
            "current_accuracy": current_accuracy,
            "relative_improvement": relative_improvement,
            "relative_percentage": relative_percentage,
            "performance_category": performance_category,
            "dataset_difficulty": dataset_difficulty,
            "exploitation_threshold": exploitation_threshold,
            "improvement_potential": improvement_potential,
            "should_exploit": current_accuracy >= exploitation_threshold
        }


    def adjust_strategy_with_llm(self) -> Tuple[int, int, int]:
        """
        Use LLM reasoning to adjust the strategy balance with baseline-calibrated performance context.

        Returns:
            Tuple[int, int, int]: (explore_rate, exploit_rate, refine_rate) - percentages summing to 100
        """
        iterations = self.get_all_iterations()
        summaries = self.get_summaries()

        # If there aren't enough iterations yet, use default balance
        if len(iterations) < 2:
            return 60, 20, 20  # Default: heavily favor exploration initially

        # Get performance context for the most recent iteration
        latest_accuracy = 0
        if summaries:
            latest_summary = max(summaries, key=lambda x: x.get("iteration", 0))
            latest_accuracy = latest_summary.get("performance", {}).get("accuracy", 0)

        performance_context = self.calculate_performance_context(latest_accuracy)

        # Get the full performance history with baseline context
        performance_history = []
        for summary in summaries:
            accuracy = summary.get("performance", {}).get("accuracy", 0)
            context = self.calculate_performance_context(accuracy)

            performance_history.append({
                "iteration": summary.get("iteration"),
                "accuracy": accuracy,
                "batch_size": summary.get("batch_size", 5),
                "strategy": summary.get("strategy"),
                "primary_issue": summary.get("primary_issue", "None identified"),
                "performance_category": context.get("performance_category", "unknown"),
                "relative_improvement": context.get("relative_improvement", 0),
                "relative_percentage": context.get("relative_percentage", 0)
            })

        # Try to get information about the best script so far
        best_script_info = None
        try:
            best_script_info = self.get_best_script_info()
        except Exception as e:
            print(f"Error getting best script info: {e}")

        # Prepare additional context for the LLM
        context = {
            "iterations_completed": len(summaries),
            "best_accuracy": best_script_info.get("accuracy", 0) if best_script_info else 0,
            "best_iteration": best_script_info.get("iteration", -1) if best_script_info else -1,
            "current_balance": f"{getattr(self, 'explore_rate', 60)}/{getattr(self, 'exploit_rate', 20)}/{getattr(self, 'refine_rate', 20)}",
            "total_examples_seen": len(self.seen_examples)
        }

        # Check for capability insights if available
        capability_context = {}
        if hasattr(self, 'capability_tracker'):
            capability_report = self.capability_tracker.generate_report()
            capability_context = {
                "weakest_capability": capability_report.get("weakest_capabilities", [{}])[0].get("name", None),