timteh · timteh · Feb 1, 2026
diff --git a/.jules/bolt.md b/.jules/bolt.md
@@ -5,3 +5,7 @@
 ## 2024-05-24 - openpyxl read_only optimization
 **Learning:** `openpyxl.load_workbook(..., read_only=True)` is significantly faster (1.75x) for parsing large files but requires explicit `wb.close()` (preferably in `try...finally`) as it keeps file handles open and `Workbook` objects may not support context managers in all versions.
 **Action:** Use `read_only=True` for read-heavy Excel tasks and always ensure `close()` is called.
+
+## 2024-05-25 - Openpyxl vs Direct XML for Metrics
+**Learning:** Even with `read_only=True`, `openpyxl` overhead is significant (5x slower) for simple tasks like counting formulas or finding error strings compared to direct `zipfile` + `xml.etree.ElementTree` parsing.
+**Action:** For read-only scans of specific tags/values in large XLSX files, prefer direct XML parsing to avoid object creation overhead.
diff --git a/skills/xlsx/recalc.py b/skills/xlsx/recalc.py
@@ -9,8 +9,9 @@
 import subprocess
 import os
 import platform
+import zipfile
+import xml.etree.ElementTree as ET
 from pathlib import Path
-from openpyxl import load_workbook
 
 
 def setup_libreoffice_macro():
@@ -50,6 +51,126 @@ def setup_libreoffice_macro():
         return False
 
 
+def parse_excel_xml(filename):
+    """
+    Parse Excel file using direct XML parsing (zipfile + ElementTree)
+    to count formulas and identify error cells.
+    Significantly faster than openpyxl for large files.
+    """
+    excel_errors = {'#VALUE!', '#DIV/0!', '#REF!', '#NAME?', '#NULL!', '#NUM!', '#N/A'}
+    error_details = {err: [] for err in excel_errors}
+    total_errors = 0
+    formula_count = 0
+
+    with zipfile.ZipFile(filename, 'r') as z:
+        # 1. Parse Shared Strings
+        shared_strings = []
+        if 'xl/sharedStrings.xml' in z.namelist():
+            with z.open('xl/sharedStrings.xml') as f:
+                context = ET.iterparse(f, events=('end',))
+                for event, elem in context:
+                    if elem.tag.endswith('si'):
+                        text_parts = []
+                        # iter() includes the element itself, so we check children
+                        for child in elem.iter():
+                            if child.tag.endswith('t') and child.text:
+                                text_parts.append(child.text)
+                        shared_strings.append("".join(text_parts))
+                        elem.clear()
+
+        # 2. Map Sheet Names to Filenames
+        sheet_names = {} # rId -> sheet_name
+        if 'xl/workbook.xml' in z.namelist():
+            with z.open('xl/workbook.xml') as f:
+                context = ET.iterparse(f, events=('end',))
+                for event, elem in context:
+                    if elem.tag.endswith('sheet'):
+                        name = elem.get('name')
+                        rid = None
+                        for k, v in elem.attrib.items():
+                            if k.endswith('id'):
+                                rid = v
+                                break
+                        if rid and name:
+                            sheet_names[rid] = name
+                    elem.clear()
+
+        sheet_filenames = {} # sheet_name -> filename
+        if 'xl/_rels/workbook.xml.rels' in z.namelist():
+            with z.open('xl/_rels/workbook.xml.rels') as f:
+                context = ET.iterparse(f, events=('end',))
+                for event, elem in context:
+                    if elem.tag.endswith('Relationship'):
+                        rid = elem.get('Id')
+                        target = elem.get('Target')
+                        if rid in sheet_names:
+                            if target.startswith('/'):
+                                fname = target[1:]
+                            else:
+                                fname = 'xl/' + target
+                            sheet_filenames[sheet_names[rid]] = fname
+                    elem.clear()
+
+        # 3. Parse Sheets
+        for sheet_name, sheet_file in sheet_filenames.items():
+            if sheet_file in z.namelist():
+                with z.open(sheet_file) as f:
+                    context = ET.iterparse(f, events=('end',))
+                    for event, elem in context:
+                        if elem.tag.endswith('c'): # cell
+                            coordinate = elem.get('r')
+                            cell_type = elem.get('t')
+
+                            # Check formula
+                            for child in elem:
+                                if child.tag.endswith('f'):
+                                    formula_count += 1
+                                    break
+
+                            val = None
+                            if cell_type == 'e': # Error type
+                                for child in elem:
+                                    if child.tag.endswith('v'):
+                                        val = child.text
+                                        break
+                                if val in excel_errors:
+                                    error_details[val].append(f"{sheet_name}!{coordinate}")
+                                    total_errors += 1
+
+                            elif cell_type == 's': # Shared string
+                                for child in elem:
+                                    if child.tag.endswith('v') and child.text:
+                                        try:
+                                            idx = int(child.text)
+                                            if idx < len(shared_strings):
+                                                val = shared_strings[idx]
+                                        except ValueError:
+                                            pass
+                                        break
+                                if val:
+                                    for err in excel_errors:
+                                        if err in val:
+                                            error_details[err].append(f"{sheet_name}!{coordinate}")
+                                            total_errors += 1
+                                            break
+
+                            elif cell_type == 'str': # Inline string (formula result)
+                                 for child in elem:
+                                    if child.tag.endswith('v'):
+                                        val = child.text
+                                        break
+                                 if val:
+                                    for err in excel_errors:
+                                        if err in val:
+                                            error_details[err].append(f"{sheet_name}!{coordinate}")
+                                            total_errors += 1
+                                            break
+
+                            elem.clear()
+
+    return total_errors, formula_count, error_details
+
+
 def recalc(filename, timeout=30):
     """
     Recalculate formulas in Excel file and report any errors
@@ -98,29 +219,9 @@ def recalc(filename, timeout=30):
         else:
             return {'error': error_msg}
 
-    # Check for Excel errors in the recalculated file - scan ALL cells
+    # Check for Excel errors using optimized XML parsing
     try:
-        excel_errors = ['#VALUE!', '#DIV/0!', '#REF!', '#NAME?', '#NULL!', '#NUM!', '#N/A']
-        error_details = {err: [] for err in excel_errors}
-        total_errors = 0
-
-        # Optimization: use read_only=True for faster parsing and lower memory usage
-        wb = load_workbook(filename, data_only=True, read_only=True)
-        try:
-            for sheet_name in wb.sheetnames:
-                ws = wb[sheet_name]
-                # Check ALL rows and columns - no limits
-                for row in ws.iter_rows():
-                    for cell in row:
-                        if cell.value is not None and isinstance(cell.value, str):
-                            for err in excel_errors:
-                                if err in cell.value:
-                                    location = f"{sheet_name}!{cell.coordinate}"
-                                    error_details[err].append(location)
-                                    total_errors += 1
-                                    break
-        finally:
-            wb.close()
+        total_errors, formula_count, error_details = parse_excel_xml(filename)
 
         # Build result summary
         result = {
@@ -137,19 +238,6 @@ def recalc(filename, timeout=30):
                     'locations': locations[:20]  # Show up to 20 locations
                 }
 
-        # Add formula count for context - also check ALL cells
-        formula_count = 0
-        wb_formulas = load_workbook(filename, data_only=False, read_only=True)
-        try:
-            for sheet_name in wb_formulas.sheetnames:
-                ws = wb_formulas[sheet_name]
-                for row in ws.iter_rows():
-                    for cell in row:
-                        if cell.value and isinstance(cell.value, str) and cell.value.startswith('='):
-                            formula_count += 1
-        finally:
-            wb_formulas.close()
-
         result['total_formulas'] = formula_count
 
         return result
@@ -178,4 +266,4 @@ def main():
 
 
 if __name__ == '__main__':
-    main()
+    main()
diff --git a/tests/test_xlsx_recalc.py b/tests/test_xlsx_recalc.py
@@ -0,0 +1,67 @@
+
+import sys
+import os
+import unittest
+from unittest.mock import patch, MagicMock
+from openpyxl import Workbook
+
+# Add skills/xlsx to path to import recalc
+sys.path.append(os.path.join(os.getcwd(), 'skills', 'xlsx'))
+import recalc
+
+class TestRecalc(unittest.TestCase):
+    def setUp(self):
+        self.filename = "test_verify.xlsx"
+        wb = Workbook()
+        ws = wb.active
+        ws.title = "Sheet1"
+
+        # Add formulas
+        ws['A1'] = 10
+        ws['B1'] = 20
+        ws['C1'] = "=SUM(A1:B1)"
+
+        # Add errors
+        # Note: Openpyxl saves these as shared strings.
+        # This tests the shared string error detection path.
+        ws['A2'] = "#DIV/0!"
+        ws['B2'] = "#REF!"
+        ws['C2'] = "Normal"
+
+        wb.save(self.filename)
+
+    def tearDown(self):
+        if os.path.exists(self.filename):
+            os.remove(self.filename)
+
+    @patch('recalc.subprocess.run')
+    @patch('recalc.setup_libreoffice_macro')
+    def test_recalc_parsing(self, mock_setup, mock_run):
+        # Mock setup to return True
+        mock_setup.return_value = True
+
+        # Mock subprocess to return success
+        mock_result = MagicMock()
+        mock_result.returncode = 0
+        mock_run.return_value = mock_result
+
+        # Run recalc
+        result = recalc.recalc(self.filename)
+
+        # Check result
+        self.assertEqual(result['status'], 'errors_found')
+        self.assertEqual(result['total_errors'], 2)
+        # Note: openpyxl might not write formula string if we just set it?
+        # ws['C1'] = "=SUM(...)" writes it as formula in openpyxl.
+        self.assertEqual(result['total_formulas'], 1)
+
+        # Check specific errors
+        summary = result['error_summary']
+        self.assertIn('#DIV/0!', summary)
+        self.assertIn('#REF!', summary)
+
+        self.assertIn('Sheet1!A2', summary['#DIV/0!']['locations'])
+        self.assertIn('Sheet1!B2', summary['#REF!']['locations'])
+
+if __name__ == '__main__':
+    unittest.main()