timteh · timteh · Jan 30, 2026
diff --git a/skills/xlsx/recalc.py b/skills/xlsx/recalc.py
@@ -9,6 +9,8 @@
 import subprocess
 import os
 import platform
+import zipfile
+import re
 from pathlib import Path
 from openpyxl import load_workbook
 
@@ -50,6 +52,52 @@ def setup_libreoffice_macro():
         return False
 
 
+def count_formulas(filename):
+    """
+    Count formulas in an Excel file by scanning XML directly.
+    Much faster than loading the workbook with openpyxl.
+    """
+    formula_count = 0
+    # Pattern matches <f> tag which indicates a formula.
+    # Matches <f> or <f ...> (with attributes) or <f/> (empty shared formula)
+    # Regex looks for <f followed by > or space or /
+    pattern = re.compile(rb'<f[>\s/]')
+
+    try:
+        with zipfile.ZipFile(filename, 'r') as z:
+            for name in z.namelist():
+                 if name.startswith('xl/worksheets/sheet') and name.endswith('.xml'):
+                    with z.open(name) as f:
+                        # Chunked reading for memory efficiency
+                        chunk_size = 1024 * 1024 # 1MB
+                        buffer = b''
+                        while True:
+                            chunk = f.read(chunk_size)
+                            if not chunk:
+                                break
+                            buffer += chunk
+
+                            # Find last '>' to split safely
+                            last_tag_end = buffer.rfind(b'>')
+                            if last_tag_end != -1:
+                                to_process = buffer[:last_tag_end+1]
+                                formula_count += len(pattern.findall(to_process))
+                                buffer = buffer[last_tag_end+1:]
+                            else:
+                                # No '>' found, keep accumulating
+                                pass
+
+                        # Process remaining buffer
+                        formula_count += len(pattern.findall(buffer))
+    except Exception as e:
+        sys.stderr.write(f"Error counting formulas via zip: {e}\n")
+        # Return 0 on error, or maybe -1 to indicate failure?
+        # Returning 0 is safer for now as it's just a stat.
+        return 0
+
+    return formula_count
+
+
 def recalc(filename, timeout=30):
     """
     Recalculate formulas in Excel file and report any errors
@@ -137,20 +185,8 @@ def recalc(filename, timeout=30):
                     'locations': locations[:20]  # Show up to 20 locations
                 }
 
-        # Add formula count for context - also check ALL cells
-        formula_count = 0
-        wb_formulas = load_workbook(filename, data_only=False, read_only=True)
-        try:
-            for sheet_name in wb_formulas.sheetnames:
-                ws = wb_formulas[sheet_name]
-                for row in ws.iter_rows():
-                    for cell in row:
-                        if cell.value and isinstance(cell.value, str) and cell.value.startswith('='):
-                            formula_count += 1
-        finally:
-            wb_formulas.close()
-
-        result['total_formulas'] = formula_count
+        # Add formula count for context - optimized
+        result['total_formulas'] = count_formulas(filename)
 
         return result
 

diff --git a/tests/test_recalc_perf.py b/tests/test_recalc_perf.py
@@ -0,0 +1,96 @@
+
+import unittest
+import os
+import sys
+from openpyxl import Workbook
+from unittest.mock import MagicMock, patch
+
+# Add repo root to path
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
+
+try:
+    from skills.xlsx.recalc import count_formulas
+except ImportError:
+    count_formulas = None
+
+class TestRecalcPerf(unittest.TestCase):
+    def setUp(self):
+        self.filename = "test_perf_formulas.xlsx"
+        self.create_test_file(self.filename)
+
+    def tearDown(self):
+        if os.path.exists(self.filename):
+            os.remove(self.filename)
+
+    def create_test_file(self, filename, rows=100, cols=10):
+        wb = Workbook()
+        ws = wb.active
+
+        # Add formulas
+        # A1 to A{rows} will have formulas
+        for r in range(1, rows + 1):
+            ws[f'A{r}'] = f'=B{r}+C{r}'
+
+        # Add values (no formula)
+        for r in range(1, rows + 1):
+            ws[f'B{r}'] = 1
+            ws[f'C{r}'] = 2
+
+        wb.save(filename)
+        # Expected formulas: rows
+        self.expected_count = rows
+
+    def test_count_formulas(self):
+        if count_formulas is None:
+            # If function doesn't exist, we can't test it.
+            # This allows creating the test file before the implementation.
+            print("count_formulas not found, skipping test")
+            return
+
+        count = count_formulas(self.filename)
+        self.assertEqual(count, self.expected_count, f"Expected {self.expected_count} formulas, got {count}")
+
+    def test_small_chunks(self):
+        """Test that formula counting works even when file is read in small chunks (splitting tags)"""
+        if count_formulas is None:
+             self.skipTest("count_formulas not implemented yet")
+
+        # XML content with formulas split in various ways
+        # <f> in simple form
+        # <f t="shared"> with attributes
+        # <f> split across chunk boundary
+        xml_content = b'<worksheet><sheetData><row><c><f>SUM(A1)</f></c><c><f t="shared" ref="A1">SUM(A1)</f></c><c><f>A1+1</f></c></row></sheetData></worksheet>'
+        # Total 3 formulas.
+
+        # Mocking zipfile
+        with patch('zipfile.ZipFile') as MockZip:
+            mock_zip = MockZip.return_value
+            mock_zip.__enter__.return_value = mock_zip
+            mock_zip.namelist.return_value = ['xl/worksheets/sheet1.xml']
+
+            mock_file = MagicMock()
+            mock_file.__enter__.return_value = mock_file
+            mock_zip.open.return_value = mock_file
+
+            # Use small chunk size to force splitting (e.g., 5 bytes)
+            chunks = []
+            chunk_size = 5
+            for i in range(0, len(xml_content), chunk_size):
+                chunks.append(xml_content[i:i+chunk_size])
+
+            iter_chunks = iter(chunks)
+
+            def side_effect(size=-1):
+                try:
+                    return next(iter_chunks)
+                except StopIteration:
+                    return b''
+
+            mock_file.read.side_effect = side_effect
+
+            # Filename doesn't matter as we mock ZipFile
+            count = count_formulas("dummy.xlsx")
+            self.assertEqual(count, 3)
+
+if __name__ == '__main__':
+    unittest.main()