Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
64 changes: 50 additions & 14 deletions skills/xlsx/recalc.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@
import subprocess
import os
import platform
import zipfile
import re
from pathlib import Path
from openpyxl import load_workbook

Expand Down Expand Up @@ -50,6 +52,52 @@ def setup_libreoffice_macro():
return False


def count_formulas(filename):
"""
Count formulas in an Excel file by scanning XML directly.
Much faster than loading the workbook with openpyxl.
"""
formula_count = 0
# Pattern matches <f> tag which indicates a formula.
# Matches <f> or <f ...> (with attributes) or <f/> (empty shared formula)
# Regex looks for <f followed by > or space or /
pattern = re.compile(rb'<f[>\s/]')

try:
with zipfile.ZipFile(filename, 'r') as z:
for name in z.namelist():
if name.startswith('xl/worksheets/sheet') and name.endswith('.xml'):
with z.open(name) as f:
# Chunked reading for memory efficiency
chunk_size = 1024 * 1024 # 1MB
buffer = b''
while True:
chunk = f.read(chunk_size)
if not chunk:
break
buffer += chunk

# Find last '>' to split safely
last_tag_end = buffer.rfind(b'>')
if last_tag_end != -1:
to_process = buffer[:last_tag_end+1]
formula_count += len(pattern.findall(to_process))
buffer = buffer[last_tag_end+1:]
else:
# No '>' found, keep accumulating
pass

# Process remaining buffer
formula_count += len(pattern.findall(buffer))
except Exception as e:
sys.stderr.write(f"Error counting formulas via zip: {e}\n")
# Return 0 on error, or maybe -1 to indicate failure?
# Returning 0 is safer for now as it's just a stat.
return 0

return formula_count


def recalc(filename, timeout=30):
"""
Recalculate formulas in Excel file and report any errors
Expand Down Expand Up @@ -137,20 +185,8 @@ def recalc(filename, timeout=30):
'locations': locations[:20] # Show up to 20 locations
}

# Add formula count for context - also check ALL cells
formula_count = 0
wb_formulas = load_workbook(filename, data_only=False, read_only=True)
try:
for sheet_name in wb_formulas.sheetnames:
ws = wb_formulas[sheet_name]
for row in ws.iter_rows():
for cell in row:
if cell.value and isinstance(cell.value, str) and cell.value.startswith('='):
formula_count += 1
finally:
wb_formulas.close()

result['total_formulas'] = formula_count
# Add formula count for context - optimized
result['total_formulas'] = count_formulas(filename)

return result

Expand Down
96 changes: 96 additions & 0 deletions tests/test_recalc_perf.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@

import unittest
import os
import sys
from openpyxl import Workbook
from unittest.mock import MagicMock, patch

# Add repo root to path
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))

try:
from skills.xlsx.recalc import count_formulas
except ImportError:
count_formulas = None

class TestRecalcPerf(unittest.TestCase):
def setUp(self):
self.filename = "test_perf_formulas.xlsx"
self.create_test_file(self.filename)

def tearDown(self):
if os.path.exists(self.filename):
os.remove(self.filename)

def create_test_file(self, filename, rows=100, cols=10):
wb = Workbook()
ws = wb.active

# Add formulas
# A1 to A{rows} will have formulas
for r in range(1, rows + 1):
ws[f'A{r}'] = f'=B{r}+C{r}'

# Add values (no formula)
for r in range(1, rows + 1):
ws[f'B{r}'] = 1
ws[f'C{r}'] = 2

wb.save(filename)
# Expected formulas: rows
self.expected_count = rows

def test_count_formulas(self):
if count_formulas is None:
# If function doesn't exist, we can't test it.
# This allows creating the test file before the implementation.
print("count_formulas not found, skipping test")
return

count = count_formulas(self.filename)
self.assertEqual(count, self.expected_count, f"Expected {self.expected_count} formulas, got {count}")

def test_small_chunks(self):
"""Test that formula counting works even when file is read in small chunks (splitting tags)"""
if count_formulas is None:
self.skipTest("count_formulas not implemented yet")

# XML content with formulas split in various ways
# <f> in simple form
# <f t="shared"> with attributes
# <f> split across chunk boundary
xml_content = b'<worksheet><sheetData><row><c><f>SUM(A1)</f></c><c><f t="shared" ref="A1">SUM(A1)</f></c><c><f>A1+1</f></c></row></sheetData></worksheet>'
# Total 3 formulas.

# Mocking zipfile
with patch('zipfile.ZipFile') as MockZip:
mock_zip = MockZip.return_value
mock_zip.__enter__.return_value = mock_zip
mock_zip.namelist.return_value = ['xl/worksheets/sheet1.xml']

mock_file = MagicMock()
mock_file.__enter__.return_value = mock_file
mock_zip.open.return_value = mock_file

# Use small chunk size to force splitting (e.g., 5 bytes)
chunks = []
chunk_size = 5
for i in range(0, len(xml_content), chunk_size):
chunks.append(xml_content[i:i+chunk_size])

iter_chunks = iter(chunks)

def side_effect(size=-1):
try:
return next(iter_chunks)
except StopIteration:
return b''

mock_file.read.side_effect = side_effect

# Filename doesn't matter as we mock ZipFile
count = count_formulas("dummy.xlsx")
self.assertEqual(count, 3)

if __name__ == '__main__':
unittest.main()