From 79be5f685c6bcded73bb6c7ce601b2fc98b8cce3 Mon Sep 17 00:00:00 2001 From: Josh Day Date: Mon, 6 Apr 2026 08:32:59 -0400 Subject: [PATCH 1/5] first pass --- .gitignore | 1 + Project.toml | 4 +- src/XLSX.jl | 4 +- src/cell.jl | 30 ++--- src/cellformat-helpers.jl | 35 +++--- src/cellformats.jl | 27 ++--- src/conditional-format-helpers.jl | 6 +- src/conditional-formats.jl | 59 +++++----- src/formula.jl | 2 +- src/read.jl | 107 +++++++++--------- src/relationship.jl | 14 ++- src/sst.jl | 72 +++++++----- src/stream.jl | 179 +++++++++++++++--------------- src/styles.jl | 20 ++-- src/types.jl | 9 +- src/worksheet.jl | 16 +-- src/write.jl | 40 ++++--- test/runtests.jl | 7 +- 18 files changed, 319 insertions(+), 313 deletions(-) diff --git a/.gitignore b/.gitignore index 62aa63f0..e9048b11 100644 --- a/.gitignore +++ b/.gitignore @@ -5,3 +5,4 @@ Manifest.toml docs/Manifest.toml .vscode *.gz +.claude diff --git a/Project.toml b/Project.toml index 3712e9e0..66dea26f 100644 --- a/Project.toml +++ b/Project.toml @@ -9,7 +9,6 @@ repo = "https://github.com/juliadata/XLSX.jl.git" Artifacts = "56f22d72-fd6d-98f1-02f0-08ddc0907c33" Colors = "5ae59095-9a9b-59fe-a467-6f913c188581" Dates = "ade2ca70-3891-5945-98fb-dc099432e06a" -OrderedCollections = "bac558e1-5e72-5ebc-8fee-abe8a469f55d" PrecompileTools = "aea7be01-6a6a-4083-8856-8a6e6704d82a" Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7" Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" @@ -30,11 +29,10 @@ Artifacts = "1" CSV = "0.10.15" Colors = "0.13.0" Distributions = "0.25.0" -OrderedCollections = "1" PrecompileTools = "1" Tables = "1" UUIDs = "1.8" -XML = "0.3.8" +XML = "0.4" ZipArchives = "2.5" julia = "1.8" diff --git a/src/XLSX.jl b/src/XLSX.jl index 3443bcce..d93d80eb 100644 --- a/src/XLSX.jl +++ b/src/XLSX.jl @@ -12,7 +12,6 @@ import Tables import Unicode import UUIDs import XML -using OrderedCollections: OrderedDict import ZipArchives import PrecompileTools as PCT # this is a small dependency. @@ -39,6 +38,9 @@ export getMergedCells, isMergedCell, getMergedBaseCell, mergeCells const SPREADSHEET_NAMESPACE_XPATH_ARG = "http://schemas.openxmlformats.org/spreadsheetml/2006/main" + +xml_elements(node) = filter(n -> XML.nodetype(n) == XML.Element, XML.children(node)) +xml_root_element(doc) = last(xml_elements(doc)) const EXCEL_MAX_COLS = 16_384 # total columns supported by Excel per sheet const EXCEL_MAX_ROWS = 1_048_576 # total rows supported by Excel per sheet (including headers) diff --git a/src/cell.jl b/src/cell.jl index 64ad605d..478a2b19 100644 --- a/src/cell.jl +++ b/src/cell.jl @@ -68,7 +68,7 @@ end # Extracts the unformatted text from an inlineStr "is" XML element as a XML string. function _build_si_xml(child)::String - inner = join(XML.write.(XML.children(child)), "\n") + inner = join(XML.write.(materialize.(XML.children(child))), "\n") return "\n $inner\n" end @@ -120,7 +120,7 @@ function Cell(c::XML.LazyNode, ws::Worksheet; mylock::Union{ReentrantLock,Nothin if tag == "v" ch = XML.children(child) isempty(ch) && continue - v = XLSX.unescape(XML.value(ch[1])) + v = XML.value(ch[1]) datatype, value = process_tv(wb, t, v, num_style; mylock) elseif tag == "f" if get_xlsxfile(wb).is_writable @@ -141,10 +141,10 @@ function parse_formula_from_element(c_child_element)::AbstractFormula # Extract formula string formula_string = if XML.is_simple(c_child_element) - XLSX.unescape(XML.simple_value(c_child_element)) + XML.simple_value(c_child_element) else text_nodes = filter(x -> XML.nodetype(x) == XML.Text, XML.children(c_child_element)) - isempty(text_nodes) ? "" : XLSX.unescape(XML.value(text_nodes[1])) + isempty(text_nodes) ? "" : XML.value(text_nodes[1]) end a = XML.attributes(c_child_element) @@ -424,29 +424,15 @@ function get_rowcells!(rowcells::Dict{Int,Cell}, row::XML.LazyNode, ws::Workshee return nothing, sst_count end =# - # unthreaded cell extraction is (exceedingly marginally) slower but no lock conflicts introduced. - - # debug - # @assert row.tag == "row" "Not a row node" - sst_count = 0 - d = row.depth - - cellnode = XML.next(row) - - while !isnothing(cellnode) && cellnode.depth > d - if cellnode.tag == "c" # This is a cell - cell = Cell(cellnode, ws; mylock) # construct an XLSX.Cell from an XML.LazyNode + for child in XML.children(row) + if XML.tag(child) == "c" + cell = Cell(child, ws; mylock) sst_count += cell.datatype == CT_STRING ? 1 : 0 rowcells[column_number(cell)] = cell end - cellnode = XML.next(cellnode) - end - if !isnothing(cellnode) && cellnode.tag == "row" # have reached the beginning of next row - return cellnode, sst_count - else # no more rows - return nothing, sst_count end + return sst_count end diff --git a/src/cellformat-helpers.jl b/src/cellformat-helpers.jl index 1f1e0a69..84c0cf12 100644 --- a/src/cellformat-helpers.jl +++ b/src/cellformat-helpers.jl @@ -113,8 +113,9 @@ const EXCEL_COLUMN_WIDTH_PADDING = 0.7109375 # function copynode(o::XML.Node) - n = XML.Node(o.nodetype, o.tag, o.attributes, o.value, isnothing(o.children) ? nothing : [copynode(x) for x in o.children]) - return n + attrs = isnothing(o.attributes) ? nothing : copy(o.attributes) + children = isnothing(o.children) ? nothing : XML.Node{String}[copynode(x) for x in o.children] + return XML.Node{String}(o.nodetype, o.tag, attrs, o.value, children) end function do_sheet_names_match(ws::Worksheet, rng::T) where {T<:Union{SheetCellRef,AbstractSheetCellRange}} if ws.name == rng.sheet @@ -140,7 +141,7 @@ function buildNode(tag::String, attributes::Dict{String,Union{Nothing,Dict{Strin if isnothing(attributes[a]) cnode = XML.Element(a) else - cnode = XML.Node(XML.Element, a, OrderedDict{String,String}(), nothing, tag ∈ ["border", "fill"] ? Vector{XML.Node}() : nothing) + cnode = XML.Node{String}(XML.Element, a, Pair{String,String}[], nothing, tag ∈ ["border", "fill"] ? Vector{XML.Node{String}}() : nothing) for (k, v) in attributes[a] cnode[k] = v end @@ -152,7 +153,7 @@ function buildNode(tag::String, attributes::Dict{String,Union{Nothing,Dict{Strin if isnothing(attributes[a]) cnode = XML.Element(a) else - cnode = XML.Node(XML.Element, a, OrderedDict{String,String}(), nothing, tag ∈ ["border", "fill"] ? Vector{XML.Node}() : nothing) + cnode = XML.Node{String}(XML.Element, a, Pair{String,String}[], nothing, tag ∈ ["border", "fill"] ? Vector{XML.Node{String}}() : nothing) color = XML.Element("color") for (k, v) in attributes[a] if k == "style" && v != "none" @@ -179,7 +180,7 @@ function buildNode(tag::String, attributes::Dict{String,Union{Nothing,Dict{Strin if isnothing(attributes[a]) cnode = XML.Element(a) else - cnode = XML.Node(XML.Element, a, OrderedDict{String,String}(), nothing, tag ∈ ["border", "fill"] ? Vector{XML.Node}() : nothing) + cnode = XML.Node{String}(XML.Element, a, Pair{String,String}[], nothing, tag ∈ ["border", "fill"] ? Vector{XML.Node{String}}() : nothing) patternfill = XML.Element("patternFill") fgcolor = XML.Element("fgColor") bgcolor = XML.Element("bgColor") @@ -391,14 +392,14 @@ function get_new_formatId(wb::Workbook, format::String)::Int if isnothing(j) # There are no existing custom formats return styles_add_numFmt(wb, format) else - existing_elements_count = length(XML.children(xroot[i][j])) + existing_elements_count = length(xml_elements(xroot[i][j])) if parse(Int, xroot[i][j]["count"]) != existing_elements_count throw(XLSXError("Wrong number of font elements found: $existing_elements_count. Expected $(parse(Int, xroot[i][j]["count"])).")) end format_node = XML.Element("numFmt"; numFmtId=string(existing_elements_count + PREDEFINED_NUMFMT_COUNT), - formatCode=XLSX.escape(format) + formatCode=format ) return styles_add_cell_attribute(wb, format_node, "numFmts") + PREDEFINED_NUMFMT_COUNT @@ -445,7 +446,7 @@ function update_template_xf(ws::Worksheet, allXfNodes::Vector{XML.Node}, existin old_cell_xf = styles_cell_xf(allXfNodes, Int(existing_style.id)) new_cell_xf = copynode(old_cell_xf) if isnothing(new_cell_xf.children) - new_cell_xf=XML.Node(new_cell_xf, alignment) + new_cell_xf = XML.Node{String}(XML.Element, XML.tag(new_cell_xf), new_cell_xf.attributes, nothing, XML.Node{String}[alignment]) elseif length(XML.children(new_cell_xf)) == 0 push!(new_cell_xf, alignment) else @@ -464,13 +465,13 @@ end function styles_add_cell_attribute(wb::Workbook, new_att::XML.Node, att::String)::Int xroot = styles_xmlroot(wb) i, j = get_idces(xroot, "styleSheet", att) - existing_elements_count = length(XML.children(xroot[i][j])) + existing_elements_count = length(xml_elements(xroot[i][j])) if parse(Int, xroot[i][j]["count"]) != existing_elements_count throw(XLSXError("Wrong number of elements elements found: $existing_elements_count. Expected $(parse(Int, xroot[i][j]["count"])).")) end # Check new_att doesn't duplicate any existing att. If yes, use that rather than create new. - for (k, node) in enumerate(XML.children(xroot[i][j])) + for (k, node) in enumerate(xml_elements(xroot[i][j])) if XML.tag(new_att) == "numFmt" # mustn't compare numFmtId attribute for formats if node["formatCode"] == new_att["formatCode"] return k - 1 # CellDataFormat is zero-indexed @@ -1070,7 +1071,7 @@ function process_uniform_core(f::Function, ws::Worksheet, allXfNodes::Vector{XML if first # Get the attribute of the first cell in the range. newid = f(ws, cellref; kw...) new_alignment = getAlignment(ws, cellref).alignment["alignment"] - alignment_node = XML.Node(XML.Element, "alignment", new_alignment, nothing, nothing) + alignment_node = XML.Node{String}(XML.Element, "alignment", isnothing(new_alignment) ? Pair{String,String}[] : Pair{String,String}[k => v for (k,v) in new_alignment], nothing, nothing) first = false else # Apply the same attribute to the rest of the cells in the range. if cell.style == UInt64(0) @@ -1280,12 +1281,12 @@ function update_sharedString_font(ws::Worksheet, cell::Cell; is = parse(str_formatted, XML.Node)[1] # Convert to XML.Node for ease of handling - all_r = filter(z -> z.tag == "r", XML.children(is)) + all_r = filter(z -> XML.tag(z) == "r", XML.children(is)) run_elements = reduce(vcat, [XML.children(z) for z in all_r]) - rPr_elements=filter(z -> z.tag == "rPr", run_elements) # rPr elements + rPr_elements = filter(z -> XML.tag(z) == "rPr", run_elements) # rPr elements t=String[] # text elements - for i in filter(z -> z.tag == "t", run_elements) + for i in filter(z -> XML.tag(z) == "t", run_elements) push!(t, XML.is_simple(i[1]) ? XML.simple_value(i[1]) : XML.value(i[1])) end @@ -1297,7 +1298,7 @@ function update_sharedString_font(ws::Worksheet, cell::Cell; for att in XML.children(rPr) # first copy existing attributes for i in 1:length(atts) - if att.tag == atts[i] + if XML.tag(att) == atts[i] new_rPr[i] = att end end @@ -1332,7 +1333,7 @@ function update_sharedString_font(ws::Worksheet, cell::Cell; if !isnothing(rPr.children) empty!(rPr.children) foreach(new_rPr) do element - element.tag != "DeleteMe" && push!(rPr.children, element) + XML.tag(element) != "DeleteMe" && push!(rPr.children, element) end end end @@ -1380,7 +1381,7 @@ function update_sharedString_font(ws::Worksheet, cell::Cell; for r in 1:length(all_r) if t[r] != ")___DeleteMe___(" # signals a merged element to be skipped write(new_r, " \n") - r > inc_first && write(new_r, XML.write(rPr_elements[r-inc_first];depth=3) * "\n") + r > inc_first && write(new_r, XML.write(rPr_elements[r-inc_first]) * "\n") write(new_r, " " *t[r] * "\n") write(new_r, " \n") end diff --git a/src/cellformats.jl b/src/cellformats.jl index 36113d52..44b5568a 100644 --- a/src/cellformats.jl +++ b/src/cellformats.jl @@ -334,7 +334,7 @@ function getFont(wb::Workbook, cell_style::XML.Node)::Union{Nothing,CellFont} current_font = font_nodes[fontid_int + 1] # index into filtered list, not raw children # current_font = XML.children(font_elements)[fontid_int+1] # Zero based! font_atts = Dict{String,Union{Dict{String,String},Nothing}}() - for c in XML.children(current_font) + for c in xml_elements(current_font) if isnothing(XML.attributes(c)) || length(XML.attributes(c)) == 0 font_atts[XML.tag(c)] = nothing else @@ -434,13 +434,14 @@ function getBorder(wb::Workbook, cell_style::XML.Node)::Union{Nothing,CellBorder applyborder = haskey(cell_style, "applyBorder") ? cell_style["applyBorder"] : "0" xroot = styles_xmlroot(wb) border_elements = find_all_nodes("/" * SPREADSHEET_NAMESPACE_XPATH_ARG * ":styleSheet/" * SPREADSHEET_NAMESPACE_XPATH_ARG * ":borders", xroot)[begin] - if parse(Int, border_elements["count"]) != length(XML.children(border_elements)) - throw(XLSXError("Unexpected number of border definitions found : $(length(XML.children(border_elements))). Expected $(parse(Int, border_elements["count"]))")) + border_nodes = xml_elements(border_elements) + if parse(Int, border_elements["count"]) != length(border_nodes) + throw(XLSXError("Unexpected number of border definitions found : $(length(border_nodes)). Expected $(parse(Int, border_elements["count"]))")) end - current_border = XML.children(border_elements)[parse(Int, borderid)+1] # Zero based! + current_border = border_nodes[parse(Int, borderid)+1] # Zero based! diag_atts = XML.attributes(current_border) border_atts = Dict{String,Union{Dict{String,String},Nothing}}() - for side in XML.children(current_border) + for side in xml_elements(current_border) if isnothing(XML.attributes(side)) || length(XML.attributes(side)) == 0 border_atts[XML.tag(side)] = nothing else @@ -460,7 +461,7 @@ function getBorder(wb::Workbook, cell_style::XML.Node)::Union{Nothing,CellBorder throw(XLSXError("No direction set for `diagonal` border")) end end - for subc in XML.children(side) # color is a child of a border element + for subc in xml_elements(side) # color is a child of a border element for (k, v) in XML.attributes(subc) border_atts[XML.tag(side)][k] = v end @@ -1357,7 +1358,7 @@ function setAlignment(sh::Worksheet, cellref::CellRef; cell_alignment = getAlignment(wb, cell_style) old_atts = isnothing(cell_alignment) ? Dict{String,String}() : cell_alignment.alignment["alignment"] - atts = OrderedDict{String,String}() + atts = Dict{String,String}() _merge_alignment_att(atts, "horizontal", horizontal, old_atts) _merge_alignment_att(atts, "vertical", vertical, old_atts) _merge_alignment_att(atts, "wrapText", wrapText, old_atts, b -> b ? "1" : "0") @@ -1365,7 +1366,7 @@ function setAlignment(sh::Worksheet, cellref::CellRef; _merge_alignment_att(atts, "indent", indent, old_atts) _merge_alignment_att(atts, "textRotation", rotation, old_atts) - alignment_node = XML.Node(XML.Element, "alignment", atts, nothing, nothing) + alignment_node = XML.Node{String}(XML.Element, "alignment", Pair{String,String}[k => v for (k,v) in atts], nothing, nothing) newstyle = update_template_xf(sh, allXfNodes, CellDataFormat(cell.style), alignment_node).id cell.style = newstyle @@ -1507,7 +1508,7 @@ function getFormat(wb::Workbook, cell_style::XML.Node)::Union{Nothing,CellFormat isnothing(idx) && throw(XLSXError("No format definition found for numFmtId $numfmtid_int.")) current_format = format_nodes[idx] - atts = Dict{String,String}(k => XLSX.unescape(v) for (k, v) in XML.attributes(current_format)) + atts = Dict{String,String}(k => v for (k, v) in XML.attributes(current_format)) format_atts[XML.tag(current_format)] = atts else @@ -1912,7 +1913,7 @@ function setColumnWidth(ws::Worksheet, rng::CellRange; width::Union{Nothing,Real # Build a map of existing column definitions keyed by min (column index string) col_defs = Dict{String,Dict{String,String}}() - for c in XML.children(sheetdoc[i][j]) + for c in xml_elements(sheetdoc[i][j]) col_defs[c["min"]] = Dict{String,String}(XML.attributes(c)) end @@ -1997,7 +1998,7 @@ function getColumnWidth(ws::Worksheet, cellref::CellRef)::Union{Nothing,Real} if isnothing(j) # There are no existing column formats defined. return nothing end - for c in XML.children(sheetdoc[i][j]) + for c in xml_elements(sheetdoc[i][j]) if c["min"] == string(cellref.column_number) if haskey(c, "width") return parse(Float64, c["width"]) @@ -2221,7 +2222,7 @@ function getMergedCells(ws::Worksheet)::Union{Vector{CellRange},Nothing} return nothing end - c = XML.children(sheetdoc[i][j]) + c = xml_elements(sheetdoc[i][j]) if length(c) != parse(Int, sheetdoc[i][j]["count"]) throw(XLSXError("Unexpected number of mergeCells found: $(length(c)). Expected $(sheetdoc[i][j]["count"]).")) end @@ -2455,7 +2456,7 @@ function mergeCells(ws::Worksheet, cr::CellRange) j = l + 1 count = 0 else # There are already some existing merged cells - c = XML.children(sheetdoc[i][j]) + c = xml_elements(sheetdoc[i][j]) count = length(c) if count != parse(Int, sheetdoc[i][j]["count"]) throw(XLSXError("Unexpected number of mergeCells found: $(length(c)). Expected $(sheetdoc[i][j]["count"]).")) diff --git a/src/conditional-format-helpers.jl b/src/conditional-format-helpers.jl index 4e666e3a..8c2aca1f 100644 --- a/src/conditional-format-helpers.jl +++ b/src/conditional-format-helpers.jl @@ -56,7 +56,7 @@ function update_worksheet_cfx!(allcfs, cfx, ws, rng) matchcfs = filter(x -> x["sqref"] == string(rng), allcfs) # Match range with existing conditional formatting blocks. l = length(matchcfs) if l == 0 # No existing conditional formatting blocks for this range so create a new one. - new_cf = XML.Element("conditionalFormatting"; sqref=rng) + new_cf = XML.Element("conditionalFormatting"; sqref=string(rng)) push!(new_cf, cfx) add_cf_to_XML(ws, new_cf) # Add the new conditional formatting block to the worksheet XML. elseif l == 1 # Existing conditional formatting block found for this range so add new rule to that block. @@ -181,7 +181,7 @@ function Add_Cf_Dx(wb::Workbook, new_dx::XML.Node)::DxFormat println(XML.write(xroot[i][j])) =# else - existing_dxf_elements_count = length(XML.children(xroot[i][j])) + existing_dxf_elements_count = length(xml_elements(xroot[i][j])) if parse(Int, xroot[i][j]["count"]) != existing_dxf_elements_count throw(XLSXError("Wrong number of xf elements found: $existing_cellxf_elements_count. Expected $(parse(Int, xroot[i][j]["count"])).")) @@ -189,7 +189,7 @@ function Add_Cf_Dx(wb::Workbook, new_dx::XML.Node)::DxFormat end # Don't reuse duplicates here. Always create new! - existingdx = XML.children(xroot[i][j]) + existingdx = xml_elements(xroot[i][j]) dxfs = unlink(xroot[i][j], ("dxfs", "dxf")) # Create the new Node if length(existingdx) > 0 for c in existingdx diff --git a/src/conditional-formats.jl b/src/conditional-formats.jl index af01cb88..471893b1 100644 --- a/src/conditional-formats.jl +++ b/src/conditional-formats.jl @@ -1450,16 +1450,16 @@ function setCfCellIs(ws::Worksheet, rng::CellRange; allkws::Dict{Symbol,Any}=()) if isnothing(value) value = all(ismissing.(ws[rng])) ? nothing : string(sum(skipmissing(ws[rng])) / count(!ismissing, ws[rng])) end - cfx = XML.Element("cfRule"; type="cellIs", dxfId=Int(dxid.id)) + cfx = XML.Element("cfRule"; type="cellIs", dxfId=string(Int(dxid.id))) cfx["priority"] = length(old_cf) > 0 ? string(maximum([last(x).priority for x in values(old_cf)]) + 1) : 1 if !isnothing(stopIfTrue) && stopIfTrue == "true" cfx["stopIfTrue"] = "1" end cfx["operator"] = operator - push!(cfx, XML.Element("formula", XML.Text(XLSX.escape(value)))) + push!(cfx, XML.Element("formula", XML.Text(value))) if !isnothing(value2) && operator ∈ ["between", "notBetween"] - push!(cfx, XML.Element("formula", XML.Text(XLSX.escape(value2)))) + push!(cfx, XML.Element("formula", XML.Text(value2))) end update_worksheet_cfx!(allcfs, cfx, ws, rng) @@ -1542,14 +1542,14 @@ function setCfContainsText(ws::Worksheet, rng::CellRange; allkws::Dict{Symbol,An end formula = replace(formula, "__txt__" => value, "__CR__" => string(first(rng))) - cfx = XML.Element("cfRule"; type=type, dxfId=Int(dxid.id)) + cfx = XML.Element("cfRule"; type=type, dxfId=string(Int(dxid.id))) cfx["priority"] = length(old_cf) > 0 ? string(maximum([last(x).priority for x in values(old_cf)]) + 1) : 1 if !isnothing(stopIfTrue) && stopIfTrue == "true" cfx["stopIfTrue"] = "1" end cfx["operator"] = operator cfx["text"] = value - push!(cfx, XML.Element("formula", XML.Text(XLSX.escape(formula)))) + push!(cfx, XML.Element("formula", XML.Text(formula))) update_worksheet_cfx!(allcfs, cfx, ws, rng) @@ -1616,7 +1616,7 @@ function setCfTop10(ws::Worksheet, rng::CellRange; allkws::Dict{Symbol,Any}=()): percent = "" bottom = "" - cfx = XML.Element("cfRule"; type="top10", dxfId=Int(dxid.id)) + cfx = XML.Element("cfRule"; type="top10", dxfId=string(Int(dxid.id))) if operator == "topN" elseif operator == "topN%" percent = "1" @@ -1700,25 +1700,25 @@ function setCfAboveAverage(ws::Worksheet, rng::CellRange; allkws::Dict{Symbol,An dxid = Add_Cf_Dx(wb, new_dx) if operator == "aboveAverage" - cfx = XML.Element("cfRule"; type=operator, dxfId=Int(dxid.id), priority="1") + cfx = XML.Element("cfRule"; type=operator, dxfId=string(Int(dxid.id)), priority="1") elseif operator == "aboveEqAverage" - cfx = XML.Element("cfRule"; type="aboveAverage", dxfId=Int(dxid.id), priority="1", equalAverage="1") + cfx = XML.Element("cfRule"; type="aboveAverage", dxfId=string(Int(dxid.id)), priority="1", equalAverage="1") elseif operator == "plus1StdDev" - cfx = XML.Element("cfRule"; type="aboveAverage", dxfId=Int(dxid.id), priority="1", stdDev="1") + cfx = XML.Element("cfRule"; type="aboveAverage", dxfId=string(Int(dxid.id)), priority="1", stdDev="1") elseif operator == "plus2StdDev" - cfx = XML.Element("cfRule"; type="aboveAverage", dxfId=Int(dxid.id), priority="1", stdDev="2") + cfx = XML.Element("cfRule"; type="aboveAverage", dxfId=string(Int(dxid.id)), priority="1", stdDev="2") elseif operator == "plus3StdDev" - cfx = XML.Element("cfRule"; type="aboveAverage", dxfId=Int(dxid.id), priority="1", stdDev="3") + cfx = XML.Element("cfRule"; type="aboveAverage", dxfId=string(Int(dxid.id)), priority="1", stdDev="3") elseif operator == "belowAverage" - cfx = XML.Element("cfRule"; type="aboveAverage", dxfId=Int(dxid.id), priority="1", aboveAverage="0") + cfx = XML.Element("cfRule"; type="aboveAverage", dxfId=string(Int(dxid.id)), priority="1", aboveAverage="0") elseif operator == "belowEqAverage" - cfx = XML.Element("cfRule"; type="aboveAverage", dxfId=Int(dxid.id), priority="1", aboveAverage="0", equalAverage="1") + cfx = XML.Element("cfRule"; type="aboveAverage", dxfId=string(Int(dxid.id)), priority="1", aboveAverage="0", equalAverage="1") elseif operator == "minus1StdDev" - cfx = XML.Element("cfRule"; type="aboveAverage", dxfId=Int(dxid.id), priority="1", aboveAverage="0", stdDev="1") + cfx = XML.Element("cfRule"; type="aboveAverage", dxfId=string(Int(dxid.id)), priority="1", aboveAverage="0", stdDev="1") elseif operator == "minus2StdDev" - cfx = XML.Element("cfRule"; type="aboveAverage", dxfId=Int(dxid.id), priority="1", aboveAverage="0", stdDev="2") + cfx = XML.Element("cfRule"; type="aboveAverage", dxfId=string(Int(dxid.id)), priority="1", aboveAverage="0", stdDev="2") elseif operator == "minus3StdDev" - cfx = XML.Element("cfRule"; type="aboveAverage", dxfId=Int(dxid.id), priority="1", aboveAverage="0", stdDev="3") + cfx = XML.Element("cfRule"; type="aboveAverage", dxfId=string(Int(dxid.id)), priority="1", aboveAverage="0", stdDev="3") else throw(XLSXError("Invalid operator: $operator. Valid options are: `aboveAverage`, `aboveEqAverage`, `plus1sStdDev`, `plus2StdDev`, `plus3StdDev`, `belowAverage`, `belowEqAverage`, `minus1StdDev`, `minus2StdDev`, `minus3StdDev`.")) end @@ -1794,14 +1794,14 @@ function setCfTimePeriod(ws::Worksheet, rng::CellRange; allkws::Dict{Symbol,Any} new_dx = get_new_dx(wb, dx) dxid = Add_Cf_Dx(wb, new_dx) - cfx = XML.Element("cfRule"; type="timePeriod", dxfId=Int(dxid.id)) + cfx = XML.Element("cfRule"; type="timePeriod", dxfId=string(Int(dxid.id))) cfx["priority"] = length(old_cf) > 0 ? string(maximum([last(x).priority for x in values(old_cf)]) + 1) : 1 if !isnothing(stopIfTrue) && stopIfTrue == "true" cfx["stopIfTrue"] = "1" end cfx["timePeriod"] = operator - push!(cfx, XML.Element("formula", XML.Text(XLSX.escape(formula)))) + push!(cfx, XML.Element("formula", XML.Text(formula))) update_worksheet_cfx!(allcfs, cfx, ws, rng) @@ -1875,12 +1875,12 @@ function setCfContainsBlankErrorUniqDup(ws::Worksheet, rng::CellRange; allkws::D new_dx = get_new_dx(wb, dx) dxid = Add_Cf_Dx(wb, new_dx) - cfx = XML.Element("cfRule"; type=operator, dxfId=Int(dxid.id)) + cfx = XML.Element("cfRule"; type=operator, dxfId=string(Int(dxid.id))) cfx["priority"] = length(old_cf) > 0 ? string(maximum([last(x).priority for x in values(old_cf)]) + 1) : 1 if !isnothing(stopIfTrue) && stopIfTrue == "true" cfx["stopIfTrue"] = "1" end - formula != "" && push!(cfx, XML.Element("formula", XML.Text(XLSX.escape(formula)))) + formula != "" && push!(cfx, XML.Element("formula", XML.Text(formula))) update_worksheet_cfx!(allcfs, cfx, ws, rng) @@ -1941,13 +1941,13 @@ function setCfFormula(ws::Worksheet, rng::CellRange; allkws::Dict{Symbol,Any}=() new_dx = get_new_dx(wb, dx) dxid = Add_Cf_Dx(wb, new_dx) - cfx = XML.Element("cfRule"; type="expression", dxfId=Int(dxid.id)) + cfx = XML.Element("cfRule"; type="expression", dxfId=string(Int(dxid.id))) cfx["priority"] = length(old_cf) > 0 ? string(maximum([last(x).priority for x in values(old_cf)]) + 1) : 1 if !isnothing(stopIfTrue) && stopIfTrue == "true" cfx["stopIfTrue"] = "1" end - push!(cfx, XML.Element("formula", XML.Text("(" * XLSX.escape(uppercase_unquoted(formula)) * ")"))) + push!(cfx, XML.Element("formula", XML.Text("(" * uppercase_unquoted(formula) * ")"))) update_worksheet_cfx!(allcfs, cfx, ws, rng) @@ -2013,7 +2013,7 @@ function setCfColorScale(ws::Worksheet, rng::CellRange; allkws::Dict{Symbol,Any} let new_pr, new_cf - new_pr = length(old_cf) > 0 ? string(maximum([last(x).priority for x in values(old_cf)]) + 1) : 1 + new_pr = length(old_cf) > 0 ? string(maximum([last(x).priority for x in values(old_cf)]) + 1) : "1" if isnothing(colorscale) @@ -2036,7 +2036,7 @@ function setCfColorScale(ws::Worksheet, rng::CellRange; allkws::Dict{Symbol,Any} do_sheet_names_match(ws, SheetCellRef(val)) val = string(SheetCellRef(val).cellref) end - val = XLSX.escape(uppercase_unquoted(val)) + val = uppercase_unquoted(val) end end @@ -2145,7 +2145,7 @@ function setCfIconSet(ws::Worksheet, rng::CellRange; allkws::Dict{Symbol,Any}=() let new_pr, new_cf - new_pr = length(old_cf) > 0 ? string(maximum([last(x).priority for x in values(old_cf)]) + 1) : 1 + new_pr = length(old_cf) > 0 ? string(maximum([last(x).priority for x in values(old_cf)]) + 1) : "1" isnothing(min_type) || min_type in ["percentile", "percent", "num", "formula"] || throw(XLSXError("Invalid min_type: $min_type. Valid options are: percentile, percent, num, formula.")) (!isnothing(min_type) && min_type == "formula") || isnothing(min_val) || is_valid_fixed_cellname(min_val) || is_valid_fixed_sheet_cellname(min_val) || !isnothing(tryparse(Float64, min_val)) || throw(XLSXError("Invalid min_val: `$min_val`. Valid options (unless min_type is `formula`) are a CellRef (e.g. `\$A\$1`) or a number.")) @@ -2162,7 +2162,7 @@ function setCfIconSet(ws::Worksheet, rng::CellRange; allkws::Dict{Symbol,Any}=() do_sheet_names_match(ws, SheetCellRef(val)) val = string(SheetCellRef(val).cellref) end - val = XLSX.escape(uppercase_unquoted(val)) + val = uppercase_unquoted(val) end end if !haskey(iconsets, iconset) @@ -2222,8 +2222,9 @@ function setCfIconSet(ws::Worksheet, rng::CellRange; allkws::Dict{Symbol,Any}=() else c = XML.Element("xm:f", XML.Text(val)) end - if isnothing(XML.children(cfx[1][i+1])) - cfx[1][i+1] = XML.Node(cfx[1][i+1], c) + if isempty(XML.children(cfx[1][i+1])) + old = cfx[1][i+1] + cfx[1][i+1] = XML.Node{String}(XML.Element, XML.tag(old), old.attributes, nothing, XML.Node{String}[c]) else cfx[1][i+1][1] = c end @@ -2381,7 +2382,7 @@ function setCfDataBar(ws::Worksheet, rng::CellRange; allkws::Dict{Symbol,Any}=() do_sheet_names_match(ws, SheetCellRef(val)) val = string(SheetCellRef(val).cellref) end - val = XLSX.escape(uppercase_unquoted(val)) + val = uppercase_unquoted(val) end end if !haskey(databars, databar) diff --git a/src/formula.jl b/src/formula.jl index f6a1fefe..23143593 100644 --- a/src/formula.jl +++ b/src/formula.jl @@ -662,7 +662,7 @@ function get_external_workbook_path(xf::XLSXFile, id::Int) haskey(atts, "r:id") || throw(XLSXError("Something wrong here!")) rId = atts["r:id"] # now need a second lookup of this further r:id - altUrls = XML.children(xmlroot(xf, "xl/externalLinks/_rels/$(basename(rel)).rels")[end]) + altUrls = xml_elements(xml_root_element(xmlroot(xf, "xl/externalLinks/_rels/$(basename(rel)).rels"))) for c in altUrls atts = XML.attributes(c) if haskey(atts, "Id") && atts["Id"] == rId diff --git a/src/read.jl b/src/read.jl index 2d815492..c8fa7e33 100644 --- a/src/read.jl +++ b/src/read.jl @@ -448,7 +448,7 @@ function parse_relationships!(xf::XLSXFile) # package level relationships xroot = get_package_relationship_root(xf) - for el in XML.children(xroot) + for el in xml_elements(xroot) push!(xf.relationships, Relationship(el)) end isempty(xf.relationships) && throw(XLSXError("Relationships not found in _rels/.rels!")) @@ -456,7 +456,7 @@ function parse_relationships!(xf::XLSXFile) # workbook level relationships wb = get_workbook(xf) xroot = get_workbook_relationship_root(xf) - for el in XML.children(xroot) + for el in xml_elements(xroot) push!(wb.relationships, Relationship(el)) end isempty(wb.relationships) && throw(XLSXError("Relationships not found in xl/_rels/workbook.xml.rels")) @@ -466,7 +466,7 @@ end # Updates xf.workbook from xf.data[\"xl/workbook.xml\"] function parse_workbook!(xf::XLSXFile) - xroot = xmlroot(xf,"xl/workbook.xml")[end] + xroot = xml_root_element(xmlroot(xf,"xl/workbook.xml")) chn = XML.children(xroot) XML.tag(xroot) != "workbook" && throw(XLSXError("Malformed xl/workbook.xml. Root node name should be 'workbook'. Got '$(XML.tag(xroot))'.")) @@ -505,7 +505,7 @@ function parse_workbook!(xf::XLSXFile) for node in chn if XML.tag(node) == "sheets" - for sheet_node in XML.children(node) + for sheet_node in xml_elements(node) XML.tag(sheet_node) != "sheet" && throw(XLSXError("Unsupported node $(XML.tag(sheet_node)) in node $(XML.tag(node)) in 'xl/workbook.xml'.")) worksheet = Worksheet(xf, sheet_node) push!(sheets, worksheet) @@ -612,7 +612,7 @@ function get_wb_ext_refs(xf::XLSXFile) xroot = xmlroot(xf, "xl/workbook.xml") i, j = get_idces(xroot, "workbook", "externalReferences") if !isnothing(j) - for (i, ref) in enumerate(XML.children(xroot[i][j])) + for (i, ref) in enumerate(xml_elements(xroot[i][j])) ext_refs[i] = ref["r:id"] end end @@ -622,9 +622,9 @@ end # delete Override PartName=calcChain since this was never loaded (#31) function remove_calcChain!(xf::XLSXFile) xf.data["[Content_Types].xml"] - ctype_root = xmlroot(xf, "[Content_Types].xml")[end] + ctype_root = xml_root_element(xmlroot(xf, "[Content_Types].xml")) for (i, c) in enumerate(XML.children(ctype_root)) - if c.tag == "Override" && haskey(c.attributes, "PartName") && c.attributes["PartName"]=="/xl/calcChain.xml" + if XML.tag(c) == "Override" && haskey(c, "PartName") && c["PartName"]=="/xl/calcChain.xml" deleteat!(ctype_root.children, i) break end @@ -661,44 +661,40 @@ function strip_bom_and_lf!(bytes::Vector{UInt8}) end end -function skipNode(r::XML.Raw, skipnode::String) # separate rows or ssts to speed up reading of large files -# new = Vector{UInt8}() # original data with or node removed -# skipped = Vector{UInt8}() # just the or node and its children - new = IOBuffer() # original data with or node removed - skipped = IOBuffer() # just the or node and its children - n = XML.next(r) - write(new, n.data[n.pos:n.pos+n.len]) - - while first(XML.get_name(n.data, n.pos)) != skipnode # Retain everything before the or node - n = XML.next(n) - write(new, n.data[n.pos:n.pos+n.len]) - end +function splitNode(xml_str::String, skipnode::String) + doc = parse(xml_str, XML.Node) + root = xml_root_element(doc) - if skipnode == "sheetData" # Add parents for or elements to the excerpted data - write(skipped, "") - write(skipped, "") - elseif skipnode == "sst" - write(skipped, "") - else - throw(XLSXError("Unknown skipnode $skipnode.")) + # Find the target node and extract its content + target_idx = nothing + for (i, child) in enumerate(XML.children(root)) + if XML.tag(child) == skipnode + target_idx = i + break + end end - sdepth = n.depth - n = XML.next(n) - while n !== nothing && n.depth > sdepth # Put all children of or into the excerpted data - write(skipped, n.data[n.pos:n.pos+n.len]) - n = XML.next(n) + + if isnothing(target_idx) + return doc, "" end - while n !== nothing # Retain everything after the or node - write(new, n.data[n.pos:n.pos+n.len]) - n = XML.next(n) + + target = root[target_idx] + + # Build wrapper XML from the target element and its children + skipped = XML.write(target) + if skipnode == "sheetData" + skipped = "" * skipped * "" end - if skipnode == "sheetData" # close parents for or elements in the excerpted data - write(skipped, "") - write(skipped, "") - elseif skipnode == "sst" - write(skipped, "") + + # Replace with empty self-closing element (preserving attributes) + attrs = XML.attributes(target) + if isnothing(attrs) || isempty(attrs) + root[target_idx] = XML.Element(skipnode) + else + empty!(target.children) end - return take!(new), take!(skipped) + + return doc, skipped end function stream_files(xf::XLSXFile, zip_io::ZipArchives.ZipReader; pass::Int, channel_size::Int=1 << 8) @@ -763,7 +759,8 @@ function load_files!(xf::XLSXFile, zip_io::ZipArchives.ZipReader; pass::Int) rid = get_relationship_id_by_target(wb, file.name) for sheet in wb.sheets if sheet.relationship_id == rid - first_cache_fill!(sheet, XML.LazyNode(file.raw), Threads.nthreads()) + lznode = parse(file.raw, XML.LazyNode) + first_cache_fill!(sheet, lznode, Threads.nthreads()) end end end @@ -798,15 +795,16 @@ function process_file(zip_io::ZipArchives.ZipReader, filename::String) try bytes = ZipArchives.zip_readentry(zip_io, filename) if !startswith(filename, "customXml") && (endswith(filename, ".xml") || endswith(filename, ".rels")) - if occursin(r"xl/worksheets/sheet\d+\.xml|xl/sharedStrings\.xml", filename) - strip_bom_and_lf!(bytes) - skipnode = filename == "xl/sharedStrings.xml" ? "sst" : "sheetData" - f, s = skipNode(XML.Raw(bytes), skipnode) # and elements can be very numerous in large files, so split out and keep as Raw XML data for speed - node = XML.Node(XML.Raw(f)) - raw = XML.Raw(s) + strip_bom_and_lf!(bytes) + xml_str = String(bytes) + if filename == "xl/sharedStrings.xml" + node = parse(xml_str, XML.Node) + raw = xml_str + elseif occursin(r"xl/worksheets/sheet\d+\.xml", filename) + node, raw = splitNode(xml_str, "sheetData") + raw = isempty(raw) ? nothing : raw else - strip_bom_and_lf!(bytes) - node = XML.Node(XML.Raw(bytes)) + node = parse(xml_str, XML.Node) end else bin = bytes @@ -833,12 +831,13 @@ function internal_xml_file_read(xf::XLSXFile, zip_io::Union{Nothing,ZipArchives. try bytes = ZipArchives.zip_readentry(zip_io, filename) strip_bom_and_lf!(bytes) - if occursin(r"xl/worksheets/sheet\d+\.xml|xl/sharedStrings\.xml", filename) - skipnode = filename == "xl/sharedStrings.xml" ? "sst" : "sheetData" - f, _ = skipNode(XML.Raw(bytes), skipnode) # and elements can be very numerous in large files, so split out and keep as Raw XML data for speed - xf.data[filename] = XML.Node(XML.Raw(f)) + xml_str = String(bytes) + if filename == "xl/sharedStrings.xml" + xf.data[filename] = parse(xml_str, XML.Node) + elseif occursin(r"xl/worksheets/sheet\d+\.xml", filename) + xf.data[filename], _ = splitNode(xml_str, "sheetData") else - xf.data[filename] = XML.Node(XML.Raw(bytes)) + xf.data[filename] = parse(xml_str, XML.Node) end xf.files[filename] = true # set file as read catch err diff --git a/src/relationship.jl b/src/relationship.jl index 3720f65b..0e73839f 100644 --- a/src/relationship.jl +++ b/src/relationship.jl @@ -58,7 +58,7 @@ function has_relationship_by_type(wb::Workbook, _type_::String)::Bool end function get_package_relationship_root(xf::XLSXFile)::XML.Node - xroot = xmlroot(xf, "_rels/.rels")[end] + xroot = xml_root_element(xmlroot(xf, "_rels/.rels")) XML.tag(xroot) != "Relationships" && throw(XLSXError("Malformed XLSX file $(xf.source). _rels/.rels root node name should be `Relationships`. Found $(XML.tag(xroot)).")) if ("" => "http://schemas.openxmlformats.org/package/2006/relationships") ∉ get_namespaces(xroot) throw(XLSXError("Unexpected namespace at workbook relationship file: `$(get_namespaces(xroot))`.")) @@ -67,7 +67,7 @@ function get_package_relationship_root(xf::XLSXFile)::XML.Node end function get_workbook_relationship_root(xf::XLSXFile)::XML.Node - xroot = xmlroot(xf, "xl/_rels/workbook.xml.rels")[end] + xroot = xml_root_element(xmlroot(xf, "xl/_rels/workbook.xml.rels")) XML.tag(xroot) != "Relationships" && throw(XLSXError("Malformed XLSX file $(xf.source). xl/_rels/workbook.xml.rels root node name should be `Relationships`. Found $(XML.tag(xroot)).")) if ("" => "http://schemas.openxmlformats.org/package/2006/relationships") ∉ get_namespaces(xroot) throw(XLSXError("Unexpected namespace at workbook relationship file: `$(get_namespaces(xroot))`.")) @@ -114,15 +114,17 @@ function delete_relationships!(xf::XLSXFile, rel::Relationship) #TODO renumber worksheet files in relationships - if necessary. xroot = xmlroot(xf, "xl/_rels/workbook.xml.rels") + root_el = xml_root_element(xroot) - c=XML.children(xroot[end]) - d = findfirst(r -> r["Target"] == rel.Target, c) + c=XML.children(root_el) + d = findfirst(r -> XML.nodetype(r) == XML.Element && r["Target"] == rel.Target, c) deleteat!(c, d) new_rels=XML.Element("Relationships", xmlns="http://schemas.openxmlformats.org/package/2006/relationships") - for child in c + for child in xml_elements(root_el) push!(new_rels, child) end - xroot[end]=new_rels + root_idx = findfirst(n -> XML.nodetype(n) == XML.Element, XML.children(xroot)) + xroot[root_idx]=new_rels xf.data["xl/_rels/workbook.xml.rels"]=xroot end diff --git a/src/sst.jl b/src/sst.jl index dd256f25..63051724 100644 --- a/src/sst.jl +++ b/src/sst.jl @@ -24,7 +24,7 @@ function create_new_sst(wb::Workbook, sst::SharedStringTable) add_relationship!(wb, "sharedStrings.xml", "http://schemas.openxmlformats.org/officeDocument/2006/relationships/sharedStrings") # add Content Type - ctype_root = xmlroot(get_xlsxfile(wb), "[Content_Types].xml")[end] + ctype_root = xml_root_element(xmlroot(get_xlsxfile(wb), "[Content_Types].xml")) XML.tag(ctype_root) != "Types" && throw(XLSXError("Something wrong here!")) override_node = XML.Element("Override"; ContentType = "application/vnd.openxmlformats-officedocument.spreadsheetml.sharedStrings+xml", @@ -106,50 +106,74 @@ function add_shared_string!(wb::Workbook, str_unformatted::AbstractString; myloc end function sst_load!(workbook::Workbook) - chunksize=1000 + chunksize = 1000 sst = get_sst(workbook) if !sst.is_loaded relationship_type = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/sharedStrings" if has_relationship_by_type(workbook, relationship_type) - sst_chan = stream_ssts(open_internal_file_stream(get_xlsxfile(workbook), "xl/sharedStrings.xml")[end], chunksize) + doc = open_internal_file_stream(get_xlsxfile(workbook), "xl/sharedStrings.xml") + sst_root = xml_root_element(doc) # element + sst_chan = stream_ssts(sst_root, chunksize) load_sst_table!(workbook, sst_chan, Threads.nthreads()) init_sst_index(sst) - + return end throw(XLSXError("Shared Strings Table not found for this workbook.")) end end -@inline _is_tag(n::String, tag::String) = n == tag -@inline _is_tag(n::Nothing, tag::String) = false - function produce_sstchunks(out, n, ssts, chunksize) - i = 0 # Position within current chunk - global_idx = 0 # Global position in SST table - - while !isnothing(n) - if _is_tag(n.tag, "si") +function produce_sstchunks(out, sst_root::XML.LazyNode, ssts, chunksize) + i = 0 + global_idx = 0 + + for child in XML.children(sst_root) + if XML.tag(child) == "si" i += 1 global_idx += 1 - ssts[i] = SstToken(n, global_idx) # ← Use global index + ssts[i] = SstToken(child, global_idx) end if i >= chunksize put!(out, copy(ssts)) - i = 0 # Reset chunk position, but global_idx keeps going + i = 0 end - n = XML.next(n) end if i > 0 put!(out, copy(ssts[1:i])) end end -function stream_ssts(n::XML.LazyNode, chunksize::Int; channel_size::Int=1 << 8) - n = XML.next(n) +function stream_ssts(sst_root::XML.LazyNode, chunksize::Int; channel_size::Int=1 << 8) ssts = Vector{SstToken}(undef, chunksize) Channel{Vector{SstToken}}(channel_size) do out - produce_sstchunks(out, n, ssts, chunksize) + produce_sstchunks(out, sst_root, ssts, chunksize) + end +end + +# Convert a LazyNode to a Node for serialization +function materialize(ln::XML.LazyNode)::XML.Node{String} + nt = XML.nodetype(ln) + if nt in (XML.Text, XML.CData, XML.Comment, XML.DTD) + return XML.Node{String}(nt, nothing, nothing, XML.value(ln), nothing) + elseif nt == XML.Declaration + a = XML.attributes(ln) + attrs = isnothing(a) ? nothing : Pair{String,String}[p for p in a] + return XML.Node{String}(nt, nothing, attrs, nothing, nothing) + elseif nt == XML.ProcessingInstruction + return XML.Node{String}(nt, XML.tag(ln), nothing, XML.value(ln), nothing) + elseif nt == XML.Element + a = XML.attributes(ln) + attrs = isnothing(a) ? nothing : Pair{String,String}[p for p in a] + ch = XML.children(ln) + children = isempty(ch) ? nothing : XML.Node{String}[materialize(c) for c in ch] + return XML.Node{String}(nt, XML.tag(ln), attrs, nothing, children) + elseif nt == XML.Document + ch = XML.children(ln) + children = isempty(ch) ? nothing : XML.Node{String}[materialize(c) for c in ch] + return XML.Node{String}(nt, nothing, nothing, nothing, children) + else + error("Unknown node type: $nt") end end @@ -159,11 +183,9 @@ function process_sst(sst::SstToken) if XML.nodetype(el) != XML.Text XML.tag(el) != "si" && throw(XLSXError("Unsupported node $(XML.tag(el)) in sst table.")) - sst = Sst(XML.write(el), i) + sst = Sst(XML.write(materialize(el)), i) return sst - end - end function load_sst_table!(wb::Workbook, chan::Channel, nthreads::Int) @@ -217,7 +239,7 @@ end function unformatted_text(el::XML.LazyNode) :: String io = IOBuffer() gather_strings!(io, el) - s = XLSX.unescape(String(take!(io))) + s = String(take!(io)) return s end @@ -256,7 +278,7 @@ end @inline function sst_unformatted_string(wb::Workbook, index::Int64)::String sst_load!(wb) uss = get_sst(wb).shared_strings[index+1] - return unformatted_text(parse(XML.LazyNode, uss)) + return unformatted_text(parse(uss, XML.LazyNode)) end @inline sst_unformatted_string(xl::XLSXFile, index::Int64) :: String = sst_unformatted_string(get_workbook(xl), index) @@ -633,8 +655,8 @@ end # Create a RichTextString from a shared string with multiple runs (or nothing if a simple text) function getRichTextString(xml_string::String)::Union{RichTextString, Nothing} - doc = parse(XML.Node, xml_string) - si = doc[end] + doc = parse(xml_string, XML.Node) + si = xml_root_element(doc) # Check for rich text runs elements runs = [child for child in XML.children(si) if XML.tag(child) == "r"] diff --git a/src/stream.jl b/src/stream.jl index 06e7004d..f395a6aa 100644 --- a/src/stream.jl +++ b/src/stream.jl @@ -52,87 +52,86 @@ end @inline function open_internal_file_stream(xf::XLSXFile, filename::String) :: XML.LazyNode !internal_xml_file_exists(xf, filename) && throw(XLSXError("Couldn't find $filename in $(xf.source).")) -# if xf.use_cache_for_sheet_data || (xf.source isa IO) if xf.source isa IO seekstart(xf.source) zip_io = ZipArchives.ZipReader(read(xf.source)) else - zip_io = ZipArchives.ZipReader(FileArray(abspath(xf.source))) # FileArray is marginally slower than mmap -# zip_io = ZipArchives.ZipReader(Mmap.mmap(abspath(xf.source))) # but Mmap is unreliable : https://discourse.julialang.org/t/struggling-to-use-mmap-with-ziparchives/129839 + zip_io = ZipArchives.ZipReader(FileArray(abspath(xf.source))) end - return XML.LazyNode(XML.Raw(ZipArchives.zip_readentry(zip_io, filename))) + return parse(String(ZipArchives.zip_readentry(zip_io, filename)), XML.LazyNode) end +# Collect all row LazyNodes from a worksheet's sheetData element. +function _collect_row_nodes(doc::XML.LazyNode) + root = xml_root_element(doc) + XML.tag(root) != "worksheet" && throw(XLSXError("Expecting to find a worksheet node. Found a $(XML.tag(root)).")) + + # Find sheetData + sheetdata = nothing + for child in XML.children(root) + if XML.tag(child) == "sheetData" + sheetdata = child + break + end + end + sheetdata === nothing && throw(XLSXError("No `sheetData` node found in worksheet")) + + # Collect row nodes + return XML.LazyNode[child for child in XML.children(sheetdata) if XML.tag(child) == "row"] +end + # Creates an iterator for row elements in the Worksheet's XML. function Base.iterate(itr::SheetRowStreamIterator) ws = get_worksheet(itr) target_file = get_relationship_target_by_id("xl", get_workbook(ws), ws.relationship_id) - sheetnode = open_internal_file_stream(get_xlsxfile(ws), target_file) # worksheet target files are LazyNodes - - length(sheetnode) <= 0 && throw(XLSXError("Couldn't open reader for Worksheet $(ws.name).")) - XML.tag(sheetnode[end]) != "worksheet" && throw(XLSXError("Expecting to find a worksheet node.: Found a $(XML.tag(sheetnode[end])).")) - - sheetnode=XML.next(sheetnode) - - while XML.tag(sheetnode) != "sheetData" # Check for `sheetData` - sheetnode = XML.next(sheetnode) - sheetnode === nothing && throw(XLSXError("No `sheetData` node found in worksheet")) - end + doc = open_internal_file_stream(get_xlsxfile(ws), target_file) - XML.depth(sheetnode) != 2 && throw(XLSXError("Malformed Worksheet \"$(ws.name)\": unexpected node depth for sheetData node: $(XML.depth(sheetnode)).")) + length(doc) <= 0 && throw(XLSXError("Couldn't open reader for Worksheet $(ws.name).")) - rownode=XML.next(sheetnode) + row_nodes = _collect_row_nodes(doc) + isempty(row_nodes) && return nothing - while XML.tag(rownode) != "row" # Check for at least one `row` - rownode = XML.next(rownode) - rownode === nothing && return nothing # no rows found - end - - # rownode is now the first row - a = XML.attributes(rownode) # get row number and row height (if specified) + # Process first row + rownode = row_nodes[1] + a = XML.attributes(rownode) current_row = parse(Int, a["r"]) current_row_ht = haskey(a, "ht") ? parse(Float64, a["ht"]) : nothing - # collect all cells in this row rowcells = Dict{Int, Cell}() - mylock=ReentrantLock() - next_rownode, sst_count = get_rowcells!(rowcells, rownode, ws; mylock) # update rowcells in place - + mylock = ReentrantLock() + sst_count = get_rowcells!(rowcells, rownode, ws; mylock) itr.sheet.sst_count += sst_count - sheet_row = SheetRow(ws, current_row, current_row_ht, rowcells) # create the sheet_row - - # debug -# @assert sheetnode.raw.data == next_rownode.raw.data "LazyNode data don't match" + sheet_row = SheetRow(ws, current_row, current_row_ht, rowcells) - return sheet_row, SheetRowStreamIteratorState(next_rownode, rowcells, mylock) + return sheet_row, SheetRowStreamIteratorState(row_nodes, 2, rowcells, mylock) end function Base.iterate(itr::SheetRowStreamIterator, state::SheetRowStreamIteratorState) ws = get_worksheet(itr) - rownode = state.next_rownode rowcells = state.rowcells mylock = state.lock empty!(rowcells) - if rownode === nothing # there is no next_rownode - all rows processed + if state.row_index > length(state.row_nodes) return nothing end - # get row number and row height (if specified) + rownode = state.row_nodes[state.row_index] + state.row_index += 1 + a = XML.attributes(rownode) current_row = parse(Int, a["r"]) current_row_ht = haskey(a, "ht") ? parse(Float64, a["ht"]) : nothing - # collect all cells in this row - next_rownode, sst_count = get_rowcells!(rowcells, rownode, ws; mylock) # update rowcells in place + sst_count = get_rowcells!(rowcells, rownode, ws; mylock) itr.sheet.sst_count += sst_count - sheet_row = SheetRow(ws, current_row, current_row_ht, rowcells) # create the sheet_row + sheet_row = SheetRow(ws, current_row, current_row_ht, rowcells) - return sheet_row, SheetRowStreamIteratorState(next_rownode, rowcells, mylock) + return sheet_row, SheetRowStreamIteratorState(state.row_nodes, state.row_index, rowcells, mylock) end # @@ -306,28 +305,27 @@ end Base.length(r::WorksheetCache)=length(r.cells) #--------------------------------------------------------------------- Fill cache on first read (multi-threaded) -function produce_rowchunks!(out, n, rows, chunksize) - pos=0 - while !isnothing(n) - if _is_tag(n.tag, "row") +function produce_rowchunks!(out, sheetdata::XML.LazyNode, rows, chunksize) + pos = 0 + for child in XML.children(sheetdata) + if XML.tag(child) == "row" pos += 1 - rows[pos] = n + rows[pos] = child end if pos >= chunksize put!(out, copy(rows)) - pos=0 + pos = 0 end - n = XML.next(n) end - if pos>0 # handle last incomplete chunk + if pos > 0 put!(out, copy(@view rows[1:pos])) end end -function stream_rows(n::XML.LazyNode, chunksize::Int; channel_size::Int=1 << 8) +function stream_rows(sheetdata::XML.LazyNode, chunksize::Int; channel_size::Int=1 << 8) rows = Vector{XML.LazyNode}(undef, chunksize) Channel{Vector{XML.LazyNode}}(channel_size) do out - produce_rowchunks!(out, n, rows, chunksize) + produce_rowchunks!(out, sheetdata, rows, chunksize) end end @@ -339,12 +337,12 @@ function process_row(row::XML.LazyNode, handled_attributes::Set{String}, ws::Wor current_row_ht = haskey(atts, "ht") ? parse(Float64, atts["ht"]) : nothing row_num = haskey(atts, "r") ? parse(Int, atts["r"]) : nothing row_num === nothing && throw(XLSXError("Row without 'r' attribute encountered in worksheet $(ws.name).")) - unhandled_attributes = Dict(filter(attr -> !in(first(attr), handled_attributes), atts)) + unhandled_attributes = Dict{String,String}(k => v for (k, v) in atts if !in(k, handled_attributes)) end # Process cells rowcells = Dict{Int,Cell}() - _, sst_count = get_rowcells!(rowcells, row, ws; mylock) + sst_count = get_rowcells!(rowcells, row, ws; mylock) return sst_count, SheetRow(ws, row_num, current_row_ht, rowcells), unhandled_attributes @@ -354,15 +352,15 @@ function first_cache_fill!(ws::Worksheet, lznode::XML.LazyNode, nthreads::Int) chunksize = 1000 handled_attributes = Set{String}(["r", "spans", "ht", "customHeight"]) unhandled_attributes = Dict{Int,Dict{String,String}}() - + if ws.cache === nothing ws.cache = WorksheetCache(ws) else throw(XLSXError("Expecting empty cache but cache not empty!")) end - + sheet_rows = Channel{Vector{Tuple{Int, SheetRow, Dict{String,String}}}}(1 << 8) - + consumer = @async begin sst_total = 0 for rows in sheet_rows @@ -377,20 +375,30 @@ function first_cache_fill!(ws::Worksheet, lznode::XML.LazyNode, nthreads::Int) ws.sst_count = sst_total ws.unhandled_attributes = isempty(unhandled_attributes) ? nothing : unhandled_attributes end - - streamed_rows = stream_rows(lznode, chunksize) + + # Navigate to sheetData element + root = xml_root_element(lznode) + sheetdata = nothing + for child in XML.children(root) + if XML.tag(child) == "sheetData" + sheetdata = child + break + end + end + sheetdata === nothing && throw(XLSXError("No `sheetData` node found in worksheet")) + + streamed_rows = stream_rows(sheetdata, chunksize) mylock = ReentrantLock() - + @sync for _ in 1:nthreads Threads.@spawn begin for rows in streamed_rows - # rows is already a chunk - just process it processed = [process_row(row, handled_attributes, ws, mylock) for row in rows] put!(sheet_rows, processed) end end end - + close(sheet_rows) wait(consumer) @@ -401,39 +409,32 @@ end # Materialise specific rows from a worksheet.xml file into SheetRows # (faster than using eachrow which materialises every row). function match_rows(ws::Worksheet, rows_to_match::Vector{Int})::Vector{SheetRow} - matched_rows=Vector{SheetRow}() + matched_rows = Vector{SheetRow}() sort!(rows_to_match) - i=1 - l=length(rows_to_match) - + i = 1 + l = length(rows_to_match) + target_file = get_relationship_target_by_id("xl", get_workbook(ws), ws.relationship_id) - lznode = open_internal_file_stream(get_xlsxfile(ws), target_file) - - n = XML.next(lznode) - mylock=ReentrantLock() - while !isnothing(n) - if n.tag == "row" # find each row - atts = XML.attributes(n) - if !isnothing(atts) - row_num = haskey(atts, "r") ? parse(Int, atts["r"]) : nothing - end - row_num === nothing && throw(XLSXError("Row without 'r' attribute encountered in worksheet $(ws.name).")) - if !isnothing(row_num) && row_num == rows_to_match[i] # process matching rows into SheetRows - current_row_ht = haskey(atts, "ht") ? parse(Float64, atts["ht"]) : nothing - - # Process cells - rowcells = Dict{Int,Cell}() - n, _ = get_rowcells!(rowcells, n, ws; mylock) - - sheetrow = SheetRow(ws, row_num, current_row_ht, rowcells) - push!(matched_rows, sheetrow) - i+=1 - i>l && break # stop once all rows matched - continue - end + doc = open_internal_file_stream(get_xlsxfile(ws), target_file) + row_nodes = _collect_row_nodes(doc) + + mylock = ReentrantLock() + for n in row_nodes + atts = XML.attributes(n) + row_num = !isnothing(atts) && haskey(atts, "r") ? parse(Int, atts["r"]) : nothing + row_num === nothing && throw(XLSXError("Row without 'r' attribute encountered in worksheet $(ws.name).")) + if row_num == rows_to_match[i] + current_row_ht = haskey(atts, "ht") ? parse(Float64, atts["ht"]) : nothing + + rowcells = Dict{Int,Cell}() + get_rowcells!(rowcells, n, ws; mylock) + + sheetrow = SheetRow(ws, row_num, current_row_ht, rowcells) + push!(matched_rows, sheetrow) + i += 1 + i > l && break end - n=XML.next(n) end return matched_rows diff --git a/src/styles.jl b/src/styles.jl index ba8e7180..a9d8ee5d 100644 --- a/src/styles.jl +++ b/src/styles.jl @@ -80,10 +80,10 @@ function styles_xmlroot(workbook::Workbook) styles_root = xmlroot(get_xlsxfile(workbook), styles_target) # check root node name for styles.xml - if get_default_namespace(styles_root[end]) != SPREADSHEET_NAMESPACE_XPATH_ARG - throw(XLSXError("Unsupported styles XML namespace $(get_default_namespace(styles_root[end])).")) + if get_default_namespace(xml_root_element(styles_root)) != SPREADSHEET_NAMESPACE_XPATH_ARG + throw(XLSXError("Unsupported styles XML namespace $(get_default_namespace(xml_root_element(styles_root))).")) end - XML.tag(styles_root[end]) != "styleSheet" && throw(XLSXError("Malformed package. Expected root node named `styleSheet` in `styles.xml`.")) + XML.tag(xml_root_element(styles_root)) != "styleSheet" && throw(XLSXError("Malformed package. Expected root node named `styleSheet` in `styles.xml`.")) workbook.styles_xroot = styles_root else throw(XLSXError("Styles not found for this workbook.")) @@ -138,11 +138,11 @@ function styles_add_numFmt(wb::Workbook, format_code::AbstractString)::Integer numfmts = numfmts[1] end - existing_numFmt_elements_count = length(XML.children(numfmts)) + existing_numFmt_elements_count = length(xml_elements(numfmts)) fmt_code = existing_numFmt_elements_count + PREDEFINED_NUMFMT_COUNT new_fmt = XML.Element("numFmt"; - numFmtId=fmt_code, - formatCode=XLSX.escape(format_code) + numFmtId=string(fmt_code), + formatCode=format_code ) push!(numfmts, new_fmt) return fmt_code @@ -169,7 +169,7 @@ const DATETIME_CODES = ["d", "m", "yy", "h", "s", "a/p", "am/pm"] function remove_formatting(code) # this regex should cover all the formatting cases found here(colors/conditionals/quotes/spacing): # https://support.office.com/en-us/article/create-or-delete-a-custom-number-format-78f2a361-936b-4c03-8772-09fab54be7f4 - ignoredformatting = r"""\[.{2,}?\]|".+?"|_.|\\.|\*."""x # Had to add ? to "".+"" to make it work. Don't understand what made this necessary! + ignoredformatting = r"""\[.{2,}?\]|".+?"|_.|\\.|\*."""x replace(code, ignoredformatting => "") end @@ -278,7 +278,7 @@ function styles_get_cellXf_with_numFmtId(allXfNodes::Vector{XML.Node}, numFmtId: end function styles_add_cell_xf(wb::Workbook, attributes::Dict{String,String})::CellDataFormat - new_xf = XML.Node(XML.Element, "xf", OrderedDict{String,String}(), nothing, nothing) + new_xf = XML.Node{String}(XML.Element, "xf", Pair{String,String}[], nothing, nothing) for k in keys(attributes) new_xf[k] = attributes[k] end @@ -288,12 +288,12 @@ end function styles_add_cell_xf(wb::Workbook, new_xf::XML.Node)::CellDataFormat xroot = styles_xmlroot(wb) i, j = get_idces(xroot, "styleSheet", "cellXfs") - existing_cellxf_elements_count = length(XML.children(xroot[i][j])) + existing_cellxf_elements_count = length(xml_elements(xroot[i][j])) if parse(Int, xroot[i][j]["count"]) != existing_cellxf_elements_count throw(XLSXError("Wrong number of xf elements found: $existing_cellxf_elements_count. Expected $(parse(Int, xroot[i][j]["count"])).")) end # Check new_xf doesn't duplicate any existing xf. If yes, use that rather than create new. - for (k, node) in enumerate(XML.children(xroot[i][j])) + for (k, node) in enumerate(xml_elements(xroot[i][j])) if node == new_xf return CellDataFormat(k - 1) # CellDataFormat is zero-indexed end diff --git a/src/types.jl b/src/types.jl index 1f389a9a..ec66f273 100644 --- a/src/types.jl +++ b/src/types.jl @@ -350,7 +350,8 @@ Implementations: SheetRowStreamIterator, WorksheetCache. abstract type SheetRowIterator end mutable struct SheetRowStreamIteratorState - next_rownode::Union{Nothing, XML.LazyNode} # Worksheet row being processed + row_nodes::Vector{XML.LazyNode} # All row nodes from sheetData + row_index::Int # Index of the next row to process rowcells::Dict{Int,Cell} lock::ReentrantLock end @@ -594,7 +595,7 @@ end struct ReadFile node::Union{Nothing,XML.Node} - raw::Union{Nothing,XML.Raw} + raw::Union{Nothing,String} bin::Union{Nothing,Vector{UInt8}} name::String end @@ -684,11 +685,11 @@ struct DataTable end end -struct xpath +struct XPathInfo node::XML.Node path::String - function xpath(node::XML.Node, path::String) + function XPathInfo(node::XML.Node, path::String) new(node, path) end end diff --git a/src/worksheet.jl b/src/worksheet.jl index a128f8ca..0031ba50 100644 --- a/src/worksheet.jl +++ b/src/worksheet.jl @@ -4,7 +4,7 @@ function Worksheet(xf::XLSXFile, sheet_element::XML.Node) a = XML.attributes(sheet_element) sheetId = parse(Int, a["sheetId"]) relationship_id = a["r:id"] - name = XLSX.unescape(a["name"]) + name = a["name"] is_hidden = haskey(a, "state") && a["state"] in ["hidden", "veryHidden"] # dim = read_worksheet_dimension(xf, relationship_id, name) @@ -42,24 +42,18 @@ function read_worksheet_dimension(xf::XLSXFile, relationship_id, name)::Union{No local result::Union{Nothing,CellRange} = nothing target_file = get_relationship_target_by_id("xl", wb, relationship_id) doc = open_internal_file_stream(xf, target_file) - reader = iterate(doc) - # Now let's look for a row element, if it exists - while reader !== nothing # go next node - (sheet_row, state) = reader - if XML.nodetype(sheet_row) == XML.Element && XML.tag(sheet_row) == "dimension" + root = xml_root_element(doc) - XML.depth(sheet_row) != 2 && throw(XLSXError("Malformed Worksheet \"$name\": unexpected node depth for dimension node: $(XML.depth(sheet_row)).")) - - ref_str = XML.attributes(sheet_row)["ref"] + for child in XML.children(root) + if XML.nodetype(child) == XML.Element && XML.tag(child) == "dimension" + ref_str = child["ref"] if is_valid_cellname(ref_str) result = CellRange("$(ref_str):$(ref_str)") else result = CellRange(ref_str) end - break end - reader = iterate(doc, state) end return result diff --git a/src/write.jl b/src/write.jl index 1a70cecf..7d824a1a 100644 --- a/src/write.jl +++ b/src/write.jl @@ -183,13 +183,13 @@ end function get_node_paths(node::XML.Node) XML.nodetype(node) != XML.Document && throw(XLSXError("Something wrong here!")) - default_ns = get_default_namespace(node[end]) - xpaths = Vector{xpath}() + default_ns = get_default_namespace(xml_root_element(node)) + xpaths = Vector{XPathInfo}() get_node_paths!(xpaths, node, default_ns, "") return xpaths end -function get_node_paths!(xpaths::Vector{xpath}, node::XML.Node, default_ns, path) +function get_node_paths!(xpaths::Vector{XPathInfo}, node::XML.Node, default_ns, path) for c in XML.children(node) if XML.nodetype(c) ∉ [XML.Declaration, XML.Comment, XML.Text] node_tag = XML.tag(c) @@ -197,7 +197,7 @@ function get_node_paths!(xpaths::Vector{xpath}, node::XML.Node, default_ns, path node_tag = default_ns * ":" * node_tag end npath = path * "/" * node_tag - push!(xpaths, xpath(c, npath)) + push!(xpaths, XPathInfo(c, npath)) if length(XML.children(c)) > 0 get_node_paths!(xpaths, c, default_ns, npath) end @@ -267,7 +267,7 @@ end function update_single_sheet!(wb::Workbook, sheet_no::Int, full::Bool)::Union{Nothing,Vector{UInt8}} sheet = getsheet(wb, sheet_no) doc = copynode(get_worksheet_xml_document(sheet)) - xroot = doc[end] + xroot = xml_root_element(doc) # check namespace and root node name get_default_namespace(xroot) != SPREADSHEET_NAMESPACE_XPATH_ARG && throw(XLSXError("Unsupported Spreadsheet XML namespace $(get_default_namespace(xroot)).")) @@ -547,7 +547,7 @@ function update_workbook_xml!(xl::XLSXFile) # Need to update and and "iconSet", "priority" => "1", "id" => "XXXX-xxxx-XXXX") + @test Dict(XML.attributes(XLSX.get_x14_icon("3Stars"))) == Dict("type" => "iconSet", "priority" => "1", "id" => "XXXX-xxxx-XXXX") @test length(XML.children(XLSX.get_x14_icon("5Boxes"))) == 1 - @test typeof(XLSX.get_x14_icon("Custom")) == XML.Node + @test XLSX.get_x14_icon("Custom") isa XML.Node end @testset "cellIs" begin From 4c52dd784e5956cc1697f6d910c9b3dc71e407c3 Mon Sep 17 00:00:00 2001 From: Josh Day Date: Mon, 6 Apr 2026 08:34:32 -0400 Subject: [PATCH 2/5] ci --- .github/workflows/ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 94cd205e..7038389a 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -1,6 +1,6 @@ name: CI on: -# workflow_dispatch: # <-- Add this to allow manual execution + workflow_dispatch: # <-- Add this to allow manual execution pull_request: branches: - master From 0412e1f86fd4488d759cf8f7b1fd936bf3abf77b Mon Sep 17 00:00:00 2001 From: Josh Day Date: Fri, 10 Apr 2026 10:32:42 -0400 Subject: [PATCH 3/5] Optimize SST loading and cell read perf --- src/cell.jl | 4 +- src/conditional-formats.jl | 32 ++++----- src/read.jl | 4 +- src/sst.jl | 136 ++++++------------------------------- src/types.jl | 9 +-- 5 files changed, 40 insertions(+), 145 deletions(-) diff --git a/src/cell.jl b/src/cell.jl index 478a2b19..f1555253 100644 --- a/src/cell.jl +++ b/src/cell.jl @@ -68,8 +68,8 @@ end # Extracts the unformatted text from an inlineStr "is" XML element as a XML string. function _build_si_xml(child)::String - inner = join(XML.write.(materialize.(XML.children(child))), "\n") - return "\n $inner\n" + inner = join((XML.write(c) for c in XML.children(child))) + return "$inner" end # Parses a style string to (UInt32, Int) for use as style and num_style. diff --git a/src/conditional-formats.jl b/src/conditional-formats.jl index 471893b1..5609fdf2 100644 --- a/src/conditional-formats.jl +++ b/src/conditional-formats.jl @@ -1450,7 +1450,7 @@ function setCfCellIs(ws::Worksheet, rng::CellRange; allkws::Dict{Symbol,Any}=()) if isnothing(value) value = all(ismissing.(ws[rng])) ? nothing : string(sum(skipmissing(ws[rng])) / count(!ismissing, ws[rng])) end - cfx = XML.Element("cfRule"; type="cellIs", dxfId=string(Int(dxid.id))) + cfx = XML.Element("cfRule"; type="cellIs", dxfId=string(dxid.id)) cfx["priority"] = length(old_cf) > 0 ? string(maximum([last(x).priority for x in values(old_cf)]) + 1) : 1 if !isnothing(stopIfTrue) && stopIfTrue == "true" cfx["stopIfTrue"] = "1" @@ -1542,7 +1542,7 @@ function setCfContainsText(ws::Worksheet, rng::CellRange; allkws::Dict{Symbol,An end formula = replace(formula, "__txt__" => value, "__CR__" => string(first(rng))) - cfx = XML.Element("cfRule"; type=type, dxfId=string(Int(dxid.id))) + cfx = XML.Element("cfRule"; type=type, dxfId=string(dxid.id)) cfx["priority"] = length(old_cf) > 0 ? string(maximum([last(x).priority for x in values(old_cf)]) + 1) : 1 if !isnothing(stopIfTrue) && stopIfTrue == "true" cfx["stopIfTrue"] = "1" @@ -1616,7 +1616,7 @@ function setCfTop10(ws::Worksheet, rng::CellRange; allkws::Dict{Symbol,Any}=()): percent = "" bottom = "" - cfx = XML.Element("cfRule"; type="top10", dxfId=string(Int(dxid.id))) + cfx = XML.Element("cfRule"; type="top10", dxfId=string(dxid.id)) if operator == "topN" elseif operator == "topN%" percent = "1" @@ -1700,25 +1700,25 @@ function setCfAboveAverage(ws::Worksheet, rng::CellRange; allkws::Dict{Symbol,An dxid = Add_Cf_Dx(wb, new_dx) if operator == "aboveAverage" - cfx = XML.Element("cfRule"; type=operator, dxfId=string(Int(dxid.id)), priority="1") + cfx = XML.Element("cfRule"; type=operator, dxfId=string(dxid.id), priority="1") elseif operator == "aboveEqAverage" - cfx = XML.Element("cfRule"; type="aboveAverage", dxfId=string(Int(dxid.id)), priority="1", equalAverage="1") + cfx = XML.Element("cfRule"; type="aboveAverage", dxfId=string(dxid.id), priority="1", equalAverage="1") elseif operator == "plus1StdDev" - cfx = XML.Element("cfRule"; type="aboveAverage", dxfId=string(Int(dxid.id)), priority="1", stdDev="1") + cfx = XML.Element("cfRule"; type="aboveAverage", dxfId=string(dxid.id), priority="1", stdDev="1") elseif operator == "plus2StdDev" - cfx = XML.Element("cfRule"; type="aboveAverage", dxfId=string(Int(dxid.id)), priority="1", stdDev="2") + cfx = XML.Element("cfRule"; type="aboveAverage", dxfId=string(dxid.id), priority="1", stdDev="2") elseif operator == "plus3StdDev" - cfx = XML.Element("cfRule"; type="aboveAverage", dxfId=string(Int(dxid.id)), priority="1", stdDev="3") + cfx = XML.Element("cfRule"; type="aboveAverage", dxfId=string(dxid.id), priority="1", stdDev="3") elseif operator == "belowAverage" - cfx = XML.Element("cfRule"; type="aboveAverage", dxfId=string(Int(dxid.id)), priority="1", aboveAverage="0") + cfx = XML.Element("cfRule"; type="aboveAverage", dxfId=string(dxid.id), priority="1", aboveAverage="0") elseif operator == "belowEqAverage" - cfx = XML.Element("cfRule"; type="aboveAverage", dxfId=string(Int(dxid.id)), priority="1", aboveAverage="0", equalAverage="1") + cfx = XML.Element("cfRule"; type="aboveAverage", dxfId=string(dxid.id), priority="1", aboveAverage="0", equalAverage="1") elseif operator == "minus1StdDev" - cfx = XML.Element("cfRule"; type="aboveAverage", dxfId=string(Int(dxid.id)), priority="1", aboveAverage="0", stdDev="1") + cfx = XML.Element("cfRule"; type="aboveAverage", dxfId=string(dxid.id), priority="1", aboveAverage="0", stdDev="1") elseif operator == "minus2StdDev" - cfx = XML.Element("cfRule"; type="aboveAverage", dxfId=string(Int(dxid.id)), priority="1", aboveAverage="0", stdDev="2") + cfx = XML.Element("cfRule"; type="aboveAverage", dxfId=string(dxid.id), priority="1", aboveAverage="0", stdDev="2") elseif operator == "minus3StdDev" - cfx = XML.Element("cfRule"; type="aboveAverage", dxfId=string(Int(dxid.id)), priority="1", aboveAverage="0", stdDev="3") + cfx = XML.Element("cfRule"; type="aboveAverage", dxfId=string(dxid.id), priority="1", aboveAverage="0", stdDev="3") else throw(XLSXError("Invalid operator: $operator. Valid options are: `aboveAverage`, `aboveEqAverage`, `plus1sStdDev`, `plus2StdDev`, `plus3StdDev`, `belowAverage`, `belowEqAverage`, `minus1StdDev`, `minus2StdDev`, `minus3StdDev`.")) end @@ -1794,7 +1794,7 @@ function setCfTimePeriod(ws::Worksheet, rng::CellRange; allkws::Dict{Symbol,Any} new_dx = get_new_dx(wb, dx) dxid = Add_Cf_Dx(wb, new_dx) - cfx = XML.Element("cfRule"; type="timePeriod", dxfId=string(Int(dxid.id))) + cfx = XML.Element("cfRule"; type="timePeriod", dxfId=string(dxid.id)) cfx["priority"] = length(old_cf) > 0 ? string(maximum([last(x).priority for x in values(old_cf)]) + 1) : 1 if !isnothing(stopIfTrue) && stopIfTrue == "true" cfx["stopIfTrue"] = "1" @@ -1875,7 +1875,7 @@ function setCfContainsBlankErrorUniqDup(ws::Worksheet, rng::CellRange; allkws::D new_dx = get_new_dx(wb, dx) dxid = Add_Cf_Dx(wb, new_dx) - cfx = XML.Element("cfRule"; type=operator, dxfId=string(Int(dxid.id))) + cfx = XML.Element("cfRule"; type=operator, dxfId=string(dxid.id)) cfx["priority"] = length(old_cf) > 0 ? string(maximum([last(x).priority for x in values(old_cf)]) + 1) : 1 if !isnothing(stopIfTrue) && stopIfTrue == "true" cfx["stopIfTrue"] = "1" @@ -1941,7 +1941,7 @@ function setCfFormula(ws::Worksheet, rng::CellRange; allkws::Dict{Symbol,Any}=() new_dx = get_new_dx(wb, dx) dxid = Add_Cf_Dx(wb, new_dx) - cfx = XML.Element("cfRule"; type="expression", dxfId=string(Int(dxid.id))) + cfx = XML.Element("cfRule"; type="expression", dxfId=string(dxid.id)) cfx["priority"] = length(old_cf) > 0 ? string(maximum([last(x).priority for x in values(old_cf)]) + 1) : 1 if !isnothing(stopIfTrue) && stopIfTrue == "true" cfx["stopIfTrue"] = "1" diff --git a/src/read.jl b/src/read.jl index c8fa7e33..ca896146 100644 --- a/src/read.jl +++ b/src/read.jl @@ -798,7 +798,7 @@ function process_file(zip_io::ZipArchives.ZipReader, filename::String) strip_bom_and_lf!(bytes) xml_str = String(bytes) if filename == "xl/sharedStrings.xml" - node = parse(xml_str, XML.Node) + node = XML.Element("sst") # placeholder; SST is loaded via sst_load! raw = xml_str elseif occursin(r"xl/worksheets/sheet\d+\.xml", filename) node, raw = splitNode(xml_str, "sheetData") @@ -833,7 +833,7 @@ function internal_xml_file_read(xf::XLSXFile, zip_io::Union{Nothing,ZipArchives. strip_bom_and_lf!(bytes) xml_str = String(bytes) if filename == "xl/sharedStrings.xml" - xf.data[filename] = parse(xml_str, XML.Node) + xf.data[filename] = XML.Element("sst") # placeholder; SST is loaded via sst_load! elseif occursin(r"xl/worksheets/sheet\d+\.xml", filename) xf.data[filename], _ = splitNode(xml_str, "sheetData") else diff --git a/src/sst.jl b/src/sst.jl index 63051724..57ab3e32 100644 --- a/src/sst.jl +++ b/src/sst.jl @@ -1,5 +1,5 @@ -SharedStringTable() = SharedStringTable(Vector{String}(), Dict{String, Int64}(), false) +SharedStringTable() = SharedStringTable(Vector{String}(), Vector{String}(), Dict{String, Int64}(), false) @inline get_sst(wb::Workbook) = wb.sst @inline get_sst(xl::XLSXFile) = get_sst(get_workbook(xl)) @@ -45,7 +45,7 @@ function add_to_sst!(ss::SharedStringTable, si_xml::String)::Int64 # No match found, add new entry new_idx = length(ss.shared_strings) # 0-based index push!(ss.shared_strings, si_xml) - + push!(ss.unformatted, unformatted_text(parse(si_xml, XML.LazyNode))) ss.index[si_xml] = new_idx # if new_idx ∉ get_shared_string_index(ss, si_xml) @@ -106,125 +106,28 @@ function add_shared_string!(wb::Workbook, str_unformatted::AbstractString; myloc end function sst_load!(workbook::Workbook) - chunksize = 1000 sst = get_sst(workbook) if !sst.is_loaded - - relationship_type = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/sharedStrings" - if has_relationship_by_type(workbook, relationship_type) - doc = open_internal_file_stream(get_xlsxfile(workbook), "xl/sharedStrings.xml") - sst_root = xml_root_element(doc) # element - sst_chan = stream_ssts(sst_root, chunksize) - load_sst_table!(workbook, sst_chan, Threads.nthreads()) - init_sst_index(sst) - - return - end - - throw(XLSXError("Shared Strings Table not found for this workbook.")) - end -end -function produce_sstchunks(out, sst_root::XML.LazyNode, ssts, chunksize) - i = 0 - global_idx = 0 - - for child in XML.children(sst_root) - if XML.tag(child) == "si" - i += 1 - global_idx += 1 - ssts[i] = SstToken(child, global_idx) - end - if i >= chunksize - put!(out, copy(ssts)) - i = 0 + has_sst(workbook) || throw(XLSXError("Shared Strings Table not found for this workbook.")) + doc = open_internal_file_stream(get_xlsxfile(workbook), "xl/sharedStrings.xml") + sst_root = xml_root_element(doc) + empty!(sst.shared_strings) + empty!(sst.unformatted) + uc = get(sst_root, "uniqueCount", nothing) + if !isnothing(uc) + n = parse(Int, uc) + sizehint!(sst.shared_strings, n) + sizehint!(sst.unformatted, n) end - end - if i > 0 - put!(out, copy(ssts[1:i])) - end -end - -function stream_ssts(sst_root::XML.LazyNode, chunksize::Int; channel_size::Int=1 << 8) - ssts = Vector{SstToken}(undef, chunksize) - Channel{Vector{SstToken}}(channel_size) do out - produce_sstchunks(out, sst_root, ssts, chunksize) - end -end - -# Convert a LazyNode to a Node for serialization -function materialize(ln::XML.LazyNode)::XML.Node{String} - nt = XML.nodetype(ln) - if nt in (XML.Text, XML.CData, XML.Comment, XML.DTD) - return XML.Node{String}(nt, nothing, nothing, XML.value(ln), nothing) - elseif nt == XML.Declaration - a = XML.attributes(ln) - attrs = isnothing(a) ? nothing : Pair{String,String}[p for p in a] - return XML.Node{String}(nt, nothing, attrs, nothing, nothing) - elseif nt == XML.ProcessingInstruction - return XML.Node{String}(nt, XML.tag(ln), nothing, XML.value(ln), nothing) - elseif nt == XML.Element - a = XML.attributes(ln) - attrs = isnothing(a) ? nothing : Pair{String,String}[p for p in a] - ch = XML.children(ln) - children = isempty(ch) ? nothing : XML.Node{String}[materialize(c) for c in ch] - return XML.Node{String}(nt, XML.tag(ln), attrs, nothing, children) - elseif nt == XML.Document - ch = XML.children(ln) - children = isempty(ch) ? nothing : XML.Node{String}[materialize(c) for c in ch] - return XML.Node{String}(nt, nothing, nothing, nothing, children) - else - error("Unknown node type: $nt") - end -end - -function process_sst(sst::SstToken) - el = sst.n - i = sst.idx - - if XML.nodetype(el) != XML.Text - XML.tag(el) != "si" && throw(XLSXError("Unsupported node $(XML.tag(el)) in sst table.")) - sst = Sst(XML.write(materialize(el)), i) - return sst - end -end - - function load_sst_table!(wb::Workbook, chan::Channel, nthreads::Int) - sst_table = get_sst(wb) - sst_table.is_loaded = true - sst_results = Channel{Vector{Sst}}(1 << 8) - all_ssts = Vector{Tuple{Int,Sst}}() - - consumer = @async begin - for ssts in sst_results - for sst in ssts - push!(all_ssts, (sst.idx, sst)) - end - end - sort!(all_ssts, by = x -> x[1]) - - empty!(sst_table.shared_strings) - empty!(sst_table.index) - - for (i, sst) in all_ssts - push!(sst_table.shared_strings, sst.formatted) - sst_table.index[sst.formatted] = i - 1 # 0-based - end - - end - - # Producer tasks - @sync for _ in 1:nthreads - Threads.@spawn begin - for ssts in chan - # ssts is already a chunk - just process it - processed = [process_sst(tok) for tok in ssts] - put!(sst_results, processed) + for child in XML.eachchildnode(sst_root) + if XML.nodetype(child) == XML.Element && XML.tag(child) == "si" + push!(sst.shared_strings, XML.write(child)) + push!(sst.unformatted, unformatted_text(child)) end end + sst.is_loaded = true + init_sst_index(sst) end - - close(sst_results) - wait(consumer) end # Checks whether this workbook has a Shared String Table. @@ -277,8 +180,7 @@ end # `index` starts at 0. @inline function sst_unformatted_string(wb::Workbook, index::Int64)::String sst_load!(wb) - uss = get_sst(wb).shared_strings[index+1] - return unformatted_text(parse(uss, XML.LazyNode)) + return get_sst(wb).unformatted[index+1] end @inline sst_unformatted_string(xl::XLSXFile, index::Int64) :: String = sst_unformatted_string(get_workbook(xl), index) diff --git a/src/types.jl b/src/types.jl index ec66f273..4377f1d4 100644 --- a/src/types.jl +++ b/src/types.jl @@ -410,17 +410,10 @@ end #------------------------------------------------------------------------------ sharedStrings mutable struct SharedStringTable shared_strings::Vector{String} + unformatted::Vector{String} index::Dict{String, Int64} # for search optimisation. Tuple of indices to handle hash collisions. is_loaded::Bool end -struct SstToken - n::XML.LazyNode - idx::Int -end -struct Sst - formatted::String - idx::Int -end const ValidRichTextAttributes = [:bold, :italic, :under, :strike, :vertAlign, :color, :size, :name] From 0f274d3003cb90b9fa0c5a9bca3a032412b3b025 Mon Sep 17 00:00:00 2001 From: Josh Day Date: Fri, 15 May 2026 16:52:01 -0400 Subject: [PATCH 4/5] Adapt cell and SST reading for XML v0.4 APIs --- src/cell.jl | 24 +++++++++++++----------- src/sst.jl | 30 +++++++++++++----------------- src/stream.jl | 26 ++++++++++++++++++++------ 3 files changed, 46 insertions(+), 34 deletions(-) diff --git a/src/cell.jl b/src/cell.jl index f1555253..266ff702 100644 --- a/src/cell.jl +++ b/src/cell.jl @@ -73,7 +73,7 @@ function _build_si_xml(child)::String end # Parses a style string to (UInt32, Int) for use as style and num_style. -function _parse_style(s::String) +function _parse_style(s::AbstractString) isempty(s) && return UInt32(0), 0 n = parse(Int, s) return UInt32(n), n @@ -118,9 +118,8 @@ function Cell(c::XML.LazyNode, ws::Worksheet; mylock::Union{ReentrantLock,Nothin for child in chn tag = XML.tag(child) if tag == "v" - ch = XML.children(child) - isempty(ch) && continue - v = XML.value(ch[1]) + v = XML.is_simple_value(child) + isnothing(v) && continue datatype, value = process_tv(wb, t, v, num_style; mylock) elseif tag == "f" if get_xlsxfile(wb).is_writable @@ -139,12 +138,15 @@ function parse_formula_from_element(c_child_element)::AbstractFormula XML.tag(c_child_element) == "f" || throw(XLSXError("Expected nodename `f`. Found: `$(XML.tag(c_child_element))`")) - # Extract formula string - formula_string = if XML.is_simple(c_child_element) - XML.simple_value(c_child_element) - else - text_nodes = filter(x -> XML.nodetype(x) == XML.Text, XML.children(c_child_element)) - isempty(text_nodes) ? "" : XML.value(text_nodes[1]) + # Extract formula string — `` may have attributes (t="shared", si=…, ref=…) + # which makes it non-"simple", so collect the first Text child. + formula_string = "" + for ch in XML.children(c_child_element) + if XML.nodetype(ch) === XML.Text + v = XML.value(ch) + isnothing(v) || (formula_string = v) + break + end end a = XML.attributes(c_child_element) @@ -198,7 +200,7 @@ function _parse_excel_datetime_raw(v::AbstractString) end end -function process_tv(wb::Workbook, t::String, v::String, num_style::Int; mylock::Union{ReentrantLock,Nothing}=nothing) +function process_tv(wb::Workbook, t::AbstractString, v::AbstractString, num_style::Int; mylock::Union{ReentrantLock,Nothing}=nothing) datatype::CellValueType = CT_EMPTY value::UInt64 = UInt64(0) isempty(v) && return datatype, value diff --git a/src/sst.jl b/src/sst.jl index 57ab3e32..70a47bcb 100644 --- a/src/sst.jl +++ b/src/sst.jl @@ -89,7 +89,7 @@ function add_formatted_string!(wb::Workbook, str_formatted::String; mylock::Unio end # check if unformatted shared string needs xml:space="preserve" -needs_preserve(s::String) = startswith(s, ' ') || endswith(s, ' ') || contains(s, '\n') || contains(s, " ") +needs_preserve(s::AbstractString) = startswith(s, ' ') || endswith(s, ' ') || contains(s, '\n') || contains(s, " ") # allow to write cells containing only whitespace characters or with leading or trailing whitespace. function add_shared_string!(wb::Workbook, str_unformatted::AbstractString; mylock::Union{Nothing,ReentrantLock}=nothing) :: Int @@ -148,31 +148,27 @@ end function gather_strings!(io::IOBuffer, e::XML.LazyNode) tag = XML.tag(e) - + # Skip phonetic hints entirely tag == "rPh" && return nothing - + if tag == "t" - children = XML.children(e) - n = length(children) - - if n == 1 - c = children[1] - write(io, XML.is_simple(c) ? XML.simple_value(c) : XML.value(c)) - elseif n == 0 - val = XML.value(e) - !isnothing(val) && write(io, XML.is_simple(e) ? XML.simple_value(e) : val) - else - throw(XLSXError("Unexpected number of children in : $n. Expected 0 or 1.")) + # `` may carry `xml:space="preserve"` so it is not always "simple"; + # collect any Text/CData children directly. + for ch in XML.children(e) + nt = XML.nodetype(ch) + if nt === XML.Text || nt === XML.CData + v = XML.value(ch) + isnothing(v) || write(io, v) + end end else # Recurse into children for all other tags - children = XML.children(e) - for ch in children + for ch in XML.children(e) gather_strings!(io, ch) end end - + return nothing end diff --git a/src/stream.jl b/src/stream.jl index f395a6aa..3fcffcac 100644 --- a/src/stream.jl +++ b/src/stream.jl @@ -131,7 +131,7 @@ function Base.iterate(itr::SheetRowStreamIterator, state::SheetRowStreamIterator sheet_row = SheetRow(ws, current_row, current_row_ht, rowcells) - return sheet_row, SheetRowStreamIteratorState(state.row_nodes, state.row_index, rowcells, mylock) + return sheet_row, state end # @@ -329,16 +329,30 @@ function stream_rows(sheetdata::XML.LazyNode, chunksize::Int; channel_size::Int= end end +const _EMPTY_ROW_ATTRS = Dict{String,String}() + function process_row(row::XML.LazyNode, handled_attributes::Set{String}, ws::Worksheet, mylock::ReentrantLock) - unhandled_attributes = Dict{String,String}() + current_row_ht::Union{Float64,Nothing} = nothing + row_num::Union{Int,Nothing} = nothing + unhandled_attributes = _EMPTY_ROW_ATTRS atts = XML.attributes(row) if !isnothing(atts) - current_row_ht = haskey(atts, "ht") ? parse(Float64, atts["ht"]) : nothing - row_num = haskey(atts, "r") ? parse(Int, atts["r"]) : nothing - row_num === nothing && throw(XLSXError("Row without 'r' attribute encountered in worksheet $(ws.name).")) - unhandled_attributes = Dict{String,String}(k => v for (k, v) in atts if !in(k, handled_attributes)) + for (k, v) in atts + if k == "r" + row_num = parse(Int, v) + elseif k == "ht" + current_row_ht = parse(Float64, v) + end + if !(k in handled_attributes) + if unhandled_attributes === _EMPTY_ROW_ATTRS + unhandled_attributes = Dict{String,String}() + end + unhandled_attributes[String(k)] = String(v) + end + end end + row_num === nothing && throw(XLSXError("Row without 'r' attribute encountered in worksheet $(ws.name).")) # Process cells rowcells = Dict{Int,Cell}() From ad3bd5488e8790cebf7fef4b0f2e8c53d9c8fd1c Mon Sep 17 00:00:00 2001 From: Josh Day Date: Tue, 19 May 2026 05:49:16 -0400 Subject: [PATCH 5/5] Fix no-default-namespace read path (XML v0.4) Files written without a default namespace use prefixed elements (, , , ...). Two read-path bugs caused the merged upstream "no default namespace" testset to fail: 1. gather_strings! matched element names with raw XML.tag, so never matched "t" and every shared string resolved to "". Use localname (prefix-stripping), consistent with the rest of the SST loader. 2. allExtCfs returned XML.children (incl. the v0.4 writer's indentation whitespace text nodes) and getConditionalExtFormats assumed xm:sqref was the last child and matched x14:/xm: prefixes with raw XML.tag. Return only element children, locate xm:sqref by name, and match via localname. The upstream "no default namespace" testset (issues #380/#362/#267/ #170) now passes 1897/1897; full suite green. --- src/conditional-format-helpers.jl | 4 ++-- src/conditional-formats.jl | 16 ++++++++++------ src/sst.jl | 4 +++- 3 files changed, 15 insertions(+), 9 deletions(-) diff --git a/src/conditional-format-helpers.jl b/src/conditional-format-helpers.jl index d705ebec..6698fe7f 100644 --- a/src/conditional-format-helpers.jl +++ b/src/conditional-format-helpers.jl @@ -88,13 +88,13 @@ function allExtCfs(ws::Worksheet)::Vector{XML.Node} let cfs = nothing for ext in exts for c in XML.children(ext) - if XML.tag(c) == "x14:conditionalFormattings" + if localname(c) == "conditionalFormattings" cfs = c break end end end - return isnothing(cfs) ? Vector{XML.Node}() : XML.children(cfs) + return isnothing(cfs) ? Vector{XML.Node}() : xml_elements(cfs) end end function make_extLst!(s) diff --git a/src/conditional-formats.jl b/src/conditional-formats.jl index 4d0d89cf..efc7c9e6 100644 --- a/src/conditional-formats.jl +++ b/src/conditional-formats.jl @@ -484,14 +484,18 @@ function getConditionalExtFormats(ws::Worksheet, allcfnodes::Vector{XML.Node}):: allcfs = Vector{Pair{CellRange,NamedTuple{(:type, :priority),Tuple{String,Int64}}}}() for cf in allcfnodes let t, p, r, rule = false, ref = false - @assert XML.tag(cf) == "x14:conditionalFormatting" "Something wrong here" - sqref = cf[end] - if XML.tag(sqref) == "xm:sqref" - r = XML.simple_value(sqref) + @assert localname(cf) == "conditionalFormatting" "Something wrong here" + # Children may be interleaved with whitespace text nodes from the + # writer's indentation, so locate `xm:sqref` by name rather than + # assuming it is the last child. + cf_elements = xml_elements(cf) + sqref = findlast(c -> localname(c) == "sqref", cf_elements) + if !isnothing(sqref) + r = XML.simple_value(cf_elements[sqref]) ref = true end - for child in XML.children(cf) - if XML.tag(child) == "x14:cfRule" + for child in cf_elements + if localname(child) == "cfRule" t = child["type"] if t != "dataBar" # This is the other half of a dataBar definition - don't count twice! p = parse(Int, child["priority"]) diff --git a/src/sst.jl b/src/sst.jl index 9fe63349..60f4933e 100644 --- a/src/sst.jl +++ b/src/sst.jl @@ -149,7 +149,9 @@ end unformatted_text(::Workbook, el::XML.LazyNode) :: String = unformatted_text(el) function gather_strings!(io::IOBuffer, e::XML.LazyNode) - tag = XML.tag(e) + # Use `localname` (not `XML.tag`) so prefixed elements such as `` in + # files without a default namespace are matched the same as ``. + tag = localname(e) # Skip phonetic hints entirely tag == "rPh" && return nothing