diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 4717ff81..f441d27d 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -1,6 +1,6 @@ name: CI on: -# workflow_dispatch: # <-- Add this to allow manual execution + workflow_dispatch: # <-- Add this to allow manual execution pull_request: branches: - master diff --git a/.gitignore b/.gitignore index 62aa63f0..e9048b11 100644 --- a/.gitignore +++ b/.gitignore @@ -5,3 +5,4 @@ Manifest.toml docs/Manifest.toml .vscode *.gz +.claude diff --git a/Project.toml b/Project.toml index 3fa6279a..eb545a57 100644 --- a/Project.toml +++ b/Project.toml @@ -8,7 +8,6 @@ repo = "https://github.com/juliadata/XLSX.jl.git" [deps] Colors = "5ae59095-9a9b-59fe-a467-6f913c188581" Dates = "ade2ca70-3891-5945-98fb-dc099432e06a" -OrderedCollections = "bac558e1-5e72-5ebc-8fee-abe8a469f55d" PrecompileTools = "aea7be01-6a6a-4083-8856-8a6e6704d82a" Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7" Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" @@ -28,12 +27,11 @@ StyledStringsSstsExt = "StyledStrings" CSV = "0.10.15" Colors = "0.12, 0.13" Distributions = "0.25.0" -OrderedCollections = "1" PrecompileTools = "1" StyledStrings = "1.0.3" Tables = "1" UUIDs = "1.8" -XML = "0.3.8" +XML = "0.4" ZipArchives = "2.5" julia = "1.8" [extras] diff --git a/src/XLSX.jl b/src/XLSX.jl index c35b9444..9489abe8 100644 --- a/src/XLSX.jl +++ b/src/XLSX.jl @@ -11,7 +11,6 @@ import Tables import Unicode import UUIDs import XML -using OrderedCollections: OrderedDict import ZipArchives import PrecompileTools as PCT # this is a small dependency. @@ -40,6 +39,9 @@ export getMergedCells, isMergedCell, getMergedBaseCell, mergeCells const SPREADSHEET_NAMESPACE_XPATH_ARG = "http://schemas.openxmlformats.org/spreadsheetml/2006/main" + +xml_elements(node) = filter(n -> XML.nodetype(n) == XML.Element, XML.children(node)) +xml_root_element(doc) = last(xml_elements(doc)) const EXCEL_MAX_COLS = 16_384 # total columns supported by Excel per sheet const EXCEL_MAX_ROWS = 1_048_576 # total rows supported by Excel per sheet (including headers) const ROW_CHUNKSIZE = 1000 # number of rows to be processed in each thread diff --git a/src/cell.jl b/src/cell.jl index bf3535a2..4ad1b178 100644 --- a/src/cell.jl +++ b/src/cell.jl @@ -188,7 +188,7 @@ end =# # Parses a style string to (UInt32, Int) for use as style and num_style. -function _parse_style(s::String) +function _parse_style(s::AbstractString) isempty(s) && return UInt32(0), 0 n = parse(Int, s) return UInt32(n), n @@ -235,8 +235,8 @@ function Cell(c::XML.LazyNode, ws::Worksheet, sst_pfx::String; mylock::Union{Ree if tag == "v" ch = XML.children(child) isempty(ch) && continue - raw = XML.value(ch[1]) - v = occursin('&', raw) ? XLSX.unescape(raw) : raw + # `child` is a `LazyNode`; XML.jl 0.4 already unescapes its value. + v = XML.value(ch[1]) datatype, value = process_tv(wb, t, v, num_style; mylock) elseif tag == "f" f = parse_formula_from_element(wb,child) @@ -253,12 +253,15 @@ function parse_formula_from_element(wb, c_child_element)::AbstractFormula localname(c_child_element) == "f" || throw(XLSXError("Expected nodename `f`. Found: `$(localname(c_child_element))`")) - # Extract formula string - formula_string = if XML.is_simple(c_child_element) - XLSX.unescape(XML.simple_value(c_child_element)) - else - text_nodes = filter(x -> XML.nodetype(x) == XML.Text, XML.children(c_child_element)) - isempty(text_nodes) ? "" : XLSX.unescape(XML.value(text_nodes[1])) + # Extract formula string — `` may have attributes (t="shared", si=…, ref=…) + # which makes it non-"simple", so collect the first Text child. + formula_string = "" + for ch in XML.children(c_child_element) + if XML.nodetype(ch) === XML.Text + v = XML.value(ch) + isnothing(v) || (formula_string = v) + break + end end a = XML.attributes(c_child_element) @@ -312,7 +315,7 @@ function _parse_excel_datetime_raw(v::AbstractString) end end -function process_tv(wb::Workbook, t::String, v::String, num_style::Int; mylock::Union{ReentrantLock,Nothing}=nothing) +function process_tv(wb::Workbook, t::AbstractString, v::AbstractString, num_style::Int; mylock::Union{ReentrantLock,Nothing}=nothing) datatype::CellValueType = CT_EMPTY value::UInt64 = UInt64(0) isempty(v) && return datatype, value @@ -479,29 +482,21 @@ end # Extract cells from a LazyNode and push them (in place) into a Dict(column -> Cell) function get_rowcells!(rowcells::Dict{Int,Cell}, row::XML.LazyNode, ws::Worksheet, sst_pfx::String; mylock::Union{ReentrantLock,Nothing}=nothing) - # unthreaded cell extraction is (exceedingly marginally) slower but no lock conflicts introduced. - # debug - # @assert row.tag == "row" "Not a row node" + # @assert localname(row) == "row" "Not a row node" sst_count = 0 - d = row.depth - - cellnode = XML.next(row) - - while !isnothing(cellnode) && cellnode.depth > d - if localname(cellnode) == "c" # This is a cell - cell = Cell(cellnode, ws, sst_pfx; mylock) # construct an XLSX.Cell from an XML.LazyNode + for child in XML.children(row) + if localname(child) == "c" # This is a cell + cell = Cell(child, ws, sst_pfx; mylock) # construct an XLSX.Cell from an XML.LazyNode sst_count += cell.datatype == CT_STRING ? 1 : 0 rowcells[column_number(cell)] = cell end - cellnode = XML.next(cellnode) - end - if !isnothing(cellnode) && localname(cellnode) == "row" # have reached the beginning of next row - return cellnode, sst_count - else # no more rows - return nothing, sst_count end + # Row nodes are materialised up-front (see `_collect_row_nodes`), so callers + # advance by index; the first tuple element is unused but kept for the + # `next, sst_count = get_rowcells!(...)` call contract. + return nothing, sst_count end diff --git a/src/cellformat-helpers.jl b/src/cellformat-helpers.jl index 173e6359..6324a319 100644 --- a/src/cellformat-helpers.jl +++ b/src/cellformat-helpers.jl @@ -113,8 +113,9 @@ const EXCEL_COLUMN_WIDTH_PADDING = 0.7109375 # function copynode(o::XML.Node) - n = XML.Node(o.nodetype, o.tag, o.attributes, o.value, isnothing(o.children) ? nothing : [copynode(x) for x in o.children]) - return n + attrs = isnothing(o.attributes) ? nothing : copy(o.attributes) + children = isnothing(o.children) ? nothing : XML.Node{String}[copynode(x) for x in o.children] + return XML.Node{String}(o.nodetype, o.tag, attrs, o.value, children) end function do_sheet_names_match(ws::Worksheet, rng::T) where {T<:Union{SheetCellRef,AbstractSheetCellRange}} if ws.name == rng.sheet @@ -125,8 +126,8 @@ function do_sheet_names_match(ws::Worksheet, rng::T) where {T<:Union{SheetCellRe end function make_child_node(tag::String, name::String, pfx::String)::XML.Node - children = tag ∈ ("border", "fill") ? Vector{XML.Node}() : nothing - return XML.Node(XML.Element, pfx*name, OrderedDict{String,String}(), nothing, children) + children = tag ∈ ("border", "fill") ? Vector{XML.Node{String}}() : nothing + return XML.Node{String}(XML.Element, pfx*name, Pair{String,String}[], nothing, children) end function build_font_child!(new_node::XML.Node, tag::String, name::String, attrs::Union{Nothing,Dict{String,String}}, pfx::String) @@ -216,7 +217,7 @@ end if isnothing(attributes[a]) cnode = XML.Element(a) else - cnode = XML.Node(XML.Element, a, OrderedDict{String,String}(), nothing, tag ∈ ["border", "fill"] ? Vector{XML.Node}() : nothing) + cnode = XML.Node{String}(XML.Element, a, Pair{String,String}[], nothing, tag ∈ ["border", "fill"] ? Vector{XML.Node{String}}() : nothing) for (k, v) in attributes[a] cnode[k] = v end @@ -228,7 +229,7 @@ end if isnothing(attributes[a]) cnode = XML.Element(a) else - cnode = XML.Node(XML.Element, a, OrderedDict{String,String}(), nothing, tag ∈ ["border", "fill"] ? Vector{XML.Node}() : nothing) + cnode = XML.Node{String}(XML.Element, a, Pair{String,String}[], nothing, tag ∈ ["border", "fill"] ? Vector{XML.Node{String}}() : nothing) color = XML.Element("color") for (k, v) in attributes[a] if k == "style" && v != "none" @@ -255,7 +256,7 @@ end if isnothing(attributes[a]) cnode = XML.Element(a) else - cnode = XML.Node(XML.Element, a, OrderedDict{String,String}(), nothing, tag ∈ ["border", "fill"] ? Vector{XML.Node}() : nothing) + cnode = XML.Node{String}(XML.Element, a, Pair{String,String}[], nothing, tag ∈ ["border", "fill"] ? Vector{XML.Node{String}}() : nothing) patternfill = XML.Element("patternFill") fgcolor = XML.Element("fgColor") bgcolor = XML.Element("bgColor") @@ -475,14 +476,14 @@ function get_new_formatId(wb::Workbook, format::String)::Int if isnothing(j) # There are no existing custom formats return styles_add_numFmt(wb, format) else - existing_elements_count = length(XML.children(xroot[i][j])) + existing_elements_count = length(xml_elements(xroot[i][j])) if parse(Int, xroot[i][j]["count"]) != existing_elements_count throw(XLSXError("Wrong number of font elements found: $existing_elements_count. Expected $(parse(Int, xroot[i][j]["count"])).")) end format_node = XML.Element("numFmt"; numFmtId=string(existing_elements_count + PREDEFINED_NUMFMT_COUNT), - formatCode=XLSX.escape(format) + formatCode=format ) return styles_add_cell_attribute(wb, format_node, "numFmts") + PREDEFINED_NUMFMT_COUNT @@ -529,7 +530,7 @@ function update_template_xf(ws::Worksheet, allXfNodes::Vector{XML.Node}, existin old_cell_xf = styles_cell_xf(allXfNodes, Int(existing_style.id)) new_cell_xf = copynode(old_cell_xf) if isnothing(new_cell_xf.children) - new_cell_xf=XML.Node(new_cell_xf, alignment) + new_cell_xf = XML.Node{String}(XML.Element, XML.tag(new_cell_xf), new_cell_xf.attributes, nothing, XML.Node{String}[alignment]) elseif length(XML.children(new_cell_xf)) == 0 push!(new_cell_xf, alignment) else @@ -550,14 +551,14 @@ end function styles_add_cell_attribute(wb::Workbook, new_att::XML.Node, att::String)::Int xroot = styles_xmlroot(wb) i, j = get_idces(xroot, "styleSheet", att) - existing_elements_count = length(XML.children(xroot[i][j])) + existing_elements_count = length(xml_elements(xroot[i][j])) if parse(Int, xroot[i][j]["count"]) != existing_elements_count throw(XLSXError("Wrong number of elements elements found: $existing_elements_count. Expected $(parse(Int, xroot[i][j]["count"])).")) end # Check new_att doesn't duplicate any existing att. If yes, use that rather than create new. - for (k, node) in enumerate(XML.children(xroot[i][j])) - if localname(new_att) == "numFmt" # mustn't compare numFmtId attribute for formats + for (k, node) in enumerate(xml_elements(xroot[i][j])) + if XML.tag(new_att) == "numFmt" # mustn't compare numFmtId attribute for formats if node["formatCode"] == new_att["formatCode"] return k - 1 # CellDataFormat is zero-indexed end @@ -1110,7 +1111,7 @@ function process_uniform_core(f::Function, ws::Worksheet, allXfNodes::Vector{XML if first # Get the attribute of the first cell in the range. newid = f(ws, cellref; kw...) new_alignment = getAlignment(ws, cellref).alignment["alignment"] - alignment_node = XML.Node(XML.Element, "alignment", new_alignment, nothing, nothing) + alignment_node = XML.Node{String}(XML.Element, "alignment", isnothing(new_alignment) ? Pair{String,String}[] : Pair{String,String}[k => v for (k,v) in new_alignment], nothing, nothing) first = false else # Apply the same attribute to the rest of the cells in the range. if cell.style == UInt64(0) @@ -1320,12 +1321,12 @@ function update_sharedString_font(ws::Worksheet, cell::Cell; is = parse(str_formatted, XML.Node)[1] # Convert to XML.Node for ease of handling - all_r = filter(z -> z.tag == "r", XML.children(is)) + all_r = filter(z -> localname(z) == "r", XML.children(is)) run_elements = reduce(vcat, [XML.children(z) for z in all_r]) - rPr_elements=filter(z -> z.tag == "rPr", run_elements) # rPr elements + rPr_elements = filter(z -> localname(z) == "rPr", run_elements) # rPr elements t=String[] # text elements - for i in filter(z -> z.tag == "t", run_elements) + for i in filter(z -> localname(z) == "t", run_elements) push!(t, XML.is_simple(i[1]) ? XML.simple_value(i[1]) : XML.value(i[1])) end @@ -1337,7 +1338,7 @@ function update_sharedString_font(ws::Worksheet, cell::Cell; for att in XML.children(rPr) # first copy existing attributes for i in 1:length(atts) - if att.tag == atts[i] + if XML.tag(att) == atts[i] new_rPr[i] = att end end @@ -1372,7 +1373,7 @@ function update_sharedString_font(ws::Worksheet, cell::Cell; if !isnothing(rPr.children) empty!(rPr.children) foreach(new_rPr) do element - element.tag != "DeleteMe" && push!(rPr.children, element) + XML.tag(element) != "DeleteMe" && push!(rPr.children, element) end end end @@ -1420,7 +1421,7 @@ function update_sharedString_font(ws::Worksheet, cell::Cell; for r in 1:length(all_r) if t[r] != ")___DeleteMe___(" # signals a merged element to be skipped write(new_r, " \n") - r > inc_first && write(new_r, XML.write(rPr_elements[r-inc_first];depth=3) * "\n") + r > inc_first && write(new_r, XML.write(rPr_elements[r-inc_first]) * "\n") write(new_r, " " *t[r] * "\n") write(new_r, " \n") end diff --git a/src/cellformats.jl b/src/cellformats.jl index 9ee46d46..8f789f2c 100644 --- a/src/cellformats.jl +++ b/src/cellformats.jl @@ -337,7 +337,7 @@ function getFont(wb::Workbook, cell_style::XML.Node)::Union{Nothing,CellFont} current_font = font_nodes[fontid_int + 1] # index into filtered list, not raw children # current_font = XML.children(font_elements)[fontid_int+1] # Zero based! font_atts = Dict{String,Union{Dict{String,String},Nothing}}() - for c in XML.children(current_font) + for c in xml_elements(current_font) if isnothing(XML.attributes(c)) || length(XML.attributes(c)) == 0 font_atts[localname(c)] = nothing else @@ -437,13 +437,14 @@ function getBorder(wb::Workbook, cell_style::XML.Node)::Union{Nothing,CellBorder applyborder = haskey(cell_style, "applyBorder") ? cell_style["applyBorder"] : "0" xroot = styles_xmlroot(wb) border_elements = find_all_nodes("/" * SPREADSHEET_NAMESPACE_XPATH_ARG * ":styleSheet/" * SPREADSHEET_NAMESPACE_XPATH_ARG * ":borders", xroot)[begin] - if parse(Int, border_elements["count"]) != length(XML.children(border_elements)) - throw(XLSXError("Unexpected number of border definitions found : $(length(XML.children(border_elements))). Expected $(parse(Int, border_elements["count"]))")) + border_nodes = xml_elements(border_elements) + if parse(Int, border_elements["count"]) != length(border_nodes) + throw(XLSXError("Unexpected number of border definitions found : $(length(border_nodes)). Expected $(parse(Int, border_elements["count"]))")) end - current_border = XML.children(border_elements)[parse(Int, borderid)+1] # Zero based! + current_border = border_nodes[parse(Int, borderid)+1] # Zero based! diag_atts = XML.attributes(current_border) border_atts = Dict{String,Union{Dict{String,String},Nothing}}() - for side in XML.children(current_border) + for side in xml_elements(current_border) if isnothing(XML.attributes(side)) || length(XML.attributes(side)) == 0 border_atts[localname(side)] = nothing else @@ -463,7 +464,7 @@ function getBorder(wb::Workbook, cell_style::XML.Node)::Union{Nothing,CellBorder throw(XLSXError("No direction set for `diagonal` border")) end end - for subc in XML.children(side) # color is a child of a border element + for subc in xml_elements(side) # color is a child of a border element for (k, v) in XML.attributes(subc) border_atts[localname(side)][k] = v end @@ -1369,7 +1370,7 @@ function setAlignment(sh::Worksheet, cellref::CellRef; cell_alignment = getAlignment(wb, cell_style) old_atts = isnothing(cell_alignment) ? Dict{String,String}() : cell_alignment.alignment["alignment"] - atts = OrderedDict{String,String}() + atts = Dict{String,String}() _merge_alignment_att(atts, "horizontal", horizontal, old_atts) _merge_alignment_att(atts, "vertical", vertical, old_atts) _merge_alignment_att(atts, "wrapText", wrapText, old_atts, b -> b ? "1" : "0") @@ -1377,7 +1378,7 @@ function setAlignment(sh::Worksheet, cellref::CellRef; _merge_alignment_att(atts, "indent", indent, old_atts) _merge_alignment_att(atts, "textRotation", rotation, old_atts) - alignment_node = XML.Node(XML.Element, "$(pfx)alignment", atts, nothing, nothing) + alignment_node = XML.Node{String}(XML.Element, "alignment", Pair{String,String}[k => v for (k,v) in atts], nothing, nothing) newstyle = update_template_xf(sh, allXfNodes, CellDataFormat(cell.style), alignment_node).id cell.style = newstyle @@ -1520,7 +1521,7 @@ function getFormat(wb::Workbook, cell_style::XML.Node)::Union{Nothing,CellFormat current_format = format_nodes[idx] atts = Dict{String,String}(k => XLSX.unescape(v) for (k, v) in XML.attributes(current_format)) - format_atts[localname(current_format)] = atts + format_atts[XML.tag(current_format)] = atts else # Built-in format — validate it falls in a known range @@ -1929,7 +1930,7 @@ function setColumnWidth(ws::Worksheet, rng::CellRange; width::Union{Nothing,Real # Build a map of existing column definitions keyed by min (column index string) col_defs = Dict{String,Dict{String,String}}() - for c in XML.children(sheetdoc[i][j]) + for c in xml_elements(sheetdoc[i][j]) col_defs[c["min"]] = Dict{String,String}(XML.attributes(c)) end @@ -2015,7 +2016,7 @@ function getColumnWidth(ws::Worksheet, cellref::CellRef)::Union{Nothing,Real} if isnothing(j) # There are no existing column formats defined. return nothing end - for c in XML.children(sheetdoc[i][j]) + for c in xml_elements(sheetdoc[i][j]) if c["min"] == string(cellref.column_number) if haskey(c, "width") return parse(Float64, c["width"]) @@ -2239,7 +2240,7 @@ function getMergedCells(ws::Worksheet)::Union{Vector{CellRange},Nothing} return nothing end - c = XML.children(sheetdoc[i][j]) + c = xml_elements(sheetdoc[i][j]) if length(c) != parse(Int, sheetdoc[i][j]["count"]) throw(XLSXError("Unexpected number of mergeCells found: $(length(c)). Expected $(sheetdoc[i][j]["count"]).")) end @@ -2458,26 +2459,24 @@ function mergeCells(ws::Worksheet, cr::CellRange) throw(XLSXError("Cannot get merged cells because cache is not enabled.")) end - sheetdoc = xmlroot(get_workbook(ws), ws.relationship_id) - pfx = get_prefix(ws) - pfx = pfx == "" ? pfx : pfx * ":" - - l = insert_index(sheetdoc[end], "mergeCells", WORKSHEET_ORDER) + sheetdoc = xmlroot(get_workbook(ws), ws.relationship_id) # find the block in the worksheet's xml file + i, j = get_idces(sheetdoc, "worksheet", "mergeCells") - if localname(sheetdoc[end][l]) != "mergeCells" # There are no existing merged cells. Insert immediately after the block and push everything else down one. - len = length(sheetdoc[end]) - merge_node=XML.Node(XML.Element, "$(pfx)mergeCells", OrderedDict{String,String}(), nothing, Vector{XML.Node}()) + if isnothing(j) # There are no existing merged cells. Insert immediately after the block and push everything else down one. + k, l = get_idces(sheetdoc, "worksheet", "sheetData") + len = length(sheetdoc[k]) + i != k && throw(XLSXError("Some problem here!")) if l != len - insert!(sheetdoc[end].children, l+1, merge_node) + insert!(sheetdoc[k].children, l+1, XML.Element("mergeCells")) else - push!(sheetdoc[end], merge_node) + push!(sheetdoc[k], XML.Element("mergeCells")) end - l += 1 + j = l + 1 count = 0 else # There are already some existing merged cells - c = XML.children(sheetdoc[end][l]) + c = xml_elements(sheetdoc[i][j]) count = length(c) - if count != parse(Int, sheetdoc[end][l]["count"]) + if count != parse(Int, sheetdoc[i][j]["count"]) throw(XLSXError("Unexpected number of mergeCells found: $(length(c)). Expected $(sheetdoc[i][j]["count"]).")) end for child in c @@ -2487,9 +2486,9 @@ function mergeCells(ws::Worksheet, cr::CellRange) end end - push!(sheetdoc[end][l], XML.Element("$(pfx)mergeCell", ref=string(cr))) # Add the new merged cell range. + push!(sheetdoc[i][j], XML.Element("mergeCell", ref=string(cr))) # Add the new merged cell range. count += 1 - sheetdoc[end][l]["count"] = count + sheetdoc[i][j]["count"] = count # All cells except the base cell are set to missing. let first = true diff --git a/src/conditional-format-helpers.jl b/src/conditional-format-helpers.jl index 03e0e0db..6698fe7f 100644 --- a/src/conditional-format-helpers.jl +++ b/src/conditional-format-helpers.jl @@ -61,7 +61,7 @@ function update_worksheet_cfx!(allcfs, cfx, ws, rng) matchcfs = filter(x -> x["sqref"] == string(rng), allcfs) # Match range with existing conditional formatting blocks. l = length(matchcfs) if l == 0 # No existing conditional formatting blocks for this range so create a new one. - new_cf = XML.Element("$(pfx)conditionalFormatting"; sqref=rng) + new_cf = XML.Element("conditionalFormatting"; sqref=string(rng)) push!(new_cf, cfx) add_cf_to_XML(ws, new_cf) # Add the new conditional formatting block to the worksheet XML. elseif l == 1 # Existing conditional formatting block found for this range so add new rule to that block. @@ -88,13 +88,13 @@ function allExtCfs(ws::Worksheet)::Vector{XML.Node} let cfs = nothing for ext in exts for c in XML.children(ext) - if XML.tag(c) == "x14:conditionalFormattings" + if localname(c) == "conditionalFormattings" cfs = c break end end end - return isnothing(cfs) ? Vector{XML.Node}() : XML.children(cfs) + return isnothing(cfs) ? Vector{XML.Node}() : xml_elements(cfs) end end function make_extLst!(s) @@ -193,7 +193,7 @@ function Add_Cf_Dx(wb::Workbook, new_dx::XML.Node)::DxFormat println(XML.write(xroot[i][j])) =# else - existing_dxf_elements_count = length(XML.children(xroot[i][j])) + existing_dxf_elements_count = length(xml_elements(xroot[i][j])) if parse(Int, xroot[i][j]["count"]) != existing_dxf_elements_count throw(XLSXError("Wrong number of xf elements found: $existing_cellxf_elements_count. Expected $(parse(Int, xroot[i][j]["count"])).")) @@ -201,8 +201,8 @@ function Add_Cf_Dx(wb::Workbook, new_dx::XML.Node)::DxFormat end # Don't reuse duplicates here. Always create new! - existingdx = XML.children(xroot[i][j]) - dxfs = unlink(xroot[i][j], ("dxfs", "dxf"), pfx) # Create the new Node + existingdx = xml_elements(xroot[i][j]) + dxfs = unlink(xroot[i][j], ("dxfs", "dxf")) # Create the new Node if length(existingdx) > 0 for c in existingdx push!(dxfs, c) # Copy each existing into the new Node diff --git a/src/conditional-formats.jl b/src/conditional-formats.jl index 03226241..efc7c9e6 100644 --- a/src/conditional-formats.jl +++ b/src/conditional-formats.jl @@ -484,14 +484,18 @@ function getConditionalExtFormats(ws::Worksheet, allcfnodes::Vector{XML.Node}):: allcfs = Vector{Pair{CellRange,NamedTuple{(:type, :priority),Tuple{String,Int64}}}}() for cf in allcfnodes let t, p, r, rule = false, ref = false - @assert XML.tag(cf) == "x14:conditionalFormatting" "Something wrong here" - sqref = cf[end] - if XML.tag(sqref) == "xm:sqref" - r = XML.simple_value(sqref) + @assert localname(cf) == "conditionalFormatting" "Something wrong here" + # Children may be interleaved with whitespace text nodes from the + # writer's indentation, so locate `xm:sqref` by name rather than + # assuming it is the last child. + cf_elements = xml_elements(cf) + sqref = findlast(c -> localname(c) == "sqref", cf_elements) + if !isnothing(sqref) + r = XML.simple_value(cf_elements[sqref]) ref = true end - for child in XML.children(cf) - if XML.tag(child) == "x14:cfRule" + for child in cf_elements + if localname(child) == "cfRule" t = child["type"] if t != "dataBar" # This is the other half of a dataBar definition - don't count twice! p = parse(Int, child["priority"]) @@ -1455,16 +1459,16 @@ function setCfCellIs(ws::Worksheet, rng::CellRange; allkws::Dict{Symbol,Any}=()) if isnothing(value) value = all(ismissing.(ws[rng])) ? nothing : string(sum(skipmissing(ws[rng])) / count(!ismissing, ws[rng])) end - cfx = XML.Element("$(pfx)cfRule"; type="cellIs", dxfId=Int(dxid.id)) + cfx = XML.Element("$(pfx)cfRule"; type="cellIs", dxfId=string(dxid.id)) cfx["priority"] = length(old_cf) > 0 ? string(maximum([last(x).priority for x in values(old_cf)]) + 1) : 1 if !isnothing(stopIfTrue) && stopIfTrue == "true" cfx["stopIfTrue"] = "1" end cfx["operator"] = operator - push!(cfx, XML.Element("$(pfx)formula", XML.Text(XLSX.escape(value)))) + push!(cfx, XML.Element("$(pfx)formula", XML.Text(value))) if !isnothing(value2) && operator ∈ ["between", "notBetween"] - push!(cfx, XML.Element("$(pfx)formula", XML.Text(XLSX.escape(value2)))) + push!(cfx, XML.Element("$(pfx)formula", XML.Text(value2))) end update_worksheet_cfx!(allcfs, cfx, ws, rng) @@ -1550,14 +1554,14 @@ function setCfContainsText(ws::Worksheet, rng::CellRange; allkws::Dict{Symbol,An end formula = replace(formula, "__txt__" => value, "__CR__" => string(first(rng))) - cfx = XML.Element("$(pfx)cfRule"; type=type, dxfId=Int(dxid.id)) + cfx = XML.Element("$(pfx)cfRule"; type=type, dxfId=string(dxid.id)) cfx["priority"] = length(old_cf) > 0 ? string(maximum([last(x).priority for x in values(old_cf)]) + 1) : 1 if !isnothing(stopIfTrue) && stopIfTrue == "true" cfx["stopIfTrue"] = "1" end cfx["operator"] = operator cfx["text"] = value - push!(cfx, XML.Element("$(pfx)formula", XML.Text(XLSX.escape(formula)))) + push!(cfx, XML.Element("$(pfx)formula", XML.Text(formula))) update_worksheet_cfx!(allcfs, cfx, ws, rng) @@ -1627,7 +1631,7 @@ function setCfTop10(ws::Worksheet, rng::CellRange; allkws::Dict{Symbol,Any}=()): percent = "" bottom = "" - cfx = XML.Element("$(pfx)cfRule"; type="top10", dxfId=Int(dxid.id)) + cfx = XML.Element("$(pfx)cfRule"; type="top10", dxfId=string(dxid.id)) if operator == "topN" elseif operator == "topN%" percent = "1" @@ -1714,25 +1718,25 @@ function setCfAboveAverage(ws::Worksheet, rng::CellRange; allkws::Dict{Symbol,An dxid = Add_Cf_Dx(wb, new_dx) if operator == "aboveAverage" - cfx = XML.Element("$(pfx)cfRule"; type=operator, dxfId=Int(dxid.id), priority="1") + cfx = XML.Element("$(pfx)cfRule"; type=operator, dxfId=string(dxid.id), priority="1") elseif operator == "aboveEqAverage" - cfx = XML.Element("$(pfx)cfRule"; type="aboveAverage", dxfId=Int(dxid.id), priority="1", equalAverage="1") + cfx = XML.Element("$(pfx)cfRule"; type="aboveAverage", dxfId=string(dxid.id), priority="1", equalAverage="1") elseif operator == "plus1StdDev" - cfx = XML.Element("$(pfx)cfRule"; type="aboveAverage", dxfId=Int(dxid.id), priority="1", stdDev="1") + cfx = XML.Element("$(pfx)cfRule"; type="aboveAverage", dxfId=string(dxid.id), priority="1", stdDev="1") elseif operator == "plus2StdDev" - cfx = XML.Element("$(pfx)cfRule"; type="aboveAverage", dxfId=Int(dxid.id), priority="1", stdDev="2") + cfx = XML.Element("$(pfx)cfRule"; type="aboveAverage", dxfId=string(dxid.id), priority="1", stdDev="2") elseif operator == "plus3StdDev" - cfx = XML.Element("$(pfx)cfRule"; type="aboveAverage", dxfId=Int(dxid.id), priority="1", stdDev="3") + cfx = XML.Element("$(pfx)cfRule"; type="aboveAverage", dxfId=string(dxid.id), priority="1", stdDev="3") elseif operator == "belowAverage" - cfx = XML.Element("$(pfx)cfRule"; type="aboveAverage", dxfId=Int(dxid.id), priority="1", aboveAverage="0") + cfx = XML.Element("$(pfx)cfRule"; type="aboveAverage", dxfId=string(dxid.id), priority="1", aboveAverage="0") elseif operator == "belowEqAverage" - cfx = XML.Element("$(pfx)cfRule"; type="aboveAverage", dxfId=Int(dxid.id), priority="1", aboveAverage="0", equalAverage="1") + cfx = XML.Element("$(pfx)cfRule"; type="aboveAverage", dxfId=string(dxid.id), priority="1", aboveAverage="0", equalAverage="1") elseif operator == "minus1StdDev" - cfx = XML.Element("$(pfx)cfRule"; type="aboveAverage", dxfId=Int(dxid.id), priority="1", aboveAverage="0", stdDev="1") + cfx = XML.Element("$(pfx)cfRule"; type="aboveAverage", dxfId=string(dxid.id), priority="1", aboveAverage="0", stdDev="1") elseif operator == "minus2StdDev" - cfx = XML.Element("$(pfx)cfRule"; type="aboveAverage", dxfId=Int(dxid.id), priority="1", aboveAverage="0", stdDev="2") + cfx = XML.Element("$(pfx)cfRule"; type="aboveAverage", dxfId=string(dxid.id), priority="1", aboveAverage="0", stdDev="2") elseif operator == "minus3StdDev" - cfx = XML.Element("$(pfx)cfRule"; type="aboveAverage", dxfId=Int(dxid.id), priority="1", aboveAverage="0", stdDev="3") + cfx = XML.Element("$(pfx)cfRule"; type="aboveAverage", dxfId=string(dxid.id), priority="1", aboveAverage="0", stdDev="3") else throw(XLSXError("Invalid operator: $operator. Valid options are: `aboveAverage`, `aboveEqAverage`, `plus1sStdDev`, `plus2StdDev`, `plus3StdDev`, `belowAverage`, `belowEqAverage`, `minus1StdDev`, `minus2StdDev`, `minus3StdDev`.")) end @@ -1811,14 +1815,14 @@ function setCfTimePeriod(ws::Worksheet, rng::CellRange; allkws::Dict{Symbol,Any} new_dx = get_new_dx(wb, dx) dxid = Add_Cf_Dx(wb, new_dx) - cfx = XML.Element("$(pfx)cfRule"; type="timePeriod", dxfId=Int(dxid.id)) + cfx = XML.Element("$(pfx)cfRule"; type="timePeriod", dxfId=string(dxid.id)) cfx["priority"] = length(old_cf) > 0 ? string(maximum([last(x).priority for x in values(old_cf)]) + 1) : 1 if !isnothing(stopIfTrue) && stopIfTrue == "true" cfx["stopIfTrue"] = "1" end cfx["timePeriod"] = operator - push!(cfx, XML.Element("$(pfx)formula", XML.Text(XLSX.escape(formula)))) + push!(cfx, XML.Element("$(pfx)formula", XML.Text(formula))) update_worksheet_cfx!(allcfs, cfx, ws, rng) @@ -1895,12 +1899,12 @@ function setCfContainsBlankErrorUniqDup(ws::Worksheet, rng::CellRange; allkws::D new_dx = get_new_dx(wb, dx) dxid = Add_Cf_Dx(wb, new_dx) - cfx = XML.Element("$(pfx)cfRule"; type=operator, dxfId=Int(dxid.id)) + cfx = XML.Element("$(pfx)cfRule"; type=operator, dxfId=string(dxid.id)) cfx["priority"] = length(old_cf) > 0 ? string(maximum([last(x).priority for x in values(old_cf)]) + 1) : 1 if !isnothing(stopIfTrue) && stopIfTrue == "true" cfx["stopIfTrue"] = "1" end - formula != "" && push!(cfx, XML.Element("$(pfx)formula", XML.Text(XLSX.escape(formula)))) + formula != "" && push!(cfx, XML.Element("$(pfx)formula", XML.Text(formula))) update_worksheet_cfx!(allcfs, cfx, ws, rng) @@ -1964,13 +1968,13 @@ function setCfFormula(ws::Worksheet, rng::CellRange; allkws::Dict{Symbol,Any}=() new_dx = get_new_dx(wb, dx) dxid = Add_Cf_Dx(wb, new_dx) - cfx = XML.Element("$(pfx)cfRule"; type="expression", dxfId=Int(dxid.id)) + cfx = XML.Element("$(pfx)cfRule"; type="expression", dxfId=string(dxid.id)) cfx["priority"] = length(old_cf) > 0 ? string(maximum([last(x).priority for x in values(old_cf)]) + 1) : 1 if !isnothing(stopIfTrue) && stopIfTrue == "true" cfx["stopIfTrue"] = "1" end - push!(cfx, XML.Element("$(pfx)formula", XML.Text("(" * XLSX.escape(uppercase_unquoted(formula)) * ")"))) + push!(cfx, XML.Element("$(pfx)formula", XML.Text("(" * uppercase_unquoted(formula) * ")"))) update_worksheet_cfx!(allcfs, cfx, ws, rng) @@ -2039,7 +2043,7 @@ function setCfColorScale(ws::Worksheet, rng::CellRange; allkws::Dict{Symbol,Any} let new_pr, new_cf - new_pr = length(old_cf) > 0 ? string(maximum([last(x).priority for x in values(old_cf)]) + 1) : 1 + new_pr = length(old_cf) > 0 ? string(maximum([last(x).priority for x in values(old_cf)]) + 1) : "1" if isnothing(colorscale) @@ -2064,7 +2068,7 @@ function setCfColorScale(ws::Worksheet, rng::CellRange; allkws::Dict{Symbol,Any} do_sheet_names_match(ws, SheetCellRef(val)) val = string(SheetCellRef(val).cellref) end - val = XLSX.escape(uppercase_unquoted(val)) + val = uppercase_unquoted(val) end end @@ -2196,7 +2200,7 @@ function setCfIconSet(ws::Worksheet, rng::CellRange; allkws::Dict{Symbol,Any}=() let new_pr, new_cf - new_pr = length(old_cf) > 0 ? string(maximum([last(x).priority for x in values(old_cf)]) + 1) : 1 + new_pr = length(old_cf) > 0 ? string(maximum([last(x).priority for x in values(old_cf)]) + 1) : "1" isnothing(min_type) || min_type in ["percentile", "percent", "num", "formula"] || throw(XLSXError("Invalid min_type: $min_type. Valid options are: percentile, percent, num, formula.")) (!isnothing(min_type) && min_type == "formula") || isnothing(min_val) || is_valid_fixed_cellname(min_val) || is_valid_fixed_sheet_cellname(min_val) || !isnothing(tryparse(Float64, min_val)) || throw(XLSXError("Invalid min_val: `$min_val`. Valid options (unless min_type is `formula`) are a CellRef (e.g. `\$A\$1`) or a number.")) @@ -2213,7 +2217,7 @@ function setCfIconSet(ws::Worksheet, rng::CellRange; allkws::Dict{Symbol,Any}=() do_sheet_names_match(ws, SheetCellRef(val)) val = string(SheetCellRef(val).cellref) end - val = XLSX.escape(uppercase_unquoted(val)) + val = uppercase_unquoted(val) end end if !haskey(iconsets, iconset) @@ -2273,8 +2277,9 @@ function setCfIconSet(ws::Worksheet, rng::CellRange; allkws::Dict{Symbol,Any}=() else c = XML.Element("xm:f", XML.Text(val)) end - if isnothing(XML.children(cfx[1][i+1])) - cfx[1][i+1] = XML.Node(cfx[1][i+1], c) + if isempty(XML.children(cfx[1][i+1])) + old = cfx[1][i+1] + cfx[1][i+1] = XML.Node{String}(XML.Element, XML.tag(old), old.attributes, nothing, XML.Node{String}[c]) else cfx[1][i+1][1] = c end @@ -2435,7 +2440,7 @@ function setCfDataBar(ws::Worksheet, rng::CellRange; allkws::Dict{Symbol,Any}=() do_sheet_names_match(ws, SheetCellRef(val)) val = string(SheetCellRef(val).cellref) end - val = XLSX.escape(uppercase_unquoted(val)) + val = uppercase_unquoted(val) end end if !haskey(databars, databar) diff --git a/src/formula.jl b/src/formula.jl index b029dd96..46328240 100644 --- a/src/formula.jl +++ b/src/formula.jl @@ -618,7 +618,7 @@ function process_dynamic_array_functions(xf::XLSXFile, cellref::CellRef, val::St ref = cellname(cellref) * ":" * cellname(cellref) cm = "1" if !haskey(xf.files, "xl/metadata.xml") # add metadata.xml on first use of a dynamicArray formula - xf.data["xl/metadata.xml"] = XML.Node(XML.Raw(copy(METADATA_XML_DATA))) + xf.data["xl/metadata.xml"] = parse(String(copy(METADATA_XML_DATA)), XML.Node) xf.files["xl/metadata.xml"] = true # set file as read add_override!(xf, "/xl/metadata.xml", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheetMetadata+xml") rId = add_relationship!(get_workbook(xf), "metadata.xml", "http://schemas.openxmlformats.org/officeDocument/2006/relationships/sheetMetadata") @@ -664,7 +664,7 @@ function get_external_workbook_path(xf::XLSXFile, id::Int) haskey(atts, "r:id") || throw(XLSXError("Something wrong here!")) rId = atts["r:id"] # now need a second lookup of this further r:id - altUrls = XML.children(xmlroot(xf, "xl/externalLinks/_rels/$(basename(rel)).rels")[end]) + altUrls = xml_elements(xml_root_element(xmlroot(xf, "xl/externalLinks/_rels/$(basename(rel)).rels"))) for c in altUrls atts = XML.attributes(c) if haskey(atts, "Id") && atts["Id"] == rId diff --git a/src/read.jl b/src/read.jl index af7b9ad9..35815c3a 100644 --- a/src/read.jl +++ b/src/read.jl @@ -185,7 +185,12 @@ function build_ns_dict!(xf::XLSXFile) ns = xf.namespace for (file_name, is_read) in xf.files if is_read - xroot = xmlroot(xf, file_name)[end] + doc = xmlroot(xf, file_name) + # Some files are stored as lightweight placeholders (e.g. the SST) or + # as Raw nodes with no element children; skip those for ns detection. + els = xml_elements(doc) + isempty(els) && continue + xroot = last(els) prefix = get_default_namespace_prefix(xroot) ns[file_name] = prefix end @@ -202,26 +207,50 @@ function get_prefix(file_name::String, xf::XLSXFile)::Union{Nothing,AbstractStri ns = get(xf.namespace, file_name, nothing) return something(ns, "") end +# Returns the prefix (possibly "") that maps to the default/spreadsheet namespace, +# or `nothing` if there is no prefixed default. Used by `build_ns_dict!` for the +# Strict-OOXML namespace-prefix feature. function get_default_namespace_prefix(r::XML.Node) - ns = get_default_namespace(r) - isnothing(ns) && return nothing - (prefix, _) = ns - return prefix=="" ? nothing : prefix # may be "" (default) or "x" or anything -end -function get_default_namespace(r::XML.Node) nss = get_namespaces(r) - length(nss) == 1 && return first(keys(nss)), first(values(nss)) - haskey(nss, "") && return "", nss[""] + isempty(nss) && return nothing + if length(nss) == 1 + prefix = first(keys(nss)) + return prefix == "" ? nothing : prefix + end + haskey(nss, "") && return nothing for (k, v) in nss if v == SPREADSHEET_NAMESPACE_XPATH_ARG - return k, v + return k == "" ? nothing : k end end return nothing end +# v0.4 contract: returns the default-namespace URI as a single `String`. +function get_default_namespace(r::XML.Node)::String + nss = get_namespaces(r) + + # if only one namespace is defined, assume it is the default one + # even if it has a prefix + length(nss) == 1 && return first(values(nss)) + + # otherwise, prefer the unprefixed default namespace + haskey(nss, "") && return nss[""] + + # no unprefixed default (e.g. issues #380/#362/#267/#170): fall back to the + # spreadsheet namespace even if it carries a prefix + for (_, ns) in nss + if ns == SPREADSHEET_NAMESPACE_XPATH_ARG + return ns + end + end + + throw(XLSXError("No default namespace found.")) +end function get_namespaces(r::XML.Node)::Dict{String,String} nss = Dict{String,String}() - for (key, value) in XML.attributes(r) + atts = XML.attributes(r) + isnothing(atts) && return nss # XML.jl 0.4 returns `nothing` for attribute-less nodes + for (key, value) in atts if startswith(key, "xmlns") colon_idx = findfirst(':', key) nss[isnothing(colon_idx) ? "" : SubString(key, colon_idx+1)] = value @@ -247,23 +276,26 @@ function is_strict_ooxml(xf::XLSXFile)::Bool # Primary check: conformance attribute on workbook root if haskey(files, "xl/workbook.xml") - wbNode = files["xl/workbook.xml"][end] + wbNode = xml_root_element(files["xl/workbook.xml"]) attrs = XML.attributes(wbNode) - if get(attrs, "conformance", "") == "strict" - return true - end - # Also catch strict namespace declarations on root element - if any(occursin("purl.oclc.org/ooxml", v) for v in values(attrs)) - return true + if !isnothing(attrs) + if get(attrs, "conformance", "") == "strict" + return true + end + # Also catch strict namespace declarations on root element + if any(occursin("purl.oclc.org/ooxml", v) for (_, v) in attrs) + return true + end end end # Fallback: check relationship types in _rels/.rels if haskey(files, "_rels/.rels") - rels = files["_rels/.rels"][end] - for el in XML.children(rels) + rels = xml_root_element(files["_rels/.rels"]) + for el in xml_elements(rels) if localname(el) == "Relationship" - if occursin("purl.oclc.org/ooxml", get(XML.attributes(el), "Type", "")) + relattrs = XML.attributes(el) + if !isnothing(relattrs) && occursin("purl.oclc.org/ooxml", get(relattrs, "Type", "")) return true end end @@ -540,6 +572,33 @@ end # Convert a strict OOXML file to transitional format in-place by remapping # `purl.oclc.org/ooxml` namespaces and relationship types to their # `schemas.openxmlformats.org` equivalents, and dropping the `conformance="strict"` attribute. +# Remap a single element's attributes from strict OOXML to transitional, in place. +# XML.jl 0.4 exposes attributes as a read-only `Attributes` view, so mutate the +# node directly (`node[k] = v`) and drop attributes via the backing vector. +function _strict_to_transitional_node!(node::XML.Node, filename::AbstractString) + isnothing(node.attributes) && return nothing + # Snapshot keys/values first since we mutate while inspecting. + pairs = collect(node.attributes) + for (k, v) in pairs + if k == "conformance" && v == "strict" + filter!(p -> first(p) != "conformance", node.attributes) + elseif startswith(v, "http://purl.oclc.org/ooxml") + if haskey(STRICT_TO_TRANSITIONAL, v) + node[k] = STRICT_TO_TRANSITIONAL[v] + else + throw(XLSXError("Unsupported strict OOXML namespace or relationship type: \"$v\" in $filename. Please open an issue at https://github.com/JuliaData/XLSX.jl/issues")) + end + elseif k == "Type" && startswith(v, "http://purl.oclc.org/ooxml") + if haskey(STRICT_TO_TRANSITIONAL, v) + node[k] = STRICT_TO_TRANSITIONAL[v] + else + throw(XLSXError("Unsupported strict OOXML relationship type: \"$v\" in $filename. Please open an issue at https://github.com/JuliaData/XLSX.jl/issues")) + end + end + end + return nothing +end + function convert_strict_to_transitional!(xf::XLSXFile, pass::Int) for filename in keys(xf.files) @@ -553,35 +612,16 @@ function convert_strict_to_transitional!(xf::XLSXFile, pass::Int) if should_process data = xf.data[filename] - xroot = data[end] - attrs = XML.attributes(xroot) - - for (k, v) in attrs - if k == "conformance" && v == "strict" - delete!(attrs, "conformance") - elseif startswith(v, "http://purl.oclc.org/ooxml") - if haskey(STRICT_TO_TRANSITIONAL, v) - attrs[k] = STRICT_TO_TRANSITIONAL[v] - else - throw(XLSXError("Unsupported strict OOXML namespace or relationship type: \"$v\" in $filename. Please open an issue at https://github.com/JuliaData/XLSX.jl/issues")) - end - end - end + els = xml_elements(data) + # SST/worksheet files are stored as lightweight placeholders with no + # element children; nothing to remap there. + isempty(els) && continue + xroot = last(els) + _strict_to_transitional_node!(xroot, filename) # For .rels files, also patch Type= on child Relationship elements - for el in XML.children(xroot) - el_attrs = XML.attributes(el) - if !isnothing(el_attrs) - haskey(el_attrs, "conformance") && delete!(el_attrs, "conformance") - type_val = get(el_attrs, "Type", "") - if startswith(type_val, "http://purl.oclc.org/ooxml") - if haskey(STRICT_TO_TRANSITIONAL, type_val) - el_attrs["Type"] = STRICT_TO_TRANSITIONAL[type_val] - else - throw(XLSXError("Unsupported strict OOXML relationship type: \"$type_val\" in $filename. Please open an issue at https://github.com/JuliaData/XLSX.jl/issues")) - end - end - end + for el in xml_elements(xroot) + _strict_to_transitional_node!(el, filename) end end end @@ -652,7 +692,7 @@ updating `xf.source` to use a `.xlsx` file extension. Throws an `XLSXError` if the workbook override is missing or has an unknown content type. """ function ensure_workbook_is_xlsx!(xf::XLSXFile) - root = xf.data["[Content_Types].xml"][end] + root = xml_root_element(xf.data["[Content_Types].xml"]) workbook_override = nothing default_xml_type = nothing @@ -737,14 +777,14 @@ function parse_relationships!(xf::XLSXFile) # package level relationships xroot = get_package_relationship_root(xf) - for el in XML.children(xroot) + for el in xml_elements(xroot) push!(xf.relationships, Relationship(wb, el)) end isempty(xf.relationships) && throw(XLSXError("Relationships not found in _rels/.rels!")) # workbook level relationships xroot = get_workbook_relationship_root(xf) - for el in XML.children(xroot) + for el in xml_elements(xroot) push!(wb.relationships, Relationship(wb, el)) end isempty(wb.relationships) && throw(XLSXError("Relationships not found in xl/_rels/workbook.xml.rels")) @@ -754,7 +794,7 @@ end # Updates xf.workbook from xf.data[\"xl/workbook.xml\"] function parse_workbook!(xf::XLSXFile) - xroot = xmlroot(xf, "xl/workbook.xml")[end] + xroot = xml_root_element(xmlroot(xf, "xl/workbook.xml")) wb = get_workbook(xf) localname(xroot) != "workbook" && throw(XLSXError("Malformed xl/workbook.xml. Root node name should be 'workbook'. Got '$(localname(xroot))'.")) @@ -777,9 +817,9 @@ function parse_workbook!(xf::XLSXFile) # sheets wb.sheets = Worksheet[] - for node in XML.children(xroot) + for node in xml_elements(xroot) localname(node) != "sheets" && continue - for sheet_node in XML.children(node) + for sheet_node in xml_elements(node) localname(sheet_node) != "sheet" && throw(XLSXError("Unsupported node $(localname(sheet_node)) in node $(localname(node)) in 'xl/workbook.xml'.")) push!(wb.sheets, Worksheet(xf, sheet_node)) end @@ -787,9 +827,9 @@ function parse_workbook!(xf::XLSXFile) end # named ranges - for node in XML.children(xroot) + for node in xml_elements(xroot) localname(node) != "definedNames" && continue - for dn_node in XML.children(node) + for dn_node in xml_elements(node) localname(dn_node) != "definedName" && continue raw = XML.value(dn_node[1]) @@ -852,7 +892,7 @@ function get_wb_ext_refs(xf::XLSXFile) xroot = xmlroot(xf, "xl/workbook.xml") i, j = get_idces(xroot, "workbook", "externalReferences") if !isnothing(j) - for (i, ref) in enumerate(XML.children(xroot[i][j])) + for (i, ref) in enumerate(xml_elements(xroot[i][j])) ext_refs[i] = ref["r:id"] end end @@ -862,9 +902,9 @@ end # delete Override PartName=calcChain since this was never loaded (#31) function remove_calcChain!(xf::XLSXFile) xf.data["[Content_Types].xml"] - ctype_root = xmlroot(xf, "[Content_Types].xml")[end] + ctype_root = xml_root_element(xmlroot(xf, "[Content_Types].xml")) for (i, c) in enumerate(XML.children(ctype_root)) - if c.tag == "Override" && haskey(c.attributes, "PartName") && c.attributes["PartName"]=="/xl/calcChain.xml" + if XML.tag(c) == "Override" && haskey(c, "PartName") && c["PartName"]=="/xl/calcChain.xml" deleteat!(ctype_root.children, i) break end @@ -901,6 +941,44 @@ function strip_bom_and_lf!(bytes::Vector{UInt8}) end end +function splitNode(xml_str::String, skipnode::String) + doc = parse(xml_str, XML.Node) + root = xml_root_element(doc) + + # Find the target node and extract its content (match on local name so + # namespace-prefixed files such as `` are handled, issue #380). + target_idx = nothing + for (i, child) in enumerate(XML.children(root)) + if localname(child) == skipnode + target_idx = i + break + end + end + + if isnothing(target_idx) + return doc, "" + end + + target = root[target_idx] + target_tag = XML.tag(target) # keep any namespace prefix on the placeholder + + # Build wrapper XML from the target element and its children + skipped = XML.write(target) + if skipnode == "sheetData" + skipped = "" * skipped * "" + end + + # Replace with empty self-closing element (preserving attributes and prefix) + attrs = XML.attributes(target) + if isnothing(attrs) || isempty(attrs) + root[target_idx] = XML.Element(target_tag) + else + empty!(target.children) + end + + return doc, skipped +end + function skipNode(r::XML.Raw, skipnode::String) # separate rows or ssts to speed up reading of large files # Resolve wrapper strings once upfront prefix, suffix = if skipnode == "sheetData" @@ -908,7 +986,7 @@ function skipNode(r::XML.Raw, skipnode::String) # separate rows or ssts to speed elseif skipnode == "sst" "", "" else - throw(XLSXError("Unknown skipnode $skipnode.")) + throw(XLSXError("Unsupported skipNode target: $skipnode.")) end data_len = length(r.data) @@ -948,8 +1026,9 @@ function stream_files(xf::XLSXFile, zip_io::ZipArchives.ZipReader; pass::Int, ch # ignore xl/calcChain.xml in any case (#31) if f != "xl/calcChain.xml" - if pass==1 && (endswith(f, ".xml") || endswith(f, ".rels")) - # Identify usable xml files in XLSXFile + if pass==1 && !startswith(f, "customXml") && (endswith(f, ".xml") || endswith(f, ".rels")) + # Identify usable xml files in XLSXFile (customXml parts are + # kept as raw binary, see `process_file`). internal_xml_file_add!(xf, f) end put!(out, f) @@ -1003,7 +1082,8 @@ function load_files!(xf::XLSXFile, zip_io::ZipArchives.ZipReader; pass::Int) rid = get_relationship_id_by_target(wb, file.name) for sheet in wb.sheets if sheet.relationship_id == rid - first_cache_fill!(sheet, XML.LazyNode(file.raw), Threads.nthreads()) + lznode = parse(file.raw, XML.LazyNode) + first_cache_fill!(sheet, lznode, Threads.nthreads()) end end end @@ -1037,19 +1117,20 @@ function process_file(zip_io::ZipArchives.ZipReader, filename::String) try bytes = ZipArchives.zip_readentry(zip_io, filename) - if (endswith(filename, ".xml") || endswith(filename, ".rels")) - if occursin(r"^xl/worksheets/[^/]+\.xml$|^xl/sharedStrings\.xml$", filename) - strip_bom_and_lf!(bytes) - skipnode = filename == "xl/sharedStrings.xml" ? "sst" : "sheetData" - f, s = skipNode(XML.Raw(bytes), skipnode) # and elements can be very numerous in large files, so split out and keep as Raw XML data for speed - node = XML.Node(XML.Raw(f)) - raw = XML.Raw(s) + if !startswith(filename, "customXml") && (endswith(filename, ".xml") || endswith(filename, ".rels")) + strip_bom_and_lf!(bytes) + xml_str = String(bytes) + if filename == "xl/sharedStrings.xml" + node = XML.Element("sst") # placeholder; SST is loaded via sst_load! + raw = xml_str + elseif occursin(r"xl/worksheets/sheet\d+\.xml", filename) + node, raw = splitNode(xml_str, "sheetData") + raw = isempty(raw) ? nothing : raw else - strip_bom_and_lf!(bytes) - node = XML.Node(XML.Raw(bytes)) + node = parse(xml_str, XML.Node) end else - bin = bytes + bin = bytes end catch err throw(XLSXError("Failed to parse internal XML file `$filename`")) @@ -1073,12 +1154,13 @@ function internal_xml_file_read(xf::XLSXFile, zip_io::Union{Nothing,ZipArchives. try bytes = ZipArchives.zip_readentry(zip_io, filename) strip_bom_and_lf!(bytes) - if occursin(r"^xl/worksheets/[^/]+\.xml$|^xl/sharedStrings\.xml$", filename) - skipnode = filename == "xl/sharedStrings.xml" ? "sst" : "sheetData" - f, _ = skipNode(XML.Raw(bytes), skipnode) # and elements can be very numerous in large files, so split out and keep as Raw XML data for speed - xf.data[filename] = XML.Node(XML.Raw(f)) + xml_str = String(bytes) + if filename == "xl/sharedStrings.xml" + xf.data[filename] = XML.Element("sst") # placeholder; SST is loaded via sst_load! + elseif occursin(r"xl/worksheets/sheet\d+\.xml", filename) + xf.data[filename], _ = splitNode(xml_str, "sheetData") else - xf.data[filename] = XML.Node(XML.Raw(bytes)) + xf.data[filename] = parse(xml_str, XML.Node) end xf.files[filename] = true # set file as read catch err diff --git a/src/relationship.jl b/src/relationship.jl index d4cb3ea8..4c971e82 100644 --- a/src/relationship.jl +++ b/src/relationship.jl @@ -58,8 +58,8 @@ function has_relationship_by_type(wb::Workbook, _type_::String)::Bool end function get_package_relationship_root(xf::XLSXFile)::XML.Node - xroot = xmlroot(xf, "_rels/.rels")[end] - localname(xroot) != "Relationships" && throw(XLSXError("Malformed XLSX file $(xf.source). _rels/.rels root node name should be `Relationships`. Found $(XML.tag(xroot)).")) + xroot = xml_root_element(xmlroot(xf, "_rels/.rels")) + XML.tag(xroot) != "Relationships" && throw(XLSXError("Malformed XLSX file $(xf.source). _rels/.rels root node name should be `Relationships`. Found $(XML.tag(xroot)).")) if ("" => "http://schemas.openxmlformats.org/package/2006/relationships") ∉ get_namespaces(xroot) throw(XLSXError("Unexpected namespace at workbook relationship file: `$(get_namespaces(xroot))`.")) end @@ -67,8 +67,8 @@ function get_package_relationship_root(xf::XLSXFile)::XML.Node end function get_workbook_relationship_root(xf::XLSXFile)::XML.Node - xroot = xmlroot(xf, "xl/_rels/workbook.xml.rels")[end] - localname(xroot) != "Relationships" && throw(XLSXError("Malformed XLSX file $(xf.source). xl/_rels/workbook.xml.rels root node name should be `Relationships`. Found $(localname(xroot)).")) + xroot = xml_root_element(xmlroot(xf, "xl/_rels/workbook.xml.rels")) + XML.tag(xroot) != "Relationships" && throw(XLSXError("Malformed XLSX file $(xf.source). xl/_rels/workbook.xml.rels root node name should be `Relationships`. Found $(XML.tag(xroot)).")) if ("" => "http://schemas.openxmlformats.org/package/2006/relationships") ∉ get_namespaces(xroot) throw(XLSXError("Unexpected namespace at workbook relationship file: `$(get_namespaces(xroot))`.")) end @@ -98,15 +98,17 @@ function delete_relationships!(xf::XLSXFile, rel::Relationship) #TODO renumber worksheet files in relationships - if necessary. xroot = xmlroot(xf, "xl/_rels/workbook.xml.rels") + root_el = xml_root_element(xroot) - c=XML.children(xroot[end]) - d = findfirst(r -> r["Target"] == rel.Target, c) + c=XML.children(root_el) + d = findfirst(r -> XML.nodetype(r) == XML.Element && r["Target"] == rel.Target, c) deleteat!(c, d) new_rels=XML.Element("Relationships", xmlns="http://schemas.openxmlformats.org/package/2006/relationships") - for child in c + for child in xml_elements(root_el) push!(new_rels, child) end - xroot[end]=new_rels + root_idx = findfirst(n -> XML.nodetype(n) == XML.Element, XML.children(xroot)) + xroot[root_idx]=new_rels xf.data["xl/_rels/workbook.xml.rels"]=xroot end @@ -114,10 +116,10 @@ end #is_chartsheet(wb::Workbook, rid::String) = any(r.Id == rid && occursin("chartsheet", r.Type) for r in wb.relationships) function is_chartsheet(wb::Workbook, sheetname::AbstractString)::Bool name = unquoteit(sheetname) - xroot = get_xlsxfile(wb).data["xl/workbook.xml"][end] - for node in XML.children(xroot) + xroot = xml_root_element(get_xlsxfile(wb).data["xl/workbook.xml"]) + for node in xml_elements(xroot) localname(node) != "sheets" && continue - for sheet_node in XML.children(node) + for sheet_node in xml_elements(node) attrs = XML.attributes(sheet_node) isnothing(attrs) && continue get(attrs, "name", "") == name || continue diff --git a/src/sst.jl b/src/sst.jl index 554420c2..60f4933e 100644 --- a/src/sst.jl +++ b/src/sst.jl @@ -1,5 +1,5 @@ -SharedStringTable() = SharedStringTable(Vector{String}(), Dict{String, Int64}(), false) +SharedStringTable() = SharedStringTable(Vector{String}(), Vector{String}(), Dict{String, Int64}(), false) @inline get_sst(wb::Workbook) = wb.sst @inline get_sst(xl::XLSXFile) = get_sst(get_workbook(xl)) @@ -24,8 +24,8 @@ function create_new_sst(wb::Workbook, sst::SharedStringTable) add_relationship!(wb, "sharedStrings.xml", "http://schemas.openxmlformats.org/officeDocument/2006/relationships/sharedStrings") # add Content Type - ctype_root = xmlroot(get_xlsxfile(wb), "[Content_Types].xml")[end] - localname(ctype_root) != "Types" && throw(XLSXError("Something wrong here!")) + ctype_root = xml_root_element(xmlroot(get_xlsxfile(wb), "[Content_Types].xml")) + XML.tag(ctype_root) != "Types" && throw(XLSXError("Something wrong here!")) override_node = XML.Element("Override"; ContentType = "application/vnd.openxmlformats-officedocument.spreadsheetml.sharedStrings+xml", PartName = "/xl/sharedStrings.xml" @@ -45,7 +45,7 @@ function add_to_sst!(ss::SharedStringTable, si_xml::String)::Int64 # No match found, add new entry new_idx = length(ss.shared_strings) # 0-based index push!(ss.shared_strings, si_xml) - + push!(ss.unformatted, unformatted_text(parse(si_xml, XML.LazyNode))) ss.index[si_xml] = new_idx return new_idx @@ -84,7 +84,7 @@ function add_formatted_string!(wb::Workbook, str_formatted::String; mylock::Unio end # check if unformatted shared string needs xml:space="preserve" -needs_preserve(s::String) = startswith(s, ' ') || endswith(s, ' ') || contains(s, '\n') || contains(s, " ") +needs_preserve(s::AbstractString) = startswith(s, ' ') || endswith(s, ' ') || contains(s, '\n') || contains(s, " ") # allow to write cells containing only whitespace characters or with leading or trailing whitespace. function add_shared_string!(wb::Workbook, str_unformatted::AbstractString; mylock::Union{Nothing,ReentrantLock}=nothing) :: Int @@ -104,103 +104,28 @@ function add_shared_string!(wb::Workbook, str_unformatted::AbstractString; myloc end function sst_load!(workbook::Workbook) - chunksize = ROW_CHUNKSIZE sst = get_sst(workbook) if !sst.is_loaded - - relationship_type = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/sharedStrings" - if has_relationship_by_type(workbook, relationship_type) - sst_chan = stream_ssts(open_internal_file_stream(get_xlsxfile(workbook), "xl/sharedStrings.xml")[end], chunksize) - load_sst_table!(workbook, sst_chan, Threads.nthreads()) - init_sst_index(sst) - - return - end - - throw(XLSXError("Shared Strings Table not found for this workbook.")) - end -end -@inline _is_tag(n::String, tag::String) = n == tag -@inline _is_tag(n::Nothing, tag::String) = false - function produce_sstchunks(out, n, ssts, chunksize) - i = 0 # Position within current chunk - global_idx = 0 # Global position in SST table - - while !isnothing(n) - if _is_tag(localname(n), "si") - i += 1 - global_idx += 1 - ssts[i] = SstToken(n, global_idx) # ← Use global index - end - if i >= chunksize - put!(out, copy(ssts)) - i = 0 # Reset chunk position, but global_idx keeps going + has_sst(workbook) || throw(XLSXError("Shared Strings Table not found for this workbook.")) + doc = open_internal_file_stream(get_xlsxfile(workbook), "xl/sharedStrings.xml") + sst_root = xml_root_element(doc) + empty!(sst.shared_strings) + empty!(sst.unformatted) + uc = get(sst_root, "uniqueCount", nothing) + if !isnothing(uc) + n = parse(Int, uc) + sizehint!(sst.shared_strings, n) + sizehint!(sst.unformatted, n) end - n = XML.next(n) - end - if i > 0 - put!(out, copy(ssts[1:i])) - end -end - -function stream_ssts(n::XML.LazyNode, chunksize::Int; channel_size::Int=1 << 8) - n = XML.next(n) - ssts = Vector{SstToken}(undef, chunksize) - Channel{Vector{SstToken}}(channel_size) do out - produce_sstchunks(out, n, ssts, chunksize) - end -end - -function process_sst(wb, sst::SstToken) - el = sst.n - i = sst.idx - - if XML.nodetype(el) != XML.Text - localname(el) != "si" && throw(XLSXError("Unsupported node $(localname(el)) in sst table.")) - sst = Sst(XML.write(el), i) - return sst - - end - -end - - function load_sst_table!(wb::Workbook, chan::Channel, nthreads::Int) - sst_table = get_sst(wb) - sst_table.is_loaded = true - sst_results = Channel{Vector{Sst}}(1 << 8) - all_ssts = Vector{Tuple{Int,Sst}}() - - consumer = @async begin - for ssts in sst_results - for sst in ssts - push!(all_ssts, (sst.idx, sst)) - end - end - sort!(all_ssts, by = x -> x[1]) - - empty!(sst_table.shared_strings) - empty!(sst_table.index) - - for (i, sst) in all_ssts - push!(sst_table.shared_strings, sst.formatted) - sst_table.index[sst.formatted] = i - 1 # 0-based - end - - end - - # Producer tasks - @sync for _ in 1:nthreads - Threads.@spawn begin - for ssts in chan - # ssts is already a chunk - just process it - processed = [process_sst(wb, tok) for tok in ssts] - put!(sst_results, processed) + for child in XML.eachchildnode(sst_root) + if XML.nodetype(child) == XML.Element && localname(child) == "si" + push!(sst.shared_strings, XML.write(child)) + push!(sst.unformatted, unformatted_text(child)) end end + sst.is_loaded = true + init_sst_index(sst) end - - close(sst_results) - wait(consumer) end # Checks whether this workbook has a Shared String Table. @@ -212,35 +137,39 @@ end # Helper function to gather unformatted text from Excel data files. # It looks at all children of `el` for tag name `t` and returns # a join of all the strings found. -function unformatted_text(wb::Workbook, el::XML.LazyNode) :: String +function unformatted_text(el::XML.LazyNode) :: String io = IOBuffer() - gather_strings!(wb, io, el) - s = XLSX.unescape(String(take!(io))) - return s + gather_strings!(io, el) + # XML.jl 0.4 `LazyNode` already unescapes entity references in `XML.value`, + # so no extra unescape is needed here. + return String(take!(io)) end -function gather_strings!(wb::Workbook, io::IOBuffer, e::XML.LazyNode) +# 2-arg form retained for call sites that thread the workbook (e.g. inlineStr cells). +unformatted_text(::Workbook, el::XML.LazyNode) :: String = unformatted_text(el) + +function gather_strings!(io::IOBuffer, e::XML.LazyNode) + # Use `localname` (not `XML.tag`) so prefixed elements such as `` in + # files without a default namespace are matched the same as ``. tag = localname(e) # Skip phonetic hints entirely tag == "rPh" && return nothing if tag == "t" - for c in XML.children(e) - if XML.nodetype(c) == XML.Text - val = XML.is_simple(c) ? XML.simple_value(c) : XML.value(c) - !isnothing(val) && write(io, val) + # `` may carry `xml:space="preserve"` so it is not always "simple"; + # collect any Text/CData children directly. + for ch in XML.children(e) + nt = XML.nodetype(ch) + if nt === XML.Text || nt === XML.CData + v = XML.value(ch) + isnothing(v) || write(io, v) end end - - # Fallback for truly empty - if isempty(XML.children(e)) - val = XML.is_simple(e) ? XML.simple_value(e) : XML.value(e) - !isnothing(val) && write(io, val) - end else + # Recurse into children for all other tags for ch in XML.children(e) - gather_strings!(wb, io, ch) + gather_strings!(io, ch) end end @@ -251,8 +180,7 @@ end # `index` starts at 0. @inline function sst_unformatted_string(wb::Workbook, index::Int64)::String sst_load!(wb) - uss = get_sst(wb).shared_strings[index+1] - return unformatted_text(wb, parse(XML.LazyNode, uss)) + return get_sst(wb).unformatted[index+1] end @inline sst_unformatted_string(xl::XLSXFile, index::Int64) :: String = sst_unformatted_string(get_workbook(xl), index) @@ -683,9 +611,9 @@ function resolve_theme_color(theme_index::Int, tint::Float64) end # Create a RichTextString from a shared string with multiple runs (or nothing if a simple text) -function getRichTextString(wb::Workbook, xml_string::String)::Union{RichTextString, Nothing} - doc = parse(XML.Node, xml_string) - si = doc[end] +function getRichTextString(::Workbook, xml_string::String)::Union{RichTextString, Nothing} + doc = parse(xml_string, XML.Node) + si = xml_root_element(doc) # Check for rich text runs elements runs = [child for child in XML.children(si) if localname(child) == "r"] diff --git a/src/stream.jl b/src/stream.jl index 8cf1a03b..d55967cf 100644 --- a/src/stream.jl +++ b/src/stream.jl @@ -45,91 +45,89 @@ The iterator element is a SheetRow. @inline function open_internal_file_stream(xf::XLSXFile, filename::String) :: XML.LazyNode !internal_xml_file_exists(xf, filename) && throw(XLSXError("Couldn't find $filename in $(xf.source).")) -# if xf.use_cache_for_sheet_data || (xf.source isa IO) if xf.source isa IO seekstart(xf.source) zip_io = ZipArchives.ZipReader(read(xf.source)) else - zip_io = ZipArchives.ZipReader(FileArray(abspath(xf.source))) # FileArray is marginally slower than mmap -# zip_io = ZipArchives.ZipReader(Mmap.mmap(abspath(xf.source))) # but Mmap is unreliable : https://discourse.julialang.org/t/struggling-to-use-mmap-with-ziparchives/129839 + zip_io = ZipArchives.ZipReader(FileArray(abspath(xf.source))) end - return XML.LazyNode(XML.Raw(ZipArchives.zip_readentry(zip_io, filename))) + return parse(String(ZipArchives.zip_readentry(zip_io, filename)), XML.LazyNode) end +# Collect all row LazyNodes from a worksheet's sheetData element. +function _collect_row_nodes(doc::XML.LazyNode) + root = xml_root_element(doc) + localname(root) != "worksheet" && throw(XLSXError("Expecting to find a worksheet node. Found a $(localname(root)).")) + + # Find sheetData + sheetdata = nothing + for child in XML.children(root) + if localname(child) == "sheetData" + sheetdata = child + break + end + end + sheetdata === nothing && throw(XLSXError("No `sheetData` node found in worksheet")) + + # Collect row nodes + return XML.LazyNode[child for child in XML.children(sheetdata) if localname(child) == "row"] +end + # Creates an iterator for row elements in the Worksheet's XML. function Base.iterate(itr::SheetRowStreamIterator) ws = get_worksheet(itr) wb = get_workbook(ws) target_file = get_relationship_target_by_id("xl", get_workbook(ws), ws.relationship_id) - sheetnode = open_internal_file_stream(get_xlsxfile(ws), target_file) # worksheet target files are LazyNodes + doc = open_internal_file_stream(get_xlsxfile(ws), target_file) sst_pfx = get_sst_prefix(ws) - length(sheetnode) <= 0 && throw(XLSXError("Couldn't open reader for Worksheet $(ws.name).")) - localname(sheetnode[end]) != "worksheet" && throw(XLSXError("Expecting to find a worksheet node: Found a $(localname(sheetnode[end])).")) - - sheetnode=XML.next(sheetnode) - - while localname(sheetnode) != "sheetData" # Check for `sheetData` - sheetnode = XML.next(sheetnode) - sheetnode === nothing && throw(XLSXError("No `sheetData` node found in worksheet")) - end - - XML.depth(sheetnode) != 2 && throw(XLSXError("Malformed Worksheet \"$(ws.name)\": unexpected node depth for sheetData node: $(XML.depth(sheetnode)).")) - - rownode=XML.next(sheetnode) + length(doc) <= 0 && throw(XLSXError("Couldn't open reader for Worksheet $(ws.name).")) - while localname(rownode) != "row" # Check for at least one `row` - rownode = XML.next(rownode) - rownode === nothing && return nothing # no rows found - end + row_nodes = _collect_row_nodes(doc) + isempty(row_nodes) && return nothing - # rownode is now the first row - a = XML.attributes(rownode) # get row number and row height (if specified) + # Process first row + rownode = row_nodes[1] + a = XML.attributes(rownode) current_row = parse(Int, a["r"]) current_row_ht = haskey(a, "ht") ? parse(Float64, a["ht"]) : nothing - # collect all cells in this row rowcells = Dict{Int, Cell}() - mylock=ReentrantLock() - next_rownode, sst_count = get_rowcells!(rowcells, rownode, ws, sst_pfx; mylock) # update rowcells in place - + mylock = ReentrantLock() + _, sst_count = get_rowcells!(rowcells, rownode, ws, sst_pfx; mylock) itr.sheet.sst_count += sst_count - sheet_row = SheetRow(ws, current_row, current_row_ht, rowcells) # create the sheet_row - - # debug -# @assert sheetnode.raw.data == next_rownode.raw.data "LazyNode data don't match" + sheet_row = SheetRow(ws, current_row, current_row_ht, rowcells) - return sheet_row, SheetRowStreamIteratorState(next_rownode, rowcells, mylock) + return sheet_row, SheetRowStreamIteratorState(row_nodes, 2, rowcells, mylock) end function Base.iterate(itr::SheetRowStreamIterator, state::SheetRowStreamIteratorState) ws = get_worksheet(itr) - rownode = state.next_rownode rowcells = state.rowcells mylock = state.lock sst_pfx = get_sst_prefix(ws) - empty!(rowcells) - if rownode === nothing # there is no next_rownode - all rows processed + if state.row_index > length(state.row_nodes) return nothing end - # get row number and row height (if specified) + rownode = state.row_nodes[state.row_index] + state.row_index += 1 + a = XML.attributes(rownode) current_row = parse(Int, a["r"]) current_row_ht = haskey(a, "ht") ? parse(Float64, a["ht"]) : nothing - # collect all cells in this row - next_rownode, sst_count = get_rowcells!(rowcells, rownode, ws, sst_pfx; mylock) # update rowcells in place + _, sst_count = get_rowcells!(rowcells, rownode, ws, sst_pfx; mylock) itr.sheet.sst_count += sst_count - sheet_row = SheetRow(ws, current_row, current_row_ht, rowcells) # create the sheet_row + sheet_row = SheetRow(ws, current_row, current_row_ht, rowcells) - return sheet_row, SheetRowStreamIteratorState(next_rownode, rowcells, mylock) + return sheet_row, state end # @@ -303,43 +301,54 @@ end Base.length(r::WorksheetCache)=length(r.cells) #--------------------------------------------------------------------- Fill cache on first read (multi-threaded) -function produce_rowchunks!(out, n, rows, chunksize) - pos=0 - while !isnothing(n) - if _is_tag(localname(n), "row") +function produce_rowchunks!(out, sheetdata::XML.LazyNode, rows, chunksize) + pos = 0 + for child in XML.children(sheetdata) + if localname(child) == "row" pos += 1 - rows[pos] = n + rows[pos] = child end if pos >= chunksize put!(out, copy(rows)) - pos=0 + pos = 0 end - n = XML.next(n) end - if pos>0 # handle last incomplete chunk + if pos > 0 put!(out, copy(@view rows[1:pos])) end end -function stream_rows(n::XML.LazyNode, chunksize::Int; channel_size::Int=1 << 8) +function stream_rows(sheetdata::XML.LazyNode, chunksize::Int; channel_size::Int=1 << 8) rows = Vector{XML.LazyNode}(undef, chunksize) Channel{Vector{XML.LazyNode}}(channel_size) do out - produce_rowchunks!(out, n, rows, chunksize) + produce_rowchunks!(out, sheetdata, rows, chunksize) end end +const _EMPTY_ROW_ATTRS = Dict{String,String}() + function process_row(row::XML.LazyNode, handled_attributes::Set{String}, ws::Worksheet, sst_pfx::String, mylock::ReentrantLock) - unhandled_attributes = Dict{String,String}() - current_row_ht = nothing # initialise here - row_num = nothing # initialise here + current_row_ht::Union{Float64,Nothing} = nothing + row_num::Union{Int,Nothing} = nothing + unhandled_attributes = _EMPTY_ROW_ATTRS atts = XML.attributes(row) - isnothing(atts) && return nothing # skip rows with no attributes - - current_row_ht = haskey(atts, "ht") ? parse(Float64, atts["ht"]) : nothing - row_num = haskey(atts, "r") ? parse(Int, atts["r"]) : nothing + if !isnothing(atts) + for (k, v) in atts + if k == "r" + row_num = parse(Int, v) + elseif k == "ht" + current_row_ht = parse(Float64, v) + end + if !(k in handled_attributes) + if unhandled_attributes === _EMPTY_ROW_ATTRS + unhandled_attributes = Dict{String,String}() + end + unhandled_attributes[String(k)] = String(v) + end + end + end row_num === nothing && throw(XLSXError("Row without 'r' attribute encountered in worksheet $(ws.name).")) - unhandled_attributes = Dict(filter(attr -> !in(first(attr), handled_attributes), atts)) rowcells = Dict{Int,Cell}() _, sst_count = get_rowcells!(rowcells, row, ws, sst_pfx; mylock) @@ -352,15 +361,15 @@ function first_cache_fill!(ws::Worksheet, lznode::XML.LazyNode, nthreads::Int) handled_attributes = Set{String}(["r", "spans", "ht", "customHeight"]) unhandled_attributes = Dict{Int,Dict{String,String}}() sst_pfx = get_sst_prefix(ws) - + if ws.cache === nothing ws.cache = WorksheetCache(ws) else throw(XLSXError("Expecting empty cache but cache not empty!")) end - + sheet_rows = Channel{Vector{Tuple{Int, SheetRow, Dict{String,String}}}}(1 << 8) - + consumer = @async begin sst_total = 0 for rows in sheet_rows @@ -375,10 +384,21 @@ function first_cache_fill!(ws::Worksheet, lznode::XML.LazyNode, nthreads::Int) ws.sst_count = sst_total ws.unhandled_attributes = isempty(unhandled_attributes) ? nothing : unhandled_attributes end - - streamed_rows = stream_rows(lznode, chunksize) + + # Navigate to sheetData element + root = xml_root_element(lznode) + sheetdata = nothing + for child in XML.children(root) + if localname(child) == "sheetData" + sheetdata = child + break + end + end + sheetdata === nothing && throw(XLSXError("No `sheetData` node found in worksheet")) + + streamed_rows = stream_rows(sheetdata, chunksize) mylock = ReentrantLock() - + @sync for _ in 1:nthreads Threads.@spawn begin for rows in streamed_rows @@ -387,7 +407,7 @@ function first_cache_fill!(ws::Worksheet, lznode::XML.LazyNode, nthreads::Int) end end end - + close(sheet_rows) wait(consumer) @@ -398,40 +418,33 @@ end # Materialise specific rows from a worksheet.xml file into SheetRows # (faster than using eachrow which materialises every row). function match_rows(ws::Worksheet, rows_to_match::Vector{Int})::Vector{SheetRow} - matched_rows=Vector{SheetRow}() + matched_rows = Vector{SheetRow}() sst_pfx = get_sst_prefix(ws) sort!(rows_to_match) - i=1 - l=length(rows_to_match) - + i = 1 + l = length(rows_to_match) + target_file = get_relationship_target_by_id("xl", get_workbook(ws), ws.relationship_id) - lznode = open_internal_file_stream(get_xlsxfile(ws), target_file) - - n = XML.next(lznode) - mylock=ReentrantLock() - while !isnothing(n) - if localname(n) == "row" # find each row - atts = XML.attributes(n) - if !isnothing(atts) - row_num = haskey(atts, "r") ? parse(Int, atts["r"]) : nothing - end - row_num === nothing && throw(XLSXError("Row without 'r' attribute encountered in worksheet $(ws.name).")) - if !isnothing(row_num) && row_num == rows_to_match[i] # process matching rows into SheetRows - current_row_ht = haskey(atts, "ht") ? parse(Float64, atts["ht"]) : nothing - - # Process cells - rowcells = Dict{Int,Cell}() - n, _ = get_rowcells!(rowcells, n, ws, sst_pfx; mylock) - - sheetrow = SheetRow(ws, row_num, current_row_ht, rowcells) - push!(matched_rows, sheetrow) - i+=1 - i>l && break # stop once all rows matched - continue - end + doc = open_internal_file_stream(get_xlsxfile(ws), target_file) + row_nodes = _collect_row_nodes(doc) + + mylock = ReentrantLock() + for n in row_nodes + atts = XML.attributes(n) + row_num = !isnothing(atts) && haskey(atts, "r") ? parse(Int, atts["r"]) : nothing + row_num === nothing && throw(XLSXError("Row without 'r' attribute encountered in worksheet $(ws.name).")) + if row_num == rows_to_match[i] + current_row_ht = haskey(atts, "ht") ? parse(Float64, atts["ht"]) : nothing + + rowcells = Dict{Int,Cell}() + get_rowcells!(rowcells, n, ws, sst_pfx; mylock) + + sheetrow = SheetRow(ws, row_num, current_row_ht, rowcells) + push!(matched_rows, sheetrow) + i += 1 + i > l && break end - n=XML.next(n) end return matched_rows diff --git a/src/styles.jl b/src/styles.jl index e53f0479..d051cedd 100644 --- a/src/styles.jl +++ b/src/styles.jl @@ -80,11 +80,10 @@ function styles_xmlroot(workbook::Workbook) styles_root = xmlroot(get_xlsxfile(workbook), styles_target) # check root node name for styles.xml - (_, uri) = get_default_namespace(styles_root[end]) - if uri != SPREADSHEET_NAMESPACE_XPATH_ARG - throw(XLSXError("Unsupported styles XML namespace $(get_default_namespace(styles_root[end])).")) + if get_default_namespace(xml_root_element(styles_root)) != SPREADSHEET_NAMESPACE_XPATH_ARG + throw(XLSXError("Unsupported styles XML namespace $(get_default_namespace(xml_root_element(styles_root))).")) end - localname(styles_root[end]) != "styleSheet" && throw(XLSXError("Malformed package. Expected root node named `styleSheet` in `styles.xml`.")) + localname(xml_root_element(styles_root)) != "styleSheet" && throw(XLSXError("Malformed package. Expected root node named `styleSheet` in `styles.xml`.")) workbook.styles_xroot = styles_root else throw(XLSXError("Styles not found for this workbook.")) @@ -138,11 +137,11 @@ function styles_add_numFmt(wb::Workbook, format_code::AbstractString)::Integer numfmts = numfmts[1] end - existing_numFmt_elements_count = length(XML.children(numfmts)) + existing_numFmt_elements_count = length(xml_elements(numfmts)) fmt_code = existing_numFmt_elements_count + PREDEFINED_NUMFMT_COUNT new_fmt = XML.Element("numFmt"; - numFmtId=fmt_code, - formatCode=XLSX.escape(format_code) + numFmtId=string(fmt_code), + formatCode=format_code ) push!(numfmts, new_fmt) return fmt_code @@ -169,7 +168,7 @@ const DATETIME_CODES = ["d", "m", "yy", "h", "s", "a/p", "am/pm"] function remove_formatting(code) # this regex should cover all the formatting cases found here(colors/conditionals/quotes/spacing): # https://support.office.com/en-us/article/create-or-delete-a-custom-number-format-78f2a361-936b-4c03-8772-09fab54be7f4 - ignoredformatting = r"""\[.{2,}?\]|".+?"|_.|\\.|\*."""x # Had to add ? to "".+"" to make it work. Don't understand what made this necessary! + ignoredformatting = r"""\[.{2,}?\]|".+?"|_.|\\.|\*."""x replace(code, ignoredformatting => "") end @@ -282,7 +281,7 @@ function styles_get_cellXf_with_numFmtId(allXfNodes::Vector{XML.Node}, numFmtId: end function styles_add_cell_xf(wb::Workbook, attributes::Dict{String,String})::CellDataFormat - new_xf = XML.Node(XML.Element, "xf", OrderedDict{String,String}(), nothing, nothing) + new_xf = XML.Node{String}(XML.Element, "xf", Pair{String,String}[], nothing, nothing) for k in keys(attributes) new_xf[k] = attributes[k] end @@ -292,12 +291,12 @@ end function styles_add_cell_xf(wb::Workbook, new_xf::XML.Node)::CellDataFormat xroot = styles_xmlroot(wb) i, j = get_idces(xroot, "styleSheet", "cellXfs") - existing_cellxf_elements_count = length(XML.children(xroot[i][j])) + existing_cellxf_elements_count = length(xml_elements(xroot[i][j])) if parse(Int, xroot[i][j]["count"]) != existing_cellxf_elements_count throw(XLSXError("Wrong number of xf elements found: $existing_cellxf_elements_count. Expected $(parse(Int, xroot[i][j]["count"])).")) end # Check new_xf doesn't duplicate any existing xf. If yes, use that rather than create new. - for (k, node) in enumerate(XML.children(xroot[i][j])) + for (k, node) in enumerate(xml_elements(xroot[i][j])) if node == new_xf return CellDataFormat(k - 1) # CellDataFormat is zero-indexed end diff --git a/src/types.jl b/src/types.jl index 9a17f735..c5268da6 100644 --- a/src/types.jl +++ b/src/types.jl @@ -351,7 +351,8 @@ Implementations: SheetRowStreamIterator, WorksheetCache. abstract type SheetRowIterator end mutable struct SheetRowStreamIteratorState - next_rownode::Union{Nothing, XML.LazyNode} # Worksheet row being processed + row_nodes::Vector{XML.LazyNode} # All row nodes from sheetData + row_index::Int # Index of the next row to process rowcells::Dict{Int,Cell} lock::ReentrantLock end @@ -410,17 +411,10 @@ end #------------------------------------------------------------------------------ sharedStrings mutable struct SharedStringTable shared_strings::Vector{String} + unformatted::Vector{String} index::Dict{String, Int64} # for search optimisation. Tuple of indices to handle hash collisions. is_loaded::Bool end -struct SstToken - n::XML.LazyNode - idx::Int -end -struct Sst - formatted::String - idx::Int -end const ValidRichTextAttributes = [:bold, :italic, :under, :strike, :vertAlign, :color, :size, :name] @@ -599,7 +593,7 @@ end struct ReadFile node::Union{Nothing,XML.Node} - raw::Union{Nothing,XML.Raw} + raw::Union{Nothing,String} bin::Union{Nothing,Vector{UInt8}} name::String end @@ -690,11 +684,11 @@ struct DataTable end end -struct xpath +struct XPathInfo node::XML.Node path::String - function xpath(node::XML.Node, path::String) + function XPathInfo(node::XML.Node, path::String) new(node, path) end end diff --git a/src/worksheet.jl b/src/worksheet.jl index 5ed12af0..82b13895 100644 --- a/src/worksheet.jl +++ b/src/worksheet.jl @@ -68,7 +68,7 @@ function Worksheet(xf::XLSXFile, sheet_element::XML.Node) a = XML.attributes(sheet_element) sheetId = parse(Int, a["sheetId"]) relationship_id = a["r:id"] - name = XLSX.unescape(a["name"]) + name = a["name"] is_hidden = haskey(a, "state") && a["state"] in ["hidden", "veryHidden"] return Worksheet(xf, sheetId, relationship_id, name, nothing, is_hidden) @@ -113,24 +113,18 @@ function read_worksheet_dimension(xf::XLSXFile, relationship_id, name)::Union{No local result::Union{Nothing,CellRange} = nothing target_file = get_relationship_target_by_id("xl", wb, relationship_id) doc = open_internal_file_stream(xf, target_file) - reader = iterate(doc) - # Now let's look for a row element, if it exists - while reader !== nothing # go next node - (sheet_row, state) = reader - if XML.nodetype(sheet_row) == XML.Element && localname(sheet_row) == "dimension" + root = xml_root_element(doc) - XML.depth(sheet_row) != 2 && throw(XLSXError("Malformed Worksheet \"$name\": unexpected node depth for dimension node: $(XML.depth(sheet_row)).")) - - ref_str = XML.attributes(sheet_row)["ref"] + for child in XML.children(root) + if XML.nodetype(child) == XML.Element && localname(child) == "dimension" + ref_str = child["ref"] if is_valid_cellname(ref_str) result = CellRange("$(ref_str):$(ref_str)") else result = CellRange(ref_str) end - break end - reader = iterate(doc, state) end return result diff --git a/src/write.jl b/src/write.jl index 978dcb18..0c79fa21 100644 --- a/src/write.jl +++ b/src/write.jl @@ -201,13 +201,13 @@ end function get_node_paths(node::XML.Node) XML.nodetype(node) != XML.Document && throw(XLSXError("Something wrong here!")) - (_, default_ns) = get_default_namespace(node[end]) - xpaths = Vector{xpath}() + default_ns = get_default_namespace(xml_root_element(node)) + xpaths = Vector{XPathInfo}() get_node_paths!(xpaths, node, default_ns, "") return xpaths end -function get_node_paths!(xpaths::Vector{xpath}, node::XML.Node, default_ns, path) +function get_node_paths!(xpaths::Vector{XPathInfo}, node::XML.Node, default_ns, path) for c in XML.children(node) if XML.nodetype(c) ∉ [XML.Declaration, XML.Comment, XML.Text] node_tag = localname(c) @@ -215,7 +215,7 @@ function get_node_paths!(xpaths::Vector{xpath}, node::XML.Node, default_ns, path node_tag = default_ns * ":" * node_tag end npath = path * "/" * node_tag - push!(xpaths, xpath(c, npath)) + push!(xpaths, XPathInfo(c, npath)) if length(XML.children(c)) > 0 get_node_paths!(xpaths, c, default_ns, npath) end @@ -226,16 +226,16 @@ end # Remove all children with tag given by att[2] from a parent XML node with a tag given by att[1]. -function unlink(node::XML.Node, att::Tuple{String,String}, pfx::String) +function unlink(node::XML.Node, att::Tuple{String,String}, pfx::String="") new_node = XML.Element(pfx * first(att)) - + atts = XML.attributes(node) isnothing(atts) || foreach(((k, v),) -> new_node[k] = v, atts) - + for child in XML.children(node) localname(child) != last(att) && push!(new_node, child) end - + return new_node end @@ -280,7 +280,7 @@ end function update_single_sheet!(wb::Workbook, sheet_no::Int, full::Bool)::Union{Nothing,Vector{UInt8}} sheet = getsheet(wb, sheet_no) doc = copynode(get_worksheet_xml_document(sheet)) - xroot = doc[end] + xroot = xml_root_element(doc) # check namespace and root node name ns_map = get_namespaces(xroot) @@ -574,7 +574,7 @@ function update_workbook_xml!(xl::XLSXFile) # Need to update and and . + xml_str = String(copy(sheet_template_data)) + xdoc, _ = splitNode(xml_str, "sheetData") new_cache = XLSX.WorksheetCache( true, @@ -1255,13 +1253,12 @@ function copysheet!(ws::Worksheet, name::AbstractString="")::Worksheet # if copied sheet is the currently selected sheet, do not copy this attribute over. # The original sheet will remain the only selected sheet. - for c in XML.children(xdoc[end]) - if localname(c) =="sheetViews" + for c in XML.children(xml_root_element(xdoc)) + if XML.tag(c) == "sheetViews" for c2 in XML.children(c) - if localname(c2) =="sheetView" - atts=XML.attributes(c2) - if haskey(atts, "tabSelected") - atts["tabSelected"]="0" + if XML.tag(c2) == "sheetView" + if haskey(c2, "tabSelected") + c2["tabSelected"] = "0" end end end @@ -1398,8 +1395,10 @@ function insertsheet!(wb::Workbook, xdoc::XML.Node, new_cache::WorksheetCache, s sheetId = max(current_sheet_ids...) + 1 # generate a unique ID for the new sheet - !haskey(xdoc[2], "xmlns:xr") && (xdoc[2]["xmlns:xr"] = "http://schemas.microsoft.com/office/spreadsheetml/2016/revision") - xdoc[2]["xr:uid"] = "{" * uppercase(string(UUIDs.uuid4(wb.package.uuid_rng))) * "}" + let sheet_root = xml_root_element(xdoc) + !haskey(sheet_root, "xmlns:xr") && (sheet_root["xmlns:xr"] = "http://schemas.microsoft.com/office/spreadsheetml/2016/revision") + sheet_root["xr:uid"] = "{" * uppercase(string(UUIDs.uuid4(wb.package.uuid_rng))) * "}" + end # generate a unique name for the XML local xml_filename::String @@ -1440,9 +1439,8 @@ end add_override!(wb::Workbook, part::String, content::String) = add_override!(get_xlsxfile(wb), part, content) function add_override!(xf::XLSXFile, part::String, content::String) - wb = get_workbook(xf) - ctype_root = xmlroot(xf, "[Content_Types].xml")[end] - localname(ctype_root) != "Types" && throw(XLSXError("Something wrong here!")) + ctype_root = xml_root_element(xmlroot(xf, "[Content_Types].xml")) + XML.tag(ctype_root) != "Types" && throw(XLSXError("Something wrong here!")) override_node = XML.Element("Override"; PartName=part, ContentType=content @@ -1646,7 +1644,7 @@ function deletesheet!(wb::Workbook, name::AbstractString)::XLSXFile end # Remove drawing Override from [Content_Types].xml - ctype_root = xmlroot(xf, "[Content_Types].xml")[end] + ctype_root = xml_root_element(xmlroot(xf, "[Content_Types].xml")) cont = XML.children(ctype_root) idx = findfirst(i -> haskey(cont[i], "PartName") && cont[i]["PartName"] == "/$drawing_path", eachindex(cont)) @@ -1672,8 +1670,8 @@ function deletesheet!(wb::Workbook, name::AbstractString)::XLSXFile end # update [Content_Types].xml - ctype_root = xmlroot(get_xlsxfile(wb), "[Content_Types].xml")[end] - localname(ctype_root) != "Types" && throw(XLSXError("Something wrong here!")) + ctype_root = xml_root_element(xmlroot(get_xlsxfile(wb), "[Content_Types].xml")) + XML.tag(ctype_root) != "Types" && throw(XLSXError("Something wrong here!")) cont = XML.children(ctype_root) let idx = 0 for (i, c) in enumerate(cont) diff --git a/test/runtests.jl b/test/runtests.jl index 938c880d..e1d28d13 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -2,7 +2,6 @@ import XLSX import Tables using Test, Dates, XML -using OrderedCollections: OrderedDict import DataFrames, Random import Distributions as Dist import CSV @@ -2820,7 +2819,7 @@ end @test id(datestyle) == UInt64(2) @test XLSX.styles_get_cellXf_with_numFmtId(wb, numfmt) == numstyle - @test XLSX.styles_numFmt_formatCode(wb, string(numfmt)) == "\$* #,##0.00;\$* (#,##0.00);\$* "-"??;[Magenta]@" + @test XLSX.styles_numFmt_formatCode(wb, string(numfmt)) == "\$* #,##0.00;\$* (#,##0.00);\$* \"-\"??;[Magenta]@" @test numstyle isa XLSX.CellDataFormat @test !isempty(numstyle) @test id(numstyle) == UInt64(3) @@ -5184,9 +5183,9 @@ end max_val="95" ) @test XML.tag(XLSX.get_x14_icon("3Triangles")) == "x14:cfRule" - @test XML.attributes(XLSX.get_x14_icon("3Stars")) == OrderedDict("type" => "iconSet", "priority" => "1", "id" => "XXXX-xxxx-XXXX") + @test Dict(XML.attributes(XLSX.get_x14_icon("3Stars"))) == Dict("type" => "iconSet", "priority" => "1", "id" => "XXXX-xxxx-XXXX") @test length(XML.children(XLSX.get_x14_icon("5Boxes"))) == 1 - @test typeof(XLSX.get_x14_icon("Custom")) == XML.Node + @test XLSX.get_x14_icon("Custom") isa XML.Node end @testset "cellIs" begin