Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/ci.yml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
name: CI
on:
# workflow_dispatch: # <-- Add this to allow manual execution
workflow_dispatch: # <-- Add this to allow manual execution
pull_request:
branches:
- master
Expand Down
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -5,3 +5,4 @@ Manifest.toml
docs/Manifest.toml
.vscode
*.gz
.claude
4 changes: 1 addition & 3 deletions Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@ repo = "https://github.com/juliadata/XLSX.jl.git"
[deps]
Colors = "5ae59095-9a9b-59fe-a467-6f913c188581"
Dates = "ade2ca70-3891-5945-98fb-dc099432e06a"
OrderedCollections = "bac558e1-5e72-5ebc-8fee-abe8a469f55d"
PrecompileTools = "aea7be01-6a6a-4083-8856-8a6e6704d82a"
Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7"
Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
Expand All @@ -28,12 +27,11 @@ StyledStringsSstsExt = "StyledStrings"
CSV = "0.10.15"
Colors = "0.12, 0.13"
Distributions = "0.25.0"
OrderedCollections = "1"
PrecompileTools = "1"
StyledStrings = "1.0.3"
Tables = "1"
UUIDs = "1.8"
XML = "0.3.8"
XML = "0.4"
ZipArchives = "2.5"
julia = "1.8"
[extras]
Expand Down
4 changes: 3 additions & 1 deletion src/XLSX.jl
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@ import Tables
import Unicode
import UUIDs
import XML
using OrderedCollections: OrderedDict
import ZipArchives

import PrecompileTools as PCT # this is a small dependency.
Expand Down Expand Up @@ -40,6 +39,9 @@ export
getMergedCells, isMergedCell, getMergedBaseCell, mergeCells

const SPREADSHEET_NAMESPACE_XPATH_ARG = "http://schemas.openxmlformats.org/spreadsheetml/2006/main"

xml_elements(node) = filter(n -> XML.nodetype(n) == XML.Element, XML.children(node))
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could this return Node{SubString}?

xml_root_element(doc) = last(xml_elements(doc))
const EXCEL_MAX_COLS = 16_384 # total columns supported by Excel per sheet
const EXCEL_MAX_ROWS = 1_048_576 # total rows supported by Excel per sheet (including headers)
const ROW_CHUNKSIZE = 1000 # number of rows to be processed in each thread
Expand Down
47 changes: 21 additions & 26 deletions src/cell.jl
Original file line number Diff line number Diff line change
Expand Up @@ -188,7 +188,7 @@ end
=#

# Parses a style string to (UInt32, Int) for use as style and num_style.
function _parse_style(s::String)
function _parse_style(s::AbstractString)
isempty(s) && return UInt32(0), 0
n = parse(Int, s)
return UInt32(n), n
Expand Down Expand Up @@ -235,8 +235,8 @@ function Cell(c::XML.LazyNode, ws::Worksheet, sst_pfx::String; mylock::Union{Ree
if tag == "v"
ch = XML.children(child)
isempty(ch) && continue
raw = XML.value(ch[1])
v = occursin('&', raw) ? XLSX.unescape(raw) : raw
# `child` is a `LazyNode`; XML.jl 0.4 already unescapes its value.
v = XML.value(ch[1])
datatype, value = process_tv(wb, t, v, num_style; mylock)
elseif tag == "f"
f = parse_formula_from_element(wb,child)
Expand All @@ -253,12 +253,15 @@ function parse_formula_from_element(wb, c_child_element)::AbstractFormula
localname(c_child_element) == "f" ||
throw(XLSXError("Expected nodename `f`. Found: `$(localname(c_child_element))`"))

# Extract formula string
formula_string = if XML.is_simple(c_child_element)
XLSX.unescape(XML.simple_value(c_child_element))
else
text_nodes = filter(x -> XML.nodetype(x) == XML.Text, XML.children(c_child_element))
isempty(text_nodes) ? "" : XLSX.unescape(XML.value(text_nodes[1]))
# Extract formula string — `<f>` may have attributes (t="shared", si=…, ref=…)
# which makes it non-"simple", so collect the first Text child.
formula_string = ""
for ch in XML.children(c_child_element)
if XML.nodetype(ch) === XML.Text
v = XML.value(ch)
isnothing(v) || (formula_string = v)
break
end
end

a = XML.attributes(c_child_element)
Expand Down Expand Up @@ -312,7 +315,7 @@ function _parse_excel_datetime_raw(v::AbstractString)
end
end

function process_tv(wb::Workbook, t::String, v::String, num_style::Int; mylock::Union{ReentrantLock,Nothing}=nothing)
function process_tv(wb::Workbook, t::AbstractString, v::AbstractString, num_style::Int; mylock::Union{ReentrantLock,Nothing}=nothing)
datatype::CellValueType = CT_EMPTY
value::UInt64 = UInt64(0)
isempty(v) && return datatype, value
Expand Down Expand Up @@ -479,29 +482,21 @@ end
# Extract cells from a <row> LazyNode and push them (in place) into a Dict(column -> Cell)
function get_rowcells!(rowcells::Dict{Int,Cell}, row::XML.LazyNode, ws::Worksheet, sst_pfx::String; mylock::Union{ReentrantLock,Nothing}=nothing)

# unthreaded cell extraction is (exceedingly marginally) slower but no lock conflicts introduced.

# debug
# @assert row.tag == "row" "Not a row node"
# @assert localname(row) == "row" "Not a row node"

sst_count = 0

d = row.depth

cellnode = XML.next(row)

while !isnothing(cellnode) && cellnode.depth > d
if localname(cellnode) == "c" # This is a cell
cell = Cell(cellnode, ws, sst_pfx; mylock) # construct an XLSX.Cell from an XML.LazyNode
for child in XML.children(row)
if localname(child) == "c" # This is a cell
cell = Cell(child, ws, sst_pfx; mylock) # construct an XLSX.Cell from an XML.LazyNode
sst_count += cell.datatype == CT_STRING ? 1 : 0
rowcells[column_number(cell)] = cell
end
cellnode = XML.next(cellnode)
end
if !isnothing(cellnode) && localname(cellnode) == "row" # have reached the beginning of next row
return cellnode, sst_count
else # no more rows
return nothing, sst_count
end

# Row nodes are materialised up-front (see `_collect_row_nodes`), so callers
# advance by index; the first tuple element is unused but kept for the
# `next, sst_count = get_rowcells!(...)` call contract.
return nothing, sst_count
end
41 changes: 21 additions & 20 deletions src/cellformat-helpers.jl
Original file line number Diff line number Diff line change
Expand Up @@ -113,8 +113,9 @@ const EXCEL_COLUMN_WIDTH_PADDING = 0.7109375
#

function copynode(o::XML.Node)
n = XML.Node(o.nodetype, o.tag, o.attributes, o.value, isnothing(o.children) ? nothing : [copynode(x) for x in o.children])
return n
attrs = isnothing(o.attributes) ? nothing : copy(o.attributes)
children = isnothing(o.children) ? nothing : XML.Node{String}[copynode(x) for x in o.children]
return XML.Node{String}(o.nodetype, o.tag, attrs, o.value, children)
end
function do_sheet_names_match(ws::Worksheet, rng::T) where {T<:Union{SheetCellRef,AbstractSheetCellRange}}
if ws.name == rng.sheet
Expand All @@ -125,8 +126,8 @@ function do_sheet_names_match(ws::Worksheet, rng::T) where {T<:Union{SheetCellRe
end

function make_child_node(tag::String, name::String, pfx::String)::XML.Node
children = tag ∈ ("border", "fill") ? Vector{XML.Node}() : nothing
return XML.Node(XML.Element, pfx*name, OrderedDict{String,String}(), nothing, children)
children = tag ∈ ("border", "fill") ? Vector{XML.Node{String}}() : nothing
return XML.Node{String}(XML.Element, pfx*name, Pair{String,String}[], nothing, children)
end

function build_font_child!(new_node::XML.Node, tag::String, name::String, attrs::Union{Nothing,Dict{String,String}}, pfx::String)
Expand Down Expand Up @@ -216,7 +217,7 @@ end
if isnothing(attributes[a])
cnode = XML.Element(a)
else
cnode = XML.Node(XML.Element, a, OrderedDict{String,String}(), nothing, tag ∈ ["border", "fill"] ? Vector{XML.Node}() : nothing)
cnode = XML.Node{String}(XML.Element, a, Pair{String,String}[], nothing, tag ∈ ["border", "fill"] ? Vector{XML.Node{String}}() : nothing)
for (k, v) in attributes[a]
cnode[k] = v
end
Expand All @@ -228,7 +229,7 @@ end
if isnothing(attributes[a])
cnode = XML.Element(a)
else
cnode = XML.Node(XML.Element, a, OrderedDict{String,String}(), nothing, tag ∈ ["border", "fill"] ? Vector{XML.Node}() : nothing)
cnode = XML.Node{String}(XML.Element, a, Pair{String,String}[], nothing, tag ∈ ["border", "fill"] ? Vector{XML.Node{String}}() : nothing)
color = XML.Element("color")
for (k, v) in attributes[a]
if k == "style" && v != "none"
Expand All @@ -255,7 +256,7 @@ end
if isnothing(attributes[a])
cnode = XML.Element(a)
else
cnode = XML.Node(XML.Element, a, OrderedDict{String,String}(), nothing, tag ∈ ["border", "fill"] ? Vector{XML.Node}() : nothing)
cnode = XML.Node{String}(XML.Element, a, Pair{String,String}[], nothing, tag ∈ ["border", "fill"] ? Vector{XML.Node{String}}() : nothing)
patternfill = XML.Element("patternFill")
fgcolor = XML.Element("fgColor")
bgcolor = XML.Element("bgColor")
Expand Down Expand Up @@ -475,14 +476,14 @@ function get_new_formatId(wb::Workbook, format::String)::Int
if isnothing(j) # There are no existing custom formats
return styles_add_numFmt(wb, format)
else
existing_elements_count = length(XML.children(xroot[i][j]))
existing_elements_count = length(xml_elements(xroot[i][j]))
if parse(Int, xroot[i][j]["count"]) != existing_elements_count
throw(XLSXError("Wrong number of font elements found: $existing_elements_count. Expected $(parse(Int, xroot[i][j]["count"]))."))
end

format_node = XML.Element("numFmt";
numFmtId=string(existing_elements_count + PREDEFINED_NUMFMT_COUNT),
formatCode=XLSX.escape(format)
formatCode=format
)

return styles_add_cell_attribute(wb, format_node, "numFmts") + PREDEFINED_NUMFMT_COUNT
Expand Down Expand Up @@ -529,7 +530,7 @@ function update_template_xf(ws::Worksheet, allXfNodes::Vector{XML.Node}, existin
old_cell_xf = styles_cell_xf(allXfNodes, Int(existing_style.id))
new_cell_xf = copynode(old_cell_xf)
if isnothing(new_cell_xf.children)
new_cell_xf=XML.Node(new_cell_xf, alignment)
new_cell_xf = XML.Node{String}(XML.Element, XML.tag(new_cell_xf), new_cell_xf.attributes, nothing, XML.Node{String}[alignment])
elseif length(XML.children(new_cell_xf)) == 0
push!(new_cell_xf, alignment)
else
Expand All @@ -550,14 +551,14 @@ end
function styles_add_cell_attribute(wb::Workbook, new_att::XML.Node, att::String)::Int
xroot = styles_xmlroot(wb)
i, j = get_idces(xroot, "styleSheet", att)
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The get_idces function was introduced as a kind of poor man's XPATH when I replaced EzXML with XML. Now that XML supports XPATH, could this function be replaced with a proper XPATH handling?

existing_elements_count = length(XML.children(xroot[i][j]))
existing_elements_count = length(xml_elements(xroot[i][j]))
if parse(Int, xroot[i][j]["count"]) != existing_elements_count
throw(XLSXError("Wrong number of elements elements found: $existing_elements_count. Expected $(parse(Int, xroot[i][j]["count"]))."))
end

# Check new_att doesn't duplicate any existing att. If yes, use that rather than create new.
for (k, node) in enumerate(XML.children(xroot[i][j]))
if localname(new_att) == "numFmt" # mustn't compare numFmtId attribute for formats
for (k, node) in enumerate(xml_elements(xroot[i][j]))
if XML.tag(new_att) == "numFmt" # mustn't compare numFmtId attribute for formats
if node["formatCode"] == new_att["formatCode"]
return k - 1 # CellDataFormat is zero-indexed
end
Expand Down Expand Up @@ -1110,7 +1111,7 @@ function process_uniform_core(f::Function, ws::Worksheet, allXfNodes::Vector{XML
if first # Get the attribute of the first cell in the range.
newid = f(ws, cellref; kw...)
new_alignment = getAlignment(ws, cellref).alignment["alignment"]
alignment_node = XML.Node(XML.Element, "alignment", new_alignment, nothing, nothing)
alignment_node = XML.Node{String}(XML.Element, "alignment", isnothing(new_alignment) ? Pair{String,String}[] : Pair{String,String}[k => v for (k,v) in new_alignment], nothing, nothing)
first = false
else # Apply the same attribute to the rest of the cells in the range.
if cell.style == UInt64(0)
Expand Down Expand Up @@ -1320,12 +1321,12 @@ function update_sharedString_font(ws::Worksheet, cell::Cell;

is = parse(str_formatted, XML.Node)[1] # Convert to XML.Node for ease of handling

all_r = filter(z -> z.tag == "r", XML.children(is))
all_r = filter(z -> localname(z) == "r", XML.children(is))
run_elements = reduce(vcat, [XML.children(z) for z in all_r])
rPr_elements=filter(z -> z.tag == "rPr", run_elements) # rPr elements
rPr_elements = filter(z -> localname(z) == "rPr", run_elements) # rPr elements

t=String[] # text elements
for i in filter(z -> z.tag == "t", run_elements)
for i in filter(z -> localname(z) == "t", run_elements)
push!(t, XML.is_simple(i[1]) ? XML.simple_value(i[1]) : XML.value(i[1]))
end

Expand All @@ -1337,7 +1338,7 @@ function update_sharedString_font(ws::Worksheet, cell::Cell;

for att in XML.children(rPr) # first copy existing attributes
for i in 1:length(atts)
if att.tag == atts[i]
if XML.tag(att) == atts[i]
new_rPr[i] = att
end
end
Expand Down Expand Up @@ -1372,7 +1373,7 @@ function update_sharedString_font(ws::Worksheet, cell::Cell;
if !isnothing(rPr.children)
empty!(rPr.children)
foreach(new_rPr) do element
element.tag != "DeleteMe" && push!(rPr.children, element)
XML.tag(element) != "DeleteMe" && push!(rPr.children, element)
end
end
end
Expand Down Expand Up @@ -1420,7 +1421,7 @@ function update_sharedString_font(ws::Worksheet, cell::Cell;
for r in 1:length(all_r)
if t[r] != ")___DeleteMe___(" # signals a merged <r> element to be skipped
write(new_r, " <r>\n")
r > inc_first && write(new_r, XML.write(rPr_elements[r-inc_first];depth=3) * "\n")
r > inc_first && write(new_r, XML.write(rPr_elements[r-inc_first]) * "\n")
write(new_r, " <t" * (needs_preserve(t[r]) ? " xml:space=\"preserve\"" : "") * ">" *t[r] * "</t>\n")
write(new_r, " </r>\n")
end
Expand Down
Loading
Loading