From 93b77bc830f3ec36b43653569acae438982a4d1d Mon Sep 17 00:00:00 2001 From: scott Date: Tue, 5 May 2026 14:31:09 +1000 Subject: [PATCH 1/2] fix(serializer): collapse multi-row column headers into one markdown header MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit MarkdownTableSerializer hardcoded `headers=rows[0]` and `rows[1:]` as the body. When TableFormer (correctly) marks multiple leading rows as column_header=True — the case where a column title wraps onto two visual lines like "Cash per Security" + "($)" — only the first row rendered as the markdown header and the continuation leaked into the body as a spurious "data" row. Mirrors the logic that already exists in `_export_to_dataframe_with_options` (document.py:2219-2245): count leading grid rows containing any column_header cell, concatenate their cell text per column to build the markdown header, and use the remaining grid rows as the body. Spanning siblings render empty in both cases so a colspan=N header is not concatenated N times. Falls back to "first row is header" if no row has any column_header cell, preserving prior behaviour for tables that arrive without header marking. Empirical result on a 10-PDF dividend-statement corpus (finance_nexus tests/fixtures/dividend_statement/): * IOZ_Reinvestment_Plan_Advice_2025_04_17.pdf: the wrapped header collapses from two rows into one. Body unchanged. * All 9 other fixtures: byte-identical markdown output. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../transforms/serializer/markdown.py | 45 ++++++++++++++++--- 1 file changed, 40 insertions(+), 5 deletions(-) diff --git a/docling_core/transforms/serializer/markdown.py b/docling_core/transforms/serializer/markdown.py index 2783d4ed..6fa64666 100644 --- a/docling_core/transforms/serializer/markdown.py +++ b/docling_core/transforms/serializer/markdown.py @@ -549,18 +549,53 @@ def serialize( rendered_row.append(cell_text.replace("\n", " ").replace("|", "|")) rows.append(rendered_row) if len(rows) > 0: + # Count leading rows that contain any column_header cell so + # multi-line wrapped headers ("Cash per Security" + "($)" on + # successive grid rows) collapse into one markdown header + # row instead of leaking the continuation into the body. + # Mirrors `_export_to_dataframe_with_options` in + # docling_core/types/doc/document.py:2219-2245. Falls back to + # "first row is header" if upstream did not mark any header + # cells, preserving prior behaviour for that case. + num_header_rows = 0 + for grid_row in item.data.grid: + if any( + getattr(cell, "column_header", False) for cell in grid_row + ): + num_header_rows += 1 + else: + break + if num_header_rows == 0: + num_header_rows = 1 + + num_cols = max(len(r) for r in rows) + if num_header_rows == 1: + headers = rows[0] + else: + # Concatenate header lines per column with a single + # space. Empty cells (spanning siblings, gaps) skipped + # so a colspan=N header isn't doubled. + headers = [] + for col_idx in range(num_cols): + parts = [ + rows[r][col_idx] + for r in range(min(num_header_rows, len(rows))) + if col_idx < len(rows[r]) and rows[r][col_idx] + ] + headers.append(" ".join(parts)) + body = rows[num_header_rows:] + # Always disable numparse to prevent silent precision loss in numeric values # Use tabulate's _column_type to detect numeric columns for right-alignment colalign = [] - if len(rows) > 1: # Need at least header + 1 data row - num_cols = len(rows[0]) + if body: # Need at least one data row to detect column types for col_idx in range(num_cols): - col_values = [row[col_idx] if col_idx < len(row) else "" for row in rows[1:]] + col_values = [row[col_idx] if col_idx < len(row) else "" for row in body] col_type = _column_type(col_values) colalign.append("right" if col_type in (int, float) else "left") table_text = tabulate( - rows[1:], - headers=rows[0], + body, + headers=headers, tablefmt="github", disable_numparse=True, colalign=tuple(colalign) if colalign else None, From 92c81c267e04f504611ad52e729cd637fb79530e Mon Sep 17 00:00:00 2001 From: scott Date: Tue, 5 May 2026 15:04:02 +1000 Subject: [PATCH 2/2] DCO Remediation Commit for scott I, scott , hereby add my Signed-off-by to this commit: 93b77bcadbf3cdbeb20bbe10f7c14dde0f9ec88a Signed-off-by: scott