Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
8020297
First test xlsx handler
mattiagiupponi Jan 20, 2026
3dd170c
extending xlsx_hander to cover the error handling by using calamine
Gpetrak Feb 2, 2026
790c469
improving error handling and adding the dep of XLSX handler
Gpetrak Feb 2, 2026
8dc5f73
Merge branch 'master' into xlsx_handler
Gpetrak Feb 3, 2026
d39e05b
simpifying the ogr-based method in order to not handle the WKT geomet…
Gpetrak Feb 3, 2026
ecc8f30
implementing a data validation check
Gpetrak Feb 3, 2026
5557898
make the XLSX handler configurable
Gpetrak Feb 3, 2026
3459516
adding xls extension
Gpetrak Feb 3, 2026
4084871
extend the handler for xls files
Gpetrak Feb 3, 2026
e105ca7
adding tests
Gpetrak Feb 4, 2026
1bbf2ee
adding a security-check test
Gpetrak Feb 5, 2026
a63ffea
fixing flake issues
Gpetrak Feb 5, 2026
e298866
black re-format
Gpetrak Feb 5, 2026
4fa8944
use InvalidInputFileException in more simple exceptions
Gpetrak Feb 5, 2026
af1d9ee
avoid exposing a row exception to the user
Gpetrak Feb 5, 2026
083addf
simplifying is_valid method
Gpetrak Feb 5, 2026
cbede0b
fixing security in the create_ogr2ogr_command method as gemini review…
Gpetrak Feb 5, 2026
bdbd200
fixing flake issues
Gpetrak Feb 5, 2026
1937fd9
wrapping the xlsx enable check in a separate method
Gpetrak Feb 10, 2026
38a4102
moving the XLSX conf variable in the settings and define it as False
Gpetrak Feb 11, 2026
db358f4
fixing flake issues
Gpetrak Feb 12, 2026
3c55cdb
Merge branch 'master' into ISSUE_13936
Gpetrak Feb 17, 2026
7f6c030
remove the shlex-based modification
Gpetrak Feb 17, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion .env.sample
Original file line number Diff line number Diff line change
Expand Up @@ -245,4 +245,7 @@ RESTART_POLICY_WINDOW=120s

DEFAULT_MAX_PARALLEL_UPLOADS_PER_USER=5

# FORCE_READ_ONLY_MODE=False Override the read-only value saved in the configuration
# FORCE_READ_ONLY_MODE=False Override the read-only value saved in the configuration

# Enable or not the XLSX / XLS upload
XLSX_UPLOAD_ENABLED=False
5 changes: 4 additions & 1 deletion .env_dev
Original file line number Diff line number Diff line change
Expand Up @@ -207,4 +207,7 @@ RESTART_POLICY_WINDOW=120s

DEFAULT_MAX_PARALLEL_UPLOADS_PER_USER=5
UPSERT_CHUNK_SIZE= 100
UPSERT_LIMIT_ERROR_LOG=100
UPSERT_LIMIT_ERROR_LOG=100

# Enable or not the XLSX / XLS upload
XLSX_UPLOAD_ENABLED=False
3 changes: 3 additions & 0 deletions .env_local
Original file line number Diff line number Diff line change
Expand Up @@ -209,3 +209,6 @@ RESTART_POLICY_MAX_ATTEMPTS="3"
RESTART_POLICY_WINDOW=120s

DEFAULT_MAX_PARALLEL_UPLOADS_PER_USER=5

# Enable or not the XLSX / XLS upload
XLSX_UPLOAD_ENABLED=False
3 changes: 3 additions & 0 deletions .env_test
Original file line number Diff line number Diff line change
Expand Up @@ -224,3 +224,6 @@ MICROSOFT_TENANT_ID=
AZURE_CLIENT_ID=
AZURE_SECRET_KEY=
AZURE_KEY=

# Enable or not the XLSX / XLS upload
XLSX_UPLOAD_ENABLED=False
3 changes: 3 additions & 0 deletions geonode/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -2221,3 +2221,6 @@ def get_geonode_catalogue_service():

FILE_UPLOAD_DIRECTORY_PERMISSIONS = 0o777
FILE_UPLOAD_PERMISSIONS = 0o777

# Enable or not the XLSX / XLS upload
XLSX_UPLOAD_ENABLED = ast.literal_eval(os.getenv("XLSX_UPLOAD_ENABLED", "False"))
Empty file.
336 changes: 336 additions & 0 deletions geonode/upload/handlers/xlsx/handler.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,336 @@
#########################################################################
#
# Copyright (C) 2024 OSGeo
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
#########################################################################
import logging
from pathlib import Path
import csv
from datetime import datetime
import math
from celery import group
from python_calamine import CalamineWorkbook
from osgeo import ogr

from dynamic_models.models import ModelSchema
from django.conf import settings

from geonode.upload.handlers.common.vector import BaseVectorFileHandler
from geonode.upload.handlers.csv.handler import CSVFileHandler
from geonode.upload.celery_tasks import create_dynamic_structure
from geonode.upload.handlers.utils import GEOM_TYPE_MAPPING
from geonode.upload.api.exceptions import InvalidInputFileException

logger = logging.getLogger("importer")


class XLSXFileHandler(CSVFileHandler):

XLSX_UPLOAD_ENABLED = getattr(settings, "XLSX_UPLOAD_ENABLED", False)

lat_names = CSVFileHandler.possible_lat_column
lon_names = CSVFileHandler.possible_long_column

@classmethod
def is_xlsx_enabled(cls):
"""
Unified check for the feature toggle.
Returns True if enabled, None if disabled.
"""
if not cls.XLSX_UPLOAD_ENABLED:
return None
return True

@property
def supported_file_extension_config(self):

# If disabled, return an empty list or None so the UI doesn't show XLSX options
if not self.is_xlsx_enabled():
return None

return {
"id": "excel", # Use a generic ID that doesn't imply a specific extension
"formats": [
{
"label": "Excel (xlsx)",
"required_ext": ["xlsx"],
"optional_ext": ["sld", "xml"],
},
{
"label": "Excel (xls)",
"required_ext": ["xls"],
"optional_ext": ["sld", "xml"],
},
],
"actions": list(self.TASKS.keys()),
"type": "vector",
}

@staticmethod
def can_handle(_data) -> bool:
"""
This endpoint will return True or False if with the info provided
the handler is able to handle the file or not
"""
# Availability Check for the back-end
if not XLSXFileHandler.is_xlsx_enabled():
return False

base = _data.get("base_file")
if not base:
return False

# Support both XLSX and XLS
valid_extensions = (".xlsx", ".xls")

is_excel = (
base.lower().endswith(valid_extensions)
if isinstance(base, str)
else base.name.lower().endswith(valid_extensions)
)

return is_excel and BaseVectorFileHandler.can_handle(_data)

@staticmethod
def is_valid(files, user, **kwargs):
from geonode.upload.utils import UploadLimitValidator

# Basic GeoNode validation
BaseVectorFileHandler.is_valid(files, user)

# Parallelism check (This is fast and doesn't need to open the file)
upload_validator = UploadLimitValidator(user)
upload_validator.validate_parallelism_limit_per_user()

# We handle the deep inspection (lat/lon) later.
return True

@staticmethod
def create_ogr2ogr_command(files, original_name, ovverwrite_layer, alternate, **kwargs):
"""
Customized for XLSX: Only looks for X/Y (Point) data.
Sanitized with shlex.quote to prevent Command Injection.
"""

# Pass the safe versions to the base handler
base_command = BaseVectorFileHandler.create_ogr2ogr_command(files, original_name, ovverwrite_layer, alternate)

# Define mapping (these are safe as they are class-level constants)
lat_mapping = ",".join(XLSXFileHandler.lat_names)
lon_mapping = ",".join(XLSXFileHandler.lon_names)

additional_option = (
f' -oo "X_POSSIBLE_NAMES={lon_mapping}" ' f'-oo "Y_POSSIBLE_NAMES={lat_mapping}" ' f'-nln "{alternate}"'
)

# Return the combined, safe command string
return (
f"{base_command} -oo KEEP_GEOM_COLUMNS=NO "
f"-lco GEOMETRY_NAME={BaseVectorFileHandler().default_geometry_column_name} "
f"{additional_option}"
)

def create_dynamic_model_fields(
self,
layer: str,
dynamic_model_schema: ModelSchema = None,
overwrite: bool = None,
execution_id: str = None,
layer_name: str = None,
return_celery_group: bool = True,
):
# retrieving the field schema from ogr2ogr and converting the type to Django Types
layer_schema = [{"name": x.name.lower(), "class_name": self._get_type(x), "null": True} for x in layer.schema]

class_name = GEOM_TYPE_MAPPING.get(self.promote_to_multi("Point"))
# Get the geometry type name from OGR (e.g., 'Point' or 'Point 25D')
geom_type_name = ogr.GeometryTypeToName(layer.GetGeomType())

layer_schema += [
{
"name": layer.GetGeometryColumn() or self.default_geometry_column_name,
"class_name": class_name,
"dim": (3 if geom_type_name.lower().startswith("3d") or "z" in geom_type_name.lower() else 2),
}
]

if not return_celery_group:
return layer_schema

list_chunked = [layer_schema[i : i + 30] for i in range(0, len(layer_schema), 30)]
celery_group = group(
create_dynamic_structure.s(execution_id, schema, dynamic_model_schema.id, overwrite, layer_name)
for schema in list_chunked
)

return dynamic_model_schema, celery_group

def pre_processing(self, files, execution_id, **kwargs):
from geonode.upload.orchestrator import orchestrator

# calling the super function (CSVFileHandler logic)
_data, execution_id = super().pre_processing(files, execution_id, **kwargs)

# convert the XLSX file into a CSV
xlsx_file = _data.get("files", {}).get("base_file", "")
if not xlsx_file:
raise InvalidInputFileException(detail="The base file was not found in the upload payload.")

output_file = str(Path(xlsx_file).with_suffix(".csv"))

try:
workbook = CalamineWorkbook.from_path(xlsx_file)

# Sheet Validation (Uses the validated sheet name)
sheet_name = self._validate_sheets(workbook)
sheet = workbook.get_sheet_by_name(sheet_name)

# We iterate until we find the first non-empty row
rows_gen = iter(sheet.to_python())
try:
# We strictly take the first row. No skipping allowed.
headers = next(rows_gen)
except StopIteration:
raise InvalidInputFileException(detail="The file is empty.")

# Restrictive File Structure Validation
self._validate_headers(headers)

# Conversion with row cleanup
# Note: rows_gen continues from the row after the headers
self._convert_to_csv(headers, rows_gen, output_file)

except Exception as e:
logger.exception("XLSX Pre-processing failed")
raise InvalidInputFileException(detail=f"Failed to securely parse Excel: {str(e)}")

# update the file path in the payload
_data["files"]["base_file"] = output_file

if "temporary_files" not in _data or not isinstance(_data["temporary_files"], dict):
_data["temporary_files"] = {}

_data["temporary_files"]["base_file"] = output_file

# updating the execution id params
orchestrator.update_execution_request_obj(
orchestrator.get_execution_object(execution_id), {"input_params": _data}
)
return _data, execution_id

def _validate_sheets(self, workbook):
"""Returns the first sheet name and logs warnings if others exist."""
sheets = workbook.sheet_names
if not sheets:
raise InvalidInputFileException(detail="No sheets found in workbook.")
if len(sheets) > 1:
logger.warning(f"Multiple sheets found. Ignoring: {sheets[1:]}")
return sheets[0]

def _validate_headers(self, headers):
"""
Strictly validates Row 1 for headers:
- Must not be empty.
- Must contain geometry 'fingerprints' (Lat/Lon).
- Must have unique and non-empty column names.
"""
# Existence Check
if not headers or self._detect_empty_rows(headers):
raise InvalidInputFileException(detail="No data or headers found in the selected sheet.")

# Normalization
clean_headers = [str(h).strip().lower() if h is not None else "" for h in headers]

# Geometry Fingerprint Check
has_lat = any(h in self.lat_names for h in clean_headers)
has_lon = any(h in self.lon_names for h in clean_headers)

if not (has_lat and has_lon):
raise InvalidInputFileException(
detail="The headers do not contain valid geometry headers. "
"GeoNode requires Latitude and Longitude labels in the first row."
)

# Integrity Check (No Empty Names)
if any(h == "" for h in clean_headers):
raise InvalidInputFileException(detail="One or more columns in the first row are missing a header name.")

# Uniqueness Check
if len(clean_headers) != len(set(clean_headers)):
duplicates = set([h for h in clean_headers if clean_headers.count(h) > 1])
raise InvalidInputFileException(detail=f"Duplicate headers found in Row 1: {', '.join(duplicates)}")

return True

def _data_sense_check(self, x, y):
"""
High-speed coordinate validation for large datasets
"""
try:
# Catch Excel Date objects immediately (Calamine returns these as datetime)
if isinstance(x, datetime) or isinstance(y, datetime):
return False

f_x = float(x)
f_y = float(y)

# Finiteness check (Catches NaN, Inf, and None)
# This is extremely fast in Python
if not (math.isfinite(f_x) and math.isfinite(f_y)):
return False

# Magnitude check
# Limits to +/- 40 million (covers all CRS including Web Mercator)
# but blocks 'serial date numbers' or corrupted scientific notation
if not (-40000000 < f_x < 40000000 and -40000000 < f_y < 40000000):
return False

return True
except (ValueError, TypeError):
return False

def _detect_empty_rows(self, row):
return not row or all(cell is None or str(cell).strip() == "" for cell in row)

def _convert_to_csv(self, headers, rows_gen, output_path):
"""Streams valid data to CSV, skipping empty rows."""

# Define clean_headers once here to find the indices
clean_headers = [str(h).strip().lower() for h in headers]

# Get the indices for the Lat and Lon columns
lat_idx = next(i for i, h in enumerate(clean_headers) if h in self.lat_names)
lon_idx = next(i for i, h in enumerate(clean_headers) if h in self.lon_names)

# Local binding of the check function for loop speed
check_func = self._data_sense_check

with open(output_path, "w", newline="", encoding="utf-8") as f:
writer = csv.writer(f)
writer.writerow(headers)

for row_num, row in enumerate(rows_gen, start=2):
# Skip row if it contains no data
if self._detect_empty_rows(row):
continue

if not check_func(row[lon_idx], row[lat_idx]):
raise InvalidInputFileException(
detail=f"Coordinate error at row {row_num}. "
"Check for dates or non-numeric values in Lat/Lon."
)

writer.writerow(row)
Loading
Loading