diff --git a/.env.sample b/.env.sample index fee9293c82d..3727b7078f2 100644 --- a/.env.sample +++ b/.env.sample @@ -245,4 +245,7 @@ RESTART_POLICY_WINDOW=120s DEFAULT_MAX_PARALLEL_UPLOADS_PER_USER=5 -# FORCE_READ_ONLY_MODE=False Override the read-only value saved in the configuration \ No newline at end of file +# FORCE_READ_ONLY_MODE=False Override the read-only value saved in the configuration + +# Enable or not the XLSX / XLS upload +XLSX_UPLOAD_ENABLED=False \ No newline at end of file diff --git a/.env_dev b/.env_dev index 0cfa9dad6c7..f4d32a94fa3 100644 --- a/.env_dev +++ b/.env_dev @@ -207,4 +207,7 @@ RESTART_POLICY_WINDOW=120s DEFAULT_MAX_PARALLEL_UPLOADS_PER_USER=5 UPSERT_CHUNK_SIZE= 100 -UPSERT_LIMIT_ERROR_LOG=100 \ No newline at end of file +UPSERT_LIMIT_ERROR_LOG=100 + +# Enable or not the XLSX / XLS upload +XLSX_UPLOAD_ENABLED=False \ No newline at end of file diff --git a/.env_local b/.env_local index 583a9fc32d6..bc9a975fe1a 100644 --- a/.env_local +++ b/.env_local @@ -209,3 +209,6 @@ RESTART_POLICY_MAX_ATTEMPTS="3" RESTART_POLICY_WINDOW=120s DEFAULT_MAX_PARALLEL_UPLOADS_PER_USER=5 + +# Enable or not the XLSX / XLS upload +XLSX_UPLOAD_ENABLED=False diff --git a/.env_test b/.env_test index a770063d228..04e8407217a 100644 --- a/.env_test +++ b/.env_test @@ -224,3 +224,6 @@ MICROSOFT_TENANT_ID= AZURE_CLIENT_ID= AZURE_SECRET_KEY= AZURE_KEY= + +# Enable or not the XLSX / XLS upload +XLSX_UPLOAD_ENABLED=False diff --git a/geonode/settings.py b/geonode/settings.py index d9335455510..761f4e17675 100644 --- a/geonode/settings.py +++ b/geonode/settings.py @@ -2221,3 +2221,6 @@ def get_geonode_catalogue_service(): FILE_UPLOAD_DIRECTORY_PERMISSIONS = 0o777 FILE_UPLOAD_PERMISSIONS = 0o777 + +# Enable or not the XLSX / XLS upload +XLSX_UPLOAD_ENABLED = ast.literal_eval(os.getenv("XLSX_UPLOAD_ENABLED", "False")) diff --git a/geonode/upload/handlers/xlsx/__init__.py b/geonode/upload/handlers/xlsx/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/geonode/upload/handlers/xlsx/handler.py b/geonode/upload/handlers/xlsx/handler.py new file mode 100644 index 00000000000..d44e99cb586 --- /dev/null +++ b/geonode/upload/handlers/xlsx/handler.py @@ -0,0 +1,336 @@ +######################################################################### +# +# Copyright (C) 2024 OSGeo +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# +######################################################################### +import logging +from pathlib import Path +import csv +from datetime import datetime +import math +from celery import group +from python_calamine import CalamineWorkbook +from osgeo import ogr + +from dynamic_models.models import ModelSchema +from django.conf import settings + +from geonode.upload.handlers.common.vector import BaseVectorFileHandler +from geonode.upload.handlers.csv.handler import CSVFileHandler +from geonode.upload.celery_tasks import create_dynamic_structure +from geonode.upload.handlers.utils import GEOM_TYPE_MAPPING +from geonode.upload.api.exceptions import InvalidInputFileException + +logger = logging.getLogger("importer") + + +class XLSXFileHandler(CSVFileHandler): + + XLSX_UPLOAD_ENABLED = getattr(settings, "XLSX_UPLOAD_ENABLED", False) + + lat_names = CSVFileHandler.possible_lat_column + lon_names = CSVFileHandler.possible_long_column + + @classmethod + def is_xlsx_enabled(cls): + """ + Unified check for the feature toggle. + Returns True if enabled, None if disabled. + """ + if not cls.XLSX_UPLOAD_ENABLED: + return None + return True + + @property + def supported_file_extension_config(self): + + # If disabled, return an empty list or None so the UI doesn't show XLSX options + if not self.is_xlsx_enabled(): + return None + + return { + "id": "excel", # Use a generic ID that doesn't imply a specific extension + "formats": [ + { + "label": "Excel (xlsx)", + "required_ext": ["xlsx"], + "optional_ext": ["sld", "xml"], + }, + { + "label": "Excel (xls)", + "required_ext": ["xls"], + "optional_ext": ["sld", "xml"], + }, + ], + "actions": list(self.TASKS.keys()), + "type": "vector", + } + + @staticmethod + def can_handle(_data) -> bool: + """ + This endpoint will return True or False if with the info provided + the handler is able to handle the file or not + """ + # Availability Check for the back-end + if not XLSXFileHandler.is_xlsx_enabled(): + return False + + base = _data.get("base_file") + if not base: + return False + + # Support both XLSX and XLS + valid_extensions = (".xlsx", ".xls") + + is_excel = ( + base.lower().endswith(valid_extensions) + if isinstance(base, str) + else base.name.lower().endswith(valid_extensions) + ) + + return is_excel and BaseVectorFileHandler.can_handle(_data) + + @staticmethod + def is_valid(files, user, **kwargs): + from geonode.upload.utils import UploadLimitValidator + + # Basic GeoNode validation + BaseVectorFileHandler.is_valid(files, user) + + # Parallelism check (This is fast and doesn't need to open the file) + upload_validator = UploadLimitValidator(user) + upload_validator.validate_parallelism_limit_per_user() + + # We handle the deep inspection (lat/lon) later. + return True + + @staticmethod + def create_ogr2ogr_command(files, original_name, ovverwrite_layer, alternate, **kwargs): + """ + Customized for XLSX: Only looks for X/Y (Point) data. + Sanitized with shlex.quote to prevent Command Injection. + """ + + # Pass the safe versions to the base handler + base_command = BaseVectorFileHandler.create_ogr2ogr_command(files, original_name, ovverwrite_layer, alternate) + + # Define mapping (these are safe as they are class-level constants) + lat_mapping = ",".join(XLSXFileHandler.lat_names) + lon_mapping = ",".join(XLSXFileHandler.lon_names) + + additional_option = ( + f' -oo "X_POSSIBLE_NAMES={lon_mapping}" ' f'-oo "Y_POSSIBLE_NAMES={lat_mapping}" ' f'-nln "{alternate}"' + ) + + # Return the combined, safe command string + return ( + f"{base_command} -oo KEEP_GEOM_COLUMNS=NO " + f"-lco GEOMETRY_NAME={BaseVectorFileHandler().default_geometry_column_name} " + f"{additional_option}" + ) + + def create_dynamic_model_fields( + self, + layer: str, + dynamic_model_schema: ModelSchema = None, + overwrite: bool = None, + execution_id: str = None, + layer_name: str = None, + return_celery_group: bool = True, + ): + # retrieving the field schema from ogr2ogr and converting the type to Django Types + layer_schema = [{"name": x.name.lower(), "class_name": self._get_type(x), "null": True} for x in layer.schema] + + class_name = GEOM_TYPE_MAPPING.get(self.promote_to_multi("Point")) + # Get the geometry type name from OGR (e.g., 'Point' or 'Point 25D') + geom_type_name = ogr.GeometryTypeToName(layer.GetGeomType()) + + layer_schema += [ + { + "name": layer.GetGeometryColumn() or self.default_geometry_column_name, + "class_name": class_name, + "dim": (3 if geom_type_name.lower().startswith("3d") or "z" in geom_type_name.lower() else 2), + } + ] + + if not return_celery_group: + return layer_schema + + list_chunked = [layer_schema[i : i + 30] for i in range(0, len(layer_schema), 30)] + celery_group = group( + create_dynamic_structure.s(execution_id, schema, dynamic_model_schema.id, overwrite, layer_name) + for schema in list_chunked + ) + + return dynamic_model_schema, celery_group + + def pre_processing(self, files, execution_id, **kwargs): + from geonode.upload.orchestrator import orchestrator + + # calling the super function (CSVFileHandler logic) + _data, execution_id = super().pre_processing(files, execution_id, **kwargs) + + # convert the XLSX file into a CSV + xlsx_file = _data.get("files", {}).get("base_file", "") + if not xlsx_file: + raise InvalidInputFileException(detail="The base file was not found in the upload payload.") + + output_file = str(Path(xlsx_file).with_suffix(".csv")) + + try: + workbook = CalamineWorkbook.from_path(xlsx_file) + + # Sheet Validation (Uses the validated sheet name) + sheet_name = self._validate_sheets(workbook) + sheet = workbook.get_sheet_by_name(sheet_name) + + # We iterate until we find the first non-empty row + rows_gen = iter(sheet.to_python()) + try: + # We strictly take the first row. No skipping allowed. + headers = next(rows_gen) + except StopIteration: + raise InvalidInputFileException(detail="The file is empty.") + + # Restrictive File Structure Validation + self._validate_headers(headers) + + # Conversion with row cleanup + # Note: rows_gen continues from the row after the headers + self._convert_to_csv(headers, rows_gen, output_file) + + except Exception as e: + logger.exception("XLSX Pre-processing failed") + raise InvalidInputFileException(detail=f"Failed to securely parse Excel: {str(e)}") + + # update the file path in the payload + _data["files"]["base_file"] = output_file + + if "temporary_files" not in _data or not isinstance(_data["temporary_files"], dict): + _data["temporary_files"] = {} + + _data["temporary_files"]["base_file"] = output_file + + # updating the execution id params + orchestrator.update_execution_request_obj( + orchestrator.get_execution_object(execution_id), {"input_params": _data} + ) + return _data, execution_id + + def _validate_sheets(self, workbook): + """Returns the first sheet name and logs warnings if others exist.""" + sheets = workbook.sheet_names + if not sheets: + raise InvalidInputFileException(detail="No sheets found in workbook.") + if len(sheets) > 1: + logger.warning(f"Multiple sheets found. Ignoring: {sheets[1:]}") + return sheets[0] + + def _validate_headers(self, headers): + """ + Strictly validates Row 1 for headers: + - Must not be empty. + - Must contain geometry 'fingerprints' (Lat/Lon). + - Must have unique and non-empty column names. + """ + # Existence Check + if not headers or self._detect_empty_rows(headers): + raise InvalidInputFileException(detail="No data or headers found in the selected sheet.") + + # Normalization + clean_headers = [str(h).strip().lower() if h is not None else "" for h in headers] + + # Geometry Fingerprint Check + has_lat = any(h in self.lat_names for h in clean_headers) + has_lon = any(h in self.lon_names for h in clean_headers) + + if not (has_lat and has_lon): + raise InvalidInputFileException( + detail="The headers do not contain valid geometry headers. " + "GeoNode requires Latitude and Longitude labels in the first row." + ) + + # Integrity Check (No Empty Names) + if any(h == "" for h in clean_headers): + raise InvalidInputFileException(detail="One or more columns in the first row are missing a header name.") + + # Uniqueness Check + if len(clean_headers) != len(set(clean_headers)): + duplicates = set([h for h in clean_headers if clean_headers.count(h) > 1]) + raise InvalidInputFileException(detail=f"Duplicate headers found in Row 1: {', '.join(duplicates)}") + + return True + + def _data_sense_check(self, x, y): + """ + High-speed coordinate validation for large datasets + """ + try: + # Catch Excel Date objects immediately (Calamine returns these as datetime) + if isinstance(x, datetime) or isinstance(y, datetime): + return False + + f_x = float(x) + f_y = float(y) + + # Finiteness check (Catches NaN, Inf, and None) + # This is extremely fast in Python + if not (math.isfinite(f_x) and math.isfinite(f_y)): + return False + + # Magnitude check + # Limits to +/- 40 million (covers all CRS including Web Mercator) + # but blocks 'serial date numbers' or corrupted scientific notation + if not (-40000000 < f_x < 40000000 and -40000000 < f_y < 40000000): + return False + + return True + except (ValueError, TypeError): + return False + + def _detect_empty_rows(self, row): + return not row or all(cell is None or str(cell).strip() == "" for cell in row) + + def _convert_to_csv(self, headers, rows_gen, output_path): + """Streams valid data to CSV, skipping empty rows.""" + + # Define clean_headers once here to find the indices + clean_headers = [str(h).strip().lower() for h in headers] + + # Get the indices for the Lat and Lon columns + lat_idx = next(i for i, h in enumerate(clean_headers) if h in self.lat_names) + lon_idx = next(i for i, h in enumerate(clean_headers) if h in self.lon_names) + + # Local binding of the check function for loop speed + check_func = self._data_sense_check + + with open(output_path, "w", newline="", encoding="utf-8") as f: + writer = csv.writer(f) + writer.writerow(headers) + + for row_num, row in enumerate(rows_gen, start=2): + # Skip row if it contains no data + if self._detect_empty_rows(row): + continue + + if not check_func(row[lon_idx], row[lat_idx]): + raise InvalidInputFileException( + detail=f"Coordinate error at row {row_num}. " + "Check for dates or non-numeric values in Lat/Lon." + ) + + writer.writerow(row) diff --git a/geonode/upload/handlers/xlsx/tests.py b/geonode/upload/handlers/xlsx/tests.py new file mode 100644 index 00000000000..fb432f799b0 --- /dev/null +++ b/geonode/upload/handlers/xlsx/tests.py @@ -0,0 +1,164 @@ +import os +import tempfile +import zipfile +import uuid +from unittest.mock import patch +from django.test import TestCase +from django.contrib.auth import get_user_model + +from geonode.upload import project_dir +from geonode.upload.api.exceptions import InvalidInputFileException +from geonode.upload.handlers.xlsx.handler import XLSXFileHandler + + +class TestXLSXHandler(TestCase): + databases = ("default", "datastore") + + @classmethod + def setUpClass(cls): + super().setUpClass() + cls.handler = XLSXFileHandler() + + # Consistent with CSV handler's fixture path + cls.valid_xlsx = f"{project_dir}/tests/fixture/valid_excel.xlsx" + cls.valid_xls = f"{project_dir}/tests/fixture/valid_excel.xls" + cls.empty_rows_xlsx = f"{project_dir}/tests/fixture/valid_with_empty_rows.xlsx" + cls.leading_empty_xlsx = f"{project_dir}/tests/fixture/valid_leading_empty_rows.xlsx" + cls.missing_lat_xlsx = f"{project_dir}/tests/fixture/missing_lat.xlsx" + cls.wrong_data_xlsx = f"{project_dir}/tests/fixture/wrong_data.xlsx" + + cls.user, _ = get_user_model().objects.get_or_create(username="admin") + + def setUp(self): + # Force the handler to be enabled for testing + XLSXFileHandler.XLSX_UPLOAD_ENABLED = True + + @patch("geonode.upload.handlers.common.vector.BaseVectorFileHandler.can_handle") + def test_can_handle_xlsx_and_xls(self, mock_base_can_handle): + """Check if the handler identifies both extensions.""" + mock_base_can_handle.return_value = True + + self.assertTrue(self.handler.can_handle({"base_file": self.valid_xlsx})) + self.assertTrue(self.handler.can_handle({"base_file": self.valid_xls})) + + # Also verify it returns False when the file is wrong + self.assertFalse(self.handler.can_handle({"base_file": "random.txt"})) + + @patch("geonode.upload.orchestrator.orchestrator.get_execution_object") + @patch("geonode.upload.orchestrator.orchestrator.update_execution_request_obj") + def test_pre_processing_success_with_valid_files(self, mock_update, mock_get_exec): + test_files = [self.valid_xlsx, self.valid_xls, self.empty_rows_xlsx, self.leading_empty_xlsx] + + for file_path in test_files: + exec_id = str(uuid.uuid4()) + files = {"base_file": file_path} + + with patch( + "geonode.upload.handlers.csv.handler.CSVFileHandler.pre_processing", + return_value=({"files": files, "temporary_files": {}}, exec_id), + ): + + data, _ = self.handler.pre_processing(files, exec_id) + + output_csv = data["files"]["base_file"] + self.assertTrue(output_csv.endswith(".csv")) + self.assertTrue(os.path.exists(output_csv)) + + # Cleanup + if os.path.exists(output_csv): + os.remove(output_csv) + + @patch("geonode.upload.orchestrator.orchestrator.get_execution_object") + def test_pre_processing_fails_on_missing_lat(self, mock_get_exec): + """Should fail when header fingerprinting doesn't find Latitude.""" + exec_id = str(uuid.uuid4()) + files = {"base_file": self.missing_lat_xlsx} + + with patch( + "geonode.upload.handlers.csv.handler.CSVFileHandler.pre_processing", + return_value=({"files": files}, exec_id), + ): + with self.assertRaises(InvalidInputFileException) as context: + self.handler.pre_processing(files, exec_id) + + self.assertIn("geometry headers", str(context.exception)) + + @patch("geonode.upload.orchestrator.orchestrator.get_execution_object") + def test_pre_processing_fails_on_wrong_data(self, mock_get_exec): + """Should fail on row 1 of the data due to 'nan' and extreme magnitude.""" + exec_id = str(uuid.uuid4()) + files = {"base_file": self.wrong_data_xlsx} + + with patch( + "geonode.upload.handlers.csv.handler.CSVFileHandler.pre_processing", + return_value=({"files": files}, exec_id), + ): + with self.assertRaises(InvalidInputFileException) as context: + self.handler.pre_processing(files, exec_id) + + # The error should specifically mention the coordinate error and the row + self.assertIn("Coordinate error at row 2", str(context.exception)) + + def test_data_sense_check_logic(self): + """Directly test the coordinate validation math.""" + # Valid + self.assertTrue(self.handler._data_sense_check(37.8, -122.4)) + # NaN + self.assertFalse(self.handler._data_sense_check("nan", 40.0)) + # Infinite + self.assertFalse(self.handler._data_sense_check(float("inf"), 40.0)) + # Extreme Magnitude + self.assertFalse(self.handler._data_sense_check(40000001, 10.0)) + # Excel Date (as datetime object) + from datetime import datetime + + self.assertFalse(self.handler._data_sense_check(datetime.now(), 40.0)) + + def test_security_billion_laughs_protection(self): + """ + Security Test: Verifies protection against XML Entity Expansion (Billion Laughs). + Ensures the parser handles malicious DTD entities without crashing. + """ + # Create the malicious payload in memory + xml_payload = """ + + + + + ]> + + + """ + + # Use a temporary file so we don't pollute the project's fixture folder + with tempfile.NamedTemporaryFile(suffix=".xlsx", delete=False) as tf: + with zipfile.ZipFile(tf, "w") as zf: + zf.writestr("xl/workbook.xml", xml_payload) + zf.writestr( + "[Content_Types].xml", + '' + '' + '', + ) + malicious_path = tf.name + + exec_id = str(uuid.uuid4()) + files = {"base_file": malicious_path} + + try: + # Patch the super().pre_processing to return our temp file + with patch( + "geonode.upload.handlers.csv.handler.CSVFileHandler.pre_processing", + return_value=({"files": files, "temporary_files": {}}, exec_id), + ): + + # The test passes if it raises the exception OR if it handles it safely + # without timing out (hanging). + with self.assertRaises(InvalidInputFileException) as context: + self.handler.pre_processing(files, exec_id) + + self.assertIn("Failed to securely parse Excel", str(context.exception)) + finally: + if os.path.exists(malicious_path): + os.remove(malicious_path) diff --git a/geonode/upload/settings.py b/geonode/upload/settings.py index e021a030395..07f5b69b200 100644 --- a/geonode/upload/settings.py +++ b/geonode/upload/settings.py @@ -40,4 +40,5 @@ "geonode.upload.handlers.remote.wms.RemoteWMSResourceHandler", "geonode.upload.handlers.remote.cog.RemoteCOGResourceHandler", "geonode.upload.handlers.empty_dataset.handler.EmptyDatasetHandler", + "geonode.upload.handlers.xlsx.handler.XLSXFileHandler", ] diff --git a/geonode/upload/tests/fixture/missing_lat.xlsx b/geonode/upload/tests/fixture/missing_lat.xlsx new file mode 100644 index 00000000000..70d5aea451e Binary files /dev/null and b/geonode/upload/tests/fixture/missing_lat.xlsx differ diff --git a/geonode/upload/tests/fixture/valid_excel.xls b/geonode/upload/tests/fixture/valid_excel.xls new file mode 100644 index 00000000000..c0d1675204d Binary files /dev/null and b/geonode/upload/tests/fixture/valid_excel.xls differ diff --git a/geonode/upload/tests/fixture/valid_excel.xlsx b/geonode/upload/tests/fixture/valid_excel.xlsx new file mode 100644 index 00000000000..38c4918b526 Binary files /dev/null and b/geonode/upload/tests/fixture/valid_excel.xlsx differ diff --git a/geonode/upload/tests/fixture/valid_leading_empty_rows.xlsx b/geonode/upload/tests/fixture/valid_leading_empty_rows.xlsx new file mode 100644 index 00000000000..e929ff5735a Binary files /dev/null and b/geonode/upload/tests/fixture/valid_leading_empty_rows.xlsx differ diff --git a/geonode/upload/tests/fixture/valid_with_empty_rows.xlsx b/geonode/upload/tests/fixture/valid_with_empty_rows.xlsx new file mode 100644 index 00000000000..a6979d5e7ee Binary files /dev/null and b/geonode/upload/tests/fixture/valid_with_empty_rows.xlsx differ diff --git a/geonode/upload/tests/fixture/wrong_data.csv b/geonode/upload/tests/fixture/wrong_data.csv new file mode 100644 index 00000000000..967a1f66be8 --- /dev/null +++ b/geonode/upload/tests/fixture/wrong_data.csv @@ -0,0 +1 @@ +id,name,latitude,longitude,description diff --git a/geonode/upload/tests/fixture/wrong_data.xlsx b/geonode/upload/tests/fixture/wrong_data.xlsx new file mode 100644 index 00000000000..36b35f808b9 Binary files /dev/null and b/geonode/upload/tests/fixture/wrong_data.xlsx differ diff --git a/pyproject.toml b/pyproject.toml index 41cc837fab3..f083204cbfe 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -159,6 +159,8 @@ dependencies = [ # Security and audit "cryptography==46.0.3", "jwcrypto>=1.5.6", + # dependency for XLSX handler + "python-calamine==0.6.1", ] [project.optional-dependencies]