From 9d55e81f22542a968b254f2b137c69be1e3e6bc4 Mon Sep 17 00:00:00 2001 From: Andrew Date: Tue, 24 Feb 2026 10:10:19 -0600 Subject: [PATCH 1/8] fix: sanitize download filenames to prevent path traversal --- src/pyUSPTO/clients/base.py | 18 ++++--- tests/clients/test_base.py | 99 +++++++++++++++++++++++++++++++++++++ 2 files changed, 111 insertions(+), 6 deletions(-) diff --git a/src/pyUSPTO/clients/base.py b/src/pyUSPTO/clients/base.py index e85ff46..c535ba3 100644 --- a/src/pyUSPTO/clients/base.py +++ b/src/pyUSPTO/clients/base.py @@ -562,12 +562,18 @@ def _save_response_to_file( else: filename = "download" - if destination: - dest_path = Path(destination) - dest_path.mkdir(parents=True, exist_ok=True) - final_path = dest_path / filename - else: - final_path = Path.cwd() / filename + filename = Path(filename).name + if not filename or filename in (".", ".."): + filename = "download" + + dest_path = Path(destination) if destination else Path.cwd() + dest_path.mkdir(parents=True, exist_ok=True) + final_path = dest_path / filename + + if not self._is_safe_path(dest_path, final_path): + raise ValueError( + f"Filename {filename!r} resolves outside destination directory" + ) if final_path.exists() and not overwrite: raise FileExistsError(f"File exists: {final_path}. Use overwrite=True") diff --git a/tests/clients/test_base.py b/tests/clients/test_base.py index d257415..3b315b7 100644 --- a/tests/clients/test_base.py +++ b/tests/clients/test_base.py @@ -1416,6 +1416,105 @@ def test_save_to_current_directory_when_no_destination( assert result == str(expected_path) +class TestSaveResponseToFilePathTraversal: + """Tests for filename sanitization in _save_response_to_file.""" + + @patch("builtins.open", new_callable=mock_open) + def test_content_disposition_path_traversal_stripped( + self, mock_file_open: MagicMock, tmp_path: Any + ) -> None: + """Filenames with directory traversal sequences are sanitized.""" + client: BaseUSPTOClient[Any] = BaseUSPTOClient( + config=USPTOConfig(api_key="test"), base_url="https://test.com" + ) + mock_response = MagicMock() + mock_response.headers = { + "Content-Disposition": 'attachment; filename="../../etc/passwd"' + } + mock_response.iter_content.return_value = [b"data"] + + result = client._save_response_to_file(mock_response, str(tmp_path)) + + expected_path = tmp_path / "passwd" + mock_file_open.assert_called_once_with(expected_path, "wb") + assert result == str(expected_path) + + @patch("builtins.open", new_callable=mock_open) + def test_filename_with_path_separators_stripped( + self, mock_file_open: MagicMock, tmp_path: Any + ) -> None: + """User-provided filenames with path separators are sanitized.""" + client: BaseUSPTOClient[Any] = BaseUSPTOClient( + config=USPTOConfig(api_key="test"), base_url="https://test.com" + ) + mock_response = MagicMock() + mock_response.headers = {} + mock_response.iter_content.return_value = [b"data"] + + result = client._save_response_to_file( + mock_response, str(tmp_path), file_name="../evil.txt" + ) + + expected_path = tmp_path / "evil.txt" + mock_file_open.assert_called_once_with(expected_path, "wb") + assert result == str(expected_path) + + @patch("builtins.open", new_callable=mock_open) + def test_url_path_traversal_stripped( + self, mock_file_open: MagicMock, tmp_path: Any + ) -> None: + """Filenames derived from URLs with traversal are sanitized.""" + client: BaseUSPTOClient[Any] = BaseUSPTOClient( + config=USPTOConfig(api_key="test"), base_url="https://test.com" + ) + mock_response = MagicMock() + mock_response.headers = {} + mock_response.url = "https://test.com/../../secret.pdf" + mock_response.iter_content.return_value = [b"data"] + + result = client._save_response_to_file(mock_response, str(tmp_path)) + + expected_path = tmp_path / "secret.pdf" + mock_file_open.assert_called_once_with(expected_path, "wb") + assert result == str(expected_path) + + @patch("builtins.open", new_callable=mock_open) + def test_empty_filename_after_sanitization_falls_back( + self, mock_file_open: MagicMock, tmp_path: Any + ) -> None: + """A filename that becomes empty after sanitization falls back to 'download'.""" + client: BaseUSPTOClient[Any] = BaseUSPTOClient( + config=USPTOConfig(api_key="test"), base_url="https://test.com" + ) + mock_response = MagicMock() + mock_response.headers = { + "Content-Disposition": 'attachment; filename="../../"' + } + mock_response.iter_content.return_value = [b"data"] + + result = client._save_response_to_file(mock_response, str(tmp_path)) + + expected_path = tmp_path / "download" + mock_file_open.assert_called_once_with(expected_path, "wb") + assert result == str(expected_path) + + def test_is_safe_path_rejects_unsafe_resolved_path( + self, tmp_path: Any + ) -> None: + """Raises ValueError when resolved path escapes destination.""" + client: BaseUSPTOClient[Any] = BaseUSPTOClient( + config=USPTOConfig(api_key="test"), base_url="https://test.com" + ) + mock_response = MagicMock() + mock_response.headers = { + "Content-Disposition": 'attachment; filename="safe.txt"' + } + + with patch.object(client, "_is_safe_path", return_value=False): + with pytest.raises(ValueError, match="resolves outside"): + client._save_response_to_file(mock_response, str(tmp_path)) + + class TestExtractArchive: """Tests for _extract_archive method.""" From 2458bf40c133601e76f291822069516018fb3fe9 Mon Sep 17 00:00:00 2001 From: Andrew Date: Tue, 24 Feb 2026 10:25:35 -0600 Subject: [PATCH 2/8] fix: add download path validation and zip-bomb protection --- ADVANCED.md | 32 ++++++++++++++++++++++++++++++-- src/pyUSPTO/clients/base.py | 13 +++++++++++-- src/pyUSPTO/http_config.py | 6 ++++++ tests/test_http_config.py | 3 +++ 4 files changed, 50 insertions(+), 4 deletions(-) diff --git a/ADVANCED.md b/ADVANCED.md index 774846c..beccba6 100644 --- a/ADVANCED.md +++ b/ADVANCED.md @@ -75,8 +75,6 @@ All clients support configuration via environment variables. This is the recomme | `USPTO_PATENT_DATA_BASE_URL` | Base URL for Patent Data API | `https://api.uspto.gov` | | `USPTO_PETITION_DECISIONS_BASE_URL` | Base URL for Petition Decisions API | `https://api.uspto.gov` | | `USPTO_PTAB_BASE_URL` | Base URL for PTAB APIs | `https://api.uspto.gov` | -| `USPTO_DOWNLOAD_CHUNK_SIZE` | Chunk size in bytes for file downloads | `8192` | - ### HTTP Transport Configuration | Environment Variable | Description | Default | @@ -87,6 +85,8 @@ All clients support configuration via environment variables. This is the recomme | `USPTO_BACKOFF_FACTOR` | Exponential backoff multiplier for retries | `2.0` | | `USPTO_POOL_CONNECTIONS` | Number of connection pools to cache | `10` | | `USPTO_POOL_MAXSIZE` | Maximum connections per pool | `10` | +| `USPTO_DOWNLOAD_CHUNK_SIZE` | Chunk size in bytes for file downloads | `8192` | +| `USPTO_MAX_EXTRACT_SIZE` | Maximum bytes to extract from archives | None (no limit) | ### Example: Configuration @@ -212,3 +212,31 @@ warnings.filterwarnings('always', category=USPTODataWarning) ``` The library's permissive parsing philosophy returns `None` for fields that cannot be parsed, allowing you to retrieve partial data even when some fields have issues. Warnings inform you when this happens without stopping execution. + +## Archive Extraction Safety + +Download methods that accept `extract=True` (e.g., `BulkDataClient.download_file`) automatically extract archive files (tar.gz, zip). The extraction includes the following protections: + +- **Path traversal protection**: Archive members with paths that resolve outside the extraction directory are rejected. +- **Size limits**: Set `max_extract_size` on `HTTPConfig` to cap the total bytes extracted, protecting against zip bombs. + +```python +from pyUSPTO import USPTOConfig, HTTPConfig, BulkDataClient + +http_config = HTTPConfig( + max_extract_size=10 * 1024 * 1024 * 1024 # 10 GB +) +config = USPTOConfig(api_key="your_key", http_config=http_config) +client = BulkDataClient(config=config) + +# Extraction will raise ValueError if total extracted size exceeds 10 GB +client.download_file(product_file, destination="/tmp", extract=True) +``` + +Or via environment variable: + +```bash +export USPTO_MAX_EXTRACT_SIZE=10737418240 # 10 GB +``` + +By default, `extract` is `False` on `BulkDataClient.download_file` and there is no size limit. \ No newline at end of file diff --git a/src/pyUSPTO/clients/base.py b/src/pyUSPTO/clients/base.py index c535ba3..d57de11 100644 --- a/src/pyUSPTO/clients/base.py +++ b/src/pyUSPTO/clients/base.py @@ -720,6 +720,10 @@ def _download_and_extract( ) -> str: """Download file and auto-extract if it's an archive. + Archives are extracted with path traversal protection. Extraction size + can be limited via ``http_config.max_extract_size`` to guard against + zip bombs. + Args: url: URL to download destination: Directory to save/extract to @@ -732,7 +736,8 @@ def _download_and_extract( Raises: TypeError: If response is not a valid Response object FileExistsError: If file exists and overwrite is False - ValueError: If downloaded file is not a valid archive when extraction attempted + ValueError: If downloaded file is not a valid archive when extraction + attempted, or if extraction exceeds max_extract_size """ import tarfile import zipfile @@ -749,7 +754,11 @@ def _download_and_extract( ) if is_archive: - return self._extract_archive(path_obj, remove_archive=True) + return self._extract_archive( + path_obj, + remove_archive=True, + max_size=self.http_config.max_extract_size, + ) else: return downloaded_path diff --git a/src/pyUSPTO/http_config.py b/src/pyUSPTO/http_config.py index 8c1fcf7..4944d68 100644 --- a/src/pyUSPTO/http_config.py +++ b/src/pyUSPTO/http_config.py @@ -27,6 +27,7 @@ class HTTPConfig: pool_connections: Number of connection pools to cache (default: 10) pool_maxsize: Maximum number of connections per pool (default: 10) download_chunk_size: Chunk size in bytes for streaming file downloads (default: 8192) + max_extract_size: Maximum total bytes to extract from archives (default: None, no limit) custom_headers: Additional headers to include in all requests """ @@ -47,6 +48,7 @@ class HTTPConfig: # Download configuration download_chunk_size: int = 8192 # Bytes per chunk when streaming downloads + max_extract_size: int | None = None # Custom headers (User-Agent, tracking, etc.) custom_headers: dict[str, str] | None = None @@ -78,6 +80,7 @@ def from_env(cls) -> "HTTPConfig": USPTO_POOL_CONNECTIONS: Connection pool size USPTO_POOL_MAXSIZE: Max connections per pool USPTO_DOWNLOAD_CHUNK_SIZE: Chunk size for streaming downloads (bytes) + USPTO_MAX_EXTRACT_SIZE: Maximum bytes to extract from archives Returns: HTTPConfig instance with values from environment or defaults @@ -92,6 +95,9 @@ def from_env(cls) -> "HTTPConfig": download_chunk_size=int( os.environ.get("USPTO_DOWNLOAD_CHUNK_SIZE", "8192") ), + max_extract_size=( + int(v) if (v := os.environ.get("USPTO_MAX_EXTRACT_SIZE")) else None + ), ) def get_timeout_tuple(self) -> tuple[float | None, float | None]: diff --git a/tests/test_http_config.py b/tests/test_http_config.py index 2733028..6fd005a 100644 --- a/tests/test_http_config.py +++ b/tests/test_http_config.py @@ -49,6 +49,7 @@ def test_from_env(self, monkeypatch): monkeypatch.setenv("USPTO_BACKOFF_FACTOR", "1.5") monkeypatch.setenv("USPTO_POOL_CONNECTIONS", "15") monkeypatch.setenv("USPTO_POOL_MAXSIZE", "25") + monkeypatch.setenv("USPTO_MAX_EXTRACT_SIZE", "5368709120") # 5 GB config = HTTPConfig.from_env() assert config.timeout == 45.0 @@ -57,6 +58,7 @@ def test_from_env(self, monkeypatch): assert config.backoff_factor == 1.5 assert config.pool_connections == 15 assert config.pool_maxsize == 25 + assert config.max_extract_size == 5368709120 def test_from_env_with_defaults(self): """Test HTTPConfig.from_env() uses defaults when env vars not set""" @@ -78,6 +80,7 @@ def test_from_env_with_defaults(self): assert config.backoff_factor == 2.0 assert config.pool_connections == 10 assert config.pool_maxsize == 10 + assert config.max_extract_size is None def test_get_timeout_tuple(self): """Test timeout tuple generation""" From 7817100764221edde19b73bad7523c0089866d4a Mon Sep 17 00:00:00 2001 From: Andrew Date: Tue, 24 Feb 2026 10:31:50 -0600 Subject: [PATCH 3/8] docs: add session lifecycle and extraction safety sections --- ADVANCED.md | 26 +++++++++++++++++++++++++- 1 file changed, 25 insertions(+), 1 deletion(-) diff --git a/ADVANCED.md b/ADVANCED.md index beccba6..787cd01 100644 --- a/ADVANCED.md +++ b/ADVANCED.md @@ -31,7 +31,31 @@ config = USPTOConfig( client = PatentDataClient(config=config) ``` -Configure HTTP settings via environment variables: +## Session Lifecycle + +`USPTOConfig` manages an underlying `requests.Session`. For short-lived scripts this is cleaned up automatically, but for long-running applications or tests you may close it explicitly: + +```python +from pyUSPTO import PatentDataClient, USPTOConfig + +# Option 1: Context manager +with USPTOConfig(api_key="your_key") as config: + client = PatentDataClient(config=config) + response = client.search_applications(limit=1) + +# Option 2: Explicit close +config = USPTOConfig(api_key="your_key") +try: + client = PatentDataClient(config=config) + response = client.search_applications(limit=1) +finally: + config.close() +``` + + +## Configure HTTP settings + +via environment variables: ```bash export USPTO_REQUEST_TIMEOUT=60.0 # Read timeout From 005ad2e2b94387ad3c4af6e79b063ed60ffc51d1 Mon Sep 17 00:00:00 2001 From: Andrew Date: Tue, 24 Feb 2026 11:26:23 -0600 Subject: [PATCH 4/8] docs: improve README structure and add missing examples; fix: enforce keyword names in get_IFW_metadata. --- ADVANCED.md | 40 +++++--- README.md | 151 +++++++++++------------------ src/pyUSPTO/clients/patent_data.py | 1 + 3 files changed, 84 insertions(+), 108 deletions(-) diff --git a/ADVANCED.md b/ADVANCED.md index 787cd01..e1dcf4a 100644 --- a/ADVANCED.md +++ b/ADVANCED.md @@ -52,10 +52,7 @@ finally: config.close() ``` - -## Configure HTTP settings - -via environment variables: +## HTTP Configuration via Environment Variables ```bash export USPTO_REQUEST_TIMEOUT=60.0 # Read timeout @@ -99,24 +96,32 @@ All clients support configuration via environment variables. This is the recomme | `USPTO_PATENT_DATA_BASE_URL` | Base URL for Patent Data API | `https://api.uspto.gov` | | `USPTO_PETITION_DECISIONS_BASE_URL` | Base URL for Petition Decisions API | `https://api.uspto.gov` | | `USPTO_PTAB_BASE_URL` | Base URL for PTAB APIs | `https://api.uspto.gov` | + +> [!NOTE] +> The base URL variables are provided in case the USPTO introduces alternate environments (e.g., a development or testing endpoint) in the future. Currently there are no such endpoints, and these defaults should not be changed. + ### HTTP Transport Configuration -| Environment Variable | Description | Default | -| -------------------------- | ------------------------------------------ | -------- | -| `USPTO_REQUEST_TIMEOUT` | Read timeout in seconds | `30.0` | -| `USPTO_CONNECT_TIMEOUT` | Connection timeout in seconds | `10.0` | -| `USPTO_MAX_RETRIES` | Maximum number of retry attempts | `3` | -| `USPTO_BACKOFF_FACTOR` | Exponential backoff multiplier for retries | `2.0` | -| `USPTO_POOL_CONNECTIONS` | Number of connection pools to cache | `10` | -| `USPTO_POOL_MAXSIZE` | Maximum connections per pool | `10` | -| `USPTO_DOWNLOAD_CHUNK_SIZE` | Chunk size in bytes for file downloads | `8192` | -| `USPTO_MAX_EXTRACT_SIZE` | Maximum bytes to extract from archives | None (no limit) | +| Environment Variable | Description | Default | +| ----------------------------- | ------------------------------------------ | --------------- | +| `USPTO_REQUEST_TIMEOUT` | Read timeout in seconds | `30.0` | +| `USPTO_CONNECT_TIMEOUT` | Connection timeout in seconds | `10.0` | +| `USPTO_MAX_RETRIES` | Maximum number of retry attempts | `3` | +| `USPTO_BACKOFF_FACTOR` | Exponential backoff multiplier for retries | `2.0` | +| `USPTO_POOL_CONNECTIONS` | Number of connection pools to cache | `10` | +| `USPTO_POOL_MAXSIZE` | Maximum connections per pool | `10` | +| `USPTO_DOWNLOAD_CHUNK_SIZE` | Chunk size in bytes for file downloads | `8192` | +| `USPTO_MAX_EXTRACT_SIZE` | Maximum bytes to extract from archives | None (no limit) | ### Example: Configuration ```bash # API Configuration export USPTO_API_KEY="your_api_key" +export USPTO_BULK_DATA_BASE_URL="https://api.uspto.gov" +export USPTO_PATENT_DATA_BASE_URL="https://api.uspto.gov" +export USPTO_PETITION_DECISIONS_BASE_URL="https://api.uspto.gov" +export USPTO_PTAB_BASE_URL="https://api.uspto.gov" # Increase timeouts for large downloads export USPTO_REQUEST_TIMEOUT=120.0 @@ -132,6 +137,9 @@ export USPTO_POOL_MAXSIZE=20 # Larger chunk size for faster downloads export USPTO_DOWNLOAD_CHUNK_SIZE=65536 + +# Limit total bytes extracted from archives +export USPTO_MAX_EXTRACT_SIZE=10737418240 ``` ## Debugging with Raw Data Preservation @@ -242,7 +250,7 @@ The library's permissive parsing philosophy returns `None` for fields that canno Download methods that accept `extract=True` (e.g., `BulkDataClient.download_file`) automatically extract archive files (tar.gz, zip). The extraction includes the following protections: - **Path traversal protection**: Archive members with paths that resolve outside the extraction directory are rejected. -- **Size limits**: Set `max_extract_size` on `HTTPConfig` to cap the total bytes extracted, protecting against zip bombs. +- **Size limits**: Set `max_extract_size` on `HTTPConfig` to cap the total bytes extracted, protecting against zip bombs or file system size limitations. ```python from pyUSPTO import USPTOConfig, HTTPConfig, BulkDataClient @@ -263,4 +271,4 @@ Or via environment variable: export USPTO_MAX_EXTRACT_SIZE=10737418240 # 10 GB ``` -By default, `extract` is `False` on `BulkDataClient.download_file` and there is no size limit. \ No newline at end of file +By default, `extract` is `False` on `BulkDataClient.download_file` and there is no size limit. diff --git a/README.md b/README.md index 2199223..f8b7d42 100644 --- a/README.md +++ b/README.md @@ -10,7 +10,7 @@ A Python client library for interacting with the United Stated Patent and Tradem This package provides clients for interacting with the USPTO Bulk Data API, Patent Data API, Final Petition Decisions API, and PTAB (Patent Trial and Appeal Board) APIs. > [!IMPORTANT] -> The USPTO is in the process of moving their API. This package is only concerned with the new API. The [old API](https://developer.uspto.gov/) will be retired at the end of 2025. +> The USPTO is in the process of moving their Developer API. This package is only concerned with the new API. The [old API](https://developer.uspto.gov/) was officially retired at the end of 2025; however, some products have not yet been fully transitioned to the Open Data Portal API. The USPTO expects the remaining products to be transitioned to the Open Data Portal in early 2026. ## Quick Start @@ -37,33 +37,12 @@ print(f"Found {results.count} applications") ## Configuration -All clients require a `USPTOConfig` object for configuration. There are two methods: +All clients require a `USPTOConfig` object. You can create one from environment variables (recommended) or by passing the API key directly. -### Method 1: Using USPTOConfig - -```python -from pyUSPTO import ( - BulkDataClient, - PatentDataClient, - FinalPetitionDecisionsClient, - PTABTrialsClient, - PTABAppealsClient, - PTABInterferencesClient -) - -from pyUSPTO.config import USPTOConfig - -config = USPTOConfig(api_key="your_api_key_here") - -patent_client = PatentDataClient(config=config) -bulk_client = BulkDataClient(config=config) -petition_client = FinalPetitionDecisionsClient(config=config) -trials_client = PTABTrialsClient(config=config) -appeals_client = PTABAppealsClient(config=config) -interferences_client = PTABInterferencesClient(config=config) -``` +> [!TIP] +> `USPTOConfig` manages an underlying HTTP session. For long-running applications, use it as a context manager (`with USPTOConfig(...) as config:`) or call `config.close()` when done. See [ADVANCED.md](ADVANCED.md#session-lifecycle) for details. -### Method 2: Environment Variables (Recommended) +### Environment Variables (Recommended) Set the environment variable in your shell: @@ -80,9 +59,9 @@ from pyUSPTO import ( FinalPetitionDecisionsClient, PTABTrialsClient, PTABAppealsClient, - PTABInterferencesClient + PTABInterferencesClient, + USPTOConfig, ) -from pyUSPTO.config import USPTOConfig # Load configuration from environment config = USPTOConfig.from_env() @@ -95,7 +74,17 @@ appeals_client = PTABAppealsClient(config=config) interferences_client = PTABInterferencesClient(config=config) ``` -## API Usage Examples +### Direct API Key + +Alternatively, you can pass your API key directly when creating the config: + +```python +from pyUSPTO import USPTOConfig + +config = USPTOConfig(api_key="your_api_key_here") +``` + +## Client Usage Examples > [!TIP] > For comprehensive examples with detailed explanations, see the [`examples/`](examples/) directory. @@ -103,7 +92,7 @@ interferences_client = PTABInterferencesClient(config=config) ### Patent Data API ```python -from pyUSPTO import PatentDataClient +from pyUSPTO import PatentDataClient, USPTOConfig config = USPTOConfig(api_key="your_api_key_here") client = PatentDataClient(config=config) @@ -118,12 +107,47 @@ if app.application_meta_data: print(f"Title: {app.application_meta_data.invention_title}") ``` +`PatentDataClient` also provides convenience methods for common lookups: + +```python +# Look up a patent wrapper by any identifier type (you must use keyword names). +wrapper = client.get_IFW_metadata(application_number="18/045,436") +wrapper = client.get_IFW_metadata(patent_number="11,234,567") +wrapper = client.get_IFW_metadata(publication_number="2023/0012345") +wrapper = client.get_IFW_metadata(PCT_app_number="PCT/US24/12345") + +# Look up USPTO status codes +status_codes = client.get_status_codes() +``` + See [`examples/patent_data_example.py`](examples/patent_data_example.py) for detailed examples including downloading documents and publications. +### Bulk Data API + +```python +from pyUSPTO import BulkDataClient, USPTOConfig + +config = USPTOConfig(api_key="your_api_key_here") +client = BulkDataClient(config=config) + +# Search for bulk data products +response = client.search_products(query="patent", limit=5) +print(f"Found {response.count} products matching 'patent'") + +for product in response.bulk_data_product_bag: + print(f" {product.product_title_text} ({product.product_identifier})") + +# Get a specific product with its files +product = client.get_product_by_id("PTGRXML", include_files=True, latest=True) +print(f"Product: {product.product_title_text}") +``` + +See [`examples/bulk_data_example.py`](examples/bulk_data_example.py) for detailed examples including file downloads and archive extraction. + ### Final Petition Decisions API ```python -from pyUSPTO import FinalPetitionDecisionsClient +from pyUSPTO import FinalPetitionDecisionsClient, USPTOConfig config = USPTOConfig(api_key="your_api_key_here") client = FinalPetitionDecisionsClient(config=config) @@ -147,7 +171,7 @@ See [`examples/petition_decisions_example.py`](examples/petition_decisions_examp ### PTAB Trials API ```python -from pyUSPTO import PTABTrialsClient +from pyUSPTO import PTABTrialsClient, USPTOConfig config = USPTOConfig(api_key="your_api_key_here") client = PTABTrialsClient(config=config) @@ -175,7 +199,7 @@ See [`examples/ptab_trials_example.py`](examples/ptab_trials_example.py) for det ### PTAB Appeals API ```python -from pyUSPTO import PTABAppealsClient +from pyUSPTO import PTABAppealsClient, USPTOConfig config = USPTOConfig(api_key="your_api_key_here") client = PTABAppealsClient(config=config) @@ -195,7 +219,7 @@ See [`examples/ptab_appeals_example.py`](examples/ptab_appeals_example.py) for d ### PTAB Interferences API ```python -from pyUSPTO import PTABInterferencesClient +from pyUSPTO import PTABInterferencesClient, USPTOConfig config = USPTOConfig(api_key="your_api_key_here") client = PTABInterferencesClient(config=config) @@ -216,64 +240,7 @@ Full documentation may be found on [Read the Docs](https://pyuspto.readthedocs.i ## Data Models -The library uses Python dataclasses to represent API responses. All data models include type annotations for attributes and methods, making them fully compatible with static type checkers. - -#### Bulk Data API - -- `BulkDataResponse`: Top-level response from the API -- `BulkDataProduct`: Information about a specific product -- `ProductFileBag`: Container for file data elements -- `FileData`: Information about an individual file - -#### Patent Data API - -- `PatentDataResponse`: Top-level response from the API -- `PatentFileWrapper`: Information about a patent application -- `ApplicationMetaData`: Metadata about a patent application -- `Person`, `Applicant`, `Inventor`, `Attorney`: Person-related data classes -- `Assignment`, `Assignor`, `Assignee`: Assignment-related data classes -- `Continuity`, `ParentContinuity`, `ChildContinuity`: Continuity-related data classes -- `PatentTermAdjustmentData`: Patent term adjustment information -- `DocumentBag`, `EntityStatus`, `RecordAttorney`: Additional data classes for patent data -- And many more specialized classes for different aspects of patent data - -#### Final Petition Decisions API - -- `PetitionDecisionResponse`: Top-level response from the API -- `PetitionDecision`: Complete information about a petition decision -- `PetitionDecisionDocument`: Document associated with a petition decision -- `DecisionTypeCode`: Enum for petition decision types -- `DocumentDirectionCategory`: Enum for document direction categories - -#### PTAB Trials API - -- `PTABTrialProceedingResponse`: Top-level response from the API -- `PTABTrialProceeding`: Information about a PTAB trial proceeding (IPR, PGR, CBM, DER) -- `PTABTrialDocumentResponse`: Response containing trial documents -- `PTABTrialDocument`: Document associated with a trial proceeding -- `TrialDecisionData`: Decision information for a trial proceeding -- `TrialDocumentData`: Document metadata for trial documents -- `TrialMetaData`: Trial metadata and status information -- `RegularPetitionerData`, `RespondentData`, `DerivationPetitionerData`: Party data for different trial types - -#### PTAB Appeals API - -- `PTABAppealResponse`: Top-level response from the API -- `PTABAppealDecision`: Ex parte appeal decision information -- `AppellantData`: Appellant information and application details -- `AppealMetaData`: Appeal metadata and filing information -- `AppealDocumentData`: Document and decision details - -#### PTAB Interferences API - -- `PTABInterferenceResponse`: Top-level response from the API -- `PTABInterferenceDecision`: Interference proceeding decision information -- `SeniorPartyData`, `JuniorPartyData`, `AdditionalPartyData`: Party data classes -- `InterferenceMetaData`: Interference metadata and status information -- `InterferenceDocumentData`: Document and outcome details -- `DecisionData`: Decision information for interference proceedings - -For a complete list of all data models, see the [API Reference docuentation](https://pyuspto.readthedocs.io/en/latest/api/models/index.html). +The library uses Python dataclasses to represent API responses. All data models include type annotations and are fully compatible with static type checkers. For a complete list of all data models, see the [API Reference documentation](https://pyuspto.readthedocs.io/en/latest/api/models/index.html). ## Advanced Topics diff --git a/src/pyUSPTO/clients/patent_data.py b/src/pyUSPTO/clients/patent_data.py index 93125e9..2615cfc 100644 --- a/src/pyUSPTO/clients/patent_data.py +++ b/src/pyUSPTO/clients/patent_data.py @@ -1005,6 +1005,7 @@ def download_document( def get_IFW_metadata( self, + *, application_number: str | None = None, publication_number: str | None = None, patent_number: str | None = None, From e5df2f1931fb6dbe196f5cfae729c84fdc90f8e7 Mon Sep 17 00:00:00 2001 From: Andrew Date: Tue, 24 Feb 2026 11:32:17 -0600 Subject: [PATCH 5/8] refactor: remove unused utils.http module and ALLOWED_METHODS --- src/pyUSPTO/http_config.py | 3 -- src/pyUSPTO/utils/__init__.py | 7 ---- src/pyUSPTO/utils/http.py | 50 ------------------------ tests/utils/test_http.py | 73 ----------------------------------- 4 files changed, 133 deletions(-) delete mode 100644 src/pyUSPTO/utils/http.py delete mode 100644 tests/utils/test_http.py diff --git a/src/pyUSPTO/http_config.py b/src/pyUSPTO/http_config.py index 4944d68..1b29422 100644 --- a/src/pyUSPTO/http_config.py +++ b/src/pyUSPTO/http_config.py @@ -7,9 +7,6 @@ import os from dataclasses import dataclass, field -# HTTP methods supported by the USPTO API -ALLOWED_METHODS = ["GET", "POST"] - @dataclass class HTTPConfig: diff --git a/src/pyUSPTO/utils/__init__.py b/src/pyUSPTO/utils/__init__.py index 16d72fa..954a068 100644 --- a/src/pyUSPTO/utils/__init__.py +++ b/src/pyUSPTO/utils/__init__.py @@ -2,10 +2,3 @@ This package provides utility functions for USPTO API clients. """ - -from pyUSPTO.utils.http import create_session, parse_response - -__all__ = [ - "create_session", - "parse_response", -] diff --git a/src/pyUSPTO/utils/http.py b/src/pyUSPTO/utils/http.py deleted file mode 100644 index 5b63fdb..0000000 --- a/src/pyUSPTO/utils/http.py +++ /dev/null @@ -1,50 +0,0 @@ -"""utils.http - HTTP utilities for USPTO API clients. - -This module provides HTTP utilities for USPTO API clients. -""" - -from typing import Any - -import requests -from requests.adapters import HTTPAdapter -from urllib3.util.retry import Retry - - -def create_session(headers: dict[str, str] | None = None) -> requests.Session: - """Create a requests session with retry configuration. - - Args: - headers: Optional headers to add to the session - - Returns: - Configured requests.Session object - """ - session = requests.Session() - - if headers: - session.headers.update(headers) - - # Configure retries - retry_strategy = Retry( - total=3, - backoff_factor=1, - status_forcelist=[429, 500, 502, 503, 504], - ) - adapter = HTTPAdapter(max_retries=retry_strategy) - session.mount("http://", adapter) - session.mount("https://", adapter) - - return session - - -def parse_response(response: requests.Response) -> dict[str, Any]: - """Parse a response from the USPTO API. - - Args: - response: Response from the USPTO API - - Returns: - Parsed response data - """ - json_response: dict[str, Any] = response.json() - return json_response diff --git a/tests/utils/test_http.py b/tests/utils/test_http.py deleted file mode 100644 index c9afb1a..0000000 --- a/tests/utils/test_http.py +++ /dev/null @@ -1,73 +0,0 @@ -""" -Tests for the pyUSPTO.utils.http module. -""" - -from unittest.mock import ANY, MagicMock, patch - -from pyUSPTO.utils.http import create_session - - -class TestHttpUtils: - """Tests for HTTP utility functions.""" - - def test_create_session(self) -> None: - """Test that create_session configures session correctly.""" - with patch("pyUSPTO.utils.http.requests.Session") as mock_session: - # Setup the mock - mock_session_instance = MagicMock() - mock_session.return_value = mock_session_instance - - # Call the function with custom headers - headers = {"X-API-KEY": "test_key"} - session = create_session(headers=headers) - - # Verify the session was created - mock_session.assert_called_once() - - # Verify headers were set - mock_session_instance.headers.update.assert_called_once_with(headers) - - # Verify adapters were mounted - mock_session_instance.mount.assert_any_call("http://", ANY) - mock_session_instance.mount.assert_any_call("https://", ANY) - - # Should be called exactly twice - once for http and once for https - assert mock_session_instance.mount.call_count == 2 - - # Return the session - assert session == mock_session_instance - - def test_parse_response(self) -> None: - """Test parse_response function.""" - from pyUSPTO.utils.http import parse_response - - # Create a mock response - mock_response = MagicMock() - mock_response.json.return_value = {"key": "value"} - - # Call the function - result = parse_response(mock_response) - - # Verify the response was parsed - mock_response.json.assert_called_once() - assert result == {"key": "value"} - - def test_create_session_default_params(self) -> None: - """Test create_session with default parameters.""" - with patch("pyUSPTO.utils.http.requests.Session") as mock_session: - # Setup the mock - mock_session_instance = MagicMock() - mock_session.return_value = mock_session_instance - - # Call the function with defaults - session = create_session() - - # Verify the session was created - mock_session.assert_called_once() - - # Verify adapters were mounted - mock_session_instance.mount.assert_any_call("http://", ANY) - mock_session_instance.mount.assert_any_call("https://", ANY) - - # Return the session - assert session == mock_session_instance From ec3a027bde3b76f3897440bbbefeaafae5380834 Mon Sep 17 00:00:00 2001 From: Andrew Date: Tue, 24 Feb 2026 11:35:25 -0600 Subject: [PATCH 6/8] fix: enable retries for POST requests --- src/pyUSPTO/config.py | 1 + tests/test_config.py | 10 ++++++++++ 2 files changed, 11 insertions(+) diff --git a/src/pyUSPTO/config.py b/src/pyUSPTO/config.py index 944d69c..356a302 100644 --- a/src/pyUSPTO/config.py +++ b/src/pyUSPTO/config.py @@ -124,6 +124,7 @@ def _create_session(self) -> "requests.Session": total=self.http_config.max_retries, backoff_factor=self.http_config.backoff_factor, status_forcelist=self.http_config.retry_status_codes, + allowed_methods={"GET", "POST"}, ) # Configure connection pooling diff --git a/tests/test_config.py b/tests/test_config.py index b5b38e6..2b9a714 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -95,6 +95,16 @@ def test_http_config_sharing(self): assert config1.http_config.timeout == 90.0 assert config2.http_config.timeout == 90.0 + def test_session_retries_post_requests(self): + """Test that POST requests are included in retry configuration""" + config = USPTOConfig(api_key="test") + session = config.session + + adapter = session.get_adapter("https://api.uspto.gov") + retry = adapter.max_retries + assert "POST" in retry.allowed_methods + assert "GET" in retry.allowed_methods + def test_session_lifecycle(self): """Test session sharing, lazy creation, reuse, and cleanup behavior""" From a812986d173f944a527b5ee01f2c528bb674a951 Mon Sep 17 00:00:00 2001 From: Andrew Date: Tue, 24 Feb 2026 17:27:59 -0600 Subject: [PATCH 7/8] fix: skip symlinks during archive extraction; chore: optimize tox deps and enable parallel --- src/pyUSPTO/clients/base.py | 8 ++- tests/clients/test_base.py | 110 ++++++++++++++++++++++++++++++++++++ tox.ini | 6 +- 3 files changed, 121 insertions(+), 3 deletions(-) diff --git a/src/pyUSPTO/clients/base.py b/src/pyUSPTO/clients/base.py index d57de11..ece2954 100644 --- a/src/pyUSPTO/clients/base.py +++ b/src/pyUSPTO/clients/base.py @@ -648,9 +648,11 @@ def _extract_archive( with tarfile.open(archive_path, "r:*") as tar: # Extract members one by one with validation for member in tar.getmembers(): - # Skip directories + # Skip directories and symlinks if member.isdir(): continue + if member.issym() or member.islnk(): + continue # Path traversal check member_path = extract_to / member.name @@ -676,9 +678,11 @@ def _extract_archive( with zipfile.ZipFile(archive_path, "r") as zip_ref: # Extract members one by one with validation for zip_info in zip_ref.infolist(): - # Skip directories + # Skip directories and symlinks if zip_info.is_dir(): continue + if (zip_info.external_attr >> 16) & 0o170000 == 0o120000: + continue # Path traversal check member_path = extract_to / zip_info.filename diff --git a/tests/clients/test_base.py b/tests/clients/test_base.py index 3b315b7..257c154 100644 --- a/tests/clients/test_base.py +++ b/tests/clients/test_base.py @@ -1809,6 +1809,116 @@ def test_zip_with_directories(self, tmp_path: Any) -> None: assert (extract_to / "testdir" / "file.txt").exists() +class TestExtractArchiveSymlinks: + """Tests for symlink skipping in _extract_archive.""" + + def test_tar_symlink_skipped(self, tmp_path: Any) -> None: + """Test that symbolic links in tar archives are skipped.""" + import tarfile + + client: BaseUSPTOClient[Any] = BaseUSPTOClient( + config=USPTOConfig(api_key="test"), base_url="https://test.com" + ) + + tar_path = tmp_path / "test.tar" + with tarfile.open(tar_path, "w") as tar: + # Add a regular file + import io + + data = b"real content" + info = tarfile.TarInfo(name="real.txt") + info.size = len(data) + tar.addfile(info, io.BytesIO(data)) + + # Add a symlink pointing outside + sym_info = tarfile.TarInfo(name="evil_link") + sym_info.type = tarfile.SYMTYPE + sym_info.linkname = "/etc/passwd" + tar.addfile(sym_info) + + extract_to = tmp_path / "extracted" + client._extract_archive(tar_path, extract_to=extract_to) + + assert (extract_to / "real.txt").exists() + assert not (extract_to / "evil_link").exists() + + def test_tar_hardlink_skipped(self, tmp_path: Any) -> None: + """Test that hard links in tar archives are skipped.""" + import tarfile + + client: BaseUSPTOClient[Any] = BaseUSPTOClient( + config=USPTOConfig(api_key="test"), base_url="https://test.com" + ) + + tar_path = tmp_path / "test.tar" + with tarfile.open(tar_path, "w") as tar: + import io + + data = b"real content" + info = tarfile.TarInfo(name="real.txt") + info.size = len(data) + tar.addfile(info, io.BytesIO(data)) + + # Add a hard link + link_info = tarfile.TarInfo(name="hard_link") + link_info.type = tarfile.LNKTYPE + link_info.linkname = "real.txt" + tar.addfile(link_info) + + extract_to = tmp_path / "extracted" + client._extract_archive(tar_path, extract_to=extract_to) + + assert (extract_to / "real.txt").exists() + assert not (extract_to / "hard_link").exists() + + def test_zip_symlink_skipped(self, tmp_path: Any) -> None: + """Test that symbolic links in zip archives are skipped.""" + import stat + import zipfile + + client: BaseUSPTOClient[Any] = BaseUSPTOClient( + config=USPTOConfig(api_key="test"), base_url="https://test.com" + ) + + zip_path = tmp_path / "test.zip" + with zipfile.ZipFile(zip_path, "w") as zf: + # Add a regular file + zf.writestr("real.txt", "real content") + + # Add a symlink entry (Unix symlink via external_attr) + sym_info = zipfile.ZipInfo("evil_link") + sym_info.external_attr = (stat.S_IFLNK | 0o777) << 16 + zf.writestr(sym_info, "/etc/passwd") + + extract_to = tmp_path / "extracted" + client._extract_archive(zip_path, extract_to=extract_to) + + assert (extract_to / "real.txt").exists() + assert not (extract_to / "evil_link").exists() + + def test_tar_only_symlinks_returns_directory(self, tmp_path: Any) -> None: + """Test that an archive containing only symlinks returns the directory.""" + import tarfile + + client: BaseUSPTOClient[Any] = BaseUSPTOClient( + config=USPTOConfig(api_key="test"), base_url="https://test.com" + ) + + tar_path = tmp_path / "test.tar" + with tarfile.open(tar_path, "w") as tar: + sym_info = tarfile.TarInfo(name="evil_link") + sym_info.type = tarfile.SYMTYPE + sym_info.linkname = "/etc/passwd" + tar.addfile(sym_info) + + extract_to = tmp_path / "extracted" + result = client._extract_archive(tar_path, extract_to=extract_to) + + # No files extracted, returns directory + assert result == str(extract_to) + assert not (extract_to / "evil_link").exists() + + class TestDownloadAndExtract: """Tests for _download_and_extract method.""" diff --git a/tox.ini b/tox.ini index 6042ae2..cc55d4a 100644 --- a/tox.ini +++ b/tox.ini @@ -2,6 +2,7 @@ # py315 commented out - requires C++ build tools for librt (mypy dependency) envlist = py310,py311,py312,py313,py314 isolated_build = True +parallel = auto [testenv] basepython = @@ -12,6 +13,9 @@ basepython = py314: {env:LOCALAPPDATA}\Python\pythoncore-3.14-64\python.exe # py315: {env:LOCALAPPDATA}\Python\pythoncore-3.15-64\python.exe deps = - -r requirements-dev.txt + pytest>=9.0.2 + pytest-cov>=7.0.0 + pytest-mock>=3.15.1 + typing_extensions>=4.15.0 commands = pytest tests/ --cov=src/pyUSPTO From c95ccb7b19c0bc7d78a19a70e2fb68a4c53f3c49 Mon Sep 17 00:00:00 2001 From: Andrew Date: Wed, 25 Feb 2026 10:10:59 -0600 Subject: [PATCH 8/8] fix: mock Path.mkdir in test to prevent CI permission error --- tests/clients/test_base.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tests/clients/test_base.py b/tests/clients/test_base.py index 257c154..67ff593 100644 --- a/tests/clients/test_base.py +++ b/tests/clients/test_base.py @@ -1406,7 +1406,10 @@ def test_save_to_current_directory_when_no_destination( mock_response.iter_content.return_value = [b"data"] # Save with no destination (should use cwd) - with patch("pyUSPTO.clients.base.Path.cwd") as mock_cwd: + with ( + patch("pyUSPTO.clients.base.Path.cwd") as mock_cwd, + patch("pyUSPTO.clients.base.Path.mkdir"), + ): mock_cwd.return_value = Path("/fake/cwd") result = client._save_response_to_file(mock_response, destination=None)