nihalnihalani · nihalnihalani · Apr 7, 2026 · Apr 2, 2026 · Apr 2, 2026 · Apr 6, 2026
diff --git a/nodes/src/nodes/chunker/IGlobal.py b/nodes/src/nodes/chunker/IGlobal.py
@@ -0,0 +1,101 @@
+# =============================================================================
+# MIT License
+# Copyright (c) 2026 Aparavi Software AG
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+# =============================================================================
+
+# ------------------------------------------------------------------------------
+# This class controls the data shared between all threads for the task
+# ------------------------------------------------------------------------------
+import os
+from rocketlib import IGlobalBase, OPEN_MODE, warning
+from ai.common.config import Config
+
+from .chunker_strategies import ChunkingStrategy, RecursiveCharacterChunker, SentenceChunker, TokenChunker
+
+
+class IGlobal(IGlobalBase):
+    strategy: ChunkingStrategy | None = None
+
+    def validateConfig(self):
+        """Validate that tiktoken dependency is available (only needed for token strategy)."""
+        try:
+            config = Config.getNodeConfig(self.glb.logicalType, self.glb.connConfig)
+            strategy_name = config.get('strategy', 'recursive')
+        except Exception:  # noqa: BLE001
+            # If config isn't available yet, install proactively
+            strategy_name = 'token'
+
+        if strategy_name == 'token':
+            try:
+                from depends import depends
+
+                requirements = os.path.dirname(os.path.realpath(__file__)) + '/requirements.txt'
+                depends(requirements)
+            except Exception as e:  # noqa: BLE001 - intentional broad catch for dependency probing
+                warning(str(e))
+
+    def beginGlobal(self):
+        """Initialize the configured chunking strategy for runtime execution."""
+        # Are we in config mode or some other mode?
+        if self.IEndpoint.endpoint.openMode == OPEN_MODE.CONFIG:
+            # We are going to get a call to configureService but
+            # we don't actually need to load the strategy for that
+            pass
+        else:
+            # Get this node's config
+            config = Config.getNodeConfig(self.glb.logicalType, self.glb.connConfig)
+
+            # Read strategy parameters from config
+            strategy_name = config.get('strategy', 'recursive')
+            chunk_size = int(config.get('chunk_size', 1000))
+            chunk_overlap = int(config.get('chunk_overlap', 200))
+            encoding_name = config.get('encoding_name', 'cl100k_base')
+
+            if chunk_size <= 0:
+                raise ValueError(f'chunk_size must be positive, got {chunk_size}')
+            if chunk_overlap < 0:
+                raise ValueError(f'chunk_overlap must be non-negative, got {chunk_overlap}')
+            if chunk_overlap >= chunk_size:
+                raise ValueError(f'chunk_overlap ({chunk_overlap}) must be less than chunk_size ({chunk_size})')
+
+            # Build the appropriate strategy
+            if strategy_name == 'token':
+                self.strategy = TokenChunker(
+                    chunk_size=chunk_size,
+                    chunk_overlap=chunk_overlap,
+                    encoding_name=encoding_name,
+                )
+            elif strategy_name == 'sentence':
+                self.strategy = SentenceChunker(
+                    chunk_size=chunk_size,
+                    chunk_overlap=chunk_overlap,
+                )
+            else:
+                # Default to recursive character chunker
+                self.strategy = RecursiveCharacterChunker(
+                    chunk_size=chunk_size,
+                    chunk_overlap=chunk_overlap,
+                )
+
+    def endGlobal(self):
+        """Release the configured chunking strategy."""
+        # Release the strategy
+        self.strategy = None
-    def beginGlobal(self):
-        # Are we in config mode or some other mode?
-        if self.IEndpoint.endpoint.openMode == OPEN_MODE.CONFIG:
-            # We are going to get a call to configureService but
-            # we don't actually need to load the strategy for that
-            pass
-        else:
-            # Get this node's config
-            config = Config.getNodeConfig(self.glb.logicalType, self.glb.connConfig)
-
-            # Read strategy parameters from config
-            strategy_name = config.get('strategy', 'recursive')
-            chunk_size = int(config.get('chunk_size', 1000))
-            chunk_overlap = int(config.get('chunk_overlap', 200))
-            encoding_name = config.get('encoding_name', 'cl100k_base')
-
-            if chunk_size <= 0:
-                raise ValueError(f'chunk_size must be positive, got {chunk_size}')
-            if chunk_overlap < 0:
-                raise ValueError(f'chunk_overlap must be non-negative, got {chunk_overlap}')
-            if chunk_overlap >= chunk_size:
-                raise ValueError(f'chunk_overlap ({chunk_overlap}) must be less than chunk_size ({chunk_size})')
-
-            # Build the appropriate strategy
-            if strategy_name == 'token':
-                self.strategy = TokenChunker(
-                    chunk_size=chunk_size,
-                    chunk_overlap=chunk_overlap,
-                    encoding_name=encoding_name,
-                )
-            elif strategy_name == 'sentence':
-                self.strategy = SentenceChunker(
-                    chunk_size=chunk_size,
-                    chunk_overlap=chunk_overlap,
-                )
-            else:
-                # Default to recursive character chunker
-                self.strategy = RecursiveCharacterChunker(
-                    chunk_size=chunk_size,
-                    chunk_overlap=chunk_overlap,
-                )
-
-    def endGlobal(self):
-        # Release the strategy
-        self.strategy = None
+    def beginGlobal(self):
+        """Initialize the configured chunking strategy for runtime execution."""
+        # Are we in config mode or some other mode?
+        if self.IEndpoint.endpoint.openMode == OPEN_MODE.CONFIG:
+            # We are going to get a call to configureService but
+            # we don't actually need to load the strategy for that
+            pass
+        else:
+            # Get this node's config
+            config = Config.getNodeConfig(self.glb.logicalType, self.glb.connConfig)
+
+            # Read strategy parameters from config
+            strategy_name = config.get('strategy', 'recursive')
+            chunk_size = int(config.get('chunk_size', 1000))
+            chunk_overlap = int(config.get('chunk_overlap', 200))
+            encoding_name = config.get('encoding_name', 'cl100k_base')
+
+            if chunk_size <= 0:
+                raise ValueError(f'chunk_size must be positive, got {chunk_size}')
+            if chunk_overlap < 0:
+                raise ValueError(f'chunk_overlap must be non-negative, got {chunk_overlap}')
+            if chunk_overlap >= chunk_size:
+                raise ValueError(f'chunk_overlap ({chunk_overlap}) must be less than chunk_size ({chunk_size})')
+
+            # Build the appropriate strategy
+            if strategy_name == 'token':
+                self.strategy = TokenChunker(
+                    chunk_size=chunk_size,
+                    chunk_overlap=chunk_overlap,
+                    encoding_name=encoding_name,
+                )
+            elif strategy_name == 'sentence':
+                self.strategy = SentenceChunker(
+                    chunk_size=chunk_size,
+                    chunk_overlap=chunk_overlap,
+                )
+            else:
+                # Default to recursive character chunker
+                self.strategy = RecursiveCharacterChunker(
+                    chunk_size=chunk_size,
+                    chunk_overlap=chunk_overlap,
+                )
+
+    def endGlobal(self):
+        """Release the configured chunking strategy."""
+        # Release the strategy
+        self.strategy = None
-    def beginGlobal(self):
-        # Are we in config mode or some other mode?
-        if self.IEndpoint.endpoint.openMode == OPEN_MODE.CONFIG:
-            # We are going to get a call to configureService but
-            # we don't actually need to load the strategy for that
-            pass
-        else:
-            # Get this node's config
-            config = Config.getNodeConfig(self.glb.logicalType, self.glb.connConfig)
-
-            # Read strategy parameters from config
-            strategy_name = config.get('strategy', 'recursive')
-            chunk_size = int(config.get('chunk_size', 1000))
-            chunk_overlap = int(config.get('chunk_overlap', 200))
-            encoding_name = config.get('encoding_name', 'cl100k_base')
-
-            if chunk_size <= 0:
-                raise ValueError(f'chunk_size must be positive, got {chunk_size}')
-            if chunk_overlap < 0:
-                raise ValueError(f'chunk_overlap must be non-negative, got {chunk_overlap}')
-            if chunk_overlap >= chunk_size:
-                raise ValueError(f'chunk_overlap ({chunk_overlap}) must be less than chunk_size ({chunk_size})')
-
-            # Build the appropriate strategy
-            if strategy_name == 'token':
-                self.strategy = TokenChunker(
-                    chunk_size=chunk_size,
-                    chunk_overlap=chunk_overlap,
-                    encoding_name=encoding_name,
-                )
-            elif strategy_name == 'sentence':
-                self.strategy = SentenceChunker(
-                    chunk_size=chunk_size,
-                    chunk_overlap=chunk_overlap,
-                )
-            else:
-                # Default to recursive character chunker
-                self.strategy = RecursiveCharacterChunker(
-                    chunk_size=chunk_size,
-                    chunk_overlap=chunk_overlap,
-                )
-
-    def endGlobal(self):
-        # Release the strategy
-        self.strategy = None
+    def beginGlobal(self):
+        """Initialize the configured chunking strategy for runtime execution."""
+        # Are we in config mode or some other mode?
+        if self.IEndpoint.endpoint.openMode == OPEN_MODE.CONFIG:
+            # We are going to get a call to configureService but
+            # we don't actually need to load the strategy for that
+            pass
+        else:
+            # Get this node's config
+            config = Config.getNodeConfig(self.glb.logicalType, self.glb.connConfig)
+
+            # Read strategy parameters from config
+            strategy_name = config.get('strategy', 'recursive')
+            chunk_size = int(config.get('chunk_size', 1000))
+            chunk_overlap = int(config.get('chunk_overlap', 200))
+            encoding_name = config.get('encoding_name', 'cl100k_base')
+
+            if chunk_size <= 0:
+                raise ValueError(f'chunk_size must be positive, got {chunk_size}')
+            if chunk_overlap < 0:
+                raise ValueError(f'chunk_overlap must be non-negative, got {chunk_overlap}')
+            if chunk_overlap >= chunk_size:
+                raise ValueError(f'chunk_overlap ({chunk_overlap}) must be less than chunk_size ({chunk_size})')
+
+            # Build the appropriate strategy
+            if strategy_name == 'token':
+                self.strategy = TokenChunker(
+                    chunk_size=chunk_size,
+                    chunk_overlap=chunk_overlap,
+                    encoding_name=encoding_name,
+                )
+            elif strategy_name == 'sentence':
+                self.strategy = SentenceChunker(
+                    chunk_size=chunk_size,
+                    chunk_overlap=chunk_overlap,
+                )
+            else:
+                # Default to recursive character chunker
+                self.strategy = RecursiveCharacterChunker(
+                    chunk_size=chunk_size,
+                    chunk_overlap=chunk_overlap,
+                )
+
+    def endGlobal(self):
+        """Release the configured chunking strategy."""
+        # Release the strategy
+        self.strategy = None
diff --git a/nodes/src/nodes/chunker/IInstance.py b/nodes/src/nodes/chunker/IInstance.py
@@ -0,0 +1,100 @@
+# =============================================================================
+# MIT License
+# Copyright (c) 2026 Aparavi Software AG
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+# =============================================================================
+
+# ------------------------------------------------------------------------------
+# This class controls the data for each thread of the task
+# ------------------------------------------------------------------------------
+import copy
+
+from rocketlib import IInstanceBase, Entry, debug
+from ai.common.schema import Doc, DocMetadata
+
+from .IGlobal import IGlobal
+
+
+class IInstance(IInstanceBase):
+    """Instance that chunks incoming documents and emits one document per chunk."""
+
+    IGlobal: IGlobal
+
+    chunkId: int = 0
+
+    def open(self, obj: Entry):
+        """Reset chunk counter for each new object."""
+        self.chunkId = 0
+
+    def writeDocuments(self, documents: list[Doc]):
+        """
+        Chunk each incoming document and emit multiple documents (one per chunk).
+
+        Each emitted document gets metadata with chunkId, parentId, chunk_index,
+        start_char, end_char, and total_chunks so downstream nodes can
+        reconstruct the original document if needed.
+        """
+        if self.IGlobal.strategy is None:
+            raise RuntimeError('Chunker strategy not initialized')
+
+        for document in documents:
+            # Extract text content
+            text = document.page_content or ''
+            if not text.strip():
+                continue
+
+            # Get the original object ID for parent tracking
+            parent_id = ''
+            if document.metadata is not None:
+                parent_id = getattr(document.metadata, 'objectId', '') or ''
+
+            # Chunk the text
+            chunks = self.IGlobal.strategy.chunk(text)
+            total_chunks = len(chunks)
+
+            if total_chunks == 0:
+                continue
+
+            # Build output documents
+            output_docs: list[Doc] = []
+            for chunk_data in chunks:
+                # Shallow copy of document, explicit copy of metadata only
+                chunk_doc = copy.copy(document)
+                chunk_doc.metadata = copy.copy(document.metadata) if document.metadata else DocMetadata()
+                chunk_doc.page_content = chunk_data['text']
+
+                # Update metadata (always non-None after the copy/create above)
+                chunk_doc.metadata.chunkId = self.chunkId
+                chunk_doc.metadata.parentId = parent_id
+
+                # Propagate strategy metadata (chunk_index, start_char, end_char)
+                strategy_meta = chunk_data.get('metadata', {})
+                chunk_doc.metadata.chunk_index = strategy_meta.get('chunk_index', 0)
+                chunk_doc.metadata.start_char = strategy_meta.get('start_char', 0)
+                chunk_doc.metadata.end_char = strategy_meta.get('end_char', 0)
+                chunk_doc.metadata.total_chunks = total_chunks
+
+                self.chunkId += 1
+                output_docs.append(chunk_doc)
+
+            # Emit all chunks for this document
+            if output_docs:
+                debug(f'Chunker emitting {len(output_docs)} chunks for document (parent_id={parent_id})')
+                self.instance.writeDocuments(output_docs)
diff --git a/nodes/src/nodes/chunker/__init__.py b/nodes/src/nodes/chunker/__init__.py