From 54a2c914a4ee06f16ad00656635295577b43987e Mon Sep 17 00:00:00 2001 From: Lalit Gupta Date: Tue, 17 Mar 2026 20:06:22 +0530 Subject: [PATCH 1/6] fix: type annotations, clip return type, shot URLs, and caption warning - Fix VideoAsset/AudioAsset start/end type from int to Optional[float] - Fix clip() return type from str to SearchResult - Add Literal types for clip() content_type and model_name params - Pass stream_url/player_url from search API response to Shot objects - Add warnings.warn() for CaptionAsset(src='auto') indexing requirement --- videodb/asset.py | 8 ++++---- videodb/editor.py | 15 +++++++++++++++ videodb/search.py | 2 ++ videodb/shot.py | 6 ++++-- videodb/video.py | 8 ++++---- 5 files changed, 29 insertions(+), 10 deletions(-) diff --git a/videodb/asset.py b/videodb/asset.py index 6061b4b..805a862 100644 --- a/videodb/asset.py +++ b/videodb/asset.py @@ -37,8 +37,8 @@ def __init__( end: Optional[float] = None, ) -> None: super().__init__(asset_id) - self.start: int = start - self.end: Union[int, None] = end + self.start: Optional[float] = start + self.end: Optional[float] = end def to_json(self) -> dict: return copy.deepcopy(self.__dict__) @@ -63,8 +63,8 @@ def __init__( fade_out_duration: Optional[Union[int, float]] = 0, ): super().__init__(asset_id) - self.start: int = start - self.end: Union[int, None] = end + self.start: Optional[float] = start + self.end: Optional[float] = end self.disable_other_tracks: bool = disable_other_tracks self.fade_in_duration: Union[int, float] = validate_max_supported( fade_in_duration, MaxSupported.fade_duration, "fade_in_duration" diff --git a/videodb/editor.py b/videodb/editor.py index 5c40282..daaf1e9 100644 --- a/videodb/editor.py +++ b/videodb/editor.py @@ -1,5 +1,7 @@ import json +import logging import requests +import warnings from typing import List, Optional, Union from enum import Enum @@ -8,6 +10,8 @@ from videodb.exceptions import InvalidRequestError +logger = logging.getLogger(__name__) + MAX_PAYLOAD_SIZE = 100 * 1024 @@ -840,6 +844,11 @@ def __init__( ): """Initialize a CaptionAsset instance. + .. note:: + When using ``src="auto"``, the video must be indexed first + (e.g. via ``video.index_spoken_words()``) so that a transcript + is available for caption generation. + :param str src: Caption source ("auto" for auto-generated or base64 encoded ass string) :param FontStyling font: (optional) Font styling properties :param str primary_color: Primary text color in ASS format (default: "&H00FFFFFF") @@ -849,6 +858,12 @@ def __init__( :param Positioning position: (optional) Caption positioning properties :param CaptionAnimation animation: (optional) Caption animation effect """ + if src == "auto": + warnings.warn( + "CaptionAsset(src='auto'): the video must be indexed " + "(e.g. video.index_spoken_words()) for captions to be generated.", + stacklevel=2, + ) self.src = src self.font = font if font is not None else FontStyling() self.primary_color = primary_color diff --git a/videodb/search.py b/videodb/search.py index 94730ec..f2b9207 100644 --- a/videodb/search.py +++ b/videodb/search.py @@ -48,6 +48,8 @@ def _format_results(self): scene_index_id=doc.get("scene_index_id"), scene_index_name=doc.get("scene_index_name"), metadata=doc.get("metadata"), + stream_url=doc.get("stream_link"), + player_url=doc.get("player_url"), ) ) diff --git a/videodb/shot.py b/videodb/shot.py index b261077..82d90cd 100644 --- a/videodb/shot.py +++ b/videodb/shot.py @@ -35,6 +35,8 @@ def __init__( scene_index_id: Optional[str] = None, scene_index_name: Optional[str] = None, metadata: Optional[dict] = None, + stream_url: Optional[str] = None, + player_url: Optional[str] = None, ) -> None: self._connection = _connection self.video_id = video_id @@ -47,8 +49,8 @@ def __init__( self.scene_index_id = scene_index_id self.scene_index_name = scene_index_name self.metadata = metadata - self.stream_url = None - self.player_url = None + self.stream_url = stream_url + self.player_url = player_url def __repr__(self) -> str: repr_str = ( diff --git a/videodb/video.py b/videodb/video.py index 3c7126d..76f2f5b 100644 --- a/videodb/video.py +++ b/videodb/video.py @@ -1,4 +1,4 @@ -from typing import Optional, Union, List, Dict, Tuple, Any +from typing import Literal, Optional, Union, List, Dict, Tuple, Any from videodb._utils._video import play_stream from videodb._constants import ( ApiPath, @@ -702,9 +702,9 @@ def add_subtitle(self, style: SubtitleStyle = SubtitleStyle()) -> str: def clip( self, prompt: str, - content_type: str, - model_name: str, - ) -> str: + content_type: Literal["spoken", "visual", "multimodal"], + model_name: Literal["basic", "pro", "ultra"], + ) -> SearchResult: """Generate a clip from the video using a prompt. :param str prompt: Prompt to generate the clip :param str content_type: Content type for the clip. Valid options: "spoken", "visual", "multimodal" From a40e767b708b02a45d6bcfbf57bf922cc3280d56 Mon Sep 17 00:00:00 2001 From: Lalit Gupta Date: Wed, 18 Mar 2026 00:21:40 +0530 Subject: [PATCH 2/6] chore: bump version to 0.4.3, require capture-bin>=0.2.9 --- setup.py | 2 +- videodb/__about__.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index a7cca41..5fb283a 100644 --- a/setup.py +++ b/setup.py @@ -39,7 +39,7 @@ "websockets>=11.0.3", ], extras_require={ - "capture": ["videodb-capture-bin>=0.2.8"], + "capture": ["videodb-capture-bin>=0.2.9"], }, classifiers=[ "Intended Audience :: Developers", diff --git a/videodb/__about__.py b/videodb/__about__.py index c1f5177..0cdd560 100644 --- a/videodb/__about__.py +++ b/videodb/__about__.py @@ -2,7 +2,7 @@ -__version__ = "0.4.2" +__version__ = "0.4.3" __title__ = "videodb" __author__ = "videodb" __email__ = "contact@videodb.io" From 88e326a4fd483b732497dad89350a5c59ec18929 Mon Sep 17 00:00:00 2001 From: Lalit Gupta Date: Wed, 18 Mar 2026 00:30:55 +0530 Subject: [PATCH 3/6] feat: add language_code param to generate_transcript --- videodb/video.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/videodb/video.py b/videodb/video.py index 76f2f5b..367ba87 100644 --- a/videodb/video.py +++ b/videodb/video.py @@ -249,10 +249,12 @@ def get_transcript_text( def generate_transcript( self, force: bool = None, + language_code: Optional[str] = None, ) -> str: """Generate transcript for the video. :param bool force: Force generate new transcript + :param str language_code: (optional) Language code of the video :return: Full transcript text as string :rtype: str """ @@ -260,6 +262,7 @@ def generate_transcript( path=f"{ApiPath.video}/{self.id}/{ApiPath.transcription}", data={ "force": True if force else False, + "language_code": language_code, }, ) transcript = transcript_data.get("word_timestamps", []) From d6770cecd74e0e963beba90b27a234ba6a57b425 Mon Sep 17 00:00:00 2001 From: Lalit Gupta Date: Wed, 18 Mar 2026 11:01:38 +0530 Subject: [PATCH 4/6] fix: remove record param from Channel.to_dict() --- videodb/capture.py | 1 - 1 file changed, 1 deletion(-) diff --git a/videodb/capture.py b/videodb/capture.py index 6626c74..596a156 100644 --- a/videodb/capture.py +++ b/videodb/capture.py @@ -91,7 +91,6 @@ def to_dict(self) -> Dict[str, Any]: "channel_id": self.id, "type": self.type, "name": self.name, - "record": True, "store": self.store, "is_primary": self.is_primary, } From d2a90d9692f393ab4771b62f4210be693dce8ca6 Mon Sep 17 00:00:00 2001 From: Lalit Gupta Date: Wed, 18 Mar 2026 13:42:21 +0530 Subject: [PATCH 5/6] chore: bump capture binary requirement to >=0.2.10 v0.2.10 binary suppresses verbose info/debug logs natively. --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 5fb283a..efa1423 100644 --- a/setup.py +++ b/setup.py @@ -39,7 +39,7 @@ "websockets>=11.0.3", ], extras_require={ - "capture": ["videodb-capture-bin>=0.2.9"], + "capture": ["videodb-capture-bin>=0.2.10"], }, classifiers=[ "Intended Audience :: Developers", From 5111b7a79df93efa61a087522612813ffe96c92f Mon Sep 17 00:00:00 2001 From: Lalit Gupta Date: Wed, 18 Mar 2026 21:32:59 +0530 Subject: [PATCH 6/6] feat: separate camera channels from displays, use ID-prefix grouping - Channels with `camera:` prefix now go to `channels.cameras` instead of `channels.displays`. Cameras excluded from `all()`. - Switched audio grouping from name-based heuristics to channel ID prefix matching (`mic:`, `system_audio:`) for consistency with Node SDK. --- videodb/capture.py | 35 +++++++++++++++++++++-------------- 1 file changed, 21 insertions(+), 14 deletions(-) diff --git a/videodb/capture.py b/videodb/capture.py index 596a156..dca7e59 100644 --- a/videodb/capture.py +++ b/videodb/capture.py @@ -133,21 +133,24 @@ def __init__( mics: List[AudioChannel] = None, displays: List[VideoChannel] = None, system_audio: List[AudioChannel] = None, + cameras: List[VideoChannel] = None, ): self.mics: ChannelList = ChannelList(mics or []) self.displays: ChannelList = ChannelList(displays or []) self.system_audio: ChannelList = ChannelList(system_audio or []) + self.cameras: ChannelList = ChannelList(cameras or []) def __repr__(self): return ( f"Channels(" f"mics={len(self.mics)}, " f"displays={len(self.displays)}, " - f"system_audio={len(self.system_audio)})" + f"system_audio={len(self.system_audio)}, " + f"cameras={len(self.cameras)})" ) def all(self) -> List[Channel]: - """Return a flat list of all channels.""" + """Return a flat list of all capturable channels (excludes cameras).""" return list(self.mics) + list(self.displays) + list(self.system_audio) @@ -333,30 +336,34 @@ async def list_channels(self) -> Channels: mics = [] displays = [] system_audio = [] - + cameras = [] + for ch in raw_channels: c_type = ch.get("type") c_id = ch.get("channel_id") or ch.get("id") c_name = ch.get("name", "") - + if not c_id: logger.warning(f"Skipping channel with missing ID: {ch}") continue - # Categorize based on type and name patterns - if c_type == "video": + # Categorize based on channel ID prefix + if c_id.startswith("mic:"): + mics.append(AudioChannel(id=c_id, name=c_name, client=self)) + elif c_id.startswith("display:") or c_id.startswith("screen:"): displays.append(VideoChannel(id=c_id, name=c_name, client=self)) + elif c_id.startswith("system_audio:"): + system_audio.append(AudioChannel(id=c_id, name=c_name, client=self)) + elif c_id.startswith("camera:"): + cameras.append(VideoChannel(id=c_id, name=c_name, client=self)) elif c_type == "audio": - # Distinguish between mic and system audio based on common patterns - name_lower = c_name.lower() - if "system" in name_lower or "output" in name_lower or "speaker" in name_lower: - system_audio.append(AudioChannel(id=c_id, name=c_name, client=self)) - else: - mics.append(AudioChannel(id=c_id, name=c_name, client=self)) + mics.append(AudioChannel(id=c_id, name=c_name, client=self)) + elif c_type == "video": + displays.append(VideoChannel(id=c_id, name=c_name, client=self)) else: logger.debug(f"Unknown channel type '{c_type}' for channel '{c_name}'") - - return Channels(mics=mics, displays=displays, system_audio=system_audio) + + return Channels(mics=mics, displays=displays, system_audio=system_audio, cameras=cameras) async def start_session( self,