From 1ee6e5830a9392fa6175e66919d782792022859c Mon Sep 17 00:00:00 2001 From: monoxgas Date: Thu, 17 Jul 2025 10:54:34 -0600 Subject: [PATCH 1/2] Add explicit data types for better text rendering. Add a Meta type for extending schemas on demand for logged objects. --- docs/sdk/data_types.mdx | 163 +++++++++++++++++++++++++ docs/sdk/serialization.mdx | 15 ++- docs/usage/data-tracking.mdx | 2 + docs/usage/rich-objects.mdx | 55 ++++++++- dreadnode/data_types/__init__.py | 4 +- dreadnode/data_types/audio.py | 4 +- dreadnode/data_types/base.py | 49 ++++++++ dreadnode/data_types/base_data_type.py | 17 --- dreadnode/data_types/image.py | 4 +- dreadnode/data_types/object_3d.py | 4 +- dreadnode/data_types/table.py | 4 +- dreadnode/data_types/text.py | 59 +++++++++ dreadnode/data_types/video.py | 4 +- dreadnode/serialization.py | 18 +-- 14 files changed, 361 insertions(+), 41 deletions(-) create mode 100644 dreadnode/data_types/base.py delete mode 100644 dreadnode/data_types/base_data_type.py create mode 100644 dreadnode/data_types/text.py diff --git a/docs/sdk/data_types.mdx b/docs/sdk/data_types.mdx index c8d1aaae..2d184f32 100644 --- a/docs/sdk/data_types.mdx +++ b/docs/sdk/data_types.mdx @@ -110,6 +110,34 @@ def to_serializable(self) -> tuple[t.Any, dict[str, t.Any]]: ``` + + +Code +---- + +```python +Code(text: str, language: str = '') +``` + +Hint type for code-formatted text. + +This is a subclass of Text with format set to "code". + +Example + +```python +log_output("code_snippet", Code("print('Hello, World!')", language="python")) +``` + + + +```python +def __init__(self, text: str, language: str = ""): + super().__init__(text, format="code") + self._language = language +``` + + Image @@ -220,6 +248,104 @@ def to_serializable(self) -> tuple[t.Any, dict[str, t.Any]]: ``` + + +Markdown +-------- + +```python +Markdown(text: str) +``` + +Hint type for markdown-formatted text. + +This is a subclass of Text with format set to "markdown". + +Example + +```python +log_output("report", Markdown("...")) +``` + + + +```python +def __init__(self, text: str): + super().__init__(text, format="markdown") +``` + + + + +Meta +---- + +```python +Meta(obj: Any, metadata: dict[str, Any]) +``` + +Helper data type to add additional metadata to the schema for logged data. + +Example + +```python +log_output("my_data", Meta(data, {"format": "custom-data"})) +``` + +Initialize a data type with associated metadata. + +**Parameters:** + +* **`metadata`** + (`dict[str, Any]`) + –The metadata for this data type + + +```python +def __init__(self, obj: t.Any, metadata: dict[str, t.Any]): + """ + Initialize a data type with associated metadata. + + Args: + metadata: The metadata for this data type + """ + self._obj = obj + self._metadata = metadata +``` + + + + +### to\_serializable + +```python +to_serializable() -> tuple[t.Any, dict[str, t.Any]] +``` + +Convert the media type to a serializable format. + +**Returns:** + +* `tuple[Any, dict[str, Any]]` + –Tuple of (data, metadata) where: + - data: The serialized data + - metadata: Additional metadata for this data type + + +```python +def to_serializable(self) -> tuple[t.Any, dict[str, t.Any]]: + """ + Convert the media type to a serializable format. + + Returns: + Tuple of (data, metadata) where: + - data: The serialized data + - metadata: Additional metadata for this data type + """ + return self._obj, self._metadata +``` + + Object3D @@ -434,6 +560,43 @@ def to_serializable(self) -> tuple[bytes, dict[str, t.Any]]: ``` + + +Text +---- + +```python +Text(text: str, format: str) +``` + +Text data type for Dreadnode logging. + +Initialize a Text object. + +**Parameters:** + +* **`text`** + (`str`) + –The text content to log +* **`format`** + (`str`) + –The format hint of the text + + +```python +def __init__(self, text: str, format: str): + """ + Initialize a Text object. + + Args: + text: The text content to log + format: The format hint of the text + """ + self._text = text + self._format = format +``` + + Video diff --git a/docs/sdk/serialization.mdx b/docs/sdk/serialization.mdx index 19aba1f2..6d4d04b1 100644 --- a/docs/sdk/serialization.mdx +++ b/docs/sdk/serialization.mdx @@ -57,7 +57,9 @@ serialize --------- ```python -serialize(obj: Any) -> Serialized +serialize( + obj: Any, *, schema_extras: JsonDict | None = None +) -> Serialized ``` Serializes a Python object into a JSON-compatible structure and @@ -69,6 +71,11 @@ the serialization format and the schema. * **`obj`** (`Any`) –The Python object to process. +* **`schema_extras`** + (`JsonDict | None`, default: + `None` + ) + –Additional JSON Schema properties to include. **Returns:** @@ -77,7 +84,7 @@ the serialization format and the schema. ```python -def serialize(obj: t.Any) -> Serialized: +def serialize(obj: t.Any, *, schema_extras: JsonDict | None = None) -> Serialized: """ Serializes a Python object into a JSON-compatible structure and generates a corresponding JSON Schema, ensuring consistency between @@ -85,6 +92,7 @@ def serialize(obj: t.Any) -> Serialized: Args: obj: The Python object to process. + schema_extras: Additional JSON Schema properties to include. Returns: An object containing the serialized data, schema, and their hashes. @@ -96,6 +104,9 @@ def serialize(obj: t.Any) -> Serialized: else: serialized_bytes = json.dumps(serialized, separators=(",", ":")).encode() + if schema_extras: + schema = {**schema, **schema_extras} + schema_str = json.dumps(schema, separators=(",", ":")) data_hash = EMPTY_HASH diff --git a/docs/usage/data-tracking.mdx b/docs/usage/data-tracking.mdx index 489ac3a7..fff51406 100644 --- a/docs/usage/data-tracking.mdx +++ b/docs/usage/data-tracking.mdx @@ -67,6 +67,8 @@ with dn.run("text-generation"): Strikes maintains a rich serialization layer to support many different kinds of Python objects: - Dictionaries, lists, and other JSON-serializable objects - NumPy arrays and Pandas DataFrames +- Rich media types (images, audio, video, 3D objects, tables) +- Formatted text (markdown, code with syntax highlighting) - Custom objects (serialized with pickle) - Large datasets (automatically stored efficiently) diff --git a/docs/usage/rich-objects.mdx b/docs/usage/rich-objects.mdx index 899c1a2e..d4bb342e 100644 --- a/docs/usage/rich-objects.mdx +++ b/docs/usage/rich-objects.mdx @@ -1,10 +1,10 @@ --- title: 'Rich Objects' -description: 'Store data types like images, audio, video, and 3D objects in your runs.' +description: 'Store data types like images, audio, video, text with formatting, and 3D objects in your runs.' public: true --- -Strikes extends its data tracking capabilities to handle complex, non-JSON serializable data types. This allows you to store rich media and other complex objects directly within your runs, making it easy to track and analyze all aspects of your data-driven workflows. +Strikes extends its data tracking capabilities to handle complex, non-JSON serializable data types. This allows you to store rich media, formatted text, and other complex objects directly within your runs, making it easy to track and analyze all aspects of your data-driven workflows. ## Images @@ -173,6 +173,57 @@ with dn.run("3d-formats-example"): ``` +## Text with Formatting Hints + +For text data that needs special rendering in the UI, you can use text hint types. These provide better visualization and formatting for different types of text content. + + +```python Markdown Text +import dreadnode as dn + +markdown_content = """ +# Results Summary + +## Model Performance +- **Accuracy**: 94.2% +- **Loss**: 0.156 + +### Key Findings +The model shows excellent performance on validation data. +""" + +with dn.run("markdown-example"): + dn.log_output("report", dn.Markdown(markdown_content)) +``` + +```python Code Snippets +import dreadnode as dn + +python_code = """ +def fibonacci(n): + if n <= 1: + return n + return fibonacci(n-1) + fibonacci(n-2) + +result = fibonacci(10) +print(f"Fibonacci(10) = {result}") +""" + +with dn.run("code-example"): + dn.log_output("generated_code", dn.Code(python_code, language="python")) +``` + +```python Generic Text with Format +import dreadnode as dn + +# For custom text formatting +formatted_text = "This is custom formatted text" + +with dn.run("text-example"): + dn.log_output("custom", dn.Text(formatted_text, format="custom")) +``` + + ## Tables For structured data, you can use the `dn.Table` data type. It can be created from various data formats and provides flexible data organization. diff --git a/dreadnode/data_types/__init__.py b/dreadnode/data_types/__init__.py index 7506d88f..c405ea28 100644 --- a/dreadnode/data_types/__init__.py +++ b/dreadnode/data_types/__init__.py @@ -1,7 +1,9 @@ from .audio import Audio +from .base import Meta from .image import Image from .object_3d import Object3D from .table import Table +from .text import Code, Markdown, Text from .video import Video -__all__ = ["Audio", "Image", "Object3D", "Table", "Video"] +__all__ = ["Audio", "Code", "Image", "Markdown", "Meta", "Object3D", "Table", "Text", "Video"] diff --git a/dreadnode/data_types/audio.py b/dreadnode/data_types/audio.py index c4bd3964..e3b6e0c7 100644 --- a/dreadnode/data_types/audio.py +++ b/dreadnode/data_types/audio.py @@ -9,12 +9,12 @@ except ImportError: sf = None -from dreadnode.data_types.base_data_type import BaseDataType +from dreadnode.data_types.base import DataType AudioDataType: t.TypeAlias = str | Path | np.ndarray[t.Any, t.Any] | bytes -class Audio(BaseDataType): +class Audio(DataType): """ Audio media type for Dreadnode logging. diff --git a/dreadnode/data_types/base.py b/dreadnode/data_types/base.py new file mode 100644 index 00000000..7baeccfc --- /dev/null +++ b/dreadnode/data_types/base.py @@ -0,0 +1,49 @@ +import typing as t +from abc import ABC, abstractmethod + + +class DataType(ABC): + """Base class for dedicated data types that can be logged with Dreadnode.""" + + @abstractmethod + def to_serializable(self) -> tuple[t.Any, dict[str, t.Any]]: + """ + Convert the media type to a serializable format. + + Returns: + Tuple of (data, metadata) where: + - data: The serialized data + - metadata: Additional metadata for this data type + """ + + +class Meta(DataType): + """ + Helper data type to add additional metadata to the schema for logged data. + + Example: + ``` + log_output("my_data", Meta(data, {"format": "custom-data"})) + ``` + """ + + def __init__(self, obj: t.Any, metadata: dict[str, t.Any]): + """ + Initialize a data type with associated metadata. + + Args: + metadata: The metadata for this data type + """ + self._obj = obj + self._metadata = metadata + + def to_serializable(self) -> tuple[t.Any, dict[str, t.Any]]: + """ + Convert the media type to a serializable format. + + Returns: + Tuple of (data, metadata) where: + - data: The serialized data + - metadata: Additional metadata for this data type + """ + return self._obj, self._metadata diff --git a/dreadnode/data_types/base_data_type.py b/dreadnode/data_types/base_data_type.py deleted file mode 100644 index 74c38119..00000000 --- a/dreadnode/data_types/base_data_type.py +++ /dev/null @@ -1,17 +0,0 @@ -import typing as t -from abc import ABC, abstractmethod - - -class BaseDataType(ABC): - """Base class for all data types that can be logged with Dreadnode.""" - - @abstractmethod - def to_serializable(self) -> tuple[t.Any, dict[str, t.Any]]: - """ - Convert the media type to a serializable format. - - Returns: - Tuple of (data, metadata) where: - - data: The serialized data - - metadata: Additional metadata for this data type - """ diff --git a/dreadnode/data_types/image.py b/dreadnode/data_types/image.py index 301e7448..0a4c7a22 100644 --- a/dreadnode/data_types/image.py +++ b/dreadnode/data_types/image.py @@ -5,7 +5,7 @@ import numpy as np -from dreadnode.data_types.base_data_type import BaseDataType +from dreadnode.data_types.base import DataType try: from PIL import Image as PILImage @@ -16,7 +16,7 @@ ImageDataOrPathType = str | Path | bytes | ImageDataType -class Image(BaseDataType): +class Image(DataType): """ Image media type for Dreadnode logging. diff --git a/dreadnode/data_types/object_3d.py b/dreadnode/data_types/object_3d.py index 6f64450b..d4000983 100644 --- a/dreadnode/data_types/object_3d.py +++ b/dreadnode/data_types/object_3d.py @@ -2,12 +2,12 @@ from pathlib import Path from typing import ClassVar -from dreadnode.data_types.base_data_type import BaseDataType +from dreadnode.data_types.base import DataType Object3DDataType = str | Path | bytes -class Object3D(BaseDataType): +class Object3D(DataType): """ 3D object media type for Dreadnode logging. diff --git a/dreadnode/data_types/table.py b/dreadnode/data_types/table.py index 797946f3..37a09372 100644 --- a/dreadnode/data_types/table.py +++ b/dreadnode/data_types/table.py @@ -6,14 +6,14 @@ import numpy as np import pandas as pd -from dreadnode.data_types.base_data_type import BaseDataType +from dreadnode.data_types.base import DataType TableDataType = ( pd.DataFrame | dict[t.Any, t.Any] | list[t.Any] | str | Path | np.ndarray[t.Any, t.Any] ) -class Table(BaseDataType): +class Table(DataType): """ Table data type for Dreadnode logging. diff --git a/dreadnode/data_types/text.py b/dreadnode/data_types/text.py new file mode 100644 index 00000000..bc4ef24d --- /dev/null +++ b/dreadnode/data_types/text.py @@ -0,0 +1,59 @@ +import typing as t + +from dreadnode.data_types.base import DataType + + +class Text(DataType): + """ + Text data type for Dreadnode logging. + """ + + def __init__(self, text: str, format: str): + """ + Initialize a Text object. + + Args: + text: The text content to log + format: The format hint of the text + """ + self._text = text + self._format = format + + def to_serializable(self) -> tuple[str, dict[str, t.Any]]: + return self._text, {"format": self._format} + + +class Markdown(Text): + """ + Hint type for markdown-formatted text. + + This is a subclass of Text with format set to "markdown". + + Example: + ``` + log_output("report", Markdown("...")) + ``` + """ + + def __init__(self, text: str): + super().__init__(text, format="markdown") + + +class Code(Text): + """ + Hint type for code-formatted text. + + This is a subclass of Text with format set to "code". + + Example: + ``` + log_output("code_snippet", Code("print('Hello, World!')", language="python")) + ``` + """ + + def __init__(self, text: str, language: str = ""): + super().__init__(text, format="code") + self._language = language + + def to_serializable(self) -> tuple[str, dict[str, t.Any]]: + return self._text, {"format": self._format, "code-language": self._language} diff --git a/dreadnode/data_types/video.py b/dreadnode/data_types/video.py index 73fc1a74..a475c76d 100644 --- a/dreadnode/data_types/video.py +++ b/dreadnode/data_types/video.py @@ -6,7 +6,7 @@ import numpy as np from numpy.typing import NDArray -from dreadnode.data_types.base_data_type import BaseDataType +from dreadnode.data_types.base import DataType try: from moviepy.video.io.ImageSequenceClip import ImageSequenceClip # type: ignore # noqa: PGH003 @@ -19,7 +19,7 @@ VideoDataType: t.TypeAlias = str | Path | NDArray[t.Any] | bytes | list[NDArray[t.Any]] | t.Any -class Video(BaseDataType): +class Video(DataType): """ Video media type for Dreadnode logging. diff --git a/dreadnode/serialization.py b/dreadnode/serialization.py index fcafc7fa..1d9fc7bc 100644 --- a/dreadnode/serialization.py +++ b/dreadnode/serialization.py @@ -23,7 +23,7 @@ from re import Pattern from uuid import UUID -from dreadnode.data_types.base_data_type import BaseDataType +from dreadnode.data_types.base import DataType from dreadnode.types import JsonDict, JsonValue from dreadnode.util import safe_repr @@ -408,16 +408,12 @@ def _handle_dataset(obj: t.Any, _seen: set[int]) -> tuple[JsonValue, JsonDict]: ) -def _handle_custom_data_type(obj: BaseDataType, _seen: set[int]) -> tuple[JsonValue, JsonDict]: +def _handle_custom_data_type(obj: DataType, _seen: set[int]) -> tuple[JsonValue, JsonDict]: """Handler for Dreadnode custom data types.""" - if not isinstance(obj, BaseDataType): + if not isinstance(obj, DataType): return safe_repr(obj), UNKNOWN_OBJECT_SCHEMA - # Get the serialized data and metadata from the media type data, metadata = obj.to_serializable() - - if isinstance(data, bytes): - return _handle_bytes(data, _seen, metadata) serialized, schema = _serialize(data, _seen) schema.update(metadata) @@ -511,7 +507,7 @@ def _get_handlers() -> dict[type, HandlerFunc]: handlers[datasets.Dataset] = _handle_dataset with contextlib.suppress(Exception): - handlers[BaseDataType] = _handle_custom_data_type + handlers[DataType] = _handle_custom_data_type return handlers @@ -619,7 +615,7 @@ class Serialized: EMPTY_HASH = "0" * 16 -def serialize(obj: t.Any) -> Serialized: +def serialize(obj: t.Any, *, schema_extras: JsonDict | None = None) -> Serialized: """ Serializes a Python object into a JSON-compatible structure and generates a corresponding JSON Schema, ensuring consistency between @@ -627,6 +623,7 @@ def serialize(obj: t.Any) -> Serialized: Args: obj: The Python object to process. + schema_extras: Additional JSON Schema properties to include. Returns: An object containing the serialized data, schema, and their hashes. @@ -638,6 +635,9 @@ def serialize(obj: t.Any) -> Serialized: else: serialized_bytes = json.dumps(serialized, separators=(",", ":")).encode() + if schema_extras: + schema = {**schema, **schema_extras} + schema_str = json.dumps(schema, separators=(",", ":")) data_hash = EMPTY_HASH From 0b2917c4506d971c6406088aad3efe5c8b35a68a Mon Sep 17 00:00:00 2001 From: monoxgas Date: Thu, 17 Jul 2025 10:55:18 -0600 Subject: [PATCH 2/2] Meta to WithMeta for clarity --- docs/sdk/data_types.mdx | 142 +++++++++++++++---------------- dreadnode/data_types/__init__.py | 4 +- dreadnode/data_types/base.py | 4 +- 3 files changed, 75 insertions(+), 75 deletions(-) diff --git a/docs/sdk/data_types.mdx b/docs/sdk/data_types.mdx index 2d184f32..46c03f3a 100644 --- a/docs/sdk/data_types.mdx +++ b/docs/sdk/data_types.mdx @@ -275,77 +275,6 @@ def __init__(self, text: str): ``` - - -Meta ----- - -```python -Meta(obj: Any, metadata: dict[str, Any]) -``` - -Helper data type to add additional metadata to the schema for logged data. - -Example - -```python -log_output("my_data", Meta(data, {"format": "custom-data"})) -``` - -Initialize a data type with associated metadata. - -**Parameters:** - -* **`metadata`** - (`dict[str, Any]`) - –The metadata for this data type - - -```python -def __init__(self, obj: t.Any, metadata: dict[str, t.Any]): - """ - Initialize a data type with associated metadata. - - Args: - metadata: The metadata for this data type - """ - self._obj = obj - self._metadata = metadata -``` - - - - -### to\_serializable - -```python -to_serializable() -> tuple[t.Any, dict[str, t.Any]] -``` - -Convert the media type to a serializable format. - -**Returns:** - -* `tuple[Any, dict[str, Any]]` - –Tuple of (data, metadata) where: - - data: The serialized data - - metadata: Additional metadata for this data type - - -```python -def to_serializable(self) -> tuple[t.Any, dict[str, t.Any]]: - """ - Convert the media type to a serializable format. - - Returns: - Tuple of (data, metadata) where: - - data: The serialized data - - metadata: Additional metadata for this data type - """ - return self._obj, self._metadata -``` - - Object3D @@ -738,4 +667,75 @@ def to_serializable(self) -> tuple[bytes, dict[str, t.Any]]: ``` + + +WithMeta +-------- + +```python +WithMeta(obj: Any, metadata: dict[str, Any]) +``` + +Helper data type to add additional metadata to the schema for logged data. + +Example + +```python +log_output("my_data", WithMeta(data, {"format": "custom-data"})) +``` + +Initialize a data type with associated metadata. + +**Parameters:** + +* **`metadata`** + (`dict[str, Any]`) + –The metadata for this data type + + +```python +def __init__(self, obj: t.Any, metadata: dict[str, t.Any]): + """ + Initialize a data type with associated metadata. + + Args: + metadata: The metadata for this data type + """ + self._obj = obj + self._metadata = metadata +``` + + + + +### to\_serializable + +```python +to_serializable() -> tuple[t.Any, dict[str, t.Any]] +``` + +Convert the media type to a serializable format. + +**Returns:** + +* `tuple[Any, dict[str, Any]]` + –Tuple of (data, metadata) where: + - data: The serialized data + - metadata: Additional metadata for this data type + + +```python +def to_serializable(self) -> tuple[t.Any, dict[str, t.Any]]: + """ + Convert the media type to a serializable format. + + Returns: + Tuple of (data, metadata) where: + - data: The serialized data + - metadata: Additional metadata for this data type + """ + return self._obj, self._metadata +``` + + \ No newline at end of file diff --git a/dreadnode/data_types/__init__.py b/dreadnode/data_types/__init__.py index c405ea28..04a95f21 100644 --- a/dreadnode/data_types/__init__.py +++ b/dreadnode/data_types/__init__.py @@ -1,9 +1,9 @@ from .audio import Audio -from .base import Meta +from .base import WithMeta from .image import Image from .object_3d import Object3D from .table import Table from .text import Code, Markdown, Text from .video import Video -__all__ = ["Audio", "Code", "Image", "Markdown", "Meta", "Object3D", "Table", "Text", "Video"] +__all__ = ["Audio", "Code", "Image", "Markdown", "Object3D", "Table", "Text", "Video", "WithMeta"] diff --git a/dreadnode/data_types/base.py b/dreadnode/data_types/base.py index 7baeccfc..9d566b16 100644 --- a/dreadnode/data_types/base.py +++ b/dreadnode/data_types/base.py @@ -17,13 +17,13 @@ def to_serializable(self) -> tuple[t.Any, dict[str, t.Any]]: """ -class Meta(DataType): +class WithMeta(DataType): """ Helper data type to add additional metadata to the schema for logged data. Example: ``` - log_output("my_data", Meta(data, {"format": "custom-data"})) + log_output("my_data", WithMeta(data, {"format": "custom-data"})) ``` """