Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .python-version
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
3.11
9 changes: 6 additions & 3 deletions docs/CUSTOM_FIELDS.md
Original file line number Diff line number Diff line change
Expand Up @@ -23,9 +23,9 @@ Multiple datasets and fields can be defined in the files. However, the files nee

The first contains the definitions of the datasets you want to create. It should have following structure:

| workspace_id | dataset_id | dataset_name | dataset_datasource_id | dataset_source_table | dataset_source_sql | parent_dataset_reference | parent_dataset_reference_attribute_id | dataset_reference_source_column | wdf_id | wdf_column_name |
| -------------------- | ----------------- | -------------------- | --------------------- | -------------------- | ------------------ | ------------------------ | ------------------------------------- | ------------------------------- | ------ | --------------- |
| child_workspace_id_1 | custom_dataset_id | Custom Dataset Title | datasource_id | dataset_source_table | | parent_dataset_id | parent_dataset.reference_field | custom_dataset.reference_field | wdf_id | wdf_column_name |
| workspace_id | dataset_id | dataset_name | dataset_datasource_id | dataset_source_table | dataset_source_sql | parent_dataset_reference | parent_dataset_reference_attribute_id | dataset_reference_source_column | dataset_reference_source_column_data_type | wdf_id | wdf_column_name |
| -------------------- | ----------------- | -------------------- | --------------------- | -------------------- | ------------------ | ------------------------ | ------------------------------------- | ------------------------------- | ----------------------------------------- | ------ | --------------- |
| child_workspace_id_1 | custom_dataset_id | Custom Dataset Title | datasource_id | dataset_source_table | | parent_dataset_id | parent_dataset.reference_field | custom_dataset.reference_field | column data type | wdf_id | wdf_column_name |

#### Validity constraints

Expand All @@ -48,11 +48,14 @@ For readability, here is the data structure in JSON format with comments. Howeve
"parent_dataset_reference": "products", // ID of the parent dataset to which the custom one will be connected
"parent_dataset_reference_attribute_id": "products.product_id", // parent dataset column name used fot the "join"
"dataset_reference_source_column": "product_id", // custom dataset column name used for the "join"
"dataset_reference_source_column_data_type": "STRING", // column data type*
"wdf_id": "x__client_id", // workspace data filter id
"wdf_column_name": "client_id" // name of the column used for filtering
}
```

\* possible values are listed in `ColumnDataType` enum in [models](../scripts/custom_fields/models/custom_data_object.py)

### Custom fields definition

The individual files of the custom dataset are defined thusly:
Expand Down
3 changes: 2 additions & 1 deletion requirements-test.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
pytest~=7.3.2
moto~=5.1.6
pytest-mock==3.14.0
pytest-mock==3.14.0
tox==4.27.0
30 changes: 25 additions & 5 deletions scripts/custom_fields/input_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@ class CustomFieldsDataProcessor:

@staticmethod
def _attribute_from_field(
dataset_name: str,
custom_field: CustomFieldDefinition,
) -> CatalogDeclarativeAttribute:
"""Assign a declarative attribute from a custom field definition."""
Expand All @@ -73,10 +74,12 @@ def _attribute_from_field(
source_column=custom_field.cf_source_column,
labels=[],
source_column_data_type=custom_field.cf_source_column_data_type.value,
tags=[dataset_name],
)

@staticmethod
def _fact_from_field(
dataset_name: str,
custom_field: CustomFieldDefinition,
) -> CatalogDeclarativeFact:
"""Assign a declarative fact from a custom field definition."""
Expand All @@ -85,10 +88,12 @@ def _fact_from_field(
title=custom_field.cf_name,
source_column=custom_field.cf_source_column,
source_column_data_type=custom_field.cf_source_column_data_type.value,
tags=[dataset_name],
)

def _date_from_field(
self,
dataset_name: str,
custom_field: CustomFieldDefinition,
) -> CatalogDeclarativeDateDataset:
"""Assign a declarative date dataset from a custom field definition."""
Expand All @@ -101,6 +106,7 @@ def _date_from_field(
title_pattern="%titleBase - %granularityTitle",
),
granularities=self.DATE_GRANULARITIES,
tags=[dataset_name],
)

@staticmethod
Expand Down Expand Up @@ -181,10 +187,18 @@ def datasets_to_ldm(
# Iterate through the custom fields and create the appropriate objects
for custom_field in dataset.custom_fields:
if custom_field.cf_type == CustomFieldType.ATTRIBUTE:
attributes.append(self._attribute_from_field(custom_field))
attributes.append(
self._attribute_from_field(
dataset.definition.dataset_name, custom_field
)
)

elif custom_field.cf_type == CustomFieldType.FACT:
facts.append(self._fact_from_field(custom_field))
facts.append(
self._fact_from_field(
dataset.definition.dataset_name, custom_field
)
)

# Process date dimensions and store them to date_instances. Date
# dimensions are not stored in a dataset, but as a separate dataset.
Expand All @@ -193,7 +207,11 @@ def datasets_to_ldm(
# in the GoodData Logical Data Model.
elif custom_field.cf_type == CustomFieldType.DATE:
# Add the date dimension to the date_instances
date_instances.append(self._date_from_field(custom_field))
date_instances.append(
self._date_from_field(
dataset.definition.dataset_name, custom_field
)
)

# Create a reference so that the date dimension is connected
# to the dataset in the GoodData Logical Data Model.
Expand All @@ -216,12 +234,13 @@ def datasets_to_ldm(
references=[
CatalogDeclarativeReference(
identifier=CatalogReferenceIdentifier(
id=dataset.definition.parent_dataset_reference
id=dataset.definition.parent_dataset_reference,
),
multivalue=True,
sources=[
CatalogDeclarativeReferenceSource(
column=dataset.definition.dataset_reference_source_column,
data_type=dataset.definition.dataset_reference_source_column_data_type.value,
target=CatalogGrainIdentifier(
id=dataset.definition.parent_dataset_reference_attribute_id,
type=CustomFieldType.ATTRIBUTE.value,
Expand All @@ -238,7 +257,7 @@ def datasets_to_ldm(
sql=dataset_sql,
workspace_data_filter_columns=[
CatalogDeclarativeWorkspaceDataFilterColumn(
name=dataset.definition.wdf_id,
name=dataset.definition.wdf_column_name,
data_type=ColumnDataType.STRING.value,
)
],
Expand All @@ -251,6 +270,7 @@ def datasets_to_ldm(
filter_column_data_type=ColumnDataType.STRING.value,
)
],
tags=[dataset.definition.dataset_name],
)
)

Expand Down
1 change: 1 addition & 0 deletions scripts/custom_fields/models/custom_data_object.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@ class CustomDatasetDefinition(BaseModel):
parent_dataset_reference: str
parent_dataset_reference_attribute_id: str
dataset_reference_source_column: str
dataset_reference_source_column_data_type: ColumnDataType
wdf_id: str
wdf_column_name: str

Expand Down
16 changes: 12 additions & 4 deletions tests/test_custom_fields/test_input_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,7 @@ def mock_dataset_definition():
parent_dataset_reference="parent_ds",
parent_dataset_reference_attribute_id="parent_attr",
dataset_reference_source_column="ref_col",
dataset_reference_source_column_data_type=ColumnDataType.STRING,
wdf_id="wdf1",
wdf_column_name="col1",
)
Expand All @@ -92,27 +93,34 @@ def mock_custom_dataset(


def test_attribute_from_field(mock_custom_field_attribute):
attr = CustomFieldsDataProcessor._attribute_from_field(mock_custom_field_attribute)
attr = CustomFieldsDataProcessor._attribute_from_field(
"dataset_name", mock_custom_field_attribute
)
assert attr.id == "attr1"
assert attr.title == "Attribute 1"
assert attr.source_column == "col_attr1"
assert attr.source_column_data_type == ColumnDataType.STRING.value
assert attr.tags == ["dataset_name"]


def test_fact_from_field(mock_custom_field_fact):
fact = CustomFieldsDataProcessor._fact_from_field(mock_custom_field_fact)
fact = CustomFieldsDataProcessor._fact_from_field(
"dataset_name", mock_custom_field_fact
)
assert fact.id == "fact1"
assert fact.title == "Fact 1"
assert fact.source_column == "col_fact1"
assert fact.source_column_data_type == ColumnDataType.INT.value
assert fact.tags == ["dataset_name"]


def test_date_from_field(mock_custom_field_date):
processor = CustomFieldsDataProcessor()
date_ds = processor._date_from_field(mock_custom_field_date)
date_ds = processor._date_from_field("dataset_name", mock_custom_field_date)
assert date_ds.id == "date1"
assert date_ds.title == "Date 1"
assert set(date_ds.granularities) == set(processor.DATE_GRANULARITIES)
assert date_ds.tags == ["dataset_name"]


def test_date_ref_from_field(mock_custom_field_date):
Expand Down Expand Up @@ -161,7 +169,7 @@ def test_datasets_to_ldm(mock_custom_dataset):
assert len(ds.references) == 2 # 1 parent + 1 date
assert ds.workspace_data_filter_columns
assert ds.workspace_data_filter_references
assert ds.workspace_data_filter_columns[0].name == "wdf1"
assert ds.workspace_data_filter_columns[0].name == "col1"
assert ds.workspace_data_filter_references[0].filter_id.id == "wdf1"
assert len(ldm.date_instances) == 1
assert ldm.date_instances[0].id == "date1"
3 changes: 3 additions & 0 deletions tests/test_custom_fields/test_input_validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ def valid_dataset_definitions():
"parent_dataset_reference": "parent1",
"parent_dataset_reference_attribute_id": "parent1.id",
"dataset_reference_source_column": "id",
"dataset_reference_source_column_data_type": "STRING",
"wdf_id": "wdf1",
"wdf_column_name": "id",
},
Expand All @@ -42,6 +43,7 @@ def valid_dataset_definitions():
"parent_dataset_reference": "parent2",
"parent_dataset_reference_attribute_id": "parent2.id",
"dataset_reference_source_column": "id",
"dataset_reference_source_column_data_type": "INT",
"wdf_id": "wdf2",
"wdf_column_name": "id",
},
Expand Down Expand Up @@ -108,6 +110,7 @@ def test_duplicate_dataset_raises(valid_dataset_definitions):
"parent_dataset_reference": "parent1",
"parent_dataset_reference_attribute_id": "parent1.id",
"dataset_reference_source_column": "id",
"dataset_reference_source_column_data_type": "STRING",
"wdf_id": "wdf1",
"wdf_column_name": "id",
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ def make_valid_dataset_def(**kwargs):
"parent_dataset_reference": "parent_ds",
"parent_dataset_reference_attribute_id": "parent_attr",
"dataset_reference_source_column": "src_col",
"dataset_reference_source_column_data_type": ColumnDataType.STRING,
"wdf_id": "wdf1",
"wdf_column_name": "col1",
}
Expand Down
2 changes: 1 addition & 1 deletion tox.ini
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
[tox]
env_list = py311, type, lint, 3.11
env_list = py311, type, lint

[testenv]
allowlist_externals =
Expand Down