diff --git a/.python-version b/.python-version new file mode 100644 index 0000000..2c07333 --- /dev/null +++ b/.python-version @@ -0,0 +1 @@ +3.11 diff --git a/docs/CUSTOM_FIELDS.md b/docs/CUSTOM_FIELDS.md index c3ebe1e..90baf17 100644 --- a/docs/CUSTOM_FIELDS.md +++ b/docs/CUSTOM_FIELDS.md @@ -23,9 +23,9 @@ Multiple datasets and fields can be defined in the files. However, the files nee The first contains the definitions of the datasets you want to create. It should have following structure: -| workspace_id | dataset_id | dataset_name | dataset_datasource_id | dataset_source_table | dataset_source_sql | parent_dataset_reference | parent_dataset_reference_attribute_id | dataset_reference_source_column | wdf_id | wdf_column_name | -| -------------------- | ----------------- | -------------------- | --------------------- | -------------------- | ------------------ | ------------------------ | ------------------------------------- | ------------------------------- | ------ | --------------- | -| child_workspace_id_1 | custom_dataset_id | Custom Dataset Title | datasource_id | dataset_source_table | | parent_dataset_id | parent_dataset.reference_field | custom_dataset.reference_field | wdf_id | wdf_column_name | +| workspace_id | dataset_id | dataset_name | dataset_datasource_id | dataset_source_table | dataset_source_sql | parent_dataset_reference | parent_dataset_reference_attribute_id | dataset_reference_source_column | dataset_reference_source_column_data_type | wdf_id | wdf_column_name | +| -------------------- | ----------------- | -------------------- | --------------------- | -------------------- | ------------------ | ------------------------ | ------------------------------------- | ------------------------------- | ----------------------------------------- | ------ | --------------- | +| child_workspace_id_1 | custom_dataset_id | Custom Dataset Title | datasource_id | dataset_source_table | | parent_dataset_id | parent_dataset.reference_field | custom_dataset.reference_field | column data type | wdf_id | wdf_column_name | #### Validity constraints @@ -48,11 +48,14 @@ For readability, here is the data structure in JSON format with comments. Howeve "parent_dataset_reference": "products", // ID of the parent dataset to which the custom one will be connected "parent_dataset_reference_attribute_id": "products.product_id", // parent dataset column name used fot the "join" "dataset_reference_source_column": "product_id", // custom dataset column name used for the "join" + "dataset_reference_source_column_data_type": "STRING", // column data type* "wdf_id": "x__client_id", // workspace data filter id "wdf_column_name": "client_id" // name of the column used for filtering } ``` +\* possible values are listed in `ColumnDataType` enum in [models](../scripts/custom_fields/models/custom_data_object.py) + ### Custom fields definition The individual files of the custom dataset are defined thusly: diff --git a/requirements-test.txt b/requirements-test.txt index 0757d4d..f56bf6a 100644 --- a/requirements-test.txt +++ b/requirements-test.txt @@ -1,3 +1,4 @@ pytest~=7.3.2 moto~=5.1.6 -pytest-mock==3.14.0 \ No newline at end of file +pytest-mock==3.14.0 +tox==4.27.0 \ No newline at end of file diff --git a/scripts/custom_fields/input_processor.py b/scripts/custom_fields/input_processor.py index e75dcf9..5f18df6 100644 --- a/scripts/custom_fields/input_processor.py +++ b/scripts/custom_fields/input_processor.py @@ -64,6 +64,7 @@ class CustomFieldsDataProcessor: @staticmethod def _attribute_from_field( + dataset_name: str, custom_field: CustomFieldDefinition, ) -> CatalogDeclarativeAttribute: """Assign a declarative attribute from a custom field definition.""" @@ -73,10 +74,12 @@ def _attribute_from_field( source_column=custom_field.cf_source_column, labels=[], source_column_data_type=custom_field.cf_source_column_data_type.value, + tags=[dataset_name], ) @staticmethod def _fact_from_field( + dataset_name: str, custom_field: CustomFieldDefinition, ) -> CatalogDeclarativeFact: """Assign a declarative fact from a custom field definition.""" @@ -85,10 +88,12 @@ def _fact_from_field( title=custom_field.cf_name, source_column=custom_field.cf_source_column, source_column_data_type=custom_field.cf_source_column_data_type.value, + tags=[dataset_name], ) def _date_from_field( self, + dataset_name: str, custom_field: CustomFieldDefinition, ) -> CatalogDeclarativeDateDataset: """Assign a declarative date dataset from a custom field definition.""" @@ -101,6 +106,7 @@ def _date_from_field( title_pattern="%titleBase - %granularityTitle", ), granularities=self.DATE_GRANULARITIES, + tags=[dataset_name], ) @staticmethod @@ -181,10 +187,18 @@ def datasets_to_ldm( # Iterate through the custom fields and create the appropriate objects for custom_field in dataset.custom_fields: if custom_field.cf_type == CustomFieldType.ATTRIBUTE: - attributes.append(self._attribute_from_field(custom_field)) + attributes.append( + self._attribute_from_field( + dataset.definition.dataset_name, custom_field + ) + ) elif custom_field.cf_type == CustomFieldType.FACT: - facts.append(self._fact_from_field(custom_field)) + facts.append( + self._fact_from_field( + dataset.definition.dataset_name, custom_field + ) + ) # Process date dimensions and store them to date_instances. Date # dimensions are not stored in a dataset, but as a separate dataset. @@ -193,7 +207,11 @@ def datasets_to_ldm( # in the GoodData Logical Data Model. elif custom_field.cf_type == CustomFieldType.DATE: # Add the date dimension to the date_instances - date_instances.append(self._date_from_field(custom_field)) + date_instances.append( + self._date_from_field( + dataset.definition.dataset_name, custom_field + ) + ) # Create a reference so that the date dimension is connected # to the dataset in the GoodData Logical Data Model. @@ -216,12 +234,13 @@ def datasets_to_ldm( references=[ CatalogDeclarativeReference( identifier=CatalogReferenceIdentifier( - id=dataset.definition.parent_dataset_reference + id=dataset.definition.parent_dataset_reference, ), multivalue=True, sources=[ CatalogDeclarativeReferenceSource( column=dataset.definition.dataset_reference_source_column, + data_type=dataset.definition.dataset_reference_source_column_data_type.value, target=CatalogGrainIdentifier( id=dataset.definition.parent_dataset_reference_attribute_id, type=CustomFieldType.ATTRIBUTE.value, @@ -238,7 +257,7 @@ def datasets_to_ldm( sql=dataset_sql, workspace_data_filter_columns=[ CatalogDeclarativeWorkspaceDataFilterColumn( - name=dataset.definition.wdf_id, + name=dataset.definition.wdf_column_name, data_type=ColumnDataType.STRING.value, ) ], @@ -251,6 +270,7 @@ def datasets_to_ldm( filter_column_data_type=ColumnDataType.STRING.value, ) ], + tags=[dataset.definition.dataset_name], ) ) diff --git a/scripts/custom_fields/models/custom_data_object.py b/scripts/custom_fields/models/custom_data_object.py index 07ec606..9d2d729 100644 --- a/scripts/custom_fields/models/custom_data_object.py +++ b/scripts/custom_fields/models/custom_data_object.py @@ -63,6 +63,7 @@ class CustomDatasetDefinition(BaseModel): parent_dataset_reference: str parent_dataset_reference_attribute_id: str dataset_reference_source_column: str + dataset_reference_source_column_data_type: ColumnDataType wdf_id: str wdf_column_name: str diff --git a/tests/test_custom_fields/test_input_processor.py b/tests/test_custom_fields/test_input_processor.py index 679412b..5f57bcb 100644 --- a/tests/test_custom_fields/test_input_processor.py +++ b/tests/test_custom_fields/test_input_processor.py @@ -69,6 +69,7 @@ def mock_dataset_definition(): parent_dataset_reference="parent_ds", parent_dataset_reference_attribute_id="parent_attr", dataset_reference_source_column="ref_col", + dataset_reference_source_column_data_type=ColumnDataType.STRING, wdf_id="wdf1", wdf_column_name="col1", ) @@ -92,27 +93,34 @@ def mock_custom_dataset( def test_attribute_from_field(mock_custom_field_attribute): - attr = CustomFieldsDataProcessor._attribute_from_field(mock_custom_field_attribute) + attr = CustomFieldsDataProcessor._attribute_from_field( + "dataset_name", mock_custom_field_attribute + ) assert attr.id == "attr1" assert attr.title == "Attribute 1" assert attr.source_column == "col_attr1" assert attr.source_column_data_type == ColumnDataType.STRING.value + assert attr.tags == ["dataset_name"] def test_fact_from_field(mock_custom_field_fact): - fact = CustomFieldsDataProcessor._fact_from_field(mock_custom_field_fact) + fact = CustomFieldsDataProcessor._fact_from_field( + "dataset_name", mock_custom_field_fact + ) assert fact.id == "fact1" assert fact.title == "Fact 1" assert fact.source_column == "col_fact1" assert fact.source_column_data_type == ColumnDataType.INT.value + assert fact.tags == ["dataset_name"] def test_date_from_field(mock_custom_field_date): processor = CustomFieldsDataProcessor() - date_ds = processor._date_from_field(mock_custom_field_date) + date_ds = processor._date_from_field("dataset_name", mock_custom_field_date) assert date_ds.id == "date1" assert date_ds.title == "Date 1" assert set(date_ds.granularities) == set(processor.DATE_GRANULARITIES) + assert date_ds.tags == ["dataset_name"] def test_date_ref_from_field(mock_custom_field_date): @@ -161,7 +169,7 @@ def test_datasets_to_ldm(mock_custom_dataset): assert len(ds.references) == 2 # 1 parent + 1 date assert ds.workspace_data_filter_columns assert ds.workspace_data_filter_references - assert ds.workspace_data_filter_columns[0].name == "wdf1" + assert ds.workspace_data_filter_columns[0].name == "col1" assert ds.workspace_data_filter_references[0].filter_id.id == "wdf1" assert len(ldm.date_instances) == 1 assert ldm.date_instances[0].id == "date1" diff --git a/tests/test_custom_fields/test_input_validator.py b/tests/test_custom_fields/test_input_validator.py index b360f49..57ce14d 100644 --- a/tests/test_custom_fields/test_input_validator.py +++ b/tests/test_custom_fields/test_input_validator.py @@ -29,6 +29,7 @@ def valid_dataset_definitions(): "parent_dataset_reference": "parent1", "parent_dataset_reference_attribute_id": "parent1.id", "dataset_reference_source_column": "id", + "dataset_reference_source_column_data_type": "STRING", "wdf_id": "wdf1", "wdf_column_name": "id", }, @@ -42,6 +43,7 @@ def valid_dataset_definitions(): "parent_dataset_reference": "parent2", "parent_dataset_reference_attribute_id": "parent2.id", "dataset_reference_source_column": "id", + "dataset_reference_source_column_data_type": "INT", "wdf_id": "wdf2", "wdf_column_name": "id", }, @@ -108,6 +110,7 @@ def test_duplicate_dataset_raises(valid_dataset_definitions): "parent_dataset_reference": "parent1", "parent_dataset_reference_attribute_id": "parent1.id", "dataset_reference_source_column": "id", + "dataset_reference_source_column_data_type": "STRING", "wdf_id": "wdf1", "wdf_column_name": "id", } diff --git a/tests/test_custom_fields/test_models/test_custom_data_object.py b/tests/test_custom_fields/test_models/test_custom_data_object.py index cd10a4f..a4c6625 100644 --- a/tests/test_custom_fields/test_models/test_custom_data_object.py +++ b/tests/test_custom_fields/test_models/test_custom_data_object.py @@ -43,6 +43,7 @@ def make_valid_dataset_def(**kwargs): "parent_dataset_reference": "parent_ds", "parent_dataset_reference_attribute_id": "parent_attr", "dataset_reference_source_column": "src_col", + "dataset_reference_source_column_data_type": ColumnDataType.STRING, "wdf_id": "wdf1", "wdf_column_name": "col1", } diff --git a/tox.ini b/tox.ini index 664e47f..bb43f0e 100644 --- a/tox.ini +++ b/tox.ini @@ -1,5 +1,5 @@ [tox] -env_list = py311, type, lint, 3.11 +env_list = py311, type, lint [testenv] allowlist_externals =