From 54b2c5ab39540b5e433602e1dada821477655a52 Mon Sep 17 00:00:00 2001 From: Shlomit Koyfman Date: Wed, 3 May 2023 12:05:18 +0000 Subject: [PATCH 1/5] use default policy; write new mysql asset Signed-off-by: Shlomit Koyfman --- fybrik/README.md | 8 --- fybrik/README_Chaining.md | 2 +- fybrik/README_mysql.md | 77 ++++++++++++++++++---- fybrik/sample-policy.rego | 2 - fybrik/write-flow/storage-account.yaml | 12 ++++ fybrik/write-flow/write-new-asset-app.yaml | 35 ++++++++++ hack/test_module.sh | 16 ----- tests/test_fybrik.sh | 5 -- 8 files changed, 114 insertions(+), 43 deletions(-) delete mode 100644 fybrik/sample-policy.rego create mode 100644 fybrik/write-flow/storage-account.yaml create mode 100644 fybrik/write-flow/write-new-asset-app.yaml diff --git a/fybrik/README.md b/fybrik/README.md index 3c5ec58..4d15fd4 100644 --- a/fybrik/README.md +++ b/fybrik/README.md @@ -34,14 +34,6 @@ You will need a copy of the Fybrik repository (`git clone https://github.com/fyb kubectl config set-context --current --namespace=fybrik-airbyte-sample ``` -1. Create a policy to allow access to any asset (we use a policy that does not restrict access nor mandate any transformations): - ```bash - kubectl -n fybrik-system create configmap sample-policy --from-file=$AIRBYTE_MODULE_DIR/fybrik/sample-policy.rego - kubectl -n fybrik-system label configmap sample-policy openpolicyagent.org/policy=rego - while [[ $(kubectl get cm sample-policy -n fybrik-system -o 'jsonpath={.metadata.annotations.openpolicyagent\.org/policy-status}') != '{"status":"ok"}' ]]; do echo "waiting for policy to be applied" && sleep 5; done - ``` - - 1. Create an asset (the `userdata` asset) and an application that requires this asset: ```bash kubectl apply -f $AIRBYTE_MODULE_DIR/fybrik/read-flow/asset.yaml -n fybrik-airbyte-sample diff --git a/fybrik/README_Chaining.md b/fybrik/README_Chaining.md index ddf073f..238f9a5 100644 --- a/fybrik/README_Chaining.md +++ b/fybrik/README_Chaining.md @@ -6,7 +6,7 @@ Therefore, to satisfy the constraints, Fybrik must deploy two modules: the airby dataset, and the [arrow-flight-module](https://github.com/fybrik/arrow-flight-module) for transforming the dataset based on the governance policies. -The current use case differs from the `Unrestricted Read` use case outlined [here](README.md) in that governance policies mandate transformation of sensitive data. See our use case's [policy](sample-policy-restrictive.rego) vs. the `Unrestricted Read` [policy](sample-policy.rego). +The current use case differs from the `Unrestricted Read` use case outlined [here](README.md) in that governance policies mandate transformation of sensitive data. See our use case's [policy](sample-policy-restrictive.rego). We demonstrate how, using an Airbyte FybrikModule, a workload can access data stored in google-sheets, postgres, and other data stores supported by Airbyte connectors. To do so a FybrikApplication (i.e. the request) must be submitted indicating the desired data set(s). In this example, we use the `userdata` dataset, a Parquet file found in https://github.com/Teradata/kylo/blob/master/samples/sample-data/parquet/userdata2.parquet. diff --git a/fybrik/README_mysql.md b/fybrik/README_mysql.md index 998f5fe..abb6dc2 100644 --- a/fybrik/README_mysql.md +++ b/fybrik/README_mysql.md @@ -25,13 +25,6 @@ You will need a copy of the Fybrik repository (`git clone https://github.com/fyb kubectl config set-context --current --namespace=fybrik-airbyte-sample ``` -1. Create a policy to allow access to any asset (we use a policy that does not restrict access nor mandate any transformations): - ```bash - kubectl -n fybrik-system create configmap sample-policy --from-file=$AIRBYTE_MODULE_DIR/fybrik/sample-policy.rego - kubectl -n fybrik-system label configmap sample-policy openpolicyagent.org/policy=rego - while [[ $(kubectl get cm sample-policy -n fybrik-system -o 'jsonpath={.metadata.annotations.openpolicyagent\.org/policy-status}') != '{"status":"ok"}' ]]; do echo "waiting for policy to be applied" && sleep 5; done - ``` - 1. Setup and initialize mysql for reading a dataset 1. Deploy [mysql](https://bitnami.com/stack/mysql/helm) helm chart in `fybrik-airbyte-sample` namespace. @@ -111,9 +104,9 @@ In this example, a small dataset is written to mysql table. To do so a FybrikApp As above, you will need a copy of the Fybrik repository (`git clone https://github.com/fybrik/fybrik.git`). Set the following environment variables: FYBRIK_DIR for the path of the `fybrik` directory, and AIRBYTE_MODULE_DIR for the path of the `airbyte-module` directory. -Repeat steps 1-5 above. +Repeat steps 1-4 above. -6. Setup and initialize mysql for writing a dataset +5. Setup and initialize mysql for writing a dataset 1. Deploy [mysql](https://bitnami.com/stack/mysql/helm) helm chart in `fybrik-airbyte-sample` namespace: ```bash @@ -151,7 +144,7 @@ Repeat steps 1-5 above. EOF ``` -1. Create an asset (the `userdata` asset), the policy to access it (we use a policy that does not restrict access nor mandate any transformations), and an application that requires this asset: +1. Create an asset (the `userdata` asset) and an application that requires this asset: ```bash kubectl apply -f $AIRBYTE_MODULE_DIR/fybrik/write-flow/asset-mysql.yaml -n fybrik-airbyte-sample kubectl apply -f $AIRBYTE_MODULE_DIR/fybrik/write-flow/application.yaml -n fybrik-airbyte-sample @@ -186,6 +179,68 @@ Repeat steps 1-5 above. select * from demo; ``` +# Writing and Registering a New Dataset with Fybrik Application + +In this example, a small dataset is written to mysql table and registered in the catalog. + +As above, you will need a copy of the Fybrik repository (`git clone https://github.com/fybrik/fybrik.git`). Set the following environment variables: FYBRIK_DIR for the path of the `fybrik` directory, and AIRBYTE_MODULE_DIR for the path of the `airbyte-module` directory. + +Repeat steps 1-4 above. + +5. Setup and initialize mysql for writing a dataset + + 1. Deploy [mysql](https://bitnami.com/stack/mysql/helm) helm chart in `fybrik-airbyte-sample` namespace: + ```bash + helm repo add bitnami https://charts.bitnami.com/bitnami + helm install mysql bitnami/mysql -n fybrik-airbyte-sample + kubectl wait pod --for=condition=ready mysql-0 --namespace fybrik-airbyte-sample --timeout 20m + ``` + 2. Use the instructions from the helm chart notes to run a pod that is use as a client and connect to the service: + ```bash + echo Username: root + MYSQL_ROOT_PASSWORD=$(kubectl get secret --namespace fybrik-airbyte-sample mysql -o jsonpath="{.data.mysql-root-password}" | base64 -d) + kubectl run mysql-client --rm --tty -i --restart='Never' --image docker.io/bitnami/mysql:8.0.32-debian-11-r0 --namespace fybrik-airbyte-sample --env MYSQL_ROOT_PASSWORD=$MYSQL_ROOT_PASSWORD --command -- bash + mysql -h mysql.fybrik-airbyte-sample.svc.cluster.local -uroot -p"$MYSQL_ROOT_PASSWORD" + ``` + +1. Register the credentials required for writing the dataset as a kubernetes secret. Replace the value for MYSQL_ROOT_PASSWORD with the mysql service password as described in the section above: + + ```bash + cat << EOF | kubectl apply -f - + apiVersion: v1 + kind: Secret + metadata: + name: creds-mysql + namespace: fybrik-system + type: Opaque + stringData: + username: root + password: "${MYSQL_ROOT_PASSWORD}" + EOF + ``` + +1. Register a storage account specifying the mysql server details and the credentials and apply an application that writes this asset: +```bash + kubectl apply -f $AIRBYTE_MODULE_DIR/fybrik/write-flow/storage-account.yaml -n fybrik-system + kubectl apply -f $AIRBYTE_MODULE_DIR/fybrik/write-flow/write-new-asset-app.yaml -n fybrik-airbyte-sample + ``` + +1. After the application is created, the Fybrik manager attempts to create the data path for the application. Fybrik realizes that the Airbyte module is required, and deploys it in the `fybrik-blueprints` namespace. To verify that the Airbyte module was indeed deployed, run: + ```bash + kubectl get pods -n fybrik-blueprints + ``` + > _NOTE:_ See the note in step 9 above. + +1. Run the following commands to exceute a write command: + ```bash + export ASSET_NAME=newdata + export ENDPOINT_HOSTNAME=$(kubectl get fybrikapplication my-app-write -n fybrik-airbyte-sample -o "jsonpath={.status.assetStates.${ASSET_NAME}.endpoint.fybrik-arrow-flight.hostname}") + export AIRBYTE_POD_NAME=$(kubectl get pods -n fybrik-blueprints | grep air |awk '{print $1}') + cd $AIRBYTE_MODULE_DIR/helm/client + ./deploy_airbyte_module_client_pod.sh + kubectl exec -it my-shell -n default -- python3 /root/client.py --host ${ENDPOINT_HOSTNAME} --port 80 --asset ${ASSET_NAME} --operation put + ``` + # Cleanup When you're finished experimenting with a sample, you may clean up as follows: @@ -197,7 +252,7 @@ kubectl delete namespace fybrik-airbyte-sample ``` To experiment with a sample after the deletion of `fybrik-airbyte-sample` namespace, -re-create the namespace with the following commands and continue from step 6 in the chosen sample. +re-create the namespace with the following commands and continue from step 5 in the chosen sample. ```bash kubectl create namespace fybrik-airbyte-sample diff --git a/fybrik/sample-policy.rego b/fybrik/sample-policy.rego deleted file mode 100644 index d6f9fa1..0000000 --- a/fybrik/sample-policy.rego +++ /dev/null @@ -1,2 +0,0 @@ -package dataapi.authz -rule[{}] { true } diff --git a/fybrik/write-flow/storage-account.yaml b/fybrik/write-flow/storage-account.yaml new file mode 100644 index 0000000..c54e9df --- /dev/null +++ b/fybrik/write-flow/storage-account.yaml @@ -0,0 +1,12 @@ +apiVersion: app.fybrik.io/v1beta2 +kind: FybrikStorageAccount +metadata: + name: account-mysql +spec: + id: theshire-mysql + type: mysql + secretRef: creds-mysql + geography: theshire + mysql: + host: mysql.fybrik-airbyte-sample + port: 3306 diff --git a/fybrik/write-flow/write-new-asset-app.yaml b/fybrik/write-flow/write-new-asset-app.yaml new file mode 100644 index 0000000..016ccb7 --- /dev/null +++ b/fybrik/write-flow/write-new-asset-app.yaml @@ -0,0 +1,35 @@ +apiVersion: app.fybrik.io/v1beta1 +kind: FybrikApplication +metadata: + name: my-notebook-write + labels: + app: my-notebook-write +spec: + selector: + clusterName: thegreendragon + workloadSelector: + matchLabels: + app: my-notebook-write + appInfo: + intent: Fraud Detection + data: + - dataSetID: newdata + flow: write + requirements: + flowParams: + isNewDataSet: true + metadata: + tags: + Purpose.finance: true + columns: + - name: DOB + tags: + PII.Sensitive: true + - name: FirstName + tags: + PII.Sensitive: true + - name: LastName + tags: + PII.Sensitive: true + interface: + protocol: fybrik-arrow-flight diff --git a/hack/test_module.sh b/hack/test_module.sh index 5a03720..bb152f1 100755 --- a/hack/test_module.sh +++ b/hack/test_module.sh @@ -157,21 +157,6 @@ ${TOOLBIN}/kubectl apply -f $WORKING_DIR/asset.yaml -n fybrik-airbyte-sample ${TOOLBIN}/kubectl describe asset userdata -n fybrik-airbyte-sample - -kubectl -n fybrik-system create configmap sample-policy --from-file=$WORKING_DIR/sample-policy.rego -kubectl -n fybrik-system label configmap sample-policy openpolicyagent.org/policy=rego -while [[ $(kubectl get cm sample-policy -n fybrik-system -o 'jsonpath={.metadata.annotations.openpolicyagent\.org/policy-status}') != '{"status":"ok"}' ]]; do echo "waiting for policy to be applied" && sleep 5; done - - -c=0 -while [[ $(${TOOLBIN}/kubectl get cm sample-policy -n fybrik-system -o 'jsonpath={.metadata.annotations.openpolicyagent\.org/policy-status}') != '{"status":"ok"}' ]] -do - echo "waiting" - ((c++)) && ((c==25)) && break - sleep 1 -done - - # apply fybrik application for writing ${TOOLBIN}/kubectl apply -f $WORKING_DIR/write-fybrikapplication.yaml -n fybrik-airbyte-sample @@ -230,7 +215,6 @@ ${TOOLBIN}/kubectl get cm -o yaml -n fybrik-blueprints rm -rf ${tmp_dir} ${TOOLBIN}/kubectl delete namespace fybrik-airbyte-sample -${TOOLBIN}/kubectl -n fybrik-system delete configmap sample-policy if [ ${RES} == 1 ] then diff --git a/tests/test_fybrik.sh b/tests/test_fybrik.sh index 93faf69..c526636 100755 --- a/tests/test_fybrik.sh +++ b/tests/test_fybrik.sh @@ -118,10 +118,6 @@ kubectl config set-context --current --namespace=fybrik-airbyte-sample kubectl apply -f $AIRBYTE_FYBRIK_TEST/read-flow/asset.yaml -kubectl -n fybrik-system create configmap sample-policy --from-file=$AIRBYTE_FYBRIK_TEST/sample-policy.rego -kubectl -n fybrik-system label configmap sample-policy openpolicyagent.org/policy=rego -while [[ $(kubectl get cm sample-policy -n fybrik-system -o 'jsonpath={.metadata.annotations.openpolicyagent\.org/policy-status}') != '{"status":"ok"}' ]]; do echo "waiting for policy to be applied" && sleep 5; done - kubectl apply -f $AIRBYTE_FYBRIK_TEST/read-flow/application.yaml CMD="kubectl wait --for=condition=ready --all pod -n fybrik-blueprints --timeout=300s " @@ -150,7 +146,6 @@ DIFF=$(diff -b $WORKING_DIR/expected.txt res.out) # cleanup /bin/rm res.out kubectl delete namespace fybrik-airbyte-sample -kubectl -n fybrik-system delete configmap sample-policy /bin/rm -Rf $FYBRIK_DIR if [ "${DIFF}" == "" ] From b041b077e831bec3da005f2c1f0ef2bde45620e0 Mon Sep 17 00:00:00 2001 From: Shlomit Koyfman Date: Wed, 3 May 2023 13:38:08 +0000 Subject: [PATCH 2/5] numbered list Signed-off-by: Shlomit Koyfman --- fybrik/README_mysql.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fybrik/README_mysql.md b/fybrik/README_mysql.md index abb6dc2..6dd5cb4 100644 --- a/fybrik/README_mysql.md +++ b/fybrik/README_mysql.md @@ -220,7 +220,7 @@ Repeat steps 1-4 above. ``` 1. Register a storage account specifying the mysql server details and the credentials and apply an application that writes this asset: -```bash + ```bash kubectl apply -f $AIRBYTE_MODULE_DIR/fybrik/write-flow/storage-account.yaml -n fybrik-system kubectl apply -f $AIRBYTE_MODULE_DIR/fybrik/write-flow/write-new-asset-app.yaml -n fybrik-airbyte-sample ``` From a9516ea5482817f0ff40df9f2b01f959ceb88aba Mon Sep 17 00:00:00 2001 From: Shlomit Koyfman Date: Wed, 3 May 2023 14:02:50 +0000 Subject: [PATCH 3/5] removing redundancy Signed-off-by: Shlomit Koyfman --- fybrik/README_mysql.md | 22 +++------------------- 1 file changed, 3 insertions(+), 19 deletions(-) diff --git a/fybrik/README_mysql.md b/fybrik/README_mysql.md index 6dd5cb4..2ba4407 100644 --- a/fybrik/README_mysql.md +++ b/fybrik/README_mysql.md @@ -185,25 +185,9 @@ In this example, a small dataset is written to mysql table and registered in the As above, you will need a copy of the Fybrik repository (`git clone https://github.com/fybrik/fybrik.git`). Set the following environment variables: FYBRIK_DIR for the path of the `fybrik` directory, and AIRBYTE_MODULE_DIR for the path of the `airbyte-module` directory. -Repeat steps 1-4 above. - -5. Setup and initialize mysql for writing a dataset - - 1. Deploy [mysql](https://bitnami.com/stack/mysql/helm) helm chart in `fybrik-airbyte-sample` namespace: - ```bash - helm repo add bitnami https://charts.bitnami.com/bitnami - helm install mysql bitnami/mysql -n fybrik-airbyte-sample - kubectl wait pod --for=condition=ready mysql-0 --namespace fybrik-airbyte-sample --timeout 20m - ``` - 2. Use the instructions from the helm chart notes to run a pod that is use as a client and connect to the service: - ```bash - echo Username: root - MYSQL_ROOT_PASSWORD=$(kubectl get secret --namespace fybrik-airbyte-sample mysql -o jsonpath="{.data.mysql-root-password}" | base64 -d) - kubectl run mysql-client --rm --tty -i --restart='Never' --image docker.io/bitnami/mysql:8.0.32-debian-11-r0 --namespace fybrik-airbyte-sample --env MYSQL_ROOT_PASSWORD=$MYSQL_ROOT_PASSWORD --command -- bash - mysql -h mysql.fybrik-airbyte-sample.svc.cluster.local -uroot -p"$MYSQL_ROOT_PASSWORD" - ``` +Repeat steps 1-5 above. -1. Register the credentials required for writing the dataset as a kubernetes secret. Replace the value for MYSQL_ROOT_PASSWORD with the mysql service password as described in the section above: +6. Register the credentials required for writing the dataset as a kubernetes secret. Replace the value for MYSQL_ROOT_PASSWORD with the mysql service password as described in the section above: ```bash cat << EOF | kubectl apply -f - @@ -229,7 +213,7 @@ Repeat steps 1-4 above. ```bash kubectl get pods -n fybrik-blueprints ``` - > _NOTE:_ See the note in step 9 above. + > _NOTE:_ See the note in [Reading a Dataset by a Fybrik Application](#reading-a-dataset-by-a-fybrik-application). 1. Run the following commands to exceute a write command: ```bash From be9b67ef8e8b5cb169e3f471c8206bfcebf7b7d5 Mon Sep 17 00:00:00 2001 From: Shlomit Koyfman Date: Wed, 3 May 2023 14:19:03 +0000 Subject: [PATCH 4/5] fix Signed-off-by: Shlomit Koyfman --- fybrik/README_mysql.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fybrik/README_mysql.md b/fybrik/README_mysql.md index 2ba4407..aee681d 100644 --- a/fybrik/README_mysql.md +++ b/fybrik/README_mysql.md @@ -185,9 +185,9 @@ In this example, a small dataset is written to mysql table and registered in the As above, you will need a copy of the Fybrik repository (`git clone https://github.com/fybrik/fybrik.git`). Set the following environment variables: FYBRIK_DIR for the path of the `fybrik` directory, and AIRBYTE_MODULE_DIR for the path of the `airbyte-module` directory. -Repeat steps 1-5 above. +Repeat steps 1-4 above. -6. Register the credentials required for writing the dataset as a kubernetes secret. Replace the value for MYSQL_ROOT_PASSWORD with the mysql service password as described in the section above: +5. Register the credentials required for writing the dataset as a kubernetes secret. Replace the value for MYSQL_ROOT_PASSWORD with the mysql service password as described in the section above: ```bash cat << EOF | kubectl apply -f - From f20094a3161038fb5df4551512cf11e980558da7 Mon Sep 17 00:00:00 2001 From: Shlomit Koyfman Date: Wed, 3 May 2023 15:11:11 +0000 Subject: [PATCH 5/5] typo Signed-off-by: Shlomit Koyfman --- fybrik/README_mysql.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fybrik/README_mysql.md b/fybrik/README_mysql.md index aee681d..256b064 100644 --- a/fybrik/README_mysql.md +++ b/fybrik/README_mysql.md @@ -156,7 +156,7 @@ Repeat steps 1-4 above. ``` > _NOTE:_ See the note in step 9 above. -1. Run the following commands to exceute a write command: +1. Run the following commands to execute a write command: ```bash export CATALOGED_ASSET=fybrik-airbyte-sample/userdata export ENDPOINT_HOSTNAME=$(kubectl get fybrikapplication my-app-write -n fybrik-airbyte-sample -o "jsonpath={.status.assetStates.${CATALOGED_ASSET}.endpoint.fybrik-arrow-flight.hostname}") @@ -215,7 +215,7 @@ Repeat steps 1-4 above. ``` > _NOTE:_ See the note in [Reading a Dataset by a Fybrik Application](#reading-a-dataset-by-a-fybrik-application). -1. Run the following commands to exceute a write command: +1. Run the following commands to execute a write command: ```bash export ASSET_NAME=newdata export ENDPOINT_HOSTNAME=$(kubectl get fybrikapplication my-app-write -n fybrik-airbyte-sample -o "jsonpath={.status.assetStates.${ASSET_NAME}.endpoint.fybrik-arrow-flight.hostname}")