From e373ea22135e0c1c2fc6406cb594a7c61c7c934a Mon Sep 17 00:00:00 2001 From: Mick Jermsurawong Date: Fri, 27 Sep 2019 14:47:46 -0700 Subject: [PATCH 01/34] forward list compat --- ...ParquetListFormatForwardCompatibility.java | 349 ++++++++++++++++ .../parquet/scrooge/ScroogeReadSupport.java | 7 +- .../scrooge/ScroogeReadSupportTests.scala | 378 ++++++++++++++++++ 3 files changed, 730 insertions(+), 4 deletions(-) create mode 100644 scalding-parquet-scrooge/src/main/java/com/twitter/scalding/parquet/scrooge/ParquetListFormatForwardCompatibility.java diff --git a/scalding-parquet-scrooge/src/main/java/com/twitter/scalding/parquet/scrooge/ParquetListFormatForwardCompatibility.java b/scalding-parquet-scrooge/src/main/java/com/twitter/scalding/parquet/scrooge/ParquetListFormatForwardCompatibility.java new file mode 100644 index 0000000000..13392d59b0 --- /dev/null +++ b/scalding-parquet-scrooge/src/main/java/com/twitter/scalding/parquet/scrooge/ParquetListFormatForwardCompatibility.java @@ -0,0 +1,349 @@ +package com.twitter.scalding.parquet.scrooge; + +import org.apache.parquet.schema.GroupType; +import org.apache.parquet.schema.OriginalType; +import org.apache.parquet.schema.PrimitiveType; +import org.apache.parquet.schema.Type; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.Stack; + +/** + * Compatibility class to convert parquet schema of legacy type to standard one + * namely 3-level list structure as recommended in + * https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#lists + * + * More specifically this handles converting from parquet file created by + * {{@code org.apache.parquet.thrift.ThriftSchemaConvertVisitor}} which always suffix + * list element with "_tuple". + */ +public class ParquetListFormatForwardCompatibility { + + private static List RULES = Arrays.asList( + new RulePrimitiveElement(), + new RulePrimitiveArray(), + + new RuleGroupElement(), + new RuleGroupArray(), + + new RuleGroupTuple(), + new RuleStandardThreeLevel()); + + /** + * Rule describes how to match a repeated type, how to decompose them, and reconstruct a + * repeated type. + */ + abstract static public class Rule { + public Type elementType(Type repeatedType) { + if (repeatedType.isPrimitive()) { + return repeatedType; + } else { + return firstField(repeatedType.asGroupType()); + } + } + + public Boolean isElementRequired(Type repeatedType) { + return true; + } + + public String elementName(Type repeatedType) { + return this.elementType(repeatedType).getName(); + } + + public OriginalType elementOriginalType(Type repeatedType) { + return this.elementType(repeatedType).getOriginalType(); + } + + abstract Boolean check(Type type); + + abstract Type createCompliantRepeatedType(Type type, String name, Boolean isElementRequired, OriginalType originalType); + + } + + static class RulePrimitiveElement extends Rule { + + public String constantElementName() { + return "element"; + } + + @Override + public Boolean check(Type repeatedType) { + return repeatedType.isPrimitive() && repeatedType.getName().equals(this.constantElementName()); + } + + @Override + public Type createCompliantRepeatedType(Type type, String name, Boolean isElementRequired, OriginalType originalType) { + if (!isElementRequired) { + throw new IllegalArgumentException("Rule 1 can only take required element"); + } + if (!type.isPrimitive()) { + throw new IllegalArgumentException( + String.format("Rule 1 cannot take primitive type, but is given %s", type)); + } + return new PrimitiveType( + Type.Repetition.REPEATED, + type.asPrimitiveType().getPrimitiveTypeName(), + this.constantElementName(), + originalType + ); + } + + } + + static class RulePrimitiveArray extends RulePrimitiveElement { + @Override + public String constantElementName() { + return "array"; + } + } + + static class RuleGroupElement extends Rule { + public String constantElementName() { + return "element"; + } + + public Type elementType(Type repeatedType) { + return repeatedType; + } + + @Override + public String elementName(Type repeatedType) { + return this.constantElementName(); + } + + @Override + public Boolean check(Type repeatedType) { + if (repeatedType.isPrimitive()) { + return false; + } else { + GroupType repeatedGroup = repeatedType.asGroupType(); + return repeatedGroup.getFields().size() > 0 && repeatedGroup.getName().equals(this.constantElementName()); + } + } + + @Override + public Type createCompliantRepeatedType(Type type, String name, Boolean isElementRequired, OriginalType originalType) { + if (type.isPrimitive()) { + return new GroupType( + Type.Repetition.REPEATED, + this.constantElementName(), + type + ); + } else { + return new GroupType( + Type.Repetition.REPEATED, + this.constantElementName(), + type.asGroupType().getFields() + ); + } + } + } + + static class RuleGroupArray extends RuleGroupElement { + @Override + public String constantElementName() { + return "array"; + } + } + + static class RuleGroupTuple extends Rule { + + @Override + public Boolean check(Type repeatedType) { + return repeatedType.getName().endsWith("_tuple"); + } + + public Type elementType(Type repeatedType) { + return repeatedType; + } + + @Override + public Type createCompliantRepeatedType(Type type, String name, Boolean isElementRequired, OriginalType originalType) { + if (!type.isPrimitive()) { + throw new IllegalArgumentException(String.format( + "Rule 3 can only take group type, but found %s", type)); + } + if (!name.endsWith("_tuple")) { + name = name + "_tuple"; + } + return new PrimitiveType( + Type.Repetition.REPEATED, + type.asPrimitiveType().getPrimitiveTypeName(), + name, + originalType + ); + } + } + + static class RuleStandardThreeLevel extends Rule { + @Override + public Boolean check(Type repeatedField) { + if (repeatedField.isPrimitive() || !repeatedField.getName().equals("list")) { + return false; + } + Type elementType = firstField(repeatedField.asGroupType()); + return elementType.getName().equals("element"); + } + + @Override + public String elementName(Type repeatedType) { + return "element"; + } + + @Override + public Type createCompliantRepeatedType(Type type, String name, Boolean isElementRequired, OriginalType originalType) { + Type elementType = null; + if (type.isPrimitive()) { + elementType = new PrimitiveType( + isElementRequired ? Type.Repetition.REQUIRED : Type.Repetition.OPTIONAL, + type.asPrimitiveType().getPrimitiveTypeName(), + "element", + originalType + ); + } else { + elementType = new GroupType( + isElementRequired ? Type.Repetition.REQUIRED : Type.Repetition.OPTIONAL, + "element", + // we cannot flatten `list` + type.asGroupType().getName().equals("list") ? + Arrays.asList(type) : + type.asGroupType().getFields() + ); + } + + return new GroupType( + Type.Repetition.REPEATED, + "list", + Arrays.asList(elementType) + ); + } + } + + private static org.apache.parquet.schema.Type firstField(GroupType groupType) { + return groupType.getFields().get(0); + } + + private static boolean isGroupList(Type projection) { + if (projection.isPrimitive()) { + return false; + } + GroupType groupProjection = projection.asGroupType(); + return groupProjection.getOriginalType() == OriginalType.LIST && + groupProjection.getFieldCount() == 1 && + groupProjection.getFields().get(0).isRepetition(Type.Repetition.REPEATED); + } + + /** + * Resolve list format in forward compatible way. + * @param fileType file type which has new format + * @param projection projection type which has legacy format + * @return projection schema in the new format. + */ + public Type resolveTypeFormat(Type fileType, Type projection) { + if (projection.isPrimitive() || fileType.isPrimitive()) { + return projection; + } + ParquetListFormatForwardCompatibility compatibility = new ParquetListFormatForwardCompatibility(); + + GroupType groupFile = fileType.asGroupType(); + GroupType groupProjection = projection.asGroupType(); + + GroupUnwrapped unwrappedFile = unwrapGroup(groupFile, new Stack()); + GroupUnwrapped unwrappedProjection = unwrapGroup(groupProjection, new Stack()); + + Type repeatedFile = unwrappedFile.repeatedType; + Type repeatedProjection = unwrappedProjection.repeatedType; + + if (repeatedProjection != null && repeatedFile != null) { + // Recurse on the repeated content. This is to handle nested list + Type repeatedResolved = resolveTypeFormat(repeatedFile, repeatedProjection); + // Make projected structure compatible with file type + Type repeatedFormatted = compatibility + .makeForwardCompatible(repeatedFile, repeatedResolved); + + // Wrap back the groups, this contain field name and whether it's optional/required + Type resolvedGroupType = repeatedFormatted; + while (!unwrappedProjection.wrappers.isEmpty()) { + resolvedGroupType = unwrappedProjection.wrappers.pop().withNewFields(resolvedGroupType); + } + return resolvedGroupType; + } else { + List fields = new ArrayList(); + for (Type projected : groupProjection.getFields()) { + if (!projected.isPrimitive()) { + // The file type field must be a group type too + int fieldIndex = groupFile.getFieldIndex(projected.getName()); + Type fileField = groupFile.getFields().get(fieldIndex); + fields.add(resolveTypeFormat(fileField.asGroupType(), projected.asGroupType())); + } else { + fields.add(projected); + } + } + return groupProjection.withNewFields(fields); + } + } + + private Rule findFirstRule(Type repeatedType, String debuggingTypeSource) { + Rule matchedRule = null; + for (Rule rule : RULES) { + if (rule.check(repeatedType)) { + matchedRule = rule; + break; + } + } + if (matchedRule == null) { + throw new RuntimeException(String.format( + "Unable to find matching rule for %s schema:\n%s", debuggingTypeSource, repeatedType)); + } + return matchedRule; + } + + private Type makeForwardCompatible(Type repeatedFileType, Type repeatedProjectedType) { + Rule fileTypeRule = findFirstRule(repeatedFileType, "file"); + Rule projectedTypeRule = findFirstRule(repeatedProjectedType, "projected"); + + if (projectedTypeRule == fileTypeRule) { + return repeatedProjectedType; + } + + String elementName = projectedTypeRule.elementName(repeatedProjectedType); + Type elementType = projectedTypeRule.elementType(repeatedProjectedType); + Boolean isElementRequired = projectedTypeRule.isElementRequired(repeatedProjectedType); + OriginalType elementOriginalType = projectedTypeRule.elementOriginalType(repeatedProjectedType); + + return fileTypeRule.createCompliantRepeatedType( + elementType, + elementName, + isElementRequired, + elementOriginalType); + } + + private static class GroupUnwrapped { + Stack wrappers; + Type repeatedType; + + public GroupUnwrapped(Stack wrappers, Type repeatedType) { + this.wrappers = wrappers; + this.repeatedType = repeatedType; + } + } + + private static GroupUnwrapped unwrapGroup(Type type, Stack wrappers) { + Type ptr = type; + // only wrapper for list with size one, so we can wrap repeated type later + while (!ptr.isPrimitive()) { + wrappers.push(ptr.asGroupType()); + if (isGroupList(ptr)) { + // when it is repeated + return new GroupUnwrapped(wrappers, ptr.asGroupType().getFields().get(0)); + } else if (ptr.asGroupType().getFields().size() == 1){ + ptr = ptr.asGroupType().getFields().get(0); + } else { + break; + } + } + return new GroupUnwrapped(wrappers, null); + } +} diff --git a/scalding-parquet-scrooge/src/main/java/com/twitter/scalding/parquet/scrooge/ScroogeReadSupport.java b/scalding-parquet-scrooge/src/main/java/com/twitter/scalding/parquet/scrooge/ScroogeReadSupport.java index 12854bbd41..efd72920c1 100644 --- a/scalding-parquet-scrooge/src/main/java/com/twitter/scalding/parquet/scrooge/ScroogeReadSupport.java +++ b/scalding-parquet-scrooge/src/main/java/com/twitter/scalding/parquet/scrooge/ScroogeReadSupport.java @@ -37,9 +37,7 @@ import org.apache.parquet.thrift.projection.ThriftProjectionException; import org.apache.parquet.thrift.struct.ThriftType; -import java.util.List; -import java.util.Map; -import java.util.Set; +import java.util.*; /** * Read support for Scrooge @@ -132,7 +130,8 @@ public static MessageType getSchemaForRead(MessageType fileMessageType, String p */ public static MessageType getSchemaForRead(MessageType fileMessageType, MessageType projectedMessageType) { assertGroupsAreCompatible(fileMessageType, projectedMessageType); - return projectedMessageType; + Type resolved = new ParquetListFormatForwardCompatibility().resolveTypeFormat(fileMessageType, projectedMessageType); + return new MessageType(projectedMessageType.getName(), resolved.asGroupType().getFields()); } /** diff --git a/scalding-parquet-scrooge/src/test/scala/com/twitter/scalding/parquet/scrooge/ScroogeReadSupportTests.scala b/scalding-parquet-scrooge/src/test/scala/com/twitter/scalding/parquet/scrooge/ScroogeReadSupportTests.scala index 5e8ed5b541..fd7cbeb4a4 100644 --- a/scalding-parquet-scrooge/src/test/scala/com/twitter/scalding/parquet/scrooge/ScroogeReadSupportTests.scala +++ b/scalding-parquet-scrooge/src/test/scala/com/twitter/scalding/parquet/scrooge/ScroogeReadSupportTests.scala @@ -90,6 +90,384 @@ class ScroogeReadSupportTests extends WordSpec with Matchers with HadoopSharedPl } } + "ScroogeReadSupport resolving list format" should { + "resolve list legacy format: project x_tuple to legacy array" in { + val fileType = MessageTypeParser.parseMessageType( + """ + |message spark_schema { + | required group country_codes (LIST) { + | repeated binary array (UTF8); + | } + | required int32 x; + |} + """.stripMargin) + val requestedProjection = MessageTypeParser.parseMessageType( + """ + |message SampleProjection { + | optional group country_codes (LIST) { + | repeated binary country_codes_tuple (UTF8); + | } + |} + """.stripMargin) + val projected = ScroogeReadSupport.getSchemaForRead(fileType, requestedProjection) + // note optional of result, and field rename + val expected = MessageTypeParser.parseMessageType( + """ + |message SampleProjection { + | optional group country_codes (LIST) { + | repeated binary array (UTF8); + | } + |} + """.stripMargin) + projected shouldEqual expected + } + + "resolve list legacy format: project x_tuple to 3-level" in { + val fileType = MessageTypeParser.parseMessageType( + """ + |message spark_schema { + | required group country_codes (LIST) { + | repeated group list { + | required binary element (UTF8); + | } + | } + | required int32 x; + |} + """.stripMargin) + val requestedProjection = MessageTypeParser.parseMessageType( + """ + |message SampleProjection { + | optional group country_codes (LIST) { + | repeated binary country_codes_tuple (UTF8); + | } + |} + """.stripMargin) + val projected = ScroogeReadSupport.getSchemaForRead(fileType, requestedProjection) + // note optional of result, and field rename + val expected = MessageTypeParser.parseMessageType( + """ + |message SampleProjection { + | optional group country_codes (LIST) { + | repeated group list { + | required binary element (UTF8); + | } + | } + |} + """.stripMargin) + projected shouldEqual expected + } + + "resolve list legacy format: project nested x_tuple to nested legacy array" in { + val fileType = MessageTypeParser.parseMessageType( + """ + |message spark_schema { + | required group foo (LIST) { + | repeated group array (LIST) { + | repeated binary array (UTF8); + | } + | } + |} + """.stripMargin) + val requestedProjection = MessageTypeParser.parseMessageType( + """ + |message SampleProjection { + | optional group foo (LIST) { + | repeated group foo_tuple (LIST) { + | repeated binary foo_tuple_tuple (UTF8); + | } + | } + |} + """.stripMargin) + val projected = ScroogeReadSupport.getSchemaForRead(fileType, requestedProjection) + // note optional of result, and field rename + val expected = MessageTypeParser.parseMessageType( + """ + |message SampleProjection { + | optional group foo (LIST) { + | repeated group array { + | repeated binary array (UTF8); + | } + | } + |} + """.stripMargin) + projected shouldEqual expected + } + + "resolve list legacy format: project nested x_tuple to nested array" in { + val fileType = MessageTypeParser.parseMessageType( + """ + |message spark_schema { + | required group foo (LIST) { + | repeated group array (LIST) { + | repeated binary array (UTF8); + | } + | } + |} + """.stripMargin) + val requestedProjection = MessageTypeParser.parseMessageType( + """ + |message SampleProjection { + | optional group foo (LIST) { + | repeated group foo_tuple (LIST) { + | repeated binary foo_tuple_tuple (UTF8); + | } + | } + |} + """.stripMargin) + val projected = ScroogeReadSupport.getSchemaForRead(fileType, requestedProjection) + // note optional of result, and field rename + val expected = MessageTypeParser.parseMessageType( + """ + |message SampleProjection { + | optional group foo (LIST) { + | repeated group array { + | repeated binary array (UTF8); + | } + | } + |} + """.stripMargin) + projected shouldEqual expected + } + + "resolve list in group legacy format: project x_tuple to nested 3-level" in { + val fileType = MessageTypeParser.parseMessageType( + """ + |message spark_schema { + | required group foo (LIST) { + | repeated group list { + | required group element (LIST) { + | repeated group list { + | required binary element (UTF8); + | } + | } + | } + | } + |} + """.stripMargin) + val requestedProjection = MessageTypeParser.parseMessageType( + """ + |message SampleProjection { + | optional group foo (LIST) { + | repeated group foo_tuple (LIST) { + | repeated binary foo_tuple_tuple (UTF8); + | } + | } + |} + """.stripMargin) + val projected = ScroogeReadSupport.getSchemaForRead(fileType, requestedProjection) + val expected = MessageTypeParser.parseMessageType( + """ + |message SampleProjection { + | optional group foo (LIST) { + | repeated group list { + | required group element { + | repeated group list { + | required binary element (UTF8); + | } + | } + | } + | } + |} + """.stripMargin) + projected shouldEqual expected + } + + "resolve: binary array to 3-level nesting" in { + val fileType = MessageTypeParser.parseMessageType( + """ + |message spark_schema { + | required group country_codes (LIST) { + | repeated group list { + | required binary element (UTF8); + | } + | } + | required int32 x; + |} + """.stripMargin) + + // inner list is `binary array` + val requestedProjection = MessageTypeParser.parseMessageType( + """ + |message SampleProjection { + | optional group country_codes (LIST) { + | repeated binary array (UTF8); + | } + |} + """.stripMargin) + val projected = ScroogeReadSupport.getSchemaForRead(fileType, requestedProjection) + + val expected = MessageTypeParser.parseMessageType( + """ + |message SampleProjection { + | optional group country_codes (LIST) { + | repeated group list { + | required binary element (UTF8); + | } + | } + |} + """.stripMargin) + projected shouldEqual expected + + } + + "resolve: identity 3-level" in { + val fileType = MessageTypeParser.parseMessageType( + """ + |message spark_schema { + | required group country_codes (LIST) { + | repeated group list { + | required binary element (UTF8); + | } + | } + | required int32 x; + |} + """.stripMargin) + val projected = ScroogeReadSupport.getSchemaForRead(fileType, fileType) + projected shouldEqual fileType + + } + + "resolve nested list: project inner legacy array to 3-level nesting" in { + val fileType = MessageTypeParser.parseMessageType( + """ + |message spark_schema { + | required group array_of_country_codes (LIST) { + | repeated group list { + | required group element (LIST) { + | repeated group list { + | required binary element (UTF8); + | } + | } + | } + | } + | required int32 x; + |} + """.stripMargin) + + // inner list is `binary array` + val requestedProjection = MessageTypeParser.parseMessageType( + """ + |message SampleProjection { + | optional group array_of_country_codes (LIST) { + | repeated group list { + | required group element (LIST) { + | repeated binary array (UTF8); + | } + | } + | } + |} + """.stripMargin) + val projected = ScroogeReadSupport.getSchemaForRead(fileType, requestedProjection) + + val expected = MessageTypeParser.parseMessageType( + """ + |message SampleProjection { + | optional group array_of_country_codes (LIST) { + | repeated group list { + | required group element (LIST) { + | repeated group list { + | required binary element (UTF8); + | } + | } + | } + | } + |} + """.stripMargin) + projected shouldEqual expected + } + + "resolve projected struct in list: repeated group element to 3-level nesting" in { + val fileType = MessageTypeParser.parseMessageType( + """ + |message spark_schema { + | required group country_codes (LIST) { + | repeated group list { + | required group element { + | required binary foo (UTF8); + | required binary bar (UTF8); + | required binary zing (UTF8); + | } + | } + | } + | required int32 x; + |} + """.stripMargin) + val requestedProjection = MessageTypeParser.parseMessageType( + """ + |message SampleProjection { + | optional group country_codes (LIST) { + | repeated group element { + | optional binary foo (UTF8); + | required binary zing (UTF8); + | } + | } + |} + """.stripMargin) + val projected = ScroogeReadSupport.getSchemaForRead(fileType, requestedProjection) + + val expected = MessageTypeParser.parseMessageType( + """ + |message SampleProjection { + | optional group country_codes (LIST) { + | repeated group list { + | required group element { + | optional binary foo (UTF8); + | required binary zing (UTF8); + | } + | } + | } + |} + """.stripMargin) + projected shouldEqual expected + } + + "resolve standard 3-level list to 2-level" in { + val fileType = MessageTypeParser.parseMessageType( + """ + |message scalding_schema { + | required group array_of_country_codes (LIST) { + | repeated group list { + | required group element (LIST) { + | repeated binary array (UTF8); + | } + | } + | } + | required int32 x; + |} + """.stripMargin) + + val requestedProjection = MessageTypeParser.parseMessageType( + """ + |message SampleProjection { + | optional group array_of_country_codes (LIST) { + | repeated group list { + | required group element (LIST) { + | repeated group list { + | required binary element (UTF8); + | } + | } + | } + | } + |} + """.stripMargin) + val projected = ScroogeReadSupport.getSchemaForRead(fileType, requestedProjection) + + val expected = MessageTypeParser.parseMessageType( + """ + |message SampleProjection { + | optional group array_of_country_codes (LIST) { + | repeated group list { + | required group element (LIST) { + | repeated binary array (UTF8); + | } + | } + | } + |} + """.stripMargin) + projected shouldEqual expected + } + } + "ScroogeReadSupport" should { "write using typedparquet and read using parquet scrooge" in { HadoopPlatformJobTest(new WriteToTypedParquetTupleJob(_), cluster) From 721a490a3e2b93c21e5fb7b9a4755ba1c614970a Mon Sep 17 00:00:00 2001 From: Mick Jermsurawong Date: Fri, 27 Sep 2019 17:14:17 -0700 Subject: [PATCH 02/34] bump travis to openjdk8 --- .travis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 42137d4c4f..2a5218f60d 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,5 +1,5 @@ language: scala -jdk: oraclejdk8 +jdk: openjdk8 sudo: false before_install: From 0adbfaea34f5a22331490ad9a87f74646497a1b5 Mon Sep 17 00:00:00 2001 From: Mick Jermsurawong Date: Sun, 29 Sep 2019 16:49:15 -0700 Subject: [PATCH 03/34] fail when projecting on list of struct --- .../scrooge/ScroogeReadSupportTests.scala | 97 ++++++++++++++++++- 1 file changed, 96 insertions(+), 1 deletion(-) diff --git a/scalding-parquet-scrooge/src/test/scala/com/twitter/scalding/parquet/scrooge/ScroogeReadSupportTests.scala b/scalding-parquet-scrooge/src/test/scala/com/twitter/scalding/parquet/scrooge/ScroogeReadSupportTests.scala index fd7cbeb4a4..86cf1069b9 100644 --- a/scalding-parquet-scrooge/src/test/scala/com/twitter/scalding/parquet/scrooge/ScroogeReadSupportTests.scala +++ b/scalding-parquet-scrooge/src/test/scala/com/twitter/scalding/parquet/scrooge/ScroogeReadSupportTests.scala @@ -260,7 +260,7 @@ class ScroogeReadSupportTests extends WordSpec with Matchers with HadoopSharedPl |message SampleProjection { | optional group foo (LIST) { | repeated group list { - | required group element { + | required group element (LIST) { | repeated group list { | required binary element (UTF8); | } @@ -466,6 +466,101 @@ class ScroogeReadSupportTests extends WordSpec with Matchers with HadoopSharedPl """.stripMargin) projected shouldEqual expected } + + "resolve list in group containing list" in { + val fileType = MessageTypeParser.parseMessageType( + """ + |message spark_schema { + | optional group connect_delays (LIST) { + | repeated group list { + | required group element { + | optional binary description (UTF8); + | optional binary created_by (UTF8); + | optional group currencies (LIST) { + | repeated group list { + | required binary element (UTF8); + | } + | } + | } + | } + | } + |} + """.stripMargin) + val requestedProjection = MessageTypeParser.parseMessageType( + """ + |message SampleProjection { + | optional group connect_delays (LIST) { + | repeated group connect_delays_tuple { + | optional binary description (UTF8); + | optional group currencies (LIST) { + | repeated binary currencies_tuple (UTF8); + | } + | } + | } + |} + """.stripMargin) + val projected = ScroogeReadSupport.getSchemaForRead(fileType, requestedProjection) + val expected = MessageTypeParser.parseMessageType( + """ + |message SampleProjection { + | optional group connect_delays (LIST) { + | repeated group list { + | required group element { + | optional binary description (UTF8); + | optional group currencies (LIST) { + | repeated group list { + | required binary element (UTF8); + | } + | } + | } + | } + | } + |} + """.stripMargin) + projected shouldEqual expected + } + + "resolve projection of different level" in { + val fileType = MessageTypeParser.parseMessageType( + """ + |message spark_schema { + | optional group foo (LIST) { + | repeated group list { + | required group element { + | required binary zing (UTF8); + | required binary bar (UTF8); + | } + | } + | } + |} + """.stripMargin) + val requestedProjection = MessageTypeParser.parseMessageType( + """ + |message SampleProjection { + | optional group foo (LIST) { + | repeated group list { + | required group element { + | optional binary zing (UTF8); + | } + | } + | } + |} + """.stripMargin) + val projected = ScroogeReadSupport.getSchemaForRead(fileType, requestedProjection) + val expected = MessageTypeParser.parseMessageType( + """ + |message SampleProjection { + | optional group foo (LIST) { + | repeated group list { + | required group element { + | optional binary zing (UTF8); + | } + | } + | } + |} + """.stripMargin) + projected shouldEqual expected + } } "ScroogeReadSupport" should { From 84c119ab81a7ace3e0d54fd51e73ddd272b50622 Mon Sep 17 00:00:00 2001 From: Mick Jermsurawong Date: Sun, 29 Sep 2019 16:50:07 -0700 Subject: [PATCH 04/34] recurse only on elements --- ...ParquetListFormatForwardCompatibility.java | 141 +++++++++++++++--- 1 file changed, 118 insertions(+), 23 deletions(-) diff --git a/scalding-parquet-scrooge/src/main/java/com/twitter/scalding/parquet/scrooge/ParquetListFormatForwardCompatibility.java b/scalding-parquet-scrooge/src/main/java/com/twitter/scalding/parquet/scrooge/ParquetListFormatForwardCompatibility.java index 13392d59b0..34acd429c3 100644 --- a/scalding-parquet-scrooge/src/main/java/com/twitter/scalding/parquet/scrooge/ParquetListFormatForwardCompatibility.java +++ b/scalding-parquet-scrooge/src/main/java/com/twitter/scalding/parquet/scrooge/ParquetListFormatForwardCompatibility.java @@ -36,17 +36,9 @@ public class ParquetListFormatForwardCompatibility { * repeated type. */ abstract static public class Rule { - public Type elementType(Type repeatedType) { - if (repeatedType.isPrimitive()) { - return repeatedType; - } else { - return firstField(repeatedType.asGroupType()); - } - } + abstract public Type elementType(Type repeatedType); - public Boolean isElementRequired(Type repeatedType) { - return true; - } + abstract Boolean isElementRequired(Type repeatedType); public String elementName(Type repeatedType) { return this.elementType(repeatedType).getName(); @@ -63,11 +55,27 @@ public OriginalType elementOriginalType(Type repeatedType) { } static class RulePrimitiveElement extends Rule { + /** + * repeated int32 element; + */ public String constantElementName() { return "element"; } + public Type elementType(Type repeatedType) { + return repeatedType; + } + + @Override + Boolean isElementRequired(Type repeatedType) { + // According to Rule 1 from, + // https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#backward-compatibility-rules + // "the repeated field is not a group, + // then its type is the element type and elements are required." + return true; + } + @Override public Boolean check(Type repeatedType) { return repeatedType.isPrimitive() && repeatedType.getName().equals(this.constantElementName()); @@ -93,6 +101,10 @@ public Type createCompliantRepeatedType(Type type, String name, Boolean isElemen } static class RulePrimitiveArray extends RulePrimitiveElement { + /** + * repeated binary array (UTF8); + */ + @Override public String constantElementName() { return "array"; @@ -100,10 +112,24 @@ public String constantElementName() { } static class RuleGroupElement extends Rule { + /** + * repeated group element { + * required binary str (UTF8); + * required int32 num; + * }; + */ public String constantElementName() { return "element"; } + public Boolean isElementRequired(Type repeatedType) { + // According Rule 2 from + // https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#backward-compatibility-rules + // "If the repeated field is a group with multiple fields, + // then its type is the element type and elements are required." + return true; + } + public Type elementType(Type repeatedType) { return repeatedType; } @@ -159,25 +185,46 @@ public Type elementType(Type repeatedType) { return repeatedType; } + @Override + Boolean isElementRequired(Type repeatedType) { + // According to Rule 3 from + // https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#backward-compatibility-rules + return true; + } + @Override public Type createCompliantRepeatedType(Type type, String name, Boolean isElementRequired, OriginalType originalType) { - if (!type.isPrimitive()) { - throw new IllegalArgumentException(String.format( - "Rule 3 can only take group type, but found %s", type)); - } + if (!name.endsWith("_tuple")) { name = name + "_tuple"; } - return new PrimitiveType( - Type.Repetition.REPEATED, - type.asPrimitiveType().getPrimitiveTypeName(), - name, - originalType - ); + if (type.isPrimitive()) { + return new PrimitiveType( + Type.Repetition.REPEATED, + type.asPrimitiveType().getPrimitiveTypeName(), + name, + originalType + ); + } else { + return new GroupType( + Type.Repetition.REPEATED, + name, + OriginalType.LIST, + type.asGroupType().getFields() + ); + } } } static class RuleStandardThreeLevel extends Rule { + /** + * group (LIST) { + * repeated group list { + * element; + * } + * } + */ + @Override public Boolean check(Type repeatedField) { if (repeatedField.isPrimitive() || !repeatedField.getName().equals("list")) { @@ -187,6 +234,16 @@ public Boolean check(Type repeatedField) { return elementType.getName().equals("element"); } + @Override + public Type elementType(Type repeatedType) { + return firstField(repeatedType.asGroupType()); + } + + @Override + Boolean isElementRequired(Type repeatedType) { + return elementType(repeatedType).getRepetition() == Type.Repetition.REQUIRED; + } + @Override public String elementName(Type repeatedType) { return "element"; @@ -194,7 +251,8 @@ public String elementName(Type repeatedType) { @Override public Type createCompliantRepeatedType(Type type, String name, Boolean isElementRequired, OriginalType originalType) { - Type elementType = null; + + Type elementType; if (type.isPrimitive()) { elementType = new PrimitiveType( isElementRequired ? Type.Repetition.REQUIRED : Type.Repetition.OPTIONAL, @@ -203,9 +261,11 @@ public Type createCompliantRepeatedType(Type type, String name, Boolean isElemen originalType ); } else { + elementType = new GroupType( isElementRequired ? Type.Repetition.REQUIRED : Type.Repetition.OPTIONAL, "element", + isGroupList(type) ? OriginalType.LIST: null, // we cannot flatten `list` type.asGroupType().getName().equals("list") ? Arrays.asList(type) : @@ -235,6 +295,23 @@ private static boolean isGroupList(Type projection) { groupProjection.getFields().get(0).isRepetition(Type.Repetition.REPEATED); } + public Type elementType(Type repeatedType, String debuggingTypeSource) { + Rule fileTypeRule = findFirstRule(repeatedType, debuggingTypeSource); + return fileTypeRule.elementType(repeatedType); + } + + public Type wrap(Type repeatedType, Type elementType) { + Rule projectedTypeRule = findFirstRule(repeatedType, "projection"); + + return projectedTypeRule.createCompliantRepeatedType( + elementType, + elementType.getName(), + // if repeated or required, it is required + !elementType.isRepetition(Type.Repetition.OPTIONAL), + elementType.getOriginalType()); + } + + /** * Resolve list format in forward compatible way. * @param fileType file type which has new format @@ -257,8 +334,26 @@ public Type resolveTypeFormat(Type fileType, Type projection) { Type repeatedProjection = unwrappedProjection.repeatedType; if (repeatedProjection != null && repeatedFile != null) { - // Recurse on the repeated content. This is to handle nested list - Type repeatedResolved = resolveTypeFormat(repeatedFile, repeatedProjection); + // Repeated types cannot be recursed yet, because file and projection might have + // format-specific wrappers. Instead, we need to extract its element type first. + // Eg. without unwrapping `repeated` layer, we will only find `element` field in the file type + // File type: | Projection type: + // optional group foo (LIST) { | optional group foo (LIST) { + // repeated group list { | repeated group foo_tuple { + // required group element { | optional binary zing (UTF8); + // required binary zing (UTF8); | optional binary bar (UTF8); + // required binary bar (UTF8); | } + // } | } + // } + // } + Type elementFile = compatibility.elementType(repeatedFile, "file"); + Type elementProjection = compatibility.elementType(repeatedProjection, "projection"); + + // Recurse on the element. This is to handle nested list + Type elementResolved = resolveTypeFormat(elementFile, elementProjection); + // Wrap + Type repeatedResolved = compatibility.wrap(repeatedProjection, elementResolved); + // Make projected structure compatible with file type Type repeatedFormatted = compatibility .makeForwardCompatible(repeatedFile, repeatedResolved); From 6bf7652bb13bd6273b28492b27f7e6c3e032dab0 Mon Sep 17 00:00:00 2001 From: Mick Jermsurawong Date: Sun, 29 Sep 2019 17:21:40 -0700 Subject: [PATCH 05/34] check for non-optional extra fields for projection --- .../ParquetListFormatForwardCompatibility.java | 16 +++++++++++----- .../scrooge/ScroogeReadSupportTests.scala | 6 ++++-- 2 files changed, 15 insertions(+), 7 deletions(-) diff --git a/scalding-parquet-scrooge/src/main/java/com/twitter/scalding/parquet/scrooge/ParquetListFormatForwardCompatibility.java b/scalding-parquet-scrooge/src/main/java/com/twitter/scalding/parquet/scrooge/ParquetListFormatForwardCompatibility.java index 34acd429c3..9786d0b070 100644 --- a/scalding-parquet-scrooge/src/main/java/com/twitter/scalding/parquet/scrooge/ParquetListFormatForwardCompatibility.java +++ b/scalding-parquet-scrooge/src/main/java/com/twitter/scalding/parquet/scrooge/ParquetListFormatForwardCompatibility.java @@ -4,6 +4,7 @@ import org.apache.parquet.schema.OriginalType; import org.apache.parquet.schema.PrimitiveType; import org.apache.parquet.schema.Type; +import org.apache.parquet.thrift.DecodingSchemaMismatchException; import java.util.ArrayList; import java.util.Arrays; @@ -367,13 +368,18 @@ public Type resolveTypeFormat(Type fileType, Type projection) { } else { List fields = new ArrayList(); for (Type projected : groupProjection.getFields()) { - if (!projected.isPrimitive()) { - // The file type field must be a group type too + if (!groupFile.containsField(projected.getName())) { + if (!projected.isRepetition(Type.Repetition.OPTIONAL)) { + throw new DecodingSchemaMismatchException( + String.format("Found non-optional projection field:\n%s\n\n" + + "not present in the given file type:\n%s", + projected, groupFile)); + } + fields.add(projected); + } else { int fieldIndex = groupFile.getFieldIndex(projected.getName()); Type fileField = groupFile.getFields().get(fieldIndex); - fields.add(resolveTypeFormat(fileField.asGroupType(), projected.asGroupType())); - } else { - fields.add(projected); + fields.add(resolveTypeFormat(fileField, projected)); } } return groupProjection.withNewFields(fields); diff --git a/scalding-parquet-scrooge/src/test/scala/com/twitter/scalding/parquet/scrooge/ScroogeReadSupportTests.scala b/scalding-parquet-scrooge/src/test/scala/com/twitter/scalding/parquet/scrooge/ScroogeReadSupportTests.scala index 86cf1069b9..ab53c9695b 100644 --- a/scalding-parquet-scrooge/src/test/scala/com/twitter/scalding/parquet/scrooge/ScroogeReadSupportTests.scala +++ b/scalding-parquet-scrooge/src/test/scala/com/twitter/scalding/parquet/scrooge/ScroogeReadSupportTests.scala @@ -540,7 +540,8 @@ class ScroogeReadSupportTests extends WordSpec with Matchers with HadoopSharedPl | optional group foo (LIST) { | repeated group list { | required group element { - | optional binary zing (UTF8); + | optional binary bar (UTF8); + | required binary zing (UTF8); | } | } | } @@ -553,7 +554,8 @@ class ScroogeReadSupportTests extends WordSpec with Matchers with HadoopSharedPl | optional group foo (LIST) { | repeated group list { | required group element { - | optional binary zing (UTF8); + | optional binary bar (UTF8); + | required binary zing (UTF8); | } | } | } From 018f2677b699e350af623c5a4f398b154f6e87a9 Mon Sep 17 00:00:00 2001 From: Mick Jermsurawong Date: Mon, 30 Sep 2019 05:13:12 -0700 Subject: [PATCH 06/34] move to scala --- ...ParquetListFormatForwardCompatibility.java | 75 ++++---- .../parquet/scrooge/ScroogeReadSupport.java | 5 +- ...CollectionFormatForwardCompatibility.scala | 137 ++++++++++++++ .../scrooge/ParquetListFormatRule.scala | 176 ++++++++++++++++++ 4 files changed, 358 insertions(+), 35 deletions(-) create mode 100644 scalding-parquet-scrooge/src/main/scala/com/twitter/scalding/parquet/scrooge/ParquetCollectionFormatForwardCompatibility.scala create mode 100644 scalding-parquet-scrooge/src/main/scala/com/twitter/scalding/parquet/scrooge/ParquetListFormatRule.scala diff --git a/scalding-parquet-scrooge/src/main/java/com/twitter/scalding/parquet/scrooge/ParquetListFormatForwardCompatibility.java b/scalding-parquet-scrooge/src/main/java/com/twitter/scalding/parquet/scrooge/ParquetListFormatForwardCompatibility.java index 9786d0b070..34356a79aa 100644 --- a/scalding-parquet-scrooge/src/main/java/com/twitter/scalding/parquet/scrooge/ParquetListFormatForwardCompatibility.java +++ b/scalding-parquet-scrooge/src/main/java/com/twitter/scalding/parquet/scrooge/ParquetListFormatForwardCompatibility.java @@ -49,9 +49,9 @@ public OriginalType elementOriginalType(Type repeatedType) { return this.elementType(repeatedType).getOriginalType(); } - abstract Boolean check(Type type); + abstract Boolean check(Type typ); - abstract Type createCompliantRepeatedType(Type type, String name, Boolean isElementRequired, OriginalType originalType); + abstract Type createCompliantRepeatedType(Type typ, String name, Boolean isElementRequired, OriginalType originalType); } @@ -83,17 +83,17 @@ public Boolean check(Type repeatedType) { } @Override - public Type createCompliantRepeatedType(Type type, String name, Boolean isElementRequired, OriginalType originalType) { + public Type createCompliantRepeatedType(Type typ, String name, Boolean isElementRequired, OriginalType originalType) { if (!isElementRequired) { throw new IllegalArgumentException("Rule 1 can only take required element"); } - if (!type.isPrimitive()) { + if (!typ.isPrimitive()) { throw new IllegalArgumentException( - String.format("Rule 1 cannot take primitive type, but is given %s", type)); + String.format("Rule 1 cannot take primitive type, but is given %s", typ)); } return new PrimitiveType( Type.Repetition.REPEATED, - type.asPrimitiveType().getPrimitiveTypeName(), + typ.asPrimitiveType().getPrimitiveTypeName(), this.constantElementName(), originalType ); @@ -151,18 +151,18 @@ public Boolean check(Type repeatedType) { } @Override - public Type createCompliantRepeatedType(Type type, String name, Boolean isElementRequired, OriginalType originalType) { - if (type.isPrimitive()) { + public Type createCompliantRepeatedType(Type typ, String name, Boolean isElementRequired, OriginalType originalType) { + if (typ.isPrimitive()) { return new GroupType( Type.Repetition.REPEATED, this.constantElementName(), - type + typ ); } else { return new GroupType( Type.Repetition.REPEATED, this.constantElementName(), - type.asGroupType().getFields() + typ.asGroupType().getFields() ); } } @@ -194,15 +194,14 @@ Boolean isElementRequired(Type repeatedType) { } @Override - public Type createCompliantRepeatedType(Type type, String name, Boolean isElementRequired, OriginalType originalType) { - + public Type createCompliantRepeatedType(Type typ, String name, Boolean isElementRequired, OriginalType originalType) { if (!name.endsWith("_tuple")) { name = name + "_tuple"; } - if (type.isPrimitive()) { + if (typ.isPrimitive()) { return new PrimitiveType( Type.Repetition.REPEATED, - type.asPrimitiveType().getPrimitiveTypeName(), + typ.asPrimitiveType().getPrimitiveTypeName(), name, originalType ); @@ -211,7 +210,7 @@ public Type createCompliantRepeatedType(Type type, String name, Boolean isElemen Type.Repetition.REPEATED, name, OriginalType.LIST, - type.asGroupType().getFields() + typ.asGroupType().getFields() ); } } @@ -251,29 +250,27 @@ public String elementName(Type repeatedType) { } @Override - public Type createCompliantRepeatedType(Type type, String name, Boolean isElementRequired, OriginalType originalType) { + public Type createCompliantRepeatedType(Type typ, String name, Boolean isElementRequired, OriginalType originalType) { Type elementType; - if (type.isPrimitive()) { + if (typ.isPrimitive()) { elementType = new PrimitiveType( isElementRequired ? Type.Repetition.REQUIRED : Type.Repetition.OPTIONAL, - type.asPrimitiveType().getPrimitiveTypeName(), + typ.asPrimitiveType().getPrimitiveTypeName(), "element", originalType ); } else { - elementType = new GroupType( isElementRequired ? Type.Repetition.REQUIRED : Type.Repetition.OPTIONAL, "element", - isGroupList(type) ? OriginalType.LIST: null, + isGroupList(typ) ? OriginalType.LIST: null, // we cannot flatten `list` - type.asGroupType().getName().equals("list") ? - Arrays.asList(type) : - type.asGroupType().getFields() + typ.asGroupType().getName().equals("list") ? + Arrays.asList(typ) : + typ.asGroupType().getFields() ); } - return new GroupType( Type.Repetition.REPEATED, "list", @@ -296,6 +293,21 @@ private static boolean isGroupList(Type projection) { groupProjection.getFields().get(0).isRepetition(Type.Repetition.REPEATED); } + private static boolean isGroupMap(Type projection) { + if (projection.isPrimitive()) { + return false; + } + GroupType groupProjection = projection.asGroupType(); + return groupProjection.getOriginalType() == OriginalType.MAP && + groupProjection.getFieldCount() == 1 && + groupProjection.getFields().get(0).isRepetition(Type.Repetition.REPEATED) && + ( + (groupProjection.getFields().get(0).getName().equals("map") && + groupProjection.getFields().get(0).getOriginalType() == OriginalType.MAP_KEY_VALUE) + || groupProjection.getFields().get(0).getName().equals("key_value") + ); + } + public Type elementType(Type repeatedType, String debuggingTypeSource) { Rule fileTypeRule = findFirstRule(repeatedType, debuggingTypeSource); return fileTypeRule.elementType(repeatedType); @@ -312,7 +324,6 @@ public Type wrap(Type repeatedType, Type elementType) { elementType.getOriginalType()); } - /** * Resolve list format in forward compatible way. * @param fileType file type which has new format @@ -369,12 +380,10 @@ public Type resolveTypeFormat(Type fileType, Type projection) { List fields = new ArrayList(); for (Type projected : groupProjection.getFields()) { if (!groupFile.containsField(projected.getName())) { - if (!projected.isRepetition(Type.Repetition.OPTIONAL)) { - throw new DecodingSchemaMismatchException( - String.format("Found non-optional projection field:\n%s\n\n" + - "not present in the given file type:\n%s", - projected, groupFile)); - } + // This can happen when + // 1) projecting optional field over non-existent target schema + // 2) field is a part of legacy map format + // Make no assertions (separation of responsibility) and just include it fields.add(projected); } else { int fieldIndex = groupFile.getFieldIndex(projected.getName()); @@ -431,8 +440,8 @@ public GroupUnwrapped(Stack wrappers, Type repeatedType) { } } - private static GroupUnwrapped unwrapGroup(Type type, Stack wrappers) { - Type ptr = type; + private static GroupUnwrapped unwrapGroup(Type typ, Stack wrappers) { + Type ptr = typ; // only wrapper for list with size one, so we can wrap repeated type later while (!ptr.isPrimitive()) { wrappers.push(ptr.asGroupType()); diff --git a/scalding-parquet-scrooge/src/main/java/com/twitter/scalding/parquet/scrooge/ScroogeReadSupport.java b/scalding-parquet-scrooge/src/main/java/com/twitter/scalding/parquet/scrooge/ScroogeReadSupport.java index efd72920c1..922cb04d6c 100644 --- a/scalding-parquet-scrooge/src/main/java/com/twitter/scalding/parquet/scrooge/ScroogeReadSupport.java +++ b/scalding-parquet-scrooge/src/main/java/com/twitter/scalding/parquet/scrooge/ScroogeReadSupport.java @@ -130,8 +130,9 @@ public static MessageType getSchemaForRead(MessageType fileMessageType, String p */ public static MessageType getSchemaForRead(MessageType fileMessageType, MessageType projectedMessageType) { assertGroupsAreCompatible(fileMessageType, projectedMessageType); - Type resolved = new ParquetListFormatForwardCompatibility().resolveTypeFormat(fileMessageType, projectedMessageType); - return new MessageType(projectedMessageType.getName(), resolved.asGroupType().getFields()); + return ParquetCollectionFormatForwardCompatibility.forwardCompatibleMessage( + projectedMessageType, + fileMessageType); } /** diff --git a/scalding-parquet-scrooge/src/main/scala/com/twitter/scalding/parquet/scrooge/ParquetCollectionFormatForwardCompatibility.scala b/scalding-parquet-scrooge/src/main/scala/com/twitter/scalding/parquet/scrooge/ParquetCollectionFormatForwardCompatibility.scala new file mode 100644 index 0000000000..a55f49078f --- /dev/null +++ b/scalding-parquet-scrooge/src/main/scala/com/twitter/scalding/parquet/scrooge/ParquetCollectionFormatForwardCompatibility.scala @@ -0,0 +1,137 @@ +package com.twitter.scalding.parquet.scrooge + +import java.util + +import org.apache.parquet.schema.{GroupType, MessageType, Type} +import org.slf4j.LoggerFactory + +private[scrooge] case class GroupUnwrapped(wrappers: Seq[GroupType], repeatedType: Type) + +object ParquetCollectionFormatForwardCompatibility { + + private val LOGGER = LoggerFactory.getLogger(getClass) + + private val SOURCE_LIST_RULES = List( + PrimitiveElementRule, PrimitiveArrayRule, + GroupElementRule, GroupArrayRule, + TupleRule, StandardRule + ) + + private val TARGET_LIST_RULES = SOURCE_LIST_RULES.filterNot(_ == TupleRule) + + /** + * Create a forward-compatible schema, using content from source type with format from target type. + * @param sourceType source type with legacy format + * @param targetType target type to which source is converted to + */ + def forwardCompatibleMessage(sourceType: MessageType, targetType: MessageType): MessageType = { + val groupResult = forwardCompatibleType(sourceType, targetType).asGroupType() + LOGGER.info("Making source schema to be compatible with target" + + s"\nSource:\n${sourceType}\nTarget:\n${targetType}\nResult:\n${groupResult}") + new MessageType(groupResult.getName, groupResult.getFields) + } + + private def wrapElementAsRepeatedType(rule: ParquetListFormatRule, repeatedType: Type, elementType: Type): Type = { + rule.createCompliantRepeatedType( + elementType, + rule.elementName(repeatedType), + // if repeated or required, it is required + !elementType.isRepetition(Type.Repetition.OPTIONAL), elementType.getOriginalType) + } + + private def forwardCompatibleType(sourceType: Type, targetType: Type): Type = { + if (sourceType.isPrimitive || targetType.isPrimitive) { + return sourceType + } + + val sourceUnwrapped = unwrapGroup(sourceType.asGroupType) + val targetUnwrapped = unwrapGroup(targetType.asGroupType) + + if (targetUnwrapped.isDefined && sourceUnwrapped.isDefined) { + val targetRepeated = targetUnwrapped.get.repeatedType + val sourceRepeated = sourceUnwrapped.get.repeatedType + + val sourceRule = findFirstRule(SOURCE_LIST_RULES, sourceRepeated, "source") + val targetRule = findFirstRule(TARGET_LIST_RULES, targetRepeated, "target") + + val repeatedFormatted = (sourceRule, targetRule) match { + case (Some(sRule), Some(tRule)) => { + val sourceElement = sRule.elementType(sourceRepeated) + val targetElement = tRule.elementType(targetRepeated) + // Recurse on the element. This is to handle nested list + val forwardCompatElement = forwardCompatibleType(sourceElement, targetElement) + // Wrap the solved element with current source structure, and do actual conversion work + val repeatedResolved = wrapElementAsRepeatedType(sRule, sourceRepeated, forwardCompatElement) + convertToTarget(targetRepeated, repeatedResolved) + } + case _ => sourceRepeated // No-op + } + + // Wrapped the solved repeated type in its original groups, + // describing field name and whether it's optional/required + sourceUnwrapped.get.wrappers.foldRight(repeatedFormatted) { + (wrapper, group) => wrapper.withNewFields(group) + } + } else { + val sourceGroup = sourceType.asGroupType + val targetGroup = targetType.asGroupType + + val resultFields = new util.ArrayList[Type] + import scala.collection.JavaConversions._ + for (sourceField <- sourceGroup.getFields) { + if (!targetGroup.containsField(sourceField.getName)) { // This can happen when + // 1) projecting optional field over non-existent target schema + // 2) field is a part of legacy map format + // Make no assertions (separation of responsibility) and just include it + resultFields.add(sourceField) + } + else { + val fieldIndex = targetGroup.getFieldIndex(sourceField.getName) + val targetField = targetGroup.getFields.get(fieldIndex) + resultFields.add(forwardCompatibleType(sourceField, targetField)) + } + } + sourceGroup.withNewFields(resultFields) + } + } + + private def findFirstRule(rules: Seq[ParquetListFormatRule], + repeatedType: Type, + debuggingTypeSource: String): Option[ParquetListFormatRule] = { + val ruleFound = rules.find(rule => rule.check(repeatedType)) + if (ruleFound.isEmpty) { + LOGGER.warn(s"Unable to find matching rule for $debuggingTypeSource schema:\n$repeatedType") + } + ruleFound + } + + private def convertToTarget(repeatedTargetType: Type, repeatedSourceType: Type): Type = { + val sourceRuleMaybe = findFirstRule(SOURCE_LIST_RULES, repeatedSourceType, "source") + val targetRuleMaybe = findFirstRule(TARGET_LIST_RULES, repeatedTargetType, "target") + if (sourceRuleMaybe == targetRuleMaybe || sourceRuleMaybe.isEmpty || targetRuleMaybe.isEmpty) { + repeatedSourceType + } else { + val sourceRule = sourceRuleMaybe.get + val targetRule = targetRuleMaybe.get + val elementType = sourceRule.elementType(repeatedSourceType) + targetRule.createCompliantRepeatedType( + typ= elementType, + name= elementType.getName, + isElementRequired= sourceRule.isElementRequired(repeatedSourceType), + originalType= sourceRule.elementOriginalType(repeatedSourceType) + ) + } + } + + private def unwrapGroup(typ: Type, wrappers: Seq[GroupType] = Seq()): Option[GroupUnwrapped] = { + if (typ.isPrimitive || typ.asGroupType.getFieldCount != 1) { + None + } else { + if (ParquetListFormatRule.isGroupList(typ)) { // when it is repeated + Some(GroupUnwrapped(wrappers :+ typ.asGroupType(), typ.asGroupType.getFields.get(0))) + } else { + unwrapGroup(typ.asGroupType.getFields.get(0), wrappers :+ typ.asGroupType()) + } + } + } +} diff --git a/scalding-parquet-scrooge/src/main/scala/com/twitter/scalding/parquet/scrooge/ParquetListFormatRule.scala b/scalding-parquet-scrooge/src/main/scala/com/twitter/scalding/parquet/scrooge/ParquetListFormatRule.scala new file mode 100644 index 0000000000..77cc646328 --- /dev/null +++ b/scalding-parquet-scrooge/src/main/scala/com/twitter/scalding/parquet/scrooge/ParquetListFormatRule.scala @@ -0,0 +1,176 @@ +package com.twitter.scalding.parquet.scrooge + +import java.util + +import org.apache.parquet.schema.{GroupType, OriginalType, PrimitiveType, Type} + +object ParquetListFormatRule { + def isGroupList(projection: Type): Boolean = { + if (projection.isPrimitive) return false + val groupProjection = projection.asGroupType + (groupProjection.getOriginalType eq OriginalType.LIST) && groupProjection.getFieldCount == 1 && groupProjection.getFields.get(0).isRepetition(Type.Repetition.REPEATED) + } +} + +private[scrooge] sealed trait ParquetListFormatRule { + def elementType(repeatedType: Type): Type + + def elementName(repeatedType: Type): String = this.elementType(repeatedType).getName + + def elementOriginalType(repeatedType: Type): OriginalType = this.elementType(repeatedType).getOriginalType + + private[scrooge] def isElementRequired(repeatedType: Type): Boolean + + private[scrooge] def check(typ: Type): Boolean + + private[scrooge] def createCompliantRepeatedType(typ: Type, + name: String, + isElementRequired: Boolean, + originalType: OriginalType): Type +} + +/** + * repeated int32 [element|array]; + */ +private[scrooge] sealed trait PrimitiveListRule extends ParquetListFormatRule { + + def constantElementName: String + + override def elementType(repeatedType: Type): Type = repeatedType + + override private[scrooge] def isElementRequired(repeatedType: Type) = { + // According to Rule 1 from, + // https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#backward-compatibility-rules + // "the repeated field is not a group, + // then its type is the element type and elements are required." + true + } + + override def check(repeatedType: Type): Boolean = + repeatedType.isPrimitive && repeatedType.getName == this.constantElementName + + override def createCompliantRepeatedType(typ: Type, name: String, isElementRequired: Boolean, originalType: OriginalType): Type = { + if (!isElementRequired) throw new IllegalArgumentException("Primitive list format can only take required element") + if (!typ.isPrimitive) throw new IllegalArgumentException(String.format("Primitive list format cannot take group, but is given %s", typ)) + new PrimitiveType(Type.Repetition.REPEATED, typ.asPrimitiveType.getPrimitiveTypeName, this.constantElementName, originalType) + } +} + +object PrimitiveElementRule extends PrimitiveListRule { + override def constantElementName: String = "element" +} + +object PrimitiveArrayRule extends PrimitiveListRule { + override def constantElementName: String = "array" +} + +/** + * repeated group [element|array] { + * required binary str (UTF8); + * required int32 num; + * } + */ +private[scrooge] sealed trait GroupListRule extends ParquetListFormatRule { + + def constantElementName: String + + override def isElementRequired(repeatedType: Type): Boolean = { + // According Rule 2 from + // https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#backward-compatibility-rules + // "If the repeated field is a group with multiple fields, + // then its type is the element type and elements are required." + true + } + + override def elementType(repeatedType: Type): Type = repeatedType + + override def elementName(repeatedType: Type): String = this.constantElementName + + override def check(repeatedType: Type): Boolean = { + if (repeatedType.isPrimitive) false + else { + val groupType = repeatedType.asGroupType + groupType.getFields.size > 0 && groupType.getName == this.constantElementName + } + } + + override def createCompliantRepeatedType(typ: Type, name: String, isElementRequired: Boolean, originalType: OriginalType): Type = { + if (typ.isPrimitive) new GroupType(Type.Repetition.REPEATED, this.constantElementName, typ) + else new GroupType(Type.Repetition.REPEATED, this.constantElementName, typ.asGroupType.getFields) + } +} + +object GroupElementRule extends GroupListRule { + override def constantElementName: String = "element" +} + +object GroupArrayRule extends GroupListRule { + override def constantElementName: String = "array" +} + +object TupleRule extends ParquetListFormatRule { + override def check(repeatedType: Type): Boolean = repeatedType.getName.endsWith("_tuple") + + override def elementName(repeatedType: Type): String = { + repeatedType.getName.substring(0, repeatedType.getName.length - 6) + } + + override def elementType(repeatedType: Type): Type = repeatedType + + override private[scrooge] def isElementRequired(repeatedType: Type) = { + // According to Rule 3 from + // https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#backward-compatibility-rules + true + } + + override def createCompliantRepeatedType(typ: Type, name: String, isElementRequired: Boolean, originalType: OriginalType): Type = { + val suffixed_name = name + "_tuple" + if (typ.isPrimitive) new PrimitiveType(Type.Repetition.REPEATED, typ.asPrimitiveType.getPrimitiveTypeName, suffixed_name, originalType) + else new GroupType(Type.Repetition.REPEATED, suffixed_name, OriginalType.LIST, typ.asGroupType.getFields) + } +} + +object StandardRule extends ParquetListFormatRule { + /** + * repeated group list { + * element; + * } + */ + override def check(repeatedField: Type): Boolean = { + if (repeatedField.isPrimitive || !(repeatedField.getName == "list")) { + false + } else { + elementType(repeatedField).getName == "element" + } + } + + override def elementType(repeatedType: Type): Type = firstField(repeatedType.asGroupType) + + override private[scrooge] def isElementRequired(repeatedType: Type): Boolean = elementType(repeatedType).getRepetition eq Type.Repetition.REQUIRED + + override def elementName(repeatedType: Type): String = "element" + + override def createCompliantRepeatedType(typ: Type, name: String, isElementRequired: Boolean, originalType: OriginalType): Type = { + + val repetition = if (isElementRequired) Type.Repetition.REQUIRED else Type.Repetition.OPTIONAL + val elementType = if (typ.isPrimitive) { + + new PrimitiveType(repetition, typ.asPrimitiveType.getPrimitiveTypeName, "element", originalType) + } else { + val listType = if (ParquetListFormatRule.isGroupList(typ)) OriginalType.LIST else null + new GroupType( + repetition, + "element", + listType, + // TODO: generalize this + // we cannot flatten `list` + if (typ.asGroupType.getName == "list") util.Arrays.asList(typ) else typ.asGroupType.getFields) + } + + new GroupType(Type.Repetition.REPEATED, "list", util.Arrays.asList(elementType)) + } + + private def firstField(groupType: GroupType): Type = { + groupType.getFields.get(0) + } +} From ae89c067da33b55f9a41baf62fec5137dcf59dcd Mon Sep 17 00:00:00 2001 From: Mick Jermsurawong Date: Mon, 30 Sep 2019 05:21:38 -0700 Subject: [PATCH 07/34] migrate test to different class --- ...ParquetListFormatForwardCompatibility.java | 459 ------------- .../parquet/scrooge/ScroogeReadSupport.java | 4 +- ...ctionFormatForwardCompatibilityTests.scala | 602 ++++++++++++++++++ .../scrooge/ScroogeReadSupportTests.scala | 477 +------------- 4 files changed, 605 insertions(+), 937 deletions(-) delete mode 100644 scalding-parquet-scrooge/src/main/java/com/twitter/scalding/parquet/scrooge/ParquetListFormatForwardCompatibility.java create mode 100644 scalding-parquet-scrooge/src/test/scala/com/twitter/scalding/parquet/scrooge/ParquetCollectionFormatForwardCompatibilityTests.scala diff --git a/scalding-parquet-scrooge/src/main/java/com/twitter/scalding/parquet/scrooge/ParquetListFormatForwardCompatibility.java b/scalding-parquet-scrooge/src/main/java/com/twitter/scalding/parquet/scrooge/ParquetListFormatForwardCompatibility.java deleted file mode 100644 index 34356a79aa..0000000000 --- a/scalding-parquet-scrooge/src/main/java/com/twitter/scalding/parquet/scrooge/ParquetListFormatForwardCompatibility.java +++ /dev/null @@ -1,459 +0,0 @@ -package com.twitter.scalding.parquet.scrooge; - -import org.apache.parquet.schema.GroupType; -import org.apache.parquet.schema.OriginalType; -import org.apache.parquet.schema.PrimitiveType; -import org.apache.parquet.schema.Type; -import org.apache.parquet.thrift.DecodingSchemaMismatchException; - -import java.util.ArrayList; -import java.util.Arrays; -import java.util.List; -import java.util.Stack; - -/** - * Compatibility class to convert parquet schema of legacy type to standard one - * namely 3-level list structure as recommended in - * https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#lists - * - * More specifically this handles converting from parquet file created by - * {{@code org.apache.parquet.thrift.ThriftSchemaConvertVisitor}} which always suffix - * list element with "_tuple". - */ -public class ParquetListFormatForwardCompatibility { - - private static List RULES = Arrays.asList( - new RulePrimitiveElement(), - new RulePrimitiveArray(), - - new RuleGroupElement(), - new RuleGroupArray(), - - new RuleGroupTuple(), - new RuleStandardThreeLevel()); - - /** - * Rule describes how to match a repeated type, how to decompose them, and reconstruct a - * repeated type. - */ - abstract static public class Rule { - abstract public Type elementType(Type repeatedType); - - abstract Boolean isElementRequired(Type repeatedType); - - public String elementName(Type repeatedType) { - return this.elementType(repeatedType).getName(); - } - - public OriginalType elementOriginalType(Type repeatedType) { - return this.elementType(repeatedType).getOriginalType(); - } - - abstract Boolean check(Type typ); - - abstract Type createCompliantRepeatedType(Type typ, String name, Boolean isElementRequired, OriginalType originalType); - - } - - static class RulePrimitiveElement extends Rule { - /** - * repeated int32 element; - */ - - public String constantElementName() { - return "element"; - } - - public Type elementType(Type repeatedType) { - return repeatedType; - } - - @Override - Boolean isElementRequired(Type repeatedType) { - // According to Rule 1 from, - // https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#backward-compatibility-rules - // "the repeated field is not a group, - // then its type is the element type and elements are required." - return true; - } - - @Override - public Boolean check(Type repeatedType) { - return repeatedType.isPrimitive() && repeatedType.getName().equals(this.constantElementName()); - } - - @Override - public Type createCompliantRepeatedType(Type typ, String name, Boolean isElementRequired, OriginalType originalType) { - if (!isElementRequired) { - throw new IllegalArgumentException("Rule 1 can only take required element"); - } - if (!typ.isPrimitive()) { - throw new IllegalArgumentException( - String.format("Rule 1 cannot take primitive type, but is given %s", typ)); - } - return new PrimitiveType( - Type.Repetition.REPEATED, - typ.asPrimitiveType().getPrimitiveTypeName(), - this.constantElementName(), - originalType - ); - } - - } - - static class RulePrimitiveArray extends RulePrimitiveElement { - /** - * repeated binary array (UTF8); - */ - - @Override - public String constantElementName() { - return "array"; - } - } - - static class RuleGroupElement extends Rule { - /** - * repeated group element { - * required binary str (UTF8); - * required int32 num; - * }; - */ - public String constantElementName() { - return "element"; - } - - public Boolean isElementRequired(Type repeatedType) { - // According Rule 2 from - // https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#backward-compatibility-rules - // "If the repeated field is a group with multiple fields, - // then its type is the element type and elements are required." - return true; - } - - public Type elementType(Type repeatedType) { - return repeatedType; - } - - @Override - public String elementName(Type repeatedType) { - return this.constantElementName(); - } - - @Override - public Boolean check(Type repeatedType) { - if (repeatedType.isPrimitive()) { - return false; - } else { - GroupType repeatedGroup = repeatedType.asGroupType(); - return repeatedGroup.getFields().size() > 0 && repeatedGroup.getName().equals(this.constantElementName()); - } - } - - @Override - public Type createCompliantRepeatedType(Type typ, String name, Boolean isElementRequired, OriginalType originalType) { - if (typ.isPrimitive()) { - return new GroupType( - Type.Repetition.REPEATED, - this.constantElementName(), - typ - ); - } else { - return new GroupType( - Type.Repetition.REPEATED, - this.constantElementName(), - typ.asGroupType().getFields() - ); - } - } - } - - static class RuleGroupArray extends RuleGroupElement { - @Override - public String constantElementName() { - return "array"; - } - } - - static class RuleGroupTuple extends Rule { - - @Override - public Boolean check(Type repeatedType) { - return repeatedType.getName().endsWith("_tuple"); - } - - public Type elementType(Type repeatedType) { - return repeatedType; - } - - @Override - Boolean isElementRequired(Type repeatedType) { - // According to Rule 3 from - // https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#backward-compatibility-rules - return true; - } - - @Override - public Type createCompliantRepeatedType(Type typ, String name, Boolean isElementRequired, OriginalType originalType) { - if (!name.endsWith("_tuple")) { - name = name + "_tuple"; - } - if (typ.isPrimitive()) { - return new PrimitiveType( - Type.Repetition.REPEATED, - typ.asPrimitiveType().getPrimitiveTypeName(), - name, - originalType - ); - } else { - return new GroupType( - Type.Repetition.REPEATED, - name, - OriginalType.LIST, - typ.asGroupType().getFields() - ); - } - } - } - - static class RuleStandardThreeLevel extends Rule { - /** - * group (LIST) { - * repeated group list { - * element; - * } - * } - */ - - @Override - public Boolean check(Type repeatedField) { - if (repeatedField.isPrimitive() || !repeatedField.getName().equals("list")) { - return false; - } - Type elementType = firstField(repeatedField.asGroupType()); - return elementType.getName().equals("element"); - } - - @Override - public Type elementType(Type repeatedType) { - return firstField(repeatedType.asGroupType()); - } - - @Override - Boolean isElementRequired(Type repeatedType) { - return elementType(repeatedType).getRepetition() == Type.Repetition.REQUIRED; - } - - @Override - public String elementName(Type repeatedType) { - return "element"; - } - - @Override - public Type createCompliantRepeatedType(Type typ, String name, Boolean isElementRequired, OriginalType originalType) { - - Type elementType; - if (typ.isPrimitive()) { - elementType = new PrimitiveType( - isElementRequired ? Type.Repetition.REQUIRED : Type.Repetition.OPTIONAL, - typ.asPrimitiveType().getPrimitiveTypeName(), - "element", - originalType - ); - } else { - elementType = new GroupType( - isElementRequired ? Type.Repetition.REQUIRED : Type.Repetition.OPTIONAL, - "element", - isGroupList(typ) ? OriginalType.LIST: null, - // we cannot flatten `list` - typ.asGroupType().getName().equals("list") ? - Arrays.asList(typ) : - typ.asGroupType().getFields() - ); - } - return new GroupType( - Type.Repetition.REPEATED, - "list", - Arrays.asList(elementType) - ); - } - } - - private static org.apache.parquet.schema.Type firstField(GroupType groupType) { - return groupType.getFields().get(0); - } - - private static boolean isGroupList(Type projection) { - if (projection.isPrimitive()) { - return false; - } - GroupType groupProjection = projection.asGroupType(); - return groupProjection.getOriginalType() == OriginalType.LIST && - groupProjection.getFieldCount() == 1 && - groupProjection.getFields().get(0).isRepetition(Type.Repetition.REPEATED); - } - - private static boolean isGroupMap(Type projection) { - if (projection.isPrimitive()) { - return false; - } - GroupType groupProjection = projection.asGroupType(); - return groupProjection.getOriginalType() == OriginalType.MAP && - groupProjection.getFieldCount() == 1 && - groupProjection.getFields().get(0).isRepetition(Type.Repetition.REPEATED) && - ( - (groupProjection.getFields().get(0).getName().equals("map") && - groupProjection.getFields().get(0).getOriginalType() == OriginalType.MAP_KEY_VALUE) - || groupProjection.getFields().get(0).getName().equals("key_value") - ); - } - - public Type elementType(Type repeatedType, String debuggingTypeSource) { - Rule fileTypeRule = findFirstRule(repeatedType, debuggingTypeSource); - return fileTypeRule.elementType(repeatedType); - } - - public Type wrap(Type repeatedType, Type elementType) { - Rule projectedTypeRule = findFirstRule(repeatedType, "projection"); - - return projectedTypeRule.createCompliantRepeatedType( - elementType, - elementType.getName(), - // if repeated or required, it is required - !elementType.isRepetition(Type.Repetition.OPTIONAL), - elementType.getOriginalType()); - } - - /** - * Resolve list format in forward compatible way. - * @param fileType file type which has new format - * @param projection projection type which has legacy format - * @return projection schema in the new format. - */ - public Type resolveTypeFormat(Type fileType, Type projection) { - if (projection.isPrimitive() || fileType.isPrimitive()) { - return projection; - } - ParquetListFormatForwardCompatibility compatibility = new ParquetListFormatForwardCompatibility(); - - GroupType groupFile = fileType.asGroupType(); - GroupType groupProjection = projection.asGroupType(); - - GroupUnwrapped unwrappedFile = unwrapGroup(groupFile, new Stack()); - GroupUnwrapped unwrappedProjection = unwrapGroup(groupProjection, new Stack()); - - Type repeatedFile = unwrappedFile.repeatedType; - Type repeatedProjection = unwrappedProjection.repeatedType; - - if (repeatedProjection != null && repeatedFile != null) { - // Repeated types cannot be recursed yet, because file and projection might have - // format-specific wrappers. Instead, we need to extract its element type first. - // Eg. without unwrapping `repeated` layer, we will only find `element` field in the file type - // File type: | Projection type: - // optional group foo (LIST) { | optional group foo (LIST) { - // repeated group list { | repeated group foo_tuple { - // required group element { | optional binary zing (UTF8); - // required binary zing (UTF8); | optional binary bar (UTF8); - // required binary bar (UTF8); | } - // } | } - // } - // } - Type elementFile = compatibility.elementType(repeatedFile, "file"); - Type elementProjection = compatibility.elementType(repeatedProjection, "projection"); - - // Recurse on the element. This is to handle nested list - Type elementResolved = resolveTypeFormat(elementFile, elementProjection); - // Wrap - Type repeatedResolved = compatibility.wrap(repeatedProjection, elementResolved); - - // Make projected structure compatible with file type - Type repeatedFormatted = compatibility - .makeForwardCompatible(repeatedFile, repeatedResolved); - - // Wrap back the groups, this contain field name and whether it's optional/required - Type resolvedGroupType = repeatedFormatted; - while (!unwrappedProjection.wrappers.isEmpty()) { - resolvedGroupType = unwrappedProjection.wrappers.pop().withNewFields(resolvedGroupType); - } - return resolvedGroupType; - } else { - List fields = new ArrayList(); - for (Type projected : groupProjection.getFields()) { - if (!groupFile.containsField(projected.getName())) { - // This can happen when - // 1) projecting optional field over non-existent target schema - // 2) field is a part of legacy map format - // Make no assertions (separation of responsibility) and just include it - fields.add(projected); - } else { - int fieldIndex = groupFile.getFieldIndex(projected.getName()); - Type fileField = groupFile.getFields().get(fieldIndex); - fields.add(resolveTypeFormat(fileField, projected)); - } - } - return groupProjection.withNewFields(fields); - } - } - - private Rule findFirstRule(Type repeatedType, String debuggingTypeSource) { - Rule matchedRule = null; - for (Rule rule : RULES) { - if (rule.check(repeatedType)) { - matchedRule = rule; - break; - } - } - if (matchedRule == null) { - throw new RuntimeException(String.format( - "Unable to find matching rule for %s schema:\n%s", debuggingTypeSource, repeatedType)); - } - return matchedRule; - } - - private Type makeForwardCompatible(Type repeatedFileType, Type repeatedProjectedType) { - Rule fileTypeRule = findFirstRule(repeatedFileType, "file"); - Rule projectedTypeRule = findFirstRule(repeatedProjectedType, "projected"); - - if (projectedTypeRule == fileTypeRule) { - return repeatedProjectedType; - } - - String elementName = projectedTypeRule.elementName(repeatedProjectedType); - Type elementType = projectedTypeRule.elementType(repeatedProjectedType); - Boolean isElementRequired = projectedTypeRule.isElementRequired(repeatedProjectedType); - OriginalType elementOriginalType = projectedTypeRule.elementOriginalType(repeatedProjectedType); - - return fileTypeRule.createCompliantRepeatedType( - elementType, - elementName, - isElementRequired, - elementOriginalType); - } - - private static class GroupUnwrapped { - Stack wrappers; - Type repeatedType; - - public GroupUnwrapped(Stack wrappers, Type repeatedType) { - this.wrappers = wrappers; - this.repeatedType = repeatedType; - } - } - - private static GroupUnwrapped unwrapGroup(Type typ, Stack wrappers) { - Type ptr = typ; - // only wrapper for list with size one, so we can wrap repeated type later - while (!ptr.isPrimitive()) { - wrappers.push(ptr.asGroupType()); - if (isGroupList(ptr)) { - // when it is repeated - return new GroupUnwrapped(wrappers, ptr.asGroupType().getFields().get(0)); - } else if (ptr.asGroupType().getFields().size() == 1){ - ptr = ptr.asGroupType().getFields().get(0); - } else { - break; - } - } - return new GroupUnwrapped(wrappers, null); - } -} diff --git a/scalding-parquet-scrooge/src/main/java/com/twitter/scalding/parquet/scrooge/ScroogeReadSupport.java b/scalding-parquet-scrooge/src/main/java/com/twitter/scalding/parquet/scrooge/ScroogeReadSupport.java index 922cb04d6c..126e661dc6 100644 --- a/scalding-parquet-scrooge/src/main/java/com/twitter/scalding/parquet/scrooge/ScroogeReadSupport.java +++ b/scalding-parquet-scrooge/src/main/java/com/twitter/scalding/parquet/scrooge/ScroogeReadSupport.java @@ -131,8 +131,8 @@ public static MessageType getSchemaForRead(MessageType fileMessageType, String p public static MessageType getSchemaForRead(MessageType fileMessageType, MessageType projectedMessageType) { assertGroupsAreCompatible(fileMessageType, projectedMessageType); return ParquetCollectionFormatForwardCompatibility.forwardCompatibleMessage( - projectedMessageType, - fileMessageType); + projectedMessageType, fileMessageType + ); } /** diff --git a/scalding-parquet-scrooge/src/test/scala/com/twitter/scalding/parquet/scrooge/ParquetCollectionFormatForwardCompatibilityTests.scala b/scalding-parquet-scrooge/src/test/scala/com/twitter/scalding/parquet/scrooge/ParquetCollectionFormatForwardCompatibilityTests.scala new file mode 100644 index 0000000000..3a9f96217a --- /dev/null +++ b/scalding-parquet-scrooge/src/test/scala/com/twitter/scalding/parquet/scrooge/ParquetCollectionFormatForwardCompatibilityTests.scala @@ -0,0 +1,602 @@ +package com.twitter.scalding.parquet.scrooge + +import java.util + +import org.apache.parquet.schema.MessageTypeParser +import org.apache.parquet.thrift.ThriftSchemaConverter +import org.apache.parquet.thrift.struct.ThriftField.Requirement +import org.apache.parquet.thrift.struct.{ThriftField, ThriftType} +import org.apache.parquet.thrift.struct.ThriftType.StructType.StructOrUnionType +import org.apache.parquet.thrift.struct.ThriftType.{ListType, MapType, StructType} +import org.scalatest.{Matchers, WordSpec} + +class ParquetCollectionFormatForwardCompatibilityTests extends WordSpec with Matchers { + + "ScroogeReadSupport resolving map format" should { + + "map identity: " in { + val fileType = MessageTypeParser.parseMessageType( + """ + |message spark_schema { + | required group map (MAP) { + | repeated group key_value { + | required binary key (UTF8); + | required group value { + | required binary _id (UTF8); + | optional double created; + | } + | } + | } + |} + """.stripMargin) + val projected = ParquetCollectionFormatForwardCompatibility.forwardCompatibleMessage(fileType, fileType) + projected shouldEqual fileType + } + + "map identity: string key, struct value" in { + val listType = new ListType(new ThriftField("list", 2, Requirement.REQUIRED, new ThriftType.StringType)) + val children = new ThriftField("foo", 3, Requirement.REQUIRED, listType) + val mapValueType = new StructType(util.Arrays.asList(children), + StructOrUnionType.STRUCT) + val message = schemaFromThriftMap(mapValueType) + message shouldEqual MessageTypeParser.parseMessageType( + """ + |message ParquetSchema { + | required group map_field (MAP) = 6 { + | repeated group map (MAP_KEY_VALUE) { + | required binary key (UTF8); + | optional group value { + | required group foo (LIST) = 3 { + | repeated binary foo_tuple (UTF8); + | } + | } + | } + | } + |} + """.stripMargin) + + val projected = ParquetCollectionFormatForwardCompatibility.forwardCompatibleMessage(message, message) + projected shouldEqual message + } + + "map identity: string kye, list string value" in { + val listType = new ListType(new ThriftField("list", 2, Requirement.REQUIRED, new ThriftType.StringType)) + val message = schemaFromThriftMap(listType) + message shouldEqual MessageTypeParser.parseMessageType( + """ + |message ParquetSchema { + | required group map_field (MAP) = 6 { + | repeated group map (MAP_KEY_VALUE) { + | required binary key (UTF8); + | optional group value (LIST) { + | repeated binary value_tuple (UTF8); + | } + | } + | } + |} + """.stripMargin + ) + + val projected = ParquetCollectionFormatForwardCompatibility.forwardCompatibleMessage(message, message) + projected shouldEqual message + } + } + + private def schemaFromThriftMap(mapValueType: ThriftType) = { + val mapType = new MapType( + new ThriftField("NOT_USED_KEY", 4, Requirement.REQUIRED, new ThriftType.StringType), + new ThriftField("NOT_USED_VALUE", 5, Requirement.REQUIRED, + mapValueType) + ) + new ThriftSchemaConverter().convert( + new StructType(util.Arrays.asList( + new ThriftField("map_field", 6, Requirement.REQUIRED, mapType) + ), StructOrUnionType.STRUCT)) + } + + "ScroogeReadSupport resolving list format" should { + "resolve list legacy format: project x_tuple to legacy array" in { + val fileType = MessageTypeParser.parseMessageType( + """ + |message spark_schema { + | required group country_codes (LIST) { + | repeated binary array (UTF8); + | } + | required int32 x; + |} + """.stripMargin) + val requestedProjection = MessageTypeParser.parseMessageType( + """ + |message SampleProjection { + | optional group country_codes (LIST) { + | repeated binary country_codes_tuple (UTF8); + | } + |} + """.stripMargin) + val projected = ParquetCollectionFormatForwardCompatibility.forwardCompatibleMessage(requestedProjection, fileType) + // note optional of result, and field rename + val expected = MessageTypeParser.parseMessageType( + """ + |message SampleProjection { + | optional group country_codes (LIST) { + | repeated binary array (UTF8); + | } + |} + """.stripMargin) + projected shouldEqual expected + } + + "resolve list legacy format: project x_tuple to 3-level" in { + val fileType = MessageTypeParser.parseMessageType( + """ + |message spark_schema { + | required group country_codes (LIST) { + | repeated group list { + | required binary element (UTF8); + | } + | } + | required int32 x; + |} + """.stripMargin) + val requestedProjection = MessageTypeParser.parseMessageType( + """ + |message SampleProjection { + | optional group country_codes (LIST) { + | repeated binary country_codes_tuple (UTF8); + | } + |} + """.stripMargin) + val projected = ParquetCollectionFormatForwardCompatibility.forwardCompatibleMessage(requestedProjection, fileType) + // note optional of result, and field rename + val expected = MessageTypeParser.parseMessageType( + """ + |message SampleProjection { + | optional group country_codes (LIST) { + | repeated group list { + | required binary element (UTF8); + | } + | } + |} + """.stripMargin) + projected shouldEqual expected + } + + "resolve list legacy format: project nested x_tuple to nested legacy array" in { + val fileType = MessageTypeParser.parseMessageType( + """ + |message spark_schema { + | required group foo (LIST) { + | repeated group array (LIST) { + | repeated binary array (UTF8); + | } + | } + |} + """.stripMargin) + val requestedProjection = MessageTypeParser.parseMessageType( + """ + |message SampleProjection { + | optional group foo (LIST) { + | repeated group foo_tuple (LIST) { + | repeated binary foo_tuple_tuple (UTF8); + | } + | } + |} + """.stripMargin) + val projected = ParquetCollectionFormatForwardCompatibility.forwardCompatibleMessage(requestedProjection, fileType) + // note optional of result, and field rename + val expected = MessageTypeParser.parseMessageType( + """ + |message SampleProjection { + | optional group foo (LIST) { + | repeated group array { + | repeated binary array (UTF8); + | } + | } + |} + """.stripMargin) + projected shouldEqual expected + } + + "resolve list legacy format: project nested x_tuple to nested array" in { + val fileType = MessageTypeParser.parseMessageType( + """ + |message spark_schema { + | required group foo (LIST) { + | repeated group array (LIST) { + | repeated binary array (UTF8); + | } + | } + |} + """.stripMargin) + val requestedProjection = MessageTypeParser.parseMessageType( + """ + |message SampleProjection { + | optional group foo (LIST) { + | repeated group foo_tuple (LIST) { + | repeated binary foo_tuple_tuple (UTF8); + | } + | } + |} + """.stripMargin) + val projected = ParquetCollectionFormatForwardCompatibility.forwardCompatibleMessage(requestedProjection, fileType) + // note optional of result, and field rename + val expected = MessageTypeParser.parseMessageType( + """ + |message SampleProjection { + | optional group foo (LIST) { + | repeated group array { + | repeated binary array (UTF8); + | } + | } + |} + """.stripMargin) + projected shouldEqual expected + } + + "resolve list in group legacy format: project x_tuple to nested 3-level" in { + val fileType = MessageTypeParser.parseMessageType( + """ + |message spark_schema { + | required group foo (LIST) { + | repeated group list { + | required group element (LIST) { + | repeated group list { + | required binary element (UTF8); + | } + | } + | } + | } + |} + """.stripMargin) + val requestedProjection = MessageTypeParser.parseMessageType( + """ + |message SampleProjection { + | optional group foo (LIST) { + | repeated group foo_tuple (LIST) { + | repeated binary foo_tuple_tuple (UTF8); + | } + | } + |} + """.stripMargin) + val projected = ParquetCollectionFormatForwardCompatibility.forwardCompatibleMessage(requestedProjection, fileType) + val expected = MessageTypeParser.parseMessageType( + """ + |message SampleProjection { + | optional group foo (LIST) { + | repeated group list { + | required group element (LIST) { + | repeated group list { + | required binary element (UTF8); + | } + | } + | } + | } + |} + """.stripMargin) + projected shouldEqual expected + } + + "resolve: binary array to 3-level nesting" in { + val fileType = MessageTypeParser.parseMessageType( + """ + |message spark_schema { + | required group country_codes (LIST) { + | repeated group list { + | required binary element (UTF8); + | } + | } + | required int32 x; + |} + """.stripMargin) + + // inner list is `binary array` + val requestedProjection = MessageTypeParser.parseMessageType( + """ + |message SampleProjection { + | optional group country_codes (LIST) { + | repeated binary array (UTF8); + | } + |} + """.stripMargin) + val projected = ParquetCollectionFormatForwardCompatibility.forwardCompatibleMessage(requestedProjection, fileType) + + val expected = MessageTypeParser.parseMessageType( + """ + |message SampleProjection { + | optional group country_codes (LIST) { + | repeated group list { + | required binary element (UTF8); + | } + | } + |} + """.stripMargin) + projected shouldEqual expected + + } + + "resolve: identity 3-level" in { + val fileType = MessageTypeParser.parseMessageType( + """ + |message spark_schema { + | required group country_codes (LIST) { + | repeated group list { + | required binary element (UTF8); + | } + | } + | required int32 x; + |} + """.stripMargin) + val projected = ParquetCollectionFormatForwardCompatibility.forwardCompatibleMessage(fileType, fileType) + projected shouldEqual fileType + + } + + "resolve nested list: project inner legacy array to 3-level nesting" in { + val fileType = MessageTypeParser.parseMessageType( + """ + |message spark_schema { + | required group array_of_country_codes (LIST) { + | repeated group list { + | required group element (LIST) { + | repeated group list { + | required binary element (UTF8); + | } + | } + | } + | } + | required int32 x; + |} + """.stripMargin) + + // inner list is `binary array` + val requestedProjection = MessageTypeParser.parseMessageType( + """ + |message SampleProjection { + | optional group array_of_country_codes (LIST) { + | repeated group list { + | required group element (LIST) { + | repeated binary array (UTF8); + | } + | } + | } + |} + """.stripMargin) + val projected = ParquetCollectionFormatForwardCompatibility.forwardCompatibleMessage(requestedProjection, fileType) + + val expected = MessageTypeParser.parseMessageType( + """ + |message SampleProjection { + | optional group array_of_country_codes (LIST) { + | repeated group list { + | required group element (LIST) { + | repeated group list { + | required binary element (UTF8); + | } + | } + | } + | } + |} + """.stripMargin) + projected shouldEqual expected + } + + "resolve projected struct in list: repeated group element to 3-level nesting" in { + val fileType = MessageTypeParser.parseMessageType( + """ + |message spark_schema { + | required group country_codes (LIST) { + | repeated group list { + | required group element { + | required binary foo (UTF8); + | required binary bar (UTF8); + | required binary zing (UTF8); + | } + | } + | } + | required int32 x; + |} + """.stripMargin) + val requestedProjection = MessageTypeParser.parseMessageType( + """ + |message SampleProjection { + | optional group country_codes (LIST) { + | repeated group element { + | optional binary foo (UTF8); + | required binary zing (UTF8); + | } + | } + |} + """.stripMargin) + val projected = ParquetCollectionFormatForwardCompatibility.forwardCompatibleMessage(requestedProjection, fileType) + + val expected = MessageTypeParser.parseMessageType( + """ + |message SampleProjection { + | optional group country_codes (LIST) { + | repeated group list { + | required group element { + | optional binary foo (UTF8); + | required binary zing (UTF8); + | } + | } + | } + |} + """.stripMargin) + projected shouldEqual expected + } + + "resolve standard 3-level list to 2-level" in { + val fileType = MessageTypeParser.parseMessageType( + """ + |message scalding_schema { + | required group array_of_country_codes (LIST) { + | repeated group list { + | required group element (LIST) { + | repeated binary array (UTF8); + | } + | } + | } + | required int32 x; + |} + """.stripMargin) + + val requestedProjection = MessageTypeParser.parseMessageType( + """ + |message SampleProjection { + | optional group array_of_country_codes (LIST) { + | repeated group list { + | required group element (LIST) { + | repeated group list { + | required binary element (UTF8); + | } + | } + | } + | } + |} + """.stripMargin) + val projected = ParquetCollectionFormatForwardCompatibility.forwardCompatibleMessage(requestedProjection, fileType) + + val expected = MessageTypeParser.parseMessageType( + """ + |message SampleProjection { + | optional group array_of_country_codes (LIST) { + | repeated group list { + | required group element (LIST) { + | repeated binary array (UTF8); + | } + | } + | } + |} + """.stripMargin) + projected shouldEqual expected + } + + "resolve list in group containing list" in { + val fileType = MessageTypeParser.parseMessageType( + """ + |message spark_schema { + | optional group connect_delays (LIST) { + | repeated group list { + | required group element { + | optional binary description (UTF8); + | optional binary created_by (UTF8); + | optional group currencies (LIST) { + | repeated group list { + | required binary element (UTF8); + | } + | } + | } + | } + | } + |} + """.stripMargin) + val requestedProjection = MessageTypeParser.parseMessageType( + """ + |message SampleProjection { + | optional group connect_delays (LIST) { + | repeated group connect_delays_tuple { + | optional binary description (UTF8); + | optional group currencies (LIST) { + | repeated binary currencies_tuple (UTF8); + | } + | } + | } + |} + """.stripMargin) + val projected = ParquetCollectionFormatForwardCompatibility.forwardCompatibleMessage(requestedProjection, fileType) + val expected = MessageTypeParser.parseMessageType( + """ + |message SampleProjection { + | optional group connect_delays (LIST) { + | repeated group list { + | required group element { + | optional binary description (UTF8); + | optional group currencies (LIST) { + | repeated group list { + | required binary element (UTF8); + | } + | } + | } + | } + | } + |} + """.stripMargin) + projected shouldEqual expected + } + + "resolve projection of different level" in { + val fileType = MessageTypeParser.parseMessageType( + """ + |message spark_schema { + | optional group foo (LIST) { + | repeated group list { + | required group element { + | required binary zing (UTF8); + | required binary bar (UTF8); + | } + | } + | } + |} + """.stripMargin) + val requestedProjection = MessageTypeParser.parseMessageType( + """ + |message SampleProjection { + | optional group foo (LIST) { + | repeated group list { + | required group element { + | optional binary bar (UTF8); + | required binary zing (UTF8); + | } + | } + | } + |} + """.stripMargin) + val projected = ParquetCollectionFormatForwardCompatibility.forwardCompatibleMessage(requestedProjection, fileType) + val expected = MessageTypeParser.parseMessageType( + """ + |message SampleProjection { + | optional group foo (LIST) { + | repeated group list { + | required group element { + | optional binary bar (UTF8); + | required binary zing (UTF8); + | } + | } + | } + |} + """.stripMargin) + projected shouldEqual expected + } + + "resolve does not support backward compat: project nested 3-level to x_tuple" in { + val fileType = MessageTypeParser.parseMessageType( + """ + |message scalding_schema { + | required group foo (LIST) { + | repeated group foo_tuple (LIST) { + | repeated binary foo_tuple_tuple (UTF8); + | } + | } + | required int32 x; + |} + """.stripMargin) + val requestedProjection = MessageTypeParser.parseMessageType( + """ + |message SampleProjection { + | optional group foo (LIST) { + | repeated group list { + | required group element (LIST) { + | repeated group list { + | required binary element (UTF8); + | } + | } + | } + | } + | optional int32 x; + |} + """.stripMargin) + val projected = ParquetCollectionFormatForwardCompatibility.forwardCompatibleMessage(requestedProjection, fileType) + projected shouldEqual requestedProjection + } + } +} diff --git a/scalding-parquet-scrooge/src/test/scala/com/twitter/scalding/parquet/scrooge/ScroogeReadSupportTests.scala b/scalding-parquet-scrooge/src/test/scala/com/twitter/scalding/parquet/scrooge/ScroogeReadSupportTests.scala index ab53c9695b..7cd6053bb5 100644 --- a/scalding-parquet-scrooge/src/test/scala/com/twitter/scalding/parquet/scrooge/ScroogeReadSupportTests.scala +++ b/scalding-parquet-scrooge/src/test/scala/com/twitter/scalding/parquet/scrooge/ScroogeReadSupportTests.scala @@ -7,7 +7,7 @@ import com.twitter.scalding.platform.{HadoopPlatformJobTest, HadoopSharedPlatfor import com.twitter.scalding.typed.TypedPipe import com.twitter.scalding.{Args, Job} import org.apache.parquet.io.InvalidRecordException -import org.apache.parquet.schema.MessageTypeParser +import org.apache.parquet.schema.{MessageTypeParser} import org.scalatest.{Matchers, WordSpec} class ScroogeReadSupportTests extends WordSpec with Matchers with HadoopSharedPlatformTest { @@ -90,481 +90,6 @@ class ScroogeReadSupportTests extends WordSpec with Matchers with HadoopSharedPl } } - "ScroogeReadSupport resolving list format" should { - "resolve list legacy format: project x_tuple to legacy array" in { - val fileType = MessageTypeParser.parseMessageType( - """ - |message spark_schema { - | required group country_codes (LIST) { - | repeated binary array (UTF8); - | } - | required int32 x; - |} - """.stripMargin) - val requestedProjection = MessageTypeParser.parseMessageType( - """ - |message SampleProjection { - | optional group country_codes (LIST) { - | repeated binary country_codes_tuple (UTF8); - | } - |} - """.stripMargin) - val projected = ScroogeReadSupport.getSchemaForRead(fileType, requestedProjection) - // note optional of result, and field rename - val expected = MessageTypeParser.parseMessageType( - """ - |message SampleProjection { - | optional group country_codes (LIST) { - | repeated binary array (UTF8); - | } - |} - """.stripMargin) - projected shouldEqual expected - } - - "resolve list legacy format: project x_tuple to 3-level" in { - val fileType = MessageTypeParser.parseMessageType( - """ - |message spark_schema { - | required group country_codes (LIST) { - | repeated group list { - | required binary element (UTF8); - | } - | } - | required int32 x; - |} - """.stripMargin) - val requestedProjection = MessageTypeParser.parseMessageType( - """ - |message SampleProjection { - | optional group country_codes (LIST) { - | repeated binary country_codes_tuple (UTF8); - | } - |} - """.stripMargin) - val projected = ScroogeReadSupport.getSchemaForRead(fileType, requestedProjection) - // note optional of result, and field rename - val expected = MessageTypeParser.parseMessageType( - """ - |message SampleProjection { - | optional group country_codes (LIST) { - | repeated group list { - | required binary element (UTF8); - | } - | } - |} - """.stripMargin) - projected shouldEqual expected - } - - "resolve list legacy format: project nested x_tuple to nested legacy array" in { - val fileType = MessageTypeParser.parseMessageType( - """ - |message spark_schema { - | required group foo (LIST) { - | repeated group array (LIST) { - | repeated binary array (UTF8); - | } - | } - |} - """.stripMargin) - val requestedProjection = MessageTypeParser.parseMessageType( - """ - |message SampleProjection { - | optional group foo (LIST) { - | repeated group foo_tuple (LIST) { - | repeated binary foo_tuple_tuple (UTF8); - | } - | } - |} - """.stripMargin) - val projected = ScroogeReadSupport.getSchemaForRead(fileType, requestedProjection) - // note optional of result, and field rename - val expected = MessageTypeParser.parseMessageType( - """ - |message SampleProjection { - | optional group foo (LIST) { - | repeated group array { - | repeated binary array (UTF8); - | } - | } - |} - """.stripMargin) - projected shouldEqual expected - } - - "resolve list legacy format: project nested x_tuple to nested array" in { - val fileType = MessageTypeParser.parseMessageType( - """ - |message spark_schema { - | required group foo (LIST) { - | repeated group array (LIST) { - | repeated binary array (UTF8); - | } - | } - |} - """.stripMargin) - val requestedProjection = MessageTypeParser.parseMessageType( - """ - |message SampleProjection { - | optional group foo (LIST) { - | repeated group foo_tuple (LIST) { - | repeated binary foo_tuple_tuple (UTF8); - | } - | } - |} - """.stripMargin) - val projected = ScroogeReadSupport.getSchemaForRead(fileType, requestedProjection) - // note optional of result, and field rename - val expected = MessageTypeParser.parseMessageType( - """ - |message SampleProjection { - | optional group foo (LIST) { - | repeated group array { - | repeated binary array (UTF8); - | } - | } - |} - """.stripMargin) - projected shouldEqual expected - } - - "resolve list in group legacy format: project x_tuple to nested 3-level" in { - val fileType = MessageTypeParser.parseMessageType( - """ - |message spark_schema { - | required group foo (LIST) { - | repeated group list { - | required group element (LIST) { - | repeated group list { - | required binary element (UTF8); - | } - | } - | } - | } - |} - """.stripMargin) - val requestedProjection = MessageTypeParser.parseMessageType( - """ - |message SampleProjection { - | optional group foo (LIST) { - | repeated group foo_tuple (LIST) { - | repeated binary foo_tuple_tuple (UTF8); - | } - | } - |} - """.stripMargin) - val projected = ScroogeReadSupport.getSchemaForRead(fileType, requestedProjection) - val expected = MessageTypeParser.parseMessageType( - """ - |message SampleProjection { - | optional group foo (LIST) { - | repeated group list { - | required group element (LIST) { - | repeated group list { - | required binary element (UTF8); - | } - | } - | } - | } - |} - """.stripMargin) - projected shouldEqual expected - } - - "resolve: binary array to 3-level nesting" in { - val fileType = MessageTypeParser.parseMessageType( - """ - |message spark_schema { - | required group country_codes (LIST) { - | repeated group list { - | required binary element (UTF8); - | } - | } - | required int32 x; - |} - """.stripMargin) - - // inner list is `binary array` - val requestedProjection = MessageTypeParser.parseMessageType( - """ - |message SampleProjection { - | optional group country_codes (LIST) { - | repeated binary array (UTF8); - | } - |} - """.stripMargin) - val projected = ScroogeReadSupport.getSchemaForRead(fileType, requestedProjection) - - val expected = MessageTypeParser.parseMessageType( - """ - |message SampleProjection { - | optional group country_codes (LIST) { - | repeated group list { - | required binary element (UTF8); - | } - | } - |} - """.stripMargin) - projected shouldEqual expected - - } - - "resolve: identity 3-level" in { - val fileType = MessageTypeParser.parseMessageType( - """ - |message spark_schema { - | required group country_codes (LIST) { - | repeated group list { - | required binary element (UTF8); - | } - | } - | required int32 x; - |} - """.stripMargin) - val projected = ScroogeReadSupport.getSchemaForRead(fileType, fileType) - projected shouldEqual fileType - - } - - "resolve nested list: project inner legacy array to 3-level nesting" in { - val fileType = MessageTypeParser.parseMessageType( - """ - |message spark_schema { - | required group array_of_country_codes (LIST) { - | repeated group list { - | required group element (LIST) { - | repeated group list { - | required binary element (UTF8); - | } - | } - | } - | } - | required int32 x; - |} - """.stripMargin) - - // inner list is `binary array` - val requestedProjection = MessageTypeParser.parseMessageType( - """ - |message SampleProjection { - | optional group array_of_country_codes (LIST) { - | repeated group list { - | required group element (LIST) { - | repeated binary array (UTF8); - | } - | } - | } - |} - """.stripMargin) - val projected = ScroogeReadSupport.getSchemaForRead(fileType, requestedProjection) - - val expected = MessageTypeParser.parseMessageType( - """ - |message SampleProjection { - | optional group array_of_country_codes (LIST) { - | repeated group list { - | required group element (LIST) { - | repeated group list { - | required binary element (UTF8); - | } - | } - | } - | } - |} - """.stripMargin) - projected shouldEqual expected - } - - "resolve projected struct in list: repeated group element to 3-level nesting" in { - val fileType = MessageTypeParser.parseMessageType( - """ - |message spark_schema { - | required group country_codes (LIST) { - | repeated group list { - | required group element { - | required binary foo (UTF8); - | required binary bar (UTF8); - | required binary zing (UTF8); - | } - | } - | } - | required int32 x; - |} - """.stripMargin) - val requestedProjection = MessageTypeParser.parseMessageType( - """ - |message SampleProjection { - | optional group country_codes (LIST) { - | repeated group element { - | optional binary foo (UTF8); - | required binary zing (UTF8); - | } - | } - |} - """.stripMargin) - val projected = ScroogeReadSupport.getSchemaForRead(fileType, requestedProjection) - - val expected = MessageTypeParser.parseMessageType( - """ - |message SampleProjection { - | optional group country_codes (LIST) { - | repeated group list { - | required group element { - | optional binary foo (UTF8); - | required binary zing (UTF8); - | } - | } - | } - |} - """.stripMargin) - projected shouldEqual expected - } - - "resolve standard 3-level list to 2-level" in { - val fileType = MessageTypeParser.parseMessageType( - """ - |message scalding_schema { - | required group array_of_country_codes (LIST) { - | repeated group list { - | required group element (LIST) { - | repeated binary array (UTF8); - | } - | } - | } - | required int32 x; - |} - """.stripMargin) - - val requestedProjection = MessageTypeParser.parseMessageType( - """ - |message SampleProjection { - | optional group array_of_country_codes (LIST) { - | repeated group list { - | required group element (LIST) { - | repeated group list { - | required binary element (UTF8); - | } - | } - | } - | } - |} - """.stripMargin) - val projected = ScroogeReadSupport.getSchemaForRead(fileType, requestedProjection) - - val expected = MessageTypeParser.parseMessageType( - """ - |message SampleProjection { - | optional group array_of_country_codes (LIST) { - | repeated group list { - | required group element (LIST) { - | repeated binary array (UTF8); - | } - | } - | } - |} - """.stripMargin) - projected shouldEqual expected - } - - "resolve list in group containing list" in { - val fileType = MessageTypeParser.parseMessageType( - """ - |message spark_schema { - | optional group connect_delays (LIST) { - | repeated group list { - | required group element { - | optional binary description (UTF8); - | optional binary created_by (UTF8); - | optional group currencies (LIST) { - | repeated group list { - | required binary element (UTF8); - | } - | } - | } - | } - | } - |} - """.stripMargin) - val requestedProjection = MessageTypeParser.parseMessageType( - """ - |message SampleProjection { - | optional group connect_delays (LIST) { - | repeated group connect_delays_tuple { - | optional binary description (UTF8); - | optional group currencies (LIST) { - | repeated binary currencies_tuple (UTF8); - | } - | } - | } - |} - """.stripMargin) - val projected = ScroogeReadSupport.getSchemaForRead(fileType, requestedProjection) - val expected = MessageTypeParser.parseMessageType( - """ - |message SampleProjection { - | optional group connect_delays (LIST) { - | repeated group list { - | required group element { - | optional binary description (UTF8); - | optional group currencies (LIST) { - | repeated group list { - | required binary element (UTF8); - | } - | } - | } - | } - | } - |} - """.stripMargin) - projected shouldEqual expected - } - - "resolve projection of different level" in { - val fileType = MessageTypeParser.parseMessageType( - """ - |message spark_schema { - | optional group foo (LIST) { - | repeated group list { - | required group element { - | required binary zing (UTF8); - | required binary bar (UTF8); - | } - | } - | } - |} - """.stripMargin) - val requestedProjection = MessageTypeParser.parseMessageType( - """ - |message SampleProjection { - | optional group foo (LIST) { - | repeated group list { - | required group element { - | optional binary bar (UTF8); - | required binary zing (UTF8); - | } - | } - | } - |} - """.stripMargin) - val projected = ScroogeReadSupport.getSchemaForRead(fileType, requestedProjection) - val expected = MessageTypeParser.parseMessageType( - """ - |message SampleProjection { - | optional group foo (LIST) { - | repeated group list { - | required group element { - | optional binary bar (UTF8); - | required binary zing (UTF8); - | } - | } - | } - |} - """.stripMargin) - projected shouldEqual expected - } - } - "ScroogeReadSupport" should { "write using typedparquet and read using parquet scrooge" in { HadoopPlatformJobTest(new WriteToTypedParquetTupleJob(_), cluster) From ca67991ecc1674d713bcb9601a4319fe70d9dbc7 Mon Sep 17 00:00:00 2001 From: Mick Jermsurawong Date: Mon, 30 Sep 2019 08:21:09 -0700 Subject: [PATCH 08/34] handle map legacy format --- .../parquet/scrooge/ScroogeReadSupport.java | 2 +- ...CollectionFormatForwardCompatibility.scala | 217 +++---- .../scrooge/ParquetListFormatRule.scala | 94 ++- .../scrooge/ParquetMapFormatRule.scala | 54 ++ ...ctionFormatForwardCompatibilityTests.scala | 537 ++++++++++++++---- 5 files changed, 675 insertions(+), 229 deletions(-) create mode 100644 scalding-parquet-scrooge/src/main/scala/com/twitter/scalding/parquet/scrooge/ParquetMapFormatRule.scala diff --git a/scalding-parquet-scrooge/src/main/java/com/twitter/scalding/parquet/scrooge/ScroogeReadSupport.java b/scalding-parquet-scrooge/src/main/java/com/twitter/scalding/parquet/scrooge/ScroogeReadSupport.java index 126e661dc6..1bc9040ba8 100644 --- a/scalding-parquet-scrooge/src/main/java/com/twitter/scalding/parquet/scrooge/ScroogeReadSupport.java +++ b/scalding-parquet-scrooge/src/main/java/com/twitter/scalding/parquet/scrooge/ScroogeReadSupport.java @@ -130,7 +130,7 @@ public static MessageType getSchemaForRead(MessageType fileMessageType, String p */ public static MessageType getSchemaForRead(MessageType fileMessageType, MessageType projectedMessageType) { assertGroupsAreCompatible(fileMessageType, projectedMessageType); - return ParquetCollectionFormatForwardCompatibility.forwardCompatibleMessage( + return ParquetCollectionFormatForwardCompatibility.formatForwardCompatibleMessage( projectedMessageType, fileMessageType ); } diff --git a/scalding-parquet-scrooge/src/main/scala/com/twitter/scalding/parquet/scrooge/ParquetCollectionFormatForwardCompatibility.scala b/scalding-parquet-scrooge/src/main/scala/com/twitter/scalding/parquet/scrooge/ParquetCollectionFormatForwardCompatibility.scala index a55f49078f..4d0699a0e6 100644 --- a/scalding-parquet-scrooge/src/main/scala/com/twitter/scalding/parquet/scrooge/ParquetCollectionFormatForwardCompatibility.scala +++ b/scalding-parquet-scrooge/src/main/scala/com/twitter/scalding/parquet/scrooge/ParquetCollectionFormatForwardCompatibility.scala @@ -2,136 +2,151 @@ package com.twitter.scalding.parquet.scrooge import java.util +import org.apache.parquet.schema.Type.Repetition import org.apache.parquet.schema.{GroupType, MessageType, Type} +import org.apache.parquet.thrift.DecodingSchemaMismatchException import org.slf4j.LoggerFactory -private[scrooge] case class GroupUnwrapped(wrappers: Seq[GroupType], repeatedType: Type) - object ParquetCollectionFormatForwardCompatibility { private val LOGGER = LoggerFactory.getLogger(getClass) - private val SOURCE_LIST_RULES = List( - PrimitiveElementRule, PrimitiveArrayRule, - GroupElementRule, GroupArrayRule, - TupleRule, StandardRule - ) - - private val TARGET_LIST_RULES = SOURCE_LIST_RULES.filterNot(_ == TupleRule) - /** * Create a forward-compatible schema, using content from source type with format from target type. * @param sourceType source type with legacy format * @param targetType target type to which source is converted to */ - def forwardCompatibleMessage(sourceType: MessageType, targetType: MessageType): MessageType = { - val groupResult = forwardCompatibleType(sourceType, targetType).asGroupType() + def formatForwardCompatibleMessage(sourceType: MessageType, targetType: MessageType): MessageType = { + val groupResult = formatForwardCompatibleType(sourceType, targetType).asGroupType() LOGGER.info("Making source schema to be compatible with target" + s"\nSource:\n${sourceType}\nTarget:\n${targetType}\nResult:\n${groupResult}") new MessageType(groupResult.getName, groupResult.getFields) } - private def wrapElementAsRepeatedType(rule: ParquetListFormatRule, repeatedType: Type, elementType: Type): Type = { - rule.createCompliantRepeatedType( - elementType, - rule.elementName(repeatedType), - // if repeated or required, it is required - !elementType.isRepetition(Type.Repetition.OPTIONAL), elementType.getOriginalType) - } - - private def forwardCompatibleType(sourceType: Type, targetType: Type): Type = { - if (sourceType.isPrimitive || targetType.isPrimitive) { - return sourceType - } - - val sourceUnwrapped = unwrapGroup(sourceType.asGroupType) - val targetUnwrapped = unwrapGroup(targetType.asGroupType) - - if (targetUnwrapped.isDefined && sourceUnwrapped.isDefined) { - val targetRepeated = targetUnwrapped.get.repeatedType - val sourceRepeated = sourceUnwrapped.get.repeatedType - - val sourceRule = findFirstRule(SOURCE_LIST_RULES, sourceRepeated, "source") - val targetRule = findFirstRule(TARGET_LIST_RULES, targetRepeated, "target") - - val repeatedFormatted = (sourceRule, targetRule) match { - case (Some(sRule), Some(tRule)) => { - val sourceElement = sRule.elementType(sourceRepeated) - val targetElement = tRule.elementType(targetRepeated) - // Recurse on the element. This is to handle nested list - val forwardCompatElement = forwardCompatibleType(sourceElement, targetElement) - // Wrap the solved element with current source structure, and do actual conversion work - val repeatedResolved = wrapElementAsRepeatedType(sRule, sourceRepeated, forwardCompatElement) - convertToTarget(targetRepeated, repeatedResolved) + /** + * Traverse source/target schemas and format nodes of list or map. + * The formatting is not to one-to-one node swapping from source to target, + * this is because the subset fields of source node and its optional/required must + * be maintained in the formatted result. + */ + private def formatForwardCompatibleType(sourceType: Type, targetType: Type): Type = { + (unwrapGroup(sourceType), unwrapGroup(targetType)) match { + case _ if sourceType.isPrimitive || targetType.isPrimitive => + // Base case + sourceType + case ( + GroupUnwrapped(sourceWrappers, Some(sourceRepeatedListType), None), + GroupUnwrapped(_, Some(targetRepeatedListType), None) + ) => + // Format list + val sourceRule = ParquetListFormatRule.findFirstListRule(sourceRepeatedListType, Source) + val targetRule = ParquetListFormatRule.findFirstListRule(targetRepeatedListType, Target) + + val formattedRepeated = (sourceRule, targetRule) match { + case (Some(sourceRule), Some(targetRule)) => { + val sourceElement = sourceRule.elementType(sourceRepeatedListType) + val targetElement = targetRule.elementType(targetRepeatedListType) + // Recurse on the element instead of `repeated` type because list still can have + // different formats at repeated type + val forwardCompatElement = formatForwardCompatibleType(sourceElement, targetElement) + // Wrap the solved element with current source structure, and do actual conversion work + val forwardCompatRepeated = ParquetListFormatRule.wrapElementAsRepeatedType( + sourceRule, + sourceRepeatedListType, + forwardCompatElement + ) + ParquetListFormatRule.formatForwardCompatibleRepeatedType( + forwardCompatRepeated, + targetRepeatedListType) + } + case _ => sourceRepeatedListType // No-op } - case _ => sourceRepeated // No-op - } - - // Wrapped the solved repeated type in its original groups, - // describing field name and whether it's optional/required - sourceUnwrapped.get.wrappers.foldRight(repeatedFormatted) { - (wrapper, group) => wrapper.withNewFields(group) - } - } else { - val sourceGroup = sourceType.asGroupType - val targetGroup = targetType.asGroupType - - val resultFields = new util.ArrayList[Type] - import scala.collection.JavaConversions._ - for (sourceField <- sourceGroup.getFields) { - if (!targetGroup.containsField(sourceField.getName)) { // This can happen when - // 1) projecting optional field over non-existent target schema - // 2) field is a part of legacy map format - // Make no assertions (separation of responsibility) and just include it - resultFields.add(sourceField) + // Wrapped the formatted repeated type in its original groups, + // describing field name and whether it's optional/required + sourceWrappers.foldRight(formattedRepeated) { + (wrapper, group) => wrapper.withNewFields(group) } - else { - val fieldIndex = targetGroup.getFieldIndex(sourceField.getName) - val targetField = targetGroup.getFields.get(fieldIndex) - resultFields.add(forwardCompatibleType(sourceField, targetField)) + case ( + GroupUnwrapped(sourceWrappers, None, Some(sourceRepeatedMapType)), + GroupUnwrapped(_, None, Some(targetRepeatedMapType)) + ) => + // Format map + val forwardCompatRepeated = formatForwardCompatibleType(sourceRepeatedMapType, targetRepeatedMapType) + val formattedRepeated = ParquetMapFormatRule.formatForwardCompatibleRepeatedType( + forwardCompatRepeated, + targetRepeatedMapType + ) + // Wrapped the formatted repeated type in its original groups, + // describing field name and whether it's optional/required + sourceWrappers.foldRight(formattedRepeated) { + (wrapper, group) => wrapper.withNewFields(group) } - } - sourceGroup.withNewFields(resultFields) + case _ => + // Field projection + val sourceGroup = sourceType.asGroupType + val targetGroup = targetType.asGroupType + + val resultFields = new util.ArrayList[Type] + import scala.collection.JavaConversions._ + for (sourceField <- sourceGroup.getFields) { + if (!targetGroup.containsField(sourceField.getName)) { + if (!sourceField.isRepetition(Repetition.OPTIONAL)) { + throw new DecodingSchemaMismatchException( + s"Found non-optional source field ${sourceField.getName}:\n$sourceField\n\n" + + s"not present in the given target type:\n${targetGroup}" + ) + } + resultFields.add(sourceField) + } + else { + val fieldIndex = targetGroup.getFieldIndex(sourceField.getName) + val targetField = targetGroup.getFields.get(fieldIndex) + resultFields.add(formatForwardCompatibleType(sourceField, targetField)) + } + } + sourceGroup.withNewFields(resultFields) } } - private def findFirstRule(rules: Seq[ParquetListFormatRule], - repeatedType: Type, - debuggingTypeSource: String): Option[ParquetListFormatRule] = { - val ruleFound = rules.find(rule => rule.check(repeatedType)) - if (ruleFound.isEmpty) { - LOGGER.warn(s"Unable to find matching rule for $debuggingTypeSource schema:\n$repeatedType") - } - ruleFound - } + private case class GroupUnwrapped(wrappers: Seq[GroupType], + repeatedListType: Option[Type] = None, + repeatedMapType: Option[Type] = None) - private def convertToTarget(repeatedTargetType: Type, repeatedSourceType: Type): Type = { - val sourceRuleMaybe = findFirstRule(SOURCE_LIST_RULES, repeatedSourceType, "source") - val targetRuleMaybe = findFirstRule(TARGET_LIST_RULES, repeatedTargetType, "target") - if (sourceRuleMaybe == targetRuleMaybe || sourceRuleMaybe.isEmpty || targetRuleMaybe.isEmpty) { - repeatedSourceType - } else { - val sourceRule = sourceRuleMaybe.get - val targetRule = targetRuleMaybe.get - val elementType = sourceRule.elementType(repeatedSourceType) - targetRule.createCompliantRepeatedType( - typ= elementType, - name= elementType.getName, - isElementRequired= sourceRule.isElementRequired(repeatedSourceType), - originalType= sourceRule.elementOriginalType(repeatedSourceType) + private def unwrapGroup(typ: Type, wrappers: Seq[GroupType] = Seq()): GroupUnwrapped = { + if (typ.isPrimitive) { + GroupUnwrapped( + wrappers, + repeatedListType=None, + repeatedMapType=None + ) + } else if (typ.asGroupType.getFieldCount != 1) { + GroupUnwrapped( + wrappers :+ typ.asGroupType(), + repeatedListType=None, + repeatedMapType=None ) - } - } - - private def unwrapGroup(typ: Type, wrappers: Seq[GroupType] = Seq()): Option[GroupUnwrapped] = { - if (typ.isPrimitive || typ.asGroupType.getFieldCount != 1) { - None } else { - if (ParquetListFormatRule.isGroupList(typ)) { // when it is repeated - Some(GroupUnwrapped(wrappers :+ typ.asGroupType(), typ.asGroupType.getFields.get(0))) + // Note the field count is strictly 1 here, and the wrappers will be used later + // to wrap back the formatted results. + if (ParquetListFormatRule.isGroupList(typ)) { + GroupUnwrapped( + wrappers :+ typ.asGroupType(), + repeatedListType=Some(typ.asGroupType.getFields.get(0)), + repeatedMapType = None + ) + } else if (ParquetMapFormatRule.isGroupMap(typ)) { + GroupUnwrapped( + wrappers :+ typ.asGroupType(), + repeatedListType=None, + repeatedMapType=Some(typ.asGroupType.getFields.get(0)) + ) } else { unwrapGroup(typ.asGroupType.getFields.get(0), wrappers :+ typ.asGroupType()) } } } } + +trait ParquetCollectionFormatRule { + def formatForwardCompatibleRepeatedType(sourceRepeatedMapType: Type, targetRepeatedMapType: Type): Type +} \ No newline at end of file diff --git a/scalding-parquet-scrooge/src/main/scala/com/twitter/scalding/parquet/scrooge/ParquetListFormatRule.scala b/scalding-parquet-scrooge/src/main/scala/com/twitter/scalding/parquet/scrooge/ParquetListFormatRule.scala index 77cc646328..11576ec949 100644 --- a/scalding-parquet-scrooge/src/main/scala/com/twitter/scalding/parquet/scrooge/ParquetListFormatRule.scala +++ b/scalding-parquet-scrooge/src/main/scala/com/twitter/scalding/parquet/scrooge/ParquetListFormatRule.scala @@ -3,13 +3,77 @@ package com.twitter.scalding.parquet.scrooge import java.util import org.apache.parquet.schema.{GroupType, OriginalType, PrimitiveType, Type} +import org.slf4j.LoggerFactory + +/** + * Rule to convert parquet schema of legacy list type to standard one + * namely 3-level list structure as recommended in + * https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#lists + * + * More specifically this handles converting from parquet file created by + * {{@code org.apache.parquet.thrift.ThriftSchemaConvertVisitor}} which always suffix + * list element with "_tuple". + */ +private[scrooge] object ParquetListFormatRule extends ParquetCollectionFormatRule { + + private val LOGGER = LoggerFactory.getLogger(getClass) + + def formatForwardCompatibleRepeatedType(repeatedSourceType: Type, repeatedTargetType: Type) = { + val sourceRuleMaybe = findFirstListRule(repeatedSourceType, Source) + val targetRuleMaybe = findFirstListRule(repeatedTargetType, Target) + if (sourceRuleMaybe == targetRuleMaybe || sourceRuleMaybe.isEmpty || targetRuleMaybe.isEmpty) repeatedSourceType else { + val sourceRule = sourceRuleMaybe.get + val targetRule = targetRuleMaybe.get + val elementType = sourceRule.elementType(repeatedSourceType) + targetRule.createCompliantRepeatedType( + typ= elementType, + name= elementType.getName, + isElementRequired= sourceRule.isElementRequired(repeatedSourceType), + originalType= sourceRule.elementOriginalType(repeatedSourceType) + ) + } + } -object ParquetListFormatRule { def isGroupList(projection: Type): Boolean = { if (projection.isPrimitive) return false val groupProjection = projection.asGroupType (groupProjection.getOriginalType eq OriginalType.LIST) && groupProjection.getFieldCount == 1 && groupProjection.getFields.get(0).isRepetition(Type.Repetition.REPEATED) } + + def findFirstListRule(repeatedType: Type, + sourceOrTarget: SourceOrTarget): Option[ParquetListFormatRule] = { + val ruleFound = sourceOrTarget.rules.find(rule => rule.check(repeatedType)) + if (ruleFound.isEmpty) LOGGER.warn(s"Unable to find matching rule for ${sourceOrTarget.name} schema:\n$repeatedType") + ruleFound + } + + def wrapElementAsRepeatedType(rule: ParquetListFormatRule, repeatedType: Type, elementType: Type): Type = rule.createCompliantRepeatedType( + elementType, + rule.elementName(repeatedType), + // if repeated or required, it is required + !elementType.isRepetition(Type.Repetition.OPTIONAL), + elementType.getOriginalType) +} + +private[scrooge] sealed trait SourceOrTarget { + def rules: Seq[ParquetListFormatRule] + def name: String +} + +private[scrooge] object Source extends SourceOrTarget { + override val rules: Seq[ParquetListFormatRule] = Seq( + PrimitiveElementRule, PrimitiveArrayRule, + GroupElementRule, GroupArrayRule, + TupleRule, StandardRule + ) + + override def name: String = "source" +} + +private[scrooge] object Target extends SourceOrTarget { + override def rules: Seq[ParquetListFormatRule] = Source.rules.filterNot(_ == TupleRule) + + override def name: String = "target" } private[scrooge] sealed trait ParquetListFormatRule { @@ -56,11 +120,11 @@ private[scrooge] sealed trait PrimitiveListRule extends ParquetListFormatRule { } } -object PrimitiveElementRule extends PrimitiveListRule { +private[scrooge] object PrimitiveElementRule extends PrimitiveListRule { override def constantElementName: String = "element" } -object PrimitiveArrayRule extends PrimitiveListRule { +private[scrooge] object PrimitiveArrayRule extends PrimitiveListRule { override def constantElementName: String = "array" } @@ -100,15 +164,15 @@ private[scrooge] sealed trait GroupListRule extends ParquetListFormatRule { } } -object GroupElementRule extends GroupListRule { +private[scrooge] object GroupElementRule extends GroupListRule { override def constantElementName: String = "element" } -object GroupArrayRule extends GroupListRule { +private[scrooge] object GroupArrayRule extends GroupListRule { override def constantElementName: String = "array" } -object TupleRule extends ParquetListFormatRule { +private[scrooge] object TupleRule extends ParquetListFormatRule { override def check(repeatedType: Type): Boolean = repeatedType.getName.endsWith("_tuple") override def elementName(repeatedType: Type): String = { @@ -126,11 +190,11 @@ object TupleRule extends ParquetListFormatRule { override def createCompliantRepeatedType(typ: Type, name: String, isElementRequired: Boolean, originalType: OriginalType): Type = { val suffixed_name = name + "_tuple" if (typ.isPrimitive) new PrimitiveType(Type.Repetition.REPEATED, typ.asPrimitiveType.getPrimitiveTypeName, suffixed_name, originalType) - else new GroupType(Type.Repetition.REPEATED, suffixed_name, OriginalType.LIST, typ.asGroupType.getFields) + else new GroupType(Type.Repetition.REPEATED, suffixed_name, originalType, typ.asGroupType.getFields) } } -object StandardRule extends ParquetListFormatRule { +private[scrooge] object StandardRule extends ParquetListFormatRule { /** * repeated group list { * element; @@ -150,21 +214,17 @@ object StandardRule extends ParquetListFormatRule { override def elementName(repeatedType: Type): String = "element" - override def createCompliantRepeatedType(typ: Type, name: String, isElementRequired: Boolean, originalType: OriginalType): Type = { + override def createCompliantRepeatedType(originalElementType: Type, name: String, isElementRequired: Boolean, originalType: OriginalType): Type = { val repetition = if (isElementRequired) Type.Repetition.REQUIRED else Type.Repetition.OPTIONAL - val elementType = if (typ.isPrimitive) { - - new PrimitiveType(repetition, typ.asPrimitiveType.getPrimitiveTypeName, "element", originalType) + val elementType = if (originalElementType.isPrimitive) { + new PrimitiveType(repetition, originalElementType.asPrimitiveType.getPrimitiveTypeName, "element", originalType) } else { - val listType = if (ParquetListFormatRule.isGroupList(typ)) OriginalType.LIST else null new GroupType( repetition, "element", - listType, - // TODO: generalize this - // we cannot flatten `list` - if (typ.asGroupType.getName == "list") util.Arrays.asList(typ) else typ.asGroupType.getFields) + originalType, + originalElementType.asGroupType.getFields) } new GroupType(Type.Repetition.REPEATED, "list", util.Arrays.asList(elementType)) diff --git a/scalding-parquet-scrooge/src/main/scala/com/twitter/scalding/parquet/scrooge/ParquetMapFormatRule.scala b/scalding-parquet-scrooge/src/main/scala/com/twitter/scalding/parquet/scrooge/ParquetMapFormatRule.scala new file mode 100644 index 0000000000..2737e21651 --- /dev/null +++ b/scalding-parquet-scrooge/src/main/scala/com/twitter/scalding/parquet/scrooge/ParquetMapFormatRule.scala @@ -0,0 +1,54 @@ +package com.twitter.scalding.parquet.scrooge + +import org.apache.parquet.schema.{OriginalType, Type} + +/** + * Rule to format parquet schema of legacy map type to standard target + * with repeated type of `key_value` without annotation + * as recommended in + * https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#maps + * + * Source with legacy format created by + * {@code org.apache.parquet.schema.ConversionPatterns} has repeated `map` field + * annotated with (MAP_KEY_VALUE) + */ +private[scrooge] object ParquetMapFormatRule extends ParquetCollectionFormatRule { + + /** + * Handle legacy type when + * https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#maps + * @param sourceRepeatedMapType + * @param targetRepeatedMapType + * @return + */ + def formatForwardCompatibleRepeatedType(sourceRepeatedMapType: Type, targetRepeatedMapType: Type) = { + val isLegacyToStandardFormat = isStandardRepeatedType(targetRepeatedMapType) && + isLegacyRepeatedType(sourceRepeatedMapType) + if (isLegacyToStandardFormat) { + targetRepeatedMapType.asGroupType().withNewFields(sourceRepeatedMapType.asGroupType().getFields) + } else { + sourceRepeatedMapType + } + } + + def isGroupMap(typ: Type): Boolean = { + if (typ.isPrimitive) { + false + } else { + val groupType = typ.asGroupType + (groupType.getOriginalType == OriginalType.MAP) && + (groupType.getFieldCount == 1) && + groupType.getFields.get(0).isRepetition(Type.Repetition.REPEATED) && + (isLegacyRepeatedType(groupType.getFields.get(0)) || + isStandardRepeatedType(groupType.getFields.get(0))) + } + } + + private def isLegacyRepeatedType(repeatedType: Type) = { + ((repeatedType.getName == "map") && (repeatedType.getOriginalType == OriginalType.MAP_KEY_VALUE)) + } + + private def isStandardRepeatedType(repeatedType: Type) = { + (repeatedType.getName == "key_value") && (repeatedType.getOriginalType == null) + } +} diff --git a/scalding-parquet-scrooge/src/test/scala/com/twitter/scalding/parquet/scrooge/ParquetCollectionFormatForwardCompatibilityTests.scala b/scalding-parquet-scrooge/src/test/scala/com/twitter/scalding/parquet/scrooge/ParquetCollectionFormatForwardCompatibilityTests.scala index 3a9f96217a..70a0b5dbd3 100644 --- a/scalding-parquet-scrooge/src/test/scala/com/twitter/scalding/parquet/scrooge/ParquetCollectionFormatForwardCompatibilityTests.scala +++ b/scalding-parquet-scrooge/src/test/scala/com/twitter/scalding/parquet/scrooge/ParquetCollectionFormatForwardCompatibilityTests.scala @@ -3,7 +3,7 @@ package com.twitter.scalding.parquet.scrooge import java.util import org.apache.parquet.schema.MessageTypeParser -import org.apache.parquet.thrift.ThriftSchemaConverter +import org.apache.parquet.thrift.{DecodingSchemaMismatchException, ThriftSchemaConverter} import org.apache.parquet.thrift.struct.ThriftField.Requirement import org.apache.parquet.thrift.struct.{ThriftField, ThriftType} import org.apache.parquet.thrift.struct.ThriftType.StructType.StructOrUnionType @@ -12,10 +12,9 @@ import org.scalatest.{Matchers, WordSpec} class ParquetCollectionFormatForwardCompatibilityTests extends WordSpec with Matchers { - "ScroogeReadSupport resolving map format" should { - + "Format forward compat: resolving map format" should { "map identity: " in { - val fileType = MessageTypeParser.parseMessageType( + val targetType = MessageTypeParser.parseMessageType( """ |message spark_schema { | required group map (MAP) { @@ -29,8 +28,8 @@ class ParquetCollectionFormatForwardCompatibilityTests extends WordSpec with Mat | } |} """.stripMargin) - val projected = ParquetCollectionFormatForwardCompatibility.forwardCompatibleMessage(fileType, fileType) - projected shouldEqual fileType + val solved = ParquetCollectionFormatForwardCompatibility.formatForwardCompatibleMessage(targetType, targetType) + solved shouldEqual targetType } "map identity: string key, struct value" in { @@ -55,8 +54,8 @@ class ParquetCollectionFormatForwardCompatibilityTests extends WordSpec with Mat |} """.stripMargin) - val projected = ParquetCollectionFormatForwardCompatibility.forwardCompatibleMessage(message, message) - projected shouldEqual message + val solved = ParquetCollectionFormatForwardCompatibility.formatForwardCompatibleMessage(message, message) + solved shouldEqual message } "map identity: string kye, list string value" in { @@ -77,26 +76,124 @@ class ParquetCollectionFormatForwardCompatibilityTests extends WordSpec with Mat """.stripMargin ) - val projected = ParquetCollectionFormatForwardCompatibility.forwardCompatibleMessage(message, message) - projected shouldEqual message + val solved = ParquetCollectionFormatForwardCompatibility.formatForwardCompatibleMessage(message, message) + solved shouldEqual message + } + + "resolve map legacy format: original type (MAP_KEY_VALUE) to explicit name key_value" in { + val targetType = MessageTypeParser.parseMessageType( + """ + |message spark_schema { + | required group map_field (MAP) { + | repeated group key_value { + | required binary key (UTF8); + | required int32 value; + | } + | } + |} + """.stripMargin) + val sourceType = MessageTypeParser.parseMessageType( + """ + |message SampleSource { + | required group map_field (MAP) { + | repeated group map (MAP_KEY_VALUE) { + | required binary key (UTF8); + | optional int32 value; + | } + | } + |} + """.stripMargin) + val solved = ParquetCollectionFormatForwardCompatibility.formatForwardCompatibleMessage(sourceType, targetType) + val expected = MessageTypeParser.parseMessageType( + """ + |message SampleSource { + | required group map_field (MAP) { + | repeated group key_value { + | required binary key (UTF8); + | optional int32 value; + | } + | } + |} + """.stripMargin) + solved shouldEqual expected + } + + "resolve map legacy format: map of map" in { + val targetType = MessageTypeParser.parseMessageType( + """ + |message spark_schema { + | required group map_of_map_field (MAP) { + | repeated group key_value { + | required binary key (UTF8); + | required group value (MAP) { + | repeated group key_value { + | required binary key (UTF8); + | required group value { + | required binary _id (UTF8); + | required int32 x; + | } + | } + | } + | } + | } + |} + """.stripMargin) + val sourceType = MessageTypeParser.parseMessageType( + """ + |message SampleSource { + | required group map_of_map_field (MAP) { + | repeated group map (MAP_KEY_VALUE) { + | required binary key (UTF8); + | required group value (MAP) { + | repeated group map (MAP_KEY_VALUE) { + | required binary key (UTF8); + | required group value { + | optional int32 x; + | } + | } + | } + | } + | } + |} + """.stripMargin) + val solved = ParquetCollectionFormatForwardCompatibility.formatForwardCompatibleMessage(sourceType, targetType) + val expected = MessageTypeParser.parseMessageType( + """ + |message SampleSource { + | required group map_of_map_field (MAP) { + | repeated group key_value { + | required binary key (UTF8); + | required group value (MAP) { + | repeated group key_value { + | required binary key (UTF8); + | required group value { + | optional int32 x; + | } + | } + | } + | } + | } + |} + """.stripMargin) + solved shouldEqual expected } - } - private def schemaFromThriftMap(mapValueType: ThriftType) = { - val mapType = new MapType( - new ThriftField("NOT_USED_KEY", 4, Requirement.REQUIRED, new ThriftType.StringType), - new ThriftField("NOT_USED_VALUE", 5, Requirement.REQUIRED, - mapValueType) - ) - new ThriftSchemaConverter().convert( - new StructType(util.Arrays.asList( - new ThriftField("map_field", 6, Requirement.REQUIRED, mapType) - ), StructOrUnionType.STRUCT)) + def schemaFromThriftMap(mapValueType: ThriftType) = { + val mapType = new MapType( + new ThriftField("NOT_USED_KEY", 4, Requirement.REQUIRED, new ThriftType.StringType), + new ThriftField("NOT_USED_VALUE", 5, Requirement.REQUIRED, + mapValueType) + ) + new ThriftSchemaConverter().convert( + new StructType(util.Arrays.asList( + new ThriftField("map_field", 6, Requirement.REQUIRED, mapType) + ), StructOrUnionType.STRUCT)) + } } - "ScroogeReadSupport resolving list format" should { - "resolve list legacy format: project x_tuple to legacy array" in { - val fileType = MessageTypeParser.parseMessageType( + "Format forward compat: resolving list format" should { + "resolve list legacy format: format x_tuple to legacy array" in { + val targetType = MessageTypeParser.parseMessageType( """ |message spark_schema { | required group country_codes (LIST) { @@ -105,29 +202,29 @@ class ParquetCollectionFormatForwardCompatibilityTests extends WordSpec with Mat | required int32 x; |} """.stripMargin) - val requestedProjection = MessageTypeParser.parseMessageType( + val sourceType = MessageTypeParser.parseMessageType( """ - |message SampleProjection { + |message SampleSource { | optional group country_codes (LIST) { | repeated binary country_codes_tuple (UTF8); | } |} """.stripMargin) - val projected = ParquetCollectionFormatForwardCompatibility.forwardCompatibleMessage(requestedProjection, fileType) + val solved = ParquetCollectionFormatForwardCompatibility.formatForwardCompatibleMessage(sourceType, targetType) // note optional of result, and field rename val expected = MessageTypeParser.parseMessageType( """ - |message SampleProjection { + |message SampleSource { | optional group country_codes (LIST) { | repeated binary array (UTF8); | } |} """.stripMargin) - projected shouldEqual expected + solved shouldEqual expected } - "resolve list legacy format: project x_tuple to 3-level" in { - val fileType = MessageTypeParser.parseMessageType( + "resolve list legacy format: format x_tuple to 3-level" in { + val targetType = MessageTypeParser.parseMessageType( """ |message spark_schema { | required group country_codes (LIST) { @@ -138,19 +235,19 @@ class ParquetCollectionFormatForwardCompatibilityTests extends WordSpec with Mat | required int32 x; |} """.stripMargin) - val requestedProjection = MessageTypeParser.parseMessageType( + val sourceType = MessageTypeParser.parseMessageType( """ - |message SampleProjection { + |message SampleSource { | optional group country_codes (LIST) { | repeated binary country_codes_tuple (UTF8); | } |} """.stripMargin) - val projected = ParquetCollectionFormatForwardCompatibility.forwardCompatibleMessage(requestedProjection, fileType) + val solved = ParquetCollectionFormatForwardCompatibility.formatForwardCompatibleMessage(sourceType, targetType) // note optional of result, and field rename val expected = MessageTypeParser.parseMessageType( """ - |message SampleProjection { + |message SampleSource { | optional group country_codes (LIST) { | repeated group list { | required binary element (UTF8); @@ -158,11 +255,11 @@ class ParquetCollectionFormatForwardCompatibilityTests extends WordSpec with Mat | } |} """.stripMargin) - projected shouldEqual expected + solved shouldEqual expected } - "resolve list legacy format: project nested x_tuple to nested legacy array" in { - val fileType = MessageTypeParser.parseMessageType( + "resolve list legacy format: format nested x_tuple to nested legacy array" in { + val targetType = MessageTypeParser.parseMessageType( """ |message spark_schema { | required group foo (LIST) { @@ -172,9 +269,9 @@ class ParquetCollectionFormatForwardCompatibilityTests extends WordSpec with Mat | } |} """.stripMargin) - val requestedProjection = MessageTypeParser.parseMessageType( + val sourceType = MessageTypeParser.parseMessageType( """ - |message SampleProjection { + |message SampleSource { | optional group foo (LIST) { | repeated group foo_tuple (LIST) { | repeated binary foo_tuple_tuple (UTF8); @@ -182,11 +279,11 @@ class ParquetCollectionFormatForwardCompatibilityTests extends WordSpec with Mat | } |} """.stripMargin) - val projected = ParquetCollectionFormatForwardCompatibility.forwardCompatibleMessage(requestedProjection, fileType) + val solved = ParquetCollectionFormatForwardCompatibility.formatForwardCompatibleMessage(sourceType, targetType) // note optional of result, and field rename val expected = MessageTypeParser.parseMessageType( """ - |message SampleProjection { + |message SampleSource { | optional group foo (LIST) { | repeated group array { | repeated binary array (UTF8); @@ -194,11 +291,11 @@ class ParquetCollectionFormatForwardCompatibilityTests extends WordSpec with Mat | } |} """.stripMargin) - projected shouldEqual expected + solved shouldEqual expected } - "resolve list legacy format: project nested x_tuple to nested array" in { - val fileType = MessageTypeParser.parseMessageType( + "resolve list legacy format: format nested x_tuple to nested array" in { + val targetType = MessageTypeParser.parseMessageType( """ |message spark_schema { | required group foo (LIST) { @@ -208,9 +305,9 @@ class ParquetCollectionFormatForwardCompatibilityTests extends WordSpec with Mat | } |} """.stripMargin) - val requestedProjection = MessageTypeParser.parseMessageType( + val sourceType = MessageTypeParser.parseMessageType( """ - |message SampleProjection { + |message SampleSource { | optional group foo (LIST) { | repeated group foo_tuple (LIST) { | repeated binary foo_tuple_tuple (UTF8); @@ -218,11 +315,11 @@ class ParquetCollectionFormatForwardCompatibilityTests extends WordSpec with Mat | } |} """.stripMargin) - val projected = ParquetCollectionFormatForwardCompatibility.forwardCompatibleMessage(requestedProjection, fileType) + val solved = ParquetCollectionFormatForwardCompatibility.formatForwardCompatibleMessage(sourceType, targetType) // note optional of result, and field rename val expected = MessageTypeParser.parseMessageType( """ - |message SampleProjection { + |message SampleSource { | optional group foo (LIST) { | repeated group array { | repeated binary array (UTF8); @@ -230,11 +327,11 @@ class ParquetCollectionFormatForwardCompatibilityTests extends WordSpec with Mat | } |} """.stripMargin) - projected shouldEqual expected + solved shouldEqual expected } - "resolve list in group legacy format: project x_tuple to nested 3-level" in { - val fileType = MessageTypeParser.parseMessageType( + "resolve list in group legacy format: format x_tuple to nested 3-level" in { + val targetType = MessageTypeParser.parseMessageType( """ |message spark_schema { | required group foo (LIST) { @@ -248,9 +345,9 @@ class ParquetCollectionFormatForwardCompatibilityTests extends WordSpec with Mat | } |} """.stripMargin) - val requestedProjection = MessageTypeParser.parseMessageType( + val sourceType = MessageTypeParser.parseMessageType( """ - |message SampleProjection { + |message SampleSource { | optional group foo (LIST) { | repeated group foo_tuple (LIST) { | repeated binary foo_tuple_tuple (UTF8); @@ -258,10 +355,10 @@ class ParquetCollectionFormatForwardCompatibilityTests extends WordSpec with Mat | } |} """.stripMargin) - val projected = ParquetCollectionFormatForwardCompatibility.forwardCompatibleMessage(requestedProjection, fileType) + val solved = ParquetCollectionFormatForwardCompatibility.formatForwardCompatibleMessage(sourceType, targetType) val expected = MessageTypeParser.parseMessageType( """ - |message SampleProjection { + |message SampleSource { | optional group foo (LIST) { | repeated group list { | required group element (LIST) { @@ -273,11 +370,11 @@ class ParquetCollectionFormatForwardCompatibilityTests extends WordSpec with Mat | } |} """.stripMargin) - projected shouldEqual expected + solved shouldEqual expected } "resolve: binary array to 3-level nesting" in { - val fileType = MessageTypeParser.parseMessageType( + val targetType = MessageTypeParser.parseMessageType( """ |message spark_schema { | required group country_codes (LIST) { @@ -290,19 +387,19 @@ class ParquetCollectionFormatForwardCompatibilityTests extends WordSpec with Mat """.stripMargin) // inner list is `binary array` - val requestedProjection = MessageTypeParser.parseMessageType( + val sourceType = MessageTypeParser.parseMessageType( """ - |message SampleProjection { + |message SampleSource { | optional group country_codes (LIST) { | repeated binary array (UTF8); | } |} """.stripMargin) - val projected = ParquetCollectionFormatForwardCompatibility.forwardCompatibleMessage(requestedProjection, fileType) + val solved = ParquetCollectionFormatForwardCompatibility.formatForwardCompatibleMessage(sourceType, targetType) val expected = MessageTypeParser.parseMessageType( """ - |message SampleProjection { + |message SampleSource { | optional group country_codes (LIST) { | repeated group list { | required binary element (UTF8); @@ -310,12 +407,12 @@ class ParquetCollectionFormatForwardCompatibilityTests extends WordSpec with Mat | } |} """.stripMargin) - projected shouldEqual expected + solved shouldEqual expected } "resolve: identity 3-level" in { - val fileType = MessageTypeParser.parseMessageType( + val targetType = MessageTypeParser.parseMessageType( """ |message spark_schema { | required group country_codes (LIST) { @@ -326,13 +423,13 @@ class ParquetCollectionFormatForwardCompatibilityTests extends WordSpec with Mat | required int32 x; |} """.stripMargin) - val projected = ParquetCollectionFormatForwardCompatibility.forwardCompatibleMessage(fileType, fileType) - projected shouldEqual fileType + val solved = ParquetCollectionFormatForwardCompatibility.formatForwardCompatibleMessage(targetType, targetType) + solved shouldEqual targetType } - "resolve nested list: project inner legacy array to 3-level nesting" in { - val fileType = MessageTypeParser.parseMessageType( + "resolve nested list: format inner legacy array to 3-level nesting" in { + val targetType = MessageTypeParser.parseMessageType( """ |message spark_schema { | required group array_of_country_codes (LIST) { @@ -349,9 +446,9 @@ class ParquetCollectionFormatForwardCompatibilityTests extends WordSpec with Mat """.stripMargin) // inner list is `binary array` - val requestedProjection = MessageTypeParser.parseMessageType( + val sourceType = MessageTypeParser.parseMessageType( """ - |message SampleProjection { + |message SampleSource { | optional group array_of_country_codes (LIST) { | repeated group list { | required group element (LIST) { @@ -361,11 +458,11 @@ class ParquetCollectionFormatForwardCompatibilityTests extends WordSpec with Mat | } |} """.stripMargin) - val projected = ParquetCollectionFormatForwardCompatibility.forwardCompatibleMessage(requestedProjection, fileType) + val solved = ParquetCollectionFormatForwardCompatibility.formatForwardCompatibleMessage(sourceType, targetType) val expected = MessageTypeParser.parseMessageType( """ - |message SampleProjection { + |message SampleSource { | optional group array_of_country_codes (LIST) { | repeated group list { | required group element (LIST) { @@ -377,11 +474,11 @@ class ParquetCollectionFormatForwardCompatibilityTests extends WordSpec with Mat | } |} """.stripMargin) - projected shouldEqual expected + solved shouldEqual expected } - "resolve projected struct in list: repeated group element to 3-level nesting" in { - val fileType = MessageTypeParser.parseMessageType( + "resolve solved struct in list: repeated group element to 3-level nesting" in { + val targetType = MessageTypeParser.parseMessageType( """ |message spark_schema { | required group country_codes (LIST) { @@ -396,9 +493,9 @@ class ParquetCollectionFormatForwardCompatibilityTests extends WordSpec with Mat | required int32 x; |} """.stripMargin) - val requestedProjection = MessageTypeParser.parseMessageType( + val sourceType = MessageTypeParser.parseMessageType( """ - |message SampleProjection { + |message SampleSource { | optional group country_codes (LIST) { | repeated group element { | optional binary foo (UTF8); @@ -407,11 +504,11 @@ class ParquetCollectionFormatForwardCompatibilityTests extends WordSpec with Mat | } |} """.stripMargin) - val projected = ParquetCollectionFormatForwardCompatibility.forwardCompatibleMessage(requestedProjection, fileType) + val solved = ParquetCollectionFormatForwardCompatibility.formatForwardCompatibleMessage(sourceType, targetType) val expected = MessageTypeParser.parseMessageType( """ - |message SampleProjection { + |message SampleSource { | optional group country_codes (LIST) { | repeated group list { | required group element { @@ -422,11 +519,11 @@ class ParquetCollectionFormatForwardCompatibilityTests extends WordSpec with Mat | } |} """.stripMargin) - projected shouldEqual expected + solved shouldEqual expected } "resolve standard 3-level list to 2-level" in { - val fileType = MessageTypeParser.parseMessageType( + val targetType = MessageTypeParser.parseMessageType( """ |message scalding_schema { | required group array_of_country_codes (LIST) { @@ -440,9 +537,9 @@ class ParquetCollectionFormatForwardCompatibilityTests extends WordSpec with Mat |} """.stripMargin) - val requestedProjection = MessageTypeParser.parseMessageType( + val sourceType = MessageTypeParser.parseMessageType( """ - |message SampleProjection { + |message SampleSource { | optional group array_of_country_codes (LIST) { | repeated group list { | required group element (LIST) { @@ -454,11 +551,11 @@ class ParquetCollectionFormatForwardCompatibilityTests extends WordSpec with Mat | } |} """.stripMargin) - val projected = ParquetCollectionFormatForwardCompatibility.forwardCompatibleMessage(requestedProjection, fileType) + val solved = ParquetCollectionFormatForwardCompatibility.formatForwardCompatibleMessage(sourceType, targetType) val expected = MessageTypeParser.parseMessageType( """ - |message SampleProjection { + |message SampleSource { | optional group array_of_country_codes (LIST) { | repeated group list { | required group element (LIST) { @@ -468,11 +565,11 @@ class ParquetCollectionFormatForwardCompatibilityTests extends WordSpec with Mat | } |} """.stripMargin) - projected shouldEqual expected + solved shouldEqual expected } "resolve list in group containing list" in { - val fileType = MessageTypeParser.parseMessageType( + val targetType = MessageTypeParser.parseMessageType( """ |message spark_schema { | optional group connect_delays (LIST) { @@ -490,9 +587,9 @@ class ParquetCollectionFormatForwardCompatibilityTests extends WordSpec with Mat | } |} """.stripMargin) - val requestedProjection = MessageTypeParser.parseMessageType( + val sourceType = MessageTypeParser.parseMessageType( """ - |message SampleProjection { + |message SampleSource { | optional group connect_delays (LIST) { | repeated group connect_delays_tuple { | optional binary description (UTF8); @@ -503,10 +600,10 @@ class ParquetCollectionFormatForwardCompatibilityTests extends WordSpec with Mat | } |} """.stripMargin) - val projected = ParquetCollectionFormatForwardCompatibility.forwardCompatibleMessage(requestedProjection, fileType) + val solved = ParquetCollectionFormatForwardCompatibility.formatForwardCompatibleMessage(sourceType, targetType) val expected = MessageTypeParser.parseMessageType( """ - |message SampleProjection { + |message SampleSource { | optional group connect_delays (LIST) { | repeated group list { | required group element { @@ -521,11 +618,11 @@ class ParquetCollectionFormatForwardCompatibilityTests extends WordSpec with Mat | } |} """.stripMargin) - projected shouldEqual expected + solved shouldEqual expected } - "resolve projection of different level" in { - val fileType = MessageTypeParser.parseMessageType( + "resolve projection: format 2-level source to 3-level target" in { + val targetType = MessageTypeParser.parseMessageType( """ |message spark_schema { | optional group foo (LIST) { @@ -533,28 +630,27 @@ class ParquetCollectionFormatForwardCompatibilityTests extends WordSpec with Mat | required group element { | required binary zing (UTF8); | required binary bar (UTF8); + | required binary baz (UTF8); | } | } | } |} """.stripMargin) - val requestedProjection = MessageTypeParser.parseMessageType( + val sourceType = MessageTypeParser.parseMessageType( """ - |message SampleProjection { + |message SampleSource { | optional group foo (LIST) { - | repeated group list { - | required group element { - | optional binary bar (UTF8); - | required binary zing (UTF8); - | } + | repeated group element { + | optional binary bar (UTF8); + | required binary zing (UTF8); | } | } |} """.stripMargin) - val projected = ParquetCollectionFormatForwardCompatibility.forwardCompatibleMessage(requestedProjection, fileType) + val solved = ParquetCollectionFormatForwardCompatibility.formatForwardCompatibleMessage(sourceType, targetType) val expected = MessageTypeParser.parseMessageType( """ - |message SampleProjection { + |message SampleSource { | optional group foo (LIST) { | repeated group list { | required group element { @@ -565,11 +661,11 @@ class ParquetCollectionFormatForwardCompatibilityTests extends WordSpec with Mat | } |} """.stripMargin) - projected shouldEqual expected + solved shouldEqual expected } - "resolve does not support backward compat: project nested 3-level to x_tuple" in { - val fileType = MessageTypeParser.parseMessageType( + "resolve does not support backward compat: format nested 3-level to x_tuple" in { + val targetType = MessageTypeParser.parseMessageType( """ |message scalding_schema { | required group foo (LIST) { @@ -580,9 +676,9 @@ class ParquetCollectionFormatForwardCompatibilityTests extends WordSpec with Mat | required int32 x; |} """.stripMargin) - val requestedProjection = MessageTypeParser.parseMessageType( + val sourceType = MessageTypeParser.parseMessageType( """ - |message SampleProjection { + |message SampleSource { | optional group foo (LIST) { | repeated group list { | required group element (LIST) { @@ -595,8 +691,229 @@ class ParquetCollectionFormatForwardCompatibilityTests extends WordSpec with Mat | optional int32 x; |} """.stripMargin) - val projected = ParquetCollectionFormatForwardCompatibility.forwardCompatibleMessage(requestedProjection, fileType) - projected shouldEqual requestedProjection + val solved = ParquetCollectionFormatForwardCompatibility.formatForwardCompatibleMessage(sourceType, targetType) + solved shouldEqual sourceType + } + } + + "Format forward compat: resolving mixed collection" should { + "format map of list" in { + val targetType = MessageTypeParser.parseMessageType( + """ + |message spark_schema { + | required group map_field (MAP) { + | repeated group key_value { + | required binary key (UTF8); + | required group value { + | optional group foo (LIST) { + | repeated group list { + | required binary element (UTF8); + | } + | } + | required int32 x; + | } + | } + | } + |} + | + """.stripMargin) + val sourceType = MessageTypeParser.parseMessageType( + """ + |message ParquetSchema { + | required group map_field (MAP) { + | repeated group map (MAP_KEY_VALUE) { + | required binary key (UTF8); + | optional group value { + | optional group foo (LIST) { + | repeated binary foo_tuple (UTF8); + | } + | optional int32 x; + | } + | } + | } + |} + """.stripMargin) + + val solved = ScroogeReadSupport.getSchemaForRead(targetType, sourceType) + val expected = MessageTypeParser.parseMessageType( + """ + |message ParquetSchema { + | required group map_field (MAP) { + | repeated group key_value { + | required binary key (UTF8); + | optional group value { + | optional group foo (LIST) { + | repeated group list { + | required binary element (UTF8); + | } + | } + | optional int32 x; + | } + | } + | } + |} + """.stripMargin) + solved shouldEqual expected + } + + "format list of map" in { + val targetType = MessageTypeParser.parseMessageType( + """ + |message spark_schema { + | required group list_of_map (LIST) { + | repeated group list { + | required group element (MAP) { + | repeated group key_value { + | required binary key (UTF8); + | required group value { + | required binary _id (UTF8); + | required double created; + | } + | } + | } + | } + | } + |} + """.stripMargin) + val sourceType = MessageTypeParser.parseMessageType( + """ + |message SampleSource { + | required group list_of_map (LIST) { + | repeated group element_tuple (MAP) { + | repeated group map (MAP_KEY_VALUE) { + | required binary key (UTF8); + | optional group value { + | optional double created; + | } + | } + | } + | } + |} + """.stripMargin) + + val solved = ScroogeReadSupport.getSchemaForRead(targetType, sourceType) + val expected = MessageTypeParser.parseMessageType( + """ + |message SampleSource { + | required group list_of_map (LIST) { + | repeated group list { + | required group element (MAP) { + | repeated group key_value { + | required binary key (UTF8); + | optional group value { + | optional double created; + | } + | } + | } + | } + | } + |} + """.stripMargin) + solved shouldEqual expected + } + } + + "Format forward compat: check extra non-optional field projection" should { + "throws on missing (MAP_KEY_VALUE) annotation causing projection of non-existent field" in { + val targetType = MessageTypeParser.parseMessageType( + """ + |message spark_schema { + | required group map_field (MAP) { + | repeated group key_value { + | required binary key (UTF8); + | required int32 value; + | } + | } + |} + """.stripMargin) + // `map` isn't annotated with `MAP_KEY_VALUE`, and is thus treated as + // an actual field which then fails projection + val sourceType = MessageTypeParser.parseMessageType( + """ + |message SampleSource { + | required group map_field (MAP) { + | repeated group map { + | required binary key (UTF8); + | optional int32 value; + | } + | } + |} + """.stripMargin) + + val e = intercept[DecodingSchemaMismatchException] { + ParquetCollectionFormatForwardCompatibility.formatForwardCompatibleMessage( + sourceType, + targetType + ) + } + + e.getMessage should include("non-optional source field map:") + } + + "throws on missing `repeated` causing projection of non-existent field" in { + val targetType = MessageTypeParser.parseMessageType( + """ + |message spark_schema { + | optional group foo (LIST) { + | repeated group list { + | required group element { + | required binary zing (UTF8); + | } + | } + | } + |} + """.stripMargin) + val sourceType = MessageTypeParser.parseMessageType( + """ + |message SampleSource { + | optional group foo (LIST) { + | required group element { + | optional binary zing (UTF8); + | } + | } + |} + """.stripMargin) + + val e = intercept[DecodingSchemaMismatchException] { + ParquetCollectionFormatForwardCompatibility.formatForwardCompatibleMessage(sourceType, targetType) + } + + e.getMessage should include("non-optional source field element:") + } + + "throws on required but non-existent in target" in { + val targetType = MessageTypeParser.parseMessageType( + """ + |message spark_schema { + | required group map_field (MAP) { + | repeated group key_value { + | required binary key (UTF8); + | required int32 value; + | } + | } + |} + """.stripMargin) + val sourceType = MessageTypeParser.parseMessageType( + """ + |message SampleSource { + | required group map_field (MAP) { + | repeated group map (MAP_KEY_VALUE) { + | required binary key (UTF8); + | optional int32 value; + | required int32 bogus_field; + | } + | } + |} + """.stripMargin) + + val e = intercept[DecodingSchemaMismatchException] { + ParquetCollectionFormatForwardCompatibility.formatForwardCompatibleMessage( + sourceType, + targetType + ) + } + + e.getMessage should include("non-optional source field bogus_field:") } } } From b7363b86f99ebd31db853362dbcb2b6b159cfb7a Mon Sep 17 00:00:00 2001 From: Mick Jermsurawong Date: Mon, 30 Sep 2019 08:23:19 -0700 Subject: [PATCH 09/34] format import --- .../scalding/parquet/scrooge/ScroogeReadSupportTests.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scalding-parquet-scrooge/src/test/scala/com/twitter/scalding/parquet/scrooge/ScroogeReadSupportTests.scala b/scalding-parquet-scrooge/src/test/scala/com/twitter/scalding/parquet/scrooge/ScroogeReadSupportTests.scala index 7cd6053bb5..5e8ed5b541 100644 --- a/scalding-parquet-scrooge/src/test/scala/com/twitter/scalding/parquet/scrooge/ScroogeReadSupportTests.scala +++ b/scalding-parquet-scrooge/src/test/scala/com/twitter/scalding/parquet/scrooge/ScroogeReadSupportTests.scala @@ -7,7 +7,7 @@ import com.twitter.scalding.platform.{HadoopPlatformJobTest, HadoopSharedPlatfor import com.twitter.scalding.typed.TypedPipe import com.twitter.scalding.{Args, Job} import org.apache.parquet.io.InvalidRecordException -import org.apache.parquet.schema.{MessageTypeParser} +import org.apache.parquet.schema.MessageTypeParser import org.scalatest.{Matchers, WordSpec} class ScroogeReadSupportTests extends WordSpec with Matchers with HadoopSharedPlatformTest { From 2996317c7a5d1e94029db473f080bddf7a05b06f Mon Sep 17 00:00:00 2001 From: Mick Jermsurawong Date: Mon, 30 Sep 2019 08:30:24 -0700 Subject: [PATCH 10/34] improve docs --- .../scrooge/ParquetListFormatRule.scala | 34 +++++++++++++------ 1 file changed, 23 insertions(+), 11 deletions(-) diff --git a/scalding-parquet-scrooge/src/main/scala/com/twitter/scalding/parquet/scrooge/ParquetListFormatRule.scala b/scalding-parquet-scrooge/src/main/scala/com/twitter/scalding/parquet/scrooge/ParquetListFormatRule.scala index 11576ec949..32f47cfc6e 100644 --- a/scalding-parquet-scrooge/src/main/scala/com/twitter/scalding/parquet/scrooge/ParquetListFormatRule.scala +++ b/scalding-parquet-scrooge/src/main/scala/com/twitter/scalding/parquet/scrooge/ParquetListFormatRule.scala @@ -55,6 +55,9 @@ private[scrooge] object ParquetListFormatRule extends ParquetCollectionFormatRul elementType.getOriginalType) } +/** + * Helper to specify supported source/target conversion. + */ private[scrooge] sealed trait SourceOrTarget { def rules: Seq[ParquetListFormatRule] def name: String @@ -76,6 +79,16 @@ private[scrooge] object Target extends SourceOrTarget { override def name: String = "target" } +/** + * Rule allowing conversion from one format to other format by + * 1) detect which format is the repeated list type. + * 2) decompose the repeated type into element and other info. + * 3) construct compliant repeated type from the given element and other info. + * For example, + * if source repeated type matches Rule 1, and target type matches Rule 2. + * Rule 1 will decompose the source type, and + * Rule 2 will take that information to construct repeated element in Rule 2 format. + */ private[scrooge] sealed trait ParquetListFormatRule { def elementType(repeatedType: Type): Type @@ -93,11 +106,11 @@ private[scrooge] sealed trait ParquetListFormatRule { originalType: OriginalType): Type } -/** - * repeated int32 [element|array]; - */ -private[scrooge] sealed trait PrimitiveListRule extends ParquetListFormatRule { +private[scrooge] sealed trait PrimitiveListRule extends ParquetListFormatRule { + /** + * repeated int32 [element|array]; + */ def constantElementName: String override def elementType(repeatedType: Type): Type = repeatedType @@ -128,14 +141,13 @@ private[scrooge] object PrimitiveArrayRule extends PrimitiveListRule { override def constantElementName: String = "array" } -/** - * repeated group [element|array] { - * required binary str (UTF8); - * required int32 num; - * } - */ private[scrooge] sealed trait GroupListRule extends ParquetListFormatRule { - + /** + * repeated group [element|array] { + * required binary str (UTF8); + * required int32 num; + * } + */ def constantElementName: String override def isElementRequired(repeatedType: Type): Boolean = { From e02d0a81b41bcdb96c95c3029488ae6017ffee2a Mon Sep 17 00:00:00 2001 From: Mick Jermsurawong Date: Mon, 30 Sep 2019 10:22:12 -0700 Subject: [PATCH 11/34] fix style warning --- ...CollectionFormatForwardCompatibility.scala | 13 +++--- .../scrooge/ParquetListFormatRule.scala | 46 ++++++++++++------- 2 files changed, 37 insertions(+), 22 deletions(-) diff --git a/scalding-parquet-scrooge/src/main/scala/com/twitter/scalding/parquet/scrooge/ParquetCollectionFormatForwardCompatibility.scala b/scalding-parquet-scrooge/src/main/scala/com/twitter/scalding/parquet/scrooge/ParquetCollectionFormatForwardCompatibility.scala index 4d0699a0e6..1adc341a62 100644 --- a/scalding-parquet-scrooge/src/main/scala/com/twitter/scalding/parquet/scrooge/ParquetCollectionFormatForwardCompatibility.scala +++ b/scalding-parquet-scrooge/src/main/scala/com/twitter/scalding/parquet/scrooge/ParquetCollectionFormatForwardCompatibility.scala @@ -7,6 +7,8 @@ import org.apache.parquet.schema.{GroupType, MessageType, Type} import org.apache.parquet.thrift.DecodingSchemaMismatchException import org.slf4j.LoggerFactory +import scala.collection.mutable + object ParquetCollectionFormatForwardCompatibility { private val LOGGER = LoggerFactory.getLogger(getClass) @@ -86,9 +88,8 @@ object ParquetCollectionFormatForwardCompatibility { val sourceGroup = sourceType.asGroupType val targetGroup = targetType.asGroupType - val resultFields = new util.ArrayList[Type] - import scala.collection.JavaConversions._ - for (sourceField <- sourceGroup.getFields) { + import scala.collection.JavaConverters._ + val resultFields = sourceGroup.getFields.asScala.map { sourceField => if (!targetGroup.containsField(sourceField.getName)) { if (!sourceField.isRepetition(Repetition.OPTIONAL)) { throw new DecodingSchemaMismatchException( @@ -96,15 +97,15 @@ object ParquetCollectionFormatForwardCompatibility { s"not present in the given target type:\n${targetGroup}" ) } - resultFields.add(sourceField) + sourceField } else { val fieldIndex = targetGroup.getFieldIndex(sourceField.getName) val targetField = targetGroup.getFields.get(fieldIndex) - resultFields.add(formatForwardCompatibleType(sourceField, targetField)) + formatForwardCompatibleType(sourceField, targetField) } } - sourceGroup.withNewFields(resultFields) + sourceGroup.withNewFields(resultFields.asJava) } } diff --git a/scalding-parquet-scrooge/src/main/scala/com/twitter/scalding/parquet/scrooge/ParquetListFormatRule.scala b/scalding-parquet-scrooge/src/main/scala/com/twitter/scalding/parquet/scrooge/ParquetListFormatRule.scala index 32f47cfc6e..b044f31b98 100644 --- a/scalding-parquet-scrooge/src/main/scala/com/twitter/scalding/parquet/scrooge/ParquetListFormatRule.scala +++ b/scalding-parquet-scrooge/src/main/scala/com/twitter/scalding/parquet/scrooge/ParquetListFormatRule.scala @@ -19,25 +19,37 @@ private[scrooge] object ParquetListFormatRule extends ParquetCollectionFormatRul private val LOGGER = LoggerFactory.getLogger(getClass) def formatForwardCompatibleRepeatedType(repeatedSourceType: Type, repeatedTargetType: Type) = { - val sourceRuleMaybe = findFirstListRule(repeatedSourceType, Source) - val targetRuleMaybe = findFirstListRule(repeatedTargetType, Target) - if (sourceRuleMaybe == targetRuleMaybe || sourceRuleMaybe.isEmpty || targetRuleMaybe.isEmpty) repeatedSourceType else { - val sourceRule = sourceRuleMaybe.get - val targetRule = targetRuleMaybe.get - val elementType = sourceRule.elementType(repeatedSourceType) - targetRule.createCompliantRepeatedType( - typ= elementType, - name= elementType.getName, - isElementRequired= sourceRule.isElementRequired(repeatedSourceType), - originalType= sourceRule.elementOriginalType(repeatedSourceType) - ) + ( + findFirstListRule(repeatedSourceType, Source), + findFirstListRule(repeatedTargetType, Target) + ) match { + case (Some(sourceRule), Some(targetRule)) => { + if (sourceRule == targetRule) { + repeatedSourceType + } else { + val elementType = sourceRule.elementType(repeatedSourceType) + targetRule.createCompliantRepeatedType( + typ = elementType, + name = elementType.getName, + isElementRequired = sourceRule.isElementRequired(repeatedSourceType), + originalType = sourceRule.elementOriginalType(repeatedSourceType) + ) + } + } + case _ => repeatedSourceType } } def isGroupList(projection: Type): Boolean = { - if (projection.isPrimitive) return false - val groupProjection = projection.asGroupType - (groupProjection.getOriginalType eq OriginalType.LIST) && groupProjection.getFieldCount == 1 && groupProjection.getFields.get(0).isRepetition(Type.Repetition.REPEATED) + if (projection.isPrimitive) { + false + } else { + val groupProjection = projection.asGroupType + groupProjection.getOriginalType == OriginalType.LIST && + groupProjection.getFieldCount == 1 && + groupProjection.getFields.get(0).isRepetition(Type.Repetition.REPEATED) + } + } def findFirstListRule(repeatedType: Type, @@ -222,7 +234,9 @@ private[scrooge] object StandardRule extends ParquetListFormatRule { override def elementType(repeatedType: Type): Type = firstField(repeatedType.asGroupType) - override private[scrooge] def isElementRequired(repeatedType: Type): Boolean = elementType(repeatedType).getRepetition eq Type.Repetition.REQUIRED + override private[scrooge] def isElementRequired(repeatedType: Type): Boolean = { + elementType(repeatedType).getRepetition == Type.Repetition.REQUIRED + } override def elementName(repeatedType: Type): String = "element" From 4d40c06d469d4c5a4cfa32a756e1a31b9f021f9d Mon Sep 17 00:00:00 2001 From: Mick Jermsurawong Date: Mon, 30 Sep 2019 23:50:23 -0700 Subject: [PATCH 12/34] address PR feedback --- ...CollectionFormatForwardCompatibility.scala | 142 ++++++------------ ...tRule.scala => ParquetListFormatter.scala} | 89 ++++++----- ...atRule.scala => ParquetMapFormatter.scala} | 35 +++-- 3 files changed, 118 insertions(+), 148 deletions(-) rename scalding-parquet-scrooge/src/main/scala/com/twitter/scalding/parquet/scrooge/{ParquetListFormatRule.scala => ParquetListFormatter.scala} (77%) rename scalding-parquet-scrooge/src/main/scala/com/twitter/scalding/parquet/scrooge/{ParquetMapFormatRule.scala => ParquetMapFormatter.scala} (54%) diff --git a/scalding-parquet-scrooge/src/main/scala/com/twitter/scalding/parquet/scrooge/ParquetCollectionFormatForwardCompatibility.scala b/scalding-parquet-scrooge/src/main/scala/com/twitter/scalding/parquet/scrooge/ParquetCollectionFormatForwardCompatibility.scala index 1adc341a62..99656ab927 100644 --- a/scalding-parquet-scrooge/src/main/scala/com/twitter/scalding/parquet/scrooge/ParquetCollectionFormatForwardCompatibility.scala +++ b/scalding-parquet-scrooge/src/main/scala/com/twitter/scalding/parquet/scrooge/ParquetCollectionFormatForwardCompatibility.scala @@ -1,26 +1,26 @@ package com.twitter.scalding.parquet.scrooge -import java.util - +import scala.collection.JavaConverters._ import org.apache.parquet.schema.Type.Repetition import org.apache.parquet.schema.{GroupType, MessageType, Type} import org.apache.parquet.thrift.DecodingSchemaMismatchException import org.slf4j.LoggerFactory -import scala.collection.mutable +import scala.reflect.ClassTag object ParquetCollectionFormatForwardCompatibility { - private val LOGGER = LoggerFactory.getLogger(getClass) + private val logger = LoggerFactory.getLogger(getClass) /** * Create a forward-compatible schema, using content from source type with format from target type. + * * @param sourceType source type with legacy format * @param targetType target type to which source is converted to */ def formatForwardCompatibleMessage(sourceType: MessageType, targetType: MessageType): MessageType = { val groupResult = formatForwardCompatibleType(sourceType, targetType).asGroupType() - LOGGER.info("Making source schema to be compatible with target" + + logger.debug("Making source schema to be compatible with target" + s"\nSource:\n${sourceType}\nTarget:\n${targetType}\nResult:\n${groupResult}") new MessageType(groupResult.getName, groupResult.getFields) } @@ -32,63 +32,16 @@ object ParquetCollectionFormatForwardCompatibility { * be maintained in the formatted result. */ private def formatForwardCompatibleType(sourceType: Type, targetType: Type): Type = { - (unwrapGroup(sourceType), unwrapGroup(targetType)) match { - case _ if sourceType.isPrimitive || targetType.isPrimitive => - // Base case + (findCollectionGroup(sourceType), findCollectionGroup(targetType)) match { + case (Some(sourceGroup: ListGroup), Some(targetGroup: ListGroup)) => + formatForwardCompatibleCollectionGroup[ListGroup](sourceGroup, targetGroup) + case (Some(sourceGroup: MapGroup), Some(targetGroup: MapGroup)) => + formatForwardCompatibleCollectionGroup[MapGroup](sourceGroup, targetGroup) + case _ if sourceType.isPrimitive || targetType.isPrimitive => // Base case sourceType - case ( - GroupUnwrapped(sourceWrappers, Some(sourceRepeatedListType), None), - GroupUnwrapped(_, Some(targetRepeatedListType), None) - ) => - // Format list - val sourceRule = ParquetListFormatRule.findFirstListRule(sourceRepeatedListType, Source) - val targetRule = ParquetListFormatRule.findFirstListRule(targetRepeatedListType, Target) - - val formattedRepeated = (sourceRule, targetRule) match { - case (Some(sourceRule), Some(targetRule)) => { - val sourceElement = sourceRule.elementType(sourceRepeatedListType) - val targetElement = targetRule.elementType(targetRepeatedListType) - // Recurse on the element instead of `repeated` type because list still can have - // different formats at repeated type - val forwardCompatElement = formatForwardCompatibleType(sourceElement, targetElement) - // Wrap the solved element with current source structure, and do actual conversion work - val forwardCompatRepeated = ParquetListFormatRule.wrapElementAsRepeatedType( - sourceRule, - sourceRepeatedListType, - forwardCompatElement - ) - ParquetListFormatRule.formatForwardCompatibleRepeatedType( - forwardCompatRepeated, - targetRepeatedListType) - } - case _ => sourceRepeatedListType // No-op - } - // Wrapped the formatted repeated type in its original groups, - // describing field name and whether it's optional/required - sourceWrappers.foldRight(formattedRepeated) { - (wrapper, group) => wrapper.withNewFields(group) - } - case ( - GroupUnwrapped(sourceWrappers, None, Some(sourceRepeatedMapType)), - GroupUnwrapped(_, None, Some(targetRepeatedMapType)) - ) => - // Format map - val forwardCompatRepeated = formatForwardCompatibleType(sourceRepeatedMapType, targetRepeatedMapType) - val formattedRepeated = ParquetMapFormatRule.formatForwardCompatibleRepeatedType( - forwardCompatRepeated, - targetRepeatedMapType - ) - // Wrapped the formatted repeated type in its original groups, - // describing field name and whether it's optional/required - sourceWrappers.foldRight(formattedRepeated) { - (wrapper, group) => wrapper.withNewFields(group) - } - case _ => - // Field projection + case _ => // Field projection val sourceGroup = sourceType.asGroupType val targetGroup = targetType.asGroupType - - import scala.collection.JavaConverters._ val resultFields = sourceGroup.getFields.asScala.map { sourceField => if (!targetGroup.containsField(sourceField.getName)) { if (!sourceField.isRepetition(Repetition.OPTIONAL)) { @@ -109,45 +62,42 @@ object ParquetCollectionFormatForwardCompatibility { } } - private case class GroupUnwrapped(wrappers: Seq[GroupType], - repeatedListType: Option[Type] = None, - repeatedMapType: Option[Type] = None) + private def formatForwardCompatibleCollectionGroup[T <: CollectionGroup](sourceGroup: T, + targetGroup: T) + (implicit t: ClassTag[T]): GroupType = { - private def unwrapGroup(typ: Type, wrappers: Seq[GroupType] = Seq()): GroupUnwrapped = { - if (typ.isPrimitive) { - GroupUnwrapped( - wrappers, - repeatedListType=None, - repeatedMapType=None - ) - } else if (typ.asGroupType.getFieldCount != 1) { - GroupUnwrapped( - wrappers :+ typ.asGroupType(), - repeatedListType=None, - repeatedMapType=None - ) - } else { - // Note the field count is strictly 1 here, and the wrappers will be used later - // to wrap back the formatted results. - if (ParquetListFormatRule.isGroupList(typ)) { - GroupUnwrapped( - wrappers :+ typ.asGroupType(), - repeatedListType=Some(typ.asGroupType.getFields.get(0)), - repeatedMapType = None - ) - } else if (ParquetMapFormatRule.isGroupMap(typ)) { - GroupUnwrapped( - wrappers :+ typ.asGroupType(), - repeatedListType=None, - repeatedMapType=Some(typ.asGroupType.getFields.get(0)) - ) - } else { - unwrapGroup(typ.asGroupType.getFields.get(0), wrappers :+ typ.asGroupType()) - } + val formatter = t.runtimeClass.asInstanceOf[Class[T]] match { + case c if c == classOf[MapGroup] => ParquetMapFormatter + case c if c == classOf[ListGroup] => ParquetListFormatter } + val formattedRepeated = formatter.formatForwardCompatibleRepeatedType( + sourceGroup.repeatedType, + targetGroup.repeatedType, + formatForwardCompatibleType(_, _)) + // Wrapped the formatted repeated type in its original groups, + // describing field name and whether it's optional/required + sourceGroup.groupWrapper.withNewFields(formattedRepeated) + } + + private def findCollectionGroup(typ: Type): Option[CollectionGroup] = { + ParquetListFormatter.extractGroup(typ).orElse(ParquetMapFormatter.extractGroup(typ)) } } -trait ParquetCollectionFormatRule { - def formatForwardCompatibleRepeatedType(sourceRepeatedMapType: Type, targetRepeatedMapType: Type): Type -} \ No newline at end of file +private[scrooge] trait ParquetCollectionFormatter { + def formatForwardCompatibleRepeatedType(sourceRepeatedMapType: Type, + targetRepeatedMapType: Type, + recursiveSolver: (Type, Type) => Type): Type + + def extractGroup(typ: Type): Option[CollectionGroup] +} + +private[scrooge] sealed trait CollectionGroup { + def groupWrapper: GroupType + + def repeatedType: Type +} + +private[scrooge] case class MapGroup(groupWrapper: GroupType, repeatedType: Type) extends CollectionGroup + +private[scrooge] case class ListGroup(groupWrapper: GroupType, repeatedType: Type) extends CollectionGroup \ No newline at end of file diff --git a/scalding-parquet-scrooge/src/main/scala/com/twitter/scalding/parquet/scrooge/ParquetListFormatRule.scala b/scalding-parquet-scrooge/src/main/scala/com/twitter/scalding/parquet/scrooge/ParquetListFormatter.scala similarity index 77% rename from scalding-parquet-scrooge/src/main/scala/com/twitter/scalding/parquet/scrooge/ParquetListFormatRule.scala rename to scalding-parquet-scrooge/src/main/scala/com/twitter/scalding/parquet/scrooge/ParquetListFormatter.scala index b044f31b98..64a98e3bc3 100644 --- a/scalding-parquet-scrooge/src/main/scala/com/twitter/scalding/parquet/scrooge/ParquetListFormatRule.scala +++ b/scalding-parquet-scrooge/src/main/scala/com/twitter/scalding/parquet/scrooge/ParquetListFormatter.scala @@ -6,7 +6,7 @@ import org.apache.parquet.schema.{GroupType, OriginalType, PrimitiveType, Type} import org.slf4j.LoggerFactory /** - * Rule to convert parquet schema of legacy list type to standard one + * Formatter parquet schema of legacy list type to standard one * namely 3-level list structure as recommended in * https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#lists * @@ -14,57 +14,63 @@ import org.slf4j.LoggerFactory * {{@code org.apache.parquet.thrift.ThriftSchemaConvertVisitor}} which always suffix * list element with "_tuple". */ -private[scrooge] object ParquetListFormatRule extends ParquetCollectionFormatRule { +private[scrooge] object ParquetListFormatter extends ParquetCollectionFormatter { private val LOGGER = LoggerFactory.getLogger(getClass) - def formatForwardCompatibleRepeatedType(repeatedSourceType: Type, repeatedTargetType: Type) = { + def formatForwardCompatibleRepeatedType(repeatedSourceType: Type, + repeatedTargetType: Type, + recursiveSolver: (Type, Type) => Type) = { ( - findFirstListRule(repeatedSourceType, Source), - findFirstListRule(repeatedTargetType, Target) + findRule(repeatedSourceType, Source), + findRule(repeatedTargetType, Target) ) match { case (Some(sourceRule), Some(targetRule)) => { - if (sourceRule == targetRule) { - repeatedSourceType - } else { - val elementType = sourceRule.elementType(repeatedSourceType) - targetRule.createCompliantRepeatedType( - typ = elementType, - name = elementType.getName, - isElementRequired = sourceRule.isElementRequired(repeatedSourceType), - originalType = sourceRule.elementOriginalType(repeatedSourceType) - ) - } + + val sourceElementType = sourceRule.elementType(repeatedSourceType) + val sourceElementIsRequired = sourceRule.isElementRequired(repeatedSourceType) + val sourceElementOriginalType = sourceRule.elementOriginalType(repeatedSourceType) + + val targetElementType = targetRule.elementType(repeatedTargetType) + val forwardCompatElementType = recursiveSolver(sourceElementType, targetElementType) + + targetRule.createCompliantRepeatedType( + elementType = forwardCompatElementType, + elementName = forwardCompatElementType.getName, + isElementRequired = sourceElementIsRequired, + elementOriginalType = sourceElementOriginalType + ) } + case _ => repeatedSourceType } } - def isGroupList(projection: Type): Boolean = { - if (projection.isPrimitive) { + def extractGroup(typ: Type) : Option[ListGroup] = { + if (isListGroup(typ)) { + Some(ListGroup(typ.asGroupType(), typ.asGroupType().getFields.get(0))) + } else { + None + } + } + + private def isListGroup(typ: Type): Boolean = { + if (typ.isPrimitive) { false } else { - val groupProjection = projection.asGroupType + val groupProjection = typ.asGroupType groupProjection.getOriginalType == OriginalType.LIST && groupProjection.getFieldCount == 1 && groupProjection.getFields.get(0).isRepetition(Type.Repetition.REPEATED) } - } - def findFirstListRule(repeatedType: Type, - sourceOrTarget: SourceOrTarget): Option[ParquetListFormatRule] = { - val ruleFound = sourceOrTarget.rules.find(rule => rule.check(repeatedType)) + private def findRule(repeatedType: Type, + sourceOrTarget: SourceOrTarget): Option[ParquetListFormatRule] = { + val ruleFound = sourceOrTarget.rules.find(rule => rule.appliesToType(repeatedType)) if (ruleFound.isEmpty) LOGGER.warn(s"Unable to find matching rule for ${sourceOrTarget.name} schema:\n$repeatedType") ruleFound } - - def wrapElementAsRepeatedType(rule: ParquetListFormatRule, repeatedType: Type, elementType: Type): Type = rule.createCompliantRepeatedType( - elementType, - rule.elementName(repeatedType), - // if repeated or required, it is required - !elementType.isRepetition(Type.Repetition.OPTIONAL), - elementType.getOriginalType) } /** @@ -110,12 +116,12 @@ private[scrooge] sealed trait ParquetListFormatRule { private[scrooge] def isElementRequired(repeatedType: Type): Boolean - private[scrooge] def check(typ: Type): Boolean + private[scrooge] def appliesToType(repeatedType: Type): Boolean - private[scrooge] def createCompliantRepeatedType(typ: Type, - name: String, + private[scrooge] def createCompliantRepeatedType(elementType: Type, + elementName: String, isElementRequired: Boolean, - originalType: OriginalType): Type + elementOriginalType: OriginalType): Type } @@ -135,7 +141,7 @@ private[scrooge] sealed trait PrimitiveListRule extends ParquetListFormatRule { true } - override def check(repeatedType: Type): Boolean = + override def appliesToType(repeatedType: Type): Boolean = repeatedType.isPrimitive && repeatedType.getName == this.constantElementName override def createCompliantRepeatedType(typ: Type, name: String, isElementRequired: Boolean, originalType: OriginalType): Type = { @@ -174,7 +180,7 @@ private[scrooge] sealed trait GroupListRule extends ParquetListFormatRule { override def elementName(repeatedType: Type): String = this.constantElementName - override def check(repeatedType: Type): Boolean = { + override def appliesToType(repeatedType: Type): Boolean = { if (repeatedType.isPrimitive) false else { val groupType = repeatedType.asGroupType @@ -197,10 +203,13 @@ private[scrooge] object GroupArrayRule extends GroupListRule { } private[scrooge] object TupleRule extends ParquetListFormatRule { - override def check(repeatedType: Type): Boolean = repeatedType.getName.endsWith("_tuple") + private val tupleSuffix = "_tuple" + + override def appliesToType(repeatedType: Type): Boolean = repeatedType.getName.endsWith(tupleSuffix) override def elementName(repeatedType: Type): String = { - repeatedType.getName.substring(0, repeatedType.getName.length - 6) + // Since `appliesToType` + repeatedType.getName.substring(0, repeatedType.getName.length - tupleSuffix.length) } override def elementType(repeatedType: Type): Type = repeatedType @@ -212,7 +221,7 @@ private[scrooge] object TupleRule extends ParquetListFormatRule { } override def createCompliantRepeatedType(typ: Type, name: String, isElementRequired: Boolean, originalType: OriginalType): Type = { - val suffixed_name = name + "_tuple" + val suffixed_name = name + tupleSuffix if (typ.isPrimitive) new PrimitiveType(Type.Repetition.REPEATED, typ.asPrimitiveType.getPrimitiveTypeName, suffixed_name, originalType) else new GroupType(Type.Repetition.REPEATED, suffixed_name, originalType, typ.asGroupType.getFields) } @@ -224,7 +233,7 @@ private[scrooge] object StandardRule extends ParquetListFormatRule { * element; * } */ - override def check(repeatedField: Type): Boolean = { + override def appliesToType(repeatedField: Type): Boolean = { if (repeatedField.isPrimitive || !(repeatedField.getName == "list")) { false } else { diff --git a/scalding-parquet-scrooge/src/main/scala/com/twitter/scalding/parquet/scrooge/ParquetMapFormatRule.scala b/scalding-parquet-scrooge/src/main/scala/com/twitter/scalding/parquet/scrooge/ParquetMapFormatter.scala similarity index 54% rename from scalding-parquet-scrooge/src/main/scala/com/twitter/scalding/parquet/scrooge/ParquetMapFormatRule.scala rename to scalding-parquet-scrooge/src/main/scala/com/twitter/scalding/parquet/scrooge/ParquetMapFormatter.scala index 2737e21651..529184b8d4 100644 --- a/scalding-parquet-scrooge/src/main/scala/com/twitter/scalding/parquet/scrooge/ParquetMapFormatRule.scala +++ b/scalding-parquet-scrooge/src/main/scala/com/twitter/scalding/parquet/scrooge/ParquetMapFormatter.scala @@ -3,7 +3,7 @@ package com.twitter.scalding.parquet.scrooge import org.apache.parquet.schema.{OriginalType, Type} /** - * Rule to format parquet schema of legacy map type to standard target + * Format parquet schema of legacy map type to standard target * with repeated type of `key_value` without annotation * as recommended in * https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#maps @@ -12,26 +12,37 @@ import org.apache.parquet.schema.{OriginalType, Type} * {@code org.apache.parquet.schema.ConversionPatterns} has repeated `map` field * annotated with (MAP_KEY_VALUE) */ -private[scrooge] object ParquetMapFormatRule extends ParquetCollectionFormatRule { +private[scrooge] object ParquetMapFormatter extends ParquetCollectionFormatter { /** * Handle legacy type when * https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#maps + * * @param sourceRepeatedMapType * @param targetRepeatedMapType - * @return */ - def formatForwardCompatibleRepeatedType(sourceRepeatedMapType: Type, targetRepeatedMapType: Type) = { - val isLegacyToStandardFormat = isStandardRepeatedType(targetRepeatedMapType) && - isLegacyRepeatedType(sourceRepeatedMapType) - if (isLegacyToStandardFormat) { - targetRepeatedMapType.asGroupType().withNewFields(sourceRepeatedMapType.asGroupType().getFields) + def formatForwardCompatibleRepeatedType(sourceRepeatedMapType: Type, + targetRepeatedMapType: Type, + recursiveSolver: (Type, Type) => Type) = { + + val solvedRepeatedType = recursiveSolver(sourceRepeatedMapType, targetRepeatedMapType) + if (isLegacyRepeatedType(sourceRepeatedMapType) && + isStandardRepeatedType(targetRepeatedMapType)) { + targetRepeatedMapType.asGroupType().withNewFields(solvedRepeatedType.asGroupType().getFields) + } else { + solvedRepeatedType + } + } + + def extractGroup(typ: Type): Option[MapGroup] = { + if (isMapGroup(typ)) { + Some(MapGroup(typ.asGroupType(), typ.asGroupType().getFields.get(0))) } else { - sourceRepeatedMapType + None } } - def isGroupMap(typ: Type): Boolean = { + private def isMapGroup(typ: Type): Boolean = { if (typ.isPrimitive) { false } else { @@ -40,12 +51,12 @@ private[scrooge] object ParquetMapFormatRule extends ParquetCollectionFormatRule (groupType.getFieldCount == 1) && groupType.getFields.get(0).isRepetition(Type.Repetition.REPEATED) && (isLegacyRepeatedType(groupType.getFields.get(0)) || - isStandardRepeatedType(groupType.getFields.get(0))) + isStandardRepeatedType(groupType.getFields.get(0))) } } private def isLegacyRepeatedType(repeatedType: Type) = { - ((repeatedType.getName == "map") && (repeatedType.getOriginalType == OriginalType.MAP_KEY_VALUE)) + (repeatedType.getName == "map") && (repeatedType.getOriginalType == OriginalType.MAP_KEY_VALUE) } private def isStandardRepeatedType(repeatedType: Type) = { From 6e551e50d593fbacf3c1c2e89d80fb5e0b091d18 Mon Sep 17 00:00:00 2001 From: Mick Jermsurawong Date: Mon, 30 Sep 2019 23:51:50 -0700 Subject: [PATCH 13/34] undo import wildcard --- .../twitter/scalding/parquet/scrooge/ScroogeReadSupport.java | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/scalding-parquet-scrooge/src/main/java/com/twitter/scalding/parquet/scrooge/ScroogeReadSupport.java b/scalding-parquet-scrooge/src/main/java/com/twitter/scalding/parquet/scrooge/ScroogeReadSupport.java index 1bc9040ba8..98e994ec6b 100644 --- a/scalding-parquet-scrooge/src/main/java/com/twitter/scalding/parquet/scrooge/ScroogeReadSupport.java +++ b/scalding-parquet-scrooge/src/main/java/com/twitter/scalding/parquet/scrooge/ScroogeReadSupport.java @@ -37,7 +37,9 @@ import org.apache.parquet.thrift.projection.ThriftProjectionException; import org.apache.parquet.thrift.struct.ThriftType; -import java.util.*; +import java.util.List; +import java.util.Map; +import java.util.Set; /** * Read support for Scrooge From 436def83764a72ba934e01ca8e7dea7ae97493fc Mon Sep 17 00:00:00 2001 From: Mick Jermsurawong Date: Tue, 1 Oct 2019 00:07:44 -0700 Subject: [PATCH 14/34] fix warning descenents of sealed traits --- .../scrooge/ParquetCollectionFormatForwardCompatibility.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scalding-parquet-scrooge/src/main/scala/com/twitter/scalding/parquet/scrooge/ParquetCollectionFormatForwardCompatibility.scala b/scalding-parquet-scrooge/src/main/scala/com/twitter/scalding/parquet/scrooge/ParquetCollectionFormatForwardCompatibility.scala index 99656ab927..b8c1975c94 100644 --- a/scalding-parquet-scrooge/src/main/scala/com/twitter/scalding/parquet/scrooge/ParquetCollectionFormatForwardCompatibility.scala +++ b/scalding-parquet-scrooge/src/main/scala/com/twitter/scalding/parquet/scrooge/ParquetCollectionFormatForwardCompatibility.scala @@ -98,6 +98,6 @@ private[scrooge] sealed trait CollectionGroup { def repeatedType: Type } -private[scrooge] case class MapGroup(groupWrapper: GroupType, repeatedType: Type) extends CollectionGroup +private[scrooge] sealed case class MapGroup(groupWrapper: GroupType, repeatedType: Type) extends CollectionGroup -private[scrooge] case class ListGroup(groupWrapper: GroupType, repeatedType: Type) extends CollectionGroup \ No newline at end of file +private[scrooge] sealed case class ListGroup(groupWrapper: GroupType, repeatedType: Type) extends CollectionGroup \ No newline at end of file From 7ba2305b5f16f5cbadf05bb746b56cd8c6ef045c Mon Sep 17 00:00:00 2001 From: Mick Jermsurawong Date: Tue, 1 Oct 2019 00:07:55 -0700 Subject: [PATCH 15/34] remove duplicate test --- ...ctionFormatForwardCompatibilityTests.scala | 36 ------------------- 1 file changed, 36 deletions(-) diff --git a/scalding-parquet-scrooge/src/test/scala/com/twitter/scalding/parquet/scrooge/ParquetCollectionFormatForwardCompatibilityTests.scala b/scalding-parquet-scrooge/src/test/scala/com/twitter/scalding/parquet/scrooge/ParquetCollectionFormatForwardCompatibilityTests.scala index 70a0b5dbd3..621ec26820 100644 --- a/scalding-parquet-scrooge/src/test/scala/com/twitter/scalding/parquet/scrooge/ParquetCollectionFormatForwardCompatibilityTests.scala +++ b/scalding-parquet-scrooge/src/test/scala/com/twitter/scalding/parquet/scrooge/ParquetCollectionFormatForwardCompatibilityTests.scala @@ -294,42 +294,6 @@ class ParquetCollectionFormatForwardCompatibilityTests extends WordSpec with Mat solved shouldEqual expected } - "resolve list legacy format: format nested x_tuple to nested array" in { - val targetType = MessageTypeParser.parseMessageType( - """ - |message spark_schema { - | required group foo (LIST) { - | repeated group array (LIST) { - | repeated binary array (UTF8); - | } - | } - |} - """.stripMargin) - val sourceType = MessageTypeParser.parseMessageType( - """ - |message SampleSource { - | optional group foo (LIST) { - | repeated group foo_tuple (LIST) { - | repeated binary foo_tuple_tuple (UTF8); - | } - | } - |} - """.stripMargin) - val solved = ParquetCollectionFormatForwardCompatibility.formatForwardCompatibleMessage(sourceType, targetType) - // note optional of result, and field rename - val expected = MessageTypeParser.parseMessageType( - """ - |message SampleSource { - | optional group foo (LIST) { - | repeated group array { - | repeated binary array (UTF8); - | } - | } - |} - """.stripMargin) - solved shouldEqual expected - } - "resolve list in group legacy format: format x_tuple to nested 3-level" in { val targetType = MessageTypeParser.parseMessageType( """ From d2f2fb5d4e32b3774ebcb882e5db50a8f50ccf57 Mon Sep 17 00:00:00 2001 From: Mick Jermsurawong Date: Tue, 1 Oct 2019 00:23:41 -0700 Subject: [PATCH 16/34] improve test names and remove one duplicate --- ...ctionFormatForwardCompatibilityTests.scala | 107 ++++++++---------- 1 file changed, 46 insertions(+), 61 deletions(-) diff --git a/scalding-parquet-scrooge/src/test/scala/com/twitter/scalding/parquet/scrooge/ParquetCollectionFormatForwardCompatibilityTests.scala b/scalding-parquet-scrooge/src/test/scala/com/twitter/scalding/parquet/scrooge/ParquetCollectionFormatForwardCompatibilityTests.scala index 621ec26820..c7b9621d51 100644 --- a/scalding-parquet-scrooge/src/test/scala/com/twitter/scalding/parquet/scrooge/ParquetCollectionFormatForwardCompatibilityTests.scala +++ b/scalding-parquet-scrooge/src/test/scala/com/twitter/scalding/parquet/scrooge/ParquetCollectionFormatForwardCompatibilityTests.scala @@ -13,7 +13,7 @@ import org.scalatest.{Matchers, WordSpec} class ParquetCollectionFormatForwardCompatibilityTests extends WordSpec with Matchers { "Format forward compat: resolving map format" should { - "map identity: " in { + "map identity" in { val targetType = MessageTypeParser.parseMessageType( """ |message spark_schema { @@ -32,7 +32,7 @@ class ParquetCollectionFormatForwardCompatibilityTests extends WordSpec with Mat solved shouldEqual targetType } - "map identity: string key, struct value" in { + "map identity from thrift struct: string key, struct value" in { val listType = new ListType(new ThriftField("list", 2, Requirement.REQUIRED, new ThriftType.StringType)) val children = new ThriftField("foo", 3, Requirement.REQUIRED, listType) val mapValueType = new StructType(util.Arrays.asList(children), @@ -58,7 +58,7 @@ class ParquetCollectionFormatForwardCompatibilityTests extends WordSpec with Mat solved shouldEqual message } - "map identity: string kye, list string value" in { + "map identity from thrift struct: string kye, list string value" in { val listType = new ListType(new ThriftField("list", 2, Requirement.REQUIRED, new ThriftType.StringType)) val message = schemaFromThriftMap(listType) message shouldEqual MessageTypeParser.parseMessageType( @@ -80,7 +80,7 @@ class ParquetCollectionFormatForwardCompatibilityTests extends WordSpec with Mat solved shouldEqual message } - "resolve map legacy format: original type (MAP_KEY_VALUE) to explicit name key_value" in { + "format map legacy: original type (MAP_KEY_VALUE) to standard format key_value" in { val targetType = MessageTypeParser.parseMessageType( """ |message spark_schema { @@ -118,7 +118,7 @@ class ParquetCollectionFormatForwardCompatibilityTests extends WordSpec with Mat solved shouldEqual expected } - "resolve map legacy format: map of map" in { + "format map legacy map of map" in { val targetType = MessageTypeParser.parseMessageType( """ |message spark_schema { @@ -192,7 +192,7 @@ class ParquetCollectionFormatForwardCompatibilityTests extends WordSpec with Mat } "Format forward compat: resolving list format" should { - "resolve list legacy format: format x_tuple to legacy array" in { + "format x_tuple to primitive array" in { val targetType = MessageTypeParser.parseMessageType( """ |message spark_schema { @@ -211,7 +211,6 @@ class ParquetCollectionFormatForwardCompatibilityTests extends WordSpec with Mat |} """.stripMargin) val solved = ParquetCollectionFormatForwardCompatibility.formatForwardCompatibleMessage(sourceType, targetType) - // note optional of result, and field rename val expected = MessageTypeParser.parseMessageType( """ |message SampleSource { @@ -223,7 +222,37 @@ class ParquetCollectionFormatForwardCompatibilityTests extends WordSpec with Mat solved shouldEqual expected } - "resolve list legacy format: format x_tuple to 3-level" in { + "format x_tuple to primitive element" in { + val targetType = MessageTypeParser.parseMessageType( + """ + |message spark_schema { + | required group country_codes (LIST) { + | repeated binary element (UTF8); + | } + | required int32 x; + |} + """.stripMargin) + val sourceType = MessageTypeParser.parseMessageType( + """ + |message SampleSource { + | optional group country_codes (LIST) { + | repeated binary country_codes_tuple (UTF8); + | } + |} + """.stripMargin) + val solved = ParquetCollectionFormatForwardCompatibility.formatForwardCompatibleMessage(sourceType, targetType) + val expected = MessageTypeParser.parseMessageType( + """ + |message SampleSource { + | optional group country_codes (LIST) { + | repeated binary element (UTF8); + | } + |} + """.stripMargin) + solved shouldEqual expected + } + + "format x_tuple to 3-level" in { val targetType = MessageTypeParser.parseMessageType( """ |message spark_schema { @@ -258,7 +287,7 @@ class ParquetCollectionFormatForwardCompatibilityTests extends WordSpec with Mat solved shouldEqual expected } - "resolve list legacy format: format nested x_tuple to nested legacy array" in { + "format nested x_tuple to group array" in { val targetType = MessageTypeParser.parseMessageType( """ |message spark_schema { @@ -294,7 +323,7 @@ class ParquetCollectionFormatForwardCompatibilityTests extends WordSpec with Mat solved shouldEqual expected } - "resolve list in group legacy format: format x_tuple to nested 3-level" in { + "format nested x_tuple to nested 3-level" in { val targetType = MessageTypeParser.parseMessageType( """ |message spark_schema { @@ -337,7 +366,7 @@ class ParquetCollectionFormatForwardCompatibilityTests extends WordSpec with Mat solved shouldEqual expected } - "resolve: binary array to 3-level nesting" in { + "format binary array to 3-level" in { val targetType = MessageTypeParser.parseMessageType( """ |message spark_schema { @@ -375,7 +404,7 @@ class ParquetCollectionFormatForwardCompatibilityTests extends WordSpec with Mat } - "resolve: identity 3-level" in { + "format 3-level to 3-level (identity)" in { val targetType = MessageTypeParser.parseMessageType( """ |message spark_schema { @@ -389,10 +418,9 @@ class ParquetCollectionFormatForwardCompatibilityTests extends WordSpec with Mat """.stripMargin) val solved = ParquetCollectionFormatForwardCompatibility.formatForwardCompatibleMessage(targetType, targetType) solved shouldEqual targetType - } - "resolve nested list: format inner legacy array to 3-level nesting" in { + "format nested primitive array to nested 3-level" in { val targetType = MessageTypeParser.parseMessageType( """ |message spark_schema { @@ -441,7 +469,7 @@ class ParquetCollectionFormatForwardCompatibilityTests extends WordSpec with Mat solved shouldEqual expected } - "resolve solved struct in list: repeated group element to 3-level nesting" in { + "format element group to 3-level" in { val targetType = MessageTypeParser.parseMessageType( """ |message spark_schema { @@ -486,7 +514,7 @@ class ParquetCollectionFormatForwardCompatibilityTests extends WordSpec with Mat solved shouldEqual expected } - "resolve standard 3-level list to 2-level" in { + "format 3-level to nested primitive array" in { val targetType = MessageTypeParser.parseMessageType( """ |message scalding_schema { @@ -532,7 +560,7 @@ class ParquetCollectionFormatForwardCompatibilityTests extends WordSpec with Mat solved shouldEqual expected } - "resolve list in group containing list" in { + "format x_tuple in group to 3-level" in { val targetType = MessageTypeParser.parseMessageType( """ |message spark_schema { @@ -585,50 +613,7 @@ class ParquetCollectionFormatForwardCompatibilityTests extends WordSpec with Mat solved shouldEqual expected } - "resolve projection: format 2-level source to 3-level target" in { - val targetType = MessageTypeParser.parseMessageType( - """ - |message spark_schema { - | optional group foo (LIST) { - | repeated group list { - | required group element { - | required binary zing (UTF8); - | required binary bar (UTF8); - | required binary baz (UTF8); - | } - | } - | } - |} - """.stripMargin) - val sourceType = MessageTypeParser.parseMessageType( - """ - |message SampleSource { - | optional group foo (LIST) { - | repeated group element { - | optional binary bar (UTF8); - | required binary zing (UTF8); - | } - | } - |} - """.stripMargin) - val solved = ParquetCollectionFormatForwardCompatibility.formatForwardCompatibleMessage(sourceType, targetType) - val expected = MessageTypeParser.parseMessageType( - """ - |message SampleSource { - | optional group foo (LIST) { - | repeated group list { - | required group element { - | optional binary bar (UTF8); - | required binary zing (UTF8); - | } - | } - | } - |} - """.stripMargin) - solved shouldEqual expected - } - - "resolve does not support backward compat: format nested 3-level to x_tuple" in { + "does not format 3-level to x_tuple" in { val targetType = MessageTypeParser.parseMessageType( """ |message scalding_schema { From 9c96d23082f0df072b77b2fa0694ee743aba5058 Mon Sep 17 00:00:00 2001 From: Mick Jermsurawong Date: Tue, 1 Oct 2019 10:32:31 -0700 Subject: [PATCH 17/34] add docs --- ...CollectionFormatForwardCompatibility.scala | 41 +++++++++++++++---- .../scrooge/ParquetListFormatter.scala | 3 +- .../parquet/scrooge/ParquetMapFormatter.scala | 2 +- 3 files changed, 36 insertions(+), 10 deletions(-) diff --git a/scalding-parquet-scrooge/src/main/scala/com/twitter/scalding/parquet/scrooge/ParquetCollectionFormatForwardCompatibility.scala b/scalding-parquet-scrooge/src/main/scala/com/twitter/scalding/parquet/scrooge/ParquetCollectionFormatForwardCompatibility.scala index b8c1975c94..08a7017747 100644 --- a/scalding-parquet-scrooge/src/main/scala/com/twitter/scalding/parquet/scrooge/ParquetCollectionFormatForwardCompatibility.scala +++ b/scalding-parquet-scrooge/src/main/scala/com/twitter/scalding/parquet/scrooge/ParquetCollectionFormatForwardCompatibility.scala @@ -8,7 +8,23 @@ import org.slf4j.LoggerFactory import scala.reflect.ClassTag -object ParquetCollectionFormatForwardCompatibility { +/** + * Format source parquet schema to have collection types--list and map--in the same structure + * as parquet schema. This is currently used in [[ScroogeReadSupport]] to format source projection + * schema to target file schema from parquet data. + * The sources with different collection format may come from: + * 1) Thrift struct via [[org.apache.parquet.thrift.ThriftSchemaConvertVisitor]] which always + * describe list with `_tuple` format, and map which has `MAP_KEY_VALUE` annotation. + * 2) User-supplied schema string via config key + * [[org.apache.parquet.hadoop.api.ReadSupport.PARQUET_READ_SCHEMA]] + * + * The strategy of this class is to first assume that the source schema is a sub-graph of target + * schema in terms of field names. However, the data types for collection can differ in + * graph structure between the two schemas. We then need to: + * 1) traverse the two schemas until we find the collection type indicated by `repeated` type. + * 2) delegate the collection types found to respective list/map formatter. + */ +private[scrooge] object ParquetCollectionFormatForwardCompatibility { private val logger = LoggerFactory.getLogger(getClass) @@ -74,9 +90,9 @@ object ParquetCollectionFormatForwardCompatibility { sourceGroup.repeatedType, targetGroup.repeatedType, formatForwardCompatibleType(_, _)) - // Wrapped the formatted repeated type in its original groups, - // describing field name and whether it's optional/required - sourceGroup.groupWrapper.withNewFields(formattedRepeated) + // Wrap the formatted repeated type in its original group. + // This maintains the field name, and optional/required information + sourceGroup.groupType.withNewFields(formattedRepeated) } private def findCollectionGroup(typ: Type): Option[CollectionGroup] = { @@ -93,11 +109,22 @@ private[scrooge] trait ParquetCollectionFormatter { } private[scrooge] sealed trait CollectionGroup { - def groupWrapper: GroupType + /** + * Type for the collection. + * For example, given the schema, + * required group my_list (LIST) { + * repeated group list { + * optional binary element (UTF8); + * } + * } + * [[groupType]] refers to this whole schema + * [[repeatedType]] refers to inner `repeated` schema + */ + def groupType: GroupType def repeatedType: Type } -private[scrooge] sealed case class MapGroup(groupWrapper: GroupType, repeatedType: Type) extends CollectionGroup +private[scrooge] sealed case class MapGroup(groupType: GroupType, repeatedType: Type) extends CollectionGroup -private[scrooge] sealed case class ListGroup(groupWrapper: GroupType, repeatedType: Type) extends CollectionGroup \ No newline at end of file +private[scrooge] sealed case class ListGroup(groupType: GroupType, repeatedType: Type) extends CollectionGroup \ No newline at end of file diff --git a/scalding-parquet-scrooge/src/main/scala/com/twitter/scalding/parquet/scrooge/ParquetListFormatter.scala b/scalding-parquet-scrooge/src/main/scala/com/twitter/scalding/parquet/scrooge/ParquetListFormatter.scala index 64a98e3bc3..4bd93a9cdf 100644 --- a/scalding-parquet-scrooge/src/main/scala/com/twitter/scalding/parquet/scrooge/ParquetListFormatter.scala +++ b/scalding-parquet-scrooge/src/main/scala/com/twitter/scalding/parquet/scrooge/ParquetListFormatter.scala @@ -11,7 +11,7 @@ import org.slf4j.LoggerFactory * https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#lists * * More specifically this handles converting from parquet file created by - * {{@code org.apache.parquet.thrift.ThriftSchemaConvertVisitor}} which always suffix + * [[org.apache.parquet.thrift.ThriftSchemaConvertVisitor]] which always suffix * list element with "_tuple". */ private[scrooge] object ParquetListFormatter extends ParquetCollectionFormatter { @@ -208,7 +208,6 @@ private[scrooge] object TupleRule extends ParquetListFormatRule { override def appliesToType(repeatedType: Type): Boolean = repeatedType.getName.endsWith(tupleSuffix) override def elementName(repeatedType: Type): String = { - // Since `appliesToType` repeatedType.getName.substring(0, repeatedType.getName.length - tupleSuffix.length) } diff --git a/scalding-parquet-scrooge/src/main/scala/com/twitter/scalding/parquet/scrooge/ParquetMapFormatter.scala b/scalding-parquet-scrooge/src/main/scala/com/twitter/scalding/parquet/scrooge/ParquetMapFormatter.scala index 529184b8d4..8f07ad8125 100644 --- a/scalding-parquet-scrooge/src/main/scala/com/twitter/scalding/parquet/scrooge/ParquetMapFormatter.scala +++ b/scalding-parquet-scrooge/src/main/scala/com/twitter/scalding/parquet/scrooge/ParquetMapFormatter.scala @@ -9,7 +9,7 @@ import org.apache.parquet.schema.{OriginalType, Type} * https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#maps * * Source with legacy format created by - * {@code org.apache.parquet.schema.ConversionPatterns} has repeated `map` field + * [[org.apache.parquet.schema.ConversionPatterns]] has repeated `map` field * annotated with (MAP_KEY_VALUE) */ private[scrooge] object ParquetMapFormatter extends ParquetCollectionFormatter { From 09bd4ddb8294a2942e008017557047799427137d Mon Sep 17 00:00:00 2001 From: Mick Jermsurawong Date: Wed, 2 Oct 2019 17:15:12 -0700 Subject: [PATCH 18/34] Check on schema type mismatch --- ...CollectionFormatForwardCompatibility.scala | 9 +++-- ...ctionFormatForwardCompatibilityTests.scala | 33 +++++++++++++++++++ 2 files changed, 40 insertions(+), 2 deletions(-) diff --git a/scalding-parquet-scrooge/src/main/scala/com/twitter/scalding/parquet/scrooge/ParquetCollectionFormatForwardCompatibility.scala b/scalding-parquet-scrooge/src/main/scala/com/twitter/scalding/parquet/scrooge/ParquetCollectionFormatForwardCompatibility.scala index 08a7017747..2c8ffb8457 100644 --- a/scalding-parquet-scrooge/src/main/scala/com/twitter/scalding/parquet/scrooge/ParquetCollectionFormatForwardCompatibility.scala +++ b/scalding-parquet-scrooge/src/main/scala/com/twitter/scalding/parquet/scrooge/ParquetCollectionFormatForwardCompatibility.scala @@ -49,12 +49,17 @@ private[scrooge] object ParquetCollectionFormatForwardCompatibility { */ private def formatForwardCompatibleType(sourceType: Type, targetType: Type): Type = { (findCollectionGroup(sourceType), findCollectionGroup(targetType)) match { + case _ if sourceType.isPrimitive && targetType.isPrimitive => + sourceType + case _ if sourceType.isPrimitive != targetType.isPrimitive => + throw new DecodingSchemaMismatchException( + s"Found schema mismatch between source type ${sourceType.getName}:\n$sourceType\n\n" + + s"and target type:\n${targetType}" + ) case (Some(sourceGroup: ListGroup), Some(targetGroup: ListGroup)) => formatForwardCompatibleCollectionGroup[ListGroup](sourceGroup, targetGroup) case (Some(sourceGroup: MapGroup), Some(targetGroup: MapGroup)) => formatForwardCompatibleCollectionGroup[MapGroup](sourceGroup, targetGroup) - case _ if sourceType.isPrimitive || targetType.isPrimitive => // Base case - sourceType case _ => // Field projection val sourceGroup = sourceType.asGroupType val targetGroup = targetType.asGroupType diff --git a/scalding-parquet-scrooge/src/test/scala/com/twitter/scalding/parquet/scrooge/ParquetCollectionFormatForwardCompatibilityTests.scala b/scalding-parquet-scrooge/src/test/scala/com/twitter/scalding/parquet/scrooge/ParquetCollectionFormatForwardCompatibilityTests.scala index c7b9621d51..56f6eaf729 100644 --- a/scalding-parquet-scrooge/src/test/scala/com/twitter/scalding/parquet/scrooge/ParquetCollectionFormatForwardCompatibilityTests.scala +++ b/scalding-parquet-scrooge/src/test/scala/com/twitter/scalding/parquet/scrooge/ParquetCollectionFormatForwardCompatibilityTests.scala @@ -865,4 +865,37 @@ class ParquetCollectionFormatForwardCompatibilityTests extends WordSpec with Mat e.getMessage should include("non-optional source field bogus_field:") } } + + "Schema mismatch" should { + "throws exception" in { + val targetType = MessageTypeParser.parseMessageType( + """ + |message spark_schema { + | required group foo { + | repeated group bar { + | required binary _id (UTF8); + | required double created; + | } + | } + |} + """.stripMargin) + val sourceType = MessageTypeParser.parseMessageType( + """ + |message SampleSource { + | required group foo { + | required binary bar (UTF8); + | } + |} + """.stripMargin) + + val e = intercept[DecodingSchemaMismatchException] { + ParquetCollectionFormatForwardCompatibility.formatForwardCompatibleMessage( + targetType, + sourceType + ) + } + + e.getMessage should include("Found schema mismatch") + } + } } From 83b68980de6f5c4666f4a2b6fc0746f5c1c9e236 Mon Sep 17 00:00:00 2001 From: Mick Jermsurawong Date: Wed, 2 Oct 2019 19:24:09 -0700 Subject: [PATCH 19/34] explicit rename from source/target to projected read schema and file schema --- .../parquet/scrooge/ScroogeReadSupport.java | 2 +- ...CollectionFormatForwardCompatibility.scala | 121 ++++++++++-------- ...ctionFormatForwardCompatibilityTests.scala | 48 +++---- 3 files changed, 91 insertions(+), 80 deletions(-) diff --git a/scalding-parquet-scrooge/src/main/java/com/twitter/scalding/parquet/scrooge/ScroogeReadSupport.java b/scalding-parquet-scrooge/src/main/java/com/twitter/scalding/parquet/scrooge/ScroogeReadSupport.java index 98e994ec6b..181913a304 100644 --- a/scalding-parquet-scrooge/src/main/java/com/twitter/scalding/parquet/scrooge/ScroogeReadSupport.java +++ b/scalding-parquet-scrooge/src/main/java/com/twitter/scalding/parquet/scrooge/ScroogeReadSupport.java @@ -132,7 +132,7 @@ public static MessageType getSchemaForRead(MessageType fileMessageType, String p */ public static MessageType getSchemaForRead(MessageType fileMessageType, MessageType projectedMessageType) { assertGroupsAreCompatible(fileMessageType, projectedMessageType); - return ParquetCollectionFormatForwardCompatibility.formatForwardCompatibleMessage( + return ParquetCollectionFormatForwardCompatibility.projectFileSchema( projectedMessageType, fileMessageType ); } diff --git a/scalding-parquet-scrooge/src/main/scala/com/twitter/scalding/parquet/scrooge/ParquetCollectionFormatForwardCompatibility.scala b/scalding-parquet-scrooge/src/main/scala/com/twitter/scalding/parquet/scrooge/ParquetCollectionFormatForwardCompatibility.scala index 2c8ffb8457..d4e20b7630 100644 --- a/scalding-parquet-scrooge/src/main/scala/com/twitter/scalding/parquet/scrooge/ParquetCollectionFormatForwardCompatibility.scala +++ b/scalding-parquet-scrooge/src/main/scala/com/twitter/scalding/parquet/scrooge/ParquetCollectionFormatForwardCompatibility.scala @@ -9,18 +9,19 @@ import org.slf4j.LoggerFactory import scala.reflect.ClassTag /** - * Format source parquet schema to have collection types--list and map--in the same structure - * as parquet schema. This is currently used in [[ScroogeReadSupport]] to format source projection - * schema to target file schema from parquet data. - * The sources with different collection format may come from: + * Project file schema to have collection types--list and map--in the same structure + * as projected read schema. This is currently used in [[ScroogeReadSupport]] where projected + * read schema can come from: * 1) Thrift struct via [[org.apache.parquet.thrift.ThriftSchemaConvertVisitor]] which always * describe list with `_tuple` format, and map which has `MAP_KEY_VALUE` annotation. * 2) User-supplied schema string via config key * [[org.apache.parquet.hadoop.api.ReadSupport.PARQUET_READ_SCHEMA]] * - * The strategy of this class is to first assume that the source schema is a sub-graph of target - * schema in terms of field names. However, the data types for collection can differ in - * graph structure between the two schemas. We then need to: + * The strategy of this class is to first assume that the projected read schema is a "sub-graph" of + * file schema in terms of field names. (We allow optional field in projected read schema to be in + * the projected file schema.) However, the data types for collection can differ in + * graph structure between the two schemas. + * We thus need to: * 1) traverse the two schemas until we find the collection type indicated by `repeated` type. * 2) delegate the collection types found to respective list/map formatter. */ @@ -29,87 +30,97 @@ private[scrooge] object ParquetCollectionFormatForwardCompatibility { private val logger = LoggerFactory.getLogger(getClass) /** - * Create a forward-compatible schema, using content from source type with format from target type. + * Project file schema to contain the same fields as the given projected read schema. + * The result projected file schema should have the same optional/required fields as the + * projected read schema, but maintain collection type format for the file schema. * - * @param sourceType source type with legacy format - * @param targetType target type to which source is converted to + * @param projectedReadSchema read schema specifying field projection + * @param fileSchema file schema to be projected */ - def formatForwardCompatibleMessage(sourceType: MessageType, targetType: MessageType): MessageType = { - val groupResult = formatForwardCompatibleType(sourceType, targetType).asGroupType() - logger.debug("Making source schema to be compatible with target" + - s"\nSource:\n${sourceType}\nTarget:\n${targetType}\nResult:\n${groupResult}") - new MessageType(groupResult.getName, groupResult.getFields) + def projectFileSchema(projectedReadSchema: MessageType, fileSchema: MessageType): MessageType = { + val projectedFileSchema = projectFileType(projectedReadSchema, fileSchema).asGroupType() + logger.debug(s"Projected read schema:\n${projectedReadSchema}\n" + + s"File schema:\n${fileSchema}\n" + + s"Projected file schema:\n${projectedFileSchema}") + new MessageType(projectedFileSchema.getName, projectedFileSchema.getFields) } /** - * Traverse source/target schemas and format nodes of list or map. - * The formatting is not to one-to-one node swapping from source to target, - * this is because the subset fields of source node and its optional/required must - * be maintained in the formatted result. + * Traverse given schemas and format node for list or map of projected read type to structure + * of file schema. The formatting is not to one-to-one node swapping between the two schemas + * because of the projection requirement. */ - private def formatForwardCompatibleType(sourceType: Type, targetType: Type): Type = { - (findCollectionGroup(sourceType), findCollectionGroup(targetType)) match { - case _ if sourceType.isPrimitive && targetType.isPrimitive => - sourceType - case _ if sourceType.isPrimitive != targetType.isPrimitive => + private def projectFileType(projectedReadType: Type, fileType: Type): Type = { + (extractCollectionGroup(projectedReadType), extractCollectionGroup(fileType)) match { + case _ if projectedReadType.isPrimitive && fileType.isPrimitive => + projectedReadType + case _ if projectedReadType.isPrimitive != fileType.isPrimitive => throw new DecodingSchemaMismatchException( - s"Found schema mismatch between source type ${sourceType.getName}:\n$sourceType\n\n" + - s"and target type:\n${targetType}" + s"Found schema mismatch between projected read type:\n$projectedReadType\n" + + s"and file type:\n${fileType}" ) - case (Some(sourceGroup: ListGroup), Some(targetGroup: ListGroup)) => - formatForwardCompatibleCollectionGroup[ListGroup](sourceGroup, targetGroup) - case (Some(sourceGroup: MapGroup), Some(targetGroup: MapGroup)) => - formatForwardCompatibleCollectionGroup[MapGroup](sourceGroup, targetGroup) + case (Some(projectedReadGroup: ListGroup), Some(fileGroup: ListGroup)) => + projectFileGroup[ListGroup](projectedReadGroup, fileGroup) + case (Some(projectedReadGroup: MapGroup), Some(fileGroup: MapGroup)) => + projectFileGroup[MapGroup](projectedReadGroup, fileGroup) case _ => // Field projection - val sourceGroup = sourceType.asGroupType - val targetGroup = targetType.asGroupType - val resultFields = sourceGroup.getFields.asScala.map { sourceField => - if (!targetGroup.containsField(sourceField.getName)) { - if (!sourceField.isRepetition(Repetition.OPTIONAL)) { + val projectedReadGroupType = projectedReadType.asGroupType + val fileGroupType = fileType.asGroupType + val projectedReadFields = projectedReadGroupType.getFields.asScala.map { projectedReadField => + if (!fileGroupType.containsField(projectedReadField.getName)) { + if (!projectedReadField.isRepetition(Repetition.OPTIONAL)) { throw new DecodingSchemaMismatchException( - s"Found non-optional source field ${sourceField.getName}:\n$sourceField\n\n" + - s"not present in the given target type:\n${targetGroup}" + s"Found non-optional projected read field ${projectedReadField.getName}:\n$projectedReadField\n\n" + + s"not present in the given file group type:\n${fileGroupType}" ) } - sourceField + projectedReadField } else { - val fieldIndex = targetGroup.getFieldIndex(sourceField.getName) - val targetField = targetGroup.getFields.get(fieldIndex) - formatForwardCompatibleType(sourceField, targetField) + val fileFieldIndex = fileGroupType.getFieldIndex(projectedReadField.getName) + val fileField = fileGroupType.getFields.get(fileFieldIndex) + projectFileType(projectedReadField, fileField) } } - sourceGroup.withNewFields(resultFields.asJava) + projectedReadGroupType.withNewFields(projectedReadFields.asJava) } } - private def formatForwardCompatibleCollectionGroup[T <: CollectionGroup](sourceGroup: T, - targetGroup: T) - (implicit t: ClassTag[T]): GroupType = { + private def projectFileGroup[T <: CollectionGroup](projectedReadGroup: T, + fileGroup: T)(implicit t: ClassTag[T]): GroupType = { val formatter = t.runtimeClass.asInstanceOf[Class[T]] match { case c if c == classOf[MapGroup] => ParquetMapFormatter case c if c == classOf[ListGroup] => ParquetListFormatter } - val formattedRepeated = formatter.formatForwardCompatibleRepeatedType( - sourceGroup.repeatedType, - targetGroup.repeatedType, - formatForwardCompatibleType(_, _)) - // Wrap the formatted repeated type in its original group. - // This maintains the field name, and optional/required information - sourceGroup.groupType.withNewFields(formattedRepeated) + val projectedFileRepeatedType = formatter.formatForwardCompatibleRepeatedType( + projectedReadGroup.repeatedType, + fileGroup.repeatedType, + projectFileType(_, _)) + // Respect optional/required from the projected read group. + projectedReadGroup.groupType.withNewFields(projectedFileRepeatedType) } - private def findCollectionGroup(typ: Type): Option[CollectionGroup] = { + private def extractCollectionGroup(typ: Type): Option[CollectionGroup] = { ParquetListFormatter.extractGroup(typ).orElse(ParquetMapFormatter.extractGroup(typ)) } } private[scrooge] trait ParquetCollectionFormatter { - def formatForwardCompatibleRepeatedType(sourceRepeatedMapType: Type, - targetRepeatedMapType: Type, + /** + * Format source repeated type in the structure of target repeated type. + * @param sourceRepeatedType repeated type from which the formatted result get content + * @param targetRepeatedType repeated type from which the formatted result get the structure + * @param recursiveSolver solver for the inner content of the repeated type + * @return formatted result + */ + def formatForwardCompatibleRepeatedType(sourceRepeatedType: Type, + targetRepeatedType: Type, recursiveSolver: (Type, Type) => Type): Type + /** + * Extract collection group containing repeated type of different formats. + */ def extractGroup(typ: Type): Option[CollectionGroup] } diff --git a/scalding-parquet-scrooge/src/test/scala/com/twitter/scalding/parquet/scrooge/ParquetCollectionFormatForwardCompatibilityTests.scala b/scalding-parquet-scrooge/src/test/scala/com/twitter/scalding/parquet/scrooge/ParquetCollectionFormatForwardCompatibilityTests.scala index 56f6eaf729..63e7b0ff8c 100644 --- a/scalding-parquet-scrooge/src/test/scala/com/twitter/scalding/parquet/scrooge/ParquetCollectionFormatForwardCompatibilityTests.scala +++ b/scalding-parquet-scrooge/src/test/scala/com/twitter/scalding/parquet/scrooge/ParquetCollectionFormatForwardCompatibilityTests.scala @@ -28,7 +28,7 @@ class ParquetCollectionFormatForwardCompatibilityTests extends WordSpec with Mat | } |} """.stripMargin) - val solved = ParquetCollectionFormatForwardCompatibility.formatForwardCompatibleMessage(targetType, targetType) + val solved = ParquetCollectionFormatForwardCompatibility.projectFileSchema(targetType, targetType) solved shouldEqual targetType } @@ -54,7 +54,7 @@ class ParquetCollectionFormatForwardCompatibilityTests extends WordSpec with Mat |} """.stripMargin) - val solved = ParquetCollectionFormatForwardCompatibility.formatForwardCompatibleMessage(message, message) + val solved = ParquetCollectionFormatForwardCompatibility.projectFileSchema(message, message) solved shouldEqual message } @@ -76,7 +76,7 @@ class ParquetCollectionFormatForwardCompatibilityTests extends WordSpec with Mat """.stripMargin ) - val solved = ParquetCollectionFormatForwardCompatibility.formatForwardCompatibleMessage(message, message) + val solved = ParquetCollectionFormatForwardCompatibility.projectFileSchema(message, message) solved shouldEqual message } @@ -103,7 +103,7 @@ class ParquetCollectionFormatForwardCompatibilityTests extends WordSpec with Mat | } |} """.stripMargin) - val solved = ParquetCollectionFormatForwardCompatibility.formatForwardCompatibleMessage(sourceType, targetType) + val solved = ParquetCollectionFormatForwardCompatibility.projectFileSchema(sourceType, targetType) val expected = MessageTypeParser.parseMessageType( """ |message SampleSource { @@ -156,7 +156,7 @@ class ParquetCollectionFormatForwardCompatibilityTests extends WordSpec with Mat | } |} """.stripMargin) - val solved = ParquetCollectionFormatForwardCompatibility.formatForwardCompatibleMessage(sourceType, targetType) + val solved = ParquetCollectionFormatForwardCompatibility.projectFileSchema(sourceType, targetType) val expected = MessageTypeParser.parseMessageType( """ |message SampleSource { @@ -210,7 +210,7 @@ class ParquetCollectionFormatForwardCompatibilityTests extends WordSpec with Mat | } |} """.stripMargin) - val solved = ParquetCollectionFormatForwardCompatibility.formatForwardCompatibleMessage(sourceType, targetType) + val solved = ParquetCollectionFormatForwardCompatibility.projectFileSchema(sourceType, targetType) val expected = MessageTypeParser.parseMessageType( """ |message SampleSource { @@ -240,7 +240,7 @@ class ParquetCollectionFormatForwardCompatibilityTests extends WordSpec with Mat | } |} """.stripMargin) - val solved = ParquetCollectionFormatForwardCompatibility.formatForwardCompatibleMessage(sourceType, targetType) + val solved = ParquetCollectionFormatForwardCompatibility.projectFileSchema(sourceType, targetType) val expected = MessageTypeParser.parseMessageType( """ |message SampleSource { @@ -272,7 +272,7 @@ class ParquetCollectionFormatForwardCompatibilityTests extends WordSpec with Mat | } |} """.stripMargin) - val solved = ParquetCollectionFormatForwardCompatibility.formatForwardCompatibleMessage(sourceType, targetType) + val solved = ParquetCollectionFormatForwardCompatibility.projectFileSchema(sourceType, targetType) // note optional of result, and field rename val expected = MessageTypeParser.parseMessageType( """ @@ -308,7 +308,7 @@ class ParquetCollectionFormatForwardCompatibilityTests extends WordSpec with Mat | } |} """.stripMargin) - val solved = ParquetCollectionFormatForwardCompatibility.formatForwardCompatibleMessage(sourceType, targetType) + val solved = ParquetCollectionFormatForwardCompatibility.projectFileSchema(sourceType, targetType) // note optional of result, and field rename val expected = MessageTypeParser.parseMessageType( """ @@ -348,7 +348,7 @@ class ParquetCollectionFormatForwardCompatibilityTests extends WordSpec with Mat | } |} """.stripMargin) - val solved = ParquetCollectionFormatForwardCompatibility.formatForwardCompatibleMessage(sourceType, targetType) + val solved = ParquetCollectionFormatForwardCompatibility.projectFileSchema(sourceType, targetType) val expected = MessageTypeParser.parseMessageType( """ |message SampleSource { @@ -388,7 +388,7 @@ class ParquetCollectionFormatForwardCompatibilityTests extends WordSpec with Mat | } |} """.stripMargin) - val solved = ParquetCollectionFormatForwardCompatibility.formatForwardCompatibleMessage(sourceType, targetType) + val solved = ParquetCollectionFormatForwardCompatibility.projectFileSchema(sourceType, targetType) val expected = MessageTypeParser.parseMessageType( """ @@ -416,7 +416,7 @@ class ParquetCollectionFormatForwardCompatibilityTests extends WordSpec with Mat | required int32 x; |} """.stripMargin) - val solved = ParquetCollectionFormatForwardCompatibility.formatForwardCompatibleMessage(targetType, targetType) + val solved = ParquetCollectionFormatForwardCompatibility.projectFileSchema(targetType, targetType) solved shouldEqual targetType } @@ -450,7 +450,7 @@ class ParquetCollectionFormatForwardCompatibilityTests extends WordSpec with Mat | } |} """.stripMargin) - val solved = ParquetCollectionFormatForwardCompatibility.formatForwardCompatibleMessage(sourceType, targetType) + val solved = ParquetCollectionFormatForwardCompatibility.projectFileSchema(sourceType, targetType) val expected = MessageTypeParser.parseMessageType( """ @@ -496,7 +496,7 @@ class ParquetCollectionFormatForwardCompatibilityTests extends WordSpec with Mat | } |} """.stripMargin) - val solved = ParquetCollectionFormatForwardCompatibility.formatForwardCompatibleMessage(sourceType, targetType) + val solved = ParquetCollectionFormatForwardCompatibility.projectFileSchema(sourceType, targetType) val expected = MessageTypeParser.parseMessageType( """ @@ -543,7 +543,7 @@ class ParquetCollectionFormatForwardCompatibilityTests extends WordSpec with Mat | } |} """.stripMargin) - val solved = ParquetCollectionFormatForwardCompatibility.formatForwardCompatibleMessage(sourceType, targetType) + val solved = ParquetCollectionFormatForwardCompatibility.projectFileSchema(sourceType, targetType) val expected = MessageTypeParser.parseMessageType( """ @@ -592,7 +592,7 @@ class ParquetCollectionFormatForwardCompatibilityTests extends WordSpec with Mat | } |} """.stripMargin) - val solved = ParquetCollectionFormatForwardCompatibility.formatForwardCompatibleMessage(sourceType, targetType) + val solved = ParquetCollectionFormatForwardCompatibility.projectFileSchema(sourceType, targetType) val expected = MessageTypeParser.parseMessageType( """ |message SampleSource { @@ -640,7 +640,7 @@ class ParquetCollectionFormatForwardCompatibilityTests extends WordSpec with Mat | optional int32 x; |} """.stripMargin) - val solved = ParquetCollectionFormatForwardCompatibility.formatForwardCompatibleMessage(sourceType, targetType) + val solved = ParquetCollectionFormatForwardCompatibility.projectFileSchema(sourceType, targetType) solved shouldEqual sourceType } } @@ -790,13 +790,13 @@ class ParquetCollectionFormatForwardCompatibilityTests extends WordSpec with Mat """.stripMargin) val e = intercept[DecodingSchemaMismatchException] { - ParquetCollectionFormatForwardCompatibility.formatForwardCompatibleMessage( + ParquetCollectionFormatForwardCompatibility.projectFileSchema( sourceType, targetType ) } - e.getMessage should include("non-optional source field map:") + e.getMessage should include("non-optional projected read field map:") } "throws on missing `repeated` causing projection of non-existent field" in { @@ -824,10 +824,10 @@ class ParquetCollectionFormatForwardCompatibilityTests extends WordSpec with Mat """.stripMargin) val e = intercept[DecodingSchemaMismatchException] { - ParquetCollectionFormatForwardCompatibility.formatForwardCompatibleMessage(sourceType, targetType) + ParquetCollectionFormatForwardCompatibility.projectFileSchema(sourceType, targetType) } - e.getMessage should include("non-optional source field element:") + e.getMessage should include("non-optional projected read field element:") } "throws on required but non-existent in target" in { @@ -856,13 +856,13 @@ class ParquetCollectionFormatForwardCompatibilityTests extends WordSpec with Mat """.stripMargin) val e = intercept[DecodingSchemaMismatchException] { - ParquetCollectionFormatForwardCompatibility.formatForwardCompatibleMessage( + ParquetCollectionFormatForwardCompatibility.projectFileSchema( sourceType, targetType ) } - e.getMessage should include("non-optional source field bogus_field:") + e.getMessage should include("non-optional projected read field bogus_field:") } } @@ -889,7 +889,7 @@ class ParquetCollectionFormatForwardCompatibilityTests extends WordSpec with Mat """.stripMargin) val e = intercept[DecodingSchemaMismatchException] { - ParquetCollectionFormatForwardCompatibility.formatForwardCompatibleMessage( + ParquetCollectionFormatForwardCompatibility.projectFileSchema( targetType, sourceType ) From d0ed5d620fffa7ebb85b3d5e6ecc4cb38c841765 Mon Sep 17 00:00:00 2001 From: Mick Jermsurawong Date: Thu, 3 Oct 2019 09:53:36 -0700 Subject: [PATCH 20/34] support creating _tuple format and generalize compat in all directions --- ...CollectionFormatForwardCompatibility.scala | 35 +++++--- .../scrooge/ParquetListFormatter.scala | 80 +++++++------------ .../parquet/scrooge/ParquetMapFormatter.scala | 14 ++-- 3 files changed, 60 insertions(+), 69 deletions(-) diff --git a/scalding-parquet-scrooge/src/main/scala/com/twitter/scalding/parquet/scrooge/ParquetCollectionFormatForwardCompatibility.scala b/scalding-parquet-scrooge/src/main/scala/com/twitter/scalding/parquet/scrooge/ParquetCollectionFormatForwardCompatibility.scala index d4e20b7630..984f475992 100644 --- a/scalding-parquet-scrooge/src/main/scala/com/twitter/scalding/parquet/scrooge/ParquetCollectionFormatForwardCompatibility.scala +++ b/scalding-parquet-scrooge/src/main/scala/com/twitter/scalding/parquet/scrooge/ParquetCollectionFormatForwardCompatibility.scala @@ -38,7 +38,7 @@ private[scrooge] object ParquetCollectionFormatForwardCompatibility { * @param fileSchema file schema to be projected */ def projectFileSchema(projectedReadSchema: MessageType, fileSchema: MessageType): MessageType = { - val projectedFileSchema = projectFileType(projectedReadSchema, fileSchema).asGroupType() + val projectedFileSchema = projectFileType(projectedReadSchema, fileSchema, FieldContext()).asGroupType() logger.debug(s"Projected read schema:\n${projectedReadSchema}\n" + s"File schema:\n${fileSchema}\n" + s"Projected file schema:\n${projectedFileSchema}") @@ -50,7 +50,7 @@ private[scrooge] object ParquetCollectionFormatForwardCompatibility { * of file schema. The formatting is not to one-to-one node swapping between the two schemas * because of the projection requirement. */ - private def projectFileType(projectedReadType: Type, fileType: Type): Type = { + private def projectFileType(projectedReadType: Type, fileType: Type, fieldContext: FieldContext): Type = { (extractCollectionGroup(projectedReadType), extractCollectionGroup(fileType)) match { case _ if projectedReadType.isPrimitive && fileType.isPrimitive => projectedReadType @@ -60,9 +60,9 @@ private[scrooge] object ParquetCollectionFormatForwardCompatibility { s"and file type:\n${fileType}" ) case (Some(projectedReadGroup: ListGroup), Some(fileGroup: ListGroup)) => - projectFileGroup[ListGroup](projectedReadGroup, fileGroup) + projectFileGroup[ListGroup](projectedReadGroup, fileGroup, fieldContext) case (Some(projectedReadGroup: MapGroup), Some(fileGroup: MapGroup)) => - projectFileGroup[MapGroup](projectedReadGroup, fileGroup) + projectFileGroup[MapGroup](projectedReadGroup, fileGroup, fieldContext) case _ => // Field projection val projectedReadGroupType = projectedReadType.asGroupType val fileGroupType = fileType.asGroupType @@ -79,7 +79,7 @@ private[scrooge] object ParquetCollectionFormatForwardCompatibility { else { val fileFieldIndex = fileGroupType.getFieldIndex(projectedReadField.getName) val fileField = fileGroupType.getFields.get(fileFieldIndex) - projectFileType(projectedReadField, fileField) + projectFileType(projectedReadField, fileField, FieldContext(projectedReadField.getName)) } } projectedReadGroupType.withNewFields(projectedReadFields.asJava) @@ -87,16 +87,21 @@ private[scrooge] object ParquetCollectionFormatForwardCompatibility { } private def projectFileGroup[T <: CollectionGroup](projectedReadGroup: T, - fileGroup: T)(implicit t: ClassTag[T]): GroupType = { + fileGroup: T, + fieldContext: FieldContext)(implicit t: ClassTag[T]): GroupType = { - val formatter = t.runtimeClass.asInstanceOf[Class[T]] match { - case c if c == classOf[MapGroup] => ParquetMapFormatter - case c if c == classOf[ListGroup] => ParquetListFormatter + val (formatter, updatedFieldContext) = t.runtimeClass.asInstanceOf[Class[T]] match { + case c if c == classOf[MapGroup] => + (ParquetMapFormatter, fieldContext.copy(nestedListLevel = fieldContext.nestedListLevel + 1)) + case c if c == classOf[ListGroup] => + (ParquetListFormatter, fieldContext.copy(nestedListLevel = fieldContext.nestedListLevel + 1)) } + val projectedFileRepeatedType = formatter.formatForwardCompatibleRepeatedType( projectedReadGroup.repeatedType, fileGroup.repeatedType, - projectFileType(_, _)) + updatedFieldContext, + projectFileType(_, _, _)) // Respect optional/required from the projected read group. projectedReadGroup.groupType.withNewFields(projectedFileRepeatedType) } @@ -116,7 +121,8 @@ private[scrooge] trait ParquetCollectionFormatter { */ def formatForwardCompatibleRepeatedType(sourceRepeatedType: Type, targetRepeatedType: Type, - recursiveSolver: (Type, Type) => Type): Type + fieldContext: FieldContext, + recursiveSolver: (Type, Type, FieldContext) => Type): Type /** * Extract collection group containing repeated type of different formats. @@ -124,6 +130,13 @@ private[scrooge] trait ParquetCollectionFormatter { def extractGroup(typ: Type): Option[CollectionGroup] } +/** + * Helper class to carry information from the field. Currently it only contains specific to list collection + * @param name field name + * @param nestedListLevel li + */ +private[scrooge] case class FieldContext(name: String="", nestedListLevel: Int=0) + private[scrooge] sealed trait CollectionGroup { /** * Type for the collection. diff --git a/scalding-parquet-scrooge/src/main/scala/com/twitter/scalding/parquet/scrooge/ParquetListFormatter.scala b/scalding-parquet-scrooge/src/main/scala/com/twitter/scalding/parquet/scrooge/ParquetListFormatter.scala index 4bd93a9cdf..c96069eb8e 100644 --- a/scalding-parquet-scrooge/src/main/scala/com/twitter/scalding/parquet/scrooge/ParquetListFormatter.scala +++ b/scalding-parquet-scrooge/src/main/scala/com/twitter/scalding/parquet/scrooge/ParquetListFormatter.scala @@ -18,27 +18,31 @@ private[scrooge] object ParquetListFormatter extends ParquetCollectionFormatter private val LOGGER = LoggerFactory.getLogger(getClass) + private val rules: Seq[ParquetListFormatRule] = Seq( + PrimitiveElementRule, PrimitiveArrayRule, + GroupElementRule, GroupArrayRule, + TupleRule, StandardRule + ) + def formatForwardCompatibleRepeatedType(repeatedSourceType: Type, repeatedTargetType: Type, - recursiveSolver: (Type, Type) => Type) = { + fieldContext: FieldContext, + recursiveSolver: (Type, Type, FieldContext) => Type) = { ( - findRule(repeatedSourceType, Source), - findRule(repeatedTargetType, Target) + findRule(repeatedSourceType), + findRule(repeatedTargetType) ) match { case (Some(sourceRule), Some(targetRule)) => { - val sourceElementType = sourceRule.elementType(repeatedSourceType) - val sourceElementIsRequired = sourceRule.isElementRequired(repeatedSourceType) - val sourceElementOriginalType = sourceRule.elementOriginalType(repeatedSourceType) - val targetElementType = targetRule.elementType(repeatedTargetType) - val forwardCompatElementType = recursiveSolver(sourceElementType, targetElementType) + val forwardCompatElementType = recursiveSolver(sourceElementType, targetElementType, fieldContext) targetRule.createCompliantRepeatedType( elementType = forwardCompatElementType, - elementName = forwardCompatElementType.getName, - isElementRequired = sourceElementIsRequired, - elementOriginalType = sourceElementOriginalType + elementName = sourceRule.elementName(repeatedSourceType), + isElementRequired = sourceRule.isElementRequired(repeatedSourceType), + elementOriginalType = sourceRule.elementOriginalType(repeatedSourceType), + fieldContext=fieldContext ) } @@ -65,38 +69,13 @@ private[scrooge] object ParquetListFormatter extends ParquetCollectionFormatter } } - private def findRule(repeatedType: Type, - sourceOrTarget: SourceOrTarget): Option[ParquetListFormatRule] = { - val ruleFound = sourceOrTarget.rules.find(rule => rule.appliesToType(repeatedType)) - if (ruleFound.isEmpty) LOGGER.warn(s"Unable to find matching rule for ${sourceOrTarget.name} schema:\n$repeatedType") + private def findRule(repeatedType: Type): Option[ParquetListFormatRule] = { + val ruleFound = rules.find(rule => rule.appliesToType(repeatedType)) + if (ruleFound.isEmpty) logger.warn(s"Unable to find matching rule for repeated type:\n$repeatedType") ruleFound } } -/** - * Helper to specify supported source/target conversion. - */ -private[scrooge] sealed trait SourceOrTarget { - def rules: Seq[ParquetListFormatRule] - def name: String -} - -private[scrooge] object Source extends SourceOrTarget { - override val rules: Seq[ParquetListFormatRule] = Seq( - PrimitiveElementRule, PrimitiveArrayRule, - GroupElementRule, GroupArrayRule, - TupleRule, StandardRule - ) - - override def name: String = "source" -} - -private[scrooge] object Target extends SourceOrTarget { - override def rules: Seq[ParquetListFormatRule] = Source.rules.filterNot(_ == TupleRule) - - override def name: String = "target" -} - /** * Rule allowing conversion from one format to other format by * 1) detect which format is the repeated list type. @@ -121,7 +100,8 @@ private[scrooge] sealed trait ParquetListFormatRule { private[scrooge] def createCompliantRepeatedType(elementType: Type, elementName: String, isElementRequired: Boolean, - elementOriginalType: OriginalType): Type + elementOriginalType: OriginalType, + fieldContext: FieldContext): Type } @@ -144,9 +124,9 @@ private[scrooge] sealed trait PrimitiveListRule extends ParquetListFormatRule { override def appliesToType(repeatedType: Type): Boolean = repeatedType.isPrimitive && repeatedType.getName == this.constantElementName - override def createCompliantRepeatedType(typ: Type, name: String, isElementRequired: Boolean, originalType: OriginalType): Type = { - if (!isElementRequired) throw new IllegalArgumentException("Primitive list format can only take required element") - if (!typ.isPrimitive) throw new IllegalArgumentException(String.format("Primitive list format cannot take group, but is given %s", typ)) + override def createCompliantRepeatedType(typ: Type, name: String, isElementRequired: Boolean, originalType: OriginalType, fieldContext: FieldContext): Type = { + if (!isElementRequired) throw new IllegalArgumentException(s"Primitive ${constantElementName} list format can only take required element") + if (!typ.isPrimitive) throw new IllegalArgumentException(s"Primitive list format cannot take group, but is given $typ") new PrimitiveType(Type.Repetition.REPEATED, typ.asPrimitiveType.getPrimitiveTypeName, this.constantElementName, originalType) } } @@ -188,9 +168,10 @@ private[scrooge] sealed trait GroupListRule extends ParquetListFormatRule { } } - override def createCompliantRepeatedType(typ: Type, name: String, isElementRequired: Boolean, originalType: OriginalType): Type = { - if (typ.isPrimitive) new GroupType(Type.Repetition.REPEATED, this.constantElementName, typ) - else new GroupType(Type.Repetition.REPEATED, this.constantElementName, typ.asGroupType.getFields) + override def createCompliantRepeatedType(typ: Type, name: String, isElementRequired: Boolean, originalType: OriginalType, fieldContext: FieldContext): Type = { + if (!isElementRequired) throw new IllegalArgumentException(s"Group ${constantElementName} list format can only take required element") + if (typ.isPrimitive) throw new IllegalArgumentException(s"Group list format cannot take primitive type, but is given $typ") + else new GroupType(Type.Repetition.REPEATED, this.constantElementName, originalType, typ.asGroupType.getFields) } } @@ -219,8 +200,9 @@ private[scrooge] object TupleRule extends ParquetListFormatRule { true } - override def createCompliantRepeatedType(typ: Type, name: String, isElementRequired: Boolean, originalType: OriginalType): Type = { - val suffixed_name = name + tupleSuffix + override def createCompliantRepeatedType(typ: Type, name: String, isElementRequired: Boolean, originalType: OriginalType, fieldContext: FieldContext): Type = { + // nested list has type name of the form: `field_original_name_tuple_tuple..._tuple` for the depth of list + val suffixed_name = (List(fieldContext.name) ++ (1 to fieldContext.nestedListLevel).toList.map(_ => "tuple")).mkString("_") if (typ.isPrimitive) new PrimitiveType(Type.Repetition.REPEATED, typ.asPrimitiveType.getPrimitiveTypeName, suffixed_name, originalType) else new GroupType(Type.Repetition.REPEATED, suffixed_name, originalType, typ.asGroupType.getFields) } @@ -248,7 +230,7 @@ private[scrooge] object StandardRule extends ParquetListFormatRule { override def elementName(repeatedType: Type): String = "element" - override def createCompliantRepeatedType(originalElementType: Type, name: String, isElementRequired: Boolean, originalType: OriginalType): Type = { + override def createCompliantRepeatedType(originalElementType: Type, name: String, isElementRequired: Boolean, originalType: OriginalType, fieldContext: FieldContext): Type = { val repetition = if (isElementRequired) Type.Repetition.REQUIRED else Type.Repetition.OPTIONAL val elementType = if (originalElementType.isPrimitive) { diff --git a/scalding-parquet-scrooge/src/main/scala/com/twitter/scalding/parquet/scrooge/ParquetMapFormatter.scala b/scalding-parquet-scrooge/src/main/scala/com/twitter/scalding/parquet/scrooge/ParquetMapFormatter.scala index 8f07ad8125..41a452be10 100644 --- a/scalding-parquet-scrooge/src/main/scala/com/twitter/scalding/parquet/scrooge/ParquetMapFormatter.scala +++ b/scalding-parquet-scrooge/src/main/scala/com/twitter/scalding/parquet/scrooge/ParquetMapFormatter.scala @@ -15,7 +15,7 @@ import org.apache.parquet.schema.{OriginalType, Type} private[scrooge] object ParquetMapFormatter extends ParquetCollectionFormatter { /** - * Handle legacy type when + * Handle map format compatibility * https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#maps * * @param sourceRepeatedMapType @@ -23,15 +23,11 @@ private[scrooge] object ParquetMapFormatter extends ParquetCollectionFormatter { */ def formatForwardCompatibleRepeatedType(sourceRepeatedMapType: Type, targetRepeatedMapType: Type, - recursiveSolver: (Type, Type) => Type) = { + fieldContext: FieldContext, + recursiveSolver: (Type, Type, FieldContext) => Type) = { - val solvedRepeatedType = recursiveSolver(sourceRepeatedMapType, targetRepeatedMapType) - if (isLegacyRepeatedType(sourceRepeatedMapType) && - isStandardRepeatedType(targetRepeatedMapType)) { - targetRepeatedMapType.asGroupType().withNewFields(solvedRepeatedType.asGroupType().getFields) - } else { - solvedRepeatedType - } + val solvedRepeatedType = recursiveSolver(sourceRepeatedMapType, targetRepeatedMapType, fieldContext) + targetRepeatedMapType.asGroupType().withNewFields(solvedRepeatedType.asGroupType().getFields) } def extractGroup(typ: Type): Option[MapGroup] = { From 5de8b64cbbf84c41481f5aada102d0aacd058e58 Mon Sep 17 00:00:00 2001 From: Mick Jermsurawong Date: Thu, 3 Oct 2019 10:03:17 -0700 Subject: [PATCH 21/34] support legacy spark list of nullable elements --- .../scrooge/ParquetListFormatter.scala | 71 +++++++++++++++---- 1 file changed, 58 insertions(+), 13 deletions(-) diff --git a/scalding-parquet-scrooge/src/main/scala/com/twitter/scalding/parquet/scrooge/ParquetListFormatter.scala b/scalding-parquet-scrooge/src/main/scala/com/twitter/scalding/parquet/scrooge/ParquetListFormatter.scala index c96069eb8e..93467e9bad 100644 --- a/scalding-parquet-scrooge/src/main/scala/com/twitter/scalding/parquet/scrooge/ParquetListFormatter.scala +++ b/scalding-parquet-scrooge/src/main/scala/com/twitter/scalding/parquet/scrooge/ParquetListFormatter.scala @@ -21,7 +21,7 @@ private[scrooge] object ParquetListFormatter extends ParquetCollectionFormatter private val rules: Seq[ParquetListFormatRule] = Seq( PrimitiveElementRule, PrimitiveArrayRule, GroupElementRule, GroupArrayRule, - TupleRule, StandardRule + TupleRule, StandardRule, SparkLegacyNullableElementRule ) def formatForwardCompatibleRepeatedType(repeatedSourceType: Type, @@ -208,17 +208,18 @@ private[scrooge] object TupleRule extends ParquetListFormatRule { } } -private[scrooge] object StandardRule extends ParquetListFormatRule { - /** - * repeated group list { - * element; - * } - */ + +private[scrooge] sealed trait ThreeLevelRule extends ParquetListFormatRule { + + def constantElementName: String + + def constantRepeatedGroupName: String + override def appliesToType(repeatedField: Type): Boolean = { - if (repeatedField.isPrimitive || !(repeatedField.getName == "list")) { + if (repeatedField.isPrimitive || !(repeatedField.getName == constantRepeatedGroupName)) { false } else { - elementType(repeatedField).getName == "element" + elementType(repeatedField).getName == constantElementName } } @@ -228,25 +229,69 @@ private[scrooge] object StandardRule extends ParquetListFormatRule { elementType(repeatedType).getRepetition == Type.Repetition.REQUIRED } - override def elementName(repeatedType: Type): String = "element" + override def elementName(repeatedType: Type): String = constantElementName override def createCompliantRepeatedType(originalElementType: Type, name: String, isElementRequired: Boolean, originalType: OriginalType, fieldContext: FieldContext): Type = { val repetition = if (isElementRequired) Type.Repetition.REQUIRED else Type.Repetition.OPTIONAL val elementType = if (originalElementType.isPrimitive) { - new PrimitiveType(repetition, originalElementType.asPrimitiveType.getPrimitiveTypeName, "element", originalType) + new PrimitiveType(repetition, originalElementType.asPrimitiveType.getPrimitiveTypeName, constantElementName, originalType) } else { new GroupType( repetition, - "element", + constantElementName, originalType, originalElementType.asGroupType.getFields) } - new GroupType(Type.Repetition.REPEATED, "list", util.Arrays.asList(elementType)) + new GroupType(Type.Repetition.REPEATED, constantRepeatedGroupName, util.Arrays.asList(elementType)) } private def firstField(groupType: GroupType): Type = { groupType.getFields.get(0) } } + +/** + * Standard parquet list format. + * repeated group list { + * element; + * } + */ +private[scrooge] object StandardRule extends ThreeLevelRule { + + def constantElementName = "element" + + def constantRepeatedGroupName = "list" +} + +/** + * Spark legacy format when element is nullable. + * repeated group bag { + * optional array; + * } + * Documentation on Spark is incorrect at the time of writing. It indicates `optional group bag`, + * but it should be `repeated group bag`, and optional element. + * https://github.com/apache/spark/blob/master/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetWriteSupport.scala#L345-L355 + * Writing Dataset[Seq[List]] in Spark with default encoder and legacy mode on will give + * + * message spark_schema { + * optional group value (LIST) { + * repeated group bag { + * optional binary array (UTF8); + * } + * } + * } + */ +private[scrooge] object SparkLegacyNullableElementRule extends ThreeLevelRule { + override def constantElementName: String = "array" + + override def constantRepeatedGroupName: String = "bag" + + override def createCompliantRepeatedType(originalElementType: Type, name: String, isElementRequired: Boolean, originalType: OriginalType, fieldContext: FieldContext): Type = { + if (isElementRequired) { + throw new IllegalArgumentException(s"Spark legacy mode for nullable element cannot take required element. Found: ${originalElementType}") + } + super.createCompliantRepeatedType(originalElementType, name, isElementRequired, originalType, fieldContext) + } +} From db937c3d84a75d44de19cdb41d419bf244925eea Mon Sep 17 00:00:00 2001 From: Mick Jermsurawong Date: Thu, 3 Oct 2019 10:03:31 -0700 Subject: [PATCH 22/34] improve docs --- .../scrooge/ParquetListFormatter.scala | 49 ++++++++++++------- 1 file changed, 31 insertions(+), 18 deletions(-) diff --git a/scalding-parquet-scrooge/src/main/scala/com/twitter/scalding/parquet/scrooge/ParquetListFormatter.scala b/scalding-parquet-scrooge/src/main/scala/com/twitter/scalding/parquet/scrooge/ParquetListFormatter.scala index 93467e9bad..b4370ee93c 100644 --- a/scalding-parquet-scrooge/src/main/scala/com/twitter/scalding/parquet/scrooge/ParquetListFormatter.scala +++ b/scalding-parquet-scrooge/src/main/scala/com/twitter/scalding/parquet/scrooge/ParquetListFormatter.scala @@ -16,7 +16,7 @@ import org.slf4j.LoggerFactory */ private[scrooge] object ParquetListFormatter extends ParquetCollectionFormatter { - private val LOGGER = LoggerFactory.getLogger(getClass) + private val logger = LoggerFactory.getLogger(getClass) private val rules: Seq[ParquetListFormatRule] = Seq( PrimitiveElementRule, PrimitiveArrayRule, @@ -104,19 +104,20 @@ private[scrooge] sealed trait ParquetListFormatRule { fieldContext: FieldContext): Type } - +/** + * Rule 1 in https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#backward-compatibility-rules + * Although documentation only mentions `element` primitive and not for `array`, + * Spark does write out with primitive `array` when legacy write format is enabled. + * repeated int32 [element|array]; + */ private[scrooge] sealed trait PrimitiveListRule extends ParquetListFormatRule { - /** - * repeated int32 [element|array]; - */ + def constantElementName: String override def elementType(repeatedType: Type): Type = repeatedType override private[scrooge] def isElementRequired(repeatedType: Type) = { - // According to Rule 1 from, - // https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#backward-compatibility-rules - // "the repeated field is not a group, + // According to rule 1, "the repeated field is not a group, // then its type is the element type and elements are required." true } @@ -139,18 +140,21 @@ private[scrooge] object PrimitiveArrayRule extends PrimitiveListRule { override def constantElementName: String = "array" } +/** + * Rule 2 in https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#backward-compatibility-rules + * Although documentation only mentions `element` group and not for `array`, + * Spark does write out with group `array` when legacy write format is enabled. + * repeated group [element|array] { + * required binary str (UTF8); + * required int32 num; + * } + */ private[scrooge] sealed trait GroupListRule extends ParquetListFormatRule { - /** - * repeated group [element|array] { - * required binary str (UTF8); - * required int32 num; - * } - */ + def constantElementName: String override def isElementRequired(repeatedType: Type): Boolean = { - // According Rule 2 from - // https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#backward-compatibility-rules + // According Rule 2, // "If the repeated field is a group with multiple fields, // then its type is the element type and elements are required." true @@ -183,7 +187,18 @@ private[scrooge] object GroupArrayRule extends GroupListRule { override def constantElementName: String = "array" } +/** + * Rule 3 in https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#backward-compatibility-rules + * Although the documentation only mentions group with one field, the generated schema from thrift struct + * does write out both primitive type and group type with multiple fields. + * repeated group my_list_field_tuple { + * required binary str (UTF8); + * } + * This repeated type implies the field name is `my_list_field`. This is the only format where + * info is not fully self-contained. + */ private[scrooge] object TupleRule extends ParquetListFormatRule { + private val tupleSuffix = "_tuple" override def appliesToType(repeatedType: Type): Boolean = repeatedType.getName.endsWith(tupleSuffix) @@ -195,8 +210,6 @@ private[scrooge] object TupleRule extends ParquetListFormatRule { override def elementType(repeatedType: Type): Type = repeatedType override private[scrooge] def isElementRequired(repeatedType: Type) = { - // According to Rule 3 from - // https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#backward-compatibility-rules true } From 0c38af848372306aaf559fae2fb3527e0137e5fe Mon Sep 17 00:00:00 2001 From: Mick Jermsurawong Date: Thu, 3 Oct 2019 10:03:54 -0700 Subject: [PATCH 23/34] add tests for all supported list compat conversions --- ...ctionFormatForwardCompatibilityTests.scala | 337 +++++++++++++++++- 1 file changed, 333 insertions(+), 4 deletions(-) diff --git a/scalding-parquet-scrooge/src/test/scala/com/twitter/scalding/parquet/scrooge/ParquetCollectionFormatForwardCompatibilityTests.scala b/scalding-parquet-scrooge/src/test/scala/com/twitter/scalding/parquet/scrooge/ParquetCollectionFormatForwardCompatibilityTests.scala index 63e7b0ff8c..fd1960a5af 100644 --- a/scalding-parquet-scrooge/src/test/scala/com/twitter/scalding/parquet/scrooge/ParquetCollectionFormatForwardCompatibilityTests.scala +++ b/scalding-parquet-scrooge/src/test/scala/com/twitter/scalding/parquet/scrooge/ParquetCollectionFormatForwardCompatibilityTests.scala @@ -12,6 +12,240 @@ import org.scalatest.{Matchers, WordSpec} class ParquetCollectionFormatForwardCompatibilityTests extends WordSpec with Matchers { + /** + * Helper wrapper to specify repetition string for exhaustive tests + */ + case class TestRepetitions(projectedReadRepetition1: String, projectedReadRepetition2: String, + fileRepetition1: String, fileRepetition2: String) + def feasibleRepetitions = { + for { + projectedRepetition1 <- Seq("required", "optional") + projectedRepetition2 <- Seq("required", "optional") + fileRepetition1 <- Seq("required", "optional") + fileRepetition2 <- Seq("required", "optional") + // when file type is optional, required projected type is breaking + if !(fileRepetition1 == "optional" && projectedRepetition1 == "required") + if !(fileRepetition2 == "optional" && projectedRepetition2 == "required") + } yield { + TestRepetitions(projectedRepetition1, projectedRepetition2, fileRepetition1, fileRepetition2) + } + } + + /** + * The following functions of different list formats are equivalent schemas to describe: + * {{ + * x: Int + * foo_string_list: Seq[Int] + * foo_struct_list: Option[Seq[Struct]] + * foo_list_of_list: Seq[Seq[Long]] + * y: Int + * foo_optional_list: + * }} + */ + def listElementRule(repetition1: String, repetition2: String) = ( + s""" + |message schema { + | $repetition1 int32 x; + | required group foo_string_list (LIST) { + | repeated int32 element; + | } + | optional group foo_struct_list (LIST) { + | repeated group element { + | required binary str (UTF8); + | ${repetition2} int32 num; + | } + | } + | required group foo_list_of_list (LIST) { + | repeated group element (LIST) { + | repeated int64 element; + | } + | } + | $repetition2 int32 y; + |} + """.stripMargin) + + def listArrayRule(repetition1: String, repetition2: String) = ( + s""" + |message schema { + | $repetition1 int32 x; + | required group foo_string_list (LIST) { + | repeated int32 array; + | } + | optional group foo_struct_list (LIST) { + | repeated group array { + | required binary str (UTF8); + | ${repetition2} int32 num; + | } + | } + | required group foo_list_of_list (LIST) { + | repeated group array (LIST) { + | repeated int64 array; + | } + | } + | $repetition2 int32 y; + |} + """.stripMargin) + + def listTupleRule(repetition1: String, repetition2: String) = ( + s""" + |message schema { + | $repetition1 int32 x; + | required group foo_string_list (LIST) { + | repeated int32 foo_string_list_tuple; + | } + | optional group foo_struct_list (LIST) { + | repeated group foo_struct_list_tuple { + | required binary str (UTF8); + | ${repetition2} int32 num; + | } + | } + | required group foo_list_of_list (LIST) { + | repeated group foo_list_of_list_tuple (LIST) { + | repeated int64 foo_list_of_list_tuple_tuple; + | } + | } + | $repetition2 int32 y; + |} + """.stripMargin) + + def listStandardRule(repetition1: String, repetition2: String, nullableElement:Boolean=false) = { + val requiredOrOptional = if (nullableElement) "optional" else "required" + (s""" + |message schema { + | $repetition1 int32 x; + | required group foo_string_list (LIST) { + | repeated group list { + | $requiredOrOptional int32 element; + | } + | } + | optional group foo_struct_list (LIST) { + | repeated group list { + | $requiredOrOptional group element { + | required binary str (UTF8); + | ${repetition2} int32 num; + | } + | } + | } + | required group foo_list_of_list (LIST) { + | repeated group list { + | $requiredOrOptional group element (LIST) { + | repeated group list { + | $requiredOrOptional int64 element; + | } + | } + | } + | } + | $repetition2 int32 y; + |} + """.stripMargin) + } + + val requiredElementRules = Seq( + ("element", listElementRule(_, _)), + ("array", listArrayRule(_, _)), + ("tuple", listTupleRule(_, _)), + ("standard", (from: String, to: String) => listStandardRule(from, to, nullableElement = false)) + ) + for { + (projectedReadRuleName, projectedReadSchemaFunc) <- requiredElementRules + (fileRuleName, fileSchemaFunc) <- requiredElementRules + } yield { + s"Format from: [${projectedReadRuleName}] to: [${fileRuleName}]" should { + "take option/require specifications from projected read schema" in { + for { + feasibleRepetition <- feasibleRepetitions + } yield { + val projectedRepetition1 = feasibleRepetition.projectedReadRepetition1 + val projectedRepetition2 = feasibleRepetition.projectedReadRepetition2 + val projectedReadSchema = MessageTypeParser.parseMessageType(projectedReadSchemaFunc(projectedRepetition1, projectedRepetition2)) + + val fileRepetition1 = feasibleRepetition.fileRepetition1 + val fileRepetition2 = feasibleRepetition.fileRepetition2 + val fileSchema = MessageTypeParser.parseMessageType(fileSchemaFunc(fileRepetition1, fileRepetition2)) + + val expectedProjectedFileSchema = MessageTypeParser.parseMessageType( + fileSchemaFunc(projectedRepetition1, projectedRepetition2)) + + expectedProjectedFileSchema shouldEqual ParquetCollectionFormatForwardCompatibility + .projectFileSchema(projectedReadSchema, fileSchema) + } + } + } + } + + def listSparkLegacyNullableElementRule(repetition1: String, repetition2: String) = ( + s""" + |message schema { + | $repetition1 int32 x; + | required group foo_string_list (LIST) { + | repeated group bag { + | optional int32 array; + | } + | } + | optional group foo_struct_list (LIST) { + | repeated group bag { + | optional group array { + | required binary str (UTF8); + | ${repetition2} int32 num; + | } + | } + | } + | required group foo_list_of_list (LIST) { + | repeated group bag { + | optional group array (LIST) { + | repeated group bag { + | optional int64 array; + | } + | } + | } + | } + | $repetition2 int32 y; + |} + """.stripMargin) + + "Format compat for list with nullable element" should { + "format from spark legacy write, with nullable elements, to standard" in { + for { + feasibleRepetition <- feasibleRepetitions + } yield { + val projectedRepetition1 = feasibleRepetition.projectedReadRepetition1 + val projectedRepetition2 = feasibleRepetition.projectedReadRepetition2 + val projectedReadSchema = MessageTypeParser.parseMessageType(listSparkLegacyNullableElementRule(projectedRepetition1, projectedRepetition2)) + + val fileRepetition1 = feasibleRepetition.fileRepetition1 + val fileRepetition2 = feasibleRepetition.fileRepetition2 + val fileSchema = MessageTypeParser.parseMessageType(listStandardRule(fileRepetition1, + fileRepetition2, + nullableElement = true) + ) + val expectedProjectedFileSchema = MessageTypeParser.parseMessageType(listStandardRule(projectedRepetition1, projectedRepetition2, nullableElement = true)) + expectedProjectedFileSchema shouldEqual ParquetCollectionFormatForwardCompatibility + .projectFileSchema(projectedReadSchema, fileSchema) + } + } + + "failed to format required element to spark legacy write with nullable element" in { + for { + feasibleRepetition <- feasibleRepetitions + (_, requiredElementSchemaFunc) <- requiredElementRules + } yield { + val projectedRepetition1 = feasibleRepetition.projectedReadRepetition1 + val projectedRepetition2 = feasibleRepetition.projectedReadRepetition2 + val fileSchema = MessageTypeParser.parseMessageType(listSparkLegacyNullableElementRule(projectedRepetition1, projectedRepetition2)) + + val fileRepetition1 = feasibleRepetition.fileRepetition1 + val fileRepetition2 = feasibleRepetition.fileRepetition2 + val projectedReadSchema = MessageTypeParser.parseMessageType(requiredElementSchemaFunc(fileRepetition1, + fileRepetition2) + ) + val e = intercept[IllegalArgumentException] { + ParquetCollectionFormatForwardCompatibility.projectFileSchema(projectedReadSchema, fileSchema) + } + e.getMessage should include("Spark legacy mode for nullable element cannot take required element") + } + } + } + "Format forward compat: resolving map format" should { "map identity" in { val targetType = MessageTypeParser.parseMessageType( @@ -314,7 +548,7 @@ class ParquetCollectionFormatForwardCompatibilityTests extends WordSpec with Mat """ |message SampleSource { | optional group foo (LIST) { - | repeated group array { + | repeated group array (LIST) { | repeated binary array (UTF8); | } | } @@ -613,7 +847,7 @@ class ParquetCollectionFormatForwardCompatibilityTests extends WordSpec with Mat solved shouldEqual expected } - "does not format 3-level to x_tuple" in { + "format 3-level to x_tuple" in { val targetType = MessageTypeParser.parseMessageType( """ |message scalding_schema { @@ -641,11 +875,52 @@ class ParquetCollectionFormatForwardCompatibilityTests extends WordSpec with Mat |} """.stripMargin) val solved = ParquetCollectionFormatForwardCompatibility.projectFileSchema(sourceType, targetType) - solved shouldEqual sourceType + solved shouldEqual MessageTypeParser.parseMessageType( + """ + |message SampleSource { + | optional group foo (LIST) { + | repeated group foo_tuple (LIST) { + | repeated binary foo_tuple_tuple (UTF8); + | } + | } + | optional int32 x; + |} + """.stripMargin) } } "Format forward compat: resolving mixed collection" should { + "list of map identity from thrift struct" in { + val mapType = new MapType( + new ThriftField("NOT_USED_KEY", 4, Requirement.REQUIRED, new ThriftType.StringType), + new ThriftField("NOT_USED_VALUE", 5, Requirement.REQUIRED, new ThriftType.I64Type)) + val message = new ThriftSchemaConverter().convert( + new StructType(util.Arrays.asList( + new ThriftField("list_of_map", 2, Requirement.REQUIRED, new ListType( + new ThriftField("NOT_USED_ELEMENT", 2, Requirement.REQUIRED, mapType)) + ) + ), StructOrUnionType.STRUCT)) + + message shouldEqual MessageTypeParser.parseMessageType( + """ + |message ParquetSchema { + | required group list_of_map (LIST) = 2 { + | repeated group list_of_map_tuple (MAP) { + | repeated group map (MAP_KEY_VALUE) { + | required binary key (UTF8); + | optional int64 value; + | } + | } + | } + |} + | + """.stripMargin + ) + + val solved = ParquetCollectionFormatForwardCompatibility.projectFileSchema(message, message) + solved shouldEqual message + } + "format map of list" in { val targetType = MessageTypeParser.parseMessageType( """ @@ -705,7 +980,7 @@ class ParquetCollectionFormatForwardCompatibilityTests extends WordSpec with Mat solved shouldEqual expected } - "format list of map" in { + "format list of map: tuple_x to standard" in { val targetType = MessageTypeParser.parseMessageType( """ |message spark_schema { @@ -760,6 +1035,60 @@ class ParquetCollectionFormatForwardCompatibilityTests extends WordSpec with Mat """.stripMargin) solved shouldEqual expected } + + "format list of map: standard to tuple_x" in { + val targetType = MessageTypeParser.parseMessageType( + """ + |message spark_schema { + | required group list_of_map (LIST) { + | repeated group list_of_map_tuple (MAP) { + | repeated group map (MAP_KEY_VALUE) { + | required binary key (UTF8); + | required group value { + | required binary _id (UTF8); + | required double created; + | } + | } + | } + | } + |} + """.stripMargin) + val sourceType = MessageTypeParser.parseMessageType( + """ + |message SampleSource { + | required group list_of_map (LIST) { + | repeated group list { + | required group element (MAP) { + | repeated group key_value { + | required binary key (UTF8); + | optional group value { + | optional double created; + | } + | } + | } + | } + | } + |} + """.stripMargin) + + val solved = ParquetCollectionFormatForwardCompatibility.projectFileSchema(sourceType, targetType) + val expected = MessageTypeParser.parseMessageType( + """ + |message SampleSource { + | required group list_of_map (LIST) { + | repeated group list_of_map_tuple (MAP) { + | repeated group map (MAP_KEY_VALUE) { + | required binary key (UTF8); + | optional group value { + | optional double created; + | } + | } + | } + | } + |} + """.stripMargin) + solved shouldEqual expected + } } "Format forward compat: check extra non-optional field projection" should { From 3700a85e203fd19aad7612fd17e1b2eee8abfddc Mon Sep 17 00:00:00 2001 From: Mick Jermsurawong Date: Fri, 4 Oct 2019 05:51:19 -0700 Subject: [PATCH 24/34] improve docs --- ...CollectionFormatForwardCompatibility.scala | 35 +++++++++++-------- 1 file changed, 20 insertions(+), 15 deletions(-) diff --git a/scalding-parquet-scrooge/src/main/scala/com/twitter/scalding/parquet/scrooge/ParquetCollectionFormatForwardCompatibility.scala b/scalding-parquet-scrooge/src/main/scala/com/twitter/scalding/parquet/scrooge/ParquetCollectionFormatForwardCompatibility.scala index 984f475992..5237e89ade 100644 --- a/scalding-parquet-scrooge/src/main/scala/com/twitter/scalding/parquet/scrooge/ParquetCollectionFormatForwardCompatibility.scala +++ b/scalding-parquet-scrooge/src/main/scala/com/twitter/scalding/parquet/scrooge/ParquetCollectionFormatForwardCompatibility.scala @@ -9,21 +9,25 @@ import org.slf4j.LoggerFactory import scala.reflect.ClassTag /** - * Project file schema to have collection types--list and map--in the same structure - * as projected read schema. This is currently used in [[ScroogeReadSupport]] where projected - * read schema can come from: + * Project file schema based on projected read schema which may contain different format + * of collection group--list and map. This is currently used in [[ScroogeReadSupport]] where + * projected read schema can come from: * 1) Thrift struct via [[org.apache.parquet.thrift.ThriftSchemaConvertVisitor]] which always * describe list with `_tuple` format, and map which has `MAP_KEY_VALUE` annotation. * 2) User-supplied schema string via config key * [[org.apache.parquet.hadoop.api.ReadSupport.PARQUET_READ_SCHEMA]] * - * The strategy of this class is to first assume that the projected read schema is a "sub-graph" of - * file schema in terms of field names. (We allow optional field in projected read schema to be in - * the projected file schema.) However, the data types for collection can differ in - * graph structure between the two schemas. - * We thus need to: - * 1) traverse the two schemas until we find the collection type indicated by `repeated` type. - * 2) delegate the collection types found to respective list/map formatter. + * By definition, the projected read schema is a "sub-graph" of file schema in terms of field names. + * (We do allow optional field in projected read schema to be in + * the projected file schema, even if file schema may not originally contain it.) + * The graphs of the two schemas may, however, differ for list and map type because of multiple + * legacy formats and the canonical one. This class supports all directions of conversion. + * + * The projection strategy is: + * 1) traverse the two schemas and maintain only the fields in the read schema. + * 2) find collection type indicated by `repeated` type, and delegate it to respective list/map formatter. + * 3) wrap back the formatted repeated type with group type from projected read schema. This + * means the optional/required remains the same as that from projected read schema. */ private[scrooge] object ParquetCollectionFormatForwardCompatibility { @@ -31,8 +35,8 @@ private[scrooge] object ParquetCollectionFormatForwardCompatibility { /** * Project file schema to contain the same fields as the given projected read schema. - * The result projected file schema should have the same optional/required fields as the - * projected read schema, but maintain collection type format for the file schema. + * The result is projected file schema with the same optional/required fields as the + * projected read schema, but collection type format as the file schema. * * @param projectedReadSchema read schema specifying field projection * @param fileSchema file schema to be projected @@ -46,9 +50,10 @@ private[scrooge] object ParquetCollectionFormatForwardCompatibility { } /** - * Traverse given schemas and format node for list or map of projected read type to structure - * of file schema. The formatting is not to one-to-one node swapping between the two schemas - * because of the projection requirement. + * Main recursion to get projected file type. Traverse given schemas, filter out unneeded + * fields, and format read schema's list/map node to file schema's structure. + * The formatting of repeated type is not to one-to-one node swapping because we also have to + * handle projection and possible nested collection types in the repeated type. */ private def projectFileType(projectedReadType: Type, fileType: Type, fieldContext: FieldContext): Type = { (extractCollectionGroup(projectedReadType), extractCollectionGroup(fileType)) match { From 3b7255baa278aa8b0e5aecf3b9ec58abcf7c1a21 Mon Sep 17 00:00:00 2001 From: Mick Jermsurawong Date: Fri, 4 Oct 2019 05:56:57 -0700 Subject: [PATCH 25/34] remove classtag inference --- ...CollectionFormatForwardCompatibility.scala | 48 +++++++++---------- .../scrooge/ParquetListFormatter.scala | 8 ++-- .../parquet/scrooge/ParquetMapFormatter.scala | 8 ++-- 3 files changed, 32 insertions(+), 32 deletions(-) diff --git a/scalding-parquet-scrooge/src/main/scala/com/twitter/scalding/parquet/scrooge/ParquetCollectionFormatForwardCompatibility.scala b/scalding-parquet-scrooge/src/main/scala/com/twitter/scalding/parquet/scrooge/ParquetCollectionFormatForwardCompatibility.scala index 5237e89ade..5ac9f7c101 100644 --- a/scalding-parquet-scrooge/src/main/scala/com/twitter/scalding/parquet/scrooge/ParquetCollectionFormatForwardCompatibility.scala +++ b/scalding-parquet-scrooge/src/main/scala/com/twitter/scalding/parquet/scrooge/ParquetCollectionFormatForwardCompatibility.scala @@ -6,8 +6,6 @@ import org.apache.parquet.schema.{GroupType, MessageType, Type} import org.apache.parquet.thrift.DecodingSchemaMismatchException import org.slf4j.LoggerFactory -import scala.reflect.ClassTag - /** * Project file schema based on projected read schema which may contain different format * of collection group--list and map. This is currently used in [[ScroogeReadSupport]] where @@ -65,10 +63,20 @@ private[scrooge] object ParquetCollectionFormatForwardCompatibility { s"and file type:\n${fileType}" ) case (Some(projectedReadGroup: ListGroup), Some(fileGroup: ListGroup)) => - projectFileGroup[ListGroup](projectedReadGroup, fileGroup, fieldContext) + projectFileGroup( + projectedReadGroup, + fileGroup, + fieldContext.copy(nestedListLevel = fieldContext.nestedListLevel + 1), + formatter=ParquetListFormatter + ) case (Some(projectedReadGroup: MapGroup), Some(fileGroup: MapGroup)) => - projectFileGroup[MapGroup](projectedReadGroup, fileGroup, fieldContext) - case _ => // Field projection + projectFileGroup( + projectedReadGroup, + fileGroup, + fieldContext, + formatter=ParquetMapFormatter + ) + case _ => // Struct projection val projectedReadGroupType = projectedReadType.asGroupType val fileGroupType = fileType.asGroupType val projectedReadFields = projectedReadGroupType.getFields.asScala.map { projectedReadField => @@ -80,8 +88,7 @@ private[scrooge] object ParquetCollectionFormatForwardCompatibility { ) } projectedReadField - } - else { + } else { val fileFieldIndex = fileGroupType.getFieldIndex(projectedReadField.getName) val fileField = fileGroupType.getFields.get(fileFieldIndex) projectFileType(projectedReadField, fileField, FieldContext(projectedReadField.getName)) @@ -91,21 +98,14 @@ private[scrooge] object ParquetCollectionFormatForwardCompatibility { } } - private def projectFileGroup[T <: CollectionGroup](projectedReadGroup: T, - fileGroup: T, - fieldContext: FieldContext)(implicit t: ClassTag[T]): GroupType = { - - val (formatter, updatedFieldContext) = t.runtimeClass.asInstanceOf[Class[T]] match { - case c if c == classOf[MapGroup] => - (ParquetMapFormatter, fieldContext.copy(nestedListLevel = fieldContext.nestedListLevel + 1)) - case c if c == classOf[ListGroup] => - (ParquetListFormatter, fieldContext.copy(nestedListLevel = fieldContext.nestedListLevel + 1)) - } - - val projectedFileRepeatedType = formatter.formatForwardCompatibleRepeatedType( + private def projectFileGroup(projectedReadGroup: CollectionGroup, + fileGroup: CollectionGroup, + fieldContext: FieldContext, + formatter: ParquetCollectionFormatter): GroupType = { + val projectedFileRepeatedType = formatter.formatCompatibleRepeatedType( projectedReadGroup.repeatedType, fileGroup.repeatedType, - updatedFieldContext, + fieldContext, projectFileType(_, _, _)) // Respect optional/required from the projected read group. projectedReadGroup.groupType.withNewFields(projectedFileRepeatedType) @@ -124,10 +124,10 @@ private[scrooge] trait ParquetCollectionFormatter { * @param recursiveSolver solver for the inner content of the repeated type * @return formatted result */ - def formatForwardCompatibleRepeatedType(sourceRepeatedType: Type, - targetRepeatedType: Type, - fieldContext: FieldContext, - recursiveSolver: (Type, Type, FieldContext) => Type): Type + def formatCompatibleRepeatedType(sourceRepeatedType: Type, + targetRepeatedType: Type, + fieldContext: FieldContext, + recursiveSolver: (Type, Type, FieldContext) => Type): Type /** * Extract collection group containing repeated type of different formats. diff --git a/scalding-parquet-scrooge/src/main/scala/com/twitter/scalding/parquet/scrooge/ParquetListFormatter.scala b/scalding-parquet-scrooge/src/main/scala/com/twitter/scalding/parquet/scrooge/ParquetListFormatter.scala index b4370ee93c..e95a5c61b2 100644 --- a/scalding-parquet-scrooge/src/main/scala/com/twitter/scalding/parquet/scrooge/ParquetListFormatter.scala +++ b/scalding-parquet-scrooge/src/main/scala/com/twitter/scalding/parquet/scrooge/ParquetListFormatter.scala @@ -24,10 +24,10 @@ private[scrooge] object ParquetListFormatter extends ParquetCollectionFormatter TupleRule, StandardRule, SparkLegacyNullableElementRule ) - def formatForwardCompatibleRepeatedType(repeatedSourceType: Type, - repeatedTargetType: Type, - fieldContext: FieldContext, - recursiveSolver: (Type, Type, FieldContext) => Type) = { + def formatCompatibleRepeatedType(repeatedSourceType: Type, + repeatedTargetType: Type, + fieldContext: FieldContext, + recursiveSolver: (Type, Type, FieldContext) => Type) = { ( findRule(repeatedSourceType), findRule(repeatedTargetType) diff --git a/scalding-parquet-scrooge/src/main/scala/com/twitter/scalding/parquet/scrooge/ParquetMapFormatter.scala b/scalding-parquet-scrooge/src/main/scala/com/twitter/scalding/parquet/scrooge/ParquetMapFormatter.scala index 41a452be10..779674e34e 100644 --- a/scalding-parquet-scrooge/src/main/scala/com/twitter/scalding/parquet/scrooge/ParquetMapFormatter.scala +++ b/scalding-parquet-scrooge/src/main/scala/com/twitter/scalding/parquet/scrooge/ParquetMapFormatter.scala @@ -21,10 +21,10 @@ private[scrooge] object ParquetMapFormatter extends ParquetCollectionFormatter { * @param sourceRepeatedMapType * @param targetRepeatedMapType */ - def formatForwardCompatibleRepeatedType(sourceRepeatedMapType: Type, - targetRepeatedMapType: Type, - fieldContext: FieldContext, - recursiveSolver: (Type, Type, FieldContext) => Type) = { + def formatCompatibleRepeatedType(sourceRepeatedMapType: Type, + targetRepeatedMapType: Type, + fieldContext: FieldContext, + recursiveSolver: (Type, Type, FieldContext) => Type) = { val solvedRepeatedType = recursiveSolver(sourceRepeatedMapType, targetRepeatedMapType, fieldContext) targetRepeatedMapType.asGroupType().withNewFields(solvedRepeatedType.asGroupType().getFields) From 3a9be2e317b3bedb99be60b2920f563f76f97972 Mon Sep 17 00:00:00 2001 From: Mick Jermsurawong Date: Fri, 4 Oct 2019 06:04:23 -0700 Subject: [PATCH 26/34] rename schema name and use consistent method --- ...ctionFormatForwardCompatibilityTests.scala | 76 +++++++++---------- 1 file changed, 38 insertions(+), 38 deletions(-) diff --git a/scalding-parquet-scrooge/src/test/scala/com/twitter/scalding/parquet/scrooge/ParquetCollectionFormatForwardCompatibilityTests.scala b/scalding-parquet-scrooge/src/test/scala/com/twitter/scalding/parquet/scrooge/ParquetCollectionFormatForwardCompatibilityTests.scala index fd1960a5af..8eb93b7055 100644 --- a/scalding-parquet-scrooge/src/test/scala/com/twitter/scalding/parquet/scrooge/ParquetCollectionFormatForwardCompatibilityTests.scala +++ b/scalding-parquet-scrooge/src/test/scala/com/twitter/scalding/parquet/scrooge/ParquetCollectionFormatForwardCompatibilityTests.scala @@ -250,7 +250,7 @@ class ParquetCollectionFormatForwardCompatibilityTests extends WordSpec with Mat "map identity" in { val targetType = MessageTypeParser.parseMessageType( """ - |message spark_schema { + |message FileSchema { | required group map (MAP) { | repeated group key_value { | required binary key (UTF8); @@ -317,7 +317,7 @@ class ParquetCollectionFormatForwardCompatibilityTests extends WordSpec with Mat "format map legacy: original type (MAP_KEY_VALUE) to standard format key_value" in { val targetType = MessageTypeParser.parseMessageType( """ - |message spark_schema { + |message FileSchema { | required group map_field (MAP) { | repeated group key_value { | required binary key (UTF8); @@ -355,7 +355,7 @@ class ParquetCollectionFormatForwardCompatibilityTests extends WordSpec with Mat "format map legacy map of map" in { val targetType = MessageTypeParser.parseMessageType( """ - |message spark_schema { + |message FileSchema { | required group map_of_map_field (MAP) { | repeated group key_value { | required binary key (UTF8); @@ -429,7 +429,7 @@ class ParquetCollectionFormatForwardCompatibilityTests extends WordSpec with Mat "format x_tuple to primitive array" in { val targetType = MessageTypeParser.parseMessageType( """ - |message spark_schema { + |message FileSchema { | required group country_codes (LIST) { | repeated binary array (UTF8); | } @@ -459,7 +459,7 @@ class ParquetCollectionFormatForwardCompatibilityTests extends WordSpec with Mat "format x_tuple to primitive element" in { val targetType = MessageTypeParser.parseMessageType( """ - |message spark_schema { + |message FileSchema { | required group country_codes (LIST) { | repeated binary element (UTF8); | } @@ -489,7 +489,7 @@ class ParquetCollectionFormatForwardCompatibilityTests extends WordSpec with Mat "format x_tuple to 3-level" in { val targetType = MessageTypeParser.parseMessageType( """ - |message spark_schema { + |message FileSchema { | required group country_codes (LIST) { | repeated group list { | required binary element (UTF8); @@ -524,7 +524,7 @@ class ParquetCollectionFormatForwardCompatibilityTests extends WordSpec with Mat "format nested x_tuple to group array" in { val targetType = MessageTypeParser.parseMessageType( """ - |message spark_schema { + |message FileSchema { | required group foo (LIST) { | repeated group array (LIST) { | repeated binary array (UTF8); @@ -560,7 +560,7 @@ class ParquetCollectionFormatForwardCompatibilityTests extends WordSpec with Mat "format nested x_tuple to nested 3-level" in { val targetType = MessageTypeParser.parseMessageType( """ - |message spark_schema { + |message FileSchema { | required group foo (LIST) { | repeated group list { | required group element (LIST) { @@ -603,7 +603,7 @@ class ParquetCollectionFormatForwardCompatibilityTests extends WordSpec with Mat "format binary array to 3-level" in { val targetType = MessageTypeParser.parseMessageType( """ - |message spark_schema { + |message FileSchema { | required group country_codes (LIST) { | repeated group list { | required binary element (UTF8); @@ -641,7 +641,7 @@ class ParquetCollectionFormatForwardCompatibilityTests extends WordSpec with Mat "format 3-level to 3-level (identity)" in { val targetType = MessageTypeParser.parseMessageType( """ - |message spark_schema { + |message FileSchema { | required group country_codes (LIST) { | repeated group list { | required binary element (UTF8); @@ -657,7 +657,7 @@ class ParquetCollectionFormatForwardCompatibilityTests extends WordSpec with Mat "format nested primitive array to nested 3-level" in { val targetType = MessageTypeParser.parseMessageType( """ - |message spark_schema { + |message FileSchema { | required group array_of_country_codes (LIST) { | repeated group list { | required group element (LIST) { @@ -706,7 +706,7 @@ class ParquetCollectionFormatForwardCompatibilityTests extends WordSpec with Mat "format element group to 3-level" in { val targetType = MessageTypeParser.parseMessageType( """ - |message spark_schema { + |message FileSchema { | required group country_codes (LIST) { | repeated group list { | required group element { @@ -751,7 +751,7 @@ class ParquetCollectionFormatForwardCompatibilityTests extends WordSpec with Mat "format 3-level to nested primitive array" in { val targetType = MessageTypeParser.parseMessageType( """ - |message scalding_schema { + |message FileSchema { | required group array_of_country_codes (LIST) { | repeated group list { | required group element (LIST) { @@ -765,7 +765,7 @@ class ParquetCollectionFormatForwardCompatibilityTests extends WordSpec with Mat val sourceType = MessageTypeParser.parseMessageType( """ - |message SampleSource { + |message ProjectedReadSchema { | optional group array_of_country_codes (LIST) { | repeated group list { | required group element (LIST) { @@ -781,7 +781,7 @@ class ParquetCollectionFormatForwardCompatibilityTests extends WordSpec with Mat val expected = MessageTypeParser.parseMessageType( """ - |message SampleSource { + |message ProjectedReadSchema { | optional group array_of_country_codes (LIST) { | repeated group list { | required group element (LIST) { @@ -797,7 +797,7 @@ class ParquetCollectionFormatForwardCompatibilityTests extends WordSpec with Mat "format x_tuple in group to 3-level" in { val targetType = MessageTypeParser.parseMessageType( """ - |message spark_schema { + |message FileSchema { | optional group connect_delays (LIST) { | repeated group list { | required group element { @@ -815,7 +815,7 @@ class ParquetCollectionFormatForwardCompatibilityTests extends WordSpec with Mat """.stripMargin) val sourceType = MessageTypeParser.parseMessageType( """ - |message SampleSource { + |message ProjectedReadSchema { | optional group connect_delays (LIST) { | repeated group connect_delays_tuple { | optional binary description (UTF8); @@ -829,7 +829,7 @@ class ParquetCollectionFormatForwardCompatibilityTests extends WordSpec with Mat val solved = ParquetCollectionFormatForwardCompatibility.projectFileSchema(sourceType, targetType) val expected = MessageTypeParser.parseMessageType( """ - |message SampleSource { + |message ProjectedReadSchema { | optional group connect_delays (LIST) { | repeated group list { | required group element { @@ -850,7 +850,7 @@ class ParquetCollectionFormatForwardCompatibilityTests extends WordSpec with Mat "format 3-level to x_tuple" in { val targetType = MessageTypeParser.parseMessageType( """ - |message scalding_schema { + |message FileSchema { | required group foo (LIST) { | repeated group foo_tuple (LIST) { | repeated binary foo_tuple_tuple (UTF8); @@ -861,7 +861,7 @@ class ParquetCollectionFormatForwardCompatibilityTests extends WordSpec with Mat """.stripMargin) val sourceType = MessageTypeParser.parseMessageType( """ - |message SampleSource { + |message ProjectedReadSchema { | optional group foo (LIST) { | repeated group list { | required group element (LIST) { @@ -877,7 +877,7 @@ class ParquetCollectionFormatForwardCompatibilityTests extends WordSpec with Mat val solved = ParquetCollectionFormatForwardCompatibility.projectFileSchema(sourceType, targetType) solved shouldEqual MessageTypeParser.parseMessageType( """ - |message SampleSource { + |message ProjectedReadSchema { | optional group foo (LIST) { | repeated group foo_tuple (LIST) { | repeated binary foo_tuple_tuple (UTF8); @@ -924,7 +924,7 @@ class ParquetCollectionFormatForwardCompatibilityTests extends WordSpec with Mat "format map of list" in { val targetType = MessageTypeParser.parseMessageType( """ - |message spark_schema { + |message FileSchema { | required group map_field (MAP) { | repeated group key_value { | required binary key (UTF8); @@ -958,7 +958,7 @@ class ParquetCollectionFormatForwardCompatibilityTests extends WordSpec with Mat |} """.stripMargin) - val solved = ScroogeReadSupport.getSchemaForRead(targetType, sourceType) + val solved = ParquetCollectionFormatForwardCompatibility.projectFileSchema(sourceType, targetType) val expected = MessageTypeParser.parseMessageType( """ |message ParquetSchema { @@ -983,7 +983,7 @@ class ParquetCollectionFormatForwardCompatibilityTests extends WordSpec with Mat "format list of map: tuple_x to standard" in { val targetType = MessageTypeParser.parseMessageType( """ - |message spark_schema { + |message FileSchema { | required group list_of_map (LIST) { | repeated group list { | required group element (MAP) { @@ -1001,7 +1001,7 @@ class ParquetCollectionFormatForwardCompatibilityTests extends WordSpec with Mat """.stripMargin) val sourceType = MessageTypeParser.parseMessageType( """ - |message SampleSource { + |message ProjectedReadSchema { | required group list_of_map (LIST) { | repeated group element_tuple (MAP) { | repeated group map (MAP_KEY_VALUE) { @@ -1015,10 +1015,10 @@ class ParquetCollectionFormatForwardCompatibilityTests extends WordSpec with Mat |} """.stripMargin) - val solved = ScroogeReadSupport.getSchemaForRead(targetType, sourceType) + val solved = ParquetCollectionFormatForwardCompatibility.projectFileSchema(sourceType, targetType) val expected = MessageTypeParser.parseMessageType( """ - |message SampleSource { + |message ProjectedReadSchema { | required group list_of_map (LIST) { | repeated group list { | required group element (MAP) { @@ -1039,7 +1039,7 @@ class ParquetCollectionFormatForwardCompatibilityTests extends WordSpec with Mat "format list of map: standard to tuple_x" in { val targetType = MessageTypeParser.parseMessageType( """ - |message spark_schema { + |message FileSchema { | required group list_of_map (LIST) { | repeated group list_of_map_tuple (MAP) { | repeated group map (MAP_KEY_VALUE) { @@ -1055,7 +1055,7 @@ class ParquetCollectionFormatForwardCompatibilityTests extends WordSpec with Mat """.stripMargin) val sourceType = MessageTypeParser.parseMessageType( """ - |message SampleSource { + |message ProjectedReadSchema { | required group list_of_map (LIST) { | repeated group list { | required group element (MAP) { @@ -1074,7 +1074,7 @@ class ParquetCollectionFormatForwardCompatibilityTests extends WordSpec with Mat val solved = ParquetCollectionFormatForwardCompatibility.projectFileSchema(sourceType, targetType) val expected = MessageTypeParser.parseMessageType( """ - |message SampleSource { + |message ProjectedReadSchema { | required group list_of_map (LIST) { | repeated group list_of_map_tuple (MAP) { | repeated group map (MAP_KEY_VALUE) { @@ -1095,7 +1095,7 @@ class ParquetCollectionFormatForwardCompatibilityTests extends WordSpec with Mat "throws on missing (MAP_KEY_VALUE) annotation causing projection of non-existent field" in { val targetType = MessageTypeParser.parseMessageType( """ - |message spark_schema { + |message FileSchema { | required group map_field (MAP) { | repeated group key_value { | required binary key (UTF8); @@ -1108,7 +1108,7 @@ class ParquetCollectionFormatForwardCompatibilityTests extends WordSpec with Mat // an actual field which then fails projection val sourceType = MessageTypeParser.parseMessageType( """ - |message SampleSource { + |message ProjectedReadSchema { | required group map_field (MAP) { | repeated group map { | required binary key (UTF8); @@ -1131,7 +1131,7 @@ class ParquetCollectionFormatForwardCompatibilityTests extends WordSpec with Mat "throws on missing `repeated` causing projection of non-existent field" in { val targetType = MessageTypeParser.parseMessageType( """ - |message spark_schema { + |message FileSchema { | optional group foo (LIST) { | repeated group list { | required group element { @@ -1143,7 +1143,7 @@ class ParquetCollectionFormatForwardCompatibilityTests extends WordSpec with Mat """.stripMargin) val sourceType = MessageTypeParser.parseMessageType( """ - |message SampleSource { + |message ProjectedReadSchema { | optional group foo (LIST) { | required group element { | optional binary zing (UTF8); @@ -1162,7 +1162,7 @@ class ParquetCollectionFormatForwardCompatibilityTests extends WordSpec with Mat "throws on required but non-existent in target" in { val targetType = MessageTypeParser.parseMessageType( """ - |message spark_schema { + |message FileSchema { | required group map_field (MAP) { | repeated group key_value { | required binary key (UTF8); @@ -1173,7 +1173,7 @@ class ParquetCollectionFormatForwardCompatibilityTests extends WordSpec with Mat """.stripMargin) val sourceType = MessageTypeParser.parseMessageType( """ - |message SampleSource { + |message ProjectedReadSchema { | required group map_field (MAP) { | repeated group map (MAP_KEY_VALUE) { | required binary key (UTF8); @@ -1199,7 +1199,7 @@ class ParquetCollectionFormatForwardCompatibilityTests extends WordSpec with Mat "throws exception" in { val targetType = MessageTypeParser.parseMessageType( """ - |message spark_schema { + |message FileSchema { | required group foo { | repeated group bar { | required binary _id (UTF8); @@ -1210,7 +1210,7 @@ class ParquetCollectionFormatForwardCompatibilityTests extends WordSpec with Mat """.stripMargin) val sourceType = MessageTypeParser.parseMessageType( """ - |message SampleSource { + |message ProjectedReadSchema { | required group foo { | required binary bar (UTF8); | } From 5b5cf2ba91fc092d2322b23fa0a75dc2ea8b95d2 Mon Sep 17 00:00:00 2001 From: Mick Jermsurawong Date: Fri, 4 Oct 2019 06:34:05 -0700 Subject: [PATCH 27/34] file rename to drop "forward" compat --- .../twitter/scalding/parquet/scrooge/ScroogeReadSupport.java | 2 +- ...ibility.scala => ParquetCollectionFormatCompatibility.scala} | 2 +- ...ts.scala => ParquetCollectionFormatCompatibilityTests.scala} | 0 3 files changed, 2 insertions(+), 2 deletions(-) rename scalding-parquet-scrooge/src/main/scala/com/twitter/scalding/parquet/scrooge/{ParquetCollectionFormatForwardCompatibility.scala => ParquetCollectionFormatCompatibility.scala} (99%) rename scalding-parquet-scrooge/src/test/scala/com/twitter/scalding/parquet/scrooge/{ParquetCollectionFormatForwardCompatibilityTests.scala => ParquetCollectionFormatCompatibilityTests.scala} (100%) diff --git a/scalding-parquet-scrooge/src/main/java/com/twitter/scalding/parquet/scrooge/ScroogeReadSupport.java b/scalding-parquet-scrooge/src/main/java/com/twitter/scalding/parquet/scrooge/ScroogeReadSupport.java index 181913a304..c884620de4 100644 --- a/scalding-parquet-scrooge/src/main/java/com/twitter/scalding/parquet/scrooge/ScroogeReadSupport.java +++ b/scalding-parquet-scrooge/src/main/java/com/twitter/scalding/parquet/scrooge/ScroogeReadSupport.java @@ -132,7 +132,7 @@ public static MessageType getSchemaForRead(MessageType fileMessageType, String p */ public static MessageType getSchemaForRead(MessageType fileMessageType, MessageType projectedMessageType) { assertGroupsAreCompatible(fileMessageType, projectedMessageType); - return ParquetCollectionFormatForwardCompatibility.projectFileSchema( + return ParquetCollectionFormatCompatibility.projectFileSchema( projectedMessageType, fileMessageType ); } diff --git a/scalding-parquet-scrooge/src/main/scala/com/twitter/scalding/parquet/scrooge/ParquetCollectionFormatForwardCompatibility.scala b/scalding-parquet-scrooge/src/main/scala/com/twitter/scalding/parquet/scrooge/ParquetCollectionFormatCompatibility.scala similarity index 99% rename from scalding-parquet-scrooge/src/main/scala/com/twitter/scalding/parquet/scrooge/ParquetCollectionFormatForwardCompatibility.scala rename to scalding-parquet-scrooge/src/main/scala/com/twitter/scalding/parquet/scrooge/ParquetCollectionFormatCompatibility.scala index 5ac9f7c101..3b57806c32 100644 --- a/scalding-parquet-scrooge/src/main/scala/com/twitter/scalding/parquet/scrooge/ParquetCollectionFormatForwardCompatibility.scala +++ b/scalding-parquet-scrooge/src/main/scala/com/twitter/scalding/parquet/scrooge/ParquetCollectionFormatCompatibility.scala @@ -27,7 +27,7 @@ import org.slf4j.LoggerFactory * 3) wrap back the formatted repeated type with group type from projected read schema. This * means the optional/required remains the same as that from projected read schema. */ -private[scrooge] object ParquetCollectionFormatForwardCompatibility { +private[scrooge] object ParquetCollectionFormatCompatibility { private val logger = LoggerFactory.getLogger(getClass) diff --git a/scalding-parquet-scrooge/src/test/scala/com/twitter/scalding/parquet/scrooge/ParquetCollectionFormatForwardCompatibilityTests.scala b/scalding-parquet-scrooge/src/test/scala/com/twitter/scalding/parquet/scrooge/ParquetCollectionFormatCompatibilityTests.scala similarity index 100% rename from scalding-parquet-scrooge/src/test/scala/com/twitter/scalding/parquet/scrooge/ParquetCollectionFormatForwardCompatibilityTests.scala rename to scalding-parquet-scrooge/src/test/scala/com/twitter/scalding/parquet/scrooge/ParquetCollectionFormatCompatibilityTests.scala From e29c7e9d02da0ae5877c1065e355a8cafa4ec45f Mon Sep 17 00:00:00 2001 From: Mick Jermsurawong Date: Fri, 4 Oct 2019 06:35:05 -0700 Subject: [PATCH 28/34] test rename and make variables consistent --- ...etCollectionFormatCompatibilityTests.scala | 293 ++++++++++-------- 1 file changed, 166 insertions(+), 127 deletions(-) diff --git a/scalding-parquet-scrooge/src/test/scala/com/twitter/scalding/parquet/scrooge/ParquetCollectionFormatCompatibilityTests.scala b/scalding-parquet-scrooge/src/test/scala/com/twitter/scalding/parquet/scrooge/ParquetCollectionFormatCompatibilityTests.scala index 8eb93b7055..275d25066d 100644 --- a/scalding-parquet-scrooge/src/test/scala/com/twitter/scalding/parquet/scrooge/ParquetCollectionFormatCompatibilityTests.scala +++ b/scalding-parquet-scrooge/src/test/scala/com/twitter/scalding/parquet/scrooge/ParquetCollectionFormatCompatibilityTests.scala @@ -10,7 +10,7 @@ import org.apache.parquet.thrift.struct.ThriftType.StructType.StructOrUnionType import org.apache.parquet.thrift.struct.ThriftType.{ListType, MapType, StructType} import org.scalatest.{Matchers, WordSpec} -class ParquetCollectionFormatForwardCompatibilityTests extends WordSpec with Matchers { +class ParquetCollectionFormatCompatibilityTests extends WordSpec with Matchers { /** * Helper wrapper to specify repetition string for exhaustive tests @@ -146,11 +146,13 @@ class ParquetCollectionFormatForwardCompatibilityTests extends WordSpec with Mat ("tuple", listTupleRule(_, _)), ("standard", (from: String, to: String) => listStandardRule(from, to, nullableElement = false)) ) + + // All possible format pairs of non-nullable list for { (projectedReadRuleName, projectedReadSchemaFunc) <- requiredElementRules (fileRuleName, fileSchemaFunc) <- requiredElementRules } yield { - s"Format from: [${projectedReadRuleName}] to: [${fileRuleName}]" should { + s"Format compat for list with non-nullable element from: [${projectedReadRuleName}] to: [${fileRuleName}]" should { "take option/require specifications from projected read schema" in { for { feasibleRepetition <- feasibleRepetitions @@ -166,7 +168,7 @@ class ParquetCollectionFormatForwardCompatibilityTests extends WordSpec with Mat val expectedProjectedFileSchema = MessageTypeParser.parseMessageType( fileSchemaFunc(projectedRepetition1, projectedRepetition2)) - expectedProjectedFileSchema shouldEqual ParquetCollectionFormatForwardCompatibility + expectedProjectedFileSchema shouldEqual ParquetCollectionFormatCompatibility .projectFileSchema(projectedReadSchema, fileSchema) } } @@ -219,7 +221,7 @@ class ParquetCollectionFormatForwardCompatibilityTests extends WordSpec with Mat nullableElement = true) ) val expectedProjectedFileSchema = MessageTypeParser.parseMessageType(listStandardRule(projectedRepetition1, projectedRepetition2, nullableElement = true)) - expectedProjectedFileSchema shouldEqual ParquetCollectionFormatForwardCompatibility + expectedProjectedFileSchema shouldEqual ParquetCollectionFormatCompatibility .projectFileSchema(projectedReadSchema, fileSchema) } } @@ -239,16 +241,16 @@ class ParquetCollectionFormatForwardCompatibilityTests extends WordSpec with Mat fileRepetition2) ) val e = intercept[IllegalArgumentException] { - ParquetCollectionFormatForwardCompatibility.projectFileSchema(projectedReadSchema, fileSchema) + ParquetCollectionFormatCompatibility.projectFileSchema(projectedReadSchema, fileSchema) } e.getMessage should include("Spark legacy mode for nullable element cannot take required element") } } } - "Format forward compat: resolving map format" should { + "Format compat for map" should { "map identity" in { - val targetType = MessageTypeParser.parseMessageType( + val fileSchema = MessageTypeParser.parseMessageType( """ |message FileSchema { | required group map (MAP) { @@ -262,8 +264,8 @@ class ParquetCollectionFormatForwardCompatibilityTests extends WordSpec with Mat | } |} """.stripMargin) - val solved = ParquetCollectionFormatForwardCompatibility.projectFileSchema(targetType, targetType) - solved shouldEqual targetType + val projectedFileSchema = ParquetCollectionFormatCompatibility.projectFileSchema(fileSchema, fileSchema) + fileSchema shouldEqual projectedFileSchema } "map identity from thrift struct: string key, struct value" in { @@ -288,8 +290,8 @@ class ParquetCollectionFormatForwardCompatibilityTests extends WordSpec with Mat |} """.stripMargin) - val solved = ParquetCollectionFormatForwardCompatibility.projectFileSchema(message, message) - solved shouldEqual message + val projectedFileSchema = ParquetCollectionFormatCompatibility.projectFileSchema(message, message) + message shouldEqual projectedFileSchema } "map identity from thrift struct: string kye, list string value" in { @@ -310,12 +312,12 @@ class ParquetCollectionFormatForwardCompatibilityTests extends WordSpec with Mat """.stripMargin ) - val solved = ParquetCollectionFormatForwardCompatibility.projectFileSchema(message, message) - solved shouldEqual message + val projectedFileSchema = ParquetCollectionFormatCompatibility.projectFileSchema(message, message) + message shouldEqual projectedFileSchema } - "format map legacy: original type (MAP_KEY_VALUE) to standard format key_value" in { - val targetType = MessageTypeParser.parseMessageType( + "format map legacy (MAP_KEY_VALUE) to standard format key_value" in { + val fileSchema = MessageTypeParser.parseMessageType( """ |message FileSchema { | required group map_field (MAP) { @@ -326,9 +328,9 @@ class ParquetCollectionFormatForwardCompatibilityTests extends WordSpec with Mat | } |} """.stripMargin) - val sourceType = MessageTypeParser.parseMessageType( + val projectedReadSchema = MessageTypeParser.parseMessageType( """ - |message SampleSource { + |message ProjectedReadSchema { | required group map_field (MAP) { | repeated group map (MAP_KEY_VALUE) { | required binary key (UTF8); @@ -337,10 +339,36 @@ class ParquetCollectionFormatForwardCompatibilityTests extends WordSpec with Mat | } |} """.stripMargin) - val solved = ParquetCollectionFormatForwardCompatibility.projectFileSchema(sourceType, targetType) + val projectedFileSchema = ParquetCollectionFormatCompatibility.projectFileSchema(projectedReadSchema, fileSchema) val expected = MessageTypeParser.parseMessageType( """ - |message SampleSource { + |message ProjectedReadSchema { + | required group map_field (MAP) { + | repeated group key_value { + | required binary key (UTF8); + | optional int32 value; + | } + | } + |} + """.stripMargin) + expected shouldEqual projectedFileSchema + } + + "format map standard key_value to legacy (MAP_KEY_VALUE)" in { + val fileSchema = MessageTypeParser.parseMessageType( + """ + |message FileSchema { + | required group map_field (MAP) { + | repeated group map (MAP_KEY_VALUE) { + | required binary key (UTF8); + | required int32 value; + | } + | } + |} + """.stripMargin) + val projectedReadSchema = MessageTypeParser.parseMessageType( + """ + |message ProjectedReadSchema { | required group map_field (MAP) { | repeated group key_value { | required binary key (UTF8); @@ -349,11 +377,23 @@ class ParquetCollectionFormatForwardCompatibilityTests extends WordSpec with Mat | } |} """.stripMargin) - solved shouldEqual expected + val projectedFileSchema = ParquetCollectionFormatCompatibility.projectFileSchema(projectedReadSchema, fileSchema) + val expected = MessageTypeParser.parseMessageType( + """ + |message ProjectedReadSchema { + | required group map_field (MAP) { + | repeated group map (MAP_KEY_VALUE) { + | required binary key (UTF8); + | optional int32 value; + | } + | } + |} + """.stripMargin) + expected shouldEqual projectedFileSchema } "format map legacy map of map" in { - val targetType = MessageTypeParser.parseMessageType( + val fileSchema = MessageTypeParser.parseMessageType( """ |message FileSchema { | required group map_of_map_field (MAP) { @@ -372,9 +412,9 @@ class ParquetCollectionFormatForwardCompatibilityTests extends WordSpec with Mat | } |} """.stripMargin) - val sourceType = MessageTypeParser.parseMessageType( + val projectedReadSchema = MessageTypeParser.parseMessageType( """ - |message SampleSource { + |message ProjectedReadSchema { | required group map_of_map_field (MAP) { | repeated group map (MAP_KEY_VALUE) { | required binary key (UTF8); @@ -390,10 +430,10 @@ class ParquetCollectionFormatForwardCompatibilityTests extends WordSpec with Mat | } |} """.stripMargin) - val solved = ParquetCollectionFormatForwardCompatibility.projectFileSchema(sourceType, targetType) + val projectedFileSchema = ParquetCollectionFormatCompatibility.projectFileSchema(projectedReadSchema, fileSchema) val expected = MessageTypeParser.parseMessageType( """ - |message SampleSource { + |message ProjectedReadSchema { | required group map_of_map_field (MAP) { | repeated group key_value { | required binary key (UTF8); @@ -409,7 +449,7 @@ class ParquetCollectionFormatForwardCompatibilityTests extends WordSpec with Mat | } |} """.stripMargin) - solved shouldEqual expected + expected shouldEqual projectedFileSchema } def schemaFromThriftMap(mapValueType: ThriftType) = { @@ -425,9 +465,9 @@ class ParquetCollectionFormatForwardCompatibilityTests extends WordSpec with Mat } } - "Format forward compat: resolving list format" should { + "Format compat for list" should { "format x_tuple to primitive array" in { - val targetType = MessageTypeParser.parseMessageType( + val fileSchema = MessageTypeParser.parseMessageType( """ |message FileSchema { | required group country_codes (LIST) { @@ -436,28 +476,28 @@ class ParquetCollectionFormatForwardCompatibilityTests extends WordSpec with Mat | required int32 x; |} """.stripMargin) - val sourceType = MessageTypeParser.parseMessageType( + val projectedReadSchema = MessageTypeParser.parseMessageType( """ - |message SampleSource { + |message ProjectedReadSchema { | optional group country_codes (LIST) { | repeated binary country_codes_tuple (UTF8); | } |} """.stripMargin) - val solved = ParquetCollectionFormatForwardCompatibility.projectFileSchema(sourceType, targetType) + val projectedFileSchema = ParquetCollectionFormatCompatibility.projectFileSchema(projectedReadSchema, fileSchema) val expected = MessageTypeParser.parseMessageType( """ - |message SampleSource { + |message ProjectedReadSchema { | optional group country_codes (LIST) { | repeated binary array (UTF8); | } |} """.stripMargin) - solved shouldEqual expected + expected shouldEqual projectedFileSchema } "format x_tuple to primitive element" in { - val targetType = MessageTypeParser.parseMessageType( + val fileSchema = MessageTypeParser.parseMessageType( """ |message FileSchema { | required group country_codes (LIST) { @@ -466,28 +506,28 @@ class ParquetCollectionFormatForwardCompatibilityTests extends WordSpec with Mat | required int32 x; |} """.stripMargin) - val sourceType = MessageTypeParser.parseMessageType( + val projectedReadSchema = MessageTypeParser.parseMessageType( """ - |message SampleSource { + |message ProjectedReadSchema { | optional group country_codes (LIST) { | repeated binary country_codes_tuple (UTF8); | } |} """.stripMargin) - val solved = ParquetCollectionFormatForwardCompatibility.projectFileSchema(sourceType, targetType) + val projectedFileSchema = ParquetCollectionFormatCompatibility.projectFileSchema(projectedReadSchema, fileSchema) val expected = MessageTypeParser.parseMessageType( """ - |message SampleSource { + |message ProjectedReadSchema { | optional group country_codes (LIST) { | repeated binary element (UTF8); | } |} """.stripMargin) - solved shouldEqual expected + expected shouldEqual projectedFileSchema } "format x_tuple to 3-level" in { - val targetType = MessageTypeParser.parseMessageType( + val fileSchema = MessageTypeParser.parseMessageType( """ |message FileSchema { | required group country_codes (LIST) { @@ -498,19 +538,19 @@ class ParquetCollectionFormatForwardCompatibilityTests extends WordSpec with Mat | required int32 x; |} """.stripMargin) - val sourceType = MessageTypeParser.parseMessageType( + val projectedReadSchema = MessageTypeParser.parseMessageType( """ - |message SampleSource { + |message ProjectedReadSchema { | optional group country_codes (LIST) { | repeated binary country_codes_tuple (UTF8); | } |} """.stripMargin) - val solved = ParquetCollectionFormatForwardCompatibility.projectFileSchema(sourceType, targetType) + val projectedFileSchema = ParquetCollectionFormatCompatibility.projectFileSchema(projectedReadSchema, fileSchema) // note optional of result, and field rename val expected = MessageTypeParser.parseMessageType( """ - |message SampleSource { + |message ProjectedReadSchema { | optional group country_codes (LIST) { | repeated group list { | required binary element (UTF8); @@ -518,11 +558,11 @@ class ParquetCollectionFormatForwardCompatibilityTests extends WordSpec with Mat | } |} """.stripMargin) - solved shouldEqual expected + expected shouldEqual projectedFileSchema } "format nested x_tuple to group array" in { - val targetType = MessageTypeParser.parseMessageType( + val fileSchema = MessageTypeParser.parseMessageType( """ |message FileSchema { | required group foo (LIST) { @@ -532,9 +572,9 @@ class ParquetCollectionFormatForwardCompatibilityTests extends WordSpec with Mat | } |} """.stripMargin) - val sourceType = MessageTypeParser.parseMessageType( + val projectedReadSchema = MessageTypeParser.parseMessageType( """ - |message SampleSource { + |message ProjectedReadSchema { | optional group foo (LIST) { | repeated group foo_tuple (LIST) { | repeated binary foo_tuple_tuple (UTF8); @@ -542,11 +582,11 @@ class ParquetCollectionFormatForwardCompatibilityTests extends WordSpec with Mat | } |} """.stripMargin) - val solved = ParquetCollectionFormatForwardCompatibility.projectFileSchema(sourceType, targetType) + val projectedFileSchema = ParquetCollectionFormatCompatibility.projectFileSchema(projectedReadSchema, fileSchema) // note optional of result, and field rename val expected = MessageTypeParser.parseMessageType( """ - |message SampleSource { + |message ProjectedReadSchema { | optional group foo (LIST) { | repeated group array (LIST) { | repeated binary array (UTF8); @@ -554,11 +594,11 @@ class ParquetCollectionFormatForwardCompatibilityTests extends WordSpec with Mat | } |} """.stripMargin) - solved shouldEqual expected + expected shouldEqual projectedFileSchema } "format nested x_tuple to nested 3-level" in { - val targetType = MessageTypeParser.parseMessageType( + val fileSchema = MessageTypeParser.parseMessageType( """ |message FileSchema { | required group foo (LIST) { @@ -572,9 +612,9 @@ class ParquetCollectionFormatForwardCompatibilityTests extends WordSpec with Mat | } |} """.stripMargin) - val sourceType = MessageTypeParser.parseMessageType( + val projectedReadSchema = MessageTypeParser.parseMessageType( """ - |message SampleSource { + |message ProjectedReadSchema { | optional group foo (LIST) { | repeated group foo_tuple (LIST) { | repeated binary foo_tuple_tuple (UTF8); @@ -582,10 +622,10 @@ class ParquetCollectionFormatForwardCompatibilityTests extends WordSpec with Mat | } |} """.stripMargin) - val solved = ParquetCollectionFormatForwardCompatibility.projectFileSchema(sourceType, targetType) + val projectedFileSchema = ParquetCollectionFormatCompatibility.projectFileSchema(projectedReadSchema, fileSchema) val expected = MessageTypeParser.parseMessageType( """ - |message SampleSource { + |message ProjectedReadSchema { | optional group foo (LIST) { | repeated group list { | required group element (LIST) { @@ -597,11 +637,11 @@ class ParquetCollectionFormatForwardCompatibilityTests extends WordSpec with Mat | } |} """.stripMargin) - solved shouldEqual expected + expected shouldEqual projectedFileSchema } "format binary array to 3-level" in { - val targetType = MessageTypeParser.parseMessageType( + val fileSchema = MessageTypeParser.parseMessageType( """ |message FileSchema { | required group country_codes (LIST) { @@ -614,19 +654,19 @@ class ParquetCollectionFormatForwardCompatibilityTests extends WordSpec with Mat """.stripMargin) // inner list is `binary array` - val sourceType = MessageTypeParser.parseMessageType( + val projectedReadSchema = MessageTypeParser.parseMessageType( """ - |message SampleSource { + |message ProjectedReadSchema { | optional group country_codes (LIST) { | repeated binary array (UTF8); | } |} """.stripMargin) - val solved = ParquetCollectionFormatForwardCompatibility.projectFileSchema(sourceType, targetType) + val projectedFileSchema = ParquetCollectionFormatCompatibility.projectFileSchema(projectedReadSchema, fileSchema) val expected = MessageTypeParser.parseMessageType( """ - |message SampleSource { + |message ProjectedReadSchema { | optional group country_codes (LIST) { | repeated group list { | required binary element (UTF8); @@ -634,12 +674,11 @@ class ParquetCollectionFormatForwardCompatibilityTests extends WordSpec with Mat | } |} """.stripMargin) - solved shouldEqual expected - + expected shouldEqual projectedFileSchema } "format 3-level to 3-level (identity)" in { - val targetType = MessageTypeParser.parseMessageType( + val fileSchema = MessageTypeParser.parseMessageType( """ |message FileSchema { | required group country_codes (LIST) { @@ -650,12 +689,12 @@ class ParquetCollectionFormatForwardCompatibilityTests extends WordSpec with Mat | required int32 x; |} """.stripMargin) - val solved = ParquetCollectionFormatForwardCompatibility.projectFileSchema(targetType, targetType) - solved shouldEqual targetType + val projectedFileSchema = ParquetCollectionFormatCompatibility.projectFileSchema(fileSchema, fileSchema) + fileSchema shouldEqual projectedFileSchema } "format nested primitive array to nested 3-level" in { - val targetType = MessageTypeParser.parseMessageType( + val fileSchema = MessageTypeParser.parseMessageType( """ |message FileSchema { | required group array_of_country_codes (LIST) { @@ -672,9 +711,9 @@ class ParquetCollectionFormatForwardCompatibilityTests extends WordSpec with Mat """.stripMargin) // inner list is `binary array` - val sourceType = MessageTypeParser.parseMessageType( + val projectedReadSchema = MessageTypeParser.parseMessageType( """ - |message SampleSource { + |message ProjectedReadSchema { | optional group array_of_country_codes (LIST) { | repeated group list { | required group element (LIST) { @@ -684,11 +723,11 @@ class ParquetCollectionFormatForwardCompatibilityTests extends WordSpec with Mat | } |} """.stripMargin) - val solved = ParquetCollectionFormatForwardCompatibility.projectFileSchema(sourceType, targetType) + val projectedFileSchema = ParquetCollectionFormatCompatibility.projectFileSchema(projectedReadSchema, fileSchema) val expected = MessageTypeParser.parseMessageType( """ - |message SampleSource { + |message ProjectedReadSchema { | optional group array_of_country_codes (LIST) { | repeated group list { | required group element (LIST) { @@ -700,11 +739,11 @@ class ParquetCollectionFormatForwardCompatibilityTests extends WordSpec with Mat | } |} """.stripMargin) - solved shouldEqual expected + expected shouldEqual projectedFileSchema } "format element group to 3-level" in { - val targetType = MessageTypeParser.parseMessageType( + val fileSchema = MessageTypeParser.parseMessageType( """ |message FileSchema { | required group country_codes (LIST) { @@ -719,9 +758,9 @@ class ParquetCollectionFormatForwardCompatibilityTests extends WordSpec with Mat | required int32 x; |} """.stripMargin) - val sourceType = MessageTypeParser.parseMessageType( + val projectedReadSchema = MessageTypeParser.parseMessageType( """ - |message SampleSource { + |message ProjectedReadSchema { | optional group country_codes (LIST) { | repeated group element { | optional binary foo (UTF8); @@ -730,11 +769,11 @@ class ParquetCollectionFormatForwardCompatibilityTests extends WordSpec with Mat | } |} """.stripMargin) - val solved = ParquetCollectionFormatForwardCompatibility.projectFileSchema(sourceType, targetType) + val projectedFileSchema = ParquetCollectionFormatCompatibility.projectFileSchema(projectedReadSchema, fileSchema) val expected = MessageTypeParser.parseMessageType( """ - |message SampleSource { + |message ProjectedReadSchema { | optional group country_codes (LIST) { | repeated group list { | required group element { @@ -745,11 +784,11 @@ class ParquetCollectionFormatForwardCompatibilityTests extends WordSpec with Mat | } |} """.stripMargin) - solved shouldEqual expected + expected shouldEqual projectedFileSchema } "format 3-level to nested primitive array" in { - val targetType = MessageTypeParser.parseMessageType( + val fileSchema = MessageTypeParser.parseMessageType( """ |message FileSchema { | required group array_of_country_codes (LIST) { @@ -763,7 +802,7 @@ class ParquetCollectionFormatForwardCompatibilityTests extends WordSpec with Mat |} """.stripMargin) - val sourceType = MessageTypeParser.parseMessageType( + val projectedReadSchema = MessageTypeParser.parseMessageType( """ |message ProjectedReadSchema { | optional group array_of_country_codes (LIST) { @@ -777,7 +816,7 @@ class ParquetCollectionFormatForwardCompatibilityTests extends WordSpec with Mat | } |} """.stripMargin) - val solved = ParquetCollectionFormatForwardCompatibility.projectFileSchema(sourceType, targetType) + val projectedFileSchema = ParquetCollectionFormatCompatibility.projectFileSchema(projectedReadSchema, fileSchema) val expected = MessageTypeParser.parseMessageType( """ @@ -791,11 +830,11 @@ class ParquetCollectionFormatForwardCompatibilityTests extends WordSpec with Mat | } |} """.stripMargin) - solved shouldEqual expected + expected shouldEqual projectedFileSchema } "format x_tuple in group to 3-level" in { - val targetType = MessageTypeParser.parseMessageType( + val fileSchema = MessageTypeParser.parseMessageType( """ |message FileSchema { | optional group connect_delays (LIST) { @@ -813,7 +852,7 @@ class ParquetCollectionFormatForwardCompatibilityTests extends WordSpec with Mat | } |} """.stripMargin) - val sourceType = MessageTypeParser.parseMessageType( + val projectedReadSchema = MessageTypeParser.parseMessageType( """ |message ProjectedReadSchema { | optional group connect_delays (LIST) { @@ -826,7 +865,7 @@ class ParquetCollectionFormatForwardCompatibilityTests extends WordSpec with Mat | } |} """.stripMargin) - val solved = ParquetCollectionFormatForwardCompatibility.projectFileSchema(sourceType, targetType) + val projectedFileSchema = ParquetCollectionFormatCompatibility.projectFileSchema(projectedReadSchema, fileSchema) val expected = MessageTypeParser.parseMessageType( """ |message ProjectedReadSchema { @@ -844,11 +883,11 @@ class ParquetCollectionFormatForwardCompatibilityTests extends WordSpec with Mat | } |} """.stripMargin) - solved shouldEqual expected + expected shouldEqual projectedFileSchema } "format 3-level to x_tuple" in { - val targetType = MessageTypeParser.parseMessageType( + val fileSchema = MessageTypeParser.parseMessageType( """ |message FileSchema { | required group foo (LIST) { @@ -859,7 +898,7 @@ class ParquetCollectionFormatForwardCompatibilityTests extends WordSpec with Mat | required int32 x; |} """.stripMargin) - val sourceType = MessageTypeParser.parseMessageType( + val projectedReadSchema = MessageTypeParser.parseMessageType( """ |message ProjectedReadSchema { | optional group foo (LIST) { @@ -874,8 +913,8 @@ class ParquetCollectionFormatForwardCompatibilityTests extends WordSpec with Mat | optional int32 x; |} """.stripMargin) - val solved = ParquetCollectionFormatForwardCompatibility.projectFileSchema(sourceType, targetType) - solved shouldEqual MessageTypeParser.parseMessageType( + val projectedFileSchema = ParquetCollectionFormatCompatibility.projectFileSchema(projectedReadSchema, fileSchema) + MessageTypeParser.parseMessageType( """ |message ProjectedReadSchema { | optional group foo (LIST) { @@ -885,11 +924,11 @@ class ParquetCollectionFormatForwardCompatibilityTests extends WordSpec with Mat | } | optional int32 x; |} - """.stripMargin) + """.stripMargin) shouldEqual projectedFileSchema } } - "Format forward compat: resolving mixed collection" should { + "Format compat for mixed collection" should { "list of map identity from thrift struct" in { val mapType = new MapType( new ThriftField("NOT_USED_KEY", 4, Requirement.REQUIRED, new ThriftType.StringType), @@ -917,12 +956,12 @@ class ParquetCollectionFormatForwardCompatibilityTests extends WordSpec with Mat """.stripMargin ) - val solved = ParquetCollectionFormatForwardCompatibility.projectFileSchema(message, message) - solved shouldEqual message + val projectedFileSchema = ParquetCollectionFormatCompatibility.projectFileSchema(message, message) + message shouldEqual projectedFileSchema } "format map of list" in { - val targetType = MessageTypeParser.parseMessageType( + val fileSchema = MessageTypeParser.parseMessageType( """ |message FileSchema { | required group map_field (MAP) { @@ -941,7 +980,7 @@ class ParquetCollectionFormatForwardCompatibilityTests extends WordSpec with Mat |} | """.stripMargin) - val sourceType = MessageTypeParser.parseMessageType( + val projectedReadSchema = MessageTypeParser.parseMessageType( """ |message ParquetSchema { | required group map_field (MAP) { @@ -958,7 +997,7 @@ class ParquetCollectionFormatForwardCompatibilityTests extends WordSpec with Mat |} """.stripMargin) - val solved = ParquetCollectionFormatForwardCompatibility.projectFileSchema(sourceType, targetType) + val projectedFileSchema = ParquetCollectionFormatCompatibility.projectFileSchema(projectedReadSchema, fileSchema) val expected = MessageTypeParser.parseMessageType( """ |message ParquetSchema { @@ -977,11 +1016,11 @@ class ParquetCollectionFormatForwardCompatibilityTests extends WordSpec with Mat | } |} """.stripMargin) - solved shouldEqual expected + expected shouldEqual projectedFileSchema } "format list of map: tuple_x to standard" in { - val targetType = MessageTypeParser.parseMessageType( + val fileSchema = MessageTypeParser.parseMessageType( """ |message FileSchema { | required group list_of_map (LIST) { @@ -999,7 +1038,7 @@ class ParquetCollectionFormatForwardCompatibilityTests extends WordSpec with Mat | } |} """.stripMargin) - val sourceType = MessageTypeParser.parseMessageType( + val projectedReadSchema = MessageTypeParser.parseMessageType( """ |message ProjectedReadSchema { | required group list_of_map (LIST) { @@ -1015,7 +1054,7 @@ class ParquetCollectionFormatForwardCompatibilityTests extends WordSpec with Mat |} """.stripMargin) - val solved = ParquetCollectionFormatForwardCompatibility.projectFileSchema(sourceType, targetType) + val projectedFileSchema = ParquetCollectionFormatCompatibility.projectFileSchema(projectedReadSchema, fileSchema) val expected = MessageTypeParser.parseMessageType( """ |message ProjectedReadSchema { @@ -1033,11 +1072,11 @@ class ParquetCollectionFormatForwardCompatibilityTests extends WordSpec with Mat | } |} """.stripMargin) - solved shouldEqual expected + expected shouldEqual projectedFileSchema } "format list of map: standard to tuple_x" in { - val targetType = MessageTypeParser.parseMessageType( + val fileSchema = MessageTypeParser.parseMessageType( """ |message FileSchema { | required group list_of_map (LIST) { @@ -1053,7 +1092,7 @@ class ParquetCollectionFormatForwardCompatibilityTests extends WordSpec with Mat | } |} """.stripMargin) - val sourceType = MessageTypeParser.parseMessageType( + val projectedReadSchema = MessageTypeParser.parseMessageType( """ |message ProjectedReadSchema { | required group list_of_map (LIST) { @@ -1071,7 +1110,7 @@ class ParquetCollectionFormatForwardCompatibilityTests extends WordSpec with Mat |} """.stripMargin) - val solved = ParquetCollectionFormatForwardCompatibility.projectFileSchema(sourceType, targetType) + val projectedFileSchema = ParquetCollectionFormatCompatibility.projectFileSchema(projectedReadSchema, fileSchema) val expected = MessageTypeParser.parseMessageType( """ |message ProjectedReadSchema { @@ -1087,13 +1126,13 @@ class ParquetCollectionFormatForwardCompatibilityTests extends WordSpec with Mat | } |} """.stripMargin) - solved shouldEqual expected + expected shouldEqual projectedFileSchema } } - "Format forward compat: check extra non-optional field projection" should { + "Format compat: check extra non-optional field projection" should { "throws on missing (MAP_KEY_VALUE) annotation causing projection of non-existent field" in { - val targetType = MessageTypeParser.parseMessageType( + val fileSchema = MessageTypeParser.parseMessageType( """ |message FileSchema { | required group map_field (MAP) { @@ -1106,7 +1145,7 @@ class ParquetCollectionFormatForwardCompatibilityTests extends WordSpec with Mat """.stripMargin) // `map` isn't annotated with `MAP_KEY_VALUE`, and is thus treated as // an actual field which then fails projection - val sourceType = MessageTypeParser.parseMessageType( + val projectedReadSchema = MessageTypeParser.parseMessageType( """ |message ProjectedReadSchema { | required group map_field (MAP) { @@ -1119,9 +1158,9 @@ class ParquetCollectionFormatForwardCompatibilityTests extends WordSpec with Mat """.stripMargin) val e = intercept[DecodingSchemaMismatchException] { - ParquetCollectionFormatForwardCompatibility.projectFileSchema( - sourceType, - targetType + ParquetCollectionFormatCompatibility.projectFileSchema( + projectedReadSchema, + fileSchema ) } @@ -1129,7 +1168,7 @@ class ParquetCollectionFormatForwardCompatibilityTests extends WordSpec with Mat } "throws on missing `repeated` causing projection of non-existent field" in { - val targetType = MessageTypeParser.parseMessageType( + val fileSchema = MessageTypeParser.parseMessageType( """ |message FileSchema { | optional group foo (LIST) { @@ -1141,7 +1180,7 @@ class ParquetCollectionFormatForwardCompatibilityTests extends WordSpec with Mat | } |} """.stripMargin) - val sourceType = MessageTypeParser.parseMessageType( + val projectedReadSchema = MessageTypeParser.parseMessageType( """ |message ProjectedReadSchema { | optional group foo (LIST) { @@ -1153,14 +1192,14 @@ class ParquetCollectionFormatForwardCompatibilityTests extends WordSpec with Mat """.stripMargin) val e = intercept[DecodingSchemaMismatchException] { - ParquetCollectionFormatForwardCompatibility.projectFileSchema(sourceType, targetType) + ParquetCollectionFormatCompatibility.projectFileSchema(projectedReadSchema, fileSchema) } e.getMessage should include("non-optional projected read field element:") } "throws on required but non-existent in target" in { - val targetType = MessageTypeParser.parseMessageType( + val fileSchema = MessageTypeParser.parseMessageType( """ |message FileSchema { | required group map_field (MAP) { @@ -1171,7 +1210,7 @@ class ParquetCollectionFormatForwardCompatibilityTests extends WordSpec with Mat | } |} """.stripMargin) - val sourceType = MessageTypeParser.parseMessageType( + val projectedReadSchema = MessageTypeParser.parseMessageType( """ |message ProjectedReadSchema { | required group map_field (MAP) { @@ -1185,9 +1224,9 @@ class ParquetCollectionFormatForwardCompatibilityTests extends WordSpec with Mat """.stripMargin) val e = intercept[DecodingSchemaMismatchException] { - ParquetCollectionFormatForwardCompatibility.projectFileSchema( - sourceType, - targetType + ParquetCollectionFormatCompatibility.projectFileSchema( + projectedReadSchema, + fileSchema ) } @@ -1196,8 +1235,8 @@ class ParquetCollectionFormatForwardCompatibilityTests extends WordSpec with Mat } "Schema mismatch" should { - "throws exception" in { - val targetType = MessageTypeParser.parseMessageType( + "throws exception on inconsistent type between primitive and group" in { + val fileSchema = MessageTypeParser.parseMessageType( """ |message FileSchema { | required group foo { @@ -1208,7 +1247,7 @@ class ParquetCollectionFormatForwardCompatibilityTests extends WordSpec with Mat | } |} """.stripMargin) - val sourceType = MessageTypeParser.parseMessageType( + val projectedReadSchema = MessageTypeParser.parseMessageType( """ |message ProjectedReadSchema { | required group foo { @@ -1218,9 +1257,9 @@ class ParquetCollectionFormatForwardCompatibilityTests extends WordSpec with Mat """.stripMargin) val e = intercept[DecodingSchemaMismatchException] { - ParquetCollectionFormatForwardCompatibility.projectFileSchema( - targetType, - sourceType + ParquetCollectionFormatCompatibility.projectFileSchema( + fileSchema, + projectedReadSchema ) } From cd7e69c60a22adabc772f1da0d567fd258c102a2 Mon Sep 17 00:00:00 2001 From: Mick Jermsurawong Date: Fri, 4 Oct 2019 07:17:05 -0700 Subject: [PATCH 29/34] make names file/read oriented --- .../parquet/scrooge/ScroogeReadSupport.java | 2 +- ...ParquetCollectionFormatCompatibility.scala | 45 +++++++--------- .../scrooge/ParquetListFormatter.scala | 52 +++++++++---------- .../parquet/scrooge/ParquetMapFormatter.scala | 32 +++++------- 4 files changed, 58 insertions(+), 73 deletions(-) diff --git a/scalding-parquet-scrooge/src/main/java/com/twitter/scalding/parquet/scrooge/ScroogeReadSupport.java b/scalding-parquet-scrooge/src/main/java/com/twitter/scalding/parquet/scrooge/ScroogeReadSupport.java index c884620de4..f23a76c8c3 100644 --- a/scalding-parquet-scrooge/src/main/java/com/twitter/scalding/parquet/scrooge/ScroogeReadSupport.java +++ b/scalding-parquet-scrooge/src/main/java/com/twitter/scalding/parquet/scrooge/ScroogeReadSupport.java @@ -133,7 +133,7 @@ public static MessageType getSchemaForRead(MessageType fileMessageType, String p public static MessageType getSchemaForRead(MessageType fileMessageType, MessageType projectedMessageType) { assertGroupsAreCompatible(fileMessageType, projectedMessageType); return ParquetCollectionFormatCompatibility.projectFileSchema( - projectedMessageType, fileMessageType + fileMessageType, projectedMessageType ); } diff --git a/scalding-parquet-scrooge/src/main/scala/com/twitter/scalding/parquet/scrooge/ParquetCollectionFormatCompatibility.scala b/scalding-parquet-scrooge/src/main/scala/com/twitter/scalding/parquet/scrooge/ParquetCollectionFormatCompatibility.scala index 3b57806c32..026a648444 100644 --- a/scalding-parquet-scrooge/src/main/scala/com/twitter/scalding/parquet/scrooge/ParquetCollectionFormatCompatibility.scala +++ b/scalding-parquet-scrooge/src/main/scala/com/twitter/scalding/parquet/scrooge/ParquetCollectionFormatCompatibility.scala @@ -1,11 +1,12 @@ package com.twitter.scalding.parquet.scrooge -import scala.collection.JavaConverters._ import org.apache.parquet.schema.Type.Repetition import org.apache.parquet.schema.{GroupType, MessageType, Type} import org.apache.parquet.thrift.DecodingSchemaMismatchException import org.slf4j.LoggerFactory +import scala.collection.JavaConverters._ + /** * Project file schema based on projected read schema which may contain different format * of collection group--list and map. This is currently used in [[ScroogeReadSupport]] where @@ -39,8 +40,8 @@ private[scrooge] object ParquetCollectionFormatCompatibility { * @param projectedReadSchema read schema specifying field projection * @param fileSchema file schema to be projected */ - def projectFileSchema(projectedReadSchema: MessageType, fileSchema: MessageType): MessageType = { - val projectedFileSchema = projectFileType(projectedReadSchema, fileSchema, FieldContext()).asGroupType() + def projectFileSchema(fileSchema: MessageType, projectedReadSchema: MessageType): MessageType = { + val projectedFileSchema = projectFileType(fileSchema, projectedReadSchema, FieldContext()).asGroupType() logger.debug(s"Projected read schema:\n${projectedReadSchema}\n" + s"File schema:\n${fileSchema}\n" + s"Projected file schema:\n${projectedFileSchema}") @@ -53,7 +54,7 @@ private[scrooge] object ParquetCollectionFormatCompatibility { * The formatting of repeated type is not to one-to-one node swapping because we also have to * handle projection and possible nested collection types in the repeated type. */ - private def projectFileType(projectedReadType: Type, fileType: Type, fieldContext: FieldContext): Type = { + private def projectFileType(fileType: Type, projectedReadType: Type, fieldContext: FieldContext): Type = { (extractCollectionGroup(projectedReadType), extractCollectionGroup(fileType)) match { case _ if projectedReadType.isPrimitive && fileType.isPrimitive => projectedReadType @@ -63,19 +64,9 @@ private[scrooge] object ParquetCollectionFormatCompatibility { s"and file type:\n${fileType}" ) case (Some(projectedReadGroup: ListGroup), Some(fileGroup: ListGroup)) => - projectFileGroup( - projectedReadGroup, - fileGroup, - fieldContext.copy(nestedListLevel = fieldContext.nestedListLevel + 1), - formatter=ParquetListFormatter - ) + projectFileGroup(fileGroup, projectedReadGroup, fieldContext.copy(nestedListLevel = fieldContext.nestedListLevel + 1), formatter=ParquetListFormatter) case (Some(projectedReadGroup: MapGroup), Some(fileGroup: MapGroup)) => - projectFileGroup( - projectedReadGroup, - fileGroup, - fieldContext, - formatter=ParquetMapFormatter - ) + projectFileGroup(fileGroup, projectedReadGroup, fieldContext, formatter=ParquetMapFormatter) case _ => // Struct projection val projectedReadGroupType = projectedReadType.asGroupType val fileGroupType = fileType.asGroupType @@ -91,22 +82,23 @@ private[scrooge] object ParquetCollectionFormatCompatibility { } else { val fileFieldIndex = fileGroupType.getFieldIndex(projectedReadField.getName) val fileField = fileGroupType.getFields.get(fileFieldIndex) - projectFileType(projectedReadField, fileField, FieldContext(projectedReadField.getName)) + projectFileType(fileField, projectedReadField, FieldContext(projectedReadField.getName)) } } projectedReadGroupType.withNewFields(projectedReadFields.asJava) } } - private def projectFileGroup(projectedReadGroup: CollectionGroup, - fileGroup: CollectionGroup, + private def projectFileGroup(fileGroup: CollectionGroup, + projectedReadGroup: CollectionGroup, fieldContext: FieldContext, - formatter: ParquetCollectionFormatter): GroupType = { + formatter: ParquetCollectionFormatter) = { val projectedFileRepeatedType = formatter.formatCompatibleRepeatedType( - projectedReadGroup.repeatedType, fileGroup.repeatedType, + projectedReadGroup.repeatedType, fieldContext, - projectFileType(_, _, _)) + projectFileType + ) // Respect optional/required from the projected read group. projectedReadGroup.groupType.withNewFields(projectedFileRepeatedType) } @@ -119,13 +111,14 @@ private[scrooge] object ParquetCollectionFormatCompatibility { private[scrooge] trait ParquetCollectionFormatter { /** * Format source repeated type in the structure of target repeated type. - * @param sourceRepeatedType repeated type from which the formatted result get content - * @param targetRepeatedType repeated type from which the formatted result get the structure + * + * @param readRepeatedType repeated type from which the formatted result get content + * @param fileRepeatedType repeated type from which the formatted result get the structure * @param recursiveSolver solver for the inner content of the repeated type * @return formatted result */ - def formatCompatibleRepeatedType(sourceRepeatedType: Type, - targetRepeatedType: Type, + def formatCompatibleRepeatedType(fileRepeatedType: Type, + readRepeatedType: Type, fieldContext: FieldContext, recursiveSolver: (Type, Type, FieldContext) => Type): Type diff --git a/scalding-parquet-scrooge/src/main/scala/com/twitter/scalding/parquet/scrooge/ParquetListFormatter.scala b/scalding-parquet-scrooge/src/main/scala/com/twitter/scalding/parquet/scrooge/ParquetListFormatter.scala index e95a5c61b2..2ade4c2497 100644 --- a/scalding-parquet-scrooge/src/main/scala/com/twitter/scalding/parquet/scrooge/ParquetListFormatter.scala +++ b/scalding-parquet-scrooge/src/main/scala/com/twitter/scalding/parquet/scrooge/ParquetListFormatter.scala @@ -1,16 +1,16 @@ package com.twitter.scalding.parquet.scrooge -import java.util - import org.apache.parquet.schema.{GroupType, OriginalType, PrimitiveType, Type} import org.slf4j.LoggerFactory +import scala.collection.JavaConverters._ + /** - * Formatter parquet schema of legacy list type to standard one - * namely 3-level list structure as recommended in - * https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#lists + * Format parquet list schema of read type to structure of file type. + * The supported formats are in `rules` of [[ParquetListFormatRule]]. + * Please see documentation for each rule. * - * More specifically this handles converting from parquet file created by + * In a common use case, read schema form thrift struct has tuple format created by * [[org.apache.parquet.thrift.ThriftSchemaConvertVisitor]] which always suffix * list element with "_tuple". */ @@ -24,29 +24,29 @@ private[scrooge] object ParquetListFormatter extends ParquetCollectionFormatter TupleRule, StandardRule, SparkLegacyNullableElementRule ) - def formatCompatibleRepeatedType(repeatedSourceType: Type, - repeatedTargetType: Type, + def formatCompatibleRepeatedType(fileRepeatedType: Type, + readRepeatedType: Type, fieldContext: FieldContext, - recursiveSolver: (Type, Type, FieldContext) => Type) = { + recursiveSolver: (Type, Type, FieldContext) => Type): Type = { ( - findRule(repeatedSourceType), - findRule(repeatedTargetType) + findRule(fileRepeatedType), + findRule(readRepeatedType) ) match { - case (Some(sourceRule), Some(targetRule)) => { - val sourceElementType = sourceRule.elementType(repeatedSourceType) - val targetElementType = targetRule.elementType(repeatedTargetType) - val forwardCompatElementType = recursiveSolver(sourceElementType, targetElementType, fieldContext) - - targetRule.createCompliantRepeatedType( - elementType = forwardCompatElementType, - elementName = sourceRule.elementName(repeatedSourceType), - isElementRequired = sourceRule.isElementRequired(repeatedSourceType), - elementOriginalType = sourceRule.elementOriginalType(repeatedSourceType), + case (Some(fileRule), Some(readRule)) => { + val readElementType = readRule.elementType(readRepeatedType) + val fileElementType = fileRule.elementType(fileRepeatedType) + val solvedElementType = recursiveSolver(fileElementType, readElementType, fieldContext) + + fileRule.createCompliantRepeatedType( + elementType = solvedElementType, + elementName = readRule.elementName(readRepeatedType), + isElementRequired = readRule.isElementRequired(readRepeatedType), + elementOriginalType = readRule.elementOriginalType(readRepeatedType), fieldContext=fieldContext ) } - case _ => repeatedSourceType + case _ => readRepeatedType } } @@ -82,9 +82,9 @@ private[scrooge] object ParquetListFormatter extends ParquetCollectionFormatter * 2) decompose the repeated type into element and other info. * 3) construct compliant repeated type from the given element and other info. * For example, - * if source repeated type matches Rule 1, and target type matches Rule 2. - * Rule 1 will decompose the source type, and - * Rule 2 will take that information to construct repeated element in Rule 2 format. + * if read repeated type matches Rule 1, and file type matches Rule 2. + * Rule 1 will decompose the read type, and + * Rule 2 will take that information to construct repeated element in Rule 2 of file type format. */ private[scrooge] sealed trait ParquetListFormatRule { def elementType(repeatedType: Type): Type @@ -257,7 +257,7 @@ private[scrooge] sealed trait ThreeLevelRule extends ParquetListFormatRule { originalElementType.asGroupType.getFields) } - new GroupType(Type.Repetition.REPEATED, constantRepeatedGroupName, util.Arrays.asList(elementType)) + new GroupType(Type.Repetition.REPEATED, constantRepeatedGroupName, Seq(elementType).asJava) } private def firstField(groupType: GroupType): Type = { diff --git a/scalding-parquet-scrooge/src/main/scala/com/twitter/scalding/parquet/scrooge/ParquetMapFormatter.scala b/scalding-parquet-scrooge/src/main/scala/com/twitter/scalding/parquet/scrooge/ParquetMapFormatter.scala index 779674e34e..ce9ed5498d 100644 --- a/scalding-parquet-scrooge/src/main/scala/com/twitter/scalding/parquet/scrooge/ParquetMapFormatter.scala +++ b/scalding-parquet-scrooge/src/main/scala/com/twitter/scalding/parquet/scrooge/ParquetMapFormatter.scala @@ -3,31 +3,23 @@ package com.twitter.scalding.parquet.scrooge import org.apache.parquet.schema.{OriginalType, Type} /** - * Format parquet schema of legacy map type to standard target - * with repeated type of `key_value` without annotation - * as recommended in + * Format parquet map schema of read type to structure of file type. + * The supported formats are: + * 1) Standard repeated type of `key_value` without annotation + * 2) Legacy repeated `map field annotated with (MAP_KEY_VALUE) + * as described in * https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#maps * - * Source with legacy format created by - * [[org.apache.parquet.schema.ConversionPatterns]] has repeated `map` field - * annotated with (MAP_KEY_VALUE) + * In a common use case, read schema from thrift struct has legacy format 2) created by + * [[org.apache.parquet.schema.ConversionPatterns]] */ private[scrooge] object ParquetMapFormatter extends ParquetCollectionFormatter { - /** - * Handle map format compatibility - * https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#maps - * - * @param sourceRepeatedMapType - * @param targetRepeatedMapType - */ - def formatCompatibleRepeatedType(sourceRepeatedMapType: Type, - targetRepeatedMapType: Type, - fieldContext: FieldContext, - recursiveSolver: (Type, Type, FieldContext) => Type) = { - - val solvedRepeatedType = recursiveSolver(sourceRepeatedMapType, targetRepeatedMapType, fieldContext) - targetRepeatedMapType.asGroupType().withNewFields(solvedRepeatedType.asGroupType().getFields) + def formatCompatibleRepeatedType(fileRepeatedMapType: Type, + readRepeatedMapType: Type, + fieldContext: FieldContext, recursiveSolver: (Type, Type, FieldContext) => Type): Type = { + val solvedRepeatedType = recursiveSolver(fileRepeatedMapType, readRepeatedMapType, fieldContext) + fileRepeatedMapType.asGroupType().withNewFields(solvedRepeatedType.asGroupType().getFields) } def extractGroup(typ: Type): Option[MapGroup] = { From b05f13c42e4721c65ea87d1caad860d1de7d2572 Mon Sep 17 00:00:00 2001 From: Mick Jermsurawong Date: Fri, 4 Oct 2019 07:17:25 -0700 Subject: [PATCH 30/34] improve test make sure formatted type is still compatible with given file schema --- ...etCollectionFormatCompatibilityTests.scala | 78 +++++++++---------- 1 file changed, 37 insertions(+), 41 deletions(-) diff --git a/scalding-parquet-scrooge/src/test/scala/com/twitter/scalding/parquet/scrooge/ParquetCollectionFormatCompatibilityTests.scala b/scalding-parquet-scrooge/src/test/scala/com/twitter/scalding/parquet/scrooge/ParquetCollectionFormatCompatibilityTests.scala index 275d25066d..7ab4359b07 100644 --- a/scalding-parquet-scrooge/src/test/scala/com/twitter/scalding/parquet/scrooge/ParquetCollectionFormatCompatibilityTests.scala +++ b/scalding-parquet-scrooge/src/test/scala/com/twitter/scalding/parquet/scrooge/ParquetCollectionFormatCompatibilityTests.scala @@ -2,7 +2,7 @@ package com.twitter.scalding.parquet.scrooge import java.util -import org.apache.parquet.schema.MessageTypeParser +import org.apache.parquet.schema.{MessageType, MessageTypeParser} import org.apache.parquet.thrift.{DecodingSchemaMismatchException, ThriftSchemaConverter} import org.apache.parquet.thrift.struct.ThriftField.Requirement import org.apache.parquet.thrift.struct.{ThriftField, ThriftType} @@ -12,6 +12,13 @@ import org.scalatest.{Matchers, WordSpec} class ParquetCollectionFormatCompatibilityTests extends WordSpec with Matchers { + private def testProjectAndAssertCompatibility(fileSchema: MessageType, + projectedReadSchema: MessageType) = { + val projectedFileSchema = ParquetCollectionFormatCompatibility.projectFileSchema(fileSchema, projectedReadSchema) + ScroogeReadSupport.assertGroupsAreCompatible(fileSchema, projectedFileSchema) + projectedFileSchema + } + /** * Helper wrapper to specify repetition string for exhaustive tests */ @@ -168,8 +175,7 @@ class ParquetCollectionFormatCompatibilityTests extends WordSpec with Matchers { val expectedProjectedFileSchema = MessageTypeParser.parseMessageType( fileSchemaFunc(projectedRepetition1, projectedRepetition2)) - expectedProjectedFileSchema shouldEqual ParquetCollectionFormatCompatibility - .projectFileSchema(projectedReadSchema, fileSchema) + expectedProjectedFileSchema shouldEqual testProjectAndAssertCompatibility(fileSchema, projectedReadSchema) } } } @@ -221,8 +227,7 @@ class ParquetCollectionFormatCompatibilityTests extends WordSpec with Matchers { nullableElement = true) ) val expectedProjectedFileSchema = MessageTypeParser.parseMessageType(listStandardRule(projectedRepetition1, projectedRepetition2, nullableElement = true)) - expectedProjectedFileSchema shouldEqual ParquetCollectionFormatCompatibility - .projectFileSchema(projectedReadSchema, fileSchema) + expectedProjectedFileSchema shouldEqual testProjectAndAssertCompatibility(fileSchema, projectedReadSchema) } } @@ -241,7 +246,7 @@ class ParquetCollectionFormatCompatibilityTests extends WordSpec with Matchers { fileRepetition2) ) val e = intercept[IllegalArgumentException] { - ParquetCollectionFormatCompatibility.projectFileSchema(projectedReadSchema, fileSchema) + testProjectAndAssertCompatibility(fileSchema, projectedReadSchema) } e.getMessage should include("Spark legacy mode for nullable element cannot take required element") } @@ -264,7 +269,7 @@ class ParquetCollectionFormatCompatibilityTests extends WordSpec with Matchers { | } |} """.stripMargin) - val projectedFileSchema = ParquetCollectionFormatCompatibility.projectFileSchema(fileSchema, fileSchema) + val projectedFileSchema = testProjectAndAssertCompatibility(fileSchema, fileSchema) fileSchema shouldEqual projectedFileSchema } @@ -290,7 +295,7 @@ class ParquetCollectionFormatCompatibilityTests extends WordSpec with Matchers { |} """.stripMargin) - val projectedFileSchema = ParquetCollectionFormatCompatibility.projectFileSchema(message, message) + val projectedFileSchema = testProjectAndAssertCompatibility(message, message) message shouldEqual projectedFileSchema } @@ -312,7 +317,7 @@ class ParquetCollectionFormatCompatibilityTests extends WordSpec with Matchers { """.stripMargin ) - val projectedFileSchema = ParquetCollectionFormatCompatibility.projectFileSchema(message, message) + val projectedFileSchema = testProjectAndAssertCompatibility(message, message) message shouldEqual projectedFileSchema } @@ -339,7 +344,7 @@ class ParquetCollectionFormatCompatibilityTests extends WordSpec with Matchers { | } |} """.stripMargin) - val projectedFileSchema = ParquetCollectionFormatCompatibility.projectFileSchema(projectedReadSchema, fileSchema) + val projectedFileSchema = testProjectAndAssertCompatibility(fileSchema, projectedReadSchema) val expected = MessageTypeParser.parseMessageType( """ |message ProjectedReadSchema { @@ -377,7 +382,7 @@ class ParquetCollectionFormatCompatibilityTests extends WordSpec with Matchers { | } |} """.stripMargin) - val projectedFileSchema = ParquetCollectionFormatCompatibility.projectFileSchema(projectedReadSchema, fileSchema) + val projectedFileSchema = testProjectAndAssertCompatibility(fileSchema, projectedReadSchema) val expected = MessageTypeParser.parseMessageType( """ |message ProjectedReadSchema { @@ -430,7 +435,7 @@ class ParquetCollectionFormatCompatibilityTests extends WordSpec with Matchers { | } |} """.stripMargin) - val projectedFileSchema = ParquetCollectionFormatCompatibility.projectFileSchema(projectedReadSchema, fileSchema) + val projectedFileSchema = testProjectAndAssertCompatibility(fileSchema, projectedReadSchema) val expected = MessageTypeParser.parseMessageType( """ |message ProjectedReadSchema { @@ -484,7 +489,7 @@ class ParquetCollectionFormatCompatibilityTests extends WordSpec with Matchers { | } |} """.stripMargin) - val projectedFileSchema = ParquetCollectionFormatCompatibility.projectFileSchema(projectedReadSchema, fileSchema) + val projectedFileSchema = testProjectAndAssertCompatibility(fileSchema, projectedReadSchema) val expected = MessageTypeParser.parseMessageType( """ |message ProjectedReadSchema { @@ -514,7 +519,7 @@ class ParquetCollectionFormatCompatibilityTests extends WordSpec with Matchers { | } |} """.stripMargin) - val projectedFileSchema = ParquetCollectionFormatCompatibility.projectFileSchema(projectedReadSchema, fileSchema) + val projectedFileSchema = testProjectAndAssertCompatibility(fileSchema, projectedReadSchema) val expected = MessageTypeParser.parseMessageType( """ |message ProjectedReadSchema { @@ -546,7 +551,7 @@ class ParquetCollectionFormatCompatibilityTests extends WordSpec with Matchers { | } |} """.stripMargin) - val projectedFileSchema = ParquetCollectionFormatCompatibility.projectFileSchema(projectedReadSchema, fileSchema) + val projectedFileSchema = testProjectAndAssertCompatibility(fileSchema, projectedReadSchema) // note optional of result, and field rename val expected = MessageTypeParser.parseMessageType( """ @@ -582,7 +587,7 @@ class ParquetCollectionFormatCompatibilityTests extends WordSpec with Matchers { | } |} """.stripMargin) - val projectedFileSchema = ParquetCollectionFormatCompatibility.projectFileSchema(projectedReadSchema, fileSchema) + val projectedFileSchema = testProjectAndAssertCompatibility(fileSchema, projectedReadSchema) // note optional of result, and field rename val expected = MessageTypeParser.parseMessageType( """ @@ -622,7 +627,7 @@ class ParquetCollectionFormatCompatibilityTests extends WordSpec with Matchers { | } |} """.stripMargin) - val projectedFileSchema = ParquetCollectionFormatCompatibility.projectFileSchema(projectedReadSchema, fileSchema) + val projectedFileSchema = testProjectAndAssertCompatibility(fileSchema, projectedReadSchema) val expected = MessageTypeParser.parseMessageType( """ |message ProjectedReadSchema { @@ -662,7 +667,7 @@ class ParquetCollectionFormatCompatibilityTests extends WordSpec with Matchers { | } |} """.stripMargin) - val projectedFileSchema = ParquetCollectionFormatCompatibility.projectFileSchema(projectedReadSchema, fileSchema) + val projectedFileSchema = testProjectAndAssertCompatibility(fileSchema, projectedReadSchema) val expected = MessageTypeParser.parseMessageType( """ @@ -689,7 +694,7 @@ class ParquetCollectionFormatCompatibilityTests extends WordSpec with Matchers { | required int32 x; |} """.stripMargin) - val projectedFileSchema = ParquetCollectionFormatCompatibility.projectFileSchema(fileSchema, fileSchema) + val projectedFileSchema = testProjectAndAssertCompatibility(fileSchema, fileSchema) fileSchema shouldEqual projectedFileSchema } @@ -723,7 +728,7 @@ class ParquetCollectionFormatCompatibilityTests extends WordSpec with Matchers { | } |} """.stripMargin) - val projectedFileSchema = ParquetCollectionFormatCompatibility.projectFileSchema(projectedReadSchema, fileSchema) + val projectedFileSchema = testProjectAndAssertCompatibility(fileSchema, projectedReadSchema) val expected = MessageTypeParser.parseMessageType( """ @@ -769,7 +774,7 @@ class ParquetCollectionFormatCompatibilityTests extends WordSpec with Matchers { | } |} """.stripMargin) - val projectedFileSchema = ParquetCollectionFormatCompatibility.projectFileSchema(projectedReadSchema, fileSchema) + val projectedFileSchema = testProjectAndAssertCompatibility(fileSchema, projectedReadSchema) val expected = MessageTypeParser.parseMessageType( """ @@ -816,7 +821,7 @@ class ParquetCollectionFormatCompatibilityTests extends WordSpec with Matchers { | } |} """.stripMargin) - val projectedFileSchema = ParquetCollectionFormatCompatibility.projectFileSchema(projectedReadSchema, fileSchema) + val projectedFileSchema = testProjectAndAssertCompatibility(fileSchema, projectedReadSchema) val expected = MessageTypeParser.parseMessageType( """ @@ -865,7 +870,7 @@ class ParquetCollectionFormatCompatibilityTests extends WordSpec with Matchers { | } |} """.stripMargin) - val projectedFileSchema = ParquetCollectionFormatCompatibility.projectFileSchema(projectedReadSchema, fileSchema) + val projectedFileSchema = testProjectAndAssertCompatibility(fileSchema, projectedReadSchema) val expected = MessageTypeParser.parseMessageType( """ |message ProjectedReadSchema { @@ -913,7 +918,7 @@ class ParquetCollectionFormatCompatibilityTests extends WordSpec with Matchers { | optional int32 x; |} """.stripMargin) - val projectedFileSchema = ParquetCollectionFormatCompatibility.projectFileSchema(projectedReadSchema, fileSchema) + val projectedFileSchema = testProjectAndAssertCompatibility(fileSchema, projectedReadSchema) MessageTypeParser.parseMessageType( """ |message ProjectedReadSchema { @@ -956,7 +961,7 @@ class ParquetCollectionFormatCompatibilityTests extends WordSpec with Matchers { """.stripMargin ) - val projectedFileSchema = ParquetCollectionFormatCompatibility.projectFileSchema(message, message) + val projectedFileSchema = testProjectAndAssertCompatibility(message, message) message shouldEqual projectedFileSchema } @@ -997,7 +1002,7 @@ class ParquetCollectionFormatCompatibilityTests extends WordSpec with Matchers { |} """.stripMargin) - val projectedFileSchema = ParquetCollectionFormatCompatibility.projectFileSchema(projectedReadSchema, fileSchema) + val projectedFileSchema = testProjectAndAssertCompatibility(fileSchema, projectedReadSchema) val expected = MessageTypeParser.parseMessageType( """ |message ParquetSchema { @@ -1054,7 +1059,7 @@ class ParquetCollectionFormatCompatibilityTests extends WordSpec with Matchers { |} """.stripMargin) - val projectedFileSchema = ParquetCollectionFormatCompatibility.projectFileSchema(projectedReadSchema, fileSchema) + val projectedFileSchema = testProjectAndAssertCompatibility(fileSchema, projectedReadSchema) val expected = MessageTypeParser.parseMessageType( """ |message ProjectedReadSchema { @@ -1110,7 +1115,7 @@ class ParquetCollectionFormatCompatibilityTests extends WordSpec with Matchers { |} """.stripMargin) - val projectedFileSchema = ParquetCollectionFormatCompatibility.projectFileSchema(projectedReadSchema, fileSchema) + val projectedFileSchema = testProjectAndAssertCompatibility(fileSchema, projectedReadSchema) val expected = MessageTypeParser.parseMessageType( """ |message ProjectedReadSchema { @@ -1158,10 +1163,7 @@ class ParquetCollectionFormatCompatibilityTests extends WordSpec with Matchers { """.stripMargin) val e = intercept[DecodingSchemaMismatchException] { - ParquetCollectionFormatCompatibility.projectFileSchema( - projectedReadSchema, - fileSchema - ) + testProjectAndAssertCompatibility(fileSchema, projectedReadSchema) } e.getMessage should include("non-optional projected read field map:") @@ -1192,7 +1194,7 @@ class ParquetCollectionFormatCompatibilityTests extends WordSpec with Matchers { """.stripMargin) val e = intercept[DecodingSchemaMismatchException] { - ParquetCollectionFormatCompatibility.projectFileSchema(projectedReadSchema, fileSchema) + testProjectAndAssertCompatibility(fileSchema, projectedReadSchema) } e.getMessage should include("non-optional projected read field element:") @@ -1224,10 +1226,7 @@ class ParquetCollectionFormatCompatibilityTests extends WordSpec with Matchers { """.stripMargin) val e = intercept[DecodingSchemaMismatchException] { - ParquetCollectionFormatCompatibility.projectFileSchema( - projectedReadSchema, - fileSchema - ) + testProjectAndAssertCompatibility(fileSchema, projectedReadSchema) } e.getMessage should include("non-optional projected read field bogus_field:") @@ -1257,10 +1256,7 @@ class ParquetCollectionFormatCompatibilityTests extends WordSpec with Matchers { """.stripMargin) val e = intercept[DecodingSchemaMismatchException] { - ParquetCollectionFormatCompatibility.projectFileSchema( - fileSchema, - projectedReadSchema - ) + testProjectAndAssertCompatibility(projectedReadSchema, fileSchema) } e.getMessage should include("Found schema mismatch") From 7bb0382c0c04f99362f3a875456c12a71aea2782 Mon Sep 17 00:00:00 2001 From: Mick Jermsurawong Date: Fri, 4 Oct 2019 10:31:01 -0700 Subject: [PATCH 31/34] check for field optional/required --- ...ParquetCollectionFormatCompatibility.scala | 8 +- ...etCollectionFormatCompatibilityTests.scala | 163 +++++++++++------- 2 files changed, 105 insertions(+), 66 deletions(-) diff --git a/scalding-parquet-scrooge/src/main/scala/com/twitter/scalding/parquet/scrooge/ParquetCollectionFormatCompatibility.scala b/scalding-parquet-scrooge/src/main/scala/com/twitter/scalding/parquet/scrooge/ParquetCollectionFormatCompatibility.scala index 026a648444..a16807d213 100644 --- a/scalding-parquet-scrooge/src/main/scala/com/twitter/scalding/parquet/scrooge/ParquetCollectionFormatCompatibility.scala +++ b/scalding-parquet-scrooge/src/main/scala/com/twitter/scalding/parquet/scrooge/ParquetCollectionFormatCompatibility.scala @@ -82,6 +82,12 @@ private[scrooge] object ParquetCollectionFormatCompatibility { } else { val fileFieldIndex = fileGroupType.getFieldIndex(projectedReadField.getName) val fileField = fileGroupType.getFields.get(fileFieldIndex) + if (fileField.isRepetition(Repetition.OPTIONAL) && projectedReadField.isRepetition(Repetition.REQUIRED)) { + throw new DecodingSchemaMismatchException( + s"Found required projected read field ${projectedReadField.getName}:\n$projectedReadField\n\n" + + s"on optional file field:\n${fileField}" + ) + } projectFileType(fileField, projectedReadField, FieldContext(projectedReadField.getName)) } } @@ -111,7 +117,7 @@ private[scrooge] object ParquetCollectionFormatCompatibility { private[scrooge] trait ParquetCollectionFormatter { /** * Format source repeated type in the structure of target repeated type. - * + * * @param readRepeatedType repeated type from which the formatted result get content * @param fileRepeatedType repeated type from which the formatted result get the structure * @param recursiveSolver solver for the inner content of the repeated type diff --git a/scalding-parquet-scrooge/src/test/scala/com/twitter/scalding/parquet/scrooge/ParquetCollectionFormatCompatibilityTests.scala b/scalding-parquet-scrooge/src/test/scala/com/twitter/scalding/parquet/scrooge/ParquetCollectionFormatCompatibilityTests.scala index 7ab4359b07..8611f67d00 100644 --- a/scalding-parquet-scrooge/src/test/scala/com/twitter/scalding/parquet/scrooge/ParquetCollectionFormatCompatibilityTests.scala +++ b/scalding-parquet-scrooge/src/test/scala/com/twitter/scalding/parquet/scrooge/ParquetCollectionFormatCompatibilityTests.scala @@ -147,35 +147,28 @@ class ParquetCollectionFormatCompatibilityTests extends WordSpec with Matchers { """.stripMargin) } - val requiredElementRules = Seq( + val listRequiredElementRules = Seq( ("element", listElementRule(_, _)), ("array", listArrayRule(_, _)), ("tuple", listTupleRule(_, _)), ("standard", (from: String, to: String) => listStandardRule(from, to, nullableElement = false)) ) - // All possible format pairs of non-nullable list + // All possible format pairs of list with non-nullable element for { - (projectedReadRuleName, projectedReadSchemaFunc) <- requiredElementRules - (fileRuleName, fileSchemaFunc) <- requiredElementRules + (projectedReadRuleName, projectedReadSchemaFunc) <- listRequiredElementRules + (fileRuleName, fileSchemaFunc) <- listRequiredElementRules } yield { - s"Format compat for list with non-nullable element from: [${projectedReadRuleName}] to: [${fileRuleName}]" should { + s"Project for list with non-nullable element file: [${fileRuleName}] read: [${projectedReadRuleName}]" should { "take option/require specifications from projected read schema" in { for { feasibleRepetition <- feasibleRepetitions } yield { - val projectedRepetition1 = feasibleRepetition.projectedReadRepetition1 - val projectedRepetition2 = feasibleRepetition.projectedReadRepetition2 - val projectedReadSchema = MessageTypeParser.parseMessageType(projectedReadSchemaFunc(projectedRepetition1, projectedRepetition2)) - - val fileRepetition1 = feasibleRepetition.fileRepetition1 - val fileRepetition2 = feasibleRepetition.fileRepetition2 - val fileSchema = MessageTypeParser.parseMessageType(fileSchemaFunc(fileRepetition1, fileRepetition2)) - - val expectedProjectedFileSchema = MessageTypeParser.parseMessageType( - fileSchemaFunc(projectedRepetition1, projectedRepetition2)) - - expectedProjectedFileSchema shouldEqual testProjectAndAssertCompatibility(fileSchema, projectedReadSchema) + testProjectedFileSchemaHasReadSchemaRepetitions( + fileSchemaFunc, + projectedReadSchemaFunc, + feasibleRepetition + ) } } } @@ -211,50 +204,61 @@ class ParquetCollectionFormatCompatibilityTests extends WordSpec with Matchers { |} """.stripMargin) - "Format compat for list with nullable element" should { - "format from spark legacy write, with nullable elements, to standard" in { + private def testProjectedFileSchemaHasReadSchemaRepetitions( + fileSchemaFunc: (String, String) => String, + projectedReadSchemaFunc: (String, String) => String, + feasibleRepetition: TestRepetitions): Any = { + + val projectedReadSchema = MessageTypeParser.parseMessageType( + projectedReadSchemaFunc( + feasibleRepetition.projectedReadRepetition1, + feasibleRepetition.projectedReadRepetition2) + ) + val fileSchema = MessageTypeParser.parseMessageType( + fileSchemaFunc( + feasibleRepetition.fileRepetition1, + feasibleRepetition.fileRepetition2) + ) + val expectedProjectedFileSchema = MessageTypeParser.parseMessageType( + fileSchemaFunc( + feasibleRepetition.projectedReadRepetition1, + feasibleRepetition.projectedReadRepetition2) + ) + expectedProjectedFileSchema shouldEqual testProjectAndAssertCompatibility(fileSchema, projectedReadSchema) + } + + "Project for list with nullable element" should { + "file: standard, read: spark legacy write, with nullable elements" in { for { feasibleRepetition <- feasibleRepetitions } yield { - val projectedRepetition1 = feasibleRepetition.projectedReadRepetition1 - val projectedRepetition2 = feasibleRepetition.projectedReadRepetition2 - val projectedReadSchema = MessageTypeParser.parseMessageType(listSparkLegacyNullableElementRule(projectedRepetition1, projectedRepetition2)) - - val fileRepetition1 = feasibleRepetition.fileRepetition1 - val fileRepetition2 = feasibleRepetition.fileRepetition2 - val fileSchema = MessageTypeParser.parseMessageType(listStandardRule(fileRepetition1, - fileRepetition2, - nullableElement = true) + testProjectedFileSchemaHasReadSchemaRepetitions( + fileSchemaFunc = listStandardRule(_, _, nullableElement = true), + projectedReadSchemaFunc = listSparkLegacyNullableElementRule, + feasibleRepetition ) - val expectedProjectedFileSchema = MessageTypeParser.parseMessageType(listStandardRule(projectedRepetition1, projectedRepetition2, nullableElement = true)) - expectedProjectedFileSchema shouldEqual testProjectAndAssertCompatibility(fileSchema, projectedReadSchema) } } - "failed to format required element to spark legacy write with nullable element" in { + "failed to format file: required element, read: legacy write with nullable element" in { for { feasibleRepetition <- feasibleRepetitions - (_, requiredElementSchemaFunc) <- requiredElementRules + (_, requiredElementSchemaFunc) <- listRequiredElementRules } yield { - val projectedRepetition1 = feasibleRepetition.projectedReadRepetition1 - val projectedRepetition2 = feasibleRepetition.projectedReadRepetition2 - val fileSchema = MessageTypeParser.parseMessageType(listSparkLegacyNullableElementRule(projectedRepetition1, projectedRepetition2)) - - val fileRepetition1 = feasibleRepetition.fileRepetition1 - val fileRepetition2 = feasibleRepetition.fileRepetition2 - val projectedReadSchema = MessageTypeParser.parseMessageType(requiredElementSchemaFunc(fileRepetition1, - fileRepetition2) - ) val e = intercept[IllegalArgumentException] { - testProjectAndAssertCompatibility(fileSchema, projectedReadSchema) + testProjectedFileSchemaHasReadSchemaRepetitions( + fileSchemaFunc = listSparkLegacyNullableElementRule, + projectedReadSchemaFunc = requiredElementSchemaFunc, + feasibleRepetition + ) } e.getMessage should include("Spark legacy mode for nullable element cannot take required element") } } } - "Format compat for map" should { - "map identity" in { + "Project for map" should { + "file/read identity" in { val fileSchema = MessageTypeParser.parseMessageType( """ |message FileSchema { @@ -273,7 +277,7 @@ class ParquetCollectionFormatCompatibilityTests extends WordSpec with Matchers { fileSchema shouldEqual projectedFileSchema } - "map identity from thrift struct: string key, struct value" in { + "file/read identity from thrift struct (string key, struct value)" in { val listType = new ListType(new ThriftField("list", 2, Requirement.REQUIRED, new ThriftType.StringType)) val children = new ThriftField("foo", 3, Requirement.REQUIRED, listType) val mapValueType = new StructType(util.Arrays.asList(children), @@ -299,7 +303,7 @@ class ParquetCollectionFormatCompatibilityTests extends WordSpec with Matchers { message shouldEqual projectedFileSchema } - "map identity from thrift struct: string kye, list string value" in { + "file/read identity from thrift struct (string key, list string value)" in { val listType = new ListType(new ThriftField("list", 2, Requirement.REQUIRED, new ThriftType.StringType)) val message = schemaFromThriftMap(listType) message shouldEqual MessageTypeParser.parseMessageType( @@ -321,7 +325,7 @@ class ParquetCollectionFormatCompatibilityTests extends WordSpec with Matchers { message shouldEqual projectedFileSchema } - "format map legacy (MAP_KEY_VALUE) to standard format key_value" in { + "file: standard key_value, read: legacy (MAP_KEY_VALUE)" in { val fileSchema = MessageTypeParser.parseMessageType( """ |message FileSchema { @@ -359,7 +363,7 @@ class ParquetCollectionFormatCompatibilityTests extends WordSpec with Matchers { expected shouldEqual projectedFileSchema } - "format map standard key_value to legacy (MAP_KEY_VALUE)" in { + "file: legacy (MAP_KEY_VALUE), read: standard key_value" in { val fileSchema = MessageTypeParser.parseMessageType( """ |message FileSchema { @@ -397,7 +401,7 @@ class ParquetCollectionFormatCompatibilityTests extends WordSpec with Matchers { expected shouldEqual projectedFileSchema } - "format map legacy map of map" in { + "map of map, file: standard key_value, read: legacy (MAP_KEY_VALUE)" in { val fileSchema = MessageTypeParser.parseMessageType( """ |message FileSchema { @@ -471,7 +475,7 @@ class ParquetCollectionFormatCompatibilityTests extends WordSpec with Matchers { } "Format compat for list" should { - "format x_tuple to primitive array" in { + "file: primitive array, read: x_tuple" in { val fileSchema = MessageTypeParser.parseMessageType( """ |message FileSchema { @@ -501,7 +505,7 @@ class ParquetCollectionFormatCompatibilityTests extends WordSpec with Matchers { expected shouldEqual projectedFileSchema } - "format x_tuple to primitive element" in { + "file: primitive element, read: x_tuple" in { val fileSchema = MessageTypeParser.parseMessageType( """ |message FileSchema { @@ -531,7 +535,7 @@ class ParquetCollectionFormatCompatibilityTests extends WordSpec with Matchers { expected shouldEqual projectedFileSchema } - "format x_tuple to 3-level" in { + "file: 3-level, read: x_tuple" in { val fileSchema = MessageTypeParser.parseMessageType( """ |message FileSchema { @@ -566,7 +570,7 @@ class ParquetCollectionFormatCompatibilityTests extends WordSpec with Matchers { expected shouldEqual projectedFileSchema } - "format nested x_tuple to group array" in { + "file: group array, read: nested x_tuple" in { val fileSchema = MessageTypeParser.parseMessageType( """ |message FileSchema { @@ -602,7 +606,7 @@ class ParquetCollectionFormatCompatibilityTests extends WordSpec with Matchers { expected shouldEqual projectedFileSchema } - "format nested x_tuple to nested 3-level" in { + "file: nested 3-level, read: nested x_tuple" in { val fileSchema = MessageTypeParser.parseMessageType( """ |message FileSchema { @@ -645,7 +649,7 @@ class ParquetCollectionFormatCompatibilityTests extends WordSpec with Matchers { expected shouldEqual projectedFileSchema } - "format binary array to 3-level" in { + "file: 3-level, read: binary array" in { val fileSchema = MessageTypeParser.parseMessageType( """ |message FileSchema { @@ -682,7 +686,7 @@ class ParquetCollectionFormatCompatibilityTests extends WordSpec with Matchers { expected shouldEqual projectedFileSchema } - "format 3-level to 3-level (identity)" in { + "file: 3-level (identity), read: 3-level" in { val fileSchema = MessageTypeParser.parseMessageType( """ |message FileSchema { @@ -698,7 +702,7 @@ class ParquetCollectionFormatCompatibilityTests extends WordSpec with Matchers { fileSchema shouldEqual projectedFileSchema } - "format nested primitive array to nested 3-level" in { + "file: nested 3-level, read: nested primitive array" in { val fileSchema = MessageTypeParser.parseMessageType( """ |message FileSchema { @@ -747,7 +751,7 @@ class ParquetCollectionFormatCompatibilityTests extends WordSpec with Matchers { expected shouldEqual projectedFileSchema } - "format element group to 3-level" in { + "file: 3-level, read: element group" in { val fileSchema = MessageTypeParser.parseMessageType( """ |message FileSchema { @@ -792,7 +796,7 @@ class ParquetCollectionFormatCompatibilityTests extends WordSpec with Matchers { expected shouldEqual projectedFileSchema } - "format 3-level to nested primitive array" in { + "file: nested primitive array, read: 3-level" in { val fileSchema = MessageTypeParser.parseMessageType( """ |message FileSchema { @@ -838,7 +842,7 @@ class ParquetCollectionFormatCompatibilityTests extends WordSpec with Matchers { expected shouldEqual projectedFileSchema } - "format x_tuple in group to 3-level" in { + "file: 3-level, read: x_tuple in group" in { val fileSchema = MessageTypeParser.parseMessageType( """ |message FileSchema { @@ -891,7 +895,7 @@ class ParquetCollectionFormatCompatibilityTests extends WordSpec with Matchers { expected shouldEqual projectedFileSchema } - "format 3-level to x_tuple" in { + "file: x_tuple, read: 3-level" in { val fileSchema = MessageTypeParser.parseMessageType( """ |message FileSchema { @@ -934,7 +938,7 @@ class ParquetCollectionFormatCompatibilityTests extends WordSpec with Matchers { } "Format compat for mixed collection" should { - "list of map identity from thrift struct" in { + "list of map: file/read identity from thrift struct" in { val mapType = new MapType( new ThriftField("NOT_USED_KEY", 4, Requirement.REQUIRED, new ThriftType.StringType), new ThriftField("NOT_USED_VALUE", 5, Requirement.REQUIRED, new ThriftType.I64Type)) @@ -965,7 +969,7 @@ class ParquetCollectionFormatCompatibilityTests extends WordSpec with Matchers { message shouldEqual projectedFileSchema } - "format map of list" in { + "map of list, file: standard, read: thrift-generated" in { val fileSchema = MessageTypeParser.parseMessageType( """ |message FileSchema { @@ -1024,7 +1028,7 @@ class ParquetCollectionFormatCompatibilityTests extends WordSpec with Matchers { expected shouldEqual projectedFileSchema } - "format list of map: tuple_x to standard" in { + "file: standard, read: list of map: tuple_x" in { val fileSchema = MessageTypeParser.parseMessageType( """ |message FileSchema { @@ -1080,7 +1084,7 @@ class ParquetCollectionFormatCompatibilityTests extends WordSpec with Matchers { expected shouldEqual projectedFileSchema } - "format list of map: standard to tuple_x" in { + "file: tuple_x, read: list of map: standard" in { val fileSchema = MessageTypeParser.parseMessageType( """ |message FileSchema { @@ -1261,5 +1265,34 @@ class ParquetCollectionFormatCompatibilityTests extends WordSpec with Matchers { e.getMessage should include("Found schema mismatch") } + + "throws exception optional group in file schema but required group in read schema" in { + val fileSchema = MessageTypeParser.parseMessageType( + """ + |message FileSchema { + | required group foo { + | repeated group bar { + | required binary _id (UTF8); + | required double created; + | } + | } + |} + """.stripMargin) + val projectedReadSchema = MessageTypeParser.parseMessageType( + """ + |message ProjectedReadSchema { + | optional group foo { + | required binary bar (UTF8); + | } + |} + """.stripMargin) + + val e = intercept[DecodingSchemaMismatchException] { + testProjectAndAssertCompatibility(projectedReadSchema, fileSchema) + } + + e.getMessage should include ("Found required projected read field foo") + e.getMessage should include ("on optional file field") + } } } From 6365db8a79ecbfdb4b0f63a1e48721970fabd945 Mon Sep 17 00:00:00 2001 From: joshrosen-stripe <48632449+joshrosen-stripe@users.noreply.github.com> Date: Sat, 5 Oct 2019 15:03:15 -0700 Subject: [PATCH 32/34] Review suggestions for https://github.com/twitter/scalding/pull/1921 (#1) * Add more comments; add explicit else conditions to some if cases; lift base case out of pattern match * Formatting changes * More minor formatting. --- ...ParquetCollectionFormatCompatibility.scala | 86 ++++++++++++------- .../scrooge/ParquetListFormatter.scala | 32 ++++--- .../parquet/scrooge/ParquetMapFormatter.scala | 5 +- 3 files changed, 75 insertions(+), 48 deletions(-) diff --git a/scalding-parquet-scrooge/src/main/scala/com/twitter/scalding/parquet/scrooge/ParquetCollectionFormatCompatibility.scala b/scalding-parquet-scrooge/src/main/scala/com/twitter/scalding/parquet/scrooge/ParquetCollectionFormatCompatibility.scala index a16807d213..9bf03aeb78 100644 --- a/scalding-parquet-scrooge/src/main/scala/com/twitter/scalding/parquet/scrooge/ParquetCollectionFormatCompatibility.scala +++ b/scalding-parquet-scrooge/src/main/scala/com/twitter/scalding/parquet/scrooge/ParquetCollectionFormatCompatibility.scala @@ -37,8 +37,8 @@ private[scrooge] object ParquetCollectionFormatCompatibility { * The result is projected file schema with the same optional/required fields as the * projected read schema, but collection type format as the file schema. * - * @param projectedReadSchema read schema specifying field projection * @param fileSchema file schema to be projected + * @param projectedReadSchema read schema specifying field projection */ def projectFileSchema(fileSchema: MessageType, projectedReadSchema: MessageType): MessageType = { val projectedFileSchema = projectFileType(fileSchema, projectedReadSchema, FieldContext()).asGroupType() @@ -55,50 +55,70 @@ private[scrooge] object ParquetCollectionFormatCompatibility { * handle projection and possible nested collection types in the repeated type. */ private def projectFileType(fileType: Type, projectedReadType: Type, fieldContext: FieldContext): Type = { - (extractCollectionGroup(projectedReadType), extractCollectionGroup(fileType)) match { - case _ if projectedReadType.isPrimitive && fileType.isPrimitive => + if (projectedReadType.isPrimitive || fileType.isPrimitive) { + // Base-cases to handle primitive types: + if (projectedReadType.isPrimitive && fileType.isPrimitive) { + // The field is a primitive in both schemas projectedReadType - case _ if projectedReadType.isPrimitive != fileType.isPrimitive => + } else { + // The field is primitive in one schema but non-primitive in the othe other throw new DecodingSchemaMismatchException( s"Found schema mismatch between projected read type:\n$projectedReadType\n" + s"and file type:\n${fileType}" ) - case (Some(projectedReadGroup: ListGroup), Some(fileGroup: ListGroup)) => - projectFileGroup(fileGroup, projectedReadGroup, fieldContext.copy(nestedListLevel = fieldContext.nestedListLevel + 1), formatter=ParquetListFormatter) - case (Some(projectedReadGroup: MapGroup), Some(fileGroup: MapGroup)) => - projectFileGroup(fileGroup, projectedReadGroup, fieldContext, formatter=ParquetMapFormatter) - case _ => // Struct projection - val projectedReadGroupType = projectedReadType.asGroupType - val fileGroupType = fileType.asGroupType - val projectedReadFields = projectedReadGroupType.getFields.asScala.map { projectedReadField => - if (!fileGroupType.containsField(projectedReadField.getName)) { - if (!projectedReadField.isRepetition(Repetition.OPTIONAL)) { - throw new DecodingSchemaMismatchException( - s"Found non-optional projected read field ${projectedReadField.getName}:\n$projectedReadField\n\n" + - s"not present in the given file group type:\n${fileGroupType}" - ) - } - projectedReadField - } else { - val fileFieldIndex = fileGroupType.getFieldIndex(projectedReadField.getName) - val fileField = fileGroupType.getFields.get(fileFieldIndex) - if (fileField.isRepetition(Repetition.OPTIONAL) && projectedReadField.isRepetition(Repetition.REQUIRED)) { - throw new DecodingSchemaMismatchException( - s"Found required projected read field ${projectedReadField.getName}:\n$projectedReadField\n\n" + - s"on optional file field:\n${fileField}" - ) + } + } else { + // Recursive cases to handle non-primitives (lists, maps, and structs): + (extractCollectionGroup(projectedReadType), extractCollectionGroup(fileType)) match { + case (Some(projectedReadGroup: ListGroup), Some(fileGroup: ListGroup)) => + projectFileGroup(fileGroup, projectedReadGroup, fieldContext.copy(nestedListLevel = fieldContext.nestedListLevel + 1), formatter=ParquetListFormatter) + case (Some(projectedReadGroup: MapGroup), Some(fileGroup: MapGroup)) => + projectFileGroup(fileGroup, projectedReadGroup, fieldContext, formatter=ParquetMapFormatter) + case _ => // Struct projection + val projectedReadGroupType = projectedReadType.asGroupType + val fileGroupType = fileType.asGroupType + val projectedReadFields = projectedReadGroupType.getFields.asScala.map { projectedReadField => + if (!fileGroupType.containsField(projectedReadField.getName)) { + // The projected read schema includes a field which is missing from the file schema. + if (projectedReadField.isRepetition(Repetition.OPTIONAL)) { + // The missing field is optional in the projected read schema. Since the file schema + // doesn't contain this field there are no collection compatibility concerns to worry + // about and we can simply use the supplied schema: + projectedReadField + } else { + // The missing field is repeated or required, which is an error: + throw new DecodingSchemaMismatchException( + s"Found non-optional projected read field ${projectedReadField.getName}:\n$projectedReadField\n\n" + + s"not present in the given file group type:\n${fileGroupType}" + ) + } + } else { + // The field is present in both schemas, so first check that the schemas specify compatible repetition + // values for the field, then recursively process the fields: + val fileFieldIndex = fileGroupType.getFieldIndex(projectedReadField.getName) + val fileField = fileGroupType.getFields.get(fileFieldIndex) + if (fileField.isRepetition(Repetition.OPTIONAL) && projectedReadField.isRepetition(Repetition.REQUIRED)) { + // The field is optional in the file schema but required in the projected read schema; this is an error: + throw new DecodingSchemaMismatchException( + s"Found required projected read field ${projectedReadField.getName}:\n$projectedReadField\n\n" + + s"on optional file field:\n${fileField}" + ) + } else { + // The field's repetitions are compatible in both schemas (e.g. optional in both schemas or required + // in both), so recursively process the field: + projectFileType(fileField, projectedReadField, FieldContext(projectedReadField.getName)) + } } - projectFileType(fileField, projectedReadField, FieldContext(projectedReadField.getName)) } - } - projectedReadGroupType.withNewFields(projectedReadFields.asJava) + projectedReadGroupType.withNewFields(projectedReadFields.asJava) + } } } private def projectFileGroup(fileGroup: CollectionGroup, projectedReadGroup: CollectionGroup, fieldContext: FieldContext, - formatter: ParquetCollectionFormatter) = { + formatter: ParquetCollectionFormatter): GroupType = { val projectedFileRepeatedType = formatter.formatCompatibleRepeatedType( fileGroup.repeatedType, projectedReadGroup.repeatedType, @@ -118,8 +138,8 @@ private[scrooge] trait ParquetCollectionFormatter { /** * Format source repeated type in the structure of target repeated type. * - * @param readRepeatedType repeated type from which the formatted result get content * @param fileRepeatedType repeated type from which the formatted result get the structure + * @param readRepeatedType repeated type from which the formatted result get content * @param recursiveSolver solver for the inner content of the repeated type * @return formatted result */ diff --git a/scalding-parquet-scrooge/src/main/scala/com/twitter/scalding/parquet/scrooge/ParquetListFormatter.scala b/scalding-parquet-scrooge/src/main/scala/com/twitter/scalding/parquet/scrooge/ParquetListFormatter.scala index 2ade4c2497..1bdadde908 100644 --- a/scalding-parquet-scrooge/src/main/scala/com/twitter/scalding/parquet/scrooge/ParquetListFormatter.scala +++ b/scalding-parquet-scrooge/src/main/scala/com/twitter/scalding/parquet/scrooge/ParquetListFormatter.scala @@ -19,19 +19,20 @@ private[scrooge] object ParquetListFormatter extends ParquetCollectionFormatter private val logger = LoggerFactory.getLogger(getClass) private val rules: Seq[ParquetListFormatRule] = Seq( - PrimitiveElementRule, PrimitiveArrayRule, - GroupElementRule, GroupArrayRule, - TupleRule, StandardRule, SparkLegacyNullableElementRule + PrimitiveElementRule, + PrimitiveArrayRule, + GroupElementRule, + GroupArrayRule, + TupleRule, + StandardRule, + SparkLegacyNullableElementRule ) def formatCompatibleRepeatedType(fileRepeatedType: Type, readRepeatedType: Type, fieldContext: FieldContext, recursiveSolver: (Type, Type, FieldContext) => Type): Type = { - ( - findRule(fileRepeatedType), - findRule(readRepeatedType) - ) match { + (findRule(fileRepeatedType), findRule(readRepeatedType)) match { case (Some(fileRule), Some(readRule)) => { val readElementType = readRule.elementType(readRepeatedType) val fileElementType = fileRule.elementType(fileRepeatedType) @@ -42,7 +43,7 @@ private[scrooge] object ParquetListFormatter extends ParquetCollectionFormatter elementName = readRule.elementName(readRepeatedType), isElementRequired = readRule.isElementRequired(readRepeatedType), elementOriginalType = readRule.elementOriginalType(readRepeatedType), - fieldContext=fieldContext + fieldContext = fieldContext ) } @@ -165,8 +166,9 @@ private[scrooge] sealed trait GroupListRule extends ParquetListFormatRule { override def elementName(repeatedType: Type): String = this.constantElementName override def appliesToType(repeatedType: Type): Boolean = { - if (repeatedType.isPrimitive) false - else { + if (repeatedType.isPrimitive) { + false + } else { val groupType = repeatedType.asGroupType groupType.getFields.size > 0 && groupType.getName == this.constantElementName } @@ -216,8 +218,11 @@ private[scrooge] object TupleRule extends ParquetListFormatRule { override def createCompliantRepeatedType(typ: Type, name: String, isElementRequired: Boolean, originalType: OriginalType, fieldContext: FieldContext): Type = { // nested list has type name of the form: `field_original_name_tuple_tuple..._tuple` for the depth of list val suffixed_name = (List(fieldContext.name) ++ (1 to fieldContext.nestedListLevel).toList.map(_ => "tuple")).mkString("_") - if (typ.isPrimitive) new PrimitiveType(Type.Repetition.REPEATED, typ.asPrimitiveType.getPrimitiveTypeName, suffixed_name, originalType) - else new GroupType(Type.Repetition.REPEATED, suffixed_name, originalType, typ.asGroupType.getFields) + if (typ.isPrimitive) { + new PrimitiveType(Type.Repetition.REPEATED, typ.asPrimitiveType.getPrimitiveTypeName, suffixed_name, originalType) + } else { + new GroupType(Type.Repetition.REPEATED, suffixed_name, originalType, typ.asGroupType.getFields) + } } } @@ -304,7 +309,8 @@ private[scrooge] object SparkLegacyNullableElementRule extends ThreeLevelRule { override def createCompliantRepeatedType(originalElementType: Type, name: String, isElementRequired: Boolean, originalType: OriginalType, fieldContext: FieldContext): Type = { if (isElementRequired) { throw new IllegalArgumentException(s"Spark legacy mode for nullable element cannot take required element. Found: ${originalElementType}") + } else { + super.createCompliantRepeatedType(originalElementType, name, isElementRequired, originalType, fieldContext) } - super.createCompliantRepeatedType(originalElementType, name, isElementRequired, originalType, fieldContext) } } diff --git a/scalding-parquet-scrooge/src/main/scala/com/twitter/scalding/parquet/scrooge/ParquetMapFormatter.scala b/scalding-parquet-scrooge/src/main/scala/com/twitter/scalding/parquet/scrooge/ParquetMapFormatter.scala index ce9ed5498d..f7b96c4b0a 100644 --- a/scalding-parquet-scrooge/src/main/scala/com/twitter/scalding/parquet/scrooge/ParquetMapFormatter.scala +++ b/scalding-parquet-scrooge/src/main/scala/com/twitter/scalding/parquet/scrooge/ParquetMapFormatter.scala @@ -6,7 +6,7 @@ import org.apache.parquet.schema.{OriginalType, Type} * Format parquet map schema of read type to structure of file type. * The supported formats are: * 1) Standard repeated type of `key_value` without annotation - * 2) Legacy repeated `map field annotated with (MAP_KEY_VALUE) + * 2) Legacy repeated `map` field annotated with (MAP_KEY_VALUE) * as described in * https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#maps * @@ -17,7 +17,8 @@ private[scrooge] object ParquetMapFormatter extends ParquetCollectionFormatter { def formatCompatibleRepeatedType(fileRepeatedMapType: Type, readRepeatedMapType: Type, - fieldContext: FieldContext, recursiveSolver: (Type, Type, FieldContext) => Type): Type = { + fieldContext: FieldContext, + recursiveSolver: (Type, Type, FieldContext) => Type): Type = { val solvedRepeatedType = recursiveSolver(fileRepeatedMapType, readRepeatedMapType, fieldContext) fileRepeatedMapType.asGroupType().withNewFields(solvedRepeatedType.asGroupType().getFields) } From b6c8a928a24f702a401b5224b2c16ec65512c813 Mon Sep 17 00:00:00 2001 From: Mick Jermsurawong Date: Sat, 5 Oct 2019 15:50:01 -0700 Subject: [PATCH 33/34] improve code coverage and remove dead code after restructuring --- ...ParquetCollectionFormatCompatibility.scala | 6 +- .../scrooge/ParquetListFormatter.scala | 19 ++- .../parquet/scrooge/ParquetMapFormatter.scala | 25 ++-- ...etCollectionFormatCompatibilityTests.scala | 119 ++++++++++++++++-- 4 files changed, 130 insertions(+), 39 deletions(-) diff --git a/scalding-parquet-scrooge/src/main/scala/com/twitter/scalding/parquet/scrooge/ParquetCollectionFormatCompatibility.scala b/scalding-parquet-scrooge/src/main/scala/com/twitter/scalding/parquet/scrooge/ParquetCollectionFormatCompatibility.scala index 9bf03aeb78..feccf6f1a0 100644 --- a/scalding-parquet-scrooge/src/main/scala/com/twitter/scalding/parquet/scrooge/ParquetCollectionFormatCompatibility.scala +++ b/scalding-parquet-scrooge/src/main/scala/com/twitter/scalding/parquet/scrooge/ParquetCollectionFormatCompatibility.scala @@ -69,7 +69,7 @@ private[scrooge] object ParquetCollectionFormatCompatibility { } } else { // Recursive cases to handle non-primitives (lists, maps, and structs): - (extractCollectionGroup(projectedReadType), extractCollectionGroup(fileType)) match { + (extractCollectionGroup(projectedReadType.asGroupType()), extractCollectionGroup(fileType.asGroupType())) match { case (Some(projectedReadGroup: ListGroup), Some(fileGroup: ListGroup)) => projectFileGroup(fileGroup, projectedReadGroup, fieldContext.copy(nestedListLevel = fieldContext.nestedListLevel + 1), formatter=ParquetListFormatter) case (Some(projectedReadGroup: MapGroup), Some(fileGroup: MapGroup)) => @@ -129,7 +129,7 @@ private[scrooge] object ParquetCollectionFormatCompatibility { projectedReadGroup.groupType.withNewFields(projectedFileRepeatedType) } - private def extractCollectionGroup(typ: Type): Option[CollectionGroup] = { + private def extractCollectionGroup(typ: GroupType): Option[CollectionGroup] = { ParquetListFormatter.extractGroup(typ).orElse(ParquetMapFormatter.extractGroup(typ)) } } @@ -151,7 +151,7 @@ private[scrooge] trait ParquetCollectionFormatter { /** * Extract collection group containing repeated type of different formats. */ - def extractGroup(typ: Type): Option[CollectionGroup] + def extractGroup(typ: GroupType): Option[CollectionGroup] } /** diff --git a/scalding-parquet-scrooge/src/main/scala/com/twitter/scalding/parquet/scrooge/ParquetListFormatter.scala b/scalding-parquet-scrooge/src/main/scala/com/twitter/scalding/parquet/scrooge/ParquetListFormatter.scala index 1bdadde908..e237664f1a 100644 --- a/scalding-parquet-scrooge/src/main/scala/com/twitter/scalding/parquet/scrooge/ParquetListFormatter.scala +++ b/scalding-parquet-scrooge/src/main/scala/com/twitter/scalding/parquet/scrooge/ParquetListFormatter.scala @@ -51,23 +51,18 @@ private[scrooge] object ParquetListFormatter extends ParquetCollectionFormatter } } - def extractGroup(typ: Type) : Option[ListGroup] = { - if (isListGroup(typ)) { - Some(ListGroup(typ.asGroupType(), typ.asGroupType().getFields.get(0))) + def extractGroup(groupType: GroupType) : Option[ListGroup] = { + if (isListGroup(groupType)) { + Some(ListGroup(groupType, groupType.getFields.get(0))) } else { None } } - private def isListGroup(typ: Type): Boolean = { - if (typ.isPrimitive) { - false - } else { - val groupProjection = typ.asGroupType - groupProjection.getOriginalType == OriginalType.LIST && - groupProjection.getFieldCount == 1 && - groupProjection.getFields.get(0).isRepetition(Type.Repetition.REPEATED) - } + private def isListGroup(groupType: GroupType): Boolean = { + groupType.getOriginalType == OriginalType.LIST && + groupType.getFieldCount == 1 && + groupType.getFields.get(0).isRepetition(Type.Repetition.REPEATED) } private def findRule(repeatedType: Type): Option[ParquetListFormatRule] = { diff --git a/scalding-parquet-scrooge/src/main/scala/com/twitter/scalding/parquet/scrooge/ParquetMapFormatter.scala b/scalding-parquet-scrooge/src/main/scala/com/twitter/scalding/parquet/scrooge/ParquetMapFormatter.scala index f7b96c4b0a..f21a8c505e 100644 --- a/scalding-parquet-scrooge/src/main/scala/com/twitter/scalding/parquet/scrooge/ParquetMapFormatter.scala +++ b/scalding-parquet-scrooge/src/main/scala/com/twitter/scalding/parquet/scrooge/ParquetMapFormatter.scala @@ -1,6 +1,6 @@ package com.twitter.scalding.parquet.scrooge -import org.apache.parquet.schema.{OriginalType, Type} +import org.apache.parquet.schema.{GroupType, OriginalType, Type} /** * Format parquet map schema of read type to structure of file type. @@ -23,25 +23,20 @@ private[scrooge] object ParquetMapFormatter extends ParquetCollectionFormatter { fileRepeatedMapType.asGroupType().withNewFields(solvedRepeatedType.asGroupType().getFields) } - def extractGroup(typ: Type): Option[MapGroup] = { - if (isMapGroup(typ)) { - Some(MapGroup(typ.asGroupType(), typ.asGroupType().getFields.get(0))) + def extractGroup(groupType: GroupType): Option[MapGroup] = { + if (isMapGroup(groupType)) { + Some(MapGroup(groupType, groupType.getFields.get(0))) } else { None } } - private def isMapGroup(typ: Type): Boolean = { - if (typ.isPrimitive) { - false - } else { - val groupType = typ.asGroupType - (groupType.getOriginalType == OriginalType.MAP) && - (groupType.getFieldCount == 1) && - groupType.getFields.get(0).isRepetition(Type.Repetition.REPEATED) && - (isLegacyRepeatedType(groupType.getFields.get(0)) || - isStandardRepeatedType(groupType.getFields.get(0))) - } + private def isMapGroup(groupType: GroupType): Boolean = { + (groupType.getOriginalType == OriginalType.MAP) && + (groupType.getFieldCount == 1) && + groupType.getFields.get(0).isRepetition(Type.Repetition.REPEATED) && + (isLegacyRepeatedType(groupType.getFields.get(0)) || + isStandardRepeatedType(groupType.getFields.get(0))) } private def isLegacyRepeatedType(repeatedType: Type) = { diff --git a/scalding-parquet-scrooge/src/test/scala/com/twitter/scalding/parquet/scrooge/ParquetCollectionFormatCompatibilityTests.scala b/scalding-parquet-scrooge/src/test/scala/com/twitter/scalding/parquet/scrooge/ParquetCollectionFormatCompatibilityTests.scala index 8611f67d00..26e4bd589c 100644 --- a/scalding-parquet-scrooge/src/test/scala/com/twitter/scalding/parquet/scrooge/ParquetCollectionFormatCompatibilityTests.scala +++ b/scalding-parquet-scrooge/src/test/scala/com/twitter/scalding/parquet/scrooge/ParquetCollectionFormatCompatibilityTests.scala @@ -228,15 +228,26 @@ class ParquetCollectionFormatCompatibilityTests extends WordSpec with Matchers { } "Project for list with nullable element" should { - "file: standard, read: spark legacy write, with nullable elements" in { - for { - feasibleRepetition <- feasibleRepetitions - } yield { - testProjectedFileSchemaHasReadSchemaRepetitions( - fileSchemaFunc = listStandardRule(_, _, nullableElement = true), - projectedReadSchemaFunc = listSparkLegacyNullableElementRule, - feasibleRepetition - ) + val listNullableElementRules = Seq( + ("spark-legacy", listSparkLegacyNullableElementRule(_, _)), + ("standard-with", (from: String, to: String) => listStandardRule(from, to, nullableElement = true)) + ) + for { + (projectedReadRuleName, projectedReadSchemaFunc) <- listNullableElementRules + (fileRuleName, fileSchemaFunc) <- listNullableElementRules + } yield { + s"file: [${fileRuleName}] read: [${projectedReadRuleName}]" should { + "take option/require specifications from projected read schema" in { + for { + feasibleRepetition <- feasibleRepetitions + } yield { + testProjectedFileSchemaHasReadSchemaRepetitions( + fileSchemaFunc, + projectedReadSchemaFunc, + feasibleRepetition + ) + } + } } } @@ -935,6 +946,96 @@ class ParquetCollectionFormatCompatibilityTests extends WordSpec with Matchers { |} """.stripMargin) shouldEqual projectedFileSchema } + + "file: absent, read: optional list " in { + val fileSchema = MessageTypeParser.parseMessageType( + """ + |message FileSchema { + | required group foo (LIST) { + | repeated group foo_tuple (LIST) { + | repeated binary foo_tuple_tuple (UTF8); + | } + | } + | required int32 x; + |} + """.stripMargin) + val projectedReadSchema = MessageTypeParser.parseMessageType( + """ + |message ProjectedReadSchema { + | optional group foo (LIST) { + | repeated group list { + | required group element (LIST) { + | repeated group list { + | required binary element (UTF8); + | } + | } + | } + | } + | optional group foo_optional (LIST) { + | repeated group list { + | required binary element (UTF8); + | } + | } + | optional int32 x; + |} + """.stripMargin) + val projectedFileSchema = testProjectAndAssertCompatibility(fileSchema, projectedReadSchema) + MessageTypeParser.parseMessageType( + """ + |message ProjectedReadSchema { + | optional group foo (LIST) { + | repeated group foo_tuple (LIST) { + | repeated binary foo_tuple_tuple (UTF8); + | } + | } + | optional group foo_optional (LIST) { + | repeated group list { + | required binary element (UTF8); + | } + | } + | optional int32 x; + |} + """.stripMargin) shouldEqual projectedFileSchema + } + + "file: 3-level, read: unknown to return read type" in { + val fileSchema = MessageTypeParser.parseMessageType( + """ + |message FileSchema { + | required group country_codes (LIST) { + | repeated group list { + | required group element { + | required binary foo (UTF8); + | } + | } + | } + | required int32 x; + |} + """.stripMargin) + val projectedReadSchema = MessageTypeParser.parseMessageType( + """ + |message ProjectedReadSchema { + | optional group country_codes (LIST) { + | repeated group unknown_element_format { + | optional binary foo (UTF8); + | } + | } + |} + """.stripMargin) + val projectedFileSchema = testProjectAndAssertCompatibility(fileSchema, projectedReadSchema) + + val expected = MessageTypeParser.parseMessageType( + """ + |message ProjectedReadSchema { + | optional group country_codes (LIST) { + | repeated group unknown_element_format { + | optional binary foo (UTF8); + | } + | } + |} + """.stripMargin) + expected shouldEqual projectedFileSchema + } } "Format compat for mixed collection" should { From 579905ccf7961b7333dbb1d09a322cd698ca7f09 Mon Sep 17 00:00:00 2001 From: Mick Jermsurawong Date: Sat, 5 Oct 2019 15:52:33 -0700 Subject: [PATCH 34/34] auto-format from running sbt test --- ...ParquetCollectionFormatCompatibility.scala | 32 +++++----- .../scrooge/ParquetListFormatter.scala | 25 ++++---- .../parquet/scrooge/ParquetMapFormatter.scala | 8 +-- ...etCollectionFormatCompatibilityTests.scala | 62 +++++++------------ 4 files changed, 53 insertions(+), 74 deletions(-) diff --git a/scalding-parquet-scrooge/src/main/scala/com/twitter/scalding/parquet/scrooge/ParquetCollectionFormatCompatibility.scala b/scalding-parquet-scrooge/src/main/scala/com/twitter/scalding/parquet/scrooge/ParquetCollectionFormatCompatibility.scala index feccf6f1a0..669afdf2cc 100644 --- a/scalding-parquet-scrooge/src/main/scala/com/twitter/scalding/parquet/scrooge/ParquetCollectionFormatCompatibility.scala +++ b/scalding-parquet-scrooge/src/main/scala/com/twitter/scalding/parquet/scrooge/ParquetCollectionFormatCompatibility.scala @@ -1,7 +1,7 @@ package com.twitter.scalding.parquet.scrooge import org.apache.parquet.schema.Type.Repetition -import org.apache.parquet.schema.{GroupType, MessageType, Type} +import org.apache.parquet.schema.{ GroupType, MessageType, Type } import org.apache.parquet.thrift.DecodingSchemaMismatchException import org.slf4j.LoggerFactory @@ -64,16 +64,15 @@ private[scrooge] object ParquetCollectionFormatCompatibility { // The field is primitive in one schema but non-primitive in the othe other throw new DecodingSchemaMismatchException( s"Found schema mismatch between projected read type:\n$projectedReadType\n" + - s"and file type:\n${fileType}" - ) + s"and file type:\n${fileType}") } } else { // Recursive cases to handle non-primitives (lists, maps, and structs): (extractCollectionGroup(projectedReadType.asGroupType()), extractCollectionGroup(fileType.asGroupType())) match { case (Some(projectedReadGroup: ListGroup), Some(fileGroup: ListGroup)) => - projectFileGroup(fileGroup, projectedReadGroup, fieldContext.copy(nestedListLevel = fieldContext.nestedListLevel + 1), formatter=ParquetListFormatter) + projectFileGroup(fileGroup, projectedReadGroup, fieldContext.copy(nestedListLevel = fieldContext.nestedListLevel + 1), formatter = ParquetListFormatter) case (Some(projectedReadGroup: MapGroup), Some(fileGroup: MapGroup)) => - projectFileGroup(fileGroup, projectedReadGroup, fieldContext, formatter=ParquetMapFormatter) + projectFileGroup(fileGroup, projectedReadGroup, fieldContext, formatter = ParquetMapFormatter) case _ => // Struct projection val projectedReadGroupType = projectedReadType.asGroupType val fileGroupType = fileType.asGroupType @@ -89,8 +88,7 @@ private[scrooge] object ParquetCollectionFormatCompatibility { // The missing field is repeated or required, which is an error: throw new DecodingSchemaMismatchException( s"Found non-optional projected read field ${projectedReadField.getName}:\n$projectedReadField\n\n" + - s"not present in the given file group type:\n${fileGroupType}" - ) + s"not present in the given file group type:\n${fileGroupType}") } } else { // The field is present in both schemas, so first check that the schemas specify compatible repetition @@ -101,8 +99,7 @@ private[scrooge] object ParquetCollectionFormatCompatibility { // The field is optional in the file schema but required in the projected read schema; this is an error: throw new DecodingSchemaMismatchException( s"Found required projected read field ${projectedReadField.getName}:\n$projectedReadField\n\n" + - s"on optional file field:\n${fileField}" - ) + s"on optional file field:\n${fileField}") } else { // The field's repetitions are compatible in both schemas (e.g. optional in both schemas or required // in both), so recursively process the field: @@ -116,15 +113,14 @@ private[scrooge] object ParquetCollectionFormatCompatibility { } private def projectFileGroup(fileGroup: CollectionGroup, - projectedReadGroup: CollectionGroup, - fieldContext: FieldContext, - formatter: ParquetCollectionFormatter): GroupType = { + projectedReadGroup: CollectionGroup, + fieldContext: FieldContext, + formatter: ParquetCollectionFormatter): GroupType = { val projectedFileRepeatedType = formatter.formatCompatibleRepeatedType( fileGroup.repeatedType, projectedReadGroup.repeatedType, fieldContext, - projectFileType - ) + projectFileType) // Respect optional/required from the projected read group. projectedReadGroup.groupType.withNewFields(projectedFileRepeatedType) } @@ -144,9 +140,9 @@ private[scrooge] trait ParquetCollectionFormatter { * @return formatted result */ def formatCompatibleRepeatedType(fileRepeatedType: Type, - readRepeatedType: Type, - fieldContext: FieldContext, - recursiveSolver: (Type, Type, FieldContext) => Type): Type + readRepeatedType: Type, + fieldContext: FieldContext, + recursiveSolver: (Type, Type, FieldContext) => Type): Type /** * Extract collection group containing repeated type of different formats. @@ -159,7 +155,7 @@ private[scrooge] trait ParquetCollectionFormatter { * @param name field name * @param nestedListLevel li */ -private[scrooge] case class FieldContext(name: String="", nestedListLevel: Int=0) +private[scrooge] case class FieldContext(name: String = "", nestedListLevel: Int = 0) private[scrooge] sealed trait CollectionGroup { /** diff --git a/scalding-parquet-scrooge/src/main/scala/com/twitter/scalding/parquet/scrooge/ParquetListFormatter.scala b/scalding-parquet-scrooge/src/main/scala/com/twitter/scalding/parquet/scrooge/ParquetListFormatter.scala index e237664f1a..f12adbf47b 100644 --- a/scalding-parquet-scrooge/src/main/scala/com/twitter/scalding/parquet/scrooge/ParquetListFormatter.scala +++ b/scalding-parquet-scrooge/src/main/scala/com/twitter/scalding/parquet/scrooge/ParquetListFormatter.scala @@ -1,6 +1,6 @@ package com.twitter.scalding.parquet.scrooge -import org.apache.parquet.schema.{GroupType, OriginalType, PrimitiveType, Type} +import org.apache.parquet.schema.{ GroupType, OriginalType, PrimitiveType, Type } import org.slf4j.LoggerFactory import scala.collection.JavaConverters._ @@ -25,13 +25,12 @@ private[scrooge] object ParquetListFormatter extends ParquetCollectionFormatter GroupArrayRule, TupleRule, StandardRule, - SparkLegacyNullableElementRule - ) + SparkLegacyNullableElementRule) def formatCompatibleRepeatedType(fileRepeatedType: Type, - readRepeatedType: Type, - fieldContext: FieldContext, - recursiveSolver: (Type, Type, FieldContext) => Type): Type = { + readRepeatedType: Type, + fieldContext: FieldContext, + recursiveSolver: (Type, Type, FieldContext) => Type): Type = { (findRule(fileRepeatedType), findRule(readRepeatedType)) match { case (Some(fileRule), Some(readRule)) => { val readElementType = readRule.elementType(readRepeatedType) @@ -43,15 +42,14 @@ private[scrooge] object ParquetListFormatter extends ParquetCollectionFormatter elementName = readRule.elementName(readRepeatedType), isElementRequired = readRule.isElementRequired(readRepeatedType), elementOriginalType = readRule.elementOriginalType(readRepeatedType), - fieldContext = fieldContext - ) + fieldContext = fieldContext) } case _ => readRepeatedType } } - def extractGroup(groupType: GroupType) : Option[ListGroup] = { + def extractGroup(groupType: GroupType): Option[ListGroup] = { if (isListGroup(groupType)) { Some(ListGroup(groupType, groupType.getFields.get(0))) } else { @@ -94,10 +92,10 @@ private[scrooge] sealed trait ParquetListFormatRule { private[scrooge] def appliesToType(repeatedType: Type): Boolean private[scrooge] def createCompliantRepeatedType(elementType: Type, - elementName: String, - isElementRequired: Boolean, - elementOriginalType: OriginalType, - fieldContext: FieldContext): Type + elementName: String, + isElementRequired: Boolean, + elementOriginalType: OriginalType, + fieldContext: FieldContext): Type } /** @@ -221,7 +219,6 @@ private[scrooge] object TupleRule extends ParquetListFormatRule { } } - private[scrooge] sealed trait ThreeLevelRule extends ParquetListFormatRule { def constantElementName: String diff --git a/scalding-parquet-scrooge/src/main/scala/com/twitter/scalding/parquet/scrooge/ParquetMapFormatter.scala b/scalding-parquet-scrooge/src/main/scala/com/twitter/scalding/parquet/scrooge/ParquetMapFormatter.scala index f21a8c505e..9a10db843c 100644 --- a/scalding-parquet-scrooge/src/main/scala/com/twitter/scalding/parquet/scrooge/ParquetMapFormatter.scala +++ b/scalding-parquet-scrooge/src/main/scala/com/twitter/scalding/parquet/scrooge/ParquetMapFormatter.scala @@ -1,6 +1,6 @@ package com.twitter.scalding.parquet.scrooge -import org.apache.parquet.schema.{GroupType, OriginalType, Type} +import org.apache.parquet.schema.{ GroupType, OriginalType, Type } /** * Format parquet map schema of read type to structure of file type. @@ -16,9 +16,9 @@ import org.apache.parquet.schema.{GroupType, OriginalType, Type} private[scrooge] object ParquetMapFormatter extends ParquetCollectionFormatter { def formatCompatibleRepeatedType(fileRepeatedMapType: Type, - readRepeatedMapType: Type, - fieldContext: FieldContext, - recursiveSolver: (Type, Type, FieldContext) => Type): Type = { + readRepeatedMapType: Type, + fieldContext: FieldContext, + recursiveSolver: (Type, Type, FieldContext) => Type): Type = { val solvedRepeatedType = recursiveSolver(fileRepeatedMapType, readRepeatedMapType, fieldContext) fileRepeatedMapType.asGroupType().withNewFields(solvedRepeatedType.asGroupType().getFields) } diff --git a/scalding-parquet-scrooge/src/test/scala/com/twitter/scalding/parquet/scrooge/ParquetCollectionFormatCompatibilityTests.scala b/scalding-parquet-scrooge/src/test/scala/com/twitter/scalding/parquet/scrooge/ParquetCollectionFormatCompatibilityTests.scala index 26e4bd589c..948e85d61d 100644 --- a/scalding-parquet-scrooge/src/test/scala/com/twitter/scalding/parquet/scrooge/ParquetCollectionFormatCompatibilityTests.scala +++ b/scalding-parquet-scrooge/src/test/scala/com/twitter/scalding/parquet/scrooge/ParquetCollectionFormatCompatibilityTests.scala @@ -2,18 +2,18 @@ package com.twitter.scalding.parquet.scrooge import java.util -import org.apache.parquet.schema.{MessageType, MessageTypeParser} -import org.apache.parquet.thrift.{DecodingSchemaMismatchException, ThriftSchemaConverter} +import org.apache.parquet.schema.{ MessageType, MessageTypeParser } +import org.apache.parquet.thrift.{ DecodingSchemaMismatchException, ThriftSchemaConverter } import org.apache.parquet.thrift.struct.ThriftField.Requirement -import org.apache.parquet.thrift.struct.{ThriftField, ThriftType} +import org.apache.parquet.thrift.struct.{ ThriftField, ThriftType } import org.apache.parquet.thrift.struct.ThriftType.StructType.StructOrUnionType -import org.apache.parquet.thrift.struct.ThriftType.{ListType, MapType, StructType} -import org.scalatest.{Matchers, WordSpec} +import org.apache.parquet.thrift.struct.ThriftType.{ ListType, MapType, StructType } +import org.scalatest.{ Matchers, WordSpec } class ParquetCollectionFormatCompatibilityTests extends WordSpec with Matchers { private def testProjectAndAssertCompatibility(fileSchema: MessageType, - projectedReadSchema: MessageType) = { + projectedReadSchema: MessageType) = { val projectedFileSchema = ParquetCollectionFormatCompatibility.projectFileSchema(fileSchema, projectedReadSchema) ScroogeReadSupport.assertGroupsAreCompatible(fileSchema, projectedFileSchema) projectedFileSchema @@ -23,7 +23,7 @@ class ParquetCollectionFormatCompatibilityTests extends WordSpec with Matchers { * Helper wrapper to specify repetition string for exhaustive tests */ case class TestRepetitions(projectedReadRepetition1: String, projectedReadRepetition2: String, - fileRepetition1: String, fileRepetition2: String) + fileRepetition1: String, fileRepetition2: String) def feasibleRepetitions = { for { projectedRepetition1 <- Seq("required", "optional") @@ -115,7 +115,7 @@ class ParquetCollectionFormatCompatibilityTests extends WordSpec with Matchers { |} """.stripMargin) - def listStandardRule(repetition1: String, repetition2: String, nullableElement:Boolean=false) = { + def listStandardRule(repetition1: String, repetition2: String, nullableElement: Boolean = false) = { val requiredOrOptional = if (nullableElement) "optional" else "required" (s""" |message schema { @@ -151,8 +151,7 @@ class ParquetCollectionFormatCompatibilityTests extends WordSpec with Matchers { ("element", listElementRule(_, _)), ("array", listArrayRule(_, _)), ("tuple", listTupleRule(_, _)), - ("standard", (from: String, to: String) => listStandardRule(from, to, nullableElement = false)) - ) + ("standard", (from: String, to: String) => listStandardRule(from, to, nullableElement = false))) // All possible format pairs of list with non-nullable element for { @@ -167,8 +166,7 @@ class ParquetCollectionFormatCompatibilityTests extends WordSpec with Matchers { testProjectedFileSchemaHasReadSchemaRepetitions( fileSchemaFunc, projectedReadSchemaFunc, - feasibleRepetition - ) + feasibleRepetition) } } } @@ -205,33 +203,29 @@ class ParquetCollectionFormatCompatibilityTests extends WordSpec with Matchers { """.stripMargin) private def testProjectedFileSchemaHasReadSchemaRepetitions( - fileSchemaFunc: (String, String) => String, - projectedReadSchemaFunc: (String, String) => String, - feasibleRepetition: TestRepetitions): Any = { + fileSchemaFunc: (String, String) => String, + projectedReadSchemaFunc: (String, String) => String, + feasibleRepetition: TestRepetitions): Any = { val projectedReadSchema = MessageTypeParser.parseMessageType( projectedReadSchemaFunc( feasibleRepetition.projectedReadRepetition1, - feasibleRepetition.projectedReadRepetition2) - ) + feasibleRepetition.projectedReadRepetition2)) val fileSchema = MessageTypeParser.parseMessageType( fileSchemaFunc( feasibleRepetition.fileRepetition1, - feasibleRepetition.fileRepetition2) - ) + feasibleRepetition.fileRepetition2)) val expectedProjectedFileSchema = MessageTypeParser.parseMessageType( fileSchemaFunc( feasibleRepetition.projectedReadRepetition1, - feasibleRepetition.projectedReadRepetition2) - ) + feasibleRepetition.projectedReadRepetition2)) expectedProjectedFileSchema shouldEqual testProjectAndAssertCompatibility(fileSchema, projectedReadSchema) } "Project for list with nullable element" should { val listNullableElementRules = Seq( ("spark-legacy", listSparkLegacyNullableElementRule(_, _)), - ("standard-with", (from: String, to: String) => listStandardRule(from, to, nullableElement = true)) - ) + ("standard-with", (from: String, to: String) => listStandardRule(from, to, nullableElement = true))) for { (projectedReadRuleName, projectedReadSchemaFunc) <- listNullableElementRules (fileRuleName, fileSchemaFunc) <- listNullableElementRules @@ -244,8 +238,7 @@ class ParquetCollectionFormatCompatibilityTests extends WordSpec with Matchers { testProjectedFileSchemaHasReadSchemaRepetitions( fileSchemaFunc, projectedReadSchemaFunc, - feasibleRepetition - ) + feasibleRepetition) } } } @@ -260,8 +253,7 @@ class ParquetCollectionFormatCompatibilityTests extends WordSpec with Matchers { testProjectedFileSchemaHasReadSchemaRepetitions( fileSchemaFunc = listSparkLegacyNullableElementRule, projectedReadSchemaFunc = requiredElementSchemaFunc, - feasibleRepetition - ) + feasibleRepetition) } e.getMessage should include("Spark legacy mode for nullable element cannot take required element") } @@ -329,8 +321,7 @@ class ParquetCollectionFormatCompatibilityTests extends WordSpec with Matchers { | } | } |} - """.stripMargin - ) + """.stripMargin) val projectedFileSchema = testProjectAndAssertCompatibility(message, message) message shouldEqual projectedFileSchema @@ -476,12 +467,10 @@ class ParquetCollectionFormatCompatibilityTests extends WordSpec with Matchers { val mapType = new MapType( new ThriftField("NOT_USED_KEY", 4, Requirement.REQUIRED, new ThriftType.StringType), new ThriftField("NOT_USED_VALUE", 5, Requirement.REQUIRED, - mapValueType) - ) + mapValueType)) new ThriftSchemaConverter().convert( new StructType(util.Arrays.asList( - new ThriftField("map_field", 6, Requirement.REQUIRED, mapType) - ), StructOrUnionType.STRUCT)) + new ThriftField("map_field", 6, Requirement.REQUIRED, mapType)), StructOrUnionType.STRUCT)) } } @@ -1046,9 +1035,7 @@ class ParquetCollectionFormatCompatibilityTests extends WordSpec with Matchers { val message = new ThriftSchemaConverter().convert( new StructType(util.Arrays.asList( new ThriftField("list_of_map", 2, Requirement.REQUIRED, new ListType( - new ThriftField("NOT_USED_ELEMENT", 2, Requirement.REQUIRED, mapType)) - ) - ), StructOrUnionType.STRUCT)) + new ThriftField("NOT_USED_ELEMENT", 2, Requirement.REQUIRED, mapType)))), StructOrUnionType.STRUCT)) message shouldEqual MessageTypeParser.parseMessageType( """ @@ -1063,8 +1050,7 @@ class ParquetCollectionFormatCompatibilityTests extends WordSpec with Matchers { | } |} | - """.stripMargin - ) + """.stripMargin) val projectedFileSchema = testProjectAndAssertCompatibility(message, message) message shouldEqual projectedFileSchema