diff --git a/model/countries/BR/BR-parsing-rules.yaml b/model/countries/BR/BR-parsing-rules.yaml index 9168462..14aeb1a 100644 --- a/model/countries/BR/BR-parsing-rules.yaml +++ b/model/countries/BR/BR-parsing-rules.yaml @@ -19,6 +19,14 @@ regex_definitions: kFloorLiteralRe: # Regex for literal for a floor regex_fragment: andar + + # Regular expression to match 5-digit zip prefix + kZipPrefixValueRe: + regex_fragment: (?:\d{5}) + + # Regular expression to match 3-digit zip suffix + kZipSuffixValueRe: + regex_fragment: (?:\d{3}) capture_definitions: ParseBuildingLocation: @@ -165,6 +173,10 @@ parsing_definitions: output: unit-name parts: [ {regex_reference: kUnitNameValueRe} ] + postal-code: + decomposition: + capture_reference: ParsePostalCodeOptionalSeparatorExpression + test_regex_definitions: # Tests for kBuildingValueRe - id: "kBuildingValueRe: plain number" @@ -324,3 +336,15 @@ test_parsing_definitions: unit-name: "12" floor: "1" landmark: "foo" +- id: "Zip code with separator" + type: postal-code + input: "12345-678" + output: + postal-code-prefix: "12345" + postal-code-suffix: "678" +- id: "Zip code without separator" + type: postal-code + input: "12345678" + output: + postal-code-prefix: "12345" + postal-code-suffix: "678" diff --git a/model/countries/CA/CA-formatting-rules.yaml b/model/countries/CA/CA-formatting-rules.yaml index 1844c67..3ae98d6 100644 --- a/model/countries/CA/CA-formatting-rules.yaml +++ b/model/countries/CA/CA-formatting-rules.yaml @@ -11,6 +11,10 @@ formatting-rules: - skip: country # redundant with country-name - skip: street-address # redundant with street-address-alternative-1 + postal-code: + - postal-code-prefix + - postal-code-suffix + examples: - id: name comment: | @@ -33,7 +37,8 @@ examples: address-line2: Apt. 306 locality1: Ottawa admin-area1: ON - postal-code: M5H 2J9 + postal-code-prefix: M5H + postal-code-suffix: 2J9 country: CA country-name: Canada diff --git a/model/countries/CA/CA-parsing-rules.yaml b/model/countries/CA/CA-parsing-rules.yaml new file mode 100644 index 0000000..4c7110c --- /dev/null +++ b/model/countries/CA/CA-parsing-rules.yaml @@ -0,0 +1,32 @@ +regex_definitions: + # Regular expression to match the separator between + # zip code prefix and suffix. + kZipCodeSeparatorsRe: + regex_reference: kWhitespaceSeparator + + # Regular expression to match zip prefix + kZipPrefixValueRe: + regex_fragment: (?:[ABCEGHJ-NPRSTVXY]\d[ABCEGHJ-NPRSTV-Z]) + + # Regular expression to match zip suffix + kZipSuffixValueRe: + regex_fragment: (?:\d[ABCEGHJ-NPRSTV-Z]\d) + +parsing_definitions: + postal-code: + decomposition: + capture_reference: ParsePostalCodeOptionalSeparatorExpression + +test_parsing_definitions: +- id: "Zip code with separator" + type: postal-code + input: "K1A 0B1" + output: + postal-code-prefix: "K1A" + postal-code-suffix: "0B1" +- id: "Zip code without separator" + type: postal-code + input: "K1A0B1" + output: + postal-code-prefix: "K1A" + postal-code-suffix: "0B1" diff --git a/model/countries/NL/NL-formatting-rules.yaml b/model/countries/NL/NL-formatting-rules.yaml index 2ab81f0..f1e0fbc 100644 --- a/model/countries/NL/NL-formatting-rules.yaml +++ b/model/countries/NL/NL-formatting-rules.yaml @@ -24,6 +24,10 @@ formatting-rules: - separator: "-" - unit + postal-code: + - postal-code-prefix + - postal-code-suffix + examples: - id: name comment: | @@ -46,7 +50,8 @@ examples: unit: A building-and-unit: 10-A locality1: Amsterdam - postal-code: 1234 AB + postal-code-prefix: 1234 + postal-code-suffix: AB country: NL country-name: Netherlands output: @@ -65,7 +70,8 @@ examples: building: 10 building-and-unit: 10 locality1: Amsterdam - postal-code: 1234 AB + postal-code-prefix: 1234 + postal-code-suffix: AB country: NL country-name: Netherlands output: diff --git a/model/countries/NL/NL-parsing-rules.yaml b/model/countries/NL/NL-parsing-rules.yaml index ae4b211..b7a689e 100644 --- a/model/countries/NL/NL-parsing-rules.yaml +++ b/model/countries/NL/NL-parsing-rules.yaml @@ -9,6 +9,14 @@ regex_definitions: kHouseNumberAndUnitSeparator: regex_fragment: (?:[-\s/,]*) + # Regular expression to match zip prefix + kZipPrefixValueRe: + regex_fragment: (?:[1-9]\d{3}) + + # Regular expression to match zip suffix + kZipSuffixValueRe: + regex_fragment: (?:[A-Z]{2}) + capture_definitions: ParseBuildingLocation: capture: @@ -31,7 +39,7 @@ capture_definitions: - capture: output: unit-name parts: [ {regex_reference: kUnitValueRe} ] - quantifier: MATCH_OPTIONAL + quantifier: MATCH_OPTIONAL parsing_definitions: building-location: @@ -44,6 +52,10 @@ parsing_definitions: output: street-address-alternative-1 parts: - capture_reference: ParseBuildingLocation + + postal-code: + decomposition: + capture_reference: ParsePostalCodeOptionalSeparatorExpression test_parsing_definitions: - id: "Test 1" @@ -241,3 +253,15 @@ test_parsing_definitions: building: "146" unit: "A-02" unit-name: "A-02" +- id: "Test 19: zip code with separator" + type: postal-code + input: "1234 AB" + output: + postal-code-prefix: "1234" + postal-code-suffix: "AB" +- id: "Test 20: zip code without separator" + type: postal-code + input: "1234AB" + output: + postal-code-prefix: "1234" + postal-code-suffix: "AB" diff --git a/model/countries/PL/PL-parsing-rules.yaml b/model/countries/PL/PL-parsing-rules.yaml index 4df0e56..7ede4eb 100644 --- a/model/countries/PL/PL-parsing-rules.yaml +++ b/model/countries/PL/PL-parsing-rules.yaml @@ -24,6 +24,14 @@ regex_definitions: # Regular expression to match separator of house/building number and unit/apartment number. kHouseNumberAndUnitSeparator: regex_fragment: (?:^|[/\s]+) + + # Regular expression to match 2-digit zip prefix + kZipPrefixValueRe: + regex_fragment: (?:\d{2}) + + # Regular expression to match 3-digit zip suffix + kZipSuffixValueRe: + regex_fragment: (?:\d{3}) capture_definitions: ParseBuildingLocation: @@ -85,6 +93,10 @@ parsing_definitions: unit: decomposition: capture_reference: ParseUnitWithOptionalPrefix + + postal-code: + decomposition: + capture_reference: ParsePostalCodeOptionalSeparatorExpression test_parsing_definitions: @@ -222,4 +234,16 @@ test_parsing_definitions: building: "9A" unit: "m.10" unit-type: "m." - unit-name: "10" \ No newline at end of file + unit-name: "10" +- id: "Test 15: zip code with separator" + type: postal-code + input: "00-843" + output: + postal-code-prefix: "00" + postal-code-suffix: "843" +- id: "Test 16: zip code without separator" + type: postal-code + input: "00843" + output: + postal-code-prefix: "00" + postal-code-suffix: "843" diff --git a/model/countries/US/US-formatting-rules.yaml b/model/countries/US/US-formatting-rules.yaml index 6ff9c31..828d8bf 100644 --- a/model/countries/US/US-formatting-rules.yaml +++ b/model/countries/US/US-formatting-rules.yaml @@ -35,7 +35,8 @@ examples: address-line2: Apt. 10, Club of Autofillers locality1: New York City admin-area1: NY - postal-code: 11367 + postal-code-prefix: 11367 + postal-code-suffix: 4100 country: US country-name: USA @@ -45,5 +46,5 @@ examples: text: | 1234 Main St. Apt. 10, Club of Autofillers - New York City, NY 11367 - USA \ No newline at end of file + New York City, NY 11367-4100 + USA diff --git a/model/countries/US/US-parsing-rules.yaml b/model/countries/US/US-parsing-rules.yaml new file mode 100644 index 0000000..cbd6893 --- /dev/null +++ b/model/countries/US/US-parsing-rules.yaml @@ -0,0 +1,39 @@ +regex_definitions: + # Regular expression to match 5-digit zip prefix + kZipPrefixValueRe: + regex_fragment: (?:\d{5}) + + # Regular expression to match 4-digit zip suffix + kZipSuffixValueRe: + regex_fragment: (?:\d{4}) + +parsing_definitions: + postal-code: + decomposition: + capture_reference: ParsePostalCodeOptionalSeparatorExpression + +test_parsing_definitions: +- id: "Zip code with suffix" + type: postal-code + input: "90210-5555" + output: + postal-code-prefix: "90210" + postal-code-suffix: "5555" +- id: "Zip code without suffix" + type: postal-code + input: "90210" + output: + postal-code-prefix: "90210" + postal-code-suffix: "" +- id: "Zip code with extra spaces" + type: postal-code + input: "90210 - 5555" + output: + postal-code-prefix: "90210" + postal-code-suffix: "5555" +- id: "Zip code without separator" + type: postal-code + input: "902105555" + output: + postal-code-prefix: "90210" + postal-code-suffix: "5555" diff --git a/model/countries/global/global-descriptions.yaml b/model/countries/global/global-descriptions.yaml index 22690ab..84437fe 100644 --- a/model/countries/global/global-descriptions.yaml +++ b/model/countries/global/global-descriptions.yaml @@ -13,6 +13,8 @@ short-descriptions: country: 2-letter country code country-name: Name of a country postal-code: Postal code + postal-code-prefix: Postal code prefix + postal-code-suffix: Postal code suffix admin-area1: Biggest type of admin area if a country has multiple levels admin-area2: 2nd biggest type of admin area if a country has multiple levels admin-area3: 3rd biggest type of admin area if a country has multiple levels diff --git a/model/countries/global/global-formatting-rules.yaml b/model/countries/global/global-formatting-rules.yaml index 6252aaf..560cdf7 100644 --- a/model/countries/global/global-formatting-rules.yaml +++ b/model/countries/global/global-formatting-rules.yaml @@ -109,3 +109,8 @@ formatting-rules: - tel-local-prefix - separator: "" - tel-local-suffix + + postal-code: + - postal-code-prefix + - separator: "-" + - postal-code-suffix diff --git a/model/countries/global/global-model.yaml b/model/countries/global/global-model.yaml index 7e2858b..fff6adb 100644 --- a/model/countries/global/global-model.yaml +++ b/model/countries/global/global-model.yaml @@ -56,7 +56,9 @@ concepts: - admin-area2 - admin-area3 - admin-area4 - - postal-code + - postal-code: + - postal-code-prefix + - postal-code-suffix - country - country-name # Company related diff --git a/model/countries/global/global-parsing-rules.yaml b/model/countries/global/global-parsing-rules.yaml index 01ab9f4..34fd886 100644 --- a/model/countries/global/global-parsing-rules.yaml +++ b/model/countries/global/global-parsing-rules.yaml @@ -7,6 +7,11 @@ regex_definitions: kCommaOrWhitespaceSeparator: regex_fragment: (?:^|[,\s]+) + # Regular expression pattern to match a sequence of alphanumeric chars. + kAlphaNumericSequenceRe: + regex_fragment: + (?:[^\W_]+) + # Regular expressions pattern of common two-character CJK last names. # Korean names are written in Hangul. # Chinese names are written in their traditional and simplified version. @@ -179,6 +184,19 @@ regex_definitions: regex_fragment: |- ^(?:[A-Z]\.?(?:(?:\s|-)?[A-Z]\.?)*)$ + # Regular expression pattern to match the separator between + # zip code prefix and suffix. + kZipCodeSeparatorsRe: + regex_fragment: (?:[\s-]+) + + # Regular expression to match zip prefix + kZipPrefixValueRe: + regex_reference: kAlphaNumericSequenceRe + + # Regular expression to match zip suffix + kZipSuffixValueRe: + regex_reference: kAlphaNumericSequenceRe + capture_definitions: # Returns an expression to parse a CJK name that includes one separator. # The full name is parsed into |name|, the part of the name before the @@ -420,6 +438,40 @@ capture_definitions: no_capture: parts: [ {regex_fragment: '\A\s*'} ] + # Returns an expression to parse `postal-code` into `postal-code-prefix` + # and `postal-code-suffix`, separator is mandatory. + ParsePostalCodeMandatorySeparatorExpression: + capture: + output: postal-code + parts: + - capture: + output: postal-code-prefix + parts: [ {regex_reference: kZipPrefixValueRe} ] + - separator: {regex_reference: kZipCodeSeparatorsRe} + - capture: + output: postal-code-suffix + parts: [ {regex_reference: kZipSuffixValueRe} ] + quantifier: MATCH_OPTIONAL + + # Returns an expression to parse `postal-code` into `postal-code-prefix` + # and `postal-code-suffix`, separator is optional. + ParsePostalCodeOptionalSeparatorExpression: + capture: + output: postal-code + parts: + - capture: + output: postal-code-prefix + parts: [ {regex_reference: kZipPrefixValueRe} ] + - no_capture: + parts: + - separator: {regex_reference: kZipCodeSeparatorsRe} + quantifier: MATCH_OPTIONAL + - capture: + output: postal-code-suffix + parts: [ {regex_reference: kZipSuffixValueRe} ] + quantifier: MATCH_OPTIONAL + + parsing_definitions: name: # If the name is a CJK name, try to match in the following order: @@ -462,6 +514,10 @@ parsing_definitions: - decomposition: {capture_reference: ParseLastCommaFirstMiddleNameExpression} - decomposition: {capture_reference: ParseFirstMiddleLastNameExpression} + postal-code: + decomposition: + capture_reference: ParsePostalCodeMandatorySeparatorExpression + test_capture_definitions: - id: "Chinese name, Unihan" capture_name: ParseSeparatedCjkNameExpression @@ -914,3 +970,33 @@ test_parsing_definitions: # family-name-first: "" # family-name-conjunction: "" # family-name-second: "孫" +- id: US zip code with suffix + type: postal-code + input: "90210-5555" + output: + postal-code-prefix: "90210" + postal-code-suffix: "5555" +- id: US zip code without suffix + type: postal-code + input: "90210" + output: + postal-code-prefix: "90210" + postal-code-suffix: "" +- id: Canadian postal code + type: postal-code + input: "M5V 2T6" + output: + postal-code-prefix: "M5V" + postal-code-suffix: "2T6" +- id: UK postal code + type: postal-code + input: "SW1A 0AA" + output: + postal-code-prefix: "SW1A" + postal-code-suffix: "0AA" +- id: Polish postal code + type: postal-code + input: "00-950" + output: + postal-code-prefix: "00" + postal-code-suffix: "950"