From 29f9cc9570454baf80e66f323bf66f51160f35f2 Mon Sep 17 00:00:00 2001 From: m-maryia Date: Fri, 11 Jul 2025 18:20:50 +0200 Subject: [PATCH 1/5] introduce zip prefix and suffix --- model/build_internal_version.sh | 1 + model/countries/BR/BR-parsing-rules.yaml | 40 +++++++++++++ model/countries/CA/CA-formatting-rules.yaml | 7 ++- model/countries/CA/CA-parsing-rules.yaml | 44 ++++++++++++++ model/countries/NL/NL-formatting-rules.yaml | 10 +++- model/countries/NL/NL-parsing-rules.yaml | 42 ++++++++++++- model/countries/PL/PL-parsing-rules.yaml | 42 ++++++++++++- model/countries/US/US-formatting-rules.yaml | 7 ++- model/countries/US/US-parsing-rules.yaml | 56 +++++++++++++++++ .../countries/global/global-descriptions.yaml | 2 + .../global/global-formatting-rules.yaml | 5 ++ model/countries/global/global-model.yaml | 4 +- .../global/global-parsing-rules.yaml | 60 +++++++++++++++++++ model/vendor | 1 + 14 files changed, 312 insertions(+), 9 deletions(-) create mode 120000 model/build_internal_version.sh create mode 100644 model/countries/CA/CA-parsing-rules.yaml create mode 100644 model/countries/US/US-parsing-rules.yaml create mode 120000 model/vendor diff --git a/model/build_internal_version.sh b/model/build_internal_version.sh new file mode 120000 index 0000000..ebee58d --- /dev/null +++ b/model/build_internal_version.sh @@ -0,0 +1 @@ +/Users/mmaryia/autofill-i18n-model/build_internal_version.sh \ No newline at end of file diff --git a/model/countries/BR/BR-parsing-rules.yaml b/model/countries/BR/BR-parsing-rules.yaml index 9168462..744942e 100644 --- a/model/countries/BR/BR-parsing-rules.yaml +++ b/model/countries/BR/BR-parsing-rules.yaml @@ -19,6 +19,14 @@ regex_definitions: kFloorLiteralRe: # Regex for literal for a floor regex_fragment: andar + + # Regular expression to match 5-digit zip prefix + kZipPrefixValueRe: + regex_fragment: (?:\d{5}) + + # Regular expression to match 3-digit zip suffix + kZipSuffixValueRe: + regex_fragment: (?:\d{3}) capture_definitions: ParseBuildingLocation: @@ -97,6 +105,22 @@ capture_definitions: parts: - regex_fragment: '[^,\n]+' + ParsePostalCodeExpression: + capture: + output: postal-code + parts: + - capture: + output: postal-code-prefix + parts: [ {regex_reference: kZipPrefixValueRe} ] + - no_capture: + parts: + - separator: {regex_reference: kZipCodeSeparatorsRe} + quantifier: MATCH_OPTIONAL + - capture: + output: postal-code-suffix + parts: [ {regex_reference: kZipSuffixValueRe} ] + quantifier: MATCH_OPTIONAL + parsing_definitions: building-location: decomposition: @@ -165,6 +189,10 @@ parsing_definitions: output: unit-name parts: [ {regex_reference: kUnitNameValueRe} ] + postal-code: + decomposition: + capture_reference: ParsePostalCodeExpression + test_regex_definitions: # Tests for kBuildingValueRe - id: "kBuildingValueRe: plain number" @@ -324,3 +352,15 @@ test_parsing_definitions: unit-name: "12" floor: "1" landmark: "foo" +- id: "zip code with separator" + type: postal-code + input: "12345-678" + output: + postal-code-prefix: "12345" + postal-code-suffix: "678" +- id: "zip code without separator" + type: postal-code + input: "12345678" + output: + postal-code-prefix: "12345" + postal-code-suffix: "678" diff --git a/model/countries/CA/CA-formatting-rules.yaml b/model/countries/CA/CA-formatting-rules.yaml index 1844c67..3ae98d6 100644 --- a/model/countries/CA/CA-formatting-rules.yaml +++ b/model/countries/CA/CA-formatting-rules.yaml @@ -11,6 +11,10 @@ formatting-rules: - skip: country # redundant with country-name - skip: street-address # redundant with street-address-alternative-1 + postal-code: + - postal-code-prefix + - postal-code-suffix + examples: - id: name comment: | @@ -33,7 +37,8 @@ examples: address-line2: Apt. 306 locality1: Ottawa admin-area1: ON - postal-code: M5H 2J9 + postal-code-prefix: M5H + postal-code-suffix: 2J9 country: CA country-name: Canada diff --git a/model/countries/CA/CA-parsing-rules.yaml b/model/countries/CA/CA-parsing-rules.yaml new file mode 100644 index 0000000..e86e34b --- /dev/null +++ b/model/countries/CA/CA-parsing-rules.yaml @@ -0,0 +1,44 @@ +regex_definitions: + # Regular expression to match zip prefix + kZipPrefixValueRe: + regex_fragment: (?:[ABCEGHJ-NPRSTVXY]\d[ABCEGHJ-NPRSTV-Z]) + + # Regular expression to match zip suffix + kZipSuffixValueRe: + regex_fragment: (?:\d[ABCEGHJ-NPRSTV-Z]\d) + +capture_definitions: + ParsePostalCodeExpression: + capture: + output: postal-code + parts: + - capture: + output: postal-code-prefix + parts: [ {regex_reference: kZipPrefixValueRe} ] + - no_capture: + parts: + - separator: {regex_reference: kWhitespaceSeparator} + quantifier: MATCH_OPTIONAL + - capture: + output: postal-code-suffix + parts: [ {regex_reference: kZipSuffixValueRe} ] + quantifier: MATCH_OPTIONAL + +parsing_definitions: + postal-code: + decomposition: + capture_reference: ParsePostalCodeExpression + +test_parsing_definitions: +- id: "zip code" + type: postal-code + input: "K1A 0B1" + output: + postal-code-prefix: "K1A" + postal-code-suffix: "0B1" +- id: "zip code without separator" + type: postal-code + input: "K1A0B1" + output: + postal-code-prefix: "K1A" + postal-code-suffix: "0B1" diff --git a/model/countries/NL/NL-formatting-rules.yaml b/model/countries/NL/NL-formatting-rules.yaml index 2ab81f0..f1e0fbc 100644 --- a/model/countries/NL/NL-formatting-rules.yaml +++ b/model/countries/NL/NL-formatting-rules.yaml @@ -24,6 +24,10 @@ formatting-rules: - separator: "-" - unit + postal-code: + - postal-code-prefix + - postal-code-suffix + examples: - id: name comment: | @@ -46,7 +50,8 @@ examples: unit: A building-and-unit: 10-A locality1: Amsterdam - postal-code: 1234 AB + postal-code-prefix: 1234 + postal-code-suffix: AB country: NL country-name: Netherlands output: @@ -65,7 +70,8 @@ examples: building: 10 building-and-unit: 10 locality1: Amsterdam - postal-code: 1234 AB + postal-code-prefix: 1234 + postal-code-suffix: AB country: NL country-name: Netherlands output: diff --git a/model/countries/NL/NL-parsing-rules.yaml b/model/countries/NL/NL-parsing-rules.yaml index ae4b211..161ec94 100644 --- a/model/countries/NL/NL-parsing-rules.yaml +++ b/model/countries/NL/NL-parsing-rules.yaml @@ -9,6 +9,14 @@ regex_definitions: kHouseNumberAndUnitSeparator: regex_fragment: (?:[-\s/,]*) + # Regular expression to match zip prefix + kZipPrefixValueRe: + regex_fragment: (?:[1-9]\d{3}) + + # Regular expression to match zip suffix + kZipSuffixValueRe: + regex_fragment: (?:[A-Z]{2}) + capture_definitions: ParseBuildingLocation: capture: @@ -31,7 +39,23 @@ capture_definitions: - capture: output: unit-name parts: [ {regex_reference: kUnitValueRe} ] - quantifier: MATCH_OPTIONAL + quantifier: MATCH_OPTIONAL + + ParsePostalCodeExpression: + capture: + output: postal-code + parts: + - capture: + output: postal-code-prefix + parts: [ {regex_reference: kZipPrefixValueRe} ] + - no_capture: + parts: + - separator: {regex_reference: kWhitespaceSeparator} + quantifier: MATCH_OPTIONAL + - capture: + output: postal-code-suffix + parts: [ {regex_reference: kZipSuffixValueRe} ] + quantifier: MATCH_OPTIONAL parsing_definitions: building-location: @@ -44,6 +68,10 @@ parsing_definitions: output: street-address-alternative-1 parts: - capture_reference: ParseBuildingLocation + + postal-code: + decomposition: + capture_reference: ParsePostalCodeExpression test_parsing_definitions: - id: "Test 1" @@ -241,3 +269,15 @@ test_parsing_definitions: building: "146" unit: "A-02" unit-name: "A-02" +- id: "zip code with separator" + type: postal-code + input: "1234 AB" + output: + postal-code-prefix: "1234" + postal-code-suffix: "AB" +- id: "zip code without separator" + type: postal-code + input: "1234AB" + output: + postal-code-prefix: "1234" + postal-code-suffix: "AB" diff --git a/model/countries/PL/PL-parsing-rules.yaml b/model/countries/PL/PL-parsing-rules.yaml index 4df0e56..b06a13d 100644 --- a/model/countries/PL/PL-parsing-rules.yaml +++ b/model/countries/PL/PL-parsing-rules.yaml @@ -24,6 +24,14 @@ regex_definitions: # Regular expression to match separator of house/building number and unit/apartment number. kHouseNumberAndUnitSeparator: regex_fragment: (?:^|[/\s]+) + + # Regular expression to match 2-digit zip prefix + kZipPrefixValueRe: + regex_fragment: (?:\d{2}) + + # Regular expression to match 3-digit zip suffix + kZipSuffixValueRe: + regex_fragment: (?:\d{3}) capture_definitions: ParseBuildingLocation: @@ -57,6 +65,22 @@ capture_definitions: output: unit-name parts: [{regex_reference: kUnitNameValueRe}] quantifier: MATCH_OPTIONAL + + ParsePostalCodeExpression: + capture: + output: postal-code + parts: + - capture: + output: postal-code-prefix + parts: [ {regex_reference: kZipPrefixValueRe} ] + - no_capture: + parts: + - separator: {regex_reference: kZipCodeSeparatorsRe} + quantifier: MATCH_OPTIONAL + - capture: + output: postal-code-suffix + parts: [ {regex_reference: kZipSuffixValueRe} ] + quantifier: MATCH_OPTIONAL parsing_definitions: @@ -85,6 +109,10 @@ parsing_definitions: unit: decomposition: capture_reference: ParseUnitWithOptionalPrefix + + postal-code: + decomposition: + capture_reference: ParsePostalCodeExpression test_parsing_definitions: @@ -222,4 +250,16 @@ test_parsing_definitions: building: "9A" unit: "m.10" unit-type: "m." - unit-name: "10" \ No newline at end of file + unit-name: "10" +- id: "zip code with separator" + type: postal-code + input: "00-843" + output: + postal-code-prefix: "00" + postal-code-suffix: "843" +- id: "zip code without separator" + type: postal-code + input: "00843" + output: + postal-code-prefix: "00" + postal-code-suffix: "843" diff --git a/model/countries/US/US-formatting-rules.yaml b/model/countries/US/US-formatting-rules.yaml index 6ff9c31..828d8bf 100644 --- a/model/countries/US/US-formatting-rules.yaml +++ b/model/countries/US/US-formatting-rules.yaml @@ -35,7 +35,8 @@ examples: address-line2: Apt. 10, Club of Autofillers locality1: New York City admin-area1: NY - postal-code: 11367 + postal-code-prefix: 11367 + postal-code-suffix: 4100 country: US country-name: USA @@ -45,5 +46,5 @@ examples: text: | 1234 Main St. Apt. 10, Club of Autofillers - New York City, NY 11367 - USA \ No newline at end of file + New York City, NY 11367-4100 + USA diff --git a/model/countries/US/US-parsing-rules.yaml b/model/countries/US/US-parsing-rules.yaml new file mode 100644 index 0000000..cfac58a --- /dev/null +++ b/model/countries/US/US-parsing-rules.yaml @@ -0,0 +1,56 @@ +regex_definitions: + # Regular expression to match 5-digit zip prefix + kZipPrefixValueRe: + regex_fragment: (?:\d{5}) + + # Regular expression to match 4-digit zip suffix + kZipSuffixValueRe: + regex_fragment: (?:\d{4}) + +capture_definitions: + ParsePostalCodeExpression: + capture: + output: postal-code + parts: + - capture: + output: postal-code-prefix + parts: [ {regex_reference: kZipPrefixValueRe} ] + - no_capture: + parts: + - separator: {regex_reference: kZipCodeSeparatorsRe} + quantifier: MATCH_OPTIONAL + - capture: + output: postal-code-suffix + parts: [ {regex_reference: kZipSuffixValueRe} ] + quantifier: MATCH_OPTIONAL + +parsing_definitions: + postal-code: + decomposition: + capture_reference: ParsePostalCodeExpression + +test_parsing_definitions: +- id: "zip code with suffix" + type: postal-code + input: "90210-5555" + output: + postal-code-prefix: "90210" + postal-code-suffix: "5555" +- id: "zip code without suffix" + type: postal-code + input: "90210" + output: + postal-code-prefix: "90210" + postal-code-suffix: "" +- id: "zip code with extra spaces" + type: postal-code + input: "90210 - 5555" + output: + postal-code-prefix: "90210" + postal-code-suffix: "5555" +- id: "zip code without separator" + type: postal-code + input: "902105555" + output: + postal-code-prefix: "90210" + postal-code-suffix: "5555" diff --git a/model/countries/global/global-descriptions.yaml b/model/countries/global/global-descriptions.yaml index 22690ab..84437fe 100644 --- a/model/countries/global/global-descriptions.yaml +++ b/model/countries/global/global-descriptions.yaml @@ -13,6 +13,8 @@ short-descriptions: country: 2-letter country code country-name: Name of a country postal-code: Postal code + postal-code-prefix: Postal code prefix + postal-code-suffix: Postal code suffix admin-area1: Biggest type of admin area if a country has multiple levels admin-area2: 2nd biggest type of admin area if a country has multiple levels admin-area3: 3rd biggest type of admin area if a country has multiple levels diff --git a/model/countries/global/global-formatting-rules.yaml b/model/countries/global/global-formatting-rules.yaml index 6252aaf..560cdf7 100644 --- a/model/countries/global/global-formatting-rules.yaml +++ b/model/countries/global/global-formatting-rules.yaml @@ -109,3 +109,8 @@ formatting-rules: - tel-local-prefix - separator: "" - tel-local-suffix + + postal-code: + - postal-code-prefix + - separator: "-" + - postal-code-suffix diff --git a/model/countries/global/global-model.yaml b/model/countries/global/global-model.yaml index 7e2858b..fff6adb 100644 --- a/model/countries/global/global-model.yaml +++ b/model/countries/global/global-model.yaml @@ -56,7 +56,9 @@ concepts: - admin-area2 - admin-area3 - admin-area4 - - postal-code + - postal-code: + - postal-code-prefix + - postal-code-suffix - country - country-name # Company related diff --git a/model/countries/global/global-parsing-rules.yaml b/model/countries/global/global-parsing-rules.yaml index 01ab9f4..2291d03 100644 --- a/model/countries/global/global-parsing-rules.yaml +++ b/model/countries/global/global-parsing-rules.yaml @@ -178,6 +178,16 @@ regex_definitions: kMiddleNameInitialsCharacteristicsRe: regex_fragment: |- ^(?:[A-Z]\.?(?:(?:\s|-)?[A-Z]\.?)*)$ + + # Regular expression pattern to match a sequence of alphanumeric chars. + kAlphaNumericSequenceRe: + regex_fragment: + (?:[^\W_]+) + + # Regular expression pattern to match the separator between + # zip code prefix and suffix. + kZipCodeSeparatorsRe: + regex_fragment: (?:[\s-]+) capture_definitions: # Returns an expression to parse a CJK name that includes one separator. @@ -420,6 +430,22 @@ capture_definitions: no_capture: parts: [ {regex_fragment: '\A\s*'} ] + # Returns an expression to parse `postal-code` into `postal-code-prefix` + # and `postal-code-suffix`. + ParsePostalCodeExpression: + capture: + output: postal-code + parts: + - capture: + output: postal-code-prefix + parts: [ {regex_reference: kAlphaNumericSequenceRe} ] + - separator: {regex_reference: kZipCodeSeparatorsRe} + - capture: + output: postal-code-suffix + parts: [ {regex_reference: kAlphaNumericSequenceRe} ] + quantifier: MATCH_OPTIONAL + + parsing_definitions: name: # If the name is a CJK name, try to match in the following order: @@ -462,6 +488,10 @@ parsing_definitions: - decomposition: {capture_reference: ParseLastCommaFirstMiddleNameExpression} - decomposition: {capture_reference: ParseFirstMiddleLastNameExpression} + postal-code: + decomposition: + capture_reference: ParsePostalCodeExpression + test_capture_definitions: - id: "Chinese name, Unihan" capture_name: ParseSeparatedCjkNameExpression @@ -914,3 +944,33 @@ test_parsing_definitions: # family-name-first: "" # family-name-conjunction: "" # family-name-second: "孫" +- id: US zip code with suffix. + type: postal-code + input: "90210-5555" + output: + postal-code-prefix: "90210" + postal-code-suffix: "5555" +- id: US zip code without suffix. + type: postal-code + input: "90210" + output: + postal-code-prefix: "90210" + postal-code-suffix: "" +- id: Canadian postal code. + type: postal-code + input: "M5V 2T6" + output: + postal-code-prefix: "M5V" + postal-code-suffix: "2T6" +- id: UK postal code. + type: postal-code + input: "SW1A 0AA" + output: + postal-code-prefix: "SW1A" + postal-code-suffix: "0AA" +- id: Polish postal code. + type: postal-code + input: "00-950" + output: + postal-code-prefix: "00" + postal-code-suffix: "950" diff --git a/model/vendor b/model/vendor new file mode 120000 index 0000000..3f91278 --- /dev/null +++ b/model/vendor @@ -0,0 +1 @@ +/Users/mmaryia/autofill-i18n-model/vendor/ \ No newline at end of file From 2868bfef7532e882a2f20669da4c82c37ea8ee12 Mon Sep 17 00:00:00 2001 From: m-maryia Date: Fri, 11 Jul 2025 18:20:50 +0200 Subject: [PATCH 2/5] introduce zip prefix and suffix --- model/countries/BR/BR-parsing-rules.yaml | 24 ++++++ model/countries/CA/CA-formatting-rules.yaml | 7 +- model/countries/CA/CA-parsing-rules.yaml | 32 +++++++ model/countries/NL/NL-formatting-rules.yaml | 10 ++- model/countries/NL/NL-parsing-rules.yaml | 26 +++++- model/countries/PL/PL-parsing-rules.yaml | 26 +++++- model/countries/US/US-formatting-rules.yaml | 7 +- model/countries/US/US-parsing-rules.yaml | 39 +++++++++ .../countries/global/global-descriptions.yaml | 2 + .../global/global-formatting-rules.yaml | 5 ++ model/countries/global/global-model.yaml | 4 +- .../global/global-parsing-rules.yaml | 86 +++++++++++++++++++ 12 files changed, 259 insertions(+), 9 deletions(-) create mode 100644 model/countries/CA/CA-parsing-rules.yaml create mode 100644 model/countries/US/US-parsing-rules.yaml diff --git a/model/countries/BR/BR-parsing-rules.yaml b/model/countries/BR/BR-parsing-rules.yaml index 9168462..fc18dc4 100644 --- a/model/countries/BR/BR-parsing-rules.yaml +++ b/model/countries/BR/BR-parsing-rules.yaml @@ -19,6 +19,14 @@ regex_definitions: kFloorLiteralRe: # Regex for literal for a floor regex_fragment: andar + + # Regular expression to match 5-digit zip prefix + kZipPrefixValueRe: + regex_fragment: (?:\d{5}) + + # Regular expression to match 3-digit zip suffix + kZipSuffixValueRe: + regex_fragment: (?:\d{3}) capture_definitions: ParseBuildingLocation: @@ -165,6 +173,10 @@ parsing_definitions: output: unit-name parts: [ {regex_reference: kUnitNameValueRe} ] + postal-code: + decomposition: + capture_reference: ParsePostalCodeOptionalSeparatorExpression + test_regex_definitions: # Tests for kBuildingValueRe - id: "kBuildingValueRe: plain number" @@ -324,3 +336,15 @@ test_parsing_definitions: unit-name: "12" floor: "1" landmark: "foo" +- id: "zip code with separator" + type: postal-code + input: "12345-678" + output: + postal-code-prefix: "12345" + postal-code-suffix: "678" +- id: "zip code without separator" + type: postal-code + input: "12345678" + output: + postal-code-prefix: "12345" + postal-code-suffix: "678" diff --git a/model/countries/CA/CA-formatting-rules.yaml b/model/countries/CA/CA-formatting-rules.yaml index 1844c67..3ae98d6 100644 --- a/model/countries/CA/CA-formatting-rules.yaml +++ b/model/countries/CA/CA-formatting-rules.yaml @@ -11,6 +11,10 @@ formatting-rules: - skip: country # redundant with country-name - skip: street-address # redundant with street-address-alternative-1 + postal-code: + - postal-code-prefix + - postal-code-suffix + examples: - id: name comment: | @@ -33,7 +37,8 @@ examples: address-line2: Apt. 306 locality1: Ottawa admin-area1: ON - postal-code: M5H 2J9 + postal-code-prefix: M5H + postal-code-suffix: 2J9 country: CA country-name: Canada diff --git a/model/countries/CA/CA-parsing-rules.yaml b/model/countries/CA/CA-parsing-rules.yaml new file mode 100644 index 0000000..81e8bb2 --- /dev/null +++ b/model/countries/CA/CA-parsing-rules.yaml @@ -0,0 +1,32 @@ +regex_definitions: + # Regular expression to match the separator between + # zip code prefix and suffix. + kZipCodeSeparatorsRe: + regex_reference: kWhitespaceSeparator + + # Regular expression to match zip prefix + kZipPrefixValueRe: + regex_fragment: (?:[ABCEGHJ-NPRSTVXY]\d[ABCEGHJ-NPRSTV-Z]) + + # Regular expression to match zip suffix + kZipSuffixValueRe: + regex_fragment: (?:\d[ABCEGHJ-NPRSTV-Z]\d) + +parsing_definitions: + postal-code: + decomposition: + capture_reference: ParsePostalCodeOptionalSeparatorExpression + +test_parsing_definitions: +- id: "zip code" + type: postal-code + input: "K1A 0B1" + output: + postal-code-prefix: "K1A" + postal-code-suffix: "0B1" +- id: "zip code without separator" + type: postal-code + input: "K1A0B1" + output: + postal-code-prefix: "K1A" + postal-code-suffix: "0B1" diff --git a/model/countries/NL/NL-formatting-rules.yaml b/model/countries/NL/NL-formatting-rules.yaml index 2ab81f0..f1e0fbc 100644 --- a/model/countries/NL/NL-formatting-rules.yaml +++ b/model/countries/NL/NL-formatting-rules.yaml @@ -24,6 +24,10 @@ formatting-rules: - separator: "-" - unit + postal-code: + - postal-code-prefix + - postal-code-suffix + examples: - id: name comment: | @@ -46,7 +50,8 @@ examples: unit: A building-and-unit: 10-A locality1: Amsterdam - postal-code: 1234 AB + postal-code-prefix: 1234 + postal-code-suffix: AB country: NL country-name: Netherlands output: @@ -65,7 +70,8 @@ examples: building: 10 building-and-unit: 10 locality1: Amsterdam - postal-code: 1234 AB + postal-code-prefix: 1234 + postal-code-suffix: AB country: NL country-name: Netherlands output: diff --git a/model/countries/NL/NL-parsing-rules.yaml b/model/countries/NL/NL-parsing-rules.yaml index ae4b211..badaec4 100644 --- a/model/countries/NL/NL-parsing-rules.yaml +++ b/model/countries/NL/NL-parsing-rules.yaml @@ -9,6 +9,14 @@ regex_definitions: kHouseNumberAndUnitSeparator: regex_fragment: (?:[-\s/,]*) + # Regular expression to match zip prefix + kZipPrefixValueRe: + regex_fragment: (?:[1-9]\d{3}) + + # Regular expression to match zip suffix + kZipSuffixValueRe: + regex_fragment: (?:[A-Z]{2}) + capture_definitions: ParseBuildingLocation: capture: @@ -31,7 +39,7 @@ capture_definitions: - capture: output: unit-name parts: [ {regex_reference: kUnitValueRe} ] - quantifier: MATCH_OPTIONAL + quantifier: MATCH_OPTIONAL parsing_definitions: building-location: @@ -44,6 +52,10 @@ parsing_definitions: output: street-address-alternative-1 parts: - capture_reference: ParseBuildingLocation + + postal-code: + decomposition: + capture_reference: ParsePostalCodeOptionalSeparatorExpression test_parsing_definitions: - id: "Test 1" @@ -241,3 +253,15 @@ test_parsing_definitions: building: "146" unit: "A-02" unit-name: "A-02" +- id: "zip code with separator" + type: postal-code + input: "1234 AB" + output: + postal-code-prefix: "1234" + postal-code-suffix: "AB" +- id: "zip code without separator" + type: postal-code + input: "1234AB" + output: + postal-code-prefix: "1234" + postal-code-suffix: "AB" diff --git a/model/countries/PL/PL-parsing-rules.yaml b/model/countries/PL/PL-parsing-rules.yaml index 4df0e56..f9a0e8a 100644 --- a/model/countries/PL/PL-parsing-rules.yaml +++ b/model/countries/PL/PL-parsing-rules.yaml @@ -24,6 +24,14 @@ regex_definitions: # Regular expression to match separator of house/building number and unit/apartment number. kHouseNumberAndUnitSeparator: regex_fragment: (?:^|[/\s]+) + + # Regular expression to match 2-digit zip prefix + kZipPrefixValueRe: + regex_fragment: (?:\d{2}) + + # Regular expression to match 3-digit zip suffix + kZipSuffixValueRe: + regex_fragment: (?:\d{3}) capture_definitions: ParseBuildingLocation: @@ -85,6 +93,10 @@ parsing_definitions: unit: decomposition: capture_reference: ParseUnitWithOptionalPrefix + + postal-code: + decomposition: + capture_reference: ParsePostalCodeOptionalSeparatorExpression test_parsing_definitions: @@ -222,4 +234,16 @@ test_parsing_definitions: building: "9A" unit: "m.10" unit-type: "m." - unit-name: "10" \ No newline at end of file + unit-name: "10" +- id: "zip code with separator" + type: postal-code + input: "00-843" + output: + postal-code-prefix: "00" + postal-code-suffix: "843" +- id: "zip code without separator" + type: postal-code + input: "00843" + output: + postal-code-prefix: "00" + postal-code-suffix: "843" diff --git a/model/countries/US/US-formatting-rules.yaml b/model/countries/US/US-formatting-rules.yaml index 6ff9c31..828d8bf 100644 --- a/model/countries/US/US-formatting-rules.yaml +++ b/model/countries/US/US-formatting-rules.yaml @@ -35,7 +35,8 @@ examples: address-line2: Apt. 10, Club of Autofillers locality1: New York City admin-area1: NY - postal-code: 11367 + postal-code-prefix: 11367 + postal-code-suffix: 4100 country: US country-name: USA @@ -45,5 +46,5 @@ examples: text: | 1234 Main St. Apt. 10, Club of Autofillers - New York City, NY 11367 - USA \ No newline at end of file + New York City, NY 11367-4100 + USA diff --git a/model/countries/US/US-parsing-rules.yaml b/model/countries/US/US-parsing-rules.yaml new file mode 100644 index 0000000..317e1d1 --- /dev/null +++ b/model/countries/US/US-parsing-rules.yaml @@ -0,0 +1,39 @@ +regex_definitions: + # Regular expression to match 5-digit zip prefix + kZipPrefixValueRe: + regex_fragment: (?:\d{5}) + + # Regular expression to match 4-digit zip suffix + kZipSuffixValueRe: + regex_fragment: (?:\d{4}) + +parsing_definitions: + postal-code: + decomposition: + capture_reference: ParsePostalCodeOptionalSeparatorExpression + +test_parsing_definitions: +- id: "zip code with suffix" + type: postal-code + input: "90210-5555" + output: + postal-code-prefix: "90210" + postal-code-suffix: "5555" +- id: "zip code without suffix" + type: postal-code + input: "90210" + output: + postal-code-prefix: "90210" + postal-code-suffix: "" +- id: "zip code with extra spaces" + type: postal-code + input: "90210 - 5555" + output: + postal-code-prefix: "90210" + postal-code-suffix: "5555" +- id: "zip code without separator" + type: postal-code + input: "902105555" + output: + postal-code-prefix: "90210" + postal-code-suffix: "5555" diff --git a/model/countries/global/global-descriptions.yaml b/model/countries/global/global-descriptions.yaml index 22690ab..84437fe 100644 --- a/model/countries/global/global-descriptions.yaml +++ b/model/countries/global/global-descriptions.yaml @@ -13,6 +13,8 @@ short-descriptions: country: 2-letter country code country-name: Name of a country postal-code: Postal code + postal-code-prefix: Postal code prefix + postal-code-suffix: Postal code suffix admin-area1: Biggest type of admin area if a country has multiple levels admin-area2: 2nd biggest type of admin area if a country has multiple levels admin-area3: 3rd biggest type of admin area if a country has multiple levels diff --git a/model/countries/global/global-formatting-rules.yaml b/model/countries/global/global-formatting-rules.yaml index 6252aaf..560cdf7 100644 --- a/model/countries/global/global-formatting-rules.yaml +++ b/model/countries/global/global-formatting-rules.yaml @@ -109,3 +109,8 @@ formatting-rules: - tel-local-prefix - separator: "" - tel-local-suffix + + postal-code: + - postal-code-prefix + - separator: "-" + - postal-code-suffix diff --git a/model/countries/global/global-model.yaml b/model/countries/global/global-model.yaml index 7e2858b..fff6adb 100644 --- a/model/countries/global/global-model.yaml +++ b/model/countries/global/global-model.yaml @@ -56,7 +56,9 @@ concepts: - admin-area2 - admin-area3 - admin-area4 - - postal-code + - postal-code: + - postal-code-prefix + - postal-code-suffix - country - country-name # Company related diff --git a/model/countries/global/global-parsing-rules.yaml b/model/countries/global/global-parsing-rules.yaml index 01ab9f4..1121071 100644 --- a/model/countries/global/global-parsing-rules.yaml +++ b/model/countries/global/global-parsing-rules.yaml @@ -178,6 +178,24 @@ regex_definitions: kMiddleNameInitialsCharacteristicsRe: regex_fragment: |- ^(?:[A-Z]\.?(?:(?:\s|-)?[A-Z]\.?)*)$ + + # Regular expression pattern to match a sequence of alphanumeric chars. + kAlphaNumericSequenceRe: + regex_fragment: + (?:[^\W_]+) + + # Regular expression pattern to match the separator between + # zip code prefix and suffix. + kZipCodeSeparatorsRe: + regex_fragment: (?:[\s-]+) + + # Regular expression to match zip prefix + kZipPrefixValueRe: + regex_reference: kAlphaNumericSequenceRe + + # Regular expression to match zip suffix + kZipSuffixValueRe: + regex_reference: kAlphaNumericSequenceRe capture_definitions: # Returns an expression to parse a CJK name that includes one separator. @@ -420,6 +438,40 @@ capture_definitions: no_capture: parts: [ {regex_fragment: '\A\s*'} ] + # Returns an expression to parse `postal-code` into `postal-code-prefix` + # and `postal-code-suffix`, separator is mandatory. + ParsePostalCodeMandatorySeparatorExpression: + capture: + output: postal-code + parts: + - capture: + output: postal-code-prefix + parts: [ {regex_reference: kZipPrefixValueRe} ] + - separator: {regex_reference: kZipCodeSeparatorsRe} + - capture: + output: postal-code-suffix + parts: [ {regex_reference: kZipSuffixValueRe} ] + quantifier: MATCH_OPTIONAL + + # Returns an expression to parse `postal-code` into `postal-code-prefix` + # and `postal-code-suffix`, separator is optional. + ParsePostalCodeOptionalSeparatorExpression: + capture: + output: postal-code + parts: + - capture: + output: postal-code-prefix + parts: [ {regex_reference: kZipPrefixValueRe} ] + - no_capture: + parts: + - separator: {regex_reference: kZipCodeSeparatorsRe} + quantifier: MATCH_OPTIONAL + - capture: + output: postal-code-suffix + parts: [ {regex_reference: kZipSuffixValueRe} ] + quantifier: MATCH_OPTIONAL + + parsing_definitions: name: # If the name is a CJK name, try to match in the following order: @@ -462,6 +514,10 @@ parsing_definitions: - decomposition: {capture_reference: ParseLastCommaFirstMiddleNameExpression} - decomposition: {capture_reference: ParseFirstMiddleLastNameExpression} + postal-code: + decomposition: + capture_reference: ParsePostalCodeMandatorySeparatorExpression + test_capture_definitions: - id: "Chinese name, Unihan" capture_name: ParseSeparatedCjkNameExpression @@ -914,3 +970,33 @@ test_parsing_definitions: # family-name-first: "" # family-name-conjunction: "" # family-name-second: "孫" +- id: US zip code with suffix. + type: postal-code + input: "90210-5555" + output: + postal-code-prefix: "90210" + postal-code-suffix: "5555" +- id: US zip code without suffix. + type: postal-code + input: "90210" + output: + postal-code-prefix: "90210" + postal-code-suffix: "" +- id: Canadian postal code. + type: postal-code + input: "M5V 2T6" + output: + postal-code-prefix: "M5V" + postal-code-suffix: "2T6" +- id: UK postal code. + type: postal-code + input: "SW1A 0AA" + output: + postal-code-prefix: "SW1A" + postal-code-suffix: "0AA" +- id: Polish postal code. + type: postal-code + input: "00-950" + output: + postal-code-prefix: "00" + postal-code-suffix: "950" From 714afb7d005112a2fd8f4241ebc101f9c6d703cd Mon Sep 17 00:00:00 2001 From: m-maryia Date: Mon, 14 Jul 2025 16:24:09 +0200 Subject: [PATCH 3/5] remove symlinks and redundant parsing definition --- model/build_internal_version.sh | 1 - model/countries/PL/PL-parsing-rules.yaml | 16 ---------------- model/vendor | 1 - 3 files changed, 18 deletions(-) delete mode 120000 model/build_internal_version.sh delete mode 120000 model/vendor diff --git a/model/build_internal_version.sh b/model/build_internal_version.sh deleted file mode 120000 index ebee58d..0000000 --- a/model/build_internal_version.sh +++ /dev/null @@ -1 +0,0 @@ -/Users/mmaryia/autofill-i18n-model/build_internal_version.sh \ No newline at end of file diff --git a/model/countries/PL/PL-parsing-rules.yaml b/model/countries/PL/PL-parsing-rules.yaml index d1dbab8..f9a0e8a 100644 --- a/model/countries/PL/PL-parsing-rules.yaml +++ b/model/countries/PL/PL-parsing-rules.yaml @@ -65,22 +65,6 @@ capture_definitions: output: unit-name parts: [{regex_reference: kUnitNameValueRe}] quantifier: MATCH_OPTIONAL - - ParsePostalCodeExpression: - capture: - output: postal-code - parts: - - capture: - output: postal-code-prefix - parts: [ {regex_reference: kZipPrefixValueRe} ] - - no_capture: - parts: - - separator: {regex_reference: kZipCodeSeparatorsRe} - quantifier: MATCH_OPTIONAL - - capture: - output: postal-code-suffix - parts: [ {regex_reference: kZipSuffixValueRe} ] - quantifier: MATCH_OPTIONAL parsing_definitions: diff --git a/model/vendor b/model/vendor deleted file mode 120000 index 3f91278..0000000 --- a/model/vendor +++ /dev/null @@ -1 +0,0 @@ -/Users/mmaryia/autofill-i18n-model/vendor/ \ No newline at end of file From d4c7a2ddd84e9c75b13ddb67e63aad23cd6248e2 Mon Sep 17 00:00:00 2001 From: m-maryia Date: Mon, 14 Jul 2025 16:48:51 +0200 Subject: [PATCH 4/5] fix test ids --- model/countries/BR/BR-parsing-rules.yaml | 4 ++-- model/countries/CA/CA-parsing-rules.yaml | 4 ++-- model/countries/NL/NL-parsing-rules.yaml | 4 ++-- model/countries/PL/PL-parsing-rules.yaml | 4 ++-- model/countries/US/US-parsing-rules.yaml | 8 ++++---- model/countries/global/global-parsing-rules.yaml | 10 +++++----- 6 files changed, 17 insertions(+), 17 deletions(-) diff --git a/model/countries/BR/BR-parsing-rules.yaml b/model/countries/BR/BR-parsing-rules.yaml index fc18dc4..14aeb1a 100644 --- a/model/countries/BR/BR-parsing-rules.yaml +++ b/model/countries/BR/BR-parsing-rules.yaml @@ -336,13 +336,13 @@ test_parsing_definitions: unit-name: "12" floor: "1" landmark: "foo" -- id: "zip code with separator" +- id: "Zip code with separator" type: postal-code input: "12345-678" output: postal-code-prefix: "12345" postal-code-suffix: "678" -- id: "zip code without separator" +- id: "Zip code without separator" type: postal-code input: "12345678" output: diff --git a/model/countries/CA/CA-parsing-rules.yaml b/model/countries/CA/CA-parsing-rules.yaml index 81e8bb2..4c7110c 100644 --- a/model/countries/CA/CA-parsing-rules.yaml +++ b/model/countries/CA/CA-parsing-rules.yaml @@ -18,13 +18,13 @@ parsing_definitions: capture_reference: ParsePostalCodeOptionalSeparatorExpression test_parsing_definitions: -- id: "zip code" +- id: "Zip code with separator" type: postal-code input: "K1A 0B1" output: postal-code-prefix: "K1A" postal-code-suffix: "0B1" -- id: "zip code without separator" +- id: "Zip code without separator" type: postal-code input: "K1A0B1" output: diff --git a/model/countries/NL/NL-parsing-rules.yaml b/model/countries/NL/NL-parsing-rules.yaml index badaec4..b7a689e 100644 --- a/model/countries/NL/NL-parsing-rules.yaml +++ b/model/countries/NL/NL-parsing-rules.yaml @@ -253,13 +253,13 @@ test_parsing_definitions: building: "146" unit: "A-02" unit-name: "A-02" -- id: "zip code with separator" +- id: "Test 19: zip code with separator" type: postal-code input: "1234 AB" output: postal-code-prefix: "1234" postal-code-suffix: "AB" -- id: "zip code without separator" +- id: "Test 20: zip code without separator" type: postal-code input: "1234AB" output: diff --git a/model/countries/PL/PL-parsing-rules.yaml b/model/countries/PL/PL-parsing-rules.yaml index f9a0e8a..7ede4eb 100644 --- a/model/countries/PL/PL-parsing-rules.yaml +++ b/model/countries/PL/PL-parsing-rules.yaml @@ -235,13 +235,13 @@ test_parsing_definitions: unit: "m.10" unit-type: "m." unit-name: "10" -- id: "zip code with separator" +- id: "Test 15: zip code with separator" type: postal-code input: "00-843" output: postal-code-prefix: "00" postal-code-suffix: "843" -- id: "zip code without separator" +- id: "Test 16: zip code without separator" type: postal-code input: "00843" output: diff --git a/model/countries/US/US-parsing-rules.yaml b/model/countries/US/US-parsing-rules.yaml index 317e1d1..cbd6893 100644 --- a/model/countries/US/US-parsing-rules.yaml +++ b/model/countries/US/US-parsing-rules.yaml @@ -13,25 +13,25 @@ parsing_definitions: capture_reference: ParsePostalCodeOptionalSeparatorExpression test_parsing_definitions: -- id: "zip code with suffix" +- id: "Zip code with suffix" type: postal-code input: "90210-5555" output: postal-code-prefix: "90210" postal-code-suffix: "5555" -- id: "zip code without suffix" +- id: "Zip code without suffix" type: postal-code input: "90210" output: postal-code-prefix: "90210" postal-code-suffix: "" -- id: "zip code with extra spaces" +- id: "Zip code with extra spaces" type: postal-code input: "90210 - 5555" output: postal-code-prefix: "90210" postal-code-suffix: "5555" -- id: "zip code without separator" +- id: "Zip code without separator" type: postal-code input: "902105555" output: diff --git a/model/countries/global/global-parsing-rules.yaml b/model/countries/global/global-parsing-rules.yaml index 1121071..d28508d 100644 --- a/model/countries/global/global-parsing-rules.yaml +++ b/model/countries/global/global-parsing-rules.yaml @@ -970,31 +970,31 @@ test_parsing_definitions: # family-name-first: "" # family-name-conjunction: "" # family-name-second: "孫" -- id: US zip code with suffix. +- id: US zip code with suffix type: postal-code input: "90210-5555" output: postal-code-prefix: "90210" postal-code-suffix: "5555" -- id: US zip code without suffix. +- id: US zip code without suffix type: postal-code input: "90210" output: postal-code-prefix: "90210" postal-code-suffix: "" -- id: Canadian postal code. +- id: Canadian postal code type: postal-code input: "M5V 2T6" output: postal-code-prefix: "M5V" postal-code-suffix: "2T6" -- id: UK postal code. +- id: UK postal code type: postal-code input: "SW1A 0AA" output: postal-code-prefix: "SW1A" postal-code-suffix: "0AA" -- id: Polish postal code. +- id: Polish postal code type: postal-code input: "00-950" output: From fdf34327f3ae80633f64785a31560571298bd154 Mon Sep 17 00:00:00 2001 From: m-maryia Date: Mon, 21 Jul 2025 18:05:29 +0200 Subject: [PATCH 5/5] move kAlphaNumericSequenceRe to the top of file --- model/countries/global/global-parsing-rules.yaml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/model/countries/global/global-parsing-rules.yaml b/model/countries/global/global-parsing-rules.yaml index d28508d..34fd886 100644 --- a/model/countries/global/global-parsing-rules.yaml +++ b/model/countries/global/global-parsing-rules.yaml @@ -7,6 +7,11 @@ regex_definitions: kCommaOrWhitespaceSeparator: regex_fragment: (?:^|[,\s]+) + # Regular expression pattern to match a sequence of alphanumeric chars. + kAlphaNumericSequenceRe: + regex_fragment: + (?:[^\W_]+) + # Regular expressions pattern of common two-character CJK last names. # Korean names are written in Hangul. # Chinese names are written in their traditional and simplified version. @@ -178,11 +183,6 @@ regex_definitions: kMiddleNameInitialsCharacteristicsRe: regex_fragment: |- ^(?:[A-Z]\.?(?:(?:\s|-)?[A-Z]\.?)*)$ - - # Regular expression pattern to match a sequence of alphanumeric chars. - kAlphaNumericSequenceRe: - regex_fragment: - (?:[^\W_]+) # Regular expression pattern to match the separator between # zip code prefix and suffix.