From 4b1f352e542581e5c73752bd90ab674dff6b4161 Mon Sep 17 00:00:00 2001 From: Peter Johnson <738069+missinglink@users.noreply.github.com> Date: Mon, 9 Mar 2026 15:11:47 +0100 Subject: [PATCH] feat(dedupe): squash empire and country --- helper/diffPlaces.js | 19 ++++ test/unit/middleware/dedupe.js | 168 +++++++++++++++++++++++++++++++++ 2 files changed, 187 insertions(+) diff --git a/helper/diffPlaces.js b/helper/diffPlaces.js index 0a3c195b7..136c90db7 100644 --- a/helper/diffPlaces.js +++ b/helper/diffPlaces.js @@ -95,6 +95,14 @@ function isParentHierarchyDifferent(item1, item2){ } } + // special case to consider empires and country as the same for deduplication purposes + if ( + (item1.layer === 'empire' && item2.layer === 'country') || + (item1.layer === 'country' && item2.layer === 'empire') + ) { + return false; + } + // special handling of postal codes, which we consider to be strictly // unique within a single country/dependency regardless of the rest of // the hierarchy (ie. we ignore other parent properties) @@ -351,6 +359,17 @@ function layerDependentNormalization(names, layer) { }); } + // empire / country USA synonyms + if (layer === 'empire' || layer === 'country') { + _.forEach(names, (value, lang) => { + copy[lang] = field.getArrayValue(value).map(name => { + return name + .replace(/^(united states) of america$/i, '$1') + .trim(); + }); + }); + } + // county if( layer === 'county' ){ _.forEach(names, (value, lang) => { diff --git a/test/unit/middleware/dedupe.js b/test/unit/middleware/dedupe.js index f8872ee4b..d3ea2178e 100644 --- a/test/unit/middleware/dedupe.js +++ b/test/unit/middleware/dedupe.js @@ -821,6 +821,174 @@ module.exports.tests.priority = function(test, common) { }); }); + test('real-world test New Zealand: empire vs country', function (t) { + var req = { + clean: { + text: 'New Zealand', + size: 100 + } + }; + var res = { + data: [ + { + 'name': { + 'default': 'New Zealand' + }, + 'source': 'whosonfirst', + 'source_id': '136253053', + 'layer': 'empire', + 'parent': { + 'empire_id': 136253053 + }, + }, + { + 'name': { + 'default': [ 'New Zealand', 'Aotearoa' ], + }, + 'source': 'whosonfirst', + 'source_id': '85633345', + 'layer': 'country', + 'parent': { + 'continent_id': 102191583, + 'country_id': 85633345 + }, + } + ] + }; + + dedupe(req, res, function () { + t.equal(res.data.length, 1, 'results have fewer items than before'); + t.equal(res.data[0].layer, 'country', 'empire result removed'); + t.end(); + }); + }); + + test('real-world test New Zealand: empire vs country - inverted order', function (t) { + var req = { + clean: { + text: 'New Zealand', + size: 100 + } + }; + var res = { + data: [ + { + 'name': { + 'default': [ 'New Zealand', 'Aotearoa' ], + }, + 'source': 'whosonfirst', + 'source_id': '85633345', + 'layer': 'country', + 'parent': { + 'continent_id': 102191583, + 'country_id': 85633345 + }, + }, + { + 'name': { + 'default': 'New Zealand' + }, + 'source': 'whosonfirst', + 'source_id': '136253053', + 'layer': 'empire', + 'parent': { + 'empire_id': 136253053 + }, + } + ] + }; + + dedupe(req, res, function () { + t.equal(res.data.length, 1, 'results have fewer items than before'); + t.equal(res.data[0].layer, 'country', 'empire result removed'); + t.end(); + }); + }); + + test('real-world test United States: empire vs country', function (t) { + var req = { + clean: { + text: 'United States', + size: 100 + } + }; + var res = { + data: [ + { + 'name': { + 'default': 'United States' + }, + 'source': 'whosonfirst', + 'source_id': '85633793', + 'layer': 'country', + 'parent': { + 'empire_id': 136253057, + 'country_id': 85633793 + }, + }, + { + 'name': { + 'default': 'United States of America', + }, + 'source': 'whosonfirst', + 'source_id': '136253057', + 'layer': 'empire', + 'parent': { + 'empire_id': 136253057, + }, + } + ] + }; + + dedupe(req, res, function () { + t.equal(res.data.length, 1, 'results have fewer items than before'); + t.equal(res.data[0].layer, 'country', 'empire result removed'); + t.end(); + }); + }); + + test('real-world test United States: empire vs country - inverted order', function (t) { + var req = { + clean: { + text: 'United States', + size: 100 + } + }; + var res = { + data: [ + { + 'name': { + 'default': 'United States of America', + }, + 'source': 'whosonfirst', + 'source_id': '136253057', + 'layer': 'empire', + 'parent': { + 'empire_id': 136253057, + }, + }, + { + 'name': { + 'default': 'United States' + }, + 'source': 'whosonfirst', + 'source_id': '85633793', + 'layer': 'country', + 'parent': { + 'empire_id': 136253057, + 'country_id': 85633793 + }, + } + ] + }; + + dedupe(req, res, function () { + t.equal(res.data.length, 1, 'results have fewer items than before'); + t.equal(res.data[0].layer, 'country', 'empire result removed'); + t.end(); + }); + }); + test('A->B B->C dependency graph', function (t) { var req = { clean: {