From 1e9711f5202c9e09d6dab7e7eafdcc076ed16d09 Mon Sep 17 00:00:00 2001 From: Israel Date: Fri, 7 Nov 2025 11:08:04 -0700 Subject: [PATCH 01/12] Add aggregated and concat dup detection strategies. Update front end dataqa plugin to follow the new schema for p_dups. --- kardia-app/modules/base/dups/address.qy | 62 ++++ kardia-app/modules/base/dups/concat.qy | 62 ++++ kardia-app/modules/base/dups/dups.cluster | 52 +++ kardia-app/modules/base/dups/email.qy | 64 ++++ kardia-app/modules/base/dups/get/addresses.qy | 25 ++ kardia-app/modules/base/dups/get/concats.qy | 56 +++ kardia-app/modules/base/dups/get/emails.qy | 18 + kardia-app/modules/base/dups/get/names.qy | 25 ++ kardia-app/modules/base/dups/get/phones.qy | 21 ++ kardia-app/modules/base/dups/globals.qy | 18 + kardia-app/modules/base/dups/name.qy | 53 +++ kardia-app/modules/base/dups/phone.qy | 62 ++++ kardia-app/modules/base/dups/update.qy | 350 ++++++++++++++++++ .../base/plugin_base_dataqa_duplicates.cmp | 134 +++++-- kardia-app/modules/base/update_duplicates.qy | 28 -- 15 files changed, 970 insertions(+), 60 deletions(-) create mode 100644 kardia-app/modules/base/dups/address.qy create mode 100644 kardia-app/modules/base/dups/concat.qy create mode 100644 kardia-app/modules/base/dups/dups.cluster create mode 100644 kardia-app/modules/base/dups/email.qy create mode 100644 kardia-app/modules/base/dups/get/addresses.qy create mode 100644 kardia-app/modules/base/dups/get/concats.qy create mode 100644 kardia-app/modules/base/dups/get/emails.qy create mode 100644 kardia-app/modules/base/dups/get/names.qy create mode 100644 kardia-app/modules/base/dups/get/phones.qy create mode 100644 kardia-app/modules/base/dups/globals.qy create mode 100644 kardia-app/modules/base/dups/name.qy create mode 100644 kardia-app/modules/base/dups/phone.qy create mode 100644 kardia-app/modules/base/dups/update.qy delete mode 100644 kardia-app/modules/base/update_duplicates.qy diff --git a/kardia-app/modules/base/dups/address.qy b/kardia-app/modules/base/dups/address.qy new file mode 100644 index 00000000..74484b30 --- /dev/null +++ b/kardia-app/modules/base/dups/address.qy @@ -0,0 +1,62 @@ +$Version=2$ +dups_test_query "system/query" + { + sql = " + print 'address.qy start' + + -- Get the list of addresses we will be searching for duplicates. + DECLARE collection addresses scope application; + + -- Assume the listof addresses was already computed by the caller. + -- INSERT INTO collection addresses SELECT :key, :name FROM /apps/kardia/modules/base/dups/get/addresses.qy ; + + -- Declare an object for storing values locally. + declare object value; + + -- Count data. + SELECT + :value:num_data = count(1) + FROM + collection addresses + ; + + -- Compute global values. + SELECT + :value:k = :k, + :value:algorithm = :algorithm + FROM + expression ('/apps/kardia/modules/base/dups/globals.qy?num_data=' + :value:num_data) + ; + + -- Detect dups. + SELECT + key1 = :p1:p_partner_key, + key2 = :p2:p_partner_key, + sim = max(:d:sim) + FROM + identity /apps/kardia/data/Kardia_DB/p_partner/rows p1, + /apps/kardia/data/Kardia_DB/p_location/rows l1, + /apps/kardia/data/Kardia_DB/p_partner/rows p2, + /apps/kardia/data/Kardia_DB/p_location/rows l2, + expression ( + '/apps/kardia/modules/base/dups/dups.cluster' + + '?algorithm=' + :value:algorithm + + '&k=' + :value:k + + '&field=addresses' + + '&data=address' + + '/dups' + ) d + WHERE + :p1:p_partner_key = :l1:p_partner_key + AND :p2:p_partner_key = :l2:p_partner_key + AND :p1:p_partner_key = :d:key1 + AND :p2:p_partner_key = :d:key2 + AND :d:sim is not null AND :d:sim is not null AND :d:sim is not null -- Query optimization + AND :d:sim is not null AND :d:sim is not null AND :d:sim is not null -- Query optimization + GROUP BY + :p1:p_partner_key, + :p2:p_partner_key + ; + print 'address.qy done' + "; + } diff --git a/kardia-app/modules/base/dups/concat.qy b/kardia-app/modules/base/dups/concat.qy new file mode 100644 index 00000000..93a5998b --- /dev/null +++ b/kardia-app/modules/base/dups/concat.qy @@ -0,0 +1,62 @@ +$Version=2$ +dups_test_query "system/query" + { + sql = " + print 'concat.qy start' + declare object value; + + -- Count data. + SELECT + :value:num_data = count(1) + FROM + /apps/kardia/modules/base/dups/get/concats.qy + ; + + -- Compute global values. + SELECT + :value:k = :k, + :value:algorithm = :algorithm + FROM + expression ('/apps/kardia/modules/base/dups/globals.qy?num_data=' + :value:num_data) + ; + + -- Detect dups. + SELECT + key1 = :p1:p_partner_key, + key2 = :p2:p_partner_key, + sim = max(:d:sim) + FROM + identity /apps/kardia/data/Kardia_DB/p_partner/rows p1, + /apps/kardia/data/Kardia_DB/p_contact_info/rows e1, + /apps/kardia/data/Kardia_DB/p_contact_info/rows ph1, + /apps/kardia/data/Kardia_DB/p_location/rows l1, + /apps/kardia/data/Kardia_DB/p_partner/rows p2, + /apps/kardia/data/Kardia_DB/p_contact_info/rows e2, + /apps/kardia/data/Kardia_DB/p_contact_info/rows ph2, + /apps/kardia/data/Kardia_DB/p_location/rows l2, + expression ( + '/apps/kardia/modules/base/dups/dups.cluster' + + '?algorithm=' + :value:algorithm + + '&k=' + :value:k + + '&field=concats' + + '&data=data' + + '/concat_dups' + ) d + WHERE + :p1:p_partner_key = :d:key1 + AND :p2:p_partner_key = :d:key2 + AND :p1:p_partner_key *= :e1:p_partner_key + AND :p1:p_partner_key *= :ph1:p_partner_key + AND :p1:p_partner_key *= :l1:p_partner_key + AND :p2:p_partner_key *= :e2:p_partner_key + AND :p2:p_partner_key *= :ph2:p_partner_key + AND :p2:p_partner_key *= :l2:p_partner_key + AND :d:sim is not null AND :d:sim is not null AND :d:sim is not null -- Query optimization + AND :d:sim is not null AND :d:sim is not null AND :d:sim is not null -- Query optimization + GROUP BY + :p1:p_partner_key, + :p2:p_partner_key + ; + print 'concat.qy done' + "; + } diff --git a/kardia-app/modules/base/dups/dups.cluster b/kardia-app/modules/base/dups/dups.cluster new file mode 100644 index 00000000..b1aab6be --- /dev/null +++ b/kardia-app/modules/base/dups/dups.cluster @@ -0,0 +1,52 @@ +$Version=2$ +get_dups "system/cluster" + { + algorithm "cluster/parameter" { type = string; style = notnull; } + k "cluster/parameter" { type = integer; style = notnull; } + field "cluster/parameter" { type = string; style = notnull; } + data "cluster/parameter" { type = string; style = notnull; } + + source = runserver('/apps/kardia/modules/base/dups/get/' + :parameters:field + '.qy'); + key_attr = "key"; + data_attr = runserver(:parameters:data); + + kmeans "cluster/cluster" + { + algorithm = "k-means"; + similarity_measure = "cosine"; + num_clusters = runserver(:parameters:k); + min_improvement = 0.0001; + max_iterations = 32; + } + + none "cluster/cluster" + { + algorithm = "none"; + } + + dups "cluster/search" + { + source = runserver(:parameters:algorithm); + similarity_measure = "cosine"; + threshold = 0.7; + } + + phone_dups "cluster/search" + { + source = runserver(:parameters:algorithm); + similarity_measure = "levenshtein"; + threshold = 0.7; + } + + concat_dups "cluster/search" + { + source = runserver(:parameters:algorithm); + similarity_measure = "cosine"; + threshold = 0.65; + } + } + + +// --\'sql = " -- Syntax highlighting. +// --"sql = " -- Syntax highlighting. +// '// '" // Terminate highlighting hacks. diff --git a/kardia-app/modules/base/dups/email.qy b/kardia-app/modules/base/dups/email.qy new file mode 100644 index 00000000..0cef2607 --- /dev/null +++ b/kardia-app/modules/base/dups/email.qy @@ -0,0 +1,64 @@ +$Version=2$ +dups_test_query "system/query" + { + sql = " + print 'email.qy start' + + -- Get the list of emails we will be searching for duplicates. + DECLARE collection emails scope application; + + -- Assume the listof emails was already computed by the caller. + -- INSERT INTO collection emails SELECT :key, :name FROM /apps/kardia/modules/base/dups/get/emails.qy ; + + -- Declare an object for storing values locally. + declare object value; + + -- Count data. + SELECT + :value:num_data = count(1) + FROM + collection emails + ; + + -- Compute global values. + SELECT + :value:k = :k, + :value:algorithm = :algorithm + FROM + expression ('/apps/kardia/modules/base/dups/globals.qy?num_data=' + :value:num_data) + ; + + -- Detect dups. + SELECT + key1 = :p1:p_partner_key, + key2 = :p2:p_partner_key, + sim = max(:d:sim) + FROM + identity /apps/kardia/data/Kardia_DB/p_partner/rows p1, + /apps/kardia/data/Kardia_DB/p_contact_info/rows c1, + /apps/kardia/data/Kardia_DB/p_partner/rows p2, + /apps/kardia/data/Kardia_DB/p_contact_info/rows c2, + expression ( + '/apps/kardia/modules/base/dups/dups.cluster' + + '?algorithm=' + :value:algorithm + + '&k=' + :value:k + + '&field=emails' + + '&data=email' + + '/dups' + ) d + WHERE + :p1:p_partner_key = :c1:p_partner_key + AND :p2:p_partner_key = :c2:p_partner_key + AND :p1:p_partner_key = :d:key1 + AND :p2:p_partner_key = :d:key2 + AND :c1:p_contact_type = 'E' + AND :c2:p_contact_type = 'E' + AND :d:sim is not null AND :d:sim is not null AND :d:sim is not null -- Query optimization + AND :d:sim is not null AND :d:sim is not null AND :d:sim is not null -- Query optimization + GROUP BY + :p1:p_partner_key, + :p2:p_partner_key + ; + print 'email.qy done' + "; + } diff --git a/kardia-app/modules/base/dups/get/addresses.qy b/kardia-app/modules/base/dups/get/addresses.qy new file mode 100644 index 00000000..6364e083 --- /dev/null +++ b/kardia-app/modules/base/dups/get/addresses.qy @@ -0,0 +1,25 @@ +$Version=2$ +dups_test_query "system/query" + { + sql = " + print 'get/addresses.qy start' + SELECT + key = :p:p_partner_key, + address = '' + + isnull(:l:p_in_care_of, '') + + isnull(:l:p_address_1, '') + + isnull(:l:p_address_2, '') + + isnull(:l:p_address_3, '') + + isnull(:l:p_city, '') + + isnull(:l:p_state_province, '') + + isnull(:l:p_country_code, '') + + isnull(:l:p_postal_code, '') + FROM + identity /apps/kardia/data/Kardia_DB/p_partner/rows p, + /apps/kardia/data/Kardia_DB/p_location/rows l + WHERE + :p:p_partner_key = :l:p_partner_key + ; + print 'get/addresses.qy end' + "; + } \ No newline at end of file diff --git a/kardia-app/modules/base/dups/get/concats.qy b/kardia-app/modules/base/dups/get/concats.qy new file mode 100644 index 00000000..dd93dd80 --- /dev/null +++ b/kardia-app/modules/base/dups/get/concats.qy @@ -0,0 +1,56 @@ +$Version=2$ +dups_test_query "system/query" + { + sql = " + print 'get/concats.qy start' + SELECT + key = :p:p_partner_key, + data = '' + -- Name + + isnull(:p_given_name, '') + + isnull(condition( + char_length(isnull(:p_given_name, '')) > 1 + AND char_length(isnull(:p_surname, '')) > 1 + AND :p_given_name != :p_preferred_name + AND :p_surname != :p_preferred_name, + :p_preferred_name, + '' + ), '') + + isnull(:p_surname, '') + + isnull(:p_org_name, '') + + '; ' + + -- Email + + isnull(:e:p_contact_data, '') + + '; ' + + -- Phone + + isnull(:ph:p_phone_country, '') + + isnull(:ph:p_phone_area_city, '') + + isnull(:ph:p_contact_data, '') + + '; ' + + -- Address + + isnull(:l:p_in_care_of, '') + + isnull(:l:p_address_1, '') + + isnull(:l:p_address_2, '') + + isnull(:l:p_address_3, '') + + isnull(:l:p_city, '') + + isnull(:l:p_state_province, '') + + isnull(:l:p_country_code, '') + + isnull(:l:p_postal_code, '') + FROM + identity /apps/kardia/data/Kardia_DB/p_partner/rows p, + /apps/kardia/data/Kardia_DB/p_contact_info/rows e, + /apps/kardia/data/Kardia_DB/p_contact_info/rows ph, + /apps/kardia/data/Kardia_DB/p_location/rows l + WHERE + :p:p_partner_key *= :e:p_partner_key + AND :p:p_partner_key *= :ph:p_partner_key + AND :p:p_partner_key *= :l:p_partner_key + AND :e:p_contact_type = 'E' + AND (:ph:p_contact_type = 'P' OR :ph:p_contact_type = 'C') + ; + print 'get/concats.qy end' + "; + } \ No newline at end of file diff --git a/kardia-app/modules/base/dups/get/emails.qy b/kardia-app/modules/base/dups/get/emails.qy new file mode 100644 index 00000000..9d7bccdc --- /dev/null +++ b/kardia-app/modules/base/dups/get/emails.qy @@ -0,0 +1,18 @@ +$Version=2$ +dups_test_query "system/query" + { + sql = " + print 'get/emails.qy start' + SELECT + key = :p:p_partner_key, + email = :c:p_contact_data + FROM + identity /apps/kardia/data/Kardia_DB/p_partner/rows p, + /apps/kardia/data/Kardia_DB/p_contact_info/rows c + WHERE + :p:p_partner_key = :c:p_partner_key + AND :c:p_contact_type = 'E' + ; + print 'get/emails.qy end' + "; + } \ No newline at end of file diff --git a/kardia-app/modules/base/dups/get/names.qy b/kardia-app/modules/base/dups/get/names.qy new file mode 100644 index 00000000..7f2b2f57 --- /dev/null +++ b/kardia-app/modules/base/dups/get/names.qy @@ -0,0 +1,25 @@ +$Version=2$ +dups_test_query "system/query" + { + sql = " + print 'get/names.qy start' + SELECT + key = :p_partner_key, + name = '' + + isnull(:p_given_name, '') + + isnull(condition( + char_length(isnull(:p_given_name, '')) > 1 + AND char_length(isnull(:p_surname, '')) > 1 + AND :p_given_name != :p_preferred_name + AND :p_surname != :p_preferred_name, + :p_preferred_name, + '' + ), '') + + isnull(:p_surname, '') + + isnull(:p_org_name, '') + FROM + /apps/kardia/data/Kardia_DB/p_partner/rows + ; + print 'get/names.qy end' + "; + } diff --git a/kardia-app/modules/base/dups/get/phones.qy b/kardia-app/modules/base/dups/get/phones.qy new file mode 100644 index 00000000..2f8bdac6 --- /dev/null +++ b/kardia-app/modules/base/dups/get/phones.qy @@ -0,0 +1,21 @@ +$Version=2$ +dups_test_query "system/query" + { + sql = " + print 'get/phones.qy start' + SELECT + key = :p:p_partner_key, + phone = '' + + isnull(:c:p_phone_country, '') + + isnull(:c:p_phone_area_city, '') + + isnull(:c:p_contact_data, '') + FROM + identity /apps/kardia/data/Kardia_DB/p_partner/rows p, + /apps/kardia/data/Kardia_DB/p_contact_info/rows c + WHERE + :p:p_partner_key = :c:p_partner_key + AND ((:c:p_contact_type = 'P') + (:c:p_contact_type = 'C')) + ; + print 'get/phones.qy end' + "; + } \ No newline at end of file diff --git a/kardia-app/modules/base/dups/globals.qy b/kardia-app/modules/base/dups/globals.qy new file mode 100644 index 00000000..f489cafd --- /dev/null +++ b/kardia-app/modules/base/dups/globals.qy @@ -0,0 +1,18 @@ +$Version=2$ +dups_test_query "system/query" + { + num_data "query/parameter" { type = integer; style = strnull; min=1; } + + sql = " + SELECT + -- Compute the k value for clustering. + k = constrain(convert(integer, power(logn(:parameters:num_data, 36), 3.2) - 8), 2, convert(integer, NULL)), + + -- Compute which clustering algorithm to use (no clustering is better for small amounts of data). + algorithm = condition(:parameters:num_data > 100000, 'kmeans', 'none') + FROM + /apps/kardia/data/Kardia_DB/_a_alphabet/rows + LIMIT 1 + ; + "; + } diff --git a/kardia-app/modules/base/dups/name.qy b/kardia-app/modules/base/dups/name.qy new file mode 100644 index 00000000..17adfe32 --- /dev/null +++ b/kardia-app/modules/base/dups/name.qy @@ -0,0 +1,53 @@ +$Version=2$ +name_dups "system/query" + { + sql = " + print 'name.qy end' + + -- Get the list of names we will be searching for duplicates. + DECLARE collection names scope application; + + -- Assume the listof names was already computed by the caller. + -- INSERT INTO collection names SELECT :key, :name FROM /apps/kardia/modules/base/dups/get/names.qy ; + + -- Declare an object for storing values locally. + declare object value; + + -- Count data. + SELECT + :value:num_data = count(1) + FROM + collection names + ; + + -- Compute global values. + SELECT + :value:k = :k, + :value:algorithm = :algorithm + FROM + expression ('/apps/kardia/modules/base/dups/globals.qy?num_data=' + :value:num_data) + ; + + -- Detect dups. + SELECT + key1 = :p1:p_partner_key, + key2 = :p2:p_partner_key, + sim = :c:sim + FROM + identity /apps/kardia/data/Kardia_DB/p_partner/rows p1, + /apps/kardia/data/Kardia_DB/p_partner/rows p2, + expression ( + '/apps/kardia/modules/base/dups/dups.cluster' + + '?algorithm=' + :value:algorithm + + '&k=' + :value:k + + '&field=names' + + '&data=name' + + '/dups' + ) c + WHERE + :c:key1 = :p1:p_partner_key + AND :c:key2 = :p2:p_partner_key + ; + print 'name.qy end' + "; + } diff --git a/kardia-app/modules/base/dups/phone.qy b/kardia-app/modules/base/dups/phone.qy new file mode 100644 index 00000000..6d84242c --- /dev/null +++ b/kardia-app/modules/base/dups/phone.qy @@ -0,0 +1,62 @@ +$Version=2$ +dups_test_query "system/query" + { + sql = " + print 'phone.qy start' + + -- Get the list of phones we will be searching for duplicates. + DECLARE collection phones scope application; + + -- Assume the listof phones was already computed by the caller. + -- INSERT INTO collection phones SELECT :key, :name FROM /apps/kardia/modules/base/dups/get/phones.qy ; + + -- Declare an object for storing values locally. + declare object value; + + -- Count data. + SELECT + :value:num_data = count(1) + FROM + collection phones + ; + + -- Compute global values. + SELECT + :value:k = :k, + :value:algorithm = :algorithm + FROM + expression ('/apps/kardia/modules/base/dups/globals.qy?num_data=' + :value:num_data) + ; + + -- Detect dups. + SELECT + key1 = :p1:p_partner_key, + key2 = :p2:p_partner_key, + sim = max(:d:sim) + FROM + identity /apps/kardia/data/Kardia_DB/p_partner/rows p1, + /apps/kardia/data/Kardia_DB/p_contact_info/rows c1, + /apps/kardia/data/Kardia_DB/p_partner/rows p2, + /apps/kardia/data/Kardia_DB/p_contact_info/rows c2, + expression ( + '/apps/kardia/modules/base/dups/dups.cluster' + + '?algorithm=' + :value:algorithm + + '&k=' + :value:k + + '&field=phones' + + '&data=phone' + + '/phone_dups' + ) d + WHERE + :p1:p_partner_key = :c1:p_partner_key + AND :p2:p_partner_key = :c2:p_partner_key + AND :p1:p_partner_key = :d:key1 + AND :p2:p_partner_key = :d:key2 + AND :d:sim is not null AND :d:sim is not null AND :d:sim is not null -- Query optimization + AND :d:sim is not null AND :d:sim is not null AND :d:sim is not null -- Query optimization + GROUP BY + :p1:p_partner_key, + :p2:p_partner_key + ; + print 'phone.qy done' + "; + } diff --git a/kardia-app/modules/base/dups/update.qy b/kardia-app/modules/base/dups/update.qy new file mode 100644 index 00000000..9964daa6 --- /dev/null +++ b/kardia-app/modules/base/dups/update.qy @@ -0,0 +1,350 @@ +$Version=2$ +update_duplicates "system/query" + { + // db "query/parameter" { type=string; default="/apps/kardia/data/Kardia_DB"; } + + sql = " + -- Get collections for each set of possible dups. + -- These collections are used by the following scripts (such as + -- dups/name.qy, dups/email.qy, etc.), avoiding the need to reopen + -- the query file over and over again. + DECLARE collection names scope application; + DECLARE collection emails scope application; + DECLARE collection phones scope application; + DECLARE collection addresses scope application; + DELETE FROM collection names; + DELETE FROM collection emails; + DELETE FROM collection phones; + DELETE FROM collection addresses; + INSERT INTO collection names SELECT :key, :name FROM /apps/kardia/modules/base/dups/get/names.qy ; + INSERT INTO collection emails SELECT :key, :email FROM /apps/kardia/modules/base/dups/get/emails.qy ; + INSERT INTO collection phones SELECT :key, :phone FROM /apps/kardia/modules/base/dups/get/phones.qy ; + INSERT INTO collection addresses SELECT :key, :address FROM /apps/kardia/modules/base/dups/get/addresses.qy ; + + -- Get dups for each field (name, email, phone, address). + DECLARE collection name_dups; + DECLARE collection email_dups; + DECLARE collection phone_dups; + DECLARE collection address_dups; + INSERT INTO collection name_dups SELECT :key1, :key2, :sim FROM /apps/kardia/modules/base/dups/name.qy ; + INSERT INTO collection email_dups SELECT :key1, :key2, :sim FROM /apps/kardia/modules/base/dups/email.qy ; + INSERT INTO collection phone_dups SELECT :key1, :key2, :sim FROM /apps/kardia/modules/base/dups/phone.qy ; + INSERT INTO collection address_dups SELECT :key1, :key2, :sim FROM /apps/kardia/modules/base/dups/address.qy ; + + -- [name_dups <- email_dups] Add an entry to name_dups for each entry in email_dups. + INSERT INTO + collection name_dups + SELECT + key1 = :email:key1, + key2 = :email:key2, + sim = condition(:name:sim > 0.0, + :name:sim, + isnull(cos_compare(:n1:name, :n2:name), -1.0) -- Negative value will be dropped from the average. + ) + FROM + identity collection email_dups email, + collection name_dups name, + collection names n1, + collection names n2 + WHERE + :email:key1 *= :name:key1 + AND :email:key2 *= :name:key2 + AND :email:key1 *= :n1:key + AND :email:key2 *= :n2:key + ON duplicate -- Skip entries that already exist in name_dups. + :key1, + :key2 + UPDATE SET + :sim = condition(:name:sim > 0.0, + :name:sim, + isnull(cos_compare(:n1:name, :n2:name), -1.0) -- Negative value will be dropped from the average. + ) + ; + + -- [name_dups <- phone_dups] Add an entry to name_dups for each entry in phone_dups. + INSERT INTO + collection name_dups + SELECT + key1 = :phone:key1, + key2 = :phone:key2, + sim = condition(:name:sim > 0.0, + :name:sim, + max(isnull(cos_compare(:n1:name, :n2:name), -1.0)) -- Negative value will be dropped from the average. + ) + FROM + identity collection phone_dups phone, + collection name_dups name, + collection names n1, + collection names n2 + WHERE + :phone:key1 *= :name:key1 + AND :phone:key2 *= :name:key2 + AND :phone:key1 *= :n1:key + AND :phone:key2 *= :n2:key + GROUP BY + :name:key1, + :name:key2 + ON duplicate -- Skip entries that already exist in name_dups. + :key1, + :key2 + UPDATE SET + :sim = condition(:name:sim > 0.0, + :name:sim, + max(isnull(cos_compare(:n1:name, :n2:name), -1.0)) -- Negative value will be dropped from the average. + ) + ; + + -- [name_dups <- address_dups] Add an entry to name_dups for each entry in address_dups. + INSERT INTO + collection name_dups + SELECT + key1 = :address:key1, + key2 = :address:key2, + sim = condition(:name:sim > 0.0, + :name:sim, + max(isnull(cos_compare(:n1:name, :n2:name), -1.0)) -- Negative value will be dropped from the average. + ) + FROM + identity collection address_dups address, + collection name_dups name, + collection names n1, + collection names n2 + WHERE + :address:key1 *= :name:key1 + AND :address:key2 *= :name:key2 + AND :address:key1 *= :n1:key + AND :address:key2 *= :n2:key + GROUP BY + :name:key1, + :name:key2 + ON duplicate -- Skip entries that already exist in name_dups. + :key1, + :key2 + UPDATE SET + :sim = condition(:name:sim > 0.0, + :name:sim, + max(isnull(cos_compare(:n1:name, :n2:name), -1.0)) -- Negative value will be dropped from the average. + ) + ; + + + -- [email_dups <- name_dups] Add an entry to email_dups for each entry in name_dups. + INSERT INTO + collection email_dups + SELECT + key1 = :name:key1, + key2 = :name:key2, + sim = condition(:email:sim > 0.0, + :email:sim, + max(isnull(cos_compare(:e1:email, :e2:email), -1.0)) -- Negative value will be dropped from the average. + ) + FROM + identity collection name_dups name, + collection email_dups email, + collection emails e1, + collection emails e2 + WHERE + :name:key1 *= :email:key1 + AND :name:key2 *= :email:key2 + AND :name:key1 *= :e1:key + AND :name:key2 *= :e2:key + GROUP BY + :name:key1, + :name:key2 + ON duplicate -- Skip duplicate entries. + :key1, + :key2 + UPDATE SET + :sim = condition(:email:sim > 0.0, + :email:sim, + max(isnull(cos_compare(:e1:email, :e2:email), -1.0)) -- Negative value will be dropped from the average. + ) + ; + + -- [phone_dups <- name_dups] Add an entry to phone_dups for each entry in name_dups. + INSERT INTO + collection phone_dups + SELECT + key1 = :name:key1, + key2 = :name:key2, + sim = condition(:phone:sim > 0.0, + :phone:sim, + max(isnull(cos_compare(:ph1:phone, :ph2:phone), -1.0)) -- Negative value will be dropped from the average. + ) + FROM + identity collection name_dups name, + collection phone_dups phone, + collection phones ph1, + collection phones ph2 + WHERE + :name:key1 *= :phone:key1 + AND :name:key2 *= :phone:key2 + AND :name:key1 *= :ph1:key + AND :name:key2 *= :ph2:key + GROUP BY + :name:key1, + :name:key2 + ON duplicate -- Skip duplicate entries. + :key1, + :key2 + UPDATE SET + :sim = condition(:phone:sim > 0.0, + :phone:sim, + max(isnull(cos_compare(:ph1:phone, :ph2:phone), -1.0)) -- Negative value will be dropped from the average. + ) + ; + + -- [address_dups <- name_dups] Add an entry to address_dups for each entry in name_dups. + INSERT INTO + collection address_dups + SELECT + key1 = :name:key1, + key2 = :name:key2, + sim = condition(:address:sim > 0.0, + :address:sim, + max(isnull(cos_compare(:l1:address, :l2:address), -1.0)) -- Negative value will be dropped from the average. + ) + FROM + identity collection name_dups name, + collection address_dups address, + collection addresses l1, + collection addresses l2 + WHERE + :name:key1 *= :address:key1 + AND :name:key2 *= :address:key2 + AND :name:key1 *= :l1:key + AND :name:key2 *= :l2:key + GROUP BY + :name:key1, + :name:key2 + ON duplicate -- Skip duplicate entries. + :key1, + :key2 + UPDATE SET + :sim = condition(:address:sim > 0.0, + :address:sim, + max(isnull(cos_compare(:l1:address, :l2:address), -1.0)) -- Negative value will be dropped from the average. + ) + ; + + DECLARE collection all_dups; + + -- Aggregate dups. + INSERT INTO + collection all_dups + SELECT + key1 = :name:key1, + key2 = :name:key2, + sim = aggregate_similarities( + :name:sim, + :email:sim, + :phone:sim, + :address:sim, + :name:key1 + ' ' + :name:name1, + :name:key2 + ' ' + :name:name2 + ), + reason = '' + + condition(:name:sim > 0.0, 'Name (' + round(:name:sim * 100, 2) + '%)\\n', '') + + condition(:email:sim > 0.0, 'Email (' + round(:email:sim * 100, 2) + '%)\\n', '') + + condition(:phone:sim > 0.0, 'Phone (' + round(:phone:sim * 100, 2) + '%)\\n', '') + + condition(:address:sim > 0.0, 'Address (' + round(:address:sim * 100, 2) + '%)\\n', '') + - '\\n' + FROM + identity collection name_dups name, + collection email_dups email, + collection phone_dups phone, + collection address_dups address + WHERE + :name:key1 = :email:key1 + AND :name:key2 = :email:key2 + AND :name:key1 = :phone:key1 + AND :name:key2 = :phone:key2 + AND :name:key1 = :address:key1 + AND :name:key2 = :address:key2 + ON duplicate + :key1, + :key2 + UPDATE SET + :sim = :sim, + :reason = :reason, + :s_date_modified = getdate(), + :s_modified_by = user_name() + ; + + + -- Full Concat Dups + DECLARE collection concat_dups; + insert into collection concat_dups select * from /apps/kardia/modules/base/dups/concat.qy ; + INSERT INTO + collection all_dups + SELECT + key1 = :concat:key1, + key2 = :concat:key2, + sim = :concat:sim, + reason = 'All (' + round(:concat:sim * 100, 2) + '%)!' + FROM + identity collection concat_dups concat + WHERE + :concat:key1 != :concat:key2 + ON duplicate + :key1, + :key2 + UPDATE SET + :sim = condition(:sim > :concat:sim, :sim, :concat:sim), + :reason = 'All (' + round(:concat:sim * 100, 2) + '%)\\n' + :reason, + :s_date_modified = getdate(), + :s_modified_by = user_name() + ; + + + -- Free application-scoped collections. + DELETE FROM collection names; + DELETE FROM collection emails; + DELETE FROM collection phones; + DELETE FROM collection addresses; + + -- Remove all data from the dups table (for debugging). + delete from /apps/kardia/data/Kardia_DB/p_dup/rows ; + + -- Insert dups into p_dup table. + INSERT INTO + /apps/kardia/data/Kardia_DB/p_dup/rows + SELECT + p_partner_key = :key1, + p_dup_partner_key = :key2, + p_match_quality = :sim, + p_reason = :reason, + + -- Required fields. + s_date_created = getdate(), + s_created_by = user_name(), + s_date_modified = getdate(), + s_modified_by = user_name() + FROM + collection all_dups + WHERE + :sim > 0.6 + ON duplicate + :p_partner_key, + :p_dup_partner_key + UPDATE SET + :p_match_quality = :sim, + :p_reason = :reason, + :s_date_modified = getdate(), + :s_modified_by = user_name() + ; + + -- Remove stale data from the dups table. +-- delete +-- from +-- identity /apps/kardia/data/Kardia_DB/p_dup/rows d, +-- /apps/kardia/data/Kardia_DB/p_partner/rows p1, +-- /apps/kardia/data/Kardia_DB/p_partner/rows p2 +-- where +-- :d:p_partner_key *= :p1:p_partner_key +-- AND :d:p_dup_partner_key *= :p2:p_partner_key +-- AND(:d:s_date_modified < isnull(:p1:s_date_modified, getdate()) +-- OR :d:s_date_modified < isnull(:p2:s_date_modified, getdate())) +-- ; + "; + } diff --git a/kardia-app/modules/base/plugin_base_dataqa_duplicates.cmp b/kardia-app/modules/base/plugin_base_dataqa_duplicates.cmp index 12537d6c..6a29160f 100644 --- a/kardia-app/modules/base/plugin_base_dataqa_duplicates.cmp +++ b/kardia-app/modules/base/plugin_base_dataqa_duplicates.cmp @@ -112,31 +112,53 @@ plugin_base_dataqa_duplicates "widget/component-decl" :d:p_partner_key, :d:p_dup_partner_key, :d:p_match_quality, - --match = convert(integer, round(:d:p_match_quality * 100)), - match = round(:d:p_match_quality, 3), - disp_name_1 = condition(char_length(rtrim(:p1:p_org_name)) > 0, :p1:p_org_name + ' ' + condition(char_length(:p1:p_given_name + :p1:p_surname) > 0, '- ', ''), '') + isnull(:p1:p_given_name + ' ','') + isnull(:p1:p_surname + ' ',''), - disp_name_2 = condition(char_length(rtrim(:p2:p_org_name)) > 0, :p2:p_org_name + ' ' + condition(char_length(:p2:p_given_name + :p2:p_surname) > 0, '- ', ''), '') + isnull(:p2:p_given_name + ' ','') + isnull(:p2:p_surname + ' ',''), + match = round(:d:p_match_quality, 4), + disp_name_1 = '' + + condition(char_length(rtrim(:p1:p_org_name)) > 0, + :p1:p_org_name + ' ' + condition(char_length(:p1:p_given_name + :p1:p_surname) > 0, '- ', ''), + '') + + isnull(:p1:p_given_name + ' ', '') + + isnull(:p1:p_surname + ' ', ''), + disp_name_2 = '' + + condition(char_length(rtrim(:p2:p_org_name)) > 0, + :p2:p_org_name + ' ' + condition(char_length(:p2:p_given_name + :p2:p_surname) > 0, '- ', ''), + '') + + isnull(:p2:p_given_name + ' ', '') + + isnull(:p2:p_surname + ' ', ''), stat_1 = :p1:p_status_code, stat_2 = :p2:p_status_code, - --loc_1 = :l1:p_city + ', ' + :l1:p_state_province, - --loc_2 = :l2:p_city + ', ' + :l2:p_state_province, - --contact_1 = isnull(:c1:p_phone_country + ' ','') + isnull(:c1:p_phone_area_city + ' ','') + isnull(:c1:p_contact_data,''), - --contact_2 = isnull(:c2:p_phone_country + ' ','') + isnull(:c2:p_phone_area_city + ' ','') + isnull(:c2:p_contact_data,''), ploc_1 = substitute(isnull(:af1:p_format, '[:p_in_care_of]\n[:p_address_1]\n[:p_address_2]\n[:p_address_3]\n[:p_city], [:p_state_province] [:p_postal_code]\n[:p_country_name]'), 'l=pl1,p=p1,ctry1'), ploc_2 = substitute(isnull(:af2:p_format, '[:p_in_care_of]\n[:p_address_1]\n[:p_address_2]\n[:p_address_3]\n[:p_city], [:p_state_province] [:p_postal_code]\n[:p_country_name]'), 'l=pl2,p=p2,ctry2'), - type = condition(:d:p_partner_key != :d:p_dup_partner_key, 'Duplicate Partner', condition(:d:p_contact_id is not null, 'Duplicate Contact', condition(:d:p_location_id is not null, 'Duplicate Address', ''))), + + -- Dupe type info. + reason = :d:p_reason, nondup = condition(:nd:p_partner_key is not null, 1, 0), - associated = condition(:p1:p_parent_key = :p2:p_partner_key or :p1:p_partner_key = :p2:p_parent_key or :p1:p_parent_key = :p2:p_parent_key, 1, 0), - merged = condition(:p1:p_merged_with = :p2:p_partner_key or :p1:p_partner_key = :p2:p_merged_with, 1, 0), - relation = (select condition(:r:p_partner_key = :d:p_partner_key, :rt:p_relation_type_label + '/' + :rt:p_relation_type_rev_label, :rt:p_relation_type_rev_label + '/' + :rt:p_relation_type_label) from collection rels r, collection reltype rt where ((:r:p_partner_key = :d:p_partner_key and :r:p_relation_key = :d:p_dup_partner_key) or (:r:p_partner_key = :d:p_dup_partner_key and :r:p_relation_key = :d:p_partner_key)) and :rt:p_relation_type = :r:p_relation_type ) + associated = condition( + :p1:p_parent_key = :p2:p_partner_key + or :p1:p_partner_key = :p2:p_parent_key + or :p1:p_parent_key = :p2:p_parent_key, + 1, 0), + merged = condition( + :p1:p_merged_with = :p2:p_partner_key + or :p1:p_partner_key = :p2:p_merged_with, + 1, 0), + relation = ( + select + condition(:r:p_partner_key = :d:p_partner_key, + :rt:p_relation_type_label + '/' + :rt:p_relation_type_rev_label, + :rt:p_relation_type_rev_label + '/' + :rt:p_relation_type_label + ) + from + collection rels r, + collection reltype rt + where :rt:p_relation_type = :r:p_relation_type + and ((:r:p_partner_key = :d:p_partner_key and :r:p_relation_key = :d:p_dup_partner_key) or + (:r:p_partner_key = :d:p_dup_partner_key and :r:p_relation_key = :d:p_partner_key )) + ) from identity /apps/kardia/data/Kardia_DB/p_dup/rows d, /apps/kardia/data/Kardia_DB/p_partner/rows p1, /apps/kardia/data/Kardia_DB/p_partner/rows p2, - --/apps/kardia/data/Kardia_DB/p_location/rows l1, - --/apps/kardia/data/Kardia_DB/p_location/rows l2, - --/apps/kardia/data/Kardia_DB/p_contact_info/rows c1, - --/apps/kardia/data/Kardia_DB/p_contact_info/rows c2, /apps/kardia/data/Kardia_DB/p_location/rows pl1, /apps/kardia/data/Kardia_DB/p_location/rows pl2, /apps/kardia/data/Kardia_DB/p_country/rows ctry1, @@ -148,16 +170,6 @@ plugin_base_dataqa_duplicates "widget/component-decl" (:info:partnerlist is null or charindex(',' + rtrim(:d:p_partner_key) + ',', ',' + :info:partnerlist + ',') > 0 or charindex(',' + rtrim(:d:p_dup_partner_key) + ',', ',' + :info:partnerlist + ',') > 0) and :p1:p_partner_key = :d:p_partner_key and :p2:p_partner_key = :d:p_dup_partner_key and - --:l1:p_partner_key =* :d:p_partner_key and - --:l1:p_location_id =* :d:p_location_id and - --:l1:p_revision_id =* :d:p_revision_id and - --:l2:p_partner_key =* :d:p_dup_partner_key and - --:l2:p_location_id =* :d:p_dup_location_id and - --:l2:p_revision_id =* :d:p_dup_revision_id and - --:c1:p_partner_key =* :d:p_partner_key and - --:c1:p_contact_id =* :d:p_contact_id and - --:c2:p_partner_key =* :d:p_dup_partner_key and - --:c2:p_contact_id =* :d:p_dup_contact_id and :pl1:p_partner_key =* :d:p_partner_key and :pl2:p_partner_key =* :d:p_dup_partner_key and :pl1:p_country_code *= :ctry1:p_country_code and @@ -167,7 +179,7 @@ plugin_base_dataqa_duplicates "widget/component-decl" :af2:p_country_code =* :ctry2:p_country_code and :af2:p_address_set = 'STANDARD' and :nd:p_partner_key =* :d:p_partner_key and - :nd:p_nondup_partner_key =* :p_dup_partner_key + :nd:p_nondup_partner_key =* :d:p_dup_partner_key group by :d:p_match_quality desc, :d:p_partner_key, @@ -209,11 +221,69 @@ plugin_base_dataqa_duplicates "widget/component-decl" row_shadow_angle=135; nodata_message="(no duplicates to show)"; - //t_match "widget/table-column" { title="%"; value=runclient(:dupsosrc:match + '%'); width=60; style=bold; } - t_match "widget/table-column" { width=80; title = "%"; type=progress; padding=4; style=bold; fieldname=match; bar_color=runclient(condition(:dupsosrc:match >= 0.90, '#59b550', condition(:dupsosrc:match < 0.80, '#d96066', '#DDB261'))); bar_padding=3; bar_textcolor=black; } - t_type "widget/table-column" { title="%"; value=runclient(:dupsosrc:type + condition(:dupsosrc:merged, '\nAlready Merged', '') + condition(:dupsosrc:associated, '\nAssociated', '') + condition(:dupsosrc:relation is not null, '\nRelated: ' + :dupsosrc:relation, '')); width=180; style=bold; align=center; wrap=yes; } - t_par2 "widget/table-column" { title="Partner"; value=runclient(:dupsosrc:p_dup_partner_key + ' ' + :dupsosrc:disp_name_2); width=250; style=bold; caption_value=runclient(isnull(:dupsosrc:ploc_2 + '\n', '') + condition(:dupsosrc:stat_2 = 'O', '(obsolete)', '') - '\n'); wrap=yes; } - t_par1 "widget/table-column" { title="Partner"; value=runclient(condition(:dupsosrc:p_partner_key = :dupsosrc:p_dup_partner_key, '', :dupsosrc:p_partner_key + ' ' + :dupsosrc:disp_name_1)); width=250; style=bold; caption_value=runclient(condition(:dupsosrc:p_partner_key = :dupsosrc:p_dup_partner_key, '', isnull(:dupsosrc:ploc_1 + '\n', '') + condition(:dupsosrc:stat_1 = 'O', '(obsolete)', '') - '\n')); wrap=yes; } + // Similarity progress bar. + t_match "widget/table-column" + { + width = 80; + style = bold; + padding = 4; + title = "%"; + + type = progress; + fieldname = match; + + bar_color = runclient(condition(:dupsosrc:match >= 0.90, '#59b550', condition(:dupsosrc:match < 0.80, '#d96066', '#DDB261'))); + bar_padding = 3; + bar_textcolor = black; + } + + // Dulicate reason, and other information about the type of duplicate. + t_type "widget/table-column" + { + width = 180; + style = bold; + align = center; + wrap = yes; + title = "%"; + + value = runclient( + :dupsosrc:reason + + condition(:dupsosrc:merged, '\nAlready Merged', '') + + condition(:dupsosrc:associated, '\nAssociated', '') + + condition(:dupsosrc:relation is not null, '\nRelated: ' + :dupsosrc:relation, '') + ); + } + + // Duplicate record 1. + t_dup1 "widget/table-column" + { + width = 250; + style = bold; + wrap = yes; + title = "Partner"; + + value = runclient(:dupsosrc:p_dup_partner_key + ' ' + :dupsosrc:disp_name_2); + caption_value = runclient(isnull(:dupsosrc:ploc_2 + '\n', '') + condition(:dupsosrc:stat_2 = 'O', '(obsolete)', '') - '\n'); + } + + // Duplicate record 2. + t_dup2 "widget/table-column" + { + width = 250; + style = bold; + wrap = yes; + title = "Partner"; + + /// TODO: Israel - Remove comment below. + // This code used to check :dupsosrc:p_partner_key = :dupsosrc:p_dup_partner_key. + // The new algorithm verifies that this condition is always false, so it was removed. + value = runclient(:dupsosrc:p_partner_key + ' ' + :dupsosrc:disp_name_1); + caption_value = runclient( + isnull(:dupsosrc:ploc_1 + '\n', '') + + condition(:dupsosrc:stat_1 = 'O', '(obsolete)', '') + - '\n' + ); + } ops_detail "widget/table-row-detail" { diff --git a/kardia-app/modules/base/update_duplicates.qy b/kardia-app/modules/base/update_duplicates.qy deleted file mode 100644 index 058861dd..00000000 --- a/kardia-app/modules/base/update_duplicates.qy +++ /dev/null @@ -1,28 +0,0 @@ -$Version=2$ -update_duplicates "system/query" - { - sql = " delete - /apps/kardia/data/Kardia_DB/p_dup/rows - ; - - insert - /apps/kardia/data/Kardia_DB/p_dup/rows - select - p_partner_key = :p_partner_key_1, - p_dup_partner_key = :p_partner_key_2, - p_match_quality = :priority, - p_location_id = nullif(:p_location_id_1, ''), - p_dup_location_id = nullif(:p_location_id_2, ''), - p_revision_id = nullif(:p_revision_id_1, ''), - p_dup_revision_id = nullif(:p_revision_id_2, ''), - p_contact_id = nullif(:p_contact_id_1, ''), - p_dup_contact_id = nullif(:p_contact_id_2, ''), - --p_comment = :ca1 + ', ' + :ca2 + ', ' + :cc1 + ', ' + :cc2, - s_date_created = getdate(), - s_created_by = user_name(), - s_date_modified = getdate(), - s_modified_by = user_name() - from - /apps/kardia/modules/base/duplicate_checking.qy - "; - } From 5ab24da27124243e23fd00b5c916f7792bf801dc Mon Sep 17 00:00:00 2001 From: Israel Date: Thu, 13 Nov 2025 16:56:50 -0700 Subject: [PATCH 02/12] Add double metaphone. Add helpful comments. Add smother color gradient to dups UI. Ignore blank emails, phones, and addresses. Fix a bug where cos_compare() was used for phone numbers instead of lev_compare(). Abstract values into an object in update.qy. Remove unhelpful optimization attempts. Remove unhelpful comments. --- kardia-app/modules/base/dups/address.qy | 2 - kardia-app/modules/base/dups/dups.cluster | 30 ++- kardia-app/modules/base/dups/email.qy | 2 - kardia-app/modules/base/dups/get/addresses.qy | 2 + kardia-app/modules/base/dups/get/concats.qy | 61 +++-- kardia-app/modules/base/dups/get/emails.qy | 4 +- .../modules/base/dups/get/name_metas.qy | 25 ++ kardia-app/modules/base/dups/get/phones.qy | 2 + kardia-app/modules/base/dups/name_meta.qy | 53 +++++ kardia-app/modules/base/dups/phone.qy | 2 - kardia-app/modules/base/dups/update.qy | 220 ++++++++++++------ .../base/plugin_base_dataqa_duplicates.cmp | 49 +++- 12 files changed, 351 insertions(+), 101 deletions(-) create mode 100644 kardia-app/modules/base/dups/get/name_metas.qy create mode 100644 kardia-app/modules/base/dups/name_meta.qy diff --git a/kardia-app/modules/base/dups/address.qy b/kardia-app/modules/base/dups/address.qy index 74484b30..30dfd52d 100644 --- a/kardia-app/modules/base/dups/address.qy +++ b/kardia-app/modules/base/dups/address.qy @@ -51,8 +51,6 @@ dups_test_query "system/query" AND :p2:p_partner_key = :l2:p_partner_key AND :p1:p_partner_key = :d:key1 AND :p2:p_partner_key = :d:key2 - AND :d:sim is not null AND :d:sim is not null AND :d:sim is not null -- Query optimization - AND :d:sim is not null AND :d:sim is not null AND :d:sim is not null -- Query optimization GROUP BY :p1:p_partner_key, :p2:p_partner_key diff --git a/kardia-app/modules/base/dups/dups.cluster b/kardia-app/modules/base/dups/dups.cluster index b1aab6be..c1b3ba2a 100644 --- a/kardia-app/modules/base/dups/dups.cluster +++ b/kardia-app/modules/base/dups/dups.cluster @@ -1,15 +1,18 @@ $Version=2$ get_dups "system/cluster" { + // Declare parameters. algorithm "cluster/parameter" { type = string; style = notnull; } k "cluster/parameter" { type = integer; style = notnull; } field "cluster/parameter" { type = string; style = notnull; } data "cluster/parameter" { type = string; style = notnull; } + // Declare data source. source = runserver('/apps/kardia/modules/base/dups/get/' + :parameters:field + '.qy'); key_attr = "key"; data_attr = runserver(:parameters:data); + // A cluster for searching with clustering. kmeans "cluster/cluster" { algorithm = "k-means"; @@ -18,19 +21,34 @@ get_dups "system/cluster" min_improvement = 0.0001; max_iterations = 32; } - + + // A "cluster" for searching without clustering. none "cluster/cluster" { algorithm = "none"; } + // Default duplicate search, used for names, emails, and addresses. dups "cluster/search" { source = runserver(:parameters:algorithm); similarity_measure = "cosine"; threshold = 0.7; } - + + // Double Metaphone search. + // Double Metaphone is prone to false positives, so it uses a higher + // threshold to make them slightly less bad. + meta_dups "cluster/search" + { + source = runserver(:parameters:algorithm); + similarity_measure = "levenshtein"; + threshold = 0.8; + } + + // Phone search. + // Searching for duplicate phone numbers uses edit distance similarity + // instead of cosine to give more accurate results. phone_dups "cluster/search" { source = runserver(:parameters:algorithm); @@ -38,6 +56,9 @@ get_dups "system/cluster" threshold = 0.7; } + // Concat search. + // The concatenation strategy is faster than the aggregation strategy, so + // we can use a slightly lower threshold to detect just a few more dups. concat_dups "cluster/search" { source = runserver(:parameters:algorithm); @@ -45,8 +66,3 @@ get_dups "system/cluster" threshold = 0.65; } } - - -// --\'sql = " -- Syntax highlighting. -// --"sql = " -- Syntax highlighting. -// '// '" // Terminate highlighting hacks. diff --git a/kardia-app/modules/base/dups/email.qy b/kardia-app/modules/base/dups/email.qy index 0cef2607..fcaa00f6 100644 --- a/kardia-app/modules/base/dups/email.qy +++ b/kardia-app/modules/base/dups/email.qy @@ -53,8 +53,6 @@ dups_test_query "system/query" AND :p2:p_partner_key = :d:key2 AND :c1:p_contact_type = 'E' AND :c2:p_contact_type = 'E' - AND :d:sim is not null AND :d:sim is not null AND :d:sim is not null -- Query optimization - AND :d:sim is not null AND :d:sim is not null AND :d:sim is not null -- Query optimization GROUP BY :p1:p_partner_key, :p2:p_partner_key diff --git a/kardia-app/modules/base/dups/get/addresses.qy b/kardia-app/modules/base/dups/get/addresses.qy index 6364e083..a739d4ce 100644 --- a/kardia-app/modules/base/dups/get/addresses.qy +++ b/kardia-app/modules/base/dups/get/addresses.qy @@ -19,6 +19,8 @@ dups_test_query "system/query" /apps/kardia/data/Kardia_DB/p_location/rows l WHERE :p:p_partner_key = :l:p_partner_key + AND :l:p_address_1 is not null + AND :l:p_address_1 != '' ; print 'get/addresses.qy end' "; diff --git a/kardia-app/modules/base/dups/get/concats.qy b/kardia-app/modules/base/dups/get/concats.qy index dd93dd80..f2c08313 100644 --- a/kardia-app/modules/base/dups/get/concats.qy +++ b/kardia-app/modules/base/dups/get/concats.qy @@ -3,10 +3,17 @@ dups_test_query "system/query" { sql = " print 'get/concats.qy start' + + declare object value; + SELECT :value:boundary_character = '`'; + + declare collection temp; + + INSERT INTO + collection temp SELECT key = :p:p_partner_key, - data = '' - -- Name + name = '' + isnull(:p_given_name, '') + isnull(condition( char_length(isnull(:p_given_name, '')) > 1 @@ -17,20 +24,24 @@ dups_test_query "system/query" '' ), '') + isnull(:p_surname, '') - + isnull(:p_org_name, '') - + '; ' - - -- Email - + isnull(:e:p_contact_data, '') - + '; ' - - -- Phone - + isnull(:ph:p_phone_country, '') + + isnull(:p_org_name, ''), + name_meta = '' + + isnull(metaphone(:p_given_name), '') + + isnull(condition( + char_length(isnull(:p_given_name, '')) > 1 + AND char_length(isnull(:p_surname, '')) > 1 + AND :p_given_name != :p_preferred_name + AND :p_surname != :p_preferred_name, + metaphone(:p_preferred_name), + '' + ), '') + + isnull(metaphone(:p_surname), ''), + email = isnull(:e:p_contact_data, ''), + phone = '' + + isnull(:ph:p_phone_country, '') + isnull(:ph:p_phone_area_city, '') - + isnull(:ph:p_contact_data, '') - + '; ' - - -- Address + + isnull(:ph:p_contact_data, ''), + address = '' + isnull(:l:p_in_care_of, '') + isnull(:l:p_address_1, '') + isnull(:l:p_address_2, '') @@ -51,6 +62,24 @@ dups_test_query "system/query" AND :e:p_contact_type = 'E' AND (:ph:p_contact_type = 'P' OR :ph:p_contact_type = 'C') ; + + -- Nonzero numbers are used as boundary markers for the meta parts + -- because they do not appear in metaphones. This helps to reduce + -- false positives from boundary characters falsely matching. + SELECT + key = :key, + data = '' + + :name + :value:boundary_character + + :name_meta + '1' + + :name_meta + '1' + + :name_meta + '1' + + :name_meta + '1' + + :email + :value:boundary_character + + :phone + :value:boundary_character + + :address + FROM + collection temp + print 'get/concats.qy end' "; - } \ No newline at end of file + } diff --git a/kardia-app/modules/base/dups/get/emails.qy b/kardia-app/modules/base/dups/get/emails.qy index 9d7bccdc..78aba5d8 100644 --- a/kardia-app/modules/base/dups/get/emails.qy +++ b/kardia-app/modules/base/dups/get/emails.qy @@ -11,7 +11,9 @@ dups_test_query "system/query" /apps/kardia/data/Kardia_DB/p_contact_info/rows c WHERE :p:p_partner_key = :c:p_partner_key - AND :c:p_contact_type = 'E' + AND :c:p_contact_type = 'E' + AND :c:p_contact_data is not null + AND :c:p_contact_data != '' ; print 'get/emails.qy end' "; diff --git a/kardia-app/modules/base/dups/get/name_metas.qy b/kardia-app/modules/base/dups/get/name_metas.qy new file mode 100644 index 00000000..e733dbae --- /dev/null +++ b/kardia-app/modules/base/dups/get/name_metas.qy @@ -0,0 +1,25 @@ +$Version=2$ +dups_test_query "system/query" + { + sql = " + print 'get/name_metas.qy start' + SELECT + key = :p_partner_key, + name_meta = '' + + isnull(metaphone(:p_given_name), '') + + isnull(condition( + :p_preferred_name != :p_given_name + AND :p_preferred_name != :p_surname, + metaphone(:p_preferred_name), + '' + ), '') + + isnull(metaphone(:p_surname), '') + FROM + /apps/kardia/data/Kardia_DB/p_partner/rows + WHERE + char_length(isnull(:p_given_name, '')) > 1 + AND char_length(isnull(:p_surname, '')) > 1 + ; + print 'get/name_metas.qy end' + "; + } diff --git a/kardia-app/modules/base/dups/get/phones.qy b/kardia-app/modules/base/dups/get/phones.qy index 2f8bdac6..44ff7116 100644 --- a/kardia-app/modules/base/dups/get/phones.qy +++ b/kardia-app/modules/base/dups/get/phones.qy @@ -15,6 +15,8 @@ dups_test_query "system/query" WHERE :p:p_partner_key = :c:p_partner_key AND ((:c:p_contact_type = 'P') + (:c:p_contact_type = 'C')) + AND :p:p_contact_data is not null + AND :p:p_contact_data != '' ; print 'get/phones.qy end' "; diff --git a/kardia-app/modules/base/dups/name_meta.qy b/kardia-app/modules/base/dups/name_meta.qy new file mode 100644 index 00000000..a57348fa --- /dev/null +++ b/kardia-app/modules/base/dups/name_meta.qy @@ -0,0 +1,53 @@ +$Version=2$ +name_dups "system/query" + { + sql = " + print 'name.qy end' + + -- Get the list of name_metas we will be searching for duplicates. + DECLARE collection name_metas scope application; + + -- Assume the listof name_metas was already computed by the caller. + -- INSERT INTO collection name_metas SELECT :key, :name FROM /apps/kardia/modules/base/dups/get/name_metas.qy ; + + -- Declare an object for storing values locally. + declare object value; + + -- Count data. + SELECT + :value:num_data = count(1) + FROM + collection name_metas + ; + + -- Compute global values. + SELECT + :value:k = :k, + :value:algorithm = :algorithm + FROM + expression ('/apps/kardia/modules/base/dups/globals.qy?num_data=' + :value:num_data) + ; + + -- Detect dups. + SELECT + key1 = :p1:p_partner_key, + key2 = :p2:p_partner_key, + sim = :c:sim + FROM + identity /apps/kardia/data/Kardia_DB/p_partner/rows p1, + /apps/kardia/data/Kardia_DB/p_partner/rows p2, + expression ( + '/apps/kardia/modules/base/dups/dups.cluster' + + '?algorithm=' + :value:algorithm + + '&k=' + :value:k + + '&field=name_metas' + + '&data=name_meta' + + '/meta_dups' + ) c + WHERE + :c:key1 = :p1:p_partner_key + AND :c:key2 = :p2:p_partner_key + ; + print 'name.qy end' + "; + } diff --git a/kardia-app/modules/base/dups/phone.qy b/kardia-app/modules/base/dups/phone.qy index 6d84242c..111f6791 100644 --- a/kardia-app/modules/base/dups/phone.qy +++ b/kardia-app/modules/base/dups/phone.qy @@ -51,8 +51,6 @@ dups_test_query "system/query" AND :p2:p_partner_key = :c2:p_partner_key AND :p1:p_partner_key = :d:key1 AND :p2:p_partner_key = :d:key2 - AND :d:sim is not null AND :d:sim is not null AND :d:sim is not null -- Query optimization - AND :d:sim is not null AND :d:sim is not null AND :d:sim is not null -- Query optimization GROUP BY :p1:p_partner_key, :p2:p_partner_key diff --git a/kardia-app/modules/base/dups/update.qy b/kardia-app/modules/base/dups/update.qy index 9964daa6..cc245370 100644 --- a/kardia-app/modules/base/dups/update.qy +++ b/kardia-app/modules/base/dups/update.qy @@ -1,45 +1,98 @@ $Version=2$ update_duplicates "system/query" { - // db "query/parameter" { type=string; default="/apps/kardia/data/Kardia_DB"; } - sql = " + DECLARE object value; + + -- The number of places to show after the decimal point for reason + -- similarity percentages. + SELECT :value:reason_decimals = 1; + + -- The minimum similarity for a attributes to be considered one of + -- the reasons that two records were matched as duplicates. + SELECT :value:min_reason_sim = 0.10; + + -- The minimum aggregated similarity threshold that must be reached + -- for a duplicate to be added to the table and displayed to the user. + -- Duplicates which aggregate to a lower similarity than this do are + -- dropped before being added even if one attribute happens to have a + -- has high similarity. + -- For other similarity thresholds, see `dups.cluster`. + SELECT :value:min_total_sim = 0.60; + + -- Get collections for each set of possible dups. -- These collections are used by the following scripts (such as -- dups/name.qy, dups/email.qy, etc.), avoiding the need to reopen -- the query file over and over again. DECLARE collection names scope application; + DECLARE collection name_metas scope application; DECLARE collection emails scope application; DECLARE collection phones scope application; DECLARE collection addresses scope application; DELETE FROM collection names; + DELETE FROM collection name_metas; DELETE FROM collection emails; DELETE FROM collection phones; DELETE FROM collection addresses; - INSERT INTO collection names SELECT :key, :name FROM /apps/kardia/modules/base/dups/get/names.qy ; - INSERT INTO collection emails SELECT :key, :email FROM /apps/kardia/modules/base/dups/get/emails.qy ; - INSERT INTO collection phones SELECT :key, :phone FROM /apps/kardia/modules/base/dups/get/phones.qy ; - INSERT INTO collection addresses SELECT :key, :address FROM /apps/kardia/modules/base/dups/get/addresses.qy ; - + INSERT INTO collection names SELECT :key, :name FROM /apps/kardia/modules/base/dups/get/names.qy ; + INSERT INTO collection name_metas SELECT :key, :name_meta FROM /apps/kardia/modules/base/dups/get/name_metas.qy ; + INSERT INTO collection emails SELECT :key, :email FROM /apps/kardia/modules/base/dups/get/emails.qy ; + INSERT INTO collection phones SELECT :key, :phone FROM /apps/kardia/modules/base/dups/get/phones.qy ; + INSERT INTO collection addresses SELECT :key, :address FROM /apps/kardia/modules/base/dups/get/addresses.qy ; + -- Get dups for each field (name, email, phone, address). DECLARE collection name_dups; + DECLARE collection name_meta_dups; DECLARE collection email_dups; DECLARE collection phone_dups; DECLARE collection address_dups; - INSERT INTO collection name_dups SELECT :key1, :key2, :sim FROM /apps/kardia/modules/base/dups/name.qy ; - INSERT INTO collection email_dups SELECT :key1, :key2, :sim FROM /apps/kardia/modules/base/dups/email.qy ; - INSERT INTO collection phone_dups SELECT :key1, :key2, :sim FROM /apps/kardia/modules/base/dups/phone.qy ; - INSERT INTO collection address_dups SELECT :key1, :key2, :sim FROM /apps/kardia/modules/base/dups/address.qy ; - + INSERT INTO collection name_dups SELECT :key1, :key2, :sim FROM /apps/kardia/modules/base/dups/name.qy ; + INSERT INTO collection name_meta_dups SELECT :key1, :key2, :sim FROM /apps/kardia/modules/base/dups/name_meta.qy ; + INSERT INTO collection email_dups SELECT :key1, :key2, :sim FROM /apps/kardia/modules/base/dups/email.qy ; + INSERT INTO collection phone_dups SELECT :key1, :key2, :sim FROM /apps/kardia/modules/base/dups/phone.qy ; + INSERT INTO collection address_dups SELECT :key1, :key2, :sim FROM /apps/kardia/modules/base/dups/address.qy ; + + + -- [name_dups <- name_meta_dups] Add an entry to name_dups for each entry in name_meta_dups. + INSERT INTO + collection name_dups + SELECT + key1 = :name_meta:key1, + key2 = :name_meta:key2, + sim = condition(:name:sim >= 0.0, + :name:sim, + isnull(cos_compare(:n1:name, :n2:name), -1.0) + ) + FROM + identity collection name_meta_dups name_meta, + collection name_dups name, + collection names n1, + collection names n2 + WHERE + :name_meta:key1 *= :name:key1 + AND :name_meta:key2 *= :name:key2 + AND :name_meta:key1 *= :n1:key + AND :name_meta:key2 *= :n2:key + ON duplicate -- Update entries that already exist (in case we have a cross-cluster match). + :key1, + :key2 + UPDATE SET + :sim = condition(:name:sim >= 0.0, + :name:sim, + isnull(cos_compare(:n1:name, :n2:name), -1.0) + ) + ; + -- [name_dups <- email_dups] Add an entry to name_dups for each entry in email_dups. INSERT INTO collection name_dups SELECT key1 = :email:key1, key2 = :email:key2, - sim = condition(:name:sim > 0.0, + sim = condition(:name:sim >= 0.0, :name:sim, - isnull(cos_compare(:n1:name, :n2:name), -1.0) -- Negative value will be dropped from the average. + isnull(cos_compare(:n1:name, :n2:name), -1.0) ) FROM identity collection email_dups email, @@ -51,13 +104,13 @@ update_duplicates "system/query" AND :email:key2 *= :name:key2 AND :email:key1 *= :n1:key AND :email:key2 *= :n2:key - ON duplicate -- Skip entries that already exist in name_dups. + ON duplicate -- Update entries that already exist (in case we have a cross-cluster match). :key1, :key2 UPDATE SET - :sim = condition(:name:sim > 0.0, + :sim = condition(:name:sim >= 0.0, :name:sim, - isnull(cos_compare(:n1:name, :n2:name), -1.0) -- Negative value will be dropped from the average. + isnull(cos_compare(:n1:name, :n2:name), -1.0) ) ; @@ -67,9 +120,9 @@ update_duplicates "system/query" SELECT key1 = :phone:key1, key2 = :phone:key2, - sim = condition(:name:sim > 0.0, + sim = condition(:name:sim >= 0.0, :name:sim, - max(isnull(cos_compare(:n1:name, :n2:name), -1.0)) -- Negative value will be dropped from the average. + max(isnull(cos_compare(:n1:name, :n2:name), -1.0)) ) FROM identity collection phone_dups phone, @@ -84,13 +137,13 @@ update_duplicates "system/query" GROUP BY :name:key1, :name:key2 - ON duplicate -- Skip entries that already exist in name_dups. + ON duplicate -- Update entries that already exist (in case we have a cross-cluster match). :key1, :key2 UPDATE SET - :sim = condition(:name:sim > 0.0, + :sim = condition(:name:sim >= 0.0, :name:sim, - max(isnull(cos_compare(:n1:name, :n2:name), -1.0)) -- Negative value will be dropped from the average. + max(isnull(cos_compare(:n1:name, :n2:name), -1.0)) ) ; @@ -100,9 +153,9 @@ update_duplicates "system/query" SELECT key1 = :address:key1, key2 = :address:key2, - sim = condition(:name:sim > 0.0, + sim = condition(:name:sim >= 0.0, :name:sim, - max(isnull(cos_compare(:n1:name, :n2:name), -1.0)) -- Negative value will be dropped from the average. + max(isnull(cos_compare(:n1:name, :n2:name), -1.0)) ) FROM identity collection address_dups address, @@ -117,26 +170,59 @@ update_duplicates "system/query" GROUP BY :name:key1, :name:key2 - ON duplicate -- Skip entries that already exist in name_dups. + ON duplicate -- Update entries that already exist (in case we have a cross-cluster match). :key1, :key2 UPDATE SET - :sim = condition(:name:sim > 0.0, + :sim = condition(:name:sim >= 0.0, :name:sim, - max(isnull(cos_compare(:n1:name, :n2:name), -1.0)) -- Negative value will be dropped from the average. + max(isnull(cos_compare(:n1:name, :n2:name), -1.0)) ) ; + -- [name_meta_dups <- name_dups] Add an entry to name_meta_dups for each entry in name_dups. + INSERT INTO + collection name_meta_dups + SELECT + key1 = :name:key1, + key2 = :name:key2, + sim = condition(:name_meta:sim >= 0.0, + :name_meta:sim, + isnull(lev_compare(:nm1:name_meta, :nm2:name_meta), -1.0) + ) + FROM + identity collection name_dups name, + collection name_meta_dups name_meta, + collection name_metas nm1, + collection name_metas nm2 + WHERE + :name:key1 *= :name_meta:key1 + AND :name:key2 *= :name_meta:key2 + AND :name:key1 *= :nm1:key + AND :name:key2 *= :nm2:key + GROUP BY + :name:key1, + :name:key2 + ON duplicate -- Skip duplicate entries. + :key1, + :key2 + UPDATE SET + :sim = condition(:name_meta:sim >= 0.0, + :name_meta:sim, + isnull(lev_compare(:nm1:name_meta, :nm2:name_meta), -1.0) + ) + ; + -- [email_dups <- name_dups] Add an entry to email_dups for each entry in name_dups. INSERT INTO collection email_dups SELECT key1 = :name:key1, key2 = :name:key2, - sim = condition(:email:sim > 0.0, + sim = condition(:email:sim >= 0.0, :email:sim, - max(isnull(cos_compare(:e1:email, :e2:email), -1.0)) -- Negative value will be dropped from the average. + max(isnull(cos_compare(:e1:email, :e2:email), -1.0)) ) FROM identity collection name_dups name, @@ -155,9 +241,9 @@ update_duplicates "system/query" :key1, :key2 UPDATE SET - :sim = condition(:email:sim > 0.0, + :sim = condition(:email:sim >= 0.0, :email:sim, - max(isnull(cos_compare(:e1:email, :e2:email), -1.0)) -- Negative value will be dropped from the average. + max(isnull(cos_compare(:e1:email, :e2:email), -1.0)) ) ; @@ -167,9 +253,9 @@ update_duplicates "system/query" SELECT key1 = :name:key1, key2 = :name:key2, - sim = condition(:phone:sim > 0.0, + sim = condition(:phone:sim >= 0.0, :phone:sim, - max(isnull(cos_compare(:ph1:phone, :ph2:phone), -1.0)) -- Negative value will be dropped from the average. + max(isnull(lev_compare(:ph1:phone, :ph2:phone), -1.0)) ) FROM identity collection name_dups name, @@ -188,9 +274,9 @@ update_duplicates "system/query" :key1, :key2 UPDATE SET - :sim = condition(:phone:sim > 0.0, + :sim = condition(:phone:sim >= 0.0, :phone:sim, - max(isnull(cos_compare(:ph1:phone, :ph2:phone), -1.0)) -- Negative value will be dropped from the average. + max(isnull(lev_compare(:ph1:phone, :ph2:phone), -1.0)) ) ; @@ -200,9 +286,9 @@ update_duplicates "system/query" SELECT key1 = :name:key1, key2 = :name:key2, - sim = condition(:address:sim > 0.0, + sim = condition(:address:sim >= 0.0, :address:sim, - max(isnull(cos_compare(:l1:address, :l2:address), -1.0)) -- Negative value will be dropped from the average. + max(isnull(cos_compare(:l1:address, :l2:address), -1.0)) ) FROM identity collection name_dups name, @@ -221,9 +307,9 @@ update_duplicates "system/query" :key1, :key2 UPDATE SET - :sim = condition(:address:sim > 0.0, + :sim = condition(:address:sim >= 0.0, :address:sim, - max(isnull(cos_compare(:l1:address, :l2:address), -1.0)) -- Negative value will be dropped from the average. + max(isnull(cos_compare(:l1:address, :l2:address), -1.0)) ) ; @@ -235,63 +321,63 @@ update_duplicates "system/query" SELECT key1 = :name:key1, key2 = :name:key2, - sim = aggregate_similarities( - :name:sim, - :email:sim, - :phone:sim, - :address:sim, - :name:key1 + ' ' + :name:name1, - :name:key2 + ' ' + :name:name2 + sim = (0.0 + + condition(constrain(:name:sim, :name_meta:sim * 0.9, 1.0) >= 0.0, constrain(:name:sim, :name_meta:sim * 0.9, 1.0), 0.0) + + condition(:email:sim >= 0.0, :email:sim, 0.0) + + condition(:phone:sim >= 0.0, :phone:sim, 0.0) + + condition(:address:sim >= 0.0, :address:sim, 0.0) + ) / (constrain(0.0 -- Constrain to prevent divide by 0 when NANs wander into the data from alternate dimensions. + + condition(:name:sim >= 0.0 OR :name_meta:sim >= 0.0, 1.0, 0.0) + + condition(:email:sim >= 0.0, 1.0, 0.0) + + condition(:phone:sim >= 0.0, 1.0, 0.0) + + condition(:address:sim >= 0.0, 1.0, 0.0) + , 0.0000001, convert(double, NULL)) -- Necessary because centrallix devision does not handle NAN properly. ), reason = '' - + condition(:name:sim > 0.0, 'Name (' + round(:name:sim * 100, 2) + '%)\\n', '') - + condition(:email:sim > 0.0, 'Email (' + round(:email:sim * 100, 2) + '%)\\n', '') - + condition(:phone:sim > 0.0, 'Phone (' + round(:phone:sim * 100, 2) + '%)\\n', '') - + condition(:address:sim > 0.0, 'Address (' + round(:address:sim * 100, 2) + '%)\\n', '') + + condition(:name:sim > :value:min_reason_sim, 'Name (' + round(:name:sim * 100, :value:reason_decimals) + '%)\\n', '') + + condition(:name_meta:sim > :value:min_reason_sim AND :name_meta:sim > :name:sim, 'Phonetic Name (' + round(:name_meta:sim * 100, :value:reason_decimals) + '%)\\n', '') + + condition(:email:sim > :value:min_reason_sim, 'Email (' + round(:email:sim * 100, :value:reason_decimals) + '%)\\n', '') + + condition(:phone:sim > :value:min_reason_sim, 'Phone (' + round(:phone:sim * 100, :value:reason_decimals) + '%)\\n', '') + + condition(:address:sim > :value:min_reason_sim, 'Address (' + round(:address:sim * 100, :value:reason_decimals) + '%)\\n', '') - '\\n' FROM identity collection name_dups name, + collection name_meta_dups name_meta, collection email_dups email, collection phone_dups phone, collection address_dups address WHERE - :name:key1 = :email:key1 + :name:key1 = :name_meta:key1 + AND :name:key2 = :name_meta:key2 + AND :name:key1 = :email:key1 AND :name:key2 = :email:key2 AND :name:key1 = :phone:key1 AND :name:key2 = :phone:key2 AND :name:key1 = :address:key1 AND :name:key2 = :address:key2 - ON duplicate - :key1, - :key2 - UPDATE SET - :sim = :sim, - :reason = :reason, - :s_date_modified = getdate(), - :s_modified_by = user_name() ; -- Full Concat Dups DECLARE collection concat_dups; - insert into collection concat_dups select * from /apps/kardia/modules/base/dups/concat.qy ; + INSERT INTO collection concat_dups select * from /apps/kardia/modules/base/dups/concat.qy ; INSERT INTO collection all_dups SELECT key1 = :concat:key1, key2 = :concat:key2, sim = :concat:sim, - reason = 'All (' + round(:concat:sim * 100, 2) + '%)!' + reason = 'All (' + round(:concat:sim * 100, :value:reason_decimals) + '%)' FROM identity collection concat_dups concat - WHERE + WHERE -- We should never get identical dups, but somehow we do. This fixes that. :concat:key1 != :concat:key2 ON duplicate :key1, :key2 UPDATE SET :sim = condition(:sim > :concat:sim, :sim, :concat:sim), - :reason = 'All (' + round(:concat:sim * 100, 2) + '%)\\n' + :reason, + :reason = 'All (' + round(:concat:sim * 100, :value:reason_decimals) + '%)\\n' + :reason, :s_date_modified = getdate(), :s_modified_by = user_name() ; @@ -323,7 +409,7 @@ update_duplicates "system/query" FROM collection all_dups WHERE - :sim > 0.6 + :sim > :value:min_total_sim ON duplicate :p_partner_key, :p_dup_partner_key @@ -335,12 +421,12 @@ update_duplicates "system/query" ; -- Remove stale data from the dups table. --- delete --- from +-- DELETE +-- FROM -- identity /apps/kardia/data/Kardia_DB/p_dup/rows d, -- /apps/kardia/data/Kardia_DB/p_partner/rows p1, -- /apps/kardia/data/Kardia_DB/p_partner/rows p2 --- where +-- WHERE -- :d:p_partner_key *= :p1:p_partner_key -- AND :d:p_dup_partner_key *= :p2:p_partner_key -- AND(:d:s_date_modified < isnull(:p1:s_date_modified, getdate()) diff --git a/kardia-app/modules/base/plugin_base_dataqa_duplicates.cmp b/kardia-app/modules/base/plugin_base_dataqa_duplicates.cmp index 6a29160f..ee50f624 100644 --- a/kardia-app/modules/base/plugin_base_dataqa_duplicates.cmp +++ b/kardia-app/modules/base/plugin_base_dataqa_duplicates.cmp @@ -232,9 +232,53 @@ plugin_base_dataqa_duplicates "widget/component-decl" type = progress; fieldname = match; - bar_color = runclient(condition(:dupsosrc:match >= 0.90, '#59b550', condition(:dupsosrc:match < 0.80, '#d96066', '#DDB261'))); + bar_color = runclient( + condition(:dupsosrc:match >= 1.00, '#63c85b', + condition(:dupsosrc:match >= 0.95, '#86c65c', + condition(:dupsosrc:match >= 0.90, '#aac45d', + condition(:dupsosrc:match >= 0.85, '#cdc15e', + condition(:dupsosrc:match >= 0.80, '#f0bf5f', + condition(:dupsosrc:match >= 0.75, '#edaa63', + condition(:dupsosrc:match >= 0.70, '#ea9467', + condition(:dupsosrc:match >= 0.65, '#e77f6b', '#e46a6f' + ))))))))); bar_padding = 3; bar_textcolor = black; + + // The above bar_color values were generated using this script in JS. + // The hex function was generated by GPT5-mini using t3.chat with direct + // oversight from Israel, who wrote the rest of the code there after. + // + // function hex(v) { + // const clamp = (x, a = 0, b = 1) => Math.min(b, Math.max(a, x)); + // const lerp = (a, b, t) => a + (b - a) * t; + // const toHex2 = (n) => Math.round(n).toString(16).padStart(2, "0"); + // + // const t = clamp(v); + // + // const R = { r: 0xe4, g: 0x6a, b: 0x6f }; // #e46a6f + // const Y = { r: 0xf0, g: 0xbf, b: 0x5f }; // #f0bf5f + // const G = { r: 0x63, g: 0xc8, b: 0x5b }; // #63c85b + // + // if (t <= 0.6) return `#${toHex2(R.r)}${toHex2(R.g)}${toHex2(R.b)}`; + // if (t <= 0.8) { + // const u = (t - 0.6) / 0.2; + // const r = lerp(R.r, Y.r, u); + // const g = lerp(R.g, Y.g, u); + // const b = lerp(R.b, Y.b, u); + // return `#${toHex2(r)}${toHex2(g)}${toHex2(b)}`; + // } + // const u = (t - 0.8) / 0.2; + // const r = lerp(Y.r, G.r, u); + // const g = lerp(Y.g, G.g, u); + // const b = lerp(Y.b, G.b, u); + // return `#${toHex2(r)}${toHex2(g)}${toHex2(b)}`; + // } + // + // const hex_r = (sim, step) => (sim < 0.6) + // ? ` '${hex(sim)}'\n\t\t ` + // : `\n\t\t\tcondition(:dupsosrc:match >= ${Math.round(sim * 1000) / 1000}, '${hex(sim)}',${hex_r(sim - step, step)})`; + // console.log(`runclient(${hex_r(1.0, 0.05)});`); } // Dulicate reason, and other information about the type of duplicate. @@ -274,9 +318,6 @@ plugin_base_dataqa_duplicates "widget/component-decl" wrap = yes; title = "Partner"; - /// TODO: Israel - Remove comment below. - // This code used to check :dupsosrc:p_partner_key = :dupsosrc:p_dup_partner_key. - // The new algorithm verifies that this condition is always false, so it was removed. value = runclient(:dupsosrc:p_partner_key + ' ' + :dupsosrc:disp_name_1); caption_value = runclient( isnull(:dupsosrc:ploc_1 + '\n', '') From 58819c5ba196eaa520c372739283e6d0ddcc59d5 Mon Sep 17 00:00:00 2001 From: Israel Date: Fri, 14 Nov 2025 11:48:53 -0700 Subject: [PATCH 03/12] Add ClusterDriverRequirements (forgot to commit them before). Add known issues to string similarity documentation. Clean up and organize todos. Clean up testing code in several files. --- kardia-app/modules/base/dups/update.qy | 1 + kardia-scripts/cron/update_duplicates.sh | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/kardia-app/modules/base/dups/update.qy b/kardia-app/modules/base/dups/update.qy index cc245370..092694ce 100644 --- a/kardia-app/modules/base/dups/update.qy +++ b/kardia-app/modules/base/dups/update.qy @@ -392,6 +392,7 @@ update_duplicates "system/query" -- Remove all data from the dups table (for debugging). delete from /apps/kardia/data/Kardia_DB/p_dup/rows ; + -- TODO: Greg - We should fix this upsert. -- Insert dups into p_dup table. INSERT INTO /apps/kardia/data/Kardia_DB/p_dup/rows diff --git a/kardia-scripts/cron/update_duplicates.sh b/kardia-scripts/cron/update_duplicates.sh index f241a5c2..b9c5b927 100755 --- a/kardia-scripts/cron/update_duplicates.sh +++ b/kardia-scripts/cron/update_duplicates.sh @@ -14,5 +14,5 @@ # Make sure /usr/local binaries are in the $PATH export PATH=$PATH:/usr/local/sbin:/usr/local/bin +# TODO: Greg - We need to update this cron to call the new file. /usr/local/bin/test_obj -c /usr/local/etc/centrallix.conf -u kardia -p $(cat /usr/local/etc/centrallix/kardia-auth) -q -C 'ls /apps/kardia/modules/base/update_duplicates.qy' 2>/dev/null >/dev/null - From 039a92eb42155a61741a0b1d5db140d762484e1f Mon Sep 17 00:00:00 2001 From: Israel Date: Mon, 17 Nov 2025 11:09:19 -0700 Subject: [PATCH 04/12] Clean up. --- kardia-app/modules/base/dups/address.qy | 7 ++----- kardia-app/modules/base/dups/concat.qy | 4 +--- kardia-app/modules/base/dups/dups.cluster | 2 +- kardia-app/modules/base/dups/email.qy | 7 ++----- kardia-app/modules/base/dups/get/addresses.qy | 4 +--- kardia-app/modules/base/dups/get/concats.qy | 6 +----- kardia-app/modules/base/dups/get/emails.qy | 4 +--- kardia-app/modules/base/dups/get/name_metas.qy | 4 +--- kardia-app/modules/base/dups/get/names.qy | 4 +--- kardia-app/modules/base/dups/get/phones.qy | 4 +--- kardia-app/modules/base/dups/globals.qy | 2 +- kardia-app/modules/base/dups/name.qy | 7 ++----- kardia-app/modules/base/dups/name_meta.qy | 7 ++----- kardia-app/modules/base/dups/phone.qy | 5 +---- 14 files changed, 18 insertions(+), 49 deletions(-) diff --git a/kardia-app/modules/base/dups/address.qy b/kardia-app/modules/base/dups/address.qy index 30dfd52d..4b2c766c 100644 --- a/kardia-app/modules/base/dups/address.qy +++ b/kardia-app/modules/base/dups/address.qy @@ -1,13 +1,11 @@ $Version=2$ -dups_test_query "system/query" +address_duplicates "system/query" { sql = " - print 'address.qy start' - -- Get the list of addresses we will be searching for duplicates. DECLARE collection addresses scope application; - -- Assume the listof addresses was already computed by the caller. + -- Assume the list of addresses was already computed by the caller. -- INSERT INTO collection addresses SELECT :key, :name FROM /apps/kardia/modules/base/dups/get/addresses.qy ; -- Declare an object for storing values locally. @@ -55,6 +53,5 @@ dups_test_query "system/query" :p1:p_partner_key, :p2:p_partner_key ; - print 'address.qy done' "; } diff --git a/kardia-app/modules/base/dups/concat.qy b/kardia-app/modules/base/dups/concat.qy index 93a5998b..e0653341 100644 --- a/kardia-app/modules/base/dups/concat.qy +++ b/kardia-app/modules/base/dups/concat.qy @@ -1,8 +1,7 @@ $Version=2$ -dups_test_query "system/query" +concat_duplicates "system/query" { sql = " - print 'concat.qy start' declare object value; -- Count data. @@ -57,6 +56,5 @@ dups_test_query "system/query" :p1:p_partner_key, :p2:p_partner_key ; - print 'concat.qy done' "; } diff --git a/kardia-app/modules/base/dups/dups.cluster b/kardia-app/modules/base/dups/dups.cluster index c1b3ba2a..016376ec 100644 --- a/kardia-app/modules/base/dups/dups.cluster +++ b/kardia-app/modules/base/dups/dups.cluster @@ -1,5 +1,5 @@ $Version=2$ -get_dups "system/cluster" +cluster_dups "system/cluster" { // Declare parameters. algorithm "cluster/parameter" { type = string; style = notnull; } diff --git a/kardia-app/modules/base/dups/email.qy b/kardia-app/modules/base/dups/email.qy index fcaa00f6..007bf1ca 100644 --- a/kardia-app/modules/base/dups/email.qy +++ b/kardia-app/modules/base/dups/email.qy @@ -1,13 +1,11 @@ $Version=2$ -dups_test_query "system/query" +email_duplicates "system/query" { sql = " - print 'email.qy start' - -- Get the list of emails we will be searching for duplicates. DECLARE collection emails scope application; - -- Assume the listof emails was already computed by the caller. + -- Assume the list of emails was already computed by the caller. -- INSERT INTO collection emails SELECT :key, :name FROM /apps/kardia/modules/base/dups/get/emails.qy ; -- Declare an object for storing values locally. @@ -57,6 +55,5 @@ dups_test_query "system/query" :p1:p_partner_key, :p2:p_partner_key ; - print 'email.qy done' "; } diff --git a/kardia-app/modules/base/dups/get/addresses.qy b/kardia-app/modules/base/dups/get/addresses.qy index a739d4ce..d24c73bf 100644 --- a/kardia-app/modules/base/dups/get/addresses.qy +++ b/kardia-app/modules/base/dups/get/addresses.qy @@ -1,8 +1,7 @@ $Version=2$ -dups_test_query "system/query" +get_addresses "system/query" { sql = " - print 'get/addresses.qy start' SELECT key = :p:p_partner_key, address = '' @@ -22,6 +21,5 @@ dups_test_query "system/query" AND :l:p_address_1 is not null AND :l:p_address_1 != '' ; - print 'get/addresses.qy end' "; } \ No newline at end of file diff --git a/kardia-app/modules/base/dups/get/concats.qy b/kardia-app/modules/base/dups/get/concats.qy index f2c08313..cf41bb6d 100644 --- a/kardia-app/modules/base/dups/get/concats.qy +++ b/kardia-app/modules/base/dups/get/concats.qy @@ -1,9 +1,7 @@ $Version=2$ -dups_test_query "system/query" +get_concats "system/query" { sql = " - print 'get/concats.qy start' - declare object value; SELECT :value:boundary_character = '`'; @@ -79,7 +77,5 @@ dups_test_query "system/query" + :address FROM collection temp - - print 'get/concats.qy end' "; } diff --git a/kardia-app/modules/base/dups/get/emails.qy b/kardia-app/modules/base/dups/get/emails.qy index 78aba5d8..2162e773 100644 --- a/kardia-app/modules/base/dups/get/emails.qy +++ b/kardia-app/modules/base/dups/get/emails.qy @@ -1,8 +1,7 @@ $Version=2$ -dups_test_query "system/query" +get_emails "system/query" { sql = " - print 'get/emails.qy start' SELECT key = :p:p_partner_key, email = :c:p_contact_data @@ -15,6 +14,5 @@ dups_test_query "system/query" AND :c:p_contact_data is not null AND :c:p_contact_data != '' ; - print 'get/emails.qy end' "; } \ No newline at end of file diff --git a/kardia-app/modules/base/dups/get/name_metas.qy b/kardia-app/modules/base/dups/get/name_metas.qy index e733dbae..459db1b7 100644 --- a/kardia-app/modules/base/dups/get/name_metas.qy +++ b/kardia-app/modules/base/dups/get/name_metas.qy @@ -1,8 +1,7 @@ $Version=2$ -dups_test_query "system/query" +get_name_metas "system/query" { sql = " - print 'get/name_metas.qy start' SELECT key = :p_partner_key, name_meta = '' @@ -20,6 +19,5 @@ dups_test_query "system/query" char_length(isnull(:p_given_name, '')) > 1 AND char_length(isnull(:p_surname, '')) > 1 ; - print 'get/name_metas.qy end' "; } diff --git a/kardia-app/modules/base/dups/get/names.qy b/kardia-app/modules/base/dups/get/names.qy index 7f2b2f57..b8501d37 100644 --- a/kardia-app/modules/base/dups/get/names.qy +++ b/kardia-app/modules/base/dups/get/names.qy @@ -1,8 +1,7 @@ $Version=2$ -dups_test_query "system/query" +get_names "system/query" { sql = " - print 'get/names.qy start' SELECT key = :p_partner_key, name = '' @@ -20,6 +19,5 @@ dups_test_query "system/query" FROM /apps/kardia/data/Kardia_DB/p_partner/rows ; - print 'get/names.qy end' "; } diff --git a/kardia-app/modules/base/dups/get/phones.qy b/kardia-app/modules/base/dups/get/phones.qy index 44ff7116..6a4e56af 100644 --- a/kardia-app/modules/base/dups/get/phones.qy +++ b/kardia-app/modules/base/dups/get/phones.qy @@ -1,8 +1,7 @@ $Version=2$ -dups_test_query "system/query" +get_phones "system/query" { sql = " - print 'get/phones.qy start' SELECT key = :p:p_partner_key, phone = '' @@ -18,6 +17,5 @@ dups_test_query "system/query" AND :p:p_contact_data is not null AND :p:p_contact_data != '' ; - print 'get/phones.qy end' "; } \ No newline at end of file diff --git a/kardia-app/modules/base/dups/globals.qy b/kardia-app/modules/base/dups/globals.qy index f489cafd..8c30cb5a 100644 --- a/kardia-app/modules/base/dups/globals.qy +++ b/kardia-app/modules/base/dups/globals.qy @@ -1,5 +1,5 @@ $Version=2$ -dups_test_query "system/query" +duplicate_checking_globals "system/query" { num_data "query/parameter" { type = integer; style = strnull; min=1; } diff --git a/kardia-app/modules/base/dups/name.qy b/kardia-app/modules/base/dups/name.qy index 17adfe32..7b158e3b 100644 --- a/kardia-app/modules/base/dups/name.qy +++ b/kardia-app/modules/base/dups/name.qy @@ -1,13 +1,11 @@ $Version=2$ -name_dups "system/query" +name_duplicates "system/query" { sql = " - print 'name.qy end' - -- Get the list of names we will be searching for duplicates. DECLARE collection names scope application; - -- Assume the listof names was already computed by the caller. + -- Assume the list of names was already computed by the caller. -- INSERT INTO collection names SELECT :key, :name FROM /apps/kardia/modules/base/dups/get/names.qy ; -- Declare an object for storing values locally. @@ -48,6 +46,5 @@ name_dups "system/query" :c:key1 = :p1:p_partner_key AND :c:key2 = :p2:p_partner_key ; - print 'name.qy end' "; } diff --git a/kardia-app/modules/base/dups/name_meta.qy b/kardia-app/modules/base/dups/name_meta.qy index a57348fa..4b56e2fa 100644 --- a/kardia-app/modules/base/dups/name_meta.qy +++ b/kardia-app/modules/base/dups/name_meta.qy @@ -1,13 +1,11 @@ $Version=2$ -name_dups "system/query" +name_duplicates "system/query" { sql = " - print 'name.qy end' - -- Get the list of name_metas we will be searching for duplicates. DECLARE collection name_metas scope application; - -- Assume the listof name_metas was already computed by the caller. + -- Assume the list of name_metas was already computed by the caller. -- INSERT INTO collection name_metas SELECT :key, :name FROM /apps/kardia/modules/base/dups/get/name_metas.qy ; -- Declare an object for storing values locally. @@ -48,6 +46,5 @@ name_dups "system/query" :c:key1 = :p1:p_partner_key AND :c:key2 = :p2:p_partner_key ; - print 'name.qy end' "; } diff --git a/kardia-app/modules/base/dups/phone.qy b/kardia-app/modules/base/dups/phone.qy index 111f6791..6679e397 100644 --- a/kardia-app/modules/base/dups/phone.qy +++ b/kardia-app/modules/base/dups/phone.qy @@ -2,12 +2,10 @@ $Version=2$ dups_test_query "system/query" { sql = " - print 'phone.qy start' - -- Get the list of phones we will be searching for duplicates. DECLARE collection phones scope application; - -- Assume the listof phones was already computed by the caller. + -- Assume the list of phones was already computed by the caller. -- INSERT INTO collection phones SELECT :key, :name FROM /apps/kardia/modules/base/dups/get/phones.qy ; -- Declare an object for storing values locally. @@ -55,6 +53,5 @@ dups_test_query "system/query" :p1:p_partner_key, :p2:p_partner_key ; - print 'phone.qy done' "; } From 8376437ad808c912ad47eec88a3ab1af7c5b92fa Mon Sep 17 00:00:00 2001 From: Israel Date: Wed, 19 Nov 2025 12:04:09 -0700 Subject: [PATCH 05/12] Rewrite dup acquisition queries for much simpler and faster code. Rearchitect dupe acquisition queries, inlining them in update.qy. Add comments get/.qy files to explain how the data is marshalled. Add print statements to update.qy which help with debugging. Add a comment to update.qy explaining the strategies used. Add code to compute individual-field similarities for concat dups, improving reasons displayed to the DB Admin. Rename globals.qy to cluster_params.qy. --- kardia-app/modules/base/dups/address.qy | 57 ---- .../dups/{globals.qy => cluster_params.qy} | 13 +- kardia-app/modules/base/dups/concat.qy | 60 ---- kardia-app/modules/base/dups/email.qy | 59 ---- kardia-app/modules/base/dups/get/addresses.qy | 38 ++- kardia-app/modules/base/dups/get/concats.qy | 24 +- kardia-app/modules/base/dups/get/emails.qy | 17 +- .../modules/base/dups/get/name_metas.qy | 9 + kardia-app/modules/base/dups/get/names.qy | 8 + kardia-app/modules/base/dups/get/phones.qy | 24 +- kardia-app/modules/base/dups/name.qy | 50 --- kardia-app/modules/base/dups/name_meta.qy | 50 --- kardia-app/modules/base/dups/phone.qy | 57 ---- kardia-app/modules/base/dups/update.qy | 305 +++++++++++++++--- 14 files changed, 349 insertions(+), 422 deletions(-) delete mode 100644 kardia-app/modules/base/dups/address.qy rename kardia-app/modules/base/dups/{globals.qy => cluster_params.qy} (73%) delete mode 100644 kardia-app/modules/base/dups/concat.qy delete mode 100644 kardia-app/modules/base/dups/email.qy delete mode 100644 kardia-app/modules/base/dups/name.qy delete mode 100644 kardia-app/modules/base/dups/name_meta.qy delete mode 100644 kardia-app/modules/base/dups/phone.qy diff --git a/kardia-app/modules/base/dups/address.qy b/kardia-app/modules/base/dups/address.qy deleted file mode 100644 index 4b2c766c..00000000 --- a/kardia-app/modules/base/dups/address.qy +++ /dev/null @@ -1,57 +0,0 @@ -$Version=2$ -address_duplicates "system/query" - { - sql = " - -- Get the list of addresses we will be searching for duplicates. - DECLARE collection addresses scope application; - - -- Assume the list of addresses was already computed by the caller. - -- INSERT INTO collection addresses SELECT :key, :name FROM /apps/kardia/modules/base/dups/get/addresses.qy ; - - -- Declare an object for storing values locally. - declare object value; - - -- Count data. - SELECT - :value:num_data = count(1) - FROM - collection addresses - ; - - -- Compute global values. - SELECT - :value:k = :k, - :value:algorithm = :algorithm - FROM - expression ('/apps/kardia/modules/base/dups/globals.qy?num_data=' + :value:num_data) - ; - - -- Detect dups. - SELECT - key1 = :p1:p_partner_key, - key2 = :p2:p_partner_key, - sim = max(:d:sim) - FROM - identity /apps/kardia/data/Kardia_DB/p_partner/rows p1, - /apps/kardia/data/Kardia_DB/p_location/rows l1, - /apps/kardia/data/Kardia_DB/p_partner/rows p2, - /apps/kardia/data/Kardia_DB/p_location/rows l2, - expression ( - '/apps/kardia/modules/base/dups/dups.cluster' - + '?algorithm=' + :value:algorithm - + '&k=' + :value:k - + '&field=addresses' - + '&data=address' - + '/dups' - ) d - WHERE - :p1:p_partner_key = :l1:p_partner_key - AND :p2:p_partner_key = :l2:p_partner_key - AND :p1:p_partner_key = :d:key1 - AND :p2:p_partner_key = :d:key2 - GROUP BY - :p1:p_partner_key, - :p2:p_partner_key - ; - "; - } diff --git a/kardia-app/modules/base/dups/globals.qy b/kardia-app/modules/base/dups/cluster_params.qy similarity index 73% rename from kardia-app/modules/base/dups/globals.qy rename to kardia-app/modules/base/dups/cluster_params.qy index 8c30cb5a..bc30bab4 100644 --- a/kardia-app/modules/base/dups/globals.qy +++ b/kardia-app/modules/base/dups/cluster_params.qy @@ -1,15 +1,18 @@ $Version=2$ duplicate_checking_globals "system/query" { - num_data "query/parameter" { type = integer; style = strnull; min=1; } + // Computes parameters for clustering and searching, based on the provided + // size of the data. + + num_data "query/parameter" { type = integer; style = strnull; min = 1; } sql = " SELECT - -- Compute the k value for clustering. - k = constrain(convert(integer, power(logn(:parameters:num_data, 36), 3.2) - 8), 2, convert(integer, NULL)), - -- Compute which clustering algorithm to use (no clustering is better for small amounts of data). - algorithm = condition(:parameters:num_data > 100000, 'kmeans', 'none') + algorithm = condition(:parameters:num_data > 100000, 'kmeans', 'none'), + + -- Compute the k value for clustering. + k = constrain(convert(integer, power(logn(:parameters:num_data, 36), 3.2) - 8), 2, convert(integer, NULL)) FROM /apps/kardia/data/Kardia_DB/_a_alphabet/rows LIMIT 1 diff --git a/kardia-app/modules/base/dups/concat.qy b/kardia-app/modules/base/dups/concat.qy deleted file mode 100644 index e0653341..00000000 --- a/kardia-app/modules/base/dups/concat.qy +++ /dev/null @@ -1,60 +0,0 @@ -$Version=2$ -concat_duplicates "system/query" - { - sql = " - declare object value; - - -- Count data. - SELECT - :value:num_data = count(1) - FROM - /apps/kardia/modules/base/dups/get/concats.qy - ; - - -- Compute global values. - SELECT - :value:k = :k, - :value:algorithm = :algorithm - FROM - expression ('/apps/kardia/modules/base/dups/globals.qy?num_data=' + :value:num_data) - ; - - -- Detect dups. - SELECT - key1 = :p1:p_partner_key, - key2 = :p2:p_partner_key, - sim = max(:d:sim) - FROM - identity /apps/kardia/data/Kardia_DB/p_partner/rows p1, - /apps/kardia/data/Kardia_DB/p_contact_info/rows e1, - /apps/kardia/data/Kardia_DB/p_contact_info/rows ph1, - /apps/kardia/data/Kardia_DB/p_location/rows l1, - /apps/kardia/data/Kardia_DB/p_partner/rows p2, - /apps/kardia/data/Kardia_DB/p_contact_info/rows e2, - /apps/kardia/data/Kardia_DB/p_contact_info/rows ph2, - /apps/kardia/data/Kardia_DB/p_location/rows l2, - expression ( - '/apps/kardia/modules/base/dups/dups.cluster' - + '?algorithm=' + :value:algorithm - + '&k=' + :value:k - + '&field=concats' - + '&data=data' - + '/concat_dups' - ) d - WHERE - :p1:p_partner_key = :d:key1 - AND :p2:p_partner_key = :d:key2 - AND :p1:p_partner_key *= :e1:p_partner_key - AND :p1:p_partner_key *= :ph1:p_partner_key - AND :p1:p_partner_key *= :l1:p_partner_key - AND :p2:p_partner_key *= :e2:p_partner_key - AND :p2:p_partner_key *= :ph2:p_partner_key - AND :p2:p_partner_key *= :l2:p_partner_key - AND :d:sim is not null AND :d:sim is not null AND :d:sim is not null -- Query optimization - AND :d:sim is not null AND :d:sim is not null AND :d:sim is not null -- Query optimization - GROUP BY - :p1:p_partner_key, - :p2:p_partner_key - ; - "; - } diff --git a/kardia-app/modules/base/dups/email.qy b/kardia-app/modules/base/dups/email.qy deleted file mode 100644 index 007bf1ca..00000000 --- a/kardia-app/modules/base/dups/email.qy +++ /dev/null @@ -1,59 +0,0 @@ -$Version=2$ -email_duplicates "system/query" - { - sql = " - -- Get the list of emails we will be searching for duplicates. - DECLARE collection emails scope application; - - -- Assume the list of emails was already computed by the caller. - -- INSERT INTO collection emails SELECT :key, :name FROM /apps/kardia/modules/base/dups/get/emails.qy ; - - -- Declare an object for storing values locally. - declare object value; - - -- Count data. - SELECT - :value:num_data = count(1) - FROM - collection emails - ; - - -- Compute global values. - SELECT - :value:k = :k, - :value:algorithm = :algorithm - FROM - expression ('/apps/kardia/modules/base/dups/globals.qy?num_data=' + :value:num_data) - ; - - -- Detect dups. - SELECT - key1 = :p1:p_partner_key, - key2 = :p2:p_partner_key, - sim = max(:d:sim) - FROM - identity /apps/kardia/data/Kardia_DB/p_partner/rows p1, - /apps/kardia/data/Kardia_DB/p_contact_info/rows c1, - /apps/kardia/data/Kardia_DB/p_partner/rows p2, - /apps/kardia/data/Kardia_DB/p_contact_info/rows c2, - expression ( - '/apps/kardia/modules/base/dups/dups.cluster' - + '?algorithm=' + :value:algorithm - + '&k=' + :value:k - + '&field=emails' - + '&data=email' - + '/dups' - ) d - WHERE - :p1:p_partner_key = :c1:p_partner_key - AND :p2:p_partner_key = :c2:p_partner_key - AND :p1:p_partner_key = :d:key1 - AND :p2:p_partner_key = :d:key2 - AND :c1:p_contact_type = 'E' - AND :c2:p_contact_type = 'E' - GROUP BY - :p1:p_partner_key, - :p2:p_partner_key - ; - "; - } diff --git a/kardia-app/modules/base/dups/get/addresses.qy b/kardia-app/modules/base/dups/get/addresses.qy index d24c73bf..e0db6797 100644 --- a/kardia-app/modules/base/dups/get/addresses.qy +++ b/kardia-app/modules/base/dups/get/addresses.qy @@ -1,25 +1,35 @@ $Version=2$ get_addresses "system/query" { + // To get the address, we need to concatenate several fields found in the + // p_location table: + // - p_in_care_of: For sending mail to a recipient without an address who + // will have a different person/organization receive the + // mail for them. The address of that entity is provided. + // - p_address_1, p_address_2, & p_address_3: Up to 3 lines of an address. + // - p_city, p_state_province: The city and state (respectively). + // - p_country_code, p_postal_code: The country and postal code (repsectively). + // + // If the p_address_1 field is null or does not exist, the 'address' is ignored. + // This is very common because many systems in Centrallix assume that every + // record has an address, so every record has an associated address, even if it + // is almost completely blank. sql = " SELECT - key = :p:p_partner_key, + key = :p_partner_key, address = '' - + isnull(:l:p_in_care_of, '') - + isnull(:l:p_address_1, '') - + isnull(:l:p_address_2, '') - + isnull(:l:p_address_3, '') - + isnull(:l:p_city, '') - + isnull(:l:p_state_province, '') - + isnull(:l:p_country_code, '') - + isnull(:l:p_postal_code, '') + + isnull(:p_in_care_of, '') + + isnull(:p_address_1, '') + + isnull(:p_address_2, '') + + isnull(:p_address_3, '') + + isnull(:p_city, '') + + isnull(:p_state_province, '') + + isnull(:p_country_code, '') + + isnull(:p_postal_code, '') FROM - identity /apps/kardia/data/Kardia_DB/p_partner/rows p, - /apps/kardia/data/Kardia_DB/p_location/rows l + identity /apps/kardia/data/Kardia_DB/p_location/rows WHERE - :p:p_partner_key = :l:p_partner_key - AND :l:p_address_1 is not null - AND :l:p_address_1 != '' + char_length(isnull(:p_address_1, '')) > 1 ; "; } \ No newline at end of file diff --git a/kardia-app/modules/base/dups/get/concats.qy b/kardia-app/modules/base/dups/get/concats.qy index cf41bb6d..2fc914a1 100644 --- a/kardia-app/modules/base/dups/get/concats.qy +++ b/kardia-app/modules/base/dups/get/concats.qy @@ -1,10 +1,15 @@ $Version=2$ get_concats "system/query" { + // All data found in the files names.qy, name_metas.qy, emails.qy, + // phones.qy, and addresses.qy is concatenated together here (although + // those files aren't read here for performance reasons), producing a + // single concatenated string with all the information for a person. + // This string is produced for every record in p_partner record in the + // database. Each record can have up to one email, phone number, and + // address, so multiple records are produced for a given p_partner record + // if various combonations of this contact information are possible. sql = " - declare object value; - SELECT :value:boundary_character = '`'; - declare collection temp; INSERT INTO @@ -26,9 +31,7 @@ get_concats "system/query" name_meta = '' + isnull(metaphone(:p_given_name), '') + isnull(condition( - char_length(isnull(:p_given_name, '')) > 1 - AND char_length(isnull(:p_surname, '')) > 1 - AND :p_given_name != :p_preferred_name + :p_given_name != :p_preferred_name AND :p_surname != :p_preferred_name, metaphone(:p_preferred_name), '' @@ -58,7 +61,7 @@ get_concats "system/query" AND :p:p_partner_key *= :ph:p_partner_key AND :p:p_partner_key *= :l:p_partner_key AND :e:p_contact_type = 'E' - AND (:ph:p_contact_type = 'P' OR :ph:p_contact_type = 'C') + AND ((:ph:p_contact_type = 'P') + (:ph:p_contact_type = 'C')) ; -- Nonzero numbers are used as boundary markers for the meta parts @@ -67,13 +70,12 @@ get_concats "system/query" SELECT key = :key, data = '' - + :name + :value:boundary_character - + :name_meta + '1' + + :name + '`' + :name_meta + '1' + :name_meta + '1' + :name_meta + '1' - + :email + :value:boundary_character - + :phone + :value:boundary_character + + :email + '`' + + :phone + '`' + :address FROM collection temp diff --git a/kardia-app/modules/base/dups/get/emails.qy b/kardia-app/modules/base/dups/get/emails.qy index 2162e773..73294e77 100644 --- a/kardia-app/modules/base/dups/get/emails.qy +++ b/kardia-app/modules/base/dups/get/emails.qy @@ -1,18 +1,19 @@ $Version=2$ get_emails "system/query" { + // The email field is incredibly simple to get. We simply query for all + // p_contact_info records with a p_contact_type of E (for email) and read + // the email directly from the p_contact_data field. If this field is null + // or does not exist, the 'email' is ignored. sql = " SELECT - key = :p:p_partner_key, - email = :c:p_contact_data + key = :p_partner_key, + email = :p_contact_data FROM - identity /apps/kardia/data/Kardia_DB/p_partner/rows p, - /apps/kardia/data/Kardia_DB/p_contact_info/rows c + identity /apps/kardia/data/Kardia_DB/p_contact_info/rows WHERE - :p:p_partner_key = :c:p_partner_key - AND :c:p_contact_type = 'E' - AND :c:p_contact_data is not null - AND :c:p_contact_data != '' + :p_contact_type = 'E' + AND char_length(isnull(:p_contact_data, '')) > 1 ; "; } \ No newline at end of file diff --git a/kardia-app/modules/base/dups/get/name_metas.qy b/kardia-app/modules/base/dups/get/name_metas.qy index 459db1b7..1491f62f 100644 --- a/kardia-app/modules/base/dups/get/name_metas.qy +++ b/kardia-app/modules/base/dups/get/name_metas.qy @@ -1,6 +1,15 @@ $Version=2$ get_name_metas "system/query" { + // We calculate the metaphone value separately for the p_given_name, + // p_preferred_name, and p_surname fields. As with names, the + // p_preferred_name field is ignored if it matches the given name or + // the surname. + // + // Note: p_org_name is not considered because we determined that it was + // unlikely for someone would have to guess the spelling of an + // organization's name from how it sounded. Also, such names tend + // to be long and ill suited for the double metaphone algorithm. sql = " SELECT key = :p_partner_key, diff --git a/kardia-app/modules/base/dups/get/names.qy b/kardia-app/modules/base/dups/get/names.qy index b8501d37..7bad7fac 100644 --- a/kardia-app/modules/base/dups/get/names.qy +++ b/kardia-app/modules/base/dups/get/names.qy @@ -1,6 +1,14 @@ $Version=2$ get_names "system/query" { + // We concatenate four fields to produce the name: p_given_name, + // p_preferred_name, p_surname, and p_org_name. p_preferred_name is + // ignored if it adds no new information because it matches the given + // name or the surname, and it is also ignored for organizations. We + // detect organizations because they do not have a p_given_name or a + // p_surname value and only the p_org_name field is considered for them + // (it's long enough on it's own, anyway). However, p_org_name is still + // considered for people, too. sql = " SELECT key = :p_partner_key, diff --git a/kardia-app/modules/base/dups/get/phones.qy b/kardia-app/modules/base/dups/get/phones.qy index 6a4e56af..f06f2cb2 100644 --- a/kardia-app/modules/base/dups/get/phones.qy +++ b/kardia-app/modules/base/dups/get/phones.qy @@ -1,21 +1,25 @@ $Version=2$ get_phones "system/query" { + // The phone field can be found by querying for all p_contact_info records + // with a p_contact_type of P (for phone) or C (for cellphone), both of + // which we treat the same for the purposes of this algorithm. Then, we + // read the country code / area code from the p_phone_country field, the + // second 3digit code from the p_phone_area_city field, and finally the + // last four digits form the p_contact_data field. If this field is null + // or does not exist, the 'phone number' is ignored. sql = " SELECT - key = :p:p_partner_key, + key = :p_partner_key, phone = '' - + isnull(:c:p_phone_country, '') - + isnull(:c:p_phone_area_city, '') - + isnull(:c:p_contact_data, '') + + isnull(:p_phone_country, '') + + isnull(:p_phone_area_city, '') + + isnull(:p_contact_data, '') FROM - identity /apps/kardia/data/Kardia_DB/p_partner/rows p, - /apps/kardia/data/Kardia_DB/p_contact_info/rows c + identity /apps/kardia/data/Kardia_DB/p_contact_info/rows WHERE - :p:p_partner_key = :c:p_partner_key - AND ((:c:p_contact_type = 'P') + (:c:p_contact_type = 'C')) - AND :p:p_contact_data is not null - AND :p:p_contact_data != '' + ((:p_contact_type = 'P') + (:p_contact_type = 'C')) + AND char_length(isnull(:p_contact_data, '')) > 1 ; "; } \ No newline at end of file diff --git a/kardia-app/modules/base/dups/name.qy b/kardia-app/modules/base/dups/name.qy deleted file mode 100644 index 7b158e3b..00000000 --- a/kardia-app/modules/base/dups/name.qy +++ /dev/null @@ -1,50 +0,0 @@ -$Version=2$ -name_duplicates "system/query" - { - sql = " - -- Get the list of names we will be searching for duplicates. - DECLARE collection names scope application; - - -- Assume the list of names was already computed by the caller. - -- INSERT INTO collection names SELECT :key, :name FROM /apps/kardia/modules/base/dups/get/names.qy ; - - -- Declare an object for storing values locally. - declare object value; - - -- Count data. - SELECT - :value:num_data = count(1) - FROM - collection names - ; - - -- Compute global values. - SELECT - :value:k = :k, - :value:algorithm = :algorithm - FROM - expression ('/apps/kardia/modules/base/dups/globals.qy?num_data=' + :value:num_data) - ; - - -- Detect dups. - SELECT - key1 = :p1:p_partner_key, - key2 = :p2:p_partner_key, - sim = :c:sim - FROM - identity /apps/kardia/data/Kardia_DB/p_partner/rows p1, - /apps/kardia/data/Kardia_DB/p_partner/rows p2, - expression ( - '/apps/kardia/modules/base/dups/dups.cluster' - + '?algorithm=' + :value:algorithm - + '&k=' + :value:k - + '&field=names' - + '&data=name' - + '/dups' - ) c - WHERE - :c:key1 = :p1:p_partner_key - AND :c:key2 = :p2:p_partner_key - ; - "; - } diff --git a/kardia-app/modules/base/dups/name_meta.qy b/kardia-app/modules/base/dups/name_meta.qy deleted file mode 100644 index 4b56e2fa..00000000 --- a/kardia-app/modules/base/dups/name_meta.qy +++ /dev/null @@ -1,50 +0,0 @@ -$Version=2$ -name_duplicates "system/query" - { - sql = " - -- Get the list of name_metas we will be searching for duplicates. - DECLARE collection name_metas scope application; - - -- Assume the list of name_metas was already computed by the caller. - -- INSERT INTO collection name_metas SELECT :key, :name FROM /apps/kardia/modules/base/dups/get/name_metas.qy ; - - -- Declare an object for storing values locally. - declare object value; - - -- Count data. - SELECT - :value:num_data = count(1) - FROM - collection name_metas - ; - - -- Compute global values. - SELECT - :value:k = :k, - :value:algorithm = :algorithm - FROM - expression ('/apps/kardia/modules/base/dups/globals.qy?num_data=' + :value:num_data) - ; - - -- Detect dups. - SELECT - key1 = :p1:p_partner_key, - key2 = :p2:p_partner_key, - sim = :c:sim - FROM - identity /apps/kardia/data/Kardia_DB/p_partner/rows p1, - /apps/kardia/data/Kardia_DB/p_partner/rows p2, - expression ( - '/apps/kardia/modules/base/dups/dups.cluster' - + '?algorithm=' + :value:algorithm - + '&k=' + :value:k - + '&field=name_metas' - + '&data=name_meta' - + '/meta_dups' - ) c - WHERE - :c:key1 = :p1:p_partner_key - AND :c:key2 = :p2:p_partner_key - ; - "; - } diff --git a/kardia-app/modules/base/dups/phone.qy b/kardia-app/modules/base/dups/phone.qy deleted file mode 100644 index 6679e397..00000000 --- a/kardia-app/modules/base/dups/phone.qy +++ /dev/null @@ -1,57 +0,0 @@ -$Version=2$ -dups_test_query "system/query" - { - sql = " - -- Get the list of phones we will be searching for duplicates. - DECLARE collection phones scope application; - - -- Assume the list of phones was already computed by the caller. - -- INSERT INTO collection phones SELECT :key, :name FROM /apps/kardia/modules/base/dups/get/phones.qy ; - - -- Declare an object for storing values locally. - declare object value; - - -- Count data. - SELECT - :value:num_data = count(1) - FROM - collection phones - ; - - -- Compute global values. - SELECT - :value:k = :k, - :value:algorithm = :algorithm - FROM - expression ('/apps/kardia/modules/base/dups/globals.qy?num_data=' + :value:num_data) - ; - - -- Detect dups. - SELECT - key1 = :p1:p_partner_key, - key2 = :p2:p_partner_key, - sim = max(:d:sim) - FROM - identity /apps/kardia/data/Kardia_DB/p_partner/rows p1, - /apps/kardia/data/Kardia_DB/p_contact_info/rows c1, - /apps/kardia/data/Kardia_DB/p_partner/rows p2, - /apps/kardia/data/Kardia_DB/p_contact_info/rows c2, - expression ( - '/apps/kardia/modules/base/dups/dups.cluster' - + '?algorithm=' + :value:algorithm - + '&k=' + :value:k - + '&field=phones' - + '&data=phone' - + '/phone_dups' - ) d - WHERE - :p1:p_partner_key = :c1:p_partner_key - AND :p2:p_partner_key = :c2:p_partner_key - AND :p1:p_partner_key = :d:key1 - AND :p2:p_partner_key = :d:key2 - GROUP BY - :p1:p_partner_key, - :p2:p_partner_key - ; - "; - } diff --git a/kardia-app/modules/base/dups/update.qy b/kardia-app/modules/base/dups/update.qy index 092694ce..0f072c91 100644 --- a/kardia-app/modules/base/dups/update.qy +++ b/kardia-app/modules/base/dups/update.qy @@ -1,6 +1,47 @@ $Version=2$ update_duplicates "system/query" { + // This script updates the p_dup table (/apps/kardia/data/Kardia_DB/p_dup/rows) + // by detecting duplicates using two strategies (aggregation and concatenation) + // and then upserting the results into the table. + // + // The aggregation strategy first searches for dups on several groups of fields + // which it fetches using name.qy, name_metas.qy, emails.qy, phones.qy, and + // addresses.qy. We invoke the dups.cluster file to get a list of dups for each + // of these field groups (name_dups, email_dups, etc.). Then, add each dup found + // by the other strategies to name_dups by computing its name similarity. Next, + // we add each dup in name_dups to each other collection by computing the + // similarity of the respective fields. Thus, every dup exists in all five + // collections. Finally, we aggregate these collections using this equation: + // `average(max(name_sim, name_meta_sim * 0.9), email_sim, phone_sim, address_sim)` + // If a field is missing (e.g. either possible dup record doesn't have an email), + // we use a value of -1, and all values less than 0 are ignored by the average. + // (This avoids overlooking records that are missing a lot of data.) + // + // The concatenation strategy is simpler. We simply concatenate all relevant + // fields (collection concats), then search for dups (collection concat_dups) + // on the concatinated data. + // + // Each strategy has pros and cons. + // - Aggregation avoids overlooking records when one is missing information. + // - Aggregation provides clearer reasoning to the database administrator. + // - Aggregation can use Levenstein for some fields (name meta and phone), + // and cosine for others. Contamination uses the same measure for everything. + // - Concatenation finds dups where someone's email is similar to another + // person's name, which can tip us off to relationships we'd otherwise miss. + // - Concatenation is slightly faster, so it can be run with a lower threshold, + // making it more sensitive to near dups in some cases. + // - Concatenation is prone to cryptic false positives. + // + // In short, aggregation gives us more control, allowing us to find far more dups, + // but it comes at a cost of greater complexity, slower compute times, and missing + // certain specific cases of duplicates. + // + // Note: If concat_dups detects a dup, we also run run the other field checks on it + // even though concatenation similarity is not used in the aggregation equation. + // This is because it helps to enhance the reason field with additional info + // that the database administrator can see at a glace. + sql = " DECLARE object value; @@ -8,10 +49,6 @@ update_duplicates "system/query" -- similarity percentages. SELECT :value:reason_decimals = 1; - -- The minimum similarity for a attributes to be considered one of - -- the reasons that two records were matched as duplicates. - SELECT :value:min_reason_sim = 0.10; - -- The minimum aggregated similarity threshold that must be reached -- for a duplicate to be added to the table and displayed to the user. -- Duplicates which aggregate to a lower similarity than this do are @@ -20,40 +57,189 @@ update_duplicates "system/query" -- For other similarity thresholds, see `dups.cluster`. SELECT :value:min_total_sim = 0.60; - + print 'Getting data...' -- Get collections for each set of possible dups. - -- These collections are used by the following scripts (such as - -- dups/name.qy, dups/email.qy, etc.), avoiding the need to reopen - -- the query file over and over again. - DECLARE collection names scope application; - DECLARE collection name_metas scope application; - DECLARE collection emails scope application; - DECLARE collection phones scope application; - DECLARE collection addresses scope application; - DELETE FROM collection names; - DELETE FROM collection name_metas; - DELETE FROM collection emails; - DELETE FROM collection phones; - DELETE FROM collection addresses; + DECLARE collection names; + DECLARE collection name_metas; + DECLARE collection emails; + DECLARE collection phones; + DECLARE collection addresses; + DECLARE collection concats; INSERT INTO collection names SELECT :key, :name FROM /apps/kardia/modules/base/dups/get/names.qy ; INSERT INTO collection name_metas SELECT :key, :name_meta FROM /apps/kardia/modules/base/dups/get/name_metas.qy ; INSERT INTO collection emails SELECT :key, :email FROM /apps/kardia/modules/base/dups/get/emails.qy ; INSERT INTO collection phones SELECT :key, :phone FROM /apps/kardia/modules/base/dups/get/phones.qy ; INSERT INTO collection addresses SELECT :key, :address FROM /apps/kardia/modules/base/dups/get/addresses.qy ; + INSERT INTO collection concats SELECT :key, :data FROM /apps/kardia/modules/base/dups/get/concats.qy ; + + print 'Counting data...' + -- Count data. + SELECT :value:names_count = count(1) FROM collection names; + SELECT :value:name_metas_count = count(1) FROM collection name_metas; + SELECT :value:emails_count = count(1) FROM collection emails; + SELECT :value:phones_count = count(1) FROM collection phones; + SELECT :value:addresses_count = count(1) FROM collection addresses; + SELECT :value:concats_count = count(1) FROM collection concats; + + print 'Computing search parameters...' + -- Compute searching parameters. + SELECT + :value:names_algorithm = :names:algorithm, + :value:name_metas_algorithm = :name_metas:algorithm, + :value:emails_algorithm = :emails:algorithm, + :value:phones_algorithm = :phones:algorithm, + :value:addresses_algorithm = :addresses:algorithm, + :value:concats_algorithm = :concats:algorithm, + :value:names_k = :names:k, + :value:name_metas_k = :name_metas:k, + :value:emails_k = :emails:k, + :value:phones_k = :phones:k, + :value:addresses_k = :addresses:k, + :value:concats_k = :concats:k + FROM + expression ('/apps/kardia/modules/base/dups/cluster_params.qy?num_data=' + :value:names_count) names, + expression ('/apps/kardia/modules/base/dups/cluster_params.qy?num_data=' + :value:name_metas_count) name_metas, + expression ('/apps/kardia/modules/base/dups/cluster_params.qy?num_data=' + :value:emails_count) emails, + expression ('/apps/kardia/modules/base/dups/cluster_params.qy?num_data=' + :value:phones_count) phones, + expression ('/apps/kardia/modules/base/dups/cluster_params.qy?num_data=' + :value:addresses_count) addresses, + expression ('/apps/kardia/modules/base/dups/cluster_params.qy?num_data=' + :value:concats_count) concats + ; - -- Get dups for each field (name, email, phone, address). + + print 'Searching for name dups...' + -- Get name_dups using the dups.cluster file. DECLARE collection name_dups; + INSERT INTO + collection name_dups + SELECT + :key1, + :key2, + :sim + FROM + identity expression ( + '/apps/kardia/modules/base/dups/dups.cluster' + + '?algorithm=' + :value:names_algorithm + + '&k=' + :value:names_k + + '&field=names' + + '&data=name' + + '/dups' + ) + ; + + print 'Searching for name_meta dups...' + -- Get name_meta_dups using the dups.cluster file. DECLARE collection name_meta_dups; + INSERT INTO + collection name_meta_dups + SELECT + key1 = :key1, + key2 = :key2, + sim = :sim + FROM + identity expression ( + '/apps/kardia/modules/base/dups/dups.cluster' + + '?algorithm=' + :value:name_metas_algorithm + + '&k=' + :value:name_metas_k + + '&field=name_metas' + + '&data=name_meta' + + '/meta_dups' + ) + ; + + print 'Searching for email dups...' + -- Get email_dups using the dups.cluster file. DECLARE collection email_dups; + INSERT INTO + collection email_dups + SELECT + key1 = :d:key1, + key2 = :d:key2, + sim = max(:d:sim) + FROM + identity expression ( + '/apps/kardia/modules/base/dups/dups.cluster' + + '?algorithm=' + :value:emails_algorithm + + '&k=' + :value:emails_k + + '&field=emails' + + '&data=email' + + '/dups' + ) d + GROUP BY + :d:key1, + :d:key2 + ; + + print 'Searching for phone dups...' + -- Get email_dups using the dups.cluster file. DECLARE collection phone_dups; + INSERT INTO + collection phone_dups + SELECT + key1 = :d:key1, + key2 = :d:key2, + sim = max(:d:sim) + FROM + expression ( + '/apps/kardia/modules/base/dups/dups.cluster' + + '?algorithm=' + :value:phones_algorithm + + '&k=' + :value:phones_k + + '&field=phones' + + '&data=phone' + + '/phone_dups' + ) d + GROUP BY + :d:key1, + :d:key2 + ; + + print 'Searching for address dups...' + -- Get address_dups using the dups.cluster file. DECLARE collection address_dups; - INSERT INTO collection name_dups SELECT :key1, :key2, :sim FROM /apps/kardia/modules/base/dups/name.qy ; - INSERT INTO collection name_meta_dups SELECT :key1, :key2, :sim FROM /apps/kardia/modules/base/dups/name_meta.qy ; - INSERT INTO collection email_dups SELECT :key1, :key2, :sim FROM /apps/kardia/modules/base/dups/email.qy ; - INSERT INTO collection phone_dups SELECT :key1, :key2, :sim FROM /apps/kardia/modules/base/dups/phone.qy ; - INSERT INTO collection address_dups SELECT :key1, :key2, :sim FROM /apps/kardia/modules/base/dups/address.qy ; + INSERT INTO + collection address_dups + SELECT + key1 = :d:key1, + key2 = :d:key2, + sim = max(:d:sim) + FROM + identity expression ( + '/apps/kardia/modules/base/dups/dups.cluster' + + '?algorithm=' + :value:addresses_algorithm + + '&k=' + :value:addresses_k + + '&field=addresses' + + '&data=address' + + '/dups' + ) d + GROUP BY + :d:key1, + :d:key2 + ; + + print 'Searching for concat dups...' + -- Get concat_dups using the dups.cluster file. + DECLARE COLLECTION concat_dups; + INSERT INTO + collection concat_dups + SELECT + key1 = :d:key1, + key2 = :d:key2, + sim = max(:d:sim) + FROM + identity expression ( + '/apps/kardia/modules/base/dups/dups.cluster' + + '?algorithm=' + :value:concats_algorithm + + '&k=' + :value:concats_k + + '&field=concats' + + '&data=data' + + '/concat_dups' + ) d + GROUP BY + :d:key1, + :d:key2 + ; + print 'Adding other collections dups to name dups...' -- [name_dups <- name_meta_dups] Add an entry to name_dups for each entry in name_meta_dups. INSERT INTO collection name_dups @@ -180,7 +366,41 @@ update_duplicates "system/query" ) ; + -- [name_dups <- concat_dups] Add an entry to name_dups for each entry in concat_dups. + -- This isn't strictly necessary because concat_dups isn't used in the final aggregation. + -- However, it can sometimes provide additional information in the 'reason' column of the + -- UI, so this feels like it's worth the small cost in extra computation. + INSERT INTO + collection name_dups + SELECT + key1 = :concat:key1, + key2 = :concat:key2, + sim = condition(:name:sim >= 0.0, + :name:sim, + isnull(cos_compare(:n1:name, :n2:name), -1.0) + ) + FROM + identity collection concat_dups concat, + collection name_dups name, + collection names n1, + collection names n2 + WHERE + :concat:key1 *= :name:key1 + AND :concat:key2 *= :name:key2 + AND :concat:key1 *= :n1:key + AND :concat:key2 *= :n2:key + ON duplicate -- Update entries that already exist (in case we have a cross-cluster match). + :key1, + :key2 + UPDATE SET + :sim = condition(:name:sim >= 0.0, + :name:sim, + isnull(cos_compare(:n1:name, :n2:name), -1.0) + ) + ; + + print 'Adding name dups to other collections...' -- [name_meta_dups <- name_dups] Add an entry to name_meta_dups for each entry in name_dups. INSERT INTO collection name_meta_dups @@ -313,8 +533,17 @@ update_duplicates "system/query" ) ; + -- Free application-scoped collections. + DELETE FROM collection names; + DELETE FROM collection name_metas; + DELETE FROM collection emails; + DELETE FROM collection phones; + DELETE FROM collection addresses; + + -- Declare a collection to hold all dups found by both strategies (aggregation and concatenation). DECLARE collection all_dups; + print 'Aggregating dups...' -- Aggregate dups. INSERT INTO collection all_dups @@ -334,11 +563,11 @@ update_duplicates "system/query" , 0.0000001, convert(double, NULL)) -- Necessary because centrallix devision does not handle NAN properly. ), reason = '' - + condition(:name:sim > :value:min_reason_sim, 'Name (' + round(:name:sim * 100, :value:reason_decimals) + '%)\\n', '') - + condition(:name_meta:sim > :value:min_reason_sim AND :name_meta:sim > :name:sim, 'Phonetic Name (' + round(:name_meta:sim * 100, :value:reason_decimals) + '%)\\n', '') - + condition(:email:sim > :value:min_reason_sim, 'Email (' + round(:email:sim * 100, :value:reason_decimals) + '%)\\n', '') - + condition(:phone:sim > :value:min_reason_sim, 'Phone (' + round(:phone:sim * 100, :value:reason_decimals) + '%)\\n', '') - + condition(:address:sim > :value:min_reason_sim, 'Address (' + round(:address:sim * 100, :value:reason_decimals) + '%)\\n', '') + + condition(:name:sim > 0.0, 'Name (' + round(:name:sim * 100, :value:reason_decimals) + '%)\\n', '') + + condition(:name_meta:sim > 0.0 AND (:name_meta:sim - 0.00001) > :name:sim, 'Phonetic Name (' + round(:name_meta:sim * 100, :value:reason_decimals) + '%)\\n', '') + + condition(:email:sim > 0.0, 'Email (' + round(:email:sim * 100, :value:reason_decimals) + '%)\\n', '') + + condition(:phone:sim > 0.0, 'Phone (' + round(:phone:sim * 100, :value:reason_decimals) + '%)\\n', '') + + condition(:address:sim > 0.0, 'Address (' + round(:address:sim * 100, :value:reason_decimals) + '%)\\n', '') - '\\n' FROM identity collection name_dups name, @@ -358,9 +587,8 @@ update_duplicates "system/query" ; + print 'Adding concatenation dups...' -- Full Concat Dups - DECLARE collection concat_dups; - INSERT INTO collection concat_dups select * from /apps/kardia/modules/base/dups/concat.qy ; INSERT INTO collection all_dups SELECT @@ -370,8 +598,6 @@ update_duplicates "system/query" reason = 'All (' + round(:concat:sim * 100, :value:reason_decimals) + '%)' FROM identity collection concat_dups concat - WHERE -- We should never get identical dups, but somehow we do. This fixes that. - :concat:key1 != :concat:key2 ON duplicate :key1, :key2 @@ -383,15 +609,10 @@ update_duplicates "system/query" ; - -- Free application-scoped collections. - DELETE FROM collection names; - DELETE FROM collection emails; - DELETE FROM collection phones; - DELETE FROM collection addresses; - -- Remove all data from the dups table (for debugging). delete from /apps/kardia/data/Kardia_DB/p_dup/rows ; + print 'Storing dups...' -- TODO: Greg - We should fix this upsert. -- Insert dups into p_dup table. INSERT INTO @@ -409,8 +630,9 @@ update_duplicates "system/query" s_modified_by = user_name() FROM collection all_dups - WHERE - :sim > :value:min_total_sim + WHERE -- We should never get identical dups, but somehow we do. This fixes that issue. + :all_dups:key1 != :all_dups:key2 + AND :sim > :value:min_total_sim ON duplicate :p_partner_key, :p_dup_partner_key @@ -433,5 +655,6 @@ update_duplicates "system/query" -- AND(:d:s_date_modified < isnull(:p1:s_date_modified, getdate()) -- OR :d:s_date_modified < isnull(:p2:s_date_modified, getdate())) -- ; + print 'Update complete' "; } From 080c6de92735389959da4d2f4cf5c07903276eee Mon Sep 17 00:00:00 2001 From: Israel Date: Wed, 19 Nov 2025 12:09:21 -0700 Subject: [PATCH 06/12] UI Update: Add last updated date. Add handling for edge case: missing dup reason. Add handling for some nondup edge cases. Organize confusing joins. Add some friendly flair when all dups are resolved. --- .../base/plugin_base_dataqa_duplicates.cmp | 81 +++++++++++++------ 1 file changed, 58 insertions(+), 23 deletions(-) diff --git a/kardia-app/modules/base/plugin_base_dataqa_duplicates.cmp b/kardia-app/modules/base/plugin_base_dataqa_duplicates.cmp index ee50f624..9b6691ea 100644 --- a/kardia-app/modules/base/plugin_base_dataqa_duplicates.cmp +++ b/kardia-app/modules/base/plugin_base_dataqa_duplicates.cmp @@ -44,15 +44,25 @@ plugin_base_dataqa_duplicates "widget/component-decl" width=958; height=633; spacing=10; - - dupslbl "widget/component" + + dups_label "widget/pane" { - path="/apps/kardia/modules/base/section_label.cmp"; - height=26; - fl_height=0; - text = runclient("Potential Duplicate Partners..."); + height = 32; width = 958; + fl_height = 0; + border_radius = 8; + widget_class = "label"; + + lbl "widget/label" + { + x = 6; y = 6; + width = 190; height = 18; + font_size = 13; + + widget_class = "label"; + text = "Potential Duplicate Partners..."; + } } - + dupsosrc "widget/osrc" { showobs "widget/parameter" { type=integer; default=runclient(:show_obs:value); } @@ -132,7 +142,7 @@ plugin_base_dataqa_duplicates "widget/component-decl" -- Dupe type info. reason = :d:p_reason, - nondup = condition(:nd:p_partner_key is not null, 1, 0), + nondup = condition(char_length(isnull(:nd:p_partner_key, '')) > 1 and char_length(isnull(:nd:p_nondup_partner_key, '')) > 1, 1, 0), associated = condition( :p1:p_parent_key = :p2:p_partner_key or :p1:p_partner_key = :p2:p_parent_key @@ -168,20 +178,19 @@ plugin_base_dataqa_duplicates "widget/component-decl" /apps/kardia/data/Kardia_DB/p_nondup/rows nd where (:info:partnerlist is null or charindex(',' + rtrim(:d:p_partner_key) + ',', ',' + :info:partnerlist + ',') > 0 or charindex(',' + rtrim(:d:p_dup_partner_key) + ',', ',' + :info:partnerlist + ',') > 0) and - :p1:p_partner_key = :d:p_partner_key and - :p2:p_partner_key = :d:p_dup_partner_key and - :pl1:p_partner_key =* :d:p_partner_key and - :pl2:p_partner_key =* :d:p_dup_partner_key and - :pl1:p_country_code *= :ctry1:p_country_code and - :af1:p_country_code =* :ctry1:p_country_code and - :af1:p_address_set = 'STANDARD' and - :pl2:p_country_code *= :ctry2:p_country_code and - :af2:p_country_code =* :ctry2:p_country_code and - :af2:p_address_set = 'STANDARD' and - :nd:p_partner_key =* :d:p_partner_key and - :nd:p_nondup_partner_key =* :d:p_dup_partner_key + :d:p_partner_key = :p1:p_partner_key and + :d:p_dup_partner_key = :p2:p_partner_key and + :d:p_partner_key *= :pl1:p_partner_key and + :d:p_dup_partner_key *= :pl2:p_partner_key and + :d:p_partner_key *= :nd:p_partner_key and + :d:p_dup_partner_key *= :nd:p_nondup_partner_key and + :pl1:p_country_code *= :ctry1:p_country_code and + :pl2:p_country_code *= :ctry2:p_country_code and + :af1:p_country_code =* :ctry1:p_country_code and + :af2:p_country_code =* :ctry2:p_country_code and + :af1:p_address_set = 'STANDARD' and + :af2:p_address_set = 'STANDARD' group by - :d:p_match_quality desc, :d:p_partner_key, :d:p_dup_partner_key order by @@ -219,7 +228,7 @@ plugin_base_dataqa_duplicates "widget/component-decl" row_shadow_offset=1; row_shadow_color="#a0a0a0"; row_shadow_angle=135; - nodata_message="(no duplicates to show)"; + nodata_message="(no duplicates to show, congrats!)"; // Similarity progress bar. t_match "widget/table-column" @@ -291,7 +300,7 @@ plugin_base_dataqa_duplicates "widget/component-decl" title = "%"; value = runclient( - :dupsosrc:reason + isnull(:dupsosrc:reason, "Reason missing") + condition(:dupsosrc:merged, '\nAlready Merged', '') + condition(:dupsosrc:associated, '\nAssociated', '') + condition(:dupsosrc:relation is not null, '\nRelated: ' + :dupsosrc:relation, '') @@ -472,6 +481,32 @@ plugin_base_dataqa_duplicates "widget/component-decl" } } } + + last_computed_osrc "widget/osrc" + { + sql = " + SELECT + display = 'Last updated:\n' + max(:s_date_modified), + dif_days = datediff(day, max(:s_date_modified), getdate()) + FROM + /apps/kardia/data/Kardia_DB/p_dup/rows + ; + "; + + last_computed_form "widget/form" + { + dups_last_updated_label "widget/label" + { + x = 840; y = 0; + height = 32; width = 110; + font_size = 11; + align = left; valign = middle; + fgcolor = runclient(condition(:last_computed_osrc:dif_days > 7, "#FF1133", "#CCCCCC")); + style = runclient(condition(:last_computed_osrc:dif_days > 7, bold, none)); + fieldname = display; + } + } + } popover_ask_comment "widget/childwindow" { From d01bf496134bc67572542f52654b8bd86a175745 Mon Sep 17 00:00:00 2001 From: Israel Date: Thu, 20 Nov 2025 13:56:49 -0700 Subject: [PATCH 07/12] Add more print statements. --- kardia-app/modules/base/dups/update.qy | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/kardia-app/modules/base/dups/update.qy b/kardia-app/modules/base/dups/update.qy index 0f072c91..67a69d9a 100644 --- a/kardia-app/modules/base/dups/update.qy +++ b/kardia-app/modules/base/dups/update.qy @@ -240,6 +240,7 @@ update_duplicates "system/query" print 'Adding other collections dups to name dups...' + print '[name_dups <- name_meta_dups]' -- [name_dups <- name_meta_dups] Add an entry to name_dups for each entry in name_meta_dups. INSERT INTO collection name_dups @@ -270,6 +271,7 @@ update_duplicates "system/query" ) ; + print '[name_dups <- email_dups]' -- [name_dups <- email_dups] Add an entry to name_dups for each entry in email_dups. INSERT INTO collection name_dups @@ -300,6 +302,7 @@ update_duplicates "system/query" ) ; + print '[name_dups <- phone_dups]' -- [name_dups <- phone_dups] Add an entry to name_dups for each entry in phone_dups. INSERT INTO collection name_dups @@ -333,6 +336,7 @@ update_duplicates "system/query" ) ; + print '[name_dups <- address_dups]' -- [name_dups <- address_dups] Add an entry to name_dups for each entry in address_dups. INSERT INTO collection name_dups @@ -366,6 +370,7 @@ update_duplicates "system/query" ) ; + print '[name_dups <- concat_dups]' -- [name_dups <- concat_dups] Add an entry to name_dups for each entry in concat_dups. -- This isn't strictly necessary because concat_dups isn't used in the final aggregation. -- However, it can sometimes provide additional information in the 'reason' column of the @@ -401,6 +406,7 @@ update_duplicates "system/query" print 'Adding name dups to other collections...' + print '[name_meta_dups <- name_dups]' -- [name_meta_dups <- name_dups] Add an entry to name_meta_dups for each entry in name_dups. INSERT INTO collection name_meta_dups @@ -434,6 +440,7 @@ update_duplicates "system/query" ) ; + print '[email_dups <- name_dups]' -- [email_dups <- name_dups] Add an entry to email_dups for each entry in name_dups. INSERT INTO collection email_dups @@ -467,6 +474,7 @@ update_duplicates "system/query" ) ; + print '[phone_dups <- name_dups]' -- [phone_dups <- name_dups] Add an entry to phone_dups for each entry in name_dups. INSERT INTO collection phone_dups @@ -500,6 +508,7 @@ update_duplicates "system/query" ) ; + print '[address_dups <- name_dups]' -- [address_dups <- name_dups] Add an entry to address_dups for each entry in name_dups. INSERT INTO collection address_dups From 6f8ce572ba832fd332db4fff8f320a61bc88d348 Mon Sep 17 00:00:00 2001 From: Israel Date: Thu, 20 Nov 2025 14:31:48 -0700 Subject: [PATCH 08/12] Reduce max complete search threshold to 50000 records. --- kardia-app/modules/base/dups/cluster_params.qy | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kardia-app/modules/base/dups/cluster_params.qy b/kardia-app/modules/base/dups/cluster_params.qy index bc30bab4..ff5564b5 100644 --- a/kardia-app/modules/base/dups/cluster_params.qy +++ b/kardia-app/modules/base/dups/cluster_params.qy @@ -9,7 +9,7 @@ duplicate_checking_globals "system/query" sql = " SELECT -- Compute which clustering algorithm to use (no clustering is better for small amounts of data). - algorithm = condition(:parameters:num_data > 100000, 'kmeans', 'none'), + algorithm = condition(:parameters:num_data > 50000, 'kmeans', 'none'), -- Compute the k value for clustering. k = constrain(convert(integer, power(logn(:parameters:num_data, 36), 3.2) - 8), 2, convert(integer, NULL)) From b13fa2aec25b37bfcfc6e8952a595e0e8bfe1d37 Mon Sep 17 00:00:00 2001 From: Israel Date: Thu, 20 Nov 2025 15:56:30 -0700 Subject: [PATCH 09/12] Fix collections containing a name attribute, which was misinterpreted by the object system. --- kardia-app/modules/base/dups/get/concats.qy | 8 +- .../base/dups/get/{names.qy => name_strs.qy} | 6 +- kardia-app/modules/base/dups/update.qy | 476 +++++++++--------- 3 files changed, 248 insertions(+), 242 deletions(-) rename kardia-app/modules/base/dups/get/{names.qy => name_strs.qy} (80%) diff --git a/kardia-app/modules/base/dups/get/concats.qy b/kardia-app/modules/base/dups/get/concats.qy index 2fc914a1..d9ae6c78 100644 --- a/kardia-app/modules/base/dups/get/concats.qy +++ b/kardia-app/modules/base/dups/get/concats.qy @@ -9,6 +9,10 @@ get_concats "system/query" // database. Each record can have up to one email, phone number, and // address, so multiple records are produced for a given p_partner record // if various combonations of this contact information are possible. + // + // Note: Don't make the mistake of calling an attribute "name" or it might + // accidentally become the canonical name of that object in the object + // system, causing a ton of stuff to break in subtle and confusing ways. sql = " declare collection temp; @@ -16,7 +20,7 @@ get_concats "system/query" collection temp SELECT key = :p:p_partner_key, - name = '' + name_str = '' + isnull(:p_given_name, '') + isnull(condition( char_length(isnull(:p_given_name, '')) > 1 @@ -70,7 +74,7 @@ get_concats "system/query" SELECT key = :key, data = '' - + :name + '`' + + :name_str + '`' + :name_meta + '1' + :name_meta + '1' + :name_meta + '1' diff --git a/kardia-app/modules/base/dups/get/names.qy b/kardia-app/modules/base/dups/get/name_strs.qy similarity index 80% rename from kardia-app/modules/base/dups/get/names.qy rename to kardia-app/modules/base/dups/get/name_strs.qy index 7bad7fac..dfb7cc02 100644 --- a/kardia-app/modules/base/dups/get/names.qy +++ b/kardia-app/modules/base/dups/get/name_strs.qy @@ -9,10 +9,14 @@ get_names "system/query" // p_surname value and only the p_org_name field is considered for them // (it's long enough on it's own, anyway). However, p_org_name is still // considered for people, too. + // + // Note: Don't make the mistake of calling an attribute "name" or it might + // accidentally become the canonical name of that object in the object + // system, causing a ton of stuff to break in subtle and confusing ways. sql = " SELECT key = :p_partner_key, - name = '' + name_str = '' + isnull(:p_given_name, '') + isnull(condition( char_length(isnull(:p_given_name, '')) > 1 diff --git a/kardia-app/modules/base/dups/update.qy b/kardia-app/modules/base/dups/update.qy index 67a69d9a..6a69196d 100644 --- a/kardia-app/modules/base/dups/update.qy +++ b/kardia-app/modules/base/dups/update.qy @@ -6,14 +6,14 @@ update_duplicates "system/query" // and then upserting the results into the table. // // The aggregation strategy first searches for dups on several groups of fields - // which it fetches using name.qy, name_metas.qy, emails.qy, phones.qy, and + // which it fetches using name_str.qy, name_metas.qy, emails.qy, phones.qy, and // addresses.qy. We invoke the dups.cluster file to get a list of dups for each - // of these field groups (name_dups, email_dups, etc.). Then, add each dup found - // by the other strategies to name_dups by computing its name similarity. Next, - // we add each dup in name_dups to each other collection by computing the + // of these field groups (name_str_dups, email_dups, etc.). Then, add each dup + // found by the other strategies to name_str_dups by computing its name similarity. + // Next, we add each dup in name_str_dups to each other collection by computing the // similarity of the respective fields. Thus, every dup exists in all five // collections. Finally, we aggregate these collections using this equation: - // `average(max(name_sim, name_meta_sim * 0.9), email_sim, phone_sim, address_sim)` + // `average(max(name_str_sim, name_meta_sim * 0.9), email_sim, phone_sim, address_sim)` // If a field is missing (e.g. either possible dup record doesn't have an email), // we use a value of -1, and all values less than 0 are ignored by the average. // (This avoids overlooking records that are missing a lot of data.) @@ -25,7 +25,7 @@ update_duplicates "system/query" // Each strategy has pros and cons. // - Aggregation avoids overlooking records when one is missing information. // - Aggregation provides clearer reasoning to the database administrator. - // - Aggregation can use Levenstein for some fields (name meta and phone), + // - Aggregation can use Levenstein for some fields (name_meta and phone), // and cosine for others. Contamination uses the same measure for everything. // - Concatenation finds dups where someone's email is similar to another // person's name, which can tip us off to relationships we'd otherwise miss. @@ -59,13 +59,13 @@ update_duplicates "system/query" print 'Getting data...' -- Get collections for each set of possible dups. - DECLARE collection names; + DECLARE collection name_strs; DECLARE collection name_metas; DECLARE collection emails; DECLARE collection phones; DECLARE collection addresses; DECLARE collection concats; - INSERT INTO collection names SELECT :key, :name FROM /apps/kardia/modules/base/dups/get/names.qy ; + INSERT INTO collection name_strs SELECT :key, :name_str FROM /apps/kardia/modules/base/dups/get/name_strs.qy ; INSERT INTO collection name_metas SELECT :key, :name_meta FROM /apps/kardia/modules/base/dups/get/name_metas.qy ; INSERT INTO collection emails SELECT :key, :email FROM /apps/kardia/modules/base/dups/get/emails.qy ; INSERT INTO collection phones SELECT :key, :phone FROM /apps/kardia/modules/base/dups/get/phones.qy ; @@ -74,7 +74,7 @@ update_duplicates "system/query" print 'Counting data...' -- Count data. - SELECT :value:names_count = count(1) FROM collection names; + SELECT :value:name_strs_count = count(1) FROM collection name_strs; SELECT :value:name_metas_count = count(1) FROM collection name_metas; SELECT :value:emails_count = count(1) FROM collection emails; SELECT :value:phones_count = count(1) FROM collection phones; @@ -84,20 +84,20 @@ update_duplicates "system/query" print 'Computing search parameters...' -- Compute searching parameters. SELECT - :value:names_algorithm = :names:algorithm, + :value:name_strs_algorithm = :name_strs:algorithm, :value:name_metas_algorithm = :name_metas:algorithm, :value:emails_algorithm = :emails:algorithm, :value:phones_algorithm = :phones:algorithm, :value:addresses_algorithm = :addresses:algorithm, :value:concats_algorithm = :concats:algorithm, - :value:names_k = :names:k, + :value:name_strs_k = :name_strs:k, :value:name_metas_k = :name_metas:k, :value:emails_k = :emails:k, :value:phones_k = :phones:k, :value:addresses_k = :addresses:k, :value:concats_k = :concats:k FROM - expression ('/apps/kardia/modules/base/dups/cluster_params.qy?num_data=' + :value:names_count) names, + expression ('/apps/kardia/modules/base/dups/cluster_params.qy?num_data=' + :value:name_strs_count) name_strs, expression ('/apps/kardia/modules/base/dups/cluster_params.qy?num_data=' + :value:name_metas_count) name_metas, expression ('/apps/kardia/modules/base/dups/cluster_params.qy?num_data=' + :value:emails_count) emails, expression ('/apps/kardia/modules/base/dups/cluster_params.qy?num_data=' + :value:phones_count) phones, @@ -106,11 +106,11 @@ update_duplicates "system/query" ; - print 'Searching for name dups...' - -- Get name_dups using the dups.cluster file. - DECLARE collection name_dups; + print 'Searching for name_str_dups...' + -- Get name_str_dups using the dups.cluster file. + DECLARE collection name_str_dups; INSERT INTO - collection name_dups + collection name_str_dups SELECT :key1, :key2, @@ -118,10 +118,10 @@ update_duplicates "system/query" FROM identity expression ( '/apps/kardia/modules/base/dups/dups.cluster' - + '?algorithm=' + :value:names_algorithm - + '&k=' + :value:names_k - + '&field=names' - + '&data=name' + + '?algorithm=' + :value:name_strs_algorithm + + '&k=' + :value:name_strs_k + + '&field=name_strs' + + '&data=name_str' + '/dups' ) ; @@ -239,311 +239,311 @@ update_duplicates "system/query" ; - print 'Adding other collections dups to name dups...' - print '[name_dups <- name_meta_dups]' - -- [name_dups <- name_meta_dups] Add an entry to name_dups for each entry in name_meta_dups. + print 'Adding other collections dups to name_str_dups...' + print '[name_str_dups <- name_meta_dups]' + -- [name_str_dups <- name_meta_dups] Add an entry to name_str_dups for each entry in name_meta_dups. INSERT INTO - collection name_dups + collection name_str_dups SELECT - key1 = :name_meta:key1, - key2 = :name_meta:key2, - sim = condition(:name:sim >= 0.0, - :name:sim, - isnull(cos_compare(:n1:name, :n2:name), -1.0) + key1 = :name_meta_dup:key1, + key2 = :name_meta_dup:key2, + sim = condition(:name_str_dup:sim >= 0.0, + :name_str_dup:sim, + isnull(cos_compare(:name_str1:name_str, :name_str2:name_str), -1.0) ) FROM - identity collection name_meta_dups name_meta, - collection name_dups name, - collection names n1, - collection names n2 + identity collection name_meta_dups name_meta_dup, + collection name_str_dups name_str_dup, + collection name_strs name_str1, + collection name_strs name_str2 WHERE - :name_meta:key1 *= :name:key1 - AND :name_meta:key2 *= :name:key2 - AND :name_meta:key1 *= :n1:key - AND :name_meta:key2 *= :n2:key + :name_meta_dup:key1 *= :name_str_dup:key1 + AND :name_meta_dup:key2 *= :name_str_dup:key2 + AND :name_meta_dup:key1 *= :name_str1:key + AND :name_meta_dup:key2 *= :name_str2:key ON duplicate -- Update entries that already exist (in case we have a cross-cluster match). :key1, :key2 UPDATE SET - :sim = condition(:name:sim >= 0.0, - :name:sim, - isnull(cos_compare(:n1:name, :n2:name), -1.0) + :sim = condition(:name_str_dup:sim >= 0.0, + :name_str_dup:sim, + isnull(cos_compare(:name_str1:name_str, :name_str2:name_str), -1.0) ) ; - print '[name_dups <- email_dups]' - -- [name_dups <- email_dups] Add an entry to name_dups for each entry in email_dups. + print '[name_str_dups <- email_dups]' + -- [name_str_dups <- email_dups] Add an entry to name_str_dups for each entry in email_dups. INSERT INTO - collection name_dups + collection name_str_dups SELECT - key1 = :email:key1, - key2 = :email:key2, - sim = condition(:name:sim >= 0.0, - :name:sim, - isnull(cos_compare(:n1:name, :n2:name), -1.0) + key1 = :email_dup:key1, + key2 = :email_dup:key2, + sim = condition(:name_str_dup:sim >= 0.0, + :name_str_dup:sim, + isnull(cos_compare(:name_str1:name_str, :name_str2:name_str), -1.0) ) FROM - identity collection email_dups email, - collection name_dups name, - collection names n1, - collection names n2 + identity collection email_dups email_dup, + collection name_str_dups name_str_dup, + collection name_strs name_str1, + collection name_strs name_str2 WHERE - :email:key1 *= :name:key1 - AND :email:key2 *= :name:key2 - AND :email:key1 *= :n1:key - AND :email:key2 *= :n2:key + :email_dup:key1 *= :name_str_dup:key1 + AND :email_dup:key2 *= :name_str_dup:key2 + AND :email_dup:key1 *= :name_str1:key + AND :email_dup:key2 *= :name_str2:key ON duplicate -- Update entries that already exist (in case we have a cross-cluster match). :key1, :key2 UPDATE SET - :sim = condition(:name:sim >= 0.0, - :name:sim, - isnull(cos_compare(:n1:name, :n2:name), -1.0) + :sim = condition(:name_str_dup:sim >= 0.0, + :name_str_dup:sim, + isnull(cos_compare(:name_str1:name_str, :name_str2:name_str), -1.0) ) ; - print '[name_dups <- phone_dups]' - -- [name_dups <- phone_dups] Add an entry to name_dups for each entry in phone_dups. + print '[name_str_dups <- phone_dups]' + -- [name_str_dups <- phone_dups] Add an entry to name_str_dups for each entry in phone_dups. INSERT INTO - collection name_dups + collection name_str_dups SELECT - key1 = :phone:key1, - key2 = :phone:key2, - sim = condition(:name:sim >= 0.0, - :name:sim, - max(isnull(cos_compare(:n1:name, :n2:name), -1.0)) + key1 = :phone_dup:key1, + key2 = :phone_dup:key2, + sim = condition(:name_str_dup:sim >= 0.0, + :name_str_dup:sim, + max(isnull(cos_compare(:name_str1:name_str, :name_str2:name_str), -1.0)) ) FROM - identity collection phone_dups phone, - collection name_dups name, - collection names n1, - collection names n2 + identity collection phone_dups phone_dup, + collection name_str_dups name_str_dup, + collection name_strs name_str1, + collection name_strs name_str2 WHERE - :phone:key1 *= :name:key1 - AND :phone:key2 *= :name:key2 - AND :phone:key1 *= :n1:key - AND :phone:key2 *= :n2:key + :phone_dup:key1 *= :name_str_dup:key1 + AND :phone_dup:key2 *= :name_str_dup:key2 + AND :phone_dup:key1 *= :name_str1:key + AND :phone_dup:key2 *= :name_str2:key GROUP BY - :name:key1, - :name:key2 + :name_str_dup:key1, + :name_str_dup:key2 ON duplicate -- Update entries that already exist (in case we have a cross-cluster match). :key1, :key2 UPDATE SET - :sim = condition(:name:sim >= 0.0, - :name:sim, - max(isnull(cos_compare(:n1:name, :n2:name), -1.0)) + :sim = condition(:name_str_dup:sim >= 0.0, + :name_str_dup:sim, + max(isnull(cos_compare(:name_str1:name_str, :name_str2:name_str), -1.0)) ) ; - print '[name_dups <- address_dups]' - -- [name_dups <- address_dups] Add an entry to name_dups for each entry in address_dups. + print '[name_str_dups <- address_dups]' + -- [name_str_dups <- address_dups] Add an entry to name_str_dups for each entry in address_dups. INSERT INTO - collection name_dups + collection name_str_dups SELECT - key1 = :address:key1, - key2 = :address:key2, - sim = condition(:name:sim >= 0.0, - :name:sim, - max(isnull(cos_compare(:n1:name, :n2:name), -1.0)) + key1 = :address_dup:key1, + key2 = :address_dup:key2, + sim = condition(:name_str_dup:sim >= 0.0, + :name_str_dup:sim, + max(isnull(cos_compare(:name_str1:name_str, :name_str2:name_str), -1.0)) ) FROM - identity collection address_dups address, - collection name_dups name, - collection names n1, - collection names n2 + identity collection address_dups address_dup, + collection name_str_dups name_str_dup, + collection name_strs name_str1, + collection name_strs name_str2 WHERE - :address:key1 *= :name:key1 - AND :address:key2 *= :name:key2 - AND :address:key1 *= :n1:key - AND :address:key2 *= :n2:key + :address_dup:key1 *= :name_str_dup:key1 + AND :address_dup:key2 *= :name_str_dup:key2 + AND :address_dup:key1 *= :name_str1:key + AND :address_dup:key2 *= :name_str2:key GROUP BY - :name:key1, - :name:key2 + :name_str_dup:key1, + :name_str_dup:key2 ON duplicate -- Update entries that already exist (in case we have a cross-cluster match). :key1, :key2 UPDATE SET - :sim = condition(:name:sim >= 0.0, - :name:sim, - max(isnull(cos_compare(:n1:name, :n2:name), -1.0)) + :sim = condition(:name_str_dup:sim >= 0.0, + :name_str_dup:sim, + max(isnull(cos_compare(:name_str1:name_str, :name_str2:name_str), -1.0)) ) ; - print '[name_dups <- concat_dups]' - -- [name_dups <- concat_dups] Add an entry to name_dups for each entry in concat_dups. + print '[name_str_dups <- concat_dups]' + -- [name_str_dups <- concat_dups] Add an entry to name_str_dups for each entry in concat_dups. -- This isn't strictly necessary because concat_dups isn't used in the final aggregation. -- However, it can sometimes provide additional information in the 'reason' column of the -- UI, so this feels like it's worth the small cost in extra computation. INSERT INTO - collection name_dups + collection name_str_dups SELECT - key1 = :concat:key1, - key2 = :concat:key2, - sim = condition(:name:sim >= 0.0, - :name:sim, - isnull(cos_compare(:n1:name, :n2:name), -1.0) + key1 = :concat_dup:key1, + key2 = :concat_dup:key2, + sim = condition(:name_str_dup:sim >= 0.0, + :name_str_dup:sim, + isnull(cos_compare(:name_str1:name_str, :name_str2:name_str), -1.0) ) FROM - identity collection concat_dups concat, - collection name_dups name, - collection names n1, - collection names n2 + identity collection concat_dups concat_dup, + collection name_str_dups name_str_dup, + collection name_strs name_str1, + collection name_strs name_str2 WHERE - :concat:key1 *= :name:key1 - AND :concat:key2 *= :name:key2 - AND :concat:key1 *= :n1:key - AND :concat:key2 *= :n2:key + :concat_dup:key1 *= :name_str_dup:key1 + AND :concat_dup:key2 *= :name_str_dup:key2 + AND :concat_dup:key1 *= :name_str1:key + AND :concat_dup:key2 *= :name_str2:key ON duplicate -- Update entries that already exist (in case we have a cross-cluster match). :key1, :key2 UPDATE SET - :sim = condition(:name:sim >= 0.0, - :name:sim, - isnull(cos_compare(:n1:name, :n2:name), -1.0) + :sim = condition(:name_str_dup:sim >= 0.0, + :name_str_dup:sim, + isnull(cos_compare(:name_str1:name_str, :name_str2:name_str), -1.0) ) ; - print 'Adding name dups to other collections...' - print '[name_meta_dups <- name_dups]' - -- [name_meta_dups <- name_dups] Add an entry to name_meta_dups for each entry in name_dups. + print 'Adding name_str_dups to other collections...' + print '[name_meta_dups <- name_str_dups]' + -- [name_meta_dups <- name_str_dups] Add an entry to name_meta_dups for each entry in name_str_dups. INSERT INTO collection name_meta_dups SELECT - key1 = :name:key1, - key2 = :name:key2, - sim = condition(:name_meta:sim >= 0.0, - :name_meta:sim, - isnull(lev_compare(:nm1:name_meta, :nm2:name_meta), -1.0) + key1 = :name_str_dup:key1, + key2 = :name_str_dup:key2, + sim = condition(:name_meta_dup:sim >= 0.0, + :name_meta_dup:sim, + isnull(lev_compare(:name_meta1:name_meta, :name_meta2:name_meta), -1.0) ) FROM - identity collection name_dups name, - collection name_meta_dups name_meta, - collection name_metas nm1, - collection name_metas nm2 + identity collection name_str_dups name_str_dup, + collection name_meta_dups name_meta_dup, + collection name_metas name_meta1, + collection name_metas name_meta2 WHERE - :name:key1 *= :name_meta:key1 - AND :name:key2 *= :name_meta:key2 - AND :name:key1 *= :nm1:key - AND :name:key2 *= :nm2:key + :name_str_dup:key1 *= :name_meta_dup:key1 + AND :name_str_dup:key2 *= :name_meta_dup:key2 + AND :name_str_dup:key1 *= :name_meta1:key + AND :name_str_dup:key2 *= :name_meta2:key GROUP BY - :name:key1, - :name:key2 + :name_str_dup:key1, + :name_str_dup:key2 ON duplicate -- Skip duplicate entries. :key1, :key2 UPDATE SET - :sim = condition(:name_meta:sim >= 0.0, - :name_meta:sim, - isnull(lev_compare(:nm1:name_meta, :nm2:name_meta), -1.0) + :sim = condition(:name_meta_dup:sim >= 0.0, + :name_meta_dup:sim, + isnull(lev_compare(:name_meta1:name_meta, :name_meta2:name_meta), -1.0) ) ; - print '[email_dups <- name_dups]' - -- [email_dups <- name_dups] Add an entry to email_dups for each entry in name_dups. + print '[email_dups <- name_str_dups]' + -- [email_dups <- name_str_dups] Add an entry to email_dups for each entry in name_str_dups. INSERT INTO collection email_dups SELECT - key1 = :name:key1, - key2 = :name:key2, - sim = condition(:email:sim >= 0.0, - :email:sim, - max(isnull(cos_compare(:e1:email, :e2:email), -1.0)) + key1 = :name_str_dup:key1, + key2 = :name_str_dup:key2, + sim = condition(:email_dup:sim >= 0.0, + :email_dup:sim, + max(isnull(cos_compare(:email1:email, :email2:email), -1.0)) ) FROM - identity collection name_dups name, - collection email_dups email, - collection emails e1, - collection emails e2 + identity collection name_str_dups name_str_dup, + collection email_dups email_dup, + collection emails email1, + collection emails email2 WHERE - :name:key1 *= :email:key1 - AND :name:key2 *= :email:key2 - AND :name:key1 *= :e1:key - AND :name:key2 *= :e2:key + :name_str_dup:key1 *= :email_dup:key1 + AND :name_str_dup:key2 *= :email_dup:key2 + AND :name_str_dup:key1 *= :email1:key + AND :name_str_dup:key2 *= :email2:key GROUP BY - :name:key1, - :name:key2 + :name_str_dup:key1, + :name_str_dup:key2 ON duplicate -- Skip duplicate entries. :key1, :key2 UPDATE SET - :sim = condition(:email:sim >= 0.0, - :email:sim, - max(isnull(cos_compare(:e1:email, :e2:email), -1.0)) + :sim = condition(:email_dup:sim >= 0.0, + :email_dup:sim, + max(isnull(cos_compare(:email1:email, :email2:email), -1.0)) ) ; - print '[phone_dups <- name_dups]' - -- [phone_dups <- name_dups] Add an entry to phone_dups for each entry in name_dups. + print '[phone_dups <- name_str_dups]' + -- [phone_dups <- name_str_dups] Add an entry to phone_dups for each entry in name_str_dups. INSERT INTO collection phone_dups SELECT - key1 = :name:key1, - key2 = :name:key2, - sim = condition(:phone:sim >= 0.0, - :phone:sim, - max(isnull(lev_compare(:ph1:phone, :ph2:phone), -1.0)) + key1 = :name_str_dup:key1, + key2 = :name_str_dup:key2, + sim = condition(:phone_dup:sim >= 0.0, + :phone_dup:sim, + max(isnull(lev_compare(:phone1:phone, :phone2:phone), -1.0)) ) FROM - identity collection name_dups name, - collection phone_dups phone, - collection phones ph1, - collection phones ph2 + identity collection name_str_dups name_str_dup, + collection phone_dups phone_dup, + collection phones phone1, + collection phones phone2 WHERE - :name:key1 *= :phone:key1 - AND :name:key2 *= :phone:key2 - AND :name:key1 *= :ph1:key - AND :name:key2 *= :ph2:key + :name_str_dup:key1 *= :phone_dup:key1 + AND :name_str_dup:key2 *= :phone_dup:key2 + AND :name_str_dup:key1 *= :phone1:key + AND :name_str_dup:key2 *= :phone2:key GROUP BY - :name:key1, - :name:key2 + :name_str_dup:key1, + :name_str_dup:key2 ON duplicate -- Skip duplicate entries. :key1, :key2 UPDATE SET - :sim = condition(:phone:sim >= 0.0, - :phone:sim, - max(isnull(lev_compare(:ph1:phone, :ph2:phone), -1.0)) + :sim = condition(:phone_dup:sim >= 0.0, + :phone_dup:sim, + max(isnull(lev_compare(:phone1:phone, :phone2:phone), -1.0)) ) ; - print '[address_dups <- name_dups]' - -- [address_dups <- name_dups] Add an entry to address_dups for each entry in name_dups. + print '[address_dups <- name_str_dups]' + -- [address_dups <- name_str_dups] Add an entry to address_dups for each entry in name_str_dups. INSERT INTO collection address_dups SELECT - key1 = :name:key1, - key2 = :name:key2, - sim = condition(:address:sim >= 0.0, - :address:sim, - max(isnull(cos_compare(:l1:address, :l2:address), -1.0)) + key1 = :name_str_dup:key1, + key2 = :name_str_dup:key2, + sim = condition(:address_dup:sim >= 0.0, + :address_dup:sim, + max(isnull(cos_compare(:address1:address, :address2:address), -1.0)) ) FROM - identity collection name_dups name, - collection address_dups address, - collection addresses l1, - collection addresses l2 + identity collection name_str_dups name_str_dup, + collection address_dups address_dup, + collection addresses address1, + collection addresses address2 WHERE - :name:key1 *= :address:key1 - AND :name:key2 *= :address:key2 - AND :name:key1 *= :l1:key - AND :name:key2 *= :l2:key + :name_str_dup:key1 *= :address_dup:key1 + AND :name_str_dup:key2 *= :address_dup:key2 + AND :name_str_dup:key1 *= :address1:key + AND :name_str_dup:key2 *= :address2:key GROUP BY - :name:key1, - :name:key2 + :name_str_dup:key1, + :name_str_dup:key2 ON duplicate -- Skip duplicate entries. :key1, :key2 UPDATE SET - :sim = condition(:address:sim >= 0.0, - :address:sim, - max(isnull(cos_compare(:l1:address, :l2:address), -1.0)) + :sim = condition(:address_dup:sim >= 0.0, + :address_dup:sim, + max(isnull(cos_compare(:address1:address, :address2:address), -1.0)) ) ; -- Free application-scoped collections. - DELETE FROM collection names; + DELETE FROM collection name_strs; DELETE FROM collection name_metas; DELETE FROM collection emails; DELETE FROM collection phones; @@ -557,42 +557,42 @@ update_duplicates "system/query" INSERT INTO collection all_dups SELECT - key1 = :name:key1, - key2 = :name:key2, + key1 = :name_str_dup:key1, + key2 = :name_str_dup:key2, sim = (0.0 - + condition(constrain(:name:sim, :name_meta:sim * 0.9, 1.0) >= 0.0, constrain(:name:sim, :name_meta:sim * 0.9, 1.0), 0.0) - + condition(:email:sim >= 0.0, :email:sim, 0.0) - + condition(:phone:sim >= 0.0, :phone:sim, 0.0) - + condition(:address:sim >= 0.0, :address:sim, 0.0) + + condition(constrain(:name_str_dup:sim, :name_meta_dup:sim * 0.9, 1.0) >= 0.0, constrain(:name_str_dup:sim, :name_meta_dup:sim * 0.9, 1.0), 0.0) + + condition(:email_dup:sim >= 0.0, :email_dup:sim, 0.0) + + condition(:phone_dup:sim >= 0.0, :phone_dup:sim, 0.0) + + condition(:address_dup:sim >= 0.0, :address_dup:sim, 0.0) ) / (constrain(0.0 -- Constrain to prevent divide by 0 when NANs wander into the data from alternate dimensions. - + condition(:name:sim >= 0.0 OR :name_meta:sim >= 0.0, 1.0, 0.0) - + condition(:email:sim >= 0.0, 1.0, 0.0) - + condition(:phone:sim >= 0.0, 1.0, 0.0) - + condition(:address:sim >= 0.0, 1.0, 0.0) + + condition(:name_str_dup:sim >= 0.0 OR :name_meta_dup:sim >= 0.0, 1.0, 0.0) + + condition(:email_dup:sim >= 0.0, 1.0, 0.0) + + condition(:phone_dup:sim >= 0.0, 1.0, 0.0) + + condition(:address_dup:sim >= 0.0, 1.0, 0.0) , 0.0000001, convert(double, NULL)) -- Necessary because centrallix devision does not handle NAN properly. ), reason = '' - + condition(:name:sim > 0.0, 'Name (' + round(:name:sim * 100, :value:reason_decimals) + '%)\\n', '') - + condition(:name_meta:sim > 0.0 AND (:name_meta:sim - 0.00001) > :name:sim, 'Phonetic Name (' + round(:name_meta:sim * 100, :value:reason_decimals) + '%)\\n', '') - + condition(:email:sim > 0.0, 'Email (' + round(:email:sim * 100, :value:reason_decimals) + '%)\\n', '') - + condition(:phone:sim > 0.0, 'Phone (' + round(:phone:sim * 100, :value:reason_decimals) + '%)\\n', '') - + condition(:address:sim > 0.0, 'Address (' + round(:address:sim * 100, :value:reason_decimals) + '%)\\n', '') + + condition(:name_str_dup:sim > 0.0, 'Name (' + round(:name_str_dup:sim * 100, :value:reason_decimals) + '%)\\n', '') + + condition(:name_meta_dup:sim > 0.0 AND (:name_meta_dup:sim - 0.00001) > :name_str_dup:sim, 'Phonetic Name (' + round(:name_meta_dup:sim * 100, :value:reason_decimals) + '%)\\n', '') + + condition(:email_dup:sim > 0.0, 'Email (' + round(:email_dup:sim * 100, :value:reason_decimals) + '%)\\n', '') + + condition(:phone_dup:sim > 0.0, 'Phone (' + round(:phone_dup:sim * 100, :value:reason_decimals) + '%)\\n', '') + + condition(:address_dup:sim > 0.0, 'Address (' + round(:address_dup:sim * 100, :value:reason_decimals) + '%)\\n', '') - '\\n' FROM - identity collection name_dups name, - collection name_meta_dups name_meta, - collection email_dups email, - collection phone_dups phone, - collection address_dups address + identity collection name_str_dups name_str_dup, + collection name_meta_dups name_meta_dup, + collection email_dups email_dup, + collection phone_dups phone_dup, + collection address_dups address_dup WHERE - :name:key1 = :name_meta:key1 - AND :name:key2 = :name_meta:key2 - AND :name:key1 = :email:key1 - AND :name:key2 = :email:key2 - AND :name:key1 = :phone:key1 - AND :name:key2 = :phone:key2 - AND :name:key1 = :address:key1 - AND :name:key2 = :address:key2 + :name_str_dup:key1 = :name_meta_dup:key1 + AND :name_str_dup:key2 = :name_meta_dup:key2 + AND :name_str_dup:key1 = :email_dup:key1 + AND :name_str_dup:key2 = :email_dup:key2 + AND :name_str_dup:key1 = :phone_dup:key1 + AND :name_str_dup:key2 = :phone_dup:key2 + AND :name_str_dup:key1 = :address_dup:key1 + AND :name_str_dup:key2 = :address_dup:key2 ; @@ -601,20 +601,18 @@ update_duplicates "system/query" INSERT INTO collection all_dups SELECT - key1 = :concat:key1, - key2 = :concat:key2, - sim = :concat:sim, - reason = 'All (' + round(:concat:sim * 100, :value:reason_decimals) + '%)' + key1 = :concat_dup:key1, + key2 = :concat_dup:key2, + sim = :concat_dup:sim, + reason = 'All (' + round(:concat_dup:sim * 100, :value:reason_decimals) + '%)' FROM - identity collection concat_dups concat + identity collection concat_dups concat_dup ON duplicate :key1, :key2 UPDATE SET - :sim = condition(:sim > :concat:sim, :sim, :concat:sim), - :reason = 'All (' + round(:concat:sim * 100, :value:reason_decimals) + '%)\\n' + :reason, - :s_date_modified = getdate(), - :s_modified_by = user_name() + :sim = condition(:sim > :concat_dup:sim, :sim, :concat_dup:sim), + :reason = 'All (' + round(:concat_dup:sim * 100, :value:reason_decimals) + '%)\\n' + :reason ; From 6c360801e7085e38bbc77f3e33453bc0ecb40752 Mon Sep 17 00:00:00 2001 From: Israel Date: Thu, 20 Nov 2025 16:21:45 -0700 Subject: [PATCH 10/12] Minor improvements and clean up. --- kardia-app/modules/base/dups/update.qy | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kardia-app/modules/base/dups/update.qy b/kardia-app/modules/base/dups/update.qy index 6a69196d..93b36f1c 100644 --- a/kardia-app/modules/base/dups/update.qy +++ b/kardia-app/modules/base/dups/update.qy @@ -81,7 +81,7 @@ update_duplicates "system/query" SELECT :value:addresses_count = count(1) FROM collection addresses; SELECT :value:concats_count = count(1) FROM collection concats; - print 'Computing search parameters...' + print 'Computing parameters...' -- Compute searching parameters. SELECT :value:name_strs_algorithm = :name_strs:algorithm, From d3dd3b14b3db86e49569e37c5f39effa9daef800 Mon Sep 17 00:00:00 2001 From: Lightning11wins Date: Thu, 11 Dec 2025 16:38:00 -0700 Subject: [PATCH 11/12] Rename logn() to log() to prevent regression from changes in Centrallix. --- kardia-app/modules/base/dups/cluster_params.qy | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kardia-app/modules/base/dups/cluster_params.qy b/kardia-app/modules/base/dups/cluster_params.qy index ff5564b5..5d5a9143 100644 --- a/kardia-app/modules/base/dups/cluster_params.qy +++ b/kardia-app/modules/base/dups/cluster_params.qy @@ -12,7 +12,7 @@ duplicate_checking_globals "system/query" algorithm = condition(:parameters:num_data > 50000, 'kmeans', 'none'), -- Compute the k value for clustering. - k = constrain(convert(integer, power(logn(:parameters:num_data, 36), 3.2) - 8), 2, convert(integer, NULL)) + k = constrain(convert(integer, power(log(:parameters:num_data, 36), 3.2) - 8), 2, convert(integer, NULL)) FROM /apps/kardia/data/Kardia_DB/_a_alphabet/rows LIMIT 1 From adbc6a9edbe90db4658dfad5b14c32c5c9cc92ef Mon Sep 17 00:00:00 2001 From: Lightning11wins Date: Tue, 24 Mar 2026 13:40:57 -0600 Subject: [PATCH 12/12] Add notes and fix typos. --- kardia-app/modules/base/dups/cluster_params.qy | 2 +- kardia-scripts/cron/update_duplicates.sh | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/kardia-app/modules/base/dups/cluster_params.qy b/kardia-app/modules/base/dups/cluster_params.qy index 5d5a9143..e0d6f592 100644 --- a/kardia-app/modules/base/dups/cluster_params.qy +++ b/kardia-app/modules/base/dups/cluster_params.qy @@ -9,7 +9,7 @@ duplicate_checking_globals "system/query" sql = " SELECT -- Compute which clustering algorithm to use (no clustering is better for small amounts of data). - algorithm = condition(:parameters:num_data > 50000, 'kmeans', 'none'), + algorithm = condition(:parameters:num_data > 50000, 'k-means', 'none'), -- Compute the k value for clustering. k = constrain(convert(integer, power(log(:parameters:num_data, 36), 3.2) - 8), 2, convert(integer, NULL)) diff --git a/kardia-scripts/cron/update_duplicates.sh b/kardia-scripts/cron/update_duplicates.sh index b9c5b927..792423f3 100755 --- a/kardia-scripts/cron/update_duplicates.sh +++ b/kardia-scripts/cron/update_duplicates.sh @@ -15,4 +15,5 @@ export PATH=$PATH:/usr/local/sbin:/usr/local/bin # TODO: Greg - We need to update this cron to call the new file. +# I use: csv select * from /apps/kardia/modules/base/dups/update.qy /usr/local/bin/test_obj -c /usr/local/etc/centrallix.conf -u kardia -p $(cat /usr/local/etc/centrallix/kardia-auth) -q -C 'ls /apps/kardia/modules/base/update_duplicates.qy' 2>/dev/null >/dev/null