Skip to content
Open
21 changes: 21 additions & 0 deletions kardia-app/modules/base/dups/cluster_params.qy
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
$Version=2$
duplicate_checking_globals "system/query"
{
// Computes parameters for clustering and searching, based on the provided
// size of the data.

num_data "query/parameter" { type = integer; style = strnull; min = 1; }

sql = "
SELECT
-- Compute which clustering algorithm to use (no clustering is better for small amounts of data).
algorithm = condition(:parameters:num_data > 50000, 'k-means', 'none'),

-- Compute the k value for clustering.
k = constrain(convert(integer, power(log(:parameters:num_data, 36), 3.2) - 8), 2, convert(integer, NULL))
FROM
/apps/kardia/data/Kardia_DB/_a_alphabet/rows
LIMIT 1
;
";
}
68 changes: 68 additions & 0 deletions kardia-app/modules/base/dups/dups.cluster
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
$Version=2$
cluster_dups "system/cluster"
{
// Declare parameters.
algorithm "cluster/parameter" { type = string; style = notnull; }
k "cluster/parameter" { type = integer; style = notnull; }
field "cluster/parameter" { type = string; style = notnull; }
data "cluster/parameter" { type = string; style = notnull; }

// Declare data source.
source = runserver('/apps/kardia/modules/base/dups/get/' + :parameters:field + '.qy');
key_attr = "key";
data_attr = runserver(:parameters:data);

// A cluster for searching with clustering.
kmeans "cluster/cluster"
{
algorithm = "k-means";
similarity_measure = "cosine";
num_clusters = runserver(:parameters:k);
min_improvement = 0.0001;
max_iterations = 32;
}

// A "cluster" for searching without clustering.
none "cluster/cluster"
{
algorithm = "none";
}

// Default duplicate search, used for names, emails, and addresses.
dups "cluster/search"
{
source = runserver(:parameters:algorithm);
similarity_measure = "cosine";
threshold = 0.7;
}

// Double Metaphone search.
// Double Metaphone is prone to false positives, so it uses a higher
// threshold to make them slightly less bad.
meta_dups "cluster/search"
{
source = runserver(:parameters:algorithm);
similarity_measure = "levenshtein";
threshold = 0.8;
}

// Phone search.
// Searching for duplicate phone numbers uses edit distance similarity
// instead of cosine to give more accurate results.
phone_dups "cluster/search"
{
source = runserver(:parameters:algorithm);
similarity_measure = "levenshtein";
threshold = 0.7;
}

// Concat search.
// The concatenation strategy is faster than the aggregation strategy, so
// we can use a slightly lower threshold to detect just a few more dups.
concat_dups "cluster/search"
{
source = runserver(:parameters:algorithm);
similarity_measure = "cosine";
threshold = 0.65;
}
}
35 changes: 35 additions & 0 deletions kardia-app/modules/base/dups/get/addresses.qy
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
$Version=2$
get_addresses "system/query"
{
// To get the address, we need to concatenate several fields found in the
// p_location table:
// - p_in_care_of: For sending mail to a recipient without an address who
// will have a different person/organization receive the
// mail for them. The address of that entity is provided.
// - p_address_1, p_address_2, & p_address_3: Up to 3 lines of an address.
// - p_city, p_state_province: The city and state (respectively).
// - p_country_code, p_postal_code: The country and postal code (repsectively).
//
// If the p_address_1 field is null or does not exist, the 'address' is ignored.
// This is very common because many systems in Centrallix assume that every
// record has an address, so every record has an associated address, even if it
// is almost completely blank.
sql = "
SELECT
key = :p_partner_key,
address = ''
+ isnull(:p_in_care_of, '')
+ isnull(:p_address_1, '')
+ isnull(:p_address_2, '')
+ isnull(:p_address_3, '')
+ isnull(:p_city, '')
+ isnull(:p_state_province, '')
+ isnull(:p_country_code, '')
+ isnull(:p_postal_code, '')
FROM
identity /apps/kardia/data/Kardia_DB/p_location/rows
WHERE
char_length(isnull(:p_address_1, '')) > 1
;
";
}
87 changes: 87 additions & 0 deletions kardia-app/modules/base/dups/get/concats.qy
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
$Version=2$
get_concats "system/query"
{
// All data found in the files names.qy, name_metas.qy, emails.qy,
// phones.qy, and addresses.qy is concatenated together here (although
// those files aren't read here for performance reasons), producing a
// single concatenated string with all the information for a person.
// This string is produced for every record in p_partner record in the
// database. Each record can have up to one email, phone number, and
// address, so multiple records are produced for a given p_partner record
// if various combonations of this contact information are possible.
//
// Note: Don't make the mistake of calling an attribute "name" or it might
// accidentally become the canonical name of that object in the object
// system, causing a ton of stuff to break in subtle and confusing ways.
sql = "
declare collection temp;

INSERT INTO
collection temp
SELECT
key = :p:p_partner_key,
name_str = ''
+ isnull(:p_given_name, '')
+ isnull(condition(
char_length(isnull(:p_given_name, '')) > 1
AND char_length(isnull(:p_surname, '')) > 1
AND :p_given_name != :p_preferred_name
AND :p_surname != :p_preferred_name,
:p_preferred_name,
''
), '')
+ isnull(:p_surname, '')
+ isnull(:p_org_name, ''),
name_meta = ''
+ isnull(metaphone(:p_given_name), '')
+ isnull(condition(
:p_given_name != :p_preferred_name
AND :p_surname != :p_preferred_name,
metaphone(:p_preferred_name),
''
), '')
+ isnull(metaphone(:p_surname), ''),
email = isnull(:e:p_contact_data, ''),
phone = ''
+ isnull(:ph:p_phone_country, '')
+ isnull(:ph:p_phone_area_city, '')
+ isnull(:ph:p_contact_data, ''),
address = ''
+ isnull(:l:p_in_care_of, '')
+ isnull(:l:p_address_1, '')
+ isnull(:l:p_address_2, '')
+ isnull(:l:p_address_3, '')
+ isnull(:l:p_city, '')
+ isnull(:l:p_state_province, '')
+ isnull(:l:p_country_code, '')
+ isnull(:l:p_postal_code, '')
FROM
identity /apps/kardia/data/Kardia_DB/p_partner/rows p,
/apps/kardia/data/Kardia_DB/p_contact_info/rows e,
/apps/kardia/data/Kardia_DB/p_contact_info/rows ph,
/apps/kardia/data/Kardia_DB/p_location/rows l
WHERE
:p:p_partner_key *= :e:p_partner_key
AND :p:p_partner_key *= :ph:p_partner_key
AND :p:p_partner_key *= :l:p_partner_key
AND :e:p_contact_type = 'E'
AND ((:ph:p_contact_type = 'P') + (:ph:p_contact_type = 'C'))
;

-- Nonzero numbers are used as boundary markers for the meta parts
-- because they do not appear in metaphones. This helps to reduce
-- false positives from boundary characters falsely matching.
SELECT
key = :key,
data = ''
+ :name_str + '`'
+ :name_meta + '1'
+ :name_meta + '1'
+ :name_meta + '1'
+ :email + '`'
+ :phone + '`'
+ :address
FROM
collection temp
";
}
19 changes: 19 additions & 0 deletions kardia-app/modules/base/dups/get/emails.qy
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
$Version=2$
get_emails "system/query"
{
// The email field is incredibly simple to get. We simply query for all
// p_contact_info records with a p_contact_type of E (for email) and read
// the email directly from the p_contact_data field. If this field is null
// or does not exist, the 'email' is ignored.
sql = "
SELECT
key = :p_partner_key,
email = :p_contact_data
FROM
identity /apps/kardia/data/Kardia_DB/p_contact_info/rows
WHERE
:p_contact_type = 'E'
AND char_length(isnull(:p_contact_data, '')) > 1
;
";
}
32 changes: 32 additions & 0 deletions kardia-app/modules/base/dups/get/name_metas.qy
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
$Version=2$
get_name_metas "system/query"
{
// We calculate the metaphone value separately for the p_given_name,
// p_preferred_name, and p_surname fields. As with names, the
// p_preferred_name field is ignored if it matches the given name or
// the surname.
//
// Note: p_org_name is not considered because we determined that it was
// unlikely for someone would have to guess the spelling of an
// organization's name from how it sounded. Also, such names tend
// to be long and ill suited for the double metaphone algorithm.
sql = "
SELECT
key = :p_partner_key,
name_meta = ''
+ isnull(metaphone(:p_given_name), '')
+ isnull(condition(
:p_preferred_name != :p_given_name
AND :p_preferred_name != :p_surname,
metaphone(:p_preferred_name),
''
), '')
+ isnull(metaphone(:p_surname), '')
FROM
/apps/kardia/data/Kardia_DB/p_partner/rows
WHERE
char_length(isnull(:p_given_name, '')) > 1
AND char_length(isnull(:p_surname, '')) > 1
;
";
}
35 changes: 35 additions & 0 deletions kardia-app/modules/base/dups/get/name_strs.qy
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
$Version=2$
get_names "system/query"
{
// We concatenate four fields to produce the name: p_given_name,
// p_preferred_name, p_surname, and p_org_name. p_preferred_name is
// ignored if it adds no new information because it matches the given
// name or the surname, and it is also ignored for organizations. We
// detect organizations because they do not have a p_given_name or a
// p_surname value and only the p_org_name field is considered for them
// (it's long enough on it's own, anyway). However, p_org_name is still
// considered for people, too.
//
// Note: Don't make the mistake of calling an attribute "name" or it might
// accidentally become the canonical name of that object in the object
// system, causing a ton of stuff to break in subtle and confusing ways.
sql = "
SELECT
key = :p_partner_key,
name_str = ''
+ isnull(:p_given_name, '')
+ isnull(condition(
char_length(isnull(:p_given_name, '')) > 1
AND char_length(isnull(:p_surname, '')) > 1
AND :p_given_name != :p_preferred_name
AND :p_surname != :p_preferred_name,
:p_preferred_name,
''
), '')
+ isnull(:p_surname, '')
+ isnull(:p_org_name, '')
FROM
/apps/kardia/data/Kardia_DB/p_partner/rows
;
";
}
25 changes: 25 additions & 0 deletions kardia-app/modules/base/dups/get/phones.qy
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
$Version=2$
get_phones "system/query"
{
// The phone field can be found by querying for all p_contact_info records
// with a p_contact_type of P (for phone) or C (for cellphone), both of
// which we treat the same for the purposes of this algorithm. Then, we
// read the country code / area code from the p_phone_country field, the
// second 3digit code from the p_phone_area_city field, and finally the
// last four digits form the p_contact_data field. If this field is null
// or does not exist, the 'phone number' is ignored.
sql = "
SELECT
key = :p_partner_key,
phone = ''
+ isnull(:p_phone_country, '')
+ isnull(:p_phone_area_city, '')
+ isnull(:p_contact_data, '')
FROM
identity /apps/kardia/data/Kardia_DB/p_contact_info/rows
WHERE
((:p_contact_type = 'P') + (:p_contact_type = 'C'))
AND char_length(isnull(:p_contact_data, '')) > 1
;
";
}
Loading