From d862ef58aa4b5dec52b6d0f71067c016a5af2084 Mon Sep 17 00:00:00 2001 From: Emmanuel Mahuni Date: Tue, 12 Mar 2019 04:20:19 +0200 Subject: [PATCH 1/9] Update package.json --- package.json | 3 +++ 1 file changed, 3 insertions(+) diff --git a/package.json b/package.json index 4df5388..7a9a947 100644 --- a/package.json +++ b/package.json @@ -12,6 +12,9 @@ "author": "Daniel Pfeiffer", "repository": "mediabounds/fuzzy-predicate", "license": "MIT", + "dependencies": { + "leven": "^2.1.0" + }, "devDependencies": { "eslint": "^3.13.1", "eslint-config-google": "^0.6.0", From c79dcc3bd7d4ddec5e16e4c883df2bf6a62da9ac Mon Sep 17 00:00:00 2001 From: Emmanuel Mahuni Date: Tue, 12 Mar 2019 04:27:02 +0200 Subject: [PATCH 2/9] add levenshtein distance test Add new levenshtein distance matching --- test/test.js | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/test/test.js b/test/test.js index e11e560..45c7075 100644 --- a/test/test.js +++ b/test/test.js @@ -106,6 +106,12 @@ describe("fuzzy-predicate", function() { results = haystack.filter(fuzzy("JOHN_DOE")); assert.deepStrictEqual(results, ["John Doe", "To John Doe", "john-doe"]); }); + + it("matches strings containing the query using levenshtein distance", function() { + var results = haystack.filter(fuzzy("Jane",{leven: 2})); + assert.deepStrictEqual(results, ["Jane Smith"]); + }); + }); context("given an array of numbers", function() { From a7b8e383cbe56853e35288b3302d77c17e2cdd64 Mon Sep 17 00:00:00 2001 From: Emmanuel Mahuni Date: Tue, 12 Mar 2019 04:44:15 +0200 Subject: [PATCH 3/9] feat(Index): Add Levenshtein distance matches Add levenshtein distance to determine matches. This enables truly fuzzy matches even if the strings have a little difference --- index.js | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/index.js b/index.js index 35b2188..c448862 100644 --- a/index.js +++ b/index.js @@ -1,3 +1,5 @@ +var Leven = require ('leven') + module.exports = fuzzy; /** @@ -14,10 +16,11 @@ module.exports = fuzzy; * * @param {string|number} query The filter query to use to reduce an array down to objects matching the query. * @param {string|array=} keys Optionally restrict the search to a set of keys; only applied when filtering objects. + * @param {number} leven Use levenshtein distance to determine if the match is acceptable * * @return {Function} A filter predicate suitable for passing to Array.prototype.filter. */ -function fuzzy(query, keys) { +function fuzzy(query, keys, leven) { if (typeof query !== "string" && (typeof query !== "number" || isNaN(query))) { throw new TypeError("The query is required and must be a string or number"); @@ -27,12 +30,15 @@ function fuzzy(query, keys) { keys = []; } else if (typeof keys === "string") { keys = [keys]; + } else if (typeof keys === "number") { + leven = keys; + keys = []; } else if (!Array.isArray(keys)) { throw new TypeError("keys should either be an array or a single value as a string"); } return function(element) { - return _search(element, query, keys); + return _search(element, query, keys, leven); }; } @@ -42,19 +48,24 @@ function fuzzy(query, keys) { * @param {*} haystack Searches this object for the needle. * @param {string|number} needle The value to search for within the haystack. * @param {array} keys Restrict searching an object to only match values associated with the specified keys. + * @param {number} leven Use levenshtein distance to determine if the match is acceptable * * @return {boolean} True if a match was found; false otherwise. */ -function _search(haystack, needle, keys) { +function _search(haystack, needle, keys, leven) { switch (typeof haystack) { case "number": return haystack == needle; // eslint-disable-line eqeqeq case "string": - return _normalize(haystack).indexOf(_normalize(needle)) >= 0; + if (!!leven) { + return Leven(_normalize(haystack), (_normalize(needle))) <= leven; + } else { + return _normalize(haystack).indexOf(_normalize(needle)) >= 0; + } case "object": for (var key in haystack) { if (haystack.hasOwnProperty(key) && (keys.length === 0 || keys.indexOf(key) >= 0)) { - if (_search(haystack[key], needle, keys)) { + if (_search(haystack[key], needle, keys, leven)) { return true; } } From 98c6134be8e0eebe7219f22320ef8664a400d076 Mon Sep 17 00:00:00 2001 From: Emmanuel Mahuni Date: Tue, 12 Mar 2019 04:47:18 +0200 Subject: [PATCH 4/9] Use leven as a number instead --- test/test.js | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/test.js b/test/test.js index 45c7075..31934d6 100644 --- a/test/test.js +++ b/test/test.js @@ -107,8 +107,8 @@ describe("fuzzy-predicate", function() { assert.deepStrictEqual(results, ["John Doe", "To John Doe", "john-doe"]); }); - it("matches strings containing the query using levenshtein distance", function() { - var results = haystack.filter(fuzzy("Jane",{leven: 2})); + it("matches strings containing the query using given levenshtein distance", function() { + var results = haystack.filter(fuzzy("Jaine", 1)); assert.deepStrictEqual(results, ["Jane Smith"]); }); From d9489eb4cdff04e00b16886abb897127c3bdf48d Mon Sep 17 00:00:00 2001 From: Emmanuel Mahuni Date: Tue, 12 Mar 2019 04:53:26 +0200 Subject: [PATCH 5/9] feat(Readme): Add levenshtein option documentation --- ReadMe.md | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/ReadMe.md b/ReadMe.md index 80d1dc1..134fada 100644 --- a/ReadMe.md +++ b/ReadMe.md @@ -21,6 +21,13 @@ var data = [ var result = data.filter(fuzzy("dan")); +console.log(result); +// [{ +// name: "Dan Smith" +// }] + +result = data.filter(fuzzy("dun", 1)); + console.log(result); // [{ // name: "Dan Smith" @@ -144,11 +151,12 @@ This time, `result` would contain only one element: Documentation ------------ -**fuzzy(query, keys)** +**fuzzy(query, keys, leven)** Returns a filter predicate (function) suitable for passing to [`Array.prototype.filter`](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/Array/filter). * `query`: The filter query to use to reduce an array down to objects matching the query. This can be a string or a number. * `keys`: Optionally restrict the search to a set of keys; only applied when filtering objects. This can be a string containing the name of a single key, or an array of keys. +* `leven`: The (levenshtein distance](http://en.wikipedia.org/wiki/Levenshtein_distance) used to consider matches. This means that differences such as "Dan" and "Dun" can be tolerated. This is a number that determines the levenshtein threshold. ### Normalization What makes this a "fuzzy" filter is that it is looking for values that _somewhat_ match the query—not exact matches. @@ -170,4 +178,4 @@ Contributing ------------ I welcome pull requests containing bug fixes and documentation improvements for `fuzzy-predicate`. Be sure to run the tests before submitting any changes. -And although I consider `fuzzy-predicate` to be _mostly_ feature complete, I welcome discussion on how it could be a more useful tool (e.g. if callers could customize how normalization worked). \ No newline at end of file +And although I consider `fuzzy-predicate` to be _mostly_ feature complete, I welcome discussion on how it could be a more useful tool (e.g. if callers could customize how normalization worked). From f05257de809941a007443a6a93426db063149159 Mon Sep 17 00:00:00 2001 From: Emmanuel Mahuni Date: Tue, 12 Mar 2019 07:13:06 +0200 Subject: [PATCH 6/9] changed from leven to string similarity changed the matching package from leven to string similarity --- package.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/package.json b/package.json index 7a9a947..cdcaeac 100644 --- a/package.json +++ b/package.json @@ -13,7 +13,7 @@ "repository": "mediabounds/fuzzy-predicate", "license": "MIT", "dependencies": { - "leven": "^2.1.0" + "string-similarity": "^3.0.0" }, "devDependencies": { "eslint": "^3.13.1", From a0c5f4a785928e1135cd3fdb138ec192dcec86fb Mon Sep 17 00:00:00 2001 From: Emmanuel Mahuni Date: Tue, 12 Mar 2019 07:34:57 +0200 Subject: [PATCH 7/9] fix(String): Use String Similarity instead of leven String Similarity Returns a fraction between 0 and 1, which indicates the degree of similarity between the two strings. 0 indicates completely different strings, 1 indicates identical strings and that is easier to determine the fuzziness of the match --- index.js | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/index.js b/index.js index c448862..f7f92e6 100644 --- a/index.js +++ b/index.js @@ -1,4 +1,4 @@ -var Leven = require ('leven') +var StringSimilarity = require ('string-similarity') module.exports = fuzzy; @@ -10,17 +10,19 @@ module.exports = fuzzy; * - matches any string that contains the query being insensitive to punctuation, spacing, and capitalization * - matches numbers exactly * - matches objects who contains a value fuzzy-matching the query + * - if similarity threshold is set, then it does a truly fuzzy match by comparing the difference in the strings * * When filtering an array of objects, the fuzzy matching can optionally be restricted to only * match values that are associated with the specified key or keys. * * @param {string|number} query The filter query to use to reduce an array down to objects matching the query. * @param {string|array=} keys Optionally restrict the search to a set of keys; only applied when filtering objects. - * @param {number} leven Use levenshtein distance to determine if the match is acceptable + * @param {number} threshold Returns a fraction between 0 and 1, which indicates the degree of similarity between the two strings. + * 0 indicates completely different strings, 1 indicates identical strings. * * @return {Function} A filter predicate suitable for passing to Array.prototype.filter. */ -function fuzzy(query, keys, leven) { +function fuzzy(query, keys, threshold) { if (typeof query !== "string" && (typeof query !== "number" || isNaN(query))) { throw new TypeError("The query is required and must be a string or number"); @@ -31,14 +33,14 @@ function fuzzy(query, keys, leven) { } else if (typeof keys === "string") { keys = [keys]; } else if (typeof keys === "number") { - leven = keys; + threshold = keys; keys = []; } else if (!Array.isArray(keys)) { throw new TypeError("keys should either be an array or a single value as a string"); } return function(element) { - return _search(element, query, keys, leven); + return _search(element, query, keys, threshold); }; } @@ -52,20 +54,20 @@ function fuzzy(query, keys, leven) { * * @return {boolean} True if a match was found; false otherwise. */ -function _search(haystack, needle, keys, leven) { +function _search(haystack, needle, keys, threshold) { switch (typeof haystack) { case "number": return haystack == needle; // eslint-disable-line eqeqeq case "string": - if (!!leven) { - return Leven(_normalize(haystack), (_normalize(needle))) <= leven; + if (!!threshold) { + return StringSimilarity.compareTwoStrings(_normalize(haystack), (_normalize(needle))) >= threshold; } else { return _normalize(haystack).indexOf(_normalize(needle)) >= 0; } case "object": for (var key in haystack) { if (haystack.hasOwnProperty(key) && (keys.length === 0 || keys.indexOf(key) >= 0)) { - if (_search(haystack[key], needle, keys, leven)) { + if (_search(haystack[key], needle, keys, threshold)) { return true; } } From 1e926608c1cfc220d72082d3dba235cb009521c5 Mon Sep 17 00:00:00 2001 From: Emmanuel Mahuni Date: Tue, 12 Mar 2019 07:36:47 +0200 Subject: [PATCH 8/9] Update ReadMe.md --- ReadMe.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ReadMe.md b/ReadMe.md index 134fada..82fb82d 100644 --- a/ReadMe.md +++ b/ReadMe.md @@ -26,7 +26,7 @@ console.log(result); // name: "Dan Smith" // }] -result = data.filter(fuzzy("dun", 1)); +result = data.filter(fuzzy("dun", 0.2)); console.log(result); // [{ From 36a13008bbe5d13eb656614e17a390ecf32c3761 Mon Sep 17 00:00:00 2001 From: Emmanuel Mahuni Date: Tue, 12 Mar 2019 07:49:05 +0200 Subject: [PATCH 9/9] Update ReadMe.md --- ReadMe.md | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/ReadMe.md b/ReadMe.md index 82fb82d..9bce160 100644 --- a/ReadMe.md +++ b/ReadMe.md @@ -156,7 +156,7 @@ Returns a filter predicate (function) suitable for passing to [`Array.prototype. * `query`: The filter query to use to reduce an array down to objects matching the query. This can be a string or a number. * `keys`: Optionally restrict the search to a set of keys; only applied when filtering objects. This can be a string containing the name of a single key, or an array of keys. -* `leven`: The (levenshtein distance](http://en.wikipedia.org/wiki/Levenshtein_distance) used to consider matches. This means that differences such as "Dan" and "Dun" can be tolerated. This is a number that determines the levenshtein threshold. +* `threshold`: The [Dice's Coefficient](https://en.wikipedia.org/wiki/S%C3%B8rensen%E2%80%93Dice_coefficient) threshold used to consider matches. This means that differences such as "In my camp Dan has a fire" and "Dun has fire" can be tolerated. It's a fraction between 0 and 1, which indicates the degree of similarity between the needle and haystack. 0 indicates completely different strings, 1 indicates identical strings. ### Normalization What makes this a "fuzzy" filter is that it is looking for values that _somewhat_ match the query—not exact matches. @@ -166,7 +166,7 @@ When comparing strings, the needle (the query) and the haystack value are both n 1. Convert the string to a lowercase string 2. Remove all non-word characters (characters matching the `\W` regex and underscores) -Then, instead of checking string equality, it checks to see if the haystack value contains the needle value (using `indexOf`). If it does, it's considered a match. +Then, instead of checking string equality, it checks to see if the haystack value contains the needle value (using `indexOf`). If it does, it's considered a match. If threshold is supplied, then a [Dice's Coefficient](https://en.wikipedia.org/wiki/S%C3%B8rensen%E2%80%93Dice_coefficient) algorithm is used to compare the degree of difference between the needle and haystack and if it's above or equal to the threshold then it's a match. This process only applies when comparing strings; numbers must be exactly equal to be considered a match. @@ -179,3 +179,7 @@ Contributing I welcome pull requests containing bug fixes and documentation improvements for `fuzzy-predicate`. Be sure to run the tests before submitting any changes. And although I consider `fuzzy-predicate` to be _mostly_ feature complete, I welcome discussion on how it could be a more useful tool (e.g. if callers could customize how normalization worked). + +Contributors +------------ +[Emmanuel Mahuni](https://github.com/emahuni) - Added Dice algorithm option for truly fuzzy matches