diff --git a/ReadMe.md b/ReadMe.md index 80d1dc1..9bce160 100644 --- a/ReadMe.md +++ b/ReadMe.md @@ -21,6 +21,13 @@ var data = [ var result = data.filter(fuzzy("dan")); +console.log(result); +// [{ +// name: "Dan Smith" +// }] + +result = data.filter(fuzzy("dun", 0.2)); + console.log(result); // [{ // name: "Dan Smith" @@ -144,11 +151,12 @@ This time, `result` would contain only one element: Documentation ------------ -**fuzzy(query, keys)** +**fuzzy(query, keys, leven)** Returns a filter predicate (function) suitable for passing to [`Array.prototype.filter`](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/Array/filter). * `query`: The filter query to use to reduce an array down to objects matching the query. This can be a string or a number. * `keys`: Optionally restrict the search to a set of keys; only applied when filtering objects. This can be a string containing the name of a single key, or an array of keys. +* `threshold`: The [Dice's Coefficient](https://en.wikipedia.org/wiki/S%C3%B8rensen%E2%80%93Dice_coefficient) threshold used to consider matches. This means that differences such as "In my camp Dan has a fire" and "Dun has fire" can be tolerated. It's a fraction between 0 and 1, which indicates the degree of similarity between the needle and haystack. 0 indicates completely different strings, 1 indicates identical strings. ### Normalization What makes this a "fuzzy" filter is that it is looking for values that _somewhat_ match the query—not exact matches. @@ -158,7 +166,7 @@ When comparing strings, the needle (the query) and the haystack value are both n 1. Convert the string to a lowercase string 2. Remove all non-word characters (characters matching the `\W` regex and underscores) -Then, instead of checking string equality, it checks to see if the haystack value contains the needle value (using `indexOf`). If it does, it's considered a match. +Then, instead of checking string equality, it checks to see if the haystack value contains the needle value (using `indexOf`). If it does, it's considered a match. If threshold is supplied, then a [Dice's Coefficient](https://en.wikipedia.org/wiki/S%C3%B8rensen%E2%80%93Dice_coefficient) algorithm is used to compare the degree of difference between the needle and haystack and if it's above or equal to the threshold then it's a match. This process only applies when comparing strings; numbers must be exactly equal to be considered a match. @@ -170,4 +178,8 @@ Contributing ------------ I welcome pull requests containing bug fixes and documentation improvements for `fuzzy-predicate`. Be sure to run the tests before submitting any changes. -And although I consider `fuzzy-predicate` to be _mostly_ feature complete, I welcome discussion on how it could be a more useful tool (e.g. if callers could customize how normalization worked). \ No newline at end of file +And although I consider `fuzzy-predicate` to be _mostly_ feature complete, I welcome discussion on how it could be a more useful tool (e.g. if callers could customize how normalization worked). + +Contributors +------------ +[Emmanuel Mahuni](https://github.com/emahuni) - Added Dice algorithm option for truly fuzzy matches diff --git a/index.js b/index.js index 35b2188..f7f92e6 100644 --- a/index.js +++ b/index.js @@ -1,3 +1,5 @@ +var StringSimilarity = require ('string-similarity') + module.exports = fuzzy; /** @@ -8,16 +10,19 @@ module.exports = fuzzy; * - matches any string that contains the query being insensitive to punctuation, spacing, and capitalization * - matches numbers exactly * - matches objects who contains a value fuzzy-matching the query + * - if similarity threshold is set, then it does a truly fuzzy match by comparing the difference in the strings * * When filtering an array of objects, the fuzzy matching can optionally be restricted to only * match values that are associated with the specified key or keys. * * @param {string|number} query The filter query to use to reduce an array down to objects matching the query. * @param {string|array=} keys Optionally restrict the search to a set of keys; only applied when filtering objects. + * @param {number} threshold Returns a fraction between 0 and 1, which indicates the degree of similarity between the two strings. + * 0 indicates completely different strings, 1 indicates identical strings. * * @return {Function} A filter predicate suitable for passing to Array.prototype.filter. */ -function fuzzy(query, keys) { +function fuzzy(query, keys, threshold) { if (typeof query !== "string" && (typeof query !== "number" || isNaN(query))) { throw new TypeError("The query is required and must be a string or number"); @@ -27,12 +32,15 @@ function fuzzy(query, keys) { keys = []; } else if (typeof keys === "string") { keys = [keys]; + } else if (typeof keys === "number") { + threshold = keys; + keys = []; } else if (!Array.isArray(keys)) { throw new TypeError("keys should either be an array or a single value as a string"); } return function(element) { - return _search(element, query, keys); + return _search(element, query, keys, threshold); }; } @@ -42,19 +50,24 @@ function fuzzy(query, keys) { * @param {*} haystack Searches this object for the needle. * @param {string|number} needle The value to search for within the haystack. * @param {array} keys Restrict searching an object to only match values associated with the specified keys. + * @param {number} leven Use levenshtein distance to determine if the match is acceptable * * @return {boolean} True if a match was found; false otherwise. */ -function _search(haystack, needle, keys) { +function _search(haystack, needle, keys, threshold) { switch (typeof haystack) { case "number": return haystack == needle; // eslint-disable-line eqeqeq case "string": - return _normalize(haystack).indexOf(_normalize(needle)) >= 0; + if (!!threshold) { + return StringSimilarity.compareTwoStrings(_normalize(haystack), (_normalize(needle))) >= threshold; + } else { + return _normalize(haystack).indexOf(_normalize(needle)) >= 0; + } case "object": for (var key in haystack) { if (haystack.hasOwnProperty(key) && (keys.length === 0 || keys.indexOf(key) >= 0)) { - if (_search(haystack[key], needle, keys)) { + if (_search(haystack[key], needle, keys, threshold)) { return true; } } diff --git a/package.json b/package.json index 4df5388..cdcaeac 100644 --- a/package.json +++ b/package.json @@ -12,6 +12,9 @@ "author": "Daniel Pfeiffer", "repository": "mediabounds/fuzzy-predicate", "license": "MIT", + "dependencies": { + "string-similarity": "^3.0.0" + }, "devDependencies": { "eslint": "^3.13.1", "eslint-config-google": "^0.6.0", diff --git a/test/test.js b/test/test.js index e11e560..31934d6 100644 --- a/test/test.js +++ b/test/test.js @@ -106,6 +106,12 @@ describe("fuzzy-predicate", function() { results = haystack.filter(fuzzy("JOHN_DOE")); assert.deepStrictEqual(results, ["John Doe", "To John Doe", "john-doe"]); }); + + it("matches strings containing the query using given levenshtein distance", function() { + var results = haystack.filter(fuzzy("Jaine", 1)); + assert.deepStrictEqual(results, ["Jane Smith"]); + }); + }); context("given an array of numbers", function() {