quambene · quambene · Nov 28, 2023 · Nov 28, 2023 · Nov 29, 2023 · Nov 29, 2023
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -20,6 +20,7 @@ anyhow = "1.0.72"
 thiserror = "1.0.50"
 log = "0.4.19"
 env_logger = "0.10.0"
+chrono = "0.4.26"
 dirs = "5.0.1"
 url = "2.5.0"
 clap = { version = "4.3.19", features = ["derive"] }
@@ -28,15 +29,15 @@ uuid = { version = "1.4.1", features = ["v4"] }
 serde = { version = "1.0.175", features = ["derive"] }
 serde_json = "1.0.103"
 html5ever = "0.24.1"
+html2md = "0.2.14"
+readability = "0.2.0"
 regex = "1.9.1"
+sublime_fuzzy = "0.7.0"
+similar = "2.2.1"
 colored = "2.0.4"
 lz4 = "1.24.0"
-similar = "2.2.1"
-chrono = "0.4.26"
 tokio = { version = "1.32.0", features = ["rt-multi-thread", "macros", "fs", "time", "test-util"] }
 futures = "0.3.28"
-html2md = "0.2.14"
-readability = "0.2.0"
 async-trait = "0.1.73"
 
 [dev-dependencies]

diff --git a/src/args.rs b/src/args.rs
@@ -18,6 +18,9 @@ pub struct Args {
     /// Print only URLs of bookmarks with selected lines.
     #[arg(short = 'l', long)]
     pub files_with_matches: bool,
+    /// Use fuzzy matching for search.
+    #[arg(short = 'f', long)]
+    pub fuzzy_search: bool,
     #[command(subcommand)]
     pub subcommands: Option<Subcommands>,
 }

diff --git a/src/cache.rs b/src/cache.rs
@@ -155,7 +155,7 @@ impl Caching for Cache {
 
     fn open(&self, bookmark: &TargetBookmark) -> Result<Option<File>, BogrepError> {
         let cache_path = self.bookmark_path(&bookmark.id);
-        debug!("Open website: {}", cache_path.display());
+        debug!("Open cache: {}", cache_path.display());
 
         if cache_path.exists() {
             let cache_file = utils::open_file(&cache_path)?;

diff --git a/src/cmd/search.rs b/src/cmd/search.rs
@@ -10,12 +10,13 @@ use std::{
     borrow::Cow,
     io::{self, BufRead},
 };
+use sublime_fuzzy::{FuzzySearch, Scoring};
 
 /// Maximum number of characters per line displayed in the search result.
 const MAX_COLUMNS: usize = 1000;
 
 pub fn search(pattern: &str, config: &Config, args: &Args) -> Result<(), anyhow::Error> {
-    debug!("{:?}", pattern);
+    debug!("{args:?}");
 
     let cache_mode = CacheMode::new(&args.mode, &config.settings.cache_mode);
     let cache = Cache::new(&config.cache_path, cache_mode);
@@ -59,34 +60,52 @@ fn search_bookmarks(
     for bookmark in bookmarks.values() {
         if let Some(cache_file) = cache.open(bookmark)? {
             let reader = io::BufReader::new(cache_file);
-            let matched_lines = find_matches(reader, &regex)?;
-
-            if matched_lines.len() == 1 {
-                matches += 1;
-                println!("Match in bookmark: {}", bookmark.url.blue());
-            } else if matched_lines.len() > 1 {
-                matches += 1;
-                println!("Matches in bookmark: {}", bookmark.url.blue());
-            }
+            if args.fuzzy_search {
+                let matched_lines = find_fuzzy_matches(reader, pattern, args.ignore_case)?;
+
+                if matched_lines.len() == 1 {
+                    matches += 1;
+                    println!("Match in bookmark: {}", bookmark.url.blue());
+                } else if matched_lines.len() > 1 {
+                    matches += 1;
+                    println!("Matches in bookmark: {}", bookmark.url.blue());
+                }
 
-            if !args.files_with_matches {
-                for matched_line in &matched_lines {
-                    println!("{}", color_matches(matched_line, &regex));
+                if !args.files_with_matches {
+                    for (matched_line, matched_chars) in &matched_lines {
+                        println!("{}\n", color_substring(matched_line, matched_chars));
+                    }
                 }
-            }
+            } else {
+                let matched_lines = find_regex_matches(reader, &regex)?;
+
+                if matched_lines.len() == 1 {
+                    matches += 1;
+                    println!("Match in bookmark: {}", bookmark.url.blue());
+                } else if matched_lines.len() > 1 {
+                    matches += 1;
+                    println!("Matches in bookmark: {}", bookmark.url.blue());
+                }
+
+                if !args.files_with_matches {
+                    for matched_line in &matched_lines {
+                        println!("{}\n", color_matches(matched_line, &regex));
+                    }
+                }
+            };
         }
     }
 
     Ok(matches)
 }
 
 /// Find the matched lines for the regex in a file.
-fn find_matches(reader: impl BufRead, regex: &Regex) -> Result<Vec<String>, anyhow::Error> {
+fn find_regex_matches(reader: impl BufRead, regex: &Regex) -> Result<Vec<String>, anyhow::Error> {
+    let mut start_index;
+    let mut end_index;
     let mut matched_lines = vec![];
 
     for line in reader.lines() {
-        let start_index;
-        let end_index;
         let line = line?;
 
         if regex.is_match(&line) {
@@ -102,10 +121,52 @@ fn find_matches(reader: impl BufRead, regex: &Regex) -> Result<Vec<String>, anyh
                 end_index = line.len();
             }
 
-            if let Some(truncated_line) = line.get(start_index..end_index) {
-                matched_lines.push(truncated_line.to_owned());
+            let matched_line = if let Some(truncated_line) = line.get(start_index..end_index) {
+                truncated_line.to_owned()
             } else {
-                matched_lines.push(line.to_owned());
+                line
+            };
+
+            matched_lines.push(matched_line);
+        }
+    }
+
+    Ok(matched_lines)
+}
+
+// TODO: truncate line to max 1000 characters
+fn find_fuzzy_matches(
+    reader: impl BufRead,
+    search_pattern: &str,
+    is_case_insensitive: bool,
+) -> Result<Vec<(String, Vec<usize>)>, anyhow::Error> {
+    let mut matched_lines = vec![];
+    let scoring = Scoring {
+        bonus_consecutive: 1024,
+        bonus_word_start: 0,
+        ..Scoring::default()
+    };
+
+    for line in reader.lines() {
+        let line = line?;
+
+        let best_match = if is_case_insensitive {
+            FuzzySearch::new(search_pattern, &line)
+                .case_insensitive()
+                .score_with(&scoring)
+                .best_match()
+        } else {
+            FuzzySearch::new(search_pattern, &line)
+                .case_sensitive()
+                .score_with(&scoring)
+                .best_match()
+        };
+
+        if let Some(best_match) = best_match {
+            debug!("best match: {best_match:?}");
+            if best_match.score() > 5000 {
+                let matched_indices = best_match.matched_indices().cloned().collect();
+                matched_lines.push((line, matched_indices));
             }
         }
     }
@@ -120,19 +181,67 @@ fn color_matches<'a>(matched_line: &'a str, regex: &Regex) -> Cow<'a, str> {
     })
 }
 
+fn color_substring(matched_line: &str, indices: &[usize]) -> String {
+    let mut colored_string = String::new();
+    let mut last_end = 0;
+
+    for &index in indices {
+        if let Some(chars) = matched_line.get(last_end..index) {
+            colored_string.push_str(chars);
+        }
+
+        if let Some(char) = matched_line.get(index..=index) {
+            let colored_char = char.bold().red().to_string();
+            colored_string.push_str(&colored_char);
+        }
+
+        last_end = index + 1;
+    }
+
+    colored_string
+}
+
 #[cfg(test)]
 mod tests {
     use super::*;
     use std::io::Cursor;
 
     #[tokio::test]
-    async fn test_find_matches() {
+    async fn test_find_matches_regex_case_insensitive() {
+        let content = "line 1\nine 2\nline 3\nline 4 Reed-Solomon code\nline 5";
+        let cursor = Cursor::new(content);
+        let re = "(?i)reed-solomon code".to_string();
+        let regex = Regex::new(&re).unwrap();
+
+        let res = find_regex_matches(cursor, &regex);
+        assert!(res.is_ok());
+
+        let matched_lines = res.unwrap();
+        assert_eq!(matched_lines, vec!["line 4 Reed-Solomon code"]);
+    }
+
+    #[tokio::test]
+    async fn test_find_matches_regex_case_sensitive() {
+        let content = "line 1\nine 2\nline 3 Reed-Solomon code\nline 4 reed-solomon code\nline 5";
+        let cursor = Cursor::new(content);
+        let re = "Reed-Solomon code".to_string();
+        let regex = Regex::new(&re).unwrap();
+
+        let res = find_regex_matches(cursor, &regex);
+        assert!(res.is_ok());
+
+        let matched_lines = res.unwrap();
+        assert_eq!(matched_lines, vec!["line 3 Reed-Solomon code"]);
+    }
+
+    #[tokio::test]
+    async fn test_find_matches_regex_case_insensitive_more_than_1000_chars() {
         let content = r#"To avoid errors, you once again add extra information. Here, you send they-value that corresponds to another predeterminedx-coordinate. If the three points do not fall on the same line, there’s an error. And to figure out where the error is, you just send one more value — meaning you’ve sent four numbers total, rather than the six required by the previous method.The advantage grows with the size of the message. Let’s say you want to send a longer message — 1,000 numbers. The less efficient code would require sending 2,000 numbers to identify an error, and 3,000 to correct it. But if you use the code that involves interpolating a polynomial through given points, you only need 1,001 numbers to find the error, and 1,002 to correct it. (You can add more points to identify and correct more potential errors.) As the length of your message increases, the difference in efficiency between the two codes grows starker.The more efficient code is called a Reed-Solomon code. Since its introduction in 1960, mathematicians have made further breakthroughs, developing algorithms that can correct more errors with greater efficiency. “It’s very elegant, clean, concrete,” saidSwastik Kopparty, a mathematician and computer scientist at the University of Toronto. “It can be taught to a second-year undergraduate in half an hour.”Reed-Solomon codes have been particularly useful for storing and transmitting information electronically. But the same concept has also been essential in cryptography and distributed computing.Take secret sharing: Let’s say you want to distribute a secret among several parties such that no one person can access the entire secret, but together they can. (Imagine an encryption key, for instance, or a missile launch code.) You encode the numbers in a polynomial, evaluate that polynomial at a predetermined set of points, and distribute each of the results to a different person.Most recently, Reed-Solomon codes have been employed in areas like cloud computing and blockchain technology. Say you need to run a computation that’s too complicated for your laptop, so you have a large computational cluster run it — but now you need to verify that the computation you get back is correct. Reed-Solomon codes let you ask for additional information that the cluster likely won’t be able to produce if it hasn’t done the computation correctly. “This works magically,” saidJade Nardi, a research fellow at the Mathematics Institute of Rennes in France. “This process is really wonderful, and the way it relies on [these codes] blows my mind.”But Reed-Solomon codes also have an important constraint. They’re constructed in such a way that you can only evaluate your polynomial at a fixed (and usually relatively small) set of values. That is, you’re limited to using a certain set of numbers to encode your message. The size of that set, or alphabet, in turn restricts the length of the messages you can send — and the bigger you try to make your alphabet, the more computational power you’ll need to decode those messages.And so mathematicians sought an even more optimal code.Future CodesA more general, more powerful code would allow you to store or send longer messages without needing to increase the size of your alphabet. To do this, mathematicians devised codes that involve interpolating a function — which lives in a special space associated to a more complicated curve — through given points on that curve. These so-called algebraic geometry codes “came out of nowhere, and they’re better than any other code we know how to make [with a smaller alphabet],” Kopparty said. “This beats everything. It was a real shock.”There’s just one problem. In practice, implementing a Reed-Solomon code is much, much easier than implementing an algebraic geometry code. “This is state-of-the-art, but it’s still under investigation to really turn into something practical,” said the cryptologistSimon Abelard. “It involves quite abstract mathematics, and it’s hard to handle these codes on a computer.”For now, that’s not worrisome: In real-world applications, Reed-Solomon codes and related forms of error correction are sufficient. But that might not always be the case. For instance, if powerful quantum computers become available in the future, they’ll be able tobreak today’s cryptography protocols. As a result, researchers have been searching for schemes that can resist quantum attacks. One top contender for such schemes would require something stronger than Reed-Solomon codes. Certain versions of algebraic geometry codes might just work. Other researchers are hopeful about the role algebraic geometry codes might play in cloud computing.But even in the absence of such potential uses, “in the history of mathematics, sometimes you discover new things that really don’t have applications nowadays,” saidElena Berardini, a researcher at Eindhoven University of Technology in the Netherlands who works on algebraic geometry codes. “But then after 50 years, you find that it might be useful for something completely unexpected” — just like the ancient problem of interpolation itself."#;
         let cursor = Cursor::new(content);
         let re = "(?i)reed-solomon code".to_string();
         let regex = Regex::new(&re).unwrap();
 
-        let res = find_matches(cursor, &regex);
+        let res = find_regex_matches(cursor, &regex);
         assert!(res.is_ok());
         let matched_lines = res.unwrap();
         assert_eq!(