bangarharshit · Chaiavi · Oct 12, 2014
diff --git a/src/main/java/Unit_testing/IIndexableContent.java b/src/main/java/Unit_testing/IIndexableContent.java
@@ -0,0 +1,9 @@
+package Unit_testing;
+
+/**
+ * Created by Avi Hayun on 12/10/2014.
+ * Indexable content is a tagging interface for content one wants to index
+ */
+public interface IIndexableContent {
+
+}
diff --git a/src/main/java/Unit_testing/IStore.java b/src/main/java/Unit_testing/IStore.java
@@ -0,0 +1,21 @@
+package Unit_testing;
+
+import org.apache.solr.client.solrj.SolrServerException;
+
+import java.io.IOException;
+
+/**
+ * Created by Avi Hayun on 12/10/2014.
+ * Interface representing the store type one wants to index one's content in
+ */
+public interface IStore {
+  /**
+   * Parses the HTML in order to extract only the wanted content
+   * */
+  IIndexableContent parseIndexableContent(String html);
+
+  /**
+   * Add the indexable content into the store
+   * */
+  void add(IIndexableContent indexableContent) throws Exception;
+}
diff --git a/src/main/java/Unit_testing/MyCrawler.java b/src/main/java/Unit_testing/MyCrawler.java
@@ -1,19 +1,12 @@
 package Unit_testing;
 
-import java.io.IOException;
-import java.util.ArrayList;
 import java.util.List;
 import java.util.concurrent.CopyOnWriteArrayList;
+import java.util.concurrent.atomic.AtomicInteger;
 import java.util.regex.Pattern;
 
-import org.apache.solr.client.solrj.SolrServer;
-import org.apache.solr.client.solrj.SolrServerException;
-import org.apache.solr.client.solrj.impl.HttpSolrServer;
+import Unit_testing.solr_impl.SolrStore;
 import org.apache.solr.common.SolrInputDocument;
-import org.jsoup.Jsoup;
-import org.jsoup.nodes.Document;
-import org.jsoup.nodes.Element;
-import org.jsoup.select.Elements;
 
 import edu.uci.ics.crawler4j.crawler.Page;
 import edu.uci.ics.crawler4j.crawler.WebCrawler;
@@ -30,91 +23,65 @@
  */
 public class MyCrawler extends WebCrawler {
 
-        private final static Pattern FILTERS = Pattern.compile(".*(\\.(css|js|bmp|gif|jpe?g" 
+  private final static Pattern FILTERS = Pattern.compile(".*(\\.(css|js|bmp|gif|jpe?g"
                                                           + "|png|tiff?|mid|mp2|mp3|mp4"
                                                           + "|wav|avi|mov|mpeg|ram|m4v|pdf" 
                                                           + "|rm|smil|wmv|swf|wma|zip|rar|gz))$");
 
-        /*
-         *  Copy on write list for testing across different threads.
-         *  To do : replace with auto-commit  or Async-io operation since commits are expensive).
-         */
-        private List<SolrInputDocument> documentsIndexed = new CopyOnWriteArrayList<SolrInputDocument>();
-
-        String url = "http://localhost:8983/solr/collection1";
-
-
-        private int NO_OF_DOCUMENT_TO_COMMIT = 1;
-        /**
-         * You should implement this function to specify whether
-         * the given url should be crawled or not (based on your
-         * crawling logic).
-         */
-        @Override
-        public boolean shouldVisit(WebURL url) {
-                String href = url.getURL().toLowerCase();
-                return !FILTERS.matcher(href).matches() && href.startsWith("http://www.ics.uci.edu/");
-        }
+  /*
+   *  Copy on write list for testing across different threads.
+   *  To do : replace with auto-commit  or Async-io operation since commits are expensive).
+   */
+  private List<SolrInputDocument> documentsIndexed = new CopyOnWriteArrayList<SolrInputDocument>();
+  private static AtomicInteger numOfPagesIndexed = new AtomicInteger(0);
+  IStore store = new SolrStore();
+
+  private int NO_OF_DOCUMENT_TO_COMMIT = 3;
+
+  /**
+   * You should implement this function to specify whether
+   * the given url should be crawled or not (based on your
+   * crawling logic).
+   */
+  @Override
+  public boolean shouldVisit(WebURL url) {
+    String href = url.getURL().toLowerCase();
+    return !FILTERS.matcher(href).matches() && href.startsWith("http://www.ics.uci.edu/");
+  }
+
+  /**
+   * This function is called when a page is fetched and ready
+   * to be processed by your program.
+   */
+  @Override
+  public void visit(Page page) {
+    String url = page.getWebURL().getURL();
+    System.out.println("URL: " + url);
+
+    if (page.getParseData() instanceof HtmlParseData) {
+      HtmlParseData htmlParseData = (HtmlParseData) page.getParseData();
+      String html = htmlParseData.getHtml();
+
+      // Parsing Tags out of Jsoup
+      IIndexableContent indexableContent = store.parseIndexableContent(html);
 
-        /**
-         * This function is called when a page is fetched and ready 
-         * to be processed by your program.
-         */
-        @Override
-        public void visit(Page page) {          
-                String url = page.getWebURL().getURL();
-                System.out.println("URL: " + url);
+      int currentIndex = numOfPagesIndexed.incrementAndGet();
+//                        documentsIndexed.add(doSolrInputDocument);
 
-                if (page.getParseData() instanceof HtmlParseData) {
-                        HtmlParseData htmlParseData = (HtmlParseData) page.getParseData();
-                        String text = htmlParseData.getText();
-                        String html = htmlParseData.getHtml();
-                        List<WebURL> links = htmlParseData.getOutgoingUrls();
-                        // Parsing Tags out of Jsoup
-                        Document doc = Jsoup.parse(html);
-                        SolrInputDocument doSolrInputDocument = new SolrInputDocument();
-                        doSolrInputDocument.setField("id", page.hashCode());
-                        Elements linksList = doc.getElementsByTag("a");
-                        String serverUrl = "http://localhost:8983/solr/collection1";
-                        SolrServer solr = new HttpSolrServer(serverUrl);
-
-                        // To do : replace the logic with async-io for faster execution.
-                        for (Element link : linksList) {
-                          String linkHref = link.attr("href");
-                          System.out.println(linkHref + "printed attribute \n");
-                          String linkText = link.text();
-                          System.out.println(linkText + "printed text \n");
-                          doSolrInputDocument.setField("features", linkHref);;
-                        }
-
-                        Elements paragraphList = doc.getElementsByTag("p");
-                        for (Element parElement : paragraphList) {
-                        	String paragraphText = parElement.text();
-                        	System.out.println(paragraphText + "printed para text \n");
-                        	doSolrInputDocument.setField("features", paragraphText);
-                        }
-
-                        documentsIndexed.add(doSolrInputDocument);
-
-                        /*
-                         * Reducing the number of commits. 
-                         * To do : Replace commit with auto-commit on server side.
-                         * http://stackoverflow.com/questions/17654266/solr-autocommit-vs-autosoftcommit
-                         * To do : Replace add with async-io (Akka) since adds are blocking the thread.
-                         */
-                        if(documentsIndexed.size() > NO_OF_DOCUMENT_TO_COMMIT) {
-	                        try {
-	                        	solr.add(doSolrInputDocument);
-
-	                        	solr.commit(true, true);
-	                        } catch(Exception e) {
-	                        	System.out.println(e.getMessage());
-	                        	e.printStackTrace();
-	                        }
-                        }
-                        System.out.println("Text length: " + text.length());
-                        System.out.println("Html length: " + html.length());
-                        System.out.println("Number of outgoing links: " + links.size());
-                }
+      /*
+       * Reducing the number of commits.
+       * To do : Replace commit with auto-commit on server side.
+       * http://stackoverflow.com/questions/17654266/solr-autocommit-vs-autosoftcommit
+       * To do : Replace add with async-io (Akka) since adds are blocking the thread.
+       */
+      if(currentIndex <= NO_OF_DOCUMENT_TO_COMMIT) {
+        try {
+          store.add(indexableContent);
+        } catch(Exception e) {
+          System.out.println(e.getMessage());
+          e.printStackTrace();
         }
+      }
+    }
+  }
 }
diff --git a/...Unit_testing/CustomSolrInputDocument.java → ...ng/solr_impl/CustomSolrInputDocument.java b/...Unit_testing/CustomSolrInputDocument.java → ...ng/solr_impl/CustomSolrInputDocument.java
@@ -1,4 +1,4 @@
-package Unit_testing;
+package Unit_testing.solr_impl;
 
 import java.util.List;
 

diff --git a/src/main/java/Unit_testing/solr_impl/SolrIndexableContent.java b/src/main/java/Unit_testing/solr_impl/SolrIndexableContent.java
@@ -0,0 +1,21 @@
+package Unit_testing.solr_impl;
+
+import Unit_testing.IIndexableContent;
+import org.apache.solr.common.SolrInputDocument;
+
+/**
+ * Created by Avi Hayun on 12/10/2014.
+ *
+ */
+public class SolrIndexableContent implements IIndexableContent {
+  private SolrInputDocument doSolrInputDocument = new SolrInputDocument();
+
+
+  public SolrInputDocument getDoSolrInputDocument() {
+    return doSolrInputDocument;
+  }
+
+  public void setDoSolrInputDocument(SolrInputDocument doSolrInputDocument) {
+    this.doSolrInputDocument = doSolrInputDocument;
+  }
+}
diff --git a/src/main/java/Unit_testing/solr_impl/SolrStore.java b/src/main/java/Unit_testing/solr_impl/SolrStore.java
@@ -0,0 +1,60 @@
+package Unit_testing.solr_impl;
+
+import Unit_testing.IIndexableContent;
+import Unit_testing.IStore;
+import org.apache.solr.client.solrj.SolrServer;
+import org.apache.solr.client.solrj.SolrServerException;
+import org.apache.solr.client.solrj.impl.HttpSolrServer;
+import org.apache.solr.common.SolrInputDocument;
+import org.jsoup.Jsoup;
+import org.jsoup.nodes.Document;
+import org.jsoup.nodes.Element;
+import org.jsoup.select.Elements;
+
+import java.io.IOException;
+
+/**
+ * Created by Avi Hayun on 12/10/2014.
+ * Solr implementation
+ */
+public class SolrStore implements IStore {
+  String serverUrl = "http://localhost:8983/solr/collection1";
+  SolrServer solr = new HttpSolrServer(serverUrl);
+
+  @Override
+  public SolrIndexableContent parseIndexableContent(String html) {
+    SolrIndexableContent sic = new SolrIndexableContent();
+
+    Document doc = Jsoup.parse(html);
+    SolrInputDocument doSolrInputDocument = new SolrInputDocument();
+    doSolrInputDocument.setField("id", html.hashCode());
+    Elements linksList = doc.getElementsByTag("a");
+
+    // To do : replace the logic with async-io for faster execution.
+    for (Element link : linksList) {
+      String linkHref = link.attr("href");
+      System.out.println("Link attribute:" + linkHref + " \n");
+      String linkText = link.text();
+      System.out.println("Link text: " + linkText + " \n");
+      doSolrInputDocument.setField("features", linkHref);
+    }
+
+    Elements paragraphList = doc.getElementsByTag("p");
+    for (Element parElement : paragraphList) {
+      String paragraphText = parElement.text();
+      System.out.println("Paragraph text: " + paragraphText + " \n");
+      doSolrInputDocument.setField("features", paragraphText);
+    }
+
+    sic.setDoSolrInputDocument(doSolrInputDocument);
+    return sic;
+  }
+
+  @Override
+  public void add(IIndexableContent indexableContent) throws IOException, SolrServerException {
+    SolrIndexableContent solrIndexableContent = (SolrIndexableContent) indexableContent;
+
+    solr.add(solrIndexableContent.getDoSolrInputDocument());
+    solr.commit(true, true);
+  }
+}