Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions src/main/java/Unit_testing/IIndexableContent.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
package Unit_testing;

/**
* Created by Avi Hayun on 12/10/2014.
* Indexable content is a tagging interface for content one wants to index
*/
public interface IIndexableContent {

}
21 changes: 21 additions & 0 deletions src/main/java/Unit_testing/IStore.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
package Unit_testing;

import org.apache.solr.client.solrj.SolrServerException;

import java.io.IOException;

/**
* Created by Avi Hayun on 12/10/2014.
* Interface representing the store type one wants to index one's content in
*/
public interface IStore {
/**
* Parses the HTML in order to extract only the wanted content
* */
IIndexableContent parseIndexableContent(String html);

/**
* Add the indexable content into the store
* */
void add(IIndexableContent indexableContent) throws Exception;
}
145 changes: 56 additions & 89 deletions src/main/java/Unit_testing/MyCrawler.java
Original file line number Diff line number Diff line change
@@ -1,19 +1,12 @@
package Unit_testing;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.concurrent.CopyOnWriteArrayList;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.regex.Pattern;

import org.apache.solr.client.solrj.SolrServer;
import org.apache.solr.client.solrj.SolrServerException;
import org.apache.solr.client.solrj.impl.HttpSolrServer;
import Unit_testing.solr_impl.SolrStore;
import org.apache.solr.common.SolrInputDocument;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import edu.uci.ics.crawler4j.crawler.Page;
import edu.uci.ics.crawler4j.crawler.WebCrawler;
Expand All @@ -30,91 +23,65 @@
*/
public class MyCrawler extends WebCrawler {

private final static Pattern FILTERS = Pattern.compile(".*(\\.(css|js|bmp|gif|jpe?g"
private final static Pattern FILTERS = Pattern.compile(".*(\\.(css|js|bmp|gif|jpe?g"
+ "|png|tiff?|mid|mp2|mp3|mp4"
+ "|wav|avi|mov|mpeg|ram|m4v|pdf"
+ "|rm|smil|wmv|swf|wma|zip|rar|gz))$");

/*
* Copy on write list for testing across different threads.
* To do : replace with auto-commit or Async-io operation since commits are expensive).
*/
private List<SolrInputDocument> documentsIndexed = new CopyOnWriteArrayList<SolrInputDocument>();

String url = "http://localhost:8983/solr/collection1";


private int NO_OF_DOCUMENT_TO_COMMIT = 1;
/**
* You should implement this function to specify whether
* the given url should be crawled or not (based on your
* crawling logic).
*/
@Override
public boolean shouldVisit(WebURL url) {
String href = url.getURL().toLowerCase();
return !FILTERS.matcher(href).matches() && href.startsWith("http://www.ics.uci.edu/");
}
/*
* Copy on write list for testing across different threads.
* To do : replace with auto-commit or Async-io operation since commits are expensive).
*/
private List<SolrInputDocument> documentsIndexed = new CopyOnWriteArrayList<SolrInputDocument>();
private static AtomicInteger numOfPagesIndexed = new AtomicInteger(0);
IStore store = new SolrStore();

private int NO_OF_DOCUMENT_TO_COMMIT = 3;

/**
* You should implement this function to specify whether
* the given url should be crawled or not (based on your
* crawling logic).
*/
@Override
public boolean shouldVisit(WebURL url) {
String href = url.getURL().toLowerCase();
return !FILTERS.matcher(href).matches() && href.startsWith("http://www.ics.uci.edu/");
}

/**
* This function is called when a page is fetched and ready
* to be processed by your program.
*/
@Override
public void visit(Page page) {
String url = page.getWebURL().getURL();
System.out.println("URL: " + url);

if (page.getParseData() instanceof HtmlParseData) {
HtmlParseData htmlParseData = (HtmlParseData) page.getParseData();
String html = htmlParseData.getHtml();

// Parsing Tags out of Jsoup
IIndexableContent indexableContent = store.parseIndexableContent(html);

/**
* This function is called when a page is fetched and ready
* to be processed by your program.
*/
@Override
public void visit(Page page) {
String url = page.getWebURL().getURL();
System.out.println("URL: " + url);
int currentIndex = numOfPagesIndexed.incrementAndGet();
// documentsIndexed.add(doSolrInputDocument);

if (page.getParseData() instanceof HtmlParseData) {
HtmlParseData htmlParseData = (HtmlParseData) page.getParseData();
String text = htmlParseData.getText();
String html = htmlParseData.getHtml();
List<WebURL> links = htmlParseData.getOutgoingUrls();
// Parsing Tags out of Jsoup
Document doc = Jsoup.parse(html);
SolrInputDocument doSolrInputDocument = new SolrInputDocument();
doSolrInputDocument.setField("id", page.hashCode());
Elements linksList = doc.getElementsByTag("a");
String serverUrl = "http://localhost:8983/solr/collection1";
SolrServer solr = new HttpSolrServer(serverUrl);

// To do : replace the logic with async-io for faster execution.
for (Element link : linksList) {
String linkHref = link.attr("href");
System.out.println(linkHref + "printed attribute \n");
String linkText = link.text();
System.out.println(linkText + "printed text \n");
doSolrInputDocument.setField("features", linkHref);;
}

Elements paragraphList = doc.getElementsByTag("p");
for (Element parElement : paragraphList) {
String paragraphText = parElement.text();
System.out.println(paragraphText + "printed para text \n");
doSolrInputDocument.setField("features", paragraphText);
}

documentsIndexed.add(doSolrInputDocument);

/*
* Reducing the number of commits.
* To do : Replace commit with auto-commit on server side.
* http://stackoverflow.com/questions/17654266/solr-autocommit-vs-autosoftcommit
* To do : Replace add with async-io (Akka) since adds are blocking the thread.
*/
if(documentsIndexed.size() > NO_OF_DOCUMENT_TO_COMMIT) {
try {
solr.add(doSolrInputDocument);

solr.commit(true, true);
} catch(Exception e) {
System.out.println(e.getMessage());
e.printStackTrace();
}
}
System.out.println("Text length: " + text.length());
System.out.println("Html length: " + html.length());
System.out.println("Number of outgoing links: " + links.size());
}
/*
* Reducing the number of commits.
* To do : Replace commit with auto-commit on server side.
* http://stackoverflow.com/questions/17654266/solr-autocommit-vs-autosoftcommit
* To do : Replace add with async-io (Akka) since adds are blocking the thread.
*/
if(currentIndex <= NO_OF_DOCUMENT_TO_COMMIT) {
try {
store.add(indexableContent);
} catch(Exception e) {
System.out.println(e.getMessage());
e.printStackTrace();
}
}
}
}
}
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
package Unit_testing;
package Unit_testing.solr_impl;

import java.util.List;

Expand Down
21 changes: 21 additions & 0 deletions src/main/java/Unit_testing/solr_impl/SolrIndexableContent.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
package Unit_testing.solr_impl;

import Unit_testing.IIndexableContent;
import org.apache.solr.common.SolrInputDocument;

/**
* Created by Avi Hayun on 12/10/2014.
*
*/
public class SolrIndexableContent implements IIndexableContent {
private SolrInputDocument doSolrInputDocument = new SolrInputDocument();


public SolrInputDocument getDoSolrInputDocument() {
return doSolrInputDocument;
}

public void setDoSolrInputDocument(SolrInputDocument doSolrInputDocument) {
this.doSolrInputDocument = doSolrInputDocument;
}
}
60 changes: 60 additions & 0 deletions src/main/java/Unit_testing/solr_impl/SolrStore.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
package Unit_testing.solr_impl;

import Unit_testing.IIndexableContent;
import Unit_testing.IStore;
import org.apache.solr.client.solrj.SolrServer;
import org.apache.solr.client.solrj.SolrServerException;
import org.apache.solr.client.solrj.impl.HttpSolrServer;
import org.apache.solr.common.SolrInputDocument;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import java.io.IOException;

/**
* Created by Avi Hayun on 12/10/2014.
* Solr implementation
*/
public class SolrStore implements IStore {
String serverUrl = "http://localhost:8983/solr/collection1";
SolrServer solr = new HttpSolrServer(serverUrl);

@Override
public SolrIndexableContent parseIndexableContent(String html) {
SolrIndexableContent sic = new SolrIndexableContent();

Document doc = Jsoup.parse(html);
SolrInputDocument doSolrInputDocument = new SolrInputDocument();
doSolrInputDocument.setField("id", html.hashCode());
Elements linksList = doc.getElementsByTag("a");

// To do : replace the logic with async-io for faster execution.
for (Element link : linksList) {
String linkHref = link.attr("href");
System.out.println("Link attribute:" + linkHref + " \n");
String linkText = link.text();
System.out.println("Link text: " + linkText + " \n");
doSolrInputDocument.setField("features", linkHref);
}

Elements paragraphList = doc.getElementsByTag("p");
for (Element parElement : paragraphList) {
String paragraphText = parElement.text();
System.out.println("Paragraph text: " + paragraphText + " \n");
doSolrInputDocument.setField("features", paragraphText);
}

sic.setDoSolrInputDocument(doSolrInputDocument);
return sic;
}

@Override
public void add(IIndexableContent indexableContent) throws IOException, SolrServerException {
SolrIndexableContent solrIndexableContent = (SolrIndexableContent) indexableContent;

solr.add(solrIndexableContent.getDoSolrInputDocument());
solr.commit(true, true);
}
}