diff --git a/README.textile b/README.textile
index 181b495..26227f5 100644
--- a/README.textile
+++ b/README.textile
@@ -16,4 +16,24 @@ See individual readme files under the scripts directory for how to run.
h2. Why is it called Varaha?
-Evidently, Varaha is an avatar of the Hindu god Vishnu, in the form of a Boar.
\ No newline at end of file
+Evidently, Varaha is an avatar of the Hindu god Vishnu, in the form of a Boar.
+
+h2. How do I tokenize and tag text?
+
+register ../../lib/stanford-postagger-withModel.jar
+register ../../target/varaha-1.0-SNAPSHOT.jar
+
+reviews = LOAD 'data/ten.avro' USING AvroStorage;
+foo = FOREACH reviews GENERATE business_id, varaha.text.StanfordTokenize(text) AS tagged;
+DUMP foo
+
+reviews = LOAD 'data/ten.avro' USING AvroStorage();
+reviews = LIMIT reviews 1000;
+bar = FOREACH reviews GENERATE business_id, FLATTEN(varaha.text.SentenceTokenize(text)) AS tokenized_sentences;
+bar = FOREACH bar GENERATE business_id, varaha.text.StanfordPOSTag(tokenized_sentences) AS tagged;
+DUMP bar
+
+reviews = LOAD 'data/ten.avro' USING AvroStorage();
+reviews = LIMIT reviews 1000;
+bar = FOREACH reviews GENERATE business_id, varaha.text.StanfordPOSTag(varaha.text.StanfordTokenize(text)) AS tokens;
+DUMP bar
\ No newline at end of file
diff --git a/lib/stanford-corenlp-1.3.4.jar b/lib/stanford-corenlp-1.3.4.jar
new file mode 100644
index 0000000..03b6f07
Binary files /dev/null and b/lib/stanford-corenlp-1.3.4.jar differ
diff --git a/lib/stanford-postagger-withModel.jar b/lib/stanford-postagger-withModel.jar
new file mode 100644
index 0000000..938014f
Binary files /dev/null and b/lib/stanford-postagger-withModel.jar differ
diff --git a/scripts/pos_tagging/data/ten.avro/._SUCCESS.crc b/scripts/pos_tagging/data/ten.avro/._SUCCESS.crc
new file mode 100644
index 0000000..3b7b044
Binary files /dev/null and b/scripts/pos_tagging/data/ten.avro/._SUCCESS.crc differ
diff --git a/scripts/pos_tagging/data/ten.avro/.part-r-00000.avro.crc b/scripts/pos_tagging/data/ten.avro/.part-r-00000.avro.crc
new file mode 100644
index 0000000..5964e5a
Binary files /dev/null and b/scripts/pos_tagging/data/ten.avro/.part-r-00000.avro.crc differ
diff --git a/scripts/pos_tagging/data/ten.avro/_SUCCESS b/scripts/pos_tagging/data/ten.avro/_SUCCESS
new file mode 100755
index 0000000..e69de29
diff --git a/scripts/pos_tagging/data/ten.avro/part-r-00000.avro b/scripts/pos_tagging/data/ten.avro/part-r-00000.avro
new file mode 100755
index 0000000..ac3e921
Binary files /dev/null and b/scripts/pos_tagging/data/ten.avro/part-r-00000.avro differ
diff --git a/scripts/pos_tagging/test.pig b/scripts/pos_tagging/test.pig
new file mode 100644
index 0000000..253ae4e
--- /dev/null
+++ b/scripts/pos_tagging/test.pig
@@ -0,0 +1,17 @@
+register ../../lib/stanford-postagger-withModel.jar
+register ../../target/varaha-1.0-SNAPSHOT.jar
+
+reviews = LOAD 'data/ten.avro' USING AvroStorage;
+foo = FOREACH reviews GENERATE business_id, varaha.text.StanfordTokenize(text) AS tokenized;
+DUMP foo
+
+reviews = LOAD 'data/ten.avro' USING AvroStorage();
+reviews = LIMIT reviews 1000;
+bar = FOREACH reviews GENERATE business_id, FLATTEN(varaha.text.SentenceTokenize(text)) AS tokenized_sentences;
+bar = FOREACH bar GENERATE business_id, varaha.text.StanfordPOSTag(tokenized_sentences) AS tagged;
+DUMP bar
+
+reviews = LOAD 'data/ten.avro' USING AvroStorage();
+reviews = LIMIT reviews 1000;
+bar = FOREACH reviews GENERATE business_id, varaha.text.StanfordPOSTag(varaha.text.StanfordTokenize(text)) AS tokens;
+DUMP bar
\ No newline at end of file
diff --git a/src/main/java/varaha/text/EntityRecognize.java b/src/main/java/varaha/text/EntityRecognize.java
new file mode 100644
index 0000000..d627055
--- /dev/null
+++ b/src/main/java/varaha/text/EntityRecognize.java
@@ -0,0 +1,11 @@
+package varaha.text;
+
+/**
+ * Created with IntelliJ IDEA.
+ * User: rjurney
+ * Date: 12/24/13
+ * Time: 1:54 PM
+ * To change this template use File | Settings | File Templates.
+ */
+public class EntityRecognize {
+}
diff --git a/src/main/java/varaha/text/NGramTokenize.java b/src/main/java/varaha/text/NGramTokenize.java
new file mode 100644
index 0000000..49e9c18
--- /dev/null
+++ b/src/main/java/varaha/text/NGramTokenize.java
@@ -0,0 +1,80 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package varaha.text;
+
+import java.io.IOException;
+import java.io.StringReader;
+import java.util.*;
+
+import org.apache.pig.EvalFunc;
+import org.apache.pig.data.Tuple;
+import org.apache.pig.data.TupleFactory;
+import org.apache.pig.data.DataBag;
+import org.apache.pig.data.BagFactory;
+
+import edu.stanford.nlp.process.DocumentPreprocessor;
+
+/**
+ * TokenizeText uses the Lucene libraries StandardAnalyzer class to tokenize a
+ * raw text input. A list of the stopwords used is available {@link StopWords}.
+ * Output is a pig bag containing tokens.
+ *
+ *
Example:
+ *
+ * register varaha.jar;
+ * documents = LOAD 'documents' AS (doc_id:chararray, text:chararray);
+ * tokenized = FOREACH documents GENERATE doc_id AS doc_id, FLATTEN(StanfordTokenize(text)) AS (token:chararray);
+ *