alienrobotwizard · rjurney · Jan 29, 2013 · Jan 29, 2013 · Jan 30, 2013 · Dec 19, 2013
diff --git a/README.textile b/README.textile
@@ -16,4 +16,24 @@ See individual readme files under the scripts directory for how to run.
 
 h2. Why is it called Varaha?
 
-Evidently, Varaha is an avatar of the Hindu god Vishnu, in the form of a Boar.
+Evidently, Varaha is an avatar of the Hindu god Vishnu, in the form of a Boar.
+
+h2. How do I tokenize and tag text?
+
+register ../../lib/stanford-postagger-withModel.jar 
+register ../../target/varaha-1.0-SNAPSHOT.jar
+
+reviews = LOAD 'data/ten.avro' USING AvroStorage;
+foo = FOREACH reviews GENERATE business_id, varaha.text.StanfordTokenize(text) AS tagged;
+DUMP foo
+
+reviews = LOAD 'data/ten.avro' USING AvroStorage();
+reviews = LIMIT reviews 1000;
+bar = FOREACH reviews GENERATE business_id, FLATTEN(varaha.text.SentenceTokenize(text)) AS tokenized_sentences;
+bar = FOREACH bar GENERATE business_id, varaha.text.StanfordPOSTag(tokenized_sentences) AS tagged;
+DUMP bar
+
+reviews = LOAD 'data/ten.avro' USING AvroStorage();
+reviews = LIMIT reviews 1000;
+bar = FOREACH reviews GENERATE business_id, varaha.text.StanfordPOSTag(varaha.text.StanfordTokenize(text)) AS tokens;
+DUMP bar
diff --git a/lib/stanford-corenlp-1.3.4.jar b/lib/stanford-corenlp-1.3.4.jar
diff --git a/lib/stanford-postagger-withModel.jar b/lib/stanford-postagger-withModel.jar
diff --git a/scripts/pos_tagging/data/ten.avro/._SUCCESS.crc b/scripts/pos_tagging/data/ten.avro/._SUCCESS.crc
diff --git a/scripts/pos_tagging/data/ten.avro/.part-r-00000.avro.crc b/scripts/pos_tagging/data/ten.avro/.part-r-00000.avro.crc
diff --git a/scripts/pos_tagging/data/ten.avro/_SUCCESS b/scripts/pos_tagging/data/ten.avro/_SUCCESS
diff --git a/scripts/pos_tagging/data/ten.avro/part-r-00000.avro b/scripts/pos_tagging/data/ten.avro/part-r-00000.avro
diff --git a/scripts/pos_tagging/test.pig b/scripts/pos_tagging/test.pig
@@ -0,0 +1,17 @@
+register ../../lib/stanford-postagger-withModel.jar 
+register ../../target/varaha-1.0-SNAPSHOT.jar
+
+reviews = LOAD 'data/ten.avro' USING AvroStorage;
+foo = FOREACH reviews GENERATE business_id, varaha.text.StanfordTokenize(text) AS tokenized;
+DUMP foo
+
+reviews = LOAD 'data/ten.avro' USING AvroStorage();
+reviews = LIMIT reviews 1000;
+bar = FOREACH reviews GENERATE business_id, FLATTEN(varaha.text.SentenceTokenize(text)) AS tokenized_sentences;
+bar = FOREACH bar GENERATE business_id, varaha.text.StanfordPOSTag(tokenized_sentences) AS tagged;
+DUMP bar
+
+reviews = LOAD 'data/ten.avro' USING AvroStorage();
+reviews = LIMIT reviews 1000;
+bar = FOREACH reviews GENERATE business_id, varaha.text.StanfordPOSTag(varaha.text.StanfordTokenize(text)) AS tokens;
+DUMP bar
diff --git a/src/main/java/varaha/text/EntityRecognize.java b/src/main/java/varaha/text/EntityRecognize.java
@@ -0,0 +1,11 @@
+package varaha.text;
+
+/**
+ * Created with IntelliJ IDEA.
+ * User: rjurney
+ * Date: 12/24/13
+ * Time: 1:54 PM
+ * To change this template use File | Settings | File Templates.
+ */
+public class EntityRecognize {
+}
diff --git a/src/main/java/varaha/text/NGramTokenize.java b/src/main/java/varaha/text/NGramTokenize.java
@@ -0,0 +1,80 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package varaha.text;
+
+import java.io.IOException;
+import java.io.StringReader;
+import java.util.*;
+
+import org.apache.pig.EvalFunc;
+import org.apache.pig.data.Tuple;
+import org.apache.pig.data.TupleFactory;
+import org.apache.pig.data.DataBag;
+import org.apache.pig.data.BagFactory;
+
+import edu.stanford.nlp.process.DocumentPreprocessor;
+
+/**
+ * TokenizeText uses the Lucene libraries StandardAnalyzer class to tokenize a
+ * raw text input. A list of the stopwords used is available {@link StopWords}.
+ * Output is a pig bag containing tokens.
+ *
+ * <dt><b>Example:</b></dt>
+ * <dd><code>
+ * register varaha.jar;<br/>
+ * documents    = LOAD 'documents' AS (doc_id:chararray, text:chararray);<br/>
+ * tokenized    = FOREACH documents GENERATE doc_id AS doc_id, FLATTEN(StanfordTokenize(text)) AS (token:chararray);
+ * </code></dd>
+ * </dl>
+ *
+ * @see
+ * @author Russell Jurney
+ *
+ */
+public class NGramTokenizer extends EvalFunc<DataBag> {
+
+    private static TupleFactory tupleFactory = TupleFactory.getInstance();
+    private static BagFactory bagFactory = BagFactory.getInstance();
+
+    public DataBag exec(Tuple input) throws IOException {
+        if (input == null || input.size() < 1 || input.isNull(0))
+            return null;
+
+        StringReader textInput = new StringReader(input.get(0).toString());
+        int n = (Integer)input.get(1);
+
+        // Output bag
+        DataBag bagOfSentences = bagFactory.newDefaultBag();
+
+        DocumentPreprocessor dp = new DocumentPreprocessor(textInput);
+        for (List sentence : dp) {
+            DataBag sentenceBag = bagFactory.newDefaultBag();
+            ListIterator<Object> sli = sentence.listIterator();
+            while(sli.hasNext())
+            {
+                String word = sli.next().toString();
+                Tuple termText = tupleFactory.newTuple(word);
+                sentenceBag.add(termText);
+            }
+            Tuple sentenceTuple = tupleFactory.newTuple(sentenceBag);
+            bagOfSentences.add(sentenceTuple);
+        }
+        return bagOfSentences;
+    }
+}
diff --git a/src/main/java/varaha/text/SentenceTokenize.java b/src/main/java/varaha/text/SentenceTokenize.java
@@ -0,0 +1,78 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package varaha.text;
+
+import java.io.IOException;
+import java.io.StringReader;
+import java.util.*;
+
+import org.apache.pig.EvalFunc;
+import org.apache.pig.data.Tuple;
+import org.apache.pig.data.TupleFactory;
+import org.apache.pig.data.DataBag;
+import org.apache.pig.data.BagFactory;
+
+import edu.stanford.nlp.process.DocumentPreprocessor;
+
+/**
+ * TokenizeText uses the Lucene libraries StandardAnalyzer class to tokenize a
+ * raw text input. A list of the stopwords used is available {@link StopWords}.
+ * Output is a pig bag containing tokens.
+ *
+ * <dt><b>Example:</b></dt>
+ * <dd><code>
+ * register varaha.jar;<br/>
+ * documents    = LOAD 'documents' AS (doc_id:chararray, text:chararray);<br/>
+ * tokenized    = FOREACH documents GENERATE doc_id AS doc_id, FLATTEN(StanfordTokenize(text)) AS (token:chararray);
+ * </code></dd>
+ * </dl>
+ *
+ * @see
+ * @author Russell Jurney
+ *
+ */
+public class SentenceTokenize extends EvalFunc<DataBag> {
+
+    private static TupleFactory tupleFactory = TupleFactory.getInstance();
+    private static BagFactory bagFactory = BagFactory.getInstance();
+
+    public DataBag exec(Tuple input) throws IOException {
+        if (input == null || input.size() < 1 || input.isNull(0))
+            return null;
+
+        // Output bag
+        DataBag bagOfSentences = bagFactory.newDefaultBag();
+
+        StringReader textInput = new StringReader(input.get(0).toString());
+        DocumentPreprocessor dp = new DocumentPreprocessor(textInput);
+        for (List sentence : dp) {
+            DataBag sentenceBag = bagFactory.newDefaultBag();
+            ListIterator<Object> sli = sentence.listIterator();
+            while(sli.hasNext())
+            {
+                String word = sli.next().toString();
+                Tuple termText = tupleFactory.newTuple(word);
+                sentenceBag.add(termText);
+            }
+            Tuple sentenceTuple = tupleFactory.newTuple(sentenceBag);
+            bagOfSentences.add(sentenceTuple);
+        }
+        return bagOfSentences;
+    }
+}
diff --git a/src/main/java/varaha/text/StanfordPOSTag.java b/src/main/java/varaha/text/StanfordPOSTag.java
@@ -0,0 +1,130 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package varaha.text;
+
+import java.io.IOException;
+import java.io.StringReader;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Iterator;
+import java.util.List;
+
+import edu.stanford.nlp.ling.TaggedWord;
+import edu.stanford.nlp.process.DocumentPreprocessor;
+import org.apache.pig.EvalFunc;
+import org.apache.pig.data.Tuple;
+import org.apache.pig.data.TupleFactory;
+import org.apache.pig.data.DataBag;
+import org.apache.pig.data.BagFactory;
+
+import edu.stanford.nlp.tagger.maxent.MaxentTagger;
+import edu.stanford.nlp.process.PTBTokenizer;
+import edu.stanford.nlp.ling.CoreLabel;
+import edu.stanford.nlp.process.CoreLabelTokenFactory;
+import edu.stanford.nlp.ling.Word;
+
+/**
+ * StanfordPOSTagger uses the Stanford Maximum Entropy Tagger class to Part-Of-Speech tag a
+ * raw text input. Output is a pig bag containing two-field tuples, of the format (word, tag).
+ *
+ * <dt><b>Example:</b></dt>
+ * <dd><code>
+ * register varaha.jar;<br/>
+ * documents    = LOAD 'documents' AS (doc_id:chararray, text:chararray);<br/>
+ * tokenized    = FOREACH documents GENERATE doc_id AS doc_id, StanfordPOSTagger(text)
+ *                                  AS (b:bag{token:tuple(word:chararray, tag:chararray)});
+ * </code></dd>
+ * </dl>
+ *
+ * @see
+ * @author Russell Jurney
+ *
+ */
+public class StanfordPOSTag extends EvalFunc<DataBag> {
+
+    private static TupleFactory tupleFactory = TupleFactory.getInstance();
+    private static BagFactory bagFactory = BagFactory.getInstance();
+    private static boolean isFirst = true;
+    private static MaxentTagger tagger;
+
+    // Must also add implementation for bag sof tuples of sentences
+    public DataBag exec(Tuple input) throws IOException {
+        if (input == null || input.size() < 1 || input.isNull(0))
+            return null;
+
+        if(isFirst)
+        {
+            try {
+                tagger = new MaxentTagger("edu/stanford/nlp/models/pos-tagger/english-left3words/english-left3words-distsim.tagger");
+            }
+            catch(Exception e) {
+                System.err.println("Exception loading language model: " + e.getMessage());
+            }
+            isFirst = false;
+        }
+
+        // Output bag
+        DataBag bagOfTokens = bagFactory.newDefaultBag();
+
+        Object inThing = input.get(0);
+        if(inThing instanceof String) {
+            StringReader textInput = new StringReader((String)inThing);
+            Tuple termText = null;
+            List<TaggedWord> taggedSentence = null;
+            DocumentPreprocessor dp = new DocumentPreprocessor(textInput);
+            for (List sentence : dp) {
+                taggedSentence = tagger.apply(sentence);
+
+                // Now split based on '_' and build/return a bag of 2-field tuples
+                termText = tupleFactory.newTuple();
+                for (TaggedWord word : taggedSentence ) {
+                    String token = word.word();
+                    String tag = word.tag();
+                    termText = tupleFactory.newTuple(Arrays.asList(token, tag));
+                    bagOfTokens.add(termText);
+                }
+            }
+            bagOfTokens.add(termText);
+        }
+        else if(inThing instanceof DataBag) {
+            Iterator<Tuple> itr = ((DataBag)inThing).iterator();
+            List<Word> sentence = new ArrayList<Word>();
+            while(itr.hasNext()) {
+                Tuple t = itr.next();
+                if(t.get(0) != null) {
+                    Word word = new Word(t.get(0).toString());
+                    sentence.add(word);
+                }
+            }
+            List<TaggedWord> taggedSentence = tagger.apply(sentence);
+            for( TaggedWord word : taggedSentence) {
+                String token = word.word();
+                String tag = word.tag();
+                Tuple termText = tupleFactory.newTuple(Arrays.asList(token, tag));
+                bagOfTokens.add(termText);
+            }
+        }
+        else
+        {
+            throw new IOException();
+        }
+
+        return bagOfTokens;
+    }
+}
diff --git a/src/main/java/varaha/text/StanfordTokenize.java b/src/main/java/varaha/text/StanfordTokenize.java
@@ -54,7 +54,7 @@
  * </dl>
  * 
  * @see
- * @author Jacob Perkins
+ * @author Russell Jurney
  *
  */
 public class StanfordTokenize extends EvalFunc<DataBag> {
@@ -74,10 +74,12 @@ public DataBag exec(Tuple input) throws IOException {
 
         for (CoreLabel label; ptbt.hasNext(); ) {
           label = (CoreLabel)ptbt.next();
-          Tuple termText = tupleFactory.newTuple(label.toString());
-          bagOfTokens.add(termText);
-        }
-
+          if(label.value().length() > 2)
+          {
+            Tuple termText = tupleFactory.newTuple(label.word());
+            bagOfTokens.add(termText);
+          }
+        }  
         return bagOfTokens;
     }
 }