diff --git a/README.textile b/README.textile index 181b495..26227f5 100644 --- a/README.textile +++ b/README.textile @@ -16,4 +16,24 @@ See individual readme files under the scripts directory for how to run. h2. Why is it called Varaha? -Evidently, Varaha is an avatar of the Hindu god Vishnu, in the form of a Boar. \ No newline at end of file +Evidently, Varaha is an avatar of the Hindu god Vishnu, in the form of a Boar. + +h2. How do I tokenize and tag text? + +register ../../lib/stanford-postagger-withModel.jar +register ../../target/varaha-1.0-SNAPSHOT.jar + +reviews = LOAD 'data/ten.avro' USING AvroStorage; +foo = FOREACH reviews GENERATE business_id, varaha.text.StanfordTokenize(text) AS tagged; +DUMP foo + +reviews = LOAD 'data/ten.avro' USING AvroStorage(); +reviews = LIMIT reviews 1000; +bar = FOREACH reviews GENERATE business_id, FLATTEN(varaha.text.SentenceTokenize(text)) AS tokenized_sentences; +bar = FOREACH bar GENERATE business_id, varaha.text.StanfordPOSTag(tokenized_sentences) AS tagged; +DUMP bar + +reviews = LOAD 'data/ten.avro' USING AvroStorage(); +reviews = LIMIT reviews 1000; +bar = FOREACH reviews GENERATE business_id, varaha.text.StanfordPOSTag(varaha.text.StanfordTokenize(text)) AS tokens; +DUMP bar \ No newline at end of file diff --git a/lib/stanford-corenlp-1.3.4.jar b/lib/stanford-corenlp-1.3.4.jar new file mode 100644 index 0000000..03b6f07 Binary files /dev/null and b/lib/stanford-corenlp-1.3.4.jar differ diff --git a/lib/stanford-postagger-withModel.jar b/lib/stanford-postagger-withModel.jar new file mode 100644 index 0000000..938014f Binary files /dev/null and b/lib/stanford-postagger-withModel.jar differ diff --git a/scripts/pos_tagging/data/ten.avro/._SUCCESS.crc b/scripts/pos_tagging/data/ten.avro/._SUCCESS.crc new file mode 100644 index 0000000..3b7b044 Binary files /dev/null and b/scripts/pos_tagging/data/ten.avro/._SUCCESS.crc differ diff --git a/scripts/pos_tagging/data/ten.avro/.part-r-00000.avro.crc b/scripts/pos_tagging/data/ten.avro/.part-r-00000.avro.crc new file mode 100644 index 0000000..5964e5a Binary files /dev/null and b/scripts/pos_tagging/data/ten.avro/.part-r-00000.avro.crc differ diff --git a/scripts/pos_tagging/data/ten.avro/_SUCCESS b/scripts/pos_tagging/data/ten.avro/_SUCCESS new file mode 100755 index 0000000..e69de29 diff --git a/scripts/pos_tagging/data/ten.avro/part-r-00000.avro b/scripts/pos_tagging/data/ten.avro/part-r-00000.avro new file mode 100755 index 0000000..ac3e921 Binary files /dev/null and b/scripts/pos_tagging/data/ten.avro/part-r-00000.avro differ diff --git a/scripts/pos_tagging/test.pig b/scripts/pos_tagging/test.pig new file mode 100644 index 0000000..253ae4e --- /dev/null +++ b/scripts/pos_tagging/test.pig @@ -0,0 +1,17 @@ +register ../../lib/stanford-postagger-withModel.jar +register ../../target/varaha-1.0-SNAPSHOT.jar + +reviews = LOAD 'data/ten.avro' USING AvroStorage; +foo = FOREACH reviews GENERATE business_id, varaha.text.StanfordTokenize(text) AS tokenized; +DUMP foo + +reviews = LOAD 'data/ten.avro' USING AvroStorage(); +reviews = LIMIT reviews 1000; +bar = FOREACH reviews GENERATE business_id, FLATTEN(varaha.text.SentenceTokenize(text)) AS tokenized_sentences; +bar = FOREACH bar GENERATE business_id, varaha.text.StanfordPOSTag(tokenized_sentences) AS tagged; +DUMP bar + +reviews = LOAD 'data/ten.avro' USING AvroStorage(); +reviews = LIMIT reviews 1000; +bar = FOREACH reviews GENERATE business_id, varaha.text.StanfordPOSTag(varaha.text.StanfordTokenize(text)) AS tokens; +DUMP bar \ No newline at end of file diff --git a/src/main/java/varaha/text/EntityRecognize.java b/src/main/java/varaha/text/EntityRecognize.java new file mode 100644 index 0000000..d627055 --- /dev/null +++ b/src/main/java/varaha/text/EntityRecognize.java @@ -0,0 +1,11 @@ +package varaha.text; + +/** + * Created with IntelliJ IDEA. + * User: rjurney + * Date: 12/24/13 + * Time: 1:54 PM + * To change this template use File | Settings | File Templates. + */ +public class EntityRecognize { +} diff --git a/src/main/java/varaha/text/NGramTokenize.java b/src/main/java/varaha/text/NGramTokenize.java new file mode 100644 index 0000000..49e9c18 --- /dev/null +++ b/src/main/java/varaha/text/NGramTokenize.java @@ -0,0 +1,80 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package varaha.text; + +import java.io.IOException; +import java.io.StringReader; +import java.util.*; + +import org.apache.pig.EvalFunc; +import org.apache.pig.data.Tuple; +import org.apache.pig.data.TupleFactory; +import org.apache.pig.data.DataBag; +import org.apache.pig.data.BagFactory; + +import edu.stanford.nlp.process.DocumentPreprocessor; + +/** + * TokenizeText uses the Lucene libraries StandardAnalyzer class to tokenize a + * raw text input. A list of the stopwords used is available {@link StopWords}. + * Output is a pig bag containing tokens. + * + *
Example:
+ *
+ * register varaha.jar;
+ * documents = LOAD 'documents' AS (doc_id:chararray, text:chararray);
+ * tokenized = FOREACH documents GENERATE doc_id AS doc_id, FLATTEN(StanfordTokenize(text)) AS (token:chararray); + *
+ * + * + * @see + * @author Russell Jurney + * + */ +public class NGramTokenizer extends EvalFunc { + + private static TupleFactory tupleFactory = TupleFactory.getInstance(); + private static BagFactory bagFactory = BagFactory.getInstance(); + + public DataBag exec(Tuple input) throws IOException { + if (input == null || input.size() < 1 || input.isNull(0)) + return null; + + StringReader textInput = new StringReader(input.get(0).toString()); + int n = (Integer)input.get(1); + + // Output bag + DataBag bagOfSentences = bagFactory.newDefaultBag(); + + DocumentPreprocessor dp = new DocumentPreprocessor(textInput); + for (List sentence : dp) { + DataBag sentenceBag = bagFactory.newDefaultBag(); + ListIterator sli = sentence.listIterator(); + while(sli.hasNext()) + { + String word = sli.next().toString(); + Tuple termText = tupleFactory.newTuple(word); + sentenceBag.add(termText); + } + Tuple sentenceTuple = tupleFactory.newTuple(sentenceBag); + bagOfSentences.add(sentenceTuple); + } + return bagOfSentences; + } +} diff --git a/src/main/java/varaha/text/SentenceTokenize.java b/src/main/java/varaha/text/SentenceTokenize.java new file mode 100644 index 0000000..5e9a059 --- /dev/null +++ b/src/main/java/varaha/text/SentenceTokenize.java @@ -0,0 +1,78 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package varaha.text; + +import java.io.IOException; +import java.io.StringReader; +import java.util.*; + +import org.apache.pig.EvalFunc; +import org.apache.pig.data.Tuple; +import org.apache.pig.data.TupleFactory; +import org.apache.pig.data.DataBag; +import org.apache.pig.data.BagFactory; + +import edu.stanford.nlp.process.DocumentPreprocessor; + +/** + * TokenizeText uses the Lucene libraries StandardAnalyzer class to tokenize a + * raw text input. A list of the stopwords used is available {@link StopWords}. + * Output is a pig bag containing tokens. + * + *
Example:
+ *
+ * register varaha.jar;
+ * documents = LOAD 'documents' AS (doc_id:chararray, text:chararray);
+ * tokenized = FOREACH documents GENERATE doc_id AS doc_id, FLATTEN(StanfordTokenize(text)) AS (token:chararray); + *
+ * + * + * @see + * @author Russell Jurney + * + */ +public class SentenceTokenize extends EvalFunc { + + private static TupleFactory tupleFactory = TupleFactory.getInstance(); + private static BagFactory bagFactory = BagFactory.getInstance(); + + public DataBag exec(Tuple input) throws IOException { + if (input == null || input.size() < 1 || input.isNull(0)) + return null; + + // Output bag + DataBag bagOfSentences = bagFactory.newDefaultBag(); + + StringReader textInput = new StringReader(input.get(0).toString()); + DocumentPreprocessor dp = new DocumentPreprocessor(textInput); + for (List sentence : dp) { + DataBag sentenceBag = bagFactory.newDefaultBag(); + ListIterator sli = sentence.listIterator(); + while(sli.hasNext()) + { + String word = sli.next().toString(); + Tuple termText = tupleFactory.newTuple(word); + sentenceBag.add(termText); + } + Tuple sentenceTuple = tupleFactory.newTuple(sentenceBag); + bagOfSentences.add(sentenceTuple); + } + return bagOfSentences; + } +} diff --git a/src/main/java/varaha/text/StanfordPOSTag.java b/src/main/java/varaha/text/StanfordPOSTag.java new file mode 100644 index 0000000..0d9f36f --- /dev/null +++ b/src/main/java/varaha/text/StanfordPOSTag.java @@ -0,0 +1,130 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package varaha.text; + +import java.io.IOException; +import java.io.StringReader; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Iterator; +import java.util.List; + +import edu.stanford.nlp.ling.TaggedWord; +import edu.stanford.nlp.process.DocumentPreprocessor; +import org.apache.pig.EvalFunc; +import org.apache.pig.data.Tuple; +import org.apache.pig.data.TupleFactory; +import org.apache.pig.data.DataBag; +import org.apache.pig.data.BagFactory; + +import edu.stanford.nlp.tagger.maxent.MaxentTagger; +import edu.stanford.nlp.process.PTBTokenizer; +import edu.stanford.nlp.ling.CoreLabel; +import edu.stanford.nlp.process.CoreLabelTokenFactory; +import edu.stanford.nlp.ling.Word; + +/** + * StanfordPOSTagger uses the Stanford Maximum Entropy Tagger class to Part-Of-Speech tag a + * raw text input. Output is a pig bag containing two-field tuples, of the format (word, tag). + * + *
Example:
+ *
+ * register varaha.jar;
+ * documents = LOAD 'documents' AS (doc_id:chararray, text:chararray);
+ * tokenized = FOREACH documents GENERATE doc_id AS doc_id, StanfordPOSTagger(text) + * AS (b:bag{token:tuple(word:chararray, tag:chararray)}); + *
+ * + * + * @see + * @author Russell Jurney + * + */ +public class StanfordPOSTag extends EvalFunc { + + private static TupleFactory tupleFactory = TupleFactory.getInstance(); + private static BagFactory bagFactory = BagFactory.getInstance(); + private static boolean isFirst = true; + private static MaxentTagger tagger; + + // Must also add implementation for bag sof tuples of sentences + public DataBag exec(Tuple input) throws IOException { + if (input == null || input.size() < 1 || input.isNull(0)) + return null; + + if(isFirst) + { + try { + tagger = new MaxentTagger("edu/stanford/nlp/models/pos-tagger/english-left3words/english-left3words-distsim.tagger"); + } + catch(Exception e) { + System.err.println("Exception loading language model: " + e.getMessage()); + } + isFirst = false; + } + + // Output bag + DataBag bagOfTokens = bagFactory.newDefaultBag(); + + Object inThing = input.get(0); + if(inThing instanceof String) { + StringReader textInput = new StringReader((String)inThing); + Tuple termText = null; + List taggedSentence = null; + DocumentPreprocessor dp = new DocumentPreprocessor(textInput); + for (List sentence : dp) { + taggedSentence = tagger.apply(sentence); + + // Now split based on '_' and build/return a bag of 2-field tuples + termText = tupleFactory.newTuple(); + for (TaggedWord word : taggedSentence ) { + String token = word.word(); + String tag = word.tag(); + termText = tupleFactory.newTuple(Arrays.asList(token, tag)); + bagOfTokens.add(termText); + } + } + bagOfTokens.add(termText); + } + else if(inThing instanceof DataBag) { + Iterator itr = ((DataBag)inThing).iterator(); + List sentence = new ArrayList(); + while(itr.hasNext()) { + Tuple t = itr.next(); + if(t.get(0) != null) { + Word word = new Word(t.get(0).toString()); + sentence.add(word); + } + } + List taggedSentence = tagger.apply(sentence); + for( TaggedWord word : taggedSentence) { + String token = word.word(); + String tag = word.tag(); + Tuple termText = tupleFactory.newTuple(Arrays.asList(token, tag)); + bagOfTokens.add(termText); + } + } + else + { + throw new IOException(); + } + + return bagOfTokens; + } +} diff --git a/src/main/java/varaha/text/StanfordTokenize.java b/src/main/java/varaha/text/StanfordTokenize.java index e41a99c..25497e8 100644 --- a/src/main/java/varaha/text/StanfordTokenize.java +++ b/src/main/java/varaha/text/StanfordTokenize.java @@ -54,7 +54,7 @@ * * * @see - * @author Jacob Perkins + * @author Russell Jurney * */ public class StanfordTokenize extends EvalFunc { @@ -74,10 +74,12 @@ public DataBag exec(Tuple input) throws IOException { for (CoreLabel label; ptbt.hasNext(); ) { label = (CoreLabel)ptbt.next(); - Tuple termText = tupleFactory.newTuple(label.toString()); - bagOfTokens.add(termText); - } - + if(label.value().length() > 2) + { + Tuple termText = tupleFactory.newTuple(label.word()); + bagOfTokens.add(termText); + } + } return bagOfTokens; } }